From 37fb9a0231ee43d42d069863bdfd567fca2b61af Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 23 Nov 2011 20:07:17 +0000 Subject: powerpc/time: Handle wrapping of decrementer When re-enabling interrupts we have code to handle edge sensitive decrementers by resetting the decrementer to 1 whenever it is negative. If interrupts were disabled long enough that the decrementer wrapped to positive we do nothing. This means interrupts can be delayed for a long time until it finally goes negative again. While we hope interrupts are never be disabled long enough for the decrementer to go positive, we have a very good test team that can drive any kernel into the ground. The softlockup data we get back from these fails could be seconds in the future, completely missing the cause of the lockup. We already keep track of the timebase of the next event so use that to work out if we should trigger a decrementer exception. Signed-off-by: Anton Blanchard Cc: stable@kernel.org Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/time.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/powerpc/kernel/time.c') diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 522bb1dfc35..5db163c9675 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -889,6 +889,15 @@ static void __init clocksource_init(void) clock->name, clock->mult, clock->shift); } +void decrementer_check_overflow(void) +{ + u64 now = get_tb_or_rtc(); + struct decrementer_clock *decrementer = &__get_cpu_var(decrementers); + + if (now >= decrementer->next_tb) + set_dec(1); +} + static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev) { -- cgit v1.2.3 From d8afc6fd95496204174f19af0cb39eefee0c3e8a Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 23 Nov 2011 20:07:18 +0000 Subject: powerpc/time: Use clockevents_calc_mult_shift We can use clockevents_calc_mult_shift instead of doing all the work ourselves. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/time.c | 30 ++---------------------------- 1 file changed, 2 insertions(+), 28 deletions(-) (limited to 'arch/powerpc/kernel/time.c') diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 5db163c9675..fae3094c2a9 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -112,8 +112,6 @@ static void decrementer_set_mode(enum clock_event_mode mode, static struct clock_event_device decrementer_clockevent = { .name = "decrementer", .rating = 200, - .shift = 0, /* To be filled in */ - .mult = 0, /* To be filled in */ .irq = 0, .set_next_event = decrementer_set_next_event, .set_mode = decrementer_set_mode, @@ -913,31 +911,6 @@ static void decrementer_set_mode(enum clock_event_mode mode, decrementer_set_next_event(DECREMENTER_MAX, dev); } -static inline uint64_t div_sc64(unsigned long ticks, unsigned long nsec, - int shift) -{ - uint64_t tmp = ((uint64_t)ticks) << shift; - - do_div(tmp, nsec); - return tmp; -} - -static void __init setup_clockevent_multiplier(unsigned long hz) -{ - u64 mult, shift = 32; - - while (1) { - mult = div_sc64(hz, NSEC_PER_SEC, shift); - if (mult && (mult >> 32UL) == 0UL) - break; - - shift--; - } - - decrementer_clockevent.shift = shift; - decrementer_clockevent.mult = mult; -} - static void register_decrementer_clockevent(int cpu) { struct clock_event_device *dec = &per_cpu(decrementers, cpu).event; @@ -955,7 +928,8 @@ static void __init init_decrementer_clockevent(void) { int cpu = smp_processor_id(); - setup_clockevent_multiplier(ppc_tb_freq); + clockevents_calc_mult_shift(&decrementer_clockevent, ppc_tb_freq, 4); + decrementer_clockevent.max_delta_ns = clockevent_delta2ns(DECREMENTER_MAX, &decrementer_clockevent); decrementer_clockevent.min_delta_ns = -- cgit v1.2.3 From 11b8633ada8633991e584951d0027f2741162201 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 23 Nov 2011 20:07:19 +0000 Subject: powerpc/time: Use clocksource_register_hz Use clocksource_register_hz which calculates the shift/mult factors for us. Also remove the shift = 22 assumption in vsyscall_update - thanks to Paul Mackerras and John Stultz for catching that. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/time.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/kernel/time.c') diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index fae3094c2a9..d204b726a18 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -86,8 +86,6 @@ static struct clocksource clocksource_rtc = { .rating = 400, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .mask = CLOCKSOURCE_MASK(64), - .shift = 22, - .mult = 0, /* To be filled in */ .read = rtc_read, }; @@ -97,8 +95,6 @@ static struct clocksource clocksource_timebase = { .rating = 400, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .mask = CLOCKSOURCE_MASK(64), - .shift = 22, - .mult = 0, /* To be filled in */ .read = timebase_read, }; @@ -822,9 +818,8 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, ++vdso_data->tb_update_count; smp_mb(); - /* XXX this assumes clock->shift == 22 */ - /* 4611686018 ~= 2^(20+64-22) / 1e9 */ - new_tb_to_xs = (u64) mult * 4611686018ULL; + /* 19342813113834067 ~= 2^(20+64) / 1e9 */ + new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); new_stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC; do_div(new_stamp_xsec, 1000000000); new_stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC; @@ -875,9 +870,7 @@ static void __init clocksource_init(void) else clock = &clocksource_timebase; - clock->mult = clocksource_hz2mult(tb_ticks_per_sec, clock->shift); - - if (clocksource_register(clock)) { + if (clocksource_register_hz(clock, tb_ticks_per_sec)) { printk(KERN_ERR "clocksource: %s is already registered\n", clock->name); return; -- cgit v1.2.3 From 68568add2ca70153cca3dd1858eaa0776821cf75 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 23 Nov 2011 20:07:20 +0000 Subject: powerpc/time: Remove unnecessary sanity check of decrementer expiration The clockevents code uses max_delta_ns to avoid calling a clockevent with too large a value. Remove the redundant version of this in the timer_interrupt code. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/time.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/kernel/time.c') diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index d204b726a18..2eaaa242c2e 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -572,7 +572,6 @@ void timer_interrupt(struct pt_regs * regs) struct pt_regs *old_regs; struct decrementer_clock *decrementer = &__get_cpu_var(decrementers); struct clock_event_device *evt = &decrementer->event; - u64 now; /* Ensure a positive value is written to the decrementer, or else * some CPUs will continue to take decrementer exceptions. @@ -607,16 +606,9 @@ void timer_interrupt(struct pt_regs * regs) get_lppaca()->int_dword.fields.decr_int = 0; #endif - now = get_tb_or_rtc(); - if (now >= decrementer->next_tb) { - decrementer->next_tb = ~(u64)0; - if (evt->event_handler) - evt->event_handler(evt); - } else { - now = decrementer->next_tb - now; - if (now <= DECREMENTER_MAX) - set_dec((int)now); - } + decrementer->next_tb = ~(u64)0; + if (evt->event_handler) + evt->event_handler(evt); #ifdef CONFIG_PPC_ISERIES if (firmware_has_feature(FW_FEATURE_ISERIES) && hvlpevent_is_pending()) -- cgit v1.2.3 From 621692cb7efb6d0e38c62e41844a6360c6719b20 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 23 Nov 2011 20:07:21 +0000 Subject: powerpc/time: Fix some style issues Fix some formatting issues and use the DECREMENTER_MAX define instead of 0x7fffffff. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/time.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/kernel/time.c') diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 2eaaa242c2e..b1990b987e2 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -106,12 +106,12 @@ static void decrementer_set_mode(enum clock_event_mode mode, struct clock_event_device *dev); static struct clock_event_device decrementer_clockevent = { - .name = "decrementer", - .rating = 200, - .irq = 0, - .set_next_event = decrementer_set_next_event, - .set_mode = decrementer_set_mode, - .features = CLOCK_EVT_FEAT_ONESHOT, + .name = "decrementer", + .rating = 200, + .irq = 0, + .set_next_event = decrementer_set_next_event, + .set_mode = decrementer_set_mode, + .features = CLOCK_EVT_FEAT_ONESHOT, }; struct decrementer_clock { @@ -435,7 +435,7 @@ EXPORT_SYMBOL(profile_pc); /* * This function recalibrates the timebase based on the 49-bit time-of-day * value in the Titan chip. The Titan is much more accurate than the value - * returned by the service processor for the timebase frequency. + * returned by the service processor for the timebase frequency. */ static int __init iSeries_tb_recal(void) @@ -636,9 +636,9 @@ static void generic_suspend_disable_irqs(void) * with suspending. */ - set_dec(0x7fffffff); + set_dec(DECREMENTER_MAX); local_irq_disable(); - set_dec(0x7fffffff); + set_dec(DECREMENTER_MAX); } static void generic_suspend_enable_irqs(void) @@ -982,10 +982,10 @@ void __init time_init(void) boot_tb = get_tb_or_rtc(); /* If platform provided a timezone (pmac), we correct the time */ - if (timezone_offset) { + if (timezone_offset) { sys_tz.tz_minuteswest = -timezone_offset / 60; sys_tz.tz_dsttime = 0; - } + } vdso_data->tb_update_count = 0; vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; -- cgit v1.2.3 From 7df1027542c9353bef4d027cb4ab8e99f69017b7 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Wed, 23 Nov 2011 20:07:22 +0000 Subject: powerpc/time: Optimise decrementer_check_overflow decrementer_check_overflow is called from arch_local_irq_restore so we want to make it as light weight as possible. As such, turn decrementer_check_overflow into an inline function. To avoid a circular mess of includes, separate out the two components of struct decrementer_clock and keep the struct clock_event_device part local to time.c. The fast path improves from: arch_local_irq_restore 0: mflr r0 4: std r0,16(r1) 8: stdu r1,-112(r1) c: stb r3,578(r13) 10: cmpdi cr7,r3,0 14: beq- cr7,24 <.arch_local_irq_restore+0x24> ... 24: addi r1,r1,112 28: ld r0,16(r1) 2c: mtlr r0 30: blr to: arch_local_irq_restore 0: std r30,-16(r1) 4: ld r30,0(r2) 8: stb r3,578(r13) c: cmpdi cr7,r3,0 10: beq- cr7,6c <.arch_local_irq_restore+0x6c> ... 6c: ld r30,-16(r1) 70: blr Unfortunately we still setup a local TOC (due to -mminimal-toc). Yet another sign we should be moving to -mcmodel=medium. Signed-off-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/time.h | 2 +- arch/powerpc/kernel/irq.c | 9 +++++++++ arch/powerpc/kernel/time.c | 27 +++++++-------------------- 3 files changed, 17 insertions(+), 21 deletions(-) (limited to 'arch/powerpc/kernel/time.c') diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index bc3c745cb90..7eb10fb96cd 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -219,7 +219,7 @@ DECLARE_PER_CPU(struct cpu_usage, cpu_usage_array); extern void secondary_cpu_time_init(void); extern void iSeries_time_init_early(void); -extern void decrementer_check_overflow(void); +DECLARE_PER_CPU(u64, decrementers_next_tb); #endif /* __KERNEL__ */ #endif /* __POWERPC_TIME_H */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 745c1e7c10f..2ff4f5e5962 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -115,6 +115,15 @@ static inline notrace void set_soft_enabled(unsigned long enable) : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } +static inline notrace void decrementer_check_overflow(void) +{ + u64 now = get_tb_or_rtc(); + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); + + if (now >= *next_tb) + set_dec(1); +} + notrace void arch_local_irq_restore(unsigned long en) { /* diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index b1990b987e2..9754743db8b 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -114,12 +114,8 @@ static struct clock_event_device decrementer_clockevent = { .features = CLOCK_EVT_FEAT_ONESHOT, }; -struct decrementer_clock { - struct clock_event_device event; - u64 next_tb; -}; - -static DEFINE_PER_CPU(struct decrementer_clock, decrementers); +DEFINE_PER_CPU(u64, decrementers_next_tb); +static DEFINE_PER_CPU(struct clock_event_device, decrementers); #ifdef CONFIG_PPC_ISERIES static unsigned long __initdata iSeries_recal_titan; @@ -570,8 +566,8 @@ void arch_irq_work_raise(void) void timer_interrupt(struct pt_regs * regs) { struct pt_regs *old_regs; - struct decrementer_clock *decrementer = &__get_cpu_var(decrementers); - struct clock_event_device *evt = &decrementer->event; + u64 *next_tb = &__get_cpu_var(decrementers_next_tb); + struct clock_event_device *evt = &__get_cpu_var(decrementers); /* Ensure a positive value is written to the decrementer, or else * some CPUs will continue to take decrementer exceptions. @@ -606,7 +602,7 @@ void timer_interrupt(struct pt_regs * regs) get_lppaca()->int_dword.fields.decr_int = 0; #endif - decrementer->next_tb = ~(u64)0; + *next_tb = ~(u64)0; if (evt->event_handler) evt->event_handler(evt); @@ -872,19 +868,10 @@ static void __init clocksource_init(void) clock->name, clock->mult, clock->shift); } -void decrementer_check_overflow(void) -{ - u64 now = get_tb_or_rtc(); - struct decrementer_clock *decrementer = &__get_cpu_var(decrementers); - - if (now >= decrementer->next_tb) - set_dec(1); -} - static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev) { - __get_cpu_var(decrementers).next_tb = get_tb_or_rtc() + evt; + __get_cpu_var(decrementers_next_tb) = get_tb_or_rtc() + evt; set_dec(evt); return 0; } @@ -898,7 +885,7 @@ static void decrementer_set_mode(enum clock_event_mode mode, static void register_decrementer_clockevent(int cpu) { - struct clock_event_device *dec = &per_cpu(decrementers, cpu).event; + struct clock_event_device *dec = &per_cpu(decrementers, cpu); *dec = decrementer_clockevent; dec->cpumask = cpumask_of(cpu); -- cgit v1.2.3 From 9f5072d4f63f28d30d343573830ac6c85fc0deff Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Fri, 9 Dec 2011 11:35:08 +0000 Subject: powerpc: Fix wrong divisor in usecs_to_cputime Commit d57af9b (taskstats: use real microsecond granularity for CPU times) renamed msecs_to_cputime to usecs_to_cputime, but failed to update all numbers on the way. This causes nonsensical cpu idle/iowait values to be displayed in /proc/stat (the only user of usecs_to_cputime so far). This also renames __cputime_msec_factor to __cputime_usec_factor, adapting its value and using it directly in cputime_to_usecs instead of doing two multiplications. Signed-off-by: Andreas Schwab Acked-by: Anton Blanchard Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/cputime.h | 6 +++--- arch/powerpc/kernel/time.c | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/kernel/time.c') diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 1cf20bdfbec..33a35801f7c 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -126,11 +126,11 @@ static inline u64 cputime64_to_jiffies64(const cputime_t ct) /* * Convert cputime <-> microseconds */ -extern u64 __cputime_msec_factor; +extern u64 __cputime_usec_factor; static inline unsigned long cputime_to_usecs(const cputime_t ct) { - return mulhdu(ct, __cputime_msec_factor) * USEC_PER_MSEC; + return mulhdu(ct, __cputime_usec_factor); } static inline cputime_t usecs_to_cputime(const unsigned long us) @@ -143,7 +143,7 @@ static inline cputime_t usecs_to_cputime(const unsigned long us) sec = us / 1000000; if (ct) { ct *= tb_ticks_per_sec; - do_div(ct, 1000); + do_div(ct, 1000000); } if (sec) ct += (cputime_t) sec * tb_ticks_per_sec; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 9754743db8b..567dd7c3ac2 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -158,13 +158,13 @@ EXPORT_SYMBOL_GPL(ppc_tb_freq); #ifdef CONFIG_VIRT_CPU_ACCOUNTING /* * Factors for converting from cputime_t (timebase ticks) to - * jiffies, milliseconds, seconds, and clock_t (1/USER_HZ seconds). + * jiffies, microseconds, seconds, and clock_t (1/USER_HZ seconds). * These are all stored as 0.64 fixed-point binary fractions. */ u64 __cputime_jiffies_factor; EXPORT_SYMBOL(__cputime_jiffies_factor); -u64 __cputime_msec_factor; -EXPORT_SYMBOL(__cputime_msec_factor); +u64 __cputime_usec_factor; +EXPORT_SYMBOL(__cputime_usec_factor); u64 __cputime_sec_factor; EXPORT_SYMBOL(__cputime_sec_factor); u64 __cputime_clockt_factor; @@ -182,8 +182,8 @@ static void calc_cputime_factors(void) div128_by_32(HZ, 0, tb_ticks_per_sec, &res); __cputime_jiffies_factor = res.result_low; - div128_by_32(1000, 0, tb_ticks_per_sec, &res); - __cputime_msec_factor = res.result_low; + div128_by_32(1000000, 0, tb_ticks_per_sec, &res); + __cputime_usec_factor = res.result_low; div128_by_32(1, 0, tb_ticks_per_sec, &res); __cputime_sec_factor = res.result_low; div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res); -- cgit v1.2.3