diff options
author | Mandeep Singh Baines <msb@chromium.org> | 2011-05-22 22:10:23 -0700 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2011-05-23 11:58:59 +0200 |
commit | 4eec42f392043063d0f019640b4ccc2a45570002 (patch) | |
tree | 32db1c354f9c12d1275093efed8101a2bd5db232 | |
parent | 586692a5a5fc5740c8a46abc0f2365495c2d7c5f (diff) |
watchdog: Change the default timeout and configure nmi watchdog period based on watchdog_thresh
Before the conversion of the NMI watchdog to perf event, the
watchdog timeout was 5 seconds. Now it is 60 seconds. For my
particular application, netbooks, 5 seconds was a better
timeout. With a short timeout, we catch faults earlier and are
able to send back a panic. With a 60 second timeout, the user is
unlikely to wait and will instead hit the power button, causing
us to lose the panic info.
This change configures the NMI period to watchdog_thresh and
sets the softlockup_thresh to watchdog_thresh * 2. In addition,
watchdog_thresh was reduced to 10 seconds as suggested by Ingo
Molnar.
Signed-off-by: Mandeep Singh Baines <msb@chromium.org>
Cc: Marcin Slusarz <marcin.slusarz@gmail.com>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/1306127423-3347-4-git-send-email-msb@chromium.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20110517071642.GF22305@elte.hu>
-rw-r--r-- | arch/x86/kernel/apic/hw_nmi.c | 4 | ||||
-rw-r--r-- | include/linux/nmi.h | 2 | ||||
-rw-r--r-- | kernel/watchdog.c | 19 |
3 files changed, 18 insertions, 7 deletions
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 5260fe91bcb6..d5e57db0f7be 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -19,9 +19,9 @@ #include <linux/delay.h> #ifdef CONFIG_HARDLOCKUP_DETECTOR -u64 hw_nmi_get_sample_period(void) +u64 hw_nmi_get_sample_period(int watchdog_thresh) { - return (u64)(cpu_khz) * 1000 * 60; + return (u64)(cpu_khz) * 1000 * watchdog_thresh; } #endif diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 5317b8b2198f..2d304efc89df 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -45,7 +45,7 @@ static inline bool trigger_all_cpu_backtrace(void) #ifdef CONFIG_LOCKUP_DETECTOR int hw_nmi_is_cpu_stuck(struct pt_regs *); -u64 hw_nmi_get_sample_period(void); +u64 hw_nmi_get_sample_period(int watchdog_thresh); extern int watchdog_enabled; extern int watchdog_thresh; struct ctl_table; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 60301916f62e..6e63097fa73a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -28,7 +28,7 @@ #include <linux/perf_event.h> int watchdog_enabled = 1; -int __read_mostly watchdog_thresh = 60; +int __read_mostly watchdog_thresh = 10; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); @@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str) __setup("nosoftlockup", nosoftlockup_setup); /* */ +/* + * Hard-lockup warnings should be triggered after just a few seconds. Soft- + * lockups can have false positives under extreme conditions. So we generally + * want a higher threshold for soft lockups than for hard lockups. So we couple + * the thresholds with a factor: we make the soft threshold twice the amount of + * time the hard threshold is. + */ +static int get_softlockup_thresh() +{ + return watchdog_thresh * 2; +} /* * Returns seconds, approximately. We don't need nanosecond @@ -110,7 +121,7 @@ static unsigned long get_sample_period(void) * increment before the hardlockup detector generates * a warning */ - return watchdog_thresh * (NSEC_PER_SEC / 5); + return get_softlockup_thresh() * (NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ @@ -182,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts) unsigned long now = get_timestamp(smp_processor_id()); /* Warn about unreasonable delays: */ - if (time_after(now, touch_ts + watchdog_thresh)) + if (time_after(now, touch_ts + get_softlockup_thresh())) return now - touch_ts; return 0; @@ -359,7 +370,7 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(); + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); |