From 824c6b7f6294101f30e141117def224a56c203e6 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Sun, 22 May 2011 22:10:20 -0700 Subject: watchdog: Fix rounding bug in get_sample_period() In get_sample_period(), softlockup_thresh is integer divided by 5 before the multiplication by NSEC_PER_SEC. This results in softlockup_thresh being rounded down to the nearest integer multiple of 5. For example, a softlockup_thresh of 4 rounds down to 0. Signed-off-by: Mandeep Singh Baines Cc: Marcin Slusarz Cc: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1306127423-3347-1-git-send-email-msb@chromium.org Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 14733d4d156..a06972d7106 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -110,7 +110,7 @@ static unsigned long get_sample_period(void) * increment before the hardlockup detector generates * a warning */ - return softlockup_thresh / 5 * NSEC_PER_SEC; + return softlockup_thresh * (NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ -- cgit v1.2.3 From e04ab2bc41b35c0cb6cdb07c8443f91aa738cf78 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Sun, 22 May 2011 22:10:21 -0700 Subject: watchdog: Only disable/enable watchdog if neccessary Don't take any action on an unsuccessful write to /proc. Signed-off-by: Mandeep Singh Baines Cc: Marcin Slusarz Cc: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1306127423-3347-2-git-send-email-msb@chromium.org Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a06972d7106..cf0e09f452e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -507,15 +507,19 @@ static void watchdog_disable_all_cpus(void) int proc_dowatchdog_enabled(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, buffer, length, ppos); + int ret; - if (write) { - if (watchdog_enabled) - watchdog_enable_all_cpus(); - else - watchdog_disable_all_cpus(); - } - return 0; + ret = proc_dointvec(table, write, buffer, length, ppos); + if (ret || !write) + goto out; + + if (watchdog_enabled) + watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + +out: + return ret; } int proc_dowatchdog_thresh(struct ctl_table *table, int write, -- cgit v1.2.3 From 586692a5a5fc5740c8a46abc0f2365495c2d7c5f Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Sun, 22 May 2011 22:10:22 -0700 Subject: watchdog: Disable watchdog when thresh is zero This restores the previous behavior of softlock_thresh. Currently, setting watchdog_thresh to zero causes the watchdog kthreads to consume a lot of CPU. In addition, the logic of proc_dowatchdog_thresh and proc_dowatchdog_enabled has been factored into proc_dowatchdog. Signed-off-by: Mandeep Singh Baines Cc: Marcin Slusarz Cc: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1306127423-3347-3-git-send-email-msb@chromium.org Signed-off-by: Ingo Molnar LKML-Reference: <20110517071018.GE22305@elte.hu> --- include/linux/nmi.h | 5 +++-- include/linux/sched.h | 1 - kernel/sysctl.c | 12 ++++++++---- kernel/watchdog.c | 25 +++++++++---------------- 4 files changed, 20 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index c536f8545f7..5317b8b2198 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -47,9 +47,10 @@ static inline bool trigger_all_cpu_backtrace(void) int hw_nmi_is_cpu_stuck(struct pt_regs *); u64 hw_nmi_get_sample_period(void); extern int watchdog_enabled; +extern int watchdog_thresh; struct ctl_table; -extern int proc_dowatchdog_enabled(struct ctl_table *, int , - void __user *, size_t *, loff_t *); +extern int proc_dowatchdog(struct ctl_table *, int , + void __user *, size_t *, loff_t *); #endif #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 12211e1666e..d8b2d0bec0d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -315,7 +315,6 @@ extern int proc_dowatchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; -extern int softlockup_thresh; void lockup_detector_init(void); #else static inline void touch_softlockup_watchdog(void) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c0bb32414b1..3dd0c46fa3b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -730,14 +730,16 @@ static struct ctl_table kern_table[] = { .data = &watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dowatchdog_enabled, + .proc_handler = proc_dowatchdog, + .extra1 = &zero, + .extra2 = &one, }, { .procname = "watchdog_thresh", - .data = &softlockup_thresh, + .data = &watchdog_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dowatchdog_thresh, + .proc_handler = proc_dowatchdog, .extra1 = &neg_one, .extra2 = &sixty, }, @@ -755,7 +757,9 @@ static struct ctl_table kern_table[] = { .data = &watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dowatchdog_enabled, + .proc_handler = proc_dowatchdog, + .extra1 = &zero, + .extra2 = &one, }, #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) diff --git a/kernel/watchdog.c b/kernel/watchdog.c index cf0e09f452e..60301916f62 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -28,7 +28,7 @@ #include int watchdog_enabled = 1; -int __read_mostly softlockup_thresh = 60; +int __read_mostly watchdog_thresh = 60; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); @@ -105,12 +105,12 @@ static unsigned long get_timestamp(int this_cpu) static unsigned long get_sample_period(void) { /* - * convert softlockup_thresh from seconds to ns + * convert watchdog_thresh from seconds to ns * the divide by 5 is to give hrtimer 5 chances to * increment before the hardlockup detector generates * a warning */ - return softlockup_thresh * (NSEC_PER_SEC / 5); + return watchdog_thresh * (NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ @@ -182,7 +182,7 @@ static int is_softlockup(unsigned long touch_ts) unsigned long now = get_timestamp(smp_processor_id()); /* Warn about unreasonable delays: */ - if (time_after(now, touch_ts + softlockup_thresh)) + if (time_after(now, touch_ts + watchdog_thresh)) return now - touch_ts; return 0; @@ -501,19 +501,19 @@ static void watchdog_disable_all_cpus(void) /* sysctl functions */ #ifdef CONFIG_SYSCTL /* - * proc handler for /proc/sys/kernel/nmi_watchdog + * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ -int proc_dowatchdog_enabled(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) +int proc_dowatchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dointvec(table, write, buffer, length, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) goto out; - if (watchdog_enabled) + if (watchdog_enabled && watchdog_thresh) watchdog_enable_all_cpus(); else watchdog_disable_all_cpus(); @@ -521,13 +521,6 @@ int proc_dowatchdog_enabled(struct ctl_table *table, int write, out: return ret; } - -int proc_dowatchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); -} #endif /* CONFIG_SYSCTL */ -- cgit v1.2.3 From 4eec42f392043063d0f019640b4ccc2a45570002 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Sun, 22 May 2011 22:10:23 -0700 Subject: watchdog: Change the default timeout and configure nmi watchdog period based on watchdog_thresh Before the conversion of the NMI watchdog to perf event, the watchdog timeout was 5 seconds. Now it is 60 seconds. For my particular application, netbooks, 5 seconds was a better timeout. With a short timeout, we catch faults earlier and are able to send back a panic. With a 60 second timeout, the user is unlikely to wait and will instead hit the power button, causing us to lose the panic info. This change configures the NMI period to watchdog_thresh and sets the softlockup_thresh to watchdog_thresh * 2. In addition, watchdog_thresh was reduced to 10 seconds as suggested by Ingo Molnar. Signed-off-by: Mandeep Singh Baines Cc: Marcin Slusarz Cc: Don Zickus Cc: Peter Zijlstra Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1306127423-3347-4-git-send-email-msb@chromium.org Signed-off-by: Ingo Molnar LKML-Reference: <20110517071642.GF22305@elte.hu> --- arch/x86/kernel/apic/hw_nmi.c | 4 ++-- include/linux/nmi.h | 2 +- kernel/watchdog.c | 19 +++++++++++++++---- 3 files changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 5260fe91bcb..d5e57db0f7b 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -19,9 +19,9 @@ #include #ifdef CONFIG_HARDLOCKUP_DETECTOR -u64 hw_nmi_get_sample_period(void) +u64 hw_nmi_get_sample_period(int watchdog_thresh) { - return (u64)(cpu_khz) * 1000 * 60; + return (u64)(cpu_khz) * 1000 * watchdog_thresh; } #endif diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 5317b8b2198..2d304efc89d 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -45,7 +45,7 @@ static inline bool trigger_all_cpu_backtrace(void) #ifdef CONFIG_LOCKUP_DETECTOR int hw_nmi_is_cpu_stuck(struct pt_regs *); -u64 hw_nmi_get_sample_period(void); +u64 hw_nmi_get_sample_period(int watchdog_thresh); extern int watchdog_enabled; extern int watchdog_thresh; struct ctl_table; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 60301916f62..6e63097fa73 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -28,7 +28,7 @@ #include int watchdog_enabled = 1; -int __read_mostly watchdog_thresh = 60; +int __read_mostly watchdog_thresh = 10; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); @@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str) __setup("nosoftlockup", nosoftlockup_setup); /* */ +/* + * Hard-lockup warnings should be triggered after just a few seconds. Soft- + * lockups can have false positives under extreme conditions. So we generally + * want a higher threshold for soft lockups than for hard lockups. So we couple + * the thresholds with a factor: we make the soft threshold twice the amount of + * time the hard threshold is. + */ +static int get_softlockup_thresh() +{ + return watchdog_thresh * 2; +} /* * Returns seconds, approximately. We don't need nanosecond @@ -110,7 +121,7 @@ static unsigned long get_sample_period(void) * increment before the hardlockup detector generates * a warning */ - return watchdog_thresh * (NSEC_PER_SEC / 5); + return get_softlockup_thresh() * (NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ @@ -182,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts) unsigned long now = get_timestamp(smp_processor_id()); /* Warn about unreasonable delays: */ - if (time_after(now, touch_ts + watchdog_thresh)) + if (time_after(now, touch_ts + get_softlockup_thresh())) return now - touch_ts; return 0; @@ -359,7 +370,7 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(); + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); -- cgit v1.2.3