From dc61b1d65e353d638b2445f71fb8e5b5630f2415 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Jun 2010 11:40:42 +0200 Subject: sched: Fix PROVE_RCU vs cpu_cgroup PROVE_RCU has a few issues with the cpu_cgroup because the scheduler typically holds rq->lock around the css rcu derefs but the generic cgroup code doesn't (and can't) know about that lock. Provide means to add extra checks to the css dereference and use that in the scheduler to annotate its users. The addition of rq->lock to these checks is correct because the cgroup_subsys::attach() method takes the rq->lock for each task it moves, therefore by holding that lock, we ensure the task is pinned to the current cgroup and the RCU derefence is valid. That leaves one genuine race in __sched_setscheduler() where we used task_group() without holding any of the required locks and thus raced with the cgroup code. Solve this by moving the check under the appropriate lock. Signed-off-by: Peter Zijlstra Cc: "Paul E. McKenney" LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 115 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 59 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f8b8996228d..2aaceebd484 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -306,52 +306,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD; */ struct task_group init_task_group; -/* return group to which a task belongs */ -static inline struct task_group *task_group(struct task_struct *p) -{ - struct task_group *tg; - -#ifdef CONFIG_CGROUP_SCHED - tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), - struct task_group, css); -#else - tg = &init_task_group; -#endif - return tg; -} - -/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) -{ - /* - * Strictly speaking this rcu_read_lock() is not needed since the - * task_group is tied to the cgroup, which in turn can never go away - * as long as there are tasks attached to it. - * - * However since task_group() uses task_subsys_state() which is an - * rcu_dereference() user, this quiets CONFIG_PROVE_RCU. - */ - rcu_read_lock(); -#ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; - p->se.parent = task_group(p)->se[cpu]; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - p->rt.rt_rq = task_group(p)->rt_rq[cpu]; - p->rt.parent = task_group(p)->rt_se[cpu]; -#endif - rcu_read_unlock(); -} - -#else - -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline struct task_group *task_group(struct task_struct *p) -{ - return NULL; -} - #endif /* CONFIG_CGROUP_SCHED */ /* CFS-related fields in a runqueue */ @@ -644,6 +598,49 @@ static inline int cpu_of(struct rq *rq) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() (&__raw_get_cpu_var(runqueues)) +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We use task_subsys_state_check() and extend the RCU verification + * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() + * holds that lock for each task it moves into the cgroup. Therefore + * by holding that lock, we pin the task to the current cgroup. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct cgroup_subsys_state *css; + + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, + lockdep_is_held(&task_rq(p)->lock)); + return container_of(css, struct task_group, css); +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; + p->se.parent = task_group(p)->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = task_group(p)->rt_rq[cpu]; + p->rt.parent = task_group(p)->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + inline void update_rq_clock(struct rq *rq) { if (!rq->skip_clock_update) @@ -4465,16 +4462,6 @@ recheck: } if (user) { -#ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0) - return -EPERM; -#endif - retval = security_task_setscheduler(p, policy, param); if (retval) return retval; @@ -4490,6 +4477,22 @@ recheck: * runqueue lock must be held. */ rq = __task_rq_lock(p); + +#ifdef CONFIG_RT_GROUP_SCHED + if (user) { + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) { + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return -EPERM; + } + } +#endif + /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; -- cgit v1.2.3 From 4673247562e39a17e09440fa1400819522ccd446 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Jun 2010 17:53:51 +0200 Subject: genirq: Deal with desc->set_type() changing desc->chip The set_type() function can change the chip implementation when the trigger mode changes. That might result in using an non-initialized irq chip when called from __setup_irq() or when called via set_irq_type() on an already enabled irq. The set_irq_type() function should not be called on an enabled irq, but because we forgot to put a check into it, we have a bunch of users which grew the habit of doing that and it never blew up as the function is serialized via desc->lock against all users of desc->chip and they never hit the non-initialized irq chip issue. The easy fix for the __setup_irq() issue would be to move the irq_chip_set_defaults(desc->chip) call after the trigger setting to make sure that a chip change is covered. But as we have already users, which do the type setting after request_irq(), the safe fix for now is to call irq_chip_set_defaults() from __irq_set_trigger() when desc->set_type() changed the irq chip. It needs a deeper analysis whether we should refuse to change the chip on an already enabled irq, but that'd be a large scale change to fix all the existing users. So that's neither stable nor 2.6.35 material. Reported-by: Esben Haabendal Signed-off-by: Thomas Gleixner Cc: Benjamin Herrenschmidt Cc: linuxppc-dev Cc: stable@kernel.org --- kernel/irq/manage.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3164ba7ce15..e1497481fe8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -456,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); desc->status |= flags; + + if (chip != desc->chip) + irq_chip_set_defaults(desc->chip); } return ret; -- cgit v1.2.3 From dd4c4f17d722ffeb2515bf781400675a30fcead7 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Fri, 28 May 2010 16:32:14 -0400 Subject: suspend: Move NVS save/restore code to generic suspend functionality Saving platform non-volatile state may be required for suspend to RAM as well as hibernation. Move it to more generic code. Signed-off-by: Matthew Garrett Acked-by: Rafael J. Wysocki Tested-by: Maxim Levitsky Signed-off-by: Len Brown --- arch/x86/kernel/e820.c | 2 +- drivers/acpi/sleep.c | 12 ++-- include/linux/suspend.h | 26 ++++----- kernel/power/Kconfig | 9 +-- kernel/power/Makefile | 2 +- kernel/power/hibernate_nvs.c | 136 ------------------------------------------- kernel/power/nvs.c | 136 +++++++++++++++++++++++++++++++++++++++++++ kernel/power/suspend.c | 6 ++ 8 files changed, 168 insertions(+), 161 deletions(-) delete mode 100644 kernel/power/hibernate_nvs.c create mode 100644 kernel/power/nvs.c (limited to 'kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7bca3c6a02f..0d6fc71bedb 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -729,7 +729,7 @@ static int __init e820_mark_nvs_memory(void) struct e820entry *ei = &e820.map[i]; if (ei->type == E820_NVS) - hibernate_nvs_register(ei->addr, ei->size); + suspend_nvs_register(ei->addr, ei->size); } return 0; diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 4ab2275b446..bcaa6efa813 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -393,7 +393,7 @@ static int acpi_hibernation_begin(void) { int error; - error = s4_no_nvs ? 0 : hibernate_nvs_alloc(); + error = s4_no_nvs ? 0 : suspend_nvs_alloc(); if (!error) { acpi_target_sleep_state = ACPI_STATE_S4; acpi_sleep_tts_switch(acpi_target_sleep_state); @@ -407,7 +407,7 @@ static int acpi_hibernation_pre_snapshot(void) int error = acpi_pm_prepare(); if (!error) - hibernate_nvs_save(); + suspend_nvs_save(); return error; } @@ -432,7 +432,7 @@ static int acpi_hibernation_enter(void) static void acpi_hibernation_finish(void) { - hibernate_nvs_free(); + suspend_nvs_free(); acpi_pm_finish(); } @@ -452,7 +452,7 @@ static void acpi_hibernation_leave(void) panic("ACPI S4 hardware signature mismatch"); } /* Restore the NVS memory area */ - hibernate_nvs_restore(); + suspend_nvs_restore(); } static int acpi_pm_pre_restore(void) @@ -501,7 +501,7 @@ static int acpi_hibernation_begin_old(void) if (!error) { if (!s4_no_nvs) - error = hibernate_nvs_alloc(); + error = suspend_nvs_alloc(); if (!error) acpi_target_sleep_state = ACPI_STATE_S4; } @@ -513,7 +513,7 @@ static int acpi_hibernation_pre_snapshot_old(void) int error = acpi_pm_disable_gpes(); if (!error) - hibernate_nvs_save(); + suspend_nvs_save(); return error; } diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 5e781d824e6..bc7d6bb4cd8 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -256,22 +256,22 @@ static inline int hibernate(void) { return -ENOSYS; } static inline bool system_entering_hibernation(void) { return false; } #endif /* CONFIG_HIBERNATION */ -#ifdef CONFIG_HIBERNATION_NVS -extern int hibernate_nvs_register(unsigned long start, unsigned long size); -extern int hibernate_nvs_alloc(void); -extern void hibernate_nvs_free(void); -extern void hibernate_nvs_save(void); -extern void hibernate_nvs_restore(void); -#else /* CONFIG_HIBERNATION_NVS */ -static inline int hibernate_nvs_register(unsigned long a, unsigned long b) +#ifdef CONFIG_SUSPEND_NVS +extern int suspend_nvs_register(unsigned long start, unsigned long size); +extern int suspend_nvs_alloc(void); +extern void suspend_nvs_free(void); +extern void suspend_nvs_save(void); +extern void suspend_nvs_restore(void); +#else /* CONFIG_SUSPEND_NVS */ +static inline int suspend_nvs_register(unsigned long a, unsigned long b) { return 0; } -static inline int hibernate_nvs_alloc(void) { return 0; } -static inline void hibernate_nvs_free(void) {} -static inline void hibernate_nvs_save(void) {} -static inline void hibernate_nvs_restore(void) {} -#endif /* CONFIG_HIBERNATION_NVS */ +static inline int suspend_nvs_alloc(void) { return 0; } +static inline void suspend_nvs_free(void) {} +static inline void suspend_nvs_save(void) {} +static inline void suspend_nvs_restore(void) {} +#endif /* CONFIG_SUSPEND_NVS */ #ifdef CONFIG_PM_SLEEP void save_processor_state(void); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5c36ea9d55d..ca6066a6952 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -99,9 +99,13 @@ config PM_SLEEP_ADVANCED_DEBUG depends on PM_ADVANCED_DEBUG default n +config SUSPEND_NVS + bool + config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE + select SUSPEND_NVS if HAS_IOMEM default y ---help--- Allow the system to enter sleep states in which main memory is @@ -130,13 +134,10 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. -config HIBERNATION_NVS - bool - config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE - select HIBERNATION_NVS if HAS_IOMEM + select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 524e058dcf0..f9063c6b185 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -10,6 +10,6 @@ obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o -obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o +obj-$(CONFIG_SUSPEND_NVS) += nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c deleted file mode 100644 index fdcad9ed5a7..00000000000 --- a/kernel/power/hibernate_nvs.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory - * - * Copyright (C) 2008,2009 Rafael J. Wysocki , Novell Inc. - * - * This file is released under the GPLv2. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Platforms, like ACPI, may want us to save some memory used by them during - * hibernation and to restore the contents of this memory during the subsequent - * resume. The code below implements a mechanism allowing us to do that. - */ - -struct nvs_page { - unsigned long phys_start; - unsigned int size; - void *kaddr; - void *data; - struct list_head node; -}; - -static LIST_HEAD(nvs_list); - -/** - * hibernate_nvs_register - register platform NVS memory region to save - * @start - physical address of the region - * @size - size of the region - * - * The NVS region need not be page-aligned (both ends) and we arrange - * things so that the data from page-aligned addresses in this region will - * be copied into separate RAM pages. - */ -int hibernate_nvs_register(unsigned long start, unsigned long size) -{ - struct nvs_page *entry, *next; - - while (size > 0) { - unsigned int nr_bytes; - - entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); - if (!entry) - goto Error; - - list_add_tail(&entry->node, &nvs_list); - entry->phys_start = start; - nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); - entry->size = (size < nr_bytes) ? size : nr_bytes; - - start += entry->size; - size -= entry->size; - } - return 0; - - Error: - list_for_each_entry_safe(entry, next, &nvs_list, node) { - list_del(&entry->node); - kfree(entry); - } - return -ENOMEM; -} - -/** - * hibernate_nvs_free - free data pages allocated for saving NVS regions - */ -void hibernate_nvs_free(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - free_page((unsigned long)entry->data); - entry->data = NULL; - if (entry->kaddr) { - iounmap(entry->kaddr); - entry->kaddr = NULL; - } - } -} - -/** - * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions - */ -int hibernate_nvs_alloc(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) { - entry->data = (void *)__get_free_page(GFP_KERNEL); - if (!entry->data) { - hibernate_nvs_free(); - return -ENOMEM; - } - } - return 0; -} - -/** - * hibernate_nvs_save - save NVS memory regions - */ -void hibernate_nvs_save(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Saving platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - entry->kaddr = ioremap(entry->phys_start, entry->size); - memcpy(entry->data, entry->kaddr, entry->size); - } -} - -/** - * hibernate_nvs_restore - restore NVS memory regions - * - * This function is going to be called with interrupts disabled, so it - * cannot iounmap the virtual addresses used to access the NVS region. - */ -void hibernate_nvs_restore(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Restoring platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) - memcpy(entry->kaddr, entry->data, entry->size); -} diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c new file mode 100644 index 00000000000..1836db60bbb --- /dev/null +++ b/kernel/power/nvs.c @@ -0,0 +1,136 @@ +/* + * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory + * + * Copyright (C) 2008,2009 Rafael J. Wysocki , Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Platforms, like ACPI, may want us to save some memory used by them during + * suspend and to restore the contents of this memory during the subsequent + * resume. The code below implements a mechanism allowing us to do that. + */ + +struct nvs_page { + unsigned long phys_start; + unsigned int size; + void *kaddr; + void *data; + struct list_head node; +}; + +static LIST_HEAD(nvs_list); + +/** + * suspend_nvs_register - register platform NVS memory region to save + * @start - physical address of the region + * @size - size of the region + * + * The NVS region need not be page-aligned (both ends) and we arrange + * things so that the data from page-aligned addresses in this region will + * be copied into separate RAM pages. + */ +int suspend_nvs_register(unsigned long start, unsigned long size) +{ + struct nvs_page *entry, *next; + + while (size > 0) { + unsigned int nr_bytes; + + entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); + if (!entry) + goto Error; + + list_add_tail(&entry->node, &nvs_list); + entry->phys_start = start; + nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); + entry->size = (size < nr_bytes) ? size : nr_bytes; + + start += entry->size; + size -= entry->size; + } + return 0; + + Error: + list_for_each_entry_safe(entry, next, &nvs_list, node) { + list_del(&entry->node); + kfree(entry); + } + return -ENOMEM; +} + +/** + * suspend_nvs_free - free data pages allocated for saving NVS regions + */ +void suspend_nvs_free(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + free_page((unsigned long)entry->data); + entry->data = NULL; + if (entry->kaddr) { + iounmap(entry->kaddr); + entry->kaddr = NULL; + } + } +} + +/** + * suspend_nvs_alloc - allocate memory necessary for saving NVS regions + */ +int suspend_nvs_alloc(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) { + entry->data = (void *)__get_free_page(GFP_KERNEL); + if (!entry->data) { + suspend_nvs_free(); + return -ENOMEM; + } + } + return 0; +} + +/** + * suspend_nvs_save - save NVS memory regions + */ +void suspend_nvs_save(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Saving platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + entry->kaddr = ioremap(entry->phys_start, entry->size); + memcpy(entry->data, entry->kaddr, entry->size); + } +} + +/** + * suspend_nvs_restore - restore NVS memory regions + * + * This function is going to be called with interrupts disabled, so it + * cannot iounmap the virtual addresses used to access the NVS region. + */ +void suspend_nvs_restore(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Restoring platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) + memcpy(entry->kaddr, entry->data, entry->size); +} diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 56e7dbb8b99..f37cb7dd440 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -16,6 +16,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include "power.h" -- cgit v1.2.3 From a8fb2608053547bc3152ea61a5ec7cdfce5d942c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jun 2010 14:53:16 -0400 Subject: perf/tracing: Fix regression of perf losing kprobe events With the addition of the code to shrink the kernel tracepoint infrastructure, we lost kprobes being traced by perf. The reason is that I tested if the "tp_event->class->perf_probe" existed before enabling it. This prevents "ftrace only" events (like the function trace events) from being enabled by perf. Unfortunately, kprobe events do not use perf_probe. This causes kprobes to be missed by perf. To fix this, we add the test to see if "tp_event->class->reg" exists as well as perf_probe. Normal trace events have only "perf_probe" but no "reg" function, and kprobes and syscalls have the "reg" but no "perf_probe". The ftrace unique events do not have either, so this is a valid test. If a kprobe or syscall is not to be probed by perf, the "reg" function is called anyway, and will return a failure and prevent perf from probing it. Reported-by: Srikar Dronamraju Tested-by: Srikar Dronamraju Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace_event_perf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index e6f65887842..8a2b73f7c06 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -96,7 +96,9 @@ int perf_trace_init(struct perf_event *p_event) mutex_lock(&event_mutex); list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && - tp_event->class && tp_event->class->perf_probe && + tp_event->class && + (tp_event->class->perf_probe || + tp_event->class->reg) && try_module_get(tp_event->mod)) { ret = perf_trace_event_init(tp_event, p_event); break; -- cgit v1.2.3 From 3310d4d38fbc514e7b18bd3b1eea8effdd63b5aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Jun 2010 18:02:37 +0200 Subject: nohz: Fix nohz ratelimit Chris Wedgwood reports that 39c0cbe (sched: Rate-limit nohz) causes a serial console regression, unresponsiveness, and indeed it does. The reason is that the nohz code is skipped even when the tick was already stopped before the nohz_ratelimit(cpu) condition changed. Move the nohz_ratelimit() check to the other conditions which prevent long idle sleeps. Reported-by: Chris Wedgwood Tested-by: Brian Bloniarz Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Jiri Kosina Cc: Linus Torvalds Cc: Greg KH Cc: Alan Cox Cc: OGAWA Hirofumi Cc: Jef Driesen LKML-Reference: <1276790557.27822.516.camel@twins> Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1d7b9bc1c03..783fbadf220 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -315,9 +315,6 @@ void tick_nohz_stop_sched_tick(int inidle) goto end; } - if (nohz_ratelimit(cpu)) - goto end; - ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { @@ -328,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle) } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || - arch_needs_cpu(cpu)) { + arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { -- cgit v1.2.3 From 3c93717cfa51316e4dbb471e7c0f9d243359d5f8 Mon Sep 17 00:00:00 2001 From: "Alex,Shi" Date: Thu, 17 Jun 2010 14:08:13 +0800 Subject: sched: Fix over-scheduling bug Commit e70971591 ("sched: Optimize unused cgroup configuration") introduced an imbalanced scheduling bug. If we do not use CGROUP, function update_h_load won't update h_load. When the system has a large number of tasks far more than logical CPU number, the incorrect cfs_rq[cpu]->h_load value will cause load_balance() to pull too many tasks to the local CPU from the busiest CPU. So the busiest CPU keeps going in a round robin. That will hurt performance. The issue was found originally by a scientific calculation workload that developed by Yanmin. With that commit, the workload performance drops about 40%. CPU before after 00 : 2 : 7 01 : 1 : 7 02 : 11 : 6 03 : 12 : 7 04 : 6 : 6 05 : 11 : 7 06 : 10 : 6 07 : 12 : 7 08 : 11 : 6 09 : 12 : 6 10 : 1 : 6 11 : 1 : 6 12 : 6 : 6 13 : 2 : 6 14 : 2 : 6 15 : 1 : 6 Reviewed-by: Yanmin zhang Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra LKML-Reference: <1276754893.9452.5442.camel@debian> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2aaceebd484..6c9e7c8735b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1657,9 +1657,6 @@ static void update_shares(struct sched_domain *sd) static void update_h_load(long cpu) { - if (root_task_group_empty()) - return; - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -- cgit v1.2.3 From f3b577dec1f2ce32d2db6d2ca6badff7002512af Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 1 Jun 2010 14:06:13 +0100 Subject: rcu: apply RCU protection to wake_affine() The task_group() function returns a pointer that must be protected by either RCU, the ->alloc_lock, or the cgroup lock (see the rcu_dereference_check() in task_subsys_state(), which is invoked by task_group()). The wake_affine() function currently does none of these, which means that a concurrent update would be within its rights to free the structure returned by task_group(). Because wake_affine() uses this structure only to compute load-balancing heuristics, there is no reason to acquire either of the two locks. Therefore, this commit introduces an RCU read-side critical section that starts before the first call to task_group() and ends after the last use of the "tg" pointer returned from task_group(). Thanks to Li Zefan for pointing out the need to extend the RCU read-side critical section from that proposed by the original patch. Signed-off-by: Daniel J Blueman Signed-off-by: Paul E. McKenney --- kernel/sched_fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index eed35eded60..a878b5332da 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1240,6 +1240,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * effect of the currently running task from the load * of the current CPU: */ + rcu_read_lock(); if (sync) { tg = task_group(current); weight = current->se.load.weight; @@ -1275,6 +1276,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) balanced = this_eff_load <= prev_eff_load; } else balanced = true; + rcu_read_unlock(); /* * If the currently running task will sleep within -- cgit v1.2.3 From 8695159967957015f8dfb49315d6f88e111d90e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Jun 2010 11:44:53 +0200 Subject: sched: silence PROVE_RCU in sched_fork() Because cgroup_fork() is ran before sched_fork() [ from copy_process() ] and the child's pid is not yet visible the child is pinned to its cgroup. Therefore we can silence this warning. A nicer solution would be moving cgroup_fork() to right after dup_task_struct() and exclude PF_STARTING from task_subsys_state(). Signed-off-by: Peter Zijlstra Reviewed-by: Li Zefan Signed-off-by: Paul E. McKenney --- kernel/sched.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f8b8996228d..a2d215d132f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2494,7 +2494,16 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->sched_class->task_fork) p->sched_class->task_fork(p); + /* + * The child is not yet in the pid-hash so no cgroup attach races, + * and the cgroup is pinned to this child due to cgroup_fork() + * is ran before sched_fork(). + * + * Silence PROVE_RCU. + */ + rcu_read_lock(); set_task_cpu(p, cpu); + rcu_read_unlock(); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) -- cgit v1.2.3 From 0d98bb2656e9bd2dfda2d089db1fe1dbdab41504 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 24 May 2010 12:11:43 -0700 Subject: sched: Prevent compiler from optimising the sched_avg_update() loop GCC 4.4.1 on ARM has been observed to replace the while loop in sched_avg_update with a call to uldivmod, resulting in the following build failure at link-time: kernel/built-in.o: In function `sched_avg_update': kernel/sched.c:1261: undefined reference to `__aeabi_uldivmod' kernel/sched.c:1261: undefined reference to `__aeabi_uldivmod' make: *** [.tmp_vmlinux1] Error 1 This patch introduces a fake data hazard to the loop body to prevent the compiler optimising the loop away. Signed-off-by: Will Deacon Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra Cc: Catalin Marinas Cc: Russell King Cc: Linus Torvalds Cc: Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6c9e7c8735b..a24d6d5d83f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1254,6 +1254,12 @@ static void sched_avg_update(struct rq *rq) s64 period = sched_avg_period(); while ((s64)(rq->clock - rq->age_stamp) > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (rq->age_stamp)); rq->age_stamp += period; rq->rt_avg /= 2; } -- cgit v1.2.3 From e05bd3367bd3d88715b53766f95bb3a8ec7ab59e Mon Sep 17 00:00:00 2001 From: Pavan Naregundi Date: Tue, 29 Jun 2010 15:05:28 -0700 Subject: kexec: fix Oops in crash_shrink_memory() When crashkernel is not enabled, "echo 0 > /sys/kernel/kexec_crash_size" OOPSes the kernel in crash_shrink_memory. This happens when crash_shrink_memory tries to release the 'crashk_res' resource which are not reserved. Also value of "/sys/kernel/kexec_crash_size" shows as 1, which should be 0. This patch fixes the OOPS in crash_shrink_memory and shows "/sys/kernel/kexec_crash_size" as 0 when crash kernel memory is not reserved. Signed-off-by: Pavan Naregundi Reviewed-by: WANG Cong Cc: Simon Horman Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 474a84715ea..131b1703936 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1089,9 +1089,10 @@ void crash_kexec(struct pt_regs *regs) size_t crash_get_memory_size(void) { - size_t size; + size_t size = 0; mutex_lock(&kexec_mutex); - size = crashk_res.end - crashk_res.start + 1; + if (crashk_res.end != crashk_res.start) + size = crashk_res.end - crashk_res.start + 1; mutex_unlock(&kexec_mutex); return size; } @@ -1134,7 +1135,7 @@ int crash_shrink_memory(unsigned long new_size) free_reserved_phys_range(end, crashk_res.end); - if (start == end) + if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); crashk_res.end = end - 1; -- cgit v1.2.3 From 7a0ea09ad5352efce8fe79ed853150449903b9f5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 30 Jun 2010 09:51:19 +0200 Subject: futex: futex_find_get_task remove credentails check futex_find_get_task is currently used (through lookup_pi_state) from two contexts, futex_requeue and futex_lock_pi_atomic. None of the paths looks it needs the credentials check, though. Different (e)uids shouldn't matter at all because the only thing that is important for shared futex is the accessibility of the shared memory. The credentail check results in glibc assert failure or process hang (if glibc is compiled without assert support) for shared robust pthread mutex with priority inheritance if a process tries to lock already held lock owned by a process with a different euid: pthread_mutex_lock.c:312: __pthread_mutex_lock_full: Assertion `(-(e)) != 3 || !robust' failed. The problem is that futex_lock_pi_atomic which is called when we try to lock already held lock checks the current holder (tid is stored in the futex value) to get the PI state. It uses lookup_pi_state which in turn gets task struct from futex_find_get_task. ESRCH is returned either when the task is not found or if credentials check fails. futex_lock_pi_atomic simply returns if it gets ESRCH. glibc code, however, doesn't expect that robust lock returns with ESRCH because it should get either success or owner died. Signed-off-by: Michal Hocko Acked-by: Darren Hart Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Nick Piggin Cc: Alexey Kuznetsov Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/futex.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e7a35f1039e..6a3a5fa1526 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state) static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; - const struct cred *cred = current_cred(), *pcred; rcu_read_lock(); p = find_task_by_vpid(pid); - if (!p) { - p = ERR_PTR(-ESRCH); - } else { - pcred = __task_cred(p); - if (cred->euid != pcred->euid && - cred->euid != pcred->uid) - p = ERR_PTR(-ESRCH); - else - get_task_struct(p); - } + if (p) + get_task_struct(p); rcu_read_unlock(); @@ -564,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, if (!pid) return -ESRCH; p = futex_find_get_task(pid); - if (IS_ERR(p)) - return PTR_ERR(p); + if (!p) + return -ESRCH; /* * We need to look at the task state flags to figure out, -- cgit v1.2.3 From 8c215bd3890c347dfb6a2db4779755f8b9c298a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Jul 2010 09:07:17 +0200 Subject: sched: Cure nr_iowait_cpu() users Commit 0224cf4c5e (sched: Intoduce get_cpu_iowait_time_us()) broke things by not making sure preemption was indeed disabled by the callers of nr_iowait_cpu() which took the iowait value of the current cpu. This resulted in a heap of preempt warnings. Cure this by making nr_iowait_cpu() take a cpu number and fix up the callers to pass in the right number. Signed-off-by: Peter Zijlstra Cc: Arjan van de Ven Cc: Sergey Senozhatsky Cc: Rafael J. Wysocki Cc: Maxim Levitsky Cc: Len Brown Cc: Pavel Machek Cc: Jiri Slaby Cc: linux-pm@lists.linux-foundation.org LKML-Reference: <1277968037.1868.120.camel@laptop> Signed-off-by: Ingo Molnar --- drivers/cpuidle/governors/menu.c | 4 ++-- include/linux/sched.h | 2 +- kernel/sched.c | 4 ++-- kernel/time/tick-sched.c | 16 ++++++++-------- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 52ff8aa63f8..1b128702d30 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -143,7 +143,7 @@ static inline int which_bucket(unsigned int duration) * This allows us to calculate * E(duration)|iowait */ - if (nr_iowait_cpu()) + if (nr_iowait_cpu(smp_processor_id())) bucket = BUCKETS/2; if (duration < 10) @@ -175,7 +175,7 @@ static inline int performance_multiplier(void) mult += 2 * get_loadavg(); /* for IO wait tasks (per cpu!) we add 5x each */ - mult += 10 * nr_iowait_cpu(); + mult += 10 * nr_iowait_cpu(smp_processor_id()); return mult; } diff --git a/include/linux/sched.h b/include/linux/sched.h index f118809c953..747fcaedddb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -139,7 +139,7 @@ extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); -extern unsigned long nr_iowait_cpu(void); +extern unsigned long nr_iowait_cpu(int cpu); extern unsigned long this_cpu_load(void); diff --git a/kernel/sched.c b/kernel/sched.c index a24d6d5d83f..f87abe3b017 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2864,9 +2864,9 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_iowait_cpu(void) +unsigned long nr_iowait_cpu(int cpu) { - struct rq *this = this_rq(); + struct rq *this = cpu_rq(cpu); return atomic_read(&this->nr_iowait); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1d7b9bc1c03..1a6f828e57a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -154,14 +154,14 @@ static void tick_nohz_update_jiffies(ktime_t now) * Updates the per cpu time idle statistics counters */ static void -update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time) +update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) { ktime_t delta; if (ts->idle_active) { delta = ktime_sub(now, ts->idle_entrytime); ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - if (nr_iowait_cpu() > 0) + if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); ts->idle_entrytime = now; } @@ -175,19 +175,19 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - update_ts_time_stats(ts, now, NULL); + update_ts_time_stats(cpu, ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(0); } -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) { ktime_t now; now = ktime_get(); - update_ts_time_stats(ts, now, NULL); + update_ts_time_stats(cpu, ts, now, NULL); ts->idle_entrytime = now; ts->idle_active = 1; @@ -216,7 +216,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) if (!tick_nohz_enabled) return -1; - update_ts_time_stats(ts, ktime_get(), last_update_time); + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); return ktime_to_us(ts->idle_sleeptime); } @@ -242,7 +242,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) if (!tick_nohz_enabled) return -1; - update_ts_time_stats(ts, ktime_get(), last_update_time); + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); return ktime_to_us(ts->iowait_sleeptime); } @@ -284,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ ts->inidle = 1; - now = tick_nohz_start_idle(ts); + now = tick_nohz_start_idle(cpu, ts); /* * If this cpu is offline and it is the one which updates -- cgit v1.2.3 From ff49d74ad383f54041378144ca1a229ee9aeaa59 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Sat, 3 Jul 2010 13:07:35 +1000 Subject: module: initialize module dynamic debug later We should initialize the module dynamic debug datastructures only after determining that the module is not loaded yet. This fixes a bug that introduced in 2.6.35-rc2, where when a trying to load a module twice, we also load it's dynamic printing data twice which causes all sorts of nasty issues. Also handle the dynamic debug cleanup later on failure. Signed-off-by: Yehuda Sadeh Signed-off-by: Rusty Russell (removed a #ifdef) Signed-off-by: Linus Torvalds --- include/linux/dynamic_debug.h | 4 ++-- kernel/module.c | 23 +++++++++++++++-------- lib/dynamic_debug.c | 2 +- 3 files changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index b3cd4de9432..52c0da4bdd1 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -40,7 +40,7 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *modname); #if defined(CONFIG_DYNAMIC_DEBUG) -extern int ddebug_remove_module(char *mod_name); +extern int ddebug_remove_module(const char *mod_name); #define __dynamic_dbg_enabled(dd) ({ \ int __ret = 0; \ @@ -73,7 +73,7 @@ extern int ddebug_remove_module(char *mod_name); #else -static inline int ddebug_remove_module(char *mod) +static inline int ddebug_remove_module(const char *mod) { return 0; } diff --git a/kernel/module.c b/kernel/module.c index 8c6b42840dd..5d2d28197c8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2062,6 +2062,12 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) #endif } +static void dynamic_debug_remove(struct _ddebug *debug) +{ + if (debug) + ddebug_remove_module(debug->modname); +} + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); @@ -2124,6 +2130,8 @@ static noinline struct module *load_module(void __user *umod, void *ptr = NULL; /* Stops spurious gcc warning */ unsigned long symoffs, stroffs, *strmap; void __percpu *percpu; + struct _ddebug *debug = NULL; + unsigned int num_debug = 0; mm_segment_t old_fs; @@ -2476,15 +2484,9 @@ static noinline struct module *load_module(void __user *umod, kfree(strmap); strmap = NULL; - if (!mod->taints) { - struct _ddebug *debug; - unsigned int num_debug; - + if (!mod->taints) debug = section_objs(hdr, sechdrs, secstrings, "__verbose", sizeof(*debug), &num_debug); - if (debug) - dynamic_debug_setup(debug, num_debug); - } err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2526,10 +2528,13 @@ static noinline struct module *load_module(void __user *umod, goto unlock; } + if (debug) + dynamic_debug_setup(debug, num_debug); + /* Find duplicate symbols */ err = verify_export_symbols(mod); if (err < 0) - goto unlock; + goto ddebug; list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -2557,6 +2562,8 @@ static noinline struct module *load_module(void __user *umod, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + ddebug: + dynamic_debug_remove(debug); unlock: mutex_unlock(&module_mutex); synchronize_sched(); diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 3df8eb17a60..02afc253372 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -692,7 +692,7 @@ static void ddebug_table_free(struct ddebug_table *dt) * Called in response to a module being unloaded. Removes * any ddebug_table's which point at the module. */ -int ddebug_remove_module(char *mod_name) +int ddebug_remove_module(const char *mod_name) { struct ddebug_table *dt, *nextdt; int ret = -ENOENT; -- cgit v1.2.3 From 9078370c0d2cfe4a905aa34f398bbb0d65921a2b Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 19 Jul 2010 11:54:15 +0100 Subject: kmemleak: Add support for NO_BOOTMEM configurations With commits 08677214 and 59be5a8e, alloc_bootmem()/free_bootmem() and friends use the early_res functions for memory management when NO_BOOTMEM is enabled. This patch adds the kmemleak calls in the corresponding code paths for bootmem allocations. Signed-off-by: Catalin Marinas Acked-by: Pekka Enberg Acked-by: Yinghai Lu Cc: H. Peter Anvin Cc: stable@kernel.org --- kernel/early_res.c | 6 ++++++ mm/page_alloc.c | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/early_res.c b/kernel/early_res.c index 31aa9332ef3..7bfae887f21 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include /* * Early reserved memory areas. @@ -319,6 +321,8 @@ void __init free_early(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + i = find_overlapped_early(start, end); r = &early_res[i]; if (i >= max_early_res || r->end != end || r->start != start) @@ -333,6 +337,8 @@ void __init free_early_partial(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + if (start == end) return; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 431214b941a..68319dd20be 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3659,6 +3659,11 @@ void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, ptr = phys_to_virt(addr); memset(ptr, 0, size); reserve_early_without_check(addr, addr + size, "BOOTMEM"); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); return ptr; } -- cgit v1.2.3 From 1396a21ba0d4ec381db19bc9cd5b6f25a89cf633 Mon Sep 17 00:00:00 2001 From: Martin Hicks Date: Wed, 21 Jul 2010 19:27:05 -0500 Subject: kdb: break out of kdb_ll() when command is terminated Without this patch the "ll" linked-list traversal command won't terminate when you hit q/Q. Signed-off-by: Martin Hicks Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 184cd8209c3..a7fe2e98d61 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2291,6 +2291,9 @@ static int kdb_ll(int argc, const char **argv) while (va) { char buf[80]; + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); diag = kdb_parse(buf); if (diag) -- cgit v1.2.3 From fb82c0ff27b2c40c6f7a3d1a94cafb154591fa80 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 21 Jul 2010 19:27:05 -0500 Subject: repair gdbstub to match the gdbserial protocol specification The gdbserial protocol handler should return an empty packet instead of an error string when ever it responds to a command it does not implement. The problem cases come from a debugger client sending qTBuffer, qTStatus, qSearch, qSupported. The incorrect response from the gdbstub leads the debugger clients to not function correctly. Recent versions of gdb will not detach correctly as a result of this behavior. Signed-off-by: Jason Wessel Signed-off-by: Dongdong Deng --- kernel/debug/gdbstub.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 4b17b326952..e8fd6868682 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -621,10 +621,8 @@ static void gdb_cmd_query(struct kgdb_state *ks) switch (remcom_in_buffer[1]) { case 's': case 'f': - if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) { - error_packet(remcom_out_buffer, -EINVAL); + if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) break; - } i = 0; remcom_out_buffer[0] = 'm'; @@ -665,10 +663,9 @@ static void gdb_cmd_query(struct kgdb_state *ks) pack_threadid(remcom_out_buffer + 2, thref); break; case 'T': - if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) { - error_packet(remcom_out_buffer, -EINVAL); + if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) break; - } + ks->threadid = 0; ptr = remcom_in_buffer + 17; kgdb_hex2long(&ptr, &ks->threadid); -- cgit v1.2.3 From 9e8b624fcaebf9c237b5be9116f4424bf168e6d1 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 21 Jul 2010 19:27:06 -0500 Subject: Fix merge regression from external kdb to upstream kdb In the process of merging kdb to the mainline, the kdb lsmod command stopped printing the base load address of kernel modules. This is needed for using kdb in conjunction with external tools such as gdb. Simply restore the functionality by adding a kdb_printf for the base load address of the kernel modules. Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index a7fe2e98d61..7e9bfd54a0d 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1883,6 +1883,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); + kdb_printf(" 0x%p", mod->module_core); #ifdef CONFIG_MODULE_UNLOAD { -- cgit v1.2.3 From b0679c63db655fa12007558e267bc0eb1d486fdb Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 21 Jul 2010 19:27:07 -0500 Subject: debug_core,kdb: fix kgdb_connected bit set in the wrong place Immediately following an exit from the kdb shell the kgdb_connected variable should be set to zero, unless there are breakpoints planted. If the kgdb_connected variable is not zeroed out with kdb, it is impossible to turn off kdb. This patch is merely a work around for now, the real fix will check for the breakpoints. Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 5cb7cd1de10..8bc5eeffec8 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -605,13 +605,13 @@ cpu_master_loop: if (dbg_kdb_mode) { kgdb_connected = 1; error = kdb_stub(ks); + kgdb_connected = 0; } else { error = gdb_serial_stub(ks); } if (error == DBG_PASS_EVENT) { dbg_kdb_mode = !dbg_kdb_mode; - kgdb_connected = 0; } else if (error == DBG_SWITCH_CPU_EVENT) { dbg_cpu_switch(cpu, dbg_switch_cpu); goto cpu_loop; -- cgit v1.2.3 From edd63cb6b91024332d6983fc51058ac1ef0c081e Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 21 Jul 2010 19:27:07 -0500 Subject: sysrq,kdb: Use __handle_sysrq() for kdb's sysrq function The kdb code should not toggle the sysrq state in case an end user wants to try and resume the normal kernel execution. Signed-off-by: Jason Wessel Acked-by: Dmitry Torokhov --- drivers/char/sysrq.c | 2 +- include/linux/sysrq.h | 1 + kernel/debug/kdb/kdb_main.c | 3 +-- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 5d64e3acb00..878ac0c2cc6 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -493,7 +493,7 @@ static void __sysrq_put_key_op(int key, struct sysrq_key_op *op_p) sysrq_key_table[i] = op_p; } -static void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) +void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) { struct sysrq_key_op *op_p; int orig_log_level; diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 4496322e28d..609e8ca5f53 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -45,6 +45,7 @@ struct sysrq_key_op { */ void handle_sysrq(int key, struct tty_struct *tty); +void __handle_sysrq(int key, struct tty_struct *tty, int check_mask); int register_sysrq_key(int key, struct sysrq_key_op *op); int unregister_sysrq_key(int key, struct sysrq_key_op *op); struct sysrq_key_op *__sysrq_get_key_op(int key); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 7e9bfd54a0d..ebe4a287419 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1820,9 +1820,8 @@ static int kdb_sr(int argc, const char **argv) { if (argc != 1) return KDB_ARGCOUNT; - sysrq_toggle_support(1); kdb_trap_printk++; - handle_sysrq(*argv[1], NULL); + __handle_sysrq(*argv[1], NULL, 0); kdb_trap_printk--; return 0; -- cgit v1.2.3 From b82bab4bbe9efa7bc7177fc20620fff19bd95484 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Tue, 27 Jul 2010 13:18:01 -0700 Subject: dynamic debug: move ddebug_remove_module() down into free_module() The command echo "file ec.c +p" >/sys/kernel/debug/dynamic_debug/control causes an oops. Move the call to ddebug_remove_module() down into free_module(). In this way it should be called from all error paths. Currently, we are missing the remove if the module init routine fails. Signed-off-by: Jason Baron Reported-by: Thomas Renninger Tested-by: Thomas Renninger Cc: [2.6.32+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 5d2d28197c8..6c562828c85 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -787,7 +787,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); - ddebug_remove_module(mod->name); free_module(mod); return 0; @@ -1550,6 +1549,9 @@ static void free_module(struct module *mod) remove_sect_attrs(mod); mod_kobject_remove(mod); + /* Remove dynamic debug info */ + ddebug_remove_module(mod->name); + /* Arch-specific cleanup. */ module_arch_cleanup(mod); -- cgit v1.2.3 From de09a9771a5346029f4d11e4ac886be7f9bfdd75 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 29 Jul 2010 12:45:49 +0100 Subject: CRED: Fix get_task_cred() and task_state() to not resurrect dead credentials It's possible for get_task_cred() as it currently stands to 'corrupt' a set of credentials by incrementing their usage count after their replacement by the task being accessed. What happens is that get_task_cred() can race with commit_creds(): TASK_1 TASK_2 RCU_CLEANER -->get_task_cred(TASK_2) rcu_read_lock() __cred = __task_cred(TASK_2) -->commit_creds() old_cred = TASK_2->real_cred TASK_2->real_cred = ... put_cred(old_cred) call_rcu(old_cred) [__cred->usage == 0] get_cred(__cred) [__cred->usage == 1] rcu_read_unlock() -->put_cred_rcu() [__cred->usage == 1] panic() However, since a tasks credentials are generally not changed very often, we can reasonably make use of a loop involving reading the creds pointer and using atomic_inc_not_zero() to attempt to increment it if it hasn't already hit zero. If successful, we can safely return the credentials in the knowledge that, even if the task we're accessing has released them, they haven't gone to the RCU cleanup code. We then change task_state() in procfs to use get_task_cred() rather than calling get_cred() on the result of __task_cred(), as that suffers from the same problem. Without this change, a BUG_ON in __put_cred() or in put_cred_rcu() can be tripped when it is noticed that the usage count is not zero as it ought to be, for example: kernel BUG at kernel/cred.c:168! invalid opcode: 0000 [#1] SMP last sysfs file: /sys/kernel/mm/ksm/run CPU 0 Pid: 2436, comm: master Not tainted 2.6.33.3-85.fc13.x86_64 #1 0HR330/OptiPlex 745 RIP: 0010:[] [] __put_cred+0xc/0x45 RSP: 0018:ffff88019e7e9eb8 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff880161514480 RCX: 00000000ffffffff RDX: 00000000ffffffff RSI: ffff880140c690c0 RDI: ffff880140c690c0 RBP: ffff88019e7e9eb8 R08: 00000000000000d0 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000040 R12: ffff880140c690c0 R13: ffff88019e77aea0 R14: 00007fff336b0a5c R15: 0000000000000001 FS: 00007f12f50d97c0(0000) GS:ffff880007400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f8f461bc000 CR3: 00000001b26ce000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process master (pid: 2436, threadinfo ffff88019e7e8000, task ffff88019e77aea0) Stack: ffff88019e7e9ec8 ffffffff810698cd ffff88019e7e9ef8 ffffffff81069b45 <0> ffff880161514180 ffff880161514480 ffff880161514180 0000000000000000 <0> ffff88019e7e9f28 ffffffff8106aace 0000000000000001 0000000000000246 Call Trace: [] put_cred+0x13/0x15 [] commit_creds+0x16b/0x175 [] set_current_groups+0x47/0x4e [] sys_setgroups+0xf6/0x105 [] system_call_fastpath+0x16/0x1b Code: 48 8d 71 ff e8 7e 4e 15 00 85 c0 78 0b 8b 75 ec 48 89 df e8 ef 4a 15 00 48 83 c4 18 5b c9 c3 55 8b 07 8b 07 48 89 e5 85 c0 74 04 <0f> 0b eb fe 65 48 8b 04 25 00 cc 00 00 48 3b b8 58 04 00 00 75 RIP [] __put_cred+0xc/0x45 RSP ---[ end trace df391256a100ebdd ]--- Signed-off-by: David Howells Acked-by: Jiri Olsa Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- include/linux/cred.h | 21 +-------------------- kernel/cred.c | 25 +++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/fs/proc/array.c b/fs/proc/array.c index 9b58d38bc91..fff6572676a 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -176,7 +176,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, if (tracer) tpid = task_pid_nr_ns(tracer, ns); } - cred = get_cred((struct cred *) __task_cred(p)); + cred = get_task_cred(p); seq_printf(m, "State:\t%s\n" "Tgid:\t%d\n" diff --git a/include/linux/cred.h b/include/linux/cred.h index 75c0fa88130..ce40cbc791e 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -153,6 +153,7 @@ struct cred { extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); extern int copy_creds(struct task_struct *, unsigned long); +extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); @@ -281,26 +282,6 @@ static inline void put_cred(const struct cred *_cred) #define __task_cred(task) \ ((const struct cred *)(rcu_dereference_check((task)->real_cred, rcu_read_lock_held() || lockdep_tasklist_lock_is_held()))) -/** - * get_task_cred - Get another task's objective credentials - * @task: The task to query - * - * Get the objective credentials of a task, pinning them so that they can't go - * away. Accessing a task's credentials directly is not permitted. - * - * The caller must make sure task doesn't go away, either by holding a ref on - * task or by holding tasklist_lock to prevent it from being unlinked. - */ -#define get_task_cred(task) \ -({ \ - struct cred *__cred; \ - rcu_read_lock(); \ - __cred = (struct cred *) __task_cred((task)); \ - get_cred(__cred); \ - rcu_read_unlock(); \ - __cred; \ -}) - /** * get_current_cred - Get the current task's subjective credentials * diff --git a/kernel/cred.c b/kernel/cred.c index a2d5504fbcc..60bc8b1e32e 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -209,6 +209,31 @@ void exit_creds(struct task_struct *tsk) } } +/** + * get_task_cred - Get another task's objective credentials + * @task: The task to query + * + * Get the objective credentials of a task, pinning them so that they can't go + * away. Accessing a task's credentials directly is not permitted. + * + * The caller must also make sure task doesn't get deleted, either by holding a + * ref on task or by holding tasklist_lock to prevent it from being unlinked. + */ +const struct cred *get_task_cred(struct task_struct *task) +{ + const struct cred *cred; + + rcu_read_lock(); + + do { + cred = __task_cred((task)); + BUG_ON(!cred); + } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage)); + + rcu_read_unlock(); + return cred; +} + /* * Allocate blank credentials, such that the credentials can be filled in at a * later date without risk of ENOMEM. -- cgit v1.2.3