From 68cd235dda15c5eb223c86803e9068d2cfbf64b5 Mon Sep 17 00:00:00 2001 From: Todd Poynor Date: Mon, 25 Jul 2011 14:54:03 -0700 Subject: PM: Fix printing IRQ names for pending wakeup IRQs The IRQ name has moved to the struct irqaction list (so print first action's name). Change-Id: I65a627457f9abaf7c1dcc32d8814243ba2ff4717 Signed-off-by: Todd Poynor --- kernel/irq/pm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 85c13082b0c..f323a4cd58e 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -72,7 +72,9 @@ int check_wakeup_irqs(void) if (irqd_is_wakeup_set(&desc->irq_data)) { if (desc->istate & IRQS_PENDING) { pr_info("Wakeup IRQ %d %s pending, suspend aborted\n", - irq, desc->name ? desc->name : ""); + irq, + desc->action && desc->action->name ? + desc->action->name : ""); return -EBUSY; } continue; -- cgit v1.2.3 From 462fee3af72df0de7b60b96c525ffe8baf4db0f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 28 Jul 2011 20:47:10 +0200 Subject: perf: Fix software event overflow The below patch is for -stable only, upstream has a much larger patch that contains the below hunk in commit a8b0ca17b80e92faab46ee7179ba9e99ccb61233 Vince found that under certain circumstances software event overflows go wrong and deadlock. Avoid trying to delete a timer from the timer callback. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9efe7108cca..32a61513f89 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5016,11 +5016,8 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; event->pending_kill = POLL_HUP; - if (nmi) { - event->pending_disable = 1; - irq_work_queue(&event->pending); - } else - perf_event_disable(event); + event->pending_disable = 1; + irq_work_queue(&event->pending); } if (event->overflow_handler) -- cgit v1.2.3 From f35869d69b65ac9323a0d7ee13c062e1636d4a1b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jul 2011 11:36:06 -0400 Subject: tracing: Fix bug when reading system filters on module removal commit e9dbfae53eeb9fc3d4bb7da3df87fa9875f5da02 upstream. The event system is freed when its nr_events is set to zero. This happens when a module created an event system and then later the module is removed. Modules may share systems, so the system is allocated when it is created and freed when the modules are unloaded and all the events under the system are removed (nr_events set to zero). The problem arises when a task opened the "filter" file for the system. If the module is unloaded and it removed the last event for that system, the system structure is freed. If the task that opened the filter file accesses the "filter" file after the system has been freed, the system will access an invalid pointer. By adding a ref_count, and using it to keep track of what is using the event system, we can free it after all users are finished with the event system. Reported-by: Johannes Berg Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 86 +++++++++++++++++++++++++++++++++----- kernel/trace/trace_events_filter.c | 6 +++ 3 files changed, 82 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 229f8591f61..f8074072d11 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -677,6 +677,7 @@ struct event_subsystem { struct dentry *entry; struct event_filter *filter; int nr_events; + int ref_count; }; #define FILTER_PRED_INVALID ((unsigned short)-1) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 686ec399f2a..ffc5b2884af 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -244,6 +244,35 @@ static void ftrace_clear_events(void) mutex_unlock(&event_mutex); } +static void __put_system(struct event_subsystem *system) +{ + struct event_filter *filter = system->filter; + + WARN_ON_ONCE(system->ref_count == 0); + if (--system->ref_count) + return; + + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); +} + +static void __get_system(struct event_subsystem *system) +{ + WARN_ON_ONCE(system->ref_count == 0); + system->ref_count++; +} + +static void put_system(struct event_subsystem *system) +{ + mutex_lock(&event_mutex); + __put_system(system); + mutex_unlock(&event_mutex); +} + /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ @@ -826,6 +855,47 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static LIST_HEAD(event_subsystems); + +static int subsystem_open(struct inode *inode, struct file *filp) +{ + struct event_subsystem *system = NULL; + int ret; + + /* Make sure the system still exists */ + mutex_lock(&event_mutex); + list_for_each_entry(system, &event_subsystems, list) { + if (system == inode->i_private) { + /* Don't open systems with no events */ + if (!system->nr_events) { + system = NULL; + break; + } + __get_system(system); + break; + } + } + mutex_unlock(&event_mutex); + + if (system != inode->i_private) + return -ENODEV; + + ret = tracing_open_generic(inode, filp); + if (ret < 0) + put_system(system); + + return ret; +} + +static int subsystem_release(struct inode *inode, struct file *file) +{ + struct event_subsystem *system = inode->i_private; + + put_system(system); + + return 0; +} + static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -963,10 +1033,11 @@ static const struct file_operations ftrace_event_filter_fops = { }; static const struct file_operations ftrace_subsystem_filter_fops = { - .open = tracing_open_generic, + .open = subsystem_open, .read = subsystem_filter_read, .write = subsystem_filter_write, .llseek = default_llseek, + .release = subsystem_release, }; static const struct file_operations ftrace_system_enable_fops = { @@ -1002,8 +1073,6 @@ static struct dentry *event_trace_events_dir(void) return d_events; } -static LIST_HEAD(event_subsystems); - static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { @@ -1013,6 +1082,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { + __get_system(system); system->nr_events++; return system->entry; } @@ -1035,6 +1105,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) } system->nr_events = 1; + system->ref_count = 1; system->name = kstrdup(name, GFP_KERNEL); if (!system->name) { debugfs_remove(system->entry); @@ -1184,16 +1255,9 @@ static void remove_subsystem_dir(const char *name) list_for_each_entry(system, &event_subsystems, list) { if (strcmp(system->name, name) == 0) { if (!--system->nr_events) { - struct event_filter *filter = system->filter; - debugfs_remove_recursive(system->entry); list_del(&system->list); - if (filter) { - kfree(filter->filter_string); - kfree(filter); - } - kfree(system->name); - kfree(system); + __put_system(system); } break; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8008ddcfbf2..256764ecccd 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1886,6 +1886,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system, mutex_lock(&event_mutex); + /* Make sure the system still has events */ + if (!system->nr_events) { + err = -ENODEV; + goto out_unlock; + } + if (!strcmp(strstrip(filter_string), "0")) { filter_free_subsystem_preds(system); remove_filter_string(system->filter); -- cgit v1.2.3 From ff7b3dc6a634832a912445db8bffd18b05c15043 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Jul 2011 14:32:51 -0400 Subject: tracing: Have "enable" file use refcounts like the "filter" file commit 40ee4dffff061399eb9358e0c8fcfbaf8de4c8fe upstream. The "enable" file for the event system can be removed when a module is unloaded and the event system only has events from that module. As the event system nr_events count goes to zero, it may be freed if its ref_count is also set to zero. Like the "filter" file, the "enable" file may be opened by a task and referenced later, after a module has been unloaded and the events for that event system have been removed. Although the "filter" file referenced the event system structure, the "enable" file only references a pointer to the event system name. Since the name is freed when the event system is removed, it is possible that an access to the "enable" file may reference a freed pointer. Update the "enable" file to use the subsystem_open() routine that the "filter" file uses, to keep a reference to the event system structure while the "enable" file is opened. Reported-by: Johannes Berg Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_events.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ffc5b2884af..3e2a7c91c54 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -557,7 +557,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; - const char *system = filp->private_data; + struct event_subsystem *system = filp->private_data; struct ftrace_event_call *call; char buf[2]; int set = 0; @@ -568,7 +568,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, if (!call->name || !call->class || !call->class->reg) continue; - if (system && strcmp(call->class->system, system) != 0) + if (system && strcmp(call->class->system, system->name) != 0) continue; /* @@ -598,7 +598,8 @@ static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - const char *system = filp->private_data; + struct event_subsystem *system = filp->private_data; + const char *name = NULL; unsigned long val; char buf[64]; ssize_t ret; @@ -622,7 +623,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; - ret = __ftrace_set_clr_event(NULL, system, NULL, val); + /* + * Opening of "enable" adds a ref count to system, + * so the name is safe to use. + */ + if (system) + name = system->name; + + ret = __ftrace_set_clr_event(NULL, name, NULL, val); if (ret) goto out; @@ -862,6 +870,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) struct event_subsystem *system = NULL; int ret; + if (!inode->i_private) + goto skip_search; + /* Make sure the system still exists */ mutex_lock(&event_mutex); list_for_each_entry(system, &event_subsystems, list) { @@ -880,8 +891,9 @@ static int subsystem_open(struct inode *inode, struct file *filp) if (system != inode->i_private) return -ENODEV; + skip_search: ret = tracing_open_generic(inode, filp); - if (ret < 0) + if (ret < 0 && system) put_system(system); return ret; @@ -891,7 +903,8 @@ static int subsystem_release(struct inode *inode, struct file *file) { struct event_subsystem *system = inode->i_private; - put_system(system); + if (system) + put_system(system); return 0; } @@ -1041,10 +1054,11 @@ static const struct file_operations ftrace_subsystem_filter_fops = { }; static const struct file_operations ftrace_system_enable_fops = { - .open = tracing_open_generic, + .open = subsystem_open, .read = system_enable_read, .write = system_enable_write, .llseek = default_llseek, + .release = subsystem_release, }; static const struct file_operations ftrace_show_header_fops = { @@ -1133,8 +1147,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } - trace_create_file("enable", 0644, system->entry, - (void *)system->name, + trace_create_file("enable", 0644, system->entry, system, &ftrace_system_enable_fops); return system->entry; -- cgit v1.2.3 From b045b9a265fb46d8197b7d78aff1a8f6ab8e23df Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 25 Jul 2011 17:12:32 -0700 Subject: mm/futex: fix futex writes on archs with SW tracking of dirty & young commit 2efaca927f5cd7ecd0f1554b8f9b6a9a2c329c03 upstream. I haven't reproduced it myself but the fail scenario is that on such machines (notably ARM and some embedded powerpc), if you manage to hit that futex path on a writable page whose dirty bit has gone from the PTE, you'll livelock inside the kernel from what I can tell. It will go in a loop of trying the atomic access, failing, trying gup to "fix it up", getting succcess from gup, go back to the atomic access, failing again because dirty wasn't fixed etc... So I think you essentially hang in the kernel. The scenario is probably rare'ish because affected architecture are embedded and tend to not swap much (if at all) so we probably rarely hit the case where dirty is missing or young is missing, but I think Shan has a piece of SW that can reliably reproduce it using a shared writable mapping & fork or something like that. On archs who use SW tracking of dirty & young, a page without dirty is effectively mapped read-only and a page without young unaccessible in the PTE. Additionally, some architectures might lazily flush the TLB when relaxing write protection (by doing only a local flush), and expect a fault to invalidate the stale entry if it's still present on another processor. The futex code assumes that if the "in_atomic()" access -EFAULT's, it can "fix it up" by causing get_user_pages() which would then be equivalent to taking the fault. However that isn't the case. get_user_pages() will not call handle_mm_fault() in the case where the PTE seems to have the right permissions, regardless of the dirty and young state. It will eventually update those bits ... in the struct page, but not in the PTE. Additionally, it will not handle the lazy TLB flushing that can be required by some architectures in the fault case. Basically, gup is the wrong interface for the job. The patch provides a more appropriate one which boils down to just calling handle_mm_fault() since what we are trying to do is simulate a real page fault. The futex code currently attempts to write to user memory within a pagefault disabled section, and if that fails, tries to fix it up using get_user_pages(). This doesn't work on archs where the dirty and young bits are maintained by software, since they will gate access permission in the TLB, and will not be updated by gup(). In addition, there's an expectation on some archs that a spurious write fault triggers a local TLB flush, and that is missing from the picture as well. I decided that adding those "features" to gup() would be too much for this already too complex function, and instead added a new simpler fixup_user_fault() which is essentially a wrapper around handle_mm_fault() which the futex code can call. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix some nits Darren saw, fiddle comment layout] Signed-off-by: Benjamin Herrenschmidt Reported-by: Shan Hai Tested-by: Shan Hai Cc: David Laight Acked-by: Peter Zijlstra Cc: Darren Hart Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/mm.h | 2 ++ kernel/futex.c | 4 ++-- mm/memory.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 61 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9670f71d7be..1036614df0c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -985,6 +985,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); struct page *get_dump_page(unsigned long addr); +extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned long offset); diff --git a/kernel/futex.c b/kernel/futex.c index fe28dc282ea..7a0a4ed7849 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -355,8 +355,8 @@ static int fault_in_user_writeable(u32 __user *uaddr) int ret; down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, (unsigned long)uaddr, - 1, 1, 0, NULL, NULL); + ret = fixup_user_fault(current, mm, (unsigned long)uaddr, + FAULT_FLAG_WRITE); up_read(&mm->mmap_sem); return ret < 0 ? ret : 0; diff --git a/mm/memory.c b/mm/memory.c index 9b8a01d941c..d961e1914d1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1816,7 +1816,63 @@ next_page: } EXPORT_SYMBOL(__get_user_pages); -/** +/* + * fixup_user_fault() - manually resolve a user page fault + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * handle_mm_fault() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This should be called with the mm_sem held for read. + */ +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags) +{ + struct vm_area_struct *vma; + int ret; + + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) + return -EFAULT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return -EHWPOISON; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + return 0; +} + +/* * get_user_pages() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. -- cgit v1.2.3 From ed27e538aa97278e26a6c00f14f6e2e076a1a2ae Mon Sep 17 00:00:00 2001 From: Todd Poynor Date: Mon, 8 Aug 2011 17:26:49 -0700 Subject: PM: wakelocks: Don't report wake up wakelock if suspend aborted If the wakelock driver aborts suspend due to an already-held wakelock, don't report the next wakelock held as the "wake up wakelock". Change-Id: I582ffbb87a3c361739a77d839a0c62921cff11a6 Signed-off-by: Todd Poynor --- kernel/power/wakelock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index c10d0ee7907..d45df2b151b 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -311,7 +311,7 @@ static int power_suspend_late(struct device *dev) { int ret = has_wake_lock(WAKE_LOCK_SUSPEND) ? -EAGAIN : 0; #ifdef CONFIG_WAKELOCK_STAT - wait_for_wakeup = 1; + wait_for_wakeup = !ret; #endif if (debug_mask & DEBUG_SUSPEND) pr_info("power_suspend_late return %d\n", ret); -- cgit v1.2.3 From ca64b0cd3a12d7704f4e98f4f5d51f41eb5047a2 Mon Sep 17 00:00:00 2001 From: Todd Poynor Date: Mon, 8 Aug 2011 16:06:54 -0700 Subject: PM: wakelocks: Display wakelocks preventing suspend by default Use DEBUG_WAKEUP flag to show wakelocks that abort suspend, in addition to showing wakelocks held during system resume. DEBUG_WAKEUP is enabled by default. Change-Id: If6fa68e8afbc482a5300ffab2964694b02b34f41 Signed-off-by: Todd Poynor --- kernel/power/wakelock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c index d45df2b151b..2ee459fe445 100644 --- a/kernel/power/wakelock.c +++ b/kernel/power/wakelock.c @@ -249,7 +249,7 @@ long has_wake_lock(int type) unsigned long irqflags; spin_lock_irqsave(&list_lock, irqflags); ret = has_wake_lock_locked(type); - if (ret && (debug_mask & DEBUG_SUSPEND) && type == WAKE_LOCK_SUSPEND) + if (ret && (debug_mask & DEBUG_WAKEUP) && type == WAKE_LOCK_SUSPEND) print_active_locks(type); spin_unlock_irqrestore(&list_lock, irqflags); return ret; -- cgit v1.2.3