From 0793a61d4df8daeac6492dbf8d2f3e5713caae5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Dec 2008 20:12:29 +0100 Subject: performance counters: core code Implement the core kernel bits of Performance Counters subsystem. The Linux Performance Counter subsystem provides an abstraction of performance counter hardware capabilities. It provides per task and per CPU counters, and it provides event capabilities on top of those. Performance counters are accessed via special file descriptors. There's one file descriptor per virtual counter used. The special file descriptor is opened via the perf_counter_open() system call: int perf_counter_open(u32 hw_event_type, u32 hw_event_period, u32 record_type, pid_t pid, int cpu); The syscall returns the new fd. The fd can be used via the normal VFS system calls: read() can be used to read the counter, fcntl() can be used to set the blocking mode, etc. Multiple counters can be kept open at a time, and the counters can be poll()ed. See more details in Documentation/perf-counters.txt. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 171 +++++++++++++++++++++++++++++++++++++++++++ include/linux/sched.h | 9 +++ include/linux/syscalls.h | 6 ++ 3 files changed, 186 insertions(+) create mode 100644 include/linux/perf_counter.h (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h new file mode 100644 index 00000000000..22c4469abf4 --- /dev/null +++ b/include/linux/perf_counter.h @@ -0,0 +1,171 @@ +/* + * Performance counters: + * + * Copyright(C) 2008, Thomas Gleixner + * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _LINUX_PERF_COUNTER_H +#define _LINUX_PERF_COUNTER_H + +#include + +#include +#include +#include +#include +#include + +struct task_struct; + +/* + * Generalized hardware event types, used by the hw_event_type parameter + * of the sys_perf_counter_open() syscall: + */ +enum hw_event_types { + PERF_COUNT_CYCLES, + PERF_COUNT_INSTRUCTIONS, + PERF_COUNT_CACHE_REFERENCES, + PERF_COUNT_CACHE_MISSES, + PERF_COUNT_BRANCH_INSTRUCTIONS, + PERF_COUNT_BRANCH_MISSES, + /* + * If this bit is set in the type, then trigger NMI sampling: + */ + PERF_COUNT_NMI = (1 << 30), +}; + +/* + * IRQ-notification data record type: + */ +enum perf_record_type { + PERF_RECORD_SIMPLE, + PERF_RECORD_IRQ, + PERF_RECORD_GROUP, +}; + +/** + * struct hw_perf_counter - performance counter hardware details + */ +struct hw_perf_counter { + u64 config; + unsigned long config_base; + unsigned long counter_base; + int nmi; + unsigned int idx; + u64 prev_count; + s32 next_count; + u64 irq_period; +}; + +/* + * Hardcoded buffer length limit for now, for IRQ-fed events: + */ +#define PERF_DATA_BUFLEN 2048 + +/** + * struct perf_data - performance counter IRQ data sampling ... + */ +struct perf_data { + int len; + int rd_idx; + int overrun; + u8 data[PERF_DATA_BUFLEN]; +}; + +/** + * struct perf_counter - performance counter kernel representation: + */ +struct perf_counter { + struct list_head list; + int active; +#if BITS_PER_LONG == 64 + atomic64_t count; +#else + atomic_t count32[2]; +#endif + u64 __irq_period; + + struct hw_perf_counter hw; + + struct perf_counter_context *ctx; + struct task_struct *task; + + /* + * Protect attach/detach: + */ + struct mutex mutex; + + int oncpu; + int cpu; + + s32 hw_event_type; + enum perf_record_type record_type; + + /* read() / irq related data */ + wait_queue_head_t waitq; + /* optional: for NMIs */ + int wakeup_pending; + struct perf_data *irqdata; + struct perf_data *usrdata; + struct perf_data data[2]; +}; + +/** + * struct perf_counter_context - counter context structure + * + * Used as a container for task counters and CPU counters as well: + */ +struct perf_counter_context { +#ifdef CONFIG_PERF_COUNTERS + /* + * Protect the list of counters: + */ + spinlock_t lock; + struct list_head counters; + int nr_counters; + int nr_active; + struct task_struct *task; +#endif +}; + +/** + * struct perf_counter_cpu_context - per cpu counter context structure + */ +struct perf_cpu_context { + struct perf_counter_context ctx; + struct perf_counter_context *task_ctx; + int active_oncpu; + int max_pertask; +}; + +/* + * Set by architecture code: + */ +extern int perf_max_counters; + +#ifdef CONFIG_PERF_COUNTERS +extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); +extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); +extern void perf_counter_task_tick(struct task_struct *task, int cpu); +extern void perf_counter_init_task(struct task_struct *task); +extern void perf_counter_notify(struct pt_regs *regs); +extern void perf_counter_print_debug(void); +#else +static inline void +perf_counter_task_sched_in(struct task_struct *task, int cpu) { } +static inline void +perf_counter_task_sched_out(struct task_struct *task, int cpu) { } +static inline void +perf_counter_task_tick(struct task_struct *task, int cpu) { } +static inline void perf_counter_init_task(struct task_struct *task) { } +static inline void perf_counter_notify(struct pt_regs *regs) { } +static inline void perf_counter_print_debug(void) { } +#endif + +#endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 55e30d11447..4c530278391 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -71,6 +71,7 @@ struct sched_param { #include #include #include +#include #include #include #include @@ -1326,6 +1327,7 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif + struct perf_counter_context perf_counter_ctx; #ifdef CONFIG_NUMA struct mempolicy *mempolicy; short il_next; @@ -2285,6 +2287,13 @@ static inline void inc_syscw(struct task_struct *tsk) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +/* + * Call the function if the target task is executing on a CPU right now: + */ +extern void task_oncpu_function_call(struct task_struct *p, + void (*func) (void *info), void *info); + + #ifdef CONFIG_MM_OWNER extern void mm_update_next_owner(struct mm_struct *mm); extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 04fb47bfb92..6cce728a626 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -624,4 +624,10 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); +asmlinkage int +sys_perf_counter_open(u32 hw_event_type, + u32 hw_event_period, + u32 record_type, + pid_t pid, + int cpu); #endif -- cgit v1.2.3 From 4ac13294e44664bb7edf4daf52edb71e7c6bbe84 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Dec 2008 21:43:39 +0100 Subject: perf counters: protect them against CSTATE transitions Impact: fix rare lost events problem There are CPUs whose performance counters misbehave on CSTATE transitions, so provide a way to just disable/enable them around deep idle methods. (hw_perf_enable_all() is cheap on x86.) Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++++++++- drivers/acpi/processor_idle.c | 8 ++++++++ include/linux/perf_counter.h | 4 ++++ 3 files changed, 25 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6a93d1f04d9..0a7f3bea2dc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -119,10 +120,21 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_disable_all(void) +void hw_perf_restore_ctrl(u64 ctrl) { + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); +} +EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl); + +u64 hw_perf_disable_all(void) +{ + u64 ctrl; + + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + return ctrl; } +EXPORT_SYMBOL_GPL(hw_perf_disable_all); static inline void __hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 5f8d746a9b8..cca804e6f1d 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -270,8 +270,11 @@ static atomic_t c3_cpu_count; /* Common C-state entry for C2, C3, .. */ static void acpi_cstate_enter(struct acpi_processor_cx *cstate) { + u64 pctrl; + /* Don't trace irqs off for idle */ stop_critical_timings(); + pctrl = hw_perf_disable_all(); if (cstate->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cstate); @@ -284,6 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate) gets asserted in time to freeze execution properly. */ unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } + hw_perf_restore_ctrl(pctrl); start_critical_timings(); } #endif /* !CONFIG_CPU_IDLE */ @@ -1425,8 +1429,11 @@ static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr, */ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) { + u64 pctrl; + /* Don't trace irqs off for idle */ stop_critical_timings(); + pctrl = hw_perf_disable_all(); if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); @@ -1441,6 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) gets asserted in time to freeze execution properly. */ unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } + hw_perf_restore_ctrl(pctrl); start_critical_timings(); } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 22c4469abf4..5031b5614f2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -156,6 +156,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern void perf_counter_init_task(struct task_struct *task); extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); +extern void hw_perf_restore_ctrl(u64 ctrl); +extern u64 hw_perf_disable_all(void); #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -166,6 +168,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline void perf_counter_init_task(struct task_struct *task) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } +static inline void hw_perf_restore_ctrl(u64 ctrl) { } +static inline u64 hw_perf_disable_all(void) { return 0; } #endif #endif /* _LINUX_PERF_COUNTER_H */ -- cgit v1.2.3 From eab656ae04b9d3b83265e3db01c0d2c46b748ef7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Dec 2008 19:26:59 +0100 Subject: perf counters: clean up 'raw' type API Impact: cleanup Introduce a separate hw_event type. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++++++ include/linux/syscalls.h | 8 +++----- kernel/perf_counter.c | 15 ++++++++------- 3 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5031b5614f2..daedd7d87c2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -38,6 +38,7 @@ enum hw_event_types { * If this bit is set in the type, then trigger NMI sampling: */ PERF_COUNT_NMI = (1 << 30), + PERF_COUNT_RAW = (1 << 31), }; /* @@ -49,6 +50,12 @@ enum perf_record_type { PERF_RECORD_GROUP, }; +struct perf_counter_event { + u32 hw_event_type; + u32 hw_event_period; + u64 hw_raw_ctrl; +}; + /** * struct hw_perf_counter - performance counter hardware details */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 6cce728a626..3ecd73d03da 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -54,6 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; +struct perf_counter_event; #include #include @@ -625,9 +626,6 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); asmlinkage int -sys_perf_counter_open(u32 hw_event_type, - u32 hw_event_period, - u32 record_type, - pid_t pid, - int cpu); +sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, + pid_t pid, int cpu, int masterfd); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 20508f05365..96c333a5b0f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -734,26 +734,27 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) * @pid: target pid */ asmlinkage int -sys_perf_counter_open(u32 hw_event_type, - u32 hw_event_period, - u32 record_type, - pid_t pid, - int cpu) +sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, + pid_t pid, int cpu, int masterfd) { struct perf_counter_context *ctx; + struct perf_counter_event event; struct perf_counter *counter; int ret; + if (copy_from_user(&event, uevent, sizeof(event)) != 0) + return -EFAULT; + ctx = find_get_context(pid, cpu); if (IS_ERR(ctx)) return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(hw_event_period, cpu, record_type); + counter = perf_counter_alloc(event.hw_event_period, cpu, record_type); if (!counter) goto err_put_context; - ret = hw_perf_counter_init(counter, hw_event_type); + ret = hw_perf_counter_init(counter, event.hw_event_type); if (ret) goto err_free_put_context; -- cgit v1.2.3 From dfa7c899b401d7dc5d85aca416aee64ac82812f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Dec 2008 19:35:37 +0100 Subject: perf counters: expand use of counter->event Impact: change syscall, cleanup Make use of the new perf_counters event type. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 22 +++++++++++----------- include/linux/perf_counter.h | 4 +--- kernel/perf_counter.c | 10 +++++----- 3 files changed, 17 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 0a7f3bea2dc..30e7ebf7827 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -56,9 +56,10 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Setup the hardware configuration for a given hw_event_type */ -int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) +int hw_perf_counter_init(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; + u32 hw_event_type = counter->event.hw_event_type; if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -83,7 +84,7 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = counter->__irq_period; + hwc->irq_period = counter->event.hw_event_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -95,21 +96,19 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) hwc->next_count = -((s32) hwc->irq_period); /* - * Negative event types mean raw encoded event+umask values: + * Raw event type provide the config in the event structure */ - if (hw_event_type < 0) { - counter->hw_event_type = -hw_event_type; - counter->hw_event_type &= ~PERF_COUNT_NMI; + hw_event_type &= ~PERF_COUNT_NMI; + if (hw_event_type == PERF_COUNT_RAW) { + hwc->config |= counter->event.hw_raw_ctrl; } else { - hw_event_type &= ~PERF_COUNT_NMI; if (hw_event_type >= max_intel_perfmon_events) return -EINVAL; /* * The generic map: */ - counter->hw_event_type = intel_perfmon_event_map[hw_event_type]; + hwc->config |= intel_perfmon_event_map[hw_event_type]; } - hwc->config |= counter->hw_event_type; counter->wakeup_pending = 0; return 0; @@ -373,7 +372,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->hw_event_type); + perf_store_irq_data(leader, counter->event.hw_event_type); perf_store_irq_data(leader, atomic64_counter_read(counter)); } } @@ -418,7 +417,8 @@ again: perf_store_irq_data(counter, instruction_pointer(regs)); break; case PERF_RECORD_GROUP: - perf_store_irq_data(counter, counter->hw_event_type); + perf_store_irq_data(counter, + counter->event.hw_event_type); perf_store_irq_data(counter, atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index daedd7d87c2..1f0017673e7 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -96,8 +96,7 @@ struct perf_counter { #else atomic_t count32[2]; #endif - u64 __irq_period; - + struct perf_counter_event event; struct hw_perf_counter hw; struct perf_counter_context *ctx; @@ -111,7 +110,6 @@ struct perf_counter { int oncpu; int cpu; - s32 hw_event_type; enum perf_record_type record_type; /* read() / irq related data */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 96c333a5b0f..2557c670a3b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -37,7 +37,7 @@ static DEFINE_MUTEX(perf_resource_mutex); * Architecture provided APIs - weak aliases: */ -int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type) +int __weak hw_perf_counter_init(struct perf_counter *counter) { return -EINVAL; } @@ -707,7 +707,7 @@ static const struct file_operations perf_fops = { * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) +perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type) { struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); @@ -722,7 +722,7 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) counter->usrdata = &counter->data[1]; counter->cpu = cpu; counter->record_type = record_type; - counter->__irq_period = hw_event_period; + counter->event = *event; counter->wakeup_pending = 0; return counter; @@ -750,11 +750,11 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(event.hw_event_period, cpu, record_type); + counter = perf_counter_alloc(&event, cpu, record_type); if (!counter) goto err_put_context; - ret = hw_perf_counter_init(counter, event.hw_event_type); + ret = hw_perf_counter_init(counter); if (ret) goto err_free_put_context; -- cgit v1.2.3 From 9f66a3810fe0d4100972db84290f3ae4a4d77025 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 10 Dec 2008 12:33:23 +0100 Subject: perf counters: restructure the API Impact: clean up new API Thorough cleanup of the new perf counters API, we now get clean separation of the various concepts: - introduce perf_counter_hw_event to separate out the event source details - move special type flags into separate attributes: PERF_COUNT_NMI, PERF_COUNT_RAW - extend the type to u64 and reserve it fully to the architecture in the raw type case. And make use of all these changes in the core and x86 perfcounters code. Also change the syscall signature to: asmlinkage int sys_perf_counter_open( struct perf_counter_hw_event *hw_event_uptr __user, pid_t pid, int cpu, int group_fd); ( Note that group_fd is unused for now - it's reserved for the counter groups abstraction. ) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 29 ++++++----- include/linux/perf_counter.h | 98 ++++++++++++++++++++++++-------------- include/linux/syscalls.h | 12 +++-- kernel/perf_counter.c | 38 ++++++++------- 4 files changed, 106 insertions(+), 71 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 30e7ebf7827..ef1936a871a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -58,8 +58,8 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); */ int hw_perf_counter_init(struct perf_counter *counter) { + struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; - u32 hw_event_type = counter->event.hw_event_type; if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -77,14 +77,14 @@ int hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 0; if (capable(CAP_SYS_ADMIN)) { hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (hw_event_type & PERF_COUNT_NMI) + if (hw_event->nmi) hwc->nmi = 1; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = counter->event.hw_event_period; + hwc->irq_period = hw_event->irq_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -93,21 +93,20 @@ int hw_perf_counter_init(struct perf_counter *counter) if (!hwc->irq_period) hwc->irq_period = 0x7FFFFFFF; - hwc->next_count = -((s32) hwc->irq_period); + hwc->next_count = -(s32)hwc->irq_period; /* * Raw event type provide the config in the event structure */ - hw_event_type &= ~PERF_COUNT_NMI; - if (hw_event_type == PERF_COUNT_RAW) { - hwc->config |= counter->event.hw_raw_ctrl; + if (hw_event->raw) { + hwc->config |= hw_event->type; } else { - if (hw_event_type >= max_intel_perfmon_events) + if (hw_event->type >= max_intel_perfmon_events) return -EINVAL; /* * The generic map: */ - hwc->config |= intel_perfmon_event_map[hw_event_type]; + hwc->config |= intel_perfmon_event_map[hw_event->type]; } counter->wakeup_pending = 0; @@ -354,7 +353,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) int bit; list_for_each_entry(counter, &ctx->counters, list) { - if (counter->record_type != PERF_RECORD_SIMPLE || + if (counter->hw_event.record_type != PERF_RECORD_SIMPLE || counter == leader) continue; @@ -372,7 +371,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->event.hw_event_type); + perf_store_irq_data(leader, counter->hw_event.type); perf_store_irq_data(leader, atomic64_counter_read(counter)); } } @@ -410,7 +409,7 @@ again: perf_save_and_restart(counter); - switch (counter->record_type) { + switch (counter->hw_event.record_type) { case PERF_RECORD_SIMPLE: continue; case PERF_RECORD_IRQ: @@ -418,7 +417,7 @@ again: break; case PERF_RECORD_GROUP: perf_store_irq_data(counter, - counter->event.hw_event_type); + counter->hw_event.type); perf_store_irq_data(counter, atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 1f0017673e7..a2b4852e2d7 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -24,65 +24,93 @@ struct task_struct; /* - * Generalized hardware event types, used by the hw_event_type parameter - * of the sys_perf_counter_open() syscall: + * User-space ABI bits: + */ + +/* + * Generalized performance counter event types, used by the hw_event.type + * parameter of the sys_perf_counter_open() syscall: */ enum hw_event_types { - PERF_COUNT_CYCLES, - PERF_COUNT_INSTRUCTIONS, - PERF_COUNT_CACHE_REFERENCES, - PERF_COUNT_CACHE_MISSES, - PERF_COUNT_BRANCH_INSTRUCTIONS, - PERF_COUNT_BRANCH_MISSES, /* - * If this bit is set in the type, then trigger NMI sampling: + * Common hardware events, generalized by the kernel: */ - PERF_COUNT_NMI = (1 << 30), - PERF_COUNT_RAW = (1 << 31), + PERF_COUNT_CYCLES = 0, + PERF_COUNT_INSTRUCTIONS = 1, + PERF_COUNT_CACHE_REFERENCES = 2, + PERF_COUNT_CACHE_MISSES = 3, + PERF_COUNT_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_BRANCH_MISSES = 5, + + /* + * Special "software" counters provided by the kernel, even if + * the hardware does not support performance counters. These + * counters measure various physical and sw events of the + * kernel (and allow the profiling of them as well): + */ + PERF_COUNT_CPU_CLOCK = -1, + PERF_COUNT_TASK_CLOCK = -2, + PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, }; /* * IRQ-notification data record type: */ -enum perf_record_type { - PERF_RECORD_SIMPLE, - PERF_RECORD_IRQ, - PERF_RECORD_GROUP, +enum perf_counter_record_type { + PERF_RECORD_SIMPLE = 0, + PERF_RECORD_IRQ = 1, + PERF_RECORD_GROUP = 2, }; -struct perf_counter_event { - u32 hw_event_type; - u32 hw_event_period; - u64 hw_raw_ctrl; +/* + * Hardware event to monitor via a performance monitoring counter: + */ +struct perf_counter_hw_event { + u64 type; + + u64 irq_period; + u32 record_type; + + u32 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + __reserved_1 : 29; + + u64 __reserved_2; }; +/* + * Kernel-internal data types: + */ + /** - * struct hw_perf_counter - performance counter hardware details + * struct hw_perf_counter - performance counter hardware details: */ struct hw_perf_counter { - u64 config; - unsigned long config_base; - unsigned long counter_base; - int nmi; - unsigned int idx; - u64 prev_count; - s32 next_count; - u64 irq_period; + u64 config; + unsigned long config_base; + unsigned long counter_base; + int nmi; + unsigned int idx; + u64 prev_count; + u64 irq_period; + s32 next_count; }; /* * Hardcoded buffer length limit for now, for IRQ-fed events: */ -#define PERF_DATA_BUFLEN 2048 +#define PERF_DATA_BUFLEN 2048 /** * struct perf_data - performance counter IRQ data sampling ... */ struct perf_data { - int len; - int rd_idx; - int overrun; - u8 data[PERF_DATA_BUFLEN]; + int len; + int rd_idx; + int overrun; + u8 data[PERF_DATA_BUFLEN]; }; /** @@ -96,7 +124,7 @@ struct perf_counter { #else atomic_t count32[2]; #endif - struct perf_counter_event event; + struct perf_counter_hw_event hw_event; struct hw_perf_counter hw; struct perf_counter_context *ctx; @@ -110,8 +138,6 @@ struct perf_counter { int oncpu; int cpu; - enum perf_record_type record_type; - /* read() / irq related data */ wait_queue_head_t waitq; /* optional: for NMIs */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3ecd73d03da..a549678b7c3 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -54,7 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; -struct perf_counter_event; +struct perf_counter_hw_event; #include #include @@ -625,7 +625,11 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -asmlinkage int -sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, - pid_t pid, int cpu, int masterfd); + +asmlinkage int sys_perf_counter_open( + + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2557c670a3b..0d323ceda3a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -669,7 +669,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_counter *counter = file->private_data; - switch (counter->record_type) { + switch (counter->hw_event.record_type) { case PERF_RECORD_SIMPLE: return perf_read_hw(counter, buf, count); @@ -707,7 +707,7 @@ static const struct file_operations perf_fops = { * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type) +perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu) { struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); @@ -718,31 +718,37 @@ perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type) INIT_LIST_HEAD(&counter->list); init_waitqueue_head(&counter->waitq); - counter->irqdata = &counter->data[0]; - counter->usrdata = &counter->data[1]; - counter->cpu = cpu; - counter->record_type = record_type; - counter->event = *event; - counter->wakeup_pending = 0; + counter->irqdata = &counter->data[0]; + counter->usrdata = &counter->data[1]; + counter->cpu = cpu; + counter->hw_event = *hw_event; + counter->wakeup_pending = 0; return counter; } /** - * sys_perf_task_open - open a performance counter associate it to a task - * @hw_event_type: event type for monitoring/sampling... + * sys_perf_task_open - open a performance counter, associate it to a task/cpu + * + * @hw_event_uptr: event type attributes for monitoring/sampling * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader counter fd */ -asmlinkage int -sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, - pid_t pid, int cpu, int masterfd) +asmlinkage int sys_perf_counter_open( + + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd) + { struct perf_counter_context *ctx; - struct perf_counter_event event; + struct perf_counter_hw_event hw_event; struct perf_counter *counter; int ret; - if (copy_from_user(&event, uevent, sizeof(event)) != 0) + if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) return -EFAULT; ctx = find_get_context(pid, cpu); @@ -750,7 +756,7 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(&event, cpu, record_type); + counter = perf_counter_alloc(&hw_event, cpu); if (!counter) goto err_put_context; -- cgit v1.2.3 From 04289bb9891882202d7e961c4c04d2376930e9f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 08:38:42 +0100 Subject: perf counters: add support for group counters Impact: add group counters This patch adds the "counter groups" abstraction. Groups of counters behave much like normal 'single' counters, with a few semantic and behavioral extensions on top of that. A counter group is created by creating a new counter with the open() syscall's group-leader group_fd file descriptor parameter pointing to another, already existing counter. Groups of counters are scheduled in and out in one atomic group, and they are also roundrobin-scheduled atomically. Counters that are member of a group can also record events with an (atomic) extended timestamp that extends to all members of the group, if the record type is set to PERF_RECORD_GROUP. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 28 ++-- include/linux/perf_counter.h | 8 +- kernel/perf_counter.c | 282 ++++++++++++++++++++++++++++--------- 3 files changed, 236 insertions(+), 82 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ef1936a871a..54b4ad0cce6 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -346,18 +346,22 @@ static void perf_save_and_restart(struct perf_counter *counter) } static void -perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) +perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) { - struct perf_counter_context *ctx = leader->ctx; - struct perf_counter *counter; + struct perf_counter *counter, *group_leader = sibling->group_leader; int bit; - list_for_each_entry(counter, &ctx->counters, list) { - if (counter->hw_event.record_type != PERF_RECORD_SIMPLE || - counter == leader) - continue; + /* + * Store the counter's own timestamp first: + */ + perf_store_irq_data(sibling, sibling->hw_event.type); + perf_store_irq_data(sibling, atomic64_counter_read(sibling)); - if (counter->active) { + /* + * Then store sibling timestamps (if any): + */ + list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { + if (!counter->active) { /* * When counter was not in the overflow mask, we have to * read it from hardware. We read it as well, when it @@ -371,8 +375,8 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->hw_event.type); - perf_store_irq_data(leader, atomic64_counter_read(counter)); + perf_store_irq_data(sibling, counter->hw_event.type); + perf_store_irq_data(sibling, atomic64_counter_read(counter)); } } @@ -416,10 +420,6 @@ again: perf_store_irq_data(counter, instruction_pointer(regs)); break; case PERF_RECORD_GROUP: - perf_store_irq_data(counter, - counter->hw_event.type); - perf_store_irq_data(counter, - atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); break; } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a2b4852e2d7..7af7d896546 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -117,7 +117,10 @@ struct perf_data { * struct perf_counter - performance counter kernel representation: */ struct perf_counter { - struct list_head list; + struct list_head list_entry; + struct list_head sibling_list; + struct perf_counter *group_leader; + int active; #if BITS_PER_LONG == 64 atomic64_t count; @@ -158,7 +161,8 @@ struct perf_counter_context { * Protect the list of counters: */ spinlock_t lock; - struct list_head counters; + + struct list_head counter_list; int nr_counters; int nr_active; struct task_struct *task; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0d323ceda3a..fa59fe8c02d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -55,7 +56,7 @@ void __weak hw_perf_counter_setup(void) { } * Read the cached counter in counter safe against cross CPU / NMI * modifications. 64 bit version - no complications. */ -static inline u64 perf_read_counter_safe(struct perf_counter *counter) +static inline u64 perf_counter_read_safe(struct perf_counter *counter) { return (u64) atomic64_read(&counter->count); } @@ -66,7 +67,7 @@ static inline u64 perf_read_counter_safe(struct perf_counter *counter) * Read the cached counter in counter safe against cross CPU / NMI * modifications. 32 bit version. */ -static u64 perf_read_counter_safe(struct perf_counter *counter) +static u64 perf_counter_read_safe(struct perf_counter *counter) { u32 cntl, cnth; @@ -83,13 +84,55 @@ static u64 perf_read_counter_safe(struct perf_counter *counter) #endif +static void +list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +{ + struct perf_counter *group_leader = counter->group_leader; + + /* + * Depending on whether it is a standalone or sibling counter, + * add it straight to the context's counter list, or to the group + * leader's sibling list: + */ + if (counter->group_leader == counter) + list_add_tail(&counter->list_entry, &ctx->counter_list); + else + list_add_tail(&counter->list_entry, &group_leader->sibling_list); +} + +static void +list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +{ + struct perf_counter *sibling, *tmp; + + list_del_init(&counter->list_entry); + + if (list_empty(&counter->sibling_list)) + return; + + /* + * If this was a group counter with sibling counters then + * upgrade the siblings to singleton counters by adding them + * to the context list directly: + */ + list_for_each_entry_safe(sibling, tmp, + &counter->sibling_list, list_entry) { + + list_del_init(&sibling->list_entry); + list_add_tail(&sibling->list_entry, &ctx->counter_list); + WARN_ON_ONCE(!sibling->group_leader); + WARN_ON_ONCE(sibling->group_leader == sibling); + sibling->group_leader = sibling; + } +} + /* * Cross CPU call to remove a performance counter * * We disable the counter on the hardware level first. After that we * remove it from the context list. */ -static void __perf_remove_from_context(void *info) +static void __perf_counter_remove_from_context(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; @@ -119,7 +162,7 @@ static void __perf_remove_from_context(void *info) * counters on a global level. NOP for non NMI based counters. */ hw_perf_disable_all(); - list_del_init(&counter->list); + list_del_counter(counter, ctx); hw_perf_enable_all(); if (!ctx->task) { @@ -144,7 +187,7 @@ static void __perf_remove_from_context(void *info) * CPU counters are removed with a smp call. For task counters we only * call when the task is on a CPU. */ -static void perf_remove_from_context(struct perf_counter *counter) +static void perf_counter_remove_from_context(struct perf_counter *counter) { struct perf_counter_context *ctx = counter->ctx; struct task_struct *task = ctx->task; @@ -155,32 +198,32 @@ static void perf_remove_from_context(struct perf_counter *counter) * the removal is always sucessful. */ smp_call_function_single(counter->cpu, - __perf_remove_from_context, + __perf_counter_remove_from_context, counter, 1); return; } retry: - task_oncpu_function_call(task, __perf_remove_from_context, + task_oncpu_function_call(task, __perf_counter_remove_from_context, counter); spin_lock_irq(&ctx->lock); /* * If the context is active we need to retry the smp call. */ - if (ctx->nr_active && !list_empty(&counter->list)) { + if (ctx->nr_active && !list_empty(&counter->list_entry)) { spin_unlock_irq(&ctx->lock); goto retry; } /* * The lock prevents that this context is scheduled in so we - * can remove the counter safely, if it the call above did not + * can remove the counter safely, if the call above did not * succeed. */ - if (!list_empty(&counter->list)) { + if (!list_empty(&counter->list_entry)) { ctx->nr_counters--; - list_del_init(&counter->list); + list_del_counter(counter, ctx); counter->task = NULL; } spin_unlock_irq(&ctx->lock); @@ -211,7 +254,7 @@ static void __perf_install_in_context(void *info) * counters on a global level. NOP for non NMI based counters. */ hw_perf_disable_all(); - list_add_tail(&counter->list, &ctx->counters); + list_add_counter(counter, ctx); hw_perf_enable_all(); ctx->nr_counters++; @@ -268,7 +311,7 @@ retry: * If the context is active and the counter has not been added * we need to retry the smp call. */ - if (ctx->nr_active && list_empty(&counter->list)) { + if (ctx->nr_active && list_empty(&counter->list_entry)) { spin_unlock_irq(&ctx->lock); goto retry; } @@ -278,13 +321,45 @@ retry: * can add the counter safely, if it the call above did not * succeed. */ - if (list_empty(&counter->list)) { - list_add_tail(&counter->list, &ctx->counters); + if (list_empty(&counter->list_entry)) { + list_add_counter(counter, ctx); ctx->nr_counters++; } spin_unlock_irq(&ctx->lock); } +static void +counter_sched_out(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + if (!counter->active) + return; + + hw_perf_counter_disable(counter); + counter->active = 0; + counter->oncpu = -1; + + cpuctx->active_oncpu--; + ctx->nr_active--; +} + +static void +group_sched_out(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + + counter_sched_out(group_counter, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) + counter_sched_out(counter, cpuctx, ctx); +} + /* * Called from scheduler to remove the counters of the current task, * with interrupts disabled. @@ -306,21 +381,48 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) return; spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->counters, list) { - if (!ctx->nr_active) - break; - if (counter->active) { - hw_perf_counter_disable(counter); - counter->active = 0; - counter->oncpu = -1; - ctx->nr_active--; - cpuctx->active_oncpu--; - } + if (ctx->nr_active) { + list_for_each_entry(counter, &ctx->counter_list, list_entry) + group_sched_out(counter, cpuctx, ctx); } spin_unlock(&ctx->lock); cpuctx->task_ctx = NULL; } +static void +counter_sched_in(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, + int cpu) +{ + if (!counter->active) + return; + + hw_perf_counter_enable(counter); + counter->active = 1; + counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + + cpuctx->active_oncpu++; + ctx->nr_active++; +} + +static void +group_sched_in(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, + int cpu) +{ + struct perf_counter *counter; + + counter_sched_in(group_counter, cpuctx, ctx, cpu); + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) + counter_sched_in(counter, cpuctx, ctx, cpu); +} + /* * Called from scheduler to add the counters of the current task * with interrupts disabled. @@ -342,19 +444,21 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) return; spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->counters, list) { + list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (ctx->nr_active == cpuctx->max_pertask) break; + + /* + * Listen to the 'cpu' scheduling filter constraint + * of counters: + */ if (counter->cpu != -1 && counter->cpu != cpu) continue; - hw_perf_counter_enable(counter); - counter->active = 1; - counter->oncpu = cpu; - ctx->nr_active++; - cpuctx->active_oncpu++; + group_sched_in(counter, cpuctx, ctx, cpu); } spin_unlock(&ctx->lock); + cpuctx->task_ctx = ctx; } @@ -371,12 +475,12 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) spin_lock(&ctx->lock); /* - * Rotate the first entry last: + * Rotate the first entry last (works just fine for group counters too): */ hw_perf_disable_all(); - list_for_each_entry(counter, &ctx->counters, list) { - list_del(&counter->list); - list_add_tail(&counter->list, &ctx->counters); + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + list_del(&counter->list_entry); + list_add_tail(&counter->list_entry, &ctx->counter_list); break; } hw_perf_enable_all(); @@ -386,17 +490,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) perf_counter_task_sched_in(curr, cpu); } +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, + struct task_struct *task) +{ + spin_lock_init(&ctx->lock); + INIT_LIST_HEAD(&ctx->counter_list); + ctx->nr_counters = 0; + ctx->task = task; +} /* * Initialize the perf_counter context in task_struct */ void perf_counter_init_task(struct task_struct *task) { - struct perf_counter_context *ctx = &task->perf_counter_ctx; - - spin_lock_init(&ctx->lock); - INIT_LIST_HEAD(&ctx->counters); - ctx->nr_counters = 0; - ctx->task = task; + __perf_counter_init_context(&task->perf_counter_ctx, task); } /* @@ -407,7 +518,7 @@ static void __hw_perf_counter_read(void *info) hw_perf_counter_read(info); } -static u64 perf_read_counter(struct perf_counter *counter) +static u64 perf_counter_read(struct perf_counter *counter) { /* * If counter is enabled and currently active on a CPU, update the @@ -418,7 +529,7 @@ static u64 perf_read_counter(struct perf_counter *counter) __hw_perf_counter_read, counter, 1); } - return perf_read_counter_safe(counter); + return perf_counter_read_safe(counter); } /* @@ -555,7 +666,7 @@ static int perf_release(struct inode *inode, struct file *file) mutex_lock(&counter->mutex); - perf_remove_from_context(counter); + perf_counter_remove_from_context(counter); put_context(ctx); mutex_unlock(&counter->mutex); @@ -577,7 +688,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) return -EINVAL; mutex_lock(&counter->mutex); - cntval = perf_read_counter(counter); + cntval = perf_counter_read(counter); mutex_unlock(&counter->mutex); return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); @@ -707,15 +818,25 @@ static const struct file_operations perf_fops = { * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu) +perf_counter_alloc(struct perf_counter_hw_event *hw_event, + int cpu, + struct perf_counter *group_leader) { struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); if (!counter) return NULL; + /* + * Single counters are their own group leaders, with an + * empty sibling list: + */ + if (!group_leader) + group_leader = counter; + mutex_init(&counter->mutex); - INIT_LIST_HEAD(&counter->list); + INIT_LIST_HEAD(&counter->list_entry); + INIT_LIST_HEAD(&counter->sibling_list); init_waitqueue_head(&counter->waitq); counter->irqdata = &counter->data[0]; @@ -723,6 +844,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu) counter->cpu = cpu; counter->hw_event = *hw_event; counter->wakeup_pending = 0; + counter->group_leader = group_leader; return counter; } @@ -743,20 +865,45 @@ asmlinkage int sys_perf_counter_open( int group_fd) { - struct perf_counter_context *ctx; + struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; - struct perf_counter *counter; + struct perf_counter_context *ctx; + struct file *group_file = NULL; + int fput_needed = 0; int ret; if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) return -EFAULT; + /* + * Look up the group leader: + */ + group_leader = NULL; + if (group_fd != -1) { + ret = -EINVAL; + group_file = fget_light(group_fd, &fput_needed); + if (!group_file) + goto out_fput; + if (group_file->f_op != &perf_fops) + goto out_fput; + + group_leader = group_file->private_data; + /* + * Do not allow a recursive hierarchy: + */ + if (group_leader->group_leader) + goto out_fput; + } + + /* + * Get the target context (task or percpu): + */ ctx = find_get_context(pid, cpu); if (IS_ERR(ctx)) return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(&hw_event, cpu); + counter = perf_counter_alloc(&hw_event, cpu, group_leader); if (!counter) goto err_put_context; @@ -770,11 +917,14 @@ asmlinkage int sys_perf_counter_open( if (ret < 0) goto err_remove_free_put_context; +out_fput: + fput_light(group_file, fput_needed); + return ret; err_remove_free_put_context: mutex_lock(&counter->mutex); - perf_remove_from_context(counter); + perf_counter_remove_from_context(counter); mutex_unlock(&counter->mutex); err_free_put_context: @@ -783,40 +933,40 @@ err_free_put_context: err_put_context: put_context(ctx); - return ret; + goto out_fput; } -static void __cpuinit perf_init_cpu(int cpu) +static void __cpuinit perf_counter_init_cpu(int cpu) { - struct perf_cpu_context *ctx; + struct perf_cpu_context *cpuctx; - ctx = &per_cpu(perf_cpu_context, cpu); - spin_lock_init(&ctx->ctx.lock); - INIT_LIST_HEAD(&ctx->ctx.counters); + cpuctx = &per_cpu(perf_cpu_context, cpu); + __perf_counter_init_context(&cpuctx->ctx, NULL); mutex_lock(&perf_resource_mutex); - ctx->max_pertask = perf_max_counters - perf_reserved_percpu; + cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; mutex_unlock(&perf_resource_mutex); + hw_perf_counter_setup(); } #ifdef CONFIG_HOTPLUG_CPU -static void __perf_exit_cpu(void *info) +static void __perf_counter_exit_cpu(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter_context *ctx = &cpuctx->ctx; struct perf_counter *counter, *tmp; - list_for_each_entry_safe(counter, tmp, &ctx->counters, list) - __perf_remove_from_context(counter); + list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) + __perf_counter_remove_from_context(counter); } -static void perf_exit_cpu(int cpu) +static void perf_counter_exit_cpu(int cpu) { - smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1); + smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); } #else -static inline void perf_exit_cpu(int cpu) { } +static inline void perf_counter_exit_cpu(int cpu) { } #endif static int __cpuinit @@ -828,12 +978,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - perf_init_cpu(cpu); + perf_counter_init_cpu(cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - perf_exit_cpu(cpu); + perf_counter_exit_cpu(cpu); break; default: -- cgit v1.2.3 From 621a01eac89b5e2f81a4cf576568b31f40a02724 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 12:46:46 +0100 Subject: perf counters: hw driver API Impact: restructure code, introduce hw_ops driver abstraction Introduce this abstraction to handle counter details: struct hw_perf_counter_ops { void (*hw_perf_counter_enable) (struct perf_counter *counter); void (*hw_perf_counter_disable) (struct perf_counter *counter); void (*hw_perf_counter_read) (struct perf_counter *counter); }; This will be useful to support assymetric hw details, and it will also be useful to implement "software counters". (Counters that count kernel managed sw events such as pagefaults, context-switches, wall-clock time or task-local time.) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 37 ++++++++++++++++++++++--------- include/linux/perf_counter.h | 15 +++++++++++++ kernel/perf_counter.c | 45 ++++++++++++++++++++------------------ 3 files changed, 66 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 54b4ad0cce6..718b635dece 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -56,7 +56,7 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Setup the hardware configuration for a given hw_event_type */ -int hw_perf_counter_init(struct perf_counter *counter) +static int __hw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; @@ -135,7 +135,7 @@ u64 hw_perf_disable_all(void) EXPORT_SYMBOL_GPL(hw_perf_disable_all); static inline void -__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) { wrmsr(hwc->config_base + idx, hwc->config, 0); } @@ -149,13 +149,13 @@ static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) wrmsr(hwc->counter_base + idx, hwc->next_count, 0); } -static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } -void hw_perf_counter_enable(struct perf_counter *counter) +static void x86_perf_counter_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -170,12 +170,12 @@ void hw_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __hw_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(hwc, idx); cpuc->counters[idx] = counter; __hw_perf_counter_set_period(hwc, idx); - __hw_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(hwc, idx); } #ifdef CONFIG_X86_64 @@ -282,20 +282,20 @@ void perf_counter_print_debug(void) local_irq_enable(); } -void hw_perf_counter_disable(struct perf_counter *counter) +static void x86_perf_counter_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __hw_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; __hw_perf_save_counter(counter, hwc, idx); } -void hw_perf_counter_read(struct perf_counter *counter) +static void x86_perf_counter_read(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; unsigned long addr = hwc->counter_base + hwc->idx; @@ -342,7 +342,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __hw_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(hwc, idx); } static void @@ -572,3 +572,20 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } + +static struct hw_perf_counter_ops x86_perf_counter_ops = { + .hw_perf_counter_enable = x86_perf_counter_enable, + .hw_perf_counter_disable = x86_perf_counter_disable, + .hw_perf_counter_read = x86_perf_counter_read, +}; + +struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter) +{ + int err; + + err = __hw_perf_counter_init(counter); + if (err) + return NULL; + + return &x86_perf_counter_ops; +} diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7af7d896546..27385641ecb 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -113,6 +113,17 @@ struct perf_data { u8 data[PERF_DATA_BUFLEN]; }; +struct perf_counter; + +/** + * struct hw_perf_counter_ops - performance counter hw ops + */ +struct hw_perf_counter_ops { + void (*hw_perf_counter_enable) (struct perf_counter *counter); + void (*hw_perf_counter_disable) (struct perf_counter *counter); + void (*hw_perf_counter_read) (struct perf_counter *counter); +}; + /** * struct perf_counter - performance counter kernel representation: */ @@ -120,6 +131,7 @@ struct perf_counter { struct list_head list_entry; struct list_head sibling_list; struct perf_counter *group_leader; + struct hw_perf_counter_ops *hw_ops; int active; #if BITS_PER_LONG == 64 @@ -185,6 +197,9 @@ struct perf_cpu_context { extern int perf_max_counters; #ifdef CONFIG_PERF_COUNTERS +extern struct hw_perf_counter_ops * +hw_perf_counter_init(struct perf_counter *counter); + extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 278209c547a..e6e41ca9546 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -37,18 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex); /* * Architecture provided APIs - weak aliases: */ - -int __weak hw_perf_counter_init(struct perf_counter *counter) +extern __weak struct hw_perf_counter_ops * +hw_perf_counter_init(struct perf_counter *counter) { - return -EINVAL; + return ERR_PTR(-EINVAL); } -void __weak hw_perf_counter_enable(struct perf_counter *counter) { } -void __weak hw_perf_counter_disable(struct perf_counter *counter) { } -void __weak hw_perf_counter_read(struct perf_counter *counter) { } -void __weak hw_perf_disable_all(void) { } -void __weak hw_perf_enable_all(void) { } -void __weak hw_perf_counter_setup(void) { } +void __weak hw_perf_disable_all(void) { } +void __weak hw_perf_enable_all(void) { } +void __weak hw_perf_counter_setup(void) { } #if BITS_PER_LONG == 64 @@ -146,7 +143,7 @@ static void __perf_counter_remove_from_context(void *info) spin_lock(&ctx->lock); if (counter->active) { - hw_perf_counter_disable(counter); + counter->hw_ops->hw_perf_counter_disable(counter); counter->active = 0; ctx->nr_active--; cpuctx->active_oncpu--; @@ -257,7 +254,7 @@ static void __perf_install_in_context(void *info) ctx->nr_counters++; if (cpuctx->active_oncpu < perf_max_counters) { - hw_perf_counter_enable(counter); + counter->hw_ops->hw_perf_counter_enable(counter); counter->active = 1; counter->oncpu = cpu; ctx->nr_active++; @@ -333,7 +330,7 @@ counter_sched_out(struct perf_counter *counter, if (!counter->active) return; - hw_perf_counter_disable(counter); + counter->hw_ops->hw_perf_counter_disable(counter); counter->active = 0; counter->oncpu = -1; @@ -392,7 +389,7 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { - hw_perf_counter_enable(counter); + counter->hw_ops->hw_perf_counter_enable(counter); counter->active = 1; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ @@ -509,7 +506,9 @@ void perf_counter_init_task(struct task_struct *task) */ static void __hw_perf_counter_read(void *info) { - hw_perf_counter_read(info); + struct perf_counter *counter = info; + + counter->hw_ops->hw_perf_counter_read(counter); } static u64 perf_counter_read(struct perf_counter *counter) @@ -816,8 +815,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu, struct perf_counter *group_leader) { - struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); + struct hw_perf_counter_ops *hw_ops; + struct perf_counter *counter; + counter = kzalloc(sizeof(*counter), GFP_KERNEL); if (!counter) return NULL; @@ -839,6 +840,14 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->hw_event = *hw_event; counter->wakeup_pending = 0; counter->group_leader = group_leader; + counter->hw_ops = NULL; + + hw_ops = hw_perf_counter_init(counter); + if (!hw_ops) { + kfree(counter); + return NULL; + } + counter->hw_ops = hw_ops; return counter; } @@ -908,10 +917,6 @@ asmlinkage int sys_perf_counter_open( if (!counter) goto err_put_context; - ret = hw_perf_counter_init(counter); - if (ret) - goto err_free_put_context; - perf_install_in_context(ctx, counter, cpu); ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); @@ -927,8 +932,6 @@ err_remove_free_put_context: mutex_lock(&counter->mutex); perf_counter_remove_from_context(counter); mutex_unlock(&counter->mutex); - -err_free_put_context: kfree(counter); err_put_context: -- cgit v1.2.3 From 5c92d12411dfe5f0f3d1b1c1e2f756245e6f7249 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:21:10 +0100 Subject: perf counters: implement PERF_COUNT_CPU_CLOCK Impact: add new perf-counter type The 'CPU clock' counter counts the amount of CPU clock time that is elapsing, in nanoseconds. (regardless of how much of it the task is spending on a CPU executing) This counter type is a Linux kernel based abstraction, it is available even if the hardware does not support native hardware performance counters. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 36 ++------------- include/linux/perf_counter.h | 9 ++-- kernel/perf_counter.c | 95 ++++++++++++++++++++++++++++++++------ 3 files changed, 92 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 718b635dece..43c8e9a38b4 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -178,35 +178,6 @@ static void x86_perf_counter_enable(struct perf_counter *counter) __x86_perf_counter_enable(hwc, idx); } -#ifdef CONFIG_X86_64 -static inline void atomic64_counter_set(struct perf_counter *counter, u64 val) -{ - atomic64_set(&counter->count, val); -} - -static inline u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic64_read(&counter->count); -} -#else -/* - * Todo: add proper atomic64_t support to 32-bit x86: - */ -static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64) -{ - u32 *val32 = (void *)&val64; - - atomic_set(counter->count32 + 0, *(val32 + 0)); - atomic_set(counter->count32 + 1, *(val32 + 1)); -} - -static inline u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic_read(counter->count32 + 0) | - (u64) atomic_read(counter->count32 + 1) << 32; -} -#endif - static void __hw_perf_save_counter(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { @@ -309,7 +280,7 @@ static void x86_perf_counter_read(struct perf_counter *counter) } while (offs != hwc->prev_count); val32 = (s32) val; - val = (s64)hwc->irq_period + (s64)val32; + val = (s64)hwc->irq_period + (s64)val32; atomic64_counter_set(counter, hwc->prev_count + val); } @@ -573,13 +544,14 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } -static struct hw_perf_counter_ops x86_perf_counter_ops = { +static const struct hw_perf_counter_ops x86_perf_counter_ops = { .hw_perf_counter_enable = x86_perf_counter_enable, .hw_perf_counter_disable = x86_perf_counter_disable, .hw_perf_counter_read = x86_perf_counter_read, }; -struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter) +const struct hw_perf_counter_ops * +hw_perf_counter_init(struct perf_counter *counter) { int err; diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 27385641ecb..9a1713a1be2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -131,7 +131,7 @@ struct perf_counter { struct list_head list_entry; struct list_head sibling_list; struct perf_counter *group_leader; - struct hw_perf_counter_ops *hw_ops; + const struct hw_perf_counter_ops *hw_ops; int active; #if BITS_PER_LONG == 64 @@ -197,7 +197,7 @@ struct perf_cpu_context { extern int perf_max_counters; #ifdef CONFIG_PERF_COUNTERS -extern struct hw_perf_counter_ops * +extern const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter); extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); @@ -208,6 +208,9 @@ extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); extern void hw_perf_restore_ctrl(u64 ctrl); extern u64 hw_perf_disable_all(void); +extern void atomic64_counter_set(struct perf_counter *counter, u64 val64); +extern u64 atomic64_counter_read(struct perf_counter *counter); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -219,7 +222,7 @@ static inline void perf_counter_init_task(struct task_struct *task) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } static inline void hw_perf_restore_ctrl(u64 ctrl) { } -static inline u64 hw_perf_disable_all(void) { return 0; } +static inline u64 hw_perf_disable_all(void) { return 0; } #endif #endif /* _LINUX_PERF_COUNTER_H */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e6e41ca9546..506286e5ba6 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -37,15 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex); /* * Architecture provided APIs - weak aliases: */ -extern __weak struct hw_perf_counter_ops * +extern __weak const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter) { return ERR_PTR(-EINVAL); } -void __weak hw_perf_disable_all(void) { } -void __weak hw_perf_enable_all(void) { } -void __weak hw_perf_counter_setup(void) { } +u64 __weak hw_perf_disable_all(void) { return 0; } +void __weak hw_perf_restore_ctrl(u64 ctrl) { } +void __weak hw_perf_counter_setup(void) { } #if BITS_PER_LONG == 64 @@ -58,6 +58,16 @@ static inline u64 perf_counter_read_safe(struct perf_counter *counter) return (u64) atomic64_read(&counter->count); } +void atomic64_counter_set(struct perf_counter *counter, u64 val) +{ + atomic64_set(&counter->count, val); +} + +u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic64_read(&counter->count); +} + #else /* @@ -79,6 +89,20 @@ static u64 perf_counter_read_safe(struct perf_counter *counter) return cntl | ((u64) cnth) << 32; } +void atomic64_counter_set(struct perf_counter *counter, u64 val64) +{ + u32 *val32 = (void *)&val64; + + atomic_set(counter->count32 + 0, *(val32 + 0)); + atomic_set(counter->count32 + 1, *(val32 + 1)); +} + +u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic_read(counter->count32 + 0) | + (u64) atomic_read(counter->count32 + 1) << 32; +} + #endif static void @@ -131,6 +155,7 @@ static void __perf_counter_remove_from_context(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; + u64 perf_flags; /* * If this is a task context, we need to check whether it is @@ -155,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - hw_perf_disable_all(); + perf_flags = hw_perf_disable_all(); list_del_counter(counter, ctx); - hw_perf_enable_all(); + hw_perf_restore_ctrl(perf_flags); if (!ctx->task) { /* @@ -232,6 +257,7 @@ static void __perf_install_in_context(void *info) struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; int cpu = smp_processor_id(); + u64 perf_flags; /* * If this is a task context, we need to check whether it is @@ -247,9 +273,9 @@ static void __perf_install_in_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - hw_perf_disable_all(); + perf_flags = hw_perf_disable_all(); list_add_counter(counter, ctx); - hw_perf_enable_all(); + hw_perf_restore_ctrl(perf_flags); ctx->nr_counters++; @@ -457,6 +483,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) { struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; + u64 perf_flags; if (likely(!ctx->nr_counters)) return; @@ -468,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) /* * Rotate the first entry last (works just fine for group counters too): */ - hw_perf_disable_all(); + perf_flags = hw_perf_disable_all(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { list_del(&counter->list_entry); list_add_tail(&counter->list_entry, &ctx->counter_list); break; } - hw_perf_enable_all(); + hw_perf_restore_ctrl(perf_flags); spin_unlock(&ctx->lock); @@ -807,6 +834,42 @@ static const struct file_operations perf_fops = { .poll = perf_poll, }; +static void cpu_clock_perf_counter_enable(struct perf_counter *counter) +{ +} + +static void cpu_clock_perf_counter_disable(struct perf_counter *counter) +{ +} + +static void cpu_clock_perf_counter_read(struct perf_counter *counter) +{ + int cpu = raw_smp_processor_id(); + + atomic64_counter_set(counter, cpu_clock(cpu)); +} + +static const struct hw_perf_counter_ops perf_ops_cpu_clock = { + .hw_perf_counter_enable = cpu_clock_perf_counter_enable, + .hw_perf_counter_disable = cpu_clock_perf_counter_disable, + .hw_perf_counter_read = cpu_clock_perf_counter_read, +}; + +static const struct hw_perf_counter_ops * +sw_perf_counter_init(struct perf_counter *counter) +{ + const struct hw_perf_counter_ops *hw_ops = NULL; + + switch (counter->hw_event.type) { + case PERF_COUNT_CPU_CLOCK: + hw_ops = &perf_ops_cpu_clock; + break; + default: + break; + } + return hw_ops; +} + /* * Allocate and initialize a counter structure */ @@ -815,7 +878,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu, struct perf_counter *group_leader) { - struct hw_perf_counter_ops *hw_ops; + const struct hw_perf_counter_ops *hw_ops; struct perf_counter *counter; counter = kzalloc(sizeof(*counter), GFP_KERNEL); @@ -842,7 +905,13 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->hw_ops = NULL; - hw_ops = hw_perf_counter_init(counter); + hw_ops = NULL; + if (!hw_event->raw && hw_event->type < 0) + hw_ops = sw_perf_counter_init(counter); + if (!hw_ops) { + hw_ops = hw_perf_counter_init(counter); + } + if (!hw_ops) { kfree(counter); return NULL; @@ -912,7 +981,7 @@ asmlinkage int sys_perf_counter_open( goto err_put_context; } - ret = -ENOMEM; + ret = -EINVAL; counter = perf_counter_alloc(&hw_event, cpu, group_leader); if (!counter) goto err_put_context; -- cgit v1.2.3 From 01b2838c4298c5e0d30b4993c195ac34dd9df61e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:45:51 +0100 Subject: perf counters: consolidate hw_perf save/restore APIs Impact: cleanup Rename them to better match up the usual IRQ disable/enable APIs: hw_perf_disable_all() => hw_perf_save_disable() hw_perf_restore_ctrl() => hw_perf_restore() Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- drivers/acpi/processor_idle.c | 10 +++++----- include/linux/perf_counter.h | 10 +++++----- kernel/perf_counter.c | 16 ++++++++-------- 4 files changed, 22 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 43c8e9a38b4..3e1dbebe22b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -118,13 +118,13 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_restore_ctrl(u64 ctrl) +void hw_perf_restore(u64 ctrl) { wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); } -EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl); +EXPORT_SYMBOL_GPL(hw_perf_restore); -u64 hw_perf_disable_all(void) +u64 hw_perf_save_disable(void) { u64 ctrl; @@ -132,7 +132,7 @@ u64 hw_perf_disable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); return ctrl; } -EXPORT_SYMBOL_GPL(hw_perf_disable_all); +EXPORT_SYMBOL_GPL(hw_perf_save_disable); static inline void __x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index cca804e6f1d..a3e66a33b7a 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -270,11 +270,11 @@ static atomic_t c3_cpu_count; /* Common C-state entry for C2, C3, .. */ static void acpi_cstate_enter(struct acpi_processor_cx *cstate) { - u64 pctrl; + u64 perf_flags; /* Don't trace irqs off for idle */ stop_critical_timings(); - pctrl = hw_perf_disable_all(); + perf_flags = hw_perf_save_disable(); if (cstate->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cstate); @@ -287,7 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate) gets asserted in time to freeze execution properly. */ unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } - hw_perf_restore_ctrl(pctrl); + hw_perf_restore(perf_flags); start_critical_timings(); } #endif /* !CONFIG_CPU_IDLE */ @@ -1433,7 +1433,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) /* Don't trace irqs off for idle */ stop_critical_timings(); - pctrl = hw_perf_disable_all(); + pctrl = hw_perf_save_disable(); if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); @@ -1448,7 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) gets asserted in time to freeze execution properly. */ unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } - hw_perf_restore_ctrl(pctrl); + hw_perf_restore(pctrl); start_critical_timings(); } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 9a1713a1be2..68f6e3ad531 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -67,7 +67,7 @@ enum perf_counter_record_type { * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - u64 type; + s64 type; u64 irq_period; u32 record_type; @@ -206,8 +206,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern void perf_counter_init_task(struct task_struct *task); extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); -extern void hw_perf_restore_ctrl(u64 ctrl); -extern u64 hw_perf_disable_all(void); +extern u64 hw_perf_save_disable(void); +extern void hw_perf_restore(u64 ctrl); extern void atomic64_counter_set(struct perf_counter *counter, u64 val64); extern u64 atomic64_counter_read(struct perf_counter *counter); @@ -221,8 +221,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline void perf_counter_init_task(struct task_struct *task) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } -static inline void hw_perf_restore_ctrl(u64 ctrl) { } -static inline u64 hw_perf_disable_all(void) { return 0; } +static inline void hw_perf_restore(u64 ctrl) { } +static inline u64 hw_perf_save_disable(void) { return 0; } #endif #endif /* _LINUX_PERF_COUNTER_H */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 506286e5ba6..0e93fea1712 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -43,8 +43,8 @@ hw_perf_counter_init(struct perf_counter *counter) return ERR_PTR(-EINVAL); } -u64 __weak hw_perf_disable_all(void) { return 0; } -void __weak hw_perf_restore_ctrl(u64 ctrl) { } +u64 __weak hw_perf_save_disable(void) { return 0; } +void __weak hw_perf_restore(u64 ctrl) { } void __weak hw_perf_counter_setup(void) { } #if BITS_PER_LONG == 64 @@ -180,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_disable_all(); + perf_flags = hw_perf_save_disable(); list_del_counter(counter, ctx); - hw_perf_restore_ctrl(perf_flags); + hw_perf_restore(perf_flags); if (!ctx->task) { /* @@ -273,9 +273,9 @@ static void __perf_install_in_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_disable_all(); + perf_flags = hw_perf_save_disable(); list_add_counter(counter, ctx); - hw_perf_restore_ctrl(perf_flags); + hw_perf_restore(perf_flags); ctx->nr_counters++; @@ -495,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) /* * Rotate the first entry last (works just fine for group counters too): */ - perf_flags = hw_perf_disable_all(); + perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { list_del(&counter->list_entry); list_add_tail(&counter->list_entry, &ctx->counter_list); break; } - hw_perf_restore_ctrl(perf_flags); + hw_perf_restore(perf_flags); spin_unlock(&ctx->lock); -- cgit v1.2.3 From bae43c9945ebeef15e7952e317efb02393d3bfc7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 14:03:20 +0100 Subject: perf counters: implement PERF_COUNT_TASK_CLOCK Impact: add new perf-counter type The 'task clock' counter counts the amount of time a task is executing, in nanoseconds. It stops ticking when a task is scheduled out either due to it blocking, sleeping or it being preempted. This counter type is a Linux kernel based abstraction, it is available even if the hardware does not support native hardware performance counters. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++++-- kernel/perf_counter.c | 22 ++++++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 68f6e3ad531..30c0ec8c1ee 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -50,8 +50,11 @@ enum hw_event_types { */ PERF_COUNT_CPU_CLOCK = -1, PERF_COUNT_TASK_CLOCK = -2, - PERF_COUNT_PAGE_FAULTS = -3, - PERF_COUNT_CONTEXT_SWITCHES = -4, + /* + * Future software events: + */ + /* PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, */ }; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0e93fea1712..a0fe8474ee2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -855,6 +855,25 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { .hw_perf_counter_read = cpu_clock_perf_counter_read, }; +static void task_clock_perf_counter_enable(struct perf_counter *counter) +{ +} + +static void task_clock_perf_counter_disable(struct perf_counter *counter) +{ +} + +static void task_clock_perf_counter_read(struct perf_counter *counter) +{ + atomic64_counter_set(counter, current->se.sum_exec_runtime); +} + +static const struct hw_perf_counter_ops perf_ops_task_clock = { + .hw_perf_counter_enable = task_clock_perf_counter_enable, + .hw_perf_counter_disable = task_clock_perf_counter_disable, + .hw_perf_counter_read = task_clock_perf_counter_read, +}; + static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { @@ -864,6 +883,9 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_CPU_CLOCK: hw_ops = &perf_ops_cpu_clock; break; + case PERF_COUNT_TASK_CLOCK: + hw_ops = &perf_ops_task_clock; + break; default: break; } -- cgit v1.2.3 From 1d1c7ddbfab358445a542715551301b7fc363e28 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 14:59:31 +0100 Subject: perf counters: add prctl interface to disable/enable counters Add a way for self-monitoring tasks to disable/enable counters summarily, via a prctl: PR_TASK_PERF_COUNTERS_DISABLE 31 PR_TASK_PERF_COUNTERS_ENABLE 32 Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 +++ include/linux/prctl.h | 3 ++ kernel/perf_counter.c | 86 ++++++++++++++++++++++++++++++++++++++++---- kernel/sys.c | 7 ++++ 4 files changed, 93 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 30c0ec8c1ee..97d86c293ee 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -213,6 +213,8 @@ extern u64 hw_perf_save_disable(void); extern void hw_perf_restore(u64 ctrl); extern void atomic64_counter_set(struct perf_counter *counter, u64 val64); extern u64 atomic64_counter_read(struct perf_counter *counter); +extern int perf_counter_task_disable(void); +extern int perf_counter_task_enable(void); #else static inline void @@ -226,6 +228,8 @@ static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } static inline void hw_perf_restore(u64 ctrl) { } static inline u64 hw_perf_save_disable(void) { return 0; } +static inline int perf_counter_task_disable(void) { return -EINVAL; } +static inline int perf_counter_task_enable(void) { return -EINVAL; } #endif #endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/prctl.h b/include/linux/prctl.h index 48d887e3c6e..b00df4c79c6 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -85,4 +85,7 @@ #define PR_SET_TIMERSLACK 29 #define PR_GET_TIMERSLACK 30 +#define PR_TASK_PERF_COUNTERS_DISABLE 31 +#define PR_TASK_PERF_COUNTERS_ENABLE 32 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a0fe8474ee2..4e679b91d8b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -415,6 +415,9 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { + if (counter->active == -1) + return; + counter->hw_ops->hw_perf_counter_enable(counter); counter->active = 1; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ @@ -479,6 +482,79 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) cpuctx->task_ctx = ctx; } +int perf_counter_task_disable(void) +{ + struct task_struct *curr = current; + struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter *counter; + u64 perf_flags; + int cpu; + + if (likely(!ctx->nr_counters)) + return 0; + + local_irq_disable(); + cpu = smp_processor_id(); + + perf_counter_task_sched_out(curr, cpu); + + spin_lock(&ctx->lock); + + /* + * Disable all the counters: + */ + perf_flags = hw_perf_save_disable(); + + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + WARN_ON_ONCE(counter->active == 1); + counter->active = -1; + } + hw_perf_restore(perf_flags); + + spin_unlock(&ctx->lock); + + local_irq_enable(); + + return 0; +} + +int perf_counter_task_enable(void) +{ + struct task_struct *curr = current; + struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter *counter; + u64 perf_flags; + int cpu; + + if (likely(!ctx->nr_counters)) + return 0; + + local_irq_disable(); + cpu = smp_processor_id(); + + spin_lock(&ctx->lock); + + /* + * Disable all the counters: + */ + perf_flags = hw_perf_save_disable(); + + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->active != -1) + continue; + counter->active = 0; + } + hw_perf_restore(perf_flags); + + spin_unlock(&ctx->lock); + + perf_counter_task_sched_in(curr, cpu); + + local_irq_enable(); + + return 0; +} + void perf_counter_task_tick(struct task_struct *curr, int cpu) { struct perf_counter_context *ctx = &curr->perf_counter_ctx; @@ -951,13 +1027,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, * @cpu: target cpu * @group_fd: group leader counter fd */ -asmlinkage int sys_perf_counter_open( - - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd) - +asmlinkage int +sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, int cpu, int group_fd) { struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; diff --git a/kernel/sys.c b/kernel/sys.c index 31deba8f7d1..0f66633be31 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1716,6 +1717,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; + case PR_TASK_PERF_COUNTERS_DISABLE: + error = perf_counter_task_disable(); + break; + case PR_TASK_PERF_COUNTERS_ENABLE: + error = perf_counter_task_enable(); + break; case PR_GET_TIMERSLACK: error = current->timer_slack_ns; break; -- cgit v1.2.3 From 6a930700c8b655a9e25e42fc4adc0b225ebbcefc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 15:17:03 +0100 Subject: perf counters: clean up state transitions Impact: cleanup Introduce a proper enum for the 3 states of a counter: PERF_COUNTER_STATE_OFF = -1 PERF_COUNTER_STATE_INACTIVE = 0 PERF_COUNTER_STATE_ACTIVE = 1 and rename counter->active to counter->state and propagate the changes everywhere. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- include/linux/perf_counter.h | 11 ++++++++++- kernel/perf_counter.c | 29 ++++++++++++++--------------- 3 files changed, 25 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3e1dbebe22b..4854cca7fff 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -332,7 +332,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) * Then store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - if (!counter->active) { + if (counter->state != PERF_COUNTER_STATE_ACTIVE) { /* * When counter was not in the overflow mask, we have to * read it from hardware. We read it as well, when it diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 97d86c293ee..8cb095fa442 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -127,6 +127,15 @@ struct hw_perf_counter_ops { void (*hw_perf_counter_read) (struct perf_counter *counter); }; +/** + * enum perf_counter_active_state - the states of a counter + */ +enum perf_counter_active_state { + PERF_COUNTER_STATE_OFF = -1, + PERF_COUNTER_STATE_INACTIVE = 0, + PERF_COUNTER_STATE_ACTIVE = 1, +}; + /** * struct perf_counter - performance counter kernel representation: */ @@ -136,7 +145,7 @@ struct perf_counter { struct perf_counter *group_leader; const struct hw_perf_counter_ops *hw_ops; - int active; + enum perf_counter_active_state state; #if BITS_PER_LONG == 64 atomic64_t count; #else diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4e679b91d8b..559130b8774 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -167,9 +167,9 @@ static void __perf_counter_remove_from_context(void *info) spin_lock(&ctx->lock); - if (counter->active) { + if (counter->state == PERF_COUNTER_STATE_ACTIVE) { counter->hw_ops->hw_perf_counter_disable(counter); - counter->active = 0; + counter->state = PERF_COUNTER_STATE_INACTIVE; ctx->nr_active--; cpuctx->active_oncpu--; counter->task = NULL; @@ -281,7 +281,7 @@ static void __perf_install_in_context(void *info) if (cpuctx->active_oncpu < perf_max_counters) { counter->hw_ops->hw_perf_counter_enable(counter); - counter->active = 1; + counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; ctx->nr_active++; cpuctx->active_oncpu++; @@ -328,7 +328,6 @@ retry: spin_lock_irq(&ctx->lock); /* - * If the context is active and the counter has not been added * we need to retry the smp call. */ if (ctx->nr_active && list_empty(&counter->list_entry)) { @@ -353,12 +352,12 @@ counter_sched_out(struct perf_counter *counter, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx) { - if (!counter->active) + if (counter->state != PERF_COUNTER_STATE_ACTIVE) return; counter->hw_ops->hw_perf_counter_disable(counter); - counter->active = 0; - counter->oncpu = -1; + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->oncpu = -1; cpuctx->active_oncpu--; ctx->nr_active--; @@ -415,11 +414,11 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { - if (counter->active == -1) + if (counter->state == PERF_COUNTER_STATE_OFF) return; counter->hw_ops->hw_perf_counter_enable(counter); - counter->active = 1; + counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ cpuctx->active_oncpu++; @@ -506,8 +505,8 @@ int perf_counter_task_disable(void) perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - WARN_ON_ONCE(counter->active == 1); - counter->active = -1; + WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE); + counter->state = PERF_COUNTER_STATE_OFF; } hw_perf_restore(perf_flags); @@ -540,9 +539,9 @@ int perf_counter_task_enable(void) perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->active != -1) + if (counter->state != PERF_COUNTER_STATE_OFF) continue; - counter->active = 0; + counter->state = PERF_COUNTER_STATE_INACTIVE; } hw_perf_restore(perf_flags); @@ -620,7 +619,7 @@ static u64 perf_counter_read(struct perf_counter *counter) * If counter is enabled and currently active on a CPU, update the * value in the counter structure: */ - if (counter->active) { + if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, __hw_perf_counter_read, counter, 1); } @@ -673,7 +672,7 @@ static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) retry: spin_lock_irq(&ctx->lock); - if (!counter->active) { + if (counter->state != PERF_COUNTER_STATE_ACTIVE) { counter->irqdata = counter->usrdata; counter->usrdata = oldirqdata; spin_unlock_irq(&ctx->lock); -- cgit v1.2.3 From ee06094f8279e1312fc0a31591320cc7b6f0ab1e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 13 Dec 2008 09:00:03 +0100 Subject: perfcounters: restructure x86 counter math Impact: restructure code Change counter math from absolute values to clear delta logic. We try to extract elapsed deltas from the raw hw counter - and put that into the generic counter. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/cpu/perf_counter.c | 230 ++++++++++++++++++++----------------- include/linux/perf_counter.h | 15 ++- kernel/perf_counter.c | 68 +---------- 4 files changed, 137 insertions(+), 178 deletions(-) (limited to 'include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f2fdc186724..fe94490bab6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -643,7 +643,7 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) - select HAVE_PERF_COUNTERS + select HAVE_PERF_COUNTERS if (!M386 && !M486) config X86_IO_APIC def_bool y diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b903f8df72b..5afae13d8d5 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -53,6 +53,48 @@ const int intel_perfmon_event_map[] = const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); +/* + * Propagate counter elapsed time into the generic counter. + * Can only be executed on the CPU where the counter is active. + * Returns the delta events processed. + */ +static void +x86_perf_counter_update(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) +{ + u64 prev_raw_count, new_raw_count, delta; + + WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE); + /* + * Careful: an NMI might modify the previous counter value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic counter atomically: + */ +again: + prev_raw_count = atomic64_read(&hwc->prev_count); + rdmsrl(hwc->counter_base + idx, new_raw_count); + + if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (counter-)time and add that to the generic counter. + * + * Careful, not all hw sign-extends above the physical width + * of the count, so we do that by clipping the delta to 32 bits: + */ + delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); + WARN_ON_ONCE((int)delta < 0); + + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &hwc->period_left); +} + /* * Setup the hardware configuration for a given hw_event_type */ @@ -90,10 +132,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * so we install an artificial 1<<31 period regardless of * the generic counter period: */ - if (!hwc->irq_period) + if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) hwc->irq_period = 0x7FFFFFFF; - hwc->next_count = -(s32)hwc->irq_period; + atomic64_set(&hwc->period_left, hwc->irq_period); /* * Raw event type provide the config in the event structure @@ -118,12 +160,6 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_restore(u64 ctrl) -{ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); -} -EXPORT_SYMBOL_GPL(hw_perf_restore); - u64 hw_perf_save_disable(void) { u64 ctrl; @@ -134,27 +170,74 @@ u64 hw_perf_save_disable(void) } EXPORT_SYMBOL_GPL(hw_perf_save_disable); +void hw_perf_restore(u64 ctrl) +{ + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); +} +EXPORT_SYMBOL_GPL(hw_perf_restore); + static inline void -__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +__x86_perf_counter_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int idx) { - wrmsr(hwc->config_base + idx, hwc->config, 0); + int err; + + err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); + WARN_ON_ONCE(err); } -static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]); +static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); -static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the counter disabled in hw: + */ +static void +__hw_perf_counter_set_period(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { - per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count; + s32 left = atomic64_read(&hwc->period_left); + s32 period = hwc->irq_period; + + WARN_ON_ONCE(period <= 0); + + /* + * If we are way outside a reasoable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_set(&hwc->period_left, left); + } - wrmsr(hwc->counter_base + idx, hwc->next_count, 0); + WARN_ON_ONCE(left <= 0); + + per_cpu(prev_left[idx], smp_processor_id()) = left; + + /* + * The hw counter starts counting from this counter offset, + * mark it to be able to extra future deltas: + */ + atomic64_set(&hwc->prev_count, (u64)(s64)-left); + + wrmsr(hwc->counter_base + idx, -left, 0); } -static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void +__x86_perf_counter_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } +/* + * Find a PMC slot for the freshly enabled / scheduled in counter: + */ static void x86_perf_counter_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -170,55 +253,17 @@ static void x86_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __x86_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(counter, hwc, idx); cpuc->counters[idx] = counter; - __hw_perf_counter_set_period(hwc, idx); - __x86_perf_counter_enable(hwc, idx); -} - -static void __hw_perf_save_counter(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - s64 raw = -1; - s64 delta; - - /* - * Get the raw hw counter value: - */ - rdmsrl(hwc->counter_base + idx, raw); - - /* - * Rebase it to zero (it started counting at -irq_period), - * to see the delta since ->prev_count: - */ - delta = (s64)hwc->irq_period + (s64)(s32)raw; - - atomic64_counter_set(counter, hwc->prev_count + delta); - - /* - * Adjust the ->prev_count offset - if we went beyond - * irq_period of units, then we got an IRQ and the counter - * was set back to -irq_period: - */ - while (delta >= (s64)hwc->irq_period) { - hwc->prev_count += hwc->irq_period; - delta -= (s64)hwc->irq_period; - } - - /* - * Calculate the next raw counter value we'll write into - * the counter at the next sched-in time: - */ - delta -= (s64)hwc->irq_period; - - hwc->next_count = (s32)delta; + __hw_perf_counter_set_period(counter, hwc, idx); + __x86_perf_counter_enable(counter, hwc, idx); } void perf_counter_print_debug(void) { - u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count; + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; int cpu, idx; if (!nr_hw_counters) @@ -241,14 +286,14 @@ void perf_counter_print_debug(void) rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); - next_count = per_cpu(prev_next_count[idx], cpu); + prev_left = per_cpu(prev_left[idx], cpu); printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n", cpu, idx, pmc_count); - printk(KERN_INFO "CPU#%d: PMC%d next: %016llx\n", - cpu, idx, next_count); + printk(KERN_INFO "CPU#%d: PMC%d left: %016llx\n", + cpu, idx, prev_left); } local_irq_enable(); } @@ -259,29 +304,16 @@ static void x86_perf_counter_disable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __x86_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; - __hw_perf_save_counter(counter, hwc, idx); -} -static void x86_perf_counter_read(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - unsigned long addr = hwc->counter_base + hwc->idx; - s64 offs, val = -1LL; - s32 val32; - - /* Careful: NMI might modify the counter offset */ - do { - offs = hwc->prev_count; - rdmsrl(addr, val); - } while (offs != hwc->prev_count); - - val32 = (s32) val; - val = (s64)hwc->irq_period + (s64)val32; - atomic64_counter_set(counter, hwc->prev_count + val); + /* + * Drain the remaining delta count out of a counter + * that we are disabling: + */ + x86_perf_counter_update(counter, hwc, idx); } static void perf_store_irq_data(struct perf_counter *counter, u64 data) @@ -299,7 +331,8 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data) } /* - * NMI-safe enable method: + * Save and restart an expired counter. Called by NMI contexts, + * so it has to be careful about preempting normal counter ops: */ static void perf_save_and_restart(struct perf_counter *counter) { @@ -309,45 +342,25 @@ static void perf_save_and_restart(struct perf_counter *counter) rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); - __hw_perf_save_counter(counter, hwc, idx); - __hw_perf_counter_set_period(hwc, idx); + x86_perf_counter_update(counter, hwc, idx); + __hw_perf_counter_set_period(counter, hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __x86_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(counter, hwc, idx); } static void perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) { struct perf_counter *counter, *group_leader = sibling->group_leader; - int bit; - - /* - * Store the counter's own timestamp first: - */ - perf_store_irq_data(sibling, sibling->hw_event.type); - perf_store_irq_data(sibling, atomic64_counter_read(sibling)); /* - * Then store sibling timestamps (if any): + * Store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) { - /* - * When counter was not in the overflow mask, we have to - * read it from hardware. We read it as well, when it - * has not been read yet and clear the bit in the - * status mask. - */ - bit = counter->hw.idx; - if (!test_bit(bit, (unsigned long *) overflown) || - test_bit(bit, (unsigned long *) status)) { - clear_bit(bit, (unsigned long *) status); - perf_save_and_restart(counter); - } - } + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); perf_store_irq_data(sibling, counter->hw_event.type); - perf_store_irq_data(sibling, atomic64_counter_read(counter)); + perf_store_irq_data(sibling, atomic64_read(&counter->count)); } } @@ -540,6 +553,11 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } +static void x86_perf_counter_read(struct perf_counter *counter) +{ + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); +} + static const struct hw_perf_counter_ops x86_perf_counter_ops = { .hw_perf_counter_enable = x86_perf_counter_enable, .hw_perf_counter_disable = x86_perf_counter_disable, diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8cb095fa442..72460289c65 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -91,14 +91,16 @@ struct perf_counter_hw_event { * struct hw_perf_counter - performance counter hardware details: */ struct hw_perf_counter { +#ifdef CONFIG_PERF_COUNTERS u64 config; unsigned long config_base; unsigned long counter_base; int nmi; unsigned int idx; - u64 prev_count; + atomic64_t prev_count; u64 irq_period; - s32 next_count; + atomic64_t period_left; +#endif }; /* @@ -140,17 +142,15 @@ enum perf_counter_active_state { * struct perf_counter - performance counter kernel representation: */ struct perf_counter { +#ifdef CONFIG_PERF_COUNTERS struct list_head list_entry; struct list_head sibling_list; struct perf_counter *group_leader; const struct hw_perf_counter_ops *hw_ops; enum perf_counter_active_state state; -#if BITS_PER_LONG == 64 atomic64_t count; -#else - atomic_t count32[2]; -#endif + struct perf_counter_hw_event hw_event; struct hw_perf_counter hw; @@ -172,6 +172,7 @@ struct perf_counter { struct perf_data *irqdata; struct perf_data *usrdata; struct perf_data data[2]; +#endif }; /** @@ -220,8 +221,6 @@ extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); extern u64 hw_perf_save_disable(void); extern void hw_perf_restore(u64 ctrl); -extern void atomic64_counter_set(struct perf_counter *counter, u64 val64); -extern u64 atomic64_counter_read(struct perf_counter *counter); extern int perf_counter_task_disable(void); extern int perf_counter_task_enable(void); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 559130b8774..416861ce8b2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -44,67 +44,9 @@ hw_perf_counter_init(struct perf_counter *counter) } u64 __weak hw_perf_save_disable(void) { return 0; } -void __weak hw_perf_restore(u64 ctrl) { } +void __weak hw_perf_restore(u64 ctrl) { } void __weak hw_perf_counter_setup(void) { } -#if BITS_PER_LONG == 64 - -/* - * Read the cached counter in counter safe against cross CPU / NMI - * modifications. 64 bit version - no complications. - */ -static inline u64 perf_counter_read_safe(struct perf_counter *counter) -{ - return (u64) atomic64_read(&counter->count); -} - -void atomic64_counter_set(struct perf_counter *counter, u64 val) -{ - atomic64_set(&counter->count, val); -} - -u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic64_read(&counter->count); -} - -#else - -/* - * Read the cached counter in counter safe against cross CPU / NMI - * modifications. 32 bit version. - */ -static u64 perf_counter_read_safe(struct perf_counter *counter) -{ - u32 cntl, cnth; - - local_irq_disable(); - do { - cnth = atomic_read(&counter->count32[1]); - cntl = atomic_read(&counter->count32[0]); - } while (cnth != atomic_read(&counter->count32[1])); - - local_irq_enable(); - - return cntl | ((u64) cnth) << 32; -} - -void atomic64_counter_set(struct perf_counter *counter, u64 val64) -{ - u32 *val32 = (void *)&val64; - - atomic_set(counter->count32 + 0, *(val32 + 0)); - atomic_set(counter->count32 + 1, *(val32 + 1)); -} - -u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic_read(counter->count32 + 0) | - (u64) atomic_read(counter->count32 + 1) << 32; -} - -#endif - static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -280,11 +222,11 @@ static void __perf_install_in_context(void *info) ctx->nr_counters++; if (cpuctx->active_oncpu < perf_max_counters) { - counter->hw_ops->hw_perf_counter_enable(counter); counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; ctx->nr_active++; cpuctx->active_oncpu++; + counter->hw_ops->hw_perf_counter_enable(counter); } if (!ctx->task && cpuctx->max_pertask) @@ -624,7 +566,7 @@ static u64 perf_counter_read(struct perf_counter *counter) __hw_perf_counter_read, counter, 1); } - return perf_counter_read_safe(counter); + return atomic64_read(&counter->count); } /* @@ -921,7 +863,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter) { int cpu = raw_smp_processor_id(); - atomic64_counter_set(counter, cpu_clock(cpu)); + atomic64_set(&counter->count, cpu_clock(cpu)); } static const struct hw_perf_counter_ops perf_ops_cpu_clock = { @@ -940,7 +882,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter) static void task_clock_perf_counter_read(struct perf_counter *counter) { - atomic64_counter_set(counter, current->se.sum_exec_runtime); + atomic64_set(&counter->count, current->se.sum_exec_runtime); } static const struct hw_perf_counter_ops perf_ops_task_clock = { -- cgit v1.2.3 From 9b51f66dcb09ac5eb6bc68fc111d5c7a1e0131d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 13:49:45 +0100 Subject: perfcounters: implement "counter inheritance" Impact: implement new performance feature Counter inheritance can be used to run performance counters in a workload, transparently - and pipe back the counter results to the parent counter. Inheritance for performance counters works the following way: when creating a counter it can be marked with the .inherit=1 flag. Such counters are then 'inherited' by all child tasks (be they fork()-ed or clone()-ed). These counters get inherited through exec() boundaries as well (except through setuid boundaries). The counter values get added back to the parent counter(s) when the child task(s) exit - much like stime/utime statistics are gathered. So inherited counters are ideal to gather summary statistics about an application's behavior via shell commands, without having to modify that application. The timec.c command utilizes counter inheritance: http://redhat.com/~mingo/perfcounters/timec.c Sample output: $ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null Performance counter stats for 'ls': 163516953 instructions 2295 cache-misses 2855182 branch-misses Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 24 +++-- kernel/exit.c | 7 +- kernel/perf_counter.c | 248 +++++++++++++++++++++++++++++++++++-------- 3 files changed, 228 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 72460289c65..e5d25bf8f74 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -75,10 +75,11 @@ struct perf_counter_hw_event { u64 irq_period; u32 record_type; - u32 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ - __reserved_1 : 29; + u32 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + inherit : 1, /* children inherit it */ + __reserved_1 : 28; u64 __reserved_2; }; @@ -138,6 +139,8 @@ enum perf_counter_active_state { PERF_COUNTER_STATE_ACTIVE = 1, }; +struct file; + /** * struct perf_counter - performance counter kernel representation: */ @@ -156,7 +159,10 @@ struct perf_counter { struct perf_counter_context *ctx; struct task_struct *task; + struct file *filp; + unsigned int nr_inherited; + struct perf_counter *parent; /* * Protect attach/detach: */ @@ -210,13 +216,16 @@ struct perf_cpu_context { extern int perf_max_counters; #ifdef CONFIG_PERF_COUNTERS +extern void +perf_counter_show(struct perf_counter *counter, char *str, int trace); extern const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter); extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); -extern void perf_counter_init_task(struct task_struct *task); +extern void perf_counter_init_task(struct task_struct *child); +extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); extern u64 hw_perf_save_disable(void); @@ -226,12 +235,15 @@ extern int perf_counter_task_enable(void); #else static inline void +perf_counter_show(struct perf_counter *counter, char *str, int trace) { } +static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } static inline void perf_counter_task_sched_out(struct task_struct *task, int cpu) { } static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } -static inline void perf_counter_init_task(struct task_struct *task) { } +static inline void perf_counter_init_task(struct task_struct *child) { } +static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } static inline void hw_perf_restore(u64 ctrl) { } diff --git a/kernel/exit.c b/kernel/exit.c index 2d8be7ebb0f..d336c90a5f1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code) mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif -#ifdef CONFIG_FUTEX /* - * This must happen late, after the PID is not - * hashed anymore: + * These must happen late, after the PID is not + * hashed anymore, but still at a point that may sleep: */ + perf_counter_exit_task(tsk); +#ifdef CONFIG_FUTEX if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); if (unlikely(current->pi_state_cache)) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 416861ce8b2..f5e81dd193d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -80,8 +80,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_del_init(&sibling->list_entry); list_add_tail(&sibling->list_entry, &ctx->counter_list); - WARN_ON_ONCE(!sibling->group_leader); - WARN_ON_ONCE(sibling->group_leader == sibling); sibling->group_leader = sibling; } } @@ -97,6 +95,7 @@ static void __perf_counter_remove_from_context(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; + unsigned long flags; u64 perf_flags; /* @@ -107,7 +106,7 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); if (counter->state == PERF_COUNTER_STATE_ACTIVE) { counter->hw_ops->hw_perf_counter_disable(counter); @@ -136,7 +135,7 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } - spin_unlock(&ctx->lock); + spin_unlock_irqrestore(&ctx->lock, flags); } @@ -199,6 +198,7 @@ static void __perf_install_in_context(void *info) struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; int cpu = smp_processor_id(); + unsigned long flags; u64 perf_flags; /* @@ -209,7 +209,7 @@ static void __perf_install_in_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); /* * Protect the list operation against NMI by disabling the @@ -232,7 +232,7 @@ static void __perf_install_in_context(void *info) if (!ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; - spin_unlock(&ctx->lock); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -446,10 +446,9 @@ int perf_counter_task_disable(void) */ perf_flags = hw_perf_save_disable(); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE); + list_for_each_entry(counter, &ctx->counter_list, list_entry) counter->state = PERF_COUNTER_STATE_OFF; - } + hw_perf_restore(perf_flags); spin_unlock(&ctx->lock); @@ -525,26 +524,6 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) perf_counter_task_sched_in(curr, cpu); } -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - spin_lock_init(&ctx->lock); - INIT_LIST_HEAD(&ctx->counter_list); - ctx->nr_counters = 0; - ctx->task = task; -} -/* - * Initialize the perf_counter context in task_struct - */ -void perf_counter_init_task(struct task_struct *task) -{ - __perf_counter_init_context(&task->perf_counter_ctx, task); -} - /* * Cross CPU call to read the hardware counter */ @@ -663,7 +642,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); ctx = &cpuctx->ctx; - WARN_ON_ONCE(ctx->task); return ctx; } @@ -915,12 +893,13 @@ sw_perf_counter_init(struct perf_counter *counter) static struct perf_counter * perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu, - struct perf_counter *group_leader) + struct perf_counter *group_leader, + gfp_t gfpflags) { const struct hw_perf_counter_ops *hw_ops; struct perf_counter *counter; - counter = kzalloc(sizeof(*counter), GFP_KERNEL); + counter = kzalloc(sizeof(*counter), gfpflags); if (!counter) return NULL; @@ -947,9 +926,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, hw_ops = NULL; if (!hw_event->raw && hw_event->type < 0) hw_ops = sw_perf_counter_init(counter); - if (!hw_ops) { + if (!hw_ops) hw_ops = hw_perf_counter_init(counter); - } if (!hw_ops) { kfree(counter); @@ -975,8 +953,10 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; struct perf_counter_context *ctx; + struct file *counter_file = NULL; struct file *group_file = NULL; int fput_needed = 0; + int fput_needed2 = 0; int ret; if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) @@ -1017,25 +997,29 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, } ret = -EINVAL; - counter = perf_counter_alloc(&hw_event, cpu, group_leader); + counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL); if (!counter) goto err_put_context; - perf_install_in_context(ctx, counter, cpu); - ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); if (ret < 0) - goto err_remove_free_put_context; + goto err_free_put_context; + + counter_file = fget_light(ret, &fput_needed2); + if (!counter_file) + goto err_free_put_context; + + counter->filp = counter_file; + perf_install_in_context(ctx, counter, cpu); + + fput_light(counter_file, fput_needed2); out_fput: fput_light(group_file, fput_needed); return ret; -err_remove_free_put_context: - mutex_lock(&counter->mutex); - perf_counter_remove_from_context(counter); - mutex_unlock(&counter->mutex); +err_free_put_context: kfree(counter); err_put_context: @@ -1044,6 +1028,186 @@ err_put_context: goto out_fput; } +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + INIT_LIST_HEAD(&ctx->counter_list); + ctx->task = task; +} + +/* + * inherit a counter from parent task to child task: + */ +static int +inherit_counter(struct perf_counter *parent_counter, + struct task_struct *parent, + struct perf_counter_context *parent_ctx, + struct task_struct *child, + struct perf_counter_context *child_ctx) +{ + struct perf_counter *child_counter; + + child_counter = perf_counter_alloc(&parent_counter->hw_event, + parent_counter->cpu, NULL, + GFP_ATOMIC); + if (!child_counter) + return -ENOMEM; + + /* + * Link it up in the child's context: + */ + child_counter->ctx = child_ctx; + child_counter->task = child; + list_add_counter(child_counter, child_ctx); + child_ctx->nr_counters++; + + child_counter->parent = parent_counter; + parent_counter->nr_inherited++; + /* + * inherit into child's child as well: + */ + child_counter->hw_event.inherit = 1; + + /* + * Get a reference to the parent filp - we will fput it + * when the child counter exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_counter->filp->f_count); + + return 0; +} + +static void +__perf_counter_exit_task(struct task_struct *child, + struct perf_counter *child_counter, + struct perf_counter_context *child_ctx) +{ + struct perf_counter *parent_counter; + u64 parent_val, child_val; + u64 perf_flags; + + /* + * Disable and unlink this counter. + * + * Be careful about zapping the list - IRQ/NMI context + * could still be processing it: + */ + local_irq_disable(); + perf_flags = hw_perf_save_disable(); + + if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) + child_counter->hw_ops->hw_perf_counter_disable(child_counter); + list_del_init(&child_counter->list_entry); + + hw_perf_restore(perf_flags); + local_irq_enable(); + + parent_counter = child_counter->parent; + /* + * It can happen that parent exits first, and has counters + * that are still around due to the child reference. These + * counters need to be zapped - but otherwise linger. + */ + if (!parent_counter) + return; + + parent_val = atomic64_read(&parent_counter->count); + child_val = atomic64_read(&child_counter->count); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_counter->count); + + fput(parent_counter->filp); + + kfree(child_counter); +} + +/* + * When a child task exist, feed back counter values to parent counters. + * + * Note: we are running in child context, but the PID is not hashed + * anymore so new counters will not be added. + */ +void perf_counter_exit_task(struct task_struct *child) +{ + struct perf_counter *child_counter, *tmp; + struct perf_counter_context *child_ctx; + + child_ctx = &child->perf_counter_ctx; + + if (likely(!child_ctx->nr_counters)) + return; + + list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, + list_entry) + __perf_counter_exit_task(child, child_counter, child_ctx); +} + +/* + * Initialize the perf_counter context in task_struct + */ +void perf_counter_init_task(struct task_struct *child) +{ + struct perf_counter_context *child_ctx, *parent_ctx; + struct perf_counter *counter, *parent_counter; + struct task_struct *parent = current; + unsigned long flags; + + child_ctx = &child->perf_counter_ctx; + parent_ctx = &parent->perf_counter_ctx; + + __perf_counter_init_context(child_ctx, child); + + /* + * This is executed from the parent task context, so inherit + * counters that have been marked for cloning: + */ + + if (likely(!parent_ctx->nr_counters)) + return; + + /* + * Lock the parent list. No need to lock the child - not PID + * hashed yet and not running, so nobody can access it. + */ + spin_lock_irqsave(&parent_ctx->lock, flags); + + /* + * We dont have to disable NMIs - we are only looking at + * the list, not manipulating it: + */ + list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { + if (!counter->hw_event.inherit || counter->group_leader != counter) + continue; + + /* + * Instead of creating recursive hierarchies of counters, + * we link inheritd counters back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + parent_counter = counter; + if (counter->parent) + parent_counter = counter->parent; + + if (inherit_counter(parent_counter, parent, + parent_ctx, child, child_ctx)) + break; + } + + spin_unlock_irqrestore(&parent_ctx->lock, flags); +} + static void __cpuinit perf_counter_init_cpu(int cpu) { struct perf_cpu_context *cpuctx; -- cgit v1.2.3 From 5d6a27d8a096868ae313f71f563b06074a7e34fe Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 12:28:33 +0100 Subject: perfcounters: add context switch counter Impact: add new feature, new sw counter Add a counter that counts the number of context-switches a task is doing. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- kernel/perf_counter.c | 51 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e5d25bf8f74..d2a16563415 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -53,8 +53,8 @@ enum hw_event_types { /* * Future software events: */ - /* PERF_COUNT_PAGE_FAULTS = -3, - PERF_COUNT_CONTEXT_SWITCHES = -4, */ + PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, }; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1f81cde0dc4..09287091c52 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -888,6 +888,54 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .hw_perf_counter_read = task_clock_perf_counter_read, }; +static u64 get_context_switches(void) +{ + struct task_struct *curr = current; + + return curr->nvcsw + curr->nivcsw; +} + +static void context_switches_perf_counter_update(struct perf_counter *counter) +{ + u64 prev, now; + s64 delta; + + prev = atomic64_read(&counter->hw.prev_count); + now = get_context_switches(); + + atomic64_set(&counter->hw.prev_count, now); + + delta = now - prev; + if (WARN_ON_ONCE(delta < 0)) + delta = 0; + + atomic64_add(delta, &counter->count); +} + +static void context_switches_perf_counter_read(struct perf_counter *counter) +{ + context_switches_perf_counter_update(counter); +} + +static void context_switches_perf_counter_enable(struct perf_counter *counter) +{ + /* + * ->nvcsw + curr->nivcsw is a per-task value already, + * so we dont have to clear it on switch-in. + */ +} + +static void context_switches_perf_counter_disable(struct perf_counter *counter) +{ + context_switches_perf_counter_update(counter); +} + +static const struct hw_perf_counter_ops perf_ops_context_switches = { + .hw_perf_counter_enable = context_switches_perf_counter_enable, + .hw_perf_counter_disable = context_switches_perf_counter_disable, + .hw_perf_counter_read = context_switches_perf_counter_read, +}; + static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { @@ -900,6 +948,9 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_TASK_CLOCK: hw_ops = &perf_ops_task_clock; break; + case PERF_COUNT_CONTEXT_SWITCHES: + hw_ops = &perf_ops_context_switches; + break; default: break; } -- cgit v1.2.3 From 6c594c21fcb02c662f11c97be4d7d2b73060a205 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 12:34:15 +0100 Subject: perfcounters: add task migrations counter Impact: add new feature, new sw counter Add a counter that counts the number of cross-CPU migrations a task is suffering. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 8 +++++--- include/linux/sched.h | 3 ++- kernel/perf_counter.c | 49 ++++++++++++++++++++++++++++++++++++++++++++ kernel/sched.c | 7 +++++-- 4 files changed, 61 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d2a16563415..f30486fc55d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -42,6 +42,8 @@ enum hw_event_types { PERF_COUNT_BRANCH_INSTRUCTIONS = 4, PERF_COUNT_BRANCH_MISSES = 5, + PERF_HW_EVENTS_MAX = 6, + /* * Special "software" counters provided by the kernel, even if * the hardware does not support performance counters. These @@ -50,11 +52,11 @@ enum hw_event_types { */ PERF_COUNT_CPU_CLOCK = -1, PERF_COUNT_TASK_CLOCK = -2, - /* - * Future software events: - */ PERF_COUNT_PAGE_FAULTS = -3, PERF_COUNT_CONTEXT_SWITCHES = -4, + PERF_COUNT_CPU_MIGRATIONS = -5, + + PERF_SW_EVENTS_MIN = -6, }; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 4c530278391..2e15be8fc79 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1014,6 +1014,8 @@ struct sched_entity { u64 last_wakeup; u64 avg_overlap; + u64 nr_migrations; + #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; @@ -1029,7 +1031,6 @@ struct sched_entity { u64 exec_max; u64 slice_max; - u64 nr_migrations; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 09287091c52..fb11e351e44 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -936,6 +936,52 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = { .hw_perf_counter_read = context_switches_perf_counter_read, }; +static inline u64 get_cpu_migrations(void) +{ + return current->se.nr_migrations; +} + +static void cpu_migrations_perf_counter_update(struct perf_counter *counter) +{ + u64 prev, now; + s64 delta; + + prev = atomic64_read(&counter->hw.prev_count); + now = get_cpu_migrations(); + + atomic64_set(&counter->hw.prev_count, now); + + delta = now - prev; + if (WARN_ON_ONCE(delta < 0)) + delta = 0; + + atomic64_add(delta, &counter->count); +} + +static void cpu_migrations_perf_counter_read(struct perf_counter *counter) +{ + cpu_migrations_perf_counter_update(counter); +} + +static void cpu_migrations_perf_counter_enable(struct perf_counter *counter) +{ + /* + * se.nr_migrations is a per-task value already, + * so we dont have to clear it on switch-in. + */ +} + +static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) +{ + cpu_migrations_perf_counter_update(counter); +} + +static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { + .hw_perf_counter_enable = cpu_migrations_perf_counter_enable, + .hw_perf_counter_disable = cpu_migrations_perf_counter_disable, + .hw_perf_counter_read = cpu_migrations_perf_counter_read, +}; + static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { @@ -951,6 +997,9 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_CONTEXT_SWITCHES: hw_ops = &perf_ops_context_switches; break; + case PERF_COUNT_CPU_MIGRATIONS: + hw_ops = &perf_ops_cpu_migrations; + break; default: break; } diff --git a/kernel/sched.c b/kernel/sched.c index 5c3f4106314..382cfdb5e38 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1852,12 +1852,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.sleep_start -= clock_offset; if (p->se.block_start) p->se.block_start -= clock_offset; +#endif if (old_cpu != new_cpu) { - schedstat_inc(p, se.nr_migrations); + p->se.nr_migrations++; +#ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); - } #endif + } p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; @@ -2375,6 +2377,7 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; p->se.last_wakeup = 0; p->se.avg_overlap = 0; -- cgit v1.2.3 From 8fb9331391af95ca1f4e5c0a0da8120b13cbae01 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:04:16 +0100 Subject: perfcounters: remove warnings Impact: remove debug checks Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 7 ------- include/linux/perf_counter.h | 4 ---- kernel/perf_counter.c | 8 -------- 3 files changed, 19 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index bdbdb56eaa3..89fad5d4fb3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -64,7 +64,6 @@ x86_perf_counter_update(struct perf_counter *counter, { u64 prev_raw_count, new_raw_count, delta; - WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE); /* * Careful: an NMI might modify the previous counter value. * @@ -89,7 +88,6 @@ again: * of the count, so we do that by clipping the delta to 32 bits: */ delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); - WARN_ON_ONCE((int)delta < 0); atomic64_add(delta, &counter->count); atomic64_sub(delta, &hwc->period_left); @@ -193,7 +191,6 @@ __x86_perf_counter_disable(struct perf_counter *counter, int err; err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); - WARN_ON_ONCE(err); } static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); @@ -209,8 +206,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter, s32 left = atomic64_read(&hwc->period_left); s32 period = hwc->irq_period; - WARN_ON_ONCE(period <= 0); - /* * If we are way outside a reasoable range then just skip forward: */ @@ -224,8 +219,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter, atomic64_set(&hwc->period_left, left); } - WARN_ON_ONCE(left <= 0); - per_cpu(prev_left[idx], smp_processor_id()) = left; /* diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f30486fc55d..d038450de87 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -218,8 +218,6 @@ struct perf_cpu_context { extern int perf_max_counters; #ifdef CONFIG_PERF_COUNTERS -extern void -perf_counter_show(struct perf_counter *counter, char *str, int trace); extern const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter); @@ -237,8 +235,6 @@ extern int perf_counter_task_enable(void); #else static inline void -perf_counter_show(struct perf_counter *counter, char *str, int trace) { } -static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } static inline void perf_counter_task_sched_out(struct task_struct *task, int cpu) { } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5431e790b5d..aab6c123b02 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -861,8 +861,6 @@ static void task_clock_perf_counter_update(struct perf_counter *counter) atomic64_set(&counter->hw.prev_count, now); delta = now - prev; - if (WARN_ON_ONCE(delta < 0)) - delta = 0; atomic64_add(delta, &counter->count); } @@ -906,8 +904,6 @@ static void page_faults_perf_counter_update(struct perf_counter *counter) atomic64_set(&counter->hw.prev_count, now); delta = now - prev; - if (WARN_ON_ONCE(delta < 0)) - delta = 0; atomic64_add(delta, &counter->count); } @@ -954,8 +950,6 @@ static void context_switches_perf_counter_update(struct perf_counter *counter) atomic64_set(&counter->hw.prev_count, now); delta = now - prev; - if (WARN_ON_ONCE(delta < 0)) - delta = 0; atomic64_add(delta, &counter->count); } @@ -1000,8 +994,6 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter) atomic64_set(&counter->hw.prev_count, now); delta = now - prev; - if (WARN_ON_ONCE(delta < 0)) - delta = 0; atomic64_add(delta, &counter->count); } -- cgit v1.2.3 From eb2b861810d4ff72454c83996b891df4e0aaff9a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 09:09:13 +0100 Subject: x86, perfcounters: prepare for fixed-mode PMCs Impact: refactor the x86 code for fixed-mode PMCs Extend the data structures and rename the existing facilities to allow for a 'generic' versus 'fixed' counter distinction. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 11 ++++++++ arch/x86/kernel/cpu/perf_counter.c | 53 ++++++++++++++++++------------------- include/linux/perf_counter.h | 1 + 3 files changed, 38 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 9dadce1124e..dd5a4a559e2 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -1,6 +1,13 @@ #ifndef _ASM_X86_PERF_COUNTER_H #define _ASM_X86_PERF_COUNTER_H +/* + * Performance counter hw details: + */ + +#define X86_PMC_MAX_GENERIC 8 +#define X86_PMC_MAX_FIXED 3 + #define MSR_ARCH_PERFMON_PERFCTR0 0xc1 #define MSR_ARCH_PERFMON_PERFCTR1 0xc2 @@ -20,6 +27,10 @@ #define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 +/* + * Intel "Architectural Performance Monitoring" CPUID + * detection/enumeration details: + */ union cpuid10_eax { struct { unsigned int version_id:8; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a4a3a09a654..fc3af868823 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -27,13 +27,12 @@ static bool perf_counters_initialized __read_mostly; static int nr_hw_counters __read_mostly; static u32 perf_counter_mask __read_mostly; -/* No support for fixed function counters yet */ - -#define MAX_HW_COUNTERS 8 - struct cpu_hw_counters { - struct perf_counter *counters[MAX_HW_COUNTERS]; - unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)]; + struct perf_counter *generic[X86_PMC_MAX_GENERIC]; + unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)]; + + struct perf_counter *fixed[X86_PMC_MAX_FIXED]; + unsigned long used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)]; }; /* @@ -185,7 +184,7 @@ void hw_perf_restore(u64 ctrl) EXPORT_SYMBOL_GPL(hw_perf_restore); static inline void -__x86_perf_counter_disable(struct perf_counter *counter, +__pmc_generic_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int idx) { int err; @@ -193,7 +192,7 @@ __x86_perf_counter_disable(struct perf_counter *counter, err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); } -static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -231,7 +230,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter, } static void -__x86_perf_counter_enable(struct perf_counter *counter, +__pmc_generic_enable(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, @@ -241,7 +240,7 @@ __x86_perf_counter_enable(struct perf_counter *counter, /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static void x86_perf_counter_enable(struct perf_counter *counter) +static void pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -256,12 +255,12 @@ static void x86_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __x86_perf_counter_disable(counter, hwc, idx); + __pmc_generic_disable(counter, hwc, idx); - cpuc->counters[idx] = counter; + cpuc->generic[idx] = counter; __hw_perf_counter_set_period(counter, hwc, idx); - __x86_perf_counter_enable(counter, hwc, idx); + __pmc_generic_enable(counter, hwc, idx); } void perf_counter_print_debug(void) @@ -301,16 +300,16 @@ void perf_counter_print_debug(void) local_irq_enable(); } -static void x86_perf_counter_disable(struct perf_counter *counter) +static void pmc_generic_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __x86_perf_counter_disable(counter, hwc, idx); + __pmc_generic_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); - cpuc->counters[idx] = NULL; + cpuc->generic[idx] = NULL; /* * Drain the remaining delta count out of a counter @@ -349,7 +348,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __x86_perf_counter_enable(counter, hwc, idx); + __pmc_generic_enable(counter, hwc, idx); } static void @@ -392,7 +391,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) again: ack = status; for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_counter *counter = cpuc->generic[bit]; clear_bit(bit, (unsigned long *) &status); if (!counter) @@ -412,7 +411,7 @@ again: } /* * From NMI context we cannot call into the scheduler to - * do a task wakeup - but we mark these counters as + * do a task wakeup - but we mark these generic as * wakeup_pending and initate a wakeup callback: */ if (nmi) { @@ -462,7 +461,7 @@ void perf_counter_notify(struct pt_regs *regs) cpuc = &per_cpu(cpu_hw_counters, cpu); for_each_bit(bit, cpuc->used, nr_hw_counters) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_counter *counter = cpuc->generic[bit]; if (!counter) continue; @@ -539,10 +538,10 @@ void __init init_hw_perf_counters(void) printk(KERN_INFO "... version: %d\n", eax.split.version_id); printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters); nr_hw_counters = eax.split.num_counters; - if (nr_hw_counters > MAX_HW_COUNTERS) { - nr_hw_counters = MAX_HW_COUNTERS; + if (nr_hw_counters > X86_PMC_MAX_GENERIC) { + nr_hw_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - nr_hw_counters, MAX_HW_COUNTERS); + nr_hw_counters, X86_PMC_MAX_GENERIC); } perf_counter_mask = (1 << nr_hw_counters) - 1; perf_max_counters = nr_hw_counters; @@ -556,15 +555,15 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void x86_perf_counter_read(struct perf_counter *counter) +static void pmc_generic_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .hw_perf_counter_enable = x86_perf_counter_enable, - .hw_perf_counter_disable = x86_perf_counter_disable, - .hw_perf_counter_read = x86_perf_counter_read, + .hw_perf_counter_enable = pmc_generic_enable, + .hw_perf_counter_disable = pmc_generic_disable, + .hw_perf_counter_read = pmc_generic_read, }; const struct hw_perf_counter_ops * diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d038450de87..984da540224 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -14,6 +14,7 @@ #define _LINUX_PERF_COUNTER_H #include +#include #include #include -- cgit v1.2.3 From 7671581f1666ef4b54a1c1e598c51ac44c060a9b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:20:28 +0100 Subject: perfcounters: hw ops rename Impact: rename field names Shorten them. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 ++--- include/linux/perf_counter.h | 6 ++--- kernel/perf_counter.c | 50 +++++++++++++++++++------------------- 3 files changed, 31 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 358af526640..b6755712142 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -577,9 +577,9 @@ static void pmc_generic_read(struct perf_counter *counter) } static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .hw_perf_counter_enable = pmc_generic_enable, - .hw_perf_counter_disable = pmc_generic_disable, - .hw_perf_counter_read = pmc_generic_read, + .enable = pmc_generic_enable, + .disable = pmc_generic_disable, + .read = pmc_generic_read, }; const struct hw_perf_counter_ops * diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 984da540224..48f76d2e54c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -128,9 +128,9 @@ struct perf_counter; * struct hw_perf_counter_ops - performance counter hw ops */ struct hw_perf_counter_ops { - void (*hw_perf_counter_enable) (struct perf_counter *counter); - void (*hw_perf_counter_disable) (struct perf_counter *counter); - void (*hw_perf_counter_read) (struct perf_counter *counter); + void (*enable) (struct perf_counter *counter); + void (*disable) (struct perf_counter *counter); + void (*read) (struct perf_counter *counter); }; /** diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f8a4d9a5d5d..961d651aa57 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -109,7 +109,7 @@ static void __perf_counter_remove_from_context(void *info) spin_lock_irqsave(&ctx->lock, flags); if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - counter->hw_ops->hw_perf_counter_disable(counter); + counter->hw_ops->disable(counter); counter->state = PERF_COUNTER_STATE_INACTIVE; ctx->nr_active--; cpuctx->active_oncpu--; @@ -226,7 +226,7 @@ static void __perf_install_in_context(void *info) counter->oncpu = cpu; ctx->nr_active++; cpuctx->active_oncpu++; - counter->hw_ops->hw_perf_counter_enable(counter); + counter->hw_ops->enable(counter); } if (!ctx->task && cpuctx->max_pertask) @@ -297,7 +297,7 @@ counter_sched_out(struct perf_counter *counter, if (counter->state != PERF_COUNTER_STATE_ACTIVE) return; - counter->hw_ops->hw_perf_counter_disable(counter); + counter->hw_ops->disable(counter); counter->state = PERF_COUNTER_STATE_INACTIVE; counter->oncpu = -1; @@ -327,7 +327,7 @@ group_sched_out(struct perf_counter *group_counter, * * We stop each counter and update the counter value in counter->count. * - * This does not protect us against NMI, but hw_perf_counter_disable() + * This does not protect us against NMI, but disable() * sets the disabled bit in the control field of counter _before_ * accessing the counter control register. If a NMI hits, then it will * not restart the counter. @@ -359,7 +359,7 @@ counter_sched_in(struct perf_counter *counter, if (counter->state == PERF_COUNTER_STATE_OFF) return; - counter->hw_ops->hw_perf_counter_enable(counter); + counter->hw_ops->enable(counter); counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ @@ -395,7 +395,7 @@ group_sched_in(struct perf_counter *group_counter, * * We restore the counter value and then enable it. * - * This does not protect us against NMI, but hw_perf_counter_enable() + * This does not protect us against NMI, but enable() * sets the enabled bit in the control field of counter _before_ * accessing the counter control register. If a NMI hits, then it will * keep the counter running. @@ -537,11 +537,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) /* * Cross CPU call to read the hardware counter */ -static void __hw_perf_counter_read(void *info) +static void __read(void *info) { struct perf_counter *counter = info; - counter->hw_ops->hw_perf_counter_read(counter); + counter->hw_ops->read(counter); } static u64 perf_counter_read(struct perf_counter *counter) @@ -552,7 +552,7 @@ static u64 perf_counter_read(struct perf_counter *counter) */ if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, - __hw_perf_counter_read, counter, 1); + __read, counter, 1); } return atomic64_read(&counter->count); @@ -855,9 +855,9 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_cpu_clock = { - .hw_perf_counter_enable = cpu_clock_perf_counter_enable, - .hw_perf_counter_disable = cpu_clock_perf_counter_disable, - .hw_perf_counter_read = cpu_clock_perf_counter_read, + .enable = cpu_clock_perf_counter_enable, + .disable = cpu_clock_perf_counter_disable, + .read = cpu_clock_perf_counter_read, }; static void task_clock_perf_counter_update(struct perf_counter *counter) @@ -891,9 +891,9 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_task_clock = { - .hw_perf_counter_enable = task_clock_perf_counter_enable, - .hw_perf_counter_disable = task_clock_perf_counter_disable, - .hw_perf_counter_read = task_clock_perf_counter_read, + .enable = task_clock_perf_counter_enable, + .disable = task_clock_perf_counter_disable, + .read = task_clock_perf_counter_read, }; static u64 get_page_faults(void) @@ -937,9 +937,9 @@ static void page_faults_perf_counter_disable(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_page_faults = { - .hw_perf_counter_enable = page_faults_perf_counter_enable, - .hw_perf_counter_disable = page_faults_perf_counter_disable, - .hw_perf_counter_read = page_faults_perf_counter_read, + .enable = page_faults_perf_counter_enable, + .disable = page_faults_perf_counter_disable, + .read = page_faults_perf_counter_read, }; static u64 get_context_switches(void) @@ -983,9 +983,9 @@ static void context_switches_perf_counter_disable(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_context_switches = { - .hw_perf_counter_enable = context_switches_perf_counter_enable, - .hw_perf_counter_disable = context_switches_perf_counter_disable, - .hw_perf_counter_read = context_switches_perf_counter_read, + .enable = context_switches_perf_counter_enable, + .disable = context_switches_perf_counter_disable, + .read = context_switches_perf_counter_read, }; static inline u64 get_cpu_migrations(void) @@ -1027,9 +1027,9 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { - .hw_perf_counter_enable = cpu_migrations_perf_counter_enable, - .hw_perf_counter_disable = cpu_migrations_perf_counter_disable, - .hw_perf_counter_read = cpu_migrations_perf_counter_read, + .enable = cpu_migrations_perf_counter_enable, + .disable = cpu_migrations_perf_counter_disable, + .read = cpu_migrations_perf_counter_read, }; static const struct hw_perf_counter_ops * @@ -1283,7 +1283,7 @@ __perf_counter_exit_task(struct task_struct *child, cpuctx = &__get_cpu_var(perf_cpu_context); - child_counter->hw_ops->hw_perf_counter_disable(child_counter); + child_counter->hw_ops->disable(child_counter); child_counter->state = PERF_COUNTER_STATE_INACTIVE; child_counter->oncpu = -1; -- cgit v1.2.3 From aa9c4c0f967fdb482ea95e8473ec3d201e6e0781 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:10:57 +0100 Subject: perfcounters: fix task clock counter Impact: fix per task clock counter precision Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 8 ++++++ kernel/exit.c | 17 +++++++---- kernel/perf_counter.c | 70 ++++++++++++++++++++++++++++++++++----------- kernel/sched.c | 49 +++++++++++++++++++++++++++++-- 4 files changed, 120 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 4a145caeee0..1b2e3242497 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -66,7 +66,15 @@ static inline unsigned int kstat_irqs(unsigned int irq) return sum; } + +/* + * Lock/unlock the current runqueue - to extract task statistics: + */ +extern void curr_rq_lock_irq_save(unsigned long *flags); +extern void curr_rq_unlock_irq_restore(unsigned long *flags); +extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update); extern unsigned long long task_delta_exec(struct task_struct *); + extern void account_user_time(struct task_struct *, cputime_t); extern void account_user_time_scaled(struct task_struct *, cputime_t); extern void account_system_time(struct task_struct *, int, cputime_t); diff --git a/kernel/exit.c b/kernel/exit.c index d336c90a5f1..244edfd9686 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -922,6 +922,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead) forget_original_parent(tsk); exit_task_namespaces(tsk); + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + write_lock_irq(&tasklist_lock); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -1093,11 +1099,6 @@ NORET_TYPE void do_exit(long code) mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif - /* - * These must happen late, after the PID is not - * hashed anymore, but still at a point that may sleep: - */ - perf_counter_exit_task(tsk); #ifdef CONFIG_FUTEX if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); @@ -1121,6 +1122,12 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); + /* + * These must happen late, after the PID is not + * hashed anymore, but still at a point that may sleep: + */ + perf_counter_exit_task(tsk); + preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 961d651aa57..f1110ac1267 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -18,6 +18,7 @@ #include #include #include +#include #include /* @@ -106,7 +107,8 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock_irqsave(&ctx->lock, flags); + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); if (counter->state == PERF_COUNTER_STATE_ACTIVE) { counter->hw_ops->disable(counter); @@ -135,7 +137,8 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); } @@ -209,7 +212,8 @@ static void __perf_install_in_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock_irqsave(&ctx->lock, flags); + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); /* * Protect the list operation against NMI by disabling the @@ -232,7 +236,8 @@ static void __perf_install_in_context(void *info) if (!ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); } /* @@ -438,15 +443,19 @@ int perf_counter_task_disable(void) struct task_struct *curr = current; struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; + unsigned long flags; u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) return 0; - local_irq_disable(); + curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); + /* force the update of the task clock: */ + __task_delta_exec(curr, 1); + perf_counter_task_sched_out(curr, cpu); spin_lock(&ctx->lock); @@ -463,7 +472,7 @@ int perf_counter_task_disable(void) spin_unlock(&ctx->lock); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); return 0; } @@ -473,15 +482,19 @@ int perf_counter_task_enable(void) struct task_struct *curr = current; struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; + unsigned long flags; u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) return 0; - local_irq_disable(); + curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); + /* force the update of the task clock: */ + __task_delta_exec(curr, 1); + spin_lock(&ctx->lock); /* @@ -493,6 +506,7 @@ int perf_counter_task_enable(void) if (counter->state != PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->hw_event.disabled = 0; } hw_perf_restore(perf_flags); @@ -500,7 +514,7 @@ int perf_counter_task_enable(void) perf_counter_task_sched_in(curr, cpu); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); return 0; } @@ -540,8 +554,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) static void __read(void *info) { struct perf_counter *counter = info; + unsigned long flags; + curr_rq_lock_irq_save(&flags); counter->hw_ops->read(counter); + curr_rq_unlock_irq_restore(&flags); } static u64 perf_counter_read(struct perf_counter *counter) @@ -860,13 +877,27 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { .read = cpu_clock_perf_counter_read, }; -static void task_clock_perf_counter_update(struct perf_counter *counter) +/* + * Called from within the scheduler: + */ +static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update) { - u64 prev, now; + struct task_struct *curr = counter->task; + u64 delta; + + WARN_ON_ONCE(counter->task != current); + + delta = __task_delta_exec(curr, update); + + return curr->se.sum_exec_runtime + delta; +} + +static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) +{ + u64 prev; s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = current->se.sum_exec_runtime; atomic64_set(&counter->hw.prev_count, now); @@ -877,17 +908,23 @@ static void task_clock_perf_counter_update(struct perf_counter *counter) static void task_clock_perf_counter_read(struct perf_counter *counter) { - task_clock_perf_counter_update(counter); + u64 now = task_clock_perf_counter_val(counter, 1); + + task_clock_perf_counter_update(counter, now); } static void task_clock_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime); + u64 now = task_clock_perf_counter_val(counter, 0); + + atomic64_set(&counter->hw.prev_count, now); } static void task_clock_perf_counter_disable(struct perf_counter *counter) { - task_clock_perf_counter_update(counter); + u64 now = task_clock_perf_counter_val(counter, 0); + + task_clock_perf_counter_update(counter, now); } static const struct hw_perf_counter_ops perf_ops_task_clock = { @@ -1267,6 +1304,7 @@ __perf_counter_exit_task(struct task_struct *child, { struct perf_counter *parent_counter; u64 parent_val, child_val; + unsigned long flags; u64 perf_flags; /* @@ -1275,7 +1313,7 @@ __perf_counter_exit_task(struct task_struct *child, * Be careful about zapping the list - IRQ/NMI context * could still be processing it: */ - local_irq_disable(); + curr_rq_lock_irq_save(&flags); perf_flags = hw_perf_save_disable(); if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { @@ -1294,7 +1332,7 @@ __perf_counter_exit_task(struct task_struct *child, list_del_init(&child_counter->list_entry); hw_perf_restore(perf_flags); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); parent_counter = child_counter->parent; /* diff --git a/kernel/sched.c b/kernel/sched.c index 382cfdb5e38..4d84ff4c877 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -638,7 +638,7 @@ static inline int cpu_of(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -static inline void update_rq_clock(struct rq *rq) +inline void update_rq_clock(struct rq *rq) { rq->clock = sched_clock_cpu(cpu_of(rq)); } @@ -969,6 +969,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } +void curr_rq_lock_irq_save(unsigned long *flags) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_save(*flags); + rq = cpu_rq(smp_processor_id()); + spin_lock(&rq->lock); +} + +void curr_rq_unlock_irq_restore(unsigned long *flags) + __releases(rq->lock) +{ + struct rq *rq; + + rq = cpu_rq(smp_processor_id()); + spin_unlock(&rq->lock); + local_irq_restore(*flags); +} + void task_rq_unlock_wait(struct task_struct *p) { struct rq *rq = task_rq(p); @@ -2558,7 +2578,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { fire_sched_out_preempt_notifiers(prev, next); - perf_counter_task_sched_out(prev, cpu_of(rq)); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -4089,6 +4108,29 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); +/* + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. + */ +unsigned long long __task_delta_exec(struct task_struct *p, int update) +{ + s64 delta_exec; + struct rq *rq; + + rq = task_rq(p); + WARN_ON_ONCE(!runqueue_is_locked()); + WARN_ON_ONCE(!task_current(rq, p)); + + if (update) + update_rq_clock(rq); + + delta_exec = rq->clock - p->se.exec_start; + + WARN_ON_ONCE(delta_exec < 0); + + return delta_exec; +} + /* * Return any ns on the sched_clock that have not yet been banked in * @p in case that task is currently running. @@ -4316,13 +4358,13 @@ void scheduler_tick(void) update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); + perf_counter_task_tick(curr, cpu); spin_unlock(&rq->lock); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif - perf_counter_task_tick(curr, cpu); } #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ @@ -4512,6 +4554,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); + perf_counter_task_sched_out(prev, cpu); rq->nr_switches++; rq->curr = next; -- cgit v1.2.3 From eef6cbf5844c620d9db9be99e4908cdf92492fb9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 10:20:42 +0100 Subject: perfcounters: pull inherited counters Change counter inheritance from a 'push' to a 'pull' model: instead of child tasks pushing their final counts to the parent, reuse the wait4 infrastructure to pull counters as child tasks are exit-processed, much like how cutime/cstime is collected. Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 9 +++++++++ kernel/exit.c | 21 +++++++++------------ 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 23fd8909b9e..54fa2fa2c8e 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -113,6 +113,14 @@ extern struct group_info init_groups; # define CAP_INIT_BSET CAP_INIT_EFF_SET #endif +#ifdef CONFIG_PERF_COUNTERS +# define INIT_PERF_COUNTERS(tsk) \ + .perf_counter_ctx.counter_list = \ + LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), +#else +# define INIT_PERF_COUNTERS(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -180,6 +188,7 @@ extern struct group_info init_groups; INIT_IDS \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ + INIT_PERF_COUNTERS(tsk) \ } diff --git a/kernel/exit.c b/kernel/exit.c index 244edfd9686..101b7eeff44 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -153,6 +153,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); +#ifdef CONFIG_PERF_COUNTERS + WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); +#endif trace_sched_process_free(tsk); put_task_struct(tsk); } @@ -922,12 +925,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) forget_original_parent(tsk); exit_task_namespaces(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(tsk); - write_lock_irq(&tasklist_lock); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -1122,12 +1119,6 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); - /* - * These must happen late, after the PID is not - * hashed anymore, but still at a point that may sleep: - */ - perf_counter_exit_task(tsk); - preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; @@ -1371,6 +1362,12 @@ static int wait_task_zombie(struct task_struct *p, int options, */ read_unlock(&tasklist_lock); + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(p); + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; -- cgit v1.2.3 From 78b6084c907cea15bb40a564b974e072f5163781 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 21 Dec 2008 15:07:49 +0100 Subject: perfcounters: fix init context lock Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 54fa2fa2c8e..467cff545c3 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -116,7 +116,9 @@ extern struct group_info init_groups; #ifdef CONFIG_PERF_COUNTERS # define INIT_PERF_COUNTERS(tsk) \ .perf_counter_ctx.counter_list = \ - LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), + LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \ + .perf_counter_ctx.lock = \ + __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock), #else # define INIT_PERF_COUNTERS(tsk) #endif -- cgit v1.2.3 From 95cdd2e7851cce79ab839cb0b3cbe68d7911d0f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 21 Dec 2008 13:50:42 +0100 Subject: perfcounters: enable lowlevel pmc code to schedule counters Allow lowlevel ->enable() op to return an error if a counter can not be added. This can be used to handle counter constraints. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++- include/linux/perf_counter.h | 2 +- kernel/perf_counter.c | 62 +++++++++++++++++++++++++++----------- 3 files changed, 51 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b6755712142..74090a393a7 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -244,7 +244,7 @@ static int fixed_mode_idx(struct hw_perf_counter *hwc) /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static void pmc_generic_enable(struct perf_counter *counter) +static int pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -253,6 +253,8 @@ static void pmc_generic_enable(struct perf_counter *counter) /* Try to get the previous counter again */ if (test_and_set_bit(idx, cpuc->used)) { idx = find_first_zero_bit(cpuc->used, nr_counters_generic); + if (idx == nr_counters_generic) + return -EAGAIN; set_bit(idx, cpuc->used); hwc->idx = idx; } @@ -265,6 +267,8 @@ static void pmc_generic_enable(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); + + return 0; } void perf_counter_print_debug(void) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 48f76d2e54c..53af11d3767 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -128,7 +128,7 @@ struct perf_counter; * struct hw_perf_counter_ops - performance counter hw ops */ struct hw_perf_counter_ops { - void (*enable) (struct perf_counter *counter); + int (*enable) (struct perf_counter *counter); void (*disable) (struct perf_counter *counter); void (*read) (struct perf_counter *counter); }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f1110ac1267..2e73929a695 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -355,21 +355,25 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) cpuctx->task_ctx = NULL; } -static void +static int counter_sched_in(struct perf_counter *counter, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu) { if (counter->state == PERF_COUNTER_STATE_OFF) - return; + return 0; + + if (counter->hw_ops->enable(counter)) + return -EAGAIN; - counter->hw_ops->enable(counter); counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ cpuctx->active_oncpu++; ctx->nr_active++; + + return 0; } static int @@ -378,20 +382,38 @@ group_sched_in(struct perf_counter *group_counter, struct perf_counter_context *ctx, int cpu) { - struct perf_counter *counter; - int was_group = 0; + struct perf_counter *counter, *partial_group; + int ret = 0; - counter_sched_in(group_counter, cpuctx, ctx, cpu); + if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) + return -EAGAIN; /* * Schedule in siblings as one group (if any): */ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - counter_sched_in(counter, cpuctx, ctx, cpu); - was_group = 1; + if (counter_sched_in(counter, cpuctx, ctx, cpu)) { + partial_group = counter; + goto group_error; + } + ret = -EAGAIN; } - return was_group; + return ret; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + if (counter == partial_group) + break; + counter_sched_out(counter, cpuctx, ctx); + } + counter_sched_out(group_counter, cpuctx, ctx); + + return -EAGAIN; } /* @@ -416,9 +438,6 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) spin_lock(&ctx->lock); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (ctx->nr_active == cpuctx->max_pertask) - break; - /* * Listen to the 'cpu' scheduling filter constraint * of counters: @@ -856,8 +875,9 @@ static const struct file_operations perf_fops = { .poll = perf_poll, }; -static void cpu_clock_perf_counter_enable(struct perf_counter *counter) +static int cpu_clock_perf_counter_enable(struct perf_counter *counter) { + return 0; } static void cpu_clock_perf_counter_disable(struct perf_counter *counter) @@ -913,11 +933,13 @@ static void task_clock_perf_counter_read(struct perf_counter *counter) task_clock_perf_counter_update(counter, now); } -static void task_clock_perf_counter_enable(struct perf_counter *counter) +static int task_clock_perf_counter_enable(struct perf_counter *counter) { u64 now = task_clock_perf_counter_val(counter, 0); atomic64_set(&counter->hw.prev_count, now); + + return 0; } static void task_clock_perf_counter_disable(struct perf_counter *counter) @@ -960,12 +982,14 @@ static void page_faults_perf_counter_read(struct perf_counter *counter) page_faults_perf_counter_update(counter); } -static void page_faults_perf_counter_enable(struct perf_counter *counter) +static int page_faults_perf_counter_enable(struct perf_counter *counter) { /* * page-faults is a per-task value already, * so we dont have to clear it on switch-in. */ + + return 0; } static void page_faults_perf_counter_disable(struct perf_counter *counter) @@ -1006,12 +1030,14 @@ static void context_switches_perf_counter_read(struct perf_counter *counter) context_switches_perf_counter_update(counter); } -static void context_switches_perf_counter_enable(struct perf_counter *counter) +static int context_switches_perf_counter_enable(struct perf_counter *counter) { /* * ->nvcsw + curr->nivcsw is a per-task value already, * so we dont have to clear it on switch-in. */ + + return 0; } static void context_switches_perf_counter_disable(struct perf_counter *counter) @@ -1050,12 +1076,14 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter) cpu_migrations_perf_counter_update(counter); } -static void cpu_migrations_perf_counter_enable(struct perf_counter *counter) +static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) { /* * se.nr_migrations is a per-task value already, * so we dont have to clear it on switch-in. */ + + return 0; } static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) -- cgit v1.2.3 From 8fe91e61cdc407c7556d3cd71cf20141a25bbcea Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:29:25 +0100 Subject: perfcounters: remove ->nr_inherited Impact: remove dead code nr_inherited was not maintained correctly (not decremented) - and also not used - remove it. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 - kernel/perf_counter.c | 1 - 2 files changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 53af11d3767..1ea08e9f31c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -164,7 +164,6 @@ struct perf_counter { struct task_struct *task; struct file *filp; - unsigned int nr_inherited; struct perf_counter *parent; /* * Protect attach/detach: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2e73929a695..48e1dbcdc1c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1308,7 +1308,6 @@ inherit_counter(struct perf_counter *parent_counter, child_ctx->nr_counters++; child_counter->parent = parent_counter; - parent_counter->nr_inherited++; /* * inherit into child's child as well: */ -- cgit v1.2.3 From f650a672359819454c3d8d4135ecd1558cde0b24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:17:29 +0100 Subject: perfcounters: add PERF_COUNT_BUS_CYCLES Generalize "bus cycles" hw events - and map them to CPU_CLK_Unhalted.Ref on x86. (which is a good enough approximation) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 ++- include/linux/perf_counter.h | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f3359c2b391..86b2fdd344a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -41,12 +41,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); static const int intel_perfmon_event_map[] = { - [PERF_COUNT_CYCLES] = 0x003c, + [PERF_COUNT_CPU_CYCLES] = 0x003c, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, [PERF_COUNT_CACHE_MISSES] = 0x412e, [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_BUS_CYCLES] = 0x013c, }; static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 1ea08e9f31c..ec77d1643d3 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -36,14 +36,15 @@ enum hw_event_types { /* * Common hardware events, generalized by the kernel: */ - PERF_COUNT_CYCLES = 0, + PERF_COUNT_CPU_CYCLES = 0, PERF_COUNT_INSTRUCTIONS = 1, PERF_COUNT_CACHE_REFERENCES = 2, PERF_COUNT_CACHE_MISSES = 3, PERF_COUNT_BRANCH_INSTRUCTIONS = 4, PERF_COUNT_BRANCH_MISSES = 5, + PERF_COUNT_BUS_CYCLES = 6, - PERF_HW_EVENTS_MAX = 6, + PERF_HW_EVENTS_MAX = 7, /* * Special "software" counters provided by the kernel, even if -- cgit v1.2.3 From e44aef58ecfbe061eb4c53b939bcc35fb1bee82d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 25 Dec 2008 09:02:11 +0100 Subject: perfcounters: include asm/perf_counter.h only if CONFIG_PERF_COUNTERS=y Impact: build fix on ia64 KOSAKI Motohiro reported that -tip doesnt build on ia64 because asm/perf_counter.h only exists on x86 for now. Fix it. Reported-by: KOSAKI Motohiro Tested-by: KOSAKI Motohiro Acked-by: KOSAKI Motohiro Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index ec77d1643d3..cc3a75a239a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -14,7 +14,10 @@ #define _LINUX_PERF_COUNTER_H #include -#include + +#ifdef CONFIG_PERF_COUNTERS +# include +#endif #include #include -- cgit v1.2.3 From 3cbed429a9ccdb7a243f733b1056fe5c39e9004c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 16:43:42 +1100 Subject: perf_counter: Add optional hw_perf_group_sched_in arch function Impact: extend perf_counter infrastructure This adds an optional hw_perf_group_sched_in() arch function that enables a whole group of counters in one go. It returns 1 if it added the group successfully, 0 if it did nothing (and therefore the core needs to add the counters individually), or a negative number if an error occurred. It should add all the counters and enable any software counters in the group, or else add none of them and return an error. There are a couple of related changes/improvements in the group handling here: * As an optimization, group_sched_out() and group_sched_in() now check the state of the group leader, and do nothing if the leader is not active or disabled. * We now call hw_perf_save_disable/hw_perf_restore around the complete set of counter enable/disable calls in __perf_counter_sched_in/out, to give the arch code the opportunity to defer updating the hardware state until the hw_perf_restore call if it wants. * We no longer stop adding groups after we get to a group that has more than one counter. We will ultimately add an option for a group to be exclusive. The current code doesn't really implement exclusive groups anyway, since a group could end up going on with other counters that get added before it. Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 3 +++ kernel/perf_counter.c | 31 ++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index cc3a75a239a..b21d1ea4c05 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -236,6 +236,9 @@ extern u64 hw_perf_save_disable(void); extern void hw_perf_restore(u64 ctrl); extern int perf_counter_task_disable(void); extern int perf_counter_task_enable(void); +extern int hw_perf_group_sched_in(struct perf_counter *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, int cpu); #else static inline void diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b7a027a2ef0..9ad11e44d9a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -47,6 +47,12 @@ hw_perf_counter_init(struct perf_counter *counter) u64 __weak hw_perf_save_disable(void) { return 0; } void __weak hw_perf_restore(u64 ctrl) { barrier(); } void __weak hw_perf_counter_setup(void) { barrier(); } +int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, int cpu) +{ + return 0; +} static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) @@ -341,6 +347,9 @@ group_sched_out(struct perf_counter *group_counter, { struct perf_counter *counter; + if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) + return; + counter_sched_out(group_counter, cpuctx, ctx); /* @@ -354,15 +363,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx) { struct perf_counter *counter; + u64 flags; if (likely(!ctx->nr_counters)) return; spin_lock(&ctx->lock); + flags = hw_perf_save_disable(); if (ctx->nr_active) { list_for_each_entry(counter, &ctx->counter_list, list_entry) group_sched_out(counter, cpuctx, ctx); } + hw_perf_restore(flags); spin_unlock(&ctx->lock); } @@ -402,7 +414,14 @@ group_sched_in(struct perf_counter *group_counter, int cpu) { struct perf_counter *counter, *partial_group; - int ret = 0; + int ret; + + if (group_counter->state == PERF_COUNTER_STATE_OFF) + return 0; + + ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); + if (ret) + return ret < 0 ? ret : 0; if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) return -EAGAIN; @@ -415,10 +434,9 @@ group_sched_in(struct perf_counter *group_counter, partial_group = counter; goto group_error; } - ret = -EAGAIN; } - return ret; + return 0; group_error: /* @@ -440,11 +458,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) { struct perf_counter *counter; + u64 flags; if (likely(!ctx->nr_counters)) return; spin_lock(&ctx->lock); + flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { /* * Listen to the 'cpu' scheduling filter constraint @@ -454,12 +474,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, continue; /* - * If we scheduled in a group atomically and - * exclusively, break out: + * If we scheduled in a group atomically and exclusively, + * or if this group can't go on, break out: */ if (group_sched_in(counter, cpuctx, ctx, cpu)) break; } + hw_perf_restore(flags); spin_unlock(&ctx->lock); } -- cgit v1.2.3 From 3b6f9e5cb21964b7ce12bf81076f830885563ec8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 14 Jan 2009 21:00:30 +1100 Subject: perf_counter: Add support for pinned and exclusive counter groups Impact: New perf_counter features A pinned counter group is one that the user wants to have on the CPU whenever possible, i.e. whenever the associated task is running, for a per-task group, or always for a per-cpu group. If the system cannot satisfy that, it puts the group into an error state where it is not scheduled any more and reads from it return EOF (i.e. 0 bytes read). The group can be released from error state and made readable again using prctl(PR_TASK_PERF_COUNTERS_ENABLE). When we have finer-grained enable/disable controls on counters we'll be able to reset the error state on individual groups. An exclusive group is one that the user wants to be the only group using the CPU performance monitor hardware whenever it is on. The counter group scheduler will not schedule an exclusive group if there are already other groups on the CPU and will not schedule other groups onto the CPU if there is an exclusive group scheduled (that statement does not apply to groups containing only software counters, which can always go on and which do not prevent an exclusive group from going on). With an exclusive group, we will be able to let users program PMU registers at a low level without the concern that those settings will perturb other measurements. Along the way this reorganizes things a little: - is_software_counter() is moved to perf_counter.h. - cpuctx->active_oncpu now records the number of hardware counters on the CPU, i.e. it now excludes software counters. Nothing was reading cpuctx->active_oncpu before, so this change is harmless. - A new cpuctx->exclusive field records whether we currently have an exclusive group on the CPU. - counter_sched_out moves higher up in perf_counter.c and gets called from __perf_counter_remove_from_context and __perf_counter_exit_task, where we used to have essentially the same code. - __perf_counter_sched_in now goes through the counter list twice, doing the pinned counters in the first loop and the non-pinned counters in the second loop, in order to give the pinned counters the best chance to be scheduled in. Note that only a group leader can be exclusive or pinned, and that attribute applies to the whole group. This avoids some awkwardness in some corner cases (e.g. where a group leader is closed and the other group members get added to the context list). If we want to relax that restriction later, we can, and it is easier to relax a restriction than to apply a new one. This doesn't yet handle the case where a pinned counter is inherited and goes into error state in the child - the error state is not propagated up to the parent when the child exits, and arguably it should. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 10 +- include/linux/perf_counter.h | 15 ++- kernel/perf_counter.c | 226 +++++++++++++++++++++++++------------ 3 files changed, 169 insertions(+), 82 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 85ad25923c2..5b0211348c7 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -35,14 +35,6 @@ void perf_counter_print_debug(void) { } -/* - * Return 1 for a software counter, 0 for a hardware counter - */ -static inline int is_software_counter(struct perf_counter *counter) -{ - return !counter->hw_event.raw && counter->hw_event.type < 0; -} - /* * Read one performance monitor counter (PMC). */ @@ -443,6 +435,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, */ for (i = n0; i < n0 + n; ++i) cpuhw->counter[i]->hw.config = cpuhw->events[i]; + cpuctx->active_oncpu += n; n = 1; counter_sched_in(group_leader, cpu); list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { @@ -451,7 +444,6 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, ++n; } } - cpuctx->active_oncpu += n; ctx->nr_active += n; return 1; diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index b21d1ea4c05..7ab8e5f96f5 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -86,7 +86,10 @@ struct perf_counter_hw_event { nmi : 1, /* NMI sampling */ raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ - __reserved_1 : 28; + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only counter on PMU */ + + __reserved_1 : 26; u64 __reserved_2; }; @@ -141,6 +144,7 @@ struct hw_perf_counter_ops { * enum perf_counter_active_state - the states of a counter */ enum perf_counter_active_state { + PERF_COUNTER_STATE_ERROR = -2, PERF_COUNTER_STATE_OFF = -1, PERF_COUNTER_STATE_INACTIVE = 0, PERF_COUNTER_STATE_ACTIVE = 1, @@ -214,6 +218,7 @@ struct perf_cpu_context { struct perf_counter_context *task_ctx; int active_oncpu; int max_pertask; + int exclusive; }; /* @@ -240,6 +245,14 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu); +/* + * Return 1 for a software counter, 0 for a hardware counter + */ +static inline int is_software_counter(struct perf_counter *counter) +{ + return !counter->hw_event.raw && counter->hw_event.type < 0; +} + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 52f2f526248..faf671b2956 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -93,6 +93,25 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) } } +static void +counter_sched_out(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + return; + + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->hw_ops->disable(counter); + counter->oncpu = -1; + + if (!is_software_counter(counter)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (counter->hw_event.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + /* * Cross CPU call to remove a performance counter * @@ -118,14 +137,9 @@ static void __perf_counter_remove_from_context(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->hw_ops->disable(counter); - ctx->nr_active--; - cpuctx->active_oncpu--; - counter->task = NULL; - counter->oncpu = -1; - } + counter_sched_out(counter, cpuctx, ctx); + + counter->task = NULL; ctx->nr_counters--; /* @@ -207,7 +221,7 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { - if (counter->state == PERF_COUNTER_STATE_OFF) + if (counter->state <= PERF_COUNTER_STATE_OFF) return 0; counter->state = PERF_COUNTER_STATE_ACTIVE; @@ -223,12 +237,63 @@ counter_sched_in(struct perf_counter *counter, return -EAGAIN; } - cpuctx->active_oncpu++; + if (!is_software_counter(counter)) + cpuctx->active_oncpu++; ctx->nr_active++; + if (counter->hw_event.exclusive) + cpuctx->exclusive = 1; + return 0; } +/* + * Return 1 for a group consisting entirely of software counters, + * 0 if the group contains any hardware counters. + */ +static int is_software_only_group(struct perf_counter *leader) +{ + struct perf_counter *counter; + + if (!is_software_counter(leader)) + return 0; + list_for_each_entry(counter, &leader->sibling_list, list_entry) + if (!is_software_counter(counter)) + return 0; + return 1; +} + +/* + * Work out whether we can put this counter group on the CPU now. + */ +static int group_can_go_on(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + int can_add_hw) +{ + /* + * Groups consisting entirely of software counters can always go on. + */ + if (is_software_only_group(counter)) + return 1; + /* + * If an exclusive group is already on, no other hardware + * counters can go on. + */ + if (cpuctx->exclusive) + return 0; + /* + * If this group is exclusive and there are already + * counters on the CPU, it can't go on. + */ + if (counter->hw_event.exclusive && cpuctx->active_oncpu) + return 0; + /* + * Otherwise, try to add it if all previous groups were able + * to go on. + */ + return can_add_hw; +} + /* * Cross CPU call to install and enable a performance counter */ @@ -240,6 +305,7 @@ static void __perf_install_in_context(void *info) int cpu = smp_processor_id(); unsigned long flags; u64 perf_flags; + int err; /* * If this is a task context, we need to check whether it is @@ -261,9 +327,21 @@ static void __perf_install_in_context(void *info) list_add_counter(counter, ctx); ctx->nr_counters++; - counter_sched_in(counter, cpuctx, ctx, cpu); + /* + * An exclusive counter can't go on if there are already active + * hardware counters, and no hardware counter can go on if there + * is already an exclusive counter on. + */ + if (counter->state == PERF_COUNTER_STATE_INACTIVE && + !group_can_go_on(counter, cpuctx, 1)) + err = -EEXIST; + else + err = counter_sched_in(counter, cpuctx, ctx, cpu); + + if (err && counter->hw_event.pinned) + counter->state = PERF_COUNTER_STATE_ERROR; - if (!ctx->task && cpuctx->max_pertask) + if (!err && !ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; hw_perf_restore(perf_flags); @@ -326,22 +404,6 @@ retry: spin_unlock_irq(&ctx->lock); } -static void -counter_sched_out(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->hw_ops->disable(counter); - counter->oncpu = -1; - - cpuctx->active_oncpu--; - ctx->nr_active--; -} - static void group_sched_out(struct perf_counter *group_counter, struct perf_cpu_context *cpuctx, @@ -359,6 +421,9 @@ group_sched_out(struct perf_counter *group_counter, */ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) counter_sched_out(counter, cpuctx, ctx); + + if (group_counter->hw_event.exclusive) + cpuctx->exclusive = 0; } void __perf_counter_sched_out(struct perf_counter_context *ctx, @@ -455,30 +520,6 @@ group_error: return -EAGAIN; } -/* - * Return 1 for a software counter, 0 for a hardware counter - */ -static inline int is_software_counter(struct perf_counter *counter) -{ - return !counter->hw_event.raw && counter->hw_event.type < 0; -} - -/* - * Return 1 for a group consisting entirely of software counters, - * 0 if the group contains any hardware counters. - */ -static int is_software_only_group(struct perf_counter *leader) -{ - struct perf_counter *counter; - - if (!is_software_counter(leader)) - return 0; - list_for_each_entry(counter, &leader->sibling_list, list_entry) - if (!is_software_counter(counter)) - return 0; - return 1; -} - static void __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) @@ -492,7 +533,38 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, spin_lock(&ctx->lock); flags = hw_perf_save_disable(); + + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->state <= PERF_COUNTER_STATE_OFF || + !counter->hw_event.pinned) + continue; + if (counter->cpu != -1 && counter->cpu != cpu) + continue; + + if (group_can_go_on(counter, cpuctx, 1)) + group_sched_in(counter, cpuctx, ctx, cpu); + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + counter->state = PERF_COUNTER_STATE_ERROR; + } + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + /* + * Ignore counters in OFF or ERROR state, and + * ignore pinned counters since we did them already. + */ + if (counter->state <= PERF_COUNTER_STATE_OFF || + counter->hw_event.pinned) + continue; + /* * Listen to the 'cpu' scheduling filter constraint * of counters: @@ -500,14 +572,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (counter->cpu != -1 && counter->cpu != cpu) continue; - /* - * If we scheduled in a group atomically and exclusively, - * or if this group can't go on, don't add any more - * hardware counters. - */ - if (can_add_hw || is_software_only_group(counter)) + if (group_can_go_on(counter, cpuctx, can_add_hw)) { if (group_sched_in(counter, cpuctx, ctx, cpu)) can_add_hw = 0; + } } hw_perf_restore(flags); spin_unlock(&ctx->lock); @@ -567,8 +635,10 @@ int perf_counter_task_disable(void) */ perf_flags = hw_perf_save_disable(); - list_for_each_entry(counter, &ctx->counter_list, list_entry) - counter->state = PERF_COUNTER_STATE_OFF; + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->state != PERF_COUNTER_STATE_ERROR) + counter->state = PERF_COUNTER_STATE_OFF; + } hw_perf_restore(perf_flags); @@ -607,7 +677,7 @@ int perf_counter_task_enable(void) perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_OFF) + if (counter->state > PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; counter->hw_event.disabled = 0; @@ -849,6 +919,14 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (count != sizeof(cntval)) return -EINVAL; + /* + * Return end-of-file for a read on a counter that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (counter->state == PERF_COUNTER_STATE_ERROR) + return 0; + mutex_lock(&counter->mutex); cntval = perf_counter_read(counter); mutex_unlock(&counter->mutex); @@ -884,7 +962,7 @@ perf_read_irq_data(struct perf_counter *counter, { struct perf_data *irqdata, *usrdata; DECLARE_WAITQUEUE(wait, current); - ssize_t res; + ssize_t res, res2; irqdata = counter->irqdata; usrdata = counter->usrdata; @@ -905,6 +983,9 @@ perf_read_irq_data(struct perf_counter *counter, if (signal_pending(current)) break; + if (counter->state == PERF_COUNTER_STATE_ERROR) + break; + spin_unlock_irq(&counter->waitq.lock); schedule(); spin_lock_irq(&counter->waitq.lock); @@ -913,7 +994,8 @@ perf_read_irq_data(struct perf_counter *counter, __set_current_state(TASK_RUNNING); spin_unlock_irq(&counter->waitq.lock); - if (usrdata->len + irqdata->len < count) + if (usrdata->len + irqdata->len < count && + counter->state != PERF_COUNTER_STATE_ERROR) return -ERESTARTSYS; read_pending: mutex_lock(&counter->mutex); @@ -925,11 +1007,12 @@ read_pending: /* Switch irq buffer: */ usrdata = perf_switch_irq_data(counter); - if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) { + res2 = perf_copy_usrdata(usrdata, buf + res, count - res); + if (res2 < 0) { if (!res) res = -EFAULT; } else { - res = count; + res += res2; } out: mutex_unlock(&counter->mutex); @@ -1348,6 +1431,11 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, */ if (group_leader->ctx != ctx) goto err_put_context; + /* + * Only a group leader can be exclusive or pinned + */ + if (hw_event.exclusive || hw_event.pinned) + goto err_put_context; } ret = -EINVAL; @@ -1473,13 +1561,7 @@ __perf_counter_exit_task(struct task_struct *child, cpuctx = &__get_cpu_var(perf_cpu_context); - if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - child_counter->hw_ops->disable(child_counter); - cpuctx->active_oncpu--; - child_ctx->nr_active--; - child_counter->oncpu = -1; - } + counter_sched_out(child_counter, cpuctx, child_ctx); list_del_init(&child_counter->list_entry); -- cgit v1.2.3 From d859e29fe34cb833071b20aef860ee94fbad9bb2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 17 Jan 2009 18:10:22 +1100 Subject: perf_counter: Add counter enable/disable ioctls Impact: New perf_counter features This primarily adds a way for perf_counter users to enable and disable counters and groups. Enabling or disabling a counter or group also enables or disables all of the child counters that have been cloned from it to monitor children of the task monitored by the top-level counter. The userspace interface to enable/disable counters is via ioctl on the counter file descriptor. Along the way this extends the code that handles child counters to handle child counter groups properly. A group with multiple counters will be cloned to child tasks if and only if the group leader has the hw_event.inherit bit set - if it is set the whole group is cloned as a group in the child task. In order to be able to enable or disable all child counters of a given top-level counter, we need a way to find them all. Hence I have added a child_list field to struct perf_counter, which is the head of the list of children for a top-level counter, or the link in that list for a child counter. That list is protected by the perf_counter.mutex field. This also adds a mutex to the perf_counter_context struct. Previously the list of counters was protected just by the lock field in the context, which meant that perf_counter_init_task had to take that lock and then take whatever lock/mutex protects the top-level counter's child_list. But the counter enable/disable functions need to take that lock in order to traverse the list, then for each counter take the lock in that counter's context in order to change the counter's state safely, which will lead to a deadlock. To solve this, we now have both a mutex and a spinlock in the context, and taking either is sufficient to ensure the list of counters can't change - you have to take both before changing the list. Now perf_counter_init_task takes the mutex instead of the lock (which incidentally means that inherit_counter can use GFP_KERNEL instead of GFP_ATOMIC) and thus avoids the possible deadlock. Similarly the new enable/disable functions can take the mutex while traversing the list of child counters without incurring a possible deadlock when the counter manipulation code locks the context for a child counter. We also had an misfeature that the first counter added to a context would possibly not go on until the next sched-in, because we were using ctx->nr_active to detect if the context was running on a CPU. But nr_active is the number of active counters, and if that was zero (because the context didn't have any counters yet) it would look like the context wasn't running on a cpu and so the retry code in __perf_install_in_context wouldn't retry. So this adds an 'is_active' field that is set when the context is on a CPU, even if it has no counters. The is_active field is only used for task contexts, not for per-cpu contexts. If we enable a subsidiary counter in a group that is active on a CPU, and the arch code can't enable the counter, then we have to pull the whole group off the CPU. We do this with group_sched_out, which gets moved up in the file so it comes before all its callers. This also adds similar logic to __perf_install_in_context so that the "all on, or none" invariant of groups is preserved when adding a new counter to a group. Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 21 +- kernel/perf_counter.c | 455 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 415 insertions(+), 61 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7ab8e5f96f5..33ba9fe0a78 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -14,6 +14,7 @@ #define _LINUX_PERF_COUNTER_H #include +#include #ifdef CONFIG_PERF_COUNTERS # include @@ -94,6 +95,12 @@ struct perf_counter_hw_event { u64 __reserved_2; }; +/* + * Ioctls that can be done on a perf counter fd: + */ +#define PERF_COUNTER_IOC_ENABLE _IO('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO('$', 1) + /* * Kernel-internal data types: */ @@ -173,8 +180,10 @@ struct perf_counter { struct file *filp; struct perf_counter *parent; + struct list_head child_list; + /* - * Protect attach/detach: + * Protect attach/detach and child_list: */ struct mutex mutex; @@ -199,13 +208,21 @@ struct perf_counter { struct perf_counter_context { #ifdef CONFIG_PERF_COUNTERS /* - * Protect the list of counters: + * Protect the states of the counters in the list, + * nr_active, and the list: */ spinlock_t lock; + /* + * Protect the list of counters. Locking either mutex or lock + * is sufficient to ensure the list doesn't change; to change + * the list you need to lock both the mutex and the spinlock. + */ + struct mutex mutex; struct list_head counter_list; int nr_counters; int nr_active; + int is_active; struct task_struct *task; #endif }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index faf671b2956..1ac18daa424 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -112,6 +112,28 @@ counter_sched_out(struct perf_counter *counter, cpuctx->exclusive = 0; } +static void +group_sched_out(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + + if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) + return; + + counter_sched_out(group_counter, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) + counter_sched_out(counter, cpuctx, ctx); + + if (group_counter->hw_event.exclusive) + cpuctx->exclusive = 0; +} + /* * Cross CPU call to remove a performance counter * @@ -168,7 +190,7 @@ static void __perf_counter_remove_from_context(void *info) /* * Remove the counter from a task's (or a CPU's) list of counters. * - * Must be called with counter->mutex held. + * Must be called with counter->mutex and ctx->mutex held. * * CPU counters are removed with a smp call. For task counters we only * call when the task is on a CPU. @@ -215,6 +237,99 @@ retry: spin_unlock_irq(&ctx->lock); } +/* + * Cross CPU call to disable a performance counter + */ +static void __perf_counter_disable(void *info) +{ + struct perf_counter *counter = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter_context *ctx = counter->ctx; + unsigned long flags; + + /* + * If this is a per-task counter, need to check whether this + * counter's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); + + /* + * If the counter is on, turn it off. + * If it is in error state, leave it in error state. + */ + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + if (counter == counter->group_leader) + group_sched_out(counter, cpuctx, ctx); + else + counter_sched_out(counter, cpuctx, ctx); + counter->state = PERF_COUNTER_STATE_OFF; + } + + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); +} + +/* + * Disable a counter. + */ +static void perf_counter_disable(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Disable the counter on the cpu that it's on + */ + smp_call_function_single(counter->cpu, __perf_counter_disable, + counter, 1); + return; + } + + retry: + task_oncpu_function_call(task, __perf_counter_disable, counter); + + spin_lock_irq(&ctx->lock); + /* + * If the counter is still active, we need to retry the cross-call. + */ + if (counter->state == PERF_COUNTER_STATE_ACTIVE) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + counter->state = PERF_COUNTER_STATE_OFF; + + spin_unlock_irq(&ctx->lock); +} + +/* + * Disable a counter and all its children. + */ +static void perf_counter_disable_family(struct perf_counter *counter) +{ + struct perf_counter *child; + + perf_counter_disable(counter); + + /* + * Lock the mutex to protect the list of children + */ + mutex_lock(&counter->mutex); + list_for_each_entry(child, &counter->child_list, child_list) + perf_counter_disable(child); + mutex_unlock(&counter->mutex); +} + static int counter_sched_in(struct perf_counter *counter, struct perf_cpu_context *cpuctx, @@ -302,6 +417,7 @@ static void __perf_install_in_context(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *leader = counter->group_leader; int cpu = smp_processor_id(); unsigned long flags; u64 perf_flags; @@ -327,23 +443,40 @@ static void __perf_install_in_context(void *info) list_add_counter(counter, ctx); ctx->nr_counters++; + /* + * Don't put the counter on if it is disabled or if + * it is in a group and the group isn't on. + */ + if (counter->state != PERF_COUNTER_STATE_INACTIVE || + (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) + goto unlock; + /* * An exclusive counter can't go on if there are already active * hardware counters, and no hardware counter can go on if there * is already an exclusive counter on. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE && - !group_can_go_on(counter, cpuctx, 1)) + if (!group_can_go_on(counter, cpuctx, 1)) err = -EEXIST; else err = counter_sched_in(counter, cpuctx, ctx, cpu); - if (err && counter->hw_event.pinned) - counter->state = PERF_COUNTER_STATE_ERROR; + if (err) { + /* + * This counter couldn't go on. If it is in a group + * then we have to pull the whole group off. + * If the counter group is pinned then put it in error state. + */ + if (leader != counter) + group_sched_out(leader, cpuctx, ctx); + if (leader->hw_event.pinned) + leader->state = PERF_COUNTER_STATE_ERROR; + } if (!err && !ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; + unlock: hw_perf_restore(perf_flags); spin_unlock(&ctx->lock); @@ -359,6 +492,8 @@ static void __perf_install_in_context(void *info) * If the counter is attached to a task which is on a CPU we use a smp * call to enable it in the task context. The task might have been * scheduled away, but we check this in the smp call again. + * + * Must be called with ctx->mutex held. */ static void perf_install_in_context(struct perf_counter_context *ctx, @@ -387,7 +522,7 @@ retry: /* * we need to retry the smp call. */ - if (ctx->nr_active && list_empty(&counter->list_entry)) { + if (ctx->is_active && list_empty(&counter->list_entry)) { spin_unlock_irq(&ctx->lock); goto retry; } @@ -404,26 +539,131 @@ retry: spin_unlock_irq(&ctx->lock); } -static void -group_sched_out(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) +/* + * Cross CPU call to enable a performance counter + */ +static void __perf_counter_enable(void *info) { - struct perf_counter *counter; + struct perf_counter *counter = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *leader = counter->group_leader; + unsigned long flags; + int err; - if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) + /* + * If this is a per-task counter, need to check whether this + * counter's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) return; - counter_sched_out(group_counter, cpuctx, ctx); + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); + + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + goto unlock; + counter->state = PERF_COUNTER_STATE_INACTIVE; /* - * Schedule out siblings (if any): + * If the counter is in a group and isn't the group leader, + * then don't put it on unless the group is on. */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) - counter_sched_out(counter, cpuctx, ctx); + if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) + goto unlock; - if (group_counter->hw_event.exclusive) - cpuctx->exclusive = 0; + if (!group_can_go_on(counter, cpuctx, 1)) + err = -EEXIST; + else + err = counter_sched_in(counter, cpuctx, ctx, + smp_processor_id()); + + if (err) { + /* + * If this counter can't go on and it's part of a + * group, then the whole group has to come off. + */ + if (leader != counter) + group_sched_out(leader, cpuctx, ctx); + if (leader->hw_event.pinned) + leader->state = PERF_COUNTER_STATE_ERROR; + } + + unlock: + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); +} + +/* + * Enable a counter. + */ +static void perf_counter_enable(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Enable the counter on the cpu that it's on + */ + smp_call_function_single(counter->cpu, __perf_counter_enable, + counter, 1); + return; + } + + spin_lock_irq(&ctx->lock); + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + goto out; + + /* + * If the counter is in error state, clear that first. + * That way, if we see the counter in error state below, we + * know that it has gone back into error state, as distinct + * from the task having been scheduled away before the + * cross-call arrived. + */ + if (counter->state == PERF_COUNTER_STATE_ERROR) + counter->state = PERF_COUNTER_STATE_OFF; + + retry: + spin_unlock_irq(&ctx->lock); + task_oncpu_function_call(task, __perf_counter_enable, counter); + + spin_lock_irq(&ctx->lock); + + /* + * If the context is active and the counter is still off, + * we need to retry the cross-call. + */ + if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) + goto retry; + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (counter->state == PERF_COUNTER_STATE_OFF) + counter->state = PERF_COUNTER_STATE_INACTIVE; + out: + spin_unlock_irq(&ctx->lock); +} + +/* + * Enable a counter and all its children. + */ +static void perf_counter_enable_family(struct perf_counter *counter) +{ + struct perf_counter *child; + + perf_counter_enable(counter); + + /* + * Lock the mutex to protect the list of children + */ + mutex_lock(&counter->mutex); + list_for_each_entry(child, &counter->child_list, child_list) + perf_counter_enable(child); + mutex_unlock(&counter->mutex); } void __perf_counter_sched_out(struct perf_counter_context *ctx, @@ -432,16 +672,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, struct perf_counter *counter; u64 flags; + spin_lock(&ctx->lock); + ctx->is_active = 0; if (likely(!ctx->nr_counters)) - return; + goto out; - spin_lock(&ctx->lock); flags = hw_perf_save_disable(); if (ctx->nr_active) { list_for_each_entry(counter, &ctx->counter_list, list_entry) group_sched_out(counter, cpuctx, ctx); } hw_perf_restore(flags); + out: spin_unlock(&ctx->lock); } @@ -528,10 +770,11 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, u64 flags; int can_add_hw = 1; + spin_lock(&ctx->lock); + ctx->is_active = 1; if (likely(!ctx->nr_counters)) - return; + goto out; - spin_lock(&ctx->lock); flags = hw_perf_save_disable(); /* @@ -578,6 +821,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, } } hw_perf_restore(flags); + out: spin_unlock(&ctx->lock); } @@ -896,12 +1140,14 @@ static int perf_release(struct inode *inode, struct file *file) file->private_data = NULL; + mutex_lock(&ctx->mutex); mutex_lock(&counter->mutex); perf_counter_remove_from_context(counter); put_context(ctx); mutex_unlock(&counter->mutex); + mutex_unlock(&ctx->mutex); kfree(counter); @@ -1053,10 +1299,30 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) return events; } +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_counter *counter = file->private_data; + int err = 0; + + switch (cmd) { + case PERF_COUNTER_IOC_ENABLE: + perf_counter_enable_family(counter); + break; + case PERF_COUNTER_IOC_DISABLE: + perf_counter_disable_family(counter); + break; + default: + err = -ENOTTY; + } + return err; +} + static const struct file_operations perf_fops = { .release = perf_release, .read = perf_read, .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, }; static int cpu_clock_perf_counter_enable(struct perf_counter *counter) @@ -1348,6 +1614,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, INIT_LIST_HEAD(&counter->sibling_list); init_waitqueue_head(&counter->waitq); + INIT_LIST_HEAD(&counter->child_list); + counter->irqdata = &counter->data[0]; counter->usrdata = &counter->data[1]; counter->cpu = cpu; @@ -1452,7 +1720,9 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, goto err_free_put_context; counter->filp = counter_file; + mutex_lock(&ctx->mutex); perf_install_in_context(ctx, counter, cpu); + mutex_unlock(&ctx->mutex); fput_light(counter_file, fput_needed2); @@ -1479,6 +1749,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx, { memset(ctx, 0, sizeof(*ctx)); spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); ctx->task = task; } @@ -1486,20 +1757,30 @@ __perf_counter_init_context(struct perf_counter_context *ctx, /* * inherit a counter from parent task to child task: */ -static int +static struct perf_counter * inherit_counter(struct perf_counter *parent_counter, struct task_struct *parent, struct perf_counter_context *parent_ctx, struct task_struct *child, + struct perf_counter *group_leader, struct perf_counter_context *child_ctx) { struct perf_counter *child_counter; + /* + * Instead of creating recursive hierarchies of counters, + * we link inherited counters back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_counter->parent) + parent_counter = parent_counter->parent; + child_counter = perf_counter_alloc(&parent_counter->hw_event, - parent_counter->cpu, NULL, - GFP_ATOMIC); + parent_counter->cpu, group_leader, + GFP_KERNEL); if (!child_counter) - return -ENOMEM; + return NULL; /* * Link it up in the child's context: @@ -1523,16 +1804,82 @@ inherit_counter(struct perf_counter *parent_counter, */ atomic_long_inc(&parent_counter->filp->f_count); + /* + * Link this into the parent counter's child list + */ + mutex_lock(&parent_counter->mutex); + list_add_tail(&child_counter->child_list, &parent_counter->child_list); + + /* + * Make the child state follow the state of the parent counter, + * not its hw_event.disabled bit. We hold the parent's mutex, + * so we won't race with perf_counter_{en,dis}able_family. + */ + if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) + child_counter->state = PERF_COUNTER_STATE_INACTIVE; + else + child_counter->state = PERF_COUNTER_STATE_OFF; + + mutex_unlock(&parent_counter->mutex); + + return child_counter; +} + +static int inherit_group(struct perf_counter *parent_counter, + struct task_struct *parent, + struct perf_counter_context *parent_ctx, + struct task_struct *child, + struct perf_counter_context *child_ctx) +{ + struct perf_counter *leader; + struct perf_counter *sub; + + leader = inherit_counter(parent_counter, parent, parent_ctx, + child, NULL, child_ctx); + if (!leader) + return -ENOMEM; + list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { + if (!inherit_counter(sub, parent, parent_ctx, + child, leader, child_ctx)) + return -ENOMEM; + } return 0; } +static void sync_child_counter(struct perf_counter *child_counter, + struct perf_counter *parent_counter) +{ + u64 parent_val, child_val; + + parent_val = atomic64_read(&parent_counter->count); + child_val = atomic64_read(&child_counter->count); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_counter->count); + + /* + * Remove this counter from the parent's list + */ + mutex_lock(&parent_counter->mutex); + list_del_init(&child_counter->child_list); + mutex_unlock(&parent_counter->mutex); + + /* + * Release the parent counter, if this was the last + * reference to it. + */ + fput(parent_counter->filp); +} + static void __perf_counter_exit_task(struct task_struct *child, struct perf_counter *child_counter, struct perf_counter_context *child_ctx) { struct perf_counter *parent_counter; - u64 parent_val, child_val; + struct perf_counter *sub, *tmp; /* * If we do not self-reap then we have to wait for the @@ -1561,7 +1908,7 @@ __perf_counter_exit_task(struct task_struct *child, cpuctx = &__get_cpu_var(perf_cpu_context); - counter_sched_out(child_counter, cpuctx, child_ctx); + group_sched_out(child_counter, cpuctx, child_ctx); list_del_init(&child_counter->list_entry); @@ -1577,26 +1924,23 @@ __perf_counter_exit_task(struct task_struct *child, * that are still around due to the child reference. These * counters need to be zapped - but otherwise linger. */ - if (!parent_counter) - return; - - parent_val = atomic64_read(&parent_counter->count); - child_val = atomic64_read(&child_counter->count); - - /* - * Add back the child's count to the parent's count: - */ - atomic64_add(child_val, &parent_counter->count); - - fput(parent_counter->filp); + if (parent_counter) { + sync_child_counter(child_counter, parent_counter); + list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list, + list_entry) { + if (sub->parent) + sync_child_counter(sub, sub->parent); + kfree(sub); + } + } kfree(child_counter); } /* - * When a child task exist, feed back counter values to parent counters. + * When a child task exits, feed back counter values to parent counters. * - * Note: we are running in child context, but the PID is not hashed + * Note: we may be running in child context, but the PID is not hashed * anymore so new counters will not be added. */ void perf_counter_exit_task(struct task_struct *child) @@ -1620,9 +1964,8 @@ void perf_counter_exit_task(struct task_struct *child) void perf_counter_init_task(struct task_struct *child) { struct perf_counter_context *child_ctx, *parent_ctx; - struct perf_counter *counter, *parent_counter; + struct perf_counter *counter; struct task_struct *parent = current; - unsigned long flags; child_ctx = &child->perf_counter_ctx; parent_ctx = &parent->perf_counter_ctx; @@ -1641,32 +1984,22 @@ void perf_counter_init_task(struct task_struct *child) * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. */ - spin_lock_irqsave(&parent_ctx->lock, flags); + mutex_lock(&parent_ctx->mutex); /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { - if (!counter->hw_event.inherit || counter->group_leader != counter) + if (!counter->hw_event.inherit) continue; - /* - * Instead of creating recursive hierarchies of counters, - * we link inheritd counters back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - parent_counter = counter; - if (counter->parent) - parent_counter = counter->parent; - - if (inherit_counter(parent_counter, parent, + if (inherit_group(counter, parent, parent_ctx, child, child_ctx)) break; } - spin_unlock_irqrestore(&parent_ctx->lock, flags); + mutex_unlock(&parent_ctx->mutex); } static void __cpuinit perf_counter_init_cpu(int cpu) @@ -1692,11 +2025,15 @@ static void __perf_counter_exit_cpu(void *info) list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) __perf_counter_remove_from_context(counter); - } static void perf_counter_exit_cpu(int cpu) { + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); + mutex_unlock(&ctx->mutex); } #else static inline void perf_counter_exit_cpu(int cpu) { } -- cgit v1.2.3 From 1b023a96d9b44f50f4d8ff28c15f5b80e354760f Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 10:13:01 +0100 Subject: perfcounters: throttle on too high IRQ rates Starting kerneltop with only -c 100 seems to be a bad idea, it can easily lock the system due to perfcounter IRQ overload. So add throttling: if a new IRQ arrives in a shorter than PERFMON_MIN_PERIOD_NS time, turn off perfcounters and untrottle them from the next timer tick. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 ++ arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++++++++++++++------ include/linux/perf_counter.h | 4 ++++ 3 files changed, 38 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 7b434e5b14c..849c23009bf 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -781,6 +781,8 @@ static void local_apic_timer_interrupt(void) inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); + + perf_counter_unthrottle(); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9376771f757..1a040b179b5 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,6 +33,9 @@ static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 last_interrupt; + u64 global_enable; + int throttled; }; /* @@ -474,16 +477,19 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); - u64 ack, status, saved_global; - struct cpu_hw_counters *cpuc; + u64 ack, status, now; + struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); /* Disable counters globally */ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + now = sched_clock(); + if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS) + cpuc->throttled = 1; + cpuc->last_interrupt = now; rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (!status) @@ -533,9 +539,29 @@ again: goto again; out: /* - * Restore - do not reenable when global enable is off: + * Restore - do not reenable when global enable is off or throttled: */ - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); + if (!cpuc->throttled) + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); +} + +void perf_counter_unthrottle(void) +{ + struct cpu_hw_counters *cpuc; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return; + + if (unlikely(!perf_counters_initialized)) + return; + + cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); + if (cpuc->throttled) { + if (printk_ratelimit()) + printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n"); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + cpuc->throttled = 0; + } } void smp_perf_counter_interrupt(struct pt_regs *regs) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 33ba9fe0a78..91f1ca4c01c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -254,6 +254,7 @@ extern void perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); +extern void perf_counter_unthrottle(void); extern u64 hw_perf_save_disable(void); extern void hw_perf_restore(u64 ctrl); extern int perf_counter_task_disable(void); @@ -270,6 +271,8 @@ static inline int is_software_counter(struct perf_counter *counter) return !counter->hw_event.raw && counter->hw_event.type < 0; } +#define PERFMON_MIN_PERIOD_NS 10000 + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -281,6 +284,7 @@ static inline void perf_counter_init_task(struct task_struct *child) { } static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } +static inline void perf_counter_unthrottle(void) { } static inline void hw_perf_restore(u64 ctrl) { } static inline u64 hw_perf_save_disable(void) { return 0; } static inline int perf_counter_task_disable(void) { return -EINVAL; } -- cgit v1.2.3 From 4b39fd96855254a244f71245b41a91cdecb87d63 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 14:36:16 +0100 Subject: perfcounters: ratelimit performance counter interrupts Ratelimit performance counter interrupts to 100KHz per CPU. This replaces the irq-delta-time based method. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 27 +++++++++++++++------------ include/linux/perf_counter.h | 2 -- 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1a040b179b5..a56d4cf92f3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,9 +33,8 @@ static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - u64 last_interrupt; + unsigned long interrupts; u64 global_enable; - int throttled; }; /* @@ -470,6 +469,11 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) } } +/* + * Maximum interrupt frequency of 100KHz per CPU + */ +#define PERFMON_MAX_INTERRUPTS 100000/HZ + /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -477,7 +481,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); - u64 ack, status, now; + u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); @@ -486,11 +490,6 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); - now = sched_clock(); - if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS) - cpuc->throttled = 1; - cpuc->last_interrupt = now; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (!status) goto out; @@ -541,13 +540,14 @@ out: /* * Restore - do not reenable when global enable is off or throttled: */ - if (!cpuc->throttled) + if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); } void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; + u64 global_enable; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; @@ -556,12 +556,15 @@ void perf_counter_unthrottle(void) return; cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); - if (cpuc->throttled) { + if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) - printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n"); + printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); - cpuc->throttled = 0; } + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable); + if (unlikely(cpuc->global_enable && !global_enable)) + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + cpuc->interrupts = 0; } void smp_perf_counter_interrupt(struct pt_regs *regs) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 91f1ca4c01c..f55381fbcac 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -271,8 +271,6 @@ static inline int is_software_counter(struct perf_counter *counter) return !counter->hw_event.raw && counter->hw_event.type < 0; } -#define PERFMON_MIN_PERIOD_NS 10000 - #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } -- cgit v1.2.3 From 23a185ca8abbeef64b6ffc33059b1d630e43ec10 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 9 Feb 2009 22:42:47 +1100 Subject: perf_counters: make software counters work as per-cpu counters Impact: kernel crash fix Yanmin Zhang reported that using a PERF_COUNT_TASK_CLOCK software counter as a per-cpu counter would reliably crash the system, because it calls __task_delta_exec with a null pointer. The page fault, context switch and cpu migration counters also won't function correctly as per-cpu counters since they reference the current task. This fixes the problem by redirecting the task_clock counter to the cpu_clock counter when used as a per-cpu counter, and by implementing per-cpu page fault, context switch and cpu migration counters. Along the way, this: - Initializes counter->ctx earlier, in perf_counter_alloc, so that sw_perf_counter_init can use it - Adds code to kernel/sched.c to count task migrations into each cpu, in rq->nr_migrations_in - Exports the per-cpu context switch and task migration counts via new functions added to kernel/sched.c - Makes sure that if sw_perf_counter_init fails, we don't try to initialize the counter as a hardware counter. Since the user has passed a negative, non-raw event type, they clearly don't intend for it to be interpreted as a hardware event. Reported-by: "Zhang Yanmin" Signed-off-by: Paul Mackerras Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/perf_counter.c | 78 +++++++++++++++++++++++++++++---------------------- kernel/sched.c | 17 +++++++++++ 3 files changed, 64 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index b85b10abf77..1e5f70062a9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -137,6 +137,8 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern u64 cpu_nr_switches(int cpu); +extern u64 cpu_nr_migrations(int cpu); struct seq_file; struct cfs_rq; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f27a7e9f3c4..544193cbc47 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include /* * Each CPU has a list of per CPU counters: @@ -502,7 +504,6 @@ perf_install_in_context(struct perf_counter_context *ctx, { struct task_struct *task = ctx->task; - counter->ctx = ctx; if (!task) { /* * Per cpu counters are installed via an smp call and @@ -1417,11 +1418,19 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; -static u64 get_page_faults(void) +#ifdef CONFIG_VM_EVENT_COUNTERS +#define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT] +#else +#define cpu_page_faults() 0 +#endif + +static u64 get_page_faults(struct perf_counter *counter) { - struct task_struct *curr = current; + struct task_struct *curr = counter->ctx->task; - return curr->maj_flt + curr->min_flt; + if (curr) + return curr->maj_flt + curr->min_flt; + return cpu_page_faults(); } static void page_faults_perf_counter_update(struct perf_counter *counter) @@ -1430,7 +1439,7 @@ static void page_faults_perf_counter_update(struct perf_counter *counter) s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = get_page_faults(); + now = get_page_faults(counter); atomic64_set(&counter->hw.prev_count, now); @@ -1446,11 +1455,7 @@ static void page_faults_perf_counter_read(struct perf_counter *counter) static int page_faults_perf_counter_enable(struct perf_counter *counter) { - /* - * page-faults is a per-task value already, - * so we dont have to clear it on switch-in. - */ - + atomic64_set(&counter->hw.prev_count, get_page_faults(counter)); return 0; } @@ -1465,11 +1470,13 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = { .read = page_faults_perf_counter_read, }; -static u64 get_context_switches(void) +static u64 get_context_switches(struct perf_counter *counter) { - struct task_struct *curr = current; + struct task_struct *curr = counter->ctx->task; - return curr->nvcsw + curr->nivcsw; + if (curr) + return curr->nvcsw + curr->nivcsw; + return cpu_nr_switches(smp_processor_id()); } static void context_switches_perf_counter_update(struct perf_counter *counter) @@ -1478,7 +1485,7 @@ static void context_switches_perf_counter_update(struct perf_counter *counter) s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = get_context_switches(); + now = get_context_switches(counter); atomic64_set(&counter->hw.prev_count, now); @@ -1494,11 +1501,7 @@ static void context_switches_perf_counter_read(struct perf_counter *counter) static int context_switches_perf_counter_enable(struct perf_counter *counter) { - /* - * ->nvcsw + curr->nivcsw is a per-task value already, - * so we dont have to clear it on switch-in. - */ - + atomic64_set(&counter->hw.prev_count, get_context_switches(counter)); return 0; } @@ -1513,9 +1516,13 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = { .read = context_switches_perf_counter_read, }; -static inline u64 get_cpu_migrations(void) +static inline u64 get_cpu_migrations(struct perf_counter *counter) { - return current->se.nr_migrations; + struct task_struct *curr = counter->ctx->task; + + if (curr) + return curr->se.nr_migrations; + return cpu_nr_migrations(smp_processor_id()); } static void cpu_migrations_perf_counter_update(struct perf_counter *counter) @@ -1524,7 +1531,7 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter) s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = get_cpu_migrations(); + now = get_cpu_migrations(counter); atomic64_set(&counter->hw.prev_count, now); @@ -1540,11 +1547,7 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter) static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) { - /* - * se.nr_migrations is a per-task value already, - * so we dont have to clear it on switch-in. - */ - + atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter)); return 0; } @@ -1569,7 +1572,14 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_TASK_CLOCK: - hw_ops = &perf_ops_task_clock; + /* + * If the user instantiates this as a per-cpu counter, + * use the cpu_clock counter instead. + */ + if (counter->ctx->task) + hw_ops = &perf_ops_task_clock; + else + hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_PAGE_FAULTS: hw_ops = &perf_ops_page_faults; @@ -1592,6 +1602,7 @@ sw_perf_counter_init(struct perf_counter *counter) static struct perf_counter * perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu, + struct perf_counter_context *ctx, struct perf_counter *group_leader, gfp_t gfpflags) { @@ -1623,6 +1634,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->wakeup_pending = 0; counter->group_leader = group_leader; counter->hw_ops = NULL; + counter->ctx = ctx; counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) @@ -1631,7 +1643,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, hw_ops = NULL; if (!hw_event->raw && hw_event->type < 0) hw_ops = sw_perf_counter_init(counter); - if (!hw_ops) + else hw_ops = hw_perf_counter_init(counter); if (!hw_ops) { @@ -1707,7 +1719,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, } ret = -EINVAL; - counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL); + counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, + GFP_KERNEL); if (!counter) goto err_put_context; @@ -1777,15 +1790,14 @@ inherit_counter(struct perf_counter *parent_counter, parent_counter = parent_counter->parent; child_counter = perf_counter_alloc(&parent_counter->hw_event, - parent_counter->cpu, group_leader, - GFP_KERNEL); + parent_counter->cpu, child_ctx, + group_leader, GFP_KERNEL); if (!child_counter) return NULL; /* * Link it up in the child's context: */ - child_counter->ctx = child_ctx; child_counter->task = child; list_add_counter(child_counter, child_ctx); child_ctx->nr_counters++; diff --git a/kernel/sched.c b/kernel/sched.c index 8db1a4cf208..173768f142a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -558,6 +558,7 @@ struct rq { struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; + u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -1908,6 +1909,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #endif if (old_cpu != new_cpu) { p->se.nr_migrations++; + new_rq->nr_migrations_in++; #ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); @@ -2810,6 +2812,21 @@ unsigned long nr_active(void) return running + uninterruptible; } +/* + * Externally visible per-cpu scheduler statistics: + * cpu_nr_switches(cpu) - number of context switches on that cpu + * cpu_nr_migrations(cpu) - number of migrations into that cpu + */ +u64 cpu_nr_switches(int cpu) +{ + return cpu_rq(cpu)->nr_switches; +} + +u64 cpu_nr_migrations(int cpu) +{ + return cpu_rq(cpu)->nr_migrations_in; +} + /* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). -- cgit v1.2.3 From 0475f9ea8e2cc030298908949e0d5da9f2fc2cfe Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Feb 2009 14:35:35 +1100 Subject: perf_counters: allow users to count user, kernel and/or hypervisor events Impact: new perf_counter feature This extends the perf_counter_hw_event struct with bits that specify that events in user, kernel and/or hypervisor mode should not be counted (i.e. should be excluded), and adds code to program the PMU mode selection bits accordingly on x86 and powerpc. For software counters, we don't currently have the infrastructure to distinguish which mode an event occurs in, so we currently fail the counter initialization if the setting of the hw_event.exclude_* bits would require us to distinguish. Context switches and CPU migrations are currently considered to occur in kernel mode. On x86, this changes the previous policy that only root can count kernel events. Now non-root users can count kernel events or exclude them. Non-root users still can't use NMI events, though. On x86 we don't appear to have any way to control whether hypervisor events are counted or not, so hw_event.exclude_hv is ignored. On powerpc, the selection of whether to count events in user, kernel and/or hypervisor mode is PMU-wide, not per-counter, so this adds a check that the hw_event.exclude_* settings are the same as other events on the PMU. Counters being added to a group have to have the same settings as the other hardware counters in the group. Counters and groups can only be enabled in hw_perf_group_sched_in or power_perf_enable if they have the same settings as any other counters already on the PMU. If we are not running on a hypervisor, the exclude_hv setting is ignored (by forcing it to 0) since we can't ever get any hypervisor events. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 68 ++++++++++++++++++++++++++++++++++++-- arch/x86/kernel/cpu/perf_counter.c | 31 ++++++++++------- include/linux/perf_counter.h | 19 ++++++----- kernel/perf_counter.c | 26 ++++++++++++--- 4 files changed, 117 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 5b0211348c7..bd6ba85beb5 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -16,6 +16,7 @@ #include #include #include +#include struct cpu_hw_counters { int n_counters; @@ -214,6 +215,36 @@ static int power_check_constraints(unsigned int event[], int n_ev) return 0; } +/* + * Check if newly-added counters have consistent settings for + * exclude_{user,kernel,hv} with each other and any previously + * added counters. + */ +static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new) +{ + int eu, ek, eh; + int i, n; + struct perf_counter *counter; + + n = n_prev + n_new; + if (n <= 1) + return 0; + + eu = ctrs[0]->hw_event.exclude_user; + ek = ctrs[0]->hw_event.exclude_kernel; + eh = ctrs[0]->hw_event.exclude_hv; + if (n_prev == 0) + n_prev = 1; + for (i = n_prev; i < n; ++i) { + counter = ctrs[i]; + if (counter->hw_event.exclude_user != eu || + counter->hw_event.exclude_kernel != ek || + counter->hw_event.exclude_hv != eh) + return -EAGAIN; + } + return 0; +} + static void power_perf_read(struct perf_counter *counter) { long val, delta, prev; @@ -323,6 +354,20 @@ void hw_perf_restore(u64 disable) goto out; } + /* + * Add in MMCR0 freeze bits corresponding to the + * hw_event.exclude_* bits for the first counter. + * We have already checked that all counters have the + * same values for these bits as the first counter. + */ + counter = cpuhw->counter[0]; + if (counter->hw_event.exclude_user) + cpuhw->mmcr[0] |= MMCR0_FCP; + if (counter->hw_event.exclude_kernel) + cpuhw->mmcr[0] |= MMCR0_FCS; + if (counter->hw_event.exclude_hv) + cpuhw->mmcr[0] |= MMCR0_FCHV; + /* * Write the new configuration to MMCR* with the freeze * bit set and set the hardware counters to their initial values. @@ -424,6 +469,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, &cpuhw->counter[n0], &cpuhw->events[n0]); if (n < 0) return -EAGAIN; + if (check_excludes(cpuhw->counter, n0, n)) + return -EAGAIN; if (power_check_constraints(cpuhw->events, n + n0)) return -EAGAIN; cpuhw->n_counters = n0 + n; @@ -476,6 +523,8 @@ static int power_perf_enable(struct perf_counter *counter) goto out; cpuhw->counter[n0] = counter; cpuhw->events[n0] = counter->hw.config; + if (check_excludes(cpuhw->counter, n0, 1)) + goto out; if (power_check_constraints(cpuhw->events, n0 + 1)) goto out; @@ -554,6 +603,17 @@ hw_perf_counter_init(struct perf_counter *counter) counter->hw.config_base = ev; counter->hw.idx = 0; + /* + * If we are not running on a hypervisor, force the + * exclude_hv bit to 0 so that we don't care what + * the user set it to. This also means that we don't + * set the MMCR0_FCHV bit, which unconditionally freezes + * the counters on the PPC970 variants used in Apple G5 + * machines (since MSR.HV is always 1 on those machines). + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + counter->hw_event.exclude_hv = 0; + /* * If this is in a group, check if it can go on with all the * other hardware counters in the group. We assume the counter @@ -566,11 +626,13 @@ hw_perf_counter_init(struct perf_counter *counter) if (n < 0) return NULL; } - events[n++] = ev; - if (power_check_constraints(events, n)) + events[n] = ev; + if (check_excludes(ctrs, n, 1)) + return NULL; + if (power_check_constraints(events, n + 1)) return NULL; - counter->hw.config = events[n - 1]; + counter->hw.config = events[n]; atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); return &power_perf_ops; } diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9901e46998d..383d4c6423a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -107,21 +107,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EINVAL; /* - * Count user events, and generate PMC IRQs: + * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ - hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT; + hwc->config = ARCH_PERFMON_EVENTSEL_INT; /* - * If privileged enough, count OS events too, and allow - * NMI events as well: + * Count user and OS events unless requested not to. */ - hwc->nmi = 0; - if (capable(CAP_SYS_ADMIN)) { + if (!hw_event->exclude_user) + hwc->config |= ARCH_PERFMON_EVENTSEL_USR; + if (!hw_event->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (hw_event->nmi) - hwc->nmi = 1; - } + + /* + * If privileged enough, allow NMI events: + */ + hwc->nmi = 0; + if (capable(CAP_SYS_ADMIN) && hw_event->nmi) + hwc->nmi = 1; hwc->irq_period = hw_event->irq_period; /* @@ -248,10 +252,13 @@ __pmc_fixed_enable(struct perf_counter *counter, int err; /* - * Enable IRQ generation (0x8) and ring-3 counting (0x2), - * and enable ring-0 counting if allowed: + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: */ - bits = 0x8ULL | 0x2ULL; + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) bits |= 0x1; bits <<= (idx * 4); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f55381fbcac..c83f51d6e35 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -83,14 +83,17 @@ struct perf_counter_hw_event { u64 irq_period; u32 record_type; - u32 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ - inherit : 1, /* children inherit it */ - pinned : 1, /* must always be on PMU */ - exclusive : 1, /* only counter on PMU */ - - __reserved_1 : 26; + u32 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + + __reserved_1 : 23; u64 __reserved_2; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 544193cbc47..89d5e3fe970 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1567,11 +1567,25 @@ sw_perf_counter_init(struct perf_counter *counter) { const struct hw_perf_counter_ops *hw_ops = NULL; + /* + * Software counters (currently) can't in general distinguish + * between user, kernel and hypervisor events. + * However, context switches and cpu migrations are considered + * to be kernel events, and page faults are never hypervisor + * events. + */ switch (counter->hw_event.type) { case PERF_COUNT_CPU_CLOCK: - hw_ops = &perf_ops_cpu_clock; + if (!(counter->hw_event.exclude_user || + counter->hw_event.exclude_kernel || + counter->hw_event.exclude_hv)) + hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_TASK_CLOCK: + if (counter->hw_event.exclude_user || + counter->hw_event.exclude_kernel || + counter->hw_event.exclude_hv) + break; /* * If the user instantiates this as a per-cpu counter, * use the cpu_clock counter instead. @@ -1582,13 +1596,17 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_PAGE_FAULTS: - hw_ops = &perf_ops_page_faults; + if (!(counter->hw_event.exclude_user || + counter->hw_event.exclude_kernel)) + hw_ops = &perf_ops_page_faults; break; case PERF_COUNT_CONTEXT_SWITCHES: - hw_ops = &perf_ops_context_switches; + if (!counter->hw_event.exclude_kernel) + hw_ops = &perf_ops_context_switches; break; case PERF_COUNT_CPU_MIGRATIONS: - hw_ops = &perf_ops_cpu_migrations; + if (!counter->hw_event.exclude_kernel) + hw_ops = &perf_ops_cpu_migrations; break; default: break; -- cgit v1.2.3 From c07c99b67233ccaad38a961c17405dc1e1542aa4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 13 Feb 2009 22:10:34 +1100 Subject: perfcounters: make context switch and migration software counters work again Jaswinder Singh Rajput reported that commit 23a185ca8abbeef caused the context switch and migration software counters to report zero always. With that commit, the software counters only count events that occur between sched-in and sched-out for a task. This is necessary for the counter enable/disable prctls and ioctls to work. However, the context switch and migration counts are incremented after sched-out for one task and before sched-in for the next. Since the increment doesn't occur while a task is scheduled in (as far as the software counters are concerned) it doesn't count towards any counter. Thus the context switch and migration counters need to count events that occur at any time, provided the counter is enabled, not just those that occur while the task is scheduled in (from the perf_counter subsystem's point of view). The problem though is that the software counter code can't tell the difference between being enabled and being scheduled in, and between being disabled and being scheduled out, since we use the one pair of enable/disable entry points for both. That is, the high-level disable operation simply arranges for the counter to not be scheduled in any more, and the high-level enable operation arranges for it to be scheduled in again. One way to solve this would be to have sched_in/out operations in the hw_perf_counter_ops struct as well as enable/disable. However, this takes a simpler approach: it adds a 'prev_state' field to the perf_counter struct that allows a counter's enable method to know whether the counter was previously disabled or just inactive (scheduled out), and therefore whether the enable method is being called as a result of a high-level enable or a schedule-in operation. This then allows the context switch, migration and page fault counters to reset their hw.prev_count value in their enable functions only if they are called as a result of a high-level enable operation. Although page faults would normally only occur while the counter is scheduled in, this changes the page fault counter code too in case there are ever circumstances where page faults get counted against a task while its counters are not scheduled in. Reported-by: Jaswinder Singh Rajput Signed-off-by: Paul Mackerras Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 21 +++++++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c83f51d6e35..32cd1acb738 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -173,6 +173,7 @@ struct perf_counter { const struct hw_perf_counter_ops *hw_ops; enum perf_counter_active_state state; + enum perf_counter_active_state prev_state; atomic64_t count; struct perf_counter_hw_event hw_event; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fcefb0a726f..ad62965828d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -444,6 +444,7 @@ static void __perf_install_in_context(void *info) list_add_counter(counter, ctx); ctx->nr_counters++; + counter->prev_state = PERF_COUNTER_STATE_OFF; /* * Don't put the counter on if it is disabled or if @@ -562,6 +563,7 @@ static void __perf_counter_enable(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); + counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; @@ -733,6 +735,7 @@ group_sched_in(struct perf_counter *group_counter, if (ret) return ret < 0 ? ret : 0; + group_counter->prev_state = group_counter->state; if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) return -EAGAIN; @@ -740,6 +743,7 @@ group_sched_in(struct perf_counter *group_counter, * Schedule in siblings as one group (if any): */ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + counter->prev_state = counter->state; if (counter_sched_in(counter, cpuctx, ctx, cpu)) { partial_group = counter; goto group_error; @@ -1398,9 +1402,9 @@ static void task_clock_perf_counter_read(struct perf_counter *counter) static int task_clock_perf_counter_enable(struct perf_counter *counter) { - u64 now = task_clock_perf_counter_val(counter, 0); - - atomic64_set(&counter->hw.prev_count, now); + if (counter->prev_state <= PERF_COUNTER_STATE_OFF) + atomic64_set(&counter->hw.prev_count, + task_clock_perf_counter_val(counter, 0)); return 0; } @@ -1455,7 +1459,8 @@ static void page_faults_perf_counter_read(struct perf_counter *counter) static int page_faults_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, get_page_faults(counter)); + if (counter->prev_state <= PERF_COUNTER_STATE_OFF) + atomic64_set(&counter->hw.prev_count, get_page_faults(counter)); return 0; } @@ -1501,7 +1506,9 @@ static void context_switches_perf_counter_read(struct perf_counter *counter) static int context_switches_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, get_context_switches(counter)); + if (counter->prev_state <= PERF_COUNTER_STATE_OFF) + atomic64_set(&counter->hw.prev_count, + get_context_switches(counter)); return 0; } @@ -1547,7 +1554,9 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter) static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter)); + if (counter->prev_state <= PERF_COUNTER_STATE_OFF) + atomic64_set(&counter->hw.prev_count, + get_cpu_migrations(counter)); return 0; } -- cgit v1.2.3 From f3dfd2656deb81a0addee4f4ceff66b50a387388 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 26 Feb 2009 22:43:46 +1100 Subject: perfcounters: fix a few minor cleanliness issues This fixes three issues noticed by Arnd Bergmann: - Add #ifdef __KERNEL__ and move some things around in perf_counter.h to make sure only the bits that userspace needs are exported to userspace. - Use __u64, __s64, __u32 types in the structs exported to userspace rather than u64, s64, u32. - Make the sys_perf_counter_open syscall available to the SPUs on Cell platforms. And one issue that I noticed in looking at the code again: - Wrap the perf_counter_open syscall with SYSCALL_DEFINE4 so we get the proper handling of int arguments on ppc64 (and some other 64-bit architectures). Reported-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/systbl.h | 2 +- include/linux/perf_counter.h | 43 +++++++++++++++++++++------------------ include/linux/syscalls.h | 9 +++----- kernel/perf_counter.c | 6 +++--- 4 files changed, 30 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 4c8095f6bec..d312eec8abb 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -322,4 +322,4 @@ SYSCALL_SPU(epoll_create1) SYSCALL_SPU(dup3) SYSCALL_SPU(pipe2) SYSCALL(inotify_init1) -SYSCALL(perf_counter_open) +SYSCALL_SPU(perf_counter_open) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 32cd1acb738..186efaf4966 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -13,20 +13,8 @@ #ifndef _LINUX_PERF_COUNTER_H #define _LINUX_PERF_COUNTER_H -#include -#include - -#ifdef CONFIG_PERF_COUNTERS -# include -#endif - -#include -#include -#include -#include -#include - -struct task_struct; +#include +#include /* * User-space ABI bits: @@ -78,12 +66,12 @@ enum perf_counter_record_type { * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - s64 type; + __s64 type; - u64 irq_period; - u32 record_type; + __u64 irq_period; + __u32 record_type; - u32 disabled : 1, /* off by default */ + __u32 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ @@ -95,7 +83,7 @@ struct perf_counter_hw_event { __reserved_1 : 23; - u64 __reserved_2; + __u64 __reserved_2; }; /* @@ -104,10 +92,24 @@ struct perf_counter_hw_event { #define PERF_COUNTER_IOC_ENABLE _IO('$', 0) #define PERF_COUNTER_IOC_DISABLE _IO('$', 1) +#ifdef __KERNEL__ /* - * Kernel-internal data types: + * Kernel-internal data types and definitions: */ +#ifdef CONFIG_PERF_COUNTERS +# include +#endif + +#include +#include +#include +#include +#include +#include + +struct task_struct; + /** * struct hw_perf_counter - performance counter hardware details: */ @@ -293,4 +295,5 @@ static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } #endif +#endif /* __KERNEL__ */ #endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 88255d3261a..28ef2be839c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -696,10 +696,7 @@ asmlinkage long sys_pipe(int __user *); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -asmlinkage int sys_perf_counter_open( - - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd); +asmlinkage long sys_perf_counter_open( + const struct perf_counter_hw_event __user *hw_event_uptr, + pid_t pid, int cpu, int group_fd); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ad62965828d..16b14ba99d3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1690,9 +1690,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, * @cpu: target cpu * @group_fd: group leader counter fd */ -asmlinkage int -sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, int cpu, int group_fd) +SYSCALL_DEFINE4(perf_counter_open, + const struct perf_counter_hw_event __user *, hw_event_uptr, + pid_t, pid, int, cpu, int, group_fd) { struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; -- cgit v1.2.3 From 2743a5b0fa6f309da904f2190a9cc25deee34dbd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 4 Mar 2009 20:36:51 +1100 Subject: perfcounters: provide expansion room in the ABI Impact: ABI change This expands several fields in the perf_counter_hw_event struct and adds a "flags" argument to the perf_counter_open system call, in order that features can be added in future without ABI changes. In particular the record_type field is expanded to 64 bits, and the space for flag bits has been expanded from 32 to 64 bits. This also adds some new fields: * read_format (64 bits) is intended to provide a way to specify what userspace wants to get back when it does a read() on a simple (non-interrupting) counter; * exclude_idle (1 bit) provides a way for userspace to ask that events that occur when the cpu is idle be excluded; * extra_config_len will provide a way for userspace to supply an arbitrary amount of extra machine-specific PMU configuration data immediately following the perf_counter_hw_event struct, to allow sophisticated users to program things such as instruction matching CAMs and address range registers; * __reserved_3 and __reserved_4 provide space for future expansion. Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 12 +++++++++--- include/linux/syscalls.h | 2 +- kernel/perf_counter.c | 10 +++++++--- 3 files changed, 17 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 186efaf4966..c42455ab155 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -69,9 +69,10 @@ struct perf_counter_hw_event { __s64 type; __u64 irq_period; - __u32 record_type; + __u64 record_type; + __u64 read_format; - __u32 disabled : 1, /* off by default */ + __u64 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ @@ -80,10 +81,15 @@ struct perf_counter_hw_event { exclude_user : 1, /* don't count user */ exclude_kernel : 1, /* ditto kernel */ exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ - __reserved_1 : 23; + __reserved_1 : 55; + + __u32 extra_config_len; + __u32 __reserved_4; __u64 __reserved_2; + __u64 __reserved_3; }; /* diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 28ef2be839c..ab1d7724739 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -698,5 +698,5 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]); asmlinkage long sys_perf_counter_open( const struct perf_counter_hw_event __user *hw_event_uptr, - pid_t pid, int cpu, int group_fd); + pid_t pid, int cpu, int group_fd, unsigned long flags); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 16b14ba99d3..b2e838959f3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1683,16 +1683,16 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, } /** - * sys_perf_task_open - open a performance counter, associate it to a task/cpu + * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * * @hw_event_uptr: event type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader counter fd */ -SYSCALL_DEFINE4(perf_counter_open, +SYSCALL_DEFINE5(perf_counter_open, const struct perf_counter_hw_event __user *, hw_event_uptr, - pid_t, pid, int, cpu, int, group_fd) + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; @@ -1703,6 +1703,10 @@ SYSCALL_DEFINE4(perf_counter_open, int fput_needed2 = 0; int ret; + /* for future expandability... */ + if (flags) + return -EINVAL; + if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) return -EFAULT; -- cgit v1.2.3 From 2485e5184452ccbda34ff83883883d9107800dff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2009 12:33:16 +0100 Subject: perfcounters: fix reserved bits sizing The sum of bits is 65 currently not 64 - so reduce the # of reserved bits from 55 to 54. Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c42455ab155..dde564517b6 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -83,7 +83,7 @@ struct perf_counter_hw_event { exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ - __reserved_1 : 55; + __reserved_1 : 54; __u32 extra_config_len; __u32 __reserved_4; -- cgit v1.2.3 From 15dbf27cc18559a14e99609f78678aa86b9c6ff1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:32 +0100 Subject: perf_counter: software counter event infrastructure Provide generic software counter infrastructure that supports software events. This will be used to allow sample based profiling based on software events such as pagefaults. The current infrastructure can only provide a count of such events, no place information. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 8 +- kernel/perf_counter.c | 201 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 208 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index dde564517b6..3fefc3b8150 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -126,6 +126,7 @@ struct hw_perf_counter { unsigned long counter_base; int nmi; unsigned int idx; + atomic64_t count; /* software */ atomic64_t prev_count; u64 irq_period; atomic64_t period_left; @@ -283,6 +284,8 @@ static inline int is_software_counter(struct perf_counter *counter) return !counter->hw_event.raw && counter->hw_event.type < 0; } +extern void perf_swcounter_event(enum hw_event_types, u64, int, struct pt_regs *); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -295,10 +298,13 @@ static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } static inline void perf_counter_unthrottle(void) { } -static inline void hw_perf_restore(u64 ctrl) { } +static inline void hw_perf_restore(u64 ctrl) { } static inline u64 hw_perf_save_disable(void) { return 0; } static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } + +static inline void perf_swcounter_event(enum hw_event_types event, u64 nr, + int nmi, struct pt_regs *regs) { } #endif #endif /* __KERNEL__ */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0fe22c916e2..eeb1b46cf70 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1328,6 +1328,185 @@ static const struct file_operations perf_fops = { .compat_ioctl = perf_ioctl, }; +/* + * Generic software counter infrastructure + */ + +static void perf_swcounter_update(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + u64 prev, now; + s64 delta; + +again: + prev = atomic64_read(&hwc->prev_count); + now = atomic64_read(&hwc->count); + if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) + goto again; + + delta = now - prev; + + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &hwc->period_left); +} + +static void perf_swcounter_set_period(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + s64 left = atomic64_read(&hwc->period_left); + s64 period = hwc->irq_period; + + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_add(period, &hwc->period_left); + } + + atomic64_set(&hwc->prev_count, -left); + atomic64_set(&hwc->count, -left); +} + +static void perf_swcounter_save_and_restart(struct perf_counter *counter) +{ + perf_swcounter_update(counter); + perf_swcounter_set_period(counter); +} + +static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data) +{ + struct perf_data *irqdata = counter->irqdata; + + if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { + irqdata->overrun++; + } else { + u64 *p = (u64 *) &irqdata->data[irqdata->len]; + + *p = data; + irqdata->len += sizeof(u64); + } +} + +static void perf_swcounter_handle_group(struct perf_counter *sibling) +{ + struct perf_counter *counter, *group_leader = sibling->group_leader; + + list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { + perf_swcounter_update(counter); + perf_swcounter_store_irq(sibling, counter->hw_event.type); + perf_swcounter_store_irq(sibling, atomic64_read(&counter->count)); + } +} + +static void perf_swcounter_interrupt(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + perf_swcounter_save_and_restart(counter); + + switch (counter->hw_event.record_type) { + case PERF_RECORD_SIMPLE: + break; + + case PERF_RECORD_IRQ: + perf_swcounter_store_irq(counter, instruction_pointer(regs)); + break; + + case PERF_RECORD_GROUP: + perf_swcounter_handle_group(counter); + break; + } + + if (nmi) { + counter->wakeup_pending = 1; + set_tsk_thread_flag(current, TIF_PERF_COUNTERS); + } else + wake_up(&counter->waitq); +} + +static int perf_swcounter_match(struct perf_counter *counter, + enum hw_event_types event, + struct pt_regs *regs) +{ + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + return 0; + + if (counter->hw_event.raw) + return 0; + + if (counter->hw_event.type != event) + return 0; + + if (counter->hw_event.exclude_user && user_mode(regs)) + return 0; + + if (counter->hw_event.exclude_kernel && !user_mode(regs)) + return 0; + + return 1; +} + +static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, + enum hw_event_types event, u64 nr, + int nmi, struct pt_regs *regs) +{ + struct perf_counter *counter; + unsigned long flags; + int neg; + + if (list_empty(&ctx->counter_list)) + return; + + spin_lock_irqsave(&ctx->lock, flags); + + /* + * XXX: make counter_list RCU safe + */ + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (perf_swcounter_match(counter, event, regs)) { + neg = atomic64_add_negative(nr, &counter->hw.count); + if (counter->hw.irq_period && !neg) + perf_swcounter_interrupt(counter, nmi, regs); + } + } + + spin_unlock_irqrestore(&ctx->lock, flags); +} + +void perf_swcounter_event(enum hw_event_types event, u64 nr, + int nmi, struct pt_regs *regs) +{ + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + + perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs); + if (cpuctx->task_ctx) + perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs); + + put_cpu_var(perf_cpu_context); +} + +static void perf_swcounter_read(struct perf_counter *counter) +{ + perf_swcounter_update(counter); +} + +static int perf_swcounter_enable(struct perf_counter *counter) +{ + perf_swcounter_set_period(counter); + return 0; +} + +static void perf_swcounter_disable(struct perf_counter *counter) +{ + perf_swcounter_update(counter); +} + +/* + * Software counter: cpu wall time clock + */ + static int cpu_clock_perf_counter_enable(struct perf_counter *counter) { int cpu = raw_smp_processor_id(); @@ -1364,6 +1543,10 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { .read = cpu_clock_perf_counter_read, }; +/* + * Software counter: task time clock + */ + /* * Called from within the scheduler: */ @@ -1420,6 +1603,10 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; +/* + * Software counter: page faults + */ + #ifdef CONFIG_VM_EVENT_COUNTERS #define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT] #else @@ -1473,6 +1660,10 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = { .read = page_faults_perf_counter_read, }; +/* + * Software counter: context switches + */ + static u64 get_context_switches(struct perf_counter *counter) { struct task_struct *curr = counter->ctx->task; @@ -1521,6 +1712,10 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = { .read = context_switches_perf_counter_read, }; +/* + * Software counter: cpu migrations + */ + static inline u64 get_cpu_migrations(struct perf_counter *counter) { struct task_struct *curr = counter->ctx->task; @@ -1572,7 +1767,9 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { + struct perf_counter_hw_event *hw_event = &counter->hw_event; const struct hw_perf_counter_ops *hw_ops = NULL; + struct hw_perf_counter *hwc = &counter->hw; /* * Software counters (currently) can't in general distinguish @@ -1618,6 +1815,10 @@ sw_perf_counter_init(struct perf_counter *counter) default: break; } + + if (hw_ops) + hwc->irq_period = hw_event->irq_period; + return hw_ops; } -- cgit v1.2.3 From ac17dc8e58f3069ea895cfff963adf98ff3cf6b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:34 +0100 Subject: perf_counter: provide major/minor page fault software events Provide separate sw counters for major and minor page faults. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/powerpc/mm/fault.c | 5 ++++- arch/x86/mm/fault.c | 7 +++++-- include/linux/perf_counter.h | 4 +++- kernel/perf_counter.c | 22 +++++++++------------- 4 files changed, 21 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index eda5b0ca4af..17bbf6f91fb 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -312,6 +312,7 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { preempt_disable(); @@ -319,8 +320,10 @@ good_area: preempt_enable(); } #endif - } else + } else { current->min_flt++; + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); + } up_read(&mm->mmap_sem); return 0; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c8725752b6c..f2d3324d921 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1140,10 +1140,13 @@ good_area: return; } - if (fault & VM_FAULT_MAJOR) + if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - else + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); + } else { tsk->min_flt++; + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); + } check_v8086_mode(regs, address, tsk); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 3fefc3b8150..4b14a8e9dbf 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -49,8 +49,10 @@ enum hw_event_types { PERF_COUNT_PAGE_FAULTS = -3, PERF_COUNT_CONTEXT_SWITCHES = -4, PERF_COUNT_CPU_MIGRATIONS = -5, + PERF_COUNT_PAGE_FAULTS_MIN = -6, + PERF_COUNT_PAGE_FAULTS_MAJ = -7, - PERF_SW_EVENTS_MIN = -6, + PERF_SW_EVENTS_MIN = -8, }; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1773c5d7427..68950a3a52b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1503,6 +1503,12 @@ static void perf_swcounter_disable(struct perf_counter *counter) perf_swcounter_update(counter); } +static const struct hw_perf_counter_ops perf_ops_generic = { + .enable = perf_swcounter_enable, + .disable = perf_swcounter_disable, + .read = perf_swcounter_read, +}; + /* * Software counter: cpu wall time clock */ @@ -1603,16 +1609,6 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; -/* - * Software counter: page faults - */ - -static const struct hw_perf_counter_ops perf_ops_page_faults = { - .enable = perf_swcounter_enable, - .disable = perf_swcounter_disable, - .read = perf_swcounter_read, -}; - /* * Software counter: context switches */ @@ -1753,9 +1749,9 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_PAGE_FAULTS: - if (!(counter->hw_event.exclude_user || - counter->hw_event.exclude_kernel)) - hw_ops = &perf_ops_page_faults; + case PERF_COUNT_PAGE_FAULTS_MIN: + case PERF_COUNT_PAGE_FAULTS_MAJ: + hw_ops = &perf_ops_generic; break; case PERF_COUNT_CONTEXT_SWITCHES: if (!counter->hw_event.exclude_kernel) -- cgit v1.2.3 From d6d020e9957745c61285ef3da9f294c5e6801f0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:35 +0100 Subject: perf_counter: hrtimer based sampling for software time events Use hrtimers to profile timer based sampling for the software time counters. This allows platforms without hardware counter support to still perform sample based profiling. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 20 ++++--- kernel/perf_counter.c | 123 ++++++++++++++++++++++++++++++------------- 2 files changed, 100 insertions(+), 43 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4b14a8e9dbf..dfb4c7ce18b 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -114,6 +114,7 @@ struct perf_counter_hw_event { #include #include #include +#include #include struct task_struct; @@ -123,12 +124,19 @@ struct task_struct; */ struct hw_perf_counter { #ifdef CONFIG_PERF_COUNTERS - u64 config; - unsigned long config_base; - unsigned long counter_base; - int nmi; - unsigned int idx; - atomic64_t count; /* software */ + union { + struct { /* hardware */ + u64 config; + unsigned long config_base; + unsigned long counter_base; + int nmi; + unsigned int idx; + }; + union { /* software */ + atomic64_t count; + struct hrtimer hrtimer; + }; + }; atomic64_t prev_count; u64 irq_period; atomic64_t period_left; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 68950a3a52b..f9330d5827c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1395,7 +1395,7 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling) struct perf_counter *counter, *group_leader = sibling->group_leader; list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - perf_swcounter_update(counter); + counter->hw_ops->read(counter); perf_swcounter_store_irq(sibling, counter->hw_event.type); perf_swcounter_store_irq(sibling, atomic64_read(&counter->count)); } @@ -1404,8 +1404,6 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling) static void perf_swcounter_interrupt(struct perf_counter *counter, int nmi, struct pt_regs *regs) { - perf_swcounter_save_and_restart(counter); - switch (counter->hw_event.record_type) { case PERF_RECORD_SIMPLE: break; @@ -1426,6 +1424,38 @@ static void perf_swcounter_interrupt(struct perf_counter *counter, wake_up(&counter->waitq); } +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ + struct perf_counter *counter; + struct pt_regs *regs; + + counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); + counter->hw_ops->read(counter); + + regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((counter->hw_event.exclude_kernel || !regs) && + !counter->hw_event.exclude_user) + regs = task_pt_regs(current); + + if (regs) + perf_swcounter_interrupt(counter, 0, regs); + + hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); + + return HRTIMER_RESTART; +} + +static void perf_swcounter_overflow(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + perf_swcounter_save_and_restart(counter); + perf_swcounter_interrupt(counter, nmi, regs); +} + static int perf_swcounter_match(struct perf_counter *counter, enum hw_event_types event, struct pt_regs *regs) @@ -1448,13 +1478,20 @@ static int perf_swcounter_match(struct perf_counter *counter, return 1; } +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, + int nmi, struct pt_regs *regs) +{ + int neg = atomic64_add_negative(nr, &counter->hw.count); + if (counter->hw.irq_period && !neg) + perf_swcounter_overflow(counter, nmi, regs); +} + static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum hw_event_types event, u64 nr, int nmi, struct pt_regs *regs) { struct perf_counter *counter; unsigned long flags; - int neg; if (list_empty(&ctx->counter_list)) return; @@ -1465,11 +1502,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, * XXX: make counter_list RCU safe */ list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (perf_swcounter_match(counter, event, regs)) { - neg = atomic64_add_negative(nr, &counter->hw.count); - if (counter->hw.irq_period && !neg) - perf_swcounter_interrupt(counter, nmi, regs); - } + if (perf_swcounter_match(counter, event, regs)) + perf_swcounter_add(counter, nr, nmi, regs); } spin_unlock_irqrestore(&ctx->lock, flags); @@ -1513,14 +1547,6 @@ static const struct hw_perf_counter_ops perf_ops_generic = { * Software counter: cpu wall time clock */ -static int cpu_clock_perf_counter_enable(struct perf_counter *counter) -{ - int cpu = raw_smp_processor_id(); - - atomic64_set(&counter->hw.prev_count, cpu_clock(cpu)); - return 0; -} - static void cpu_clock_perf_counter_update(struct perf_counter *counter) { int cpu = raw_smp_processor_id(); @@ -1533,8 +1559,26 @@ static void cpu_clock_perf_counter_update(struct perf_counter *counter) atomic64_add(now - prev, &counter->count); } +static int cpu_clock_perf_counter_enable(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + int cpu = raw_smp_processor_id(); + + atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + if (hwc->irq_period) { + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(hwc->irq_period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + static void cpu_clock_perf_counter_disable(struct perf_counter *counter) { + hrtimer_cancel(&counter->hw.hrtimer); cpu_clock_perf_counter_update(counter); } @@ -1580,27 +1624,33 @@ static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now atomic64_add(delta, &counter->count); } -static void task_clock_perf_counter_read(struct perf_counter *counter) -{ - u64 now = task_clock_perf_counter_val(counter, 1); - - task_clock_perf_counter_update(counter, now); -} - static int task_clock_perf_counter_enable(struct perf_counter *counter) { - if (counter->prev_state <= PERF_COUNTER_STATE_OFF) - atomic64_set(&counter->hw.prev_count, - task_clock_perf_counter_val(counter, 0)); + struct hw_perf_counter *hwc = &counter->hw; + + atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0)); + if (hwc->irq_period) { + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(hwc->irq_period), 0, + HRTIMER_MODE_REL, 0); + } return 0; } static void task_clock_perf_counter_disable(struct perf_counter *counter) { - u64 now = task_clock_perf_counter_val(counter, 0); + hrtimer_cancel(&counter->hw.hrtimer); + task_clock_perf_counter_update(counter, + task_clock_perf_counter_val(counter, 0)); +} - task_clock_perf_counter_update(counter, now); +static void task_clock_perf_counter_read(struct perf_counter *counter) +{ + task_clock_perf_counter_update(counter, + task_clock_perf_counter_val(counter, 1)); } static const struct hw_perf_counter_ops perf_ops_task_clock = { @@ -1729,16 +1779,12 @@ sw_perf_counter_init(struct perf_counter *counter) */ switch (counter->hw_event.type) { case PERF_COUNT_CPU_CLOCK: - if (!(counter->hw_event.exclude_user || - counter->hw_event.exclude_kernel || - counter->hw_event.exclude_hv)) - hw_ops = &perf_ops_cpu_clock; + hw_ops = &perf_ops_cpu_clock; + + if (hw_event->irq_period && hw_event->irq_period < 10000) + hw_event->irq_period = 10000; break; case PERF_COUNT_TASK_CLOCK: - if (counter->hw_event.exclude_user || - counter->hw_event.exclude_kernel || - counter->hw_event.exclude_hv) - break; /* * If the user instantiates this as a per-cpu counter, * use the cpu_clock counter instead. @@ -1747,6 +1793,9 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_task_clock; else hw_ops = &perf_ops_cpu_clock; + + if (hw_event->irq_period && hw_event->irq_period < 10000) + hw_event->irq_period = 10000; break; case PERF_COUNT_PAGE_FAULTS: case PERF_COUNT_PAGE_FAULTS_MIN: -- cgit v1.2.3 From 592903cdcbf606a838056bae6d03fc557806c914 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:36 +0100 Subject: perf_counter: add an event_list I noticed that the counter_list only includes top-level counters, thus perf_swcounter_event() will miss sw-counters in groups. Since perf_swcounter_event() also wants an RCU safe list, create a new event_list that includes all counters and uses RCU list ops and use call_rcu to free the counter structure. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++++ kernel/perf_counter.c | 30 +++++++++++++++++++----------- 2 files changed, 23 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index dfb4c7ce18b..08c11a6afeb 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -187,6 +187,7 @@ struct file; struct perf_counter { #ifdef CONFIG_PERF_COUNTERS struct list_head list_entry; + struct list_head event_entry; struct list_head sibling_list; struct perf_counter *group_leader; const struct hw_perf_counter_ops *hw_ops; @@ -220,6 +221,8 @@ struct perf_counter { struct perf_data *irqdata; struct perf_data *usrdata; struct perf_data data[2]; + + struct rcu_head rcu_head; #endif }; @@ -243,6 +246,7 @@ struct perf_counter_context { struct mutex mutex; struct list_head counter_list; + struct list_head event_list; int nr_counters; int nr_active; int is_active; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f9330d5827c..8d6ecfa64c0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * Each CPU has a list of per CPU counters: @@ -72,6 +73,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_add_tail(&counter->list_entry, &ctx->counter_list); else list_add_tail(&counter->list_entry, &group_leader->sibling_list); + + list_add_rcu(&counter->event_entry, &ctx->event_list); } static void @@ -80,6 +83,7 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) struct perf_counter *sibling, *tmp; list_del_init(&counter->list_entry); + list_del_rcu(&counter->event_entry); /* * If this was a group counter with sibling counters then @@ -1133,6 +1137,14 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) return ctx; } +static void free_counter_rcu(struct rcu_head *head) +{ + struct perf_counter *counter; + + counter = container_of(head, struct perf_counter, rcu_head); + kfree(counter); +} + /* * Called when the last reference to the file is gone. */ @@ -1151,7 +1163,7 @@ static int perf_release(struct inode *inode, struct file *file) mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); - kfree(counter); + call_rcu(&counter->rcu_head, free_counter_rcu); put_context(ctx); return 0; @@ -1491,22 +1503,16 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, int nmi, struct pt_regs *regs) { struct perf_counter *counter; - unsigned long flags; - if (list_empty(&ctx->counter_list)) + if (list_empty(&ctx->event_list)) return; - spin_lock_irqsave(&ctx->lock, flags); - - /* - * XXX: make counter_list RCU safe - */ - list_for_each_entry(counter, &ctx->counter_list, list_entry) { + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { if (perf_swcounter_match(counter, event, regs)) perf_swcounter_add(counter, nr, nmi, regs); } - - spin_unlock_irqrestore(&ctx->lock, flags); + rcu_read_unlock(); } void perf_swcounter_event(enum hw_event_types event, u64 nr, @@ -1846,6 +1852,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, mutex_init(&counter->mutex); INIT_LIST_HEAD(&counter->list_entry); + INIT_LIST_HEAD(&counter->event_entry); INIT_LIST_HEAD(&counter->sibling_list); init_waitqueue_head(&counter->waitq); @@ -1992,6 +1999,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx, spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); + INIT_LIST_HEAD(&ctx->event_list); ctx->task = task; } -- cgit v1.2.3 From 01ef09d9ffb5ce9f8d62d1e5206da3d5ca612acc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:11 +0100 Subject: perf_counter: fix uninitialized usage of event_list Impact: fix boot crash When doing the generic context switch event I ran into some early boot hangs, which were caused by inf func recursion (event, fault, event, fault). I eventually tracked it down to event_list not being initialized at the time of the first event. Fix this. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.195392657@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 2 ++ kernel/perf_counter.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 219748d0026..ca226a91abe 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -124,6 +124,8 @@ extern struct cred init_cred; # define INIT_PERF_COUNTERS(tsk) \ .perf_counter_ctx.counter_list = \ LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \ + .perf_counter_ctx.event_list = \ + LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \ .perf_counter_ctx.lock = \ __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock), #else diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b39456ad74a..4c4e9eb37ab 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1506,7 +1506,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, { struct perf_counter *counter; - if (list_empty(&ctx->event_list)) + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) return; rcu_read_lock(); -- cgit v1.2.3 From 4a0deca657f3dbb8a707b5dc8f173beec01e7ed2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:12 +0100 Subject: perf_counter: generic context switch event Impact: cleanup Use the generic software events for context switches. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.283522645@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/perf_counter.c | 60 ++++----------------------------------------------- kernel/sched.c | 6 ------ 3 files changed, 4 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 75b2fc5306d..7ed41f7c5ac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -138,7 +138,6 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); -extern u64 cpu_nr_switches(int cpu); extern u64 cpu_nr_migrations(int cpu); extern unsigned long get_parent_ip(unsigned long addr); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4c4e9eb37ab..99d5930f0a5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -710,10 +710,13 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct pt_regs *regs; if (likely(!cpuctx->task_ctx)) return; + regs = task_pt_regs(task); + perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs); __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; @@ -1667,58 +1670,6 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; -/* - * Software counter: context switches - */ - -static u64 get_context_switches(struct perf_counter *counter) -{ - struct task_struct *curr = counter->ctx->task; - - if (curr) - return curr->nvcsw + curr->nivcsw; - return cpu_nr_switches(smp_processor_id()); -} - -static void context_switches_perf_counter_update(struct perf_counter *counter) -{ - u64 prev, now; - s64 delta; - - prev = atomic64_read(&counter->hw.prev_count); - now = get_context_switches(counter); - - atomic64_set(&counter->hw.prev_count, now); - - delta = now - prev; - - atomic64_add(delta, &counter->count); -} - -static void context_switches_perf_counter_read(struct perf_counter *counter) -{ - context_switches_perf_counter_update(counter); -} - -static int context_switches_perf_counter_enable(struct perf_counter *counter) -{ - if (counter->prev_state <= PERF_COUNTER_STATE_OFF) - atomic64_set(&counter->hw.prev_count, - get_context_switches(counter)); - return 0; -} - -static void context_switches_perf_counter_disable(struct perf_counter *counter) -{ - context_switches_perf_counter_update(counter); -} - -static const struct hw_perf_counter_ops perf_ops_context_switches = { - .enable = context_switches_perf_counter_enable, - .disable = context_switches_perf_counter_disable, - .read = context_switches_perf_counter_read, -}; - /* * Software counter: cpu migrations */ @@ -1808,11 +1759,8 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_PAGE_FAULTS: case PERF_COUNT_PAGE_FAULTS_MIN: case PERF_COUNT_PAGE_FAULTS_MAJ: - hw_ops = &perf_ops_generic; - break; case PERF_COUNT_CONTEXT_SWITCHES: - if (!counter->hw_event.exclude_kernel) - hw_ops = &perf_ops_context_switches; + hw_ops = &perf_ops_generic; break; case PERF_COUNT_CPU_MIGRATIONS: if (!counter->hw_event.exclude_kernel) diff --git a/kernel/sched.c b/kernel/sched.c index 39e70860216..f76e3c0188a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2900,14 +2900,8 @@ unsigned long nr_active(void) /* * Externally visible per-cpu scheduler statistics: - * cpu_nr_switches(cpu) - number of context switches on that cpu * cpu_nr_migrations(cpu) - number of migrations into that cpu */ -u64 cpu_nr_switches(int cpu) -{ - return cpu_rq(cpu)->nr_switches; -} - u64 cpu_nr_migrations(int cpu) { return cpu_rq(cpu)->nr_migrations_in; -- cgit v1.2.3 From e077df4f439681e43f0db8255b2d215b342ebdc6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:17 +0100 Subject: perf_counter: hook up the tracepoint events Impact: new perfcounters feature Enable usage of tracepoints as perf counter events. tracepoint event ids can be found in /debug/tracing/event/*/*/id and (for now) are represented as -65536+id in the type field. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.744044174@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 +++ init/Kconfig | 5 +++++ kernel/perf_counter.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 08c11a6afeb..065984c1ff5 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -53,6 +53,8 @@ enum hw_event_types { PERF_COUNT_PAGE_FAULTS_MAJ = -7, PERF_SW_EVENTS_MIN = -8, + + PERF_TP_EVENTS_MIN = -65536 }; /* @@ -222,6 +224,7 @@ struct perf_counter { struct perf_data *usrdata; struct perf_data data[2]; + void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; #endif }; diff --git a/init/Kconfig b/init/Kconfig index 38a2ecd47c3..4f647142f2e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -947,6 +947,11 @@ config PERF_COUNTERS Say Y if unsure. +config EVENT_PROFILE + bool "Tracepoint profile sources" + depends on PERF_COUNTERS && EVENT_TRACER + default y + endmenu config VM_EVENT_COUNTERS diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 97f891ffeb4..0bbe3e45ba0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1152,6 +1152,9 @@ static void free_counter_rcu(struct rcu_head *head) static void free_counter(struct perf_counter *counter) { + if (counter->destroy) + counter->destroy(counter); + call_rcu(&counter->rcu_head, free_counter_rcu); } @@ -1727,6 +1730,45 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { .read = cpu_migrations_perf_counter_read, }; +#ifdef CONFIG_EVENT_PROFILE +void perf_tpcounter_event(int event_id) +{ + perf_swcounter_event(PERF_TP_EVENTS_MIN + event_id, 1, 1, + task_pt_regs(current)); +} + +extern int ftrace_profile_enable(int); +extern void ftrace_profile_disable(int); + +static void tp_perf_counter_destroy(struct perf_counter *counter) +{ + int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN; + + ftrace_profile_disable(event_id); +} + +static const struct hw_perf_counter_ops * +tp_perf_counter_init(struct perf_counter *counter) +{ + int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN; + int ret; + + ret = ftrace_profile_enable(event_id); + if (ret) + return NULL; + + counter->destroy = tp_perf_counter_destroy; + + return &perf_ops_generic; +} +#else +static const struct hw_perf_counter_ops * +tp_perf_counter_init(struct perf_counter *counter) +{ + return NULL; +} +#endif + static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { @@ -1772,6 +1814,7 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_cpu_migrations; break; default: + hw_ops = tp_perf_counter_init(counter); break; } -- cgit v1.2.3 From b8e83514b64577b48bfb794fe85fcde40a9343ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:18 +0100 Subject: perf_counter: revamp syscall input ABI Impact: modify ABI The hardware/software classification in hw_event->type became a little strained due to the addition of tracepoint tracing. Instead split up the field and provide a type field to explicitly specify the counter type, while using the event_id field to specify which event to use. Raw counters still work as before, only the raw config now goes into raw_event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.836807573@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 4 +- arch/x86/kernel/cpu/perf_counter.c | 10 ++-- include/linux/perf_counter.h | 95 ++++++++++++++++++++++++-------------- kernel/perf_counter.c | 83 ++++++++++++++++++++------------- 4 files changed, 117 insertions(+), 75 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 5008762e8bf..26f69dc7130 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -602,7 +602,7 @@ hw_perf_counter_init(struct perf_counter *counter) return NULL; if ((s64)counter->hw_event.irq_period < 0) return NULL; - ev = counter->hw_event.type; + ev = counter->hw_event.event_id; if (!counter->hw_event.raw) { if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) @@ -692,7 +692,7 @@ static void perf_handle_group(struct perf_counter *counter) list_for_each_entry(sub, &leader->sibling_list, list_entry) { if (sub != counter) sub->hw_ops->read(sub); - perf_store_irq_data(counter, sub->hw_event.type); + perf_store_irq_data(counter, sub->hw_event.event_config); perf_store_irq_data(counter, atomic64_read(&sub->count)); } } diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6cba9d47b71..d844ae41d5a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (hw_event->raw) { - hwc->config |= pmc_ops->raw_event(hw_event->type); + if (hw_event->raw_type) { + hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id); } else { - if (hw_event->type >= pmc_ops->max_events) + if (hw_event->event_id >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(hw_event->type); + hwc->config |= pmc_ops->event_map(hw_event->event_id); } counter->wakeup_pending = 0; @@ -715,7 +715,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); - perf_store_irq_data(sibling, counter->hw_event.type); + perf_store_irq_data(sibling, counter->hw_event.event_config); perf_store_irq_data(sibling, atomic64_read(&counter->count)); } } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 065984c1ff5..8f939490550 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -21,56 +21,81 @@ */ /* - * Generalized performance counter event types, used by the hw_event.type - * parameter of the sys_perf_counter_open() syscall: + * hw_event.type */ -enum hw_event_types { +enum perf_event_types { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + /* - * Common hardware events, generalized by the kernel: + * available TYPE space, raw is the max value. */ - PERF_COUNT_CPU_CYCLES = 0, - PERF_COUNT_INSTRUCTIONS = 1, - PERF_COUNT_CACHE_REFERENCES = 2, - PERF_COUNT_CACHE_MISSES = 3, - PERF_COUNT_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_BRANCH_MISSES = 5, - PERF_COUNT_BUS_CYCLES = 6, - PERF_HW_EVENTS_MAX = 7, + PERF_TYPE_RAW = 128, +}; +/* + * Generalized performance counter event types, used by the hw_event.event_id + * parameter of the sys_perf_counter_open() syscall: + */ +enum hw_event_ids { /* - * Special "software" counters provided by the kernel, even if - * the hardware does not support performance counters. These - * counters measure various physical and sw events of the - * kernel (and allow the profiling of them as well): + * Common hardware events, generalized by the kernel: */ - PERF_COUNT_CPU_CLOCK = -1, - PERF_COUNT_TASK_CLOCK = -2, - PERF_COUNT_PAGE_FAULTS = -3, - PERF_COUNT_CONTEXT_SWITCHES = -4, - PERF_COUNT_CPU_MIGRATIONS = -5, - PERF_COUNT_PAGE_FAULTS_MIN = -6, - PERF_COUNT_PAGE_FAULTS_MAJ = -7, - - PERF_SW_EVENTS_MIN = -8, + PERF_COUNT_CPU_CYCLES = 0, + PERF_COUNT_INSTRUCTIONS = 1, + PERF_COUNT_CACHE_REFERENCES = 2, + PERF_COUNT_CACHE_MISSES = 3, + PERF_COUNT_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_BRANCH_MISSES = 5, + PERF_COUNT_BUS_CYCLES = 6, + + PERF_HW_EVENTS_MAX = 7, +}; - PERF_TP_EVENTS_MIN = -65536 +/* + * Special "software" counters provided by the kernel, even if the hardware + * does not support performance counters. These counters measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum sw_event_ids { + PERF_COUNT_CPU_CLOCK = 0, + PERF_COUNT_TASK_CLOCK = 1, + PERF_COUNT_PAGE_FAULTS = 2, + PERF_COUNT_CONTEXT_SWITCHES = 3, + PERF_COUNT_CPU_MIGRATIONS = 4, + PERF_COUNT_PAGE_FAULTS_MIN = 5, + PERF_COUNT_PAGE_FAULTS_MAJ = 6, + + PERF_SW_EVENTS_MAX = 7, }; /* * IRQ-notification data record type: */ enum perf_counter_record_type { - PERF_RECORD_SIMPLE = 0, - PERF_RECORD_IRQ = 1, - PERF_RECORD_GROUP = 2, + PERF_RECORD_SIMPLE = 0, + PERF_RECORD_IRQ = 1, + PERF_RECORD_GROUP = 2, }; /* * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - __s64 type; + union { + struct { + __u64 event_id : 56, + type : 8; + }; + struct { + __u64 raw_event_id : 63, + raw_type : 1; + }; + __u64 event_config; + }; __u64 irq_period; __u64 record_type; @@ -78,7 +103,6 @@ struct perf_counter_hw_event { __u64 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ pinned : 1, /* must always be on PMU */ exclusive : 1, /* only group on PMU */ @@ -87,7 +111,7 @@ struct perf_counter_hw_event { exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ - __reserved_1 : 54; + __reserved_1 : 55; __u32 extra_config_len; __u32 __reserved_4; @@ -298,10 +322,11 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, */ static inline int is_software_counter(struct perf_counter *counter) { - return !counter->hw_event.raw && counter->hw_event.type < 0; + return !counter->hw_event.raw_type && + counter->hw_event.type != PERF_TYPE_HARDWARE; } -extern void perf_swcounter_event(enum hw_event_types, u64, int, struct pt_regs *); +extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); #else static inline void @@ -320,7 +345,7 @@ static inline u64 hw_perf_save_disable(void) { return 0; } static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } -static inline void perf_swcounter_event(enum hw_event_types event, u64 nr, +static inline void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { } #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0bbe3e45ba0..68a56a68bc7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1395,12 +1395,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter) atomic64_set(&hwc->count, -left); } -static void perf_swcounter_save_and_restart(struct perf_counter *counter) -{ - perf_swcounter_update(counter); - perf_swcounter_set_period(counter); -} - static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data) { struct perf_data *irqdata = counter->irqdata; @@ -1421,7 +1415,7 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling) list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { counter->hw_ops->read(counter); - perf_swcounter_store_irq(sibling, counter->hw_event.type); + perf_swcounter_store_irq(sibling, counter->hw_event.event_config); perf_swcounter_store_irq(sibling, atomic64_read(&counter->count)); } } @@ -1477,21 +1471,25 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) static void perf_swcounter_overflow(struct perf_counter *counter, int nmi, struct pt_regs *regs) { - perf_swcounter_save_and_restart(counter); + perf_swcounter_update(counter); + perf_swcounter_set_period(counter); perf_swcounter_interrupt(counter, nmi, regs); } static int perf_swcounter_match(struct perf_counter *counter, - enum hw_event_types event, - struct pt_regs *regs) + enum perf_event_types type, + u32 event, struct pt_regs *regs) { if (counter->state != PERF_COUNTER_STATE_ACTIVE) return 0; - if (counter->hw_event.raw) + if (counter->hw_event.raw_type) + return 0; + + if (counter->hw_event.type != type) return 0; - if (counter->hw_event.type != event) + if (counter->hw_event.event_id != event) return 0; if (counter->hw_event.exclude_user && user_mode(regs)) @@ -1512,8 +1510,8 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, } static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum hw_event_types event, u64 nr, - int nmi, struct pt_regs *regs) + enum perf_event_types type, u32 event, + u64 nr, int nmi, struct pt_regs *regs) { struct perf_counter *counter; @@ -1522,24 +1520,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, event, regs)) + if (perf_swcounter_match(counter, type, event, regs)) perf_swcounter_add(counter, nr, nmi, regs); } rcu_read_unlock(); } -void perf_swcounter_event(enum hw_event_types event, u64 nr, - int nmi, struct pt_regs *regs) +static void __perf_swcounter_event(enum perf_event_types type, u32 event, + u64 nr, int nmi, struct pt_regs *regs) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs); - if (cpuctx->task_ctx) - perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs); + perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs); + if (cpuctx->task_ctx) { + perf_swcounter_ctx_event(cpuctx->task_ctx, type, event, + nr, nmi, regs); + } put_cpu_var(perf_cpu_context); } +void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) +{ + __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs); +} + static void perf_swcounter_read(struct perf_counter *counter) { perf_swcounter_update(counter); @@ -1733,8 +1738,12 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { #ifdef CONFIG_EVENT_PROFILE void perf_tpcounter_event(int event_id) { - perf_swcounter_event(PERF_TP_EVENTS_MIN + event_id, 1, 1, - task_pt_regs(current)); + struct pt_regs *regs = get_irq_regs(); + + if (!regs) + regs = task_pt_regs(current); + + __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs); } extern int ftrace_profile_enable(int); @@ -1742,15 +1751,13 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN; - - ftrace_profile_disable(event_id); + ftrace_profile_disable(counter->hw_event.event_id); } static const struct hw_perf_counter_ops * tp_perf_counter_init(struct perf_counter *counter) { - int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN; + int event_id = counter->hw_event.event_id; int ret; ret = ftrace_profile_enable(event_id); @@ -1758,6 +1765,7 @@ tp_perf_counter_init(struct perf_counter *counter) return NULL; counter->destroy = tp_perf_counter_destroy; + counter->hw.irq_period = counter->hw_event.irq_period; return &perf_ops_generic; } @@ -1783,7 +1791,7 @@ sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (counter->hw_event.type) { + switch (counter->hw_event.event_id) { case PERF_COUNT_CPU_CLOCK: hw_ops = &perf_ops_cpu_clock; @@ -1813,9 +1821,6 @@ sw_perf_counter_init(struct perf_counter *counter) if (!counter->hw_event.exclude_kernel) hw_ops = &perf_ops_cpu_migrations; break; - default: - hw_ops = tp_perf_counter_init(counter); - break; } if (hw_ops) @@ -1870,10 +1875,22 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->state = PERF_COUNTER_STATE_OFF; hw_ops = NULL; - if (!hw_event->raw && hw_event->type < 0) - hw_ops = sw_perf_counter_init(counter); - else + + if (hw_event->raw_type) + hw_ops = hw_perf_counter_init(counter); + else switch (hw_event->type) { + case PERF_TYPE_HARDWARE: hw_ops = hw_perf_counter_init(counter); + break; + + case PERF_TYPE_SOFTWARE: + hw_ops = sw_perf_counter_init(counter); + break; + + case PERF_TYPE_TRACEPOINT: + hw_ops = tp_perf_counter_init(counter); + break; + } if (!hw_ops) { kfree(counter); -- cgit v1.2.3 From 0322cd6ec504b0bf08ca7b2c3d7f43bda37d79c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:19 +0100 Subject: perf_counter: unify irq output code Impact: cleanup Having 3 slightly different copies of the same code around does nobody any good. First step in revamping the output format. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.929962222@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 51 +----------------- arch/x86/kernel/cpu/perf_counter.c | 53 +------------------ include/linux/perf_counter.h | 2 + kernel/perf_counter.c | 106 ++++++++++++++++++++----------------- 4 files changed, 61 insertions(+), 151 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 26f69dc7130..88b72eb4af1 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -662,41 +662,6 @@ void perf_counter_do_pending(void) } } -/* - * Record data for an irq counter. - * This function was lifted from the x86 code; maybe it should - * go in the core? - */ -static void perf_store_irq_data(struct perf_counter *counter, u64 data) -{ - struct perf_data *irqdata = counter->irqdata; - - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; - - *p = data; - irqdata->len += sizeof(u64); - } -} - -/* - * Record all the values of the counters in a group - */ -static void perf_handle_group(struct perf_counter *counter) -{ - struct perf_counter *leader, *sub; - - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->hw_ops->read(sub); - perf_store_irq_data(counter, sub->hw_event.event_config); - perf_store_irq_data(counter, atomic64_read(&sub->count)); - } -} - /* * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled @@ -736,20 +701,8 @@ static void record_and_restart(struct perf_counter *counter, long val, /* * Finally record data if requested. */ - if (record) { - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - break; - case PERF_RECORD_IRQ: - perf_store_irq_data(counter, instruction_pointer(regs)); - counter->wakeup_pending = 1; - break; - case PERF_RECORD_GROUP: - perf_handle_group(counter); - counter->wakeup_pending = 1; - break; - } - } + if (record) + perf_counter_output(counter, 1, regs); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d844ae41d5a..902282d68b0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -674,20 +674,6 @@ static void pmc_generic_disable(struct perf_counter *counter) x86_perf_counter_update(counter, hwc, idx); } -static void perf_store_irq_data(struct perf_counter *counter, u64 data) -{ - struct perf_data *irqdata = counter->irqdata; - - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; - - *p = data; - irqdata->len += sizeof(u64); - } -} - /* * Save and restart an expired counter. Called by NMI contexts, * so it has to be careful about preempting normal counter ops: @@ -704,22 +690,6 @@ static void perf_save_and_restart(struct perf_counter *counter) __pmc_generic_enable(counter, hwc, idx); } -static void -perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) -{ - struct perf_counter *counter, *group_leader = sibling->group_leader; - - /* - * Store sibling timestamps (if any): - */ - list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - - x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); - perf_store_irq_data(sibling, counter->hw_event.event_config); - perf_store_irq_data(sibling, atomic64_read(&counter->count)); - } -} - /* * Maximum interrupt frequency of 100KHz per CPU */ @@ -754,28 +724,7 @@ again: continue; perf_save_and_restart(counter); - - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - continue; - case PERF_RECORD_IRQ: - perf_store_irq_data(counter, instruction_pointer(regs)); - break; - case PERF_RECORD_GROUP: - perf_handle_group(counter, &status, &ack); - break; - } - /* - * From NMI context we cannot call into the scheduler to - * do a task wakeup - but we mark these generic as - * wakeup_pending and initate a wakeup callback: - */ - if (nmi) { - counter->wakeup_pending = 1; - set_tsk_thread_flag(current, TIF_PERF_COUNTERS); - } else { - wake_up(&counter->waitq); - } + perf_counter_output(counter, nmi, regs); } hw_perf_ack_status(ack); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8f939490550..a4b76c0175f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -317,6 +317,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu); +extern void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs); /* * Return 1 for a software counter, 0 for a hardware counter */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 68a56a68bc7..f054b8c9bf9 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1353,6 +1353,60 @@ static const struct file_operations perf_fops = { .compat_ioctl = perf_ioctl, }; +/* + * Output + */ + +static void perf_counter_store_irq(struct perf_counter *counter, u64 data) +{ + struct perf_data *irqdata = counter->irqdata; + + if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { + irqdata->overrun++; + } else { + u64 *p = (u64 *) &irqdata->data[irqdata->len]; + + *p = data; + irqdata->len += sizeof(u64); + } +} + +static void perf_counter_handle_group(struct perf_counter *counter) +{ + struct perf_counter *leader, *sub; + + leader = counter->group_leader; + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + if (sub != counter) + sub->hw_ops->read(sub); + perf_counter_store_irq(counter, sub->hw_event.event_config); + perf_counter_store_irq(counter, atomic64_read(&sub->count)); + } +} + +void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + switch (counter->hw_event.record_type) { + case PERF_RECORD_SIMPLE: + return; + + case PERF_RECORD_IRQ: + perf_counter_store_irq(counter, instruction_pointer(regs)); + break; + + case PERF_RECORD_GROUP: + perf_counter_handle_group(counter); + break; + } + + if (nmi) { + counter->wakeup_pending = 1; + set_perf_counter_pending(); + } else + wake_up(&counter->waitq); +} + /* * Generic software counter infrastructure */ @@ -1395,54 +1449,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter) atomic64_set(&hwc->count, -left); } -static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data) -{ - struct perf_data *irqdata = counter->irqdata; - - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; - - *p = data; - irqdata->len += sizeof(u64); - } -} - -static void perf_swcounter_handle_group(struct perf_counter *sibling) -{ - struct perf_counter *counter, *group_leader = sibling->group_leader; - - list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - counter->hw_ops->read(counter); - perf_swcounter_store_irq(sibling, counter->hw_event.event_config); - perf_swcounter_store_irq(sibling, atomic64_read(&counter->count)); - } -} - -static void perf_swcounter_interrupt(struct perf_counter *counter, - int nmi, struct pt_regs *regs) -{ - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - break; - - case PERF_RECORD_IRQ: - perf_swcounter_store_irq(counter, instruction_pointer(regs)); - break; - - case PERF_RECORD_GROUP: - perf_swcounter_handle_group(counter); - break; - } - - if (nmi) { - counter->wakeup_pending = 1; - set_perf_counter_pending(); - } else - wake_up(&counter->waitq); -} - static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) { struct perf_counter *counter; @@ -1461,7 +1467,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) regs = task_pt_regs(current); if (regs) - perf_swcounter_interrupt(counter, 0, regs); + perf_counter_output(counter, 0, regs); hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); @@ -1473,7 +1479,7 @@ static void perf_swcounter_overflow(struct perf_counter *counter, { perf_swcounter_update(counter); perf_swcounter_set_period(counter); - perf_swcounter_interrupt(counter, nmi, regs); + perf_counter_output(counter, nmi, regs); } static int perf_swcounter_match(struct perf_counter *counter, -- cgit v1.2.3 From 9aaa131a279834dff75c290c91f0058f62d72d46 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 21 Mar 2009 15:31:47 +1100 Subject: perf_counter: fix type/event_id layout on big-endian systems Impact: build fix for powerpc Commit db3a944aca35ae61 ("perf_counter: revamp syscall input ABI") expanded the hw_event.type field into a union of structs containing bitfields. In particular it introduced a type field and a raw_type field, with the intention that the 1-bit raw_type field should overlay the most-significant bit of the 8-bit type field, and in fact perf_counter_alloc() now assumes that (or at least, assumes that raw_type doesn't overlay any of the bits that are 1 in the values of PERF_TYPE_{HARDWARE,SOFTWARE,TRACEPOINT}). Unfortunately this is not true on big-endian systems such as PowerPC, where bitfields are laid out from left to right, i.e. from most significant bit to least significant. This means that setting hw_event.type = PERF_TYPE_SOFTWARE will set hw_event.raw_type to 1. This fixes it by making the layout depend on whether or not __BIG_ENDIAN_BITFIELD is defined. It's a bit ugly, but that's what we get for using bitfields in a user/kernel ABI. Also, that commit didn't fix up some places in arch/powerpc/kernel/ perf_counter.c where hw_event.raw and hw_event.event_id were used. This fixes them too. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 9 +++++---- include/linux/perf_counter.h | 12 ++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 830ca9c4494..6413d9c0313 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -602,12 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter) return NULL; if ((s64)counter->hw_event.irq_period < 0) return NULL; - ev = counter->hw_event.event_id; - if (!counter->hw_event.raw) { - if (ev >= ppmu->n_generic || - ppmu->generic_events[ev] == 0) + if (!counter->hw_event.raw_type) { + ev = counter->hw_event.event_id; + if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) return NULL; ev = ppmu->generic_events[ev]; + } else { + ev = counter->hw_event.raw_event_id; } counter->hw.config_base = ev; counter->hw.idx = 0; diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a4b76c0175f..98f5990be1e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -15,6 +15,7 @@ #include #include +#include /* * User-space ABI bits: @@ -86,6 +87,7 @@ enum perf_counter_record_type { */ struct perf_counter_hw_event { union { +#ifndef __BIG_ENDIAN_BITFIELD struct { __u64 event_id : 56, type : 8; @@ -94,6 +96,16 @@ struct perf_counter_hw_event { __u64 raw_event_id : 63, raw_type : 1; }; +#else + struct { + __u64 type : 8, + event_id : 56; + }; + struct { + __u64 raw_type : 1, + raw_event_id : 63; + }; +#endif /* __BIT_ENDIAN_BITFIELD */ __u64 event_config; }; -- cgit v1.2.3 From f4a2deb4860497f4332cf6a1acddab3dd628ddf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:06 +0100 Subject: perf_counter: remove the event config bitfields Since the bitfields turned into a bit of a mess, remove them and rely on good old masks. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.059499915@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 6 ++-- arch/x86/kernel/cpu/perf_counter.c | 8 ++--- include/linux/perf_counter.h | 74 +++++++++++++++++++++++++------------- kernel/perf_counter.c | 22 +++++++----- 4 files changed, 70 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 6413d9c0313..d05651584d4 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -602,13 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter) return NULL; if ((s64)counter->hw_event.irq_period < 0) return NULL; - if (!counter->hw_event.raw_type) { - ev = counter->hw_event.event_id; + if (!perf_event_raw(&counter->hw_event)) { + ev = perf_event_id(&counter->hw_event); if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) return NULL; ev = ppmu->generic_events[ev]; } else { - ev = counter->hw_event.raw_event_id; + ev = perf_event_config(&counter->hw_event); } counter->hw.config_base = ev; counter->hw.idx = 0; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 902282d68b0..3f95b0cdc55 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (hw_event->raw_type) { - hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id); + if (perf_event_raw(hw_event)) { + hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event)); } else { - if (hw_event->event_id >= pmc_ops->max_events) + if (perf_event_id(hw_event) >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(hw_event->event_id); + hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } counter->wakeup_pending = 0; diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 98f5990be1e..56099e52970 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -82,32 +82,37 @@ enum perf_counter_record_type { PERF_RECORD_GROUP = 2, }; +#define __PERF_COUNTER_MASK(name) \ + (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \ + PERF_COUNTER_##name##_SHIFT) + +#define PERF_COUNTER_RAW_BITS 1 +#define PERF_COUNTER_RAW_SHIFT 63 +#define PERF_COUNTER_RAW_MASK __PERF_COUNTER_MASK(RAW) + +#define PERF_COUNTER_CONFIG_BITS 63 +#define PERF_COUNTER_CONFIG_SHIFT 0 +#define PERF_COUNTER_CONFIG_MASK __PERF_COUNTER_MASK(CONFIG) + +#define PERF_COUNTER_TYPE_BITS 7 +#define PERF_COUNTER_TYPE_SHIFT 56 +#define PERF_COUNTER_TYPE_MASK __PERF_COUNTER_MASK(TYPE) + +#define PERF_COUNTER_EVENT_BITS 56 +#define PERF_COUNTER_EVENT_SHIFT 0 +#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) + /* * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - union { -#ifndef __BIG_ENDIAN_BITFIELD - struct { - __u64 event_id : 56, - type : 8; - }; - struct { - __u64 raw_event_id : 63, - raw_type : 1; - }; -#else - struct { - __u64 type : 8, - event_id : 56; - }; - struct { - __u64 raw_type : 1, - raw_event_id : 63; - }; -#endif /* __BIT_ENDIAN_BITFIELD */ - __u64 event_config; - }; + /* + * The MSB of the config word signifies if the rest contains cpu + * specific (raw) counter configuration data, if unset, the next + * 7 bits are an event type and the rest of the bits are the event + * identifier. + */ + __u64 config; __u64 irq_period; __u64 record_type; @@ -157,6 +162,27 @@ struct perf_counter_hw_event { struct task_struct; +static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event) +{ + return hw_event->config & PERF_COUNTER_RAW_MASK; +} + +static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event) +{ + return hw_event->config & PERF_COUNTER_CONFIG_MASK; +} + +static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event) +{ + return (hw_event->config & PERF_COUNTER_TYPE_MASK) >> + PERF_COUNTER_TYPE_SHIFT; +} + +static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event) +{ + return hw_event->config & PERF_COUNTER_EVENT_MASK; +} + /** * struct hw_perf_counter - performance counter hardware details: */ @@ -336,8 +362,8 @@ extern void perf_counter_output(struct perf_counter *counter, */ static inline int is_software_counter(struct perf_counter *counter) { - return !counter->hw_event.raw_type && - counter->hw_event.type != PERF_TYPE_HARDWARE; + return !perf_event_raw(&counter->hw_event) && + perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE; } extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f054b8c9bf9..ca14fc41ccd 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1379,7 +1379,7 @@ static void perf_counter_handle_group(struct perf_counter *counter) list_for_each_entry(sub, &leader->sibling_list, list_entry) { if (sub != counter) sub->hw_ops->read(sub); - perf_counter_store_irq(counter, sub->hw_event.event_config); + perf_counter_store_irq(counter, sub->hw_event.config); perf_counter_store_irq(counter, atomic64_read(&sub->count)); } } @@ -1489,13 +1489,13 @@ static int perf_swcounter_match(struct perf_counter *counter, if (counter->state != PERF_COUNTER_STATE_ACTIVE) return 0; - if (counter->hw_event.raw_type) + if (perf_event_raw(&counter->hw_event)) return 0; - if (counter->hw_event.type != type) + if (perf_event_type(&counter->hw_event) != type) return 0; - if (counter->hw_event.event_id != event) + if (perf_event_id(&counter->hw_event) != event) return 0; if (counter->hw_event.exclude_user && user_mode(regs)) @@ -1757,13 +1757,13 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - ftrace_profile_disable(counter->hw_event.event_id); + ftrace_profile_disable(perf_event_id(&counter->hw_event)); } static const struct hw_perf_counter_ops * tp_perf_counter_init(struct perf_counter *counter) { - int event_id = counter->hw_event.event_id; + int event_id = perf_event_id(&counter->hw_event); int ret; ret = ftrace_profile_enable(event_id); @@ -1797,7 +1797,7 @@ sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (counter->hw_event.event_id) { + switch (perf_event_id(&counter->hw_event)) { case PERF_COUNT_CPU_CLOCK: hw_ops = &perf_ops_cpu_clock; @@ -1882,9 +1882,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, hw_ops = NULL; - if (hw_event->raw_type) + if (perf_event_raw(hw_event)) { hw_ops = hw_perf_counter_init(counter); - else switch (hw_event->type) { + goto done; + } + + switch (perf_event_type(hw_event)) { case PERF_TYPE_HARDWARE: hw_ops = hw_perf_counter_init(counter); break; @@ -1902,6 +1905,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, kfree(counter); return NULL; } +done: counter->hw_ops = hw_ops; return counter; -- cgit v1.2.3 From 96f6d4444302bb2ea2cf409529eef816462f6ce0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:07 +0100 Subject: perf_counter: avoid recursion Tracepoint events like lock_acquire and software counters like pagefaults can recurse into the perf counter code again, avoid that. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.152096433@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++++++ kernel/perf_counter.c | 26 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 56099e52970..18dc17d0a61 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -328,6 +328,13 @@ struct perf_cpu_context { int active_oncpu; int max_pertask; int exclusive; + + /* + * Recursion avoidance: + * + * task, softirq, irq, nmi context + */ + int recursion[4]; }; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ca14fc41ccd..ce34bff07bd 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -1532,10 +1533,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_unlock(); } +static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) +{ + if (in_nmi()) + return &cpuctx->recursion[3]; + + if (in_irq()) + return &cpuctx->recursion[2]; + + if (in_softirq()) + return &cpuctx->recursion[1]; + + return &cpuctx->recursion[0]; +} + static void __perf_swcounter_event(enum perf_event_types type, u32 event, u64 nr, int nmi, struct pt_regs *regs) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int *recursion = perf_swcounter_recursion_context(cpuctx); + + if (*recursion) + goto out; + + (*recursion)++; + barrier(); perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs); if (cpuctx->task_ctx) { @@ -1543,6 +1565,10 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event, nr, nmi, regs); } + barrier(); + (*recursion)--; + +out: put_cpu_var(perf_cpu_context); } -- cgit v1.2.3 From 37d81828385f8ff823caaaf1a83e72d065b6cfa1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 23 Mar 2009 18:22:08 +0100 Subject: perf_counter: add an mmap method to allow userspace to read hardware counters Impact: new feature giving performance improvement This adds the ability for userspace to do an mmap on a hardware counter fd and get access to a read-only page that contains the information needed to translate a hardware counter value to the full 64-bit counter value that would be returned by a read on the fd. This is useful on architectures that allow user programs to read the hardware counters, such as PowerPC. The mmap will only succeed if the counter is a hardware counter monitoring the current process. On my quad 2.5GHz PowerPC 970MP machine, userspace can read a counter and translate it to the full 64-bit value in about 30ns using the mmapped page, compared to about 830ns for the read syscall on the counter, so this does give a significant performance improvement. Signed-off-by: Paul Mackerras Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <20090323172417.297057964@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 6 +++ include/linux/perf_counter.h | 15 ++++++++ kernel/perf_counter.c | 76 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index d05651584d4..e4349281b07 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -417,6 +417,8 @@ void hw_perf_restore(u64 disable) atomic64_set(&counter->hw.prev_count, val); counter->hw.idx = hwc_index[i] + 1; write_pmc(counter->hw.idx, val); + if (counter->user_page) + perf_counter_update_userpage(counter); } mb(); cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; @@ -572,6 +574,8 @@ static void power_perf_disable(struct perf_counter *counter) ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); write_pmc(counter->hw.idx, 0); counter->hw.idx = 0; + if (counter->user_page) + perf_counter_update_userpage(counter); break; } } @@ -698,6 +702,8 @@ static void record_and_restart(struct perf_counter *counter, long val, write_pmc(counter->hw.idx, val); atomic64_set(&counter->hw.prev_count, val); atomic64_set(&counter->hw.period_left, left); + if (counter->user_page) + perf_counter_update_userpage(counter); /* * Finally record data if requested. diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 18dc17d0a61..40b324e91bf 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -143,6 +143,17 @@ struct perf_counter_hw_event { #define PERF_COUNTER_IOC_ENABLE _IO('$', 0) #define PERF_COUNTER_IOC_DISABLE _IO('$', 1) +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_counter_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware counter identifier */ + __s64 offset; /* add to hardware counter value */ +}; + #ifdef __KERNEL__ /* * Kernel-internal data types and definitions: @@ -278,6 +289,9 @@ struct perf_counter { int oncpu; int cpu; + /* pointer to page shared with userspace via mmap */ + unsigned long user_page; + /* read() / irq related data */ wait_queue_head_t waitq; /* optional: for NMIs */ @@ -361,6 +375,7 @@ extern int perf_counter_task_enable(void); extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu); +extern void perf_counter_update_userpage(struct perf_counter *counter); extern void perf_counter_output(struct perf_counter *counter, int nmi, struct pt_regs *regs); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ce34bff07bd..d9cfd902140 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1177,6 +1177,7 @@ static int perf_release(struct inode *inode, struct file *file) mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); + free_page(counter->user_page); free_counter(counter); put_context(ctx); @@ -1346,12 +1347,87 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } +void perf_counter_update_userpage(struct perf_counter *counter) +{ + struct perf_counter_mmap_page *userpg; + + if (!counter->user_page) + return; + userpg = (struct perf_counter_mmap_page *) counter->user_page; + + ++userpg->lock; + smp_wmb(); + userpg->index = counter->hw.idx; + userpg->offset = atomic64_read(&counter->count); + if (counter->state == PERF_COUNTER_STATE_ACTIVE) + userpg->offset -= atomic64_read(&counter->hw.prev_count); + smp_wmb(); + ++userpg->lock; +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_counter *counter = vma->vm_file->private_data; + + if (!counter->user_page) + return VM_FAULT_SIGBUS; + + vmf->page = virt_to_page(counter->user_page); + get_page(vmf->page); + return 0; +} + +static struct vm_operations_struct perf_mmap_vmops = { + .fault = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_counter *counter = file->private_data; + unsigned long userpg; + + if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) + return -EINVAL; + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + /* + * For now, restrict to the case of a hardware counter + * on the current task. + */ + if (is_software_counter(counter) || counter->task != current) + return -EINVAL; + + userpg = counter->user_page; + if (!userpg) { + userpg = get_zeroed_page(GFP_KERNEL); + mutex_lock(&counter->mutex); + if (counter->user_page) { + free_page(userpg); + userpg = counter->user_page; + } else { + counter->user_page = userpg; + } + mutex_unlock(&counter->mutex); + if (!userpg) + return -ENOMEM; + } + + perf_counter_update_userpage(counter); + + vma->vm_flags &= ~VM_MAYWRITE; + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + return 0; +} + static const struct file_operations perf_fops = { .release = perf_release, .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, }; /* -- cgit v1.2.3 From 9ab772cd535c4b256a577eae516f9c7462346b2d Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 23 Mar 2009 18:22:09 +0100 Subject: mutex: add atomic_dec_and_mutex_lock() Much like the atomic_dec_and_lock() function in which we take an hold a spin_lock if we drop the atomic to 0 this function takes and holds the mutex if we dec the atomic to 0. Signed-off-by: Eric Paris Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.410913479@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 3069ec7e0ab..93054fc3635 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} + #endif -- cgit v1.2.3 From 7b732a75047738e4f85438ed2f9cd34bf5f2a19a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:10 +0100 Subject: perf_counter: new output ABI - part 1 Impact: Rework the perfcounter output ABI use sys_read() only for instant data and provide mmap() output for all async overflow data. The first mmap() determines the size of the output buffer. The mmap() size must be a PAGE_SIZE multiple of 1+pages, where pages must be a power of 2 or 0. Further mmap()s of the same fd must have the same size. Once all maps are gone, you can again mmap() with a new size. In case of 0 extra pages there is no data output and the first page only contains meta data. When there are data pages, a poll() event will be generated for each full page of data. Furthermore, the output is circular. This means that although 1 page is a valid configuration, its useless, since we'll start overwriting it the instant we report a full page. Future work will focus on the output format (currently maintained) where we'll likey want each entry denoted by a header which includes a type and length. Further future work will allow to splice() the fd, also containing the async overflow data -- splice() would be mutually exclusive with mmap() of the data. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.470536358@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 9 +- include/linux/perf_counter.h | 36 ++- kernel/perf_counter.c | 464 ++++++++++++++++++++----------------- 3 files changed, 263 insertions(+), 246 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index e4349281b07..d48596ab655 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -417,8 +417,7 @@ void hw_perf_restore(u64 disable) atomic64_set(&counter->hw.prev_count, val); counter->hw.idx = hwc_index[i] + 1; write_pmc(counter->hw.idx, val); - if (counter->user_page) - perf_counter_update_userpage(counter); + perf_counter_update_userpage(counter); } mb(); cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; @@ -574,8 +573,7 @@ static void power_perf_disable(struct perf_counter *counter) ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); write_pmc(counter->hw.idx, 0); counter->hw.idx = 0; - if (counter->user_page) - perf_counter_update_userpage(counter); + perf_counter_update_userpage(counter); break; } } @@ -702,8 +700,7 @@ static void record_and_restart(struct perf_counter *counter, long val, write_pmc(counter->hw.idx, val); atomic64_set(&counter->hw.prev_count, val); atomic64_set(&counter->hw.period_left, left); - if (counter->user_page) - perf_counter_update_userpage(counter); + perf_counter_update_userpage(counter); /* * Finally record data if requested. diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 40b324e91bf..2b5e66d5ebd 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -152,6 +152,8 @@ struct perf_counter_mmap_page { __u32 lock; /* seqlock for synchronization */ __u32 index; /* hardware counter identifier */ __s64 offset; /* add to hardware counter value */ + + __u32 data_head; /* head in the data section */ }; #ifdef __KERNEL__ @@ -218,21 +220,6 @@ struct hw_perf_counter { #endif }; -/* - * Hardcoded buffer length limit for now, for IRQ-fed events: - */ -#define PERF_DATA_BUFLEN 2048 - -/** - * struct perf_data - performance counter IRQ data sampling ... - */ -struct perf_data { - int len; - int rd_idx; - int overrun; - u8 data[PERF_DATA_BUFLEN]; -}; - struct perf_counter; /** @@ -256,6 +243,14 @@ enum perf_counter_active_state { struct file; +struct perf_mmap_data { + struct rcu_head rcu_head; + int nr_pages; + atomic_t head; + struct perf_counter_mmap_page *user_page; + void *data_pages[0]; +}; + /** * struct perf_counter - performance counter kernel representation: */ @@ -289,16 +284,15 @@ struct perf_counter { int oncpu; int cpu; - /* pointer to page shared with userspace via mmap */ - unsigned long user_page; + /* mmap bits */ + struct mutex mmap_mutex; + atomic_t mmap_count; + struct perf_mmap_data *data; - /* read() / irq related data */ + /* poll related */ wait_queue_head_t waitq; /* optional: for NMIs */ int wakeup_pending; - struct perf_data *irqdata; - struct perf_data *usrdata; - struct perf_data data[2]; void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d9cfd902140..0dfe91094fd 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -4,7 +4,8 @@ * Copyright(C) 2008 Thomas Gleixner * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar * - * For licencing details see kernel-base/COPYING + * + * For licensing details see kernel-base/COPYING */ #include @@ -1022,66 +1023,6 @@ static u64 perf_counter_read(struct perf_counter *counter) return atomic64_read(&counter->count); } -/* - * Cross CPU call to switch performance data pointers - */ -static void __perf_switch_irq_data(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - struct perf_data *oldirqdata = counter->irqdata; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task) { - if (cpuctx->task_ctx != ctx) - return; - spin_lock(&ctx->lock); - } - - /* Change the pointer NMI safe */ - atomic_long_set((atomic_long_t *)&counter->irqdata, - (unsigned long) counter->usrdata); - counter->usrdata = oldirqdata; - - if (ctx->task) - spin_unlock(&ctx->lock); -} - -static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct perf_data *oldirqdata = counter->irqdata; - struct task_struct *task = ctx->task; - - if (!task) { - smp_call_function_single(counter->cpu, - __perf_switch_irq_data, - counter, 1); - return counter->usrdata; - } - -retry: - spin_lock_irq(&ctx->lock); - if (counter->state != PERF_COUNTER_STATE_ACTIVE) { - counter->irqdata = counter->usrdata; - counter->usrdata = oldirqdata; - spin_unlock_irq(&ctx->lock); - return oldirqdata; - } - spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_switch_irq_data, counter); - /* Might have failed, because task was scheduled out */ - if (counter->irqdata == oldirqdata) - goto retry; - - return counter->usrdata; -} - static void put_context(struct perf_counter_context *ctx) { if (ctx->task) @@ -1177,7 +1118,6 @@ static int perf_release(struct inode *inode, struct file *file) mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); - free_page(counter->user_page); free_counter(counter); put_context(ctx); @@ -1192,7 +1132,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { u64 cntval; - if (count != sizeof(cntval)) + if (count < sizeof(cntval)) return -EINVAL; /* @@ -1210,122 +1150,21 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); } -static ssize_t -perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count) -{ - if (!usrdata->len) - return 0; - - count = min(count, (size_t)usrdata->len); - if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count)) - return -EFAULT; - - /* Adjust the counters */ - usrdata->len -= count; - if (!usrdata->len) - usrdata->rd_idx = 0; - else - usrdata->rd_idx += count; - - return count; -} - -static ssize_t -perf_read_irq_data(struct perf_counter *counter, - char __user *buf, - size_t count, - int nonblocking) -{ - struct perf_data *irqdata, *usrdata; - DECLARE_WAITQUEUE(wait, current); - ssize_t res, res2; - - irqdata = counter->irqdata; - usrdata = counter->usrdata; - - if (usrdata->len + irqdata->len >= count) - goto read_pending; - - if (nonblocking) - return -EAGAIN; - - spin_lock_irq(&counter->waitq.lock); - __add_wait_queue(&counter->waitq, &wait); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (usrdata->len + irqdata->len >= count) - break; - - if (signal_pending(current)) - break; - - if (counter->state == PERF_COUNTER_STATE_ERROR) - break; - - spin_unlock_irq(&counter->waitq.lock); - schedule(); - spin_lock_irq(&counter->waitq.lock); - } - __remove_wait_queue(&counter->waitq, &wait); - __set_current_state(TASK_RUNNING); - spin_unlock_irq(&counter->waitq.lock); - - if (usrdata->len + irqdata->len < count && - counter->state != PERF_COUNTER_STATE_ERROR) - return -ERESTARTSYS; -read_pending: - mutex_lock(&counter->mutex); - - /* Drain pending data first: */ - res = perf_copy_usrdata(usrdata, buf, count); - if (res < 0 || res == count) - goto out; - - /* Switch irq buffer: */ - usrdata = perf_switch_irq_data(counter); - res2 = perf_copy_usrdata(usrdata, buf + res, count - res); - if (res2 < 0) { - if (!res) - res = -EFAULT; - } else { - res += res2; - } -out: - mutex_unlock(&counter->mutex); - - return res; -} - static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_counter *counter = file->private_data; - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - return perf_read_hw(counter, buf, count); - - case PERF_RECORD_IRQ: - case PERF_RECORD_GROUP: - return perf_read_irq_data(counter, buf, count, - file->f_flags & O_NONBLOCK); - } - return -EINVAL; + return perf_read_hw(counter, buf, count); } static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_counter *counter = file->private_data; - unsigned int events = 0; - unsigned long flags; + unsigned int events = POLLIN; poll_wait(file, &counter->waitq, wait); - spin_lock_irqsave(&counter->waitq.lock, flags); - if (counter->usrdata->len || counter->irqdata->len) - events |= POLLIN; - spin_unlock_irqrestore(&counter->waitq.lock, flags); - return events; } @@ -1347,78 +1186,207 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } -void perf_counter_update_userpage(struct perf_counter *counter) +static void __perf_counter_update_userpage(struct perf_counter *counter, + struct perf_mmap_data *data) { - struct perf_counter_mmap_page *userpg; - - if (!counter->user_page) - return; - userpg = (struct perf_counter_mmap_page *) counter->user_page; + struct perf_counter_mmap_page *userpg = data->user_page; + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); ++userpg->lock; smp_wmb(); userpg->index = counter->hw.idx; userpg->offset = atomic64_read(&counter->count); if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); + + userpg->data_head = atomic_read(&data->head); smp_wmb(); ++userpg->lock; + preempt_enable(); +} + +void perf_counter_update_userpage(struct perf_counter *counter) +{ + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (data) + __perf_counter_update_userpage(counter, data); + rcu_read_unlock(); } static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct perf_counter *counter = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; - if (!counter->user_page) - return VM_FAULT_SIGBUS; + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (!data) + goto unlock; + + if (vmf->pgoff == 0) { + vmf->page = virt_to_page(data->user_page); + } else { + int nr = vmf->pgoff - 1; - vmf->page = virt_to_page(counter->user_page); + if ((unsigned)nr > data->nr_pages) + goto unlock; + + vmf->page = virt_to_page(data->data_pages[nr]); + } get_page(vmf->page); + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + int i; + + WARN_ON(atomic_read(&counter->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += nr_pages * sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->data_pages[i]) + goto fail_data_pages; + } + + data->nr_pages = nr_pages; + + rcu_assign_pointer(counter->data, data); + return 0; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)data->data_pages[i]); + + free_page((unsigned long)data->user_page); + +fail_user_page: + kfree(data); + +fail: + return -ENOMEM; +} + +static void __perf_mmap_data_free(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data = container_of(rcu_head, + struct perf_mmap_data, rcu_head); + int i; + + free_page((unsigned long)data->user_page); + for (i = 0; i < data->nr_pages; i++) + free_page((unsigned long)data->data_pages[i]); + kfree(data); +} + +static void perf_mmap_data_free(struct perf_counter *counter) +{ + struct perf_mmap_data *data = counter->data; + + WARN_ON(atomic_read(&counter->mmap_count)); + + rcu_assign_pointer(counter->data, NULL); + call_rcu(&data->rcu_head, __perf_mmap_data_free); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_counter *counter = vma->vm_file->private_data; + + atomic_inc(&counter->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ + struct perf_counter *counter = vma->vm_file->private_data; + + if (atomic_dec_and_mutex_lock(&counter->mmap_count, + &counter->mmap_mutex)) { + perf_mmap_data_free(counter); + mutex_unlock(&counter->mmap_mutex); + } } static struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, .fault = perf_mmap_fault, }; static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_counter *counter = file->private_data; - unsigned long userpg; + unsigned long vma_size; + unsigned long nr_pages; + unsigned long locked, lock_limit; + int ret = 0; if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) return -EINVAL; - if (vma->vm_end - vma->vm_start != PAGE_SIZE) + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + if (nr_pages == 0 || !is_power_of_2(nr_pages)) return -EINVAL; - /* - * For now, restrict to the case of a hardware counter - * on the current task. - */ - if (is_software_counter(counter) || counter->task != current) + if (vma_size != PAGE_SIZE * (1 + nr_pages)) return -EINVAL; - userpg = counter->user_page; - if (!userpg) { - userpg = get_zeroed_page(GFP_KERNEL); - mutex_lock(&counter->mutex); - if (counter->user_page) { - free_page(userpg); - userpg = counter->user_page; - } else { - counter->user_page = userpg; - } - mutex_unlock(&counter->mutex); - if (!userpg) - return -ENOMEM; - } + if (vma->vm_pgoff != 0) + return -EINVAL; + + locked = vma_size >> PAGE_SHIFT; + locked += vma->vm_mm->locked_vm; - perf_counter_update_userpage(counter); + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) + return -EPERM; + + mutex_lock(&counter->mmap_mutex); + if (atomic_inc_not_zero(&counter->mmap_count)) + goto out; + + WARN_ON(counter->data); + ret = perf_mmap_data_alloc(counter, nr_pages); + if (!ret) + atomic_set(&counter->mmap_count, 1); +out: + mutex_unlock(&counter->mmap_mutex); vma->vm_flags &= ~VM_MAYWRITE; vma->vm_flags |= VM_RESERVED; vma->vm_ops = &perf_mmap_vmops; - return 0; + + return ret; } static const struct file_operations perf_fops = { @@ -1434,30 +1402,94 @@ static const struct file_operations perf_fops = { * Output */ -static void perf_counter_store_irq(struct perf_counter *counter, u64 data) +static int perf_output_write(struct perf_counter *counter, int nmi, + void *buf, ssize_t size) { - struct perf_data *irqdata = counter->irqdata; + struct perf_mmap_data *data; + unsigned int offset, head, nr; + unsigned int len; + int ret, wakeup; - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; + rcu_read_lock(); + ret = -ENOSPC; + data = rcu_dereference(counter->data); + if (!data) + goto out; + + if (!data->nr_pages) + goto out; + + ret = -EINVAL; + if (size > PAGE_SIZE) + goto out; + + do { + offset = head = atomic_read(&data->head); + head += sizeof(u64); + } while (atomic_cmpxchg(&data->head, offset, head) != offset); + + wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); - *p = data; - irqdata->len += sizeof(u64); + nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1); + offset &= PAGE_SIZE - 1; + + len = min_t(unsigned int, PAGE_SIZE - offset, size); + memcpy(data->data_pages[nr] + offset, buf, len); + size -= len; + + if (size) { + nr = (nr + 1) & (data->nr_pages - 1); + memcpy(data->data_pages[nr], buf + len, size); + } + + /* + * generate a poll() wakeup for every page boundary crossed + */ + if (wakeup) { + __perf_counter_update_userpage(counter, data); + if (nmi) { + counter->wakeup_pending = 1; + set_perf_counter_pending(); + } else + wake_up(&counter->waitq); } + ret = 0; +out: + rcu_read_unlock(); + + return ret; } -static void perf_counter_handle_group(struct perf_counter *counter) +static void perf_output_simple(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + u64 entry; + + entry = instruction_pointer(regs); + + perf_output_write(counter, nmi, &entry, sizeof(entry)); +} + +struct group_entry { + u64 event; + u64 counter; +}; + +static void perf_output_group(struct perf_counter *counter, int nmi) { struct perf_counter *leader, *sub; leader = counter->group_leader; list_for_each_entry(sub, &leader->sibling_list, list_entry) { + struct group_entry entry; + if (sub != counter) sub->hw_ops->read(sub); - perf_counter_store_irq(counter, sub->hw_event.config); - perf_counter_store_irq(counter, atomic64_read(&sub->count)); + + entry.event = sub->hw_event.config; + entry.counter = atomic64_read(&sub->count); + + perf_output_write(counter, nmi, &entry, sizeof(entry)); } } @@ -1469,19 +1501,13 @@ void perf_counter_output(struct perf_counter *counter, return; case PERF_RECORD_IRQ: - perf_counter_store_irq(counter, instruction_pointer(regs)); + perf_output_simple(counter, nmi, regs); break; case PERF_RECORD_GROUP: - perf_counter_handle_group(counter); + perf_output_group(counter, nmi); break; } - - if (nmi) { - counter->wakeup_pending = 1; - set_perf_counter_pending(); - } else - wake_up(&counter->waitq); } /* @@ -1967,10 +1993,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, INIT_LIST_HEAD(&counter->sibling_list); init_waitqueue_head(&counter->waitq); + mutex_init(&counter->mmap_mutex); + INIT_LIST_HEAD(&counter->child_list); - counter->irqdata = &counter->data[0]; - counter->usrdata = &counter->data[1]; counter->cpu = cpu; counter->hw_event = *hw_event; counter->wakeup_pending = 0; -- cgit v1.2.3 From c7138f37f905bb7987b1f9f5a8ee73667db39f25 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Mar 2009 13:18:16 +0100 Subject: perf_counter: fix perf_poll() Impact: fix kerneltop 100% CPU usage Only return a poll event when there's actually been one, poll_wait() doesn't actually wait for the waitq you pass it, it only enqueues you on it. Only once all FDs have been iterated and none of thm returned a poll-event will it schedule(). Also make it return POLL_HUP when there's not mmap() area to read from. Further, fix a silly bug in the write code. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Arjan van de Ven Orig-LKML-Reference: <1237897096.24918.181.camel@twins> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2b5e66d5ebd..48212c15b7d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -246,6 +246,7 @@ struct file; struct perf_mmap_data { struct rcu_head rcu_head; int nr_pages; + atomic_t wakeup; atomic_t head; struct perf_counter_mmap_page *user_page; void *data_pages[0]; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0dfe91094fd..affe227d56a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1161,7 +1161,16 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_counter *counter = file->private_data; - unsigned int events = POLLIN; + struct perf_mmap_data *data; + unsigned int events; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (data) + events = atomic_xchg(&data->wakeup, 0); + else + events = POLL_HUP; + rcu_read_unlock(); poll_wait(file, &counter->waitq, wait); @@ -1425,7 +1434,7 @@ static int perf_output_write(struct perf_counter *counter, int nmi, do { offset = head = atomic_read(&data->head); - head += sizeof(u64); + head += size; } while (atomic_cmpxchg(&data->head, offset, head) != offset); wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); @@ -1446,6 +1455,7 @@ static int perf_output_write(struct perf_counter *counter, int nmi, * generate a poll() wakeup for every page boundary crossed */ if (wakeup) { + atomic_xchg(&data->wakeup, POLL_IN); __perf_counter_update_userpage(counter, data); if (nmi) { counter->wakeup_pending = 1; -- cgit v1.2.3 From 5c1481943250ab65fa5130e05ec479c93216e9f7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:23 +0100 Subject: perf_counter: output objects Provide a {type,size} header for each output entry. This should provide extensible output, and the ability to mix multiple streams. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113316.831607932@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 11 +++++++++ kernel/perf_counter.c | 53 ++++++++++++++++++++++++++++++++++---------- 2 files changed, 52 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 48212c15b7d..c256635377d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -156,6 +156,16 @@ struct perf_counter_mmap_page { __u32 data_head; /* head in the data section */ }; +struct perf_event_header { + __u32 type; + __u32 size; +}; + +enum perf_event_type { + PERF_EVENT_IP = 0, + PERF_EVENT_GROUP = 1, +}; + #ifdef __KERNEL__ /* * Kernel-internal data types and definitions: @@ -260,6 +270,7 @@ struct perf_counter { struct list_head list_entry; struct list_head event_entry; struct list_head sibling_list; + int nr_siblings; struct perf_counter *group_leader; const struct hw_perf_counter_ops *hw_ops; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0422fd9bf62..d76e3112d38 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -75,8 +75,10 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) */ if (counter->group_leader == counter) list_add_tail(&counter->list_entry, &ctx->counter_list); - else + else { list_add_tail(&counter->list_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + } list_add_rcu(&counter->event_entry, &ctx->event_list); } @@ -89,6 +91,9 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); + if (counter->group_leader != counter) + counter->group_leader->nr_siblings--; + /* * If this was a group counter with sibling counters then * upgrade the siblings to singleton counters by adding them @@ -381,9 +386,11 @@ static int is_software_only_group(struct perf_counter *leader) if (!is_software_counter(leader)) return 0; + list_for_each_entry(counter, &leader->sibling_list, list_entry) if (!is_software_counter(counter)) return 0; + return 1; } @@ -1480,6 +1487,9 @@ static void perf_output_copy(struct perf_output_handle *handle, handle->offset = offset; } +#define perf_output_put(handle, x) \ + perf_output_copy((handle), &(x), sizeof(x)) + static void perf_output_end(struct perf_output_handle *handle, int nmi) { if (handle->wakeup) { @@ -1514,34 +1524,53 @@ out: static void perf_output_simple(struct perf_counter *counter, int nmi, struct pt_regs *regs) { - u64 entry; + struct { + struct perf_event_header header; + u64 ip; + } event; - entry = instruction_pointer(regs); + event.header.type = PERF_EVENT_IP; + event.header.size = sizeof(event); + event.ip = instruction_pointer(regs); - perf_output_write(counter, nmi, &entry, sizeof(entry)); + perf_output_write(counter, nmi, &event, sizeof(event)); } -struct group_entry { - u64 event; - u64 counter; -}; - static void perf_output_group(struct perf_counter *counter, int nmi) { + struct perf_output_handle handle; + struct perf_event_header header; struct perf_counter *leader, *sub; + unsigned int size; + struct { + u64 event; + u64 counter; + } entry; + int ret; + + size = sizeof(header) + counter->nr_siblings * sizeof(entry); + + ret = perf_output_begin(&handle, counter, size); + if (ret) + return; + + header.type = PERF_EVENT_GROUP; + header.size = size; + + perf_output_put(&handle, header); leader = counter->group_leader; list_for_each_entry(sub, &leader->sibling_list, list_entry) { - struct group_entry entry; - if (sub != counter) sub->hw_ops->read(sub); entry.event = sub->hw_event.config; entry.counter = atomic64_read(&sub->count); - perf_output_write(counter, nmi, &entry, sizeof(entry)); + perf_output_put(&handle, entry); } + + perf_output_end(&handle, nmi); } void perf_counter_output(struct perf_counter *counter, -- cgit v1.2.3 From ea5d20cf99db5d26d43b6d322d3ace17e08a6552 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:25 +0100 Subject: perf_counter: optionally provide the pid/tid of the sampled task Allow cpu wide counters to profile userspace by providing what process the sample belongs to. This raises the first issue with the output type, lots of these options: group, tid, callchain, etc.. are non-exclusive and could be combined, suggesting a bitfield. However, things like the mmap() data stream doesn't fit in that. How to split the type field... Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113317.013775235@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 ++++- kernel/perf_counter.c | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c256635377d..7fdbdf8be77 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -127,8 +127,9 @@ struct perf_counter_hw_event { exclude_kernel : 1, /* ditto kernel */ exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ + include_tid : 1, /* include the tid */ - __reserved_1 : 55; + __reserved_1 : 54; __u32 extra_config_len; __u32 __reserved_4; @@ -164,6 +165,8 @@ struct perf_event_header { enum perf_event_type { PERF_EVENT_IP = 0, PERF_EVENT_GROUP = 1, + + __PERF_EVENT_TID = 0x100, }; #ifdef __KERNEL__ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7669afe82cc..f3e1b27bc1b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1528,16 +1528,30 @@ out: static void perf_output_simple(struct perf_counter *counter, int nmi, struct pt_regs *regs) { + unsigned int size; struct { struct perf_event_header header; u64 ip; + u32 pid, tid; } event; event.header.type = PERF_EVENT_IP; - event.header.size = sizeof(event); event.ip = instruction_pointer(regs); - perf_output_write(counter, nmi, &event, sizeof(event)); + size = sizeof(event); + + if (counter->hw_event.include_tid) { + /* namespace issues */ + event.pid = current->group_leader->pid; + event.tid = current->pid; + + event.header.type |= __PERF_EVENT_TID; + } else + size -= sizeof(u64); + + event.header.size = size; + + perf_output_write(counter, nmi, &event, size); } static void perf_output_group(struct perf_counter *counter, int nmi) -- cgit v1.2.3 From 53cfbf593758916aac41db728f029986a62f1254 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 25 Mar 2009 22:46:58 +1100 Subject: perf_counter: record time running and time enabled for each counter Impact: new functionality Currently, if there are more counters enabled than can fit on the CPU, the kernel will multiplex the counters on to the hardware using round-robin scheduling. That isn't too bad for sampling counters, but for counting counters it means that the value read from a counter represents some unknown fraction of the true count of events that occurred while the counter was enabled. This remedies the situation by keeping track of how long each counter is enabled for, and how long it is actually on the cpu and counting events. These times are recorded in nanoseconds using the task clock for per-task counters and the cpu clock for per-cpu counters. These values can be supplied to userspace on a read from the counter. Userspace requests that they be supplied after the counter value by setting the PERF_FORMAT_TOTAL_TIME_ENABLED and/or PERF_FORMAT_TOTAL_TIME_RUNNING bits in the hw_event.read_format field when creating the counter. (There is no way to change the read format after the counter is created, though it would be possible to add some way to do that.) Using this information it is possible for userspace to scale the count it reads from the counter to get an estimate of the true count: true_count_estimate = count * total_time_enabled / total_time_running This also lets userspace detect the situation where the counter never got to go on the cpu: total_time_running == 0. This functionality has been requested by the PAPI developers, and will be generally needed for interpreting the count values from counting counters correctly. In the implementation, this keeps 5 time values (in nanoseconds) for each counter: total_time_enabled and total_time_running are used when the counter is in state OFF or ERROR and for reporting back to userspace. When the counter is in state INACTIVE or ACTIVE, it is the tstamp_enabled, tstamp_running and tstamp_stopped values that are relevant, and total_time_enabled and total_time_running are determined from them. (tstamp_stopped is only used in INACTIVE state.) The reason for doing it like this is that it means that only counters being enabled or disabled at sched-in and sched-out time need to be updated. There are no new loops that iterate over all counters to update total_time_enabled or total_time_running. This also keeps separate child_total_time_running and child_total_time_enabled fields that get added in when reporting the totals to userspace. They are separate fields so that they can be atomic. We don't want to use atomics for total_time_running, total_time_enabled etc., because then we would have to use atomic sequences to update them, which are slower than regular arithmetic and memory accesses. It is possible to measure total_time_running by adding a task_clock counter to each group of counters, and total_time_enabled can be measured approximately with a top-level task_clock counter (though inaccuracies will creep in if you need to disable and enable groups since it is not possible in general to disable/enable the top-level task_clock counter simultaneously with another group). However, that adds extra overhead - I measured around 15% increase in the context switch latency reported by lat_ctx (from lmbench) when a task_clock counter was added to each of 2 groups, and around 25% increase when a task_clock counter was added to each of 4 groups. (In both cases a top-level task-clock counter was also added.) In contrast, the code added in this commit gives better information with no overhead that I could measure (in fact in some cases I measured lower times with this code, but the differences were all less than one standard deviation). [ v2: address review comments by Andrew Morton. ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Andrew Morton Orig-LKML-Reference: <18890.6578.728637.139402@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 2 + include/linux/perf_counter.h | 53 +++++++++++++ kernel/perf_counter.c | 157 ++++++++++++++++++++++++++++++++----- 3 files changed, 191 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index d48596ab655..df007fe0cc0 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -455,6 +455,8 @@ static void counter_sched_in(struct perf_counter *counter, int cpu) { counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; + counter->tstamp_running += counter->ctx->time_now - + counter->tstamp_stopped; if (is_software_counter(counter)) counter->hw_ops->enable(counter); } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7fdbdf8be77..6bf67ce1762 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -102,6 +102,16 @@ enum perf_counter_record_type { #define PERF_COUNTER_EVENT_SHIFT 0 #define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) +/* + * Bits that can be set in hw_event.read_format to request that + * reads on the counter should return the indicated quantities, + * in increasing order of bit value, after the counter value. + */ +enum perf_counter_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1, + PERF_FORMAT_TOTAL_TIME_RUNNING = 2, +}; + /* * Hardware event to monitor via a performance monitoring counter: */ @@ -281,6 +291,32 @@ struct perf_counter { enum perf_counter_active_state prev_state; atomic64_t count; + /* + * These are the total time in nanoseconds that the counter + * has been enabled (i.e. eligible to run, and the task has + * been scheduled in, if this is a per-task counter) + * and running (scheduled onto the CPU), respectively. + * + * They are computed from tstamp_enabled, tstamp_running and + * tstamp_stopped when the counter is in INACTIVE or ACTIVE state. + */ + u64 total_time_enabled; + u64 total_time_running; + + /* + * These are timestamps used for computing total_time_enabled + * and total_time_running when the counter is in INACTIVE or + * ACTIVE state, measured in nanoseconds from an arbitrary point + * in time. + * tstamp_enabled: the notional time when the counter was enabled + * tstamp_running: the notional time when the counter was scheduled on + * tstamp_stopped: in INACTIVE state, the notional time when the + * counter was scheduled off. + */ + u64 tstamp_enabled; + u64 tstamp_running; + u64 tstamp_stopped; + struct perf_counter_hw_event hw_event; struct hw_perf_counter hw; @@ -291,6 +327,13 @@ struct perf_counter { struct perf_counter *parent; struct list_head child_list; + /* + * These accumulate total time (in nanoseconds) that children + * counters have been enabled and running, respectively. + */ + atomic64_t child_total_time_enabled; + atomic64_t child_total_time_running; + /* * Protect attach/detach and child_list: */ @@ -339,6 +382,16 @@ struct perf_counter_context { int nr_active; int is_active; struct task_struct *task; + + /* + * time_now is the current time in nanoseconds since an arbitrary + * point in the past. For per-task counters, this is based on the + * task clock, and for per-cpu counters it is based on the cpu clock. + * time_lost is an offset from the task/cpu clock, used to make it + * appear that time only passes while the context is scheduled in. + */ + u64 time_now; + u64 time_lost; #endif }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 95e02575546..3b862a7988c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -116,6 +116,7 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_stopped = ctx->time_now; counter->hw_ops->disable(counter); counter->oncpu = -1; @@ -251,6 +252,60 @@ retry: spin_unlock_irq(&ctx->lock); } +/* + * Get the current time for this context. + * If this is a task context, we use the task's task clock, + * or for a per-cpu context, we use the cpu clock. + */ +static u64 get_context_time(struct perf_counter_context *ctx, int update) +{ + struct task_struct *curr = ctx->task; + + if (!curr) + return cpu_clock(smp_processor_id()); + + return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime; +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_counter_context *ctx, int update) +{ + ctx->time_now = get_context_time(ctx, update) - ctx->time_lost; +} + +/* + * Update the total_time_enabled and total_time_running fields for a counter. + */ +static void update_counter_times(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + u64 run_end; + + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + counter->total_time_enabled = ctx->time_now - + counter->tstamp_enabled; + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + run_end = counter->tstamp_stopped; + else + run_end = ctx->time_now; + counter->total_time_running = run_end - counter->tstamp_running; + } +} + +/* + * Update total_time_enabled and total_time_running for all counters in a group. + */ +static void update_group_times(struct perf_counter *leader) +{ + struct perf_counter *counter; + + update_counter_times(leader); + list_for_each_entry(counter, &leader->sibling_list, list_entry) + update_counter_times(counter); +} + /* * Cross CPU call to disable a performance counter */ @@ -276,6 +331,8 @@ static void __perf_counter_disable(void *info) * If it is in error state, leave it in error state. */ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + update_context_time(ctx, 1); + update_counter_times(counter); if (counter == counter->group_leader) group_sched_out(counter, cpuctx, ctx); else @@ -320,8 +377,10 @@ static void perf_counter_disable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) + if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_counter_times(counter); counter->state = PERF_COUNTER_STATE_OFF; + } spin_unlock_irq(&ctx->lock); } @@ -366,6 +425,8 @@ counter_sched_in(struct perf_counter *counter, return -EAGAIN; } + counter->tstamp_running += ctx->time_now - counter->tstamp_stopped; + if (!is_software_counter(counter)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -425,6 +486,17 @@ static int group_can_go_on(struct perf_counter *counter, return can_add_hw; } +static void add_counter_to_ctx(struct perf_counter *counter, + struct perf_counter_context *ctx) +{ + list_add_counter(counter, ctx); + ctx->nr_counters++; + counter->prev_state = PERF_COUNTER_STATE_OFF; + counter->tstamp_enabled = ctx->time_now; + counter->tstamp_running = ctx->time_now; + counter->tstamp_stopped = ctx->time_now; +} + /* * Cross CPU call to install and enable a performance counter */ @@ -449,6 +521,7 @@ static void __perf_install_in_context(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); + update_context_time(ctx, 1); /* * Protect the list operation against NMI by disabling the @@ -456,9 +529,7 @@ static void __perf_install_in_context(void *info) */ perf_flags = hw_perf_save_disable(); - list_add_counter(counter, ctx); - ctx->nr_counters++; - counter->prev_state = PERF_COUNTER_STATE_OFF; + add_counter_to_ctx(counter, ctx); /* * Don't put the counter on if it is disabled or if @@ -486,8 +557,10 @@ static void __perf_install_in_context(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) + if (leader->hw_event.pinned) { + update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; + } } if (!err && !ctx->task && cpuctx->max_pertask) @@ -548,10 +621,8 @@ retry: * can add the counter safely, if it the call above did not * succeed. */ - if (list_empty(&counter->list_entry)) { - list_add_counter(counter, ctx); - ctx->nr_counters++; - } + if (list_empty(&counter->list_entry)) + add_counter_to_ctx(counter, ctx); spin_unlock_irq(&ctx->lock); } @@ -576,11 +647,13 @@ static void __perf_counter_enable(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); + update_context_time(ctx, 1); counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled; /* * If the counter is in a group and isn't the group leader, @@ -602,8 +675,10 @@ static void __perf_counter_enable(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) + if (leader->hw_event.pinned) { + update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; + } } unlock: @@ -659,8 +734,11 @@ static void perf_counter_enable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_OFF) + if (counter->state == PERF_COUNTER_STATE_OFF) { counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time_now - + counter->total_time_enabled; + } out: spin_unlock_irq(&ctx->lock); } @@ -693,6 +771,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, ctx->is_active = 0; if (likely(!ctx->nr_counters)) goto out; + update_context_time(ctx, 0); flags = hw_perf_save_disable(); if (ctx->nr_active) { @@ -797,6 +876,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (likely(!ctx->nr_counters)) goto out; + /* + * Add any time since the last sched_out to the lost time + * so it doesn't get included in the total_time_enabled and + * total_time_running measures for counters in the context. + */ + ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now; + flags = hw_perf_save_disable(); /* @@ -817,8 +903,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, * If this pinned group hasn't been scheduled, * put it in error state. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) + if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_group_times(counter); counter->state = PERF_COUNTER_STATE_ERROR; + } } list_for_each_entry(counter, &ctx->counter_list, list_entry) { @@ -902,8 +990,10 @@ int perf_counter_task_disable(void) perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ERROR) + if (counter->state != PERF_COUNTER_STATE_ERROR) { + update_group_times(counter); counter->state = PERF_COUNTER_STATE_OFF; + } } hw_perf_restore(perf_flags); @@ -946,6 +1036,8 @@ int perf_counter_task_enable(void) if (counter->state > PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time_now - + counter->total_time_enabled; counter->hw_event.disabled = 0; } hw_perf_restore(perf_flags); @@ -1009,10 +1101,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) static void __read(void *info) { struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; unsigned long flags; curr_rq_lock_irq_save(&flags); + if (ctx->is_active) + update_context_time(ctx, 1); counter->hw_ops->read(counter); + update_counter_times(counter); curr_rq_unlock_irq_restore(&flags); } @@ -1025,6 +1121,8 @@ static u64 perf_counter_read(struct perf_counter *counter) if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, __read, counter, 1); + } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_counter_times(counter); } return atomic64_read(&counter->count); @@ -1137,10 +1235,8 @@ static int perf_release(struct inode *inode, struct file *file) static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 cntval; - - if (count < sizeof(cntval)) - return -EINVAL; + u64 values[3]; + int n; /* * Return end-of-file for a read on a counter that is in @@ -1151,10 +1247,24 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) return 0; mutex_lock(&counter->mutex); - cntval = perf_counter_read(counter); + values[0] = perf_counter_read(counter); + n = 1; + if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); mutex_unlock(&counter->mutex); - return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); + if (count < n * sizeof(u64)) + return -EINVAL; + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; } static ssize_t @@ -2290,8 +2400,7 @@ inherit_counter(struct perf_counter *parent_counter, * Link it up in the child's context: */ child_counter->task = child; - list_add_counter(child_counter, child_ctx); - child_ctx->nr_counters++; + add_counter_to_ctx(child_counter, child_ctx); child_counter->parent = parent_counter; /* @@ -2361,6 +2470,10 @@ static void sync_child_counter(struct perf_counter *child_counter, * Add back the child's count to the parent's count: */ atomic64_add(child_val, &parent_counter->count); + atomic64_add(child_counter->total_time_enabled, + &parent_counter->child_total_time_enabled); + atomic64_add(child_counter->total_time_running, + &parent_counter->child_total_time_running); /* * Remove this counter from the parent's list @@ -2395,6 +2508,7 @@ __perf_counter_exit_task(struct task_struct *child, if (child != current) { wait_task_inactive(child, 0); list_del_init(&child_counter->list_entry); + update_counter_times(child_counter); } else { struct perf_cpu_context *cpuctx; unsigned long flags; @@ -2412,6 +2526,7 @@ __perf_counter_exit_task(struct task_struct *child, cpuctx = &__get_cpu_var(perf_cpu_context); group_sched_out(child_counter, cpuctx, child_ctx); + update_counter_times(child_counter); list_del_init(&child_counter->list_entry); -- cgit v1.2.3 From 925d519ab82b6dd7aca9420d809ee83819c08db2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:02 +0200 Subject: perf_counter: unify and fix delayed counter wakeup While going over the wakeup code I noticed delayed wakeups only work for hardware counters but basically all software counters rely on them. This patch unifies and generalizes the delayed wakeup to fix this issue. Since we're dealing with NMI context bits here, use a cmpxchg() based single link list implementation to track counters that have pending wakeups. [ This should really be generic code for delayed wakeups, but since we cannot use cmpxchg()/xchg() in generic code, I've let it live in the perf_counter code. -- Eric Dumazet could use it to aggregate the network wakeups. ] Furthermore, the x86 method of using TIF flags was flawed in that its quite possible to end up setting the bit on the idle task, loosing the wakeup. The powerpc method uses per-cpu storage and does appear to be sufficient. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.153932974@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/hw_irq.h | 4 +- arch/powerpc/kernel/irq.c | 2 +- arch/powerpc/kernel/perf_counter.c | 22 +------ arch/x86/include/asm/perf_counter.h | 5 +- arch/x86/include/asm/thread_info.h | 4 +- arch/x86/kernel/cpu/perf_counter.c | 29 -------- arch/x86/kernel/signal.c | 6 -- include/linux/perf_counter.h | 15 +++-- kernel/perf_counter.c | 128 +++++++++++++++++++++++++++++++++--- kernel/timer.c | 3 + 10 files changed, 142 insertions(+), 76 deletions(-) (limited to 'include') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index cb32d571c9c..20a44d0c9fd 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -132,7 +132,7 @@ static inline int irqs_disabled_flags(unsigned long flags) struct irq_chip; #ifdef CONFIG_PERF_COUNTERS -static inline unsigned long get_perf_counter_pending(void) +static inline unsigned long test_perf_counter_pending(void) { unsigned long x; @@ -160,7 +160,7 @@ extern void perf_counter_do_pending(void); #else -static inline unsigned long get_perf_counter_pending(void) +static inline unsigned long test_perf_counter_pending(void) { return 0; } diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 469e9635ff0..2cd471f92fe 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -135,7 +135,7 @@ notrace void raw_local_irq_restore(unsigned long en) iseries_handle_interrupts(); } - if (get_perf_counter_pending()) { + if (test_perf_counter_pending()) { clear_perf_counter_pending(); perf_counter_do_pending(); } diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index df007fe0cc0..cde720fc495 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -649,24 +649,6 @@ hw_perf_counter_init(struct perf_counter *counter) return &power_perf_ops; } -/* - * Handle wakeups. - */ -void perf_counter_do_pending(void) -{ - int i; - struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); - struct perf_counter *counter; - - for (i = 0; i < cpuhw->n_counters; ++i) { - counter = cpuhw->counter[i]; - if (counter && counter->wakeup_pending) { - counter->wakeup_pending = 0; - wake_up(&counter->waitq); - } - } -} - /* * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled @@ -720,7 +702,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); struct perf_counter *counter; long val; - int need_wakeup = 0, found = 0; + int found = 0; for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; @@ -761,7 +743,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) * immediately; otherwise we'll have do the wakeup when interrupts * get soft-enabled. */ - if (get_perf_counter_pending() && regs->softe) { + if (test_perf_counter_pending() && regs->softe) { irq_enter(); clear_perf_counter_pending(); perf_counter_do_pending(); diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 1662043b340..e2b0e66b235 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -84,8 +84,9 @@ union cpuid10_edx { #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) -#define set_perf_counter_pending() \ - set_tsk_thread_flag(current, TIF_PERF_COUNTERS); +#define set_perf_counter_pending() do { } while (0) +#define clear_perf_counter_pending() do { } while (0) +#define test_perf_counter_pending() (0) #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 3ffd5d2a367..8820a73ae09 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -83,7 +83,6 @@ struct thread_info { #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ -#define TIF_PERF_COUNTERS 11 /* notify perf counter work */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ @@ -107,7 +106,6 @@ struct thread_info { #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) -#define _TIF_PERF_COUNTERS (1 << TIF_PERF_COUNTERS) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) @@ -141,7 +139,7 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME) + (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3f95b0cdc55..7aab177fb56 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -227,7 +227,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) */ hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } - counter->wakeup_pending = 0; return 0; } @@ -773,34 +772,6 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_exit(); } -/* - * This handler is triggered by NMI contexts: - */ -void perf_counter_notify(struct pt_regs *regs) -{ - struct cpu_hw_counters *cpuc; - unsigned long flags; - int bit, cpu; - - local_irq_save(flags); - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); - - for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) { - struct perf_counter *counter = cpuc->counters[bit]; - - if (!counter) - continue; - - if (counter->wakeup_pending) { - counter->wakeup_pending = 0; - wake_up(&counter->waitq); - } - } - - local_irq_restore(flags); -} - void perf_counters_lapic_init(int nmi) { u32 apic_val; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 611615a92c9..0a813b17b17 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,7 +6,6 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ -#include #include #include #include @@ -872,11 +871,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) tracehook_notify_resume(regs); } - if (thread_info_flags & _TIF_PERF_COUNTERS) { - clear_thread_flag(TIF_PERF_COUNTERS); - perf_counter_notify(regs); - } - #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); #endif /* CONFIG_X86_32 */ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 6bf67ce1762..0d833228eee 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -275,6 +275,10 @@ struct perf_mmap_data { void *data_pages[0]; }; +struct perf_wakeup_entry { + struct perf_wakeup_entry *next; +}; + /** * struct perf_counter - performance counter kernel representation: */ @@ -350,7 +354,7 @@ struct perf_counter { /* poll related */ wait_queue_head_t waitq; /* optional: for NMIs */ - int wakeup_pending; + struct perf_wakeup_entry wakeup; void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; @@ -427,7 +431,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern void perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); -extern void perf_counter_notify(struct pt_regs *regs); +extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); extern void perf_counter_unthrottle(void); extern u64 hw_perf_save_disable(void); @@ -461,7 +465,7 @@ static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline void perf_counter_init_task(struct task_struct *child) { } static inline void perf_counter_exit_task(struct task_struct *child) { } -static inline void perf_counter_notify(struct pt_regs *regs) { } +static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } static inline void perf_counter_unthrottle(void) { } static inline void hw_perf_restore(u64 ctrl) { } @@ -469,8 +473,9 @@ static inline u64 hw_perf_save_disable(void) { return 0; } static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } -static inline void perf_swcounter_event(u32 event, u64 nr, - int nmi, struct pt_regs *regs) { } +static inline void +perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { } + #endif #endif /* __KERNEL__ */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3b862a7988c..f70ff80e79d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1197,8 +1197,12 @@ static void free_counter_rcu(struct rcu_head *head) kfree(counter); } +static void perf_pending_sync(struct perf_counter *counter); + static void free_counter(struct perf_counter *counter) { + perf_pending_sync(counter); + if (counter->destroy) counter->destroy(counter); @@ -1528,6 +1532,118 @@ static const struct file_operations perf_fops = { .mmap = perf_mmap, }; +/* + * Perf counter wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_counter_wakeup(struct perf_counter *counter) +{ + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (data) { + (void)atomic_xchg(&data->wakeup, POLL_IN); + __perf_counter_update_userpage(counter, data); + } + rcu_read_unlock(); + + wake_up_all(&counter->waitq); +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = { + PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_counter *counter) +{ + struct perf_wakeup_entry **head; + struct perf_wakeup_entry *prev, *next; + + if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL) + return; + + head = &get_cpu_var(perf_wakeup_head); + + do { + prev = counter->wakeup.next = *head; + next = &counter->wakeup; + } while (cmpxchg(head, prev, next) != prev); + + set_perf_counter_pending(); + + put_cpu_var(perf_wakeup_head); +} + +static int __perf_pending_run(void) +{ + struct perf_wakeup_entry *list; + int nr = 0; + + list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL); + while (list != PENDING_TAIL) { + struct perf_counter *counter = container_of(list, + struct perf_counter, wakeup); + + list = list->next; + + counter->wakeup.next = NULL; + /* + * Ensure we observe the unqueue before we issue the wakeup, + * so that we won't be waiting forever. + * -- see perf_not_pending(). + */ + smp_wmb(); + + perf_counter_wakeup(counter); + nr++; + } + + return nr; +} + +static inline int perf_not_pending(struct perf_counter *counter) +{ + /* + * If we flush on whatever cpu we run, there is a chance we don't + * need to wait. + */ + get_cpu(); + __perf_pending_run(); + put_cpu(); + + /* + * Ensure we see the proper queue state before going to sleep + * so that we do not miss the wakeup. -- see perf_pending_handle() + */ + smp_rmb(); + return counter->wakeup.next == NULL; +} + +static void perf_pending_sync(struct perf_counter *counter) +{ + wait_event(counter->waitq, perf_not_pending(counter)); +} + +void perf_counter_do_pending(void) +{ + __perf_pending_run(); +} + /* * Output */ @@ -1611,13 +1727,10 @@ static void perf_output_copy(struct perf_output_handle *handle, static void perf_output_end(struct perf_output_handle *handle, int nmi) { if (handle->wakeup) { - (void)atomic_xchg(&handle->data->wakeup, POLL_IN); - __perf_counter_update_userpage(handle->counter, handle->data); - if (nmi) { - handle->counter->wakeup_pending = 1; - set_perf_counter_pending(); - } else - wake_up(&handle->counter->waitq); + if (nmi) + perf_pending_queue(handle->counter); + else + perf_counter_wakeup(handle->counter); } rcu_read_unlock(); } @@ -2211,7 +2324,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->cpu = cpu; counter->hw_event = *hw_event; - counter->wakeup_pending = 0; counter->group_leader = group_leader; counter->hw_ops = NULL; counter->ctx = ctx; diff --git a/kernel/timer.c b/kernel/timer.c index b4555568b4e..672ca25fbc4 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -1167,6 +1168,8 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); + perf_counter_do_pending(); + hrtimer_run_pending(); if (time_after_eq(jiffies, base->timer_jiffies)) -- cgit v1.2.3 From 38ff667b321b00f5e6830e93fb4ab11a653a2920 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:03 +0200 Subject: perf_counter: fix update_userpage() It just occured to me it is possible to have multiple contending updates of the userpage (mmap information vs overflow vs counter). This would break the seqlock logic. It appear the arch code uses this from NMI context, so we cannot possibly serialize its use, therefore separate the data_head update from it and let it return to its original use. The arch code needs to make sure there are no contending callers by disabling the counter before using it -- powerpc appears to do this nicely. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.241410660@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 35 +++++++++++++++++++++++++++++++++++ kernel/perf_counter.c | 38 +++++++++++++++++++++++--------------- 2 files changed, 58 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0d833228eee..8ac18852dcf 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -160,10 +160,45 @@ struct perf_counter_hw_event { struct perf_counter_mmap_page { __u32 version; /* version number of this structure */ __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw counters in user-space. + * + * The index and offset should be read atomically using the seqlock: + * + * __u32 seq, index; + * __s64 offset; + * + * again: + * rmb(); + * seq = pc->lock; + * + * if (unlikely(seq & 1)) { + * cpu_relax(); + * goto again; + * } + * + * index = pc->index; + * offset = pc->offset; + * + * rmb(); + * if (pc->lock != seq) + * goto again; + * + * After this, index contains architecture specific counter index + 1, + * so that 0 means unavailable, offset contains the value to be added + * to the result of the raw timer read to obtain this counter's value. + */ __u32 lock; /* seqlock for synchronization */ __u32 index; /* hardware counter identifier */ __s64 offset; /* add to hardware counter value */ + /* + * Control data for the mmap() data buffer. + * + * User-space reading this value should issue an rmb(), on SMP capable + * platforms, after reading this value -- see perf_counter_wakeup(). + */ __u32 data_head; /* head in the data section */ }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f70ff80e79d..c95e92329b9 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1316,10 +1316,22 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } -static void __perf_counter_update_userpage(struct perf_counter *counter, - struct perf_mmap_data *data) +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_counter_update_userpage(struct perf_counter *counter) { - struct perf_counter_mmap_page *userpg = data->user_page; + struct perf_mmap_data *data; + struct perf_counter_mmap_page *userpg; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (!data) + goto unlock; + + userpg = data->user_page; /* * Disable preemption so as to not let the corresponding user-space @@ -1333,20 +1345,10 @@ static void __perf_counter_update_userpage(struct perf_counter *counter, if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); - userpg->data_head = atomic_read(&data->head); smp_wmb(); ++userpg->lock; preempt_enable(); -} - -void perf_counter_update_userpage(struct perf_counter *counter) -{ - struct perf_mmap_data *data; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (data) - __perf_counter_update_userpage(counter, data); +unlock: rcu_read_unlock(); } @@ -1547,7 +1549,13 @@ void perf_counter_wakeup(struct perf_counter *counter) data = rcu_dereference(counter->data); if (data) { (void)atomic_xchg(&data->wakeup, POLL_IN); - __perf_counter_update_userpage(counter, data); + /* + * Ensure all data writes are issued before updating the + * user-space data head information. The matching rmb() + * will be in userspace after reading this value. + */ + smp_wmb(); + data->user_page->data_head = atomic_read(&data->head); } rcu_read_unlock(); -- cgit v1.2.3 From 0a4a93919bdc5cee48fe4367591e8e0449c1086c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:05 +0200 Subject: perf_counter: executable mmap() information Currently the profiling information returns userspace IPs but no way to correlate them to userspace code. Userspace could look into /proc/$pid/maps but that might not be current or even present anymore at the time of analyzing the IPs. Therefore provide means to track the mmap information and provide it in the output stream. XXX: only covers mmap()/munmap(), mremap() and mprotect() are missing. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Cc: Andrew Morton Orig-LKML-Reference: <20090330171023.417259499@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 24 ++++++- kernel/perf_counter.c | 145 +++++++++++++++++++++++++++++++++++++++++++ mm/mmap.c | 10 +++ 3 files changed, 177 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8ac18852dcf..037a81145ac 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -137,9 +137,11 @@ struct perf_counter_hw_event { exclude_kernel : 1, /* ditto kernel */ exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ - include_tid : 1, /* include the tid */ + include_tid : 1, /* include the tid */ + mmap : 1, /* include mmap data */ + munmap : 1, /* include munmap data */ - __reserved_1 : 54; + __reserved_1 : 52; __u32 extra_config_len; __u32 __reserved_4; @@ -211,6 +213,9 @@ enum perf_event_type { PERF_EVENT_IP = 0, PERF_EVENT_GROUP = 1, + PERF_EVENT_MMAP = 2, + PERF_EVENT_MUNMAP = 3, + __PERF_EVENT_TID = 0x100, }; @@ -491,6 +496,12 @@ static inline int is_software_counter(struct perf_counter *counter) extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); +extern void perf_counter_mmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file); + +extern void perf_counter_munmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -511,6 +522,15 @@ static inline int perf_counter_task_enable(void) { return -EINVAL; } static inline void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { } + +static inline void +perf_counter_mmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) { } + +static inline void +perf_counter_munmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) { } + #endif #endif /* __KERNEL__ */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c95e92329b9..f35e89e3d6a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -1843,6 +1844,150 @@ void perf_counter_output(struct perf_counter *counter, } } +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct file *file; + char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event; +}; + +static void perf_counter_mmap_output(struct perf_counter *counter, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + int size = mmap_event->event.header.size; + int ret = perf_output_begin(&handle, counter, size); + + if (ret) + return; + + perf_output_put(&handle, mmap_event->event); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + perf_output_end(&handle, 0); +} + +static int perf_counter_mmap_match(struct perf_counter *counter, + struct perf_mmap_event *mmap_event) +{ + if (counter->hw_event.mmap && + mmap_event->event.header.type == PERF_EVENT_MMAP) + return 1; + + if (counter->hw_event.munmap && + mmap_event->event.header.type == PERF_EVENT_MUNMAP) + return 1; + + return 0; +} + +static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, + struct perf_mmap_event *mmap_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_mmap_match(counter, mmap_event)) + perf_counter_mmap_output(counter, mmap_event); + } + rcu_read_unlock(); +} + +static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct file *file = mmap_event->file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + char *name; + + if (file) { + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = dentry_path(file->f_dentry, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name), sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event.header.size = sizeof(mmap_event->event) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); + put_cpu_var(perf_cpu_context); + + perf_counter_mmap_ctx(¤t->perf_counter_ctx, mmap_event); + + kfree(buf); +} + +void perf_counter_mmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) +{ + struct perf_mmap_event mmap_event = { + .file = file, + .event = { + .header = { .type = PERF_EVENT_MMAP, }, + .pid = current->group_leader->pid, + .tid = current->pid, + .start = addr, + .len = len, + .pgoff = pgoff, + }, + }; + + perf_counter_mmap_event(&mmap_event); +} + +void perf_counter_munmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) +{ + struct perf_mmap_event mmap_event = { + .file = file, + .event = { + .header = { .type = PERF_EVENT_MUNMAP, }, + .pid = current->group_leader->pid, + .tid = current->pid, + .start = addr, + .len = len, + .pgoff = pgoff, + }, + }; + + perf_counter_mmap_event(&mmap_event); +} + /* * Generic software counter infrastructure */ diff --git a/mm/mmap.c b/mm/mmap.c index 4a3841186c1..1df63f614f9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -1223,6 +1224,9 @@ munmap_back: if (correct_wcount) atomic_inc(&inode->i_writecount); out: + if (vm_flags & VM_EXEC) + perf_counter_mmap(addr, len, pgoff, file); + mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { @@ -1756,6 +1760,12 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) do { long nrpages = vma_pages(vma); + if (vma->vm_flags & VM_EXEC) { + perf_counter_munmap(vma->vm_start, + nrpages << PAGE_SHIFT, + vma->vm_pgoff, vma->vm_file); + } + mm->total_vm -= nrpages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); vma = remove_vma(vma); -- cgit v1.2.3 From 5ed00415e304203a0a9dcaef226d6d3f1106070e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:12 +0200 Subject: perf_counter: re-arrange the perf_event_type Breaks ABI yet again :-) Change the event type so that [0, 2^31-1] are regular event types, but [2^31, 2^32-1] forms a bitmask for overflow events. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171024.047961770@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 6 +++-- kernel/perf_counter.c | 56 ++++++++++++++++++++------------------------ 2 files changed, 29 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 037a81145ac..edf5bfb7ff5 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -210,13 +210,15 @@ struct perf_event_header { }; enum perf_event_type { - PERF_EVENT_IP = 0, + PERF_EVENT_GROUP = 1, PERF_EVENT_MMAP = 2, PERF_EVENT_MUNMAP = 3, - __PERF_EVENT_TID = 0x100, + PERF_EVENT_OVERFLOW = 1UL << 31, + __PERF_EVENT_IP = 1UL << 30, + __PERF_EVENT_TID = 1UL << 29, }; #ifdef __KERNEL__ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4471e7e2c10..d93e9ddf784 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1754,50 +1754,44 @@ static void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static int perf_output_write(struct perf_counter *counter, int nmi, - void *buf, ssize_t size) -{ - struct perf_output_handle handle; - int ret; - - ret = perf_output_begin(&handle, counter, size, nmi); - if (ret) - goto out; - - perf_output_copy(&handle, buf, size); - perf_output_end(&handle); - -out: - return ret; -} - static void perf_output_simple(struct perf_counter *counter, int nmi, struct pt_regs *regs) { - unsigned int size; + int ret; + struct perf_output_handle handle; + struct perf_event_header header; + u64 ip; struct { - struct perf_event_header header; - u64 ip; u32 pid, tid; - } event; + } tid_entry; - event.header.type = PERF_EVENT_IP; - event.ip = instruction_pointer(regs); + header.type = PERF_EVENT_OVERFLOW; + header.size = sizeof(header); - size = sizeof(event); + ip = instruction_pointer(regs); + header.type |= __PERF_EVENT_IP; + header.size += sizeof(ip); if (counter->hw_event.include_tid) { /* namespace issues */ - event.pid = current->group_leader->pid; - event.tid = current->pid; + tid_entry.pid = current->group_leader->pid; + tid_entry.tid = current->pid; - event.header.type |= __PERF_EVENT_TID; - } else - size -= sizeof(u64); + header.type |= __PERF_EVENT_TID; + header.size += sizeof(tid_entry); + } - event.header.size = size; + ret = perf_output_begin(&handle, counter, header.size, nmi); + if (ret) + return; + + perf_output_put(&handle, header); + perf_output_put(&handle, ip); + + if (counter->hw_event.include_tid) + perf_output_put(&handle, tid_entry); - perf_output_write(counter, nmi, &event, size); + perf_output_end(&handle); } static void perf_output_group(struct perf_counter *counter, int nmi) -- cgit v1.2.3 From 394ee07623cf556c8daae2b3c00cf5fea47f0811 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:14 +0200 Subject: perf_counter: provide generic callchain bits Provide the generic callchain support bits. If hw_event->callchain is set the arch specific perf_callchain() function is called upon to provide a perf_callchain_entry structure filled with the current callchain. If it does so, it is added to the overflow output event. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171024.254266860@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 13 ++++++++++++- kernel/perf_counter.c | 27 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index edf5bfb7ff5..43083afffe0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -140,8 +140,9 @@ struct perf_counter_hw_event { include_tid : 1, /* include the tid */ mmap : 1, /* include mmap data */ munmap : 1, /* include munmap data */ + callchain : 1, /* add callchain data */ - __reserved_1 : 52; + __reserved_1 : 51; __u32 extra_config_len; __u32 __reserved_4; @@ -219,6 +220,7 @@ enum perf_event_type { PERF_EVENT_OVERFLOW = 1UL << 31, __PERF_EVENT_IP = 1UL << 30, __PERF_EVENT_TID = 1UL << 29, + __PERF_EVENT_CALLCHAIN = 1UL << 28, }; #ifdef __KERNEL__ @@ -504,6 +506,15 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len, extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); +#define MAX_STACK_DEPTH 255 + +struct perf_callchain_entry { + u64 nr; + u64 ip[MAX_STACK_DEPTH]; +}; + +extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d93e9ddf784..860cdc26bd7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1653,6 +1653,17 @@ void perf_counter_do_pending(void) __perf_pending_run(); } +/* + * Callchain support -- arch specific + */ + +struct perf_callchain_entry * +__attribute__((weak)) +perf_callchain(struct pt_regs *regs) +{ + return NULL; +} + /* * Output */ @@ -1764,6 +1775,8 @@ static void perf_output_simple(struct perf_counter *counter, struct { u32 pid, tid; } tid_entry; + struct perf_callchain_entry *callchain = NULL; + int callchain_size = 0; header.type = PERF_EVENT_OVERFLOW; header.size = sizeof(header); @@ -1781,6 +1794,17 @@ static void perf_output_simple(struct perf_counter *counter, header.size += sizeof(tid_entry); } + if (counter->hw_event.callchain) { + callchain = perf_callchain(regs); + + if (callchain) { + callchain_size = (1 + callchain->nr) * sizeof(u64); + + header.type |= __PERF_EVENT_CALLCHAIN; + header.size += callchain_size; + } + } + ret = perf_output_begin(&handle, counter, header.size, nmi); if (ret) return; @@ -1791,6 +1815,9 @@ static void perf_output_simple(struct perf_counter *counter, if (counter->hw_event.include_tid) perf_output_put(&handle, tid_entry); + if (callchain) + perf_output_copy(&handle, callchain, callchain_size); + perf_output_end(&handle); } -- cgit v1.2.3 From 8a057d84912f36e53f970c4d177cb4bb6b2f9e08 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:11:59 +0200 Subject: perf_counter: move the event overflow output bits to record_type Per suggestion from Paul, move the event overflow bits to record_type and sanitize the enums a bit. Breaks the ABI -- again ;-) Suggested-by: Paul Mackerras Signed-off-by: Peter Zijlstra Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.151921176@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 50 +++++++++++---------- kernel/perf_counter.c | 101 +++++++++++++++++-------------------------- 2 files changed, 68 insertions(+), 83 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 43083afffe0..06a6fba9f53 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -73,15 +73,6 @@ enum sw_event_ids { PERF_SW_EVENTS_MAX = 7, }; -/* - * IRQ-notification data record type: - */ -enum perf_counter_record_type { - PERF_RECORD_SIMPLE = 0, - PERF_RECORD_IRQ = 1, - PERF_RECORD_GROUP = 2, -}; - #define __PERF_COUNTER_MASK(name) \ (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \ PERF_COUNTER_##name##_SHIFT) @@ -102,6 +93,17 @@ enum perf_counter_record_type { #define PERF_COUNTER_EVENT_SHIFT 0 #define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) +/* + * Bits that can be set in hw_event.record_type to request information + * in the overflow packets. + */ +enum perf_counter_record_format { + PERF_RECORD_IP = 1U << 0, + PERF_RECORD_TID = 1U << 1, + PERF_RECORD_GROUP = 1U << 2, + PERF_RECORD_CALLCHAIN = 1U << 3, +}; + /* * Bits that can be set in hw_event.read_format to request that * reads on the counter should return the indicated quantities, @@ -125,8 +127,8 @@ struct perf_counter_hw_event { __u64 config; __u64 irq_period; - __u64 record_type; - __u64 read_format; + __u32 record_type; + __u32 read_format; __u64 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ @@ -137,12 +139,10 @@ struct perf_counter_hw_event { exclude_kernel : 1, /* ditto kernel */ exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ - include_tid : 1, /* include the tid */ mmap : 1, /* include mmap data */ munmap : 1, /* include munmap data */ - callchain : 1, /* add callchain data */ - __reserved_1 : 51; + __reserved_1 : 53; __u32 extra_config_len; __u32 __reserved_4; @@ -212,15 +212,21 @@ struct perf_event_header { enum perf_event_type { - PERF_EVENT_GROUP = 1, - - PERF_EVENT_MMAP = 2, - PERF_EVENT_MUNMAP = 3, + PERF_EVENT_MMAP = 1, + PERF_EVENT_MUNMAP = 2, - PERF_EVENT_OVERFLOW = 1UL << 31, - __PERF_EVENT_IP = 1UL << 30, - __PERF_EVENT_TID = 1UL << 29, - __PERF_EVENT_CALLCHAIN = 1UL << 28, + /* + * Half the event type space is reserved for the counter overflow + * bitfields, as found in hw_event.record_type. + * + * These events will have types of the form: + * PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } * + */ + PERF_EVENT_COUNTER_OVERFLOW = 1UL << 31, + __PERF_EVENT_IP = PERF_RECORD_IP, + __PERF_EVENT_TID = PERF_RECORD_TID, + __PERF_EVENT_GROUP = PERF_RECORD_GROUP, + __PERF_EVENT_CALLCHAIN = PERF_RECORD_CALLCHAIN, }; #ifdef __KERNEL__ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 860cdc26bd7..995063df910 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1765,27 +1765,34 @@ static void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static void perf_output_simple(struct perf_counter *counter, - int nmi, struct pt_regs *regs) +void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs) { int ret; + u64 record_type = counter->hw_event.record_type; struct perf_output_handle handle; struct perf_event_header header; u64 ip; struct { u32 pid, tid; } tid_entry; + struct { + u64 event; + u64 counter; + } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; - header.type = PERF_EVENT_OVERFLOW; + header.type = PERF_EVENT_COUNTER_OVERFLOW; header.size = sizeof(header); - ip = instruction_pointer(regs); - header.type |= __PERF_EVENT_IP; - header.size += sizeof(ip); + if (record_type & PERF_RECORD_IP) { + ip = instruction_pointer(regs); + header.type |= __PERF_EVENT_IP; + header.size += sizeof(ip); + } - if (counter->hw_event.include_tid) { + if (record_type & PERF_RECORD_TID) { /* namespace issues */ tid_entry.pid = current->group_leader->pid; tid_entry.tid = current->pid; @@ -1794,7 +1801,13 @@ static void perf_output_simple(struct perf_counter *counter, header.size += sizeof(tid_entry); } - if (counter->hw_event.callchain) { + if (record_type & PERF_RECORD_GROUP) { + header.type |= __PERF_EVENT_GROUP; + header.size += sizeof(u64) + + counter->nr_siblings * sizeof(group_entry); + } + + if (record_type & PERF_RECORD_CALLCHAIN) { callchain = perf_callchain(regs); if (callchain) { @@ -1810,69 +1823,35 @@ static void perf_output_simple(struct perf_counter *counter, return; perf_output_put(&handle, header); - perf_output_put(&handle, ip); - if (counter->hw_event.include_tid) - perf_output_put(&handle, tid_entry); + if (record_type & PERF_RECORD_IP) + perf_output_put(&handle, ip); - if (callchain) - perf_output_copy(&handle, callchain, callchain_size); - - perf_output_end(&handle); -} - -static void perf_output_group(struct perf_counter *counter, int nmi) -{ - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_counter *leader, *sub; - unsigned int size; - struct { - u64 event; - u64 counter; - } entry; - int ret; - - size = sizeof(header) + counter->nr_siblings * sizeof(entry); + if (record_type & PERF_RECORD_TID) + perf_output_put(&handle, tid_entry); - ret = perf_output_begin(&handle, counter, size, nmi); - if (ret) - return; + if (record_type & PERF_RECORD_GROUP) { + struct perf_counter *leader, *sub; + u64 nr = counter->nr_siblings; - header.type = PERF_EVENT_GROUP; - header.size = size; + perf_output_put(&handle, nr); - perf_output_put(&handle, header); + leader = counter->group_leader; + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + if (sub != counter) + sub->hw_ops->read(sub); - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->hw_ops->read(sub); + group_entry.event = sub->hw_event.config; + group_entry.counter = atomic64_read(&sub->count); - entry.event = sub->hw_event.config; - entry.counter = atomic64_read(&sub->count); - - perf_output_put(&handle, entry); + perf_output_put(&handle, group_entry); + } } - perf_output_end(&handle); -} - -void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs) -{ - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - return; - - case PERF_RECORD_IRQ: - perf_output_simple(counter, nmi, regs); - break; + if (callchain) + perf_output_copy(&handle, callchain, callchain_size); - case PERF_RECORD_GROUP: - perf_output_group(counter, nmi); - break; - } + perf_output_end(&handle); } /* -- cgit v1.2.3 From c457810ab4a825161aec6ef71b581e1bc8febd1a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:01 +0200 Subject: perf_counter: per event wakeups By request, provide a way to request a wakeup every 'n' events instead of every page of output. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.323309784@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 ++- kernel/perf_counter.c | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 06a6fba9f53..5428ba120d7 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -145,7 +145,7 @@ struct perf_counter_hw_event { __reserved_1 : 53; __u32 extra_config_len; - __u32 __reserved_4; + __u32 wakeup_events; /* wakeup every n events */ __u64 __reserved_2; __u64 __reserved_3; @@ -321,6 +321,7 @@ struct perf_mmap_data { int nr_pages; atomic_t wakeup; atomic_t head; + atomic_t events; struct perf_counter_mmap_page *user_page; void *data_pages[0]; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 995063df910..9bcab10e735 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1760,7 +1760,15 @@ static void perf_output_copy(struct perf_output_handle *handle, static void perf_output_end(struct perf_output_handle *handle) { - if (handle->wakeup) + int wakeup_events = handle->counter->hw_event.wakeup_events; + + if (wakeup_events) { + int events = atomic_inc_return(&handle->data->events); + if (events >= wakeup_events) { + atomic_sub(wakeup_events, &handle->data->events); + __perf_output_wakeup(handle); + } + } else if (handle->wakeup) __perf_output_wakeup(handle); rcu_read_unlock(); } -- cgit v1.2.3 From 5872bdb88a35fae7d224bd6b21e5f377e854ccfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:03 +0200 Subject: perf_counter: add more context information Put in counts to tell which ips belong to what context. ----- | | hv | -- nr | | kernel | -- | | user ----- Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.493101305@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++++++++ include/linux/perf_counter.h | 4 ++-- kernel/perf_counter.c | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2a946a160ca..c74e20d593a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1088,6 +1088,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { unsigned long bp; char *stack; + int nr = entry->nr; callchain_store(entry, instruction_pointer(regs)); @@ -1099,6 +1100,8 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) #endif dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); + + entry->kernel = entry->nr - nr; } @@ -1128,6 +1131,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) { struct stack_frame frame; const void __user *fp; + int nr = entry->nr; regs = (struct pt_regs *)current->thread.sp0 - 1; fp = (void __user *)regs->bp; @@ -1147,6 +1151,8 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, frame.return_address); fp = frame.next_fp; } + + entry->user = entry->nr - nr; } static void @@ -1182,6 +1188,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) entry = &__get_cpu_var(irq_entry); entry->nr = 0; + entry->hv = 0; + entry->kernel = 0; + entry->user = 0; perf_do_callchain(regs, entry); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5428ba120d7..90cce0c74a0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -513,10 +513,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len, extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); -#define MAX_STACK_DEPTH 255 +#define MAX_STACK_DEPTH 254 struct perf_callchain_entry { - u64 nr; + u32 nr, hv, kernel, user; u64 ip[MAX_STACK_DEPTH]; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 9bcab10e735..f105a6e696c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1819,7 +1819,7 @@ void perf_counter_output(struct perf_counter *counter, callchain = perf_callchain(regs); if (callchain) { - callchain_size = (1 + callchain->nr) * sizeof(u64); + callchain_size = (2 + callchain->nr) * sizeof(u64); header.type |= __PERF_EVENT_CALLCHAIN; header.size += callchain_size; -- cgit v1.2.3 From 92f22a3865abe87eea2609a6f8e5be5123f7ce4f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:04 +0200 Subject: perf_counter: update mmap() counter read Paul noted that we don't need SMP barriers for the mmap() counter read because its always on the same cpu (otherwise you can't access the hw counter anyway). So remove the SMP barriers and replace them with regular compiler barriers. Further, update the comment to include a race free method of reading said hardware counter. The primary change is putting the pmc_read inside the seq-loop, otherwise we can still race and read rubbish. Noticed-by: Paul Mackerras Signed-off-by: Peter Zijlstra Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.577951445@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 22 ++++++++++------------ kernel/perf_counter.c | 4 ++-- 2 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 90cce0c74a0..f2b914de3f0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -167,30 +167,28 @@ struct perf_counter_mmap_page { /* * Bits needed to read the hw counters in user-space. * - * The index and offset should be read atomically using the seqlock: - * - * __u32 seq, index; - * __s64 offset; + * u32 seq; + * s64 count; * * again: - * rmb(); * seq = pc->lock; - * * if (unlikely(seq & 1)) { * cpu_relax(); * goto again; * } * - * index = pc->index; - * offset = pc->offset; + * if (pc->index) { + * count = pmc_read(pc->index - 1); + * count += pc->offset; + * } else + * goto regular_read; * - * rmb(); + * barrier(); * if (pc->lock != seq) * goto again; * - * After this, index contains architecture specific counter index + 1, - * so that 0 means unavailable, offset contains the value to be added - * to the result of the raw timer read to obtain this counter's value. + * NOTE: for obvious reason this only works on self-monitoring + * processes. */ __u32 lock; /* seqlock for synchronization */ __u32 index; /* hardware counter identifier */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f105a6e696c..2a5d4f52556 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1340,13 +1340,13 @@ void perf_counter_update_userpage(struct perf_counter *counter) */ preempt_disable(); ++userpg->lock; - smp_wmb(); + barrier(); userpg->index = counter->hw.idx; userpg->offset = atomic64_read(&counter->count); if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); - smp_wmb(); + barrier(); ++userpg->lock; preempt_enable(); unlock: -- cgit v1.2.3 From a2e87d06ddbe6e6fdb8d6d2e5e985efe4efb07dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:44:59 +0200 Subject: perf_counter: update mmap() counter read, take 2 Update the userspace read method. Paul noted that: - userspace cannot observe ->lock & 1 on the same cpu. - we need a barrier() between reading ->lock and ->index to ensure we read them in that prticular order. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.368446033@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f2b914de3f0..e22ab47a2f4 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -170,22 +170,18 @@ struct perf_counter_mmap_page { * u32 seq; * s64 count; * - * again: - * seq = pc->lock; - * if (unlikely(seq & 1)) { - * cpu_relax(); - * goto again; - * } + * do { + * seq = pc->lock; * - * if (pc->index) { - * count = pmc_read(pc->index - 1); - * count += pc->offset; - * } else - * goto regular_read; + * barrier() + * if (pc->index) { + * count = pmc_read(pc->index - 1); + * count += pc->offset; + * } else + * goto regular_read; * - * barrier(); - * if (pc->lock != seq) - * goto again; + * barrier(); + * } while (pc->lock != seq); * * NOTE: for obvious reason this only works on self-monitoring * processes. -- cgit v1.2.3 From 9c03d88e328d5f28f13191622c2ea1349c36b799 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:00 +0200 Subject: perf_counter: add more context information Change the callchain context entries to u16, so as to gain some space. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.457320003@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- kernel/perf_counter.c | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e22ab47a2f4..f9d5cf0bfbd 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -507,10 +507,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len, extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); -#define MAX_STACK_DEPTH 254 +#define MAX_STACK_DEPTH 255 struct perf_callchain_entry { - u32 nr, hv, kernel, user; + u16 nr, hv, kernel, user; u64 ip[MAX_STACK_DEPTH]; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2a5d4f52556..727624db507 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1657,9 +1657,7 @@ void perf_counter_do_pending(void) * Callchain support -- arch specific */ -struct perf_callchain_entry * -__attribute__((weak)) -perf_callchain(struct pt_regs *regs) +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) { return NULL; } @@ -1819,7 +1817,7 @@ void perf_counter_output(struct perf_counter *counter, callchain = perf_callchain(regs); if (callchain) { - callchain_size = (2 + callchain->nr) * sizeof(u64); + callchain_size = (1 + callchain->nr) * sizeof(u64); header.type |= __PERF_EVENT_CALLCHAIN; header.size += callchain_size; -- cgit v1.2.3 From 3c446b3d3b38f991f97e9d2df0ad26a60a94dcff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:01 +0200 Subject: perf_counter: SIGIO support Provide support for fcntl() I/O availability signals. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.579788800@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f9d5cf0bfbd..8d5d11b8d01 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -238,6 +238,7 @@ enum perf_event_type { #include #include #include +#include #include struct task_struct; @@ -398,6 +399,7 @@ struct perf_counter { /* poll related */ wait_queue_head_t waitq; + struct fasync_struct *fasync; /* optional: for NMIs */ struct perf_wakeup_entry wakeup; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 727624db507..c58cc64319e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1526,6 +1526,22 @@ out: return ret; } +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct perf_counter *counter = filp->private_data; + struct inode *inode = filp->f_path.dentry->d_inode; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &counter->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + static const struct file_operations perf_fops = { .release = perf_release, .read = perf_read, @@ -1533,6 +1549,7 @@ static const struct file_operations perf_fops = { .unlocked_ioctl = perf_ioctl, .compat_ioctl = perf_ioctl, .mmap = perf_mmap, + .fasync = perf_fasync, }; /* @@ -1549,7 +1566,7 @@ void perf_counter_wakeup(struct perf_counter *counter) rcu_read_lock(); data = rcu_dereference(counter->data); if (data) { - (void)atomic_xchg(&data->wakeup, POLL_IN); + atomic_set(&data->wakeup, POLL_IN); /* * Ensure all data writes are issued before updating the * user-space data head information. The matching rmb() @@ -1561,6 +1578,7 @@ void perf_counter_wakeup(struct perf_counter *counter) rcu_read_unlock(); wake_up_all(&counter->waitq); + kill_fasync(&counter->fasync, SIGIO, POLL_IN); } /* -- cgit v1.2.3 From 671dec5daf3b3c43c5777be282f00120a44cf37f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:02 +0200 Subject: perf_counter: generalize pending infrastructure Prepare the pending infrastructure to do more than wakeups. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.634732847@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++--- kernel/perf_counter.c | 53 ++++++++++++++++++++++++++------------------ 2 files changed, 36 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8d5d11b8d01..977fb15a53f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -321,8 +321,9 @@ struct perf_mmap_data { void *data_pages[0]; }; -struct perf_wakeup_entry { - struct perf_wakeup_entry *next; +struct perf_pending_entry { + struct perf_pending_entry *next; + void (*func)(struct perf_pending_entry *); }; /** @@ -401,7 +402,7 @@ struct perf_counter { wait_queue_head_t waitq; struct fasync_struct *fasync; /* optional: for NMIs */ - struct perf_wakeup_entry wakeup; + struct perf_pending_entry pending; void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c58cc64319e..0a2ade2e4f1 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1581,6 +1581,14 @@ void perf_counter_wakeup(struct perf_counter *counter) kill_fasync(&counter->fasync, SIGIO, POLL_IN); } +static void perf_pending_wakeup(struct perf_pending_entry *entry) +{ + struct perf_counter *counter = container_of(entry, + struct perf_counter, pending); + + perf_counter_wakeup(counter); +} + /* * Pending wakeups * @@ -1590,45 +1598,47 @@ void perf_counter_wakeup(struct perf_counter *counter) * single linked list and use cmpxchg() to add entries lockless. */ -#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL) +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) -static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = { +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { PENDING_TAIL, }; -static void perf_pending_queue(struct perf_counter *counter) +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) { - struct perf_wakeup_entry **head; - struct perf_wakeup_entry *prev, *next; + struct perf_pending_entry **head; - if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL) + if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) return; - head = &get_cpu_var(perf_wakeup_head); + entry->func = func; + + head = &get_cpu_var(perf_pending_head); do { - prev = counter->wakeup.next = *head; - next = &counter->wakeup; - } while (cmpxchg(head, prev, next) != prev); + entry->next = *head; + } while (cmpxchg(head, entry->next, entry) != entry->next); set_perf_counter_pending(); - put_cpu_var(perf_wakeup_head); + put_cpu_var(perf_pending_head); } static int __perf_pending_run(void) { - struct perf_wakeup_entry *list; + struct perf_pending_entry *list; int nr = 0; - list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL); + list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); while (list != PENDING_TAIL) { - struct perf_counter *counter = container_of(list, - struct perf_counter, wakeup); + void (*func)(struct perf_pending_entry *); + struct perf_pending_entry *entry = list; list = list->next; - counter->wakeup.next = NULL; + func = entry->func; + entry->next = NULL; /* * Ensure we observe the unqueue before we issue the wakeup, * so that we won't be waiting forever. @@ -1636,7 +1646,7 @@ static int __perf_pending_run(void) */ smp_wmb(); - perf_counter_wakeup(counter); + func(entry); nr++; } @@ -1658,7 +1668,7 @@ static inline int perf_not_pending(struct perf_counter *counter) * so that we do not miss the wakeup. -- see perf_pending_handle() */ smp_rmb(); - return counter->wakeup.next == NULL; + return counter->pending.next == NULL; } static void perf_pending_sync(struct perf_counter *counter) @@ -1695,9 +1705,10 @@ struct perf_output_handle { static inline void __perf_output_wakeup(struct perf_output_handle *handle) { - if (handle->nmi) - perf_pending_queue(handle->counter); - else + if (handle->nmi) { + perf_pending_queue(&handle->counter->pending, + perf_pending_wakeup); + } else perf_counter_wakeup(handle->counter); } -- cgit v1.2.3 From f6c7d5fe58b4846ee0cb4b98b6042489705eced4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:04 +0200 Subject: perf_counter: theres more to overflow than writing events Prepare for more generic overflow handling. The new perf_counter_overflow() method will handle the generic bits of the counter overflow, and can return a !0 return value, in which case the counter should be (soft) disabled, so that it won't count until it's properly disabled. XXX: do powerpc and swcounter Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.812109629@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 3 ++- include/linux/perf_counter.h | 4 ++-- kernel/perf_counter.c | 29 +++++++++++++++++++++++------ 4 files changed, 28 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 0a4d14f279a..f88c35d0710 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -732,7 +732,7 @@ static void record_and_restart(struct perf_counter *counter, long val, * Finally record data if requested. */ if (record) - perf_counter_output(counter, 1, regs); + perf_counter_overflow(counter, 1, regs); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 438415866fe..1116a41bc7b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -800,7 +800,8 @@ again: continue; perf_save_and_restart(counter); - perf_counter_output(counter, nmi, regs); + if (perf_counter_overflow(counter, nmi, regs)) + __pmc_generic_disable(counter, &counter->hw, bit); } hw_perf_ack_status(ack); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 977fb15a53f..ca2d4df29e0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -491,8 +491,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_counter_context *ctx, int cpu); extern void perf_counter_update_userpage(struct perf_counter *counter); -extern void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs); +extern int perf_counter_overflow(struct perf_counter *counter, + int nmi, struct pt_regs *regs); /* * Return 1 for a software counter, 0 for a hardware counter */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0a2ade2e4f1..195e976eb07 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1800,8 +1800,8 @@ static void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs) +static void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs) { int ret; u64 record_type = counter->hw_event.record_type; @@ -2033,6 +2033,17 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, perf_counter_mmap_event(&mmap_event); } +/* + * Generic counter overflow handling. + */ + +int perf_counter_overflow(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + perf_counter_output(counter, nmi, regs); + return 0; +} + /* * Generic software counter infrastructure */ @@ -2077,6 +2088,7 @@ static void perf_swcounter_set_period(struct perf_counter *counter) static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) { + enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_counter *counter; struct pt_regs *regs; @@ -2092,12 +2104,14 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) !counter->hw_event.exclude_user) regs = task_pt_regs(current); - if (regs) - perf_counter_output(counter, 0, regs); + if (regs) { + if (perf_counter_overflow(counter, 0, regs)) + ret = HRTIMER_NORESTART; + } hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); - return HRTIMER_RESTART; + return ret; } static void perf_swcounter_overflow(struct perf_counter *counter, @@ -2105,7 +2119,10 @@ static void perf_swcounter_overflow(struct perf_counter *counter, { perf_swcounter_update(counter); perf_swcounter_set_period(counter); - perf_counter_output(counter, nmi, regs); + if (perf_counter_overflow(counter, nmi, regs)) + /* soft-disable the counter */ + ; + } static int perf_swcounter_match(struct perf_counter *counter, -- cgit v1.2.3 From 339f7c90b8a2f3aa2dd4267e79f797999e8a3c59 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:06 +0200 Subject: perf_counter: PERF_RECORD_TIME By popular request, provide means to log a timestamp along with the counter overflow event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.024173282@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index ca2d4df29e0..928a7fae096 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -102,6 +102,7 @@ enum perf_counter_record_format { PERF_RECORD_TID = 1U << 1, PERF_RECORD_GROUP = 1U << 2, PERF_RECORD_CALLCHAIN = 1U << 3, + PERF_RECORD_TIME = 1U << 4, }; /* @@ -221,6 +222,7 @@ enum perf_event_type { __PERF_EVENT_TID = PERF_RECORD_TID, __PERF_EVENT_GROUP = PERF_RECORD_GROUP, __PERF_EVENT_CALLCHAIN = PERF_RECORD_CALLCHAIN, + __PERF_EVENT_TIME = PERF_RECORD_TIME, }; #ifdef __KERNEL__ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c841563de04..19990d1f021 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1826,6 +1826,7 @@ static void perf_counter_output(struct perf_counter *counter, } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; + u64 time; header.type = PERF_EVENT_COUNTER_OVERFLOW; header.size = sizeof(header); @@ -1862,6 +1863,16 @@ static void perf_counter_output(struct perf_counter *counter, } } + if (record_type & PERF_RECORD_TIME) { + /* + * Maybe do better on x86 and provide cpu_clock_nmi() + */ + time = sched_clock(); + + header.type |= __PERF_EVENT_TIME; + header.size += sizeof(u64); + } + ret = perf_output_begin(&handle, counter, header.size, nmi); if (ret) return; @@ -1895,6 +1906,9 @@ static void perf_counter_output(struct perf_counter *counter, if (callchain) perf_output_copy(&handle, callchain, callchain_size); + if (record_type & PERF_RECORD_TIME) + perf_output_put(&handle, time); + perf_output_end(&handle); } -- cgit v1.2.3 From 79f146415623fe74f39af67c0f6adc208939a410 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:07 +0200 Subject: perf_counter: counter overflow limit Provide means to auto-disable the counter after 'n' overflow events. Create the counter with hw_event.disabled = 1, and then issue an ioctl(fd, PREF_COUNTER_IOC_REFRESH, n); to set the limit and enable the counter. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.083139737@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 12 ++++++++--- kernel/perf_counter.c | 51 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 50 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 928a7fae096..ef4dcbff75a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -155,8 +155,9 @@ struct perf_counter_hw_event { /* * Ioctls that can be done on a perf counter fd: */ -#define PERF_COUNTER_IOC_ENABLE _IO('$', 0) -#define PERF_COUNTER_IOC_DISABLE _IO('$', 1) +#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) +#define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32) /* * Structure of the page that can be mapped via mmap @@ -403,9 +404,14 @@ struct perf_counter { /* poll related */ wait_queue_head_t waitq; struct fasync_struct *fasync; - /* optional: for NMIs */ + + /* delayed work for NMIs and such */ + int pending_wakeup; + int pending_disable; struct perf_pending_entry pending; + atomic_t event_limit; + void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 19990d1f021..c05e10354bc 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -744,6 +744,12 @@ static void perf_counter_enable(struct perf_counter *counter) spin_unlock_irq(&ctx->lock); } +static void perf_counter_refresh(struct perf_counter *counter, int refresh) +{ + atomic_add(refresh, &counter->event_limit); + perf_counter_enable(counter); +} + /* * Enable a counter and all its children. */ @@ -1311,6 +1317,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_COUNTER_IOC_DISABLE: perf_counter_disable_family(counter); break; + case PERF_COUNTER_IOC_REFRESH: + perf_counter_refresh(counter, arg); + break; default: err = -ENOTTY; } @@ -1590,14 +1599,6 @@ void perf_counter_wakeup(struct perf_counter *counter) kill_fasync(&counter->fasync, SIGIO, POLL_IN); } -static void perf_pending_wakeup(struct perf_pending_entry *entry) -{ - struct perf_counter *counter = container_of(entry, - struct perf_counter, pending); - - perf_counter_wakeup(counter); -} - /* * Pending wakeups * @@ -1607,6 +1608,22 @@ static void perf_pending_wakeup(struct perf_pending_entry *entry) * single linked list and use cmpxchg() to add entries lockless. */ +static void perf_pending_counter(struct perf_pending_entry *entry) +{ + struct perf_counter *counter = container_of(entry, + struct perf_counter, pending); + + if (counter->pending_disable) { + counter->pending_disable = 0; + perf_counter_disable(counter); + } + + if (counter->pending_wakeup) { + counter->pending_wakeup = 0; + perf_counter_wakeup(counter); + } +} + #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { @@ -1715,8 +1732,9 @@ struct perf_output_handle { static inline void __perf_output_wakeup(struct perf_output_handle *handle) { if (handle->nmi) { + handle->counter->pending_wakeup = 1; perf_pending_queue(&handle->counter->pending, - perf_pending_wakeup); + perf_pending_counter); } else perf_counter_wakeup(handle->counter); } @@ -2063,8 +2081,21 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, int perf_counter_overflow(struct perf_counter *counter, int nmi, struct pt_regs *regs) { + int events = atomic_read(&counter->event_limit); + int ret = 0; + + if (events && atomic_dec_and_test(&counter->event_limit)) { + ret = 1; + if (nmi) { + counter->pending_disable = 1; + perf_pending_queue(&counter->pending, + perf_pending_counter); + } else + perf_counter_disable(counter); + } + perf_counter_output(counter, nmi, regs); - return 0; + return ret; } /* -- cgit v1.2.3 From 0c593b3411341e3a05a61f5527df36ab02bd11e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:08 +0200 Subject: perf_counter: comment the perf_event_type stuff Describe the event format. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.211174347@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index ef4dcbff75a..81220188d05 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -208,6 +208,20 @@ struct perf_event_header { enum perf_event_type { + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * }; + */ PERF_EVENT_MMAP = 1, PERF_EVENT_MUNMAP = 2, @@ -217,6 +231,24 @@ enum perf_event_type { * * These events will have types of the form: * PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } * + * + * struct { + * struct perf_event_header header; + * + * { u64 ip; } && __PERF_EVENT_IP + * { u32 pid, tid; } && __PERF_EVENT_TID + * + * { u64 nr; + * { u64 event, val; } cnt[nr]; } && __PERF_EVENT_GROUP + * + * { u16 nr, + * hv, + * kernel, + * user; + * u64 ips[nr]; } && __PERF_EVENT_CALLCHAIN + * + * { u64 time; } && __PERF_EVENT_TIME + * }; */ PERF_EVENT_COUNTER_OVERFLOW = 1UL << 31, __PERF_EVENT_IP = PERF_RECORD_IP, -- cgit v1.2.3 From 4c9e25428ff46b968a30f1dfafdba550cb6e4141 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:09 +0200 Subject: perf_counter: change event definition Currently the definition of an event is slightly ambiguous. We have wakeup events, for poll() and SIGIO, which are either generated when a record crosses a page boundary (hw_events.wakeup_events == 0), or every wakeup_events new records. Now a record can be either a counter overflow record, or a number of different things, like the mmap PROT_EXEC region notifications. Then there is the PERF_COUNTER_IOC_REFRESH event limit, which only considers counter overflows. This patch changes then wakeup_events and SIGIO notification to only consider overflow events. Furthermore it changes the SIGIO notification to report SIGHUP when the event limit is reached and the counter will be disabled. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.266679874@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 22 +++++++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 81220188d05..0f5a4005048 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -439,6 +439,7 @@ struct perf_counter { /* delayed work for NMIs and such */ int pending_wakeup; + int pending_kill; int pending_disable; struct perf_pending_entry pending; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c05e10354bc..8c8eaf0625f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1596,7 +1596,11 @@ void perf_counter_wakeup(struct perf_counter *counter) rcu_read_unlock(); wake_up_all(&counter->waitq); - kill_fasync(&counter->fasync, SIGIO, POLL_IN); + + if (counter->pending_kill) { + kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); + counter->pending_kill = 0; + } } /* @@ -1727,6 +1731,7 @@ struct perf_output_handle { unsigned int head; int wakeup; int nmi; + int overflow; }; static inline void __perf_output_wakeup(struct perf_output_handle *handle) @@ -1741,7 +1746,7 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle) static int perf_output_begin(struct perf_output_handle *handle, struct perf_counter *counter, unsigned int size, - int nmi) + int nmi, int overflow) { struct perf_mmap_data *data; unsigned int offset, head; @@ -1751,8 +1756,9 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data) goto out; - handle->counter = counter; - handle->nmi = nmi; + handle->counter = counter; + handle->nmi = nmi; + handle->overflow = overflow; if (!data->nr_pages) goto fail; @@ -1816,7 +1822,7 @@ static void perf_output_end(struct perf_output_handle *handle) { int wakeup_events = handle->counter->hw_event.wakeup_events; - if (wakeup_events) { + if (handle->overflow && wakeup_events) { int events = atomic_inc_return(&handle->data->events); if (events >= wakeup_events) { atomic_sub(wakeup_events, &handle->data->events); @@ -1891,7 +1897,7 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } - ret = perf_output_begin(&handle, counter, header.size, nmi); + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -1955,7 +1961,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter, { struct perf_output_handle handle; int size = mmap_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0); + int ret = perf_output_begin(&handle, counter, size, 0, 0); if (ret) return; @@ -2084,8 +2090,10 @@ int perf_counter_overflow(struct perf_counter *counter, int events = atomic_read(&counter->event_limit); int ret = 0; + counter->pending_kill = POLL_IN; if (events && atomic_dec_and_test(&counter->event_limit)) { ret = 1; + counter->pending_kill = POLL_HUP; if (nmi) { counter->pending_disable = 1; perf_pending_queue(&counter->pending, -- cgit v1.2.3 From 4af4998b8aa35600f4c4a4f3c3a23baca6081d02 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:10 +0200 Subject: perf_counter: rework context time Since perf_counter_context is switched along with tasks, we can maintain the context time without using the task runtime clock. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.353552838@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 ++---- kernel/perf_counter.c | 78 +++++++++++++++++++------------------------- 2 files changed, 37 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0f5a4005048..7f5d353d78a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -477,14 +477,10 @@ struct perf_counter_context { struct task_struct *task; /* - * time_now is the current time in nanoseconds since an arbitrary - * point in the past. For per-task counters, this is based on the - * task clock, and for per-cpu counters it is based on the cpu clock. - * time_lost is an offset from the task/cpu clock, used to make it - * appear that time only passes while the context is scheduled in. + * Context clock, runs when context enabled. */ - u64 time_now; - u64 time_lost; + u64 time; + u64 timestamp; #endif }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8c8eaf0625f..84d85ab4e16 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -117,7 +117,7 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_stopped = ctx->time_now; + counter->tstamp_stopped = ctx->time; counter->hw_ops->disable(counter); counter->oncpu = -1; @@ -253,27 +253,20 @@ retry: spin_unlock_irq(&ctx->lock); } -/* - * Get the current time for this context. - * If this is a task context, we use the task's task clock, - * or for a per-cpu context, we use the cpu clock. - */ -static u64 get_context_time(struct perf_counter_context *ctx, int update) +static inline u64 perf_clock(void) { - struct task_struct *curr = ctx->task; - - if (!curr) - return cpu_clock(smp_processor_id()); - - return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime; + return cpu_clock(smp_processor_id()); } /* * Update the record of the current time in a context. */ -static void update_context_time(struct perf_counter_context *ctx, int update) +static void update_context_time(struct perf_counter_context *ctx) { - ctx->time_now = get_context_time(ctx, update) - ctx->time_lost; + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; } /* @@ -284,15 +277,17 @@ static void update_counter_times(struct perf_counter *counter) struct perf_counter_context *ctx = counter->ctx; u64 run_end; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { - counter->total_time_enabled = ctx->time_now - - counter->tstamp_enabled; - if (counter->state == PERF_COUNTER_STATE_INACTIVE) - run_end = counter->tstamp_stopped; - else - run_end = ctx->time_now; - counter->total_time_running = run_end - counter->tstamp_running; - } + if (counter->state < PERF_COUNTER_STATE_INACTIVE) + return; + + counter->total_time_enabled = ctx->time - counter->tstamp_enabled; + + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + run_end = counter->tstamp_stopped; + else + run_end = ctx->time; + + counter->total_time_running = run_end - counter->tstamp_running; } /* @@ -332,7 +327,7 @@ static void __perf_counter_disable(void *info) * If it is in error state, leave it in error state. */ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { - update_context_time(ctx, 1); + update_context_time(ctx); update_counter_times(counter); if (counter == counter->group_leader) group_sched_out(counter, cpuctx, ctx); @@ -426,7 +421,7 @@ counter_sched_in(struct perf_counter *counter, return -EAGAIN; } - counter->tstamp_running += ctx->time_now - counter->tstamp_stopped; + counter->tstamp_running += ctx->time - counter->tstamp_stopped; if (!is_software_counter(counter)) cpuctx->active_oncpu++; @@ -493,9 +488,9 @@ static void add_counter_to_ctx(struct perf_counter *counter, list_add_counter(counter, ctx); ctx->nr_counters++; counter->prev_state = PERF_COUNTER_STATE_OFF; - counter->tstamp_enabled = ctx->time_now; - counter->tstamp_running = ctx->time_now; - counter->tstamp_stopped = ctx->time_now; + counter->tstamp_enabled = ctx->time; + counter->tstamp_running = ctx->time; + counter->tstamp_stopped = ctx->time; } /* @@ -522,7 +517,7 @@ static void __perf_install_in_context(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); - update_context_time(ctx, 1); + update_context_time(ctx); /* * Protect the list operation against NMI by disabling the @@ -648,13 +643,13 @@ static void __perf_counter_enable(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); - update_context_time(ctx, 1); + update_context_time(ctx); counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled; + counter->tstamp_enabled = ctx->time - counter->total_time_enabled; /* * If the counter is in a group and isn't the group leader, @@ -737,8 +732,8 @@ static void perf_counter_enable(struct perf_counter *counter) */ if (counter->state == PERF_COUNTER_STATE_OFF) { counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time_now - - counter->total_time_enabled; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; } out: spin_unlock_irq(&ctx->lock); @@ -778,7 +773,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, ctx->is_active = 0; if (likely(!ctx->nr_counters)) goto out; - update_context_time(ctx, 0); + update_context_time(ctx); flags = hw_perf_save_disable(); if (ctx->nr_active) { @@ -883,12 +878,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (likely(!ctx->nr_counters)) goto out; - /* - * Add any time since the last sched_out to the lost time - * so it doesn't get included in the total_time_enabled and - * total_time_running measures for counters in the context. - */ - ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now; + ctx->timestamp = perf_clock(); flags = hw_perf_save_disable(); @@ -1043,8 +1033,8 @@ int perf_counter_task_enable(void) if (counter->state > PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time_now - - counter->total_time_enabled; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; counter->hw_event.disabled = 0; } hw_perf_restore(perf_flags); @@ -1113,7 +1103,7 @@ static void __read(void *info) curr_rq_lock_irq_save(&flags); if (ctx->is_active) - update_context_time(ctx, 1); + update_context_time(ctx); counter->hw_ops->read(counter); update_counter_times(counter); curr_rq_unlock_irq_restore(&flags); -- cgit v1.2.3 From 849691a6cd40270ff5f4a8846d5f6bf8df663ffc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:12 +0200 Subject: perf_counter: remove rq->lock usage Now that all the task runtime clock users are gone, remove the ugly rq->lock usage from perf counters, which solves the nasty deadlock seen when a software task clock counter was read from an NMI overflow context. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.531137582@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 2 -- kernel/perf_counter.c | 42 ++++++++++++++++-------------------------- kernel/sched.c | 20 -------------------- 3 files changed, 16 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index b6d2887a5d8..080d1fd461d 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -85,8 +85,6 @@ static inline unsigned int kstat_irqs(unsigned int irq) /* * Lock/unlock the current runqueue - to extract task statistics: */ -extern void curr_rq_lock_irq_save(unsigned long *flags); -extern void curr_rq_unlock_irq_restore(unsigned long *flags); extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update); extern unsigned long long task_delta_exec(struct task_struct *); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 56b7eb53d67..f4f7596f784 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -172,8 +172,7 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); counter_sched_out(counter, cpuctx, ctx); @@ -198,8 +197,7 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } @@ -319,8 +317,7 @@ static void __perf_counter_disable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); /* * If the counter is on, turn it off. @@ -336,8 +333,7 @@ static void __perf_counter_disable(void *info) counter->state = PERF_COUNTER_STATE_OFF; } - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -515,8 +511,7 @@ static void __perf_install_in_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); update_context_time(ctx); /* @@ -565,8 +560,7 @@ static void __perf_install_in_context(void *info) unlock: hw_perf_restore(perf_flags); - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -641,8 +635,7 @@ static void __perf_counter_enable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); update_context_time(ctx); counter->prev_state = counter->state; @@ -678,8 +671,7 @@ static void __perf_counter_enable(void *info) } unlock: - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -971,7 +963,7 @@ int perf_counter_task_disable(void) if (likely(!ctx->nr_counters)) return 0; - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); cpu = smp_processor_id(); perf_counter_task_sched_out(curr, cpu); @@ -992,9 +984,7 @@ int perf_counter_task_disable(void) hw_perf_restore(perf_flags); - spin_unlock(&ctx->lock); - - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); return 0; } @@ -1011,7 +1001,7 @@ int perf_counter_task_enable(void) if (likely(!ctx->nr_counters)) return 0; - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); cpu = smp_processor_id(); perf_counter_task_sched_out(curr, cpu); @@ -1037,7 +1027,7 @@ int perf_counter_task_enable(void) perf_counter_task_sched_in(curr, cpu); - curr_rq_unlock_irq_restore(&flags); + local_irq_restore(flags); return 0; } @@ -1095,12 +1085,12 @@ static void __read(void *info) struct perf_counter_context *ctx = counter->ctx; unsigned long flags; - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); if (ctx->is_active) update_context_time(ctx); counter->hw_ops->read(counter); update_counter_times(counter); - curr_rq_unlock_irq_restore(&flags); + local_irq_restore(flags); } static u64 perf_counter_read(struct perf_counter *counter) @@ -2890,7 +2880,7 @@ __perf_counter_exit_task(struct task_struct *child, * Be careful about zapping the list - IRQ/NMI context * could still be processing it: */ - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); perf_flags = hw_perf_save_disable(); cpuctx = &__get_cpu_var(perf_cpu_context); @@ -2903,7 +2893,7 @@ __perf_counter_exit_task(struct task_struct *child, child_ctx->nr_counters--; hw_perf_restore(perf_flags); - curr_rq_unlock_irq_restore(&flags); + local_irq_restore(flags); } parent_counter = child_counter->parent; diff --git a/kernel/sched.c b/kernel/sched.c index f76e3c0188a..0de2f814fb1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -997,26 +997,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } -void curr_rq_lock_irq_save(unsigned long *flags) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_save(*flags); - rq = cpu_rq(smp_processor_id()); - spin_lock(&rq->lock); -} - -void curr_rq_unlock_irq_restore(unsigned long *flags) - __releases(rq->lock) -{ - struct rq *rq; - - rq = cpu_rq(smp_processor_id()); - spin_unlock(&rq->lock); - local_irq_restore(*flags); -} - void task_rq_unlock_wait(struct task_struct *p) { struct rq *rq = task_rq(p); -- cgit v1.2.3 From 6fab01927e8bdbbc77bafba2abb4810c5591ad52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:26 +0200 Subject: perf_counter: provide misc bits in the event header Limit the size of each record to 64k (or should we count in multiples of u64 and have a 512K limit?), this gives 16 bits or spare room in the header, which we can use for misc bits, so as to not have to grow the record with u64 every time we have a few bits to report. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130408.769271806@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 6 +++++- kernel/perf_counter.c | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7f5d353d78a..5bd8817b12d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -201,9 +201,13 @@ struct perf_counter_mmap_page { __u32 data_head; /* head in the data section */ }; +#define PERF_EVENT_MISC_KERNEL (1 << 0) +#define PERF_EVENT_MISC_USER (1 << 1) + struct perf_event_header { __u32 type; - __u32 size; + __u16 misc; + __u16 size; }; enum perf_event_type { diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 84a39081344..4af98f943d3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1831,6 +1831,9 @@ static void perf_counter_output(struct perf_counter *counter, header.type = PERF_EVENT_COUNTER_OVERFLOW; header.size = sizeof(header); + header.misc = user_mode(regs) ? + PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL; + if (record_type & PERF_RECORD_IP) { ip = instruction_pointer(regs); header.type |= __PERF_EVENT_IP; -- cgit v1.2.3 From 6b6e5486b3a168f0328c82a8d4376caf901472b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:27 +0200 Subject: perf_counter: use misc field to widen type Push the PERF_EVENT_COUNTER_OVERFLOW bit into the misc field so that we can have the full 32bit for PERF_RECORD_ bits. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130408.891867663@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 28 ++++++++++------------------ kernel/perf_counter.c | 15 ++++++++------- 2 files changed, 18 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5bd8817b12d..4809ae18a94 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -201,8 +201,9 @@ struct perf_counter_mmap_page { __u32 data_head; /* head in the data section */ }; -#define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (1 << 1) +#define PERF_EVENT_MISC_KERNEL (1 << 0) +#define PERF_EVENT_MISC_USER (1 << 1) +#define PERF_EVENT_MISC_OVERFLOW (1 << 2) struct perf_event_header { __u32 type; @@ -230,36 +231,27 @@ enum perf_event_type { PERF_EVENT_MUNMAP = 2, /* - * Half the event type space is reserved for the counter overflow - * bitfields, as found in hw_event.record_type. - * - * These events will have types of the form: - * PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } * + * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field + * will be PERF_RECORD_* * * struct { * struct perf_event_header header; * - * { u64 ip; } && __PERF_EVENT_IP - * { u32 pid, tid; } && __PERF_EVENT_TID + * { u64 ip; } && PERF_RECORD_IP + * { u32 pid, tid; } && PERF_RECORD_TID * * { u64 nr; - * { u64 event, val; } cnt[nr]; } && __PERF_EVENT_GROUP + * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP * * { u16 nr, * hv, * kernel, * user; - * u64 ips[nr]; } && __PERF_EVENT_CALLCHAIN + * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN * - * { u64 time; } && __PERF_EVENT_TIME + * { u64 time; } && PERF_RECORD_TIME * }; */ - PERF_EVENT_COUNTER_OVERFLOW = 1UL << 31, - __PERF_EVENT_IP = PERF_RECORD_IP, - __PERF_EVENT_TID = PERF_RECORD_TID, - __PERF_EVENT_GROUP = PERF_RECORD_GROUP, - __PERF_EVENT_CALLCHAIN = PERF_RECORD_CALLCHAIN, - __PERF_EVENT_TIME = PERF_RECORD_TIME, }; #ifdef __KERNEL__ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4af98f943d3..bf12df6f353 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1828,15 +1828,16 @@ static void perf_counter_output(struct perf_counter *counter, int callchain_size = 0; u64 time; - header.type = PERF_EVENT_COUNTER_OVERFLOW; + header.type = 0; header.size = sizeof(header); - header.misc = user_mode(regs) ? + header.misc = PERF_EVENT_MISC_OVERFLOW; + header.misc |= user_mode(regs) ? PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL; if (record_type & PERF_RECORD_IP) { ip = instruction_pointer(regs); - header.type |= __PERF_EVENT_IP; + header.type |= PERF_RECORD_IP; header.size += sizeof(ip); } @@ -1845,12 +1846,12 @@ static void perf_counter_output(struct perf_counter *counter, tid_entry.pid = current->group_leader->pid; tid_entry.tid = current->pid; - header.type |= __PERF_EVENT_TID; + header.type |= PERF_RECORD_TID; header.size += sizeof(tid_entry); } if (record_type & PERF_RECORD_GROUP) { - header.type |= __PERF_EVENT_GROUP; + header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + counter->nr_siblings * sizeof(group_entry); } @@ -1861,7 +1862,7 @@ static void perf_counter_output(struct perf_counter *counter, if (callchain) { callchain_size = (1 + callchain->nr) * sizeof(u64); - header.type |= __PERF_EVENT_CALLCHAIN; + header.type |= PERF_RECORD_CALLCHAIN; header.size += callchain_size; } } @@ -1872,7 +1873,7 @@ static void perf_counter_output(struct perf_counter *counter, */ time = sched_clock(); - header.type |= __PERF_EVENT_TIME; + header.type |= PERF_RECORD_TIME; header.size += sizeof(u64); } -- cgit v1.2.3 From 8740f9418c78dcad694b46ab25d1645d5aef1f5e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:29 +0200 Subject: perf_counter: add some comments Add a few comments because I was forgetting what field what for what functionality. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.036984214@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4809ae18a94..8bf764fc622 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -344,10 +344,12 @@ struct file; struct perf_mmap_data { struct rcu_head rcu_head; - int nr_pages; - atomic_t wakeup; - atomic_t head; - atomic_t events; + int nr_pages; /* nr of data pages */ + + atomic_t wakeup; /* POLL_ for wakeups */ + atomic_t head; /* write position */ + atomic_t events; /* event limit */ + struct perf_counter_mmap_page *user_page; void *data_pages[0]; }; -- cgit v1.2.3 From 8d1b2d9361b494bfc761700c348c65ebbe3deb5b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:30 +0200 Subject: perf_counter: track task-comm data Similar to the mmap data stream, add one that tracks the task COMM field, so that the userspace reporting knows what to call a task. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.127422406@chello.nl> Signed-off-by: Ingo Molnar --- fs/exec.c | 1 + include/linux/perf_counter.h | 16 +++++++- kernel/perf_counter.c | 93 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/exec.c b/fs/exec.c index e015c0b5a08..bf47ed0278f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -951,6 +951,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) task_lock(tsk); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); + perf_counter_comm(tsk); } int flush_old_exec(struct linux_binprm * bprm) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8bf764fc622..a70a55f2759 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -142,8 +142,9 @@ struct perf_counter_hw_event { exclude_idle : 1, /* don't count when idle */ mmap : 1, /* include mmap data */ munmap : 1, /* include munmap data */ + comm : 1, /* include comm data */ - __reserved_1 : 53; + __reserved_1 : 52; __u32 extra_config_len; __u32 wakeup_events; /* wakeup every n events */ @@ -230,6 +231,16 @@ enum perf_event_type { PERF_EVENT_MMAP = 1, PERF_EVENT_MUNMAP = 2, + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * }; + */ + PERF_EVENT_COMM = 3, + /* * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* @@ -545,6 +556,8 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len, extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); +extern void perf_counter_comm(struct task_struct *tsk); + #define MAX_STACK_DEPTH 255 struct perf_callchain_entry { @@ -583,6 +596,7 @@ static inline void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file) { } +static inline void perf_counter_comm(struct task_struct *tsk) { } #endif #endif /* __KERNEL__ */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index bf12df6f353..2d4aebb2982 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1916,6 +1916,99 @@ static void perf_counter_output(struct perf_counter *counter, perf_output_end(&handle); } +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event; +}; + +static void perf_counter_comm_output(struct perf_counter *counter, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + int size = comm_event->event.header.size; + int ret = perf_output_begin(&handle, counter, size, 0, 0); + + if (ret) + return; + + perf_output_put(&handle, comm_event->event); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + perf_output_end(&handle); +} + +static int perf_counter_comm_match(struct perf_counter *counter, + struct perf_comm_event *comm_event) +{ + if (counter->hw_event.comm && + comm_event->event.header.type == PERF_EVENT_COMM) + return 1; + + return 0; +} + +static void perf_counter_comm_ctx(struct perf_counter_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_comm_match(counter, comm_event)) + perf_counter_comm_output(counter, comm_event); + } + rcu_read_unlock(); +} + +static void perf_counter_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + unsigned int size; + char *comm = comm_event->task->comm; + + size = ALIGN(strlen(comm), sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event.header.size = sizeof(comm_event->event) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_comm_ctx(&cpuctx->ctx, comm_event); + put_cpu_var(perf_cpu_context); + + perf_counter_comm_ctx(¤t->perf_counter_ctx, comm_event); +} + +void perf_counter_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event = { + .task = task, + .event = { + .header = { .type = PERF_EVENT_COMM, }, + .pid = task->group_leader->pid, + .tid = task->pid, + }, + }; + + perf_counter_comm_event(&comm_event); +} + /* * mmap tracking */ -- cgit v1.2.3 From 4d855457d84b819fefcd1cd1b0a2a0a0ec475c07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:32 +0200 Subject: perf_counter: move PERF_RECORD_TIME Move PERF_RECORD_TIME so that all the fixed length items come before the variable length ones. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.307926436@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 9 ++++----- kernel/perf_counter.c | 26 +++++++++++++------------- 2 files changed, 17 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a70a55f2759..8bd1be58c93 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -100,9 +100,9 @@ enum sw_event_ids { enum perf_counter_record_format { PERF_RECORD_IP = 1U << 0, PERF_RECORD_TID = 1U << 1, - PERF_RECORD_GROUP = 1U << 2, - PERF_RECORD_CALLCHAIN = 1U << 3, - PERF_RECORD_TIME = 1U << 4, + PERF_RECORD_TIME = 1U << 2, + PERF_RECORD_GROUP = 1U << 3, + PERF_RECORD_CALLCHAIN = 1U << 4, }; /* @@ -250,6 +250,7 @@ enum perf_event_type { * * { u64 ip; } && PERF_RECORD_IP * { u32 pid, tid; } && PERF_RECORD_TID + * { u64 time; } && PERF_RECORD_TIME * * { u64 nr; * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP @@ -259,8 +260,6 @@ enum perf_event_type { * kernel, * user; * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN - * - * { u64 time; } && PERF_RECORD_TIME * }; */ }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2d4aebb2982..4dc8600d282 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1850,6 +1850,16 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(tid_entry); } + if (record_type & PERF_RECORD_TIME) { + /* + * Maybe do better on x86 and provide cpu_clock_nmi() + */ + time = sched_clock(); + + header.type |= PERF_RECORD_TIME; + header.size += sizeof(u64); + } + if (record_type & PERF_RECORD_GROUP) { header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + @@ -1867,16 +1877,6 @@ static void perf_counter_output(struct perf_counter *counter, } } - if (record_type & PERF_RECORD_TIME) { - /* - * Maybe do better on x86 and provide cpu_clock_nmi() - */ - time = sched_clock(); - - header.type |= PERF_RECORD_TIME; - header.size += sizeof(u64); - } - ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -1889,6 +1889,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_TID) perf_output_put(&handle, tid_entry); + if (record_type & PERF_RECORD_TIME) + perf_output_put(&handle, time); + if (record_type & PERF_RECORD_GROUP) { struct perf_counter *leader, *sub; u64 nr = counter->nr_siblings; @@ -1910,9 +1913,6 @@ static void perf_counter_output(struct perf_counter *counter, if (callchain) perf_output_copy(&handle, callchain, callchain_size); - if (record_type & PERF_RECORD_TIME) - perf_output_put(&handle, time); - perf_output_end(&handle); } -- cgit v1.2.3 From 78f13e9525ba777da25c4ddab89f28e9366a8b7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:33 +0200 Subject: perf_counter: allow for data addresses to be recorded Paul suggested we allow for data addresses to be recorded along with the traditional IPs as power can provide these. For now, only the software pagefault events provide data addresses, but in the future power might as well for some events. x86 doesn't seem capable of providing this atm. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.394816925@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 2 +- arch/powerpc/mm/fault.c | 8 ++++--- arch/x86/kernel/cpu/perf_counter.c | 2 +- arch/x86/mm/fault.c | 8 ++++--- include/linux/perf_counter.h | 14 +++++++----- kernel/perf_counter.c | 46 ++++++++++++++++++++++++-------------- 6 files changed, 49 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 0697ade84dd..c9d019f1907 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -749,7 +749,7 @@ static void record_and_restart(struct perf_counter *counter, long val, * Finally record data if requested. */ if (record) - perf_counter_overflow(counter, 1, regs); + perf_counter_overflow(counter, 1, regs, 0); } /* diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 17bbf6f91fb..ac0e112031b 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address); /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -312,7 +312,8 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, + regs, address); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { preempt_disable(); @@ -322,7 +323,8 @@ good_area: #endif } else { current->min_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, + regs, address); } up_read(&mm->mmap_sem); return 0; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1116a41bc7b..0fcbaab83f9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -800,7 +800,7 @@ again: continue; perf_save_and_restart(counter); - if (perf_counter_overflow(counter, nmi, regs)) + if (perf_counter_overflow(counter, nmi, regs, 0)) __pmc_generic_disable(counter, &counter->hw, bit); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f2d3324d921..6f9df2babe4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1142,10 +1142,12 @@ good_area: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, + regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, + regs, address); } check_v8086_mode(regs, address, tsk); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8bd1be58c93..c22363a4f74 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -101,8 +101,9 @@ enum perf_counter_record_format { PERF_RECORD_IP = 1U << 0, PERF_RECORD_TID = 1U << 1, PERF_RECORD_TIME = 1U << 2, - PERF_RECORD_GROUP = 1U << 3, - PERF_RECORD_CALLCHAIN = 1U << 4, + PERF_RECORD_ADDR = 1U << 3, + PERF_RECORD_GROUP = 1U << 4, + PERF_RECORD_CALLCHAIN = 1U << 5, }; /* @@ -251,6 +252,7 @@ enum perf_event_type { * { u64 ip; } && PERF_RECORD_IP * { u32 pid, tid; } && PERF_RECORD_TID * { u64 time; } && PERF_RECORD_TIME + * { u64 addr; } && PERF_RECORD_ADDR * * { u64 nr; * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP @@ -537,7 +539,7 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, extern void perf_counter_update_userpage(struct perf_counter *counter); extern int perf_counter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs); + int nmi, struct pt_regs *regs, u64 addr); /* * Return 1 for a software counter, 0 for a hardware counter */ @@ -547,7 +549,7 @@ static inline int is_software_counter(struct perf_counter *counter) perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE; } -extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); +extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); extern void perf_counter_mmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); @@ -584,8 +586,8 @@ static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } static inline void -perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { } - +perf_swcounter_event(u32 event, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) { } static inline void perf_counter_mmap(unsigned long addr, unsigned long len, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4dc8600d282..321c57e3556 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -800,7 +800,7 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) update_context_time(ctx); regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs); + perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; @@ -1810,7 +1810,7 @@ static void perf_output_end(struct perf_output_handle *handle) } static void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { int ret; u64 record_type = counter->hw_event.record_type; @@ -1860,6 +1860,11 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } + if (record_type & PERF_RECORD_ADDR) { + header.type |= PERF_RECORD_ADDR; + header.size += sizeof(u64); + } + if (record_type & PERF_RECORD_GROUP) { header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + @@ -1892,6 +1897,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_TIME) perf_output_put(&handle, time); + if (record_type & PERF_RECORD_ADDR) + perf_output_put(&handle, addr); + if (record_type & PERF_RECORD_GROUP) { struct perf_counter *leader, *sub; u64 nr = counter->nr_siblings; @@ -2158,7 +2166,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, */ int perf_counter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { int events = atomic_read(&counter->event_limit); int ret = 0; @@ -2175,7 +2183,7 @@ int perf_counter_overflow(struct perf_counter *counter, perf_counter_disable(counter); } - perf_counter_output(counter, nmi, regs); + perf_counter_output(counter, nmi, regs, addr); return ret; } @@ -2240,7 +2248,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) regs = task_pt_regs(current); if (regs) { - if (perf_counter_overflow(counter, 0, regs)) + if (perf_counter_overflow(counter, 0, regs, 0)) ret = HRTIMER_NORESTART; } @@ -2250,11 +2258,11 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) } static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { perf_swcounter_update(counter); perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, regs)) + if (perf_counter_overflow(counter, nmi, regs, addr)) /* soft-disable the counter */ ; @@ -2286,16 +2294,17 @@ static int perf_swcounter_match(struct perf_counter *counter, } static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { int neg = atomic64_add_negative(nr, &counter->hw.count); if (counter->hw.irq_period && !neg) - perf_swcounter_overflow(counter, nmi, regs); + perf_swcounter_overflow(counter, nmi, regs, addr); } static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_event_types type, u32 event, - u64 nr, int nmi, struct pt_regs *regs) + u64 nr, int nmi, struct pt_regs *regs, + u64 addr) { struct perf_counter *counter; @@ -2305,7 +2314,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { if (perf_swcounter_match(counter, type, event, regs)) - perf_swcounter_add(counter, nr, nmi, regs); + perf_swcounter_add(counter, nr, nmi, regs, addr); } rcu_read_unlock(); } @@ -2325,7 +2334,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) } static void __perf_swcounter_event(enum perf_event_types type, u32 event, - u64 nr, int nmi, struct pt_regs *regs) + u64 nr, int nmi, struct pt_regs *regs, + u64 addr) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); int *recursion = perf_swcounter_recursion_context(cpuctx); @@ -2336,10 +2346,11 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event, (*recursion)++; barrier(); - perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs); + perf_swcounter_ctx_event(&cpuctx->ctx, type, event, + nr, nmi, regs, addr); if (cpuctx->task_ctx) { perf_swcounter_ctx_event(cpuctx->task_ctx, type, event, - nr, nmi, regs); + nr, nmi, regs, addr); } barrier(); @@ -2349,9 +2360,10 @@ out: put_cpu_var(perf_cpu_context); } -void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) +void +perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { - __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs); + __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); } static void perf_swcounter_read(struct perf_counter *counter) @@ -2548,7 +2560,7 @@ void perf_tpcounter_event(int event_id) if (!regs) regs = task_pt_regs(current); - __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs); + __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); } extern int ftrace_profile_enable(int); -- cgit v1.2.3 From 1ccd15497869f3ed83b5225d410df53a96e52757 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Apr 2009 10:53:45 +0200 Subject: perf_counter: sysctl for system wide perf counters Impact: add sysctl for paranoid/relaxed perfcounters policy Allow the use of system wide perf counters to everybody, but provide a sysctl to disable it for the paranoid security minded. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090409085524.514046352@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 4 +++- kernel/sysctl.c | 11 +++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c22363a4f74..98143288530 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -568,6 +568,8 @@ struct perf_callchain_entry { extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +extern int sysctl_perf_counter_priv; + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 76376ecb23b..7efb7ebaaae 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -42,6 +42,8 @@ static atomic_t nr_mmap_tracking __read_mostly; static atomic_t nr_munmap_tracking __read_mostly; static atomic_t nr_comm_tracking __read_mostly; +int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ + /* * Mutex for (sysadmin-configurable) counter reservations: */ @@ -1132,7 +1134,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) */ if (cpu != -1) { /* Must be root to operate on a CPU counter: */ - if (!capable(CAP_SYS_ADMIN)) + if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); if (cpu < 0 || cpu > num_possible_cpus()) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4286b62b34a..8ba457838d9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -920,6 +921,16 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif +#ifdef CONFIG_PERF_COUNTERS + { + .ctl_name = CTL_UNNUMBERED, + .procname = "perf_counter_privileged", + .data = &sysctl_perf_counter_priv, + .maxlen = sizeof(sysctl_perf_counter_priv), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt -- cgit v1.2.3 From f1f9b3b1795da8625e0e6096813c9d18d4a344ce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 20:38:21 +0200 Subject: perfcounters, sched: remove __task_delta_exec() This function was left orphan by the latest round of sw-counter cleanups. [ Impact: remove unused kernel function ] Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 1 - kernel/sched.c | 23 ----------------------- 2 files changed, 24 deletions(-) (limited to 'include') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 080d1fd461d..a77c6007dc9 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -85,7 +85,6 @@ static inline unsigned int kstat_irqs(unsigned int irq) /* * Lock/unlock the current runqueue - to extract task statistics: */ -extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update); extern unsigned long long task_delta_exec(struct task_struct *); extern void account_user_time(struct task_struct *, cputime_t, cputime_t); diff --git a/kernel/sched.c b/kernel/sched.c index b66a08c2480..a69278eef42 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4546,29 +4546,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); -/* - * Return any ns on the sched_clock that have not yet been banked in - * @p in case that task is currently running. - */ -unsigned long long __task_delta_exec(struct task_struct *p, int update) -{ - s64 delta_exec; - struct rq *rq; - - rq = task_rq(p); - WARN_ON_ONCE(!runqueue_is_locked()); - WARN_ON_ONCE(!task_current(rq, p)); - - if (update) - update_rq_clock(rq); - - delta_exec = rq->clock - p->se.exec_start; - - WARN_ON_ONCE(delta_exec < 0); - - return delta_exec; -} - /* * Return any ns on the sched_clock that have not yet been banked in * @p in case that task is currently running. -- cgit v1.2.3 From 829b42dd395c5801f6ae87da87ecbdcfd5ef1a6c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:46:59 +0200 Subject: perf_counter, x86: declare perf_max_counters only for CONFIG_PERF_COUNTERS This is only needed for CONFIG_PERF_COUNTERS enabled. [ Impact: cleanup ] Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1241002046-8832-3-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 98143288530..be10b3ffe32 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -512,12 +512,13 @@ struct perf_cpu_context { int recursion[4]; }; +#ifdef CONFIG_PERF_COUNTERS + /* * Set by architecture code: */ extern int perf_max_counters; -#ifdef CONFIG_PERF_COUNTERS extern const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter); -- cgit v1.2.3 From 4aeb0b4239bb3b67ed402cb9cef3e000c892cadf Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:03 +0200 Subject: perfcounters: rename struct hw_perf_counter_ops into struct pmu This patch renames struct hw_perf_counter_ops into struct pmu. It introduces a structure to describe a cpu specific pmu (performance monitoring unit). It may contain ops and data. The new name of the structure fits better, is shorter, and thus better to handle. Where it was appropriate, names of function and variable have been changed too. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-7-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 25 +++++++------- arch/x86/kernel/cpu/perf_counter.c | 37 ++++++++++----------- include/linux/perf_counter.h | 9 +++-- kernel/perf_counter.c | 68 ++++++++++++++++++-------------------- 4 files changed, 66 insertions(+), 73 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index bd76d0fa2c3..d9bbe5efc64 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -256,7 +256,7 @@ static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new) return 0; } -static void power_perf_read(struct perf_counter *counter) +static void power_pmu_read(struct perf_counter *counter) { long val, delta, prev; @@ -405,7 +405,7 @@ void hw_perf_restore(u64 disable) for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) { - power_perf_read(counter); + power_pmu_read(counter); write_pmc(counter->hw.idx, 0); counter->hw.idx = 0; } @@ -477,7 +477,7 @@ static void counter_sched_in(struct perf_counter *counter, int cpu) counter->oncpu = cpu; counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped; if (is_software_counter(counter)) - counter->hw_ops->enable(counter); + counter->pmu->enable(counter); } /* @@ -533,7 +533,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, * re-enable the PMU in order to get hw_perf_restore to do the * actual work of reconfiguring the PMU. */ -static int power_perf_enable(struct perf_counter *counter) +static int power_pmu_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuhw; unsigned long flags; @@ -573,7 +573,7 @@ static int power_perf_enable(struct perf_counter *counter) /* * Remove a counter from the PMU. */ -static void power_perf_disable(struct perf_counter *counter) +static void power_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuhw; long i; @@ -583,7 +583,7 @@ static void power_perf_disable(struct perf_counter *counter) local_irq_save(flags); pmudis = hw_perf_save_disable(); - power_perf_read(counter); + power_pmu_read(counter); cpuhw = &__get_cpu_var(cpu_hw_counters); for (i = 0; i < cpuhw->n_counters; ++i) { @@ -607,10 +607,10 @@ static void power_perf_disable(struct perf_counter *counter) local_irq_restore(flags); } -struct hw_perf_counter_ops power_perf_ops = { - .enable = power_perf_enable, - .disable = power_perf_disable, - .read = power_perf_read +struct pmu power_pmu = { + .enable = power_pmu_enable, + .disable = power_pmu_disable, + .read = power_pmu_read, }; /* Number of perf_counters counting hardware events */ @@ -631,8 +631,7 @@ static void hw_perf_counter_destroy(struct perf_counter *counter) } } -const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter) +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { unsigned long ev; struct perf_counter *ctrs[MAX_HWCOUNTERS]; @@ -705,7 +704,7 @@ hw_perf_counter_init(struct perf_counter *counter) if (err) return ERR_PTR(err); - return &power_perf_ops; + return &power_pmu; } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ad663d5ad2d..95de980c74a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -515,8 +515,8 @@ __pmc_fixed_disable(struct perf_counter *counter, } static inline void -__pmc_generic_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int idx) +__x86_pmu_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); @@ -591,8 +591,8 @@ __pmc_fixed_enable(struct perf_counter *counter, } static void -__pmc_generic_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +__x86_pmu_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_enable(counter, hwc, idx); @@ -626,7 +626,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static int pmc_generic_enable(struct perf_counter *counter) +static int x86_pmu_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -667,7 +667,7 @@ try_generic: perf_counters_lapic_init(hwc->nmi); - __pmc_generic_disable(counter, hwc, idx); + __x86_pmu_disable(counter, hwc, idx); cpuc->counters[idx] = counter; /* @@ -676,7 +676,7 @@ try_generic: barrier(); __hw_perf_counter_set_period(counter, hwc, idx); - __pmc_generic_enable(counter, hwc, idx); + __x86_pmu_enable(counter, hwc, idx); return 0; } @@ -731,13 +731,13 @@ void perf_counter_print_debug(void) local_irq_enable(); } -static void pmc_generic_disable(struct perf_counter *counter) +static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __pmc_generic_disable(counter, hwc, idx); + __x86_pmu_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; @@ -767,7 +767,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) - __pmc_generic_enable(counter, hwc, idx); + __x86_pmu_enable(counter, hwc, idx); } /* @@ -805,7 +805,7 @@ again: perf_save_and_restart(counter); if (perf_counter_overflow(counter, nmi, regs, 0)) - __pmc_generic_disable(counter, &counter->hw, bit); + __x86_pmu_disable(counter, &counter->hw, bit); } hw_perf_ack_status(ack); @@ -1034,19 +1034,18 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void pmc_generic_read(struct perf_counter *counter) +static void x86_pmu_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } -static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .enable = pmc_generic_enable, - .disable = pmc_generic_disable, - .read = pmc_generic_read, +static const struct pmu pmu = { + .enable = x86_pmu_enable, + .disable = x86_pmu_disable, + .read = x86_pmu_read, }; -const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter) +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { int err; @@ -1054,7 +1053,7 @@ hw_perf_counter_init(struct perf_counter *counter) if (err) return ERR_PTR(err); - return &x86_perf_counter_ops; + return &pmu; } /* diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index be10b3ffe32..c3db52dc876 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -334,9 +334,9 @@ struct hw_perf_counter { struct perf_counter; /** - * struct hw_perf_counter_ops - performance counter hw ops + * struct pmu - generic performance monitoring unit */ -struct hw_perf_counter_ops { +struct pmu { int (*enable) (struct perf_counter *counter); void (*disable) (struct perf_counter *counter); void (*read) (struct perf_counter *counter); @@ -381,7 +381,7 @@ struct perf_counter { struct list_head sibling_list; int nr_siblings; struct perf_counter *group_leader; - const struct hw_perf_counter_ops *hw_ops; + const struct pmu *pmu; enum perf_counter_active_state state; enum perf_counter_active_state prev_state; @@ -519,8 +519,7 @@ struct perf_cpu_context { */ extern int perf_max_counters; -extern const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter); +extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter); extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 09396098dd0..582108addef 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -52,8 +52,7 @@ static DEFINE_MUTEX(perf_resource_mutex); /* * Architecture provided APIs - weak aliases: */ -extern __weak const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter) +extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { return NULL; } @@ -124,7 +123,7 @@ counter_sched_out(struct perf_counter *counter, counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_stopped = ctx->time; - counter->hw_ops->disable(counter); + counter->pmu->disable(counter); counter->oncpu = -1; if (!is_software_counter(counter)) @@ -417,7 +416,7 @@ counter_sched_in(struct perf_counter *counter, */ smp_wmb(); - if (counter->hw_ops->enable(counter)) { + if (counter->pmu->enable(counter)) { counter->state = PERF_COUNTER_STATE_INACTIVE; counter->oncpu = -1; return -EAGAIN; @@ -1096,7 +1095,7 @@ static void __read(void *info) local_irq_save(flags); if (ctx->is_active) update_context_time(ctx); - counter->hw_ops->read(counter); + counter->pmu->read(counter); update_counter_times(counter); local_irq_restore(flags); } @@ -1922,7 +1921,7 @@ static void perf_counter_output(struct perf_counter *counter, leader = counter->group_leader; list_for_each_entry(sub, &leader->sibling_list, list_entry) { if (sub != counter) - sub->hw_ops->read(sub); + sub->pmu->read(sub); group_entry.event = sub->hw_event.config; group_entry.counter = atomic64_read(&sub->count); @@ -2264,7 +2263,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) struct pt_regs *regs; counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->hw_ops->read(counter); + counter->pmu->read(counter); regs = get_irq_regs(); /* @@ -2410,7 +2409,7 @@ static void perf_swcounter_disable(struct perf_counter *counter) perf_swcounter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_generic = { +static const struct pmu perf_ops_generic = { .enable = perf_swcounter_enable, .disable = perf_swcounter_disable, .read = perf_swcounter_read, @@ -2460,7 +2459,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter) cpu_clock_perf_counter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_cpu_clock = { +static const struct pmu perf_ops_cpu_clock = { .enable = cpu_clock_perf_counter_enable, .disable = cpu_clock_perf_counter_disable, .read = cpu_clock_perf_counter_read, @@ -2522,7 +2521,7 @@ static void task_clock_perf_counter_read(struct perf_counter *counter) task_clock_perf_counter_update(counter, time); } -static const struct hw_perf_counter_ops perf_ops_task_clock = { +static const struct pmu perf_ops_task_clock = { .enable = task_clock_perf_counter_enable, .disable = task_clock_perf_counter_disable, .read = task_clock_perf_counter_read, @@ -2574,7 +2573,7 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) cpu_migrations_perf_counter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { +static const struct pmu perf_ops_cpu_migrations = { .enable = cpu_migrations_perf_counter_enable, .disable = cpu_migrations_perf_counter_disable, .read = cpu_migrations_perf_counter_read, @@ -2600,8 +2599,7 @@ static void tp_perf_counter_destroy(struct perf_counter *counter) ftrace_profile_disable(perf_event_id(&counter->hw_event)); } -static const struct hw_perf_counter_ops * -tp_perf_counter_init(struct perf_counter *counter) +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { int event_id = perf_event_id(&counter->hw_event); int ret; @@ -2616,18 +2614,16 @@ tp_perf_counter_init(struct perf_counter *counter) return &perf_ops_generic; } #else -static const struct hw_perf_counter_ops * -tp_perf_counter_init(struct perf_counter *counter) +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { return NULL; } #endif -static const struct hw_perf_counter_ops * -sw_perf_counter_init(struct perf_counter *counter) +static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_hw_event *hw_event = &counter->hw_event; - const struct hw_perf_counter_ops *hw_ops = NULL; + const struct pmu *pmu = NULL; struct hw_perf_counter *hwc = &counter->hw; /* @@ -2639,7 +2635,7 @@ sw_perf_counter_init(struct perf_counter *counter) */ switch (perf_event_id(&counter->hw_event)) { case PERF_COUNT_CPU_CLOCK: - hw_ops = &perf_ops_cpu_clock; + pmu = &perf_ops_cpu_clock; if (hw_event->irq_period && hw_event->irq_period < 10000) hw_event->irq_period = 10000; @@ -2650,9 +2646,9 @@ sw_perf_counter_init(struct perf_counter *counter) * use the cpu_clock counter instead. */ if (counter->ctx->task) - hw_ops = &perf_ops_task_clock; + pmu = &perf_ops_task_clock; else - hw_ops = &perf_ops_cpu_clock; + pmu = &perf_ops_cpu_clock; if (hw_event->irq_period && hw_event->irq_period < 10000) hw_event->irq_period = 10000; @@ -2661,18 +2657,18 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_PAGE_FAULTS_MIN: case PERF_COUNT_PAGE_FAULTS_MAJ: case PERF_COUNT_CONTEXT_SWITCHES: - hw_ops = &perf_ops_generic; + pmu = &perf_ops_generic; break; case PERF_COUNT_CPU_MIGRATIONS: if (!counter->hw_event.exclude_kernel) - hw_ops = &perf_ops_cpu_migrations; + pmu = &perf_ops_cpu_migrations; break; } - if (hw_ops) + if (pmu) hwc->irq_period = hw_event->irq_period; - return hw_ops; + return pmu; } /* @@ -2685,7 +2681,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, struct perf_counter *group_leader, gfp_t gfpflags) { - const struct hw_perf_counter_ops *hw_ops; + const struct pmu *pmu; struct perf_counter *counter; long err; @@ -2713,46 +2709,46 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->cpu = cpu; counter->hw_event = *hw_event; counter->group_leader = group_leader; - counter->hw_ops = NULL; + counter->pmu = NULL; counter->ctx = ctx; counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) counter->state = PERF_COUNTER_STATE_OFF; - hw_ops = NULL; + pmu = NULL; if (perf_event_raw(hw_event)) { - hw_ops = hw_perf_counter_init(counter); + pmu = hw_perf_counter_init(counter); goto done; } switch (perf_event_type(hw_event)) { case PERF_TYPE_HARDWARE: - hw_ops = hw_perf_counter_init(counter); + pmu = hw_perf_counter_init(counter); break; case PERF_TYPE_SOFTWARE: - hw_ops = sw_perf_counter_init(counter); + pmu = sw_perf_counter_init(counter); break; case PERF_TYPE_TRACEPOINT: - hw_ops = tp_perf_counter_init(counter); + pmu = tp_perf_counter_init(counter); break; } done: err = 0; - if (!hw_ops) + if (!pmu) err = -EINVAL; - else if (IS_ERR(hw_ops)) - err = PTR_ERR(hw_ops); + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); if (err) { kfree(counter); return ERR_PTR(err); } - counter->hw_ops = hw_ops; + counter->pmu = pmu; if (counter->hw_event.mmap) atomic_inc(&nr_mmap_tracking); -- cgit v1.2.3 From 6f00cada07bb5da7f751929d3173494dcc5446cc Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:17 +0200 Subject: perf_counter, x86: consistent use of type int for counter index The type of counter index is sometimes implemented as unsigned int. This patch changes this to have a consistent usage of int. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-21-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- include/linux/perf_counter.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f7fd4a35515..d8beebeb270 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -459,7 +459,7 @@ static void hw_perf_disable(int idx, u64 config) static inline void __pmc_fixed_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int __idx) + struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -474,7 +474,7 @@ __pmc_fixed_disable(struct perf_counter *counter, static inline void __x86_pmu_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int idx) + struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); @@ -523,7 +523,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, static inline void __pmc_fixed_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int __idx) + struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; @@ -691,7 +691,7 @@ static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; - unsigned int idx = hwc->idx; + int idx = hwc->idx; /* * Must be done before we disable, otherwise the nmi handler diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c3db52dc876..41aed427005 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -318,7 +318,7 @@ struct hw_perf_counter { unsigned long config_base; unsigned long counter_base; int nmi; - unsigned int idx; + int idx; }; union { /* software */ atomic64_t count; -- cgit v1.2.3 From c33a0bc4e41ef169d6e807d8abb9502544b518e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2009 12:23:16 +0200 Subject: perf_counter: fix race in perf_output_* When two (or more) contexts output to the same buffer, it is possible to observe half written output. Suppose we have CPU0 doing perf_counter_mmap(), CPU1 doing perf_counter_overflow(). If CPU1 does a wakeup and exposes head to user-space, then CPU2 can observe the data CPU0 is still writing. [ Impact: fix occasionally corrupted profiling records ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090501102533.007821627@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 +- kernel/perf_counter.c | 130 +++++++++++++++++++++++++++++++++---------- 2 files changed, 105 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 41aed427005..f776851f8c4 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -358,10 +358,13 @@ struct perf_mmap_data { struct rcu_head rcu_head; int nr_pages; /* nr of data pages */ - atomic_t wakeup; /* POLL_ for wakeups */ + atomic_t poll; /* POLL_ for wakeups */ atomic_t head; /* write position */ atomic_t events; /* event limit */ + atomic_t wakeup_head; /* completed head */ + atomic_t lock; /* concurrent writes */ + struct perf_counter_mmap_page *user_page; void *data_pages[0]; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 75f2b6c8239..8660ae57953 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1279,14 +1279,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_counter *counter = file->private_data; struct perf_mmap_data *data; - unsigned int events; + unsigned int events = POLL_HUP; rcu_read_lock(); data = rcu_dereference(counter->data); if (data) - events = atomic_xchg(&data->wakeup, 0); - else - events = POLL_HUP; + events = atomic_xchg(&data->poll, 0); rcu_read_unlock(); poll_wait(file, &counter->waitq, wait); @@ -1568,22 +1566,6 @@ static const struct file_operations perf_fops = { void perf_counter_wakeup(struct perf_counter *counter) { - struct perf_mmap_data *data; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (data) { - atomic_set(&data->wakeup, POLL_IN); - /* - * Ensure all data writes are issued before updating the - * user-space data head information. The matching rmb() - * will be in userspace after reading this value. - */ - smp_wmb(); - data->user_page->data_head = atomic_read(&data->head); - } - rcu_read_unlock(); - wake_up_all(&counter->waitq); if (counter->pending_kill) { @@ -1721,10 +1703,14 @@ struct perf_output_handle { int wakeup; int nmi; int overflow; + int locked; + unsigned long flags; }; -static inline void __perf_output_wakeup(struct perf_output_handle *handle) +static void perf_output_wakeup(struct perf_output_handle *handle) { + atomic_set(&handle->data->poll, POLL_IN); + if (handle->nmi) { handle->counter->pending_wakeup = 1; perf_pending_queue(&handle->counter->pending, @@ -1733,6 +1719,86 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle) perf_counter_wakeup(handle->counter); } +/* + * Curious locking construct. + * + * We need to ensure a later event doesn't publish a head when a former + * event isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int cpu; + + handle->locked = 0; + + local_irq_save(handle->flags); + cpu = smp_processor_id(); + + if (in_nmi() && atomic_read(&data->lock) == cpu) + return; + + while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) + cpu_relax(); + + handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int head, cpu; + + if (handle->wakeup) + data->wakeup_head = data->head; + + if (!handle->locked) + goto out; + +again: + /* + * The xchg implies a full barrier that ensures all writes are done + * before we publish the new head, matched by a rmb() in userspace when + * reading this position. + */ + while ((head = atomic_xchg(&data->wakeup_head, 0))) { + data->user_page->data_head = head; + handle->wakeup = 1; + } + + /* + * NMI can happen here, which means we can miss a wakeup_head update. + */ + + cpu = atomic_xchg(&data->lock, 0); + WARN_ON_ONCE(cpu != smp_processor_id()); + + /* + * Therefore we have to validate we did not indeed do so. + */ + if (unlikely(atomic_read(&data->wakeup_head))) { + /* + * Since we had it locked, we can lock it again. + */ + while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) + cpu_relax(); + + goto again; + } + + if (handle->wakeup) + perf_output_wakeup(handle); +out: + local_irq_restore(handle->flags); +} + static int perf_output_begin(struct perf_output_handle *handle, struct perf_counter *counter, unsigned int size, int nmi, int overflow) @@ -1745,6 +1811,7 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data) goto out; + handle->data = data; handle->counter = counter; handle->nmi = nmi; handle->overflow = overflow; @@ -1752,12 +1819,13 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data->nr_pages) goto fail; + perf_output_lock(handle); + do { offset = head = atomic_read(&data->head); head += size; } while (atomic_cmpxchg(&data->head, offset, head) != offset); - handle->data = data; handle->offset = offset; handle->head = head; handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); @@ -1765,7 +1833,7 @@ static int perf_output_begin(struct perf_output_handle *handle, return 0; fail: - __perf_output_wakeup(handle); + perf_output_wakeup(handle); out: rcu_read_unlock(); @@ -1809,16 +1877,20 @@ static void perf_output_copy(struct perf_output_handle *handle, static void perf_output_end(struct perf_output_handle *handle) { - int wakeup_events = handle->counter->hw_event.wakeup_events; + struct perf_counter *counter = handle->counter; + struct perf_mmap_data *data = handle->data; + + int wakeup_events = counter->hw_event.wakeup_events; if (handle->overflow && wakeup_events) { - int events = atomic_inc_return(&handle->data->events); + int events = atomic_inc_return(&data->events); if (events >= wakeup_events) { - atomic_sub(wakeup_events, &handle->data->events); - __perf_output_wakeup(handle); + atomic_sub(wakeup_events, &data->events); + handle->wakeup = 1; } - } else if (handle->wakeup) - __perf_output_wakeup(handle); + } + + perf_output_unlock(handle); rcu_read_unlock(); } -- cgit v1.2.3 From 0d905bca23aca5c86a10ee101bcd3b1abbd40b25 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2009 19:13:30 +0200 Subject: perf_counter: initialize the per-cpu context earlier percpu scheduling for perfcounters wants to take the context lock, but that lock first needs to be initialized. Currently it is an early_initcall() - but that is too late, the task tick runs much sooner than that. Call it explicitly from the scheduler init sequence instead. [ Impact: fix access-before-init crash ] LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 ++++- kernel/perf_counter.c | 5 +---- kernel/sched.c | 5 ++++- 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f776851f8c4..a356fa69796 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -573,6 +573,8 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); extern int sysctl_perf_counter_priv; +extern void perf_counter_init(void); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -600,9 +602,10 @@ perf_counter_mmap(unsigned long addr, unsigned long len, static inline void perf_counter_munmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file) { } + unsigned long pgoff, struct file *file) { } static inline void perf_counter_comm(struct task_struct *tsk) { } +static inline void perf_counter_init(void) { } #endif #endif /* __KERNEL__ */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b9679c36bcc..fcdafa234a5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3265,15 +3265,12 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = { .notifier_call = perf_cpu_notify, }; -static int __init perf_counter_init(void) +void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); - - return 0; } -early_initcall(perf_counter_init); static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) { diff --git a/kernel/sched.c b/kernel/sched.c index 2f600e30dcf..a728976a3a6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -8996,7 +8997,7 @@ void __init sched_init(void) * 1024) and two child groups A0 and A1 (of weight 1024 each), * then A0's share of the cpu resource is: * - * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% * * We achieve this by letting init_task_group's tasks sit * directly in rq->cfs (i.e init_task_group->se[] = NULL). @@ -9097,6 +9098,8 @@ void __init sched_init(void) alloc_bootmem_cpumask_var(&cpu_isolated_map); #endif /* SMP */ + perf_counter_init(); + scheduler_running = 1; } -- cgit v1.2.3 From c66de4a5be7913247bd83d79168f8e4420c9cfbc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:22 +0200 Subject: perf_counter: uncouple data_head updates from wakeups Keep data_head up-to-date irrespective of notifications. This fixes the case where you disable a counter and don't get a notification for the last few pending events, and it also allows polling usage. [ Impact: increase precision of perfcounter mmap-ed fields ] Suggested-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20090505155436.925084300@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 +++- kernel/perf_counter.c | 20 +++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a356fa69796..17b63105f2a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -362,9 +362,11 @@ struct perf_mmap_data { atomic_t head; /* write position */ atomic_t events; /* event limit */ - atomic_t wakeup_head; /* completed head */ + atomic_t done_head; /* completed head */ atomic_t lock; /* concurrent writes */ + atomic_t wakeup; /* needs a wakeup */ + struct perf_counter_mmap_page *user_page; void *data_pages[0]; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5f86a1156c9..ba5e921e1f3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1696,7 +1696,6 @@ struct perf_output_handle { struct perf_mmap_data *data; unsigned int offset; unsigned int head; - int wakeup; int nmi; int overflow; int locked; @@ -1752,8 +1751,7 @@ static void perf_output_unlock(struct perf_output_handle *handle) struct perf_mmap_data *data = handle->data; int head, cpu; - if (handle->wakeup) - data->wakeup_head = data->head; + data->done_head = data->head; if (!handle->locked) goto out; @@ -1764,13 +1762,11 @@ again: * before we publish the new head, matched by a rmb() in userspace when * reading this position. */ - while ((head = atomic_xchg(&data->wakeup_head, 0))) { + while ((head = atomic_xchg(&data->done_head, 0))) data->user_page->data_head = head; - handle->wakeup = 1; - } /* - * NMI can happen here, which means we can miss a wakeup_head update. + * NMI can happen here, which means we can miss a done_head update. */ cpu = atomic_xchg(&data->lock, 0); @@ -1779,7 +1775,7 @@ again: /* * Therefore we have to validate we did not indeed do so. */ - if (unlikely(atomic_read(&data->wakeup_head))) { + if (unlikely(atomic_read(&data->done_head))) { /* * Since we had it locked, we can lock it again. */ @@ -1789,7 +1785,7 @@ again: goto again; } - if (handle->wakeup) + if (atomic_xchg(&data->wakeup, 0)) perf_output_wakeup(handle); out: local_irq_restore(handle->flags); @@ -1824,7 +1820,9 @@ static int perf_output_begin(struct perf_output_handle *handle, handle->offset = offset; handle->head = head; - handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); + + if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) + atomic_set(&data->wakeup, 1); return 0; @@ -1882,7 +1880,7 @@ static void perf_output_end(struct perf_output_handle *handle) int events = atomic_inc_return(&data->events); if (events >= wakeup_events) { atomic_sub(wakeup_events, &data->events); - handle->wakeup = 1; + atomic_set(&data->wakeup, 1); } } -- cgit v1.2.3 From 6de6a7b95705b859b61430fa3afa1403034eb3e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:23 +0200 Subject: perf_counter: add ioctl(PERF_COUNTER_IOC_RESET) Provide a way to reset an existing counter - this eases PAPI libraries around perfcounters. Similar to read() it doesn't collapse pending child counters. [ Impact: new perfcounter fd ioctl method to reset counters ] Suggested-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20090505155437.022272933@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 17b63105f2a..0fcbf34a4f7 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -160,6 +160,7 @@ struct perf_counter_hw_event { #define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) #define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) #define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32) +#define PERF_COUNTER_IOC_RESET _IO ('$', 3) /* * Structure of the page that can be mapped via mmap diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ba5e921e1f3..6e6834e0587 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1288,6 +1288,11 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) return events; } +static void perf_counter_reset(struct perf_counter *counter) +{ + atomic_set(&counter->count, 0); +} + static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_counter *counter = file->private_data; @@ -1303,6 +1308,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_COUNTER_IOC_REFRESH: perf_counter_refresh(counter, arg); break; + case PERF_COUNTER_IOC_RESET: + perf_counter_reset(counter); + break; default: err = -ENOTTY; } -- cgit v1.2.3 From c5078f78b455fbf67ea71442c7e7ca8acf9ff095 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:24 +0200 Subject: perf_counter: provide an mlock threshold Provide a threshold to relax the mlock accounting, increasing usability. Each counter gets perf_counter_mlock_kb for free. [ Impact: allow more mmap buffering ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090505155437.112113632@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 15 +++++++++++---- kernel/sysctl.c | 8 ++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0fcbf34a4f7..00081d84169 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -358,6 +358,7 @@ struct file; struct perf_mmap_data { struct rcu_head rcu_head; int nr_pages; /* nr of data pages */ + int nr_locked; /* nr pages mlocked */ atomic_t poll; /* POLL_ for wakeups */ atomic_t head; /* write position */ @@ -575,6 +576,7 @@ struct perf_callchain_entry { extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); extern int sysctl_perf_counter_priv; +extern int sysctl_perf_counter_mlock; extern void perf_counter_init(void); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 6e6834e0587..2d134273830 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -44,6 +44,7 @@ static atomic_t nr_munmap_tracking __read_mostly; static atomic_t nr_comm_tracking __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ +int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */ /* * Lock for (sysadmin-configurable) counter reservations: @@ -1461,7 +1462,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { - vma->vm_mm->locked_vm -= counter->data->nr_pages + 1; + vma->vm_mm->locked_vm -= counter->data->nr_locked; perf_mmap_data_free(counter); mutex_unlock(&counter->mmap_mutex); } @@ -1480,6 +1481,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long nr_pages; unsigned long locked, lock_limit; int ret = 0; + long extra; if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) return -EINVAL; @@ -1507,8 +1509,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - locked = vma->vm_mm->locked_vm; - locked += nr_pages + 1; + extra = nr_pages /* + 1 only account the data pages */; + extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + if (extra < 0) + extra = 0; + + locked = vma->vm_mm->locked_vm + extra; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; lock_limit >>= PAGE_SHIFT; @@ -1524,7 +1530,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; atomic_set(&counter->mmap_count, 1); - vma->vm_mm->locked_vm += nr_pages + 1; + vma->vm_mm->locked_vm += extra; + counter->data->nr_locked = extra; unlock: mutex_unlock(&counter->mmap_mutex); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8203d70928d..3b05c2b088d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -920,6 +920,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "perf_counter_mlock_kb", + .data = &sysctl_perf_counter_mlock, + .maxlen = sizeof(sysctl_perf_counter_mlock), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif /* * NOTE: do not add new entries to this table unless you have read -- cgit v1.2.3 From 3df5edad87a998273aa5a9a8c728c05d855ad00e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:22 +0200 Subject: perf_counter: rework ioctl()s Corey noticed that ioctl()s on grouped counters didn't work on the whole group. This extends the ioctl() interface to take a second argument that is interpreted as a flags field. We then provide PERF_IOC_FLAG_GROUP to toggle the behaviour. Having this flag gives the greatest flexibility, allowing you to individually enable/disable/reset counters in a group, or all together. [ Impact: fix group counter enable/disable semantics ] Reported-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20090508170028.837558214@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 +++-- kernel/perf_counter.c | 104 ++++++++++++++++++++++++------------------- 2 files changed, 65 insertions(+), 49 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 00081d84169..88f863ec274 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -157,10 +157,14 @@ struct perf_counter_hw_event { /* * Ioctls that can be done on a perf counter fd: */ -#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) -#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) +#define PERF_COUNTER_IOC_ENABLE _IOW('$', 0, u32) +#define PERF_COUNTER_IOC_DISABLE _IOW('$', 1, u32) #define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32) -#define PERF_COUNTER_IOC_RESET _IO ('$', 3) +#define PERF_COUNTER_IOC_RESET _IOW('$', 3, u32) + +enum perf_counter_ioc_flags { + PERF_IOC_FLAG_GROUP = 1U << 0, +}; /* * Structure of the page that can be mapped via mmap diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fdb0d242127..f4883f1f47e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -82,7 +82,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) * add it straight to the context's counter list, or to the group * leader's sibling list: */ - if (counter->group_leader == counter) + if (group_leader == counter) list_add_tail(&counter->list_entry, &ctx->counter_list); else { list_add_tail(&counter->list_entry, &group_leader->sibling_list); @@ -385,24 +385,6 @@ static void perf_counter_disable(struct perf_counter *counter) spin_unlock_irq(&ctx->lock); } -/* - * Disable a counter and all its children. - */ -static void perf_counter_disable_family(struct perf_counter *counter) -{ - struct perf_counter *child; - - perf_counter_disable(counter); - - /* - * Lock the mutex to protect the list of children - */ - mutex_lock(&counter->mutex); - list_for_each_entry(child, &counter->child_list, child_list) - perf_counter_disable(child); - mutex_unlock(&counter->mutex); -} - static int counter_sched_in(struct perf_counter *counter, struct perf_cpu_context *cpuctx, @@ -753,24 +735,6 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh) return 0; } -/* - * Enable a counter and all its children. - */ -static void perf_counter_enable_family(struct perf_counter *counter) -{ - struct perf_counter *child; - - perf_counter_enable(counter); - - /* - * Lock the mutex to protect the list of children - */ - mutex_lock(&counter->mutex); - list_for_each_entry(child, &counter->child_list, child_list) - perf_counter_enable(child); - mutex_unlock(&counter->mutex); -} - void __perf_counter_sched_out(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx) { @@ -1307,31 +1271,79 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void perf_counter_reset(struct perf_counter *counter) { + (void)perf_counter_read(counter); atomic_set(&counter->count, 0); + perf_counter_update_userpage(counter); +} + +static void perf_counter_for_each_sibling(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *sibling; + + spin_lock_irq(&ctx->lock); + counter = counter->group_leader; + + func(counter); + list_for_each_entry(sibling, &counter->sibling_list, list_entry) + func(sibling); + spin_unlock_irq(&ctx->lock); +} + +static void perf_counter_for_each_child(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter *child; + + mutex_lock(&counter->mutex); + func(counter); + list_for_each_entry(child, &counter->child_list, child_list) + func(child); + mutex_unlock(&counter->mutex); +} + +static void perf_counter_for_each(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter *child; + + mutex_lock(&counter->mutex); + perf_counter_for_each_sibling(counter, func); + list_for_each_entry(child, &counter->child_list, child_list) + perf_counter_for_each_sibling(child, func); + mutex_unlock(&counter->mutex); } static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_counter *counter = file->private_data; - int err = 0; + void (*func)(struct perf_counter *); + u32 flags = arg; switch (cmd) { case PERF_COUNTER_IOC_ENABLE: - perf_counter_enable_family(counter); + func = perf_counter_enable; break; case PERF_COUNTER_IOC_DISABLE: - perf_counter_disable_family(counter); - break; - case PERF_COUNTER_IOC_REFRESH: - err = perf_counter_refresh(counter, arg); + func = perf_counter_disable; break; case PERF_COUNTER_IOC_RESET: - perf_counter_reset(counter); + func = perf_counter_reset; break; + + case PERF_COUNTER_IOC_REFRESH: + return perf_counter_refresh(counter, arg); default: - err = -ENOTTY; + return -ENOTTY; } - return err; + + if (flags & PERF_IOC_FLAG_GROUP) + perf_counter_for_each(counter, func); + else + perf_counter_for_each_child(counter, func); + + return 0; } /* -- cgit v1.2.3 From a85f61abe11a46553c4562e74edb27ebc782aeb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:23 +0200 Subject: perf_counter: add PERF_RECORD_CONFIG Much like CONFIG_RECORD_GROUP records the hw_event.config to identify the values, allow to record this for all counters. [ Impact: extend perfcounter output record format ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090508170028.923228280@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 88f863ec274..0e6303d36c6 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -104,6 +104,7 @@ enum perf_counter_record_format { PERF_RECORD_ADDR = 1U << 3, PERF_RECORD_GROUP = 1U << 4, PERF_RECORD_CALLCHAIN = 1U << 5, + PERF_RECORD_CONFIG = 1U << 6, }; /* @@ -258,6 +259,7 @@ enum perf_event_type { * { u32 pid, tid; } && PERF_RECORD_TID * { u64 time; } && PERF_RECORD_TIME * { u64 addr; } && PERF_RECORD_ADDR + * { u64 config; } && PERF_RECORD_CONFIG * * { u64 nr; * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f4883f1f47e..c615f52aa40 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1994,6 +1994,11 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } + if (record_type & PERF_RECORD_CONFIG) { + header.type |= PERF_RECORD_CONFIG; + header.size += sizeof(u64); + } + if (record_type & PERF_RECORD_GROUP) { header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + @@ -2029,6 +2034,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_ADDR) perf_output_put(&handle, addr); + if (record_type & PERF_RECORD_CONFIG) + perf_output_put(&handle, counter->hw_event.config); + /* * XXX PERF_RECORD_GROUP vs inherited counters seems difficult. */ -- cgit v1.2.3 From f370e1e2f195ec1e6420e26fc83e0319595db578 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:24 +0200 Subject: perf_counter: add PERF_RECORD_CPU Allow recording the CPU number the event was generated on. RFC: this leaves a u32 as reserved, should we fill in the node_id() there, or leave this open for future extention, as userspace can already easily do the cpu->node mapping if needed. [ Impact: extend perfcounter output record format ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090508170029.008627711@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0e6303d36c6..614f921d616 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -105,6 +105,7 @@ enum perf_counter_record_format { PERF_RECORD_GROUP = 1U << 4, PERF_RECORD_CALLCHAIN = 1U << 5, PERF_RECORD_CONFIG = 1U << 6, + PERF_RECORD_CPU = 1U << 7, }; /* @@ -260,6 +261,7 @@ enum perf_event_type { * { u64 time; } && PERF_RECORD_TIME * { u64 addr; } && PERF_RECORD_ADDR * { u64 config; } && PERF_RECORD_CONFIG + * { u32 cpu, res; } && PERF_RECORD_CPU * * { u64 nr; * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c615f52aa40..d850a1fb8d4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1956,6 +1956,9 @@ static void perf_counter_output(struct perf_counter *counter, struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; u64 time; + struct { + u32 cpu, reserved; + } cpu_entry; header.type = 0; header.size = sizeof(header); @@ -1999,6 +2002,13 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } + if (record_type & PERF_RECORD_CPU) { + header.type |= PERF_RECORD_CPU; + header.size += sizeof(cpu_entry); + + cpu_entry.cpu = raw_smp_processor_id(); + } + if (record_type & PERF_RECORD_GROUP) { header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + @@ -2037,6 +2047,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_CONFIG) perf_output_put(&handle, counter->hw_event.config); + if (record_type & PERF_RECORD_CPU) + perf_output_put(&handle, cpu_entry); + /* * XXX PERF_RECORD_GROUP vs inherited counters seems difficult. */ -- cgit v1.2.3 From 9e35ad388bea89f7d6f375af4c0ae98803688666 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 16:21:38 +0200 Subject: perf_counter: Rework the perf counter disable/enable The current disable/enable mechanism is: token = hw_perf_save_disable(); ... /* do bits */ ... hw_perf_restore(token); This works well, provided that the use nests properly. Except we don't. x86 NMI/INT throttling has non-nested use of this, breaking things. Therefore provide a reference counter disable/enable interface, where the first disable disables the hardware, and the last enable enables the hardware again. [ Impact: refactor, simplify the PMU disable/enable logic ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 24 ++++---- arch/x86/kernel/cpu/perf_counter.c | 113 ++++++++++++++----------------------- drivers/acpi/processor_idle.c | 6 +- include/linux/perf_counter.h | 10 ++-- kernel/perf_counter.c | 76 +++++++++++++++---------- 5 files changed, 109 insertions(+), 120 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 15cdc8e6722..bb1b463c136 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -386,7 +386,7 @@ static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0) * Disable all counters to prevent PMU interrupts and to allow * counters to be added or removed. */ -u64 hw_perf_save_disable(void) +void hw_perf_disable(void) { struct cpu_hw_counters *cpuhw; unsigned long ret; @@ -428,7 +428,6 @@ u64 hw_perf_save_disable(void) mb(); } local_irq_restore(flags); - return ret; } /* @@ -436,7 +435,7 @@ u64 hw_perf_save_disable(void) * If we were previously disabled and counters were added, then * put the new config on the PMU. */ -void hw_perf_restore(u64 disable) +void hw_perf_enable(void) { struct perf_counter *counter; struct cpu_hw_counters *cpuhw; @@ -448,9 +447,12 @@ void hw_perf_restore(u64 disable) int n_lim; int idx; - if (disable) - return; local_irq_save(flags); + if (!cpuhw->disabled) { + local_irq_restore(flags); + return; + } + cpuhw = &__get_cpu_var(cpu_hw_counters); cpuhw->disabled = 0; @@ -649,19 +651,18 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, /* * Add a counter to the PMU. * If all counters are not already frozen, then we disable and - * re-enable the PMU in order to get hw_perf_restore to do the + * re-enable the PMU in order to get hw_perf_enable to do the * actual work of reconfiguring the PMU. */ static int power_pmu_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuhw; unsigned long flags; - u64 pmudis; int n0; int ret = -EAGAIN; local_irq_save(flags); - pmudis = hw_perf_save_disable(); + perf_disable(); /* * Add the counter to the list (if there is room) @@ -685,7 +686,7 @@ static int power_pmu_enable(struct perf_counter *counter) ret = 0; out: - hw_perf_restore(pmudis); + perf_enable(); local_irq_restore(flags); return ret; } @@ -697,11 +698,10 @@ static void power_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuhw; long i; - u64 pmudis; unsigned long flags; local_irq_save(flags); - pmudis = hw_perf_save_disable(); + perf_disable(); power_pmu_read(counter); @@ -735,7 +735,7 @@ static void power_pmu_disable(struct perf_counter *counter) cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); } - hw_perf_restore(pmudis); + perf_enable(); local_irq_restore(flags); } diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7601c014f8f..313638cecbb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -31,7 +31,6 @@ struct cpu_hw_counters { unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; - u64 throttle_ctrl; int enabled; }; @@ -42,8 +41,8 @@ struct x86_pmu { const char *name; int version; int (*handle_irq)(struct pt_regs *, int); - u64 (*save_disable_all)(void); - void (*restore_all)(u64); + void (*disable_all)(void); + void (*enable_all)(void); void (*enable)(struct hw_perf_counter *, int); void (*disable)(struct hw_perf_counter *, int); unsigned eventsel; @@ -56,6 +55,7 @@ struct x86_pmu { int counter_bits; u64 counter_mask; u64 max_period; + u64 intel_ctrl; }; static struct x86_pmu x86_pmu __read_mostly; @@ -311,22 +311,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } -static u64 intel_pmu_save_disable_all(void) +static void intel_pmu_disable_all(void) { - u64 ctrl; - - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - return ctrl; } -static u64 amd_pmu_save_disable_all(void) +static void amd_pmu_disable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - int enabled, idx; + int idx; + + if (!cpuc->enabled) + return; - enabled = cpuc->enabled; cpuc->enabled = 0; /* * ensure we write the disable before we start disabling the @@ -334,8 +331,6 @@ static u64 amd_pmu_save_disable_all(void) * right thing. */ barrier(); - if (!enabled) - goto out; for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -348,37 +343,31 @@ static u64 amd_pmu_save_disable_all(void) val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } - -out: - return enabled; } -u64 hw_perf_save_disable(void) +void hw_perf_disable(void) { if (!x86_pmu_initialized()) - return 0; - return x86_pmu.save_disable_all(); + return; + return x86_pmu.disable_all(); } -/* - * Exported because of ACPI idle - */ -EXPORT_SYMBOL_GPL(hw_perf_save_disable); -static void intel_pmu_restore_all(u64 ctrl) +static void intel_pmu_enable_all(void) { - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); } -static void amd_pmu_restore_all(u64 ctrl) +static void amd_pmu_enable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int idx; - cpuc->enabled = ctrl; - barrier(); - if (!ctrl) + if (cpuc->enabled) return; + cpuc->enabled = 1; + barrier(); + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -392,16 +381,12 @@ static void amd_pmu_restore_all(u64 ctrl) } } -void hw_perf_restore(u64 ctrl) +void hw_perf_enable(void) { if (!x86_pmu_initialized()) return; - x86_pmu.restore_all(ctrl); + x86_pmu.enable_all(); } -/* - * Exported because of ACPI idle - */ -EXPORT_SYMBOL_GPL(hw_perf_restore); static inline u64 intel_pmu_get_status(void) { @@ -735,15 +720,14 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) int bit, cpu = smp_processor_id(); u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - int ret = 0; - - cpuc->throttle_ctrl = intel_pmu_save_disable_all(); + perf_disable(); status = intel_pmu_get_status(); - if (!status) - goto out; + if (!status) { + perf_enable(); + return 0; + } - ret = 1; again: inc_irq_stat(apic_perf_irqs); ack = status; @@ -767,19 +751,11 @@ again: status = intel_pmu_get_status(); if (status) goto again; -out: - /* - * Restore - do not reenable when global enable is off or throttled: - */ - if (cpuc->throttle_ctrl) { - if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) { - intel_pmu_restore_all(cpuc->throttle_ctrl); - } else { - pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id()); - } - } - return ret; + if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS) + perf_enable(); + + return 1; } static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) @@ -792,13 +768,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) struct hw_perf_counter *hwc; int idx, throttle = 0; - cpuc->throttle_ctrl = cpuc->enabled; - cpuc->enabled = 0; - barrier(); - - if (cpuc->throttle_ctrl) { - if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) - throttle = 1; + if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { + throttle = 1; + __perf_disable(); + cpuc->enabled = 0; + barrier(); } for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -824,9 +798,6 @@ next: amd_pmu_disable_counter(hwc, idx); } - if (cpuc->throttle_ctrl && !throttle) - cpuc->enabled = 1; - return handled; } @@ -839,13 +810,11 @@ void perf_counter_unthrottle(void) cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id()); - /* * Clear them before re-enabling irqs/NMIs again: */ cpuc->interrupts = 0; - hw_perf_restore(cpuc->throttle_ctrl); + perf_enable(); } else { cpuc->interrupts = 0; } @@ -931,8 +900,8 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { static struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, - .save_disable_all = intel_pmu_save_disable_all, - .restore_all = intel_pmu_restore_all, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, .enable = intel_pmu_enable_counter, .disable = intel_pmu_disable_counter, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, @@ -951,8 +920,8 @@ static struct x86_pmu intel_pmu = { static struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, - .save_disable_all = amd_pmu_save_disable_all, - .restore_all = amd_pmu_restore_all, + .disable_all = amd_pmu_disable_all, + .enable_all = amd_pmu_enable_all, .enable = amd_pmu_enable_counter, .disable = amd_pmu_disable_counter, .eventsel = MSR_K7_EVNTSEL0, @@ -1003,6 +972,8 @@ static int intel_pmu_init(void) x86_pmu.counter_bits = eax.split.bit_width; x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + return 0; } diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index d2830f39d46..9645758c047 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -763,11 +763,9 @@ static int acpi_idle_bm_check(void) */ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) { - u64 perf_flags; - /* Don't trace irqs off for idle */ stop_critical_timings(); - perf_flags = hw_perf_save_disable(); + perf_disable(); if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); @@ -782,7 +780,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) gets asserted in time to freeze execution properly. */ unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } - hw_perf_restore(perf_flags); + perf_enable(); start_critical_timings(); } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 614f921d616..e543ecc129f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -544,8 +544,10 @@ extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); extern void perf_counter_unthrottle(void); -extern u64 hw_perf_save_disable(void); -extern void hw_perf_restore(u64 ctrl); +extern void __perf_disable(void); +extern bool __perf_enable(void); +extern void perf_disable(void); +extern void perf_enable(void); extern int perf_counter_task_disable(void); extern int perf_counter_task_enable(void); extern int hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -600,8 +602,8 @@ static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } static inline void perf_counter_unthrottle(void) { } -static inline void hw_perf_restore(u64 ctrl) { } -static inline u64 hw_perf_save_disable(void) { return 0; } +static inline void perf_disable(void) { } +static inline void perf_enable(void) { } static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 985be0b662a..e814ff04d7c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -60,8 +60,9 @@ extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counte return NULL; } -u64 __weak hw_perf_save_disable(void) { return 0; } -void __weak hw_perf_restore(u64 ctrl) { barrier(); } +void __weak hw_perf_disable(void) { barrier(); } +void __weak hw_perf_enable(void) { barrier(); } + void __weak hw_perf_counter_setup(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, @@ -72,6 +73,32 @@ int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, void __weak perf_counter_print_debug(void) { } +static DEFINE_PER_CPU(int, disable_count); + +void __perf_disable(void) +{ + __get_cpu_var(disable_count)++; +} + +bool __perf_enable(void) +{ + return !--__get_cpu_var(disable_count); +} + +void perf_disable(void) +{ + __perf_disable(); + hw_perf_disable(); +} +EXPORT_SYMBOL_GPL(perf_disable); /* ACPI idle */ + +void perf_enable(void) +{ + if (__perf_enable()) + hw_perf_enable(); +} +EXPORT_SYMBOL_GPL(perf_enable); /* ACPI idle */ + static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -170,7 +197,6 @@ static void __perf_counter_remove_from_context(void *info) struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; unsigned long flags; - u64 perf_flags; /* * If this is a task context, we need to check whether it is @@ -191,9 +217,9 @@ static void __perf_counter_remove_from_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_del_counter(counter, ctx); - hw_perf_restore(perf_flags); + perf_enable(); if (!ctx->task) { /* @@ -538,7 +564,6 @@ static void __perf_install_in_context(void *info) struct perf_counter *leader = counter->group_leader; int cpu = smp_processor_id(); unsigned long flags; - u64 perf_flags; int err; /* @@ -556,7 +581,7 @@ static void __perf_install_in_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_save_disable(); + perf_disable(); add_counter_to_ctx(counter, ctx); @@ -596,7 +621,7 @@ static void __perf_install_in_context(void *info) cpuctx->max_pertask--; unlock: - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock_irqrestore(&ctx->lock, flags); } @@ -663,7 +688,6 @@ static void __perf_counter_enable(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter_context *ctx = counter->ctx; struct perf_counter *leader = counter->group_leader; - unsigned long pmuflags; unsigned long flags; int err; @@ -693,14 +717,14 @@ static void __perf_counter_enable(void *info) if (!group_can_go_on(counter, cpuctx, 1)) { err = -EEXIST; } else { - pmuflags = hw_perf_save_disable(); + perf_disable(); if (counter == leader) err = group_sched_in(counter, cpuctx, ctx, smp_processor_id()); else err = counter_sched_in(counter, cpuctx, ctx, smp_processor_id()); - hw_perf_restore(pmuflags); + perf_enable(); } if (err) { @@ -795,7 +819,6 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx) { struct perf_counter *counter; - u64 flags; spin_lock(&ctx->lock); ctx->is_active = 0; @@ -803,12 +826,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, goto out; update_context_time(ctx); - flags = hw_perf_save_disable(); + perf_disable(); if (ctx->nr_active) { list_for_each_entry(counter, &ctx->counter_list, list_entry) group_sched_out(counter, cpuctx, ctx); } - hw_perf_restore(flags); + perf_enable(); out: spin_unlock(&ctx->lock); } @@ -860,7 +883,6 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) { struct perf_counter *counter; - u64 flags; int can_add_hw = 1; spin_lock(&ctx->lock); @@ -870,7 +892,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, ctx->timestamp = perf_clock(); - flags = hw_perf_save_disable(); + perf_disable(); /* * First go through the list and put on any pinned groups @@ -917,7 +939,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, can_add_hw = 0; } } - hw_perf_restore(flags); + perf_enable(); out: spin_unlock(&ctx->lock); } @@ -955,7 +977,6 @@ int perf_counter_task_disable(void) struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; unsigned long flags; - u64 perf_flags; if (likely(!ctx->nr_counters)) return 0; @@ -969,7 +990,7 @@ int perf_counter_task_disable(void) /* * Disable all the counters: */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (counter->state != PERF_COUNTER_STATE_ERROR) { @@ -978,7 +999,7 @@ int perf_counter_task_disable(void) } } - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock_irqrestore(&ctx->lock, flags); @@ -991,7 +1012,6 @@ int perf_counter_task_enable(void) struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; unsigned long flags; - u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) @@ -1007,7 +1027,7 @@ int perf_counter_task_enable(void) /* * Disable all the counters: */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (counter->state > PERF_COUNTER_STATE_OFF) @@ -1017,7 +1037,7 @@ int perf_counter_task_enable(void) ctx->time - counter->total_time_enabled; counter->hw_event.disabled = 0; } - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock(&ctx->lock); @@ -1034,7 +1054,6 @@ int perf_counter_task_enable(void) static void rotate_ctx(struct perf_counter_context *ctx) { struct perf_counter *counter; - u64 perf_flags; if (!ctx->nr_counters) return; @@ -1043,12 +1062,12 @@ static void rotate_ctx(struct perf_counter_context *ctx) /* * Rotate the first entry last (works just fine for group counters too): */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { list_move_tail(&counter->list_entry, &ctx->counter_list); break; } - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock(&ctx->lock); } @@ -3194,7 +3213,6 @@ __perf_counter_exit_task(struct task_struct *child, } else { struct perf_cpu_context *cpuctx; unsigned long flags; - u64 perf_flags; /* * Disable and unlink this counter. @@ -3203,7 +3221,7 @@ __perf_counter_exit_task(struct task_struct *child, * could still be processing it: */ local_irq_save(flags); - perf_flags = hw_perf_save_disable(); + perf_disable(); cpuctx = &__get_cpu_var(perf_cpu_context); @@ -3214,7 +3232,7 @@ __perf_counter_exit_task(struct task_struct *child, child_ctx->nr_counters--; - hw_perf_restore(perf_flags); + perf_enable(); local_irq_restore(flags); } -- cgit v1.2.3 From 789f90fcf6b0b54e655740e9396c954378542c79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:27 +0200 Subject: perf_counter: per user mlock gift Instead of a per-process mlock gift for perf-counters, use a per-user gift so that there is less of a DoS potential. [ Impact: allow less worst-case unprivileged memory consumption ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.496182835@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/perf_counter.c | 22 +++++++++++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index d1857580a13..ff59d123151 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -674,6 +674,10 @@ struct user_struct { struct work_struct work; #endif #endif + +#ifdef CONFIG_PERF_COUNTERS + atomic_long_t locked_vm; +#endif }; extern int uids_sysfs_init(void); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0173738dd54..93f4a0e4b87 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -45,7 +45,7 @@ static atomic_t nr_munmap_tracking __read_mostly; static atomic_t nr_comm_tracking __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ -int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */ +int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ /* * Lock for (sysadmin-configurable) counter reservations: @@ -1522,6 +1522,9 @@ static void perf_mmap_close(struct vm_area_struct *vma) if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { + struct user_struct *user = current_user(); + + atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); vma->vm_mm->locked_vm -= counter->data->nr_locked; perf_mmap_data_free(counter); mutex_unlock(&counter->mmap_mutex); @@ -1537,11 +1540,13 @@ static struct vm_operations_struct perf_mmap_vmops = { static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_counter *counter = file->private_data; + struct user_struct *user = current_user(); unsigned long vma_size; unsigned long nr_pages; + unsigned long user_locked, user_lock_limit; unsigned long locked, lock_limit; + long user_extra, extra; int ret = 0; - long extra; if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) return -EINVAL; @@ -1569,15 +1574,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - extra = nr_pages /* + 1 only account the data pages */; - extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); - if (extra < 0) - extra = 0; + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + user_locked = atomic_long_read(&user->locked_vm) + user_extra; - locked = vma->vm_mm->locked_vm + extra; + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { ret = -EPERM; @@ -1590,6 +1597,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; atomic_set(&counter->mmap_count, 1); + atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->locked_vm += extra; counter->data->nr_locked = extra; unlock: -- cgit v1.2.3 From 60db5e09c13109b13830cc9dcae688003fd39e79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:28 +0200 Subject: perf_counter: frequency based adaptive irq_period Instead of specifying the irq_period for a counter, provide a target interrupt frequency and dynamically adapt the irq_period to match this frequency. [ Impact: new perf-counter attribute/feature ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.646195868@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 13 ++++---- arch/x86/kernel/cpu/perf_counter.c | 9 ++---- include/linux/perf_counter.h | 10 ++++-- kernel/perf_counter.c | 63 ++++++++++++++++++++++++++++++-------- 4 files changed, 68 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index bb1b463c136..db8d5cafc15 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -534,7 +534,7 @@ void hw_perf_enable(void) continue; } val = 0; - if (counter->hw_event.irq_period) { + if (counter->hw.irq_period) { left = atomic64_read(&counter->hw.period_left); if (left < 0x80000000L) val = 0x80000000L - left; @@ -829,8 +829,6 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) if (!ppmu) return ERR_PTR(-ENXIO); - if ((s64)counter->hw_event.irq_period < 0) - return ERR_PTR(-EINVAL); if (!perf_event_raw(&counter->hw_event)) { ev = perf_event_id(&counter->hw_event); if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) @@ -901,7 +899,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) counter->hw.config = events[n]; counter->hw.counter_base = cflags[n]; - atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); + atomic64_set(&counter->hw.period_left, counter->hw.irq_period); /* * See if we need to reserve the PMU. @@ -934,6 +932,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) static void record_and_restart(struct perf_counter *counter, long val, struct pt_regs *regs, int nmi) { + u64 period = counter->hw.irq_period; s64 prev, delta, left; int record = 0; @@ -948,11 +947,11 @@ static void record_and_restart(struct perf_counter *counter, long val, */ val = 0; left = atomic64_read(&counter->hw.period_left) - delta; - if (counter->hw_event.irq_period) { + if (period) { if (left <= 0) { - left += counter->hw_event.irq_period; + left += period; if (left <= 0) - left = counter->hw_event.irq_period; + left = period; record = 1; } if (left < 0x80000000L) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5a7f718eb1e..886dcf334bc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -286,11 +286,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; } - hwc->irq_period = hw_event->irq_period; - if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) - hwc->irq_period = x86_pmu.max_period; - - atomic64_set(&hwc->period_left, hwc->irq_period); + atomic64_set(&hwc->period_left, + min(x86_pmu.max_period, hwc->irq_period)); /* * Raw event type provide the config in the event structure @@ -458,7 +455,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->irq_period; + s64 period = min(x86_pmu.max_period, hwc->irq_period); int err; /* diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e543ecc129f..004b6e162b9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -130,7 +130,11 @@ struct perf_counter_hw_event { */ __u64 config; - __u64 irq_period; + union { + __u64 irq_period; + __u64 irq_freq; + }; + __u32 record_type; __u32 read_format; @@ -146,8 +150,9 @@ struct perf_counter_hw_event { mmap : 1, /* include mmap data */ munmap : 1, /* include munmap data */ comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ - __reserved_1 : 52; + __reserved_1 : 51; __u32 extra_config_len; __u32 wakeup_events; /* wakeup every n events */ @@ -337,6 +342,7 @@ struct hw_perf_counter { atomic64_t prev_count; u64 irq_period; atomic64_t period_left; + u64 interrupts; #endif }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 93f4a0e4b87..0ad1db4f3d6 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1046,6 +1046,38 @@ int perf_counter_task_enable(void) return 0; } +void perf_adjust_freq(struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + u64 irq_period; + u64 events, period; + s64 delta; + + spin_lock(&ctx->lock); + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + continue; + + if (!counter->hw_event.freq || !counter->hw_event.irq_freq) + continue; + + events = HZ * counter->hw.interrupts * counter->hw.irq_period; + period = div64_u64(events, counter->hw_event.irq_freq); + + delta = (s64)(1 + period - counter->hw.irq_period); + delta >>= 1; + + irq_period = counter->hw.irq_period + delta; + + if (!irq_period) + irq_period = 1; + + counter->hw.irq_period = irq_period; + counter->hw.interrupts = 0; + } + spin_unlock(&ctx->lock); +} + /* * Round-robin a context's counters: */ @@ -1081,6 +1113,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); ctx = &curr->perf_counter_ctx; + perf_adjust_freq(&cpuctx->ctx); + perf_adjust_freq(ctx); + perf_counter_cpu_sched_out(cpuctx); __perf_counter_task_sched_out(ctx); @@ -2382,6 +2417,8 @@ int perf_counter_overflow(struct perf_counter *counter, int events = atomic_read(&counter->event_limit); int ret = 0; + counter->hw.interrupts++; + /* * XXX event_limit might not quite work as expected on inherited * counters @@ -2450,6 +2487,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_counter *counter; struct pt_regs *regs; + u64 period; counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); counter->pmu->read(counter); @@ -2468,7 +2506,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) ret = HRTIMER_NORESTART; } - hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); + period = max_t(u64, 10000, counter->hw.irq_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); return ret; } @@ -2629,8 +2668,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { + u64 period = max_t(u64, 10000, hwc->irq_period); __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(hwc->irq_period), 0, + ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); } @@ -2679,8 +2719,9 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { + u64 period = max_t(u64, 10000, hwc->irq_period); __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(hwc->irq_period), 0, + ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); } @@ -2811,9 +2852,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) { - struct perf_counter_hw_event *hw_event = &counter->hw_event; const struct pmu *pmu = NULL; - struct hw_perf_counter *hwc = &counter->hw; /* * Software counters (currently) can't in general distinguish @@ -2826,8 +2865,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_CPU_CLOCK: pmu = &perf_ops_cpu_clock; - if (hw_event->irq_period && hw_event->irq_period < 10000) - hw_event->irq_period = 10000; break; case PERF_COUNT_TASK_CLOCK: /* @@ -2839,8 +2876,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) else pmu = &perf_ops_cpu_clock; - if (hw_event->irq_period && hw_event->irq_period < 10000) - hw_event->irq_period = 10000; break; case PERF_COUNT_PAGE_FAULTS: case PERF_COUNT_PAGE_FAULTS_MIN: @@ -2854,9 +2889,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) break; } - if (pmu) - hwc->irq_period = hw_event->irq_period; - return pmu; } @@ -2872,6 +2904,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, { const struct pmu *pmu; struct perf_counter *counter; + struct hw_perf_counter *hwc; long err; counter = kzalloc(sizeof(*counter), gfpflags); @@ -2907,6 +2940,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, pmu = NULL; + hwc = &counter->hw; + if (hw_event->freq && hw_event->irq_freq) + hwc->irq_period = TICK_NSEC / hw_event->irq_freq; + else + hwc->irq_period = hw_event->irq_period; + /* * we currently do not support PERF_RECORD_GROUP on inherited counters */ -- cgit v1.2.3 From 9d23a90a67261e73b2fcac04d8ca963c6b496afb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 14 May 2009 21:48:08 +1000 Subject: perf_counter: allow arch to supply event misc flags and instruction pointer At present the values we put in overflow events for the misc flags indicating processor mode and the instruction pointer are obtained using the standard user_mode() and instruction_pointer() functions. Those functions tell you where the performance monitor interrupt was taken, which might not be exactly where the counter overflow occurred, for example because interrupts were disabled at the point where the overflow occurred, or because the processor had many instructions in flight and chose to complete some more instructions beyond the one that caused the counter overflow. Some architectures (e.g. powerpc) can supply more precise information about where the counter overflow occurred and the processor mode at that point. This introduces new functions, perf_misc_flags() and perf_instruction_pointer(), which arch code can override to provide more precise information if available. They have default implementations which are identical to the existing code. This also adds a new misc flag value, PERF_EVENT_MISC_HYPERVISOR, for the case where a counter overflow occurred in the hypervisor. We encode the processor mode in the 2 bits previously used to indicate user or kernel mode; the values for user and kernel mode are unchanged and hypervisor mode is indicated by both bits being set. [ Impact: generalize perfcounter core facilities ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <18956.1272.818511.561835@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 11 ++++++++++- kernel/perf_counter.c | 5 ++--- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 004b6e162b9..c8c1dfc22c9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -215,8 +215,11 @@ struct perf_counter_mmap_page { __u32 data_head; /* head in the data section */ }; +#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) +#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (1 << 1) +#define PERF_EVENT_MISC_USER (2 << 0) +#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) #define PERF_EVENT_MISC_OVERFLOW (1 << 2) struct perf_event_header { @@ -596,6 +599,12 @@ extern int sysctl_perf_counter_mlock; extern void perf_counter_init(void); +#ifndef perf_misc_flags +#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \ + PERF_EVENT_MISC_KERNEL) +#define perf_instruction_pointer(regs) instruction_pointer(regs) +#endif + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 728a595399b..57840a94b16 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2042,11 +2042,10 @@ static void perf_counter_output(struct perf_counter *counter, header.size = sizeof(header); header.misc = PERF_EVENT_MISC_OVERFLOW; - header.misc |= user_mode(regs) ? - PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL; + header.misc |= perf_misc_flags(regs); if (record_type & PERF_RECORD_IP) { - ip = instruction_pointer(regs); + ip = perf_instruction_pointer(regs); header.type |= PERF_RECORD_IP; header.size += sizeof(ip); } -- cgit v1.2.3 From c44d70a340554a33071339064a303ac0f1a31623 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 11:24:08 +0200 Subject: perf_counter: fix counter inheritance race Context rotation should not occur when we are in the middle of walking the counter list when inheriting counters ... [ Impact: fix occasionally incorrect perf stat results ] Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c8c1dfc22c9..13cb2fbbf33 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -508,6 +508,7 @@ struct perf_counter_context { int nr_counters; int nr_active; int is_active; + int rr_allowed; struct task_struct *task; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7af16d1c480..4d8f97375f3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1120,7 +1120,8 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) __perf_counter_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); - rotate_ctx(ctx); + if (ctx->rr_allowed) + rotate_ctx(ctx); perf_counter_cpu_sched_in(cpuctx, cpu); perf_counter_task_sched_in(curr, cpu); @@ -3108,6 +3109,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx, mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); INIT_LIST_HEAD(&ctx->event_list); + ctx->rr_allowed = 1; ctx->task = task; } @@ -3348,6 +3350,9 @@ void perf_counter_init_task(struct task_struct *child) */ mutex_lock(&parent_ctx->mutex); + parent_ctx->rr_allowed = 0; + barrier(); /* irqs */ + /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: @@ -3361,6 +3366,9 @@ void perf_counter_init_task(struct task_struct *child) break; } + barrier(); /* irqs */ + parent_ctx->rr_allowed = 1; + mutex_unlock(&parent_ctx->mutex); } -- cgit v1.2.3 From d7b629a34fc4134a43c730b5f0197855dc4948d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 May 2009 12:21:19 +0200 Subject: perf_counter: Solve the rotate_ctx vs inherit race differently Instead of disabling RR scheduling of the counters, use a different list that does not get rotated to iterate the counters on inheritance. [ Impact: cleanup, optimization ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: Marcelo Tosatti Cc: John Kacur LKML-Reference: <20090520102553.237504544@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 - kernel/perf_counter.c | 15 +++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 13cb2fbbf33..c8c1dfc22c9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -508,7 +508,6 @@ struct perf_counter_context { int nr_counters; int nr_active; int is_active; - int rr_allowed; struct task_struct *task; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4d8f97375f3..64113e6d194 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1120,8 +1120,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) __perf_counter_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); - if (ctx->rr_allowed) - rotate_ctx(ctx); + rotate_ctx(ctx); perf_counter_cpu_sched_in(cpuctx, cpu); perf_counter_task_sched_in(curr, cpu); @@ -3109,7 +3108,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx, mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); INIT_LIST_HEAD(&ctx->event_list); - ctx->rr_allowed = 1; ctx->task = task; } @@ -3350,14 +3348,14 @@ void perf_counter_init_task(struct task_struct *child) */ mutex_lock(&parent_ctx->mutex); - parent_ctx->rr_allowed = 0; - barrier(); /* irqs */ - /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { + list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { + if (counter != counter->group_leader) + continue; + if (!counter->hw_event.inherit) continue; @@ -3366,9 +3364,6 @@ void perf_counter_init_task(struct task_struct *child) break; } - barrier(); /* irqs */ - parent_ctx->rr_allowed = 1; - mutex_unlock(&parent_ctx->mutex); } -- cgit v1.2.3 From 26b119bc811a73bac6ecf95bdf284bf31c7955f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 May 2009 12:21:20 +0200 Subject: perf_counter: Log irq_period changes For the dynamic irq_period code, log whenever we change the period so that analyzing code can normalize the event flow. [ Impact: add new feature to allow more precise profiling ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: Marcelo Tosatti Cc: John Kacur LKML-Reference: <20090520102553.298769743@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 8 ++++++++ kernel/perf_counter.c | 40 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c8c1dfc22c9..f612941ef46 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -257,6 +257,14 @@ enum perf_event_type { */ PERF_EVENT_COMM = 3, + /* + * struct { + * struct perf_event_header header; + * u64 irq_period; + * }; + */ + PERF_EVENT_PERIOD = 4, + /* * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 64113e6d194..db02eb16c77 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1046,7 +1046,9 @@ int perf_counter_task_enable(void) return 0; } -void perf_adjust_freq(struct perf_counter_context *ctx) +static void perf_log_period(struct perf_counter *counter, u64 period); + +static void perf_adjust_freq(struct perf_counter_context *ctx) { struct perf_counter *counter; u64 irq_period; @@ -1072,6 +1074,8 @@ void perf_adjust_freq(struct perf_counter_context *ctx) if (!irq_period) irq_period = 1; + perf_log_period(counter, irq_period); + counter->hw.irq_period = irq_period; counter->hw.interrupts = 0; } @@ -2406,6 +2410,40 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, perf_counter_mmap_event(&mmap_event); } +/* + * + */ + +static void perf_log_period(struct perf_counter *counter, u64 period) +{ + struct perf_output_handle handle; + int ret; + + struct { + struct perf_event_header header; + u64 time; + u64 period; + } freq_event = { + .header = { + .type = PERF_EVENT_PERIOD, + .misc = 0, + .size = sizeof(freq_event), + }, + .time = sched_clock(), + .period = period, + }; + + if (counter->hw.irq_period == period) + return; + + ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0); + if (ret) + return; + + perf_output_put(&handle, freq_event); + perf_output_end(&handle); +} + /* * Generic counter overflow handling. */ -- cgit v1.2.3 From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:17:31 +1000 Subject: perf_counter: Dynamically allocate tasks' perf_counter_context struct This replaces the struct perf_counter_context in the task_struct with a pointer to a dynamically allocated perf_counter_context struct. The main reason for doing is this is to allow us to transfer a perf_counter_context from one task to another when we do lazy PMU switching in a later patch. This has a few side-benefits: the task_struct becomes a little smaller, we save some memory because only tasks that have perf_counters attached get a perf_counter_context allocated for them, and we can remove the inclusion of in sched.h, meaning that we don't end up recompiling nearly everything whenever perf_counter.h changes. The perf_counter_context structures are reference-counted and freed when the last reference is dropped. A context can have references from its task and the counters on its task. Counters can outlive the task so it is possible that a context will be freed well after its task has exited. Contexts are allocated on fork if the parent had a context, or otherwise the first time that a per-task counter is created on a task. In the latter case, we set the context pointer in the task struct locklessly using an atomic compare-and-exchange operation in case we raced with some other task in creating a context for the subject task. This also removes the task pointer from the perf_counter struct. The task pointer was not used anywhere and would make it harder to move a context from one task to another. Anything that needed to know which task a counter was attached to was already using counter->ctx->task. The __perf_counter_init_context function moves up in perf_counter.c so that it can be called from find_get_context, and now initializes the refcount, but is otherwise unchanged. We were potentially calling list_del_counter twice: once from __perf_counter_exit_task when the task exits and once from __perf_counter_remove_from_context when the counter's fd gets closed. This adds a check in list_del_counter so it doesn't do anything if the counter has already been removed from the lists. Since perf_counter_task_sched_in doesn't do anything if the task doesn't have a context, and leaves cpuctx->task_ctx = NULL, this adds code to __perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in the case where the current task adds the first counter to itself and thus creates a context for itself. This also adds similar code to __perf_counter_enable to handle a similar situation which can arise when the counters have been disabled using prctl; that also leaves cpuctx->task_ctx = NULL. [ Impact: refactor counter context management to prepare for new feature ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 1 + include/linux/init_task.h | 13 --- include/linux/perf_counter.h | 4 +- include/linux/sched.h | 6 +- kernel/exit.c | 3 +- kernel/fork.c | 1 + kernel/perf_counter.c | 218 +++++++++++++++++++++++++++---------------- 7 files changed, 145 insertions(+), 101 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e9021a90802..b4f64402a82 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,6 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ +#include #include #include #include diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 503afaa0afa..d87247d2641 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -108,18 +108,6 @@ extern struct group_info init_groups; extern struct cred init_cred; -#ifdef CONFIG_PERF_COUNTERS -# define INIT_PERF_COUNTERS(tsk) \ - .perf_counter_ctx.counter_list = \ - LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \ - .perf_counter_ctx.event_list = \ - LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \ - .perf_counter_ctx.lock = \ - __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock), -#else -# define INIT_PERF_COUNTERS(tsk) -#endif - /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -183,7 +171,6 @@ extern struct cred init_cred; }, \ .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_IDS \ - INIT_PERF_COUNTERS(tsk) \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f612941ef46..07130900546 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -449,7 +449,6 @@ struct perf_counter { struct hw_perf_counter hw; struct perf_counter_context *ctx; - struct task_struct *task; struct file *filp; struct perf_counter *parent; @@ -498,7 +497,6 @@ struct perf_counter { * Used as a container for task counters and CPU counters as well: */ struct perf_counter_context { -#ifdef CONFIG_PERF_COUNTERS /* * Protect the states of the counters in the list, * nr_active, and the list: @@ -516,6 +514,7 @@ struct perf_counter_context { int nr_counters; int nr_active; int is_active; + atomic_t refcount; struct task_struct *task; /* @@ -523,7 +522,6 @@ struct perf_counter_context { */ u64 time; u64 timestamp; -#endif }; /** diff --git a/include/linux/sched.h b/include/linux/sched.h index ff59d123151..9714d450f41 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -71,7 +71,6 @@ struct sched_param { #include #include #include -#include #include #include #include @@ -99,6 +98,7 @@ struct robust_list_head; struct bio; struct bts_tracer; struct fs_struct; +struct perf_counter_context; /* * List of flags we want to share for kernel threads, @@ -1387,7 +1387,9 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif - struct perf_counter_context perf_counter_ctx; +#ifdef CONFIG_PERF_COUNTERS + struct perf_counter_context *perf_counter_ctxp; +#endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; short il_next; diff --git a/kernel/exit.c b/kernel/exit.c index f9dfedd94af..99ad4063ee4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -159,7 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); #ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); + WARN_ON_ONCE(tsk->perf_counter_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index d32fef4d38e..e72a09f5355 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 08584c16049..06ea3eae886 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -97,6 +97,17 @@ void perf_enable(void) hw_perf_enable(); } +static void get_ctx(struct perf_counter_context *ctx) +{ + atomic_inc(&ctx->refcount); +} + +static void put_ctx(struct perf_counter_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) + kfree(ctx); +} + static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -118,11 +129,17 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) ctx->nr_counters++; } +/* + * Remove a counter from the lists for its context. + * Must be called with counter->mutex and ctx->mutex held. + */ static void list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { struct perf_counter *sibling, *tmp; + if (list_empty(&counter->list_entry)) + return; ctx->nr_counters--; list_del_init(&counter->list_entry); @@ -216,8 +233,6 @@ static void __perf_counter_remove_from_context(void *info) counter_sched_out(counter, cpuctx, ctx); - counter->task = NULL; - list_del_counter(counter, ctx); if (!ctx->task) { @@ -279,7 +294,6 @@ retry: */ if (!list_empty(&counter->list_entry)) { list_del_counter(counter, ctx); - counter->task = NULL; } spin_unlock_irq(&ctx->lock); } @@ -568,11 +582,17 @@ static void __perf_install_in_context(void *info) * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. + * Or possibly this is the right context but it isn't + * on this cpu because it had no counters. */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } spin_lock_irqsave(&ctx->lock, flags); + ctx->is_active = 1; update_context_time(ctx); /* @@ -653,7 +673,6 @@ perf_install_in_context(struct perf_counter_context *ctx, return; } - counter->task = task; retry: task_oncpu_function_call(task, __perf_install_in_context, counter); @@ -693,10 +712,14 @@ static void __perf_counter_enable(void *info) * If this is a per-task counter, need to check whether this * counter's task is the current task on this cpu. */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } spin_lock_irqsave(&ctx->lock, flags); + ctx->is_active = 1; update_context_time(ctx); counter->prev_state = counter->state; @@ -852,10 +875,10 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, void perf_counter_task_sched_out(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter_context *ctx = task->perf_counter_ctxp; struct pt_regs *regs; - if (likely(!cpuctx->task_ctx)) + if (likely(!ctx || !cpuctx->task_ctx)) return; update_context_time(ctx); @@ -871,6 +894,8 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + if (!cpuctx->task_ctx) + return; __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; } @@ -969,8 +994,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, void perf_counter_task_sched_in(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter_context *ctx = task->perf_counter_ctxp; + if (likely(!ctx)) + return; __perf_counter_sched_in(ctx, cpuctx, cpu); cpuctx->task_ctx = ctx; } @@ -985,11 +1012,11 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) int perf_counter_task_disable(void) { struct task_struct *curr = current; - struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; unsigned long flags; - if (likely(!ctx->nr_counters)) + if (!ctx || !ctx->nr_counters) return 0; local_irq_save(flags); @@ -1020,12 +1047,12 @@ int perf_counter_task_disable(void) int perf_counter_task_enable(void) { struct task_struct *curr = current; - struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; unsigned long flags; int cpu; - if (likely(!ctx->nr_counters)) + if (!ctx || !ctx->nr_counters) return 0; local_irq_save(flags); @@ -1128,19 +1155,23 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) return; cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &curr->perf_counter_ctx; + ctx = curr->perf_counter_ctxp; perf_adjust_freq(&cpuctx->ctx); - perf_adjust_freq(ctx); + if (ctx) + perf_adjust_freq(ctx); perf_counter_cpu_sched_out(cpuctx); - __perf_counter_task_sched_out(ctx); + if (ctx) + __perf_counter_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); - rotate_ctx(ctx); + if (ctx) + rotate_ctx(ctx); perf_counter_cpu_sched_in(cpuctx, cpu); - perf_counter_task_sched_in(curr, cpu); + if (ctx) + perf_counter_task_sched_in(curr, cpu); } /* @@ -1176,6 +1207,22 @@ static u64 perf_counter_read(struct perf_counter *counter) return atomic64_read(&counter->count); } +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->counter_list); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); + ctx->task = task; +} + static void put_context(struct perf_counter_context *ctx) { if (ctx->task) @@ -1186,6 +1233,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; + struct perf_counter_context *tctx; struct task_struct *task; /* @@ -1225,15 +1273,36 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) if (!task) return ERR_PTR(-ESRCH); - ctx = &task->perf_counter_ctx; - ctx->task = task; - /* Reuse ptrace permission checks for now. */ if (!ptrace_may_access(task, PTRACE_MODE_READ)) { - put_context(ctx); + put_task_struct(task); return ERR_PTR(-EACCES); } + ctx = task->perf_counter_ctxp; + if (!ctx) { + ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + if (!ctx) { + put_task_struct(task); + return ERR_PTR(-ENOMEM); + } + __perf_counter_init_context(ctx, task); + /* + * Make sure other cpus see correct values for *ctx + * once task->perf_counter_ctxp is visible to them. + */ + smp_wmb(); + tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx); + if (tctx) { + /* + * We raced with some other task; use + * the context they set. + */ + kfree(ctx); + ctx = tctx; + } + } + return ctx; } @@ -1242,6 +1311,7 @@ static void free_counter_rcu(struct rcu_head *head) struct perf_counter *counter; counter = container_of(head, struct perf_counter, rcu_head); + put_ctx(counter->ctx); kfree(counter); } @@ -2247,7 +2317,7 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) perf_counter_comm_ctx(&cpuctx->ctx, comm_event); put_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(¤t->perf_counter_ctx, comm_event); + perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event); } void perf_counter_comm(struct task_struct *task) @@ -2256,7 +2326,9 @@ void perf_counter_comm(struct task_struct *task) if (!atomic_read(&nr_comm_tracking)) return; - + if (!current->perf_counter_ctxp) + return; + comm_event = (struct perf_comm_event){ .task = task, .event = { @@ -2372,7 +2444,7 @@ got_name: perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); put_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(¤t->perf_counter_ctx, mmap_event); + perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event); kfree(buf); } @@ -2384,6 +2456,8 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, if (!atomic_read(&nr_mmap_tracking)) return; + if (!current->perf_counter_ctxp) + return; mmap_event = (struct perf_mmap_event){ .file = file, @@ -2985,6 +3059,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->pmu = NULL; counter->ctx = ctx; + get_ctx(ctx); counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) @@ -3149,21 +3224,6 @@ err_put_context: goto out_fput; } -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->counter_list); - INIT_LIST_HEAD(&ctx->event_list); - ctx->task = task; -} - /* * inherit a counter from parent task to child task: */ @@ -3195,7 +3255,6 @@ inherit_counter(struct perf_counter *parent_counter, /* * Link it up in the child's context: */ - child_counter->task = child; add_counter_to_ctx(child_counter, child_ctx); child_counter->parent = parent_counter; @@ -3294,40 +3353,15 @@ __perf_counter_exit_task(struct task_struct *child, struct perf_counter *parent_counter; /* - * If we do not self-reap then we have to wait for the - * child task to unschedule (it will happen for sure), - * so that its counter is at its final count. (This - * condition triggers rarely - child tasks usually get - * off their CPU before the parent has a chance to - * get this far into the reaping action) + * Protect against concurrent operations on child_counter + * due its fd getting closed, etc. */ - if (child != current) { - wait_task_inactive(child, 0); - update_counter_times(child_counter); - list_del_counter(child_counter, child_ctx); - } else { - struct perf_cpu_context *cpuctx; - unsigned long flags; - - /* - * Disable and unlink this counter. - * - * Be careful about zapping the list - IRQ/NMI context - * could still be processing it: - */ - local_irq_save(flags); - perf_disable(); - - cpuctx = &__get_cpu_var(perf_cpu_context); + mutex_lock(&child_counter->mutex); - group_sched_out(child_counter, cpuctx, child_ctx); - update_counter_times(child_counter); + update_counter_times(child_counter); + list_del_counter(child_counter, child_ctx); - list_del_counter(child_counter, child_ctx); - - perf_enable(); - local_irq_restore(flags); - } + mutex_unlock(&child_counter->mutex); parent_counter = child_counter->parent; /* @@ -3346,19 +3380,29 @@ __perf_counter_exit_task(struct task_struct *child, * * Note: we may be running in child context, but the PID is not hashed * anymore so new counters will not be added. + * (XXX not sure that is true when we get called from flush_old_exec. + * -- paulus) */ void perf_counter_exit_task(struct task_struct *child) { struct perf_counter *child_counter, *tmp; struct perf_counter_context *child_ctx; + unsigned long flags; WARN_ON_ONCE(child != current); - child_ctx = &child->perf_counter_ctx; + child_ctx = child->perf_counter_ctxp; - if (likely(!child_ctx->nr_counters)) + if (likely(!child_ctx)) return; + local_irq_save(flags); + __perf_counter_task_sched_out(child_ctx); + child->perf_counter_ctxp = NULL; + local_irq_restore(flags); + + mutex_lock(&child_ctx->mutex); + again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, list_entry) @@ -3371,6 +3415,10 @@ again: */ if (!list_empty(&child_ctx->counter_list)) goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); } /* @@ -3382,19 +3430,25 @@ void perf_counter_init_task(struct task_struct *child) struct perf_counter *counter; struct task_struct *parent = current; - child_ctx = &child->perf_counter_ctx; - parent_ctx = &parent->perf_counter_ctx; - - __perf_counter_init_context(child_ctx, child); + child->perf_counter_ctxp = NULL; /* * This is executed from the parent task context, so inherit - * counters that have been marked for cloning: + * counters that have been marked for cloning. + * First allocate and initialize a context for the child. */ - if (likely(!parent_ctx->nr_counters)) + child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + if (!child_ctx) + return; + + parent_ctx = parent->perf_counter_ctxp; + if (likely(!parent_ctx || !parent_ctx->nr_counters)) return; + __perf_counter_init_context(child_ctx, child); + child->perf_counter_ctxp = child_ctx; + /* * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. -- cgit v1.2.3 From 564c2b210add41df9a3a5aaa365c1d97cff6110d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:27:22 +1000 Subject: perf_counter: Optimize context switch between identical inherited contexts When monitoring a process and its descendants with a set of inherited counters, we can often get the situation in a context switch where both the old (outgoing) and new (incoming) process have the same set of counters, and their values are ultimately going to be added together. In that situation it doesn't matter which set of counters are used to count the activity for the new process, so there is really no need to go through the process of reading the hardware counters and updating the old task's counters and then setting up the PMU for the new task. This optimizes the context switch in this situation. Instead of scheduling out the perf_counter_context for the old task and scheduling in the new context, we simply transfer the old context to the new task and keep using it without interruption. The new context gets transferred to the old task. This means that both tasks still have a valid perf_counter_context, so no special case is introduced when the old task gets scheduled in again, either on this CPU or another CPU. The equivalence of contexts is detected by keeping a pointer in each cloned context pointing to the context it was cloned from. To cope with the situation where a context is changed by adding or removing counters after it has been cloned, we also keep a generation number on each context which is incremented every time a context is changed. When a context is cloned we take a copy of the parent's generation number, and two cloned contexts are equivalent only if they have the same parent and the same generation number. In order that the parent context pointer remains valid (and is not reused), we increment the parent context's reference count for each context cloned from it. Since we don't have individual fds for the counters in a cloned context, the only thing that can make two clones of a given parent different after they have been cloned is enabling or disabling all counters with prctl. To account for this, we keep a count of the number of enabled counters in each context. Two contexts must have the same number of enabled counters to be considered equivalent. Here are some measurements of the context switch time as measured with the lat_ctx benchmark from lmbench, comparing the times obtained with and without this patch series: -----Unmodified----- With this patch series Counters: none 2 HW 4H+4S none 2 HW 4H+4S 2 processes: Average 3.44 6.45 11.24 3.12 3.39 3.60 St dev 0.04 0.04 0.13 0.05 0.17 0.19 8 processes: Average 6.45 8.79 14.00 5.57 6.23 7.57 St dev 1.27 1.04 0.88 1.42 1.46 1.42 32 processes: Average 5.56 8.43 13.78 5.28 5.55 7.15 St dev 0.41 0.47 0.53 0.54 0.57 0.81 The numbers are the mean and standard deviation of 20 runs of lat_ctx. The "none" columns are lat_ctx run directly without any counters. The "2 HW" columns are with lat_ctx run under perfstat, counting cycles and instructions. The "4H+4S" columns are lat_ctx run under perfstat with 4 hardware counters and 4 software counters (cycles, instructions, cache references, cache misses, task clock, context switch, cpu migrations, and page faults). [ Impact: performance optimization of counter context-switches ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 12 ++++- kernel/perf_counter.c | 109 +++++++++++++++++++++++++++++++++++++------ kernel/sched.c | 2 +- 3 files changed, 107 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 07130900546..4cae01a5045 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -513,6 +513,7 @@ struct perf_counter_context { struct list_head event_list; int nr_counters; int nr_active; + int nr_enabled; int is_active; atomic_t refcount; struct task_struct *task; @@ -522,6 +523,14 @@ struct perf_counter_context { */ u64 time; u64 timestamp; + + /* + * These fields let us detect when two contexts have both + * been cloned (inherited) from a common ancestor. + */ + struct perf_counter_context *parent_ctx; + u32 parent_gen; + u32 generation; }; /** @@ -552,7 +561,8 @@ extern int perf_max_counters; extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter); extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); -extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); +extern void perf_counter_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern void perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 06ea3eae886..c10055416de 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -104,8 +104,11 @@ static void get_ctx(struct perf_counter_context *ctx) static void put_ctx(struct perf_counter_context *ctx) { - if (atomic_dec_and_test(&ctx->refcount)) + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); kfree(ctx); + } } static void @@ -127,6 +130,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_add_rcu(&counter->event_entry, &ctx->event_list); ctx->nr_counters++; + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + ctx->nr_enabled++; } /* @@ -141,6 +146,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) if (list_empty(&counter->list_entry)) return; ctx->nr_counters--; + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + ctx->nr_enabled--; list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); @@ -203,6 +210,22 @@ group_sched_out(struct perf_counter *group_counter, cpuctx->exclusive = 0; } +/* + * Mark this context as not being a clone of another. + * Called when counters are added to or removed from this context. + * We also increment our generation number so that anything that + * was cloned from this context before this will not match anything + * cloned from this context after this. + */ +static void unclone_ctx(struct perf_counter_context *ctx) +{ + ++ctx->generation; + if (!ctx->parent_ctx) + return; + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; +} + /* * Cross CPU call to remove a performance counter * @@ -263,6 +286,7 @@ static void perf_counter_remove_from_context(struct perf_counter *counter) struct perf_counter_context *ctx = counter->ctx; struct task_struct *task = ctx->task; + unclone_ctx(ctx); if (!task) { /* * Per cpu counters are removed via an smp call and @@ -378,6 +402,7 @@ static void __perf_counter_disable(void *info) else counter_sched_out(counter, cpuctx, ctx); counter->state = PERF_COUNTER_STATE_OFF; + ctx->nr_enabled--; } spin_unlock_irqrestore(&ctx->lock, flags); @@ -419,6 +444,7 @@ static void perf_counter_disable(struct perf_counter *counter) if (counter->state == PERF_COUNTER_STATE_INACTIVE) { update_counter_times(counter); counter->state = PERF_COUNTER_STATE_OFF; + ctx->nr_enabled--; } spin_unlock_irq(&ctx->lock); @@ -727,6 +753,7 @@ static void __perf_counter_enable(void *info) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + ctx->nr_enabled++; /* * If the counter is in a group and isn't the group leader, @@ -817,6 +844,7 @@ static void perf_counter_enable(struct perf_counter *counter) counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + ctx->nr_enabled++; } out: spin_unlock_irq(&ctx->lock); @@ -861,6 +889,25 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, spin_unlock(&ctx->lock); } +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled counters. + * If the number of enabled counters is the same, then the set + * of enabled counters should be the same, because these are both + * inherited contexts, therefore we can't access individual counters + * in them directly with an fd; we can only enable/disable all + * counters via prctl, or enable/disable all counters in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_counter_context *ctx1, + struct perf_counter_context *ctx2) +{ + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && ctx1->nr_enabled == ctx2->nr_enabled; +} + /* * Called from scheduler to remove the counters of the current task, * with interrupts disabled. @@ -872,10 +919,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, * accessing the counter control register. If a NMI hits, then it will * not restart the counter. */ -void perf_counter_task_sched_out(struct task_struct *task, int cpu) +void perf_counter_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = task->perf_counter_ctxp; + struct perf_counter_context *next_ctx; struct pt_regs *regs; if (likely(!ctx || !cpuctx->task_ctx)) @@ -885,6 +934,16 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) regs = task_pt_regs(task); perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); + + next_ctx = next->perf_counter_ctxp; + if (next_ctx && context_equiv(ctx, next_ctx)) { + task->perf_counter_ctxp = next_ctx; + next->perf_counter_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + return; + } + __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; @@ -998,6 +1057,8 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) if (likely(!ctx)) return; + if (cpuctx->task_ctx == ctx) + return; __perf_counter_sched_in(ctx, cpuctx, cpu); cpuctx->task_ctx = ctx; } @@ -3252,6 +3313,16 @@ inherit_counter(struct perf_counter *parent_counter, if (IS_ERR(child_counter)) return child_counter; + /* + * Make the child state follow the state of the parent counter, + * not its hw_event.disabled bit. We hold the parent's mutex, + * so we won't race with perf_counter_{en,dis}able_family. + */ + if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) + child_counter->state = PERF_COUNTER_STATE_INACTIVE; + else + child_counter->state = PERF_COUNTER_STATE_OFF; + /* * Link it up in the child's context: */ @@ -3277,16 +3348,6 @@ inherit_counter(struct perf_counter *parent_counter, mutex_lock(&parent_counter->mutex); list_add_tail(&child_counter->child_list, &parent_counter->child_list); - /* - * Make the child state follow the state of the parent counter, - * not its hw_event.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en,dis}able_family. - */ - if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - else - child_counter->state = PERF_COUNTER_STATE_OFF; - mutex_unlock(&parent_counter->mutex); return child_counter; @@ -3429,6 +3490,7 @@ void perf_counter_init_task(struct task_struct *child) struct perf_counter_context *child_ctx, *parent_ctx; struct perf_counter *counter; struct task_struct *parent = current; + int inherited_all = 1; child->perf_counter_ctxp = NULL; @@ -3463,12 +3525,31 @@ void perf_counter_init_task(struct task_struct *child) if (counter != counter->group_leader) continue; - if (!counter->hw_event.inherit) + if (!counter->hw_event.inherit) { + inherited_all = 0; continue; + } if (inherit_group(counter, parent, - parent_ctx, child, child_ctx)) + parent_ctx, child, child_ctx)) { + inherited_all = 0; break; + } + } + + if (inherited_all) { + /* + * Mark the child context as a clone of the parent + * context, or of whatever the parent is a clone of. + */ + if (parent_ctx->parent_ctx) { + child_ctx->parent_ctx = parent_ctx->parent_ctx; + child_ctx->parent_gen = parent_ctx->parent_gen; + } else { + child_ctx->parent_ctx = parent_ctx; + child_ctx->parent_gen = parent_ctx->generation; + } + get_ctx(child_ctx->parent_ctx); } mutex_unlock(&parent_ctx->mutex); diff --git a/kernel/sched.c b/kernel/sched.c index 419a39d0988..4c0d58bce6b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5091,7 +5091,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_counter_task_sched_out(prev, cpu); + perf_counter_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; -- cgit v1.2.3 From 910431c7f2e963017d767b29c80ae706421e569f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 22 May 2009 12:32:15 +0200 Subject: perf_counter: fix !PERF_COUNTERS build failure Update the !CONFIG_PERF_COUNTERS prototype too, for perf_counter_task_sched_out(). [ Impact: build fix ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4cae01a5045..2eedae8498d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -625,7 +625,8 @@ extern void perf_counter_init(void); static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } static inline void -perf_counter_task_sched_out(struct task_struct *task, int cpu) { } +perf_counter_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) { } static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline void perf_counter_init_task(struct task_struct *child) { } -- cgit v1.2.3 From e220d2dcb944c5c488b6855d15ec66d76900514f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:55 +0200 Subject: perf_counter: Fix dynamic irq_period logging We call perf_adjust_freq() from perf_counter_task_tick() which is is called under the rq->lock causing lock recursion. However, it's no longer required to be called under the rq->lock, so remove it from under it. Also, fix up some related comments. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.476197912@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 3 ++- kernel/sched.c | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2eedae8498d..23ddd29730f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -260,6 +260,7 @@ enum perf_event_type { /* * struct { * struct perf_event_header header; + * u64 time; * u64 irq_period; * }; */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c10055416de..2f410ea2cb3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2559,7 +2559,8 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, } /* - * + * Log irq_period changes so that analyzing tools can re-normalize the + * event flow. */ static void perf_log_period(struct perf_counter *counter, u64 period) diff --git a/kernel/sched.c b/kernel/sched.c index 4c0d58bce6b..ad079f07c9c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4875,9 +4875,10 @@ void scheduler_tick(void) update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - perf_counter_task_tick(curr, cpu); spin_unlock(&rq->lock); + perf_counter_task_tick(curr, cpu); + #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); -- cgit v1.2.3 From fccc714b3148ab9741fafc1e90c3876d50df6093 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:56 +0200 Subject: perf_counter: Sanitize counter->mutex s/counter->mutex/counter->child_mutex/ and make sure its only used to protect child_list. The usage in __perf_counter_exit_task() doesn't appear to be problematic since ctx->mutex also covers anything related to fd tear-down. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.533186528@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++---- kernel/perf_counter.c | 47 ++++++++++++++++++-------------------------- 2 files changed, 22 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 23ddd29730f..4ab8050eb9e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -452,9 +452,6 @@ struct perf_counter { struct perf_counter_context *ctx; struct file *filp; - struct perf_counter *parent; - struct list_head child_list; - /* * These accumulate total time (in nanoseconds) that children * counters have been enabled and running, respectively. @@ -465,7 +462,9 @@ struct perf_counter { /* * Protect attach/detach and child_list: */ - struct mutex mutex; + struct mutex child_mutex; + struct list_head child_list; + struct perf_counter *parent; int oncpu; int cpu; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2f410ea2cb3..679c3b5bb7d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -111,6 +111,10 @@ static void put_ctx(struct perf_counter_context *ctx) } } +/* + * Add a counter from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -136,7 +140,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) /* * Remove a counter from the lists for its context. - * Must be called with counter->mutex and ctx->mutex held. + * Must be called with ctx->mutex and ctx->lock held. */ static void list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) @@ -276,7 +280,7 @@ static void __perf_counter_remove_from_context(void *info) /* * Remove the counter from a task's (or a CPU's) list of counters. * - * Must be called with counter->mutex and ctx->mutex held. + * Must be called with ctx->mutex held. * * CPU counters are removed with a smp call. For task counters we only * call when the task is on a CPU. @@ -1407,11 +1411,7 @@ static int perf_release(struct inode *inode, struct file *file) file->private_data = NULL; mutex_lock(&ctx->mutex); - mutex_lock(&counter->mutex); - perf_counter_remove_from_context(counter); - - mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); free_counter(counter); @@ -1437,7 +1437,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->state == PERF_COUNTER_STATE_ERROR) return 0; - mutex_lock(&counter->mutex); + mutex_lock(&counter->child_mutex); values[0] = perf_counter_read(counter); n = 1; if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) @@ -1446,7 +1446,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = counter->total_time_running + atomic64_read(&counter->child_total_time_running); - mutex_unlock(&counter->mutex); + mutex_unlock(&counter->child_mutex); if (count < n * sizeof(u64)) return -EINVAL; @@ -1510,11 +1510,11 @@ static void perf_counter_for_each_child(struct perf_counter *counter, { struct perf_counter *child; - mutex_lock(&counter->mutex); + mutex_lock(&counter->child_mutex); func(counter); list_for_each_entry(child, &counter->child_list, child_list) func(child); - mutex_unlock(&counter->mutex); + mutex_unlock(&counter->child_mutex); } static void perf_counter_for_each(struct perf_counter *counter, @@ -1522,11 +1522,11 @@ static void perf_counter_for_each(struct perf_counter *counter, { struct perf_counter *child; - mutex_lock(&counter->mutex); + mutex_lock(&counter->child_mutex); perf_counter_for_each_sibling(counter, func); list_for_each_entry(child, &counter->child_list, child_list) perf_counter_for_each_sibling(child, func); - mutex_unlock(&counter->mutex); + mutex_unlock(&counter->child_mutex); } static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -3106,7 +3106,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, if (!group_leader) group_leader = counter; - mutex_init(&counter->mutex); + mutex_init(&counter->child_mutex); + INIT_LIST_HEAD(&counter->child_list); + INIT_LIST_HEAD(&counter->list_entry); INIT_LIST_HEAD(&counter->event_entry); INIT_LIST_HEAD(&counter->sibling_list); @@ -3114,8 +3116,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, mutex_init(&counter->mmap_mutex); - INIT_LIST_HEAD(&counter->child_list); - counter->cpu = cpu; counter->hw_event = *hw_event; counter->group_leader = group_leader; @@ -3346,10 +3346,9 @@ inherit_counter(struct perf_counter *parent_counter, /* * Link this into the parent counter's child list */ - mutex_lock(&parent_counter->mutex); + mutex_lock(&parent_counter->child_mutex); list_add_tail(&child_counter->child_list, &parent_counter->child_list); - - mutex_unlock(&parent_counter->mutex); + mutex_unlock(&parent_counter->child_mutex); return child_counter; } @@ -3396,9 +3395,9 @@ static void sync_child_counter(struct perf_counter *child_counter, /* * Remove this counter from the parent's list */ - mutex_lock(&parent_counter->mutex); + mutex_lock(&parent_counter->child_mutex); list_del_init(&child_counter->child_list); - mutex_unlock(&parent_counter->mutex); + mutex_unlock(&parent_counter->child_mutex); /* * Release the parent counter, if this was the last @@ -3414,17 +3413,9 @@ __perf_counter_exit_task(struct task_struct *child, { struct perf_counter *parent_counter; - /* - * Protect against concurrent operations on child_counter - * due its fd getting closed, etc. - */ - mutex_lock(&child_counter->mutex); - update_counter_times(child_counter); list_del_counter(child_counter, child_ctx); - mutex_unlock(&child_counter->mutex); - parent_counter = child_counter->parent; /* * It can happen that parent exits first, and has counters -- cgit v1.2.3 From 082ff5a2767a0679ee543f14883adbafb631ffbe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:29:00 +0200 Subject: perf_counter: Change pctrl() behaviour Instead of en/dis-abling all counters acting on a particular task, en/dis- able all counters we created. [ v2: fix crash on first counter enable ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.916937244@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 10 +++++ include/linux/perf_counter.h | 3 ++ include/linux/sched.h | 2 + kernel/perf_counter.c | 87 ++++++++++++-------------------------------- 4 files changed, 39 insertions(+), 63 deletions(-) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index d87247d2641..353c0ac7723 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -108,6 +108,15 @@ extern struct group_info init_groups; extern struct cred init_cred; +#ifdef CONFIG_PERF_COUNTERS +# define INIT_PERF_COUNTERS(tsk) \ + .perf_counter_mutex = \ + __MUTEX_INITIALIZER(tsk.perf_counter_mutex), \ + .perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list), +#else +# define INIT_PERF_COUNTERS(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -171,6 +180,7 @@ extern struct cred init_cred; }, \ .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_IDS \ + INIT_PERF_COUNTERS(tsk) \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4ab8050eb9e..4159ee5940f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -469,6 +469,9 @@ struct perf_counter { int oncpu; int cpu; + struct list_head owner_entry; + struct task_struct *owner; + /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; diff --git a/include/linux/sched.h b/include/linux/sched.h index 9714d450f41..bc9326dcdde 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1389,6 +1389,8 @@ struct task_struct { #endif #ifdef CONFIG_PERF_COUNTERS struct perf_counter_context *perf_counter_ctxp; + struct mutex perf_counter_mutex; + struct list_head perf_counter_list; #endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0e97f896133..4c86a636976 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1076,79 +1076,26 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) __perf_counter_sched_in(ctx, cpuctx, cpu); } -int perf_counter_task_disable(void) +int perf_counter_task_enable(void) { - struct task_struct *curr = current; - struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; - unsigned long flags; - - if (!ctx || !ctx->nr_counters) - return 0; - - local_irq_save(flags); - __perf_counter_task_sched_out(ctx); - - spin_lock(&ctx->lock); - - /* - * Disable all the counters: - */ - perf_disable(); - - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ERROR) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_OFF; - } - } - - perf_enable(); - - spin_unlock_irqrestore(&ctx->lock, flags); + mutex_lock(¤t->perf_counter_mutex); + list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) + perf_counter_enable(counter); + mutex_unlock(¤t->perf_counter_mutex); return 0; } -int perf_counter_task_enable(void) +int perf_counter_task_disable(void) { - struct task_struct *curr = current; - struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; - unsigned long flags; - int cpu; - - if (!ctx || !ctx->nr_counters) - return 0; - - local_irq_save(flags); - cpu = smp_processor_id(); - - __perf_counter_task_sched_out(ctx); - - spin_lock(&ctx->lock); - /* - * Disable all the counters: - */ - perf_disable(); - - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state > PERF_COUNTER_STATE_OFF) - continue; - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = - ctx->time - counter->total_time_enabled; - counter->hw_event.disabled = 0; - } - perf_enable(); - - spin_unlock(&ctx->lock); - - perf_counter_task_sched_in(curr, cpu); - - local_irq_restore(flags); + mutex_lock(¤t->perf_counter_mutex); + list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) + perf_counter_disable(counter); + mutex_unlock(¤t->perf_counter_mutex); return 0; } @@ -1416,6 +1363,11 @@ static int perf_release(struct inode *inode, struct file *file) perf_counter_remove_from_context(counter); mutex_unlock(&ctx->mutex); + mutex_lock(&counter->owner->perf_counter_mutex); + list_del_init(&counter->owner_entry); + mutex_unlock(&counter->owner->perf_counter_mutex); + put_task_struct(counter->owner); + free_counter(counter); put_context(ctx); @@ -3272,6 +3224,12 @@ SYSCALL_DEFINE5(perf_counter_open, perf_install_in_context(ctx, counter, cpu); mutex_unlock(&ctx->mutex); + counter->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_counter_mutex); + list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); + mutex_unlock(¤t->perf_counter_mutex); + fput_light(counter_file, fput_needed2); out_fput: @@ -3488,6 +3446,9 @@ void perf_counter_init_task(struct task_struct *child) child->perf_counter_ctxp = NULL; + mutex_init(&child->perf_counter_mutex); + INIT_LIST_HEAD(&child->perf_counter_list); + /* * This is executed from the parent task context, so inherit * counters that have been marked for cloning. -- cgit v1.2.3 From 475c55797323b67435083f6e2eb8ee670f6410ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:29:01 +0200 Subject: perf_counter: Remove perf_counter_context::nr_enabled now that pctrl() no longer disables other people's counters, remove the PMU cache code that deals with that. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163013.032998331@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 - kernel/perf_counter.c | 11 +---------- 2 files changed, 1 insertion(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4159ee5940f..2ddf5e3c551 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -516,7 +516,6 @@ struct perf_counter_context { struct list_head event_list; int nr_counters; int nr_active; - int nr_enabled; int is_active; atomic_t refcount; struct task_struct *task; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4c86a636976..cb4062559b4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -134,8 +134,6 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_add_rcu(&counter->event_entry, &ctx->event_list); ctx->nr_counters++; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - ctx->nr_enabled++; } /* @@ -150,8 +148,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) if (list_empty(&counter->list_entry)) return; ctx->nr_counters--; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - ctx->nr_enabled--; list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); @@ -406,7 +402,6 @@ static void __perf_counter_disable(void *info) else counter_sched_out(counter, cpuctx, ctx); counter->state = PERF_COUNTER_STATE_OFF; - ctx->nr_enabled--; } spin_unlock_irqrestore(&ctx->lock, flags); @@ -448,7 +443,6 @@ static void perf_counter_disable(struct perf_counter *counter) if (counter->state == PERF_COUNTER_STATE_INACTIVE) { update_counter_times(counter); counter->state = PERF_COUNTER_STATE_OFF; - ctx->nr_enabled--; } spin_unlock_irq(&ctx->lock); @@ -759,7 +753,6 @@ static void __perf_counter_enable(void *info) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - ctx->nr_enabled++; /* * If the counter is in a group and isn't the group leader, @@ -850,7 +843,6 @@ static void perf_counter_enable(struct perf_counter *counter) counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - ctx->nr_enabled++; } out: spin_unlock_irq(&ctx->lock); @@ -910,8 +902,7 @@ static int context_equiv(struct perf_counter_context *ctx1, struct perf_counter_context *ctx2) { return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && ctx1->nr_enabled == ctx2->nr_enabled; + && ctx1->parent_gen == ctx2->parent_gen; } /* -- cgit v1.2.3 From e527ea312f31e88a7fa5472b71db71c565b0d44f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 14:45:25 +0200 Subject: perf_counter: Remove unused ABI bits extra_config_len isn't used for anything, remove it. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525124600.116035832@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2ddf5e3c551..b1f2bac09f9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -154,11 +154,11 @@ struct perf_counter_hw_event { __reserved_1 : 51; - __u32 extra_config_len; __u32 wakeup_events; /* wakeup every n events */ + __u32 __reserved_2; - __u64 __reserved_2; __u64 __reserved_3; + __u64 __reserved_4; }; /* -- cgit v1.2.3 From 6ab423e0eaca827fbd201ca4ae7d4f8573a366b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 14:45:27 +0200 Subject: perf_counter: Propagate inheritance failures down the fork() path Fail fork() when we fail inheritance for some reason (-ENOMEM most likely). Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525124600.324656474@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- kernel/fork.c | 6 +++++- kernel/perf_counter.c | 20 ++++++++++++-------- 3 files changed, 19 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index b1f2bac09f9..d3e85de9bf1 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -566,7 +566,7 @@ extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); extern void perf_counter_task_sched_out(struct task_struct *task, struct task_struct *next, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); -extern void perf_counter_init_task(struct task_struct *child); +extern int perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); @@ -631,7 +631,7 @@ perf_counter_task_sched_out(struct task_struct *task, struct task_struct *next, int cpu) { } static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } -static inline void perf_counter_init_task(struct task_struct *child) { } +static inline int perf_counter_init_task(struct task_struct *child) { } static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } diff --git a/kernel/fork.c b/kernel/fork.c index 675e01e9072..c07c3335cea 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1095,7 +1095,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); - perf_counter_init_task(p); + + retval = perf_counter_init_task(p); + if (retval) + goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) goto bad_fork_cleanup_policy; @@ -1295,6 +1298,7 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: + perf_counter_exit_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 217dbcce2eb..7a7a144870e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3434,18 +3434,23 @@ again: /* * Initialize the perf_counter context in task_struct */ -void perf_counter_init_task(struct task_struct *child) +int perf_counter_init_task(struct task_struct *child) { struct perf_counter_context *child_ctx, *parent_ctx; struct perf_counter *counter; struct task_struct *parent = current; int inherited_all = 1; + int ret = 0; child->perf_counter_ctxp = NULL; mutex_init(&child->perf_counter_mutex); INIT_LIST_HEAD(&child->perf_counter_list); + parent_ctx = parent->perf_counter_ctxp; + if (likely(!parent_ctx || !parent_ctx->nr_counters)) + return 0; + /* * This is executed from the parent task context, so inherit * counters that have been marked for cloning. @@ -3454,11 +3459,7 @@ void perf_counter_init_task(struct task_struct *child) child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); if (!child_ctx) - return; - - parent_ctx = parent->perf_counter_ctxp; - if (likely(!parent_ctx || !parent_ctx->nr_counters)) - return; + return -ENOMEM; __perf_counter_init_context(child_ctx, child); child->perf_counter_ctxp = child_ctx; @@ -3482,8 +3483,9 @@ void perf_counter_init_task(struct task_struct *child) continue; } - if (inherit_group(counter, parent, - parent_ctx, child, child_ctx)) { + ret = inherit_group(counter, parent, parent_ctx, + child, child_ctx); + if (ret) { inherited_all = 0; break; } @@ -3505,6 +3507,8 @@ void perf_counter_init_task(struct task_struct *child) } mutex_unlock(&parent_ctx->mutex); + + return ret; } static void __cpuinit perf_counter_init_cpu(int cpu) -- cgit v1.2.3 From 48e22d56ecdeddd1ffb42a02fccba5c6ef42b133 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:04 +0200 Subject: perf_counter: x86: Remove interrupt throttle remove the x86 specific interrupt throttle Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.616671838@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 -- arch/x86/kernel/cpu/perf_counter.c | 47 ++++---------------------------------- include/linux/perf_counter.h | 2 -- 3 files changed, 5 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b4f64402a82..89b63b5fad3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -763,8 +763,6 @@ static void local_apic_timer_interrupt(void) inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); - - perf_counter_unthrottle(); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c14437faf5d..8c8177f859f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -718,11 +718,6 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) intel_pmu_enable_counter(hwc, idx); } -/* - * Maximum interrupt frequency of 100KHz per CPU - */ -#define PERFMON_MAX_INTERRUPTS (100000/HZ) - /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -775,15 +770,14 @@ again: if (status) goto again; - if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS) - perf_enable(); + perf_enable(); return 1; } static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int cpu, idx, throttle = 0, handled = 0; + int cpu, idx, handled = 0; struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; @@ -792,16 +786,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { - throttle = 1; - __perf_disable(); - cpuc->enabled = 0; - barrier(); - } - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - int disable = 0; - if (!test_bit(idx, cpuc->active_mask)) continue; @@ -809,45 +794,23 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) hwc = &counter->hw; if (counter->hw_event.nmi != nmi) - goto next; + continue; val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) - goto next; + continue; /* counter overflow */ x86_perf_counter_set_period(counter, hwc, idx); handled = 1; inc_irq_stat(apic_perf_irqs); - disable = perf_counter_overflow(counter, nmi, regs, 0); - -next: - if (disable || throttle) + if (perf_counter_overflow(counter, nmi, regs, 0)) amd_pmu_disable_counter(hwc, idx); } return handled; } -void perf_counter_unthrottle(void) -{ - struct cpu_hw_counters *cpuc; - - if (!x86_pmu_initialized()) - return; - - cpuc = &__get_cpu_var(cpu_hw_counters); - if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - /* - * Clear them before re-enabling irqs/NMIs again: - */ - cpuc->interrupts = 0; - perf_enable(); - } else { - cpuc->interrupts = 0; - } -} - void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d3e85de9bf1..0c160be2078 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -570,7 +570,6 @@ extern int perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); -extern void perf_counter_unthrottle(void); extern void __perf_disable(void); extern bool __perf_enable(void); extern void perf_disable(void); @@ -635,7 +634,6 @@ static inline int perf_counter_init_task(struct task_struct *child) { } static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } -static inline void perf_counter_unthrottle(void) { } static inline void perf_disable(void) { } static inline void perf_enable(void) { } static inline int perf_counter_task_disable(void) { return -EINVAL; } -- cgit v1.2.3 From a78ac3258782f3e64cb40beb5990808e1febcc0c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:05 +0200 Subject: perf_counter: Generic per counter interrupt throttle Introduce a generic per counter interrupt throttle. This uses the perf_counter_overflow() quick disable to throttle a specific counter when its going too fast when a pmu->unthrottle() method is provided which can undo the quick disable. Power needs to implement both the quick disable and the unthrottle method. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 13 +++++++++ include/linux/perf_counter.h | 11 +++++++ kernel/perf_counter.c | 59 +++++++++++++++++++++++++++++++++++--- kernel/sysctl.c | 8 ++++++ 4 files changed, 87 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8c8177f859f..c4b543d1a86 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -623,6 +623,18 @@ try_generic: return 0; } +static void x86_pmu_unthrottle(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + + if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || + cpuc->counters[hwc->idx] != counter)) + return; + + x86_pmu.enable(hwc, hwc->idx); +} + void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; @@ -1038,6 +1050,7 @@ static const struct pmu pmu = { .enable = x86_pmu_enable, .disable = x86_pmu_disable, .read = x86_pmu_read, + .unthrottle = x86_pmu_unthrottle, }; const struct pmu *hw_perf_counter_init(struct perf_counter *counter) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0c160be2078..e3a7585d3e4 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -266,6 +266,15 @@ enum perf_event_type { */ PERF_EVENT_PERIOD = 4, + /* + * struct { + * struct perf_event_header header; + * u64 time; + * }; + */ + PERF_EVENT_THROTTLE = 5, + PERF_EVENT_UNTHROTTLE = 6, + /* * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* @@ -367,6 +376,7 @@ struct pmu { int (*enable) (struct perf_counter *counter); void (*disable) (struct perf_counter *counter); void (*read) (struct perf_counter *counter); + void (*unthrottle) (struct perf_counter *counter); }; /** @@ -613,6 +623,7 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); extern int sysctl_perf_counter_priv; extern int sysctl_perf_counter_mlock; +extern int sysctl_perf_counter_limit; extern void perf_counter_init(void); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 14b1fe98483..ec9c4007a7f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -46,6 +46,7 @@ static atomic_t nr_comm_tracking __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ +int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */ /* * Lock for (sysadmin-configurable) counter reservations: @@ -1066,12 +1067,15 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) __perf_counter_sched_in(ctx, cpuctx, cpu); } +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_counter *counter, int enable); static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_freq(struct perf_counter_context *ctx) { struct perf_counter *counter; - u64 irq_period; + u64 interrupts, irq_period; u64 events, period; s64 delta; @@ -1080,10 +1084,19 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) if (counter->state != PERF_COUNTER_STATE_ACTIVE) continue; + interrupts = counter->hw.interrupts; + counter->hw.interrupts = 0; + + if (interrupts == MAX_INTERRUPTS) { + perf_log_throttle(counter, 1); + counter->pmu->unthrottle(counter); + interrupts = 2*sysctl_perf_counter_limit/HZ; + } + if (!counter->hw_event.freq || !counter->hw_event.irq_freq) continue; - events = HZ * counter->hw.interrupts * counter->hw.irq_period; + events = HZ * interrupts * counter->hw.irq_period; period = div64_u64(events, counter->hw_event.irq_freq); delta = (s64)(1 + period - counter->hw.irq_period); @@ -1097,7 +1110,6 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) perf_log_period(counter, irq_period); counter->hw.irq_period = irq_period; - counter->hw.interrupts = 0; } spin_unlock(&ctx->lock); } @@ -2543,6 +2555,35 @@ static void perf_log_period(struct perf_counter *counter, u64 period) perf_output_end(&handle); } +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_counter *counter, int enable) +{ + struct perf_output_handle handle; + int ret; + + struct { + struct perf_event_header header; + u64 time; + } throttle_event = { + .header = { + .type = PERF_EVENT_THROTTLE + 1, + .misc = 0, + .size = sizeof(throttle_event), + }, + .time = sched_clock(), + }; + + ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0); + if (ret) + return; + + perf_output_put(&handle, throttle_event); + perf_output_end(&handle); +} + /* * Generic counter overflow handling. */ @@ -2551,9 +2592,19 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { int events = atomic_read(&counter->event_limit); + int throttle = counter->pmu->unthrottle != NULL; int ret = 0; - counter->hw.interrupts++; + if (!throttle) { + counter->hw.interrupts++; + } else if (counter->hw.interrupts != MAX_INTERRUPTS) { + counter->hw.interrupts++; + if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) { + counter->hw.interrupts = MAX_INTERRUPTS; + perf_log_throttle(counter, 0); + ret = 1; + } + } /* * XXX event_limit might not quite work as expected on inherited diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3cb1849f598..0c4bf863afa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -930,6 +930,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "perf_counter_int_limit", + .data = &sysctl_perf_counter_limit, + .maxlen = sizeof(sysctl_perf_counter_limit), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif /* * NOTE: do not add new entries to this table unless you have read -- cgit v1.2.3 From 0127c3ea082ee9f1034789b978dfc7fd83254617 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2009 22:03:26 +0200 Subject: perf_counter: fix warning & lockup - remove bogus warning - fix wakeup from NMI path lockup - also fix up whitespace noise in perf_counter.h Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 78 ++++++++++++++++++++++---------------------- kernel/perf_counter.c | 4 +-- 2 files changed, 40 insertions(+), 42 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e3a7585d3e4..2b16ed37b74 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -73,7 +73,7 @@ enum sw_event_ids { PERF_SW_EVENTS_MAX = 7, }; -#define __PERF_COUNTER_MASK(name) \ +#define __PERF_COUNTER_MASK(name) \ (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \ PERF_COUNTER_##name##_SHIFT) @@ -98,14 +98,14 @@ enum sw_event_ids { * in the overflow packets. */ enum perf_counter_record_format { - PERF_RECORD_IP = 1U << 0, - PERF_RECORD_TID = 1U << 1, - PERF_RECORD_TIME = 1U << 2, - PERF_RECORD_ADDR = 1U << 3, - PERF_RECORD_GROUP = 1U << 4, - PERF_RECORD_CALLCHAIN = 1U << 5, - PERF_RECORD_CONFIG = 1U << 6, - PERF_RECORD_CPU = 1U << 7, + PERF_RECORD_IP = 1U << 0, + PERF_RECORD_TID = 1U << 1, + PERF_RECORD_TIME = 1U << 2, + PERF_RECORD_ADDR = 1U << 3, + PERF_RECORD_GROUP = 1U << 4, + PERF_RECORD_CALLCHAIN = 1U << 5, + PERF_RECORD_CONFIG = 1U << 6, + PERF_RECORD_CPU = 1U << 7, }; /* @@ -235,13 +235,13 @@ enum perf_event_type { * correlate userspace IPs to code. They have the following structure: * * struct { - * struct perf_event_header header; + * struct perf_event_header header; * - * u32 pid, tid; - * u64 addr; - * u64 len; - * u64 pgoff; - * char filename[]; + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; * }; */ PERF_EVENT_MMAP = 1, @@ -249,27 +249,27 @@ enum perf_event_type { /* * struct { - * struct perf_event_header header; + * struct perf_event_header header; * - * u32 pid, tid; - * char comm[]; + * u32 pid, tid; + * char comm[]; * }; */ PERF_EVENT_COMM = 3, /* * struct { - * struct perf_event_header header; - * u64 time; - * u64 irq_period; + * struct perf_event_header header; + * u64 time; + * u64 irq_period; * }; */ PERF_EVENT_PERIOD = 4, /* * struct { - * struct perf_event_header header; - * u64 time; + * struct perf_event_header header; + * u64 time; * }; */ PERF_EVENT_THROTTLE = 5, @@ -280,23 +280,23 @@ enum perf_event_type { * will be PERF_RECORD_* * * struct { - * struct perf_event_header header; + * struct perf_event_header header; * - * { u64 ip; } && PERF_RECORD_IP - * { u32 pid, tid; } && PERF_RECORD_TID - * { u64 time; } && PERF_RECORD_TIME - * { u64 addr; } && PERF_RECORD_ADDR - * { u64 config; } && PERF_RECORD_CONFIG - * { u32 cpu, res; } && PERF_RECORD_CPU + * { u64 ip; } && PERF_RECORD_IP + * { u32 pid, tid; } && PERF_RECORD_TID + * { u64 time; } && PERF_RECORD_TIME + * { u64 addr; } && PERF_RECORD_ADDR + * { u64 config; } && PERF_RECORD_CONFIG + * { u32 cpu, res; } && PERF_RECORD_CPU * - * { u64 nr; - * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP + * { u64 nr; + * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP * - * { u16 nr, - * hv, - * kernel, - * user; - * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN + * { u16 nr, + * hv, + * kernel, + * user; + * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN * }; */ }; @@ -406,7 +406,7 @@ struct perf_mmap_data { atomic_t wakeup; /* needs a wakeup */ struct perf_counter_mmap_page *user_page; - void *data_pages[0]; + void *data_pages[0]; }; struct perf_pending_entry { @@ -422,7 +422,7 @@ struct perf_counter { struct list_head list_entry; struct list_head event_entry; struct list_head sibling_list; - int nr_siblings; + int nr_siblings; struct perf_counter *group_leader; const struct pmu *pmu; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ec9c4007a7f..070f92d3232 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2576,7 +2576,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) .time = sched_clock(), }; - ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0); + ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); if (ret) return; @@ -3449,8 +3449,6 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter_context *child_ctx; unsigned long flags; - WARN_ON_ONCE(child != current); - child_ctx = child->perf_counter_ctxp; if (likely(!child_ctx)) -- cgit v1.2.3 From d3e78ee3d015dac1794433abb6403b6fc8e70e10 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 May 2009 11:41:50 +0200 Subject: perf_counter: Fix perf_counter_init_task() on !CONFIG_PERF_COUNTERS Pointed out by compiler warnings: tip/include/linux/perf_counter.h:644: warning: no return statement in function returning non-void Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2b16ed37b74..a65ddc58051 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -641,7 +641,7 @@ perf_counter_task_sched_out(struct task_struct *task, struct task_struct *next, int cpu) { } static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } -static inline int perf_counter_init_task(struct task_struct *child) { } +static inline int perf_counter_init_task(struct task_struct *child) { return 0; } static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } -- cgit v1.2.3 From c93f7669098eb97c5376e5396e3dfb734c17df4f Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 28 May 2009 22:18:17 +1000 Subject: perf_counter: Fix race in attaching counters to tasks and exiting Commit 564c2b21 ("perf_counter: Optimize context switch between identical inherited contexts") introduced a race where it is possible that a counter being attached to a task could get attached to the wrong task, if the task is one that has inherited its context from another task via fork. This happens because the optimized context switch could switch the context to another task after find_get_context has read task->perf_counter_ctxp. In fact, it's possible that the context could then get freed, if the other task then exits. This fixes the problem by protecting both the context switch and the critical code in find_get_context with spinlocks. The context switch locks the cxt->lock of both the outgoing and incoming contexts before swapping them. That means that once code such as find_get_context has obtained the spinlock for the context associated with a task, the context can't get swapped to another task. However, the context may have been swapped in the interval between reading task->perf_counter_ctxp and getting the lock, so it is necessary to check and retry. To make sure that none of the contexts being looked at in find_get_context can get freed, this changes the context freeing code to use RCU. Thus an rcu_read_lock() is sufficient to ensure that no contexts can get freed. This part of the patch is lifted from a patch posted by Peter Zijlstra. This also adds a check to make sure that we can't add a counter to a task that is exiting. There is also a race between perf_counter_exit_task and find_get_context; this solves the race by moving the get_ctx that was in perf_counter_alloc into the locked region in find_get_context, so that once find_get_context has got the context for a task, it won't get freed even if the task calls perf_counter_exit_task. It doesn't matter if new top-level (non-inherited) counters get attached to the context after perf_counter_exit_task has detached the context from the task. They will just stay there and never get scheduled in until the counters' fds get closed, and then perf_release will remove them from the context and eventually free the context. With this, we are now doing the unclone in find_get_context rather than when a counter was added to or removed from a context (actually, we were missing the unclone_ctx() call when adding a counter to a context). We don't need to unclone when removing a counter from a context because we have no way to remove a counter from a cloned context. This also takes out the smp_wmb() in find_get_context, which Peter Zijlstra pointed out was unnecessary because the cmpxchg implies a full barrier anyway. Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Mike Galbraith Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18974.33033.667187.273886@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 +- kernel/perf_counter.c | 205 ++++++++++++++++++++++++++++++------------- 2 files changed, 148 insertions(+), 62 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a65ddc58051..717bf3b59ba 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -541,8 +541,9 @@ struct perf_counter_context { * been cloned (inherited) from a common ancestor. */ struct perf_counter_context *parent_ctx; - u32 parent_gen; - u32 generation; + u64 parent_gen; + u64 generation; + struct rcu_head rcu_head; }; /** diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 367299f91aa..52e5a15321d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -103,12 +103,22 @@ static void get_ctx(struct perf_counter_context *ctx) atomic_inc(&ctx->refcount); } +static void free_ctx(struct rcu_head *head) +{ + struct perf_counter_context *ctx; + + ctx = container_of(head, struct perf_counter_context, rcu_head); + kfree(ctx); +} + static void put_ctx(struct perf_counter_context *ctx) { if (atomic_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); - kfree(ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); } } @@ -211,22 +221,6 @@ group_sched_out(struct perf_counter *group_counter, cpuctx->exclusive = 0; } -/* - * Mark this context as not being a clone of another. - * Called when counters are added to or removed from this context. - * We also increment our generation number so that anything that - * was cloned from this context before this will not match anything - * cloned from this context after this. - */ -static void unclone_ctx(struct perf_counter_context *ctx) -{ - ++ctx->generation; - if (!ctx->parent_ctx) - return; - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; -} - /* * Cross CPU call to remove a performance counter * @@ -281,13 +275,19 @@ static void __perf_counter_remove_from_context(void *info) * * CPU counters are removed with a smp call. For task counters we only * call when the task is on a CPU. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_counter_exit_task, it's OK because the + * context has been detached from its task. */ static void perf_counter_remove_from_context(struct perf_counter *counter) { struct perf_counter_context *ctx = counter->ctx; struct task_struct *task = ctx->task; - unclone_ctx(ctx); if (!task) { /* * Per cpu counters are removed via an smp call and @@ -410,6 +410,16 @@ static void __perf_counter_disable(void *info) /* * Disable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_counter_for_each_child or perf_counter_for_each because they + * hold the top-level counter's child_mutex, so any descendant that + * goes to exit will block in sync_child_counter. + * When called from perf_pending_counter it's OK because counter->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_counter_task_sched_out for this context. */ static void perf_counter_disable(struct perf_counter *counter) { @@ -794,6 +804,12 @@ static void __perf_counter_enable(void *info) /* * Enable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_counter_for_each_child or perf_counter_for_each as described + * for perf_counter_disable. */ static void perf_counter_enable(struct perf_counter *counter) { @@ -923,7 +939,9 @@ void perf_counter_task_sched_out(struct task_struct *task, struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = task->perf_counter_ctxp; struct perf_counter_context *next_ctx; + struct perf_counter_context *parent; struct pt_regs *regs; + int do_switch = 1; regs = task_pt_regs(task); perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); @@ -932,18 +950,39 @@ void perf_counter_task_sched_out(struct task_struct *task, return; update_context_time(ctx); + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); next_ctx = next->perf_counter_ctxp; - if (next_ctx && context_equiv(ctx, next_ctx)) { - task->perf_counter_ctxp = next_ctx; - next->perf_counter_ctxp = ctx; - ctx->task = next; - next_ctx->task = task; - return; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + spin_lock(&ctx->lock); + spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + task->perf_counter_ctxp = next_ctx; + next->perf_counter_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + } + spin_unlock(&next_ctx->lock); + spin_unlock(&ctx->lock); } + rcu_read_unlock(); - __perf_counter_sched_out(ctx, cpuctx); - - cpuctx->task_ctx = NULL; + if (do_switch) { + __perf_counter_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; + } } static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) @@ -1215,18 +1254,13 @@ __perf_counter_init_context(struct perf_counter_context *ctx, ctx->task = task; } -static void put_context(struct perf_counter_context *ctx) -{ - if (ctx->task) - put_task_struct(ctx->task); -} - static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; - struct perf_counter_context *tctx; + struct perf_counter_context *parent_ctx; struct task_struct *task; + int err; /* * If cpu is not a wildcard then this is a percpu counter: @@ -1249,6 +1283,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); ctx = &cpuctx->ctx; + get_ctx(ctx); return ctx; } @@ -1265,37 +1300,79 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) if (!task) return ERR_PTR(-ESRCH); + /* + * Can't attach counters to a dying task. + */ + err = -ESRCH; + if (task->flags & PF_EXITING) + goto errout; + /* Reuse ptrace permission checks for now. */ - if (!ptrace_may_access(task, PTRACE_MODE_READ)) { - put_task_struct(task); - return ERR_PTR(-EACCES); + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + retry_lock: + rcu_read_lock(); + retry: + ctx = rcu_dereference(task->perf_counter_ctxp); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_counter_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more and we can + * unclone it if necessary. + * Once it's not a clone things will be stable. + */ + spin_lock_irq(&ctx->lock); + if (ctx != rcu_dereference(task->perf_counter_ctxp)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + parent_ctx = ctx->parent_ctx; + if (parent_ctx) { + put_ctx(parent_ctx); + ctx->parent_ctx = NULL; /* no longer a clone */ + } + ++ctx->generation; + /* + * Get an extra reference before dropping the lock so that + * this context won't get freed if the task exits. + */ + get_ctx(ctx); + spin_unlock_irq(&ctx->lock); } + rcu_read_unlock(); - ctx = task->perf_counter_ctxp; if (!ctx) { ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - if (!ctx) { - put_task_struct(task); - return ERR_PTR(-ENOMEM); - } + err = -ENOMEM; + if (!ctx) + goto errout; __perf_counter_init_context(ctx, task); - /* - * Make sure other cpus see correct values for *ctx - * once task->perf_counter_ctxp is visible to them. - */ - smp_wmb(); - tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx); - if (tctx) { + get_ctx(ctx); + if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { /* * We raced with some other task; use * the context they set. */ kfree(ctx); - ctx = tctx; + goto retry_lock; } + get_task_struct(task); } + put_task_struct(task); return ctx; + + errout: + put_task_struct(task); + return ERR_PTR(err); } static void free_counter_rcu(struct rcu_head *head) @@ -1303,7 +1380,6 @@ static void free_counter_rcu(struct rcu_head *head) struct perf_counter *counter; counter = container_of(head, struct perf_counter, rcu_head); - put_ctx(counter->ctx); kfree(counter); } @@ -1324,6 +1400,7 @@ static void free_counter(struct perf_counter *counter) if (counter->destroy) counter->destroy(counter); + put_ctx(counter->ctx); call_rcu(&counter->rcu_head, free_counter_rcu); } @@ -1347,7 +1424,6 @@ static int perf_release(struct inode *inode, struct file *file) put_task_struct(counter->owner); free_counter(counter); - put_context(ctx); return 0; } @@ -1437,6 +1513,12 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter, mutex_unlock(&ctx->mutex); } +/* + * Holding the top-level counter's child_mutex means that any + * descendant process that has inherited this counter will block + * in sync_child_counter if it goes to exit, thus satisfying the + * task existence requirements of perf_counter_enable/disable. + */ static void perf_counter_for_each_child(struct perf_counter *counter, void (*func)(struct perf_counter *)) { @@ -3124,8 +3206,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->ctx = ctx; counter->oncpu = -1; - get_ctx(ctx); - counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) counter->state = PERF_COUNTER_STATE_OFF; @@ -3290,7 +3370,7 @@ err_free_put_context: kfree(counter); err_put_context: - put_context(ctx); + put_ctx(ctx); goto out_fput; } @@ -3322,6 +3402,7 @@ inherit_counter(struct perf_counter *parent_counter, group_leader, GFP_KERNEL); if (IS_ERR(child_counter)) return child_counter; + get_ctx(child_ctx); /* * Make the child state follow the state of the parent counter, @@ -3439,11 +3520,6 @@ __perf_counter_exit_task(struct task_struct *child, /* * When a child task exits, feed back counter values to parent counters. - * - * Note: we may be running in child context, but the PID is not hashed - * anymore so new counters will not be added. - * (XXX not sure that is true when we get called from flush_old_exec. - * -- paulus) */ void perf_counter_exit_task(struct task_struct *child) { @@ -3458,7 +3534,15 @@ void perf_counter_exit_task(struct task_struct *child) local_irq_save(flags); __perf_counter_task_sched_out(child_ctx); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_counter_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + spin_lock(&child_ctx->lock); child->perf_counter_ctxp = NULL; + spin_unlock(&child_ctx->lock); local_irq_restore(flags); mutex_lock(&child_ctx->mutex); @@ -3513,6 +3597,7 @@ int perf_counter_init_task(struct task_struct *child) __perf_counter_init_context(child_ctx, child); child->perf_counter_ctxp = child_ctx; + get_task_struct(child); /* * Lock the parent list. No need to lock the child - not PID -- cgit v1.2.3 From bbbee90829304d156c12b171c0ac7e6e1aba8b90 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 May 2009 14:25:58 +0200 Subject: perf_counter: Ammend cleanup in fork() fail When fork() fails we cannot use perf_counter_exit_task() since that assumes to operate on current. Write a new helper that cleans up unused/clean contexts. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/fork.c | 2 +- kernel/perf_counter.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 43 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 717bf3b59ba..519a41bba24 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -579,6 +579,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task, extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern int perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); +extern void perf_counter_free_task(struct task_struct *task); extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); extern void __perf_disable(void); @@ -644,6 +645,7 @@ static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline int perf_counter_init_task(struct task_struct *child) { return 0; } static inline void perf_counter_exit_task(struct task_struct *child) { } +static inline void perf_counter_free_task(struct task_struct *task) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } static inline void perf_disable(void) { } diff --git a/kernel/fork.c b/kernel/fork.c index c07c3335cea..23bf757ed32 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1298,7 +1298,7 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: - perf_counter_exit_task(p); + perf_counter_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0c000d305e0..79c3f26541d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3538,8 +3538,7 @@ static void sync_child_counter(struct perf_counter *child_counter, } static void -__perf_counter_exit_task(struct task_struct *child, - struct perf_counter *child_counter, +__perf_counter_exit_task(struct perf_counter *child_counter, struct perf_counter_context *child_ctx) { struct perf_counter *parent_counter; @@ -3605,7 +3604,7 @@ void perf_counter_exit_task(struct task_struct *child) again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, list_entry) - __perf_counter_exit_task(child, child_counter, child_ctx); + __perf_counter_exit_task(child_counter, child_ctx); /* * If the last counter was a group counter, it will have appended all @@ -3620,6 +3619,44 @@ again: put_ctx(child_ctx); } +/* + * free an unexposed, unused context as created by inheritance by + * init_task below, used by fork() in case of fail. + */ +void perf_counter_free_task(struct task_struct *task) +{ + struct perf_counter_context *ctx = task->perf_counter_ctxp; + struct perf_counter *counter, *tmp; + + if (!ctx) + return; + + mutex_lock(&ctx->mutex); +again: + list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) { + struct perf_counter *parent = counter->parent; + + if (WARN_ON_ONCE(!parent)) + continue; + + mutex_lock(&parent->child_mutex); + list_del_init(&counter->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_counter(counter, ctx); + free_counter(counter); + } + + if (!list_empty(&ctx->counter_list)) + goto again; + + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + /* * Initialize the perf_counter context in task_struct */ -- cgit v1.2.3 From 25346b93ca079080c9cb23331db5c4f6404e8530 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 1 Jun 2009 17:48:12 +1000 Subject: perf_counter: Provide functions for locking and pinning the context for a task This abstracts out the code for locking the context associated with a task. Because the context might get transferred from one task to another concurrently, we have to check after locking the context that it is still the right context for the task and retry if not. This was open-coded in find_get_context() and perf_counter_init_task(). This adds a further function for pinning the context for a task, i.e. marking it so it can't be transferred to another task. This adds a 'pin_count' field to struct perf_counter_context to indicate that a context is pinned, instead of the previous method of setting the parent_gen count to all 1s. Pinning the context with a pin_count is easier to undo and doesn't require saving the parent_gen value. This also adds a perf_unpin_context() to undo the effect of perf_pin_task_context() and changes perf_counter_init_task to use it. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18979.34748.755674.596386@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 128 +++++++++++++++++++++++++------------------ 2 files changed, 75 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 519a41bba24..81ec79c9f19 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -543,6 +543,7 @@ struct perf_counter_context { struct perf_counter_context *parent_ctx; u64 parent_gen; u64 generation; + int pin_count; struct rcu_head rcu_head; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 79c3f26541d..da8dfef4b47 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -122,6 +122,69 @@ static void put_ctx(struct perf_counter_context *ctx) } } +/* + * Get the perf_counter_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_counter_context *perf_lock_task_context( + struct task_struct *task, unsigned long *flags) +{ + struct perf_counter_context *ctx; + + rcu_read_lock(); + retry: + ctx = rcu_dereference(task->perf_counter_ctxp); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_counter_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_counter_ctxp)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) +{ + struct perf_counter_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + ++ctx->pin_count; + get_ctx(ctx); + spin_unlock_irqrestore(&ctx->lock, flags); + } + return ctx; +} + +static void perf_unpin_context(struct perf_counter_context *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + put_ctx(ctx); +} + /* * Add a counter from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -916,7 +979,7 @@ static int context_equiv(struct perf_counter_context *ctx1, { return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && ctx1->parent_gen == ctx2->parent_gen - && ctx1->parent_gen != ~0ull; + && !ctx1->pin_count && !ctx2->pin_count; } /* @@ -1271,6 +1334,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) struct perf_counter_context *ctx; struct perf_counter_context *parent_ctx; struct task_struct *task; + unsigned long flags; int err; /* @@ -1323,28 +1387,9 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto errout; - retry_lock: - rcu_read_lock(); retry: - ctx = rcu_dereference(task->perf_counter_ctxp); + ctx = perf_lock_task_context(task, &flags); if (ctx) { - /* - * If this context is a clone of another, it might - * get swapped for another underneath us by - * perf_counter_task_sched_out, though the - * rcu_read_lock() protects us from any context - * getting freed. Lock the context and check if it - * got swapped before we could get the lock, and retry - * if so. If we locked the right context, then it - * can't get swapped on us any more and we can - * unclone it if necessary. - * Once it's not a clone things will be stable. - */ - spin_lock_irq(&ctx->lock); - if (ctx != rcu_dereference(task->perf_counter_ctxp)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } parent_ctx = ctx->parent_ctx; if (parent_ctx) { put_ctx(parent_ctx); @@ -1355,9 +1400,8 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) * this context won't get freed if the task exits. */ get_ctx(ctx); - spin_unlock_irq(&ctx->lock); + spin_unlock_irqrestore(&ctx->lock, flags); } - rcu_read_unlock(); if (!ctx) { ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); @@ -1372,7 +1416,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) * the context they set. */ kfree(ctx); - goto retry_lock; + goto retry; } get_task_struct(task); } @@ -3667,7 +3711,6 @@ int perf_counter_init_task(struct task_struct *child) struct perf_counter *counter; struct task_struct *parent = current; int inherited_all = 1; - u64 cloned_gen; int ret = 0; child->perf_counter_ctxp = NULL; @@ -3693,32 +3736,17 @@ int perf_counter_init_task(struct task_struct *child) get_task_struct(child); /* - * If the parent's context is a clone, temporarily set its - * parent_gen to an impossible value (all 1s) so it won't get - * swapped under us. The rcu_read_lock makes sure that - * parent_ctx continues to exist even if it gets swapped to - * another process and then freed while we are trying to get - * its lock. + * If the parent's context is a clone, pin it so it won't get + * swapped under us. */ - rcu_read_lock(); - retry: - parent_ctx = rcu_dereference(parent->perf_counter_ctxp); + parent_ctx = perf_pin_task_context(parent); + /* * No need to check if parent_ctx != NULL here; since we saw * it non-NULL earlier, the only reason for it to become NULL * is if we exit, and since we're currently in the middle of * a fork we can't be exiting at the same time. */ - spin_lock_irq(&parent_ctx->lock); - if (parent_ctx != rcu_dereference(parent->perf_counter_ctxp)) { - spin_unlock_irq(&parent_ctx->lock); - goto retry; - } - cloned_gen = parent_ctx->parent_gen; - if (parent_ctx->parent_ctx) - parent_ctx->parent_gen = ~0ull; - spin_unlock_irq(&parent_ctx->lock); - rcu_read_unlock(); /* * Lock the parent list. No need to lock the child - not PID @@ -3759,7 +3787,7 @@ int perf_counter_init_task(struct task_struct *child) cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); if (cloned_ctx) { child_ctx->parent_ctx = cloned_ctx; - child_ctx->parent_gen = cloned_gen; + child_ctx->parent_gen = parent_ctx->parent_gen; } else { child_ctx->parent_ctx = parent_ctx; child_ctx->parent_gen = parent_ctx->generation; @@ -3769,15 +3797,7 @@ int perf_counter_init_task(struct task_struct *child) mutex_unlock(&parent_ctx->mutex); - /* - * Restore the clone status of the parent. - */ - if (parent_ctx->parent_ctx) { - spin_lock_irq(&parent_ctx->lock); - if (parent_ctx->parent_ctx) - parent_ctx->parent_gen = cloned_gen; - spin_unlock_irq(&parent_ctx->lock); - } + perf_unpin_context(parent_ctx); return ret; } -- cgit v1.2.3 From 22a4f650d686eeaac3629dae1c4294381485efdf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 1 Jun 2009 10:13:37 +0200 Subject: perf_counter: Tidy up style details - whitespace fixlets - make local variable definitions more consistent [ Impact: cleanup ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- kernel/perf_counter.c | 39 +++++++++++++++++++++------------------ 2 files changed, 22 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 81ec79c9f19..0e57d8cc5a3 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -562,7 +562,7 @@ struct perf_cpu_context { * * task, softirq, irq, nmi context */ - int recursion[4]; + int recursion[4]; }; #ifdef CONFIG_PERF_COUNTERS diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ff8b4636f84..df319c48c52 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -16,8 +16,9 @@ #include #include #include -#include +#include #include +#include #include #include #include @@ -26,7 +27,6 @@ #include #include #include -#include #include @@ -65,7 +65,9 @@ void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_counter_setup(int cpu) { barrier(); } -int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, + +int __weak +hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu) { @@ -127,8 +129,8 @@ static void put_ctx(struct perf_counter_context *ctx) * This has to cope with with the fact that until it is locked, * the context could get moved to another task. */ -static struct perf_counter_context *perf_lock_task_context( - struct task_struct *task, unsigned long *flags) +static struct perf_counter_context * +perf_lock_task_context(struct task_struct *task, unsigned long *flags) { struct perf_counter_context *ctx; @@ -1330,9 +1332,9 @@ __perf_counter_init_context(struct perf_counter_context *ctx, static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; struct perf_counter_context *parent_ctx; + struct perf_counter_context *ctx; + struct perf_cpu_context *cpuctx; struct task_struct *task; unsigned long flags; int err; @@ -1664,8 +1666,8 @@ int perf_counter_task_disable(void) */ void perf_counter_update_userpage(struct perf_counter *counter) { - struct perf_mmap_data *data; struct perf_counter_mmap_page *userpg; + struct perf_mmap_data *data; rcu_read_lock(); data = rcu_dereference(counter->data); @@ -1769,10 +1771,11 @@ fail: static void __perf_mmap_data_free(struct rcu_head *rcu_head) { - struct perf_mmap_data *data = container_of(rcu_head, - struct perf_mmap_data, rcu_head); + struct perf_mmap_data *data; int i; + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + free_page((unsigned long)data->user_page); for (i = 0; i < data->nr_pages; i++) free_page((unsigned long)data->data_pages[i]); @@ -1801,8 +1804,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_counter *counter = vma->vm_file->private_data; WARN_ON_ONCE(counter->ctx->parent_ctx); - if (atomic_dec_and_mutex_lock(&counter->mmap_count, - &counter->mmap_mutex)) { + if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { struct user_struct *user = current_user(); atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); @@ -1821,11 +1823,11 @@ static struct vm_operations_struct perf_mmap_vmops = { static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_counter *counter = file->private_data; + unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); + unsigned long locked, lock_limit; unsigned long vma_size; unsigned long nr_pages; - unsigned long user_locked, user_lock_limit; - unsigned long locked, lock_limit; long user_extra, extra; int ret = 0; @@ -1900,8 +1902,8 @@ unlock: static int perf_fasync(int fd, struct file *filp, int on) { - struct perf_counter *counter = filp->private_data; struct inode *inode = filp->f_path.dentry->d_inode; + struct perf_counter *counter = filp->private_data; int retval; mutex_lock(&inode->i_mutex); @@ -2412,8 +2414,8 @@ static void perf_counter_output(struct perf_counter *counter, */ struct perf_comm_event { - struct task_struct *task; - char *comm; + struct task_struct *task; + char *comm; int comm_size; struct { @@ -2932,6 +2934,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { int neg = atomic64_add_negative(nr, &counter->hw.count); + if (counter->hw.irq_period && !neg) perf_swcounter_overflow(counter, nmi, regs, addr); } @@ -3526,7 +3529,7 @@ inherit_counter(struct perf_counter *parent_counter, /* * Make the child state follow the state of the parent counter, * not its hw_event.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en,dis}able_family. + * so we won't race with perf_counter_{en, dis}able_family. */ if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) child_counter->state = PERF_COUNTER_STATE_INACTIVE; -- cgit v1.2.3 From 3f731ca60afc29f5bcdb5fd2a04391466313a9ac Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 1 Jun 2009 17:52:30 +1000 Subject: perf_counter: Fix cpu migration counter This fixes the cpu migration software counter to count correctly even when contexts get swapped from one task to another. Previously the cpu migration counts reported by perf stat were bogus, ranging from negative to several thousand for a single "lat_ctx 2 8 32" run. With this patch the cpu migration count reported for "lat_ctx 2 8 32" is almost always between 35 and 44. This fixes the problem by adding a call into the perf_counter code from set_task_cpu when tasks are migrated. This enables us to use the generic swcounter code (with some modifications) for the cpu migration counter. This modifies the swcounter code to allow a NULL regs pointer to be passed in to perf_swcounter_ctx_event() etc. The cpu migration counter does this because there isn't necessarily a pt_regs struct for the task available. In this case, the counter will not have interrupt capability - but the migration counter didn't have interrupt capability before, so this is no loss. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18979.35006.819769.416327@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 +++ kernel/perf_counter.c | 74 +++++++++++++------------------------------- kernel/sched.c | 1 + 3 files changed, 26 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0e57d8cc5a3..deb9acf9ad2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -615,6 +615,8 @@ extern void perf_counter_munmap(unsigned long addr, unsigned long len, extern void perf_counter_comm(struct task_struct *tsk); +extern void perf_counter_task_migration(struct task_struct *task, int cpu); + #define MAX_STACK_DEPTH 255 struct perf_callchain_entry { @@ -668,6 +670,8 @@ perf_counter_munmap(unsigned long addr, unsigned long len, static inline void perf_counter_comm(struct task_struct *tsk) { } static inline void perf_counter_init(void) { } +static inline void perf_counter_task_migration(struct task_struct *task, + int cpu) { } #endif #endif /* __KERNEL__ */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8d2653f137e..cd94cf3bf9e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2921,11 +2921,13 @@ static int perf_swcounter_match(struct perf_counter *counter, if (counter->hw_event.config != event_config) return 0; - if (counter->hw_event.exclude_user && user_mode(regs)) - return 0; + if (regs) { + if (counter->hw_event.exclude_user && user_mode(regs)) + return 0; - if (counter->hw_event.exclude_kernel && !user_mode(regs)) - return 0; + if (counter->hw_event.exclude_kernel && !user_mode(regs)) + return 0; + } return 1; } @@ -2935,7 +2937,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, { int neg = atomic64_add_negative(nr, &counter->hw.count); - if (counter->hw.irq_period && !neg) + if (counter->hw.irq_period && !neg && regs) perf_swcounter_overflow(counter, nmi, regs, addr); } @@ -3151,55 +3153,24 @@ static const struct pmu perf_ops_task_clock = { /* * Software counter: cpu migrations */ - -static inline u64 get_cpu_migrations(struct perf_counter *counter) -{ - struct task_struct *curr = counter->ctx->task; - - if (curr) - return curr->se.nr_migrations; - return cpu_nr_migrations(smp_processor_id()); -} - -static void cpu_migrations_perf_counter_update(struct perf_counter *counter) -{ - u64 prev, now; - s64 delta; - - prev = atomic64_read(&counter->hw.prev_count); - now = get_cpu_migrations(counter); - - atomic64_set(&counter->hw.prev_count, now); - - delta = now - prev; - - atomic64_add(delta, &counter->count); -} - -static void cpu_migrations_perf_counter_read(struct perf_counter *counter) +void perf_counter_task_migration(struct task_struct *task, int cpu) { - cpu_migrations_perf_counter_update(counter); -} + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx; -static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) -{ - if (counter->prev_state <= PERF_COUNTER_STATE_OFF) - atomic64_set(&counter->hw.prev_count, - get_cpu_migrations(counter)); - return 0; -} + perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE, + PERF_COUNT_CPU_MIGRATIONS, + 1, 1, NULL, 0); -static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) -{ - cpu_migrations_perf_counter_update(counter); + ctx = perf_pin_task_context(task); + if (ctx) { + perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE, + PERF_COUNT_CPU_MIGRATIONS, + 1, 1, NULL, 0); + perf_unpin_context(ctx); + } } -static const struct pmu perf_ops_cpu_migrations = { - .enable = cpu_migrations_perf_counter_enable, - .disable = cpu_migrations_perf_counter_disable, - .read = cpu_migrations_perf_counter_read, -}; - #ifdef CONFIG_EVENT_PROFILE void perf_tpcounter_event(int event_id) { @@ -3272,11 +3243,8 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_PAGE_FAULTS_MIN: case PERF_COUNT_PAGE_FAULTS_MAJ: case PERF_COUNT_CONTEXT_SWITCHES: - pmu = &perf_ops_generic; - break; case PERF_COUNT_CPU_MIGRATIONS: - if (!counter->hw_event.exclude_kernel) - pmu = &perf_ops_cpu_migrations; + pmu = &perf_ops_generic; break; } diff --git a/kernel/sched.c b/kernel/sched.c index 3226cc132e9..8d43347a0c0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1977,6 +1977,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif + perf_counter_task_migration(p, new_cpu); } p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; -- cgit v1.2.3 From bf4e0ed3d027ce581be18496036862131b5f32aa Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 1 Jun 2009 17:53:16 +1000 Subject: perf_counter: Remove unused prev_state field This removes the prev_state field of struct perf_counter since it is now unused. It was only used by the cpu migration counter, which doesn't use it any more. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <18979.35052.915728.626374@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 - kernel/perf_counter.c | 4 ---- 2 files changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index deb9acf9ad2..d970fbc16af 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -427,7 +427,6 @@ struct perf_counter { const struct pmu *pmu; enum perf_counter_active_state state; - enum perf_counter_active_state prev_state; atomic64_t count; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index cd94cf3bf9e..fbed4d28ad7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -572,7 +572,6 @@ group_sched_in(struct perf_counter *group_counter, if (ret) return ret < 0 ? ret : 0; - group_counter->prev_state = group_counter->state; if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) return -EAGAIN; @@ -580,7 +579,6 @@ group_sched_in(struct perf_counter *group_counter, * Schedule in siblings as one group (if any): */ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - counter->prev_state = counter->state; if (counter_sched_in(counter, cpuctx, ctx, cpu)) { partial_group = counter; goto group_error; @@ -657,7 +655,6 @@ static void add_counter_to_ctx(struct perf_counter *counter, struct perf_counter_context *ctx) { list_add_counter(counter, ctx); - counter->prev_state = PERF_COUNTER_STATE_OFF; counter->tstamp_enabled = ctx->time; counter->tstamp_running = ctx->time; counter->tstamp_stopped = ctx->time; @@ -820,7 +817,6 @@ static void __perf_counter_enable(void *info) ctx->is_active = 1; update_context_time(ctx); - counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; -- cgit v1.2.3 From 709e50cf870e61745b39552044aa6c7c38e4f9e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 14:13:15 +0200 Subject: perf_counter: Use PID namespaces properly Stop using task_struct::pid and start using PID namespaces. PIDs will be reported in the PID namespace of the monitoring task at the moment of counter creation. Signed-off-by: Peter Zijlstra Cc: Eric W. Biederman Cc: Oleg Nesterov Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 +++ kernel/perf_counter.c | 42 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 37 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d970fbc16af..9ec20fc6bd3 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -317,6 +317,7 @@ enum perf_event_type { #include #include #include +#include #include struct task_struct; @@ -500,6 +501,8 @@ struct perf_counter { void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; + + struct pid_namespace *ns; #endif }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fbed4d28ad7..caa012cfe49 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1432,6 +1432,8 @@ static void free_counter_rcu(struct rcu_head *head) struct perf_counter *counter; counter = container_of(head, struct perf_counter, rcu_head); + if (counter->ns) + put_pid_ns(counter->ns); kfree(counter); } @@ -2267,6 +2269,28 @@ static void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } +static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) +{ + /* + * only top level counters have the pid namespace they were created in + */ + if (counter->parent) + counter = counter->parent; + + return task_tgid_nr_ns(p, counter->ns); +} + +static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) +{ + /* + * only top level counters have the pid namespace they were created in + */ + if (counter->parent) + counter = counter->parent; + + return task_pid_nr_ns(p, counter->ns); +} + static void perf_counter_output(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { @@ -2303,8 +2327,8 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_TID) { /* namespace issues */ - tid_entry.pid = current->group_leader->pid; - tid_entry.tid = current->pid; + tid_entry.pid = perf_counter_pid(counter, current); + tid_entry.tid = perf_counter_tid(counter, current); header.type |= PERF_RECORD_TID; header.size += sizeof(tid_entry); @@ -2432,6 +2456,9 @@ static void perf_counter_comm_output(struct perf_counter *counter, if (ret) return; + comm_event->event.pid = perf_counter_pid(counter, comm_event->task); + comm_event->event.tid = perf_counter_tid(counter, comm_event->task); + perf_output_put(&handle, comm_event->event); perf_output_copy(&handle, comm_event->comm, comm_event->comm_size); @@ -2504,8 +2531,6 @@ void perf_counter_comm(struct task_struct *task) .task = task, .event = { .header = { .type = PERF_EVENT_COMM, }, - .pid = task->group_leader->pid, - .tid = task->pid, }, }; @@ -2542,6 +2567,9 @@ static void perf_counter_mmap_output(struct perf_counter *counter, if (ret) return; + mmap_event->event.pid = perf_counter_pid(counter, current); + mmap_event->event.tid = perf_counter_tid(counter, current); + perf_output_put(&handle, mmap_event->event); perf_output_copy(&handle, mmap_event->file_name, mmap_event->file_size); @@ -2641,8 +2669,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, .file = file, .event = { .header = { .type = PERF_EVENT_MMAP, }, - .pid = current->group_leader->pid, - .tid = current->pid, .start = addr, .len = len, .pgoff = pgoff, @@ -2664,8 +2690,6 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, .file = file, .event = { .header = { .type = PERF_EVENT_MUNMAP, }, - .pid = current->group_leader->pid, - .tid = current->pid, .start = addr, .len = len, .pgoff = pgoff, @@ -3445,6 +3469,8 @@ SYSCALL_DEFINE5(perf_counter_open, list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); mutex_unlock(¤t->perf_counter_mutex); + counter->ns = get_pid_ns(current->nsproxy->pid_ns); + fput_light(counter_file, fput_needed2); out_fput: -- cgit v1.2.3 From 53e111a730ea8b002d57dd226098c12789993329 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 17:01:58 +0200 Subject: x86: Fix atomic_long_xchg() on 64bit Apparently I'm the first to use it :-) Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/asm-generic/atomic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index 3673a13b670..81d3be459ef 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -134,7 +134,7 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u) #define atomic_long_cmpxchg(l, old, new) \ (atomic64_cmpxchg((atomic64_t *)(l), (old), (new))) #define atomic_long_xchg(v, new) \ - (atomic64_xchg((atomic64_t *)(l), (new))) + (atomic64_xchg((atomic64_t *)(v), (new))) #else /* BITS_PER_LONG == 64 */ -- cgit v1.2.3 From 8e5799b1ad2a0567fdfaaf0e91b40efee010f2c1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:08:15 +0200 Subject: perf_counter: Add unique counter id Stephan raised the issue that we currently cannot distinguish between similar counters within a group (PERF_RECORD_GROUP uses the config value as identifier). Therefore, generate a new ID for each counter using a global u64 sequence counter. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 8 +++++--- kernel/perf_counter.c | 9 +++++++-- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 9ec20fc6bd3..4845a214b9e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -114,8 +114,9 @@ enum perf_counter_record_format { * in increasing order of bit value, after the counter value. */ enum perf_counter_read_format { - PERF_FORMAT_TOTAL_TIME_ENABLED = 1, - PERF_FORMAT_TOTAL_TIME_RUNNING = 2, + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, }; /* @@ -290,7 +291,7 @@ enum perf_event_type { * { u32 cpu, res; } && PERF_RECORD_CPU * * { u64 nr; - * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP + * { u64 id, val; } cnt[nr]; } && PERF_RECORD_GROUP * * { u16 nr, * hv, @@ -503,6 +504,7 @@ struct perf_counter { struct rcu_head rcu_head; struct pid_namespace *ns; + u64 id; #endif }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index caa012cfe49..978ecfcc7aa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1510,6 +1510,8 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = counter->total_time_running + atomic64_read(&counter->child_total_time_running); + if (counter->hw_event.read_format & PERF_FORMAT_ID) + values[n++] = counter->id; mutex_unlock(&counter->child_mutex); if (count < n * sizeof(u64)) @@ -2303,7 +2305,7 @@ static void perf_counter_output(struct perf_counter *counter, u32 pid, tid; } tid_entry; struct { - u64 event; + u64 id; u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; @@ -2416,7 +2418,7 @@ static void perf_counter_output(struct perf_counter *counter, if (sub != counter) sub->pmu->read(sub); - group_entry.event = sub->hw_event.config; + group_entry.id = sub->id; group_entry.counter = atomic64_read(&sub->count); perf_output_put(&handle, group_entry); @@ -3375,6 +3377,8 @@ done: return counter; } +static atomic64_t perf_counter_id; + /** * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * @@ -3470,6 +3474,7 @@ SYSCALL_DEFINE5(perf_counter_open, mutex_unlock(¤t->perf_counter_mutex); counter->ns = get_pid_ns(current->nsproxy->pid_ns); + counter->id = atomic64_inc_return(&perf_counter_id); fput_light(counter_file, fput_needed2); -- cgit v1.2.3 From b23f3325ed465f1bd914384884269af0d106778c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:13:03 +0200 Subject: perf_counter: Rename various fields A few renames: s/irq_period/sample_period/ s/irq_freq/sample_freq/ s/PERF_RECORD_/PERF_SAMPLE_/ s/record_type/sample_type/ And change both the new sample_type and read_format to u64. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 12 ++--- arch/x86/kernel/cpu/perf_counter.c | 8 +-- include/linux/perf_counter.h | 32 ++++++------ kernel/perf_counter.c | 104 ++++++++++++++++++------------------- 4 files changed, 78 insertions(+), 78 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index f96d55f55bd..c9633321e7a 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -535,7 +535,7 @@ void hw_perf_enable(void) continue; } val = 0; - if (counter->hw.irq_period) { + if (counter->hw.sample_period) { left = atomic64_read(&counter->hw.period_left); if (left < 0x80000000L) val = 0x80000000L - left; @@ -749,12 +749,12 @@ static void power_pmu_unthrottle(struct perf_counter *counter) s64 val, left; unsigned long flags; - if (!counter->hw.idx || !counter->hw.irq_period) + if (!counter->hw.idx || !counter->hw.sample_period) return; local_irq_save(flags); perf_disable(); power_pmu_read(counter); - left = counter->hw.irq_period; + left = counter->hw.sample_period; val = 0; if (left < 0x80000000L) val = 0x80000000L - left; @@ -789,7 +789,7 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev, if (counter->hw_event.exclude_user || counter->hw_event.exclude_kernel || counter->hw_event.exclude_hv - || counter->hw_event.irq_period) + || counter->hw_event.sample_period) return 0; if (ppmu->limited_pmc_event(ev)) @@ -925,7 +925,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) counter->hw.config = events[n]; counter->hw.counter_base = cflags[n]; - atomic64_set(&counter->hw.period_left, counter->hw.irq_period); + atomic64_set(&counter->hw.period_left, counter->hw.sample_period); /* * See if we need to reserve the PMU. @@ -958,7 +958,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) static void record_and_restart(struct perf_counter *counter, long val, struct pt_regs *regs, int nmi) { - u64 period = counter->hw.irq_period; + u64 period = counter->hw.sample_period; s64 prev, delta, left; int record = 0; u64 addr, mmcra, sdsync; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 316b0c995f3..ec06aa5e928 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -290,11 +290,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; hw_event->nmi = 1; - if (!hwc->irq_period) - hwc->irq_period = x86_pmu.max_period; + if (!hwc->sample_period) + hwc->sample_period = x86_pmu.max_period; atomic64_set(&hwc->period_left, - min(x86_pmu.max_period, hwc->irq_period)); + min(x86_pmu.max_period, hwc->sample_period)); /* * Raw event type provide the config in the event structure @@ -462,7 +462,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s64 period = min(x86_pmu.max_period, hwc->irq_period); + s64 period = min(x86_pmu.max_period, hwc->sample_period); int err; /* diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4845a214b9e..1fcd3cc9385 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -94,18 +94,18 @@ enum sw_event_ids { #define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) /* - * Bits that can be set in hw_event.record_type to request information + * Bits that can be set in hw_event.sample_type to request information * in the overflow packets. */ -enum perf_counter_record_format { - PERF_RECORD_IP = 1U << 0, - PERF_RECORD_TID = 1U << 1, - PERF_RECORD_TIME = 1U << 2, - PERF_RECORD_ADDR = 1U << 3, - PERF_RECORD_GROUP = 1U << 4, - PERF_RECORD_CALLCHAIN = 1U << 5, - PERF_RECORD_CONFIG = 1U << 6, - PERF_RECORD_CPU = 1U << 7, +enum perf_counter_sample_format { + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_GROUP = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_CONFIG = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, }; /* @@ -132,12 +132,12 @@ struct perf_counter_hw_event { __u64 config; union { - __u64 irq_period; - __u64 irq_freq; + __u64 sample_period; + __u64 sample_freq; }; - __u32 record_type; - __u32 read_format; + __u64 sample_type; + __u64 read_format; __u64 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ @@ -262,7 +262,7 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u64 time; - * u64 irq_period; + * u64 sample_period; * }; */ PERF_EVENT_PERIOD = 4, @@ -363,7 +363,7 @@ struct hw_perf_counter { }; }; atomic64_t prev_count; - u64 irq_period; + u64 sample_period; atomic64_t period_left; u64 interrupts; #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 978ecfcc7aa..5ecd9981c03 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1186,7 +1186,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_freq(struct perf_counter_context *ctx) { struct perf_counter *counter; - u64 interrupts, irq_period; + u64 interrupts, sample_period; u64 events, period; s64 delta; @@ -1204,23 +1204,23 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) interrupts = 2*sysctl_perf_counter_limit/HZ; } - if (!counter->hw_event.freq || !counter->hw_event.irq_freq) + if (!counter->hw_event.freq || !counter->hw_event.sample_freq) continue; - events = HZ * interrupts * counter->hw.irq_period; - period = div64_u64(events, counter->hw_event.irq_freq); + events = HZ * interrupts * counter->hw.sample_period; + period = div64_u64(events, counter->hw_event.sample_freq); - delta = (s64)(1 + period - counter->hw.irq_period); + delta = (s64)(1 + period - counter->hw.sample_period); delta >>= 1; - irq_period = counter->hw.irq_period + delta; + sample_period = counter->hw.sample_period + delta; - if (!irq_period) - irq_period = 1; + if (!sample_period) + sample_period = 1; - perf_log_period(counter, irq_period); + perf_log_period(counter, sample_period); - counter->hw.irq_period = irq_period; + counter->hw.sample_period = sample_period; } spin_unlock(&ctx->lock); } @@ -2297,7 +2297,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { int ret; - u64 record_type = counter->hw_event.record_type; + u64 sample_type = counter->hw_event.sample_type; struct perf_output_handle handle; struct perf_event_header header; u64 ip; @@ -2321,61 +2321,61 @@ static void perf_counter_output(struct perf_counter *counter, header.misc = PERF_EVENT_MISC_OVERFLOW; header.misc |= perf_misc_flags(regs); - if (record_type & PERF_RECORD_IP) { + if (sample_type & PERF_SAMPLE_IP) { ip = perf_instruction_pointer(regs); - header.type |= PERF_RECORD_IP; + header.type |= PERF_SAMPLE_IP; header.size += sizeof(ip); } - if (record_type & PERF_RECORD_TID) { + if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ tid_entry.pid = perf_counter_pid(counter, current); tid_entry.tid = perf_counter_tid(counter, current); - header.type |= PERF_RECORD_TID; + header.type |= PERF_SAMPLE_TID; header.size += sizeof(tid_entry); } - if (record_type & PERF_RECORD_TIME) { + if (sample_type & PERF_SAMPLE_TIME) { /* * Maybe do better on x86 and provide cpu_clock_nmi() */ time = sched_clock(); - header.type |= PERF_RECORD_TIME; + header.type |= PERF_SAMPLE_TIME; header.size += sizeof(u64); } - if (record_type & PERF_RECORD_ADDR) { - header.type |= PERF_RECORD_ADDR; + if (sample_type & PERF_SAMPLE_ADDR) { + header.type |= PERF_SAMPLE_ADDR; header.size += sizeof(u64); } - if (record_type & PERF_RECORD_CONFIG) { - header.type |= PERF_RECORD_CONFIG; + if (sample_type & PERF_SAMPLE_CONFIG) { + header.type |= PERF_SAMPLE_CONFIG; header.size += sizeof(u64); } - if (record_type & PERF_RECORD_CPU) { - header.type |= PERF_RECORD_CPU; + if (sample_type & PERF_SAMPLE_CPU) { + header.type |= PERF_SAMPLE_CPU; header.size += sizeof(cpu_entry); cpu_entry.cpu = raw_smp_processor_id(); } - if (record_type & PERF_RECORD_GROUP) { - header.type |= PERF_RECORD_GROUP; + if (sample_type & PERF_SAMPLE_GROUP) { + header.type |= PERF_SAMPLE_GROUP; header.size += sizeof(u64) + counter->nr_siblings * sizeof(group_entry); } - if (record_type & PERF_RECORD_CALLCHAIN) { + if (sample_type & PERF_SAMPLE_CALLCHAIN) { callchain = perf_callchain(regs); if (callchain) { callchain_size = (1 + callchain->nr) * sizeof(u64); - header.type |= PERF_RECORD_CALLCHAIN; + header.type |= PERF_SAMPLE_CALLCHAIN; header.size += callchain_size; } } @@ -2386,28 +2386,28 @@ static void perf_counter_output(struct perf_counter *counter, perf_output_put(&handle, header); - if (record_type & PERF_RECORD_IP) + if (sample_type & PERF_SAMPLE_IP) perf_output_put(&handle, ip); - if (record_type & PERF_RECORD_TID) + if (sample_type & PERF_SAMPLE_TID) perf_output_put(&handle, tid_entry); - if (record_type & PERF_RECORD_TIME) + if (sample_type & PERF_SAMPLE_TIME) perf_output_put(&handle, time); - if (record_type & PERF_RECORD_ADDR) + if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(&handle, addr); - if (record_type & PERF_RECORD_CONFIG) + if (sample_type & PERF_SAMPLE_CONFIG) perf_output_put(&handle, counter->hw_event.config); - if (record_type & PERF_RECORD_CPU) + if (sample_type & PERF_SAMPLE_CPU) perf_output_put(&handle, cpu_entry); /* - * XXX PERF_RECORD_GROUP vs inherited counters seems difficult. + * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. */ - if (record_type & PERF_RECORD_GROUP) { + if (sample_type & PERF_SAMPLE_GROUP) { struct perf_counter *leader, *sub; u64 nr = counter->nr_siblings; @@ -2702,7 +2702,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, } /* - * Log irq_period changes so that analyzing tools can re-normalize the + * Log sample_period changes so that analyzing tools can re-normalize the * event flow. */ @@ -2725,7 +2725,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period) .period = period, }; - if (counter->hw.irq_period == period) + if (counter->hw.sample_period == period) return; ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0); @@ -2834,7 +2834,7 @@ static void perf_swcounter_set_period(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->irq_period; + s64 period = hwc->sample_period; if (unlikely(left <= -period)) { left = period; @@ -2874,7 +2874,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) ret = HRTIMER_NORESTART; } - period = max_t(u64, 10000, counter->hw.irq_period); + period = max_t(u64, 10000, counter->hw.sample_period); hrtimer_forward_now(hrtimer, ns_to_ktime(period)); return ret; @@ -2959,7 +2959,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, { int neg = atomic64_add_negative(nr, &counter->hw.count); - if (counter->hw.irq_period && !neg && regs) + if (counter->hw.sample_period && !neg && regs) perf_swcounter_overflow(counter, nmi, regs, addr); } @@ -3080,8 +3080,8 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) atomic64_set(&hwc->prev_count, cpu_clock(cpu)); hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->irq_period) { - u64 period = max_t(u64, 10000, hwc->irq_period); + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); @@ -3092,7 +3092,7 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) static void cpu_clock_perf_counter_disable(struct perf_counter *counter) { - if (counter->hw.irq_period) + if (counter->hw.sample_period) hrtimer_cancel(&counter->hw.hrtimer); cpu_clock_perf_counter_update(counter); } @@ -3132,8 +3132,8 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) atomic64_set(&hwc->prev_count, now); hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->irq_period) { - u64 period = max_t(u64, 10000, hwc->irq_period); + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); @@ -3144,7 +3144,7 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) static void task_clock_perf_counter_disable(struct perf_counter *counter) { - if (counter->hw.irq_period) + if (counter->hw.sample_period) hrtimer_cancel(&counter->hw.hrtimer); task_clock_perf_counter_update(counter, counter->ctx->time); @@ -3223,7 +3223,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) return NULL; counter->destroy = tp_perf_counter_destroy; - counter->hw.irq_period = counter->hw_event.irq_period; + counter->hw.sample_period = counter->hw_event.sample_period; return &perf_ops_generic; } @@ -3323,15 +3323,15 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, pmu = NULL; hwc = &counter->hw; - if (hw_event->freq && hw_event->irq_freq) - hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq); + if (hw_event->freq && hw_event->sample_freq) + hwc->sample_period = div64_u64(TICK_NSEC, hw_event->sample_freq); else - hwc->irq_period = hw_event->irq_period; + hwc->sample_period = hw_event->sample_period; /* - * we currently do not support PERF_RECORD_GROUP on inherited counters + * we currently do not support PERF_SAMPLE_GROUP on inherited counters */ - if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP)) + if (hw_event->inherit && (hw_event->sample_type & PERF_SAMPLE_GROUP)) goto done; if (perf_event_raw(hw_event)) { -- cgit v1.2.3 From 8a016db386195b193e2a8aeddff9fe937dcb7a40 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 15:27:45 +0200 Subject: perf_counter: Remove the last nmi/irq bits IRQ (non-NMI) sampling is not used anymore - remove the last few bits. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 ------ include/linux/perf_counter.h | 4 +--- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ec06aa5e928..9e144fbebd2 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -284,12 +284,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!hw_event->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - /* - * Use NMI events all the time: - */ - hwc->nmi = 1; - hw_event->nmi = 1; - if (!hwc->sample_period) hwc->sample_period = x86_pmu.max_period; diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 1fcd3cc9385..cef9931793f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -140,7 +140,6 @@ struct perf_counter_hw_event { __u64 read_format; __u64 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ inherit : 1, /* children inherit it */ pinned : 1, /* must always be on PMU */ exclusive : 1, /* only group on PMU */ @@ -153,7 +152,7 @@ struct perf_counter_hw_event { comm : 1, /* include comm data */ freq : 1, /* use freq, not period */ - __reserved_1 : 51; + __reserved_1 : 52; __u32 wakeup_events; /* wakeup every n events */ __u32 __reserved_2; @@ -354,7 +353,6 @@ struct hw_perf_counter { u64 config; unsigned long config_base; unsigned long counter_base; - int nmi; int idx; }; union { /* software */ -- cgit v1.2.3 From 8e3747c13c39246c7e46def7cf495d9d21d4c5f9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 16:16:02 +0200 Subject: perf_counter: Change data head from u32 to u64 Since some people worried that 4G might not be a large enough as an mmap data window, extend it to 64 bit for capable platforms. Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 ++++--- kernel/perf_counter.c | 15 ++++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index cef9931793f..c046f7d97cf 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -212,7 +212,7 @@ struct perf_counter_mmap_page { * User-space reading this value should issue an rmb(), on SMP capable * platforms, after reading this value -- see perf_counter_wakeup(). */ - __u32 data_head; /* head in the data section */ + __u64 data_head; /* head in the data section */ }; #define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) @@ -397,10 +397,11 @@ struct perf_mmap_data { int nr_locked; /* nr pages mlocked */ atomic_t poll; /* POLL_ for wakeups */ - atomic_t head; /* write position */ atomic_t events; /* event limit */ - atomic_t done_head; /* completed head */ + atomic_long_t head; /* write position */ + atomic_long_t done_head; /* completed head */ + atomic_t lock; /* concurrent writes */ atomic_t wakeup; /* needs a wakeup */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5ecd9981c03..3f11a2bc6c7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2067,8 +2067,8 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) struct perf_output_handle { struct perf_counter *counter; struct perf_mmap_data *data; - unsigned int offset; - unsigned int head; + unsigned long head; + unsigned long offset; int nmi; int overflow; int locked; @@ -2122,7 +2122,8 @@ static void perf_output_lock(struct perf_output_handle *handle) static void perf_output_unlock(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; - int head, cpu; + unsigned long head; + int cpu; data->done_head = data->head; @@ -2135,7 +2136,7 @@ again: * before we publish the new head, matched by a rmb() in userspace when * reading this position. */ - while ((head = atomic_xchg(&data->done_head, 0))) + while ((head = atomic_long_xchg(&data->done_head, 0))) data->user_page->data_head = head; /* @@ -2148,7 +2149,7 @@ again: /* * Therefore we have to validate we did not indeed do so. */ - if (unlikely(atomic_read(&data->done_head))) { + if (unlikely(atomic_long_read(&data->done_head))) { /* * Since we had it locked, we can lock it again. */ @@ -2195,7 +2196,7 @@ static int perf_output_begin(struct perf_output_handle *handle, do { offset = head = atomic_read(&data->head); head += size; - } while (atomic_cmpxchg(&data->head, offset, head) != offset); + } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); handle->offset = offset; handle->head = head; @@ -2246,7 +2247,7 @@ static void perf_output_copy(struct perf_output_handle *handle, * Check we didn't copy past our reservation window, taking the * possible unsigned int wrap into account. */ - WARN_ON_ONCE(((int)(handle->head - handle->offset)) < 0); + WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); } #define perf_output_put(handle, x) \ -- cgit v1.2.3 From 08247e31ca79b8f02cce47b7e8120797a8726606 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 16:46:57 +0200 Subject: perf_counter: Add ioctl for changing the sample period/frequency Reported-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 9 +++++---- kernel/perf_counter.c | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c046f7d97cf..45bdd3b95d3 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -164,10 +164,11 @@ struct perf_counter_hw_event { /* * Ioctls that can be done on a perf counter fd: */ -#define PERF_COUNTER_IOC_ENABLE _IOW('$', 0, u32) -#define PERF_COUNTER_IOC_DISABLE _IOW('$', 1, u32) -#define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32) -#define PERF_COUNTER_IOC_RESET _IOW('$', 3, u32) +#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) +#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2) +#define PERF_COUNTER_IOC_RESET _IO ('$', 3) +#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) enum perf_counter_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3f11a2bc6c7..abe2f3b6c42 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1604,6 +1604,43 @@ static void perf_counter_for_each(struct perf_counter *counter, mutex_unlock(&counter->child_mutex); } +static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) +{ + struct perf_counter_context *ctx = counter->ctx; + unsigned long size; + int ret = 0; + u64 value; + + if (!counter->hw_event.sample_period) + return -EINVAL; + + size = copy_from_user(&value, arg, sizeof(value)); + if (size != sizeof(value)) + return -EFAULT; + + if (!value) + return -EINVAL; + + spin_lock_irq(&ctx->lock); + if (counter->hw_event.freq) { + if (value > sysctl_perf_counter_limit) { + ret = -EINVAL; + goto unlock; + } + + counter->hw_event.sample_freq = value; + } else { + counter->hw_event.sample_period = value; + counter->hw.sample_period = value; + + perf_log_period(counter, value); + } +unlock: + spin_unlock_irq(&ctx->lock); + + return ret; +} + static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_counter *counter = file->private_data; @@ -1623,6 +1660,10 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_COUNTER_IOC_REFRESH: return perf_counter_refresh(counter, arg); + + case PERF_COUNTER_IOC_PERIOD: + return perf_counter_period(counter, (u64 __user *)arg); + default: return -ENOTTY; } -- cgit v1.2.3 From 0d48696f87e3618b0d35bd3e4e9d7c188d51e7de Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Jun 2009 19:22:16 +0200 Subject: perf_counter: Rename perf_counter_hw_event => perf_counter_attr The structure isn't hw only and when I read event, I think about those things that fall out the other end. Rename the thing. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur Cc: Stephane Eranian LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 38 ++++++------ arch/x86/kernel/cpu/perf_counter.c | 16 ++--- include/linux/perf_counter.h | 34 +++++------ include/linux/syscalls.h | 4 +- kernel/perf_counter.c | 116 ++++++++++++++++++------------------- 5 files changed, 104 insertions(+), 104 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index c9633321e7a..ea54686cb78 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -262,13 +262,13 @@ static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[], } counter = ctrs[i]; if (first) { - eu = counter->hw_event.exclude_user; - ek = counter->hw_event.exclude_kernel; - eh = counter->hw_event.exclude_hv; + eu = counter->attr.exclude_user; + ek = counter->attr.exclude_kernel; + eh = counter->attr.exclude_hv; first = 0; - } else if (counter->hw_event.exclude_user != eu || - counter->hw_event.exclude_kernel != ek || - counter->hw_event.exclude_hv != eh) { + } else if (counter->attr.exclude_user != eu || + counter->attr.exclude_kernel != ek || + counter->attr.exclude_hv != eh) { return -EAGAIN; } } @@ -483,16 +483,16 @@ void hw_perf_enable(void) /* * Add in MMCR0 freeze bits corresponding to the - * hw_event.exclude_* bits for the first counter. + * attr.exclude_* bits for the first counter. * We have already checked that all counters have the * same values for these bits as the first counter. */ counter = cpuhw->counter[0]; - if (counter->hw_event.exclude_user) + if (counter->attr.exclude_user) cpuhw->mmcr[0] |= MMCR0_FCP; - if (counter->hw_event.exclude_kernel) + if (counter->attr.exclude_kernel) cpuhw->mmcr[0] |= freeze_counters_kernel; - if (counter->hw_event.exclude_hv) + if (counter->attr.exclude_hv) cpuhw->mmcr[0] |= MMCR0_FCHV; /* @@ -786,10 +786,10 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev, int n; u64 alt[MAX_EVENT_ALTERNATIVES]; - if (counter->hw_event.exclude_user - || counter->hw_event.exclude_kernel - || counter->hw_event.exclude_hv - || counter->hw_event.sample_period) + if (counter->attr.exclude_user + || counter->attr.exclude_kernel + || counter->attr.exclude_hv + || counter->attr.sample_period) return 0; if (ppmu->limited_pmc_event(ev)) @@ -855,13 +855,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) if (!ppmu) return ERR_PTR(-ENXIO); - if (!perf_event_raw(&counter->hw_event)) { - ev = perf_event_id(&counter->hw_event); + if (!perf_event_raw(&counter->attr)) { + ev = perf_event_id(&counter->attr); if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) return ERR_PTR(-EOPNOTSUPP); ev = ppmu->generic_events[ev]; } else { - ev = perf_event_config(&counter->hw_event); + ev = perf_event_config(&counter->attr); } counter->hw.config_base = ev; counter->hw.idx = 0; @@ -872,7 +872,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) * the user set it to. */ if (!firmware_has_feature(FW_FEATURE_LPAR)) - counter->hw_event.exclude_hv = 0; + counter->attr.exclude_hv = 0; /* * If this is a per-task counter, then we can use @@ -990,7 +990,7 @@ static void record_and_restart(struct perf_counter *counter, long val, */ if (record) { addr = 0; - if (counter->hw_event.record_type & PERF_RECORD_ADDR) { + if (counter->attr.record_type & PERF_RECORD_ADDR) { /* * The user wants a data address recorded. * If we're not doing instruction sampling, diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 904571bea71..e16e8c13132 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -247,11 +247,11 @@ static inline int x86_pmu_initialized(void) } /* - * Setup the hardware configuration for a given hw_event_type + * Setup the hardware configuration for a given attr_type */ static int __hw_perf_counter_init(struct perf_counter *counter) { - struct perf_counter_hw_event *hw_event = &counter->hw_event; + struct perf_counter_attr *attr = &counter->attr; struct hw_perf_counter *hwc = &counter->hw; int err; @@ -279,9 +279,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Count user and OS events unless requested not to. */ - if (!hw_event->exclude_user) + if (!attr->exclude_user) hwc->config |= ARCH_PERFMON_EVENTSEL_USR; - if (!hw_event->exclude_kernel) + if (!attr->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; if (!hwc->sample_period) @@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (perf_event_raw(hw_event)) { - hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event)); + if (perf_event_raw(attr)) { + hwc->config |= x86_pmu.raw_event(perf_event_config(attr)); } else { - if (perf_event_id(hw_event) >= x86_pmu.max_events) + if (perf_event_id(attr) >= x86_pmu.max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= x86_pmu.event_map(perf_event_id(hw_event)); + hwc->config |= x86_pmu.event_map(perf_event_id(attr)); } counter->destroy = hw_perf_counter_destroy; diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 45bdd3b95d3..37d5541d74c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -22,7 +22,7 @@ */ /* - * hw_event.type + * attr.type */ enum perf_event_types { PERF_TYPE_HARDWARE = 0, @@ -37,10 +37,10 @@ enum perf_event_types { }; /* - * Generalized performance counter event types, used by the hw_event.event_id + * Generalized performance counter event types, used by the attr.event_id * parameter of the sys_perf_counter_open() syscall: */ -enum hw_event_ids { +enum attr_ids { /* * Common hardware events, generalized by the kernel: */ @@ -94,7 +94,7 @@ enum sw_event_ids { #define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) /* - * Bits that can be set in hw_event.sample_type to request information + * Bits that can be set in attr.sample_type to request information * in the overflow packets. */ enum perf_counter_sample_format { @@ -109,7 +109,7 @@ enum perf_counter_sample_format { }; /* - * Bits that can be set in hw_event.read_format to request that + * Bits that can be set in attr.read_format to request that * reads on the counter should return the indicated quantities, * in increasing order of bit value, after the counter value. */ @@ -122,7 +122,7 @@ enum perf_counter_read_format { /* * Hardware event to monitor via a performance monitoring counter: */ -struct perf_counter_hw_event { +struct perf_counter_attr { /* * The MSB of the config word signifies if the rest contains cpu * specific (raw) counter configuration data, if unset, the next @@ -323,25 +323,25 @@ enum perf_event_type { struct task_struct; -static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event) +static inline u64 perf_event_raw(struct perf_counter_attr *attr) { - return hw_event->config & PERF_COUNTER_RAW_MASK; + return attr->config & PERF_COUNTER_RAW_MASK; } -static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event) +static inline u64 perf_event_config(struct perf_counter_attr *attr) { - return hw_event->config & PERF_COUNTER_CONFIG_MASK; + return attr->config & PERF_COUNTER_CONFIG_MASK; } -static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event) +static inline u64 perf_event_type(struct perf_counter_attr *attr) { - return (hw_event->config & PERF_COUNTER_TYPE_MASK) >> + return (attr->config & PERF_COUNTER_TYPE_MASK) >> PERF_COUNTER_TYPE_SHIFT; } -static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event) +static inline u64 perf_event_id(struct perf_counter_attr *attr) { - return hw_event->config & PERF_COUNTER_EVENT_MASK; + return attr->config & PERF_COUNTER_EVENT_MASK; } /** @@ -457,7 +457,7 @@ struct perf_counter { u64 tstamp_running; u64 tstamp_stopped; - struct perf_counter_hw_event hw_event; + struct perf_counter_attr attr; struct hw_perf_counter hw; struct perf_counter_context *ctx; @@ -605,8 +605,8 @@ extern int perf_counter_overflow(struct perf_counter *counter, */ static inline int is_software_counter(struct perf_counter *counter) { - return !perf_event_raw(&counter->hw_event) && - perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE; + return !perf_event_raw(&counter->attr) && + perf_event_type(&counter->attr) != PERF_TYPE_HARDWARE; } extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 79faae950e2..c6c84ad8bd7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -55,7 +55,7 @@ struct compat_timeval; struct robust_list_head; struct getcpu_cache; struct old_linux_dirent; -struct perf_counter_hw_event; +struct perf_counter_attr; #include #include @@ -758,6 +758,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]); asmlinkage long sys_perf_counter_open( - const struct perf_counter_hw_event __user *hw_event_uptr, + const struct perf_counter_attr __user *attr_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index abe2f3b6c42..317cef78a38 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -260,7 +260,7 @@ counter_sched_out(struct perf_counter *counter, if (!is_software_counter(counter)) cpuctx->active_oncpu--; ctx->nr_active--; - if (counter->hw_event.exclusive || !cpuctx->active_oncpu) + if (counter->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; } @@ -282,7 +282,7 @@ group_sched_out(struct perf_counter *group_counter, list_for_each_entry(counter, &group_counter->sibling_list, list_entry) counter_sched_out(counter, cpuctx, ctx); - if (group_counter->hw_event.exclusive) + if (group_counter->attr.exclusive) cpuctx->exclusive = 0; } @@ -550,7 +550,7 @@ counter_sched_in(struct perf_counter *counter, cpuctx->active_oncpu++; ctx->nr_active++; - if (counter->hw_event.exclusive) + if (counter->attr.exclusive) cpuctx->exclusive = 1; return 0; @@ -642,7 +642,7 @@ static int group_can_go_on(struct perf_counter *counter, * If this group is exclusive and there are already * counters on the CPU, it can't go on. */ - if (counter->hw_event.exclusive && cpuctx->active_oncpu) + if (counter->attr.exclusive && cpuctx->active_oncpu) return 0; /* * Otherwise, try to add it if all previous groups were able @@ -725,7 +725,7 @@ static void __perf_install_in_context(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) { + if (leader->attr.pinned) { update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; } @@ -849,7 +849,7 @@ static void __perf_counter_enable(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) { + if (leader->attr.pinned) { update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; } @@ -927,7 +927,7 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh) /* * not supported on inherited counters */ - if (counter->hw_event.inherit) + if (counter->attr.inherit) return -EINVAL; atomic_add(refresh, &counter->event_limit); @@ -1094,7 +1094,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, */ list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (counter->state <= PERF_COUNTER_STATE_OFF || - !counter->hw_event.pinned) + !counter->attr.pinned) continue; if (counter->cpu != -1 && counter->cpu != cpu) continue; @@ -1122,7 +1122,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, * ignore pinned counters since we did them already. */ if (counter->state <= PERF_COUNTER_STATE_OFF || - counter->hw_event.pinned) + counter->attr.pinned) continue; /* @@ -1204,11 +1204,11 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) interrupts = 2*sysctl_perf_counter_limit/HZ; } - if (!counter->hw_event.freq || !counter->hw_event.sample_freq) + if (!counter->attr.freq || !counter->attr.sample_freq) continue; events = HZ * interrupts * counter->hw.sample_period; - period = div64_u64(events, counter->hw_event.sample_freq); + period = div64_u64(events, counter->attr.sample_freq); delta = (s64)(1 + period - counter->hw.sample_period); delta >>= 1; @@ -1444,11 +1444,11 @@ static void free_counter(struct perf_counter *counter) perf_pending_sync(counter); atomic_dec(&nr_counters); - if (counter->hw_event.mmap) + if (counter->attr.mmap) atomic_dec(&nr_mmap_tracking); - if (counter->hw_event.munmap) + if (counter->attr.munmap) atomic_dec(&nr_munmap_tracking); - if (counter->hw_event.comm) + if (counter->attr.comm) atomic_dec(&nr_comm_tracking); if (counter->destroy) @@ -1504,13 +1504,13 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) mutex_lock(&counter->child_mutex); values[0] = perf_counter_read(counter); n = 1; - if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) values[n++] = counter->total_time_enabled + atomic64_read(&counter->child_total_time_enabled); - if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = counter->total_time_running + atomic64_read(&counter->child_total_time_running); - if (counter->hw_event.read_format & PERF_FORMAT_ID) + if (counter->attr.read_format & PERF_FORMAT_ID) values[n++] = counter->id; mutex_unlock(&counter->child_mutex); @@ -1611,7 +1611,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) int ret = 0; u64 value; - if (!counter->hw_event.sample_period) + if (!counter->attr.sample_period) return -EINVAL; size = copy_from_user(&value, arg, sizeof(value)); @@ -1622,15 +1622,15 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) return -EINVAL; spin_lock_irq(&ctx->lock); - if (counter->hw_event.freq) { + if (counter->attr.freq) { if (value > sysctl_perf_counter_limit) { ret = -EINVAL; goto unlock; } - counter->hw_event.sample_freq = value; + counter->attr.sample_freq = value; } else { - counter->hw_event.sample_period = value; + counter->attr.sample_period = value; counter->hw.sample_period = value; perf_log_period(counter, value); @@ -2299,7 +2299,7 @@ static void perf_output_end(struct perf_output_handle *handle) struct perf_counter *counter = handle->counter; struct perf_mmap_data *data = handle->data; - int wakeup_events = counter->hw_event.wakeup_events; + int wakeup_events = counter->attr.wakeup_events; if (handle->overflow && wakeup_events) { int events = atomic_inc_return(&data->events); @@ -2339,7 +2339,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { int ret; - u64 sample_type = counter->hw_event.sample_type; + u64 sample_type = counter->attr.sample_type; struct perf_output_handle handle; struct perf_event_header header; u64 ip; @@ -2441,7 +2441,7 @@ static void perf_counter_output(struct perf_counter *counter, perf_output_put(&handle, addr); if (sample_type & PERF_SAMPLE_CONFIG) - perf_output_put(&handle, counter->hw_event.config); + perf_output_put(&handle, counter->attr.config); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(&handle, cpu_entry); @@ -2512,7 +2512,7 @@ static void perf_counter_comm_output(struct perf_counter *counter, static int perf_counter_comm_match(struct perf_counter *counter, struct perf_comm_event *comm_event) { - if (counter->hw_event.comm && + if (counter->attr.comm && comm_event->event.header.type == PERF_EVENT_COMM) return 1; @@ -2623,11 +2623,11 @@ static void perf_counter_mmap_output(struct perf_counter *counter, static int perf_counter_mmap_match(struct perf_counter *counter, struct perf_mmap_event *mmap_event) { - if (counter->hw_event.mmap && + if (counter->attr.mmap && mmap_event->event.header.type == PERF_EVENT_MMAP) return 1; - if (counter->hw_event.munmap && + if (counter->attr.munmap && mmap_event->event.header.type == PERF_EVENT_MUNMAP) return 1; @@ -2907,8 +2907,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) * In case we exclude kernel IPs or are somehow not in interrupt * context, provide the next best thing, the user IP. */ - if ((counter->hw_event.exclude_kernel || !regs) && - !counter->hw_event.exclude_user) + if ((counter->attr.exclude_kernel || !regs) && + !counter->attr.exclude_user) regs = task_pt_regs(current); if (regs) { @@ -2982,14 +2982,14 @@ static int perf_swcounter_match(struct perf_counter *counter, if (!perf_swcounter_is_counting(counter)) return 0; - if (counter->hw_event.config != event_config) + if (counter->attr.config != event_config) return 0; if (regs) { - if (counter->hw_event.exclude_user && user_mode(regs)) + if (counter->attr.exclude_user && user_mode(regs)) return 0; - if (counter->hw_event.exclude_kernel && !user_mode(regs)) + if (counter->attr.exclude_kernel && !user_mode(regs)) return 0; } @@ -3252,12 +3252,12 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - ftrace_profile_disable(perf_event_id(&counter->hw_event)); + ftrace_profile_disable(perf_event_id(&counter->attr)); } static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { - int event_id = perf_event_id(&counter->hw_event); + int event_id = perf_event_id(&counter->attr); int ret; ret = ftrace_profile_enable(event_id); @@ -3265,7 +3265,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) return NULL; counter->destroy = tp_perf_counter_destroy; - counter->hw.sample_period = counter->hw_event.sample_period; + counter->hw.sample_period = counter->attr.sample_period; return &perf_ops_generic; } @@ -3287,7 +3287,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (perf_event_id(&counter->hw_event)) { + switch (perf_event_id(&counter->attr)) { case PERF_COUNT_CPU_CLOCK: pmu = &perf_ops_cpu_clock; @@ -3319,7 +3319,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(struct perf_counter_hw_event *hw_event, +perf_counter_alloc(struct perf_counter_attr *attr, int cpu, struct perf_counter_context *ctx, struct perf_counter *group_leader, @@ -3352,36 +3352,36 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, mutex_init(&counter->mmap_mutex); counter->cpu = cpu; - counter->hw_event = *hw_event; + counter->attr = *attr; counter->group_leader = group_leader; counter->pmu = NULL; counter->ctx = ctx; counter->oncpu = -1; counter->state = PERF_COUNTER_STATE_INACTIVE; - if (hw_event->disabled) + if (attr->disabled) counter->state = PERF_COUNTER_STATE_OFF; pmu = NULL; hwc = &counter->hw; - if (hw_event->freq && hw_event->sample_freq) - hwc->sample_period = div64_u64(TICK_NSEC, hw_event->sample_freq); + if (attr->freq && attr->sample_freq) + hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq); else - hwc->sample_period = hw_event->sample_period; + hwc->sample_period = attr->sample_period; /* * we currently do not support PERF_SAMPLE_GROUP on inherited counters */ - if (hw_event->inherit && (hw_event->sample_type & PERF_SAMPLE_GROUP)) + if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) goto done; - if (perf_event_raw(hw_event)) { + if (perf_event_raw(attr)) { pmu = hw_perf_counter_init(counter); goto done; } - switch (perf_event_type(hw_event)) { + switch (perf_event_type(attr)) { case PERF_TYPE_HARDWARE: pmu = hw_perf_counter_init(counter); break; @@ -3409,11 +3409,11 @@ done: counter->pmu = pmu; atomic_inc(&nr_counters); - if (counter->hw_event.mmap) + if (counter->attr.mmap) atomic_inc(&nr_mmap_tracking); - if (counter->hw_event.munmap) + if (counter->attr.munmap) atomic_inc(&nr_munmap_tracking); - if (counter->hw_event.comm) + if (counter->attr.comm) atomic_inc(&nr_comm_tracking); return counter; @@ -3424,17 +3424,17 @@ static atomic64_t perf_counter_id; /** * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * - * @hw_event_uptr: event type attributes for monitoring/sampling + * @attr_uptr: event type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader counter fd */ SYSCALL_DEFINE5(perf_counter_open, - const struct perf_counter_hw_event __user *, hw_event_uptr, + const struct perf_counter_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_counter *counter, *group_leader; - struct perf_counter_hw_event hw_event; + struct perf_counter_attr attr; struct perf_counter_context *ctx; struct file *counter_file = NULL; struct file *group_file = NULL; @@ -3446,7 +3446,7 @@ SYSCALL_DEFINE5(perf_counter_open, if (flags) return -EINVAL; - if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) + if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0) return -EFAULT; /* @@ -3484,11 +3484,11 @@ SYSCALL_DEFINE5(perf_counter_open, /* * Only a group leader can be exclusive or pinned */ - if (hw_event.exclusive || hw_event.pinned) + if (attr.exclusive || attr.pinned) goto err_put_context; } - counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, + counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, GFP_KERNEL); ret = PTR_ERR(counter); if (IS_ERR(counter)) @@ -3556,7 +3556,7 @@ inherit_counter(struct perf_counter *parent_counter, if (parent_counter->parent) parent_counter = parent_counter->parent; - child_counter = perf_counter_alloc(&parent_counter->hw_event, + child_counter = perf_counter_alloc(&parent_counter->attr, parent_counter->cpu, child_ctx, group_leader, GFP_KERNEL); if (IS_ERR(child_counter)) @@ -3565,7 +3565,7 @@ inherit_counter(struct perf_counter *parent_counter, /* * Make the child state follow the state of the parent counter, - * not its hw_event.disabled bit. We hold the parent's mutex, + * not its attr.disabled bit. We hold the parent's mutex, * so we won't race with perf_counter_{en, dis}able_family. */ if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) @@ -3582,7 +3582,7 @@ inherit_counter(struct perf_counter *parent_counter, /* * inherit into child's child as well: */ - child_counter->hw_event.inherit = 1; + child_counter->attr.inherit = 1; /* * Get a reference to the parent filp - we will fput it @@ -3838,7 +3838,7 @@ int perf_counter_init_task(struct task_struct *child) if (counter != counter->group_leader) continue; - if (!counter->hw_event.inherit) { + if (!counter->attr.inherit) { inherited_all = 0; continue; } -- cgit v1.2.3 From 60313ebed739b331e8e61079da27a11ee3b73a30 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Jun 2009 16:53:44 +0200 Subject: perf_counter: Add fork event Create a fork event so that we can easily clone the comm and dso maps without having to generate all those events. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 ++++ kernel/fork.c | 4 +- kernel/perf_counter.c | 131 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 126 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 37d5541d74c..380247bdb91 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -276,6 +276,14 @@ enum perf_event_type { PERF_EVENT_THROTTLE = 5, PERF_EVENT_UNTHROTTLE = 6, + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * }; + */ + PERF_EVENT_FORK = 7, + /* * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* @@ -618,6 +626,7 @@ extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); extern void perf_counter_comm(struct task_struct *tsk); +extern void perf_counter_fork(struct task_struct *tsk); extern void perf_counter_task_migration(struct task_struct *task, int cpu); @@ -673,6 +682,7 @@ perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file) { } static inline void perf_counter_comm(struct task_struct *tsk) { } +static inline void perf_counter_fork(struct task_struct *tsk) { } static inline void perf_counter_init(void) { } static inline void perf_counter_task_migration(struct task_struct *task, int cpu) { } diff --git a/kernel/fork.c b/kernel/fork.c index b7d7a9f0bd7..f4466ca37ec 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1412,12 +1412,12 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); - } else { + } else if (!(clone_flags & CLONE_VM)) { /* * vfork will do an exec which will call * set_task_comm() */ - perf_counter_comm(p); + perf_counter_fork(p); } audit_finish_fork(p); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0bb03f15a5b..78c58623a0d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -40,9 +40,9 @@ static int perf_reserved_percpu __read_mostly; static int perf_overcommit __read_mostly = 1; static atomic_t nr_counters __read_mostly; -static atomic_t nr_mmap_tracking __read_mostly; -static atomic_t nr_munmap_tracking __read_mostly; -static atomic_t nr_comm_tracking __read_mostly; +static atomic_t nr_mmap_counters __read_mostly; +static atomic_t nr_munmap_counters __read_mostly; +static atomic_t nr_comm_counters __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ @@ -1447,11 +1447,11 @@ static void free_counter(struct perf_counter *counter) atomic_dec(&nr_counters); if (counter->attr.mmap) - atomic_dec(&nr_mmap_tracking); + atomic_dec(&nr_mmap_counters); if (counter->attr.munmap) - atomic_dec(&nr_munmap_tracking); + atomic_dec(&nr_munmap_counters); if (counter->attr.comm) - atomic_dec(&nr_comm_tracking); + atomic_dec(&nr_comm_counters); if (counter->destroy) counter->destroy(counter); @@ -2475,6 +2475,105 @@ static void perf_counter_output(struct perf_counter *counter, perf_output_end(&handle); } +/* + * fork tracking + */ + +struct perf_fork_event { + struct task_struct *task; + + struct { + struct perf_event_header header; + + u32 pid; + u32 ppid; + } event; +}; + +static void perf_counter_fork_output(struct perf_counter *counter, + struct perf_fork_event *fork_event) +{ + struct perf_output_handle handle; + int size = fork_event->event.header.size; + struct task_struct *task = fork_event->task; + int ret = perf_output_begin(&handle, counter, size, 0, 0); + + if (ret) + return; + + fork_event->event.pid = perf_counter_pid(counter, task); + fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); + + perf_output_put(&handle, fork_event->event); + perf_output_end(&handle); +} + +static int perf_counter_fork_match(struct perf_counter *counter) +{ + if (counter->attr.comm || counter->attr.mmap || counter->attr.munmap) + return 1; + + return 0; +} + +static void perf_counter_fork_ctx(struct perf_counter_context *ctx, + struct perf_fork_event *fork_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_fork_match(counter)) + perf_counter_fork_output(counter, fork_event); + } + rcu_read_unlock(); +} + +static void perf_counter_fork_event(struct perf_fork_event *fork_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_fork_ctx(&cpuctx->ctx, fork_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_counter_ctxp); + if (ctx) + perf_counter_fork_ctx(ctx, fork_event); + rcu_read_unlock(); +} + +void perf_counter_fork(struct task_struct *task) +{ + struct perf_fork_event fork_event; + + if (!atomic_read(&nr_comm_counters) && + !atomic_read(&nr_mmap_counters) && + !atomic_read(&nr_munmap_counters)) + return; + + fork_event = (struct perf_fork_event){ + .task = task, + .event = { + .header = { + .type = PERF_EVENT_FORK, + .size = sizeof(fork_event.event), + }, + }, + }; + + perf_counter_fork_event(&fork_event); +} + /* * comm tracking */ @@ -2511,11 +2610,9 @@ static void perf_counter_comm_output(struct perf_counter *counter, perf_output_end(&handle); } -static int perf_counter_comm_match(struct perf_counter *counter, - struct perf_comm_event *comm_event) +static int perf_counter_comm_match(struct perf_counter *counter) { - if (counter->attr.comm && - comm_event->event.header.type == PERF_EVENT_COMM) + if (counter->attr.comm) return 1; return 0; @@ -2531,7 +2628,7 @@ static void perf_counter_comm_ctx(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_comm_match(counter, comm_event)) + if (perf_counter_comm_match(counter)) perf_counter_comm_output(counter, comm_event); } rcu_read_unlock(); @@ -2570,7 +2667,7 @@ void perf_counter_comm(struct task_struct *task) { struct perf_comm_event comm_event; - if (!atomic_read(&nr_comm_tracking)) + if (!atomic_read(&nr_comm_counters)) return; comm_event = (struct perf_comm_event){ @@ -2708,7 +2805,7 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, { struct perf_mmap_event mmap_event; - if (!atomic_read(&nr_mmap_tracking)) + if (!atomic_read(&nr_mmap_counters)) return; mmap_event = (struct perf_mmap_event){ @@ -2729,7 +2826,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, { struct perf_mmap_event mmap_event; - if (!atomic_read(&nr_munmap_tracking)) + if (!atomic_read(&nr_munmap_counters)) return; mmap_event = (struct perf_mmap_event){ @@ -3427,11 +3524,11 @@ done: atomic_inc(&nr_counters); if (counter->attr.mmap) - atomic_inc(&nr_mmap_tracking); + atomic_inc(&nr_mmap_counters); if (counter->attr.munmap) - atomic_inc(&nr_munmap_tracking); + atomic_inc(&nr_munmap_counters); if (counter->attr.comm) - atomic_inc(&nr_comm_tracking); + atomic_inc(&nr_comm_counters); return counter; } -- cgit v1.2.3 From d99e9446200c1ffab28cb0e39b76c34a2bfafd06 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Jun 2009 17:08:58 +0200 Subject: perf_counter: Remove munmap stuff In name of keeping it simple, only track mmap events. Userspace will have to remove old overlapping maps when it encounters them. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 11 +---------- kernel/perf_counter.c | 38 +++----------------------------------- mm/mmap.c | 6 ------ 3 files changed, 4 insertions(+), 51 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 380247bdb91..6ca403acd41 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -148,11 +148,10 @@ struct perf_counter_attr { exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ mmap : 1, /* include mmap data */ - munmap : 1, /* include munmap data */ comm : 1, /* include comm data */ freq : 1, /* use freq, not period */ - __reserved_1 : 52; + __reserved_1 : 53; __u32 wakeup_events; /* wakeup every n events */ __u32 __reserved_2; @@ -246,7 +245,6 @@ enum perf_event_type { * }; */ PERF_EVENT_MMAP = 1, - PERF_EVENT_MUNMAP = 2, /* * struct { @@ -622,9 +620,6 @@ extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); extern void perf_counter_mmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); -extern void perf_counter_munmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file); - extern void perf_counter_comm(struct task_struct *tsk); extern void perf_counter_fork(struct task_struct *tsk); @@ -677,10 +672,6 @@ static inline void perf_counter_mmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file) { } -static inline void -perf_counter_munmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file) { } - static inline void perf_counter_comm(struct task_struct *tsk) { } static inline void perf_counter_fork(struct task_struct *tsk) { } static inline void perf_counter_init(void) { } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 78c58623a0d..195712e20d0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -41,7 +41,6 @@ static int perf_overcommit __read_mostly = 1; static atomic_t nr_counters __read_mostly; static atomic_t nr_mmap_counters __read_mostly; -static atomic_t nr_munmap_counters __read_mostly; static atomic_t nr_comm_counters __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ @@ -1448,8 +1447,6 @@ static void free_counter(struct perf_counter *counter) atomic_dec(&nr_counters); if (counter->attr.mmap) atomic_dec(&nr_mmap_counters); - if (counter->attr.munmap) - atomic_dec(&nr_munmap_counters); if (counter->attr.comm) atomic_dec(&nr_comm_counters); @@ -2510,7 +2507,7 @@ static void perf_counter_fork_output(struct perf_counter *counter, static int perf_counter_fork_match(struct perf_counter *counter) { - if (counter->attr.comm || counter->attr.mmap || counter->attr.munmap) + if (counter->attr.comm || counter->attr.mmap) return 1; return 0; @@ -2557,8 +2554,7 @@ void perf_counter_fork(struct task_struct *task) struct perf_fork_event fork_event; if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters) && - !atomic_read(&nr_munmap_counters)) + !atomic_read(&nr_mmap_counters)) return; fork_event = (struct perf_fork_event){ @@ -2722,12 +2718,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter, static int perf_counter_mmap_match(struct perf_counter *counter, struct perf_mmap_event *mmap_event) { - if (counter->attr.mmap && - mmap_event->event.header.type == PERF_EVENT_MMAP) - return 1; - - if (counter->attr.munmap && - mmap_event->event.header.type == PERF_EVENT_MUNMAP) + if (counter->attr.mmap) return 1; return 0; @@ -2821,27 +2812,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, perf_counter_mmap_event(&mmap_event); } -void perf_counter_munmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file) -{ - struct perf_mmap_event mmap_event; - - if (!atomic_read(&nr_munmap_counters)) - return; - - mmap_event = (struct perf_mmap_event){ - .file = file, - .event = { - .header = { .type = PERF_EVENT_MUNMAP, }, - .start = addr, - .len = len, - .pgoff = pgoff, - }, - }; - - perf_counter_mmap_event(&mmap_event); -} - /* * Log sample_period changes so that analyzing tools can re-normalize the * event flow. @@ -3525,8 +3495,6 @@ done: atomic_inc(&nr_counters); if (counter->attr.mmap) atomic_inc(&nr_mmap_counters); - if (counter->attr.munmap) - atomic_inc(&nr_munmap_counters); if (counter->attr.comm) atomic_inc(&nr_comm_counters); diff --git a/mm/mmap.c b/mm/mmap.c index 2c1c2cb0e2e..6451ce2854b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1756,12 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) do { long nrpages = vma_pages(vma); - if (vma->vm_flags & VM_EXEC) { - perf_counter_munmap(vma->vm_start, - nrpages << PAGE_SHIFT, - vma->vm_pgoff, vma->vm_file); - } - mm->total_vm -= nrpages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); vma = remove_vma(vma); -- cgit v1.2.3 From 089dd79db9264dc0da602bad45d42f1b3e7d1e07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2009 14:04:55 +0200 Subject: perf_counter: Generate mmap events for install_special_mapping() In order to track the vdso also generate mmap events for install_special_mapping(). Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 14 ++++++++------ kernel/perf_counter.c | 34 ++++++++++++++++++++++------------ mm/mmap.c | 5 +++-- 3 files changed, 33 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 6ca403acd41..40dc0e273d9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -617,8 +617,13 @@ static inline int is_software_counter(struct perf_counter *counter) extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); -extern void perf_counter_mmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file); +extern void __perf_counter_mmap(struct vm_area_struct *vma); + +static inline void perf_counter_mmap(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_EXEC) + __perf_counter_mmap(vma); +} extern void perf_counter_comm(struct task_struct *tsk); extern void perf_counter_fork(struct task_struct *tsk); @@ -668,10 +673,7 @@ static inline void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { } -static inline void -perf_counter_mmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file) { } - +static inline void perf_counter_mmap(struct vm_area_struct *vma) { } static inline void perf_counter_comm(struct task_struct *tsk) { } static inline void perf_counter_fork(struct task_struct *tsk) { } static inline void perf_counter_init(void) { } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a5d3e2aedd2..37a5a241ca7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2255,7 +2255,7 @@ out: } static void perf_output_copy(struct perf_output_handle *handle, - void *buf, unsigned int len) + const void *buf, unsigned int len) { unsigned int pages_mask; unsigned int offset; @@ -2681,9 +2681,10 @@ void perf_counter_comm(struct task_struct *task) */ struct perf_mmap_event { - struct file *file; - char *file_name; - int file_size; + struct vm_area_struct *vma; + + const char *file_name; + int file_size; struct { struct perf_event_header header; @@ -2744,11 +2745,12 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; - struct file *file = mmap_event->file; + struct vm_area_struct *vma = mmap_event->vma; + struct file *file = vma->vm_file; unsigned int size; char tmp[16]; char *buf = NULL; - char *name; + const char *name; if (file) { buf = kzalloc(PATH_MAX, GFP_KERNEL); @@ -2762,6 +2764,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) goto got_name; } } else { + name = arch_vma_name(mmap_event->vma); + if (name) + goto got_name; + + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } + name = strncpy(tmp, "//anon", sizeof(tmp)); goto got_name; } @@ -2791,8 +2802,7 @@ got_name: kfree(buf); } -void perf_counter_mmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file) +void __perf_counter_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; @@ -2800,12 +2810,12 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, return; mmap_event = (struct perf_mmap_event){ - .file = file, + .vma = vma, .event = { .header = { .type = PERF_EVENT_MMAP, }, - .start = addr, - .len = len, - .pgoff = pgoff, + .start = vma->vm_start, + .len = vma->vm_end - vma->vm_start, + .pgoff = vma->vm_pgoff, }, }; diff --git a/mm/mmap.c b/mm/mmap.c index 6451ce2854b..8101de490c7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1220,8 +1220,7 @@ munmap_back: if (correct_wcount) atomic_inc(&inode->i_writecount); out: - if (vm_flags & VM_EXEC) - perf_counter_mmap(addr, len, pgoff, file); + perf_counter_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); @@ -2309,6 +2308,8 @@ int install_special_mapping(struct mm_struct *mm, mm->total_vm += len >> PAGE_SHIFT; + perf_counter_mmap(vma); + return 0; } -- cgit v1.2.3 From ac4bcf889469ffbca88f234d3184452886a47905 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2009 14:44:52 +0200 Subject: perf_counter: Change PERF_SAMPLE_CONFIG into PERF_SAMPLE_ID The purpose of PERF_SAMPLE_CONFIG was to identify the counters, since then we've added counter ids, use those instead. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- kernel/perf_counter.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 40dc0e273d9..9cea32a0655 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -104,7 +104,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_ADDR = 1U << 3, PERF_SAMPLE_GROUP = 1U << 4, PERF_SAMPLE_CALLCHAIN = 1U << 5, - PERF_SAMPLE_CONFIG = 1U << 6, + PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 37a5a241ca7..e75b91a76a5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2392,8 +2392,8 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } - if (sample_type & PERF_SAMPLE_CONFIG) { - header.type |= PERF_SAMPLE_CONFIG; + if (sample_type & PERF_SAMPLE_ID) { + header.type |= PERF_SAMPLE_ID; header.size += sizeof(u64); } @@ -2439,8 +2439,8 @@ static void perf_counter_output(struct perf_counter *counter, if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(&handle, addr); - if (sample_type & PERF_SAMPLE_CONFIG) - perf_output_put(&handle, counter->attr.config); + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(&handle, counter->id); if (sample_type & PERF_SAMPLE_CPU) perf_output_put(&handle, cpu_entry); -- cgit v1.2.3 From 689802b2d0536e72281dc959ab9cb34fb3c304cf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2009 15:05:43 +0200 Subject: perf_counter: Add PERF_SAMPLE_PERIOD In order to allow easy tracking of the period, also provide means of adding it to the sample data. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 10 ++++++++++ 2 files changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 9cea32a0655..6bc25003973 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -106,6 +106,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_CALLCHAIN = 1U << 5, PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, }; /* @@ -260,6 +261,7 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u64 time; + * u64 id; * u64 sample_period; * }; */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e75b91a76a5..f8390668c39 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2404,6 +2404,11 @@ static void perf_counter_output(struct perf_counter *counter, cpu_entry.cpu = raw_smp_processor_id(); } + if (sample_type & PERF_SAMPLE_PERIOD) { + header.type |= PERF_SAMPLE_PERIOD; + header.size += sizeof(u64); + } + if (sample_type & PERF_SAMPLE_GROUP) { header.type |= PERF_SAMPLE_GROUP; header.size += sizeof(u64) + @@ -2445,6 +2450,9 @@ static void perf_counter_output(struct perf_counter *counter, if (sample_type & PERF_SAMPLE_CPU) perf_output_put(&handle, cpu_entry); + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(&handle, counter->hw.sample_period); + /* * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. */ @@ -2835,6 +2843,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period) struct { struct perf_event_header header; u64 time; + u64 id; u64 period; } freq_event = { .header = { @@ -2843,6 +2852,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period) .size = sizeof(freq_event), }, .time = sched_clock(), + .id = counter->id, .period = period, }; -- cgit v1.2.3 From 6a24ed6c6082ec65d19331a4bfa30c0512a1a822 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Jun 2009 18:01:29 +0200 Subject: perf_counter: Fix frequency adjustment for < HZ Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 +++ kernel/perf_counter.c | 32 +++++++++++++++++++++++++------- 2 files changed, 28 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 6bc25003973..4f9d39ecdc0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -373,6 +373,9 @@ struct hw_perf_counter { u64 sample_period; atomic64_t period_left; u64 interrupts; + + u64 freq_count; + u64 freq_interrupts; #endif }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f8390668c39..47c92fb927f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1187,8 +1187,9 @@ static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_freq(struct perf_counter_context *ctx) { struct perf_counter *counter; + struct hw_perf_counter *hwc; u64 interrupts, sample_period; - u64 events, period; + u64 events, period, freq; s64 delta; spin_lock(&ctx->lock); @@ -1196,8 +1197,10 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) if (counter->state != PERF_COUNTER_STATE_ACTIVE) continue; - interrupts = counter->hw.interrupts; - counter->hw.interrupts = 0; + hwc = &counter->hw; + + interrupts = hwc->interrupts; + hwc->interrupts = 0; if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(counter, 1); @@ -1208,20 +1211,35 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) if (!counter->attr.freq || !counter->attr.sample_freq) continue; - events = HZ * interrupts * counter->hw.sample_period; + if (counter->attr.sample_freq < HZ) { + freq = counter->attr.sample_freq; + + hwc->freq_count += freq; + hwc->freq_interrupts += interrupts; + + if (hwc->freq_count < HZ) + continue; + + interrupts = hwc->freq_interrupts; + hwc->freq_interrupts = 0; + hwc->freq_count -= HZ; + } else + freq = HZ; + + events = freq * interrupts * hwc->sample_period; period = div64_u64(events, counter->attr.sample_freq); - delta = (s64)(1 + period - counter->hw.sample_period); + delta = (s64)(1 + period - hwc->sample_period); delta >>= 1; - sample_period = counter->hw.sample_period + delta; + sample_period = hwc->sample_period + delta; if (!sample_period) sample_period = 1; perf_log_period(counter, sample_period); - counter->hw.sample_period = sample_period; + hwc->sample_period = sample_period; } spin_unlock(&ctx->lock); } -- cgit v1.2.3 From a21ca2cac582886a3e95c8bb84ff7c52d4d15e54 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 6 Jun 2009 09:58:57 +0200 Subject: perf_counter: Separate out attr->type from attr->config Counter type is a frequently used value and we do a lot of bit juggling by encoding and decoding it from attr->config. Clean this up by creating a separate attr->type field. Also clean up the various similarly complex user-space bits all around counter attribute management. The net improvement is significant, and it will be easier to add a new major type (which is what triggered this cleanup). (This changes the ABI, all tools are adapted.) (PowerPC build-tested.) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-record.c | 105 ++++++++++------------ Documentation/perf_counter/builtin-stat.c | 76 +++++++--------- Documentation/perf_counter/builtin-top.c | 67 +++++--------- Documentation/perf_counter/perf.h | 2 - Documentation/perf_counter/util/parse-events.c | 120 ++++++++++++++----------- Documentation/perf_counter/util/parse-events.h | 7 +- arch/powerpc/kernel/perf_counter.c | 6 +- arch/x86/kernel/cpu/perf_counter.c | 8 +- include/linux/perf_counter.h | 65 +++----------- kernel/perf_counter.c | 14 ++- 10 files changed, 196 insertions(+), 274 deletions(-) (limited to 'include') diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c index c22ea0c7472..130fd88266b 100644 --- a/Documentation/perf_counter/builtin-record.c +++ b/Documentation/perf_counter/builtin-record.c @@ -20,10 +20,10 @@ #define ALIGN(x, a) __ALIGN_MASK(x, (typeof(x))(a)-1) #define __ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) -static long default_interval = 100000; -static long event_count[MAX_COUNTERS]; - static int fd[MAX_NR_CPUS][MAX_COUNTERS]; + +static long default_interval = 100000; + static int nr_cpus = 0; static unsigned int page_size; static unsigned int mmap_pages = 128; @@ -38,22 +38,44 @@ static int inherit = 1; static int force = 0; static int append_file = 0; -const unsigned int default_count[] = { - 1000000, - 1000000, - 10000, - 10000, - 1000000, - 10000, +static long samples; +static struct timeval last_read; +static struct timeval this_read; + +static __u64 bytes_written; + +static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; + +static int nr_poll; +static int nr_cpu; + +struct mmap_event { + struct perf_event_header header; + __u32 pid; + __u32 tid; + __u64 start; + __u64 len; + __u64 pgoff; + char filename[PATH_MAX]; +}; + +struct comm_event { + struct perf_event_header header; + __u32 pid; + __u32 tid; + char comm[16]; }; + struct mmap_data { - int counter; - void *base; - unsigned int mask; - unsigned int prev; + int counter; + void *base; + unsigned int mask; + unsigned int prev; }; +static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; + static unsigned int mmap_read_head(struct mmap_data *md) { struct perf_counter_mmap_page *pc = md->base; @@ -65,11 +87,6 @@ static unsigned int mmap_read_head(struct mmap_data *md) return head; } -static long samples; -static struct timeval last_read, this_read; - -static __u64 bytes_written; - static void mmap_read(struct mmap_data *md) { unsigned int head = mmap_read_head(md); @@ -157,29 +174,6 @@ static void sig_handler(int sig) done = 1; } -static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; -static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; - -static int nr_poll; -static int nr_cpu; - -struct mmap_event { - struct perf_event_header header; - __u32 pid; - __u32 tid; - __u64 start; - __u64 len; - __u64 pgoff; - char filename[PATH_MAX]; -}; - -struct comm_event { - struct perf_event_header header; - __u32 pid; - __u32 tid; - char comm[16]; -}; - static void pid_synthesize_comm_event(pid_t pid, int full) { struct comm_event comm_ev; @@ -341,24 +335,21 @@ static int group_fd; static void create_counter(int counter, int cpu, pid_t pid) { - struct perf_counter_attr attr; + struct perf_counter_attr *attr = attrs + counter; int track = 1; - memset(&attr, 0, sizeof(attr)); - attr.config = event_id[counter]; - attr.sample_period = event_count[counter]; - attr.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD; + attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD; if (freq) { - attr.freq = 1; - attr.sample_freq = freq; + attr->freq = 1; + attr->sample_freq = freq; } - attr.mmap = track; - attr.comm = track; - attr.inherit = (cpu < 0) && inherit; + attr->mmap = track; + attr->comm = track; + attr->inherit = (cpu < 0) && inherit; track = 0; /* only the first counter needs these */ - fd[nr_cpu][counter] = sys_perf_counter_open(&attr, pid, cpu, group_fd, 0); + fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0); if (fd[nr_cpu][counter] < 0) { int err = errno; @@ -542,16 +533,14 @@ int cmd_record(int argc, const char **argv, const char *prefix) if (!argc && target_pid == -1 && !system_wide) usage_with_options(record_usage, options); - if (!nr_counters) { + if (!nr_counters) nr_counters = 1; - event_id[0] = 0; - } for (counter = 0; counter < nr_counters; counter++) { - if (event_count[counter]) + if (attrs[counter].sample_period) continue; - event_count[counter] = default_interval; + attrs[counter].sample_period = default_interval; } return __cmd_record(argc, argv); diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c index 4fc0d80440e..9711e552423 100644 --- a/Documentation/perf_counter/builtin-stat.c +++ b/Documentation/perf_counter/builtin-stat.c @@ -44,23 +44,22 @@ #include -static int system_wide = 0; -static int inherit = 1; +static struct perf_counter_attr default_attrs[MAX_COUNTERS] = { -static __u64 default_event_id[MAX_COUNTERS] = { - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_TASK_CLOCK }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CONTEXT_SWITCHES }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CPU_MIGRATIONS }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_PAGE_FAULTS }, - EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), - EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), - EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), - EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CPU_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_INSTRUCTIONS }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_REFERENCES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_MISSES }, }; -static int default_interval = 100000; -static int event_count[MAX_COUNTERS]; +static int system_wide = 0; +static int inherit = 1; + static int fd[MAX_NR_CPUS][MAX_COUNTERS]; static int target_pid = -1; @@ -86,22 +85,16 @@ static __u64 walltime_nsecs; static void create_perfstat_counter(int counter) { - struct perf_counter_attr attr; - - memset(&attr, 0, sizeof(attr)); - attr.config = event_id[counter]; - attr.sample_type = 0; - attr.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL; - attr.exclude_user = event_mask[counter] & EVENT_MASK_USER; + struct perf_counter_attr *attr = attrs + counter; if (scale) - attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | - PERF_FORMAT_TOTAL_TIME_RUNNING; + attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING; if (system_wide) { int cpu; for (cpu = 0; cpu < nr_cpus; cpu ++) { - fd[cpu][counter] = sys_perf_counter_open(&attr, -1, cpu, -1, 0); + fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0); if (fd[cpu][counter] < 0) { printf("perfstat error: syscall returned with %d (%s)\n", fd[cpu][counter], strerror(errno)); @@ -109,10 +102,10 @@ static void create_perfstat_counter(int counter) } } } else { - attr.inherit = inherit; - attr.disabled = 1; + attr->inherit = inherit; + attr->disabled = 1; - fd[0][counter] = sys_perf_counter_open(&attr, 0, -1, -1, 0); + fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0); if (fd[0][counter] < 0) { printf("perfstat error: syscall returned with %d (%s)\n", fd[0][counter], strerror(errno)); @@ -126,9 +119,13 @@ static void create_perfstat_counter(int counter) */ static inline int nsec_counter(int counter) { - if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK)) + if (attrs[counter].type != PERF_TYPE_SOFTWARE) + return 0; + + if (attrs[counter].config == PERF_COUNT_CPU_CLOCK) return 1; - if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) + + if (attrs[counter].config == PERF_COUNT_TASK_CLOCK) return 1; return 0; @@ -177,7 +174,8 @@ static void read_counter(int counter) /* * Save the full runtime - to allow normalization during printout: */ - if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) + if (attrs[counter].type == PERF_TYPE_SOFTWARE && + attrs[counter].config == PERF_COUNT_TASK_CLOCK) runtime_nsecs = count[0]; } @@ -203,8 +201,8 @@ static void print_counter(int counter) fprintf(stderr, " %14.6f %-20s", msecs, event_name(counter)); - if (event_id[counter] == - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) { + if (attrs[counter].type == PERF_TYPE_SOFTWARE && + attrs[counter].config == PERF_COUNT_TASK_CLOCK) { fprintf(stderr, " # %11.3f CPU utilization factor", (double)count[0] / (double)walltime_nsecs); @@ -300,8 +298,6 @@ static char events_help_msg[EVENTS_HELP_MAX]; static const struct option options[] = { OPT_CALLBACK('e', "event", NULL, "event", events_help_msg, parse_events), - OPT_INTEGER('c', "count", &default_interval, - "event period to sample"), OPT_BOOLEAN('i', "inherit", &inherit, "child tasks inherit counters"), OPT_INTEGER('p', "pid", &target_pid, @@ -315,27 +311,19 @@ static const struct option options[] = { int cmd_stat(int argc, const char **argv, const char *prefix) { - int counter; - page_size = sysconf(_SC_PAGE_SIZE); create_events_help(events_help_msg); - memcpy(event_id, default_event_id, sizeof(default_event_id)); + + memcpy(attrs, default_attrs, sizeof(attrs)); argc = parse_options(argc, argv, options, stat_usage, 0); if (!argc) usage_with_options(stat_usage, options); - if (!nr_counters) { + if (!nr_counters) nr_counters = 8; - } - - for (counter = 0; counter < nr_counters; counter++) { - if (event_count[counter]) - continue; - event_count[counter] = default_interval; - } nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); assert(nr_cpus <= MAX_NR_CPUS); assert(nr_cpus >= 0); diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c index b2f480b5a13..98a6d53e17b 100644 --- a/Documentation/perf_counter/builtin-top.c +++ b/Documentation/perf_counter/builtin-top.c @@ -48,22 +48,11 @@ #include #include -static int system_wide = 0; +static int fd[MAX_NR_CPUS][MAX_COUNTERS]; -static __u64 default_event_id[MAX_COUNTERS] = { - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), +static int system_wide = 0; - EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), - EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), - EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), - EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), -}; -static int default_interval = 100000; -static int event_count[MAX_COUNTERS]; -static int fd[MAX_NR_CPUS][MAX_COUNTERS]; +static int default_interval = 100000; static __u64 count_filter = 5; static int print_entries = 15; @@ -85,15 +74,6 @@ static int delay_secs = 2; static int zero; static int dump_symtab; -static const unsigned int default_count[] = { - 1000000, - 1000000, - 10000, - 10000, - 1000000, - 10000, -}; - /* * Symbols */ @@ -112,7 +92,7 @@ struct sym_entry { struct sym_entry *sym_filter_entry; -struct dso *kernel_dso; +struct dso *kernel_dso; /* * Symbols will be added here in record_ip and will get out @@ -213,7 +193,7 @@ static void print_sym_table(void) 100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec))); if (nr_counters == 1) { - printf("%d", event_count[0]); + printf("%Ld", attrs[0].sample_period); if (freq) printf("Hz "); else @@ -421,10 +401,10 @@ static void process_event(uint64_t ip, int counter) } struct mmap_data { - int counter; - void *base; - unsigned int mask; - unsigned int prev; + int counter; + void *base; + unsigned int mask; + unsigned int prev; }; static unsigned int mmap_read_head(struct mmap_data *md) @@ -539,7 +519,7 @@ static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; static int __cmd_top(void) { - struct perf_counter_attr attr; + struct perf_counter_attr *attr; pthread_t thread; int i, counter, group_fd, nr_poll = 0; unsigned int cpu; @@ -553,13 +533,12 @@ static int __cmd_top(void) if (target_pid == -1 && profile_cpu == -1) cpu = i; - memset(&attr, 0, sizeof(attr)); - attr.config = event_id[counter]; - attr.sample_period = event_count[counter]; - attr.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; - attr.freq = freq; + attr = attrs + counter; - fd[i][counter] = sys_perf_counter_open(&attr, target_pid, cpu, group_fd, 0); + attr->sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_TID; + attr->freq = freq; + + fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0); if (fd[i][counter] < 0) { int err = errno; @@ -670,7 +649,6 @@ int cmd_top(int argc, const char **argv, const char *prefix) page_size = sysconf(_SC_PAGE_SIZE); create_events_help(events_help_msg); - memcpy(event_id, default_event_id, sizeof(default_event_id)); argc = parse_options(argc, argv, options, top_usage, 0); if (argc) @@ -688,19 +666,22 @@ int cmd_top(int argc, const char **argv, const char *prefix) profile_cpu = -1; } - if (!nr_counters) { + if (!nr_counters) nr_counters = 1; - event_id[0] = 0; - } if (delay_secs < 1) delay_secs = 1; + parse_symbols(); + + /* + * Fill in the ones not specifically initialized via -c: + */ for (counter = 0; counter < nr_counters; counter++) { - if (event_count[counter]) + if (attrs[counter].sample_period) continue; - event_count[counter] = default_interval; + attrs[counter].sample_period = default_interval; } nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); @@ -710,7 +691,5 @@ int cmd_top(int argc, const char **argv, const char *prefix) if (target_pid != -1 || profile_cpu != -1) nr_cpus = 1; - parse_symbols(); - return __cmd_top(); } diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h index 10622a48b40..af0a5046d74 100644 --- a/Documentation/perf_counter/perf.h +++ b/Documentation/perf_counter/perf.h @@ -64,6 +64,4 @@ sys_perf_counter_open(struct perf_counter_attr *attr_uptr, #define MAX_COUNTERS 256 #define MAX_NR_CPUS 256 -#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) - #endif diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c index 2fdfd1d923f..eb56bd99657 100644 --- a/Documentation/perf_counter/util/parse-events.c +++ b/Documentation/perf_counter/util/parse-events.c @@ -6,37 +6,39 @@ #include "exec_cmd.h" #include "string.h" -int nr_counters; +int nr_counters; -__u64 event_id[MAX_COUNTERS] = { }; -int event_mask[MAX_COUNTERS]; +struct perf_counter_attr attrs[MAX_COUNTERS]; struct event_symbol { - __u64 event; - char *symbol; + __u8 type; + __u64 config; + char *symbol; }; +#define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y + static struct event_symbol event_symbols[] = { - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", }, - - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", }, + { C(HARDWARE, CPU_CYCLES), "cpu-cycles", }, + { C(HARDWARE, CPU_CYCLES), "cycles", }, + { C(HARDWARE, INSTRUCTIONS), "instructions", }, + { C(HARDWARE, CACHE_REFERENCES), "cache-references", }, + { C(HARDWARE, CACHE_MISSES), "cache-misses", }, + { C(HARDWARE, BRANCH_INSTRUCTIONS), "branch-instructions", }, + { C(HARDWARE, BRANCH_INSTRUCTIONS), "branches", }, + { C(HARDWARE, BRANCH_MISSES), "branch-misses", }, + { C(HARDWARE, BUS_CYCLES), "bus-cycles", }, + + { C(SOFTWARE, CPU_CLOCK), "cpu-clock", }, + { C(SOFTWARE, TASK_CLOCK), "task-clock", }, + { C(SOFTWARE, PAGE_FAULTS), "page-faults", }, + { C(SOFTWARE, PAGE_FAULTS), "faults", }, + { C(SOFTWARE, PAGE_FAULTS_MIN), "minor-faults", }, + { C(SOFTWARE, PAGE_FAULTS_MAJ), "major-faults", }, + { C(SOFTWARE, CONTEXT_SWITCHES), "context-switches", }, + { C(SOFTWARE, CONTEXT_SWITCHES), "cs", }, + { C(SOFTWARE, CPU_MIGRATIONS), "cpu-migrations", }, + { C(SOFTWARE, CPU_MIGRATIONS), "migrations", }, }; #define __PERF_COUNTER_FIELD(config, name) \ @@ -67,27 +69,26 @@ static char *sw_event_names[] = { "major faults", }; -char *event_name(int ctr) +char *event_name(int counter) { - __u64 config = event_id[ctr]; - int type = PERF_COUNTER_TYPE(config); - int id = PERF_COUNTER_ID(config); + __u64 config = attrs[counter].config; + int type = attrs[counter].type; static char buf[32]; - if (PERF_COUNTER_RAW(config)) { - sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config)); + if (attrs[counter].type == PERF_TYPE_RAW) { + sprintf(buf, "raw 0x%llx", config); return buf; } switch (type) { case PERF_TYPE_HARDWARE: - if (id < PERF_HW_EVENTS_MAX) - return hw_event_names[id]; + if (config < PERF_HW_EVENTS_MAX) + return hw_event_names[config]; return "unknown-hardware"; case PERF_TYPE_SOFTWARE: - if (id < PERF_SW_EVENTS_MAX) - return sw_event_names[id]; + if (config < PERF_SW_EVENTS_MAX) + return sw_event_names[config]; return "unknown-software"; default: @@ -101,15 +102,19 @@ char *event_name(int ctr) * Each event can have multiple symbolic names. * Symbolic names are (almost) exactly matched. */ -static __u64 match_event_symbols(const char *str) +static int match_event_symbols(const char *str, struct perf_counter_attr *attr) { __u64 config, id; int type; unsigned int i; const char *sep, *pstr; - if (str[0] == 'r' && hex2u64(str + 1, &config) > 0) - return config | PERF_COUNTER_RAW_MASK; + if (str[0] == 'r' && hex2u64(str + 1, &config) > 0) { + attr->type = PERF_TYPE_RAW; + attr->config = config; + + return 0; + } pstr = str; sep = strchr(pstr, ':'); @@ -121,35 +126,45 @@ static __u64 match_event_symbols(const char *str) if (sep) { pstr = sep + 1; if (strchr(pstr, 'k')) - event_mask[nr_counters] |= EVENT_MASK_USER; + attr->exclude_user = 1; if (strchr(pstr, 'u')) - event_mask[nr_counters] |= EVENT_MASK_KERNEL; + attr->exclude_kernel = 1; } - return EID(type, id); + attr->type = type; + attr->config = id; + + return 0; } for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { if (!strncmp(str, event_symbols[i].symbol, - strlen(event_symbols[i].symbol))) - return event_symbols[i].event; + strlen(event_symbols[i].symbol))) { + + attr->type = event_symbols[i].type; + attr->config = event_symbols[i].config; + + return 0; + } } - return ~0ULL; + return -EINVAL; } int parse_events(const struct option *opt, const char *str, int unset) { - __u64 config; + struct perf_counter_attr attr; + int ret; + memset(&attr, 0, sizeof(attr)); again: if (nr_counters == MAX_COUNTERS) return -1; - config = match_event_symbols(str); - if (config == ~0ULL) - return -1; + ret = match_event_symbols(str, &attr); + if (ret < 0) + return ret; - event_id[nr_counters] = config; + attrs[nr_counters] = attr; nr_counters++; str = strstr(str, ","); @@ -168,7 +183,6 @@ void create_events_help(char *events_help_msg) { unsigned int i; char *str; - __u64 e; str = events_help_msg; @@ -178,9 +192,8 @@ void create_events_help(char *events_help_msg) for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { int type, id; - e = event_symbols[i].event; - type = PERF_COUNTER_TYPE(e); - id = PERF_COUNTER_ID(e); + type = event_symbols[i].type; + id = event_symbols[i].config; if (i) str += sprintf(str, "|"); @@ -191,4 +204,3 @@ void create_events_help(char *events_help_msg) str += sprintf(str, "|rNNN]"); } - diff --git a/Documentation/perf_counter/util/parse-events.h b/Documentation/perf_counter/util/parse-events.h index 0da306bb902..542971c495b 100644 --- a/Documentation/perf_counter/util/parse-events.h +++ b/Documentation/perf_counter/util/parse-events.h @@ -3,12 +3,9 @@ * Parse symbolic events/counts passed in as options: */ -extern int nr_counters; -extern __u64 event_id[MAX_COUNTERS]; -extern int event_mask[MAX_COUNTERS]; +extern int nr_counters; -#define EVENT_MASK_KERNEL 1 -#define EVENT_MASK_USER 2 +extern struct perf_counter_attr attrs[MAX_COUNTERS]; extern char *event_name(int ctr); diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 232b00a36f7..4786ad9a288 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -867,13 +867,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) if (!ppmu) return ERR_PTR(-ENXIO); - if (!perf_event_raw(&counter->attr)) { - ev = perf_event_id(&counter->attr); + if (counter->attr.type != PERF_TYPE_RAW) { + ev = counter->attr.config; if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) return ERR_PTR(-EOPNOTSUPP); ev = ppmu->generic_events[ev]; } else { - ev = perf_event_config(&counter->attr); + ev = counter->attr.config; } counter->hw.config_base = ev; counter->hw.idx = 0; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8f53f3a7da2..430e048f285 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (perf_event_raw(attr)) { - hwc->config |= x86_pmu.raw_event(perf_event_config(attr)); + if (attr->type == PERF_TYPE_RAW) { + hwc->config |= x86_pmu.raw_event(attr->config); } else { - if (perf_event_id(attr) >= x86_pmu.max_events) + if (attr->config >= x86_pmu.max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= x86_pmu.event_map(perf_event_id(attr)); + hwc->config |= x86_pmu.event_map(attr->config); } counter->destroy = hw_perf_counter_destroy; diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4f9d39ecdc0..f794c69b34c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -73,26 +73,6 @@ enum sw_event_ids { PERF_SW_EVENTS_MAX = 7, }; -#define __PERF_COUNTER_MASK(name) \ - (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \ - PERF_COUNTER_##name##_SHIFT) - -#define PERF_COUNTER_RAW_BITS 1 -#define PERF_COUNTER_RAW_SHIFT 63 -#define PERF_COUNTER_RAW_MASK __PERF_COUNTER_MASK(RAW) - -#define PERF_COUNTER_CONFIG_BITS 63 -#define PERF_COUNTER_CONFIG_SHIFT 0 -#define PERF_COUNTER_CONFIG_MASK __PERF_COUNTER_MASK(CONFIG) - -#define PERF_COUNTER_TYPE_BITS 7 -#define PERF_COUNTER_TYPE_SHIFT 56 -#define PERF_COUNTER_TYPE_MASK __PERF_COUNTER_MASK(TYPE) - -#define PERF_COUNTER_EVENT_BITS 56 -#define PERF_COUNTER_EVENT_SHIFT 0 -#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) - /* * Bits that can be set in attr.sample_type to request information * in the overflow packets. @@ -125,10 +105,13 @@ enum perf_counter_read_format { */ struct perf_counter_attr { /* - * The MSB of the config word signifies if the rest contains cpu - * specific (raw) counter configuration data, if unset, the next - * 7 bits are an event type and the rest of the bits are the event - * identifier. + * Major type: hardware/software/tracepoint/etc. + */ + __u32 type; + __u32 __reserved_1; + + /* + * Type specific configuration information. */ __u64 config; @@ -152,12 +135,11 @@ struct perf_counter_attr { comm : 1, /* include comm data */ freq : 1, /* use freq, not period */ - __reserved_1 : 53; + __reserved_2 : 53; __u32 wakeup_events; /* wakeup every n events */ - __u32 __reserved_2; + __u32 __reserved_3; - __u64 __reserved_3; __u64 __reserved_4; }; @@ -278,8 +260,8 @@ enum perf_event_type { /* * struct { - * struct perf_event_header header; - * u32 pid, ppid; + * struct perf_event_header header; + * u32 pid, ppid; * }; */ PERF_EVENT_FORK = 7, @@ -331,27 +313,6 @@ enum perf_event_type { struct task_struct; -static inline u64 perf_event_raw(struct perf_counter_attr *attr) -{ - return attr->config & PERF_COUNTER_RAW_MASK; -} - -static inline u64 perf_event_config(struct perf_counter_attr *attr) -{ - return attr->config & PERF_COUNTER_CONFIG_MASK; -} - -static inline u64 perf_event_type(struct perf_counter_attr *attr) -{ - return (attr->config & PERF_COUNTER_TYPE_MASK) >> - PERF_COUNTER_TYPE_SHIFT; -} - -static inline u64 perf_event_id(struct perf_counter_attr *attr) -{ - return attr->config & PERF_COUNTER_EVENT_MASK; -} - /** * struct hw_perf_counter - performance counter hardware details: */ @@ -616,8 +577,8 @@ extern int perf_counter_overflow(struct perf_counter *counter, */ static inline int is_software_counter(struct perf_counter *counter) { - return !perf_event_raw(&counter->attr) && - perf_event_type(&counter->attr) != PERF_TYPE_HARDWARE; + return (counter->attr.type != PERF_TYPE_RAW) && + (counter->attr.type != PERF_TYPE_HARDWARE); } extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 47c92fb927f..75ae76796df 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3091,14 +3091,12 @@ static int perf_swcounter_match(struct perf_counter *counter, enum perf_event_types type, u32 event, struct pt_regs *regs) { - u64 event_config; - - event_config = ((u64) type << PERF_COUNTER_TYPE_SHIFT) | event; - if (!perf_swcounter_is_counting(counter)) return 0; - if (counter->attr.config != event_config) + if (counter->attr.type != type) + return 0; + if (counter->attr.config != event) return 0; if (regs) { @@ -3403,7 +3401,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (perf_event_id(&counter->attr)) { + switch (counter->attr.config) { case PERF_COUNT_CPU_CLOCK: pmu = &perf_ops_cpu_clock; @@ -3496,12 +3494,12 @@ perf_counter_alloc(struct perf_counter_attr *attr, if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) goto done; - if (perf_event_raw(attr)) { + if (attr->type == PERF_TYPE_RAW) { pmu = hw_perf_counter_init(counter); goto done; } - switch (perf_event_type(attr)) { + switch (attr->type) { case PERF_TYPE_HARDWARE: pmu = hw_perf_counter_init(counter); break; -- cgit v1.2.3 From 8326f44da090d6d304d29b9fdc7fb3e20889e329 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Jun 2009 20:22:46 +0200 Subject: perf_counter: Implement generalized cache event types Extend generic event enumeration with the PERF_TYPE_HW_CACHE method. This is a 3-dimensional space: { L1-D, L1-I, L2, ITLB, DTLB, BPU } x { load, store, prefetch } x { accesses, misses } User-space passes in the 3 coordinates and the kernel provides a counter. (if the hardware supports that type and if the combination makes sense.) Combinations that make no sense produce a -EINVAL. Combinations that are not supported by the hardware produce -ENOTSUP. Extend the tools to deal with this, and rewrite the event symbol parsing code with various popular aliases for the units and access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are both valid aliases. ( x86 is supported for now, with the Nehalem event table filled in, and with Core2 and Atom having placeholder tables. ) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- Documentation/perf_counter/util/parse-events.c | 104 ++++++++++++- arch/x86/kernel/cpu/perf_counter.c | 201 ++++++++++++++++++++++++- include/linux/perf_counter.h | 34 +++++ kernel/perf_counter.c | 1 + 4 files changed, 329 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c index eb56bd99657..de9a77c4715 100644 --- a/Documentation/perf_counter/util/parse-events.c +++ b/Documentation/perf_counter/util/parse-events.c @@ -6,6 +6,8 @@ #include "exec_cmd.h" #include "string.h" +extern char *strcasestr(const char *haystack, const char *needle); + int nr_counters; struct perf_counter_attr attrs[MAX_COUNTERS]; @@ -17,6 +19,7 @@ struct event_symbol { }; #define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y +#define CR(x, y) .type = PERF_TYPE_##x, .config = y static struct event_symbol event_symbols[] = { { C(HARDWARE, CPU_CYCLES), "cpu-cycles", }, @@ -69,6 +72,28 @@ static char *sw_event_names[] = { "major faults", }; +#define MAX_ALIASES 8 + +static char *hw_cache [][MAX_ALIASES] = { + { "l1-d" , "l1d" , "l1", "l1-data-cache" }, + { "l1-i" , "l1i" , "l1-instruction-cache" }, + { "l2" , }, + { "dtlb", }, + { "itlb", }, + { "bpu" , "btb", "branch-cache", NULL }, +}; + +static char *hw_cache_op [][MAX_ALIASES] = { + { "read" , "load" }, + { "write" , "store" }, + { "prefetch" , "speculative-read", "speculative-load" }, +}; + +static char *hw_cache_result [][MAX_ALIASES] = { + { "access", "ops" }, + { "miss", }, +}; + char *event_name(int counter) { __u64 config = attrs[counter].config; @@ -86,6 +111,30 @@ char *event_name(int counter) return hw_event_names[config]; return "unknown-hardware"; + case PERF_TYPE_HW_CACHE: { + __u8 cache_type, cache_op, cache_result; + static char name[100]; + + cache_type = (config >> 0) & 0xff; + if (cache_type > PERF_COUNT_HW_CACHE_MAX) + return "unknown-ext-hardware-cache-type"; + + cache_op = (config >> 8) & 0xff; + if (cache_type > PERF_COUNT_HW_CACHE_OP_MAX) + return "unknown-ext-hardware-cache-op-type"; + + cache_result = (config >> 16) & 0xff; + if (cache_type > PERF_COUNT_HW_CACHE_RESULT_MAX) + return "unknown-ext-hardware-cache-result-type"; + + sprintf(name, "%s:%s:%s", + hw_cache[cache_type][0], + hw_cache_op[cache_op][0], + hw_cache_result[cache_result][0]); + + return name; + } + case PERF_TYPE_SOFTWARE: if (config < PERF_SW_EVENTS_MAX) return sw_event_names[config]; @@ -98,11 +147,60 @@ char *event_name(int counter) return "unknown"; } +static int parse_aliases(const char *str, char *names[][MAX_ALIASES], int size) +{ + int i, j; + + for (i = 0; i < size; i++) { + for (j = 0; j < MAX_ALIASES; j++) { + if (!names[i][j]) + break; + if (strcasestr(str, names[i][j])) + return i; + } + } + + return 0; +} + +static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *attr) +{ + __u8 cache_type = -1, cache_op = 0, cache_result = 0; + + cache_type = parse_aliases(str, hw_cache, PERF_COUNT_HW_CACHE_MAX); + /* + * No fallback - if we cannot get a clear cache type + * then bail out: + */ + if (cache_type == -1) + return -EINVAL; + + cache_op = parse_aliases(str, hw_cache_op, PERF_COUNT_HW_CACHE_OP_MAX); + /* + * Fall back to reads: + */ + if (cache_type == -1) + cache_type = PERF_COUNT_HW_CACHE_OP_READ; + + cache_result = parse_aliases(str, hw_cache_result, + PERF_COUNT_HW_CACHE_RESULT_MAX); + /* + * Fall back to accesses: + */ + if (cache_result == -1) + cache_result = PERF_COUNT_HW_CACHE_RESULT_ACCESS; + + attr->config = cache_type | (cache_op << 8) | (cache_result << 16); + attr->type = PERF_TYPE_HW_CACHE; + + return 0; +} + /* * Each event can have multiple symbolic names. * Symbolic names are (almost) exactly matched. */ -static int match_event_symbols(const char *str, struct perf_counter_attr *attr) +static int parse_event_symbols(const char *str, struct perf_counter_attr *attr) { __u64 config, id; int type; @@ -147,7 +245,7 @@ static int match_event_symbols(const char *str, struct perf_counter_attr *attr) } } - return -EINVAL; + return parse_generic_hw_symbols(str, attr); } int parse_events(const struct option *opt, const char *str, int unset) @@ -160,7 +258,7 @@ again: if (nr_counters == MAX_COUNTERS) return -1; - ret = match_event_symbols(str, &attr); + ret = parse_event_symbols(str, &attr); if (ret < 0) return ret; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 430e048f285..e86679fa521 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -83,6 +83,128 @@ static u64 intel_pmu_event_map(int event) return intel_perfmon_event_map[event]; } +/* + * Generalized hw caching related event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'event makes no sense on + * this CPU', any other value means the raw event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +static u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +static const u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(L2 ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES */ + [ C(RESULT_MISS) ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + /* To be filled in */ +}; + +static const u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + /* To be filled in */ +}; + static u64 intel_pmu_raw_event(u64 event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL @@ -246,6 +368,39 @@ static inline int x86_pmu_initialized(void) return x86_pmu.handle_irq != NULL; } +static inline int +set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +{ + unsigned int cache_type, cache_op, cache_result; + u64 config, val; + + config = attr->config; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return -EINVAL; + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return -EINVAL; + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + val = hw_cache_event_ids[cache_type][cache_op][cache_result]; + + if (val == 0) + return -ENOENT; + + if (val == -1) + return -EINVAL; + + hwc->config |= val; + + return 0; +} + /* * Setup the hardware configuration for a given attr_type */ @@ -288,22 +443,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->sample_period = x86_pmu.max_period; atomic64_set(&hwc->period_left, hwc->sample_period); + counter->destroy = hw_perf_counter_destroy; /* * Raw event type provide the config in the event structure */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); - } else { - if (attr->config >= x86_pmu.max_events) - return -EINVAL; - /* - * The generic map: - */ - hwc->config |= x86_pmu.event_map(attr->config); + return 0; } - counter->destroy = hw_perf_counter_destroy; + if (attr->type == PERF_TYPE_HW_CACHE) + return set_ext_hw_attr(hwc, attr); + + if (attr->config >= x86_pmu.max_events) + return -EINVAL; + /* + * The generic map: + */ + hwc->config |= x86_pmu.event_map(attr->config); return 0; } @@ -989,6 +1147,33 @@ static int intel_pmu_init(void) rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + /* + * Nehalem: + */ + switch (boot_cpu_data.x86_model) { + case 17: + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* + PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + + pr_info("... installed Core2 event tables\n"); + break; + default: + case 26: + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* + PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + + pr_info("... installed Nehalem/Corei7 event tables\n"); + break; + case 28: + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(u64)*PERF_COUNT_HW_CACHE_MAX* + PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX); + + pr_info("... installed Atom event tables\n"); + break; + } return 0; } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f794c69b34c..3586df840f6 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -28,6 +28,7 @@ enum perf_event_types { PERF_TYPE_HARDWARE = 0, PERF_TYPE_SOFTWARE = 1, PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, /* * available TYPE space, raw is the max value. @@ -55,6 +56,39 @@ enum attr_ids { PERF_HW_EVENTS_MAX = 7, }; +/* + * Generalized hardware cache counters: + * + * { L1-D, L1-I, L2, LLC, ITLB, DTLB, BPU } x + * { read, write, prefetch } x + * { accesses, misses } + */ +enum hw_cache_id { + PERF_COUNT_HW_CACHE_L1D, + PERF_COUNT_HW_CACHE_L1I, + PERF_COUNT_HW_CACHE_L2, + PERF_COUNT_HW_CACHE_DTLB, + PERF_COUNT_HW_CACHE_ITLB, + PERF_COUNT_HW_CACHE_BPU, + + PERF_COUNT_HW_CACHE_MAX, +}; + +enum hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ, + PERF_COUNT_HW_CACHE_OP_WRITE, + PERF_COUNT_HW_CACHE_OP_PREFETCH, + + PERF_COUNT_HW_CACHE_OP_MAX, +}; + +enum hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS, + PERF_COUNT_HW_CACHE_RESULT_MISS, + + PERF_COUNT_HW_CACHE_RESULT_MAX, +}; + /* * Special "software" counters provided by the kernel, even if the hardware * does not support performance counters. These counters measure various diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 75ae76796df..5eacaaf3f9c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3501,6 +3501,7 @@ perf_counter_alloc(struct perf_counter_attr *attr, switch (attr->type) { case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: pmu = hw_perf_counter_init(counter); break; -- cgit v1.2.3 From bd2b5b12849a3446abad0b25e920f86f5480b309 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 13:40:57 +0200 Subject: perf_counter: More aggressive frequency adjustment Also employ the overflow handler to adjust the frequency, this results in a stable frequency in about 40~50 samples, instead of that many ticks. This also means we can start sampling at a sample period of 1 without running head-first into the throttle. It relies on sched_clock() to accurately measure the time difference between the overflow NMIs. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 +- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 130 +++++++++++++++++++++++++------------ 3 files changed, 92 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 49f258537cb..240ca563063 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -696,10 +696,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!attr->exclude_kernel) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (!hwc->sample_period) + if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; + atomic64_set(&hwc->period_left, hwc->sample_period); + } - atomic64_set(&hwc->period_left, hwc->sample_period); counter->destroy = hw_perf_counter_destroy; /* diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 3586df840f6..282d8cc4898 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -371,6 +371,7 @@ struct hw_perf_counter { u64 freq_count; u64 freq_interrupts; + u64 freq_stamp; #endif }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5eacaaf3f9c..51c571ee4d0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1184,13 +1184,33 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) static void perf_log_throttle(struct perf_counter *counter, int enable); static void perf_log_period(struct perf_counter *counter, u64 period); -static void perf_adjust_freq(struct perf_counter_context *ctx) +static void perf_adjust_period(struct perf_counter *counter, u64 events) +{ + struct hw_perf_counter *hwc = &counter->hw; + u64 period, sample_period; + s64 delta; + + events *= hwc->sample_period; + period = div64_u64(events, counter->attr.sample_freq); + + delta = (s64)(period - hwc->sample_period); + delta = (delta + 7) / 8; /* low pass filter */ + + sample_period = hwc->sample_period + delta; + + if (!sample_period) + sample_period = 1; + + perf_log_period(counter, sample_period); + + hwc->sample_period = sample_period; +} + +static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) { struct perf_counter *counter; struct hw_perf_counter *hwc; - u64 interrupts, sample_period; - u64 events, period, freq; - s64 delta; + u64 interrupts, freq; spin_lock(&ctx->lock); list_for_each_entry(counter, &ctx->counter_list, list_entry) { @@ -1202,6 +1222,9 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) interrupts = hwc->interrupts; hwc->interrupts = 0; + /* + * unthrottle counters on the tick + */ if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(counter, 1); counter->pmu->unthrottle(counter); @@ -1211,6 +1234,9 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) if (!counter->attr.freq || !counter->attr.sample_freq) continue; + /* + * if the specified freq < HZ then we need to skip ticks + */ if (counter->attr.sample_freq < HZ) { freq = counter->attr.sample_freq; @@ -1226,20 +1252,20 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) } else freq = HZ; - events = freq * interrupts * hwc->sample_period; - period = div64_u64(events, counter->attr.sample_freq); - - delta = (s64)(1 + period - hwc->sample_period); - delta >>= 1; - - sample_period = hwc->sample_period + delta; - - if (!sample_period) - sample_period = 1; + perf_adjust_period(counter, freq * interrupts); - perf_log_period(counter, sample_period); - - hwc->sample_period = sample_period; + /* + * In order to avoid being stalled by an (accidental) huge + * sample period, force reset the sample period if we didn't + * get any events in this freq period. + */ + if (!interrupts) { + perf_disable(); + counter->pmu->disable(counter); + atomic_set(&hwc->period_left, 0); + counter->pmu->enable(counter); + perf_enable(); + } } spin_unlock(&ctx->lock); } @@ -1279,9 +1305,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); ctx = curr->perf_counter_ctxp; - perf_adjust_freq(&cpuctx->ctx); + perf_ctx_adjust_freq(&cpuctx->ctx); if (ctx) - perf_adjust_freq(ctx); + perf_ctx_adjust_freq(ctx); perf_counter_cpu_sched_out(cpuctx); if (ctx) @@ -1647,10 +1673,10 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) counter->attr.sample_freq = value; } else { + perf_log_period(counter, value); + counter->attr.sample_period = value; counter->hw.sample_period = value; - - perf_log_period(counter, value); } unlock: spin_unlock_irq(&ctx->lock); @@ -2853,35 +2879,41 @@ void __perf_counter_mmap(struct vm_area_struct *vma) * event flow. */ +struct freq_event { + struct perf_event_header header; + u64 time; + u64 id; + u64 period; +}; + static void perf_log_period(struct perf_counter *counter, u64 period) { struct perf_output_handle handle; + struct freq_event event; int ret; - struct { - struct perf_event_header header; - u64 time; - u64 id; - u64 period; - } freq_event = { + if (counter->hw.sample_period == period) + return; + + if (counter->attr.sample_type & PERF_SAMPLE_PERIOD) + return; + + event = (struct freq_event) { .header = { .type = PERF_EVENT_PERIOD, .misc = 0, - .size = sizeof(freq_event), + .size = sizeof(event), }, .time = sched_clock(), .id = counter->id, .period = period, }; - if (counter->hw.sample_period == period) - return; - - ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0); + ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0); if (ret) return; - perf_output_put(&handle, freq_event); + perf_output_put(&handle, event); perf_output_end(&handle); } @@ -2923,15 +2955,16 @@ int perf_counter_overflow(struct perf_counter *counter, { int events = atomic_read(&counter->event_limit); int throttle = counter->pmu->unthrottle != NULL; + struct hw_perf_counter *hwc = &counter->hw; int ret = 0; if (!throttle) { - counter->hw.interrupts++; + hwc->interrupts++; } else { - if (counter->hw.interrupts != MAX_INTERRUPTS) { - counter->hw.interrupts++; - if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) { - counter->hw.interrupts = MAX_INTERRUPTS; + if (hwc->interrupts != MAX_INTERRUPTS) { + hwc->interrupts++; + if (HZ * hwc->interrupts > (u64)sysctl_perf_counter_limit) { + hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(counter, 0); ret = 1; } @@ -2945,6 +2978,16 @@ int perf_counter_overflow(struct perf_counter *counter, } } + if (counter->attr.freq) { + u64 now = sched_clock(); + s64 delta = now - hwc->freq_stamp; + + hwc->freq_stamp = now; + + if (delta > 0 && delta < TICK_NSEC) + perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); + } + /* * XXX event_limit might not quite work as expected on inherited * counters @@ -3379,7 +3422,6 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) return NULL; counter->destroy = tp_perf_counter_destroy; - counter->hw.sample_period = counter->attr.sample_period; return &perf_ops_generic; } @@ -3483,10 +3525,11 @@ perf_counter_alloc(struct perf_counter_attr *attr, pmu = NULL; hwc = &counter->hw; + hwc->sample_period = attr->sample_period; if (attr->freq && attr->sample_freq) - hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq); - else - hwc->sample_period = attr->sample_period; + hwc->sample_period = 1; + + atomic64_set(&hwc->period_left, hwc->sample_period); /* * we currently do not support PERF_SAMPLE_GROUP on inherited counters @@ -3687,6 +3730,9 @@ inherit_counter(struct perf_counter *parent_counter, else child_counter->state = PERF_COUNTER_STATE_OFF; + if (parent_counter->attr.freq) + child_counter->hw.sample_period = parent_counter->hw.sample_period; + /* * Link it up in the child's context: */ -- cgit v1.2.3 From df1a132bf3d3508f863336c80a27806a2ac947e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:02:22 +0200 Subject: perf_counter: Introduce struct for sample data For easy extension of the sample data, put it in a structure. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 10 +++++++--- arch/x86/kernel/cpu/perf_counter.c | 15 +++++++++++---- include/linux/perf_counter.h | 10 ++++++++-- kernel/perf_counter.c | 38 ++++++++++++++++++++++---------------- 4 files changed, 48 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 4786ad9a288..5e0bf399c43 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -1001,7 +1001,11 @@ static void record_and_restart(struct perf_counter *counter, long val, * Finally record data if requested. */ if (record) { - addr = 0; + struct perf_sample_data data = { + .regs = regs, + .addr = 0, + }; + if (counter->attr.sample_type & PERF_SAMPLE_ADDR) { /* * The user wants a data address recorded. @@ -1016,9 +1020,9 @@ static void record_and_restart(struct perf_counter *counter, long val, sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) - addr = mfspr(SPRN_SDAR); + data.addr = mfspr(SPRN_SDAR); } - if (perf_counter_overflow(counter, nmi, regs, addr)) { + if (perf_counter_overflow(counter, nmi, &data)) { /* * Interrupts are coming too fast - throttle them * by setting the counter to 0, so it will be diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 240ca563063..82a23d487f9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1173,11 +1173,14 @@ static void intel_pmu_reset(void) */ static int intel_pmu_handle_irq(struct pt_regs *regs) { + struct perf_sample_data data; struct cpu_hw_counters *cpuc; - struct cpu_hw_counters; int bit, cpu, loops; u64 ack, status; + data.regs = regs; + data.addr = 0; + cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -1210,7 +1213,7 @@ again: if (!intel_pmu_save_and_restart(counter)) continue; - if (perf_counter_overflow(counter, 1, regs, 0)) + if (perf_counter_overflow(counter, 1, &data)) intel_pmu_disable_counter(&counter->hw, bit); } @@ -1230,12 +1233,16 @@ again: static int amd_pmu_handle_irq(struct pt_regs *regs) { - int cpu, idx, handled = 0; + struct perf_sample_data data; struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; + int cpu, idx, handled = 0; u64 val; + data.regs = regs; + data.addr = 0; + cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -1256,7 +1263,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, regs, 0)) + if (perf_counter_overflow(counter, 1, &data)) amd_pmu_disable_counter(hwc, idx); } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 282d8cc4898..d8c0eb480f9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -605,8 +605,14 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_counter_context *ctx, int cpu); extern void perf_counter_update_userpage(struct perf_counter *counter); -extern int perf_counter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs, u64 addr); +struct perf_sample_data { + struct pt_regs *regs; + u64 addr; +}; + +extern int perf_counter_overflow(struct perf_counter *counter, int nmi, + struct perf_sample_data *data); + /* * Return 1 for a software counter, 0 for a hardware counter */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ae591a1275a..4fe85e804f4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2378,8 +2378,8 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } -static void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs, u64 addr) +static void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data) { int ret; u64 sample_type = counter->attr.sample_type; @@ -2404,10 +2404,10 @@ static void perf_counter_output(struct perf_counter *counter, header.size = sizeof(header); header.misc = PERF_EVENT_MISC_OVERFLOW; - header.misc |= perf_misc_flags(regs); + header.misc |= perf_misc_flags(data->regs); if (sample_type & PERF_SAMPLE_IP) { - ip = perf_instruction_pointer(regs); + ip = perf_instruction_pointer(data->regs); header.type |= PERF_SAMPLE_IP; header.size += sizeof(ip); } @@ -2460,7 +2460,7 @@ static void perf_counter_output(struct perf_counter *counter, } if (sample_type & PERF_SAMPLE_CALLCHAIN) { - callchain = perf_callchain(regs); + callchain = perf_callchain(data->regs); if (callchain) { callchain_size = (1 + callchain->nr) * sizeof(u64); @@ -2486,7 +2486,7 @@ static void perf_counter_output(struct perf_counter *counter, perf_output_put(&handle, time); if (sample_type & PERF_SAMPLE_ADDR) - perf_output_put(&handle, addr); + perf_output_put(&handle, data->addr); if (sample_type & PERF_SAMPLE_ID) perf_output_put(&handle, counter->id); @@ -2950,8 +2950,8 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) * Generic counter overflow handling. */ -int perf_counter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs, u64 addr) +int perf_counter_overflow(struct perf_counter *counter, int nmi, + struct perf_sample_data *data) { int events = atomic_read(&counter->event_limit); int throttle = counter->pmu->unthrottle != NULL; @@ -3005,7 +3005,7 @@ int perf_counter_overflow(struct perf_counter *counter, perf_counter_disable(counter); } - perf_counter_output(counter, nmi, regs, addr); + perf_counter_output(counter, nmi, data); return ret; } @@ -3054,24 +3054,25 @@ static void perf_swcounter_set_period(struct perf_counter *counter) static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) { enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; struct perf_counter *counter; - struct pt_regs *regs; u64 period; counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); counter->pmu->read(counter); - regs = get_irq_regs(); + data.addr = 0; + data.regs = get_irq_regs(); /* * In case we exclude kernel IPs or are somehow not in interrupt * context, provide the next best thing, the user IP. */ - if ((counter->attr.exclude_kernel || !regs) && + if ((counter->attr.exclude_kernel || !data.regs) && !counter->attr.exclude_user) - regs = task_pt_regs(current); + data.regs = task_pt_regs(current); - if (regs) { - if (perf_counter_overflow(counter, 0, regs, 0)) + if (data.regs) { + if (perf_counter_overflow(counter, 0, &data)) ret = HRTIMER_NORESTART; } @@ -3084,9 +3085,14 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) static void perf_swcounter_overflow(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { + struct perf_sample_data data = { + .regs = regs, + .addr = addr, + }; + perf_swcounter_update(counter); perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, regs, addr)) + if (perf_counter_overflow(counter, nmi, &data)) /* soft-disable the counter */ ; -- cgit v1.2.3 From 9e350de37ac9607012fcf9c5314a28fbddf8f43c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Jun 2009 21:34:59 +0200 Subject: perf_counter: Accurate period data We currently log hw.sample_period for PERF_SAMPLE_PERIOD, however this is incorrect. When we adjust the period, it will only take effect the next cycle but report it for the current cycle. So when we adjust the period for every cycle, we're always wrong. Solve this by keeping track of the last_period. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 9 ++++++--- arch/x86/kernel/cpu/perf_counter.c | 15 ++++++++++++--- include/linux/perf_counter.h | 6 ++++-- kernel/perf_counter.c | 9 ++++++--- 4 files changed, 28 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 5e0bf399c43..4990ce2e5f0 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -767,6 +767,7 @@ static void power_pmu_unthrottle(struct perf_counter *counter) perf_disable(); power_pmu_read(counter); left = counter->hw.sample_period; + counter->hw.last_period = left; val = 0; if (left < 0x80000000L) val = 0x80000000L - left; @@ -937,7 +938,8 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) counter->hw.config = events[n]; counter->hw.counter_base = cflags[n]; - atomic64_set(&counter->hw.period_left, counter->hw.sample_period); + counter->hw.last_period = counter->hw.sample_period; + atomic64_set(&counter->hw.period_left, counter->hw.last_period); /* * See if we need to reserve the PMU. @@ -1002,8 +1004,9 @@ static void record_and_restart(struct perf_counter *counter, long val, */ if (record) { struct perf_sample_data data = { - .regs = regs, - .addr = 0, + .regs = regs, + .addr = 0, + .period = counter->hw.last_period, }; if (counter->attr.sample_type & PERF_SAMPLE_ADDR) { diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 82a23d487f9..57ae1bec81b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -698,6 +698,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; + hwc->last_period = hwc->sample_period; atomic64_set(&hwc->period_left, hwc->sample_period); } @@ -880,12 +881,14 @@ x86_perf_counter_set_period(struct perf_counter *counter, if (unlikely(left <= -period)) { left = period; atomic64_set(&hwc->period_left, left); + hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; atomic64_set(&hwc->period_left, left); + hwc->last_period = period; ret = 1; } /* @@ -1257,9 +1260,12 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (val & (1ULL << (x86_pmu.counter_bits - 1))) continue; - /* counter overflow */ - handled = 1; - inc_irq_stat(apic_perf_irqs); + /* + * counter overflow + */ + handled = 1; + data.period = counter->hw.last_period; + if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; @@ -1267,6 +1273,9 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) amd_pmu_disable_counter(hwc, idx); } + if (handled) + inc_irq_stat(apic_perf_irqs); + return handled; } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d8c0eb480f9..5b966472b45 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -366,6 +366,7 @@ struct hw_perf_counter { }; atomic64_t prev_count; u64 sample_period; + u64 last_period; atomic64_t period_left; u64 interrupts; @@ -606,8 +607,9 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, extern void perf_counter_update_userpage(struct perf_counter *counter); struct perf_sample_data { - struct pt_regs *regs; - u64 addr; + struct pt_regs *regs; + u64 addr; + u64 period; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4fe85e804f4..8b89b40bd0f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2495,7 +2495,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, perf_output_put(&handle, cpu_entry); if (sample_type & PERF_SAMPLE_PERIOD) - perf_output_put(&handle, counter->hw.sample_period); + perf_output_put(&handle, data->period); /* * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. @@ -3040,11 +3040,13 @@ static void perf_swcounter_set_period(struct perf_counter *counter) if (unlikely(left <= -period)) { left = period; atomic64_set(&hwc->period_left, left); + hwc->last_period = period; } if (unlikely(left <= 0)) { left += period; atomic64_add(period, &hwc->period_left); + hwc->last_period = period; } atomic64_set(&hwc->prev_count, -left); @@ -3086,8 +3088,9 @@ static void perf_swcounter_overflow(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { struct perf_sample_data data = { - .regs = regs, - .addr = addr, + .regs = regs, + .addr = addr, + .period = counter->hw.last_period, }; perf_swcounter_update(counter); -- cgit v1.2.3 From 0764771dab80d7b84b9a271bee7f1b21a04a3f0c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 11:18:36 +0200 Subject: perf_counter: More paranoia settings Rename the perf_counter_priv knob to perf_counter_paranoia (because priv can be read as private, as opposed to privileged) and provide one more level: 0 - permissive 1 - restrict cpu counters to privilidged contexts 2 - restrict kernel-mode code counting and profiling Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- kernel/perf_counter.c | 25 +++++++++++++++++++++++-- kernel/sysctl.c | 6 +++--- 3 files changed, 27 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5b966472b45..386be915baa 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -648,7 +648,7 @@ struct perf_callchain_entry { extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); -extern int sysctl_perf_counter_priv; +extern int sysctl_perf_counter_paranoid; extern int sysctl_perf_counter_mlock; extern int sysctl_perf_counter_limit; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8b89b40bd0f..63f1987c1c1 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -43,7 +43,23 @@ static atomic_t nr_counters __read_mostly; static atomic_t nr_mmap_counters __read_mostly; static atomic_t nr_comm_counters __read_mostly; -int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ +/* + * 0 - not paranoid + * 1 - disallow cpu counters to unpriv + * 2 - disallow kernel profiling to unpriv + */ +int sysctl_perf_counter_paranoid __read_mostly; /* do we need to be privileged */ + +static inline bool perf_paranoid_cpu(void) +{ + return sysctl_perf_counter_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ + return sysctl_perf_counter_paranoid > 1; +} + int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */ @@ -1385,7 +1401,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) */ if (cpu != -1) { /* Must be root to operate on a CPU counter: */ - if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); if (cpu < 0 || cpu > num_possible_cpus()) @@ -3618,6 +3634,11 @@ SYSCALL_DEFINE5(perf_counter_open, if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0) return -EFAULT; + if (!attr.exclude_kernel) { + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + /* * Get the target context (task or percpu): */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0c4bf863afa..344a65981de 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -916,9 +916,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_PERF_COUNTERS { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_privileged", - .data = &sysctl_perf_counter_priv, - .maxlen = sizeof(sysctl_perf_counter_priv), + .procname = "perf_counter_paranoid", + .data = &sysctl_perf_counter_paranoid, + .maxlen = sizeof(sysctl_perf_counter_paranoid), .mode = 0644, .proc_handler = &proc_dointvec, }, -- cgit v1.2.3 From df58ab24bf26b166874bfb18b3b5a2e0a8e63179 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 11:25:05 +0200 Subject: perf_counter: Rename perf_counter_limit sysctl Rename perf_counter_limit to perf_counter_max_sample_rate and prohibit creation of counters with a known higher sample frequency. Signed-off-by: Peter Zijlstra Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- kernel/perf_counter.c | 27 +++++++++++++++++++-------- kernel/sysctl.c | 6 +++--- 3 files changed, 23 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 386be915baa..95c797c480e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -650,7 +650,7 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); extern int sysctl_perf_counter_paranoid; extern int sysctl_perf_counter_mlock; -extern int sysctl_perf_counter_limit; +extern int sysctl_perf_counter_sample_rate; extern void perf_counter_init(void); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 63f1987c1c1..3b2829de559 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -44,11 +44,12 @@ static atomic_t nr_mmap_counters __read_mostly; static atomic_t nr_comm_counters __read_mostly; /* - * 0 - not paranoid - * 1 - disallow cpu counters to unpriv - * 2 - disallow kernel profiling to unpriv + * perf counter paranoia level: + * 0 - not paranoid + * 1 - disallow cpu counters to unpriv + * 2 - disallow kernel profiling to unpriv */ -int sysctl_perf_counter_paranoid __read_mostly; /* do we need to be privileged */ +int sysctl_perf_counter_paranoid __read_mostly; static inline bool perf_paranoid_cpu(void) { @@ -61,7 +62,11 @@ static inline bool perf_paranoid_kernel(void) } int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ -int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */ + +/* + * max perf counter sample rate + */ +int sysctl_perf_counter_sample_rate __read_mostly = 100000; static atomic64_t perf_counter_id; @@ -1244,7 +1249,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(counter, 1); counter->pmu->unthrottle(counter); - interrupts = 2*sysctl_perf_counter_limit/HZ; + interrupts = 2*sysctl_perf_counter_sample_rate/HZ; } if (!counter->attr.freq || !counter->attr.sample_freq) @@ -1682,7 +1687,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) spin_lock_irq(&ctx->lock); if (counter->attr.freq) { - if (value > sysctl_perf_counter_limit) { + if (value > sysctl_perf_counter_sample_rate) { ret = -EINVAL; goto unlock; } @@ -2979,7 +2984,8 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi, } else { if (hwc->interrupts != MAX_INTERRUPTS) { hwc->interrupts++; - if (HZ * hwc->interrupts > (u64)sysctl_perf_counter_limit) { + if (HZ * hwc->interrupts > + (u64)sysctl_perf_counter_sample_rate) { hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(counter, 0); ret = 1; @@ -3639,6 +3645,11 @@ SYSCALL_DEFINE5(perf_counter_open, return -EACCES; } + if (attr.freq) { + if (attr.sample_freq > sysctl_perf_counter_sample_rate) + return -EINVAL; + } + /* * Get the target context (task or percpu): */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 344a65981de..9fd4e436b69 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -932,9 +932,9 @@ static struct ctl_table kern_table[] = { }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_int_limit", - .data = &sysctl_perf_counter_limit, - .maxlen = sizeof(sysctl_perf_counter_limit), + .procname = "perf_counter_max_sample_rate", + .data = &sysctl_perf_counter_sample_rate, + .maxlen = sizeof(sysctl_perf_counter_sample_rate), .mode = 0644, .proc_handler = &proc_dointvec, }, -- cgit v1.2.3 From 1c432d899d32d36371ee4ee310fa3609cf0e5742 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 13:19:29 +0200 Subject: perf_counter: Rename enums Rename the perf enums to be in the 'perf_' namespace and strictly enumerate the ABI bits. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 53 +++++++++++++++++++++----------------------- kernel/perf_counter.c | 6 ++--- 2 files changed, 28 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 95c797c480e..d5911b02bc8 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -24,24 +24,21 @@ /* * attr.type */ -enum perf_event_types { +enum perf_type_id { PERF_TYPE_HARDWARE = 0, PERF_TYPE_SOFTWARE = 1, PERF_TYPE_TRACEPOINT = 2, PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, - /* - * available TYPE space, raw is the max value. - */ - - PERF_TYPE_RAW = 128, + PERF_TYPE_MAX, /* non ABI */ }; /* * Generalized performance counter event types, used by the attr.event_id * parameter of the sys_perf_counter_open() syscall: */ -enum attr_ids { +enum perf_hw_id { /* * Common hardware events, generalized by the kernel: */ @@ -53,7 +50,7 @@ enum attr_ids { PERF_COUNT_BRANCH_MISSES = 5, PERF_COUNT_BUS_CYCLES = 6, - PERF_HW_EVENTS_MAX = 7, + PERF_HW_EVENTS_MAX, /* non ABI */ }; /* @@ -63,30 +60,30 @@ enum attr_ids { * { read, write, prefetch } x * { accesses, misses } */ -enum hw_cache_id { - PERF_COUNT_HW_CACHE_L1D, - PERF_COUNT_HW_CACHE_L1I, - PERF_COUNT_HW_CACHE_L2, - PERF_COUNT_HW_CACHE_DTLB, - PERF_COUNT_HW_CACHE_ITLB, - PERF_COUNT_HW_CACHE_BPU, - - PERF_COUNT_HW_CACHE_MAX, +enum perf_hw_cache_id { + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_L2 = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + + PERF_COUNT_HW_CACHE_MAX, /* non ABI */ }; -enum hw_cache_op_id { - PERF_COUNT_HW_CACHE_OP_READ, - PERF_COUNT_HW_CACHE_OP_WRITE, - PERF_COUNT_HW_CACHE_OP_PREFETCH, +enum perf_hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, - PERF_COUNT_HW_CACHE_OP_MAX, + PERF_COUNT_HW_CACHE_OP_MAX, /* non ABI */ }; -enum hw_cache_op_result_id { - PERF_COUNT_HW_CACHE_RESULT_ACCESS, - PERF_COUNT_HW_CACHE_RESULT_MISS, +enum perf_hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, + PERF_COUNT_HW_CACHE_RESULT_MISS = 1, - PERF_COUNT_HW_CACHE_RESULT_MAX, + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non ABI */ }; /* @@ -95,7 +92,7 @@ enum hw_cache_op_result_id { * physical and sw events of the kernel (and allow the profiling of them as * well): */ -enum sw_event_ids { +enum perf_sw_ids { PERF_COUNT_CPU_CLOCK = 0, PERF_COUNT_TASK_CLOCK = 1, PERF_COUNT_PAGE_FAULTS = 2, @@ -104,7 +101,7 @@ enum sw_event_ids { PERF_COUNT_PAGE_FAULTS_MIN = 5, PERF_COUNT_PAGE_FAULTS_MAJ = 6, - PERF_SW_EVENTS_MAX = 7, + PERF_SW_EVENTS_MAX, /* non ABI */ }; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3b2829de559..c02535bed26 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3162,7 +3162,7 @@ static int perf_swcounter_is_counting(struct perf_counter *counter) } static int perf_swcounter_match(struct perf_counter *counter, - enum perf_event_types type, + enum perf_type_id type, u32 event, struct pt_regs *regs) { if (!perf_swcounter_is_counting(counter)) @@ -3194,7 +3194,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, } static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum perf_event_types type, u32 event, + enum perf_type_id type, u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { @@ -3225,7 +3225,7 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) return &cpuctx->recursion[0]; } -static void __perf_swcounter_event(enum perf_event_types type, u32 event, +static void __perf_swcounter_event(enum perf_type_id type, u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { -- cgit v1.2.3 From f4dbfa8f3131a84257223393905f7efad0ca5996 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:06:28 +0200 Subject: perf_counter: Standardize event names Pure renames only, to PERF_COUNT_HW_* and PERF_COUNT_SW_*. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/power4-pmu.c | 12 +++++------ arch/powerpc/kernel/power5+-pmu.c | 12 +++++------ arch/powerpc/kernel/power5-pmu.c | 12 +++++------ arch/powerpc/kernel/power6-pmu.c | 12 +++++------ arch/powerpc/kernel/ppc970-pmu.c | 12 +++++------ arch/powerpc/mm/fault.c | 6 +++--- arch/x86/kernel/cpu/perf_counter.c | 32 +++++++++++++-------------- arch/x86/mm/fault.c | 6 +++--- include/linux/perf_counter.h | 36 +++++++++++++++---------------- kernel/perf_counter.c | 20 ++++++++--------- tools/perf/builtin-record.c | 4 ++-- tools/perf/builtin-stat.c | 31 ++++++++++++++------------- tools/perf/builtin-top.c | 4 ++-- tools/perf/design.txt | 28 ++++++++++++------------ tools/perf/util/parse-events.c | 44 +++++++++++++++++++------------------- 15 files changed, 136 insertions(+), 135 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index 0e94b685722..73956f084b2 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -535,12 +535,12 @@ static void p4_disable_pmc(unsigned int pmc, u64 mmcr[]) } static int p4_generic_events[] = { - [PERF_COUNT_CPU_CYCLES] = 7, - [PERF_COUNT_INSTRUCTIONS] = 0x1001, - [PERF_COUNT_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */ - [PERF_COUNT_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */ - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */ - [PERF_COUNT_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */ + [PERF_COUNT_HW_CPU_CYCLES] = 7, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x1001, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */ }; #define C(x) PERF_COUNT_HW_CACHE_##x diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index bbf2cbb0738..5f8b7741e97 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -606,12 +606,12 @@ static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[]) } static int power5p_generic_events[] = { - [PERF_COUNT_CPU_CYCLES] = 0xf, - [PERF_COUNT_INSTRUCTIONS] = 0x100009, - [PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */ - [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ - [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ + [PERF_COUNT_HW_CPU_CYCLES] = 0xf, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x100009, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ }; #define C(x) PERF_COUNT_HW_CACHE_##x diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index 670cf10b91e..d54723ab627 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -548,12 +548,12 @@ static void power5_disable_pmc(unsigned int pmc, u64 mmcr[]) } static int power5_generic_events[] = { - [PERF_COUNT_CPU_CYCLES] = 0xf, - [PERF_COUNT_INSTRUCTIONS] = 0x100009, - [PERF_COUNT_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */ - [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ - [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ + [PERF_COUNT_HW_CPU_CYCLES] = 0xf, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x100009, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ }; #define C(x) PERF_COUNT_HW_CACHE_##x diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index 4da70786609..0cd406ee765 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -466,12 +466,12 @@ static void p6_disable_pmc(unsigned int pmc, u64 mmcr[]) } static int power6_generic_events[] = { - [PERF_COUNT_CPU_CYCLES] = 0x1e, - [PERF_COUNT_INSTRUCTIONS] = 2, - [PERF_COUNT_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */ - [PERF_COUNT_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */ - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */ - [PERF_COUNT_BRANCH_MISSES] = 0x400052, /* BR_MPRED */ + [PERF_COUNT_HW_CPU_CYCLES] = 0x1e, + [PERF_COUNT_HW_INSTRUCTIONS] = 2, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x400052, /* BR_MPRED */ }; #define C(x) PERF_COUNT_HW_CACHE_##x diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index 336adf1736a..46a20640942 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -419,12 +419,12 @@ static void p970_disable_pmc(unsigned int pmc, u64 mmcr[]) } static int ppc970_generic_events[] = { - [PERF_COUNT_CPU_CYCLES] = 7, - [PERF_COUNT_INSTRUCTIONS] = 1, - [PERF_COUNT_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */ - [PERF_COUNT_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */ - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */ - [PERF_COUNT_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */ + [PERF_COUNT_HW_CPU_CYCLES] = 7, + [PERF_COUNT_HW_INSTRUCTIONS] = 1, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */ + [PERF_COUNT_HW_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */ + [PERF_COUNT_HW_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */ }; #define C(x) PERF_COUNT_HW_CACHE_##x diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index ac0e112031b..5beffc8f481 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address); + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -312,7 +312,7 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { @@ -323,7 +323,7 @@ good_area: #endif } else { current->min_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } up_read(&mm->mmap_sem); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 57ae1bec81b..572fb434a66 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -69,13 +69,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { */ static const u64 intel_perfmon_event_map[] = { - [PERF_COUNT_CPU_CYCLES] = 0x003c, - [PERF_COUNT_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_CACHE_MISSES] = 0x412e, - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; static u64 intel_pmu_event_map(int event) @@ -485,12 +485,12 @@ static const u64 amd_0f_hw_cache_event_ids */ static const u64 amd_perfmon_event_map[] = { - [PERF_COUNT_CPU_CYCLES] = 0x0076, - [PERF_COUNT_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_CACHE_MISSES] = 0x0081, - [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, }; static u64 amd_pmu_event_map(int event) @@ -970,11 +970,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 6f9df2babe4..5c6d816f30b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address); + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1142,11 +1142,11 @@ good_area: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, + perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d5911b02bc8..887df88a9c2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -42,15 +42,15 @@ enum perf_hw_id { /* * Common hardware events, generalized by the kernel: */ - PERF_COUNT_CPU_CYCLES = 0, - PERF_COUNT_INSTRUCTIONS = 1, - PERF_COUNT_CACHE_REFERENCES = 2, - PERF_COUNT_CACHE_MISSES = 3, - PERF_COUNT_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_BRANCH_MISSES = 5, - PERF_COUNT_BUS_CYCLES = 6, - - PERF_HW_EVENTS_MAX, /* non ABI */ + PERF_COUNT_HW_CPU_CYCLES = 0, + PERF_COUNT_HW_INSTRUCTIONS = 1, + PERF_COUNT_HW_CACHE_REFERENCES = 2, + PERF_COUNT_HW_CACHE_MISSES = 3, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_HW_BRANCH_MISSES = 5, + PERF_COUNT_HW_BUS_CYCLES = 6, + + PERF_COUNT_HW_MAX, /* non ABI */ }; /* @@ -93,15 +93,15 @@ enum perf_hw_cache_op_result_id { * well): */ enum perf_sw_ids { - PERF_COUNT_CPU_CLOCK = 0, - PERF_COUNT_TASK_CLOCK = 1, - PERF_COUNT_PAGE_FAULTS = 2, - PERF_COUNT_CONTEXT_SWITCHES = 3, - PERF_COUNT_CPU_MIGRATIONS = 4, - PERF_COUNT_PAGE_FAULTS_MIN = 5, - PERF_COUNT_PAGE_FAULTS_MAJ = 6, - - PERF_SW_EVENTS_MAX, /* non ABI */ + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + + PERF_COUNT_SW_MAX, /* non ABI */ }; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c02535bed26..8859b97390e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1024,7 +1024,7 @@ void perf_counter_task_sched_out(struct task_struct *task, int do_switch = 1; regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); + perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); if (likely(!ctx || !cpuctx->task_ctx)) return; @@ -3411,13 +3411,13 @@ void perf_counter_task_migration(struct task_struct *task, int cpu) struct perf_counter_context *ctx; perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE, - PERF_COUNT_CPU_MIGRATIONS, + PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); ctx = perf_pin_task_context(task); if (ctx) { perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE, - PERF_COUNT_CPU_MIGRATIONS, + PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); perf_unpin_context(ctx); } @@ -3475,11 +3475,11 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * events. */ switch (counter->attr.config) { - case PERF_COUNT_CPU_CLOCK: + case PERF_COUNT_SW_CPU_CLOCK: pmu = &perf_ops_cpu_clock; break; - case PERF_COUNT_TASK_CLOCK: + case PERF_COUNT_SW_TASK_CLOCK: /* * If the user instantiates this as a per-cpu counter, * use the cpu_clock counter instead. @@ -3490,11 +3490,11 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) pmu = &perf_ops_cpu_clock; break; - case PERF_COUNT_PAGE_FAULTS: - case PERF_COUNT_PAGE_FAULTS_MIN: - case PERF_COUNT_PAGE_FAULTS_MAJ: - case PERF_COUNT_CONTEXT_SWITCHES: - case PERF_COUNT_CPU_MIGRATIONS: + case PERF_COUNT_SW_PAGE_FAULTS: + case PERF_COUNT_SW_PAGE_FAULTS_MIN: + case PERF_COUNT_SW_PAGE_FAULTS_MAJ: + case PERF_COUNT_SW_CONTEXT_SWITCHES: + case PERF_COUNT_SW_CPU_MIGRATIONS: pmu = &perf_ops_generic; break; } diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 84cd336ae79..29259e74dcf 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -378,12 +378,12 @@ try_again: * is always available even if no PMU support: */ if (attr->type == PERF_TYPE_HARDWARE - && attr->config == PERF_COUNT_CPU_CYCLES) { + && attr->config == PERF_COUNT_HW_CPU_CYCLES) { if (verbose) warning(" ... trying to fall back to cpu-clock-ticks\n"); attr->type = PERF_TYPE_SOFTWARE; - attr->config = PERF_COUNT_CPU_CLOCK; + attr->config = PERF_COUNT_SW_CPU_CLOCK; goto try_again; } printf("\n"); diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 6404906924f..c43e4a97dc4 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -46,15 +46,16 @@ static struct perf_counter_attr default_attrs[MAX_COUNTERS] = { - { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_TASK_CLOCK }, - { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CONTEXT_SWITCHES }, - { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CPU_MIGRATIONS }, - { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_PAGE_FAULTS }, - - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CPU_CYCLES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_INSTRUCTIONS }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_REFERENCES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_MISSES }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES}, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, + + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES}, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES }, + }; static int system_wide = 0; @@ -120,10 +121,10 @@ static inline int nsec_counter(int counter) if (attrs[counter].type != PERF_TYPE_SOFTWARE) return 0; - if (attrs[counter].config == PERF_COUNT_CPU_CLOCK) + if (attrs[counter].config == PERF_COUNT_SW_CPU_CLOCK) return 1; - if (attrs[counter].config == PERF_COUNT_TASK_CLOCK) + if (attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) return 1; return 0; @@ -176,10 +177,10 @@ static void read_counter(int counter) * Save the full runtime - to allow normalization during printout: */ if (attrs[counter].type == PERF_TYPE_SOFTWARE && - attrs[counter].config == PERF_COUNT_TASK_CLOCK) + attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) runtime_nsecs = count[0]; if (attrs[counter].type == PERF_TYPE_HARDWARE && - attrs[counter].config == PERF_COUNT_CPU_CYCLES) + attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES) runtime_cycles = count[0]; } @@ -206,7 +207,7 @@ static void print_counter(int counter) fprintf(stderr, " %14.6f %-20s", msecs, event_name(counter)); if (attrs[counter].type == PERF_TYPE_SOFTWARE && - attrs[counter].config == PERF_COUNT_TASK_CLOCK) { + attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) { if (walltime_nsecs) fprintf(stderr, " # %11.3f CPU utilization factor", @@ -220,7 +221,7 @@ static void print_counter(int counter) (double)count[0]/runtime_nsecs*1000.0); if (runtime_cycles && attrs[counter].type == PERF_TYPE_HARDWARE && - attrs[counter].config == PERF_COUNT_INSTRUCTIONS) { + attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) { fprintf(stderr, " # %1.3f per cycle", (double)count[0] / (double)runtime_cycles); diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 309dbc76ec8..fe338d3c5d7 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -562,13 +562,13 @@ try_again: * is always available even if no PMU support: */ if (attr->type == PERF_TYPE_HARDWARE - && attr->config == PERF_COUNT_CPU_CYCLES) { + && attr->config == PERF_COUNT_HW_CPU_CYCLES) { if (verbose) warning(" ... trying to fall back to cpu-clock-ticks\n"); attr->type = PERF_TYPE_SOFTWARE; - attr->config = PERF_COUNT_CPU_CLOCK; + attr->config = PERF_COUNT_SW_CPU_CLOCK; goto try_again; } printf("\n"); diff --git a/tools/perf/design.txt b/tools/perf/design.txt index d3250763dc9..860e116d979 100644 --- a/tools/perf/design.txt +++ b/tools/perf/design.txt @@ -99,13 +99,13 @@ enum hw_event_ids { /* * Common hardware events, generalized by the kernel: */ - PERF_COUNT_CPU_CYCLES = 0, - PERF_COUNT_INSTRUCTIONS = 1, - PERF_COUNT_CACHE_REFERENCES = 2, - PERF_COUNT_CACHE_MISSES = 3, - PERF_COUNT_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_BRANCH_MISSES = 5, - PERF_COUNT_BUS_CYCLES = 6, + PERF_COUNT_HW_CPU_CYCLES = 0, + PERF_COUNT_HW_INSTRUCTIONS = 1, + PERF_COUNT_HW_CACHE_REFERENCES = 2, + PERF_COUNT_HW_CACHE_MISSES = 3, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_HW_BRANCH_MISSES = 5, + PERF_COUNT_HW_BUS_CYCLES = 6, }; These are standardized types of events that work relatively uniformly @@ -130,13 +130,13 @@ software events, selected by 'event_id': * well): */ enum sw_event_ids { - PERF_COUNT_CPU_CLOCK = 0, - PERF_COUNT_TASK_CLOCK = 1, - PERF_COUNT_PAGE_FAULTS = 2, - PERF_COUNT_CONTEXT_SWITCHES = 3, - PERF_COUNT_CPU_MIGRATIONS = 4, - PERF_COUNT_PAGE_FAULTS_MIN = 5, - PERF_COUNT_PAGE_FAULTS_MAJ = 6, + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, }; Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f18a9a006e1..9d5f1ca50e6 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -22,26 +22,26 @@ struct event_symbol { #define CR(x, y) .type = PERF_TYPE_##x, .config = y static struct event_symbol event_symbols[] = { - { C(HARDWARE, CPU_CYCLES), "cpu-cycles", }, - { C(HARDWARE, CPU_CYCLES), "cycles", }, - { C(HARDWARE, INSTRUCTIONS), "instructions", }, - { C(HARDWARE, CACHE_REFERENCES), "cache-references", }, - { C(HARDWARE, CACHE_MISSES), "cache-misses", }, - { C(HARDWARE, BRANCH_INSTRUCTIONS), "branch-instructions", }, - { C(HARDWARE, BRANCH_INSTRUCTIONS), "branches", }, - { C(HARDWARE, BRANCH_MISSES), "branch-misses", }, - { C(HARDWARE, BUS_CYCLES), "bus-cycles", }, - - { C(SOFTWARE, CPU_CLOCK), "cpu-clock", }, - { C(SOFTWARE, TASK_CLOCK), "task-clock", }, - { C(SOFTWARE, PAGE_FAULTS), "page-faults", }, - { C(SOFTWARE, PAGE_FAULTS), "faults", }, - { C(SOFTWARE, PAGE_FAULTS_MIN), "minor-faults", }, - { C(SOFTWARE, PAGE_FAULTS_MAJ), "major-faults", }, - { C(SOFTWARE, CONTEXT_SWITCHES), "context-switches", }, - { C(SOFTWARE, CONTEXT_SWITCHES), "cs", }, - { C(SOFTWARE, CPU_MIGRATIONS), "cpu-migrations", }, - { C(SOFTWARE, CPU_MIGRATIONS), "migrations", }, + { C(HARDWARE, HW_CPU_CYCLES), "cpu-cycles", }, + { C(HARDWARE, HW_CPU_CYCLES), "cycles", }, + { C(HARDWARE, HW_INSTRUCTIONS), "instructions", }, + { C(HARDWARE, HW_CACHE_REFERENCES), "cache-references", }, + { C(HARDWARE, HW_CACHE_MISSES), "cache-misses", }, + { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branch-instructions", }, + { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branches", }, + { C(HARDWARE, HW_BRANCH_MISSES), "branch-misses", }, + { C(HARDWARE, HW_BUS_CYCLES), "bus-cycles", }, + + { C(SOFTWARE, SW_CPU_CLOCK), "cpu-clock", }, + { C(SOFTWARE, SW_TASK_CLOCK), "task-clock", }, + { C(SOFTWARE, SW_PAGE_FAULTS), "page-faults", }, + { C(SOFTWARE, SW_PAGE_FAULTS), "faults", }, + { C(SOFTWARE, SW_PAGE_FAULTS_MIN), "minor-faults", }, + { C(SOFTWARE, SW_PAGE_FAULTS_MAJ), "major-faults", }, + { C(SOFTWARE, SW_CONTEXT_SWITCHES), "context-switches", }, + { C(SOFTWARE, SW_CONTEXT_SWITCHES), "cs", }, + { C(SOFTWARE, SW_CPU_MIGRATIONS), "cpu-migrations", }, + { C(SOFTWARE, SW_CPU_MIGRATIONS), "migrations", }, }; #define __PERF_COUNTER_FIELD(config, name) \ @@ -107,7 +107,7 @@ char *event_name(int counter) switch (type) { case PERF_TYPE_HARDWARE: - if (config < PERF_HW_EVENTS_MAX) + if (config < PERF_COUNT_HW_MAX) return hw_event_names[config]; return "unknown-hardware"; @@ -136,7 +136,7 @@ char *event_name(int counter) } case PERF_TYPE_SOFTWARE: - if (config < PERF_SW_EVENTS_MAX) + if (config < PERF_COUNT_SW_MAX) return sw_event_names[config]; return "unknown-software"; -- cgit v1.2.3 From 8be6e8f3c3a13900169f1141870562d0c723b010 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:19:11 +0200 Subject: perf_counter: Rename L2 to LL cache The top (fastest) and last level (biggest) caches are the most interesting ones, performance wise. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: [ Fixed the Nehalem LL table to LLC Reference/Miss events ] Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/power4-pmu.c | 2 +- arch/powerpc/kernel/power5+-pmu.c | 2 +- arch/powerpc/kernel/power5-pmu.c | 2 +- arch/powerpc/kernel/power6-pmu.c | 2 +- arch/powerpc/kernel/power7-pmu.c | 2 +- arch/powerpc/kernel/ppc970-pmu.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------ include/linux/perf_counter.h | 4 ++-- 8 files changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index 73956f084b2..07bd308a5fa 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -561,7 +561,7 @@ static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { 0, 0 }, }, - [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ [C(OP_READ)] = { 0, 0 }, [C(OP_WRITE)] = { 0, 0 }, [C(OP_PREFETCH)] = { 0xc34, 0 }, diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index 5f8b7741e97..41e5d2d958d 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -632,7 +632,7 @@ static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { 0, 0 }, }, - [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ [C(OP_READ)] = { 0, 0 }, [C(OP_WRITE)] = { 0, 0 }, [C(OP_PREFETCH)] = { 0xc50c3, 0 }, diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index d54723ab627..05600b66221 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -574,7 +574,7 @@ static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { 0, 0 }, }, - [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ [C(OP_READ)] = { 0, 0x3c309b }, [C(OP_WRITE)] = { 0, 0 }, [C(OP_PREFETCH)] = { 0xc50c3, 0 }, diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index 0cd406ee765..46f74bebcfd 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -493,7 +493,7 @@ static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { 0x4008c, 0 }, }, - [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ [C(OP_READ)] = { 0x150730, 0x250532 }, [C(OP_WRITE)] = { 0x250432, 0x150432 }, [C(OP_PREFETCH)] = { 0x810a6, 0 }, diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c index 060e0deb399..b3f7d1216ba 100644 --- a/arch/powerpc/kernel/power7-pmu.c +++ b/arch/powerpc/kernel/power7-pmu.c @@ -320,7 +320,7 @@ static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { 0x408a, 0 }, }, - [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ [C(OP_READ)] = { 0x6080, 0x6084 }, [C(OP_WRITE)] = { 0x6082, 0x6086 }, [C(OP_PREFETCH)] = { 0, 0 }, diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index 46a20640942..ba0a357a89f 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -445,7 +445,7 @@ static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { [C(OP_WRITE)] = { -1, -1 }, [C(OP_PREFETCH)] = { 0, 0 }, }, - [C(L2)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ [C(OP_READ)] = { 0, 0 }, [C(OP_WRITE)] = { 0, 0 }, [C(OP_PREFETCH)] = { 0x733, 0 }, diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 572fb434a66..895c82e7845 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -131,7 +131,7 @@ static const u64 nehalem_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ @@ -141,8 +141,8 @@ static const u64 nehalem_hw_cache_event_ids [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ }, [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES */ - [ C(RESULT_MISS) ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS */ + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ }, }, [ C(DTLB) ] = { @@ -222,7 +222,7 @@ static const u64 core2_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ @@ -313,7 +313,7 @@ static const u64 atom_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ @@ -422,7 +422,7 @@ static const u64 amd_0f_hw_cache_event_ids [ C(RESULT_MISS) ] = 0, }, }, - [ C(L2 ) ] = { + [ C(LL ) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0, [ C(RESULT_MISS) ] = 0, diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 887df88a9c2..20cf5af27ad 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -56,14 +56,14 @@ enum perf_hw_id { /* * Generalized hardware cache counters: * - * { L1-D, L1-I, L2, LLC, ITLB, DTLB, BPU } x + * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x * { read, write, prefetch } x * { accesses, misses } */ enum perf_hw_cache_id { PERF_COUNT_HW_CACHE_L1D = 0, PERF_COUNT_HW_CACHE_L1I = 1, - PERF_COUNT_HW_CACHE_L2 = 2, + PERF_COUNT_HW_CACHE_LL = 2, PERF_COUNT_HW_CACHE_DTLB = 3, PERF_COUNT_HW_CACHE_ITLB = 4, PERF_COUNT_HW_CACHE_BPU = 5, -- cgit v1.2.3 From a308444ceb576d3089f9ca0dfd097eba6f1e623f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Jun 2009 14:44:26 +0200 Subject: perf_counter: Better align code Whitespace and comment bits. Also update copyrights. [ Impact: cleanup ] Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: --- include/linux/perf_counter.h | 165 ++++++++++++++++++++++--------------------- 1 file changed, 85 insertions(+), 80 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 20cf5af27ad..1fa1a26cb1b 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -1,12 +1,13 @@ /* * Performance counters: * - * Copyright(C) 2008, Thomas Gleixner - * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra * * Data type definitions, declarations, prototypes. * - * Started by: Thomas Gleixner and Ingo Molnar + * Started by: Thomas Gleixner and Ingo Molnar * * For licencing details see kernel-base/COPYING */ @@ -25,18 +26,19 @@ * attr.type */ enum perf_type_id { - PERF_TYPE_HARDWARE = 0, - PERF_TYPE_SOFTWARE = 1, - PERF_TYPE_TRACEPOINT = 2, - PERF_TYPE_HW_CACHE = 3, - PERF_TYPE_RAW = 4, + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, - PERF_TYPE_MAX, /* non ABI */ + PERF_TYPE_MAX, /* non-ABI */ }; /* - * Generalized performance counter event types, used by the attr.event_id - * parameter of the sys_perf_counter_open() syscall: + * Generalized performance counter event types, used by the + * attr.event_id parameter of the sys_perf_counter_open() + * syscall: */ enum perf_hw_id { /* @@ -50,7 +52,7 @@ enum perf_hw_id { PERF_COUNT_HW_BRANCH_MISSES = 5, PERF_COUNT_HW_BUS_CYCLES = 6, - PERF_COUNT_HW_MAX, /* non ABI */ + PERF_COUNT_HW_MAX, /* non-ABI */ }; /* @@ -61,29 +63,29 @@ enum perf_hw_id { * { accesses, misses } */ enum perf_hw_cache_id { - PERF_COUNT_HW_CACHE_L1D = 0, - PERF_COUNT_HW_CACHE_L1I = 1, - PERF_COUNT_HW_CACHE_LL = 2, - PERF_COUNT_HW_CACHE_DTLB = 3, - PERF_COUNT_HW_CACHE_ITLB = 4, - PERF_COUNT_HW_CACHE_BPU = 5, - - PERF_COUNT_HW_CACHE_MAX, /* non ABI */ + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_LL = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + + PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ }; enum perf_hw_cache_op_id { - PERF_COUNT_HW_CACHE_OP_READ = 0, - PERF_COUNT_HW_CACHE_OP_WRITE = 1, - PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, - PERF_COUNT_HW_CACHE_OP_MAX, /* non ABI */ + PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ }; enum perf_hw_cache_op_result_id { PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, PERF_COUNT_HW_CACHE_RESULT_MISS = 1, - PERF_COUNT_HW_CACHE_RESULT_MAX, /* non ABI */ + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ }; /* @@ -93,15 +95,15 @@ enum perf_hw_cache_op_result_id { * well): */ enum perf_sw_ids { - PERF_COUNT_SW_CPU_CLOCK = 0, - PERF_COUNT_SW_TASK_CLOCK = 1, - PERF_COUNT_SW_PAGE_FAULTS = 2, - PERF_COUNT_SW_CONTEXT_SWITCHES = 3, - PERF_COUNT_SW_CPU_MIGRATIONS = 4, - PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, - PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, - - PERF_COUNT_SW_MAX, /* non ABI */ + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + + PERF_COUNT_SW_MAX, /* non-ABI */ }; /* @@ -109,15 +111,15 @@ enum perf_sw_ids { * in the overflow packets. */ enum perf_counter_sample_format { - PERF_SAMPLE_IP = 1U << 0, - PERF_SAMPLE_TID = 1U << 1, - PERF_SAMPLE_TIME = 1U << 2, - PERF_SAMPLE_ADDR = 1U << 3, - PERF_SAMPLE_GROUP = 1U << 4, - PERF_SAMPLE_CALLCHAIN = 1U << 5, - PERF_SAMPLE_ID = 1U << 6, - PERF_SAMPLE_CPU = 1U << 7, - PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_GROUP = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_ID = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, }; /* @@ -126,9 +128,9 @@ enum perf_counter_sample_format { * in increasing order of bit value, after the counter value. */ enum perf_counter_read_format { - PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, - PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, - PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, }; /* @@ -229,12 +231,12 @@ struct perf_counter_mmap_page { __u64 data_head; /* head in the data section */ }; -#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) -#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) -#define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (2 << 0) -#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) -#define PERF_EVENT_MISC_OVERFLOW (1 << 2) +#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) +#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_EVENT_MISC_KERNEL (1 << 0) +#define PERF_EVENT_MISC_USER (2 << 0) +#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) +#define PERF_EVENT_MISC_OVERFLOW (1 << 2) struct perf_event_header { __u32 type; @@ -351,14 +353,14 @@ struct hw_perf_counter { #ifdef CONFIG_PERF_COUNTERS union { struct { /* hardware */ - u64 config; - unsigned long config_base; - unsigned long counter_base; - int idx; + u64 config; + unsigned long config_base; + unsigned long counter_base; + int idx; }; union { /* software */ - atomic64_t count; - struct hrtimer hrtimer; + atomic64_t count; + struct hrtimer hrtimer; }; }; atomic64_t prev_count; @@ -523,37 +525,37 @@ struct perf_counter_context { * Protect the states of the counters in the list, * nr_active, and the list: */ - spinlock_t lock; + spinlock_t lock; /* * Protect the list of counters. Locking either mutex or lock * is sufficient to ensure the list doesn't change; to change * the list you need to lock both the mutex and the spinlock. */ - struct mutex mutex; + struct mutex mutex; - struct list_head counter_list; - struct list_head event_list; - int nr_counters; - int nr_active; - int is_active; - atomic_t refcount; - struct task_struct *task; + struct list_head counter_list; + struct list_head event_list; + int nr_counters; + int nr_active; + int is_active; + atomic_t refcount; + struct task_struct *task; /* * Context clock, runs when context enabled. */ - u64 time; - u64 timestamp; + u64 time; + u64 timestamp; /* * These fields let us detect when two contexts have both * been cloned (inherited) from a common ancestor. */ - struct perf_counter_context *parent_ctx; - u64 parent_gen; - u64 generation; - int pin_count; - struct rcu_head rcu_head; + struct perf_counter_context *parent_ctx; + u64 parent_gen; + u64 generation; + int pin_count; + struct rcu_head rcu_head; }; /** @@ -604,9 +606,9 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, extern void perf_counter_update_userpage(struct perf_counter *counter); struct perf_sample_data { - struct pt_regs *regs; - u64 addr; - u64 period; + struct pt_regs *regs; + u64 addr; + u64 period; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, @@ -636,11 +638,14 @@ extern void perf_counter_fork(struct task_struct *tsk); extern void perf_counter_task_migration(struct task_struct *task, int cpu); -#define MAX_STACK_DEPTH 255 +#define MAX_STACK_DEPTH 255 struct perf_callchain_entry { - u16 nr, hv, kernel, user; - u64 ip[MAX_STACK_DEPTH]; + u16 nr; + u16 hv; + u16 kernel; + u16 user; + u64 ip[MAX_STACK_DEPTH]; }; extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); -- cgit v1.2.3 From cca3f454a85ff42d426401bce7ac804541b2bd03 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 11 Jun 2009 14:57:55 +0200 Subject: perf_counter: Add counter->id to the throttle event So as to be able to distuinguish between multiple counters. Signed-off-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 1fa1a26cb1b..6e133954e2e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -286,6 +286,7 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u64 time; + * u64 id; * }; */ PERF_EVENT_THROTTLE = 5, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8859b97390e..ef5d8a5b245 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2950,13 +2950,15 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) struct { struct perf_event_header header; u64 time; + u64 id; } throttle_event = { .header = { .type = PERF_EVENT_THROTTLE + 1, .misc = 0, .size = sizeof(throttle_event), }, - .time = sched_clock(), + .time = sched_clock(), + .id = counter->id, }; ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); -- cgit v1.2.3