diff options
Diffstat (limited to 'kernel/events')
-rw-r--r-- | kernel/events/callchain.c | 48 | ||||
-rw-r--r-- | kernel/events/core.c | 710 | ||||
-rw-r--r-- | kernel/events/internal.h | 25 | ||||
-rw-r--r-- | kernel/events/ring_buffer.c | 15 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 49 |
5 files changed, 599 insertions, 248 deletions
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index b9325e7dcba1..e9fdb5203de5 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -19,11 +19,13 @@ struct callchain_cpus_entries { }; int sysctl_perf_event_max_stack __read_mostly = PERF_MAX_STACK_DEPTH; +int sysctl_perf_event_max_contexts_per_stack __read_mostly = PERF_MAX_CONTEXTS_PER_STACK; static inline size_t perf_callchain_entry__sizeof(void) { return (sizeof(struct perf_callchain_entry) + - sizeof(__u64) * sysctl_perf_event_max_stack); + sizeof(__u64) * (sysctl_perf_event_max_stack + + sysctl_perf_event_max_contexts_per_stack)); } static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); @@ -32,12 +34,12 @@ static DEFINE_MUTEX(callchain_mutex); static struct callchain_cpus_entries *callchain_cpus_entries; -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, +__weak void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } -__weak void perf_callchain_user(struct perf_callchain_entry *entry, +__weak void perf_callchain_user(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { } @@ -102,7 +104,7 @@ fail: return -ENOMEM; } -int get_callchain_buffers(void) +int get_callchain_buffers(int event_max_stack) { int err = 0; int count; @@ -119,6 +121,15 @@ int get_callchain_buffers(void) /* If the allocation failed, give up */ if (!callchain_cpus_entries) err = -ENOMEM; + /* + * If requesting per event more than the global cap, + * return a different error to help userspace figure + * this out. + * + * And also do it here so that we have &callchain_mutex held. + */ + if (event_max_stack > sysctl_perf_event_max_stack) + err = -EOVERFLOW; goto exit; } @@ -172,18 +183,20 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) bool user = !event->attr.exclude_callchain_user; /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + const u32 max_stack = event->attr.sample_max_stack; if (!kernel && !user) return NULL; - return get_perf_callchain(regs, 0, kernel, user, crosstask, true); + return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); } struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, - bool crosstask, bool add_mark) + u32 max_stack, bool crosstask, bool add_mark) { struct perf_callchain_entry *entry; + struct perf_callchain_entry_ctx ctx; int rctx; entry = get_callchain_entry(&rctx); @@ -193,12 +206,16 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, if (!entry) goto exit_put; - entry->nr = init_nr; + ctx.entry = entry; + ctx.max_stack = max_stack; + ctx.nr = entry->nr = init_nr; + ctx.contexts = 0; + ctx.contexts_maxed = false; if (kernel && !user_mode(regs)) { if (add_mark) - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); + perf_callchain_store_context(&ctx, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(&ctx, regs); } if (user) { @@ -214,8 +231,8 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, goto exit_put; if (add_mark) - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + perf_callchain_user(&ctx, regs); } } @@ -225,10 +242,15 @@ exit_put: return entry; } +/* + * Used for sysctl_perf_event_max_stack and + * sysctl_perf_event_max_contexts_per_stack. + */ int perf_event_max_stack_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int new_value = sysctl_perf_event_max_stack, ret; + int *value = table->data; + int new_value = *value, ret; struct ctl_table new_table = *table; new_table.data = &new_value; @@ -240,7 +262,7 @@ int perf_event_max_stack_handler(struct ctl_table *table, int write, if (atomic_read(&nr_callchain_events)) ret = -EBUSY; else - sysctl_perf_event_max_stack = new_value; + *value = new_value; mutex_unlock(&callchain_mutex); diff --git a/kernel/events/core.c b/kernel/events/core.c index 274450efea90..c6e47e97b33f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -242,18 +242,6 @@ unlock: return ret; } -static void event_function_local(struct perf_event *event, event_f func, void *data) -{ - struct event_function_struct efs = { - .event = event, - .func = func, - .data = data, - }; - - int ret = event_function(&efs); - WARN_ON_ONCE(ret); -} - static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; @@ -303,6 +291,54 @@ again: raw_spin_unlock_irq(&ctx->lock); } +/* + * Similar to event_function_call() + event_function(), but hard assumes IRQs + * are already disabled and we're on the right CPU. + */ +static void event_function_local(struct perf_event *event, event_f func, void *data) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct task_struct *task = READ_ONCE(ctx->task); + struct perf_event_context *task_ctx = NULL; + + WARN_ON_ONCE(!irqs_disabled()); + + if (task) { + if (task == TASK_TOMBSTONE) + return; + + task_ctx = ctx; + } + + perf_ctx_lock(cpuctx, task_ctx); + + task = ctx->task; + if (task == TASK_TOMBSTONE) + goto unlock; + + if (task) { + /* + * We must be either inactive or active and the right task, + * otherwise we're screwed, since we cannot IPI to somewhere + * else. + */ + if (ctx->is_active) { + if (WARN_ON_ONCE(task != current)) + goto unlock; + + if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) + goto unlock; + } + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); + } + + func(event, cpuctx, ctx, data); +unlock: + perf_ctx_unlock(cpuctx, task_ctx); +} + #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ @@ -335,6 +371,7 @@ static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); +static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -396,6 +433,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write, if (ret || !write) return ret; + /* + * If throttling is disabled don't allow the write: + */ + if (sysctl_perf_cpu_time_max_percent == 100 || + sysctl_perf_cpu_time_max_percent == 0) + return -EINVAL; + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); @@ -440,7 +484,7 @@ static u64 __report_allowed; static void perf_duration_warn(struct irq_work *w) { - printk_ratelimited(KERN_WARNING + printk_ratelimited(KERN_INFO "perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", __report_avg, __report_allowed, @@ -835,6 +879,32 @@ perf_cgroup_mark_enabled(struct perf_event *event, } } } + +/* + * Update cpuctx->cgrp so that it is set when first cgroup event is added and + * cleared when last cgroup event is removed. + */ +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ + struct perf_cpu_context *cpuctx; + + if (!is_cgroup_event(event)) + return; + + if (add && ctx->nr_cgroups++) + return; + else if (!add && --ctx->nr_cgroups) + return; + /* + * Because cgroup events are always per-cpu events, + * this will always be called from the right CPU. + */ + cpuctx = __get_cpu_context(ctx); + cpuctx->cgrp = add ? event->cgrp : NULL; +} + #else /* !CONFIG_CGROUP_PERF */ static inline bool @@ -912,6 +982,13 @@ perf_cgroup_mark_enabled(struct perf_event *event, struct perf_event_context *ctx) { } + +static inline void +list_update_cgroup_event(struct perf_event *event, + struct perf_event_context *ctx, bool add) +{ +} + #endif /* @@ -1384,6 +1461,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + lockdep_assert_held(&ctx->lock); WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); @@ -1397,15 +1475,13 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (event->group_leader == event) { struct list_head *list; - if (is_software_event(event)) - event->group_flags |= PERF_GROUP_SOFTWARE; + event->group_caps = event->event_caps; list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); } - if (is_cgroup_event(event)) - ctx->nr_cgroups++; + list_update_cgroup_event(event, ctx, true); list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; @@ -1553,9 +1629,7 @@ static void perf_group_attach(struct perf_event *event) WARN_ON_ONCE(group_leader->ctx != event->ctx); - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + group_leader->group_caps &= event->event_caps; list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; @@ -1573,8 +1647,6 @@ static void perf_group_attach(struct perf_event *event) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_cpu_context *cpuctx; - WARN_ON_ONCE(event->ctx != ctx); lockdep_assert_held(&ctx->lock); @@ -1586,20 +1658,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->attach_state &= ~PERF_ATTACH_CONTEXT; - if (is_cgroup_event(event)) { - ctx->nr_cgroups--; - /* - * Because cgroup events are always per-cpu events, this will - * always be called from the right CPU. - */ - cpuctx = __get_cpu_context(ctx); - /* - * If there are no more cgroup events then clear cgrp to avoid - * stale pointer in update_cgrp_time_from_cpuctx(). - */ - if (!ctx->nr_cgroups) - cpuctx->cgrp = NULL; - } + list_update_cgroup_event(event, ctx, false); ctx->nr_events--; if (event->attr.inherit_stat) @@ -1661,7 +1720,7 @@ static void perf_group_detach(struct perf_event *event) sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ - sibling->group_flags = event->group_flags; + sibling->group_caps = event->group_caps; WARN_ON_ONCE(sibling->ctx != event->ctx); } @@ -1678,17 +1737,38 @@ static bool is_orphaned_event(struct perf_event *event) return event->state == PERF_EVENT_STATE_DEAD; } -static inline int pmu_filter_match(struct perf_event *event) +static inline int __pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; return pmu->filter_match ? pmu->filter_match(event) : 1; } +/* + * Check whether we should attempt to schedule an event group based on + * PMU-specific filtering. An event group can consist of HW and SW events, + * potentially with a SW leader, so we must check all the filters, to + * determine whether a group is schedulable: + */ +static inline int pmu_filter_match(struct perf_event *event) +{ + struct perf_event *child; + + if (!__pmu_filter_match(event)) + return 0; + + list_for_each_entry(child, &event->sibling_list, group_entry) { + if (!__pmu_filter_match(child)) + return 0; + } + + return 1; +} + static inline int event_filter_match(struct perf_event *event) { - return (event->cpu == -1 || event->cpu == smp_processor_id()) - && perf_cgroup_match(event) && pmu_filter_match(event); + return (event->cpu == -1 || event->cpu == smp_processor_id()) && + perf_cgroup_match(event) && pmu_filter_match(event); } static void @@ -1708,8 +1788,8 @@ event_sched_out(struct perf_event *event, * maintained, otherwise bogus information is return * via read() for time_enabled, time_running: */ - if (event->state == PERF_EVENT_STATE_INACTIVE - && !event_filter_match(event)) { + if (event->state == PERF_EVENT_STATE_INACTIVE && + !event_filter_match(event)) { delta = tstamp - event->tstamp_stopped; event->tstamp_running += delta; event->tstamp_stopped = tstamp; @@ -1749,6 +1829,8 @@ group_sched_out(struct perf_event *group_event, struct perf_event *event; int state = group_event->state; + perf_pmu_disable(ctx->pmu); + event_sched_out(group_event, cpuctx, ctx); /* @@ -1757,6 +1839,8 @@ group_sched_out(struct perf_event *group_event, list_for_each_entry(event, &group_event->sibling_list, group_entry) event_sched_out(event, cpuctx, ctx); + perf_pmu_enable(ctx->pmu); + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) cpuctx->exclusive = 0; } @@ -2062,7 +2146,7 @@ static int group_can_go_on(struct perf_event *event, /* * Groups consisting entirely of software events can always go on. */ - if (event->group_flags & PERF_GROUP_SOFTWARE) + if (event->group_caps & PERF_EV_CAP_SOFTWARE) return 1; /* * If an exclusive group is already on, no other hardware @@ -2207,10 +2291,15 @@ perf_install_in_context(struct perf_event_context *ctx, lockdep_assert_held(&ctx->mutex); - event->ctx = ctx; if (event->cpu != -1) event->cpu = cpu; + /* + * Ensures that if we can observe event->ctx, both the event and ctx + * will be 'complete'. See perf_iterate_sb_cpu(). + */ + smp_store_release(&event->ctx, ctx); + if (!task) { cpu_function_call(cpu, __perf_install_in_context, event); return; @@ -2403,16 +2492,16 @@ static int __perf_event_stop(void *info) * while restarting. */ if (sd->restart) - event->pmu->start(event, PERF_EF_START); + event->pmu->start(event, 0); return 0; } -static int perf_event_restart(struct perf_event *event) +static int perf_event_stop(struct perf_event *event, int restart) { struct stop_event_data sd = { .event = event, - .restart = 1, + .restart = restart, }; int ret = 0; @@ -2749,19 +2838,36 @@ unlock: } } +static DEFINE_PER_CPU(struct list_head, sched_cb_list); + void perf_sched_cb_dec(struct pmu *pmu) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + this_cpu_dec(perf_sched_cb_usages); + + if (!--cpuctx->sched_cb_usage) + list_del(&cpuctx->sched_cb_entry); } + void perf_sched_cb_inc(struct pmu *pmu) { + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + if (!cpuctx->sched_cb_usage++) + list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list)); + this_cpu_inc(perf_sched_cb_usages); } /* * This function provides the context switch callback to the lower code * layer. It is invoked ONLY when the context switch callback is enabled. + * + * This callback is relevant even to per-cpu events; for example multi event + * PEBS requires this to provide PID/TID information. This requires we flush + * all queued PEBS records before we context switch to a new task. */ static void perf_pmu_sched_task(struct task_struct *prev, struct task_struct *next, @@ -2769,34 +2875,24 @@ static void perf_pmu_sched_task(struct task_struct *prev, { struct perf_cpu_context *cpuctx; struct pmu *pmu; - unsigned long flags; if (prev == next) return; - local_irq_save(flags); - - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - if (pmu->sched_task) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); + list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) { + pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */ - perf_pmu_disable(pmu); + if (WARN_ON_ONCE(!pmu->sched_task)) + continue; - pmu->sched_task(cpuctx->task_ctx, sched_in); + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + perf_pmu_disable(pmu); - perf_pmu_enable(pmu); + pmu->sched_task(cpuctx->task_ctx, sched_in); - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } + perf_pmu_enable(pmu); + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); } - - rcu_read_unlock(); - - local_irq_restore(flags); } static void perf_event_switch(struct task_struct *task, @@ -3328,6 +3424,22 @@ struct perf_read_data { int ret; }; +static int find_cpu_to_read(struct perf_event *event, int local_cpu) +{ + int event_cpu = event->oncpu; + u16 local_pkg, event_pkg; + + if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) { + event_pkg = topology_physical_package_id(event_cpu); + local_pkg = topology_physical_package_id(local_cpu); + + if (event_pkg == local_pkg) + return local_cpu; + } + + return event_cpu; +} + /* * Cross CPU call to read the hardware event */ @@ -3449,7 +3561,7 @@ u64 perf_event_read_local(struct perf_event *event) static int perf_event_read(struct perf_event *event, bool group) { - int ret = 0; + int ret = 0, cpu_to_read, local_cpu; /* * If event is enabled and currently active on a CPU, update the @@ -3461,8 +3573,22 @@ static int perf_event_read(struct perf_event *event, bool group) .group = group, .ret = 0, }; - smp_call_function_single(event->oncpu, - __perf_event_read, &data, 1); + + local_cpu = get_cpu(); + cpu_to_read = find_cpu_to_read(event, local_cpu); + put_cpu(); + + /* + * Purposely ignore the smp_call_function_single() return + * value. + * + * If event->oncpu isn't a valid CPU it means the event got + * scheduled out and that will have updated the event count. + * + * Therefore, either way, we'll have an up-to-date event count + * after this. + */ + (void)smp_call_function_single(cpu_to_read, __perf_event_read, &data, 1); ret = data.ret; } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; @@ -3665,6 +3791,39 @@ static void free_event_rcu(struct rcu_head *head) static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb); +static void detach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_del_rcu(&event->sb_list); + raw_spin_unlock(&pel->lock); +} + +static bool is_sb_event(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + + if (event->parent) + return false; + + if (event->attach_state & PERF_ATTACH_TASK) + return false; + + if (attr->mmap || attr->mmap_data || attr->mmap2 || + attr->comm || attr->comm_exec || + attr->task || + attr->context_switch) + return true; + return false; +} + +static void unaccount_pmu_sb_event(struct perf_event *event) +{ + if (is_sb_event(event)) + detach_sb_event(event); +} + static void unaccount_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -3728,6 +3887,8 @@ static void unaccount_event(struct perf_event *event) } unaccount_event_cpu(event, event->cpu); + + unaccount_pmu_sb_event(event); } static void perf_sched_delayed(struct work_struct *work) @@ -3797,7 +3958,7 @@ static void exclusive_event_destroy(struct perf_event *event) static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) { - if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && + if ((e1->pmu == e2->pmu) && (e1->cpu == e2->cpu || e1->cpu == -1 || e2->cpu == -1)) @@ -3862,10 +4023,8 @@ static void _free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) { - exclusive_event_destroy(event); - module_put(event->pmu->module); - } + exclusive_event_destroy(event); + module_put(event->pmu->module); call_rcu(&event->rcu_head, free_event_rcu); } @@ -4715,6 +4874,19 @@ static void ring_buffer_attach(struct perf_event *event, spin_unlock_irqrestore(&rb->event_lock, flags); } + /* + * Avoid racing with perf_mmap_close(AUX): stop the event + * before swizzling the event::rb pointer; if it's getting + * unmapped, its aux_mmap_count will be 0 and it won't + * restart. See the comment in __perf_pmu_output_stop(). + * + * Data will inevitably be lost when set_output is done in + * mid-air, but then again, whoever does it like this is + * not in for the data anyway. + */ + if (has_aux(event)) + perf_event_stop(event, 0); + rcu_assign_pointer(event->rb, rb); if (old_rb) { @@ -5207,9 +5379,10 @@ perf_output_sample_regs(struct perf_output_handle *handle, struct pt_regs *regs, u64 mask) { int bit; + DECLARE_BITMAP(_mask, 64); - for_each_set_bit(bit, (const unsigned long *) &mask, - sizeof(mask) * BITS_PER_BYTE) { + bitmap_from_u64(_mask, mask); + for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) { u64 val; val = perf_reg_value(regs, bit); @@ -5555,16 +5728,26 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - u32 raw_size = data->raw->size; - u32 real_size = round_up(raw_size + sizeof(u32), - sizeof(u64)) - sizeof(u32); - u64 zero = 0; - - perf_output_put(handle, real_size); - __output_copy(handle, data->raw->data, raw_size); - if (real_size - raw_size) - __output_copy(handle, &zero, real_size - raw_size); + struct perf_raw_record *raw = data->raw; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + + perf_output_put(handle, raw->size); + do { + if (frag->copy) { + __output_custom(handle, frag->copy, + frag->data, frag->size); + } else { + __output_copy(handle, frag->data, + frag->size); + } + if (perf_raw_frag_last(frag)) + break; + frag = frag->next; + } while (1); + if (frag->pad) + __output_skip(handle, NULL, frag->pad); } else { struct { u32 size; @@ -5689,14 +5872,28 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); + struct perf_raw_record *raw = data->raw; + int size; + + if (raw) { + struct perf_raw_frag *frag = &raw->frag; + u32 sum = 0; + + do { + sum += frag->size; + if (perf_raw_frag_last(frag)) + break; + frag = frag->next; + } while (1); + + size = round_up(sum + sizeof(u32), sizeof(u64)); + raw->size = size - sizeof(u32); + frag->pad = raw->size - sum; + } else { + size = sizeof(u64); + } - header->size += round_up(size, sizeof(u64)); + header->size += size; } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { @@ -5856,11 +6053,11 @@ perf_event_read_event(struct perf_event *event, perf_output_end(&handle); } -typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); +typedef void (perf_iterate_f)(struct perf_event *event, void *data); static void -perf_event_aux_ctx(struct perf_event_context *ctx, - perf_event_aux_output_cb output, +perf_iterate_ctx(struct perf_event_context *ctx, + perf_iterate_f output, void *data, bool all) { struct perf_event *event; @@ -5877,52 +6074,63 @@ perf_event_aux_ctx(struct perf_event_context *ctx, } } -static void -perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, - struct perf_event_context *task_ctx) +static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) { - rcu_read_lock(); - preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data, false); - preempt_enable(); - rcu_read_unlock(); + struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); + struct perf_event *event; + + list_for_each_entry_rcu(event, &pel->list, sb_list) { + /* + * Skip events that are not fully formed yet; ensure that + * if we observe event->ctx, both event and ctx will be + * complete enough. See perf_install_in_context(). + */ + if (!smp_load_acquire(&event->ctx)) + continue; + + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + output(event, data); + } } +/* + * Iterate all events that need to receive side-band events. + * + * For new callers; ensure that account_pmu_sb_event() includes + * your event, otherwise it might not get delivered. + */ static void -perf_event_aux(perf_event_aux_output_cb output, void *data, +perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { - struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; int ctxn; + rcu_read_lock(); + preempt_disable(); + /* - * If we have task_ctx != NULL we only notify - * the task context itself. The task_ctx is set - * only for EXIT events before releasing task + * If we have task_ctx != NULL we only notify the task context itself. + * The task_ctx is set only for EXIT events before releasing task * context. */ if (task_ctx) { - perf_event_aux_task_ctx(output, data, task_ctx); - return; + perf_iterate_ctx(task_ctx, output, data, false); + goto done; } - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_aux_ctx(&cpuctx->ctx, output, data, false); - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; + perf_iterate_sb_cpu(output, data); + + for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) - perf_event_aux_ctx(ctx, output, data, false); -next: - put_cpu_ptr(pmu->pmu_cpu_context); + perf_iterate_ctx(ctx, output, data, false); } +done: + preempt_enable(); rcu_read_unlock(); } @@ -5955,7 +6163,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) - perf_event_restart(event); + perf_event_stop(event, 1); } void perf_event_exec(void) @@ -5971,7 +6179,7 @@ void perf_event_exec(void) perf_event_enable_on_exec(ctxn); - perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL, + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); } rcu_read_unlock(); @@ -5999,7 +6207,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data) /* * In case of inheritance, it will be the parent that links to the - * ring-buffer, but it will be the child that's actually using it: + * ring-buffer, but it will be the child that's actually using it. + * + * We are using event::rb to determine if the event should be stopped, + * however this may race with ring_buffer_attach() (through set_output), + * which will make us skip the event that actually needs to be stopped. + * So ring_buffer_attach() has to stop an aux event before re-assigning + * its rb pointer. */ if (rcu_dereference(parent->rb) == rb) ro->err = __perf_event_stop(&sd); @@ -6009,15 +6223,15 @@ static int __perf_pmu_output_stop(void *info) { struct perf_event *event = info; struct pmu *pmu = event->pmu; - struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); struct remote_output ro = { .rb = event->rb, }; rcu_read_lock(); - perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); + perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); if (cpuctx->task_ctx) - perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, + perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, &ro, false); rcu_read_unlock(); @@ -6146,7 +6360,7 @@ static void perf_event_task(struct task_struct *task, }, }; - perf_event_aux(perf_event_task_output, + perf_iterate_sb(perf_event_task_output, &task_event, task_ctx); } @@ -6225,7 +6439,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - perf_event_aux(perf_event_comm_output, + perf_iterate_sb(perf_event_comm_output, comm_event, NULL); } @@ -6456,7 +6670,7 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - perf_event_aux(perf_event_mmap_output, + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); @@ -6464,15 +6678,6 @@ got_name: } /* - * Whether this @filter depends on a dynamic object which is not loaded - * yet or its load addresses are not known. - */ -static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter) -{ - return filter->filter && filter->inode; -} - -/* * Check whether inode and address range match filter criteria. */ static bool perf_addr_filter_match(struct perf_addr_filter *filter, @@ -6522,7 +6727,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data) raw_spin_unlock_irqrestore(&ifh->lock, flags); if (restart) - perf_event_restart(event); + perf_event_stop(event, 1); } /* @@ -6533,13 +6738,20 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) struct perf_event_context *ctx; int ctxn; + /* + * Data tracing isn't supported yet and as such there is no need + * to keep track of anything that isn't related to executable code: + */ + if (!(vma->vm_flags & VM_EXEC)) + return; + rcu_read_lock(); for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (!ctx) continue; - perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true); + perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); } rcu_read_unlock(); } @@ -6726,7 +6938,7 @@ static void perf_event_switch(struct task_struct *task, }, }; - perf_event_aux(perf_event_switch_output, + perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } @@ -6867,7 +7079,7 @@ static int __perf_event_overflow(struct perf_event *event, irq_work_queue(&event->pending); } - event->overflow_handler(event, data, regs); + READ_ONCE(event->overflow_handler)(event, data, regs); if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; @@ -7333,7 +7545,7 @@ static struct pmu perf_swevent = { static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { - void *record = data->raw->data; + void *record = data->raw->frag.data; /* only top level events have filters set */ if (event->parent) @@ -7389,8 +7601,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct perf_event *event; struct perf_raw_record raw = { - .size = entry_size, - .data = record, + .frag = { + .size = entry_size, + .data = record, + }, }; perf_sample_data_init(&data, 0, 0); @@ -7480,11 +7694,83 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +#ifdef CONFIG_BPF_SYSCALL +static void bpf_overflow_handler(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct bpf_perf_event_data_kern ctx = { + .data = data, + .regs = regs, + }; + int ret = 0; + + preempt_disable(); + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) + goto out; + rcu_read_lock(); + ret = BPF_PROG_RUN(event->prog, (void *)&ctx); + rcu_read_unlock(); +out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + if (!ret) + return; + + event->orig_overflow_handler(event, data, regs); +} + +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->overflow_handler_context) + /* hw breakpoint or kernel counter */ + return -EINVAL; + + if (event->prog) + return -EEXIST; + + prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + event->prog = prog; + event->orig_overflow_handler = READ_ONCE(event->overflow_handler); + WRITE_ONCE(event->overflow_handler, bpf_overflow_handler); + return 0; +} + +static void perf_event_free_bpf_handler(struct perf_event *event) +{ + struct bpf_prog *prog = event->prog; + + if (!prog) + return; + + WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler); + event->prog = NULL; + bpf_prog_put(prog); +} +#else +static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd) +{ + return -EOPNOTSUPP; +} +static void perf_event_free_bpf_handler(struct perf_event *event) +{ +} +#endif + static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) { bool is_kprobe, is_tracepoint; struct bpf_prog *prog; + if (event->attr.type == PERF_TYPE_HARDWARE || + event->attr.type == PERF_TYPE_SOFTWARE) + return perf_event_set_bpf_handler(event, prog_fd); + if (event->attr.type != PERF_TYPE_TRACEPOINT) return -EINVAL; @@ -7525,6 +7811,8 @@ static void perf_event_free_bpf_prog(struct perf_event *event) { struct bpf_prog *prog; + perf_event_free_bpf_handler(event); + if (!event->tp_event) return; @@ -7683,7 +7971,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event) list_for_each_entry(filter, &ifh->list, entry) { event->addr_filters_offs[count] = 0; - if (perf_addr_filter_needs_mmap(filter)) + /* + * Adjust base offset if the filter is associated to a binary + * that needs to be mapped: + */ + if (filter->inode) event->addr_filters_offs[count] = perf_addr_filter_apply(filter, mm); @@ -7698,7 +7990,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) mmput(mm); restart: - perf_event_restart(event); + perf_event_stop(event, 1); } /* @@ -7814,8 +8106,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, goto fail; } - if (token == IF_SRC_FILE) { - filename = match_strdup(&args[2]); + if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { + int fpos = filter->range ? 2 : 1; + + filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; goto fail; @@ -8648,6 +8942,28 @@ unlock: return pmu; } +static void attach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_add_rcu(&event->sb_list, &pel->list); + raw_spin_unlock(&pel->lock); +} + +/* + * We keep a list of all !task (and therefore per-cpu) events + * that need to receive side-band records. + * + * This avoids having to scan all the various PMU per-cpu contexts + * looking for them. + */ +static void account_pmu_sb_event(struct perf_event *event) +{ + if (is_sb_event(event)) + attach_sb_event(event); +} + static void account_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -8728,6 +9044,8 @@ static void account_event(struct perf_event *event) enabled: account_event_cpu(event, event->cpu); + + account_pmu_sb_event(event); } /* @@ -8811,6 +9129,19 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING) + if (overflow_handler == bpf_overflow_handler) { + struct bpf_prog *prog = bpf_prog_inc(parent_event->prog); + + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto err_ns; + } + event->prog = prog; + event->orig_overflow_handler = + parent_event->orig_overflow_handler; + } +#endif } if (overflow_handler) { @@ -8876,7 +9207,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { - err = get_callchain_buffers(); + err = get_callchain_buffers(attr->sample_max_stack); if (err) goto err_addr_filters; } @@ -9198,6 +9529,9 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + if (!attr.sample_max_stack) + attr.sample_max_stack = sysctl_perf_event_max_stack; + /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument @@ -9271,7 +9605,7 @@ SYSCALL_DEFINE5(perf_event_open, if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { - err = -ENOTSUPP; + err = -EOPNOTSUPP; goto err_alloc; } } @@ -9288,6 +9622,9 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } + if (pmu->task_ctx_nr == perf_sw_context) + event->event_caps |= PERF_EV_CAP_SOFTWARE; + if (group_leader && (is_software_event(event) != is_software_event(group_leader))) { if (is_software_event(event)) { @@ -9301,7 +9638,7 @@ SYSCALL_DEFINE5(perf_event_open, */ pmu = group_leader->pmu; } else if (is_software_event(group_leader) && - (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) { /* * In case the group is a pure software group, and we * try to add a hardware event, move the whole group to @@ -10233,10 +10570,15 @@ static void __init perf_event_init_all_cpus(void) swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); + + INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); + raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); + + INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu)); } } -static void perf_event_init_cpu(int cpu) +int perf_event_init_cpu(unsigned int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -10249,6 +10591,7 @@ static void perf_event_init_cpu(int cpu) rcu_assign_pointer(swhash->swevent_hlist, hlist); } mutex_unlock(&swhash->hlist_mutex); + return 0; } #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE @@ -10280,14 +10623,17 @@ static void perf_event_exit_cpu_context(int cpu) } srcu_read_unlock(&pmus_srcu, idx); } +#else + +static void perf_event_exit_cpu_context(int cpu) { } -static void perf_event_exit_cpu(int cpu) +#endif + +int perf_event_exit_cpu(unsigned int cpu) { perf_event_exit_cpu_context(cpu); + return 0; } -#else -static inline void perf_event_exit_cpu(int cpu) { } -#endif static int perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) @@ -10309,46 +10655,6 @@ static struct notifier_block perf_reboot_notifier = { .priority = INT_MIN, }; -static int -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - - case CPU_UP_PREPARE: - /* - * This must be done before the CPU comes alive, because the - * moment we can run tasks we can encounter (software) events. - * - * Specifically, someone can have inherited events on kthreadd - * or a pre-existing worker thread that gets re-bound. - */ - perf_event_init_cpu(cpu); - break; - - case CPU_DOWN_PREPARE: - /* - * This must be done before the CPU dies because after that an - * active event might want to IPI the CPU and that'll not work - * so great for dead CPUs. - * - * XXX smp_call_function_single() return -ENXIO without a warn - * so we could possibly deal with this. - * - * This is safe against new events arriving because - * sys_perf_event_open() serializes against hotplug using - * get_online_cpus(). - */ - perf_event_exit_cpu(cpu); - break; - default: - break; - } - - return NOTIFY_OK; -} - void __init perf_event_init(void) { int ret; @@ -10361,7 +10667,7 @@ void __init perf_event_init(void) perf_pmu_register(&perf_cpu_clock, NULL, -1); perf_pmu_register(&perf_task_clock, NULL, -1); perf_tp_register(); - perf_cpu_notifier(perf_cpu_notify); + perf_event_init_cpu(smp_processor_id()); register_reboot_notifier(&perf_reboot_notifier); ret = init_hw_breakpoint(); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 05f9f6d626df..486fd78eb8d5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -123,21 +123,19 @@ static inline unsigned long perf_aux_size(struct ring_buffer *rb) return rb->aux_nr_pages << PAGE_SHIFT; } -#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ -static inline unsigned long \ -func_name(struct perf_output_handle *handle, \ - const void *buf, unsigned long len) \ +#define __DEFINE_OUTPUT_COPY_BODY(advance_buf, memcpy_func, ...) \ { \ unsigned long size, written; \ \ do { \ size = min(handle->size, len); \ - written = memcpy_func(handle->addr, buf, size); \ + written = memcpy_func(__VA_ARGS__); \ written = size - written; \ \ len -= written; \ handle->addr += written; \ - buf += written; \ + if (advance_buf) \ + buf += written; \ handle->size -= written; \ if (!handle->size) { \ struct ring_buffer *rb = handle->rb; \ @@ -152,6 +150,21 @@ func_name(struct perf_output_handle *handle, \ return len; \ } +#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ +static inline unsigned long \ +func_name(struct perf_output_handle *handle, \ + const void *buf, unsigned long len) \ +__DEFINE_OUTPUT_COPY_BODY(true, memcpy_func, handle->addr, buf, size) + +static inline unsigned long +__output_custom(struct perf_output_handle *handle, perf_copy_f copy_func, + const void *buf, unsigned long len) +{ + unsigned long orig_len = len; + __DEFINE_OUTPUT_COPY_BODY(false, copy_func, handle->addr, buf, + orig_len - len, size) +} + static inline unsigned long memcpy_common(void *dst, const void *src, unsigned long n) { diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ae9b90dc9a5a..257fa460b846 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -330,15 +330,22 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!rb) return NULL; - if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) + if (!rb_has_aux(rb)) goto err; /* - * If rb::aux_mmap_count is zero (and rb_has_aux() above went through), - * the aux buffer is in perf_mmap_close(), about to get freed. + * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), + * about to get freed, so we leave immediately. + * + * Checking rb::aux_mmap_count and rb::refcount has to be done in + * the same order, see perf_mmap_close. Otherwise we end up freeing + * aux pages in this path, which is a bug, because in_atomic(). */ if (!atomic_read(&rb->aux_mmap_count)) - goto err_put; + goto err; + + if (!atomic_inc_not_zero(&rb->aux_refcount)) + goto err; /* * Nesting is not supported for AUX area, make sure nested diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7edc95edfaee..d4129bb05e5d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -150,7 +150,7 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) * Returns 0 on success, -EFAULT on failure. */ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, - struct page *page, struct page *kpage) + struct page *old_page, struct page *new_page) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; @@ -161,48 +161,49 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, const unsigned long mmun_end = addr + PAGE_SIZE; struct mem_cgroup *memcg; - err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg, + err = mem_cgroup_try_charge(new_page, vma->vm_mm, GFP_KERNEL, &memcg, false); if (err) return err; /* For try_to_free_swap() and munlock_vma_page() below */ - lock_page(page); + lock_page(old_page); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; - ptep = page_check_address(page, mm, addr, &ptl, 0); - if (!ptep) + ptep = page_check_address(old_page, mm, addr, &ptl, 0); + if (!ptep) { + mem_cgroup_cancel_charge(new_page, memcg, false); goto unlock; + } - get_page(kpage); - page_add_new_anon_rmap(kpage, vma, addr, false); - mem_cgroup_commit_charge(kpage, memcg, false, false); - lru_cache_add_active_or_unevictable(kpage, vma); + get_page(new_page); + page_add_new_anon_rmap(new_page, vma, addr, false); + mem_cgroup_commit_charge(new_page, memcg, false, false); + lru_cache_add_active_or_unevictable(new_page, vma); - if (!PageAnon(page)) { - dec_mm_counter(mm, mm_counter_file(page)); + if (!PageAnon(old_page)) { + dec_mm_counter(mm, mm_counter_file(old_page)); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, addr, pte_pfn(*ptep)); ptep_clear_flush_notify(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); + set_pte_at_notify(mm, addr, ptep, mk_pte(new_page, vma->vm_page_prot)); - page_remove_rmap(page, false); - if (!page_mapped(page)) - try_to_free_swap(page); + page_remove_rmap(old_page, false); + if (!page_mapped(old_page)) + try_to_free_swap(old_page); pte_unmap_unlock(ptep, ptl); if (vma->vm_flags & VM_LOCKED) - munlock_vma_page(page); - put_page(page); + munlock_vma_page(old_page); + put_page(old_page); err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg, false); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - unlock_page(page); + unlock_page(old_page); return err; } @@ -1130,7 +1131,9 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) struct vm_area_struct *vma; int ret; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + if (mm->uprobes_state.xol_area) { ret = -EALREADY; goto fail; @@ -1469,7 +1472,8 @@ static void dup_xol_work(struct callback_head *work) if (current->flags & PF_EXITING) return; - if (!__create_xol_area(current->utask->dup_xol_addr)) + if (!__create_xol_area(current->utask->dup_xol_addr) && + !fatal_signal_pending(current)) uprobe_warn(current, "dup xol area"); } @@ -1694,8 +1698,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) int result; pagefault_disable(); - result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, - sizeof(opcode)); + result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) |