summaryrefslogtreecommitdiff
path: root/kernel/events/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r--kernel/events/core.c244
1 files changed, 179 insertions, 65 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ab15509fab8c..e5aaa806702d 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1469,7 +1469,6 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
-
lockdep_assert_held(&ctx->lock);
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
@@ -1624,6 +1623,8 @@ static void perf_group_attach(struct perf_event *event)
{
struct perf_event *group_leader = event->group_leader, *pos;
+ lockdep_assert_held(&event->ctx->lock);
+
/*
* We can have double attach due to group movement in perf_event_open.
*/
@@ -1697,6 +1698,8 @@ static void perf_group_detach(struct perf_event *event)
struct perf_event *sibling, *tmp;
struct list_head *list = NULL;
+ lockdep_assert_held(&event->ctx->lock);
+
/*
* We can have double detach due to exit/hot-unplug + close.
*/
@@ -1895,9 +1898,29 @@ __perf_remove_from_context(struct perf_event *event,
*/
static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
- lockdep_assert_held(&event->ctx->mutex);
+ struct perf_event_context *ctx = event->ctx;
+
+ lockdep_assert_held(&ctx->mutex);
event_function_call(event, __perf_remove_from_context, (void *)flags);
+
+ /*
+ * The above event_function_call() can NO-OP when it hits
+ * TASK_TOMBSTONE. In that case we must already have been detached
+ * from the context (by perf_event_exit_event()) but the grouping
+ * might still be in-tact.
+ */
+ WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
+ if ((flags & DETACH_GROUP) &&
+ (event->attach_state & PERF_ATTACH_GROUP)) {
+ /*
+ * Since in that case we cannot possibly be scheduled, simply
+ * detach now.
+ */
+ raw_spin_lock_irq(&ctx->lock);
+ perf_group_detach(event);
+ raw_spin_unlock_irq(&ctx->lock);
+ }
}
/*
@@ -2249,7 +2272,7 @@ static int __perf_install_in_context(void *info)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
- bool activate = true;
+ bool reprogram = true;
int ret = 0;
raw_spin_lock(&cpuctx->ctx.lock);
@@ -2257,27 +2280,26 @@ static int __perf_install_in_context(void *info)
raw_spin_lock(&ctx->lock);
task_ctx = ctx;
- /* If we're on the wrong CPU, try again */
- if (task_cpu(ctx->task) != smp_processor_id()) {
- ret = -ESRCH;
- goto unlock;
- }
+ reprogram = (ctx->task == current);
/*
- * If we're on the right CPU, see if the task we target is
- * current, if not we don't have to activate the ctx, a future
- * context switch will do that for us.
+ * If the task is running, it must be running on this CPU,
+ * otherwise we cannot reprogram things.
+ *
+ * If its not running, we don't care, ctx->lock will
+ * serialize against it becoming runnable.
*/
- if (ctx->task != current)
- activate = false;
- else
- WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
+ if (task_curr(ctx->task) && !reprogram) {
+ ret = -ESRCH;
+ goto unlock;
+ }
+ WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
} else if (task_ctx) {
raw_spin_lock(&task_ctx->lock);
}
- if (activate) {
+ if (reprogram) {
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx);
@@ -2328,13 +2350,36 @@ perf_install_in_context(struct perf_event_context *ctx,
/*
* Installing events is tricky because we cannot rely on ctx->is_active
* to be set in case this is the nr_events 0 -> 1 transition.
+ *
+ * Instead we use task_curr(), which tells us if the task is running.
+ * However, since we use task_curr() outside of rq::lock, we can race
+ * against the actual state. This means the result can be wrong.
+ *
+ * If we get a false positive, we retry, this is harmless.
+ *
+ * If we get a false negative, things are complicated. If we are after
+ * perf_event_context_sched_in() ctx::lock will serialize us, and the
+ * value must be correct. If we're before, it doesn't matter since
+ * perf_event_context_sched_in() will program the counter.
+ *
+ * However, this hinges on the remote context switch having observed
+ * our task->perf_event_ctxp[] store, such that it will in fact take
+ * ctx::lock in perf_event_context_sched_in().
+ *
+ * We do this by task_function_call(), if the IPI fails to hit the task
+ * we know any future context switch of task must see the
+ * perf_event_ctpx[] store.
*/
-again:
+
/*
- * Cannot use task_function_call() because we need to run on the task's
- * CPU regardless of whether its current or not.
+ * This smp_mb() orders the task->perf_event_ctxp[] store with the
+ * task_cpu() load, such that if the IPI then does not find the task
+ * running, a future context switch of that task must observe the
+ * store.
*/
- if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+ smp_mb();
+again:
+ if (!task_function_call(task, __perf_install_in_context, event))
return;
raw_spin_lock_irq(&ctx->lock);
@@ -2348,12 +2393,16 @@ again:
raw_spin_unlock_irq(&ctx->lock);
return;
}
- raw_spin_unlock_irq(&ctx->lock);
/*
- * Since !ctx->is_active doesn't mean anything, we must IPI
- * unconditionally.
+ * If the task is not running, ctx->lock will avoid it becoming so,
+ * thus we can safely install the event.
*/
- goto again;
+ if (task_curr(task)) {
+ raw_spin_unlock_irq(&ctx->lock);
+ goto again;
+ }
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
}
/*
@@ -6583,6 +6632,27 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
char *buf = NULL;
char *name;
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vma->vm_flags & VM_MAYEXEC)
+ flags |= MAP_EXECUTABLE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_HUGETLB)
+ flags |= MAP_HUGETLB;
+
if (file) {
struct inode *inode;
dev_t dev;
@@ -6609,27 +6679,6 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
maj = MAJOR(dev);
min = MINOR(dev);
- if (vma->vm_flags & VM_READ)
- prot |= PROT_READ;
- if (vma->vm_flags & VM_WRITE)
- prot |= PROT_WRITE;
- if (vma->vm_flags & VM_EXEC)
- prot |= PROT_EXEC;
-
- if (vma->vm_flags & VM_MAYSHARE)
- flags = MAP_SHARED;
- else
- flags = MAP_PRIVATE;
-
- if (vma->vm_flags & VM_DENYWRITE)
- flags |= MAP_DENYWRITE;
- if (vma->vm_flags & VM_MAYEXEC)
- flags |= MAP_EXECUTABLE;
- if (vma->vm_flags & VM_LOCKED)
- flags |= MAP_LOCKED;
- if (vma->vm_flags & VM_HUGETLB)
- flags |= MAP_HUGETLB;
-
goto got_name;
} else {
if (vma->vm_ops && vma->vm_ops->name) {
@@ -7034,25 +7083,12 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}
-/*
- * Generic event overflow handling, sampling.
- */
-
-static int __perf_event_overflow(struct perf_event *event,
- int throttle, struct perf_sample_data *data,
- struct pt_regs *regs)
+static int
+__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
- int events = atomic_read(&event->event_limit);
struct hw_perf_event *hwc = &event->hw;
- u64 seq;
int ret = 0;
-
- /*
- * Non-sampling counters might still use the PMI to fold short
- * hardware counters, ignore those.
- */
- if (unlikely(!is_sampling_event(event)))
- return 0;
+ u64 seq;
seq = __this_cpu_read(perf_throttled_seq);
if (seq != hwc->interrupts_seq) {
@@ -7080,6 +7116,34 @@ static int __perf_event_overflow(struct perf_event *event,
perf_adjust_period(event, delta, hwc->last_period, true);
}
+ return ret;
+}
+
+int perf_event_account_interrupt(struct perf_event *event)
+{
+ return __perf_event_account_interrupt(event, 1);
+}
+
+/*
+ * Generic event overflow handling, sampling.
+ */
+
+static int __perf_event_overflow(struct perf_event *event,
+ int throttle, struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ int events = atomic_read(&event->event_limit);
+ int ret = 0;
+
+ /*
+ * Non-sampling counters might still use the PMI to fold short
+ * hardware counters, ignore those.
+ */
+ if (unlikely(!is_sampling_event(event)))
+ return 0;
+
+ ret = __perf_event_account_interrupt(event, throttle);
+
/*
* XXX event_limit might not quite work as expected on inherited
* events
@@ -9503,6 +9567,37 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
return 0;
}
+/*
+ * Variation on perf_event_ctx_lock_nested(), except we take two context
+ * mutexes.
+ */
+static struct perf_event_context *
+__perf_event_ctx_lock_double(struct perf_event *group_leader,
+ struct perf_event_context *ctx)
+{
+ struct perf_event_context *gctx;
+
+again:
+ rcu_read_lock();
+ gctx = READ_ONCE(group_leader->ctx);
+ if (!atomic_inc_not_zero(&gctx->refcount)) {
+ rcu_read_unlock();
+ goto again;
+ }
+ rcu_read_unlock();
+
+ mutex_lock_double(&gctx->mutex, &ctx->mutex);
+
+ if (group_leader->ctx != gctx) {
+ mutex_unlock(&ctx->mutex);
+ mutex_unlock(&gctx->mutex);
+ put_ctx(gctx);
+ goto again;
+ }
+
+ return gctx;
+}
+
/**
* sys_perf_event_open - open a performance event, associate it to a task/cpu
*
@@ -9746,12 +9841,31 @@ SYSCALL_DEFINE5(perf_event_open,
}
if (move_group) {
- gctx = group_leader->ctx;
- mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ gctx = __perf_event_ctx_lock_double(group_leader, ctx);
+
if (gctx->task == TASK_TOMBSTONE) {
err = -ESRCH;
goto err_locked;
}
+
+ /*
+ * Check if we raced against another sys_perf_event_open() call
+ * moving the software group underneath us.
+ */
+ if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
+ /*
+ * If someone moved the group out from under us, check
+ * if this new event wound up on the same ctx, if so
+ * its the regular !move_group case, otherwise fail.
+ */
+ if (gctx != ctx) {
+ err = -EINVAL;
+ goto err_locked;
+ } else {
+ perf_event_ctx_unlock(group_leader, gctx);
+ move_group = 0;
+ }
+ }
} else {
mutex_lock(&ctx->mutex);
}
@@ -9853,7 +9967,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_unpin_context(ctx);
if (move_group)
- mutex_unlock(&gctx->mutex);
+ perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
if (task) {
@@ -9879,7 +9993,7 @@ SYSCALL_DEFINE5(perf_event_open,
err_locked:
if (move_group)
- mutex_unlock(&gctx->mutex);
+ perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
/* err_file: */
fput(event_file);