summaryrefslogtreecommitdiff
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig40
-rw-r--r--kernel/trace/Makefile3
-rw-r--r--kernel/trace/blktrace.c27
-rw-r--r--kernel/trace/bpf_trace.c357
-rw-r--r--kernel/trace/fgraph.c4
-rw-r--r--kernel/trace/fprobe.c332
-rw-r--r--kernel/trace/ftrace.c64
-rw-r--r--kernel/trace/rethook.c317
-rw-r--r--kernel/trace/trace.c73
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events.c104
-rw-r--r--kernel/trace/trace_events_hist.c33
-rw-r--r--kernel/trace/trace_events_synth.c14
-rw-r--r--kernel/trace/trace_events_user.c1690
-rw-r--r--kernel/trace/trace_osnoise.c4
-rw-r--r--kernel/trace/trace_sched_switch.c3
-rw-r--r--kernel/trace/trace_sched_wakeup.c1
17 files changed, 3014 insertions, 54 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a5eb5e7fd624..9bb54c0b3b2d 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -10,6 +10,17 @@ config USER_STACKTRACE_SUPPORT
config NOP_TRACER
bool
+config HAVE_RETHOOK
+ bool
+
+config RETHOOK
+ bool
+ depends on HAVE_RETHOOK
+ help
+ Enable generic return hooking feature. This is an internal
+ API, which will be used by other function-entry hooking
+ features like fprobe and kprobes.
+
config HAVE_FUNCTION_TRACER
bool
help
@@ -236,6 +247,21 @@ config DYNAMIC_FTRACE_WITH_ARGS
depends on DYNAMIC_FTRACE
depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+config FPROBE
+ bool "Kernel Function Probe (fprobe)"
+ depends on FUNCTION_TRACER
+ depends on DYNAMIC_FTRACE_WITH_REGS
+ depends on HAVE_RETHOOK
+ select RETHOOK
+ default n
+ help
+ This option enables kernel function probe (fprobe) based on ftrace.
+ The fprobe is similar to kprobes, but probes only for kernel function
+ entries and exits. This also can probe multiple functions by one
+ fprobe.
+
+ If unsure, say N.
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
@@ -737,6 +763,20 @@ config SYNTH_EVENTS
If in doubt, say N.
+config USER_EVENTS
+ bool "User trace events"
+ select TRACING
+ select DYNAMIC_EVENTS
+ help
+ User trace events are user-defined trace events that
+ can be used like an existing kernel trace event. User trace
+ events are generated by writing to a tracefs file. User
+ processes can determine if their tracing events should be
+ generated by memory mapping a tracefs file and checking for
+ an associated byte being non-zero.
+
+ If in doubt, say N.
+
config HIST_TRIGGERS
bool "Histogram triggers"
depends on ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index bedc5caceec7..d77cd8032213 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_eprobe.o
obj-$(CONFIG_TRACE_EVENT_INJECT) += trace_events_inject.o
obj-$(CONFIG_SYNTH_EVENTS) += trace_events_synth.o
obj-$(CONFIG_HIST_TRIGGERS) += trace_events_hist.o
+obj-$(CONFIG_USER_EVENTS) += trace_events_user.o
obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o
obj-$(CONFIG_KPROBE_EVENTS) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += error_report-traces.o
@@ -97,6 +98,8 @@ obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o
obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
+obj-$(CONFIG_FPROBE) += fprobe.o
+obj-$(CONFIG_RETHOOK) += rethook.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index af68a67179b4..4d5629196d01 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -310,10 +310,20 @@ record_it:
local_irq_restore(flags);
}
-static void blk_trace_free(struct blk_trace *bt)
+static void blk_trace_free(struct request_queue *q, struct blk_trace *bt)
{
relay_close(bt->rchan);
- debugfs_remove(bt->dir);
+
+ /*
+ * If 'bt->dir' is not set, then both 'dropped' and 'msg' are created
+ * under 'q->debugfs_dir', thus lookup and remove them.
+ */
+ if (!bt->dir) {
+ debugfs_remove(debugfs_lookup("dropped", q->debugfs_dir));
+ debugfs_remove(debugfs_lookup("msg", q->debugfs_dir));
+ } else {
+ debugfs_remove(bt->dir);
+ }
free_percpu(bt->sequence);
free_percpu(bt->msg_data);
kfree(bt);
@@ -335,10 +345,10 @@ static void put_probe_ref(void)
mutex_unlock(&blk_probe_mutex);
}
-static void blk_trace_cleanup(struct blk_trace *bt)
+static void blk_trace_cleanup(struct request_queue *q, struct blk_trace *bt)
{
synchronize_rcu();
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
put_probe_ref();
}
@@ -352,7 +362,7 @@ static int __blk_trace_remove(struct request_queue *q)
return -EINVAL;
if (bt->trace_state != Blktrace_running)
- blk_trace_cleanup(bt);
+ blk_trace_cleanup(q, bt);
return 0;
}
@@ -572,7 +582,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
ret = 0;
err:
if (ret)
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return ret;
}
@@ -1616,7 +1626,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
put_probe_ref();
synchronize_rcu();
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return 0;
}
@@ -1647,7 +1657,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
return 0;
free_bt:
- blk_trace_free(bt);
+ blk_trace_free(q, bt);
return ret;
}
@@ -1892,7 +1902,6 @@ void blk_fill_rwbs(char *rwbs, unsigned int op)
switch (op & REQ_OP_MASK) {
case REQ_OP_WRITE:
- case REQ_OP_WRITE_SAME:
rwbs[i++] = 'W';
break;
case REQ_OP_DISCARD:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 21aa30644219..7fa2ebc07f60 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -17,6 +17,9 @@
#include <linux/error-injection.h>
#include <linux/btf_ids.h>
#include <linux/bpf_lsm.h>
+#include <linux/fprobe.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
#include <net/bpf_sk_storage.h>
@@ -77,6 +80,8 @@ u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size,
u64 flags, const struct btf **btf,
s32 *btf_id);
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx);
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx);
/**
* trace_call_bpf - invoke BPF program
@@ -332,8 +337,6 @@ BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
if (unlikely(in_interrupt() ||
current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
@@ -835,8 +838,6 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type)
*/
if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
return -EPERM;
- if (unlikely(uaccess_kernel()))
- return -EPERM;
if (unlikely(!nmi_uaccess_okay()))
return -EPERM;
@@ -1036,6 +1037,30 @@ static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe = {
.arg1_type = ARG_PTR_TO_CTX,
};
+BPF_CALL_1(bpf_get_func_ip_kprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_kprobe_multi_entry_ip(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_func_ip_proto_kprobe_multi = {
+ .func = bpf_get_func_ip_kprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
+BPF_CALL_1(bpf_get_attach_cookie_kprobe_multi, struct pt_regs *, regs)
+{
+ return bpf_kprobe_multi_cookie(current->bpf_ctx);
+}
+
+static const struct bpf_func_proto bpf_get_attach_cookie_proto_kmulti = {
+ .func = bpf_get_attach_cookie_kprobe_multi,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+};
+
BPF_CALL_1(bpf_get_attach_cookie_trace, void *, ctx)
{
struct bpf_trace_run_ctx *run_ctx;
@@ -1235,6 +1260,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_task_stack_proto;
case BPF_FUNC_copy_from_user:
return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL;
+ case BPF_FUNC_copy_from_user_task:
+ return prog->aux->sleepable ? &bpf_copy_from_user_task_proto : NULL;
case BPF_FUNC_snprintf_btf:
return &bpf_snprintf_btf_proto;
case BPF_FUNC_per_cpu_ptr:
@@ -1277,9 +1304,13 @@ kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_override_return_proto;
#endif
case BPF_FUNC_get_func_ip:
- return &bpf_get_func_ip_proto_kprobe;
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
+ &bpf_get_func_ip_proto_kprobe_multi :
+ &bpf_get_func_ip_proto_kprobe;
case BPF_FUNC_get_attach_cookie:
- return &bpf_get_attach_cookie_proto_trace;
+ return prog->expected_attach_type == BPF_TRACE_KPROBE_MULTI ?
+ &bpf_get_attach_cookie_proto_kmulti :
+ &bpf_get_attach_cookie_proto_trace;
default:
return bpf_tracing_func_proto(func_id, prog);
}
@@ -1562,6 +1593,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
extern const struct bpf_func_proto bpf_skb_output_proto;
extern const struct bpf_func_proto bpf_xdp_output_proto;
+extern const struct bpf_func_proto bpf_xdp_get_buff_len_trace_proto;
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
struct bpf_map *, map, u64, flags)
@@ -1661,6 +1693,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sock_from_file_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_ptr_cookie_proto;
+ case BPF_FUNC_xdp_get_buff_len:
+ return &bpf_xdp_get_buff_len_trace_proto;
#endif
case BPF_FUNC_seq_printf:
return prog->expected_attach_type == BPF_TRACE_ITER ?
@@ -2176,3 +2210,314 @@ static int __init bpf_event_init(void)
fs_initcall(bpf_event_init);
#endif /* CONFIG_MODULES */
+
+#ifdef CONFIG_FPROBE
+struct bpf_kprobe_multi_link {
+ struct bpf_link link;
+ struct fprobe fp;
+ unsigned long *addrs;
+ u64 *cookies;
+ u32 cnt;
+};
+
+struct bpf_kprobe_multi_run_ctx {
+ struct bpf_run_ctx run_ctx;
+ struct bpf_kprobe_multi_link *link;
+ unsigned long entry_ip;
+};
+
+static void bpf_kprobe_multi_link_release(struct bpf_link *link)
+{
+ struct bpf_kprobe_multi_link *kmulti_link;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ unregister_fprobe(&kmulti_link->fp);
+}
+
+static void bpf_kprobe_multi_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_kprobe_multi_link *kmulti_link;
+
+ kmulti_link = container_of(link, struct bpf_kprobe_multi_link, link);
+ kvfree(kmulti_link->addrs);
+ kvfree(kmulti_link->cookies);
+ kfree(kmulti_link);
+}
+
+static const struct bpf_link_ops bpf_kprobe_multi_link_lops = {
+ .release = bpf_kprobe_multi_link_release,
+ .dealloc = bpf_kprobe_multi_link_dealloc,
+};
+
+static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void *priv)
+{
+ const struct bpf_kprobe_multi_link *link = priv;
+ unsigned long *addr_a = a, *addr_b = b;
+ u64 *cookie_a, *cookie_b;
+ unsigned long tmp1;
+ u64 tmp2;
+
+ cookie_a = link->cookies + (addr_a - link->addrs);
+ cookie_b = link->cookies + (addr_b - link->addrs);
+
+ /* swap addr_a/addr_b and cookie_a/cookie_b values */
+ tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1;
+ tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2;
+}
+
+static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b)
+{
+ const unsigned long *addr_a = a, *addr_b = b;
+
+ if (*addr_a == *addr_b)
+ return 0;
+ return *addr_a < *addr_b ? -1 : 1;
+}
+
+static int bpf_kprobe_multi_cookie_cmp(const void *a, const void *b, const void *priv)
+{
+ return __bpf_kprobe_multi_cookie_cmp(a, b);
+}
+
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ struct bpf_kprobe_multi_run_ctx *run_ctx;
+ struct bpf_kprobe_multi_link *link;
+ u64 *cookie, entry_ip;
+ unsigned long *addr;
+
+ if (WARN_ON_ONCE(!ctx))
+ return 0;
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ link = run_ctx->link;
+ if (!link->cookies)
+ return 0;
+ entry_ip = run_ctx->entry_ip;
+ addr = bsearch(&entry_ip, link->addrs, link->cnt, sizeof(entry_ip),
+ __bpf_kprobe_multi_cookie_cmp);
+ if (!addr)
+ return 0;
+ cookie = link->cookies + (addr - link->addrs);
+ return *cookie;
+}
+
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ struct bpf_kprobe_multi_run_ctx *run_ctx;
+
+ run_ctx = container_of(current->bpf_ctx, struct bpf_kprobe_multi_run_ctx, run_ctx);
+ return run_ctx->entry_ip;
+}
+
+static int
+kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
+ unsigned long entry_ip, struct pt_regs *regs)
+{
+ struct bpf_kprobe_multi_run_ctx run_ctx = {
+ .link = link,
+ .entry_ip = entry_ip,
+ };
+ struct bpf_run_ctx *old_run_ctx;
+ int err;
+
+ if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
+ err = 0;
+ goto out;
+ }
+
+ migrate_disable();
+ rcu_read_lock();
+ old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx);
+ err = bpf_prog_run(link->link.prog, regs);
+ bpf_reset_run_ctx(old_run_ctx);
+ rcu_read_unlock();
+ migrate_enable();
+
+ out:
+ __this_cpu_dec(bpf_prog_active);
+ return err;
+}
+
+static void
+kprobe_multi_link_handler(struct fprobe *fp, unsigned long entry_ip,
+ struct pt_regs *regs)
+{
+ struct bpf_kprobe_multi_link *link;
+
+ link = container_of(fp, struct bpf_kprobe_multi_link, fp);
+ kprobe_multi_link_prog_run(link, entry_ip, regs);
+}
+
+static int
+kprobe_multi_resolve_syms(const void *usyms, u32 cnt,
+ unsigned long *addrs)
+{
+ unsigned long addr, size;
+ const char **syms;
+ int err = -ENOMEM;
+ unsigned int i;
+ char *func;
+
+ size = cnt * sizeof(*syms);
+ syms = kvzalloc(size, GFP_KERNEL);
+ if (!syms)
+ return -ENOMEM;
+
+ func = kmalloc(KSYM_NAME_LEN, GFP_KERNEL);
+ if (!func)
+ goto error;
+
+ if (copy_from_user(syms, usyms, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+
+ for (i = 0; i < cnt; i++) {
+ err = strncpy_from_user(func, syms[i], KSYM_NAME_LEN);
+ if (err == KSYM_NAME_LEN)
+ err = -E2BIG;
+ if (err < 0)
+ goto error;
+ err = -EINVAL;
+ addr = kallsyms_lookup_name(func);
+ if (!addr)
+ goto error;
+ if (!kallsyms_lookup_size_offset(addr, &size, NULL))
+ goto error;
+ addr = ftrace_location_range(addr, addr + size - 1);
+ if (!addr)
+ goto error;
+ addrs[i] = addr;
+ }
+
+ err = 0;
+error:
+ kvfree(syms);
+ kfree(func);
+ return err;
+}
+
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ struct bpf_kprobe_multi_link *link = NULL;
+ struct bpf_link_primer link_primer;
+ void __user *ucookies;
+ unsigned long *addrs;
+ u32 flags, cnt, size;
+ void __user *uaddrs;
+ u64 *cookies = NULL;
+ void __user *usyms;
+ int err;
+
+ /* no support for 32bit archs yet */
+ if (sizeof(u64) != sizeof(void *))
+ return -EOPNOTSUPP;
+
+ if (prog->expected_attach_type != BPF_TRACE_KPROBE_MULTI)
+ return -EINVAL;
+
+ flags = attr->link_create.kprobe_multi.flags;
+ if (flags & ~BPF_F_KPROBE_MULTI_RETURN)
+ return -EINVAL;
+
+ uaddrs = u64_to_user_ptr(attr->link_create.kprobe_multi.addrs);
+ usyms = u64_to_user_ptr(attr->link_create.kprobe_multi.syms);
+ if (!!uaddrs == !!usyms)
+ return -EINVAL;
+
+ cnt = attr->link_create.kprobe_multi.cnt;
+ if (!cnt)
+ return -EINVAL;
+
+ size = cnt * sizeof(*addrs);
+ addrs = kvmalloc(size, GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+
+ if (uaddrs) {
+ if (copy_from_user(addrs, uaddrs, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+ } else {
+ err = kprobe_multi_resolve_syms(usyms, cnt, addrs);
+ if (err)
+ goto error;
+ }
+
+ ucookies = u64_to_user_ptr(attr->link_create.kprobe_multi.cookies);
+ if (ucookies) {
+ cookies = kvmalloc(size, GFP_KERNEL);
+ if (!cookies) {
+ err = -ENOMEM;
+ goto error;
+ }
+ if (copy_from_user(cookies, ucookies, size)) {
+ err = -EFAULT;
+ goto error;
+ }
+ }
+
+ link = kzalloc(sizeof(*link), GFP_KERNEL);
+ if (!link) {
+ err = -ENOMEM;
+ goto error;
+ }
+
+ bpf_link_init(&link->link, BPF_LINK_TYPE_KPROBE_MULTI,
+ &bpf_kprobe_multi_link_lops, prog);
+
+ err = bpf_link_prime(&link->link, &link_primer);
+ if (err)
+ goto error;
+
+ if (flags & BPF_F_KPROBE_MULTI_RETURN)
+ link->fp.exit_handler = kprobe_multi_link_handler;
+ else
+ link->fp.entry_handler = kprobe_multi_link_handler;
+
+ link->addrs = addrs;
+ link->cookies = cookies;
+ link->cnt = cnt;
+
+ if (cookies) {
+ /*
+ * Sorting addresses will trigger sorting cookies as well
+ * (check bpf_kprobe_multi_cookie_swap). This way we can
+ * find cookie based on the address in bpf_get_attach_cookie
+ * helper.
+ */
+ sort_r(addrs, cnt, sizeof(*addrs),
+ bpf_kprobe_multi_cookie_cmp,
+ bpf_kprobe_multi_cookie_swap,
+ link);
+ }
+
+ err = register_fprobe_ips(&link->fp, addrs, cnt);
+ if (err) {
+ bpf_link_cleanup(&link_primer);
+ return err;
+ }
+
+ return bpf_link_settle(&link_primer);
+
+error:
+ kfree(link);
+ kvfree(addrs);
+ kvfree(cookies);
+ return err;
+}
+#else /* !CONFIG_FPROBE */
+int bpf_kprobe_multi_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+ return -EOPNOTSUPP;
+}
+static u64 bpf_kprobe_multi_cookie(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
+{
+ return 0;
+}
+#endif
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 22061d38fc00..19028e072cdb 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -415,7 +415,9 @@ free:
static void
ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
unsigned long long timestamp;
int index;
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
new file mode 100644
index 000000000000..8b2dd5b9dcd1
--- /dev/null
+++ b/kernel/trace/fprobe.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * fprobe - Simple ftrace probe wrapper for function entry.
+ */
+#define pr_fmt(fmt) "fprobe: " fmt
+
+#include <linux/err.h>
+#include <linux/fprobe.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/rethook.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+#include "trace.h"
+
+struct fprobe_rethook_node {
+ struct rethook_node node;
+ unsigned long entry_ip;
+};
+
+static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe_rethook_node *fpr;
+ struct rethook_node *rh;
+ struct fprobe *fp;
+ int bit;
+
+ fp = container_of(ops, struct fprobe, ops);
+ if (fprobe_disabled(fp))
+ return;
+
+ bit = ftrace_test_recursion_trylock(ip, parent_ip);
+ if (bit < 0) {
+ fp->nmissed++;
+ return;
+ }
+
+ if (fp->entry_handler)
+ fp->entry_handler(fp, ip, ftrace_get_regs(fregs));
+
+ if (fp->exit_handler) {
+ rh = rethook_try_get(fp->rethook);
+ if (!rh) {
+ fp->nmissed++;
+ goto out;
+ }
+ fpr = container_of(rh, struct fprobe_rethook_node, node);
+ fpr->entry_ip = ip;
+ rethook_hook(rh, ftrace_get_regs(fregs), true);
+ }
+
+out:
+ ftrace_test_recursion_unlock(bit);
+}
+NOKPROBE_SYMBOL(fprobe_handler);
+
+static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *ops, struct ftrace_regs *fregs)
+{
+ struct fprobe *fp = container_of(ops, struct fprobe, ops);
+
+ if (unlikely(kprobe_running())) {
+ fp->nmissed++;
+ return;
+ }
+ kprobe_busy_begin();
+ fprobe_handler(ip, parent_ip, ops, fregs);
+ kprobe_busy_end();
+}
+
+static void fprobe_exit_handler(struct rethook_node *rh, void *data,
+ struct pt_regs *regs)
+{
+ struct fprobe *fp = (struct fprobe *)data;
+ struct fprobe_rethook_node *fpr;
+
+ if (!fp || fprobe_disabled(fp))
+ return;
+
+ fpr = container_of(rh, struct fprobe_rethook_node, node);
+
+ fp->exit_handler(fp, fpr->entry_ip, regs);
+}
+NOKPROBE_SYMBOL(fprobe_exit_handler);
+
+/* Convert ftrace location address from symbols */
+static unsigned long *get_ftrace_locations(const char **syms, int num)
+{
+ unsigned long addr, size;
+ unsigned long *addrs;
+ int i;
+
+ /* Convert symbols to symbol address */
+ addrs = kcalloc(num, sizeof(*addrs), GFP_KERNEL);
+ if (!addrs)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < num; i++) {
+ addr = kallsyms_lookup_name(syms[i]);
+ if (!addr) /* Maybe wrong symbol */
+ goto error;
+
+ /* Convert symbol address to ftrace location. */
+ if (!kallsyms_lookup_size_offset(addr, &size, NULL) || !size)
+ goto error;
+
+ addr = ftrace_location_range(addr, addr + size - 1);
+ if (!addr) /* No dynamic ftrace there. */
+ goto error;
+
+ addrs[i] = addr;
+ }
+
+ return addrs;
+
+error:
+ kfree(addrs);
+
+ return ERR_PTR(-ENOENT);
+}
+
+static void fprobe_init(struct fprobe *fp)
+{
+ fp->nmissed = 0;
+ if (fprobe_shared_with_kprobes(fp))
+ fp->ops.func = fprobe_kprobe_handler;
+ else
+ fp->ops.func = fprobe_handler;
+ fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
+}
+
+static int fprobe_init_rethook(struct fprobe *fp, int num)
+{
+ int i, size;
+
+ if (num < 0)
+ return -EINVAL;
+
+ if (!fp->exit_handler) {
+ fp->rethook = NULL;
+ return 0;
+ }
+
+ /* Initialize rethook if needed */
+ size = num * num_possible_cpus() * 2;
+ if (size < 0)
+ return -E2BIG;
+
+ fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler);
+ for (i = 0; i < size; i++) {
+ struct rethook_node *node;
+
+ node = kzalloc(sizeof(struct fprobe_rethook_node), GFP_KERNEL);
+ if (!node) {
+ rethook_free(fp->rethook);
+ fp->rethook = NULL;
+ return -ENOMEM;
+ }
+ rethook_add_node(fp->rethook, node);
+ }
+ return 0;
+}
+
+static void fprobe_fail_cleanup(struct fprobe *fp)
+{
+ if (fp->rethook) {
+ /* Don't need to cleanup rethook->handler because this is not used. */
+ rethook_free(fp->rethook);
+ fp->rethook = NULL;
+ }
+ ftrace_free_filter(&fp->ops);
+}
+
+/**
+ * register_fprobe() - Register fprobe to ftrace by pattern.
+ * @fp: A fprobe data structure to be registered.
+ * @filter: A wildcard pattern of probed symbols.
+ * @notfilter: A wildcard pattern of NOT probed symbols.
+ *
+ * Register @fp to ftrace for enabling the probe on the symbols matched to @filter.
+ * If @notfilter is not NULL, the symbols matched the @notfilter are not probed.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
+{
+ struct ftrace_hash *hash;
+ unsigned char *str;
+ int ret, len;
+
+ if (!fp || !filter)
+ return -EINVAL;
+
+ fprobe_init(fp);
+
+ len = strlen(filter);
+ str = kstrdup(filter, GFP_KERNEL);
+ ret = ftrace_set_filter(&fp->ops, str, len, 0);
+ kfree(str);
+ if (ret)
+ return ret;
+
+ if (notfilter) {
+ len = strlen(notfilter);
+ str = kstrdup(notfilter, GFP_KERNEL);
+ ret = ftrace_set_notrace(&fp->ops, str, len, 0);
+ kfree(str);
+ if (ret)
+ goto out;
+ }
+
+ /* TODO:
+ * correctly calculate the total number of filtered symbols
+ * from both filter and notfilter.
+ */
+ hash = fp->ops.local_hash.filter_hash;
+ if (WARN_ON_ONCE(!hash))
+ goto out;
+
+ ret = fprobe_init_rethook(fp, (int)hash->count);
+ if (!ret)
+ ret = register_ftrace_function(&fp->ops);
+
+out:
+ if (ret)
+ fprobe_fail_cleanup(fp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe);
+
+/**
+ * register_fprobe_ips() - Register fprobe to ftrace by address.
+ * @fp: A fprobe data structure to be registered.
+ * @addrs: An array of target ftrace location addresses.
+ * @num: The number of entries of @addrs.
+ *
+ * Register @fp to ftrace for enabling the probe on the address given by @addrs.
+ * The @addrs must be the addresses of ftrace location address, which may be
+ * the symbol address + arch-dependent offset.
+ * If you unsure what this mean, please use other registration functions.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
+{
+ int ret;
+
+ if (!fp || !addrs || num <= 0)
+ return -EINVAL;
+
+ fprobe_init(fp);
+
+ ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
+ if (ret)
+ return ret;
+
+ ret = fprobe_init_rethook(fp, num);
+ if (!ret)
+ ret = register_ftrace_function(&fp->ops);
+
+ if (ret)
+ fprobe_fail_cleanup(fp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe_ips);
+
+/**
+ * register_fprobe_syms() - Register fprobe to ftrace by symbols.
+ * @fp: A fprobe data structure to be registered.
+ * @syms: An array of target symbols.
+ * @num: The number of entries of @syms.
+ *
+ * Register @fp to the symbols given by @syms array. This will be useful if
+ * you are sure the symbols exist in the kernel.
+ *
+ * Return 0 if @fp is registered successfully, -errno if not.
+ */
+int register_fprobe_syms(struct fprobe *fp, const char **syms, int num)
+{
+ unsigned long *addrs;
+ int ret;
+
+ if (!fp || !syms || num <= 0)
+ return -EINVAL;
+
+ addrs = get_ftrace_locations(syms, num);
+ if (IS_ERR(addrs))
+ return PTR_ERR(addrs);
+
+ ret = register_fprobe_ips(fp, addrs, num);
+
+ kfree(addrs);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(register_fprobe_syms);
+
+/**
+ * unregister_fprobe() - Unregister fprobe from ftrace
+ * @fp: A fprobe data structure to be unregistered.
+ *
+ * Unregister fprobe (and remove ftrace hooks from the function entries).
+ *
+ * Return 0 if @fp is unregistered successfully, -errno if not.
+ */
+int unregister_fprobe(struct fprobe *fp)
+{
+ int ret;
+
+ if (!fp || fp->ops.func != fprobe_handler)
+ return -EINVAL;
+
+ /*
+ * rethook_free() starts disabling the rethook, but the rethook handlers
+ * may be running on other processors at this point. To make sure that all
+ * current running handlers are finished, call unregister_ftrace_function()
+ * after this.
+ */
+ if (fp->rethook)
+ rethook_free(fp->rethook);
+
+ ret = unregister_ftrace_function(&fp->ops);
+ if (ret < 0)
+ return ret;
+
+ ftrace_free_filter(&fp->ops);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_fprobe);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6105b7036482..2e114659f7f8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4958,7 +4958,7 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
}
static int
-ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
+__ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
{
struct ftrace_func_entry *entry;
@@ -4977,8 +4977,29 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
}
static int
+ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips,
+ unsigned int cnt, int remove)
+{
+ unsigned int i;
+ int err;
+
+ for (i = 0; i < cnt; i++) {
+ err = __ftrace_match_addr(hash, ips[i], remove);
+ if (err) {
+ /*
+ * This expects the @hash is a temporary hash and if this
+ * fails the caller must free the @hash.
+ */
+ return err;
+ }
+ }
+ return 0;
+}
+
+static int
ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
- unsigned long ip, int remove, int reset, int enable)
+ unsigned long *ips, unsigned int cnt,
+ int remove, int reset, int enable)
{
struct ftrace_hash **orig_hash;
struct ftrace_hash *hash;
@@ -5008,8 +5029,8 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
ret = -EINVAL;
goto out_regex_unlock;
}
- if (ip) {
- ret = ftrace_match_addr(hash, ip, remove);
+ if (ips) {
+ ret = ftrace_match_addr(hash, ips, cnt, remove);
if (ret < 0)
goto out_regex_unlock;
}
@@ -5026,10 +5047,10 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
}
static int
-ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
- int reset, int enable)
+ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt,
+ int remove, int reset, int enable)
{
- return ftrace_set_hash(ops, NULL, 0, ip, remove, reset, enable);
+ return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable);
}
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
@@ -5634,11 +5655,30 @@ int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
int remove, int reset)
{
ftrace_ops_init(ops);
- return ftrace_set_addr(ops, ip, remove, reset, 1);
+ return ftrace_set_addr(ops, &ip, 1, remove, reset, 1);
}
EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
/**
+ * ftrace_set_filter_ips - set functions to filter on in ftrace by addresses
+ * @ops - the ops to set the filter with
+ * @ips - the array of addresses to add to or remove from the filter.
+ * @cnt - the number of addresses in @ips
+ * @remove - non zero to remove ips from the filter
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled
+ * If @ips array or any ip specified within is NULL , it fails to update filter.
+ */
+int ftrace_set_filter_ips(struct ftrace_ops *ops, unsigned long *ips,
+ unsigned int cnt, int remove, int reset)
+{
+ ftrace_ops_init(ops);
+ return ftrace_set_addr(ops, ips, cnt, remove, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_filter_ips);
+
+/**
* ftrace_ops_set_global_filter - setup ops to use global filters
* @ops - the ops which will use the global filters
*
@@ -5659,7 +5699,7 @@ static int
ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
int reset, int enable)
{
- return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable);
+ return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable);
}
/**
@@ -7096,6 +7136,8 @@ void __init ftrace_free_init_mem(void)
void *start = (void *)(&__init_begin);
void *end = (void *)(&__init_end);
+ ftrace_boot_snapshot();
+
ftrace_free_mem(NULL, start, end);
}
@@ -7346,7 +7388,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
static void
ftrace_filter_pid_sched_switch_probe(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *pid_list;
diff --git a/kernel/trace/rethook.c b/kernel/trace/rethook.c
new file mode 100644
index 000000000000..ab463a4d2b23
--- /dev/null
+++ b/kernel/trace/rethook.c
@@ -0,0 +1,317 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "rethook: " fmt
+
+#include <linux/bug.h>
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/preempt.h>
+#include <linux/rethook.h>
+#include <linux/slab.h>
+#include <linux/sort.h>
+
+/* Return hook list (shadow stack by list) */
+
+/*
+ * This function is called from delayed_put_task_struct() when a task is
+ * dead and cleaned up to recycle any kretprobe instances associated with
+ * this task. These left over instances represent probed functions that
+ * have been called but will never return.
+ */
+void rethook_flush_task(struct task_struct *tk)
+{
+ struct rethook_node *rhn;
+ struct llist_node *node;
+
+ node = __llist_del_all(&tk->rethooks);
+ while (node) {
+ rhn = container_of(node, struct rethook_node, llist);
+ node = node->next;
+ preempt_disable();
+ rethook_recycle(rhn);
+ preempt_enable();
+ }
+}
+
+static void rethook_free_rcu(struct rcu_head *head)
+{
+ struct rethook *rh = container_of(head, struct rethook, rcu);
+ struct rethook_node *rhn;
+ struct freelist_node *node;
+ int count = 1;
+
+ node = rh->pool.head;
+ while (node) {
+ rhn = container_of(node, struct rethook_node, freelist);
+ node = node->next;
+ kfree(rhn);
+ count++;
+ }
+
+ /* The rh->ref is the number of pooled node + 1 */
+ if (refcount_sub_and_test(count, &rh->ref))
+ kfree(rh);
+}
+
+/**
+ * rethook_free() - Free struct rethook.
+ * @rh: the struct rethook to be freed.
+ *
+ * Free the rethook. Before calling this function, user must ensure the
+ * @rh::data is cleaned if needed (or, the handler can access it after
+ * calling this function.) This function will set the @rh to be freed
+ * after all rethook_node are freed (not soon). And the caller must
+ * not touch @rh after calling this.
+ */
+void rethook_free(struct rethook *rh)
+{
+ rcu_assign_pointer(rh->handler, NULL);
+
+ call_rcu(&rh->rcu, rethook_free_rcu);
+}
+
+/**
+ * rethook_alloc() - Allocate struct rethook.
+ * @data: a data to pass the @handler when hooking the return.
+ * @handler: the return hook callback function.
+ *
+ * Allocate and initialize a new rethook with @data and @handler.
+ * Return NULL if memory allocation fails or @handler is NULL.
+ * Note that @handler == NULL means this rethook is going to be freed.
+ */
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler)
+{
+ struct rethook *rh = kzalloc(sizeof(struct rethook), GFP_KERNEL);
+
+ if (!rh || !handler)
+ return NULL;
+
+ rh->data = data;
+ rh->handler = handler;
+ rh->pool.head = NULL;
+ refcount_set(&rh->ref, 1);
+
+ return rh;
+}
+
+/**
+ * rethook_add_node() - Add a new node to the rethook.
+ * @rh: the struct rethook.
+ * @node: the struct rethook_node to be added.
+ *
+ * Add @node to @rh. User must allocate @node (as a part of user's
+ * data structure.) The @node fields are initialized in this function.
+ */
+void rethook_add_node(struct rethook *rh, struct rethook_node *node)
+{
+ node->rethook = rh;
+ freelist_add(&node->freelist, &rh->pool);
+ refcount_inc(&rh->ref);
+}
+
+static void free_rethook_node_rcu(struct rcu_head *head)
+{
+ struct rethook_node *node = container_of(head, struct rethook_node, rcu);
+
+ if (refcount_dec_and_test(&node->rethook->ref))
+ kfree(node->rethook);
+ kfree(node);
+}
+
+/**
+ * rethook_recycle() - return the node to rethook.
+ * @node: The struct rethook_node to be returned.
+ *
+ * Return back the @node to @node::rethook. If the @node::rethook is already
+ * marked as freed, this will free the @node.
+ */
+void rethook_recycle(struct rethook_node *node)
+{
+ lockdep_assert_preemption_disabled();
+
+ if (likely(READ_ONCE(node->rethook->handler)))
+ freelist_add(&node->freelist, &node->rethook->pool);
+ else
+ call_rcu(&node->rcu, free_rethook_node_rcu);
+}
+NOKPROBE_SYMBOL(rethook_recycle);
+
+/**
+ * rethook_try_get() - get an unused rethook node.
+ * @rh: The struct rethook which pools the nodes.
+ *
+ * Get an unused rethook node from @rh. If the node pool is empty, this
+ * will return NULL. Caller must disable preemption.
+ */
+struct rethook_node *rethook_try_get(struct rethook *rh)
+{
+ rethook_handler_t handler = READ_ONCE(rh->handler);
+ struct freelist_node *fn;
+
+ lockdep_assert_preemption_disabled();
+
+ /* Check whether @rh is going to be freed. */
+ if (unlikely(!handler))
+ return NULL;
+
+ fn = freelist_try_get(&rh->pool);
+ if (!fn)
+ return NULL;
+
+ return container_of(fn, struct rethook_node, freelist);
+}
+NOKPROBE_SYMBOL(rethook_try_get);
+
+/**
+ * rethook_hook() - Hook the current function return.
+ * @node: The struct rethook node to hook the function return.
+ * @regs: The struct pt_regs for the function entry.
+ * @mcount: True if this is called from mcount(ftrace) context.
+ *
+ * Hook the current running function return. This must be called when the
+ * function entry (or at least @regs must be the registers of the function
+ * entry.) @mcount is used for identifying the context. If this is called
+ * from ftrace (mcount) callback, @mcount must be set true. If this is called
+ * from the real function entry (e.g. kprobes) @mcount must be set false.
+ * This is because the way to hook the function return depends on the context.
+ */
+void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount)
+{
+ arch_rethook_prepare(node, regs, mcount);
+ __llist_add(&node->llist, &current->rethooks);
+}
+NOKPROBE_SYMBOL(rethook_hook);
+
+/* This assumes the 'tsk' is the current task or is not running. */
+static unsigned long __rethook_find_ret_addr(struct task_struct *tsk,
+ struct llist_node **cur)
+{
+ struct rethook_node *rh = NULL;
+ struct llist_node *node = *cur;
+
+ if (!node)
+ node = tsk->rethooks.first;
+ else
+ node = node->next;
+
+ while (node) {
+ rh = container_of(node, struct rethook_node, llist);
+ if (rh->ret_addr != (unsigned long)arch_rethook_trampoline) {
+ *cur = node;
+ return rh->ret_addr;
+ }
+ node = node->next;
+ }
+ return 0;
+}
+NOKPROBE_SYMBOL(__rethook_find_ret_addr);
+
+/**
+ * rethook_find_ret_addr -- Find correct return address modified by rethook
+ * @tsk: Target task
+ * @frame: A frame pointer
+ * @cur: a storage of the loop cursor llist_node pointer for next call
+ *
+ * Find the correct return address modified by a rethook on @tsk in unsigned
+ * long type.
+ * The @tsk must be 'current' or a task which is not running. @frame is a hint
+ * to get the currect return address - which is compared with the
+ * rethook::frame field. The @cur is a loop cursor for searching the
+ * kretprobe return addresses on the @tsk. The '*@cur' should be NULL at the
+ * first call, but '@cur' itself must NOT NULL.
+ *
+ * Returns found address value or zero if not found.
+ */
+unsigned long rethook_find_ret_addr(struct task_struct *tsk, unsigned long frame,
+ struct llist_node **cur)
+{
+ struct rethook_node *rhn = NULL;
+ unsigned long ret;
+
+ if (WARN_ON_ONCE(!cur))
+ return 0;
+
+ if (WARN_ON_ONCE(tsk != current && task_is_running(tsk)))
+ return 0;
+
+ do {
+ ret = __rethook_find_ret_addr(tsk, cur);
+ if (!ret)
+ break;
+ rhn = container_of(*cur, struct rethook_node, llist);
+ } while (rhn->frame != frame);
+
+ return ret;
+}
+NOKPROBE_SYMBOL(rethook_find_ret_addr);
+
+void __weak arch_rethook_fixup_return(struct pt_regs *regs,
+ unsigned long correct_ret_addr)
+{
+ /*
+ * Do nothing by default. If the architecture which uses a
+ * frame pointer to record real return address on the stack,
+ * it should fill this function to fixup the return address
+ * so that stacktrace works from the rethook handler.
+ */
+}
+
+/* This function will be called from each arch-defined trampoline. */
+unsigned long rethook_trampoline_handler(struct pt_regs *regs,
+ unsigned long frame)
+{
+ struct llist_node *first, *node = NULL;
+ unsigned long correct_ret_addr;
+ rethook_handler_t handler;
+ struct rethook_node *rhn;
+
+ correct_ret_addr = __rethook_find_ret_addr(current, &node);
+ if (!correct_ret_addr) {
+ pr_err("rethook: Return address not found! Maybe there is a bug in the kernel\n");
+ BUG_ON(1);
+ }
+
+ instruction_pointer_set(regs, correct_ret_addr);
+
+ /*
+ * These loops must be protected from rethook_free_rcu() because those
+ * are accessing 'rhn->rethook'.
+ */
+ preempt_disable();
+
+ /*
+ * Run the handler on the shadow stack. Do not unlink the list here because
+ * stackdump inside the handlers needs to decode it.
+ */
+ first = current->rethooks.first;
+ while (first) {
+ rhn = container_of(first, struct rethook_node, llist);
+ if (WARN_ON_ONCE(rhn->frame != frame))
+ break;
+ handler = READ_ONCE(rhn->rethook->handler);
+ if (handler)
+ handler(rhn, rhn->rethook->data, regs);
+
+ if (first == node)
+ break;
+ first = first->next;
+ }
+
+ /* Fixup registers for returning to correct address. */
+ arch_rethook_fixup_return(regs, correct_ret_addr);
+
+ /* Unlink used shadow stack */
+ first = current->rethooks.first;
+ current->rethooks.first = node->next;
+ node->next = NULL;
+
+ while (first) {
+ rhn = container_of(first, struct rethook_node, llist);
+ first = first->next;
+ rethook_recycle(rhn);
+ }
+ preempt_enable();
+
+ return correct_ret_addr;
+}
+NOKPROBE_SYMBOL(rethook_trampoline_handler);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 96265a717ca4..f4de111fa18f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -185,6 +185,7 @@ static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
static bool allocate_snapshot;
+static bool snapshot_at_boot;
static int __init set_cmdline_ftrace(char *str)
{
@@ -230,6 +231,15 @@ static int __init boot_alloc_snapshot(char *str)
__setup("alloc_snapshot", boot_alloc_snapshot);
+static int __init boot_snapshot(char *str)
+{
+ snapshot_at_boot = true;
+ boot_alloc_snapshot(str);
+ return 1;
+}
+__setup("ftrace_boot_snapshot", boot_snapshot);
+
+
static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
@@ -7730,7 +7740,7 @@ const struct file_operations trace_min_max_fops = {
struct err_info {
const char **errs; /* ptr to loc-specific array of err strings */
u8 type; /* index into errs -> specific err string */
- u8 pos; /* MAX_FILTER_STR_VAL = 256 */
+ u16 pos; /* caret position */
u64 ts;
};
@@ -7738,26 +7748,52 @@ struct tracing_log_err {
struct list_head list;
struct err_info info;
char loc[TRACING_LOG_LOC_MAX]; /* err location */
- char cmd[MAX_FILTER_STR_VAL]; /* what caused err */
+ char *cmd; /* what caused err */
};
static DEFINE_MUTEX(tracing_err_log_lock);
-static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
+static struct tracing_log_err *alloc_tracing_log_err(int len)
+{
+ struct tracing_log_err *err;
+
+ err = kzalloc(sizeof(*err), GFP_KERNEL);
+ if (!err)
+ return ERR_PTR(-ENOMEM);
+
+ err->cmd = kzalloc(len, GFP_KERNEL);
+ if (!err->cmd) {
+ kfree(err);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return err;
+}
+
+static void free_tracing_log_err(struct tracing_log_err *err)
+{
+ kfree(err->cmd);
+ kfree(err);
+}
+
+static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr,
+ int len)
{
struct tracing_log_err *err;
if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) {
- err = kzalloc(sizeof(*err), GFP_KERNEL);
- if (!err)
- err = ERR_PTR(-ENOMEM);
- else
+ err = alloc_tracing_log_err(len);
+ if (PTR_ERR(err) != -ENOMEM)
tr->n_err_log_entries++;
return err;
}
err = list_first_entry(&tr->err_log, struct tracing_log_err, list);
+ kfree(err->cmd);
+ err->cmd = kzalloc(len, GFP_KERNEL);
+ if (!err->cmd)
+ return ERR_PTR(-ENOMEM);
list_del(&err->list);
return err;
@@ -7818,22 +7854,25 @@ unsigned int err_pos(char *cmd, const char *str)
*/
void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos)
+ const char **errs, u8 type, u16 pos)
{
struct tracing_log_err *err;
+ int len = 0;
if (!tr)
tr = &global_trace;
+ len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1;
+
mutex_lock(&tracing_err_log_lock);
- err = get_tracing_log_err(tr);
+ err = get_tracing_log_err(tr, len);
if (PTR_ERR(err) == -ENOMEM) {
mutex_unlock(&tracing_err_log_lock);
return;
}
snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
- snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd);
+ snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd);
err->info.errs = errs;
err->info.type = type;
@@ -7851,7 +7890,7 @@ static void clear_tracing_err_log(struct trace_array *tr)
mutex_lock(&tracing_err_log_lock);
list_for_each_entry_safe(err, next, &tr->err_log, list) {
list_del(&err->list);
- kfree(err);
+ free_tracing_log_err(err);
}
tr->n_err_log_entries = 0;
@@ -7879,9 +7918,9 @@ static void tracing_err_log_seq_stop(struct seq_file *m, void *v)
mutex_unlock(&tracing_err_log_lock);
}
-static void tracing_err_log_show_pos(struct seq_file *m, u8 pos)
+static void tracing_err_log_show_pos(struct seq_file *m, u16 pos)
{
- u8 i;
+ u16 i;
for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++)
seq_putc(m, ' ');
@@ -10127,6 +10166,14 @@ out:
return ret;
}
+void __init ftrace_boot_snapshot(void)
+{
+ if (snapshot_at_boot) {
+ tracing_snapshot();
+ internal_trace_puts("** Boot snapshot taken **\n");
+ }
+}
+
void __init early_trace_init(void)
{
if (tracepoint_printk) {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c5b09c31e077..07d990270e2a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1877,7 +1877,7 @@ extern ssize_t trace_parse_run_command(struct file *file,
extern unsigned int err_pos(char *cmd, const char *str);
extern void tracing_log_err(struct trace_array *tr,
const char *loc, const char *cmd,
- const char **errs, u8 type, u8 pos);
+ const char **errs, u8 type, u16 pos);
/*
* Normal trace_printk() and friends allocates special buffers
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3147614c1812..e11e167b7809 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -40,6 +40,14 @@ static LIST_HEAD(ftrace_generic_fields);
static LIST_HEAD(ftrace_common_fields);
static bool eventdir_initialized;
+static LIST_HEAD(module_strings);
+
+struct module_string {
+ struct list_head next;
+ struct module *module;
+ char *str;
+};
+
#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
static struct kmem_cache *field_cachep;
@@ -384,6 +392,12 @@ static void test_event_printk(struct trace_event_call *call)
if (!(dereference_flags & (1ULL << arg)))
goto next_arg;
+ /* Check for __get_sockaddr */;
+ if (str_has_prefix(fmt + i, "__get_sockaddr(")) {
+ dereference_flags &= ~(1ULL << arg);
+ goto next_arg;
+ }
+
/* Find the REC-> in the argument */
c = strchr(fmt + i, ',');
r = strstr(fmt + i, "REC->");
@@ -759,7 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable)
static void
event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -783,7 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
static void
event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
- struct task_struct *prev, struct task_struct *next)
+ unsigned int prev_state,
+ struct task_struct *prev,
+ struct task_struct *next)
{
struct trace_array *tr = data;
struct trace_pid_list *no_pid_list;
@@ -2633,6 +2651,76 @@ static void update_event_printk(struct trace_event_call *call,
}
}
+static void add_str_to_module(struct module *module, char *str)
+{
+ struct module_string *modstr;
+
+ modstr = kmalloc(sizeof(*modstr), GFP_KERNEL);
+
+ /*
+ * If we failed to allocate memory here, then we'll just
+ * let the str memory leak when the module is removed.
+ * If this fails to allocate, there's worse problems than
+ * a leaked string on module removal.
+ */
+ if (WARN_ON_ONCE(!modstr))
+ return;
+
+ modstr->module = module;
+ modstr->str = str;
+
+ list_add(&modstr->next, &module_strings);
+}
+
+static void update_event_fields(struct trace_event_call *call,
+ struct trace_eval_map *map)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+ char *ptr;
+ char *str;
+ int len = strlen(map->eval_string);
+
+ /* Dynamic events should never have field maps */
+ if (WARN_ON_ONCE(call->flags & TRACE_EVENT_FL_DYNAMIC))
+ return;
+
+ head = trace_get_fields(call);
+ list_for_each_entry(field, head, link) {
+ ptr = strchr(field->type, '[');
+ if (!ptr)
+ continue;
+ ptr++;
+
+ if (!isalpha(*ptr) && *ptr != '_')
+ continue;
+
+ if (strncmp(map->eval_string, ptr, len) != 0)
+ continue;
+
+ str = kstrdup(field->type, GFP_KERNEL);
+ if (WARN_ON_ONCE(!str))
+ return;
+ ptr = str + (ptr - field->type);
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
+ if (WARN_ON_ONCE(!ptr)) {
+ kfree(str);
+ continue;
+ }
+
+ /*
+ * If the event is part of a module, then we need to free the string
+ * when the module is removed. Otherwise, it will stay allocated
+ * until a reboot.
+ */
+ if (call->module)
+ add_str_to_module(call->module, str);
+
+ field->type = str;
+ }
+}
+
void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
@@ -2668,6 +2756,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
first = false;
}
update_event_printk(call, map[i]);
+ update_event_fields(call, map[i]);
}
}
}
@@ -2758,6 +2847,7 @@ int trace_add_event_call(struct trace_event_call *call)
mutex_unlock(&trace_types_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(trace_add_event_call);
/*
* Must be called under locking of trace_types_lock, event_mutex and
@@ -2819,6 +2909,7 @@ int trace_remove_event_call(struct trace_event_call *call)
return ret;
}
+EXPORT_SYMBOL_GPL(trace_remove_event_call);
#define for_each_event(event, start, end) \
for (event = start; \
@@ -2853,6 +2944,7 @@ static void trace_module_add_events(struct module *mod)
static void trace_module_remove_events(struct module *mod)
{
struct trace_event_call *call, *p;
+ struct module_string *modstr, *m;
down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
@@ -2861,6 +2953,14 @@ static void trace_module_remove_events(struct module *mod)
if (call->module == mod)
__trace_remove_event_call(call);
}
+ /* Check for any strings allocade for this module */
+ list_for_each_entry_safe(modstr, m, &module_strings, next) {
+ if (modstr->module != mod)
+ continue;
+ list_del(&modstr->next);
+ kfree(modstr->str);
+ kfree(modstr);
+ }
up_write(&trace_event_sem);
/*
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index dc7f733b4cb3..44db5ba9cabb 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -727,11 +727,16 @@ static struct track_data *track_data_alloc(unsigned int key_len,
return data;
}
-static char last_cmd[MAX_FILTER_STR_VAL];
+#define HIST_PREFIX "hist:"
+
+static char *last_cmd;
static char last_cmd_loc[MAX_FILTER_STR_VAL];
static int errpos(char *str)
{
+ if (!str || !last_cmd)
+ return 0;
+
return err_pos(last_cmd, str);
}
@@ -739,12 +744,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
{
const char *system = NULL, *name = NULL;
struct trace_event_call *call;
+ int len;
if (!str)
return;
- strcpy(last_cmd, "hist:");
- strncat(last_cmd, str, MAX_FILTER_STR_VAL - 1 - sizeof("hist:"));
+ /* sizeof() contains the nul byte */
+ len = sizeof(HIST_PREFIX) + strlen(str);
+ kfree(last_cmd);
+ last_cmd = kzalloc(len, GFP_KERNEL);
+ if (!last_cmd)
+ return;
+
+ strcpy(last_cmd, HIST_PREFIX);
+ /* Again, sizeof() contains the nul byte */
+ len -= sizeof(HIST_PREFIX);
+ strncat(last_cmd, str, len);
if (file) {
call = file->event_call;
@@ -757,18 +772,22 @@ static void last_cmd_set(struct trace_event_file *file, char *str)
}
if (system)
- snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name);
+ snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, HIST_PREFIX "%s:%s", system, name);
}
-static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos)
+static void hist_err(struct trace_array *tr, u8 err_type, u16 err_pos)
{
+ if (!last_cmd)
+ return;
+
tracing_log_err(tr, last_cmd_loc, last_cmd, err_text,
err_type, err_pos);
}
static void hist_err_clear(void)
{
- last_cmd[0] = '\0';
+ if (last_cmd)
+ last_cmd[0] = '\0';
last_cmd_loc[0] = '\0';
}
@@ -5610,7 +5629,7 @@ static int event_hist_trigger_print(struct seq_file *m,
bool have_var = false;
unsigned int i;
- seq_puts(m, "hist:");
+ seq_puts(m, HIST_PREFIX);
if (data->name)
seq_printf(m, "%s:", data->name);
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index 154db74dadbc..5e8c07aef071 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -42,10 +42,13 @@ enum { ERRORS };
static const char *err_text[] = { ERRORS };
-static char last_cmd[MAX_FILTER_STR_VAL];
+static char *last_cmd;
static int errpos(const char *str)
{
+ if (!str || !last_cmd)
+ return 0;
+
return err_pos(last_cmd, str);
}
@@ -54,11 +57,16 @@ static void last_cmd_set(const char *str)
if (!str)
return;
- strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1);
+ kfree(last_cmd);
+
+ last_cmd = kstrdup(str, GFP_KERNEL);
}
-static void synth_err(u8 err_type, u8 err_pos)
+static void synth_err(u8 err_type, u16 err_pos)
{
+ if (!last_cmd)
+ return;
+
tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
err_type, err_pos);
}
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
new file mode 100644
index 000000000000..8b3d241a31c2
--- /dev/null
+++ b/kernel/trace/trace_events_user.c
@@ -0,0 +1,1690 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, Microsoft Corporation.
+ *
+ * Authors:
+ * Beau Belgrave <beaub@linux.microsoft.com>
+ */
+
+#include <linux/bitmap.h>
+#include <linux/cdev.h>
+#include <linux/hashtable.h>
+#include <linux/list.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/ioctl.h>
+#include <linux/jhash.h>
+#include <linux/trace_events.h>
+#include <linux/tracefs.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <uapi/linux/user_events.h>
+#include "trace.h"
+#include "trace_dynevent.h"
+
+#define USER_EVENTS_PREFIX_LEN (sizeof(USER_EVENTS_PREFIX)-1)
+
+#define FIELD_DEPTH_TYPE 0
+#define FIELD_DEPTH_NAME 1
+#define FIELD_DEPTH_SIZE 2
+
+/*
+ * Limits how many trace_event calls user processes can create:
+ * Must be a power of two of PAGE_SIZE.
+ */
+#define MAX_PAGE_ORDER 0
+#define MAX_PAGES (1 << MAX_PAGE_ORDER)
+#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE)
+
+/* Limit how long of an event name plus args within the subsystem. */
+#define MAX_EVENT_DESC 512
+#define EVENT_NAME(user_event) ((user_event)->tracepoint.name)
+#define MAX_FIELD_ARRAY_SIZE 1024
+#define MAX_FIELD_ARG_NAME 256
+
+#define MAX_BPF_COPY_SIZE PAGE_SIZE
+#define MAX_STACK_BPF_DATA 512
+
+static char *register_page_data;
+
+static DEFINE_MUTEX(reg_mutex);
+static DEFINE_HASHTABLE(register_table, 4);
+static DECLARE_BITMAP(page_bitmap, MAX_EVENTS);
+
+/*
+ * Stores per-event properties, as users register events
+ * within a file a user_event might be created if it does not
+ * already exist. These are globally used and their lifetime
+ * is tied to the refcnt member. These cannot go away until the
+ * refcnt reaches zero.
+ */
+struct user_event {
+ struct tracepoint tracepoint;
+ struct trace_event_call call;
+ struct trace_event_class class;
+ struct dyn_event devent;
+ struct hlist_node node;
+ struct list_head fields;
+ struct list_head validators;
+ atomic_t refcnt;
+ int index;
+ int flags;
+ int min_size;
+};
+
+/*
+ * Stores per-file events references, as users register events
+ * within a file this structure is modified and freed via RCU.
+ * The lifetime of this struct is tied to the lifetime of the file.
+ * These are not shared and only accessible by the file that created it.
+ */
+struct user_event_refs {
+ struct rcu_head rcu;
+ int count;
+ struct user_event *events[];
+};
+
+#define VALIDATOR_ENSURE_NULL (1 << 0)
+#define VALIDATOR_REL (1 << 1)
+
+struct user_event_validator {
+ struct list_head link;
+ int offset;
+ int flags;
+};
+
+typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted);
+
+static int user_event_parse(char *name, char *args, char *flags,
+ struct user_event **newuser);
+
+static u32 user_event_key(char *name)
+{
+ return jhash(name, strlen(name), 0);
+}
+
+static __always_inline __must_check
+size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i)
+{
+ size_t ret;
+
+ pagefault_disable();
+
+ ret = copy_from_iter_nocache(addr, bytes, i);
+
+ pagefault_enable();
+
+ return ret;
+}
+
+static struct list_head *user_event_get_fields(struct trace_event_call *call)
+{
+ struct user_event *user = (struct user_event *)call->data;
+
+ return &user->fields;
+}
+
+/*
+ * Parses a register command for user_events
+ * Format: event_name[:FLAG1[,FLAG2...]] [field1[;field2...]]
+ *
+ * Example event named 'test' with a 20 char 'msg' field with an unsigned int
+ * 'id' field after:
+ * test char[20] msg;unsigned int id
+ *
+ * NOTE: Offsets are from the user data perspective, they are not from the
+ * trace_entry/buffer perspective. We automatically add the common properties
+ * sizes to the offset for the user.
+ *
+ * Upon success user_event has its ref count increased by 1.
+ */
+static int user_event_parse_cmd(char *raw_command, struct user_event **newuser)
+{
+ char *name = raw_command;
+ char *args = strpbrk(name, " ");
+ char *flags;
+
+ if (args)
+ *args++ = '\0';
+
+ flags = strpbrk(name, ":");
+
+ if (flags)
+ *flags++ = '\0';
+
+ return user_event_parse(name, args, flags, newuser);
+}
+
+static int user_field_array_size(const char *type)
+{
+ const char *start = strchr(type, '[');
+ char val[8];
+ char *bracket;
+ int size = 0;
+
+ if (start == NULL)
+ return -EINVAL;
+
+ if (strscpy(val, start + 1, sizeof(val)) <= 0)
+ return -EINVAL;
+
+ bracket = strchr(val, ']');
+
+ if (!bracket)
+ return -EINVAL;
+
+ *bracket = '\0';
+
+ if (kstrtouint(val, 0, &size))
+ return -EINVAL;
+
+ if (size > MAX_FIELD_ARRAY_SIZE)
+ return -EINVAL;
+
+ return size;
+}
+
+static int user_field_size(const char *type)
+{
+ /* long is not allowed from a user, since it's ambigious in size */
+ if (strcmp(type, "s64") == 0)
+ return sizeof(s64);
+ if (strcmp(type, "u64") == 0)
+ return sizeof(u64);
+ if (strcmp(type, "s32") == 0)
+ return sizeof(s32);
+ if (strcmp(type, "u32") == 0)
+ return sizeof(u32);
+ if (strcmp(type, "int") == 0)
+ return sizeof(int);
+ if (strcmp(type, "unsigned int") == 0)
+ return sizeof(unsigned int);
+ if (strcmp(type, "s16") == 0)
+ return sizeof(s16);
+ if (strcmp(type, "u16") == 0)
+ return sizeof(u16);
+ if (strcmp(type, "short") == 0)
+ return sizeof(short);
+ if (strcmp(type, "unsigned short") == 0)
+ return sizeof(unsigned short);
+ if (strcmp(type, "s8") == 0)
+ return sizeof(s8);
+ if (strcmp(type, "u8") == 0)
+ return sizeof(u8);
+ if (strcmp(type, "char") == 0)
+ return sizeof(char);
+ if (strcmp(type, "unsigned char") == 0)
+ return sizeof(unsigned char);
+ if (str_has_prefix(type, "char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "unsigned char["))
+ return user_field_array_size(type);
+ if (str_has_prefix(type, "__data_loc "))
+ return sizeof(u32);
+ if (str_has_prefix(type, "__rel_loc "))
+ return sizeof(u32);
+
+ /* Uknown basic type, error */
+ return -EINVAL;
+}
+
+static void user_event_destroy_validators(struct user_event *user)
+{
+ struct user_event_validator *validator, *next;
+ struct list_head *head = &user->validators;
+
+ list_for_each_entry_safe(validator, next, head, link) {
+ list_del(&validator->link);
+ kfree(validator);
+ }
+}
+
+static void user_event_destroy_fields(struct user_event *user)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+
+ list_for_each_entry_safe(field, next, head, link) {
+ list_del(&field->link);
+ kfree(field);
+ }
+}
+
+static int user_event_add_field(struct user_event *user, const char *type,
+ const char *name, int offset, int size,
+ int is_signed, int filter_type)
+{
+ struct user_event_validator *validator;
+ struct ftrace_event_field *field;
+ int validator_flags = 0;
+
+ field = kmalloc(sizeof(*field), GFP_KERNEL);
+
+ if (!field)
+ return -ENOMEM;
+
+ if (str_has_prefix(type, "__data_loc "))
+ goto add_validator;
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ validator_flags |= VALIDATOR_REL;
+ goto add_validator;
+ }
+
+ goto add_field;
+
+add_validator:
+ if (strstr(type, "char") != 0)
+ validator_flags |= VALIDATOR_ENSURE_NULL;
+
+ validator = kmalloc(sizeof(*validator), GFP_KERNEL);
+
+ if (!validator) {
+ kfree(field);
+ return -ENOMEM;
+ }
+
+ validator->flags = validator_flags;
+ validator->offset = offset;
+
+ /* Want sequential access when validating */
+ list_add_tail(&validator->link, &user->validators);
+
+add_field:
+ field->type = type;
+ field->name = name;
+ field->offset = offset;
+ field->size = size;
+ field->is_signed = is_signed;
+ field->filter_type = filter_type;
+
+ list_add(&field->link, &user->fields);
+
+ /*
+ * Min size from user writes that are required, this does not include
+ * the size of trace_entry (common fields).
+ */
+ user->min_size = (offset + size) - sizeof(struct trace_entry);
+
+ return 0;
+}
+
+/*
+ * Parses the values of a field within the description
+ * Format: type name [size]
+ */
+static int user_event_parse_field(char *field, struct user_event *user,
+ u32 *offset)
+{
+ char *part, *type, *name;
+ u32 depth = 0, saved_offset = *offset;
+ int len, size = -EINVAL;
+ bool is_struct = false;
+
+ field = skip_spaces(field);
+
+ if (*field == '\0')
+ return 0;
+
+ /* Handle types that have a space within */
+ len = str_has_prefix(field, "unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "struct ");
+ if (len) {
+ is_struct = true;
+ goto skip_next;
+ }
+
+ len = str_has_prefix(field, "__data_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__data_loc ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc unsigned ");
+ if (len)
+ goto skip_next;
+
+ len = str_has_prefix(field, "__rel_loc ");
+ if (len)
+ goto skip_next;
+
+ goto parse;
+skip_next:
+ type = field;
+ field = strpbrk(field + len, " ");
+
+ if (field == NULL)
+ return -EINVAL;
+
+ *field++ = '\0';
+ depth++;
+parse:
+ name = NULL;
+
+ while ((part = strsep(&field, " ")) != NULL) {
+ switch (depth++) {
+ case FIELD_DEPTH_TYPE:
+ type = part;
+ break;
+ case FIELD_DEPTH_NAME:
+ name = part;
+ break;
+ case FIELD_DEPTH_SIZE:
+ if (!is_struct)
+ return -EINVAL;
+
+ if (kstrtou32(part, 10, &size))
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (depth < FIELD_DEPTH_SIZE || !name)
+ return -EINVAL;
+
+ if (depth == FIELD_DEPTH_SIZE)
+ size = user_field_size(type);
+
+ if (size == 0)
+ return -EINVAL;
+
+ if (size < 0)
+ return size;
+
+ *offset = saved_offset + size;
+
+ return user_event_add_field(user, type, name, saved_offset, size,
+ type[0] != 'u', FILTER_OTHER);
+}
+
+static void user_event_parse_flags(struct user_event *user, char *flags)
+{
+ char *flag;
+
+ if (flags == NULL)
+ return;
+
+ while ((flag = strsep(&flags, ",")) != NULL) {
+ if (strcmp(flag, "BPF_ITER") == 0)
+ user->flags |= FLAG_BPF_ITER;
+ }
+}
+
+static int user_event_parse_fields(struct user_event *user, char *args)
+{
+ char *field;
+ u32 offset = sizeof(struct trace_entry);
+ int ret = -EINVAL;
+
+ if (args == NULL)
+ return 0;
+
+ while ((field = strsep(&args, ";")) != NULL) {
+ ret = user_event_parse_field(field, user, &offset);
+
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static struct trace_event_fields user_event_fields_array[1];
+
+static const char *user_field_format(const char *type)
+{
+ if (strcmp(type, "s64") == 0)
+ return "%lld";
+ if (strcmp(type, "u64") == 0)
+ return "%llu";
+ if (strcmp(type, "s32") == 0)
+ return "%d";
+ if (strcmp(type, "u32") == 0)
+ return "%u";
+ if (strcmp(type, "int") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned int") == 0)
+ return "%u";
+ if (strcmp(type, "s16") == 0)
+ return "%d";
+ if (strcmp(type, "u16") == 0)
+ return "%u";
+ if (strcmp(type, "short") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned short") == 0)
+ return "%u";
+ if (strcmp(type, "s8") == 0)
+ return "%d";
+ if (strcmp(type, "u8") == 0)
+ return "%u";
+ if (strcmp(type, "char") == 0)
+ return "%d";
+ if (strcmp(type, "unsigned char") == 0)
+ return "%u";
+ if (strstr(type, "char[") != 0)
+ return "%s";
+
+ /* Unknown, likely struct, allowed treat as 64-bit */
+ return "%llu";
+}
+
+static bool user_field_is_dyn_string(const char *type, const char **str_func)
+{
+ if (str_has_prefix(type, "__data_loc ")) {
+ *str_func = "__get_str";
+ goto check;
+ }
+
+ if (str_has_prefix(type, "__rel_loc ")) {
+ *str_func = "__get_rel_str";
+ goto check;
+ }
+
+ return false;
+check:
+ return strstr(type, "char") != 0;
+}
+
+#define LEN_OR_ZERO (len ? len - pos : 0)
+static int user_event_set_print_fmt(struct user_event *user, char *buf, int len)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+ int pos = 0, depth = 0;
+ const char *str_func;
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (depth != 0)
+ pos += snprintf(buf + pos, LEN_OR_ZERO, " ");
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "%s=%s",
+ field->name, user_field_format(field->type));
+
+ depth++;
+ }
+
+ pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (user_field_is_dyn_string(field->type, &str_func))
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", %s(%s)", str_func, field->name);
+ else
+ pos += snprintf(buf + pos, LEN_OR_ZERO,
+ ", REC->%s", field->name);
+ }
+
+ return pos + 1;
+}
+#undef LEN_OR_ZERO
+
+static int user_event_create_print_fmt(struct user_event *user)
+{
+ char *print_fmt;
+ int len;
+
+ len = user_event_set_print_fmt(user, NULL, 0);
+
+ print_fmt = kmalloc(len, GFP_KERNEL);
+
+ if (!print_fmt)
+ return -ENOMEM;
+
+ user_event_set_print_fmt(user, print_fmt, len);
+
+ user->call.print_fmt = print_fmt;
+
+ return 0;
+}
+
+static enum print_line_t user_event_print_trace(struct trace_iterator *iter,
+ int flags,
+ struct trace_event *event)
+{
+ /* Unsafe to try to decode user provided print_fmt, use hex */
+ trace_print_hex_dump_seq(&iter->seq, "", DUMP_PREFIX_OFFSET, 16,
+ 1, iter->ent, iter->ent_size, true);
+
+ return trace_handle_return(&iter->seq);
+}
+
+static struct trace_event_functions user_event_funcs = {
+ .trace = user_event_print_trace,
+};
+
+static int user_event_set_call_visible(struct user_event *user, bool visible)
+{
+ int ret;
+ const struct cred *old_cred;
+ struct cred *cred;
+
+ cred = prepare_creds();
+
+ if (!cred)
+ return -ENOMEM;
+
+ /*
+ * While by default tracefs is locked down, systems can be configured
+ * to allow user_event files to be less locked down. The extreme case
+ * being "other" has read/write access to user_events_data/status.
+ *
+ * When not locked down, processes may not have have permissions to
+ * add/remove calls themselves to tracefs. We need to temporarily
+ * switch to root file permission to allow for this scenario.
+ */
+ cred->fsuid = GLOBAL_ROOT_UID;
+
+ old_cred = override_creds(cred);
+
+ if (visible)
+ ret = trace_add_event_call(&user->call);
+ else
+ ret = trace_remove_event_call(&user->call);
+
+ revert_creds(old_cred);
+ put_cred(cred);
+
+ return ret;
+}
+
+static int destroy_user_event(struct user_event *user)
+{
+ int ret = 0;
+
+ /* Must destroy fields before call removal */
+ user_event_destroy_fields(user);
+
+ ret = user_event_set_call_visible(user, false);
+
+ if (ret)
+ return ret;
+
+ dyn_event_remove(&user->devent);
+
+ register_page_data[user->index] = 0;
+ clear_bit(user->index, page_bitmap);
+ hash_del(&user->node);
+
+ user_event_destroy_validators(user);
+ kfree(user->call.print_fmt);
+ kfree(EVENT_NAME(user));
+ kfree(user);
+
+ return ret;
+}
+
+static struct user_event *find_user_event(char *name, u32 *outkey)
+{
+ struct user_event *user;
+ u32 key = user_event_key(name);
+
+ *outkey = key;
+
+ hash_for_each_possible(register_table, user, node, key)
+ if (!strcmp(EVENT_NAME(user), name)) {
+ atomic_inc(&user->refcnt);
+ return user;
+ }
+
+ return NULL;
+}
+
+static int user_event_validate(struct user_event *user, void *data, int len)
+{
+ struct list_head *head = &user->validators;
+ struct user_event_validator *validator;
+ void *pos, *end = data + len;
+ u32 loc, offset, size;
+
+ list_for_each_entry(validator, head, link) {
+ pos = data + validator->offset;
+
+ /* Already done min_size check, no bounds check here */
+ loc = *(u32 *)pos;
+ offset = loc & 0xffff;
+ size = loc >> 16;
+
+ if (likely(validator->flags & VALIDATOR_REL))
+ pos += offset + sizeof(loc);
+ else
+ pos = data + offset;
+
+ pos += size;
+
+ if (unlikely(pos > end))
+ return -EFAULT;
+
+ if (likely(validator->flags & VALIDATOR_ENSURE_NULL))
+ if (unlikely(*(char *)(pos - 1) != '\0'))
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/*
+ * Writes the user supplied payload out to a trace file.
+ */
+static void user_event_ftrace(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct trace_event_file *file;
+ struct trace_entry *entry;
+ struct trace_event_buffer event_buffer;
+ size_t size = sizeof(*entry) + i->count;
+
+ file = (struct trace_event_file *)tpdata;
+
+ if (!file ||
+ !(file->flags & EVENT_FILE_FL_ENABLED) ||
+ trace_trigger_soft_disabled(file))
+ return;
+
+ /* Allocates and fills trace_entry, + 1 of this is data payload */
+ entry = trace_event_buffer_reserve(&event_buffer, file, size);
+
+ if (unlikely(!entry))
+ return;
+
+ if (unlikely(!copy_nofault(entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, entry, size)))
+ goto discard;
+
+ trace_event_buffer_commit(&event_buffer);
+
+ return;
+discard:
+ *faulted = true;
+ __trace_event_discard_commit(event_buffer.buffer,
+ event_buffer.event);
+}
+
+#ifdef CONFIG_PERF_EVENTS
+static void user_event_bpf(struct user_event *user, struct iov_iter *i)
+{
+ struct user_bpf_context context;
+ struct user_bpf_iter bpf_i;
+ char fast_data[MAX_STACK_BPF_DATA];
+ void *temp = NULL;
+
+ if ((user->flags & FLAG_BPF_ITER) && iter_is_iovec(i)) {
+ /* Raw iterator */
+ context.data_type = USER_BPF_DATA_ITER;
+ context.data_len = i->count;
+ context.iter = &bpf_i;
+
+ bpf_i.iov_offset = i->iov_offset;
+ bpf_i.iov = i->iov;
+ bpf_i.nr_segs = i->nr_segs;
+ } else if (i->nr_segs == 1 && iter_is_iovec(i)) {
+ /* Single buffer from user */
+ context.data_type = USER_BPF_DATA_USER;
+ context.data_len = i->count;
+ context.udata = i->iov->iov_base + i->iov_offset;
+ } else {
+ /* Multi buffer from user */
+ struct iov_iter copy = *i;
+ size_t copy_size = min_t(size_t, i->count, MAX_BPF_COPY_SIZE);
+
+ context.data_type = USER_BPF_DATA_KERNEL;
+ context.kdata = fast_data;
+
+ if (unlikely(copy_size > sizeof(fast_data))) {
+ temp = kmalloc(copy_size, GFP_NOWAIT);
+
+ if (temp)
+ context.kdata = temp;
+ else
+ copy_size = sizeof(fast_data);
+ }
+
+ context.data_len = copy_nofault(context.kdata,
+ copy_size, &copy);
+ }
+
+ trace_call_bpf(&user->call, &context);
+
+ kfree(temp);
+}
+
+/*
+ * Writes the user supplied payload out to perf ring buffer or eBPF program.
+ */
+static void user_event_perf(struct user_event *user, struct iov_iter *i,
+ void *tpdata, bool *faulted)
+{
+ struct hlist_head *perf_head;
+
+ if (bpf_prog_array_valid(&user->call))
+ user_event_bpf(user, i);
+
+ perf_head = this_cpu_ptr(user->call.perf_events);
+
+ if (perf_head && !hlist_empty(perf_head)) {
+ struct trace_entry *perf_entry;
+ struct pt_regs *regs;
+ size_t size = sizeof(*perf_entry) + i->count;
+ int context;
+
+ perf_entry = perf_trace_buf_alloc(ALIGN(size, 8),
+ &regs, &context);
+
+ if (unlikely(!perf_entry))
+ return;
+
+ perf_fetch_caller_regs(regs);
+
+ if (unlikely(!copy_nofault(perf_entry + 1, i->count, i)))
+ goto discard;
+
+ if (!list_empty(&user->validators) &&
+ unlikely(user_event_validate(user, perf_entry, size)))
+ goto discard;
+
+ perf_trace_buf_submit(perf_entry, size, context,
+ user->call.event.type, 1, regs,
+ perf_head, NULL);
+
+ return;
+discard:
+ *faulted = true;
+ perf_swevent_put_recursion_context(context);
+ }
+}
+#endif
+
+/*
+ * Update the register page that is shared between user processes.
+ */
+static void update_reg_page_for(struct user_event *user)
+{
+ struct tracepoint *tp = &user->tracepoint;
+ char status = 0;
+
+ if (atomic_read(&tp->key.enabled) > 0) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ probe_func = probe_func_ptr->func;
+
+ if (probe_func == user_event_ftrace)
+ status |= EVENT_STATUS_FTRACE;
+#ifdef CONFIG_PERF_EVENTS
+ else if (probe_func == user_event_perf)
+ status |= EVENT_STATUS_PERF;
+#endif
+ else
+ status |= EVENT_STATUS_OTHER;
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+ }
+
+ register_page_data[user->index] = status;
+}
+
+/*
+ * Register callback for our events from tracing sub-systems.
+ */
+static int user_event_reg(struct trace_event_call *call,
+ enum trace_reg type,
+ void *data)
+{
+ struct user_event *user = (struct user_event *)call->data;
+ int ret = 0;
+
+ if (!user)
+ return -ENOENT;
+
+ switch (type) {
+ case TRACE_REG_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->probe,
+ data);
+ goto dec;
+
+#ifdef CONFIG_PERF_EVENTS
+ case TRACE_REG_PERF_REGISTER:
+ ret = tracepoint_probe_register(call->tp,
+ call->class->perf_probe,
+ data);
+ if (!ret)
+ goto inc;
+ break;
+
+ case TRACE_REG_PERF_UNREGISTER:
+ tracepoint_probe_unregister(call->tp,
+ call->class->perf_probe,
+ data);
+ goto dec;
+
+ case TRACE_REG_PERF_OPEN:
+ case TRACE_REG_PERF_CLOSE:
+ case TRACE_REG_PERF_ADD:
+ case TRACE_REG_PERF_DEL:
+ break;
+#endif
+ }
+
+ return ret;
+inc:
+ atomic_inc(&user->refcnt);
+ update_reg_page_for(user);
+ return 0;
+dec:
+ update_reg_page_for(user);
+ atomic_dec(&user->refcnt);
+ return 0;
+}
+
+static int user_event_create(const char *raw_command)
+{
+ struct user_event *user;
+ char *name;
+ int ret;
+
+ if (!str_has_prefix(raw_command, USER_EVENTS_PREFIX))
+ return -ECANCELED;
+
+ raw_command += USER_EVENTS_PREFIX_LEN;
+ raw_command = skip_spaces(raw_command);
+
+ name = kstrdup(raw_command, GFP_KERNEL);
+
+ if (!name)
+ return -ENOMEM;
+
+ mutex_lock(&reg_mutex);
+
+ ret = user_event_parse_cmd(name, &user);
+
+ if (!ret)
+ atomic_dec(&user->refcnt);
+
+ mutex_unlock(&reg_mutex);
+
+ if (ret)
+ kfree(name);
+
+ return ret;
+}
+
+static int user_event_show(struct seq_file *m, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ struct ftrace_event_field *field, *next;
+ struct list_head *head;
+ int depth = 0;
+
+ seq_printf(m, "%s%s", USER_EVENTS_PREFIX, EVENT_NAME(user));
+
+ head = trace_get_fields(&user->call);
+
+ list_for_each_entry_safe_reverse(field, next, head, link) {
+ if (depth == 0)
+ seq_puts(m, " ");
+ else
+ seq_puts(m, "; ");
+
+ seq_printf(m, "%s %s", field->type, field->name);
+
+ if (str_has_prefix(field->type, "struct "))
+ seq_printf(m, " %d", field->size);
+
+ depth++;
+ }
+
+ seq_puts(m, "\n");
+
+ return 0;
+}
+
+static bool user_event_is_busy(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ return atomic_read(&user->refcnt) != 0;
+}
+
+static int user_event_free(struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+
+ if (atomic_read(&user->refcnt) != 0)
+ return -EBUSY;
+
+ return destroy_user_event(user);
+}
+
+static bool user_field_match(struct ftrace_event_field *field, int argc,
+ const char **argv, int *iout)
+{
+ char *field_name, *arg_name;
+ int len, pos, i = *iout;
+ bool colon = false, match = false;
+
+ if (i >= argc)
+ return false;
+
+ len = MAX_FIELD_ARG_NAME;
+ field_name = kmalloc(len, GFP_KERNEL);
+ arg_name = kmalloc(len, GFP_KERNEL);
+
+ if (!arg_name || !field_name)
+ goto out;
+
+ pos = 0;
+
+ for (; i < argc; ++i) {
+ if (i != *iout)
+ pos += snprintf(arg_name + pos, len - pos, " ");
+
+ pos += snprintf(arg_name + pos, len - pos, argv[i]);
+
+ if (strchr(argv[i], ';')) {
+ ++i;
+ colon = true;
+ break;
+ }
+ }
+
+ pos = 0;
+
+ pos += snprintf(field_name + pos, len - pos, field->type);
+ pos += snprintf(field_name + pos, len - pos, " ");
+ pos += snprintf(field_name + pos, len - pos, field->name);
+
+ if (colon)
+ pos += snprintf(field_name + pos, len - pos, ";");
+
+ *iout = i;
+
+ match = strcmp(arg_name, field_name) == 0;
+out:
+ kfree(arg_name);
+ kfree(field_name);
+
+ return match;
+}
+
+static bool user_fields_match(struct user_event *user, int argc,
+ const char **argv)
+{
+ struct ftrace_event_field *field, *next;
+ struct list_head *head = &user->fields;
+ int i = 0;
+
+ list_for_each_entry_safe_reverse(field, next, head, link)
+ if (!user_field_match(field, argc, argv, &i))
+ return false;
+
+ if (i != argc)
+ return false;
+
+ return true;
+}
+
+static bool user_event_match(const char *system, const char *event,
+ int argc, const char **argv, struct dyn_event *ev)
+{
+ struct user_event *user = container_of(ev, struct user_event, devent);
+ bool match;
+
+ match = strcmp(EVENT_NAME(user), event) == 0 &&
+ (!system || strcmp(system, USER_EVENTS_SYSTEM) == 0);
+
+ if (match && argc > 0)
+ match = user_fields_match(user, argc, argv);
+
+ return match;
+}
+
+static struct dyn_event_operations user_event_dops = {
+ .create = user_event_create,
+ .show = user_event_show,
+ .is_busy = user_event_is_busy,
+ .free = user_event_free,
+ .match = user_event_match,
+};
+
+static int user_event_trace_register(struct user_event *user)
+{
+ int ret;
+
+ ret = register_trace_event(&user->call.event);
+
+ if (!ret)
+ return -ENODEV;
+
+ ret = user_event_set_call_visible(user, true);
+
+ if (ret)
+ unregister_trace_event(&user->call.event);
+
+ return ret;
+}
+
+/*
+ * Parses the event name, arguments and flags then registers if successful.
+ * The name buffer lifetime is owned by this method for success cases only.
+ * Upon success the returned user_event has its ref count increased by 1.
+ */
+static int user_event_parse(char *name, char *args, char *flags,
+ struct user_event **newuser)
+{
+ int ret;
+ int index;
+ u32 key;
+ struct user_event *user;
+
+ /* Prevent dyn_event from racing */
+ mutex_lock(&event_mutex);
+ user = find_user_event(name, &key);
+ mutex_unlock(&event_mutex);
+
+ if (user) {
+ *newuser = user;
+ /*
+ * Name is allocated by caller, free it since it already exists.
+ * Caller only worries about failure cases for freeing.
+ */
+ kfree(name);
+ return 0;
+ }
+
+ index = find_first_zero_bit(page_bitmap, MAX_EVENTS);
+
+ if (index == MAX_EVENTS)
+ return -EMFILE;
+
+ user = kzalloc(sizeof(*user), GFP_KERNEL);
+
+ if (!user)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&user->class.fields);
+ INIT_LIST_HEAD(&user->fields);
+ INIT_LIST_HEAD(&user->validators);
+
+ user->tracepoint.name = name;
+
+ user_event_parse_flags(user, flags);
+
+ ret = user_event_parse_fields(user, args);
+
+ if (ret)
+ goto put_user;
+
+ ret = user_event_create_print_fmt(user);
+
+ if (ret)
+ goto put_user;
+
+ user->call.data = user;
+ user->call.class = &user->class;
+ user->call.name = name;
+ user->call.flags = TRACE_EVENT_FL_TRACEPOINT;
+ user->call.tp = &user->tracepoint;
+ user->call.event.funcs = &user_event_funcs;
+
+ user->class.system = USER_EVENTS_SYSTEM;
+ user->class.fields_array = user_event_fields_array;
+ user->class.get_fields = user_event_get_fields;
+ user->class.reg = user_event_reg;
+ user->class.probe = user_event_ftrace;
+#ifdef CONFIG_PERF_EVENTS
+ user->class.perf_probe = user_event_perf;
+#endif
+
+ mutex_lock(&event_mutex);
+ ret = user_event_trace_register(user);
+ mutex_unlock(&event_mutex);
+
+ if (ret)
+ goto put_user;
+
+ user->index = index;
+
+ /* Ensure we track ref */
+ atomic_inc(&user->refcnt);
+
+ dyn_event_init(&user->devent, &user_event_dops);
+ dyn_event_add(&user->devent, &user->call);
+ set_bit(user->index, page_bitmap);
+ hash_add(register_table, &user->node, key);
+
+ *newuser = user;
+ return 0;
+put_user:
+ user_event_destroy_fields(user);
+ user_event_destroy_validators(user);
+ kfree(user);
+ return ret;
+}
+
+/*
+ * Deletes a previously created event if it is no longer being used.
+ */
+static int delete_user_event(char *name)
+{
+ u32 key;
+ int ret;
+ struct user_event *user = find_user_event(name, &key);
+
+ if (!user)
+ return -ENOENT;
+
+ /* Ensure we are the last ref */
+ if (atomic_read(&user->refcnt) != 1) {
+ ret = -EBUSY;
+ goto put_ref;
+ }
+
+ ret = destroy_user_event(user);
+
+ if (ret)
+ goto put_ref;
+
+ return ret;
+put_ref:
+ /* No longer have this ref */
+ atomic_dec(&user->refcnt);
+
+ return ret;
+}
+
+/*
+ * Validates the user payload and writes via iterator.
+ */
+static ssize_t user_events_write_core(struct file *file, struct iov_iter *i)
+{
+ struct user_event_refs *refs;
+ struct user_event *user = NULL;
+ struct tracepoint *tp;
+ ssize_t ret = i->count;
+ int idx;
+
+ if (unlikely(copy_from_iter(&idx, sizeof(idx), i) != sizeof(idx)))
+ return -EFAULT;
+
+ rcu_read_lock_sched();
+
+ refs = rcu_dereference_sched(file->private_data);
+
+ /*
+ * The refs->events array is protected by RCU, and new items may be
+ * added. But the user retrieved from indexing into the events array
+ * shall be immutable while the file is opened.
+ */
+ if (likely(refs && idx < refs->count))
+ user = refs->events[idx];
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(user == NULL))
+ return -ENOENT;
+
+ if (unlikely(i->count < user->min_size))
+ return -EINVAL;
+
+ tp = &user->tracepoint;
+
+ /*
+ * It's possible key.enabled disables after this check, however
+ * we don't mind if a few events are included in this condition.
+ */
+ if (likely(atomic_read(&tp->key.enabled) > 0)) {
+ struct tracepoint_func *probe_func_ptr;
+ user_event_func_t probe_func;
+ struct iov_iter copy;
+ void *tpdata;
+ bool faulted;
+
+ if (unlikely(fault_in_iov_iter_readable(i, i->count)))
+ return -EFAULT;
+
+ faulted = false;
+
+ rcu_read_lock_sched();
+
+ probe_func_ptr = rcu_dereference_sched(tp->funcs);
+
+ if (probe_func_ptr) {
+ do {
+ copy = *i;
+ probe_func = probe_func_ptr->func;
+ tpdata = probe_func_ptr->data;
+ probe_func(user, &copy, tpdata, &faulted);
+ } while ((++probe_func_ptr)->func);
+ }
+
+ rcu_read_unlock_sched();
+
+ if (unlikely(faulted))
+ return -EFAULT;
+ }
+
+ return ret;
+}
+
+static ssize_t user_events_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
+{
+ struct iovec iov;
+ struct iov_iter i;
+
+ if (unlikely(*ppos != 0))
+ return -EFAULT;
+
+ if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i)))
+ return -EFAULT;
+
+ return user_events_write_core(file, &i);
+}
+
+static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i)
+{
+ return user_events_write_core(kp->ki_filp, i);
+}
+
+static int user_events_ref_add(struct file *file, struct user_event *user)
+{
+ struct user_event_refs *refs, *new_refs;
+ int i, size, count = 0;
+
+ refs = rcu_dereference_protected(file->private_data,
+ lockdep_is_held(&reg_mutex));
+
+ if (refs) {
+ count = refs->count;
+
+ for (i = 0; i < count; ++i)
+ if (refs->events[i] == user)
+ return i;
+ }
+
+ size = struct_size(refs, events, count + 1);
+
+ new_refs = kzalloc(size, GFP_KERNEL);
+
+ if (!new_refs)
+ return -ENOMEM;
+
+ new_refs->count = count + 1;
+
+ for (i = 0; i < count; ++i)
+ new_refs->events[i] = refs->events[i];
+
+ new_refs->events[i] = user;
+
+ atomic_inc(&user->refcnt);
+
+ rcu_assign_pointer(file->private_data, new_refs);
+
+ if (refs)
+ kfree_rcu(refs, rcu);
+
+ return i;
+}
+
+static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg)
+{
+ u32 size;
+ long ret;
+
+ ret = get_user(size, &ureg->size);
+
+ if (ret)
+ return ret;
+
+ if (size > PAGE_SIZE)
+ return -E2BIG;
+
+ return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size);
+}
+
+/*
+ * Registers a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_reg(struct file *file, unsigned long uarg)
+{
+ struct user_reg __user *ureg = (struct user_reg __user *)uarg;
+ struct user_reg reg;
+ struct user_event *user;
+ char *name;
+ long ret;
+
+ ret = user_reg_get(ureg, &reg);
+
+ if (ret)
+ return ret;
+
+ name = strndup_user((const char __user *)(uintptr_t)reg.name_args,
+ MAX_EVENT_DESC);
+
+ if (IS_ERR(name)) {
+ ret = PTR_ERR(name);
+ return ret;
+ }
+
+ ret = user_event_parse_cmd(name, &user);
+
+ if (ret) {
+ kfree(name);
+ return ret;
+ }
+
+ ret = user_events_ref_add(file, user);
+
+ /* No longer need parse ref, ref_add either worked or not */
+ atomic_dec(&user->refcnt);
+
+ /* Positive number is index and valid */
+ if (ret < 0)
+ return ret;
+
+ put_user((u32)ret, &ureg->write_index);
+ put_user(user->index, &ureg->status_index);
+
+ return 0;
+}
+
+/*
+ * Deletes a user_event on behalf of a user process.
+ */
+static long user_events_ioctl_del(struct file *file, unsigned long uarg)
+{
+ void __user *ubuf = (void __user *)uarg;
+ char *name;
+ long ret;
+
+ name = strndup_user(ubuf, MAX_EVENT_DESC);
+
+ if (IS_ERR(name))
+ return PTR_ERR(name);
+
+ /* event_mutex prevents dyn_event from racing */
+ mutex_lock(&event_mutex);
+ ret = delete_user_event(name);
+ mutex_unlock(&event_mutex);
+
+ kfree(name);
+
+ return ret;
+}
+
+/*
+ * Handles the ioctl from user mode to register or alter operations.
+ */
+static long user_events_ioctl(struct file *file, unsigned int cmd,
+ unsigned long uarg)
+{
+ long ret = -ENOTTY;
+
+ switch (cmd) {
+ case DIAG_IOCSREG:
+ mutex_lock(&reg_mutex);
+ ret = user_events_ioctl_reg(file, uarg);
+ mutex_unlock(&reg_mutex);
+ break;
+
+ case DIAG_IOCSDEL:
+ mutex_lock(&reg_mutex);
+ ret = user_events_ioctl_del(file, uarg);
+ mutex_unlock(&reg_mutex);
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * Handles the final close of the file from user mode.
+ */
+static int user_events_release(struct inode *node, struct file *file)
+{
+ struct user_event_refs *refs;
+ struct user_event *user;
+ int i;
+
+ /*
+ * Ensure refs cannot change under any situation by taking the
+ * register mutex during the final freeing of the references.
+ */
+ mutex_lock(&reg_mutex);
+
+ refs = file->private_data;
+
+ if (!refs)
+ goto out;
+
+ /*
+ * The lifetime of refs has reached an end, it's tied to this file.
+ * The underlying user_events are ref counted, and cannot be freed.
+ * After this decrement, the user_events may be freed elsewhere.
+ */
+ for (i = 0; i < refs->count; ++i) {
+ user = refs->events[i];
+
+ if (user)
+ atomic_dec(&user->refcnt);
+ }
+out:
+ file->private_data = NULL;
+
+ mutex_unlock(&reg_mutex);
+
+ kfree(refs);
+
+ return 0;
+}
+
+static const struct file_operations user_data_fops = {
+ .write = user_events_write,
+ .write_iter = user_events_write_iter,
+ .unlocked_ioctl = user_events_ioctl,
+ .release = user_events_release,
+};
+
+/*
+ * Maps the shared page into the user process for checking if event is enabled.
+ */
+static int user_status_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ unsigned long size = vma->vm_end - vma->vm_start;
+
+ if (size != MAX_EVENTS)
+ return -EINVAL;
+
+ return remap_pfn_range(vma, vma->vm_start,
+ virt_to_phys(register_page_data) >> PAGE_SHIFT,
+ size, vm_get_page_prot(VM_READ));
+}
+
+static void *user_seq_start(struct seq_file *m, loff_t *pos)
+{
+ if (*pos)
+ return NULL;
+
+ return (void *)1;
+}
+
+static void *user_seq_next(struct seq_file *m, void *p, loff_t *pos)
+{
+ ++*pos;
+ return NULL;
+}
+
+static void user_seq_stop(struct seq_file *m, void *p)
+{
+}
+
+static int user_seq_show(struct seq_file *m, void *p)
+{
+ struct user_event *user;
+ char status;
+ int i, active = 0, busy = 0, flags;
+
+ mutex_lock(&reg_mutex);
+
+ hash_for_each(register_table, i, user, node) {
+ status = register_page_data[user->index];
+ flags = user->flags;
+
+ seq_printf(m, "%d:%s", user->index, EVENT_NAME(user));
+
+ if (flags != 0 || status != 0)
+ seq_puts(m, " #");
+
+ if (status != 0) {
+ seq_puts(m, " Used by");
+ if (status & EVENT_STATUS_FTRACE)
+ seq_puts(m, " ftrace");
+ if (status & EVENT_STATUS_PERF)
+ seq_puts(m, " perf");
+ if (status & EVENT_STATUS_OTHER)
+ seq_puts(m, " other");
+ busy++;
+ }
+
+ if (flags & FLAG_BPF_ITER)
+ seq_puts(m, " FLAG:BPF_ITER");
+
+ seq_puts(m, "\n");
+ active++;
+ }
+
+ mutex_unlock(&reg_mutex);
+
+ seq_puts(m, "\n");
+ seq_printf(m, "Active: %d\n", active);
+ seq_printf(m, "Busy: %d\n", busy);
+ seq_printf(m, "Max: %ld\n", MAX_EVENTS);
+
+ return 0;
+}
+
+static const struct seq_operations user_seq_ops = {
+ .start = user_seq_start,
+ .next = user_seq_next,
+ .stop = user_seq_stop,
+ .show = user_seq_show,
+};
+
+static int user_status_open(struct inode *node, struct file *file)
+{
+ return seq_open(file, &user_seq_ops);
+}
+
+static const struct file_operations user_status_fops = {
+ .open = user_status_open,
+ .mmap = user_status_mmap,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+/*
+ * Creates a set of tracefs files to allow user mode interactions.
+ */
+static int create_user_tracefs(void)
+{
+ struct dentry *edata, *emmap;
+
+ edata = tracefs_create_file("user_events_data", TRACE_MODE_WRITE,
+ NULL, NULL, &user_data_fops);
+
+ if (!edata) {
+ pr_warn("Could not create tracefs 'user_events_data' entry\n");
+ goto err;
+ }
+
+ /* mmap with MAP_SHARED requires writable fd */
+ emmap = tracefs_create_file("user_events_status", TRACE_MODE_WRITE,
+ NULL, NULL, &user_status_fops);
+
+ if (!emmap) {
+ tracefs_remove(edata);
+ pr_warn("Could not create tracefs 'user_events_mmap' entry\n");
+ goto err;
+ }
+
+ return 0;
+err:
+ return -ENODEV;
+}
+
+static void set_page_reservations(bool set)
+{
+ int page;
+
+ for (page = 0; page < MAX_PAGES; ++page) {
+ void *addr = register_page_data + (PAGE_SIZE * page);
+
+ if (set)
+ SetPageReserved(virt_to_page(addr));
+ else
+ ClearPageReserved(virt_to_page(addr));
+ }
+}
+
+static int __init trace_events_user_init(void)
+{
+ struct page *pages;
+ int ret;
+
+ /* Zero all bits beside 0 (which is reserved for failures) */
+ bitmap_zero(page_bitmap, MAX_EVENTS);
+ set_bit(0, page_bitmap);
+
+ pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER);
+ if (!pages)
+ return -ENOMEM;
+ register_page_data = page_address(pages);
+
+ set_page_reservations(true);
+
+ ret = create_user_tracefs();
+
+ if (ret) {
+ pr_warn("user_events could not register with tracefs\n");
+ set_page_reservations(false);
+ __free_pages(pages, MAX_PAGE_ORDER);
+ return ret;
+ }
+
+ if (dyn_event_register(&user_event_dops))
+ pr_warn("user_events could not register with dyn_events\n");
+
+ return 0;
+}
+
+fs_initcall(trace_events_user_init);
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index 5e3c62a08fc0..e9ae1f33a7f0 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1167,7 +1167,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
* used to record the beginning and to report the end of a thread noise window.
*/
static void
-trace_sched_switch_callback(void *data, bool preempt, struct task_struct *p,
+trace_sched_switch_callback(void *data, bool preempt,
+ unsigned int prev_state,
+ struct task_struct *p,
struct task_struct *n)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index e304196d7c28..45796d8bd4b2 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -22,6 +22,7 @@ static DEFINE_MUTEX(sched_register_mutex);
static void
probe_sched_switch(void *ignore, bool preempt,
+ unsigned int prev_state,
struct task_struct *prev, struct task_struct *next)
{
int flags;
@@ -44,7 +45,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee)
if (!flags)
return;
- tracing_record_taskinfo(current, flags);
+ tracing_record_taskinfo_sched_switch(current, wakee, flags);
}
static int tracing_sched_register(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 2402de520eca..46429f9a96fa 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -426,6 +426,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
static void notrace
probe_wakeup_sched_switch(void *ignore, bool preempt,
+ unsigned int prev_state,
struct task_struct *prev, struct task_struct *next)
{
struct trace_array_cpu *data;