From 29e8077ae2beea6a85ad2d0bae9c550bd5d05ed9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:16 +0900 Subject: kprobes: Use bool type for functions which returns boolean value Use the 'bool' type instead of 'int' for the functions which returns a boolean value, because this makes clear that those functions don't return any error code. Link: https://lkml.kernel.org/r/163163041649.489837.17311187321419747536.stgit@devnote2 Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_kprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3a64ba4bbad6..0e1e7ce5f7ed 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -97,7 +97,7 @@ static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk) { - return !!(kprobe_gone(&tk->rp.kp)); + return kprobe_gone(&tk->rp.kp); } static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk, -- cgit v1.2.3 From adf8a61a940c49fea6fab9c3865f2b69b8ceef28 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:40:54 +0900 Subject: kprobes: treewide: Make it harder to refer kretprobe_trampoline directly Since now there is kretprobe_trampoline_addr() for referring the address of kretprobe trampoline code, we don't need to access kretprobe_trampoline directly. Make it harder to refer by renaming it to __kretprobe_trampoline(). Link: https://lkml.kernel.org/r/163163045446.489837.14510577516938803097.stgit@devnote2 Suggested-by: Ingo Molnar Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/arc/include/asm/kprobes.h | 2 +- arch/arc/kernel/kprobes.c | 11 ++++++----- arch/arm/probes/kprobes/core.c | 6 +++--- arch/arm64/include/asm/kprobes.h | 2 +- arch/arm64/kernel/probes/kprobes.c | 2 +- arch/arm64/kernel/probes/kprobes_trampoline.S | 4 ++-- arch/csky/include/asm/kprobes.h | 2 +- arch/csky/kernel/probes/kprobes.c | 2 +- arch/csky/kernel/probes/kprobes_trampoline.S | 4 ++-- arch/ia64/kernel/kprobes.c | 8 ++++---- arch/mips/kernel/kprobes.c | 12 ++++++------ arch/parisc/kernel/kprobes.c | 4 ++-- arch/powerpc/include/asm/kprobes.h | 2 +- arch/powerpc/kernel/kprobes.c | 16 ++++++++-------- arch/powerpc/kernel/optprobes.c | 2 +- arch/powerpc/kernel/stacktrace.c | 2 +- arch/riscv/include/asm/kprobes.h | 2 +- arch/riscv/kernel/probes/kprobes.c | 2 +- arch/riscv/kernel/probes/kprobes_trampoline.S | 4 ++-- arch/s390/include/asm/kprobes.h | 2 +- arch/s390/kernel/kprobes.c | 10 +++++----- arch/s390/kernel/stacktrace.c | 2 +- arch/sh/include/asm/kprobes.h | 2 +- arch/sh/kernel/kprobes.c | 10 +++++----- arch/sparc/include/asm/kprobes.h | 2 +- arch/sparc/kernel/kprobes.c | 10 +++++----- arch/x86/kernel/kprobes/core.c | 18 +++++++++--------- include/linux/kprobes.h | 4 ++-- kernel/trace/trace_output.c | 2 +- 29 files changed, 76 insertions(+), 75 deletions(-) (limited to 'kernel/trace') diff --git a/arch/arc/include/asm/kprobes.h b/arch/arc/include/asm/kprobes.h index 2134721dce44..de1566e32cb8 100644 --- a/arch/arc/include/asm/kprobes.h +++ b/arch/arc/include/asm/kprobes.h @@ -46,7 +46,7 @@ struct kprobe_ctlblk { }; int kprobe_fault_handler(struct pt_regs *regs, unsigned long cause); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); void trap_is_kprobe(unsigned long address, struct pt_regs *regs); #else #define trap_is_kprobe(address, regs) diff --git a/arch/arc/kernel/kprobes.c b/arch/arc/kernel/kprobes.c index 3cee75c87f97..e71d64119d71 100644 --- a/arch/arc/kernel/kprobes.c +++ b/arch/arc/kernel/kprobes.c @@ -363,8 +363,9 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, static void __used kretprobe_trampoline_holder(void) { - __asm__ __volatile__(".global kretprobe_trampoline\n" - "kretprobe_trampoline:\n" "nop\n"); + __asm__ __volatile__(".global __kretprobe_trampoline\n" + "__kretprobe_trampoline:\n" + "nop\n"); } void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, @@ -375,7 +376,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->blink = (unsigned long)&kretprobe_trampoline; + regs->blink = (unsigned long)&__kretprobe_trampoline; } static int __kprobes trampoline_probe_handler(struct kprobe *p, @@ -390,7 +391,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, } static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .addr = (kprobe_opcode_t *) &__kretprobe_trampoline, .pre_handler = trampoline_probe_handler }; @@ -402,7 +403,7 @@ int __init arch_init_kprobes(void) int __kprobes arch_trampoline_kprobe(struct kprobe *p) { - if (p->addr == (kprobe_opcode_t *) &kretprobe_trampoline) + if (p->addr == (kprobe_opcode_t *) &__kretprobe_trampoline) return 1; return 0; diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index 08098ed6f035..67ce7eb8f285 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -373,7 +373,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, * for kretprobe handlers which should normally be interested in r0 only * anyway. */ -void __naked __kprobes kretprobe_trampoline(void) +void __naked __kprobes __kretprobe_trampoline(void) { __asm__ __volatile__ ( "stmdb sp!, {r0 - r11} \n\t" @@ -389,7 +389,7 @@ void __naked __kprobes kretprobe_trampoline(void) : : : "memory"); } -/* Called from kretprobe_trampoline */ +/* Called from __kretprobe_trampoline */ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) { return (void *)kretprobe_trampoline_handler(regs, (void *)regs->ARM_fp); @@ -402,7 +402,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = (void *)regs->ARM_fp; /* Replace the return addr with trampoline addr. */ - regs->ARM_lr = (unsigned long)&kretprobe_trampoline; + regs->ARM_lr = (unsigned long)&__kretprobe_trampoline; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h index 5d38ff4a4806..05cd82eeca13 100644 --- a/arch/arm64/include/asm/kprobes.h +++ b/arch/arm64/include/asm/kprobes.h @@ -39,7 +39,7 @@ void arch_remove_kprobe(struct kprobe *); int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr); int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); void __kprobes *trampoline_probe_handler(struct pt_regs *regs); #endif /* CONFIG_KPROBES */ diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index f627a12984a8..e7ad6da980e8 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -411,7 +411,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = (void *)kernel_stack_pointer(regs); /* replace return addr (x30) with trampoline */ - regs->regs[30] = (long)&kretprobe_trampoline; + regs->regs[30] = (long)&__kretprobe_trampoline; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S index 288a84e253cc..520ee8711db1 100644 --- a/arch/arm64/kernel/probes/kprobes_trampoline.S +++ b/arch/arm64/kernel/probes/kprobes_trampoline.S @@ -61,7 +61,7 @@ ldp x28, x29, [sp, #S_X28] .endm -SYM_CODE_START(kretprobe_trampoline) +SYM_CODE_START(__kretprobe_trampoline) sub sp, sp, #PT_REGS_SIZE save_all_base_regs @@ -79,4 +79,4 @@ SYM_CODE_START(kretprobe_trampoline) add sp, sp, #PT_REGS_SIZE ret -SYM_CODE_END(kretprobe_trampoline) +SYM_CODE_END(__kretprobe_trampoline) diff --git a/arch/csky/include/asm/kprobes.h b/arch/csky/include/asm/kprobes.h index b647bbde4d6d..55267cbf5204 100644 --- a/arch/csky/include/asm/kprobes.h +++ b/arch/csky/include/asm/kprobes.h @@ -41,7 +41,7 @@ void arch_remove_kprobe(struct kprobe *p); int kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr); int kprobe_breakpoint_handler(struct pt_regs *regs); int kprobe_single_step_handler(struct pt_regs *regs); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); void __kprobes *trampoline_probe_handler(struct pt_regs *regs); #endif /* CONFIG_KPROBES */ diff --git a/arch/csky/kernel/probes/kprobes.c b/arch/csky/kernel/probes/kprobes.c index 784c5aba7f66..42920f25e73c 100644 --- a/arch/csky/kernel/probes/kprobes.c +++ b/arch/csky/kernel/probes/kprobes.c @@ -394,7 +394,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, { ri->ret_addr = (kprobe_opcode_t *)regs->lr; ri->fp = NULL; - regs->lr = (unsigned long) &kretprobe_trampoline; + regs->lr = (unsigned long) &__kretprobe_trampoline; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/csky/kernel/probes/kprobes_trampoline.S b/arch/csky/kernel/probes/kprobes_trampoline.S index b1fe3af24f03..ba48ad04a847 100644 --- a/arch/csky/kernel/probes/kprobes_trampoline.S +++ b/arch/csky/kernel/probes/kprobes_trampoline.S @@ -4,7 +4,7 @@ #include -ENTRY(kretprobe_trampoline) +ENTRY(__kretprobe_trampoline) SAVE_REGS_FTRACE mov a0, sp /* pt_regs */ @@ -16,4 +16,4 @@ ENTRY(kretprobe_trampoline) RESTORE_REGS_FTRACE rts -ENDPROC(kretprobe_trampoline) +ENDPROC(__kretprobe_trampoline) diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 44c84c20b626..1a7bab1c5d7c 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -392,7 +392,7 @@ static void __kprobes set_current_kprobe(struct kprobe *p, __this_cpu_write(current_kprobe, p); } -void kretprobe_trampoline(void) +void __kretprobe_trampoline(void) { } @@ -414,7 +414,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->b0 = (unsigned long)dereference_function_descriptor(kretprobe_trampoline); + regs->b0 = (unsigned long)dereference_function_descriptor(__kretprobe_trampoline); } /* Check the instruction in the slot is break */ @@ -897,14 +897,14 @@ static struct kprobe trampoline_p = { int __init arch_init_kprobes(void) { trampoline_p.addr = - dereference_function_descriptor(kretprobe_trampoline); + dereference_function_descriptor(__kretprobe_trampoline); return register_kprobe(&trampoline_p); } int __kprobes arch_trampoline_kprobe(struct kprobe *p) { if (p->addr == - dereference_function_descriptor(kretprobe_trampoline)) + dereference_function_descriptor(__kretprobe_trampoline)) return 1; return 0; diff --git a/arch/mips/kernel/kprobes.c b/arch/mips/kernel/kprobes.c index b33bd2498651..6c7f3b143fdc 100644 --- a/arch/mips/kernel/kprobes.c +++ b/arch/mips/kernel/kprobes.c @@ -460,14 +460,14 @@ static void __used kretprobe_trampoline_holder(void) /* Keep the assembler from reordering and placing JR here. */ ".set noreorder\n\t" "nop\n\t" - ".global kretprobe_trampoline\n" - "kretprobe_trampoline:\n\t" + ".global __kretprobe_trampoline\n" + "__kretprobe_trampoline:\n\t" "nop\n\t" ".set pop" : : : "memory"); } -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) @@ -476,7 +476,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->regs[31] = (unsigned long)kretprobe_trampoline; + regs->regs[31] = (unsigned long)__kretprobe_trampoline; } /* @@ -496,14 +496,14 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, int __kprobes arch_trampoline_kprobe(struct kprobe *p) { - if (p->addr == (kprobe_opcode_t *)kretprobe_trampoline) + if (p->addr == (kprobe_opcode_t *)__kretprobe_trampoline) return 1; return 0; } static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *)kretprobe_trampoline, + .addr = (kprobe_opcode_t *)__kretprobe_trampoline, .pre_handler = trampoline_probe_handler }; diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c index 4a35ac6e2ca2..e2bdb5a5f93e 100644 --- a/arch/parisc/kernel/kprobes.c +++ b/arch/parisc/kernel/kprobes.c @@ -175,7 +175,7 @@ int __kprobes parisc_kprobe_ss_handler(struct pt_regs *regs) return 1; } -void kretprobe_trampoline(void) +void __kretprobe_trampoline(void) { asm volatile("nop"); asm volatile("nop"); @@ -217,6 +217,6 @@ int __kprobes arch_trampoline_kprobe(struct kprobe *p) int __init arch_init_kprobes(void) { trampoline_p.addr = (kprobe_opcode_t *) - dereference_function_descriptor(kretprobe_trampoline); + dereference_function_descriptor(__kretprobe_trampoline); return register_kprobe(&trampoline_p); } diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h index 4fc0e15e23a5..bab364152b29 100644 --- a/arch/powerpc/include/asm/kprobes.h +++ b/arch/powerpc/include/asm/kprobes.h @@ -51,7 +51,7 @@ extern kprobe_opcode_t optprobe_template_end[]; #define flush_insn_slot(p) do { } while (0) #define kretprobe_blacklist_size 0 -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); extern void arch_remove_kprobe(struct kprobe *p); /* Architecture specific copy of original instruction */ diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 43c77142a262..86d77ff056a6 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -237,7 +237,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->link = (unsigned long)kretprobe_trampoline; + regs->link = (unsigned long)__kretprobe_trampoline; } NOKPROBE_SYMBOL(arch_prepare_kretprobe); @@ -403,12 +403,12 @@ NOKPROBE_SYMBOL(kprobe_handler); * - When the probed function returns, this probe * causes the handlers to fire */ -asm(".global kretprobe_trampoline\n" - ".type kretprobe_trampoline, @function\n" - "kretprobe_trampoline:\n" +asm(".global __kretprobe_trampoline\n" + ".type __kretprobe_trampoline, @function\n" + "__kretprobe_trampoline:\n" "nop\n" "blr\n" - ".size kretprobe_trampoline, .-kretprobe_trampoline\n"); + ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n"); /* * Called when the probe at kretprobe trampoline is hit @@ -427,7 +427,7 @@ static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) * as it is used to determine the return address from the trap. * For (2), since nip is not honoured with optprobes, we instead setup * the link register properly so that the subsequent 'blr' in - * kretprobe_trampoline jumps back to the right instruction. + * __kretprobe_trampoline jumps back to the right instruction. * * For nip, we should set the address to the previous instruction since * we end up emulating it in kprobe_handler(), which increments the nip @@ -543,7 +543,7 @@ int kprobe_fault_handler(struct pt_regs *regs, int trapnr) NOKPROBE_SYMBOL(kprobe_fault_handler); static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .addr = (kprobe_opcode_t *) &__kretprobe_trampoline, .pre_handler = trampoline_probe_handler }; @@ -554,7 +554,7 @@ int __init arch_init_kprobes(void) int arch_trampoline_kprobe(struct kprobe *p) { - if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) + if (p->addr == (kprobe_opcode_t *)&__kretprobe_trampoline) return 1; return 0; diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 325ba544883c..ce1903064031 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -56,7 +56,7 @@ static unsigned long can_optimize(struct kprobe *p) * has a 'nop' instruction, which can be emulated. * So further checks can be skipped. */ - if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) + if (p->addr == (kprobe_opcode_t *)&__kretprobe_trampoline) return addr + sizeof(kprobe_opcode_t); /* diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 9e4a4a7af380..a2443d61728e 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -155,7 +155,7 @@ int __no_sanitize_address arch_stack_walk_reliable(stack_trace_consume_fn consum * Mark stacktraces with kretprobed functions on them * as unreliable. */ - if (ip == (unsigned long)kretprobe_trampoline) + if (ip == (unsigned long)__kretprobe_trampoline) return -EINVAL; #endif diff --git a/arch/riscv/include/asm/kprobes.h b/arch/riscv/include/asm/kprobes.h index 9ea9b5ec3113..217ef89f22b9 100644 --- a/arch/riscv/include/asm/kprobes.h +++ b/arch/riscv/include/asm/kprobes.h @@ -40,7 +40,7 @@ void arch_remove_kprobe(struct kprobe *p); int kprobe_fault_handler(struct pt_regs *regs, unsigned int trapnr); bool kprobe_breakpoint_handler(struct pt_regs *regs); bool kprobe_single_step_handler(struct pt_regs *regs); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); void __kprobes *trampoline_probe_handler(struct pt_regs *regs); #endif /* CONFIG_KPROBES */ diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index 62d477cf11da..e6e950b7cf32 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -355,7 +355,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, { ri->ret_addr = (kprobe_opcode_t *)regs->ra; ri->fp = NULL; - regs->ra = (unsigned long) &kretprobe_trampoline; + regs->ra = (unsigned long) &__kretprobe_trampoline; } int __kprobes arch_trampoline_kprobe(struct kprobe *p) diff --git a/arch/riscv/kernel/probes/kprobes_trampoline.S b/arch/riscv/kernel/probes/kprobes_trampoline.S index 6e85d021e2a2..7bdb09ded39b 100644 --- a/arch/riscv/kernel/probes/kprobes_trampoline.S +++ b/arch/riscv/kernel/probes/kprobes_trampoline.S @@ -75,7 +75,7 @@ REG_L x31, PT_T6(sp) .endm -ENTRY(kretprobe_trampoline) +ENTRY(__kretprobe_trampoline) addi sp, sp, -(PT_SIZE_ON_STACK) save_all_base_regs @@ -90,4 +90,4 @@ ENTRY(kretprobe_trampoline) addi sp, sp, PT_SIZE_ON_STACK ret -ENDPROC(kretprobe_trampoline) +ENDPROC(__kretprobe_trampoline) diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h index 09cdb632a490..5eb722c984e4 100644 --- a/arch/s390/include/asm/kprobes.h +++ b/arch/s390/include/asm/kprobes.h @@ -70,7 +70,7 @@ struct kprobe_ctlblk { }; void arch_remove_kprobe(struct kprobe *p); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); int kprobe_fault_handler(struct pt_regs *regs, int trapnr); int kprobe_exceptions_notify(struct notifier_block *self, diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 5fa86e54f129..c505c0ee5f47 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -242,7 +242,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->gprs[14] = (unsigned long) &kretprobe_trampoline; + regs->gprs[14] = (unsigned long) &__kretprobe_trampoline; } NOKPROBE_SYMBOL(arch_prepare_kretprobe); @@ -334,8 +334,8 @@ NOKPROBE_SYMBOL(kprobe_handler); */ static void __used kretprobe_trampoline_holder(void) { - asm volatile(".global kretprobe_trampoline\n" - "kretprobe_trampoline: bcr 0,0\n"); + asm volatile(".global __kretprobe_trampoline\n" + "__kretprobe_trampoline: bcr 0,0\n"); } /* @@ -509,7 +509,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, NOKPROBE_SYMBOL(kprobe_exceptions_notify); static struct kprobe trampoline = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .addr = (kprobe_opcode_t *) &__kretprobe_trampoline, .pre_handler = trampoline_probe_handler }; @@ -520,6 +520,6 @@ int __init arch_init_kprobes(void) int arch_trampoline_kprobe(struct kprobe *p) { - return p->addr == (kprobe_opcode_t *) &kretprobe_trampoline; + return p->addr == (kprobe_opcode_t *) &__kretprobe_trampoline; } NOKPROBE_SYMBOL(arch_trampoline_kprobe); diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 101477b3e263..b7bb1981e9ee 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -46,7 +46,7 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, * Mark stacktraces with kretprobed functions on them * as unreliable. */ - if (state.ip == (unsigned long)kretprobe_trampoline) + if (state.ip == (unsigned long)__kretprobe_trampoline) return -EINVAL; #endif diff --git a/arch/sh/include/asm/kprobes.h b/arch/sh/include/asm/kprobes.h index 6171682f7798..eeba83e0a7d2 100644 --- a/arch/sh/include/asm/kprobes.h +++ b/arch/sh/include/asm/kprobes.h @@ -26,7 +26,7 @@ typedef insn_size_t kprobe_opcode_t; struct kprobe; void arch_remove_kprobe(struct kprobe *); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/arch/sh/kernel/kprobes.c b/arch/sh/kernel/kprobes.c index 8e76a35e6e33..aed1ea8e2c2f 100644 --- a/arch/sh/kernel/kprobes.c +++ b/arch/sh/kernel/kprobes.c @@ -207,7 +207,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, ri->fp = NULL; /* Replace the return addr with trampoline addr */ - regs->pr = (unsigned long)kretprobe_trampoline; + regs->pr = (unsigned long)__kretprobe_trampoline; } static int __kprobes kprobe_handler(struct pt_regs *regs) @@ -293,13 +293,13 @@ no_kprobe: */ static void __used kretprobe_trampoline_holder(void) { - asm volatile (".globl kretprobe_trampoline\n" - "kretprobe_trampoline:\n\t" + asm volatile (".globl __kretprobe_trampoline\n" + "__kretprobe_trampoline:\n\t" "nop\n"); } /* - * Called when we hit the probe point at kretprobe_trampoline + * Called when we hit the probe point at __kretprobe_trampoline */ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { @@ -442,7 +442,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, } static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *)&kretprobe_trampoline, + .addr = (kprobe_opcode_t *)&__kretprobe_trampoline, .pre_handler = trampoline_probe_handler }; diff --git a/arch/sparc/include/asm/kprobes.h b/arch/sparc/include/asm/kprobes.h index bfcaa6326c20..06c2bc767ef7 100644 --- a/arch/sparc/include/asm/kprobes.h +++ b/arch/sparc/include/asm/kprobes.h @@ -24,7 +24,7 @@ do { flushi(&(p)->ainsn.insn[0]); \ flushi(&(p)->ainsn.insn[1]); \ } while (0) -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/arch/sparc/kernel/kprobes.c b/arch/sparc/kernel/kprobes.c index 401534236c2e..535c7b35cb59 100644 --- a/arch/sparc/kernel/kprobes.c +++ b/arch/sparc/kernel/kprobes.c @@ -440,7 +440,7 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, /* Replace the return addr with trampoline addr */ regs->u_regs[UREG_RETPC] = - ((unsigned long)kretprobe_trampoline) - 8; + ((unsigned long)__kretprobe_trampoline) - 8; } /* @@ -465,13 +465,13 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, static void __used kretprobe_trampoline_holder(void) { - asm volatile(".global kretprobe_trampoline\n" - "kretprobe_trampoline:\n" + asm volatile(".global __kretprobe_trampoline\n" + "__kretprobe_trampoline:\n" "\tnop\n" "\tnop\n"); } static struct kprobe trampoline_p = { - .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .addr = (kprobe_opcode_t *) &__kretprobe_trampoline, .pre_handler = trampoline_probe_handler }; @@ -482,7 +482,7 @@ int __init arch_init_kprobes(void) int __kprobes arch_trampoline_kprobe(struct kprobe *p) { - if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) + if (p->addr == (kprobe_opcode_t *)&__kretprobe_trampoline) return 1; return 0; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 0c59ef5971de..79cd23dba5b5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -809,7 +809,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) ri->fp = sara; /* Replace the return addr with trampoline addr */ - *sara = (unsigned long) &kretprobe_trampoline; + *sara = (unsigned long) &__kretprobe_trampoline; } NOKPROBE_SYMBOL(arch_prepare_kretprobe); @@ -1019,9 +1019,9 @@ NOKPROBE_SYMBOL(kprobe_int3_handler); */ asm( ".text\n" - ".global kretprobe_trampoline\n" - ".type kretprobe_trampoline, @function\n" - "kretprobe_trampoline:\n" + ".global __kretprobe_trampoline\n" + ".type __kretprobe_trampoline, @function\n" + "__kretprobe_trampoline:\n" /* We don't bother saving the ss register */ #ifdef CONFIG_X86_64 " pushq %rsp\n" @@ -1045,14 +1045,14 @@ asm( " popfl\n" #endif " ret\n" - ".size kretprobe_trampoline, .-kretprobe_trampoline\n" + ".size __kretprobe_trampoline, .-__kretprobe_trampoline\n" ); -NOKPROBE_SYMBOL(kretprobe_trampoline); -STACK_FRAME_NON_STANDARD(kretprobe_trampoline); +NOKPROBE_SYMBOL(__kretprobe_trampoline); +STACK_FRAME_NON_STANDARD(__kretprobe_trampoline); /* - * Called from kretprobe_trampoline + * Called from __kretprobe_trampoline */ __used __visible void *trampoline_handler(struct pt_regs *regs) { @@ -1061,7 +1061,7 @@ __used __visible void *trampoline_handler(struct pt_regs *regs) #ifdef CONFIG_X86_32 regs->gs = 0; #endif - regs->ip = (unsigned long)&kretprobe_trampoline; + regs->ip = (unsigned long)&__kretprobe_trampoline; regs->orig_ax = ~0UL; return (void *)kretprobe_trampoline_handler(regs, ®s->sp); diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 96f5df93e36e..b6b2370f4a4c 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -188,14 +188,14 @@ extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs); extern int arch_trampoline_kprobe(struct kprobe *p); -void kretprobe_trampoline(void); +void __kretprobe_trampoline(void); /* * Since some architecture uses structured function pointer, * use dereference_function_descriptor() to get real function address. */ static nokprobe_inline void *kretprobe_trampoline_addr(void) { - return dereference_kernel_function_descriptor(kretprobe_trampoline); + return dereference_kernel_function_descriptor(__kretprobe_trampoline); } /* If the trampoline handler called from a kprobe, use this version */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index c2ca40e8595b..5a5949c659d0 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -349,7 +349,7 @@ EXPORT_SYMBOL_GPL(trace_output_call); #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { - static const char tramp_name[] = "kretprobe_trampoline"; + static const char tramp_name[] = "__kretprobe_trampoline"; int size = sizeof(tramp_name); if (strncmp(tramp_name, name, size) == 0) -- cgit v1.2.3 From 7da89495d500d6a1e6fe1019587c3b611c7bd217 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 14 Sep 2021 23:42:40 +0900 Subject: tracing: Show kretprobe unknown indicator only for kretprobe_trampoline ftrace shows "[unknown/kretprobe'd]" indicator all addresses in the kretprobe_trampoline, but the modified address by kretprobe should be only kretprobe_trampoline+0. Link: https://lkml.kernel.org/r/163163056044.489837.794883849706638013.stgit@devnote2 Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt (VMware) Tested-by: Andrii Nakryiko Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_output.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5a5949c659d0..3547e7176ff7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -346,22 +347,12 @@ int trace_output_call(struct trace_iterator *iter, char *name, char *fmt, ...) } EXPORT_SYMBOL_GPL(trace_output_call); -#ifdef CONFIG_KRETPROBES -static inline const char *kretprobed(const char *name) +static inline const char *kretprobed(const char *name, unsigned long addr) { - static const char tramp_name[] = "__kretprobe_trampoline"; - int size = sizeof(tramp_name); - - if (strncmp(tramp_name, name, size) == 0) + if (is_kretprobe_trampoline(addr)) return "[unknown/kretprobe'd]"; return name; } -#else -static inline const char *kretprobed(const char *name) -{ - return name; -} -#endif /* CONFIG_KRETPROBES */ void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) @@ -374,7 +365,7 @@ trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) sprint_symbol(str, address); else kallsyms_lookup(address, NULL, NULL, NULL, str); - name = kretprobed(str); + name = kretprobed(str, address); if (name && strlen(name)) { trace_seq_puts(s, name); -- cgit v1.2.3 From 6954e415264eeb5ee6be0d22d789ad12c995ee64 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 Sep 2021 21:03:49 -0400 Subject: tracing: Place trace_pid_list logic into abstract functions Instead of having the logic that does trace_pid_list open coded, wrap it in abstract functions. This will allow a rewrite of the logic that implements the trace_pid_list without affecting the users. Note, this causes a change in behavior. Every time a pid is written into the set_*_pid file, it creates a new list and uses RCU to update it. If pid_max is lowered, but there was a pid currently in the list that was higher than pid_max, those pids will now be removed on updating the list. The old behavior kept that from happening. The rewrite of the pid_list logic will no longer depend on pid_max, and will return the old behavior. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Makefile | 1 + kernel/trace/ftrace.c | 6 +- kernel/trace/pid_list.c | 160 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/pid_list.h | 13 ++++ kernel/trace/trace.c | 78 ++++++++------------- kernel/trace/trace.h | 14 ++-- kernel/trace/trace_events.c | 6 +- 7 files changed, 217 insertions(+), 61 deletions(-) create mode 100644 kernel/trace/pid_list.c create mode 100644 kernel/trace/pid_list.h (limited to 'kernel/trace') diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 6de5d4d63165..bedc5caceec7 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -47,6 +47,7 @@ obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_seq.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o +obj-$(CONFIG_TRACING) += pid_list.o obj-$(CONFIG_TRACING_MAP) += tracing_map.o obj-$(CONFIG_PREEMPTIRQ_DELAY_TEST) += preemptirq_delay_test.o obj-$(CONFIG_SYNTH_EVENT_GEN_TEST) += synth_event_gen_test.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7efbc8aaf7f6..3eec6792f115 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7184,10 +7184,10 @@ static void clear_ftrace_pids(struct trace_array *tr, int type) synchronize_rcu(); if ((type & TRACE_PIDS) && pid_list) - trace_free_pid_list(pid_list); + trace_pid_list_free(pid_list); if ((type & TRACE_NO_PIDS) && no_pid_list) - trace_free_pid_list(no_pid_list); + trace_pid_list_free(no_pid_list); } void ftrace_clear_pids(struct trace_array *tr) @@ -7428,7 +7428,7 @@ pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { synchronize_rcu(); - trace_free_pid_list(filtered_pids); + trace_pid_list_free(filtered_pids); } else if (pid_list && !other_pids) { /* Register a probe to set whether to ignore the tracing of a task */ register_trace_sched_switch(ftrace_filter_pid_sched_switch_probe, tr); diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c new file mode 100644 index 000000000000..4483ef70b562 --- /dev/null +++ b/kernel/trace/pid_list.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 VMware Inc, Steven Rostedt + */ +#include +#include +#include "trace.h" + +/** + * trace_pid_list_is_set - test if the pid is set in the list + * @pid_list: The pid list to test + * @pid: The pid to to see if set in the list. + * + * Tests if @pid is is set in the @pid_list. This is usually called + * from the scheduler when a task is scheduled. Its pid is checked + * if it should be traced or not. + * + * Return true if the pid is in the list, false otherwise. + */ +bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) +{ + /* + * If pid_max changed after filtered_pids was created, we + * by default ignore all pids greater than the previous pid_max. + */ + if (pid >= pid_list->pid_max) + return false; + + return test_bit(pid, pid_list->pids); +} + +/** + * trace_pid_list_set - add a pid to the list + * @pid_list: The pid list to add the @pid to. + * @pid: The pid to add. + * + * Adds @pid to @pid_list. This is usually done explicitly by a user + * adding a task to be traced, or indirectly by the fork function + * when children should be traced and a task's pid is in the list. + * + * Return 0 on success, negative otherwise. + */ +int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) +{ + /* Sorry, but we don't support pid_max changing after setting */ + if (pid >= pid_list->pid_max) + return -EINVAL; + + set_bit(pid, pid_list->pids); + + return 0; +} + +/** + * trace_pid_list_clear - remove a pid from the list + * @pid_list: The pid list to remove the @pid from. + * @pid: The pid to remove. + * + * Removes @pid from @pid_list. This is usually done explicitly by a user + * removing tasks from tracing, or indirectly by the exit function + * when a task that is set to be traced exits. + * + * Return 0 on success, negative otherwise. + */ +int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) +{ + /* Sorry, but we don't support pid_max changing after setting */ + if (pid >= pid_list->pid_max) + return -EINVAL; + + clear_bit(pid, pid_list->pids); + + return 0; +} + +/** + * trace_pid_list_next - return the next pid in the list + * @pid_list: The pid list to examine. + * @pid: The pid to start from + * @next: The pointer to place the pid that is set starting from @pid. + * + * Looks for the next consecutive pid that is in @pid_list starting + * at the pid specified by @pid. If one is set (including @pid), then + * that pid is placed into @next. + * + * Return 0 when a pid is found, -1 if there are no more pids included. + */ +int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, + unsigned int *next) +{ + pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + + if (pid < pid_list->pid_max) { + *next = pid; + return 0; + } + return -1; +} + +/** + * trace_pid_list_first - return the first pid in the list + * @pid_list: The pid list to examine. + * @pid: The pointer to place the pid first found pid that is set. + * + * Looks for the first pid that is set in @pid_list, and places it + * into @pid if found. + * + * Return 0 when a pid is found, -1 if there are no pids set. + */ +int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid) +{ + unsigned int first; + + first = find_first_bit(pid_list->pids, pid_list->pid_max); + + if (first < pid_list->pid_max) { + *pid = first; + return 0; + } + return -1; +} + +/** + * trace_pid_list_alloc - create a new pid_list + * + * Allocates a new pid_list to store pids into. + * + * Returns the pid_list on success, NULL otherwise. + */ +struct trace_pid_list *trace_pid_list_alloc(void) +{ + struct trace_pid_list *pid_list; + + pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + if (!pid_list) + return NULL; + + pid_list->pid_max = READ_ONCE(pid_max); + + pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); + if (!pid_list->pids) { + kfree(pid_list); + return NULL; + } + return pid_list; +} + +/** + * trace_pid_list_free - Frees an allocated pid_list. + * + * Frees the memory for a pid_list that was allocated. + */ +void trace_pid_list_free(struct trace_pid_list *pid_list) +{ + if (!pid_list) + return; + + vfree(pid_list->pids); + kfree(pid_list); +} diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h new file mode 100644 index 000000000000..80d0ecfe1536 --- /dev/null +++ b/kernel/trace/pid_list.h @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* Do not include this file directly. */ + +#ifndef _TRACE_INTERNAL_PID_LIST_H +#define _TRACE_INTERNAL_PID_LIST_H + +struct trace_pid_list { + int pid_max; + unsigned long *pids; +}; + +#endif /* _TRACE_INTERNAL_PID_LIST_H */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7896d30d90f7..dcced07a45e6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -512,12 +512,6 @@ int call_filter_check_discard(struct trace_event_call *call, void *rec, return 0; } -void trace_free_pid_list(struct trace_pid_list *pid_list) -{ - vfree(pid_list->pids); - kfree(pid_list); -} - /** * trace_find_filtered_pid - check if a pid exists in a filtered_pid list * @filtered_pids: The list of pids to check @@ -528,14 +522,7 @@ void trace_free_pid_list(struct trace_pid_list *pid_list) bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid) { - /* - * If pid_max changed after filtered_pids was created, we - * by default ignore all pids greater than the previous pid_max. - */ - if (search_pid >= filtered_pids->pid_max) - return false; - - return test_bit(search_pid, filtered_pids->pids); + return trace_pid_list_is_set(filtered_pids, search_pid); } /** @@ -592,15 +579,11 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, return; } - /* Sorry, but we don't support pid_max changing after setting */ - if (task->pid >= pid_list->pid_max) - return; - /* "self" is set for forks, and NULL for exits */ if (self) - set_bit(task->pid, pid_list->pids); + trace_pid_list_set(pid_list, task->pid); else - clear_bit(task->pid, pid_list->pids); + trace_pid_list_clear(pid_list, task->pid); } /** @@ -617,18 +600,19 @@ void trace_filter_add_remove_task(struct trace_pid_list *pid_list, */ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) { - unsigned long pid = (unsigned long)v; + long pid = (unsigned long)v; + unsigned int next; (*pos)++; /* pid already is +1 of the actual previous bit */ - pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + if (trace_pid_list_next(pid_list, pid, &next) < 0) + return NULL; - /* Return pid + 1 to allow zero to be represented */ - if (pid < pid_list->pid_max) - return (void *)(pid + 1); + pid = next; - return NULL; + /* Return pid + 1 to allow zero to be represented */ + return (void *)(pid + 1); } /** @@ -645,12 +629,14 @@ void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos) void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos) { unsigned long pid; + unsigned int first; loff_t l = 0; - pid = find_first_bit(pid_list->pids, pid_list->pid_max); - if (pid >= pid_list->pid_max) + if (trace_pid_list_first(pid_list, &first) < 0) return NULL; + pid = first; + /* Return pid + 1 so that zero can be the exit value */ for (pid++; pid && l < *pos; pid = (unsigned long)trace_pid_next(pid_list, (void *)pid, &l)) @@ -686,7 +672,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, unsigned long val; int nr_pids = 0; ssize_t read = 0; - ssize_t ret = 0; + ssize_t ret; loff_t pos; pid_t pid; @@ -699,34 +685,23 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, * the user. If the operation fails, then the current list is * not modified. */ - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + pid_list = trace_pid_list_alloc(); if (!pid_list) { trace_parser_put(&parser); return -ENOMEM; } - pid_list->pid_max = READ_ONCE(pid_max); - - /* Only truncating will shrink pid_max */ - if (filtered_pids && filtered_pids->pid_max > pid_list->pid_max) - pid_list->pid_max = filtered_pids->pid_max; - - pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); - if (!pid_list->pids) { - trace_parser_put(&parser); - kfree(pid_list); - return -ENOMEM; - } - if (filtered_pids) { /* copy the current bits to the new max */ - for_each_set_bit(pid, filtered_pids->pids, - filtered_pids->pid_max) { - set_bit(pid, pid_list->pids); + ret = trace_pid_list_first(filtered_pids, &pid); + while (!ret) { + trace_pid_list_set(pid_list, pid); + ret = trace_pid_list_next(filtered_pids, pid + 1, &pid); nr_pids++; } } + ret = 0; while (cnt > 0) { pos = 0; @@ -742,12 +717,13 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; - if (val >= pid_list->pid_max) - break; pid = (pid_t)val; - set_bit(pid, pid_list->pids); + if (trace_pid_list_set(pid_list, pid) < 0) { + ret = -1; + break; + } nr_pids++; trace_parser_clear(&parser); @@ -756,13 +732,13 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, trace_parser_put(&parser); if (ret < 0) { - trace_free_pid_list(pid_list); + trace_pid_list_free(pid_list); return ret; } if (!nr_pids) { /* Cleared the list of pids */ - trace_free_pid_list(pid_list); + trace_pid_list_free(pid_list); read = ret; pid_list = NULL; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b7c0f8e160fb..2ec500186992 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -22,6 +22,8 @@ #include #include +#include "pid_list.h" + #ifdef CONFIG_FTRACE_SYSCALLS #include /* For NR_SYSCALLS */ #include /* some archs define it here */ @@ -188,10 +190,14 @@ struct trace_options { struct trace_option_dentry *topts; }; -struct trace_pid_list { - int pid_max; - unsigned long *pids; -}; +struct trace_pid_list *trace_pid_list_alloc(void); +void trace_pid_list_free(struct trace_pid_list *pid_list); +bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid); +int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid); +int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid); +int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid); +int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, + unsigned int *next); enum { TRACE_PIDS = BIT(0), diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 830b3b9940f4..bf54d2803261 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -885,10 +885,10 @@ static void __ftrace_clear_event_pids(struct trace_array *tr, int type) tracepoint_synchronize_unregister(); if ((type & TRACE_PIDS) && pid_list) - trace_free_pid_list(pid_list); + trace_pid_list_free(pid_list); if ((type & TRACE_NO_PIDS) && no_pid_list) - trace_free_pid_list(no_pid_list); + trace_pid_list_free(no_pid_list); } static void ftrace_clear_event_pids(struct trace_array *tr, int type) @@ -1967,7 +1967,7 @@ event_pid_write(struct file *filp, const char __user *ubuf, if (filtered_pids) { tracepoint_synchronize_unregister(); - trace_free_pid_list(filtered_pids); + trace_pid_list_free(filtered_pids); } else if (pid_list && !other_pids) { register_pid_events(tr); } -- cgit v1.2.3 From 8d6e90983ade25ec7925211ac31d9ccaf64b7edf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 Sep 2021 22:20:57 -0400 Subject: tracing: Create a sparse bitmask for pid filtering When the trace_pid_list was created, the default pid max was 32768. Creating a bitmask that can hold one bit for all 32768 took up 4096 (one page). Having a one page bitmask was not much of a problem, and that was used for mapping pids. But today, systems are bigger and can run more tasks, and now the default pid_max is usually set to 4194304. Which means to handle that many pids requires 524288 bytes. Worse yet, the pid_max can be set to 2^30 (1073741824 or 1G) which would take 134217728 (128M) of memory to store this array. Since the pid_list array is very sparsely populated, it is a huge waste of memory to store all possible bits for each pid when most will not be set. Instead, use a page table scheme to store the array, and allow this to handle up to 30 bit pids. The pid_mask will start out with 256 entries for the first 8 MSB bits. This will cost 1K for 32 bit architectures and 2K for 64 bit. Each of these will have a 256 array to store the next 8 bits of the pid (another 1 or 2K). These will hold an 2K byte bitmask (which will cover the LSB 14 bits or 16384 pids). When the trace_pid_list is allocated, it will have the 1/2K upper bits allocated, and then it will allocate a cache for the next upper chunks and the lower chunks (default 6 of each). Then when a bit is "set", these chunks will be pulled from the free list and added to the array. If the free list gets down to a lever (default 2), it will trigger an irqwork that will refill the cache back up. On clearing a bit, if the clear causes the bitmask to be zero, that chunk will then be placed back into the free cache for later use, keeping the need to allocate more down to a minimum. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/pid_list.c | 401 ++++++++++++++++++++++++++++++++++++++++++++---- kernel/trace/pid_list.h | 79 +++++++++- 2 files changed, 445 insertions(+), 35 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 4483ef70b562..cbf8031b2b99 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -2,10 +2,119 @@ /* * Copyright (C) 2021 VMware Inc, Steven Rostedt */ -#include +#include +#include #include #include "trace.h" +/* See pid_list.h for details */ + +static inline union lower_chunk *get_lower_chunk(struct trace_pid_list *pid_list) +{ + union lower_chunk *chunk; + + lockdep_assert_held(&pid_list->lock); + + if (!pid_list->lower_list) + return NULL; + + chunk = pid_list->lower_list; + pid_list->lower_list = chunk->next; + pid_list->free_lower_chunks--; + WARN_ON_ONCE(pid_list->free_lower_chunks < 0); + chunk->next = NULL; + /* + * If a refill needs to happen, it can not happen here + * as the scheduler run queue locks are held. + */ + if (pid_list->free_lower_chunks <= CHUNK_REALLOC) + irq_work_queue(&pid_list->refill_irqwork); + + return chunk; +} + +static inline union upper_chunk *get_upper_chunk(struct trace_pid_list *pid_list) +{ + union upper_chunk *chunk; + + lockdep_assert_held(&pid_list->lock); + + if (!pid_list->upper_list) + return NULL; + + chunk = pid_list->upper_list; + pid_list->upper_list = chunk->next; + pid_list->free_upper_chunks--; + WARN_ON_ONCE(pid_list->free_upper_chunks < 0); + chunk->next = NULL; + /* + * If a refill needs to happen, it can not happen here + * as the scheduler run queue locks are held. + */ + if (pid_list->free_upper_chunks <= CHUNK_REALLOC) + irq_work_queue(&pid_list->refill_irqwork); + + return chunk; +} + +static inline void put_lower_chunk(struct trace_pid_list *pid_list, + union lower_chunk *chunk) +{ + lockdep_assert_held(&pid_list->lock); + + chunk->next = pid_list->lower_list; + pid_list->lower_list = chunk; + pid_list->free_lower_chunks++; +} + +static inline void put_upper_chunk(struct trace_pid_list *pid_list, + union upper_chunk *chunk) +{ + lockdep_assert_held(&pid_list->lock); + + chunk->next = pid_list->upper_list; + pid_list->upper_list = chunk; + pid_list->free_upper_chunks++; +} + +static inline bool upper_empty(union upper_chunk *chunk) +{ + /* + * If chunk->data has no lower chunks, it will be the same + * as a zeroed bitmask. Use find_first_bit() to test it + * and if it doesn't find any bits set, then the array + * is empty. + */ + int bit = find_first_bit((unsigned long *)chunk->data, + sizeof(chunk->data) * 8); + return bit >= sizeof(chunk->data) * 8; +} + +static inline int pid_split(unsigned int pid, unsigned int *upper1, + unsigned int *upper2, unsigned int *lower) +{ + /* MAX_PID should cover all pids */ + BUILD_BUG_ON(MAX_PID < PID_MAX_LIMIT); + + /* In case a bad pid is passed in, then fail */ + if (unlikely(pid >= MAX_PID)) + return -1; + + *upper1 = (pid >> UPPER1_SHIFT) & UPPER_MASK; + *upper2 = (pid >> UPPER2_SHIFT) & UPPER_MASK; + *lower = pid & LOWER_MASK; + + return 0; +} + +static inline unsigned int pid_join(unsigned int upper1, + unsigned int upper2, unsigned int lower) +{ + return ((upper1 & UPPER_MASK) << UPPER1_SHIFT) | + ((upper2 & UPPER_MASK) << UPPER2_SHIFT) | + (lower & LOWER_MASK); +} + /** * trace_pid_list_is_set - test if the pid is set in the list * @pid_list: The pid list to test @@ -19,14 +128,30 @@ */ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) { - /* - * If pid_max changed after filtered_pids was created, we - * by default ignore all pids greater than the previous pid_max. - */ - if (pid >= pid_list->pid_max) + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + bool ret = false; + + if (!pid_list) return false; - return test_bit(pid, pid_list->pids); + if (pid_split(pid, &upper1, &upper2, &lower) < 0) + return false; + + raw_spin_lock_irqsave(&pid_list->lock, flags); + upper_chunk = pid_list->upper[upper1]; + if (upper_chunk) { + lower_chunk = upper_chunk->data[upper2]; + if (lower_chunk) + ret = test_bit(lower, lower_chunk->data); + } + raw_spin_unlock_irqrestore(&pid_list->lock, flags); + + return ret; } /** @@ -42,13 +167,44 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) */ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) { - /* Sorry, but we don't support pid_max changing after setting */ - if (pid >= pid_list->pid_max) - return -EINVAL; + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + int ret; - set_bit(pid, pid_list->pids); + if (!pid_list) + return -ENODEV; - return 0; + if (pid_split(pid, &upper1, &upper2, &lower) < 0) + return -EINVAL; + + raw_spin_lock_irqsave(&pid_list->lock, flags); + upper_chunk = pid_list->upper[upper1]; + if (!upper_chunk) { + upper_chunk = get_upper_chunk(pid_list); + if (!upper_chunk) { + ret = -ENOMEM; + goto out; + } + pid_list->upper[upper1] = upper_chunk; + } + lower_chunk = upper_chunk->data[upper2]; + if (!lower_chunk) { + lower_chunk = get_lower_chunk(pid_list); + if (!lower_chunk) { + ret = -ENOMEM; + goto out; + } + upper_chunk->data[upper2] = lower_chunk; + } + set_bit(lower, lower_chunk->data); + ret = 0; + out: + raw_spin_unlock_irqrestore(&pid_list->lock, flags); + return ret; } /** @@ -64,12 +220,41 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) */ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) { - /* Sorry, but we don't support pid_max changing after setting */ - if (pid >= pid_list->pid_max) + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + + if (!pid_list) + return -ENODEV; + + if (pid_split(pid, &upper1, &upper2, &lower) < 0) return -EINVAL; - clear_bit(pid, pid_list->pids); + raw_spin_lock_irqsave(&pid_list->lock, flags); + upper_chunk = pid_list->upper[upper1]; + if (!upper_chunk) + goto out; + lower_chunk = upper_chunk->data[upper2]; + if (!lower_chunk) + goto out; + + clear_bit(lower, lower_chunk->data); + + /* if there's no more bits set, add it to the free list */ + if (find_first_bit(lower_chunk->data, LOWER_MAX) >= LOWER_MAX) { + put_lower_chunk(pid_list, lower_chunk); + upper_chunk->data[upper2] = NULL; + if (upper_empty(upper_chunk)) { + put_upper_chunk(pid_list, upper_chunk); + pid_list->upper[upper1] = NULL; + } + } + out: + raw_spin_unlock_irqrestore(&pid_list->lock, flags); return 0; } @@ -88,13 +273,45 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, unsigned int *next) { - pid = find_next_bit(pid_list->pids, pid_list->pid_max, pid); + union upper_chunk *upper_chunk; + union lower_chunk *lower_chunk; + unsigned long flags; + unsigned int upper1; + unsigned int upper2; + unsigned int lower; + + if (!pid_list) + return -ENODEV; + + if (pid_split(pid, &upper1, &upper2, &lower) < 0) + return -EINVAL; - if (pid < pid_list->pid_max) { - *next = pid; - return 0; + raw_spin_lock_irqsave(&pid_list->lock, flags); + for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) { + upper_chunk = pid_list->upper[upper1]; + + if (!upper_chunk) + continue; + + for (; upper2 <= UPPER_MASK; upper2++, lower = 0) { + lower_chunk = upper_chunk->data[upper2]; + if (!lower_chunk) + continue; + + lower = find_next_bit(lower_chunk->data, LOWER_MAX, + lower); + if (lower < LOWER_MAX) + goto found; + } } - return -1; + + found: + raw_spin_unlock_irqrestore(&pid_list->lock, flags); + if (upper1 > UPPER_MASK) + return -1; + + *next = pid_join(upper1, upper2, lower); + return 0; } /** @@ -109,15 +326,79 @@ int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, */ int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid) { - unsigned int first; + return trace_pid_list_next(pid_list, 0, pid); +} + +static void pid_list_refill_irq(struct irq_work *iwork) +{ + struct trace_pid_list *pid_list = container_of(iwork, struct trace_pid_list, + refill_irqwork); + union upper_chunk *upper; + union lower_chunk *lower; + union upper_chunk **upper_next = &upper; + union lower_chunk **lower_next = &lower; + int upper_count; + int lower_count; + int ucnt = 0; + int lcnt = 0; - first = find_first_bit(pid_list->pids, pid_list->pid_max); + again: + raw_spin_lock(&pid_list->lock); + upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks; + lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks; + raw_spin_unlock(&pid_list->lock); + + if (upper_count <= 0 && lower_count <= 0) + return; - if (first < pid_list->pid_max) { - *pid = first; - return 0; + while (upper_count-- > 0) { + union upper_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + *upper_next = chunk; + upper_next = &chunk->next; + ucnt++; } - return -1; + + while (lower_count-- > 0) { + union lower_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + *lower_next = chunk; + lower_next = &chunk->next; + lcnt++; + } + + raw_spin_lock(&pid_list->lock); + if (upper) { + *upper_next = pid_list->upper_list; + pid_list->upper_list = upper; + pid_list->free_upper_chunks += ucnt; + } + if (lower) { + *lower_next = pid_list->lower_list; + pid_list->lower_list = lower; + pid_list->free_lower_chunks += lcnt; + } + raw_spin_unlock(&pid_list->lock); + + /* + * On success of allocating all the chunks, both counters + * will be less than zero. If they are not, then an allocation + * failed, and we should not try again. + */ + if (upper_count >= 0 || lower_count >= 0) + return; + /* + * When the locks were released, free chunks could have + * been used and allocation needs to be done again. Might as + * well allocate it now. + */ + goto again; } /** @@ -130,18 +411,41 @@ int trace_pid_list_first(struct trace_pid_list *pid_list, unsigned int *pid) struct trace_pid_list *trace_pid_list_alloc(void) { struct trace_pid_list *pid_list; + int i; + + /* According to linux/thread.h, pids can be no bigger that 30 bits */ + WARN_ON_ONCE(pid_max > (1 << 30)); - pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); + pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL); if (!pid_list) return NULL; - pid_list->pid_max = READ_ONCE(pid_max); + init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq); - pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); - if (!pid_list->pids) { - kfree(pid_list); - return NULL; + raw_spin_lock_init(&pid_list->lock); + + for (i = 0; i < CHUNK_ALLOC; i++) { + union upper_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + chunk->next = pid_list->upper_list; + pid_list->upper_list = chunk; + pid_list->free_upper_chunks++; } + + for (i = 0; i < CHUNK_ALLOC; i++) { + union lower_chunk *chunk; + + chunk = kzalloc(sizeof(*chunk), GFP_KERNEL); + if (!chunk) + break; + chunk->next = pid_list->lower_list; + pid_list->lower_list = chunk; + pid_list->free_lower_chunks++; + } + return pid_list; } @@ -152,9 +456,40 @@ struct trace_pid_list *trace_pid_list_alloc(void) */ void trace_pid_list_free(struct trace_pid_list *pid_list) { + union upper_chunk *upper; + union lower_chunk *lower; + int i, j; + if (!pid_list) return; - vfree(pid_list->pids); + irq_work_sync(&pid_list->refill_irqwork); + + while (pid_list->lower_list) { + union lower_chunk *chunk; + + chunk = pid_list->lower_list; + pid_list->lower_list = pid_list->lower_list->next; + kfree(chunk); + } + + while (pid_list->upper_list) { + union upper_chunk *chunk; + + chunk = pid_list->upper_list; + pid_list->upper_list = pid_list->upper_list->next; + kfree(chunk); + } + + for (i = 0; i < UPPER1_SIZE; i++) { + upper = pid_list->upper[i]; + if (upper) { + for (j = 0; j < UPPER2_SIZE; j++) { + lower = upper->data[j]; + kfree(lower); + } + kfree(upper); + } + } kfree(pid_list); } diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h index 80d0ecfe1536..62e73f1ac85f 100644 --- a/kernel/trace/pid_list.h +++ b/kernel/trace/pid_list.h @@ -5,9 +5,84 @@ #ifndef _TRACE_INTERNAL_PID_LIST_H #define _TRACE_INTERNAL_PID_LIST_H +/* + * In order to keep track of what pids to trace, a tree is created much + * like page tables are used. This creates a sparse bit map, where + * the tree is filled in when needed. A PID is at most 30 bits (see + * linux/thread.h), and is broken up into 3 sections based on the bit map + * of the bits. The 8 MSB is the "upper1" section. The next 8 MSB is the + * "upper2" section and the 14 LSB is the "lower" section. + * + * A trace_pid_list structure holds the "upper1" section, in an + * array of 256 pointers (1 or 2K in size) to "upper_chunk" unions, where + * each has an array of 256 pointers (1 or 2K in size) to the "lower_chunk" + * structures, where each has an array of size 2K bytes representing a bitmask + * of the 14 LSB of the PID (256 * 8 = 2048) + * + * When a trace_pid_list is allocated, it includes the 256 pointer array + * of the upper1 unions. Then a "cache" of upper and lower is allocated + * where these will be assigned as needed. + * + * When a bit is set in the pid_list bitmask, the pid to use has + * the 8 MSB masked, and this is used to index the array in the + * pid_list to find the next upper union. If the element is NULL, + * then one is retrieved from the upper_list cache. If none is + * available, then -ENOMEM is returned. + * + * The next 8 MSB is used to index into the "upper2" section. If this + * element is NULL, then it is retrieved from the lower_list cache. + * Again, if one is not available -ENOMEM is returned. + * + * Finally the 14 LSB of the PID is used to set the bit in the 16384 + * bitmask (made up of 2K bytes). + * + * When the second upper section or the lower section has their last + * bit cleared, they are added back to the free list to be reused + * when needed. + */ + +#define UPPER_BITS 8 +#define UPPER_MAX (1 << UPPER_BITS) +#define UPPER1_SIZE (1 << UPPER_BITS) +#define UPPER2_SIZE (1 << UPPER_BITS) + +#define LOWER_BITS 14 +#define LOWER_MAX (1 << LOWER_BITS) +#define LOWER_SIZE (LOWER_MAX / BITS_PER_LONG) + +#define UPPER1_SHIFT (LOWER_BITS + UPPER_BITS) +#define UPPER2_SHIFT LOWER_BITS +#define LOWER_MASK (LOWER_MAX - 1) + +#define UPPER_MASK (UPPER_MAX - 1) + +/* According to linux/thread.h pids can not be bigger than or equal to 1 << 30 */ +#define MAX_PID (1 << 30) + +/* Just keep 6 chunks of both upper and lower in the cache on alloc */ +#define CHUNK_ALLOC 6 + +/* Have 2 chunks free, trigger a refill of the cache */ +#define CHUNK_REALLOC 2 + +union lower_chunk { + union lower_chunk *next; + unsigned long data[LOWER_SIZE]; // 2K in size +}; + +union upper_chunk { + union upper_chunk *next; + union lower_chunk *data[UPPER2_SIZE]; // 1 or 2K in size +}; + struct trace_pid_list { - int pid_max; - unsigned long *pids; + raw_spinlock_t lock; + struct irq_work refill_irqwork; + union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size + union upper_chunk *upper_list; + union lower_chunk *lower_list; + int free_upper_chunks; + int free_lower_chunks; }; #endif /* _TRACE_INTERNAL_PID_LIST_H */ -- cgit v1.2.3 From b30a779d5c557e99b93917f33d441948c9aead97 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 7 Oct 2021 09:53:53 -0400 Subject: tracing: Initialize upper and lower vars in pid_list_refill_irq() The upper and lower variables are set as link lists to add into the sparse array. If they are NULL, after the needed allocations are done, then there is nothing to add. But they need to be initialized to NULL for this to work. Link: https://lore.kernel.org/all/221bc7ba-a475-1cb9-1bbe-730bb9c2d448@canonical.com/ Fixes: 8d6e90983ade ("tracing: Create a sparse bitmask for pid filtering") Reported-by: Colin Ian King Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/pid_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index cbf8031b2b99..a2ef1d18126a 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -333,8 +333,8 @@ static void pid_list_refill_irq(struct irq_work *iwork) { struct trace_pid_list *pid_list = container_of(iwork, struct trace_pid_list, refill_irqwork); - union upper_chunk *upper; - union lower_chunk *lower; + union upper_chunk *upper = NULL; + union lower_chunk *lower = NULL; union upper_chunk **upper_next = &upper; union lower_chunk **lower_next = &lower; int upper_count; -- cgit v1.2.3 From 21ccc9cd72116289469e5519b6159c675a2fa58f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 18 Aug 2021 11:24:51 -0400 Subject: tracing: Disable "other" permission bits in the tracefs files When building the files in the tracefs file system, do not by default set any permissions for OTH (other). This will make it easier for admins who want to define a group for accessing tracefs and not having to first disable all the permission bits for "other" in the file system. As tracing can leak sensitive information, it should never by default allowing all users access. An admin can still set the permission bits for others to have access, which may be useful for creating a honeypot and seeing who takes advantage of it and roots the machine. Link: https://lkml.kernel.org/r/20210818153038.864149276@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 23 +++++------ kernel/trace/trace.c | 73 ++++++++++++++++++----------------- kernel/trace/trace.h | 3 ++ kernel/trace/trace_dynevent.c | 2 +- kernel/trace/trace_events.c | 42 ++++++++++---------- kernel/trace/trace_events_synth.c | 4 +- kernel/trace/trace_functions_graph.c | 2 +- kernel/trace/trace_hwlat.c | 6 +-- kernel/trace/trace_kprobe.c | 8 ++-- kernel/trace/trace_osnoise.c | 14 +++---- kernel/trace/trace_printk.c | 2 +- kernel/trace/trace_recursion_record.c | 4 +- kernel/trace/trace_stack.c | 6 +-- kernel/trace/trace_stat.c | 6 +-- kernel/trace/trace_uprobe.c | 4 +- 15 files changed, 103 insertions(+), 96 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3eec6792f115..0a0dbc2d411b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -988,8 +988,9 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) } } - entry = tracefs_create_file("function_profile_enabled", 0644, - d_tracer, NULL, &ftrace_profile_fops); + entry = tracefs_create_file("function_profile_enabled", + TRACE_MODE_WRITE, d_tracer, NULL, + &ftrace_profile_fops); if (!entry) pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); } @@ -6109,10 +6110,10 @@ void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent) { - trace_create_file("set_ftrace_filter", 0644, parent, + trace_create_file("set_ftrace_filter", TRACE_MODE_WRITE, parent, ops, &ftrace_filter_fops); - trace_create_file("set_ftrace_notrace", 0644, parent, + trace_create_file("set_ftrace_notrace", TRACE_MODE_WRITE, parent, ops, &ftrace_notrace_fops); } @@ -6139,19 +6140,19 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops) static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { - trace_create_file("available_filter_functions", 0444, + trace_create_file("available_filter_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_avail_fops); - trace_create_file("enabled_functions", 0444, + trace_create_file("enabled_functions", TRACE_MODE_READ, d_tracer, NULL, &ftrace_enabled_fops); ftrace_create_filter_files(&global_ops, d_tracer); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - trace_create_file("set_graph_function", 0644, d_tracer, + trace_create_file("set_graph_function", TRACE_MODE_WRITE, d_tracer, NULL, &ftrace_graph_fops); - trace_create_file("set_graph_notrace", 0644, d_tracer, + trace_create_file("set_graph_notrace", TRACE_MODE_WRITE, d_tracer, NULL, &ftrace_graph_notrace_fops); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -7494,10 +7495,10 @@ static const struct file_operations ftrace_no_pid_fops = { void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer) { - trace_create_file("set_ftrace_pid", 0644, d_tracer, + trace_create_file("set_ftrace_pid", TRACE_MODE_WRITE, d_tracer, tr, &ftrace_pid_fops); - trace_create_file("set_ftrace_notrace_pid", 0644, d_tracer, - tr, &ftrace_no_pid_fops); + trace_create_file("set_ftrace_notrace_pid", TRACE_MODE_WRITE, + d_tracer, tr, &ftrace_no_pid_fops); } void __init ftrace_init_tracefs_toplevel(struct trace_array *tr, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dcced07a45e6..985390cb8441 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1690,7 +1690,8 @@ static void trace_create_maxlat_file(struct trace_array *tr, { INIT_WORK(&tr->fsnotify_work, latency_fsnotify_workfn); init_irq_work(&tr->fsnotify_irqwork, latency_fsnotify_workfn_irq); - tr->d_max_latency = trace_create_file("tracing_max_latency", 0644, + tr->d_max_latency = trace_create_file("tracing_max_latency", + TRACE_MODE_WRITE, d_tracer, &tr->max_latency, &tracing_max_lat_fops); } @@ -1727,8 +1728,8 @@ void latency_fsnotify(struct trace_array *tr) #else #define trace_create_maxlat_file(tr, d_tracer) \ - trace_create_file("tracing_max_latency", 0644, d_tracer, \ - &tr->max_latency, &tracing_max_lat_fops) + trace_create_file("tracing_max_latency", TRACE_MODE_WRITE, \ + d_tracer, &tr->max_latency, &tracing_max_lat_fops) #endif @@ -6054,7 +6055,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start, static void trace_create_eval_file(struct dentry *d_tracer) { - trace_create_file("eval_map", 0444, d_tracer, + trace_create_file("eval_map", TRACE_MODE_READ, d_tracer, NULL, &tracing_eval_map_fops); } @@ -8567,27 +8568,27 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) } /* per cpu trace_pipe */ - trace_create_cpu_file("trace_pipe", 0444, d_cpu, + trace_create_cpu_file("trace_pipe", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_pipe_fops); /* per cpu trace */ - trace_create_cpu_file("trace", 0644, d_cpu, + trace_create_cpu_file("trace", TRACE_MODE_WRITE, d_cpu, tr, cpu, &tracing_fops); - trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu, + trace_create_cpu_file("trace_pipe_raw", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_buffers_fops); - trace_create_cpu_file("stats", 0444, d_cpu, + trace_create_cpu_file("stats", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_stats_fops); - trace_create_cpu_file("buffer_size_kb", 0444, d_cpu, + trace_create_cpu_file("buffer_size_kb", TRACE_MODE_READ, d_cpu, tr, cpu, &tracing_entries_fops); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_cpu_file("snapshot", 0644, d_cpu, + trace_create_cpu_file("snapshot", TRACE_MODE_WRITE, d_cpu, tr, cpu, &snapshot_fops); - trace_create_cpu_file("snapshot_raw", 0444, d_cpu, + trace_create_cpu_file("snapshot_raw", TRACE_MODE_READ, d_cpu, tr, cpu, &snapshot_raw_fops); #endif } @@ -8793,8 +8794,8 @@ create_trace_option_file(struct trace_array *tr, topt->opt = opt; topt->tr = tr; - topt->entry = trace_create_file(opt->name, 0644, t_options, topt, - &trace_options_fops); + topt->entry = trace_create_file(opt->name, TRACE_MODE_WRITE, + t_options, topt, &trace_options_fops); } @@ -8869,7 +8870,7 @@ create_trace_option_core_file(struct trace_array *tr, if (!t_options) return NULL; - return trace_create_file(option, 0644, t_options, + return trace_create_file(option, TRACE_MODE_WRITE, t_options, (void *)&tr->trace_flags_index[index], &trace_options_core_fops); } @@ -9394,28 +9395,28 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) struct trace_event_file *file; int cpu; - trace_create_file("available_tracers", 0444, d_tracer, + trace_create_file("available_tracers", TRACE_MODE_READ, d_tracer, tr, &show_traces_fops); - trace_create_file("current_tracer", 0644, d_tracer, + trace_create_file("current_tracer", TRACE_MODE_WRITE, d_tracer, tr, &set_tracer_fops); - trace_create_file("tracing_cpumask", 0644, d_tracer, + trace_create_file("tracing_cpumask", TRACE_MODE_WRITE, d_tracer, tr, &tracing_cpumask_fops); - trace_create_file("trace_options", 0644, d_tracer, + trace_create_file("trace_options", TRACE_MODE_WRITE, d_tracer, tr, &tracing_iter_fops); - trace_create_file("trace", 0644, d_tracer, + trace_create_file("trace", TRACE_MODE_WRITE, d_tracer, tr, &tracing_fops); - trace_create_file("trace_pipe", 0444, d_tracer, + trace_create_file("trace_pipe", TRACE_MODE_READ, d_tracer, tr, &tracing_pipe_fops); - trace_create_file("buffer_size_kb", 0644, d_tracer, + trace_create_file("buffer_size_kb", TRACE_MODE_WRITE, d_tracer, tr, &tracing_entries_fops); - trace_create_file("buffer_total_size_kb", 0444, d_tracer, + trace_create_file("buffer_total_size_kb", TRACE_MODE_READ, d_tracer, tr, &tracing_total_entries_fops); trace_create_file("free_buffer", 0200, d_tracer, @@ -9426,25 +9427,25 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) file = __find_event_file(tr, "ftrace", "print"); if (file && file->dir) - trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, + file, &event_trigger_fops); tr->trace_marker_file = file; trace_create_file("trace_marker_raw", 0220, d_tracer, tr, &tracing_mark_raw_fops); - trace_create_file("trace_clock", 0644, d_tracer, tr, + trace_create_file("trace_clock", TRACE_MODE_WRITE, d_tracer, tr, &trace_clock_fops); - trace_create_file("tracing_on", 0644, d_tracer, + trace_create_file("tracing_on", TRACE_MODE_WRITE, d_tracer, tr, &rb_simple_fops); - trace_create_file("timestamp_mode", 0444, d_tracer, tr, + trace_create_file("timestamp_mode", TRACE_MODE_READ, d_tracer, tr, &trace_time_stamp_mode_fops); tr->buffer_percent = 50; - trace_create_file("buffer_percent", 0444, d_tracer, + trace_create_file("buffer_percent", TRACE_MODE_READ, d_tracer, tr, &buffer_percent_fops); create_trace_options_dir(tr); @@ -9457,11 +9458,11 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) MEM_FAIL(1, "Could not allocate function filter files"); #ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", 0644, d_tracer, + trace_create_file("snapshot", TRACE_MODE_WRITE, d_tracer, tr, &snapshot_fops); #endif - trace_create_file("error_log", 0644, d_tracer, + trace_create_file("error_log", TRACE_MODE_WRITE, d_tracer, tr, &tracing_err_log_fops); for_each_tracing_cpu(cpu) @@ -9654,19 +9655,19 @@ static __init int tracer_init_tracefs(void) init_tracer_tracefs(&global_trace, NULL); ftrace_init_tracefs_toplevel(&global_trace, NULL); - trace_create_file("tracing_thresh", 0644, NULL, + trace_create_file("tracing_thresh", TRACE_MODE_WRITE, NULL, &global_trace, &tracing_thresh_fops); - trace_create_file("README", 0444, NULL, + trace_create_file("README", TRACE_MODE_READ, NULL, NULL, &tracing_readme_fops); - trace_create_file("saved_cmdlines", 0444, NULL, + trace_create_file("saved_cmdlines", TRACE_MODE_READ, NULL, NULL, &tracing_saved_cmdlines_fops); - trace_create_file("saved_cmdlines_size", 0644, NULL, + trace_create_file("saved_cmdlines_size", TRACE_MODE_WRITE, NULL, NULL, &tracing_saved_cmdlines_size_fops); - trace_create_file("saved_tgids", 0444, NULL, + trace_create_file("saved_tgids", TRACE_MODE_READ, NULL, NULL, &tracing_saved_tgids_fops); trace_eval_init(); @@ -9678,7 +9679,7 @@ static __init int tracer_init_tracefs(void) #endif #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("dyn_ftrace_total_info", 0444, NULL, + trace_create_file("dyn_ftrace_total_info", TRACE_MODE_READ, NULL, NULL, &tracing_dyn_info_fops); #endif diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2ec500186992..6c3808132b16 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -29,6 +29,9 @@ #include /* some archs define it here */ #endif +#define TRACE_MODE_WRITE 0640 +#define TRACE_MODE_READ 0440 + enum trace_type { __TRACE_FIRST_TYPE = 0, diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 1110112e55bd..e34e8182ee4b 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -262,7 +262,7 @@ static __init int init_dynamic_event(void) if (ret) return 0; - entry = tracefs_create_file("dynamic_events", 0644, NULL, + entry = tracefs_create_file("dynamic_events", TRACE_MODE_WRITE, NULL, NULL, &dynamic_events_ops); /* Event list interface */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bf54d2803261..4021b9a79f93 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2312,7 +2312,8 @@ event_subsystem_dir(struct trace_array *tr, const char *name, /* the ftrace system is special, do not create enable or filter files */ if (strcmp(name, "ftrace") != 0) { - entry = tracefs_create_file("filter", 0644, dir->entry, dir, + entry = tracefs_create_file("filter", TRACE_MODE_WRITE, + dir->entry, dir, &ftrace_subsystem_filter_fops); if (!entry) { kfree(system->filter); @@ -2320,7 +2321,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, pr_warn("Could not create tracefs '%s/filter' entry\n", name); } - trace_create_file("enable", 0644, dir->entry, dir, + trace_create_file("enable", TRACE_MODE_WRITE, dir->entry, dir, &ftrace_system_enable_fops); } @@ -2402,12 +2403,12 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) } if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - trace_create_file("enable", 0644, file->dir, file, + trace_create_file("enable", TRACE_MODE_WRITE, file->dir, file, &ftrace_enable_fops); #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) - trace_create_file("id", 0444, file->dir, + trace_create_file("id", TRACE_MODE_READ, file->dir, (void *)(long)call->event.type, &ftrace_event_id_fops); #endif @@ -2423,22 +2424,22 @@ event_create_dir(struct dentry *parent, struct trace_event_file *file) * triggers or filters. */ if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) { - trace_create_file("filter", 0644, file->dir, file, - &ftrace_event_filter_fops); + trace_create_file("filter", TRACE_MODE_WRITE, file->dir, + file, &ftrace_event_filter_fops); - trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + trace_create_file("trigger", TRACE_MODE_WRITE, file->dir, + file, &event_trigger_fops); } #ifdef CONFIG_HIST_TRIGGERS - trace_create_file("hist", 0444, file->dir, file, + trace_create_file("hist", TRACE_MODE_READ, file->dir, file, &event_hist_fops); #endif #ifdef CONFIG_HIST_TRIGGERS_DEBUG - trace_create_file("hist_debug", 0444, file->dir, file, + trace_create_file("hist_debug", TRACE_MODE_READ, file->dir, file, &event_hist_debug_fops); #endif - trace_create_file("format", 0444, file->dir, call, + trace_create_file("format", TRACE_MODE_READ, file->dir, call, &ftrace_event_format_fops); #ifdef CONFIG_TRACE_EVENT_INJECT @@ -3433,7 +3434,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) struct dentry *d_events; struct dentry *entry; - entry = tracefs_create_file("set_event", 0644, parent, + entry = tracefs_create_file("set_event", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_fops); if (!entry) { pr_warn("Could not create tracefs 'set_event' entry\n"); @@ -3446,7 +3447,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) return -ENOMEM; } - entry = trace_create_file("enable", 0644, d_events, + entry = trace_create_file("enable", TRACE_MODE_WRITE, d_events, tr, &ftrace_tr_enable_fops); if (!entry) { pr_warn("Could not create tracefs 'enable' entry\n"); @@ -3455,24 +3456,25 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) /* There are not as crucial, just warn if they are not created */ - entry = tracefs_create_file("set_event_pid", 0644, parent, + entry = tracefs_create_file("set_event_pid", TRACE_MODE_WRITE, parent, tr, &ftrace_set_event_pid_fops); if (!entry) pr_warn("Could not create tracefs 'set_event_pid' entry\n"); - entry = tracefs_create_file("set_event_notrace_pid", 0644, parent, - tr, &ftrace_set_event_notrace_pid_fops); + entry = tracefs_create_file("set_event_notrace_pid", + TRACE_MODE_WRITE, parent, tr, + &ftrace_set_event_notrace_pid_fops); if (!entry) pr_warn("Could not create tracefs 'set_event_notrace_pid' entry\n"); /* ring buffer internal formats */ - entry = trace_create_file("header_page", 0444, d_events, + entry = trace_create_file("header_page", TRACE_MODE_READ, d_events, ring_buffer_print_page_header, &ftrace_show_header_fops); if (!entry) pr_warn("Could not create tracefs 'header_page' entry\n"); - entry = trace_create_file("header_event", 0444, d_events, + entry = trace_create_file("header_event", TRACE_MODE_READ, d_events, ring_buffer_print_entry_header, &ftrace_show_header_fops); if (!entry) @@ -3689,8 +3691,8 @@ __init int event_trace_init(void) if (!tr) return -ENODEV; - entry = tracefs_create_file("available_events", 0444, NULL, - tr, &ftrace_avail_fops); + entry = tracefs_create_file("available_events", TRACE_MODE_READ, + NULL, tr, &ftrace_avail_fops); if (!entry) pr_warn("Could not create tracefs 'available_events' entry\n"); diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index d54094b7a9d7..22db3ce95e74 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -2227,8 +2227,8 @@ static __init int trace_events_synth_init(void) if (err) goto err; - entry = tracefs_create_file("synthetic_events", 0644, NULL, - NULL, &synth_events_fops); + entry = tracefs_create_file("synthetic_events", TRACE_MODE_WRITE, + NULL, NULL, &synth_events_fops); if (!entry) { err = -ENODEV; goto err; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0de6837722da..6b5ff3ba4251 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1340,7 +1340,7 @@ static __init int init_graph_tracefs(void) if (ret) return 0; - trace_create_file("max_graph_depth", 0644, NULL, + trace_create_file("max_graph_depth", TRACE_MODE_WRITE, NULL, NULL, &graph_depth_fops); return 0; diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 1b83d75eb103..d0a730d99a33 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -782,21 +782,21 @@ static int init_tracefs(void) if (!top_dir) return -ENOMEM; - hwlat_sample_window = tracefs_create_file("window", 0640, + hwlat_sample_window = tracefs_create_file("window", TRACE_MODE_WRITE, top_dir, &hwlat_window, &trace_min_max_fops); if (!hwlat_sample_window) goto err; - hwlat_sample_width = tracefs_create_file("width", 0644, + hwlat_sample_width = tracefs_create_file("width", TRACE_MODE_WRITE, top_dir, &hwlat_width, &trace_min_max_fops); if (!hwlat_sample_width) goto err; - hwlat_thread_mode = trace_create_file("mode", 0644, + hwlat_thread_mode = trace_create_file("mode", TRACE_MODE_WRITE, top_dir, NULL, &thread_mode_fops); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 0e1e7ce5f7ed..33272a7b6912 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1925,16 +1925,16 @@ static __init int init_kprobe_trace(void) if (ret) return 0; - entry = tracefs_create_file("kprobe_events", 0644, NULL, - NULL, &kprobe_events_ops); + entry = tracefs_create_file("kprobe_events", TRACE_MODE_WRITE, + NULL, NULL, &kprobe_events_ops); /* Event list interface */ if (!entry) pr_warn("Could not create tracefs 'kprobe_events' entry\n"); /* Profile interface */ - entry = tracefs_create_file("kprobe_profile", 0444, NULL, - NULL, &kprobe_profile_ops); + entry = tracefs_create_file("kprobe_profile", TRACE_MODE_READ, + NULL, NULL, &kprobe_profile_ops); if (!entry) pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index ce053619f289..c4f14fb98aaa 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1856,38 +1856,38 @@ static int init_tracefs(void) if (!top_dir) return 0; - tmp = tracefs_create_file("period_us", 0640, top_dir, + tmp = tracefs_create_file("period_us", TRACE_MODE_WRITE, top_dir, &osnoise_period, &trace_min_max_fops); if (!tmp) goto err; - tmp = tracefs_create_file("runtime_us", 0644, top_dir, + tmp = tracefs_create_file("runtime_us", TRACE_MODE_WRITE, top_dir, &osnoise_runtime, &trace_min_max_fops); if (!tmp) goto err; - tmp = tracefs_create_file("stop_tracing_us", 0640, top_dir, + tmp = tracefs_create_file("stop_tracing_us", TRACE_MODE_WRITE, top_dir, &osnoise_stop_tracing_in, &trace_min_max_fops); if (!tmp) goto err; - tmp = tracefs_create_file("stop_tracing_total_us", 0640, top_dir, + tmp = tracefs_create_file("stop_tracing_total_us", TRACE_MODE_WRITE, top_dir, &osnoise_stop_tracing_total, &trace_min_max_fops); if (!tmp) goto err; - tmp = trace_create_file("cpus", 0644, top_dir, NULL, &cpus_fops); + tmp = trace_create_file("cpus", TRACE_MODE_WRITE, top_dir, NULL, &cpus_fops); if (!tmp) goto err; #ifdef CONFIG_TIMERLAT_TRACER #ifdef CONFIG_STACKTRACE - tmp = tracefs_create_file("print_stack", 0640, top_dir, + tmp = tracefs_create_file("print_stack", TRACE_MODE_WRITE, top_dir, &osnoise_print_stack, &trace_min_max_fops); if (!tmp) goto err; #endif - tmp = tracefs_create_file("timerlat_period_us", 0640, top_dir, + tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir, &timerlat_period, &trace_min_max_fops); if (!tmp) goto err; diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 4b320fe7df70..29f6e95439b6 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -384,7 +384,7 @@ static __init int init_trace_printk_function_export(void) if (ret) return 0; - trace_create_file("printk_formats", 0444, NULL, + trace_create_file("printk_formats", TRACE_MODE_READ, NULL, NULL, &ftrace_formats_fops); return 0; diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c index b2edac1fe156..4d4b78c8ca25 100644 --- a/kernel/trace/trace_recursion_record.c +++ b/kernel/trace/trace_recursion_record.c @@ -226,8 +226,8 @@ __init static int create_recursed_functions(void) { struct dentry *dentry; - dentry = trace_create_file("recursed_functions", 0644, NULL, NULL, - &recursed_functions_fops); + dentry = trace_create_file("recursed_functions", TRACE_MODE_WRITE, + NULL, NULL, &recursed_functions_fops); if (!dentry) pr_warn("WARNING: Failed to create recursed_functions\n"); return 0; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 63c285042051..5a48dba912ea 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -559,14 +559,14 @@ static __init int stack_trace_init(void) if (ret) return 0; - trace_create_file("stack_max_size", 0644, NULL, + trace_create_file("stack_max_size", TRACE_MODE_WRITE, NULL, &stack_trace_max_size, &stack_max_size_fops); - trace_create_file("stack_trace", 0444, NULL, + trace_create_file("stack_trace", TRACE_MODE_READ, NULL, NULL, &stack_trace_fops); #ifdef CONFIG_DYNAMIC_FTRACE - trace_create_file("stack_trace_filter", 0644, NULL, + trace_create_file("stack_trace_filter", TRACE_MODE_WRITE, NULL, &trace_ops, &stack_trace_filter_fops); #endif diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 8d141c3825a9..bb247beec447 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -297,9 +297,9 @@ static int init_stat_file(struct stat_session *session) if (!stat_dir && (ret = tracing_stat_init())) return ret; - session->file = tracefs_create_file(session->ts->name, 0644, - stat_dir, - session, &tracing_stat_fops); + session->file = tracefs_create_file(session->ts->name, TRACE_MODE_WRITE, + stat_dir, session, + &tracing_stat_fops); if (!session->file) return -ENOMEM; return 0; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 225ce569bf8f..0a5c0db3137e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1655,10 +1655,10 @@ static __init int init_uprobe_trace(void) if (ret) return 0; - trace_create_file("uprobe_events", 0644, NULL, + trace_create_file("uprobe_events", TRACE_MODE_WRITE, NULL, NULL, &uprobe_events_ops); /* Profile interface */ - trace_create_file("uprobe_profile", 0444, NULL, + trace_create_file("uprobe_profile", TRACE_MODE_READ, NULL, NULL, &uprobe_profile_ops); return 0; } -- cgit v1.2.3 From 6644c654ea70e0d8b8d5111e1272f8f29df00f21 Mon Sep 17 00:00:00 2001 From: Weizhao Ouyang Date: Thu, 9 Sep 2021 17:02:16 +0800 Subject: ftrace: Cleanup ftrace_dyn_arch_init() Most of ARCHs use empty ftrace_dyn_arch_init(), introduce a weak common ftrace_dyn_arch_init() to cleanup them. Link: https://lkml.kernel.org/r/20210909090216.1955240-1-o451686892@gmail.com Acked-by: Heiko Carstens (s390) Acked-by: Helge Deller (parisc) Signed-off-by: Weizhao Ouyang Signed-off-by: Steven Rostedt (VMware) --- arch/arm/kernel/ftrace.c | 5 ----- arch/arm64/kernel/ftrace.c | 5 ----- arch/csky/kernel/ftrace.c | 5 ----- arch/ia64/kernel/ftrace.c | 6 ------ arch/microblaze/kernel/ftrace.c | 5 ----- arch/nds32/kernel/ftrace.c | 5 ----- arch/parisc/kernel/ftrace.c | 5 ----- arch/riscv/kernel/ftrace.c | 5 ----- arch/s390/kernel/ftrace.c | 5 ----- arch/sh/kernel/ftrace.c | 5 ----- arch/sparc/kernel/ftrace.c | 5 ----- arch/x86/kernel/ftrace.c | 5 ----- kernel/trace/ftrace.c | 5 +++++ 13 files changed, 5 insertions(+), 61 deletions(-) (limited to 'kernel/trace') diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index 3c83b5d29697..a006585e1c09 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -193,11 +193,6 @@ int ftrace_make_nop(struct module *mod, return ret; } - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 7f467bd9db7a..fc62dfe73f93 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -236,11 +236,6 @@ void arch_ftrace_update_code(int command) command |= FTRACE_MAY_SLEEP; ftrace_modify_all_code(command); } - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/csky/kernel/ftrace.c b/arch/csky/kernel/ftrace.c index b4a7ec1517ff..50bfcf129078 100644 --- a/arch/csky/kernel/ftrace.c +++ b/arch/csky/kernel/ftrace.c @@ -133,11 +133,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) (unsigned long)func, true, true); return ret; } - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS diff --git a/arch/ia64/kernel/ftrace.c b/arch/ia64/kernel/ftrace.c index b2ab2d58fb30..d6360fd404ab 100644 --- a/arch/ia64/kernel/ftrace.c +++ b/arch/ia64/kernel/ftrace.c @@ -194,9 +194,3 @@ int ftrace_update_ftrace_func(ftrace_func_t func) flush_icache_range(addr, addr + 16); return 0; } - -/* run from kstop_machine */ -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} diff --git a/arch/microblaze/kernel/ftrace.c b/arch/microblaze/kernel/ftrace.c index 224eea40e1ee..188749d62709 100644 --- a/arch/microblaze/kernel/ftrace.c +++ b/arch/microblaze/kernel/ftrace.c @@ -163,11 +163,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return ret; } -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} - int ftrace_update_ftrace_func(ftrace_func_t func) { unsigned long ip = (unsigned long)(&ftrace_call); diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c index 0e23e3a8df6b..f0ef4842d191 100644 --- a/arch/nds32/kernel/ftrace.c +++ b/arch/nds32/kernel/ftrace.c @@ -84,11 +84,6 @@ void _ftrace_caller(unsigned long parent_ip) /* restore all state needed by the compiler epilogue */ } -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} - static unsigned long gen_sethi_insn(unsigned long addr) { unsigned long opcode = 0x46000000; diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 0a1e75af5382..01581f715737 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -94,11 +94,6 @@ int ftrace_disable_ftrace_graph_caller(void) #endif #ifdef CONFIG_DYNAMIC_FTRACE - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} int ftrace_update_ftrace_func(ftrace_func_t func) { return 0; diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 7f1e5203de88..4716f4cdc038 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -154,11 +154,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return ret; } - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif #ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 1d94ffdf347b..5165bf344f95 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -262,11 +262,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) return 0; } -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} - void arch_ftrace_update_code(int command) { if (ftrace_shared_hotpatch_trampoline(NULL)) diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c index 295c43315bbe..930001bb8c6a 100644 --- a/arch/sh/kernel/ftrace.c +++ b/arch/sh/kernel/ftrace.c @@ -252,11 +252,6 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) return ftrace_modify_code(rec->ip, old, new); } - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/sparc/kernel/ftrace.c b/arch/sparc/kernel/ftrace.c index 684b84ce397f..eaead3da8e03 100644 --- a/arch/sparc/kernel/ftrace.c +++ b/arch/sparc/kernel/ftrace.c @@ -82,11 +82,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func) new = ftrace_call_replace(ip, (unsigned long)func); return ftrace_modify_code(ip, old, new); } - -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1b3ce3b4a2a2..23d221a9a3cd 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -252,11 +252,6 @@ void arch_ftrace_update_code(int command) ftrace_modify_all_code(command); } -int __init ftrace_dyn_arch_init(void) -{ - return 0; -} - /* Currently only x86_64 supports dynamic trampolines */ #ifdef CONFIG_X86_64 diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 0a0dbc2d411b..2c3e9760df7f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6847,6 +6847,11 @@ void __init ftrace_free_init_mem(void) ftrace_free_mem(NULL, start, end); } +int __init __weak ftrace_dyn_arch_init(void) +{ + return 0; +} + void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; -- cgit v1.2.3 From 43c9dd8ddf4efdce126e0a0b176d729c72445b0f Mon Sep 17 00:00:00 2001 From: Carles Pey Date: Sat, 18 Sep 2021 19:30:43 +0400 Subject: ftrace: Add unit test for removing trace function A self test is provided for the trace function removal functionality. Link: https://lkml.kernel.org/r/20210918153043.318016-2-carles.pey@gmail.com Signed-off-by: Carles Pey Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index adf7ef194005..875b4f1a0476 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -287,6 +287,40 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt) if (trace_selftest_test_probe3_cnt != 4) goto out_free; + /* Remove trace function from probe 3 */ + func1_name = "!" __stringify(DYN_FTRACE_TEST_NAME); + len1 = strlen(func1_name); + + ftrace_set_filter(&test_probe3, func1_name, len1, 0); + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 3) + goto out_free; + if (trace_selftest_test_probe2_cnt != 2) + goto out_free; + if (trace_selftest_test_probe3_cnt != 4) + goto out_free; + if (cnt > 1) { + if (trace_selftest_test_global_cnt == 0) + goto out_free; + } + if (trace_selftest_test_dyn_cnt == 0) + goto out_free; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 3) + goto out_free; + if (trace_selftest_test_probe2_cnt != 3) + goto out_free; + if (trace_selftest_test_probe3_cnt != 5) + goto out_free; + ret = 0; out_free: unregister_ftrace_function(dyn_ops); -- cgit v1.2.3 From affc659246293df42ba2d184c674cc959c05aa02 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Thu, 30 Sep 2021 08:03:42 +0800 Subject: tracing: in_irq() cleanup Replace the obsolete and ambiguos macro in_irq() with new macro in_hardirq(). Link: https://lkml.kernel.org/r/20210930000342.6016-1-changbin.du@gmail.com Reviewed-by: Petr Mladek Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 2 +- kernel/trace/trace_functions_graph.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6c3808132b16..6b60ab9475ed 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -890,7 +890,7 @@ static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) * is set, and called by an interrupt handler, we still * want to trace it. */ - if (in_irq()) + if (in_hardirq()) trace_recursion_set(TRACE_IRQ_BIT); else trace_recursion_clear(TRACE_IRQ_BIT); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6b5ff3ba4251..203204cadf92 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -120,7 +120,7 @@ static inline int ftrace_graph_ignore_irqs(void) if (!ftrace_graph_skip_irqs || trace_recursion_test(TRACE_IRQ_BIT)) return 0; - return in_irq(); + return in_hardirq(); } int trace_graph_entry(struct ftrace_graph_ent *trace) -- cgit v1.2.3 From 34cdd18b8d245f3e901e5325313c27de727ab80d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 17 Jun 2020 16:56:16 -0400 Subject: tracing: Use linker magic instead of recasting ftrace_ops_list_func() In an effort to enable -Wcast-function-type in the top-level Makefile to support Control Flow Integrity builds, all function casts need to be removed. This means that ftrace_ops_list_func() can no longer be defined as ftrace_ops_no_ops(). The reason for ftrace_ops_no_ops() is to use that when an architecture calls ftrace_ops_list_func() with only two parameters (called from assembly). And to make sure there's no C side-effects, those archs call ftrace_ops_no_ops() which only has two parameters, as ftrace_ops_list_func() has four parameters. Instead of a typecast, use vmlinux.lds.h to define ftrace_ops_list_func() to arch_ftrace_ops_list_func() that will define the proper set of parameters. Link: https://lore.kernel.org/r/20200614070154.6039-1-oscar.carter@gmx.com Link: https://lkml.kernel.org/r/20200617165616.52241bde@oasis.local.home Link: https://lore.kernel.org/all/20211005053922.GA702049@embeddedor/ Requested-by: Oscar Carter Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- include/asm-generic/vmlinux.lds.h | 10 ++++++++-- include/linux/ftrace.h | 12 ++++++++++-- kernel/trace/ftrace.c | 23 ++++++++++------------- 3 files changed, 28 insertions(+), 17 deletions(-) (limited to 'kernel/trace') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f2984af2b85b..8771c435f34b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -164,16 +164,22 @@ * Need to also make ftrace_stub_graph point to ftrace_stub * so that the same stub location may have different protocols * and not mess up with C verifiers. + * + * ftrace_ops_list_func will be defined as arch_ftrace_ops_list_func + * as some archs will have a different prototype for that function + * but ftrace_ops_list_func() will have a single prototype. */ #define MCOUNT_REC() . = ALIGN(8); \ __start_mcount_loc = .; \ KEEP(*(__mcount_loc)) \ KEEP(*(__patchable_function_entries)) \ __stop_mcount_loc = .; \ - ftrace_stub_graph = ftrace_stub; + ftrace_stub_graph = ftrace_stub; \ + ftrace_ops_list_func = arch_ftrace_ops_list_func; #else # ifdef CONFIG_FUNCTION_TRACER -# define MCOUNT_REC() ftrace_stub_graph = ftrace_stub; +# define MCOUNT_REC() ftrace_stub_graph = ftrace_stub; \ + ftrace_ops_list_func = arch_ftrace_ops_list_func; # else # define MCOUNT_REC() # endif diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 832e65f06754..12fcfa2d23ea 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -30,16 +30,26 @@ #define ARCH_SUPPORTS_FTRACE_OPS 0 #endif +#ifdef CONFIG_FUNCTION_TRACER +struct ftrace_ops; +struct ftrace_regs; /* * If the arch's mcount caller does not support all of ftrace's * features, then it must call an indirect function that * does. Or at least does enough to prevent any unwelcome side effects. + * + * Also define the function prototype that these architectures use + * to call the ftrace_ops_list_func(). */ #if !ARCH_SUPPORTS_FTRACE_OPS # define FTRACE_FORCE_LIST_FUNC 1 +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); #else # define FTRACE_FORCE_LIST_FUNC 0 +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); #endif +#endif /* CONFIG_FUNCTION_TRACER */ /* Main tracing buffer and events set up */ #ifdef CONFIG_TRACING @@ -88,8 +98,6 @@ extern int ftrace_enable_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); -struct ftrace_ops; - #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS struct ftrace_regs { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2c3e9760df7f..8b5801881271 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -119,14 +119,9 @@ struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; struct ftrace_ops global_ops; -#if ARCH_SUPPORTS_FTRACE_OPS -static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct ftrace_regs *fregs); -#else -/* See comment below, where ftrace_ops_list_func is defined */ -static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); -#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) -#endif +/* Defined by vmlinux.lds.h see the commment above arch_ftrace_ops_list_func for details */ +void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); static inline void ftrace_ops_init(struct ftrace_ops *ops) { @@ -7032,21 +7027,23 @@ out: * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still * set the ARCH_SUPPORTS_FTRACE_OPS. + * + * In vmlinux.lds.h, ftrace_ops_list_func() is defined to be + * arch_ftrace_ops_list_func. */ #if ARCH_SUPPORTS_FTRACE_OPS -static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct ftrace_regs *fregs) +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) { __ftrace_ops_list_func(ip, parent_ip, NULL, fregs); } -NOKPROBE_SYMBOL(ftrace_ops_list_func); #else -static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) +void arch_ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) { __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } -NOKPROBE_SYMBOL(ftrace_ops_no_ops); #endif +NOKPROBE_SYMBOL(arch_ftrace_ops_list_func); /* * If there's only one function registered but it does not support -- cgit v1.2.3 From 7ce1bb83a14019f8c396d57ec704d19478747716 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Wed, 13 Oct 2021 21:52:17 -0700 Subject: tracing/cfi: Fix cmp_entries_* functions signature mismatch If CONFIG_CFI_CLANG=y, attempting to read an event histogram will cause the kernel to panic due to failed CFI check. 1. echo 'hist:keys=common_pid' >> events/sched/sched_switch/trigger 2. cat events/sched/sched_switch/hist 3. kernel panics on attempting to read hist This happens because the sort() function expects a generic int (*)(const void *, const void *) pointer for the compare function. To prevent this CFI failure, change tracing map cmp_entries_* function signatures to match this. Also, fix the build error reported by the kernel test robot [1]. [1] https://lore.kernel.org/r/202110141140.zzi4dRh4-lkp@intel.com/ Link: https://lkml.kernel.org/r/20211014045217.3265162-1-kaleshsingh@google.com Signed-off-by: Kalesh Singh Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/tracing_map.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index d6bddb157ef2..39bb56d2dcbe 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -834,29 +834,35 @@ int tracing_map_init(struct tracing_map *map) return err; } -static int cmp_entries_dup(const struct tracing_map_sort_entry **a, - const struct tracing_map_sort_entry **b) +static int cmp_entries_dup(const void *A, const void *B) { + const struct tracing_map_sort_entry *a, *b; int ret = 0; - if (memcmp((*a)->key, (*b)->key, (*a)->elt->map->key_size)) + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + if (memcmp(a->key, b->key, a->elt->map->key_size)) ret = 1; return ret; } -static int cmp_entries_sum(const struct tracing_map_sort_entry **a, - const struct tracing_map_sort_entry **b) +static int cmp_entries_sum(const void *A, const void *B) { const struct tracing_map_elt *elt_a, *elt_b; + const struct tracing_map_sort_entry *a, *b; struct tracing_map_sort_key *sort_key; struct tracing_map_field *field; tracing_map_cmp_fn_t cmp_fn; void *val_a, *val_b; int ret = 0; - elt_a = (*a)->elt; - elt_b = (*b)->elt; + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + elt_a = a->elt; + elt_b = b->elt; sort_key = &elt_a->map->sort_key; @@ -873,18 +879,21 @@ static int cmp_entries_sum(const struct tracing_map_sort_entry **a, return ret; } -static int cmp_entries_key(const struct tracing_map_sort_entry **a, - const struct tracing_map_sort_entry **b) +static int cmp_entries_key(const void *A, const void *B) { const struct tracing_map_elt *elt_a, *elt_b; + const struct tracing_map_sort_entry *a, *b; struct tracing_map_sort_key *sort_key; struct tracing_map_field *field; tracing_map_cmp_fn_t cmp_fn; void *val_a, *val_b; int ret = 0; - elt_a = (*a)->elt; - elt_b = (*b)->elt; + a = *(const struct tracing_map_sort_entry **)A; + b = *(const struct tracing_map_sort_entry **)B; + + elt_a = a->elt; + elt_b = b->elt; sort_key = &elt_a->map->sort_key; @@ -989,10 +998,8 @@ static void sort_secondary(struct tracing_map *map, struct tracing_map_sort_key *primary_key, struct tracing_map_sort_key *secondary_key) { - int (*primary_fn)(const struct tracing_map_sort_entry **, - const struct tracing_map_sort_entry **); - int (*secondary_fn)(const struct tracing_map_sort_entry **, - const struct tracing_map_sort_entry **); + int (*primary_fn)(const void *, const void *); + int (*secondary_fn)(const void *, const void *); unsigned i, start = 0, n_sub = 1; if (is_key(map, primary_key->field_idx)) @@ -1061,8 +1068,7 @@ int tracing_map_sort_entries(struct tracing_map *map, unsigned int n_sort_keys, struct tracing_map_sort_entry ***sort_entries) { - int (*cmp_entries_fn)(const struct tracing_map_sort_entry **, - const struct tracing_map_sort_entry **); + int (*cmp_entries_fn)(const void *, const void *); struct tracing_map_sort_entry *sort_entry, **entries; int i, n_entries, ret; -- cgit v1.2.3 From 9b84fadc444de5456ab5f5487e2108311c724c3f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Oct 2021 13:42:40 -0400 Subject: tracing: Reuse logic from perf's get_recursion_context() Instead of having branches that adds noise to the branch prediction, use the addition logic to set the bit for the level of interrupt context that the state is currently in. This copies the logic from perf's get_recursion_context() function. Link: https://lore.kernel.org/all/20211015161702.GF174703@worktop.programming.kicks-ass.net/ Suggested-by: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_recursion.h | 11 ++++++----- kernel/trace/ring_buffer.c | 12 ++++++------ 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index a9f9c5714e65..f6da7a03bff0 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -137,12 +137,13 @@ enum { static __always_inline int trace_get_context_bit(void) { unsigned long pc = preempt_count(); + unsigned char bit = 0; - if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) - return TRACE_CTX_NORMAL; - else - return pc & NMI_MASK ? TRACE_CTX_NMI : - pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ; + bit += !!(pc & (NMI_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + return TRACE_CTX_NORMAL - bit; } #ifdef CONFIG_FTRACE_RECORD_RECURSION diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c5a3fbf19617..15d4380006e3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3168,13 +3168,13 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; unsigned long pc = preempt_count(); - int bit; + int bit = 0; - if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) - bit = RB_CTX_NORMAL; - else - bit = pc & NMI_MASK ? RB_CTX_NMI : - pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; + bit += !!(pc & (NMI_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + bit = RB_CTX_NORMAL - bit; if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { /* -- cgit v1.2.3 From 91ebe8bcbff9d2ff21303e73bf7434f39a98b255 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 15 Oct 2021 15:01:19 -0400 Subject: tracing/perf: Add interrupt_context_level() helper Now that there are three different instances of doing the addition trick to the preempt_count() and NMI_MASK, HARDIRQ_MASK and SOFTIRQ_OFFSET macros, it deserves a helper function defined in the preempt.h header. Add the interrupt_context_level() helper and replace the three instances that do that logic with it. Link: https://lore.kernel.org/all/20211015142541.4badd8a9@gandalf.local.home/ Signed-off-by: Steven Rostedt (VMware) --- include/linux/preempt.h | 21 +++++++++++++++++++++ include/linux/trace_recursion.h | 7 +------ kernel/events/internal.h | 7 +------ kernel/trace/ring_buffer.c | 7 +------ 4 files changed, 24 insertions(+), 18 deletions(-) (limited to 'kernel/trace') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 4d244e295e85..b32e3dabe28b 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -77,6 +77,27 @@ /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ #include +/** + * interrupt_context_level - return interrupt context level + * + * Returns the current interrupt context level. + * 0 - normal context + * 1 - softirq context + * 2 - hardirq context + * 3 - NMI context + */ +static __always_inline unsigned char interrupt_context_level(void) +{ + unsigned long pc = preempt_count(); + unsigned char level = 0; + + level += !!(pc & (NMI_MASK)); + level += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + + return level; +} + #define nmi_count() (preempt_count() & NMI_MASK) #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #ifdef CONFIG_PREEMPT_RT diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index f6da7a03bff0..1d8cce02c3fb 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -136,12 +136,7 @@ enum { static __always_inline int trace_get_context_bit(void) { - unsigned long pc = preempt_count(); - unsigned char bit = 0; - - bit += !!(pc & (NMI_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + unsigned char bit = interrupt_context_level(); return TRACE_CTX_NORMAL - bit; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 228801e20788..082832738c8f 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -205,12 +205,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) static inline int get_recursion_context(int *recursion) { - unsigned int pc = preempt_count(); - unsigned char rctx = 0; - - rctx += !!(pc & (NMI_MASK)); - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + unsigned char rctx = interrupt_context_level(); if (recursion[rctx]) return -1; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 15d4380006e3..f6520d0a4c8c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3167,12 +3167,7 @@ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { unsigned int val = cpu_buffer->current_context; - unsigned long pc = preempt_count(); - int bit = 0; - - bit += !!(pc & (NMI_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK)); - bit += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); + int bit = interrupt_context_level(); bit = RB_CTX_NORMAL - bit; -- cgit v1.2.3 From 0c0593b45c9b4e5b212ffb3fb28bb8d3c0ec0dc8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 8 Oct 2021 11:13:31 +0200 Subject: x86/ftrace: Make function graph use ftrace directly We don't need special hook for graph tracer entry point, but instead we can use graph_ops::func function to install the return_hooker. This moves the graph tracing setup _before_ the direct trampoline prepares the stack, so the return_hooker will be called when the direct trampoline is finished. This simplifies the code, because we don't need to take into account the direct trampoline setup when preparing the graph tracer hooker and we can allow function graph tracer on entries registered with direct trampoline. Link: https://lkml.kernel.org/r/20211008091336.33616-4-jolsa@kernel.org [fixed compile error reported by kernel test robot ] Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- arch/x86/include/asm/ftrace.h | 9 +++++++-- arch/x86/kernel/ftrace.c | 37 ++++++++++++++++++++++++++++++++++--- arch/x86/kernel/ftrace_64.S | 29 +---------------------------- include/linux/ftrace.h | 9 +++++++++ kernel/trace/fgraph.c | 6 ++++-- 5 files changed, 55 insertions(+), 35 deletions(-) (limited to 'kernel/trace') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9f3130f40807..024d9797646e 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -57,6 +57,13 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs) #define ftrace_instruction_pointer_set(fregs, _ip) \ do { (fregs)->regs.ip = (_ip); } while (0) + +struct ftrace_ops; +#define ftrace_graph_func ftrace_graph_func +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs); +#else +#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR #endif #ifdef CONFIG_DYNAMIC_FTRACE @@ -65,8 +72,6 @@ struct dyn_arch_ftrace { /* No extra data needed for x86 */ }; -#define FTRACE_GRAPH_TRAMP_ADDR FTRACE_GRAPH_ADDR - #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 7f12eacdf1ae..c39f906cdc4e 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -522,7 +522,7 @@ static void *addr_from_call(void *ptr) return ptr + CALL_INSN_SIZE + call.disp; } -void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, +void prepare_ftrace_return(unsigned long ip, unsigned long *parent, unsigned long frame_pointer); /* @@ -536,7 +536,8 @@ static void *static_tramp_func(struct ftrace_ops *ops, struct dyn_ftrace *rec) void *ptr; if (ops && ops->trampoline) { -#ifdef CONFIG_FUNCTION_GRAPH_TRACER +#if !defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) && \ + defined(CONFIG_FUNCTION_GRAPH_TRACER) /* * We only know about function graph tracer setting as static * trampoline. @@ -584,8 +585,9 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) #ifdef CONFIG_FUNCTION_GRAPH_TRACER #ifdef CONFIG_DYNAMIC_FTRACE -extern void ftrace_graph_call(void); +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +extern void ftrace_graph_call(void); static const char *ftrace_jmp_replace(unsigned long ip, unsigned long addr) { return text_gen_insn(JMP32_INSN_OPCODE, (void *)ip, (void *)addr); @@ -613,7 +615,17 @@ int ftrace_disable_ftrace_graph_caller(void) return ftrace_mod_jmp(ip, &ftrace_stub); } +#else /* !CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ +int ftrace_enable_ftrace_graph_caller(void) +{ + return 0; +} +int ftrace_disable_ftrace_graph_caller(void) +{ + return 0; +} +#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ #endif /* !CONFIG_DYNAMIC_FTRACE */ /* @@ -624,6 +636,7 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent, unsigned long frame_pointer) { unsigned long return_hooker = (unsigned long)&return_to_handler; + int bit; /* * When resuming from suspend-to-ram, this function can be indirectly @@ -643,7 +656,25 @@ void prepare_ftrace_return(unsigned long ip, unsigned long *parent, if (unlikely(atomic_read(¤t->tracing_graph_pause))) return; + bit = ftrace_test_recursion_trylock(ip, *parent); + if (bit < 0) + return; + if (!function_graph_enter(*parent, ip, frame_pointer, parent)) *parent = return_hooker; + + ftrace_test_recursion_unlock(bit); +} + +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +void ftrace_graph_func(unsigned long ip, unsigned long parent_ip, + struct ftrace_ops *op, struct ftrace_regs *fregs) +{ + struct pt_regs *regs = &fregs->regs; + unsigned long *stack = (unsigned long *)kernel_stack_pointer(regs); + + prepare_ftrace_return(ip, (unsigned long *)stack, 0); } +#endif + #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index a8eb084a7a9a..7a879901f103 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -174,11 +174,6 @@ SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL) SYM_FUNC_END(ftrace_caller); SYM_FUNC_START(ftrace_epilogue) -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL) - jmp ftrace_stub -#endif - /* * This is weak to keep gas from relaxing the jumps. * It is also used to copy the retq for trampolines. @@ -288,15 +283,6 @@ SYM_FUNC_START(__fentry__) cmpq $ftrace_stub, ftrace_trace_function jnz trace -fgraph_trace: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpq $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller - - cmpq $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller -#endif - SYM_INNER_LABEL(ftrace_stub, SYM_L_GLOBAL) retq @@ -314,25 +300,12 @@ trace: CALL_NOSPEC r8 restore_mcount_regs - jmp fgraph_trace + jmp ftrace_stub SYM_FUNC_END(__fentry__) EXPORT_SYMBOL(__fentry__) #endif /* CONFIG_DYNAMIC_FTRACE */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER -SYM_FUNC_START(ftrace_graph_caller) - /* Saves rbp into %rdx and fills first parameter */ - save_mcount_regs - - leaq MCOUNT_REG_SIZE+8(%rsp), %rsi - movq $0, %rdx /* No framepointers needed */ - call prepare_ftrace_return - - restore_mcount_regs - - retq -SYM_FUNC_END(ftrace_graph_caller) - SYM_FUNC_START(return_to_handler) subq $24, %rsp diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 12fcfa2d23ea..16a7baaba702 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -803,6 +803,15 @@ static inline bool is_ftrace_trampoline(unsigned long addr) } #endif /* CONFIG_DYNAMIC_FTRACE */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +#ifndef ftrace_graph_func +#define ftrace_graph_func ftrace_stub +#define FTRACE_OPS_GRAPH_STUB FTRACE_OPS_FL_STUB +#else +#define FTRACE_OPS_GRAPH_STUB 0 +#endif +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* totally disable ftrace - can not re-enable after this */ void ftrace_kill(void); diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index b8a0d1d564fb..22061d38fc00 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -115,6 +115,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, { struct ftrace_graph_ent trace; +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS /* * Skip graph tracing if the return location is served by direct trampoline, * since call sequence and return addresses are unpredictable anyway. @@ -124,6 +125,7 @@ int function_graph_enter(unsigned long ret, unsigned long func, if (ftrace_direct_func_count && ftrace_find_rec_direct(ret - MCOUNT_INSN_SIZE)) return -EBUSY; +#endif trace.func = func; trace.depth = ++current->curr_ret_depth; @@ -333,10 +335,10 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, #endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ static struct ftrace_ops graph_ops = { - .func = ftrace_stub, + .func = ftrace_graph_func, .flags = FTRACE_OPS_FL_INITIALIZED | FTRACE_OPS_FL_PID | - FTRACE_OPS_FL_STUB, + FTRACE_OPS_GRAPH_STUB, #ifdef FTRACE_GRAPH_TRAMP_ADDR .trampoline = FTRACE_GRAPH_TRAMP_ADDR, /* trampoline_size is only needed for dynamically allocated tramps */ -- cgit v1.2.3 From 130c08065848a98163b243b55e99f66c24609efb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:32 +0200 Subject: tracing: Add trampoline/graph selftest Adding selftest for checking that direct trampoline can co-exist together with graph tracer on same function. This is supported for CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS config option, which is defined only for x86_64 for now. Link: https://lkml.kernel.org/r/20211008091336.33616-5-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 54 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 875b4f1a0476..3404a245417e 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -784,6 +784,8 @@ static struct fgraph_ops fgraph_ops __initdata = { .retfunc = &trace_graph_return, }; +noinline __noclone static void trace_direct_tramp(void) { } + /* * Pretty much the same than for the function tracer from which the selftest * has been borrowed. @@ -794,6 +796,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, { int ret; unsigned long count; + char *func_name __maybe_unused; #ifdef CONFIG_DYNAMIC_FTRACE if (ftrace_filter_param) { @@ -842,8 +845,57 @@ trace_selftest_startup_function_graph(struct tracer *trace, goto out; } - /* Don't test dynamic tracing, the function tracer already did */ +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS + tracing_reset_online_cpus(&tr->array_buffer); + set_graph_array(tr); + + /* + * Some archs *cough*PowerPC*cough* add characters to the + * start of the function names. We simply put a '*' to + * accommodate them. + */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + ftrace_set_global_filter(func_name, strlen(func_name), 1); + + /* + * Register direct function together with graph tracer + * and make sure we get graph trace. + */ + ret = register_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME, + (unsigned long) trace_direct_tramp); + if (ret) + goto out; + + ret = register_ftrace_graph(&fgraph_ops); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + + DYN_FTRACE_TEST_NAME(); + + count = 0; + + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(&tr->array_buffer, &count); + + unregister_ftrace_graph(&fgraph_ops); + + ret = unregister_ftrace_direct((unsigned long) DYN_FTRACE_TEST_NAME, + (unsigned long) trace_direct_tramp); + if (ret) + goto out; + + tracing_start(); + if (!ret && !count) { + ret = -1; + goto out; + } +#endif + + /* Don't test dynamic tracing, the function tracer already did */ out: /* Stop it if we failed */ if (ret) -- cgit v1.2.3 From 4e341cad6b7a58376bfc6d1c8347727d094a6274 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 21 Oct 2021 13:43:57 -0400 Subject: tracing: Fix selftest config check for function graph start up test There's a new test in trace_selftest_startup_function_graph() that requires the use of ftrace args being supported as well does some tricks with dynamic tracing. Although this code checks HAVE_DYNAMIC_FTRACE_WITH_ARGS it fails to check DYNAMIC_FTRACE, and the kernel fails to build due to that dependency. Also only define the prototype of trace_direct_tramp() if it is used. Link: https://lkml.kernel.org/r/20211021134357.7f48e173@gandalf.local.home Acked-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 3404a245417e..afd937a46496 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -784,7 +784,11 @@ static struct fgraph_ops fgraph_ops __initdata = { .retfunc = &trace_graph_return, }; +#if defined(CONFIG_DYNAMIC_FTRACE) && \ + defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) +#define TEST_DIRECT_TRAMP noinline __noclone static void trace_direct_tramp(void) { } +#endif /* * Pretty much the same than for the function tracer from which the selftest @@ -845,7 +849,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, goto out; } -#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +#ifdef TEST_DIRECT_TRAMP tracing_reset_online_cpus(&tr->array_buffer); set_graph_array(tr); -- cgit v1.2.3 From 1904a8144598031af85406873c5fbec806ee3fd7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:33 +0200 Subject: ftrace: Add ftrace_add_rec_direct function Factor out the code that adds (ip, addr) tuple to direct_functions hash in new ftrace_add_rec_direct function. It will be used in following patches. Link: https://lkml.kernel.org/r/20211008091336.33616-6-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 64 +++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 27 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8b5801881271..ccbd8377e580 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2390,6 +2390,39 @@ unsigned long ftrace_find_rec_direct(unsigned long ip) return entry->direct; } +static struct ftrace_func_entry* +ftrace_add_rec_direct(unsigned long ip, unsigned long addr, + struct ftrace_hash **free_hash) +{ + struct ftrace_func_entry *entry; + + if (ftrace_hash_empty(direct_functions) || + direct_functions->count > 2 * (1 << direct_functions->size_bits)) { + struct ftrace_hash *new_hash; + int size = ftrace_hash_empty(direct_functions) ? 0 : + direct_functions->count + 1; + + if (size < 32) + size = 32; + + new_hash = dup_hash(direct_functions, size); + if (!new_hash) + return NULL; + + *free_hash = direct_functions; + direct_functions = new_hash; + } + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return NULL; + + entry->ip = ip; + entry->direct = addr; + __add_hash_entry(direct_functions, entry); + return entry; +} + static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops *ops, struct ftrace_regs *fregs) { @@ -5106,39 +5139,16 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } ret = -ENOMEM; - if (ftrace_hash_empty(direct_functions) || - direct_functions->count > 2 * (1 << direct_functions->size_bits)) { - struct ftrace_hash *new_hash; - int size = ftrace_hash_empty(direct_functions) ? 0 : - direct_functions->count + 1; - - if (size < 32) - size = 32; - - new_hash = dup_hash(direct_functions, size); - if (!new_hash) - goto out_unlock; - - free_hash = direct_functions; - direct_functions = new_hash; - } - - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - goto out_unlock; - direct = ftrace_find_direct_func(addr); if (!direct) { direct = ftrace_alloc_direct_func(addr); - if (!direct) { - kfree(entry); + if (!direct) goto out_unlock; - } } - entry->ip = ip; - entry->direct = addr; - __add_hash_entry(direct_functions, entry); + entry = ftrace_add_rec_direct(ip, addr, &free_hash); + if (!entry) + goto out_unlock; ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); if (ret) -- cgit v1.2.3 From f64dd4627ec6edc39bf1430fe6dbc923d2300a88 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:34 +0200 Subject: ftrace: Add multi direct register/unregister interface Adding interface to register multiple direct functions within single call. Adding following functions: register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) The register_ftrace_direct_multi registers direct function (addr) with all functions in ops filter. The ops filter can be updated before with ftrace_set_filter_ip calls. All requested functions must not have direct function currently registered, otherwise register_ftrace_direct_multi will fail. The unregister_ftrace_direct_multi unregisters ops related direct functions. Link: https://lkml.kernel.org/r/20211008091336.33616-7-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 11 ++++ kernel/trace/ftrace.c | 142 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 16a7baaba702..0158261cac9f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -324,7 +324,10 @@ int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, unsigned long old_addr, unsigned long new_addr); unsigned long ftrace_find_rec_direct(unsigned long ip); +int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); +int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); #else +struct ftrace_ops; # define ftrace_direct_func_count 0 static inline int register_ftrace_direct(unsigned long ip, unsigned long addr) { @@ -354,6 +357,14 @@ static inline unsigned long ftrace_find_rec_direct(unsigned long ip) { return 0; } +static inline int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + return -ENODEV; +} +static inline int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + return -ENODEV; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ccbd8377e580..a05b25fb77d8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5401,6 +5401,148 @@ int modify_ftrace_direct(unsigned long ip, return ret; } EXPORT_SYMBOL_GPL(modify_ftrace_direct); + +#define MULTI_FLAGS (FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | \ + FTRACE_OPS_FL_SAVE_REGS) + +static int check_direct_multi(struct ftrace_ops *ops) +{ + if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) + return -EINVAL; + if ((ops->flags & MULTI_FLAGS) != MULTI_FLAGS) + return -EINVAL; + return 0; +} + +static void remove_direct_functions_hash(struct ftrace_hash *hash, unsigned long addr) +{ + struct ftrace_func_entry *entry, *del; + int size, i; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + del = __ftrace_lookup_ip(direct_functions, entry->ip); + if (del && del->direct == addr) { + remove_hash_entry(direct_functions, del); + kfree(del); + } + } + } +} + +/** + * register_ftrace_direct_multi - Call a custom trampoline directly + * for multiple functions registered in @ops + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the trampoline to call at @ops functions + * + * This is used to connect a direct calls to @addr from the nop locations + * of the functions registered in @ops (with by ftrace_set_filter_ip + * function). + * + * The location that it calls (@addr) must be able to handle a direct call, + * and save the parameters of the function being traced, and restore them + * (or inject new ones if needed), before returning. + * + * Returns: + * 0 on success + * -EINVAL - The @ops object was already registered with this call or + * when there are no functions in @ops object. + * -EBUSY - Another direct function is already attached (there can be only one) + * -ENODEV - @ip does not point to a ftrace nop location (or not supported) + * -ENOMEM - There was an allocation failure. + */ +int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash, *free_hash = NULL; + struct ftrace_func_entry *entry, *new; + int err = -EBUSY, size, i; + + if (ops->func || ops->trampoline) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) + return -EINVAL; + if (ops->flags & FTRACE_OPS_FL_ENABLED) + return -EINVAL; + + hash = ops->func_hash->filter_hash; + if (ftrace_hash_empty(hash)) + return -EINVAL; + + mutex_lock(&direct_mutex); + + /* Make sure requested entries are not already registered.. */ + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_find_rec_direct(entry->ip)) + goto out_unlock; + } + } + + /* ... and insert them to direct_functions hash. */ + err = -ENOMEM; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + new = ftrace_add_rec_direct(entry->ip, addr, &free_hash); + if (!new) + goto out_remove; + entry->direct = addr; + } + } + + ops->func = call_direct_funcs; + ops->flags = MULTI_FLAGS; + ops->trampoline = FTRACE_REGS_ADDR; + + err = register_ftrace_function(ops); + + out_remove: + if (err) + remove_direct_functions_hash(hash, addr); + + out_unlock: + mutex_unlock(&direct_mutex); + + if (free_hash) { + synchronize_rcu_tasks(); + free_ftrace_hash(free_hash); + } + return err; +} +EXPORT_SYMBOL_GPL(register_ftrace_direct_multi); + +/** + * unregister_ftrace_direct_multi - Remove calls to custom trampoline + * previously registered by register_ftrace_direct_multi for @ops object. + * @ops: The address of the struct ftrace_ops object + * + * This is used to remove a direct calls to @addr from the nop locations + * of the functions registered in @ops (with by ftrace_set_filter_ip + * function). + * + * Returns: + * 0 on success + * -EINVAL - The @ops object was not properly registered. + */ +int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + int err; + + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + mutex_lock(&direct_mutex); + err = unregister_ftrace_function(ops); + remove_direct_functions_hash(hash, addr); + mutex_unlock(&direct_mutex); + return err; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit v1.2.3 From ccf5a89efd6f0a9483cea8acd4a0822b1a47e59a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 8 Oct 2021 11:13:35 +0200 Subject: ftrace: Add multi direct modify interface Adding interface to modify registered direct function for ftrace_ops. Adding following function: modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) The function changes the currently registered direct function for all attached functions. Link: https://lkml.kernel.org/r/20211008091336.33616-8-jolsa@kernel.org Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 6 +++++ kernel/trace/ftrace.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) (limited to 'kernel/trace') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0158261cac9f..9999e29187de 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -326,6 +326,8 @@ int ftrace_modify_direct_caller(struct ftrace_func_entry *entry, unsigned long ftrace_find_rec_direct(unsigned long ip); int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); +int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr); + #else struct ftrace_ops; # define ftrace_direct_func_count 0 @@ -365,6 +367,10 @@ static inline int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigne { return -ENODEV; } +static inline int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + return -ENODEV; +} #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ #ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a05b25fb77d8..30120342176e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5543,6 +5543,68 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) return err; } EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); + +/** + * modify_ftrace_direct_multi - Modify an existing direct 'multi' call + * to call something else + * @ops: The address of the struct ftrace_ops object + * @addr: The address of the new trampoline to call at @ops functions + * + * This is used to unregister currently registered direct caller and + * register new one @addr on functions registered in @ops object. + * + * Note there's window between ftrace_shutdown and ftrace_startup calls + * where there will be no callbacks called. + * + * Returns: zero on success. Non zero on error, which includes: + * -EINVAL - The @ops object was not properly registered. + */ +int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + struct ftrace_func_entry *entry, *iter; + int i, size; + int err; + + if (check_direct_multi(ops)) + return -EINVAL; + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EINVAL; + + mutex_lock(&direct_mutex); + mutex_lock(&ftrace_lock); + + /* + * Shutdown the ops, change 'direct' pointer for each + * ops entry in direct_functions hash and startup the + * ops back again. + * + * Note there is no callback called for @ops object after + * this ftrace_shutdown call until ftrace_startup is called + * later on. + */ + err = ftrace_shutdown(ops, 0); + if (err) + goto out_unlock; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(iter, &hash->buckets[i], hlist) { + entry = __ftrace_lookup_ip(direct_functions, iter->ip); + if (!entry) + continue; + entry->direct = addr; + } + } + + err = ftrace_startup(ops, 0); + + out_unlock: + mutex_unlock(&ftrace_lock); + mutex_unlock(&direct_mutex); + return err; +} +EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi); #endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */ /** -- cgit v1.2.3 From ed29271894aa92826d308231593b7ee7ac5a4932 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 14 Oct 2021 16:11:14 -0400 Subject: ftrace/direct: Do not disable when switching direct callers Currently to switch a set of "multi" direct trampolines from one trampoline to another, a full shutdown of the current set needs to be done, followed by an update to what trampoline the direct callers would call, and then re-enabling the callers. This leaves a time when the functions will not be calling anything, and events may be missed. Instead, use a trick to allow all the functions with direct trampolines attached will always call either the new or old trampoline while the switch is happening. To do this, first attach a "dummy" callback via ftrace to all the functions that the current direct trampoline is attached to. This will cause the functions to call the "list func" instead of the direct trampoline. The list function will call the direct trampoline "helper" that will set the function it should call as it returns back to the ftrace trampoline. At this moment, the direct caller descriptor can safely update the direct call trampoline. The list function will pick either the new or old function (depending on the memory coherency model of the architecture). Now removing the dummy function from each of the locations of the direct trampoline caller, will put back the direct call, but now to the new trampoline. A better visual is: [ Changing direct call from my_direct_1 to my_direct_2 ] : call my_direct_1 |||||||||||||||||||| vvvvvvvvvvvvvvvvvvvv : call ftrace_caller : [..] call ftrace_ops_list_func ftrace_ops_list_func() { ops->func() -> direct_helper -> set rax to my_direct_1 or my_direct_2 } call rax (to either my_direct_1 or my_direct_2 |||||||||||||||||||| vvvvvvvvvvvvvvvvvvvv : call my_direct_2 Link: https://lore.kernel.org/all/20211014162819.5c85618b@gandalf.local.home/ Acked-by: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 30120342176e..f90ed00a6d5b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5561,8 +5561,12 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi); */ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash = ops->func_hash->filter_hash; + struct ftrace_hash *hash; struct ftrace_func_entry *entry, *iter; + static struct ftrace_ops tmp_ops = { + .func = ftrace_stub, + .flags = FTRACE_OPS_FL_STUB, + }; int i, size; int err; @@ -5572,21 +5576,22 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) return -EINVAL; mutex_lock(&direct_mutex); - mutex_lock(&ftrace_lock); + + /* Enable the tmp_ops to have the same functions as the direct ops */ + ftrace_ops_init(&tmp_ops); + tmp_ops.func_hash = ops->func_hash; + + err = register_ftrace_function(&tmp_ops); + if (err) + goto out_direct; /* - * Shutdown the ops, change 'direct' pointer for each - * ops entry in direct_functions hash and startup the - * ops back again. - * - * Note there is no callback called for @ops object after - * this ftrace_shutdown call until ftrace_startup is called - * later on. + * Now the ftrace_ops_list_func() is called to do the direct callers. + * We can safely change the direct functions attached to each entry. */ - err = ftrace_shutdown(ops, 0); - if (err) - goto out_unlock; + mutex_lock(&ftrace_lock); + hash = ops->func_hash->filter_hash; size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(iter, &hash->buckets[i], hlist) { @@ -5597,10 +5602,11 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr) } } - err = ftrace_startup(ops, 0); + /* Removing the tmp_ops will add the updated direct callers to the functions */ + unregister_ftrace_function(&tmp_ops); - out_unlock: mutex_unlock(&ftrace_lock); + out_direct: mutex_unlock(&direct_mutex); return err; } -- cgit v1.2.3 From 8720aeecc246837bc6da64c5118dc3177c162e14 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 19 Oct 2021 17:33:13 +0200 Subject: tracing: use %ps format string to print symbols clang started warning about excessive stack usage in hist_trigger_print_key() kernel/trace/trace_events_hist.c:4723:13: error: stack frame size (1336) exceeds limit (1024) in function 'hist_trigger_print_key' [-Werror,-Wframe-larger-than] The problem is that there are two 512-byte arrays on the stack if hist_trigger_stacktrace_print() gets inlined. I don't think this has changed in the past five years, but something probably changed the inlining decisions made by the compiler, so the problem is now made more obvious. Rather than printing the symbol names into separate buffers, it seems we can simply use the special %ps format string modifier to print the pointers symbolically and get rid of both buffers. Marking hist_trigger_stacktrace_print() would be a simpler way of avoiding the warning, but that would not address the excessive stack usage. Link: https://lkml.kernel.org/r/20211019153337.294790-1-arnd@kernel.org Fixes: 69a0200c2e25 ("tracing: Add hist trigger support for stacktraces as keys") Link: https://lore.kernel.org/all/20211015095704.49a99859@gandalf.local.home/ Reviewed-by: Tom Zanussi Tested-by: Tom Zanussi Signed-off-by: Arnd Bergmann Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a6061a69aa84..b64aed538628 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4706,7 +4706,6 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, unsigned long *stacktrace_entries, unsigned int max_entries) { - char str[KSYM_SYMBOL_LEN]; unsigned int spaces = 8; unsigned int i; @@ -4715,8 +4714,7 @@ static void hist_trigger_stacktrace_print(struct seq_file *m, return; seq_printf(m, "%*c", 1 + spaces, ' '); - sprint_symbol(str, stacktrace_entries[i]); - seq_printf(m, "%s\n", str); + seq_printf(m, "%pS\n", (void*)stacktrace_entries[i]); } } @@ -4726,7 +4724,6 @@ static void hist_trigger_print_key(struct seq_file *m, struct tracing_map_elt *elt) { struct hist_field *key_field; - char str[KSYM_SYMBOL_LEN]; bool multiline = false; const char *field_name; unsigned int i; @@ -4747,14 +4744,12 @@ static void hist_trigger_print_key(struct seq_file *m, seq_printf(m, "%s: %llx", field_name, uval); } else if (key_field->flags & HIST_FIELD_FL_SYM) { uval = *(u64 *)(key + key_field->offset); - sprint_symbol_no_offset(str, uval); - seq_printf(m, "%s: [%llx] %-45s", field_name, - uval, str); + seq_printf(m, "%s: [%llx] %-45ps", field_name, + uval, (void *)(uintptr_t)uval); } else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) { uval = *(u64 *)(key + key_field->offset); - sprint_symbol(str, uval); - seq_printf(m, "%s: [%llx] %-55s", field_name, - uval, str); + seq_printf(m, "%s: [%llx] %-55pS", field_name, + uval, (void *)(uintptr_t)uval); } else if (key_field->flags & HIST_FIELD_FL_EXECNAME) { struct hist_elt_data *elt_data = elt->private_data; char *comm; -- cgit v1.2.3 From 172f7ba9772cae12f099fc563352e905dc9a1921 Mon Sep 17 00:00:00 2001 From: chongjiapeng Date: Tue, 19 Oct 2021 18:48:54 +0800 Subject: ftrace: Make ftrace_profile_pages_init static This symbol is not used outside of ftrace.c, so marks it static. Fixes the following sparse warning: kernel/trace/ftrace.c:579:5: warning: symbol 'ftrace_profile_pages_init' was not declared. Should it be static? Link: https://lkml.kernel.org/r/1634640534-18280-1-git-send-email-jiapeng.chong@linux.alibaba.com Reported-by: Abaci Robot Fixes: cafb168a1c92 ("tracing: make the function profiler per cpu") Signed-off-by: chongjiapeng Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f90ed00a6d5b..2057ad363772 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -576,7 +576,7 @@ static void ftrace_profile_reset(struct ftrace_profile_stat *stat) FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); } -int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) +static int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; int functions; -- cgit v1.2.3 From e0f3b18be733ac4a3b6deb2ff586bc1936ad0368 Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 15 Oct 2021 17:07:50 +0200 Subject: trace/osnoise: Add migrate-disabled field to the osnoise header Since "54357f0c9149 tracing: Add migrate-disabled counter to tracing output," the migrate disabled field is also printed in the !PREEMPR_RT kernel config. While this information was added to the vast majority of tracers, osnoise and timerlat were not updated (because they are new tracers). Fix osnoise header by adding the information about migrate disabled. Link: https://lkml.kernel.org/r/9cb3d54e29e0588dbba12e81486bd8a09adcd8ca.1634308385.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Sebastian Andrzej Siewior Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Fixes: 54357f0c9149 ("tracing: Add migrate-disabled counter to tracing output.") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index c4f14fb98aaa..34f26c632442 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -294,19 +294,19 @@ static void print_osnoise_headers(struct seq_file *s) seq_puts(s, "# _-----=> irqs-off\n"); seq_puts(s, "# / _----=> need-resched\n"); seq_puts(s, "# | / _---=> hardirq/softirq\n"); - seq_puts(s, "# || / _--=> preempt-depth "); - seq_puts(s, " MAX\n"); - - seq_puts(s, "# || / "); + seq_puts(s, "# || / _--=> preempt-depth\n"); + seq_puts(s, "# ||| / _-=> migrate-disable "); + seq_puts(s, " MAX\n"); + seq_puts(s, "# |||| / delay "); seq_puts(s, " SINGLE Interference counters:\n"); - seq_puts(s, "# |||| RUNTIME "); + seq_puts(s, "# ||||| RUNTIME "); seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n"); - seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP IN US "); + seq_puts(s, "# TASK-PID CPU# ||||| TIMESTAMP IN US "); seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n"); - seq_puts(s, "# | | | |||| | | "); + seq_puts(s, "# | | | ||||| | | "); seq_puts(s, " | | | | | | | |\n"); } #endif /* CONFIG_PREEMPT_RT */ -- cgit v1.2.3 From aeafcb82d99c97ff5c6054a4091eeb12aefca9ab Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 15 Oct 2021 17:07:51 +0200 Subject: trace/timerlat: Add migrate-disabled field to the timerlat header Since "54357f0c9149 tracing: Add migrate-disabled counter to tracing output," the migrate disabled field is also printed in the !PREEMPR_RT kernel config. While this information was added to the vast majority of tracers, osnoise and timerlat were not updated (because they are new tracers). Fix timerlat header by adding the information about migrate disabled. Link: https://lkml.kernel.org/r/bc0c234ab49946cdd63effa6584e1d5e8662cb44.1634308385.git.bristot@kernel.org Cc: Daniel Bristot de Oliveira Cc: Jonathan Corbet Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Sebastian Andrzej Siewior Cc: x86@kernel.org Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Fixes: 54357f0c9149 ("tracing: Add migrate-disabled counter to tracing output.") Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_osnoise.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 34f26c632442..d11b41784fac 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -378,11 +378,12 @@ static void print_timerlat_headers(struct seq_file *s) seq_puts(s, "# / _----=> need-resched\n"); seq_puts(s, "# | / _---=> hardirq/softirq\n"); seq_puts(s, "# || / _--=> preempt-depth\n"); - seq_puts(s, "# || /\n"); - seq_puts(s, "# |||| ACTIVATION\n"); - seq_puts(s, "# TASK-PID CPU# |||| TIMESTAMP ID "); - seq_puts(s, " CONTEXT LATENCY\n"); - seq_puts(s, "# | | | |||| | | "); + seq_puts(s, "# ||| / _-=> migrate-disable\n"); + seq_puts(s, "# |||| / delay\n"); + seq_puts(s, "# ||||| ACTIVATION\n"); + seq_puts(s, "# TASK-PID CPU# ||||| TIMESTAMP ID "); + seq_puts(s, " CONTEXT LATENCY\n"); + seq_puts(s, "# | | | ||||| | | "); seq_puts(s, " | |\n"); } #endif /* CONFIG_PREEMPT_RT */ -- cgit v1.2.3 From 3c20bd3af535d64771b193bb4dd41ed662c464ce Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 15 Oct 2021 15:55:50 -0400 Subject: tracing: Fix missing trace_boot_init_histograms kstrdup NULL checks trace_boot_init_histograms misses NULL pointer checks for kstrdup failure. Link: https://lkml.kernel.org/r/20211015195550.22742-1-mathieu.desnoyers@efficios.com Fixes: 64dc7f6958ef5 ("tracing/boot: Show correct histogram error command") Acked-by: Masami Hiramatsu Signed-off-by: Mathieu Desnoyers Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_boot.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 8d252f63cd78..0580287d7a0d 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -430,6 +430,8 @@ trace_boot_init_histograms(struct trace_event_file *file, /* All digit started node should be instances. */ if (trace_boot_compose_hist_cmd(node, buf, size) == 0) { tmp = kstrdup(buf, GFP_KERNEL); + if (!tmp) + return; if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply hist trigger: %s\n", tmp); kfree(tmp); @@ -439,6 +441,8 @@ trace_boot_init_histograms(struct trace_event_file *file, if (xbc_node_find_subkey(hnode, "keys")) { if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) { tmp = kstrdup(buf, GFP_KERNEL); + if (!tmp) + return; if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply hist trigger: %s\n", tmp); kfree(tmp); -- cgit v1.2.3 From 1d6288914264a22c0efdfb3a5748c101c0d12baa Mon Sep 17 00:00:00 2001 From: Wang ShaoBo Date: Thu, 21 Oct 2021 11:52:25 +0800 Subject: tracing/hwlat: Make some internal symbols static The sparse tool complains as follows: kernel/trace/trace_hwlat.c:82:27: warning: symbol 'hwlat_single_cpu_data' was not declared. Should it be static? kernel/trace/trace_hwlat.c:83:1: warning: symbol '__pcpu_scope_hwlat_per_cpu_data' was not declared. Should it be static? This symbol is not used outside of trace_hwlat.c, so this commit marks it static. Link: https://lkml.kernel.org/r/20211021035225.1050685-1-bobo.shaobowang@huawei.com Signed-off-by: Wang ShaoBo Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index d0a730d99a33..56bb7b890578 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -79,8 +79,8 @@ struct hwlat_kthread_data { int nmi_cpu; }; -struct hwlat_kthread_data hwlat_single_cpu_data; -DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data); +static struct hwlat_kthread_data hwlat_single_cpu_data; +static DEFINE_PER_CPU(struct hwlat_kthread_data, hwlat_per_cpu_data); /* Tells NMIs to call back to the hwlat tracer to record timestamps */ bool trace_hwlat_callback_enabled; -- cgit v1.2.3 From 52cfb373536a7fb744b0ec4b748518e5dc874fb7 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:33 -0700 Subject: tracing: Add support for creating hist trigger variables from literal Currently hist trigger expressions don't support the use of numeric literals: e.g. echo 'hist:keys=common_pid:x=$y-1234' --> is not valid expression syntax Having the ability to use numeric constants in hist triggers supports a wider range of expressions for creating variables. Add support for creating trace event histogram variables from numeric literals. e.g. echo 'hist:keys=common_pid:x=1234,y=size-1024' >> event/trigger A negative numeric constant is created, using unary minus operator (parentheses are required). e.g. echo 'hist:keys=common_pid:z=-(2)' >> event/trigger Constants can be used with division/multiplication (added in the next patch in this series) to implement granularity filters for frequent trace events. For instance we can limit emitting the rss_stat trace event to when there is a 512KB cross over in the rss size: # Create a synthetic event to monitor instead of the high frequency # rss_stat event echo 'rss_stat_throttled unsigned int mm_id; unsigned int curr; int member; long size' >> tracing/synthetic_events # Create a hist trigger that emits the synthetic rss_stat_throttled # event only when the rss size crosses a 512KB boundary. echo 'hist:keys=keys=mm_id,member:bucket=size/0x80000:onchange($bucket) .rss_stat_throttled(mm_id,curr,member,size)' >> events/kmem/rss_stat/trigger A use case for using constants with addition/subtraction is not yet known, but for completeness the use of constants are supported for all operators. Link: https://lkml.kernel.org/r/20211025200852.3002369-2-kaleshsingh@google.com Signed-off-by: Kalesh Singh Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 71 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index b64aed538628..e6165e36d3b6 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -66,7 +66,8 @@ C(EMPTY_SORT_FIELD, "Empty sort field"), \ C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \ - C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), + C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ + C(EXPECT_NUMBER, "Expecting numeric literal"), #undef C #define C(a, b) HIST_ERR_##a @@ -89,6 +90,7 @@ typedef u64 (*hist_field_fn_t) (struct hist_field *field, #define HIST_FIELD_OPERANDS_MAX 2 #define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX) #define HIST_ACTIONS_MAX 8 +#define HIST_CONST_DIGITS_MAX 21 enum field_op_id { FIELD_OP_NONE, @@ -152,6 +154,9 @@ struct hist_field { bool read_once; unsigned int var_str_idx; + + /* Numeric literals are represented as u64 */ + u64 constant; }; static u64 hist_field_none(struct hist_field *field, @@ -163,6 +168,15 @@ static u64 hist_field_none(struct hist_field *field, return 0; } +static u64 hist_field_const(struct hist_field *field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + return field->constant; +} + static u64 hist_field_counter(struct hist_field *field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -341,6 +355,7 @@ enum hist_field_flags { HIST_FIELD_FL_CPU = 1 << 15, HIST_FIELD_FL_ALIAS = 1 << 16, HIST_FIELD_FL_BUCKET = 1 << 17, + HIST_FIELD_FL_CONST = 1 << 18, }; struct var_defs { @@ -1516,6 +1531,12 @@ static void expr_field_str(struct hist_field *field, char *expr) { if (field->flags & HIST_FIELD_FL_VAR_REF) strcat(expr, "$"); + else if (field->flags & HIST_FIELD_FL_CONST) { + char str[HIST_CONST_DIGITS_MAX]; + + snprintf(str, HIST_CONST_DIGITS_MAX, "%llu", field->constant); + strcat(expr, str); + } strcat(expr, hist_field_name(field, 0)); @@ -1689,6 +1710,15 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; } + if (flags & HIST_FIELD_FL_CONST) { + hist_field->fn = hist_field_const; + hist_field->size = sizeof(u64); + hist_field->type = kstrdup("u64", GFP_KERNEL); + if (!hist_field->type) + goto free; + goto out; + } + if (flags & HIST_FIELD_FL_STACKTRACE) { hist_field->fn = hist_field_none; goto out; @@ -2090,6 +2120,29 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data, return alias; } +static struct hist_field *parse_const(struct hist_trigger_data *hist_data, + char *str, char *var_name, + unsigned long *flags) +{ + struct trace_array *tr = hist_data->event_file->tr; + struct hist_field *field = NULL; + u64 constant; + + if (kstrtoull(str, 0, &constant)) { + hist_err(tr, HIST_ERR_EXPECT_NUMBER, errpos(str)); + return NULL; + } + + *flags |= HIST_FIELD_FL_CONST; + field = create_hist_field(hist_data, NULL, *flags, var_name); + if (!field) + return NULL; + + field->constant = constant; + + return field; +} + static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long *flags, char *var_name) @@ -2100,6 +2153,15 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, unsigned long buckets = 0; int ret = 0; + if (isdigit(str[0])) { + hist_field = parse_const(hist_data, str, var_name, flags); + if (!hist_field) { + ret = -EINVAL; + goto out; + } + return hist_field; + } + s = strchr(str, '.'); if (s) { s = strchr(++s, '.'); @@ -4945,6 +5007,8 @@ static void hist_field_debug_show_flags(struct seq_file *m, if (flags & HIST_FIELD_FL_ALIAS) seq_puts(m, " HIST_FIELD_FL_ALIAS\n"); + else if (flags & HIST_FIELD_FL_CONST) + seq_puts(m, " HIST_FIELD_FL_CONST\n"); } static int hist_field_debug_show(struct seq_file *m, @@ -4966,6 +5030,9 @@ static int hist_field_debug_show(struct seq_file *m, field->var.idx); } + if (field->flags & HIST_FIELD_FL_CONST) + seq_printf(m, " constant: %llu\n", field->constant); + if (field->flags & HIST_FIELD_FL_ALIAS) seq_printf(m, " var_ref_idx (into hist_data->var_refs[]): %u\n", field->var_ref_idx); @@ -5208,6 +5275,8 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) if (hist_field->flags & HIST_FIELD_FL_CPU) seq_puts(m, "common_cpu"); + else if (hist_field->flags & HIST_FIELD_FL_CONST) + seq_printf(m, "%llu", hist_field->constant); else if (field_name) { if (hist_field->flags & HIST_FIELD_FL_VAR_REF || hist_field->flags & HIST_FIELD_FL_ALIAS) -- cgit v1.2.3 From bcef044150320217e2a00c65050114e509c222b8 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:34 -0700 Subject: tracing: Add division and multiplication support for hist triggers Adds basic support for division and multiplication operations for hist trigger variable expressions. For simplicity this patch only supports, division and multiplication for a single operation expression (e.g. x=$a/$b), as currently expressions are always evaluated right to left. This can lead to some incorrect results: e.g. echo 'hist:keys=common_pid:x=8-4-2' >> event/trigger 8-4-2 should evaluate to 2 i.e. (8-4)-2 but currently x evaluate to 6 i.e. 8-(4-2) Multiplication and division in sub-expressions will work correctly, once correct operator precedence support is added (See next patch in this series). For the undefined case of division by 0, the histogram expression evaluates to (u64)(-1). Since this cannot be detected when the expression is created, it is the responsibility of the user to be aware and account for this possibility. Examples: echo 'hist:keys=common_pid:a=8,b=4,x=$a/$b' \ >> event/trigger echo 'hist:keys=common_pid:y=5*$b' \ >> event/trigger Link: https://lkml.kernel.org/r/20211025200852.3002369-3-kaleshsingh@google.com Signed-off-by: Kalesh Singh Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 72 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index e6165e36d3b6..1edec5d471c1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -97,6 +97,8 @@ enum field_op_id { FIELD_OP_PLUS, FIELD_OP_MINUS, FIELD_OP_UNARY_MINUS, + FIELD_OP_DIV, + FIELD_OP_MULT, }; /* @@ -285,6 +287,40 @@ static u64 hist_field_minus(struct hist_field *hist_field, return val1 - val2; } +static u64 hist_field_div(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + + /* Return -1 for the undefined case */ + if (!val2) + return -1; + + return div64_u64(val1, val2); +} + +static u64 hist_field_mult(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + struct hist_field *operand1 = hist_field->operands[0]; + struct hist_field *operand2 = hist_field->operands[1]; + + u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + + return val1 * val2; +} + static u64 hist_field_unary_minus(struct hist_field *hist_field, struct tracing_map_elt *elt, struct trace_buffer *buffer, @@ -1592,6 +1628,12 @@ static char *expr_str(struct hist_field *field, unsigned int level) case FIELD_OP_PLUS: strcat(expr, "+"); break; + case FIELD_OP_DIV: + strcat(expr, "/"); + break; + case FIELD_OP_MULT: + strcat(expr, "*"); + break; default: kfree(expr); return NULL; @@ -1607,7 +1649,7 @@ static int contains_operator(char *str) enum field_op_id field_op = FIELD_OP_NONE; char *op; - op = strpbrk(str, "+-"); + op = strpbrk(str, "+-/*"); if (!op) return FIELD_OP_NONE; @@ -1628,6 +1670,12 @@ static int contains_operator(char *str) case '+': field_op = FIELD_OP_PLUS; break; + case '/': + field_op = FIELD_OP_DIV; + break; + case '*': + field_op = FIELD_OP_MULT; + break; default: break; } @@ -2361,10 +2409,26 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, case FIELD_OP_PLUS: sep = "+"; break; + case FIELD_OP_DIV: + sep = "/"; + break; + case FIELD_OP_MULT: + sep = "*"; + break; default: goto free; } + /* + * Multiplication and division are only supported in single operator + * expressions, since the expression is always evaluated from right + * to left. + */ + if ((field_op == FIELD_OP_DIV || field_op == FIELD_OP_MULT) && level > 0) { + hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); + return ERR_PTR(-EINVAL); + } + operand1_str = strsep(&str, sep); if (!operand1_str || !str) goto free; @@ -2436,6 +2500,12 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, case FIELD_OP_PLUS: expr->fn = hist_field_plus; break; + case FIELD_OP_DIV: + expr->fn = hist_field_div; + break; + case FIELD_OP_MULT: + expr->fn = hist_field_mult; + break; default: ret = -EINVAL; goto free; -- cgit v1.2.3 From 9710b2f341a0d96f35b911580639853cfda4677d Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:35 -0700 Subject: tracing: Fix operator precedence for hist triggers expression The current histogram expression evaluation logic evaluates the expression from right to left. This can lead to incorrect results if the operations are not associative (as is the case for subtraction and, the now added, division operators). e.g. 16-8-4-2 should be 2 not 10 --> 16-8-4-2 = ((16-8)-4)-2 64/8/4/2 should be 1 not 16 --> 64/8/4/2 = ((64/8)/4)/2 Division and multiplication are currently limited to single operation expression due to operator precedence support not yet implemented. Rework the expression parsing to support the correct evaluation of expressions containing operators of different precedences; and fix the associativity error by evaluating expressions with operators of the same precedence from left to right. Examples: (1) echo 'hist:keys=common_pid:a=8,b=4,c=2,d=1,w=$a-$b-$c-$d' \ >> event/trigger (2) echo 'hist:keys=common_pid:x=$a/$b/3/2' >> event/trigger (3) echo 'hist:keys=common_pid:y=$a+10/$c*1024' >> event/trigger (4) echo 'hist:keys=common_pid:z=$a/$b+$c*$d' >> event/trigger Link: https://lkml.kernel.org/r/20211025200852.3002369-4-kaleshsingh@google.com Signed-off-by: Kalesh Singh Reviewed-by: Namhyung Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 210 ++++++++++++++++++++++++++------------- 1 file changed, 140 insertions(+), 70 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1edec5d471c1..7a50ea2ac6b1 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -67,7 +67,9 @@ C(TOO_MANY_SORT_FIELDS, "Too many sort fields (Max = 2)"), \ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ - C(EXPECT_NUMBER, "Expecting numeric literal"), + C(EXPECT_NUMBER, "Expecting numeric literal"), \ + C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \ + C(SYM_OFFSET_SUBEXPR, ".sym-offset not supported in sub-expressions"), #undef C #define C(a, b) HIST_ERR_##a @@ -1644,40 +1646,96 @@ static char *expr_str(struct hist_field *field, unsigned int level) return expr; } -static int contains_operator(char *str) +/* + * If field_op != FIELD_OP_NONE, *sep points to the root operator + * of the expression tree to be evaluated. + */ +static int contains_operator(char *str, char **sep) { enum field_op_id field_op = FIELD_OP_NONE; - char *op; + char *minus_op, *plus_op, *div_op, *mult_op; + + + /* + * Report the last occurrence of the operators first, so that the + * expression is evaluated left to right. This is important since + * subtraction and division are not associative. + * + * e.g + * 64/8/4/2 is 1, i.e 64/8/4/2 = ((64/8)/4)/2 + * 14-7-5-2 is 0, i.e 14-7-5-2 = ((14-7)-5)-2 + */ - op = strpbrk(str, "+-/*"); - if (!op) - return FIELD_OP_NONE; + /* + * First, find lower precedence addition and subtraction + * since the expression will be evaluated recursively. + */ + minus_op = strrchr(str, '-'); + if (minus_op) { + /* Unfortunately, the modifier ".sym-offset" can confuse things. */ + if (minus_op - str >= 4 && !strncmp(minus_op - 4, ".sym-offset", 11)) + goto out; - switch (*op) { - case '-': /* - * Unfortunately, the modifier ".sym-offset" - * can confuse things. + * Unary minus is not supported in sub-expressions. If + * present, it is always the next root operator. */ - if (op - str >= 4 && !strncmp(op - 4, ".sym-offset", 11)) - return FIELD_OP_NONE; - - if (*str == '-') + if (minus_op == str) { field_op = FIELD_OP_UNARY_MINUS; - else - field_op = FIELD_OP_MINUS; - break; - case '+': - field_op = FIELD_OP_PLUS; - break; - case '/': + goto out; + } + + field_op = FIELD_OP_MINUS; + } + + plus_op = strrchr(str, '+'); + if (plus_op || minus_op) { + /* + * For operators of the same precedence use to rightmost as the + * root, so that the expression is evaluated left to right. + */ + if (plus_op > minus_op) + field_op = FIELD_OP_PLUS; + goto out; + } + + /* + * Multiplication and division have higher precedence than addition and + * subtraction. + */ + div_op = strrchr(str, '/'); + if (div_op) field_op = FIELD_OP_DIV; - break; - case '*': + + mult_op = strrchr(str, '*'); + /* + * For operators of the same precedence use to rightmost as the + * root, so that the expression is evaluated left to right. + */ + if (mult_op > div_op) field_op = FIELD_OP_MULT; - break; - default: - break; + +out: + if (sep) { + switch (field_op) { + case FIELD_OP_UNARY_MINUS: + case FIELD_OP_MINUS: + *sep = minus_op; + break; + case FIELD_OP_PLUS: + *sep = plus_op; + break; + case FIELD_OP_DIV: + *sep = div_op; + break; + case FIELD_OP_MULT: + *sep = mult_op; + break; + case FIELD_OP_NONE: + default: + *sep = NULL; + break; + } } return field_op; @@ -2003,7 +2061,7 @@ static char *field_name_from_var(struct hist_trigger_data *hist_data, if (strcmp(var_name, name) == 0) { field = hist_data->attrs->var_defs.expr[i]; - if (contains_operator(field) || is_var_ref(field)) + if (contains_operator(field, NULL) || is_var_ref(field)) continue; return field; } @@ -2266,21 +2324,24 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long flags, - char *var_name, unsigned int level); + char *var_name, unsigned int *n_subexprs); static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long flags, - char *var_name, unsigned int level) + char *var_name, unsigned int *n_subexprs) { struct hist_field *operand1, *expr = NULL; unsigned long operand_flags; int ret = 0; char *s; + /* Unary minus operator, increment n_subexprs */ + ++*n_subexprs; + /* we support only -(xxx) i.e. explicit parens required */ - if (level > 3) { + if (*n_subexprs > 3) { hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); ret = -EINVAL; goto free; @@ -2297,8 +2358,16 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, } s = strrchr(str, ')'); - if (s) + if (s) { + /* unary minus not supported in sub-expressions */ + if (*(s+1) != '\0') { + hist_err(file->tr, HIST_ERR_UNARY_MINUS_SUBEXPR, + errpos(str)); + ret = -EINVAL; + goto free; + } *s = '\0'; + } else { ret = -EINVAL; /* no closing ')' */ goto free; @@ -2312,7 +2381,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, } operand_flags = 0; - operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + operand1 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand1)) { ret = PTR_ERR(operand1); goto free; @@ -2382,60 +2451,61 @@ static int check_expr_operands(struct trace_array *tr, static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, struct trace_event_file *file, char *str, unsigned long flags, - char *var_name, unsigned int level) + char *var_name, unsigned int *n_subexprs) { struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL; unsigned long operand_flags; int field_op, ret = -EINVAL; char *sep, *operand1_str; - if (level > 3) { + if (*n_subexprs > 3) { hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); return ERR_PTR(-EINVAL); } - field_op = contains_operator(str); + /* + * ".sym-offset" in expressions has no effect on their evaluation, + * but can confuse operator parsing. + */ + if (*n_subexprs == 0) { + sep = strstr(str, ".sym-offset"); + if (sep) { + *sep = '\0'; + if (strpbrk(str, "+-/*") || strpbrk(sep + 11, "+-/*")) { + *sep = '.'; + hist_err(file->tr, HIST_ERR_SYM_OFFSET_SUBEXPR, + errpos(sep)); + return ERR_PTR(-EINVAL); + } + *sep = '.'; + } + } + + field_op = contains_operator(str, &sep); if (field_op == FIELD_OP_NONE) return parse_atom(hist_data, file, str, &flags, var_name); if (field_op == FIELD_OP_UNARY_MINUS) - return parse_unary(hist_data, file, str, flags, var_name, ++level); + return parse_unary(hist_data, file, str, flags, var_name, n_subexprs); - switch (field_op) { - case FIELD_OP_MINUS: - sep = "-"; - break; - case FIELD_OP_PLUS: - sep = "+"; - break; - case FIELD_OP_DIV: - sep = "/"; - break; - case FIELD_OP_MULT: - sep = "*"; - break; - default: - goto free; - } + /* Binary operator found, increment n_subexprs */ + ++*n_subexprs; - /* - * Multiplication and division are only supported in single operator - * expressions, since the expression is always evaluated from right - * to left. - */ - if ((field_op == FIELD_OP_DIV || field_op == FIELD_OP_MULT) && level > 0) { - hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); - return ERR_PTR(-EINVAL); - } + /* Split the expression string at the root operator */ + if (!sep) + goto free; + *sep = '\0'; + operand1_str = str; + str = sep+1; - operand1_str = strsep(&str, sep); if (!operand1_str || !str) goto free; operand_flags = 0; - operand1 = parse_atom(hist_data, file, operand1_str, - &operand_flags, NULL); + + /* LHS of string is an expression e.g. a+b in a+b+c */ + operand1 = parse_expr(hist_data, file, operand1_str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand1)) { ret = PTR_ERR(operand1); operand1 = NULL; @@ -2447,9 +2517,9 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, goto free; } - /* rest of string could be another expression e.g. b+c in a+b+c */ + /* RHS of string is another expression e.g. c in a+b+c */ operand_flags = 0; - operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, ++level); + operand2 = parse_expr(hist_data, file, str, operand_flags, NULL, n_subexprs); if (IS_ERR(operand2)) { ret = PTR_ERR(operand2); operand2 = NULL; @@ -3883,9 +3953,9 @@ static int __create_val_field(struct hist_trigger_data *hist_data, unsigned long flags) { struct hist_field *hist_field; - int ret = 0; + int ret = 0, n_subexprs = 0; - hist_field = parse_expr(hist_data, file, field_str, flags, var_name, 0); + hist_field = parse_expr(hist_data, file, field_str, flags, var_name, &n_subexprs); if (IS_ERR(hist_field)) { ret = PTR_ERR(hist_field); goto out; @@ -4026,7 +4096,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, struct hist_field *hist_field = NULL; unsigned long flags = 0; unsigned int key_size; - int ret = 0; + int ret = 0, n_subexprs = 0; if (WARN_ON(key_idx >= HIST_FIELDS_MAX)) return -EINVAL; @@ -4039,7 +4109,7 @@ static int create_key_field(struct hist_trigger_data *hist_data, hist_field = create_hist_field(hist_data, NULL, flags, NULL); } else { hist_field = parse_expr(hist_data, file, field_str, flags, - NULL, 0); + NULL, &n_subexprs); if (IS_ERR(hist_field)) { ret = PTR_ERR(hist_field); goto out; -- cgit v1.2.3 From c5eac6ee8bc5d32e48b3845472b547574061f49f Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:36 -0700 Subject: tracing/histogram: Simplify handling of .sym-offset in expressions The '-' in .sym-offset can confuse the hist trigger arithmetic expression parsing. Simplify the handling of this by replacing the 'sym-offset' with 'symXoffset'. This allows us to correctly evaluate expressions where the user may have inadvertently added a .sym-offset modifier to one of the operands in an expression, instead of bailing out. In this case the .sym-offset has no effect on the evaluation of the expression. The only valid use of the .sym-offset is as a hist key modifier. Link: https://lkml.kernel.org/r/20211025200852.3002369-5-kaleshsingh@google.com Signed-off-by: Kalesh Singh Suggested-by: Steven Rostedt Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 43 ++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 26 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7a50ea2ac6b1..bbaf2e16b7ae 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -68,8 +68,7 @@ C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \ C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \ C(EXPECT_NUMBER, "Expecting numeric literal"), \ - C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \ - C(SYM_OFFSET_SUBEXPR, ".sym-offset not supported in sub-expressions"), + C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), #undef C #define C(a, b) HIST_ERR_##a @@ -1672,10 +1671,6 @@ static int contains_operator(char *str, char **sep) */ minus_op = strrchr(str, '-'); if (minus_op) { - /* Unfortunately, the modifier ".sym-offset" can confuse things. */ - if (minus_op - str >= 4 && !strncmp(minus_op - 4, ".sym-offset", 11)) - goto out; - /* * Unary minus is not supported in sub-expressions. If * present, it is always the next root operator. @@ -2138,7 +2133,11 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, *flags |= HIST_FIELD_FL_HEX; else if (strcmp(modifier, "sym") == 0) *flags |= HIST_FIELD_FL_SYM; - else if (strcmp(modifier, "sym-offset") == 0) + /* + * 'sym-offset' occurrences in the trigger string are modified + * to 'symXoffset' to simplify arithmetic expression parsing. + */ + else if (strcmp(modifier, "symXoffset") == 0) *flags |= HIST_FIELD_FL_SYM_OFFSET; else if ((strcmp(modifier, "execname") == 0) && (strcmp(field_name, "common_pid") == 0)) @@ -2463,24 +2462,6 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, return ERR_PTR(-EINVAL); } - /* - * ".sym-offset" in expressions has no effect on their evaluation, - * but can confuse operator parsing. - */ - if (*n_subexprs == 0) { - sep = strstr(str, ".sym-offset"); - if (sep) { - *sep = '\0'; - if (strpbrk(str, "+-/*") || strpbrk(sep + 11, "+-/*")) { - *sep = '.'; - hist_err(file->tr, HIST_ERR_SYM_OFFSET_SUBEXPR, - errpos(sep)); - return ERR_PTR(-EINVAL); - } - *sep = '.'; - } - } - field_op = contains_operator(str, &sep); if (field_op == FIELD_OP_NONE) @@ -5999,7 +5980,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, struct synth_event *se; const char *se_name; bool remove = false; - char *trigger, *p; + char *trigger, *p, *start; int ret = 0; lockdep_assert_held(&event_mutex); @@ -6047,6 +6028,16 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, trigger = strstrip(trigger); } + /* + * To simplify arithmetic expression parsing, replace occurrences of + * '.sym-offset' modifier with '.symXoffset' + */ + start = strstr(trigger, ".sym-offset"); + while (start) { + *(start + 4) = 'X'; + start = strstr(start + 11, ".sym-offset"); + }; + attrs = parse_hist_trigger_attrs(file->tr, trigger); if (IS_ERR(attrs)) return PTR_ERR(attrs); -- cgit v1.2.3 From f47716b7a955e40e2591b960d1eccb1fde967a70 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:37 -0700 Subject: tracing/histogram: Covert expr to const if both operands are constants If both operands of a hist trigger expression are constants, convert the expression to a constant. This optimization avoids having to perform the same calculation multiple times and also saves on memory since the merged constants are represented by a single struct hist_field instead or multiple. Link: https://lkml.kernel.org/r/20211025200852.3002369-6-kaleshsingh@google.com Signed-off-by: Kalesh Singh Suggested-by: Steven Rostedt Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 104 ++++++++++++++++++++++++++++----------- 1 file changed, 74 insertions(+), 30 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index bbaf2e16b7ae..71b453576d85 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2411,9 +2411,15 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, return ERR_PTR(ret); } +/* + * If the operands are var refs, return pointers the + * variable(s) referenced in var1 and var2, else NULL. + */ static int check_expr_operands(struct trace_array *tr, struct hist_field *operand1, - struct hist_field *operand2) + struct hist_field *operand2, + struct hist_field **var1, + struct hist_field **var2) { unsigned long operand1_flags = operand1->flags; unsigned long operand2_flags = operand2->flags; @@ -2426,6 +2432,7 @@ static int check_expr_operands(struct trace_array *tr, if (!var) return -EINVAL; operand1_flags = var->flags; + *var1 = var; } if ((operand2_flags & HIST_FIELD_FL_VAR_REF) || @@ -2436,6 +2443,7 @@ static int check_expr_operands(struct trace_array *tr, if (!var) return -EINVAL; operand2_flags = var->flags; + *var2 = var; } if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != @@ -2453,9 +2461,12 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, char *var_name, unsigned int *n_subexprs) { struct hist_field *operand1 = NULL, *operand2 = NULL, *expr = NULL; - unsigned long operand_flags; + struct hist_field *var1 = NULL, *var2 = NULL; + unsigned long operand_flags, operand2_flags; int field_op, ret = -EINVAL; char *sep, *operand1_str; + hist_field_fn_t op_fn; + bool combine_consts; if (*n_subexprs > 3) { hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); @@ -2512,11 +2523,38 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, goto free; } - ret = check_expr_operands(file->tr, operand1, operand2); + switch (field_op) { + case FIELD_OP_MINUS: + op_fn = hist_field_minus; + break; + case FIELD_OP_PLUS: + op_fn = hist_field_plus; + break; + case FIELD_OP_DIV: + op_fn = hist_field_div; + break; + case FIELD_OP_MULT: + op_fn = hist_field_mult; + break; + default: + ret = -EINVAL; + goto free; + } + + ret = check_expr_operands(file->tr, operand1, operand2, &var1, &var2); if (ret) goto free; - flags |= HIST_FIELD_FL_EXPR; + operand_flags = var1 ? var1->flags : operand1->flags; + operand2_flags = var2 ? var2->flags : operand2->flags; + + /* + * If both operands are constant, the expression can be + * collapsed to a single constant. + */ + combine_consts = operand_flags & operand2_flags & HIST_FIELD_FL_CONST; + + flags |= combine_consts ? HIST_FIELD_FL_CONST : HIST_FIELD_FL_EXPR; flags |= operand1->flags & (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); @@ -2533,37 +2571,43 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->operands[0] = operand1; expr->operands[1] = operand2; - /* The operand sizes should be the same, so just pick one */ - expr->size = operand1->size; + if (combine_consts) { + if (var1) + expr->operands[0] = var1; + if (var2) + expr->operands[1] = var2; - expr->operator = field_op; - expr->name = expr_str(expr, 0); - expr->type = kstrdup_const(operand1->type, GFP_KERNEL); - if (!expr->type) { - ret = -ENOMEM; - goto free; - } + expr->constant = op_fn(expr, NULL, NULL, NULL, NULL); - switch (field_op) { - case FIELD_OP_MINUS: - expr->fn = hist_field_minus; - break; - case FIELD_OP_PLUS: - expr->fn = hist_field_plus; - break; - case FIELD_OP_DIV: - expr->fn = hist_field_div; - break; - case FIELD_OP_MULT: - expr->fn = hist_field_mult; - break; - default: - ret = -EINVAL; - goto free; + expr->operands[0] = NULL; + expr->operands[1] = NULL; + + /* + * var refs won't be destroyed immediately + * See: destroy_hist_field() + */ + destroy_hist_field(operand2, 0); + destroy_hist_field(operand1, 0); + + expr->name = expr_str(expr, 0); + } else { + expr->fn = op_fn; + + /* The operand sizes should be the same, so just pick one */ + expr->size = operand1->size; + + expr->operator = field_op; + expr->type = kstrdup_const(operand1->type, GFP_KERNEL); + if (!expr->type) { + ret = -ENOMEM; + goto free; + } + + expr->name = expr_str(expr, 0); } return expr; - free: +free: destroy_hist_field(operand1, 0); destroy_hist_field(operand2, 0); destroy_hist_field(expr, 0); -- cgit v1.2.3 From 722eddaa4043acee8f031cf238ced5f7514ad638 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Mon, 25 Oct 2021 13:08:38 -0700 Subject: tracing/histogram: Optimize division by a power of 2 The division is a slow operation. If the divisor is a power of 2, use a shift instead. Results were obtained using Android's version of perf (simpleperf[1]) as described below: 1. hist_field_div() is modified to call 2 test functions: test_hist_field_div_[not]_optimized(); passing them the same args. Use noinline and volatile to ensure these are not optimized out by the compiler. 2. Create a hist event trigger that uses division: events/kmem/rss_stat$ echo 'hist:keys=common_pid:x=size/' >> trigger events/kmem/rss_stat$ echo 'hist:keys=common_pid:vals=$x' >> trigger 3. Run Android's lmkd_test[2] to generate rss_stat events, and record CPU samples with Android's simpleperf: simpleperf record -a --exclude-perf --post-unwind=yes -m 16384 -g -f 2000 -o perf.data == Results == Divisor is a power of 2 (divisor == 32): test_hist_field_div_not_optimized | 8,717,091 cpu-cycles test_hist_field_div_optimized | 1,643,137 cpu-cycles If the divisor is a power of 2, the optimized version is ~5.3x faster. Divisor is not a power of 2 (divisor == 33): test_hist_field_div_not_optimized | 4,444,324 cpu-cycles test_hist_field_div_optimized | 5,497,958 cpu-cycles If the divisor is not a power of 2, as expected, the optimized version is slightly slower (~24% slower). [1] https://android.googlesource.com/platform/system/extras/+/master/simpleperf/doc/README.md [2] https://cs.android.com/android/platform/superproject/+/master:system/memory/lmkd/tests/lmkd_test.cpp Link: https://lkml.kernel.org/r/20211025200852.3002369-7-kaleshsingh@google.com Signed-off-by: Kalesh Singh Suggested-by: Steven Rostedt Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 71b453576d85..452daad7cfb3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -304,6 +304,10 @@ static u64 hist_field_div(struct hist_field *hist_field, if (!val2) return -1; + /* Use shift if the divisor is a power of 2 */ + if (!(val2 & (val2 - 1))) + return val1 >> __ffs64(val2); + return div64_u64(val1, val2); } -- cgit v1.2.3 From ce5e48036c9e76a2a5bd4d9079eac273087a533a Mon Sep 17 00:00:00 2001 From: 王贇 Date: Wed, 27 Oct 2021 11:14:44 +0800 Subject: ftrace: disable preemption when recursion locked As the documentation explained, ftrace_test_recursion_trylock() and ftrace_test_recursion_unlock() were supposed to disable and enable preemption properly, however currently this work is done outside of the function, which could be missing by mistake. And since the internal using of trace_test_and_set_recursion() and trace_clear_recursion() also require preemption disabled, we can just merge the logical. This patch will make sure the preemption has been disabled when trace_test_and_set_recursion() return bit >= 0, and trace_clear_recursion() will enable the preemption if previously enabled. Link: https://lkml.kernel.org/r/13bde807-779c-aa4c-0672-20515ae365ea@linux.alibaba.com CC: Petr Mladek Cc: Guo Ren Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Joe Lawrence Cc: Masami Hiramatsu Cc: Nicholas Piggin Cc: Jisheng Zhang CC: Steven Rostedt CC: Miroslav Benes Reported-by: Abaci Suggested-by: Peter Zijlstra Signed-off-by: Michael Wang [ Removed extra line in comment - SDR ] Signed-off-by: Steven Rostedt (VMware) --- arch/csky/kernel/probes/ftrace.c | 2 -- arch/parisc/kernel/ftrace.c | 2 -- arch/powerpc/kernel/kprobes-ftrace.c | 2 -- arch/riscv/kernel/probes/ftrace.c | 2 -- arch/x86/kernel/kprobes/ftrace.c | 2 -- include/linux/trace_recursion.h | 11 ++++++++++- kernel/livepatch/patch.c | 12 ++++++------ kernel/trace/ftrace.c | 15 +++++---------- kernel/trace/trace_functions.c | 5 ----- 9 files changed, 21 insertions(+), 32 deletions(-) (limited to 'kernel/trace') diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index b388228abbf2..834cffcfbce3 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -17,7 +17,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, return; regs = ftrace_get_regs(fregs); - preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (!p) { p = get_kprobe((kprobe_opcode_t *)(ip - MCOUNT_INSN_SIZE)); @@ -57,7 +56,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, __this_cpu_write(current_kprobe, NULL); } out: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 01581f715737..b14011d3c2f1 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -211,7 +211,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, return; regs = ftrace_get_regs(fregs); - preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) goto out; @@ -240,7 +239,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, } __this_cpu_write(current_kprobe, NULL); out: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 7154d58338cc..072ebe7f290b 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -26,7 +26,6 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, return; regs = ftrace_get_regs(fregs); - preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)nip); if (unlikely(!p) || kprobe_disabled(p)) goto out; @@ -61,7 +60,6 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, __this_cpu_write(current_kprobe, NULL); } out: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/riscv/kernel/probes/ftrace.c b/arch/riscv/kernel/probes/ftrace.c index aab85a82f419..7142ec42e889 100644 --- a/arch/riscv/kernel/probes/ftrace.c +++ b/arch/riscv/kernel/probes/ftrace.c @@ -15,7 +15,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) goto out; @@ -52,7 +51,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, __this_cpu_write(current_kprobe, NULL); } out: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 596de2f6d3a5..dd2ec14adb77 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -25,7 +25,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) goto out; @@ -59,7 +58,6 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, __this_cpu_write(current_kprobe, NULL); } out: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } NOKPROBE_SYMBOL(kprobe_ftrace_handler); diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index 24f284eb55a7..a13f23b04d73 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -155,6 +155,9 @@ extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip); # define do_ftrace_record_recursion(ip, pip) do { } while (0) #endif +/* + * Preemption is promised to be disabled when return bit >= 0. + */ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip, int start, int max) { @@ -189,14 +192,20 @@ static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsign current->trace_recursion = val; barrier(); + preempt_disable_notrace(); + return bit + 1; } +/* + * Preemption will be enabled (if it was previously enabled). + */ static __always_inline void trace_clear_recursion(int bit) { if (!bit) return; + preempt_enable_notrace(); barrier(); bit--; trace_recursion_clear(bit); @@ -209,7 +218,7 @@ static __always_inline void trace_clear_recursion(int bit) * tracing recursed in the same context (normal vs interrupt), * * Returns: -1 if a recursion happened. - * >= 0 if no recursion + * >= 0 if no recursion. */ static __always_inline int ftrace_test_recursion_trylock(unsigned long ip, unsigned long parent_ip) diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index e8029aea67f1..fe316c021d73 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -49,14 +49,15 @@ static void notrace klp_ftrace_handler(unsigned long ip, ops = container_of(fops, struct klp_ops, fops); + /* + * The ftrace_test_recursion_trylock() will disable preemption, + * which is required for the variant of synchronize_rcu() that is + * used to allow patching functions where RCU is not watching. + * See klp_synchronize_transition() for more details. + */ bit = ftrace_test_recursion_trylock(ip, parent_ip); if (WARN_ON_ONCE(bit < 0)) return; - /* - * A variant of synchronize_rcu() is used to allow patching functions - * where RCU is not watching, see klp_synchronize_transition(). - */ - preempt_disable_notrace(); func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, stack_node); @@ -120,7 +121,6 @@ static void notrace klp_ftrace_handler(unsigned long ip, klp_arch_set_pc(fregs, (unsigned long)func->new_func); unlock: - preempt_enable_notrace(); ftrace_test_recursion_unlock(bit); } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2057ad363772..b4ed1a301232 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -7198,16 +7198,15 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op; int bit; + /* + * The ftrace_test_and_set_recursion() will disable preemption, + * which is required since some of the ops may be dynamically + * allocated, they must be freed after a synchronize_rcu(). + */ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; - /* - * Some of the ops may be dynamically allocated, - * they must be freed after a synchronize_rcu(). - */ - preempt_disable_notrace(); - do_for_each_ftrace_op(op, ftrace_ops_list) { /* Stub functions don't need to be called nor tested */ if (op->flags & FTRACE_OPS_FL_STUB) @@ -7231,7 +7230,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, } } while_for_each_ftrace_op(op); out: - preempt_enable_notrace(); trace_clear_recursion(bit); } @@ -7279,12 +7277,9 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - preempt_disable_notrace(); - if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) op->func(ip, parent_ip, op, fregs); - preempt_enable_notrace(); trace_clear_recursion(bit); } NOKPROBE_SYMBOL(ftrace_ops_assist_func); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 1f0e63f5d1f9..9f1bfbe105e8 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -186,7 +186,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, return; trace_ctx = tracing_gen_ctx(); - preempt_disable_notrace(); cpu = smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); @@ -194,7 +193,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, trace_function(tr, ip, parent_ip, trace_ctx); ftrace_test_recursion_unlock(bit); - preempt_enable_notrace(); } #ifdef CONFIG_UNWINDER_ORC @@ -298,8 +296,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; - preempt_disable_notrace(); - cpu = smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); if (atomic_read(&data->disabled)) @@ -324,7 +320,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip, out: ftrace_test_recursion_unlock(bit); - preempt_enable_notrace(); } static void -- cgit v1.2.3 From d33cc657372366a8959f099c619a208b4c5dc664 Mon Sep 17 00:00:00 2001 From: 王贇 Date: Wed, 27 Oct 2021 11:15:11 +0800 Subject: ftrace: do CPU checking after preemption disabled With CONFIG_DEBUG_PREEMPT we observed reports like: BUG: using smp_processor_id() in preemptible caller is perf_ftrace_function_call+0x6f/0x2e0 CPU: 1 PID: 680 Comm: a.out Not tainted Call Trace: dump_stack_lvl+0x8d/0xcf check_preemption_disabled+0x104/0x110 ? optimize_nops.isra.7+0x230/0x230 ? text_poke_bp_batch+0x9f/0x310 perf_ftrace_function_call+0x6f/0x2e0 ... __text_poke+0x5/0x620 text_poke_bp_batch+0x9f/0x310 This telling us the CPU could be changed after task is preempted, and the checking on CPU before preemption will be invalid. Since now ftrace_test_recursion_trylock() will help to disable the preemption, this patch just do the checking after trylock() to address the issue. Link: https://lkml.kernel.org/r/54880691-5fe2-33e7-d12f-1fa6136f5183@linux.alibaba.com CC: Steven Rostedt Cc: Guo Ren Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Joe Lawrence Cc: Masami Hiramatsu Cc: "Peter Zijlstra (Intel)" Cc: Nicholas Piggin Cc: Jisheng Zhang Reported-by: Abaci Signed-off-by: Michael Wang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 6aed10e2f7ce..fba8cb77a73a 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -441,13 +441,13 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, if (!rcu_is_watching()) return; - if ((unsigned long)ops->private != smp_processor_id()) - return; - bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; + if ((unsigned long)ops->private != smp_processor_id()) + goto out; + event = container_of(ops, struct perf_event, ftrace_ops); /* -- cgit v1.2.3 From a90afe8d020da9298c98fddb19b7a6372e2feb45 Mon Sep 17 00:00:00 2001 From: "Robin H. Johnson" Date: Mon, 30 Aug 2021 21:37:22 -0700 Subject: tracing: Show size of requested perf buffer If the perf buffer isn't large enough, provide a hint about how large it needs to be for whatever is running. Link: https://lkml.kernel.org/r/20210831043723.13481-1-robbat2@gentoo.org Signed-off-by: Robin H. Johnson Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fba8cb77a73a..a114549720d6 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -400,7 +400,8 @@ void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, - "perf buffer not large enough")) + "perf buffer not large enough, wanted %d, have %d", + size, PERF_MAX_TRACE_SIZE)) return NULL; *rctxp = rctx = perf_swevent_get_recursion_context(); -- cgit v1.2.3 From feea69ec121f067073868cebe0cb9d003e64ad80 Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Sat, 30 Oct 2021 08:56:15 +0800 Subject: tracing/histogram: Fix semicolon.cocci warnings kernel/trace/trace_events_hist.c:6039:2-3: Unneeded semicolon Remove unneeded semicolon. Generated by: scripts/coccinelle/misc/semicolon.cocci Link: https://lkml.kernel.org/r/20211030005615.GA41257@3074f0d39c61 Fixes: c5eac6ee8bc5 ("tracing/histogram: Simplify handling of .sym-offset in expressions") CC: Kalesh Singh Reported-by: kernel test robot Signed-off-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_hist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/trace') diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 452daad7cfb3..682870d004c4 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -6084,7 +6084,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, while (start) { *(start + 4) = 'X'; start = strstr(start + 11, ".sym-offset"); - }; + } attrs = parse_hist_trigger_attrs(file->tr, trigger); if (IS_ERR(attrs)) -- cgit v1.2.3