From 17d9ddc72fb8bba0d4f67868c9c612e472a594a9 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Wed, 10 Feb 2010 15:23:44 -0800 Subject: rbtree: Add support for augmented rbtrees Add support for augmented rbtrees in core rbtree code. This will be used in subsequent patches, in x86 PAT code, which needs interval trees to efficiently keep track of PAT ranges. Signed-off-by: Venkatesh Pallipadi LKML-Reference: <20100210232343.GA11465@linux-os.sc.intel.com> Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- Documentation/rbtree.txt | 58 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) (limited to 'Documentation') diff --git a/Documentation/rbtree.txt b/Documentation/rbtree.txt index aae8355d316..221f38be98f 100644 --- a/Documentation/rbtree.txt +++ b/Documentation/rbtree.txt @@ -190,3 +190,61 @@ Example: for (node = rb_first(&mytree); node; node = rb_next(node)) printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); +Support for Augmented rbtrees +----------------------------- + +Augmented rbtree is an rbtree with "some" additional data stored in each node. +This data can be used to augment some new functionality to rbtree. +Augmented rbtree is an optional feature built on top of basic rbtree +infrastructure. rbtree user who wants this feature will have an augment +callback function in rb_root initialized. + +This callback function will be called from rbtree core routines whenever +a node has a change in one or both of its children. It is the responsibility +of the callback function to recalculate the additional data that is in the +rb node using new children information. Note that if this new additional +data affects the parent node's additional data, then callback function has +to handle it and do the recursive updates. + + +Interval tree is an example of augmented rb tree. Reference - +"Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. +More details about interval trees: + +Classical rbtree has a single key and it cannot be directly used to store +interval ranges like [lo:hi] and do a quick lookup for any overlap with a new +lo:hi or to find whether there is an exact match for a new lo:hi. + +However, rbtree can be augmented to store such interval ranges in a structured +way making it possible to do efficient lookup and exact match. + +This "extra information" stored in each node is the maximum hi +(max_hi) value among all the nodes that are its descendents. This +information can be maintained at each node just be looking at the node +and its immediate children. And this will be used in O(log n) lookup +for lowest match (lowest start address among all possible matches) +with something like: + +find_lowest_match(lo, hi, node) +{ + lowest_match = NULL; + while (node) { + if (max_hi(node->left) > lo) { + // Lowest overlap if any must be on left side + node = node->left; + } else if (overlap(lo, hi, node)) { + lowest_match = node; + break; + } else if (lo > node->lo) { + // Lowest overlap if any must be on right side + node = node->right; + } else { + break; + } + } + return lowest_match; +} + +Finding exact match will be to first find lowest match and then to follow +successor nodes looking for exact match, until the start of a node is beyond +the hi value we are looking for. -- cgit v1.2.3 From 5cc718b9dad682329a60e73547c6e708faa5bbe4 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 15 Mar 2010 13:00:54 -0400 Subject: kprobes: Hide CONFIG_OPTPROBES and set if arch supports optimized kprobes Hide CONFIG_OPTPROBES and set if the arch supports optimized kprobes (IOW, HAVE_OPTPROBES=y), since this option doesn't change the major behavior of kprobes, and workarounds for minor changes are documented. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Dieter Ries Cc: Ananth N Mavinakayanahalli Cc: OGAWA Hirofumi Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20100315170054.31593.3153.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- Documentation/kprobes.txt | 10 ++-------- arch/Kconfig | 9 ++------- 2 files changed, 4 insertions(+), 15 deletions(-) (limited to 'Documentation') diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 2f9115c0ae6..61c291cddf1 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -165,8 +165,8 @@ the user entry_handler invocation is also skipped. 1.4 How Does Jump Optimization Work? -If you configured your kernel with CONFIG_OPTPROBES=y (currently -this option is supported on x86/x86-64, non-preemptive kernel) and +If your kernel is built with CONFIG_OPTPROBES=y (currently this flag +is automatically set 'y' on x86/x86-64, non-preemptive kernel) and the "debug.kprobes_optimization" kernel parameter is set to 1 (see sysctl(8)), Kprobes tries to reduce probe-hit overhead by using a jump instruction instead of a breakpoint instruction at each probepoint. @@ -271,8 +271,6 @@ tweak the kernel's execution path, you need to suppress optimization, using one of the following techniques: - Specify an empty function for the kprobe's post_handler or break_handler. or -- Config CONFIG_OPTPROBES=n. - or - Execute 'sysctl -w debug.kprobes_optimization=n' 2. Architectures Supported @@ -307,10 +305,6 @@ it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO), so you can use "objdump -d -l vmlinux" to see the source-to-object code mapping. -If you want to reduce probing overhead, set "Kprobes jump optimization -support" (CONFIG_OPTPROBES) to "y". You can find this option under the -"Kprobes" line. - 4. API Reference The Kprobes API includes a "register" function and an "unregister" diff --git a/arch/Kconfig b/arch/Kconfig index e5eb1337a53..f06010fb483 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -42,15 +42,10 @@ config KPROBES If in doubt, say "N". config OPTPROBES - bool "Kprobes jump optimization support (EXPERIMENTAL)" - default y - depends on KPROBES + def_bool y + depends on KPROBES && HAVE_OPTPROBES depends on !PREEMPT - depends on HAVE_OPTPROBES select KALLSYMS_ALL - help - This option will allow kprobes to optimize breakpoint to - a jump for reducing its overhead. config HAVE_EFFICIENT_UNALIGNED_ACCESS bool -- cgit v1.2.3 From 4bd96a7a8185755b091233b16034c7436cbf57af Mon Sep 17 00:00:00 2001 From: Shane Wang Date: Wed, 10 Mar 2010 14:36:10 +0800 Subject: x86, tboot: Add support for S3 memory integrity protection This patch adds support for S3 memory integrity protection within an Intel(R) TXT launched kernel, for all kernel and userspace memory. All RAM used by the kernel and userspace, as indicated by memory ranges of type E820_RAM and E820_RESERVED_KERN in the e820 table, will be integrity protected. The MAINTAINERS file is also updated to reflect the maintainers of the TXT-related code. All MACing is done in tboot, based on a complexity analysis and tradeoff. v3: Compared with v2, this patch adds a check of array size in tboot.c, and a note to specify which c/s of tboot supports this kind of MACing in intel_txt.txt. Signed-off-by: Shane Wang LKML-Reference: <4B973DDA.6050902@intel.com> Signed-off-by: Joseph Cihula Acked-by: Pavel Machek Acked-by: Rafael J. Wysocki Signed-off-by: H. Peter Anvin --- Documentation/intel_txt.txt | 16 +++++++++------- MAINTAINERS | 11 +++++++++++ arch/x86/include/asm/e820.h | 7 ++++++- arch/x86/kernel/tboot.c | 20 +++++++++++--------- 4 files changed, 37 insertions(+), 17 deletions(-) (limited to 'Documentation') diff --git a/Documentation/intel_txt.txt b/Documentation/intel_txt.txt index f40a1f03001..87c8990dbbd 100644 --- a/Documentation/intel_txt.txt +++ b/Documentation/intel_txt.txt @@ -161,13 +161,15 @@ o In order to put a system into any of the sleep states after a TXT has been restored, it will restore the TPM PCRs and then transfer control back to the kernel's S3 resume vector. In order to preserve system integrity across S3, the kernel - provides tboot with a set of memory ranges (kernel - code/data/bss, S3 resume code, and AP trampoline) that tboot - will calculate a MAC (message authentication code) over and then - seal with the TPM. On resume and once the measured environment - has been re-established, tboot will re-calculate the MAC and - verify it against the sealed value. Tboot's policy determines - what happens if the verification fails. + provides tboot with a set of memory ranges (RAM and RESERVED_KERN + in the e820 table, but not any memory that BIOS might alter over + the S3 transition) that tboot will calculate a MAC (message + authentication code) over and then seal with the TPM. On resume + and once the measured environment has been re-established, tboot + will re-calculate the MAC and verify it against the sealed value. + Tboot's policy determines what happens if the verification fails. + Note that the c/s 194 of tboot which has the new MAC code supports + this. That's pretty much it for TXT support. diff --git a/MAINTAINERS b/MAINTAINERS index 47cc449d89d..d3072cb8805 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2940,6 +2940,17 @@ S: Odd Fixes F: Documentation/networking/README.ipw2200 F: drivers/net/wireless/ipw2x00/ipw2200.* +INTEL(R) TRUSTED EXECUTION TECHNOLOGY (TXT) +M: Joseph Cihula +M: Shane Wang +L: tboot-devel@lists.sourceforge.net +W: http://tboot.sourceforge.net +T: Mercurial http://www.bughost.org/repos.hg/tboot.hg +S: Supported +F: Documentation/intel_txt.txt +F: include/linux/tboot.h +F: arch/x86/kernel/tboot.c + INTEL WIRELESS WIMAX CONNECTION 2400 M: Inaky Perez-Gonzalez M: linux-wimax@intel.com diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 0e22296790d..ec8a52d14ab 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -45,7 +45,12 @@ #define E820_NVS 4 #define E820_UNUSABLE 5 -/* reserved RAM used by kernel itself */ +/* + * reserved RAM used by kernel itself + * if CONFIG_INTEL_TXT is enabled, memory of this type will be + * included in the S3 integrity calculation and so should not include + * any memory that BIOS might alter over the S3 transition + */ #define E820_RESERVED_KERN 128 #ifndef __ASSEMBLY__ diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 86c9f91b48a..cc2c60474fd 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -175,6 +175,9 @@ static void add_mac_region(phys_addr_t start, unsigned long size) struct tboot_mac_region *mr; phys_addr_t end = start + size; + if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS) + panic("tboot: Too many MAC regions\n"); + if (start && size) { mr = &tboot->mac_regions[tboot->num_mac_regions++]; mr->start = round_down(start, PAGE_SIZE); @@ -184,18 +187,17 @@ static void add_mac_region(phys_addr_t start, unsigned long size) static int tboot_setup_sleep(void) { + int i; + tboot->num_mac_regions = 0; - /* S3 resume code */ - add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); + for (i = 0; i < e820.nr_map; i++) { + if ((e820.map[i].type != E820_RAM) + && (e820.map[i].type != E820_RESERVED_KERN)) + continue; -#ifdef CONFIG_X86_TRAMPOLINE - /* AP trampoline code */ - add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); -#endif - - /* kernel code + data + bss */ - add_mac_region(virt_to_phys(_text), _end - _text); + add_mac_region(e820.map[i].addr, e820.map[i].size); + } tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; -- cgit v1.2.3 From 25c2d55c00c6097e6792ebf21e31342f23b9b768 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 24 Mar 2010 13:17:50 +0800 Subject: sched: Remove USER_SCHED from documentation USER_SCHED has been removed, so update the documentation accordingly. Signed-off-by: Li Zefan Signed-off-by: Peter Zijlstra Acked-by: Serge E. Hallyn LKML-Reference: <4BA9A07E.8070508@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-design-CFS.txt | 54 ++-------------------------- Documentation/scheduler/sched-rt-group.txt | 20 +++-------- 2 files changed, 7 insertions(+), 67 deletions(-) (limited to 'Documentation') diff --git a/Documentation/scheduler/sched-design-CFS.txt b/Documentation/scheduler/sched-design-CFS.txt index 6f33593e59e..8239ebbcddc 100644 --- a/Documentation/scheduler/sched-design-CFS.txt +++ b/Documentation/scheduler/sched-design-CFS.txt @@ -211,7 +211,7 @@ provide fair CPU time to each such task group. For example, it may be desirable to first provide fair CPU time to each user on the system and then to each task belonging to a user. -CONFIG_GROUP_SCHED strives to achieve exactly that. It lets tasks to be +CONFIG_CGROUP_SCHED strives to achieve exactly that. It lets tasks to be grouped and divides CPU time fairly among such groups. CONFIG_RT_GROUP_SCHED permits to group real-time (i.e., SCHED_FIFO and @@ -220,38 +220,11 @@ SCHED_RR) tasks. CONFIG_FAIR_GROUP_SCHED permits to group CFS (i.e., SCHED_NORMAL and SCHED_BATCH) tasks. -At present, there are two (mutually exclusive) mechanisms to group tasks for -CPU bandwidth control purposes: - - - Based on user id (CONFIG_USER_SCHED) - - With this option, tasks are grouped according to their user id. - - - Based on "cgroup" pseudo filesystem (CONFIG_CGROUP_SCHED) - - This options needs CONFIG_CGROUPS to be defined, and lets the administrator + These options need CONFIG_CGROUPS to be defined, and let the administrator create arbitrary groups of tasks, using the "cgroup" pseudo filesystem. See Documentation/cgroups/cgroups.txt for more information about this filesystem. -Only one of these options to group tasks can be chosen and not both. - -When CONFIG_USER_SCHED is defined, a directory is created in sysfs for each new -user and a "cpu_share" file is added in that directory. - - # cd /sys/kernel/uids - # cat 512/cpu_share # Display user 512's CPU share - 1024 - # echo 2048 > 512/cpu_share # Modify user 512's CPU share - # cat 512/cpu_share # Display user 512's CPU share - 2048 - # - -CPU bandwidth between two users is divided in the ratio of their CPU shares. -For example: if you would like user "root" to get twice the bandwidth of user -"guest," then set the cpu_share for both the users such that "root"'s cpu_share -is twice "guest"'s cpu_share. - -When CONFIG_CGROUP_SCHED is defined, a "cpu.shares" file is created for each +When CONFIG_FAIR_GROUP_SCHED is defined, a "cpu.shares" file is created for each group created using the pseudo filesystem. See example steps below to create task groups and modify their CPU share using the "cgroups" pseudo filesystem. @@ -273,24 +246,3 @@ task groups and modify their CPU share using the "cgroups" pseudo filesystem. # #Launch gmplayer (or your favourite movie player) # echo > multimedia/tasks - -8. Implementation note: user namespaces - -User namespaces are intended to be hierarchical. But they are currently -only partially implemented. Each of those has ramifications for CFS. - -First, since user namespaces are hierarchical, the /sys/kernel/uids -presentation is inadequate. Eventually we will likely want to use sysfs -tagging to provide private views of /sys/kernel/uids within each user -namespace. - -Second, the hierarchical nature is intended to support completely -unprivileged use of user namespaces. So if using user groups, then -we want the users in a user namespace to be children of the user -who created it. - -That is currently unimplemented. So instead, every user in a new -user namespace will receive 1024 shares just like any user in the -initial user namespace. Note that at the moment creation of a new -user namespace requires each of CAP_SYS_ADMIN, CAP_SETUID, and -CAP_SETGID. diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index 86eabe6c341..605b0d40329 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt @@ -126,23 +126,12 @@ priority! 2.3 Basis for grouping tasks ---------------------------- -There are two compile-time settings for allocating CPU bandwidth. These are -configured using the "Basis for grouping tasks" multiple choice menu under -General setup > Group CPU Scheduler: - -a. CONFIG_USER_SCHED (aka "Basis for grouping tasks" = "user id") - -This lets you use the virtual files under -"/sys/kernel/uids//cpu_rt_runtime_us" to control he CPU time reserved for -each user . - -The other option is: - -.o CONFIG_CGROUP_SCHED (aka "Basis for grouping tasks" = "Control groups") +Enabling CONFIG_RT_GROUP_SCHED lets you explicitly allocate real +CPU bandwidth to task groups. This uses the /cgroup virtual file system and "/cgroup//cpu.rt_runtime_us" to control the CPU time reserved for each -control group instead. +control group. For more information on working with control groups, you should read Documentation/cgroups/cgroups.txt as well. @@ -161,8 +150,7 @@ For now, this can be simplified to just the following (but see Future plans): =============== There is work in progress to make the scheduling period for each group -("/sys/kernel/uids//cpu_rt_period_us" or -"/cgroup//cpu.rt_period_us" respectively) configurable as well. +("/cgroup//cpu.rt_period_us") configurable as well. The constraint on the period is that a subgroup must have a smaller or equal period to its parent. But realistically its not very useful _yet_ -- cgit v1.2.3 From 93ccae7a2227466a0d071fe52c51319f2f34c365 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 12 Apr 2010 13:17:08 -0400 Subject: tracing/kprobes: Support basic types on dynamic events Support basic types of integer (u8, u16, u32, u64, s8, s16, s32, s64) in kprobe tracer. With this patch, users can specify above basic types on each arguments after ':'. If omitted, the argument type is set as unsigned long (u32 or u64, arch-dependent). e.g. echo 'p account_system_time+0 hardirq_offset=%si:s32' > kprobe_events adds a probe recording hardirq_offset in signed-32bits value on the entry of account_system_time. Cc: Ingo Molnar Cc: Steven Rostedt Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Frederic Weisbecker LKML-Reference: <20100412171708.3790.18599.stgit@localhost6.localdomain6> Signed-off-by: Masami Hiramatsu Signed-off-by: Arnaldo Carvalho de Melo --- Documentation/trace/kprobetrace.txt | 4 +- kernel/trace/trace.h | 16 +- kernel/trace/trace_kprobe.c | 535 ++++++++++++++++++++++-------------- 3 files changed, 334 insertions(+), 221 deletions(-) (limited to 'Documentation') diff --git a/Documentation/trace/kprobetrace.txt b/Documentation/trace/kprobetrace.txt index a9100b28eb8..ec94748ae65 100644 --- a/Documentation/trace/kprobetrace.txt +++ b/Documentation/trace/kprobetrace.txt @@ -40,7 +40,9 @@ Synopsis of kprobe_events $stack : Fetch stack address. $retval : Fetch return value.(*) +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**) - NAME=FETCHARG: Set NAME as the argument name of FETCHARG. + NAME=FETCHARG : Set NAME as the argument name of FETCHARG. + FETCHARG:TYPE : Set TYPE as the type of FETCHARG. Currently, basic types + (u8/u16/u32/u64/s8/s16/s32/s64) are supported. (*) only for return probe. (**) this is useful for fetching a field of data structures. diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index bec2c973ff0..3ebdb6bd236 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -102,29 +102,17 @@ struct syscall_trace_exit { long ret; }; -struct kprobe_trace_entry { +struct kprobe_trace_entry_head { struct trace_entry ent; unsigned long ip; - int nargs; - unsigned long args[]; }; -#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ - (offsetof(struct kprobe_trace_entry, args) + \ - (sizeof(unsigned long) * (n))) - -struct kretprobe_trace_entry { +struct kretprobe_trace_entry_head { struct trace_entry ent; unsigned long func; unsigned long ret_ip; - int nargs; - unsigned long args[]; }; -#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \ - (offsetof(struct kretprobe_trace_entry, args) + \ - (sizeof(unsigned long) * (n))) - /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 1251e367bae..a7514326052 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include "trace.h" #include "trace_output.h" @@ -40,7 +42,6 @@ /* Reserved field names */ #define FIELD_STRING_IP "__probe_ip" -#define FIELD_STRING_NARGS "__probe_nargs" #define FIELD_STRING_RETIP "__probe_ret_ip" #define FIELD_STRING_FUNC "__probe_func" @@ -52,56 +53,102 @@ const char *reserved_field_names[] = { "common_tgid", "common_lock_depth", FIELD_STRING_IP, - FIELD_STRING_NARGS, FIELD_STRING_RETIP, FIELD_STRING_FUNC, }; -struct fetch_func { - unsigned long (*func)(struct pt_regs *, void *); +/* Printing function type */ +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *); +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ +static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ + const char *name, void *data)\ +{ \ + return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ +} \ +static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) + +/* Data fetch function type */ +typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); + +struct fetch_param { + fetch_func_t fn; void *data; }; -static __kprobes unsigned long call_fetch(struct fetch_func *f, - struct pt_regs *regs) +static __kprobes void call_fetch(struct fetch_param *fprm, + struct pt_regs *regs, void *dest) { - return f->func(regs, f->data); + return fprm->fn(regs, fprm->data, dest); } -/* fetch handlers */ -static __kprobes unsigned long fetch_register(struct pt_regs *regs, - void *offset) -{ - return regs_get_register(regs, (unsigned int)((unsigned long)offset)); +#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(kind) \ +DEFINE_FETCH_##kind(u8) \ +DEFINE_FETCH_##kind(u16) \ +DEFINE_FETCH_##kind(u32) \ +DEFINE_FETCH_##kind(u64) + +#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \ + ((FETCH_FUNC_NAME(kind, u8) == fn) || \ + (FETCH_FUNC_NAME(kind, u16) == fn) || \ + (FETCH_FUNC_NAME(kind, u32) == fn) || \ + (FETCH_FUNC_NAME(kind, u64) == fn)) + +/* Data fetch function templates */ +#define DEFINE_FETCH_reg(type) \ +static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_register(regs, \ + (unsigned int)((unsigned long)offset)); \ } - -static __kprobes unsigned long fetch_stack(struct pt_regs *regs, - void *num) -{ - return regs_get_kernel_stack_nth(regs, - (unsigned int)((unsigned long)num)); +DEFINE_BASIC_FETCH_FUNCS(reg) + +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ + (unsigned int)((unsigned long)offset)); \ } +DEFINE_BASIC_FETCH_FUNCS(stack) -static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) -{ - unsigned long retval; - - if (probe_kernel_address(addr, retval)) - return 0; - return retval; +#define DEFINE_FETCH_retval(type) \ +static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ + void *dummy, void *dest) \ +{ \ + *(type *)dest = (type)regs_return_value(regs); \ } - -static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, - void *dummy) -{ - return regs_return_value(regs); -} - -static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, - void *dummy) -{ - return kernel_stack_pointer(regs); +DEFINE_BASIC_FETCH_FUNCS(retval) + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + if (probe_kernel_address(addr, retval)) \ + *(type *)dest = 0; \ + else \ + *(type *)dest = retval; \ } +DEFINE_BASIC_FETCH_FUNCS(memory) /* Memory fetching by symbol */ struct symbol_cache { @@ -145,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) return sc; } -static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) -{ - struct symbol_cache *sc = data; - - if (sc->addr) - return fetch_memory(regs, (void *)sc->addr); - else - return 0; +#define DEFINE_FETCH_symbol(type) \ +static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct symbol_cache *sc = data; \ + if (sc->addr) \ + fetch_memory_##type(regs, (void *)sc->addr, dest); \ + else \ + *(type *)dest = 0; \ } +DEFINE_BASIC_FETCH_FUNCS(symbol) -/* Special indirect memory access interface */ -struct indirect_fetch_data { - struct fetch_func orig; +/* Dereference memory access function */ +struct deref_fetch_param { + struct fetch_param orig; long offset; }; -static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) -{ - struct indirect_fetch_data *ind = data; - unsigned long addr; - - addr = call_fetch(&ind->orig, regs); - if (addr) { - addr += ind->offset; - return fetch_memory(regs, (void *)addr); - } else - return 0; +#define DEFINE_FETCH_deref(type) \ +static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct deref_fetch_param *dprm = data; \ + unsigned long addr; \ + call_fetch(&dprm->orig, regs, &addr); \ + if (addr) { \ + addr += dprm->offset; \ + fetch_memory_##type(regs, (void *)addr, dest); \ + } else \ + *(type *)dest = 0; \ } +DEFINE_BASIC_FETCH_FUNCS(deref) -static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) +static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) { - if (data->orig.func == fetch_indirect) - free_indirect_fetch_data(data->orig.data); - else if (data->orig.func == fetch_symbol) + if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn)) free_symbol_cache(data->orig.data); kfree(data); } +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +#define ASSIGN_FETCH_FUNC(kind, type) \ + .kind = FETCH_FUNC_NAME(kind, type) + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + {.name = #ptype, \ + .size = sizeof(ftype), \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ + } + +/* Fetch type information table */ +static const struct fetch_type { + const char *name; /* Name of type */ + size_t size; /* Byte size of type */ + int is_signed; /* Signed flag */ + print_type_func_t print; /* Print functions */ + const char *fmt; /* Fromat string */ + /* Fetch functions */ + fetch_func_t reg; + fetch_func_t stack; + fetch_func_t retval; + fetch_func_t memory; + fetch_func_t symbol; + fetch_func_t deref; +} fetch_type_table[] = { + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), +}; + +static const struct fetch_type *find_fetch_type(const char *type) +{ + int i; + + if (!type) + type = DEFAULT_FETCH_TYPE_STR; + + for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) + if (strcmp(type, fetch_type_table[i].name) == 0) + return &fetch_type_table[i]; + return NULL; +} + +/* Special function : only accept unsigned long */ +static __kprobes void fetch_stack_address(struct pt_regs *regs, + void *dummy, void *dest) +{ + *(unsigned long *)dest = kernel_stack_pointer(regs); +} + /** * Kprobe event core functions */ struct probe_arg { - struct fetch_func fetch; - const char *name; + struct fetch_param fetch; + unsigned int offset; /* Offset from argument entry */ + const char *name; /* Name of this argument */ + const char *comm; /* Command of this argument */ + const struct fetch_type *type; /* Type of this argument */ }; /* Flags for trace_probe */ @@ -204,6 +326,7 @@ struct trace_probe { const char *symbol; /* symbol name */ struct ftrace_event_call call; struct trace_event event; + ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; }; @@ -212,6 +335,7 @@ struct trace_probe { (offsetof(struct trace_probe, args) + \ (sizeof(struct probe_arg) * (n))) + static __kprobes int probe_is_return(struct trace_probe *tp) { return tp->rp.handler != NULL; @@ -222,49 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp) return tp->symbol ? tp->symbol : "unknown"; } -static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) -{ - int ret = -EINVAL; - - if (ff->func == fetch_register) { - const char *name; - name = regs_query_register_name((unsigned int)((long)ff->data)); - ret = snprintf(buf, n, "%%%s", name); - } else if (ff->func == fetch_stack) - ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data); - else if (ff->func == fetch_memory) - ret = snprintf(buf, n, "@0x%p", ff->data); - else if (ff->func == fetch_symbol) { - struct symbol_cache *sc = ff->data; - if (sc->offset) - ret = snprintf(buf, n, "@%s%+ld", sc->symbol, - sc->offset); - else - ret = snprintf(buf, n, "@%s", sc->symbol); - } else if (ff->func == fetch_retvalue) - ret = snprintf(buf, n, "$retval"); - else if (ff->func == fetch_stack_address) - ret = snprintf(buf, n, "$stack"); - else if (ff->func == fetch_indirect) { - struct indirect_fetch_data *id = ff->data; - size_t l = 0; - ret = snprintf(buf, n, "%+ld(", id->offset); - if (ret >= n) - goto end; - l += ret; - ret = probe_arg_string(buf + l, n - l, &id->orig); - if (ret < 0) - goto end; - l += ret; - ret = snprintf(buf + l, n - l, ")"); - ret += l; - } -end: - if (ret >= n) - return -ENOSPC; - return ret; -} - static int register_probe_event(struct trace_probe *tp); static void unregister_probe_event(struct trace_probe *tp); @@ -347,11 +428,12 @@ error: static void free_probe_arg(struct probe_arg *arg) { - if (arg->fetch.func == fetch_symbol) + if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn)) + free_deref_fetch_param(arg->fetch.data); + else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn)) free_symbol_cache(arg->fetch.data); - else if (arg->fetch.func == fetch_indirect) - free_indirect_fetch_data(arg->fetch.data); kfree(arg->name); + kfree(arg->comm); } static void free_trace_probe(struct trace_probe *tp) @@ -457,28 +539,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset) #define PARAM_MAX_ARGS 16 #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) -static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) +static int parse_probe_vars(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) { int ret = 0; unsigned long param; if (strcmp(arg, "retval") == 0) { - if (is_return) { - ff->func = fetch_retvalue; - ff->data = NULL; - } else + if (is_return) + f->fn = t->retval; + else ret = -EINVAL; } else if (strncmp(arg, "stack", 5) == 0) { if (arg[5] == '\0') { - ff->func = fetch_stack_address; - ff->data = NULL; + if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) + f->fn = fetch_stack_address; + else + ret = -EINVAL; } else if (isdigit(arg[5])) { ret = strict_strtoul(arg + 5, 10, ¶m); if (ret || param > PARAM_MAX_STACK) ret = -EINVAL; else { - ff->func = fetch_stack; - ff->data = (void *)param; + f->fn = t->stack; + f->data = (void *)param; } } else ret = -EINVAL; @@ -488,7 +572,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) } /* Recursive argument parser */ -static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +static int __parse_probe_arg(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) { int ret = 0; unsigned long param; @@ -497,13 +582,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) switch (arg[0]) { case '$': - ret = parse_probe_vars(arg + 1, ff, is_return); + ret = parse_probe_vars(arg + 1, t, f, is_return); break; case '%': /* named register */ ret = regs_query_register_offset(arg + 1); if (ret >= 0) { - ff->func = fetch_register; - ff->data = (void *)(unsigned long)ret; + f->fn = t->reg; + f->data = (void *)(unsigned long)ret; ret = 0; } break; @@ -512,26 +597,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) ret = strict_strtoul(arg + 1, 0, ¶m); if (ret) break; - ff->func = fetch_memory; - ff->data = (void *)param; + f->fn = t->memory; + f->data = (void *)param; } else { ret = split_symbol_offset(arg + 1, &offset); if (ret) break; - ff->data = alloc_symbol_cache(arg + 1, offset); - if (ff->data) - ff->func = fetch_symbol; - else - ret = -EINVAL; + f->data = alloc_symbol_cache(arg + 1, offset); + if (f->data) + f->fn = t->symbol; } break; - case '+': /* indirect memory */ + case '+': /* deref memory */ case '-': tmp = strchr(arg, '('); - if (!tmp) { - ret = -EINVAL; + if (!tmp) break; - } *tmp = '\0'; ret = strict_strtol(arg + 1, 0, &offset); if (ret) @@ -541,38 +622,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) arg = tmp + 1; tmp = strrchr(arg, ')'); if (tmp) { - struct indirect_fetch_data *id; + struct deref_fetch_param *dprm; + const struct fetch_type *t2 = find_fetch_type(NULL); *tmp = '\0'; - id = kzalloc(sizeof(struct indirect_fetch_data), - GFP_KERNEL); - if (!id) + dprm = kzalloc(sizeof(struct deref_fetch_param), + GFP_KERNEL); + if (!dprm) return -ENOMEM; - id->offset = offset; - ret = __parse_probe_arg(arg, &id->orig, is_return); + dprm->offset = offset; + ret = __parse_probe_arg(arg, t2, &dprm->orig, + is_return); if (ret) - kfree(id); + kfree(dprm); else { - ff->func = fetch_indirect; - ff->data = (void *)id; + f->fn = t->deref; + f->data = (void *)dprm; } - } else - ret = -EINVAL; + } break; - default: - /* TODO: support custom handler */ - ret = -EINVAL; } + if (!ret && !f->fn) + ret = -EINVAL; return ret; } /* String length checking wrapper */ -static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) +static int parse_probe_arg(char *arg, struct trace_probe *tp, + struct probe_arg *parg, int is_return) { + const char *t; + if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); return -ENOSPC; } - return __parse_probe_arg(arg, ff, is_return); + parg->comm = kstrdup(arg, GFP_KERNEL); + if (!parg->comm) { + pr_info("Failed to allocate memory for command '%s'.\n", arg); + return -ENOMEM; + } + t = strchr(parg->comm, ':'); + if (t) { + arg[t - parg->comm] = '\0'; + t++; + } + parg->type = find_fetch_type(t); + if (!parg->type) { + pr_info("Unsupported type: %s\n", t); + return -EINVAL; + } + parg->offset = tp->size; + tp->size += parg->type->size; + return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); } /* Return 1 if name is reserved or already used by another argument */ @@ -602,15 +703,18 @@ static int create_trace_probe(int argc, char **argv) * @ADDR : fetch memory at ADDR (ADDR should be in kernel) * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) * %REG : fetch register REG - * Indirect memory fetch: + * Dereferencing memory fetch: * +|-offs(ARG) : fetch memory at ARG +|- offs address. * Alias name of args: * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. */ struct trace_probe *tp; int i, ret = 0; int is_return = 0, is_delete = 0; - char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; + char *symbol = NULL, *event = NULL, *group = NULL; + char *arg, *tmp; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; @@ -723,13 +827,6 @@ static int create_trace_probe(int argc, char **argv) else arg = argv[i]; - if (conflict_field_name(argv[i], tp->args, i)) { - pr_info("Argument%d name '%s' conflicts with " - "another field.\n", i, argv[i]); - ret = -EINVAL; - goto error; - } - tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); if (!tp->args[i].name) { pr_info("Failed to allocate argument%d name '%s'.\n", @@ -737,9 +834,19 @@ static int create_trace_probe(int argc, char **argv) ret = -ENOMEM; goto error; } + tmp = strchr(tp->args[i].name, ':'); + if (tmp) + *tmp = '_'; /* convert : to _ */ + + if (conflict_field_name(tp->args[i].name, tp->args, i)) { + pr_info("Argument%d name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } /* Parse fetch argument */ - ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); + ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); if (ret) { pr_info("Parse error at argument%d. (%d)\n", i, ret); kfree(tp->args[i].name); @@ -794,8 +901,7 @@ static void probes_seq_stop(struct seq_file *m, void *v) static int probes_seq_show(struct seq_file *m, void *v) { struct trace_probe *tp = v; - int i, ret; - char buf[MAX_ARGSTR_LEN + 1]; + int i; seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); @@ -807,15 +913,10 @@ static int probes_seq_show(struct seq_file *m, void *v) else seq_printf(m, " %s", probe_symbol(tp)); - for (i = 0; i < tp->nr_args; i++) { - ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); - if (ret < 0) { - pr_warning("Argument%d decoding error(%d).\n", i, ret); - return ret; - } - seq_printf(m, " %s=%s", tp->args[i].name, buf); - } + for (i = 0; i < tp->nr_args; i++) + seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); seq_printf(m, "\n"); + return 0; } @@ -945,9 +1046,10 @@ static const struct file_operations kprobe_profile_ops = { static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); - struct kprobe_trace_entry *entry; + struct kprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; + u8 *data; int size, i, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -957,7 +1059,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) local_save_flags(irq_flags); pc = preempt_count(); - size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + size = sizeof(*entry) + tp->size; event = trace_current_buffer_lock_reserve(&buffer, call->id, size, irq_flags, pc); @@ -965,10 +1067,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) return; entry = ring_buffer_event_data(event); - entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -979,9 +1081,10 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); - struct kretprobe_trace_entry *entry; + struct kretprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; + u8 *data; int size, i, pc; unsigned long irq_flags; struct ftrace_event_call *call = &tp->call; @@ -989,7 +1092,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, local_save_flags(irq_flags); pc = preempt_count(); - size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + size = sizeof(*entry) + tp->size; event = trace_current_buffer_lock_reserve(&buffer, call->id, size, irq_flags, pc); @@ -997,11 +1100,11 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, return; entry = ring_buffer_event_data(event); - entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); @@ -1011,13 +1114,14 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, enum print_line_t print_kprobe_event(struct trace_iterator *iter, int flags) { - struct kprobe_trace_entry *field; + struct kprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_event *event; struct trace_probe *tp; + u8 *data; int i; - field = (struct kprobe_trace_entry *)iter->ent; + field = (struct kprobe_trace_entry_head *)iter->ent; event = ftrace_find_event(field->ent.type); tp = container_of(event, struct trace_probe, event); @@ -1030,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags) if (!trace_seq_puts(s, ")")) goto partial; - for (i = 0; i < field->nargs; i++) - if (!trace_seq_printf(s, " %s=%lx", - tp->args[i].name, field->args[i])) + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1046,13 +1151,14 @@ partial: enum print_line_t print_kretprobe_event(struct trace_iterator *iter, int flags) { - struct kretprobe_trace_entry *field; + struct kretprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; struct trace_event *event; struct trace_probe *tp; + u8 *data; int i; - field = (struct kretprobe_trace_entry *)iter->ent; + field = (struct kretprobe_trace_entry_head *)iter->ent; event = ftrace_find_event(field->ent.type); tp = container_of(event, struct trace_probe, event); @@ -1071,9 +1177,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags) if (!trace_seq_puts(s, ")")) goto partial; - for (i = 0; i < field->nargs; i++) - if (!trace_seq_printf(s, " %s=%lx", - tp->args[i].name, field->args[i])) + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset)) goto partial; if (!trace_seq_puts(s, "\n")) @@ -1129,29 +1236,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call) static int kprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; - struct kprobe_trace_entry field; + struct kprobe_trace_entry_head field; struct trace_probe *tp = (struct trace_probe *)event_call->data; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); - DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) - DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->name, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } return 0; } static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) { int ret, i; - struct kretprobe_trace_entry field; + struct kretprobe_trace_entry_head field; struct trace_probe *tp = (struct trace_probe *)event_call->data; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); - DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); /* Set argument names as fields */ - for (i = 0; i < tp->nr_args; i++) - DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->name, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } return 0; } @@ -1176,8 +1297,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); for (i = 0; i < tp->nr_args; i++) { - pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", - tp->args[i].name); + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", + tp->args[i].name, tp->args[i].type->fmt); } pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); @@ -1219,12 +1340,13 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; - struct kprobe_trace_entry *entry; + struct kprobe_trace_entry_head *entry; + u8 *data; int size, __size, i; unsigned long irq_flags; int rctx; - __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); + __size = sizeof(*entry) + tp->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1235,10 +1357,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, if (!entry) return; - entry->nargs = tp->nr_args; entry->ip = (unsigned long)kp->addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); } @@ -1249,12 +1371,13 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; - struct kretprobe_trace_entry *entry; + struct kretprobe_trace_entry_head *entry; + u8 *data; int size, __size, i; unsigned long irq_flags; int rctx; - __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); + __size = sizeof(*entry) + tp->size; size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, @@ -1265,11 +1388,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, if (!entry) return; - entry->nargs = tp->nr_args; entry->func = (unsigned long)tp->rp.kp.addr; entry->ret_ip = (unsigned long)ri->ret_addr; + data = (u8 *)&entry[1]; for (i = 0; i < tp->nr_args; i++) - entry->args[i] = call_fetch(&tp->args[i].fetch, regs); + call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags, regs); -- cgit v1.2.3 From cecbca96da387428e220e307a9c945e37e2f4d9e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 18 Apr 2010 19:08:41 +0200 Subject: tracing: Dump either the oops's cpu source or all cpus buffers The ftrace_dump_on_oops kernel parameter, sysctl and sysrq let one dump every cpu buffers when an oops or panic happens. It's nice when you have few cpus but it may take ages if have many, plus you miss the real origin of the problem in all the cpu traces. Sometimes, all you need is to dump the cpu buffer that triggered the opps, most of the time it is our main interest. This patch modifies ftrace_dump_on_oops to handle this choice. The ftrace_dump_on_oops kernel parameter, when it comes alone, has the same behaviour than before. But ftrace_dump_on_oops=orig_cpu will only dump the buffer of the cpu that oops'ed. Similarly, sysctl kernel.ftrace_dump_on_oops=1 and echo 1 > /proc/sys/kernel/ftrace_dump_on_oops keep their previous behaviour. But setting 2 jumps into cpu origin dump mode. v2: Fix double setup v3: Fix spelling issues reported by Randy Dunlap v4: Also update __ftrace_dump in the selftests Signed-off-by: Frederic Weisbecker Acked-by: David S. Miller Acked-by: Steven Rostedt Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Li Zefan Cc: Lai Jiangshan --- Documentation/kernel-parameters.txt | 6 ++++- Documentation/trace/ftrace.txt | 6 +++-- drivers/char/sysrq.c | 2 +- include/linux/ftrace.h | 4 ++- include/linux/kernel.h | 11 ++++++-- kernel/trace/trace.c | 51 ++++++++++++++++++++++++++++--------- kernel/trace/trace_selftest.c | 5 ++-- 7 files changed, 64 insertions(+), 21 deletions(-) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e4cbca58536..ab67b33300f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -789,8 +789,12 @@ and is between 256 and 4096 characters. It is defined in the file as early as possible in order to facilitate early boot debugging. - ftrace_dump_on_oops + ftrace_dump_on_oops[=orig_cpu] [FTRACE] will dump the trace buffers on oops. + If no parameter is passed, ftrace will dump + buffers of all CPUs, but if you pass orig_cpu, it will + dump only the buffer of the CPU that triggered the + oops. ftrace_filter=[function-list] [FTRACE] Limit the functions traced by the function diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 03485bfbd79..52011815c90 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -1337,12 +1337,14 @@ ftrace_dump_on_oops must be set. To set ftrace_dump_on_oops, one can either use the sysctl function or set it via the proc system interface. - sysctl kernel.ftrace_dump_on_oops=1 + sysctl kernel.ftrace_dump_on_oops=n or - echo 1 > /proc/sys/kernel/ftrace_dump_on_oops + echo n > /proc/sys/kernel/ftrace_dump_on_oops +If n = 1, ftrace will dump buffers of all CPUs, if n = 2 ftrace will +only dump the buffer of the CPU that triggered the oops. Here's an example of such a dump after a null pointer dereference in a kernel module: diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 59de2525d30..d4e8b213a46 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -289,7 +289,7 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = { static void sysrq_ftrace_dump(int key, struct tty_struct *tty) { - ftrace_dump(); + ftrace_dump(DUMP_ALL); } static struct sysrq_key_op sysrq_ftrace_dump_op = { .handler = sysrq_ftrace_dump, diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 01e6adea07e..ea5b1aae0e8 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -492,7 +492,9 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk) return tsk->trace & TSK_TRACE_FL_GRAPH; } -extern int ftrace_dump_on_oops; +enum ftrace_dump_mode; + +extern enum ftrace_dump_mode ftrace_dump_on_oops; #ifdef CONFIG_PREEMPT #define INIT_TRACE_RECURSION .trace_recursion = 0, diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 9365227dbaf..9fb1c129903 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -490,6 +490,13 @@ static inline void tracing_off(void) { } static inline void tracing_off_permanent(void) { } static inline int tracing_is_on(void) { return 0; } #endif + +enum ftrace_dump_mode { + DUMP_NONE, + DUMP_ALL, + DUMP_ORIG, +}; + #ifdef CONFIG_TRACING extern void tracing_start(void); extern void tracing_stop(void); @@ -571,7 +578,7 @@ __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap); extern int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap); -extern void ftrace_dump(void); +extern void ftrace_dump(enum ftrace_dump_mode oops_dump_mode); #else static inline void ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { } @@ -592,7 +599,7 @@ ftrace_vprintk(const char *fmt, va_list ap) { return 0; } -static inline void ftrace_dump(void) { } +static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #endif /* CONFIG_TRACING */ /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bed83cab6da..7b516c7ef9a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -117,9 +117,12 @@ static cpumask_var_t __read_mostly tracing_buffer_mask; * * It is default off, but you can enable it with either specifying * "ftrace_dump_on_oops" in the kernel command line, or setting - * /proc/sys/kernel/ftrace_dump_on_oops to true. + * /proc/sys/kernel/ftrace_dump_on_oops + * Set 1 if you want to dump buffers of all CPUs + * Set 2 if you want to dump the buffer of the CPU that triggered oops */ -int ftrace_dump_on_oops; + +enum ftrace_dump_mode ftrace_dump_on_oops; static int tracing_set_tracer(const char *buf); @@ -139,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace); static int __init set_ftrace_dump_on_oops(char *str) { - ftrace_dump_on_oops = 1; - return 1; + if (*str++ != '=' || !*str) { + ftrace_dump_on_oops = DUMP_ALL; + return 1; + } + + if (!strcmp("orig_cpu", str)) { + ftrace_dump_on_oops = DUMP_ORIG; + return 1; + } + + return 0; } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); @@ -4338,7 +4350,7 @@ static int trace_panic_handler(struct notifier_block *this, unsigned long event, void *unused) { if (ftrace_dump_on_oops) - ftrace_dump(); + ftrace_dump(ftrace_dump_on_oops); return NOTIFY_OK; } @@ -4355,7 +4367,7 @@ static int trace_die_handler(struct notifier_block *self, switch (val) { case DIE_OOPS: if (ftrace_dump_on_oops) - ftrace_dump(); + ftrace_dump(ftrace_dump_on_oops); break; default: break; @@ -4396,7 +4408,8 @@ trace_printk_seq(struct trace_seq *s) trace_seq_init(s); } -static void __ftrace_dump(bool disable_tracing) +static void +__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) { static arch_spinlock_t ftrace_dump_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -4429,12 +4442,25 @@ static void __ftrace_dump(bool disable_tracing) /* don't look at user memory in panic mode */ trace_flags &= ~TRACE_ITER_SYM_USEROBJ; - printk(KERN_TRACE "Dumping ftrace buffer:\n"); - /* Simulate the iterator */ iter.tr = &global_trace; iter.trace = current_trace; - iter.cpu_file = TRACE_PIPE_ALL_CPU; + + switch (oops_dump_mode) { + case DUMP_ALL: + iter.cpu_file = TRACE_PIPE_ALL_CPU; + break; + case DUMP_ORIG: + iter.cpu_file = raw_smp_processor_id(); + break; + case DUMP_NONE: + goto out_enable; + default: + printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + iter.cpu_file = TRACE_PIPE_ALL_CPU; + } + + printk(KERN_TRACE "Dumping ftrace buffer:\n"); /* * We need to stop all tracing on all CPUS to read the @@ -4473,6 +4499,7 @@ static void __ftrace_dump(bool disable_tracing) else printk(KERN_TRACE "---------------------------------\n"); + out_enable: /* Re-enable tracing if requested */ if (!disable_tracing) { trace_flags |= old_userobj; @@ -4489,9 +4516,9 @@ static void __ftrace_dump(bool disable_tracing) } /* By default: disable tracing after the dump */ -void ftrace_dump(void) +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { - __ftrace_dump(true); + __ftrace_dump(true, oops_dump_mode); } __init static int tracer_alloc_buffers(void) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 9398034f814..6a9d36ddfcf 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -256,7 +256,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* Maximum number of functions to trace before diagnosing a hang */ #define GRAPH_MAX_FUNC_TEST 100000000 -static void __ftrace_dump(bool disable_tracing); +static void +__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode); static unsigned int graph_hang_thresh; /* Wrap the real function entry probe to avoid possible hanging */ @@ -267,7 +268,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) ftrace_graph_stop(); printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); if (ftrace_dump_on_oops) - __ftrace_dump(false); + __ftrace_dump(false, DUMP_ALL); return 0; } -- cgit v1.2.3 From 07271aa42d13378e67ebd79ea9ca1c4a5e2ad46f Mon Sep 17 00:00:00 2001 From: Chase Douglas Date: Fri, 23 Apr 2010 14:02:39 -0400 Subject: tracing: Add documentation for trace commands mod, traceon/traceoff The mod command went in as commit 64e7c440618998fd69eee6ab490b042d12248021 The traceon/traceoff commands went in as commit 23b4ff3aa479c9e3bb23cb6b2d0a97878399784a Signed-off-by: Chase Douglas LKML-Reference: <1272045759-32018-1-git-send-email-chase.douglas@canonical.com> Signed-off-by: Steven Rostedt --- Documentation/trace/ftrace.txt | 44 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) (limited to 'Documentation') diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index 52011815c90..557c1edecca 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -155,6 +155,9 @@ of ftrace. Here is a list of some of the key files: to be traced. Echoing names of functions into this file will limit the trace to only those functions. + This interface also allows for commands to be used. See the + "Filter commands" section for more details. + set_ftrace_notrace: This has an effect opposite to that of @@ -1824,6 +1827,47 @@ this special filter via: echo > set_graph_function +Filter commands +--------------- + +A few commands are supported by the set_ftrace_filter interface. +Trace commands have the following format: + +:: + +The following commands are supported: + +- mod + This command enables function filtering per module. The + parameter defines the module. For example, if only the write* + functions in the ext3 module are desired, run: + + echo 'write*:mod:ext3' > set_ftrace_filter + + This command interacts with the filter in the same way as + filtering based on function names. Thus, adding more functions + in a different module is accomplished by appending (>>) to the + filter file. Remove specific module functions by prepending + '!': + + echo '!writeback*:mod:ext3' >> set_ftrace_filter + +- traceon/traceoff + These commands turn tracing on and off when the specified + functions are hit. The parameter determines how many times the + tracing system is turned on and off. If unspecified, there is + no limit. For example, to disable tracing when a schedule bug + is hit the first 5 times, run: + + echo '__schedule_bug:traceoff:5' > set_ftrace_filter + + These commands are cumulative whether or not they are appended + to set_ftrace_filter. To remove a command, prepend it by '!' + and drop the parameter: + + echo '!__schedule_bug:traceoff' > set_ftrace_filter + + trace_pipe ---------- -- cgit v1.2.3 From 03d646e62b06e9364e2dbb939d67934c6c9826cd Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 21 Dec 2009 14:27:24 +0800 Subject: tracing: Make the documentation clear on trace_event boot option Make it clear that event-list is a comma separated list of events. Reported-by: KOSAKI Motohiro Signed-off-by: Li Zefan LKML-Reference: <4B2F154C.2060503@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- Documentation/trace/events.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index 02ac6ed38b2..778ddf38b82 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt @@ -90,7 +90,8 @@ In order to facilitate early boot debugging, use boot option: trace_event=[event-list] -The format of this boot option is the same as described in section 2.1. +event-list is a comma separated list of events. See section 2.1 for event +format. 3. Defining an event-enabled tracepoint ======================================= -- cgit v1.2.3 From 969c79215a35b06e5e3efe69b9412f858df7856c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 May 2010 18:49:21 +0200 Subject: sched: replace migration_thread with cpu_stop Currently migration_thread is serving three purposes - migration pusher, context to execute active_load_balance() and forced context switcher for expedited RCU synchronize_sched. All three roles are hardcoded into migration_thread() and determining which job is scheduled is slightly messy. This patch kills migration_thread and replaces all three uses with cpu_stop. The three different roles of migration_thread() are splitted into three separate cpu_stop callbacks - migration_cpu_stop(), active_load_balance_cpu_stop() and synchronize_sched_expedited_cpu_stop() - and each use case now simply asks cpu_stop to execute the callback as necessary. synchronize_sched_expedited() was implemented with private preallocated resources and custom multi-cpu queueing and waiting logic, both of which are provided by cpu_stop. synchronize_sched_expedited_count is made atomic and all other shared resources along with the mutex are dropped. synchronize_sched_expedited() also implemented a check to detect cases where not all the callback got executed on their assigned cpus and fall back to synchronize_sched(). If called with cpu hotplug blocked, cpu_stop already guarantees that and the condition cannot happen; otherwise, stop_machine() would break. However, this patch preserves the paranoid check using a cpumask to record on which cpus the stopper ran so that it can serve as a bisection point if something actually goes wrong theree. Because the internal execution state is no longer visible, rcu_expedited_torture_stats() is removed. This patch also renames cpu_stop threads to from "stopper/%d" to "migration/%d". The names of these threads ultimately don't matter and there's no reason to make unnecessary userland visible changes. With this patch applied, stop_machine() and sched now share the same resources. stop_machine() is faster without wasting any resources and sched migration users are much cleaner. Signed-off-by: Tejun Heo Acked-by: Peter Zijlstra Cc: Ingo Molnar Cc: Dipankar Sarma Cc: Josh Triplett Cc: Paul E. McKenney Cc: Oleg Nesterov Cc: Dimitri Sivanich --- Documentation/RCU/torture.txt | 10 -- include/linux/rcutiny.h | 2 - include/linux/rcutree.h | 1 - kernel/rcutorture.c | 2 +- kernel/sched.c | 315 ++++++++++++------------------------------ kernel/sched_fair.c | 48 +++++-- kernel/stop_machine.c | 2 +- 7 files changed, 127 insertions(+), 253 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index 0e50bc2aa1e..5d9016795fd 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -182,16 +182,6 @@ Similarly, sched_expedited RCU provides the following: sched_expedited-torture: Reader Pipe: 12660320201 95875 0 0 0 0 0 0 0 0 0 sched_expedited-torture: Reader Batch: 12660424885 0 0 0 0 0 0 0 0 0 0 sched_expedited-torture: Free-Block Circulation: 1090795 1090795 1090794 1090793 1090792 1090791 1090790 1090789 1090788 1090787 0 - state: -1 / 0:0 3:0 4:0 - -As before, the first four lines are similar to those for RCU. -The last line shows the task-migration state. The first number is --1 if synchronize_sched_expedited() is idle, -2 if in the process of -posting wakeups to the migration kthreads, and N when waiting on CPU N. -Each of the colon-separated fields following the "/" is a CPU:state pair. -Valid states are "0" for idle, "1" for waiting for quiescent state, -"2" for passed through quiescent state, and "3" when a race with a -CPU-hotplug event forces use of the synchronize_sched() primitive. USAGE diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index a5195875480..0006b2df00e 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -60,8 +60,6 @@ static inline long rcu_batches_completed_bh(void) return 0; } -extern int rcu_expedited_torture_stats(char *page); - static inline void rcu_force_quiescent_state(void) { } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 42cc3a04779..24e467e526b 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -35,7 +35,6 @@ struct notifier_block; extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); extern int rcu_needs_cpu(int cpu); -extern int rcu_expedited_torture_stats(char *page); #ifdef CONFIG_TREE_PREEMPT_RCU diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 58df55bf83e..2b676f3a0f2 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -669,7 +669,7 @@ static struct rcu_torture_ops sched_expedited_ops = { .sync = synchronize_sched_expedited, .cb_barrier = NULL, .fqs = rcu_sched_force_quiescent_state, - .stats = rcu_expedited_torture_stats, + .stats = NULL, .irq_capable = 1, .name = "sched_expedited" }; diff --git a/kernel/sched.c b/kernel/sched.c index 4956ed09283..f1d577a0a8a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -55,9 +55,9 @@ #include #include #include -#include #include #include +#include #include #include #include @@ -539,15 +539,13 @@ struct rq { int post_schedule; int active_balance; int push_cpu; + struct cpu_stop_work active_balance_work; /* cpu of this runqueue: */ int cpu; int online; unsigned long avg_load_per_task; - struct task_struct *migration_thread; - struct list_head migration_queue; - u64 rt_avg; u64 age_stamp; u64 idle_stamp; @@ -2037,21 +2035,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) __set_task_cpu(p, new_cpu); } -struct migration_req { - struct list_head list; - +struct migration_arg { struct task_struct *task; int dest_cpu; - - struct completion done; }; +static int migration_cpu_stop(void *data); + /* * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ -static int -migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) +static bool migrate_task(struct task_struct *p, int dest_cpu) { struct rq *rq = task_rq(p); @@ -2059,15 +2054,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) * If the task is not on a runqueue (and not running), then * the next wake-up will properly place the task. */ - if (!p->se.on_rq && !task_running(rq, p)) - return 0; - - init_completion(&req->done); - req->task = p; - req->dest_cpu = dest_cpu; - list_add(&req->list, &rq->migration_queue); - - return 1; + return p->se.on_rq || task_running(rq, p); } /* @@ -3110,7 +3097,6 @@ static void update_cpu_load(struct rq *this_rq) void sched_exec(void) { struct task_struct *p = current; - struct migration_req req; unsigned long flags; struct rq *rq; int dest_cpu; @@ -3124,17 +3110,11 @@ void sched_exec(void) * select_task_rq() can race against ->cpus_allowed */ if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && - likely(cpu_active(dest_cpu)) && - migrate_task(p, dest_cpu, &req)) { - /* Need to wait for migration thread (might exit: take ref). */ - struct task_struct *mt = rq->migration_thread; + likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { + struct migration_arg arg = { p, dest_cpu }; - get_task_struct(mt); task_rq_unlock(rq, &flags); - wake_up_process(mt); - put_task_struct(mt); - wait_for_completion(&req.done); - + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); return; } unlock: @@ -5290,17 +5270,15 @@ static inline void sched_init_granularity(void) /* * This is how migration works: * - * 1) we queue a struct migration_req structure in the source CPU's - * runqueue and wake up that CPU's migration thread. - * 2) we down() the locked semaphore => thread blocks. - * 3) migration thread wakes up (implicitly it forces the migrated - * thread off the CPU) - * 4) it gets the migration request and checks whether the migrated - * task is still in the wrong runqueue. - * 5) if it's in the wrong runqueue then the migration thread removes + * 1) we invoke migration_cpu_stop() on the target CPU using + * stop_one_cpu(). + * 2) stopper starts to run (implicitly forcing the migrated thread + * off the CPU) + * 3) it checks whether the migrated task is still in the wrong runqueue. + * 4) if it's in the wrong runqueue then the migration thread removes * it and puts it into the right queue. - * 6) migration thread up()s the semaphore. - * 7) we wake up and the migration is done. + * 5) stopper completes and stop_one_cpu() returns and the migration + * is done. */ /* @@ -5314,9 +5292,9 @@ static inline void sched_init_granularity(void) */ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - struct migration_req req; unsigned long flags; struct rq *rq; + unsigned int dest_cpu; int ret = 0; /* @@ -5354,15 +5332,12 @@ again: if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (migrate_task(p, dest_cpu)) { + struct migration_arg arg = { p, dest_cpu }; /* Need help from migration thread: drop lock and wait. */ - struct task_struct *mt = rq->migration_thread; - - get_task_struct(mt); task_rq_unlock(rq, &flags); - wake_up_process(mt); - put_task_struct(mt); - wait_for_completion(&req.done); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); tlb_migrate_finish(p->mm); return 0; } @@ -5420,70 +5395,22 @@ fail: return ret; } -#define RCU_MIGRATION_IDLE 0 -#define RCU_MIGRATION_NEED_QS 1 -#define RCU_MIGRATION_GOT_QS 2 -#define RCU_MIGRATION_MUST_SYNC 3 - /* - * migration_thread - this is a highprio system thread that performs - * thread migration by bumping thread off CPU then 'pushing' onto - * another runqueue. + * migration_cpu_stop - this will be executed by a highprio stopper thread + * and performs thread migration by bumping thread off CPU then + * 'pushing' onto another runqueue. */ -static int migration_thread(void *data) +static int migration_cpu_stop(void *data) { - int badcpu; - int cpu = (long)data; - struct rq *rq; - - rq = cpu_rq(cpu); - BUG_ON(rq->migration_thread != current); - - set_current_state(TASK_INTERRUPTIBLE); - while (!kthread_should_stop()) { - struct migration_req *req; - struct list_head *head; - - raw_spin_lock_irq(&rq->lock); - - if (cpu_is_offline(cpu)) { - raw_spin_unlock_irq(&rq->lock); - break; - } - - if (rq->active_balance) { - active_load_balance(rq, cpu); - rq->active_balance = 0; - } - - head = &rq->migration_queue; - - if (list_empty(head)) { - raw_spin_unlock_irq(&rq->lock); - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - continue; - } - req = list_entry(head->next, struct migration_req, list); - list_del_init(head->next); - - if (req->task != NULL) { - raw_spin_unlock(&rq->lock); - __migrate_task(req->task, cpu, req->dest_cpu); - } else if (likely(cpu == (badcpu = smp_processor_id()))) { - req->dest_cpu = RCU_MIGRATION_GOT_QS; - raw_spin_unlock(&rq->lock); - } else { - req->dest_cpu = RCU_MIGRATION_MUST_SYNC; - raw_spin_unlock(&rq->lock); - WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); - } - local_irq_enable(); - - complete(&req->done); - } - __set_current_state(TASK_RUNNING); + struct migration_arg *arg = data; + /* + * The original target cpu might have gone down and we might + * be on another cpu but it doesn't matter. + */ + local_irq_disable(); + __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); + local_irq_enable(); return 0; } @@ -5850,35 +5777,20 @@ static void set_rq_offline(struct rq *rq) static int __cpuinit migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) { - struct task_struct *p; int cpu = (long)hcpu; unsigned long flags; - struct rq *rq; + struct rq *rq = cpu_rq(cpu); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); - if (IS_ERR(p)) - return NOTIFY_BAD; - kthread_bind(p, cpu); - /* Must be high prio: stop_machine expects to yield to it. */ - rq = task_rq_lock(p, &flags); - __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - task_rq_unlock(rq, &flags); - get_task_struct(p); - cpu_rq(cpu)->migration_thread = p; rq->calc_load_update = calc_load_update; break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - /* Strictly unnecessary, as first user will wake it. */ - wake_up_process(cpu_rq(cpu)->migration_thread); - /* Update our root-domain */ - rq = cpu_rq(cpu); raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -5889,25 +5801,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!cpu_rq(cpu)->migration_thread) - break; - /* Unbind it from offline cpu so it can run. Fall thru. */ - kthread_bind(cpu_rq(cpu)->migration_thread, - cpumask_any(cpu_online_mask)); - kthread_stop(cpu_rq(cpu)->migration_thread); - put_task_struct(cpu_rq(cpu)->migration_thread); - cpu_rq(cpu)->migration_thread = NULL; - break; - case CPU_DEAD: case CPU_DEAD_FROZEN: migrate_live_tasks(cpu); - rq = cpu_rq(cpu); - kthread_stop(rq->migration_thread); - put_task_struct(rq->migration_thread); - rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ raw_spin_lock_irq(&rq->lock); deactivate_task(rq, rq->idle, 0); @@ -5918,29 +5814,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); calc_global_load_remove(rq); - /* - * No need to migrate the tasks: it was best-effort if - * they didn't take sched_hotcpu_mutex. Just wake up - * the requestors. - */ - raw_spin_lock_irq(&rq->lock); - while (!list_empty(&rq->migration_queue)) { - struct migration_req *req; - - req = list_entry(rq->migration_queue.next, - struct migration_req, list); - list_del_init(&req->list); - raw_spin_unlock_irq(&rq->lock); - complete(&req->done); - raw_spin_lock_irq(&rq->lock); - } - raw_spin_unlock_irq(&rq->lock); break; case CPU_DYING: case CPU_DYING_FROZEN: /* Update our root-domain */ - rq = cpu_rq(cpu); raw_spin_lock_irqsave(&rq->lock, flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -7757,10 +7635,8 @@ void __init sched_init(void) rq->push_cpu = 0; rq->cpu = i; rq->online = 0; - rq->migration_thread = NULL; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; - INIT_LIST_HEAD(&rq->migration_queue); rq_attach_root(rq, &def_root_domain); #endif init_rq_hrtick(rq); @@ -9054,43 +8930,39 @@ struct cgroup_subsys cpuacct_subsys = { #ifndef CONFIG_SMP -int rcu_expedited_torture_stats(char *page) -{ - return 0; -} -EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); - void synchronize_sched_expedited(void) { + /* + * There must be a full memory barrier on each affected CPU + * between the time that try_stop_cpus() is called and the + * time that it returns. + * + * In the current initial implementation of cpu_stop, the + * above condition is already met when the control reaches + * this point and the following smp_mb() is not strictly + * necessary. Do smp_mb() anyway for documentation and + * robustness against future implementation changes. + */ + smp_mb(); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); #else /* #ifndef CONFIG_SMP */ -static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); -static DEFINE_MUTEX(rcu_sched_expedited_mutex); - -#define RCU_EXPEDITED_STATE_POST -2 -#define RCU_EXPEDITED_STATE_IDLE -1 +static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); -static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; - -int rcu_expedited_torture_stats(char *page) +static int synchronize_sched_expedited_cpu_stop(void *data) { - int cnt = 0; - int cpu; + static DEFINE_SPINLOCK(done_mask_lock); + struct cpumask *done_mask = data; - cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); - for_each_online_cpu(cpu) { - cnt += sprintf(&page[cnt], " %d:%d", - cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); + if (done_mask) { + spin_lock(&done_mask_lock); + cpumask_set_cpu(smp_processor_id(), done_mask); + spin_unlock(&done_mask_lock); } - cnt += sprintf(&page[cnt], "\n"); - return cnt; + return 0; } -EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); - -static long synchronize_sched_expedited_count; /* * Wait for an rcu-sched grace period to elapse, but use "big hammer" @@ -9104,60 +8976,55 @@ static long synchronize_sched_expedited_count; */ void synchronize_sched_expedited(void) { - int cpu; - unsigned long flags; - bool need_full_sync = 0; - struct rq *rq; - struct migration_req *req; - long snap; - int trycount = 0; + cpumask_var_t done_mask_var; + struct cpumask *done_mask = NULL; + int snap, trycount = 0; + + /* + * done_mask is used to check that all cpus actually have + * finished running the stopper, which is guaranteed by + * stop_cpus() if it's called with cpu hotplug blocked. Keep + * the paranoia for now but it's best effort if cpumask is off + * stack. + */ + if (zalloc_cpumask_var(&done_mask_var, GFP_ATOMIC)) + done_mask = done_mask_var; smp_mb(); /* ensure prior mod happens before capturing snap. */ - snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; + snap = atomic_read(&synchronize_sched_expedited_count) + 1; get_online_cpus(); - while (!mutex_trylock(&rcu_sched_expedited_mutex)) { + while (try_stop_cpus(cpu_online_mask, + synchronize_sched_expedited_cpu_stop, + done_mask) == -EAGAIN) { put_online_cpus(); if (trycount++ < 10) udelay(trycount * num_online_cpus()); else { synchronize_sched(); - return; + goto free_out; } - if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { + if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { smp_mb(); /* ensure test happens before caller kfree */ - return; + goto free_out; } get_online_cpus(); } - rcu_expedited_state = RCU_EXPEDITED_STATE_POST; - for_each_online_cpu(cpu) { - rq = cpu_rq(cpu); - req = &per_cpu(rcu_migration_req, cpu); - init_completion(&req->done); - req->task = NULL; - req->dest_cpu = RCU_MIGRATION_NEED_QS; - raw_spin_lock_irqsave(&rq->lock, flags); - list_add(&req->list, &rq->migration_queue); - raw_spin_unlock_irqrestore(&rq->lock, flags); - wake_up_process(rq->migration_thread); - } - for_each_online_cpu(cpu) { - rcu_expedited_state = cpu; - req = &per_cpu(rcu_migration_req, cpu); - rq = cpu_rq(cpu); - wait_for_completion(&req->done); - raw_spin_lock_irqsave(&rq->lock, flags); - if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) - need_full_sync = 1; - req->dest_cpu = RCU_MIGRATION_IDLE; - raw_spin_unlock_irqrestore(&rq->lock, flags); - } - rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; - synchronize_sched_expedited_count++; - mutex_unlock(&rcu_sched_expedited_mutex); + atomic_inc(&synchronize_sched_expedited_count); + if (done_mask) + cpumask_xor(done_mask, done_mask, cpu_online_mask); put_online_cpus(); - if (need_full_sync) + + /* paranoia - this can't happen */ + if (done_mask && cpumask_weight(done_mask)) { + char buf[80]; + + cpulist_scnprintf(buf, sizeof(buf), done_mask); + WARN_ONCE(1, "synchronize_sched_expedited: cpu online and done masks disagree on %d cpus: %s\n", + cpumask_weight(done_mask), buf); synchronize_sched(); + } +free_out: + free_cpumask_var(done_mask_var); } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cbd8b8a296d..217e4a9393e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2798,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } +static int active_load_balance_cpu_stop(void *data); + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -2887,8 +2889,9 @@ redo: if (need_active_balance(sd, sd_idle, idle)) { raw_spin_lock_irqsave(&busiest->lock, flags); - /* don't kick the migration_thread, if the curr - * task on busiest cpu can't be moved to this_cpu + /* don't kick the active_load_balance_cpu_stop, + * if the curr task on busiest cpu can't be + * moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { @@ -2898,14 +2901,22 @@ redo: goto out_one_pinned; } + /* + * ->active_balance synchronizes accesses to + * ->active_balance_work. Once set, it's cleared + * only after active load balance is finished. + */ if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; } raw_spin_unlock_irqrestore(&busiest->lock, flags); + if (active_balance) - wake_up_process(busiest->migration_thread); + stop_one_cpu_nowait(cpu_of(busiest), + active_load_balance_cpu_stop, busiest, + &busiest->active_balance_work); /* * We've kicked active balancing, reset the failure @@ -3012,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq) } /* - * active_load_balance is run by migration threads. It pushes running tasks - * off the busiest CPU onto idle CPUs. It requires at least 1 task to be - * running on each physical CPU where possible, and avoids physical / - * logical imbalances. - * - * Called with busiest_rq locked. + * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * running tasks off the busiest CPU onto idle CPUs. It requires at + * least 1 task to be running on each physical CPU where possible, and + * avoids physical / logical imbalances. */ -static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) +static int active_load_balance_cpu_stop(void *data) { + struct rq *busiest_rq = data; + int busiest_cpu = cpu_of(busiest_rq); int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; - struct rq *target_rq; + + raw_spin_lock_irq(&busiest_rq->lock); + + /* make sure the requested cpu hasn't gone down in the meantime */ + if (unlikely(busiest_cpu != smp_processor_id() || + !busiest_rq->active_balance)) + goto out_unlock; /* Is there any task to move? */ if (busiest_rq->nr_running <= 1) - return; - - target_rq = cpu_rq(target_cpu); + goto out_unlock; /* * This condition is "impossible", if it occurs @@ -3058,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) schedstat_inc(sd, alb_failed); } double_unlock_balance(busiest_rq, target_rq); +out_unlock: + busiest_rq->active_balance = 0; + raw_spin_unlock_irq(&busiest_rq->lock); + return 0; } #ifdef CONFIG_NO_HZ diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 884c7a1afee..5b20141a5ec 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -301,7 +301,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: BUG_ON(stopper->thread || stopper->enabled || !list_empty(&stopper->works)); - p = kthread_create(cpu_stopper_thread, stopper, "stopper/%d", + p = kthread_create(cpu_stopper_thread, stopper, "migration/%d", cpu); if (IS_ERR(p)) return NOTIFY_BAD; -- cgit v1.2.3 From d21670acab9fcb4bc74a40b68a6941059234c55c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 14 Apr 2010 17:39:26 -0700 Subject: rcu: reduce the number of spurious RCU_SOFTIRQ invocations Lai Jiangshan noted that up to 10% of the RCU_SOFTIRQ are spurious, and traced this down to the fact that the current grace-period machinery will uselessly raise RCU_SOFTIRQ when a given CPU needs to go through a quiescent state, but has not yet done so. In this situation, there might well be nothing that RCU_SOFTIRQ can do, and the overhead can be worth worrying about in the ksoftirqd case. This patch therefore avoids raising RCU_SOFTIRQ in this situation. Changes since v1 (http://lkml.org/lkml/2010/3/30/122 from Lai Jiangshan): o Omit the rcu_qs_pending() prechecks, as they aren't that much less expensive than the quiescent-state checks. o Merge with the set_need_resched() patch that reduces IPIs. o Add the new n_rp_report_qs field to the rcu_pending tracing output. o Update the tracing documentation accordingly. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- Documentation/RCU/trace.txt | 35 +++++++++++++++++++---------------- kernel/rcutree.c | 11 ++++++----- kernel/rcutree.h | 1 + kernel/rcutree_trace.c | 4 +++- 4 files changed, 29 insertions(+), 22 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 8608fd85e92..efd8cc95c06 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -256,23 +256,23 @@ o Each element of the form "1/1 0:127 ^0" represents one struct The output of "cat rcu/rcu_pending" looks as follows: rcu_sched: - 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 - 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 - 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 - 3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 - 4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 - 5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 - 6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 - 7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 + 0 np=255892 qsp=53936 rpq=85 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 + 1 np=261224 qsp=54638 rpq=33 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 + 2 np=237496 qsp=49664 rpq=23 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 + 3 np=236249 qsp=48766 rpq=98 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 + 4 np=221310 qsp=46850 rpq=7 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 + 5 np=237332 qsp=48449 rpq=9 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 + 6 np=219995 qsp=46718 rpq=12 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 + 7 np=249893 qsp=49390 rpq=42 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 rcu_bh: - 0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 - 1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 - 2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 - 3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 - 4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 - 5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 - 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 - 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 + 0 np=146741 qsp=1419 rpq=6 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 + 1 np=155792 qsp=12597 rpq=3 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 + 2 np=136629 qsp=18680 rpq=1 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 + 3 np=137723 qsp=2843 rpq=0 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 + 4 np=123110 qsp=12433 rpq=0 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 + 5 np=137456 qsp=4210 rpq=1 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 + 6 np=120834 qsp=9902 rpq=2 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 + 7 np=144888 qsp=26336 rpq=0 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 As always, this is once again split into "rcu_sched" and "rcu_bh" portions, with CONFIG_TREE_PREEMPT_RCU kernels having an additional @@ -284,6 +284,9 @@ o "np" is the number of times that __rcu_pending() has been invoked o "qsp" is the number of times that the RCU was waiting for a quiescent state from this CPU. +o "rpq" is the number of times that the CPU had passed through + a quiescent state, but not yet reported it to RCU. + o "cbr" is the number of times that this CPU had RCU callbacks that had passed through a grace period, and were thus ready to be invoked. diff --git a/kernel/rcutree.c b/kernel/rcutree.c index c60fd74e7ec..ba6996943e2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1161,8 +1161,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) */ void rcu_check_callbacks(int cpu, int user) { - if (!rcu_pending(cpu)) - return; /* if nothing for RCU to do. */ if (user || (idle_cpu(cpu) && rcu_scheduler_active && !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { @@ -1194,7 +1192,8 @@ void rcu_check_callbacks(int cpu, int user) rcu_bh_qs(cpu); } rcu_preempt_check_callbacks(cpu); - raise_softirq(RCU_SOFTIRQ); + if (rcu_pending(cpu)) + raise_softirq(RCU_SOFTIRQ); } #ifdef CONFIG_SMP @@ -1534,18 +1533,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) check_cpu_stall(rsp, rdp); /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rdp->qs_pending) { + if (rdp->qs_pending && !rdp->passed_quiesc) { /* * If force_quiescent_state() coming soon and this CPU * needs a quiescent state, and this is either RCU-sched * or RCU-bh, force a local reschedule. */ + rdp->n_rp_qs_pending++; if (!rdp->preemptable && ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, jiffies)) set_need_resched(); - rdp->n_rp_qs_pending++; + } else if (rdp->qs_pending && rdp->passed_quiesc) { + rdp->n_rp_report_qs++; return 1; } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 11f171121ad..14c040b18ed 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -223,6 +223,7 @@ struct rcu_data { /* 5) __rcu_pending() statistics. */ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ unsigned long n_rp_qs_pending; + unsigned long n_rp_report_qs; unsigned long n_rp_cb_ready; unsigned long n_rp_cpu_needs_gp; unsigned long n_rp_gp_completed; diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d45db2e35d2..36c95b45738 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = { static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) { seq_printf(m, "%3d%cnp=%ld " - "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", + "qsp=%ld rpq=%ld cbr=%ld cng=%ld " + "gpc=%ld gps=%ld nf=%ld nn=%ld\n", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->n_rcu_pending, rdp->n_rp_qs_pending, + rdp->n_rp_report_qs, rdp->n_rp_cb_ready, rdp->n_rp_cpu_needs_gp, rdp->n_rp_gp_completed, -- cgit v1.2.3 From f1d507beeab1d1d60a1c58eac7dc81522c6f4629 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 15 Apr 2010 15:49:46 -0700 Subject: rcu: improve the RCU CPU-stall warning documentation The existing Documentation/RCU/stallwarn.txt has proven unhelpful, so rework it a bit. In particular, show how to interpret the stall-warning messages. Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.txt | 94 +++++++++++++++++++++++++++++++---------- 1 file changed, 71 insertions(+), 23 deletions(-) (limited to 'Documentation') diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 1423d2570d7..44c6dcc93d6 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -3,35 +3,79 @@ Using RCU's CPU Stall Detector The CONFIG_RCU_CPU_STALL_DETECTOR kernel config parameter enables RCU's CPU stall detector, which detects conditions that unduly delay RCU grace periods. The stall detector's idea of what constitutes -"unduly delayed" is controlled by a pair of C preprocessor macros: +"unduly delayed" is controlled by a set of C preprocessor macros: RCU_SECONDS_TILL_STALL_CHECK This macro defines the period of time that RCU will wait from the beginning of a grace period until it issues an RCU CPU - stall warning. It is normally ten seconds. + stall warning. This time period is normally ten seconds. RCU_SECONDS_TILL_STALL_RECHECK This macro defines the period of time that RCU will wait after - issuing a stall warning until it issues another stall warning. - It is normally set to thirty seconds. + issuing a stall warning until it issues another stall warning + for the same stall. This time period is normally set to thirty + seconds. RCU_STALL_RAT_DELAY - The CPU stall detector tries to make the offending CPU rat on itself, - as this often gives better-quality stack traces. However, if - the offending CPU does not detect its own stall in the number - of jiffies specified by RCU_STALL_RAT_DELAY, then other CPUs will - complain. This is normally set to two jiffies. + The CPU stall detector tries to make the offending CPU print its + own warnings, as this often gives better-quality stack traces. + However, if the offending CPU does not detect its own stall in + the number of jiffies specified by RCU_STALL_RAT_DELAY, then + some other CPU will complain. This delay is normally set to + two jiffies. -The following problems can result in an RCU CPU stall warning: +When a CPU detects that it is stalling, it will print a message similar +to the following: + +INFO: rcu_sched_state detected stall on CPU 5 (t=2500 jiffies) + +This message indicates that CPU 5 detected that it was causing a stall, +and that the stall was affecting RCU-sched. This message will normally be +followed by a stack dump of the offending CPU. On TREE_RCU kernel builds, +RCU and RCU-sched are implemented by the same underlying mechanism, +while on TREE_PREEMPT_RCU kernel builds, RCU is instead implemented +by rcu_preempt_state. + +On the other hand, if the offending CPU fails to print out a stall-warning +message quickly enough, some other CPU will print a message similar to +the following: + +INFO: rcu_bh_state detected stalls on CPUs/tasks: { 3 5 } (detected by 2, 2502 jiffies) + +This message indicates that CPU 2 detected that CPUs 3 and 5 were both +causing stalls, and that the stall was affecting RCU-bh. This message +will normally be followed by stack dumps for each CPU. Please note that +TREE_PREEMPT_RCU builds can be stalled by tasks as well as by CPUs, +and that the tasks will be indicated by PID, for example, "P3421". +It is even possible for a rcu_preempt_state stall to be caused by both +CPUs -and- tasks, in which case the offending CPUs and tasks will all +be called out in the list. + +Finally, if the grace period ends just as the stall warning starts +printing, there will be a spurious stall-warning message: + +INFO: rcu_bh_state detected stalls on CPUs/tasks: { } (detected by 4, 2502 jiffies) + +This is rare, but does happen from time to time in real life. + +So your kernel printed an RCU CPU stall warning. The next question is +"What caused it?" The following problems can result in RCU CPU stall +warnings: o A CPU looping in an RCU read-side critical section. -o A CPU looping with interrupts disabled. +o A CPU looping with interrupts disabled. This condition can + result in RCU-sched and RCU-bh stalls. -o A CPU looping with preemption disabled. +o A CPU looping with preemption disabled. This condition can + result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh + stalls. + +o A CPU looping with bottom halves disabled. This condition can + result in RCU-sched and RCU-bh stalls. o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel without invoking schedule(). @@ -39,20 +83,24 @@ o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel o A bug in the RCU implementation. o A hardware failure. This is quite unlikely, but has occurred - at least once in a former life. A CPU failed in a running system, + at least once in real life. A CPU failed in a running system, becoming unresponsive, but not causing an immediate crash. This resulted in a series of RCU CPU stall warnings, eventually leading the realization that the CPU had failed. -The RCU, RCU-sched, and RCU-bh implementations have CPU stall warning. -SRCU does not do so directly, but its calls to synchronize_sched() will -result in RCU-sched detecting any CPU stalls that might be occurring. - -To diagnose the cause of the stall, inspect the stack traces. The offending -function will usually be near the top of the stack. If you have a series -of stall warnings from a single extended stall, comparing the stack traces -can often help determine where the stall is occurring, which will usually -be in the function nearest the top of the stack that stays the same from -trace to trace. +The RCU, RCU-sched, and RCU-bh implementations have CPU stall +warning. SRCU does not have its own CPU stall warnings, but its +calls to synchronize_sched() will result in RCU-sched detecting +RCU-sched-related CPU stalls. Please note that RCU only detects +CPU stalls when there is a grace period in progress. No grace period, +no CPU stall warnings. + +To diagnose the cause of the stall, inspect the stack traces. +The offending function will usually be near the top of the stack. +If you have a series of stall warnings from a single extended stall, +comparing the stack traces can often help determine where the stall +is occurring, which will usually be in the function nearest the top of +that portion of the stack which remains the same from trace to trace. +If you can reliably trigger the stall, ftrace can be quite helpful. RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE. -- cgit v1.2.3 From a52357259680fe5368c2fabf5949209e231f2aa2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 11 May 2010 17:12:33 +0200 Subject: x86/amd-iommu: Add amd_iommu=off command line option This patch adds a command line option to tell the AMD IOMMU driver to not initialize any IOMMU it finds. Signed-off-by: Joerg Roedel --- Documentation/kernel-parameters.txt | 2 ++ arch/x86/kernel/amd_iommu_init.c | 6 ++++++ 2 files changed, 8 insertions(+) (limited to 'Documentation') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 839b21b0699..0c6c56076d1 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -324,6 +324,8 @@ and is between 256 and 4096 characters. It is defined in the file they are unmapped. Otherwise they are flushed before they will be reused, which is a lot of faster + off - do not initialize any AMD IOMMU found in + the system amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 6360abf993d..3bacb4d0844 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -120,6 +120,7 @@ struct ivmd_header { bool amd_iommu_dump; static int __initdata amd_iommu_detected; +static bool __initdata amd_iommu_disabled; u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ @@ -1372,6 +1373,9 @@ void __init amd_iommu_detect(void) if (no_iommu || (iommu_detected && !gart_iommu_aperture)) return; + if (amd_iommu_disabled) + return; + if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; amd_iommu_detected = 1; @@ -1401,6 +1405,8 @@ static int __init parse_amd_iommu_options(char *str) for (; *str; ++str) { if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; + if (strncmp(str, "off", 3) == 0) + amd_iommu_disabled = true; } return 1; -- cgit v1.2.3