diff options
author | John Stultz <john.stultz@linaro.org> | 2011-04-05 12:21:10 -0700 |
---|---|---|
committer | John Stultz <john.stultz@linaro.org> | 2011-04-05 12:21:10 -0700 |
commit | 4ce7ea0bfbb301ffb79154b6cecd2ef030db4cdf (patch) | |
tree | a7ad87580793912a9da208e7a9ea21d6c7768f08 /kernel | |
parent | 4450182f400f1a5f50b1680faec25af2315c2849 (diff) | |
parent | 7c4bc9c2662c6d9840afed0e29eb01314af9bb78 (diff) |
Merge branch 'upstream/linaro.38' into linaro-android.38
Conflicts:
arch/arm/kernel/signal.c
drivers/mmc/card/block.c
drivers/mtd/nand/Kconfig
include/linux/amba/mmci.h
kernel/printk.c
mm/shmem.c
net/socket.c
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 4 | ||||
-rw-r--r-- | kernel/cgroup.c | 12 | ||||
-rw-r--r-- | kernel/exit.c | 4 | ||||
-rw-r--r-- | kernel/fork.c | 10 | ||||
-rw-r--r-- | kernel/irq/handle.c | 8 | ||||
-rw-r--r-- | kernel/irq/irqdesc.c | 2 | ||||
-rw-r--r-- | kernel/itimer.c | 7 | ||||
-rw-r--r-- | kernel/kexec.c | 8 | ||||
-rw-r--r-- | kernel/lockdep.c | 22 | ||||
-rw-r--r-- | kernel/ltt-channels.c | 388 | ||||
-rw-r--r-- | kernel/marker.c | 1262 | ||||
-rw-r--r-- | kernel/module.c | 110 | ||||
-rw-r--r-- | kernel/notifier.c | 31 | ||||
-rw-r--r-- | kernel/panic.c | 7 | ||||
-rw-r--r-- | kernel/perf_event.c | 17 | ||||
-rw-r--r-- | kernel/printk.c | 7 | ||||
-rw-r--r-- | kernel/rcutree.c | 7 | ||||
-rw-r--r-- | kernel/sched.c | 56 | ||||
-rw-r--r-- | kernel/signal.c | 16 | ||||
-rw-r--r-- | kernel/smp.c | 55 | ||||
-rw-r--r-- | kernel/softirq.c | 27 | ||||
-rw-r--r-- | kernel/sysctl.c | 18 | ||||
-rw-r--r-- | kernel/time/Makefile | 1 | ||||
-rw-r--r-- | kernel/time/tsc-sync.c | 313 | ||||
-rw-r--r-- | kernel/timer.c | 16 | ||||
-rw-r--r-- | kernel/trace/Makefile | 2 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 52 | ||||
-rw-r--r-- | kernel/trace/trace-clock-32-to-64.c | 296 | ||||
-rw-r--r-- | kernel/trace/trace-clock.c | 97 | ||||
-rw-r--r-- | kernel/trace/trace_printk.c | 1 |
30 files changed, 2811 insertions, 45 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 353d3fe8ba3..c039580ba3b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -91,6 +91,7 @@ obj-$(CONFIG_RELAY) += relay.o obj-$(CONFIG_SYSCTL) += utsname_sysctl.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o +obj-$(CONFIG_MARKERS) += marker.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_BINFMT_ELF) += elfcore.o @@ -99,7 +100,10 @@ obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ +obj-$(CONFIG_MARKERS) += ltt-channels.o obj-$(CONFIG_RING_BUFFER) += trace/ +obj-$(CONFIG_HAVE_TRACE_CLOCK_32_TO_64) += trace/ +obj-$(CONFIG_HAVE_TRACE_CLOCK_GENERIC) += trace/ obj-$(CONFIG_TRACEPOINTS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_IRQ_WORK) += irq_work.o diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d96bd1eb562..d83723e7ee0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1820,10 +1820,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) /* Update the css_set linked lists if we're using them */ write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) { - list_del(&tsk->cg_list); - list_add(&tsk->cg_list, &newcg->tasks); - } + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); write_unlock(&css_set_lock); for_each_subsys(root, ss) { @@ -3670,12 +3668,12 @@ again: spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) - list_del(&cgrp->release_list); + list_del_init(&cgrp->release_list); spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ - list_del(&cgrp->sibling); + list_del_init(&cgrp->sibling); cgroup_unlock_hierarchy(cgrp->root); d = dget(cgrp->dentry); @@ -3893,7 +3891,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) subsys[ss->subsys_id] = NULL; /* remove subsystem from rootnode's list of subsystems */ - list_del(&ss->sibling); + list_del_init(&ss->sibling); /* * disentangle the css from all css_sets attached to the dummytop. as diff --git a/kernel/exit.c b/kernel/exit.c index f9a45ebcc7b..0d9a3444614 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -514,6 +514,8 @@ struct files_struct *get_files_struct(struct task_struct *task) return files; } +EXPORT_SYMBOL(get_files_struct); + void put_files_struct(struct files_struct *files) { struct fdtable *fdt; @@ -535,6 +537,8 @@ void put_files_struct(struct files_struct *files) } } +EXPORT_SYMBOL(put_files_struct); + void reset_files_struct(struct files_struct *files) { struct task_struct *tsk = current; diff --git a/kernel/fork.c b/kernel/fork.c index 515abda4084..131ce90d19d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -88,6 +88,7 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +EXPORT_SYMBOL(tasklist_lock); #ifdef CONFIG_PROVE_RCU int lockdep_tasklist_lock_is_held(void) @@ -1266,6 +1267,15 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); + /* + * The state of the parent's TIF_KTRACE flag may have changed + * since it was copied in dup_task_struct() so we re-copy it here. + */ + if (test_thread_flag(TIF_KERNEL_TRACE)) + set_tsk_thread_flag(p, TIF_KERNEL_TRACE); + else + clear_tsk_thread_flag(p, TIF_KERNEL_TRACE); + /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 3540a719012..db864334a95 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -17,6 +17,7 @@ #include <linux/kernel_stat.h> #include <trace/events/irq.h> +#include <trace/irq.h> #include "internals.h" @@ -51,6 +52,9 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } +DEFINE_TRACE(irq_entry); +DEFINE_TRACE(irq_exit); + /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number @@ -63,6 +67,8 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) irqreturn_t ret, retval = IRQ_NONE; unsigned int status = 0; + trace_irq_entry(irq, NULL, action); + do { trace_irq_handler_entry(irq, action); ret = action->handler(irq, action->dev_id); @@ -116,5 +122,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) add_interrupt_randomness(irq); local_irq_disable(); + trace_irq_exit(retval); + return retval; } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2039bea31bd..1c07afd307f 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -109,6 +109,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return radix_tree_lookup(&irq_desc_tree, irq); } +EXPORT_SYMBOL_GPL(irq_to_desc); static void delete_irq_desc(unsigned int irq) { @@ -273,6 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < NR_IRQS) ? irq_desc + irq : NULL; } +EXPORT_SYMBOL_GPL(irq_to_desc); struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node) { diff --git a/kernel/itimer.c b/kernel/itimer.c index d802883153d..18fd8e919c0 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -13,9 +13,13 @@ #include <linux/posix-timers.h> #include <linux/hrtimer.h> #include <trace/events/timer.h> +#include <trace/timer.h> #include <asm/uaccess.h> +DEFINE_TRACE(timer_itimer_expired); +DEFINE_TRACE(timer_itimer_set); + /** * itimer_get_remtime - get remaining time for the timer * @@ -124,6 +128,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) container_of(timer, struct signal_struct, real_timer); trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); + trace_timer_itimer_expired(sig); kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); return HRTIMER_NORESTART; @@ -201,6 +206,8 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) !timeval_valid(&value->it_interval)) return -EINVAL; + trace_timer_itimer_set(which, value); + switch (which) { case ITIMER_REAL: again: diff --git a/kernel/kexec.c b/kernel/kexec.c index ec19b92c7eb..779f0031929 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -33,6 +33,7 @@ #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/kmsg_dump.h> +#include <trace/kernel.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -40,6 +41,9 @@ #include <asm/system.h> #include <asm/sections.h> +DEFINE_TRACE(kernel_kernel_kexec); +DEFINE_TRACE(kernel_crash_kexec); + /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -1066,6 +1070,8 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry, void crash_kexec(struct pt_regs *regs) { + trace_kernel_crash_kexec(kexec_crash_image, regs); + /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. @@ -1495,6 +1501,8 @@ int kernel_kexec(void) { int error = 0; + trace_kernel_kernel_kexec(kexec_image); + if (!mutex_trylock(&kexec_mutex)) return -EBUSY; if (!kexec_image) { diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0d2058da80f..e0841c537db 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -49,6 +49,8 @@ #include "lockdep_internals.h" +#include <trace/lockdep.h> + #define CREATE_TRACE_POINTS #include <trace/events/lock.h> @@ -66,6 +68,13 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif +DEFINE_TRACE(lockdep_hardirqs_on); +DEFINE_TRACE(lockdep_hardirqs_off); +DEFINE_TRACE(lockdep_softirqs_on); +DEFINE_TRACE(lockdep_softirqs_off); +DEFINE_TRACE(lockdep_lock_acquire); +DEFINE_TRACE(lockdep_lock_release); + /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -2300,6 +2309,8 @@ void trace_hardirqs_on_caller(unsigned long ip) time_hardirqs_on(CALLER_ADDR0, ip); + trace_lockdep_hardirqs_on(ip); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2358,6 +2369,8 @@ void trace_hardirqs_off_caller(unsigned long ip) time_hardirqs_off(CALLER_ADDR0, ip); + trace_lockdep_hardirqs_off(ip); + if (unlikely(!debug_locks || current->lockdep_recursion)) return; @@ -2390,6 +2403,8 @@ void trace_softirqs_on(unsigned long ip) { struct task_struct *curr = current; + trace_lockdep_softirqs_on(ip); + if (unlikely(!debug_locks)) return; @@ -2424,6 +2439,8 @@ void trace_softirqs_off(unsigned long ip) { struct task_struct *curr = current; + trace_lockdep_softirqs_off(ip); + if (unlikely(!debug_locks)) return; @@ -2730,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int class_idx; u64 chain_key; + trace_lockdep_lock_acquire(ip, subclass, lock, trylock, read, + hardirqs_off); + if (!prove_locking) check = 1; @@ -3108,6 +3128,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { struct task_struct *curr = current; + trace_lockdep_lock_release(ip, lock, nested); + if (!check_unlock(curr, lock, ip)) return; diff --git a/kernel/ltt-channels.c b/kernel/ltt-channels.c new file mode 100644 index 00000000000..102513874ad --- /dev/null +++ b/kernel/ltt-channels.c @@ -0,0 +1,388 @@ +/* + * ltt/ltt-channels.c + * + * (C) Copyright 2008 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) + * + * LTTng channel management. + * + * Author: + * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) + * + * Dual LGPL v2.1/GPL v2 license. + */ + +#include <linux/module.h> +#include <linux/ltt-channels.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +/* + * ltt_channel_mutex may be nested inside the LTT trace mutex. + * ltt_channel_mutex mutex may be nested inside markers mutex. + */ +static DEFINE_MUTEX(ltt_channel_mutex); +static LIST_HEAD(ltt_channels); +/* + * Index of next channel in array. Makes sure that as long as a trace channel is + * allocated, no array index will be re-used when a channel is freed and then + * another channel is allocated. This index is cleared and the array indexeds + * get reassigned when the index_kref goes back to 0, which indicates that no + * more trace channels are allocated. + */ +static unsigned int free_index; +/* index_kref is protected by both ltt_channel_mutex and lock_markers */ +static struct kref index_kref; /* Keeps track of allocated trace channels */ + +static struct ltt_channel_setting *lookup_channel(const char *name) +{ + struct ltt_channel_setting *iter; + + list_for_each_entry(iter, <t_channels, list) + if (strcmp(name, iter->name) == 0) + return iter; + return NULL; +} + +/* + * Must be called when channel refcount falls to 0 _and_ also when the last + * trace is freed. This function is responsible for compacting the channel and + * event IDs when no users are active. + * + * Called with lock_markers() and channels mutex held. + */ +static void release_channel_setting(struct kref *kref) +{ + struct ltt_channel_setting *setting = container_of(kref, + struct ltt_channel_setting, kref); + struct ltt_channel_setting *iter; + + if (atomic_read(&index_kref.refcount) == 0 + && atomic_read(&setting->kref.refcount) == 0) { + list_del(&setting->list); + kfree(setting); + + free_index = 0; + list_for_each_entry(iter, <t_channels, list) { + iter->index = free_index++; + iter->free_event_id = 0; + } + } +} + +/* + * Perform channel index compaction when the last trace channel is freed. + * + * Called with lock_markers() and channels mutex held. + */ +static void release_trace_channel(struct kref *kref) +{ + struct ltt_channel_setting *iter, *n; + + list_for_each_entry_safe(iter, n, <t_channels, list) + release_channel_setting(&iter->kref); + if (atomic_read(&index_kref.refcount) == 0) + markers_compact_event_ids(); +} + +/* + * ltt_channel_trace_ref : Is there an existing trace session ? + * + * Must be called with lock_markers() held. + */ +int ltt_channels_trace_ref(void) +{ + return !!atomic_read(&index_kref.refcount); +} +EXPORT_SYMBOL_GPL(ltt_channels_trace_ref); + +/** + * ltt_channels_register - Register a trace channel. + * @name: channel name + * + * Uses refcounting. + */ +int ltt_channels_register(const char *name) +{ + struct ltt_channel_setting *setting; + int ret = 0; + + mutex_lock(<t_channel_mutex); + setting = lookup_channel(name); + if (setting) { + if (atomic_read(&setting->kref.refcount) == 0) + goto init_kref; + else { + kref_get(&setting->kref); + goto end; + } + } + setting = kzalloc(sizeof(*setting), GFP_KERNEL); + if (!setting) { + ret = -ENOMEM; + goto end; + } + list_add(&setting->list, <t_channels); + strncpy(setting->name, name, PATH_MAX-1); + setting->index = free_index++; +init_kref: + kref_init(&setting->kref); +end: + mutex_unlock(<t_channel_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(ltt_channels_register); + +/** + * ltt_channels_unregister - Unregister a trace channel. + * @name: channel name + * @compacting: performing compaction + * + * Must be called with markers mutex held. + */ +int ltt_channels_unregister(const char *name, int compacting) +{ + struct ltt_channel_setting *setting; + int ret = 0; + + if (!compacting) + mutex_lock(<t_channel_mutex); + setting = lookup_channel(name); + if (!setting || atomic_read(&setting->kref.refcount) == 0) { + ret = -ENOENT; + goto end; + } + kref_put(&setting->kref, release_channel_setting); + if (!compacting && atomic_read(&index_kref.refcount) == 0) + markers_compact_event_ids(); +end: + if (!compacting) + mutex_unlock(<t_channel_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(ltt_channels_unregister); + +/** + * ltt_channels_set_default - Set channel default behavior. + * @name: default channel name + * @sb_size: size of the subbuffers + * @n_sb: number of subbuffers + */ +int ltt_channels_set_default(const char *name, + unsigned int sb_size, + unsigned int n_sb) +{ + struct ltt_channel_setting *setting; + int ret = 0; + + mutex_lock(<t_channel_mutex); + setting = lookup_channel(name); + if (!setting || atomic_read(&setting->kref.refcount) == 0) { + ret = -ENOENT; + goto end; + } + setting->sb_size = sb_size; + setting->n_sb = n_sb; +end: + mutex_unlock(<t_channel_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(ltt_channels_set_default); + +/** + * ltt_channels_get_name_from_index - get channel name from channel index + * @index: channel index + * + * Allows to lookup the channel name given its index. Done to keep the name + * information outside of each trace channel instance. + */ +const char *ltt_channels_get_name_from_index(unsigned int index) +{ + struct ltt_channel_setting *iter; + + list_for_each_entry(iter, <t_channels, list) + if (iter->index == index && atomic_read(&iter->kref.refcount)) + return iter->name; + return NULL; +} +EXPORT_SYMBOL_GPL(ltt_channels_get_name_from_index); + +static struct ltt_channel_setting * +ltt_channels_get_setting_from_name(const char *name) +{ + struct ltt_channel_setting *iter; + + list_for_each_entry(iter, <t_channels, list) + if (!strcmp(iter->name, name) + && atomic_read(&iter->kref.refcount)) + return iter; + return NULL; +} + +/** + * ltt_channels_get_index_from_name - get channel index from channel name + * @name: channel name + * + * Allows to lookup the channel index given its name. Done to keep the name + * information outside of each trace channel instance. + * Returns -1 if not found. + */ +int ltt_channels_get_index_from_name(const char *name) +{ + struct ltt_channel_setting *setting; + + setting = ltt_channels_get_setting_from_name(name); + if (setting) + return setting->index; + else + return -1; +} +EXPORT_SYMBOL_GPL(ltt_channels_get_index_from_name); + +/** + * ltt_channels_trace_alloc - Allocate channel structures for a trace + * @sb_size: subbuffer size. 0 uses default. + * @n_sb: number of subbuffers per per-cpu buffers. 0 uses default. + * @flags: Default channel flags + * + * Use the current channel list to allocate the channels for a trace. + * Called with trace lock held. Does not perform the trace buffer allocation, + * because we must let the user overwrite specific channel sizes. + */ +struct ltt_chan *ltt_channels_trace_alloc(unsigned int *nr_channels, + int overwrite, int active) +{ + struct ltt_chan *chan = NULL; + struct ltt_channel_setting *iter; + + lock_markers(); + mutex_lock(<t_channel_mutex); + if (!free_index) + goto end; + if (!atomic_read(&index_kref.refcount)) + kref_init(&index_kref); + else + kref_get(&index_kref); + *nr_channels = free_index; + chan = kzalloc(sizeof(struct ltt_chan) * free_index, GFP_KERNEL); + if (!chan) + goto end; + list_for_each_entry(iter, <t_channels, list) { + if (!atomic_read(&iter->kref.refcount)) + continue; + chan[iter->index].a.sb_size = iter->sb_size; + chan[iter->index].a.n_sb = iter->n_sb; + chan[iter->index].overwrite = overwrite; + chan[iter->index].active = active; + strncpy(chan[iter->index].a.filename, iter->name, NAME_MAX - 1); + chan[iter->index].switch_timer_interval = 0; + } +end: + mutex_unlock(<t_channel_mutex); + unlock_markers(); + return chan; +} +EXPORT_SYMBOL_GPL(ltt_channels_trace_alloc); + +/** + * ltt_channels_trace_free - Free one trace's channels + * @channels: channels to free + * + * Called with trace lock held. The actual channel buffers must be freed before + * this function is called. + */ +void ltt_channels_trace_free(struct ltt_chan *channels, + unsigned int nr_channels) +{ + lock_markers(); + mutex_lock(<t_channel_mutex); + kfree(channels); + kref_put(&index_kref, release_trace_channel); + mutex_unlock(<t_channel_mutex); + unlock_markers(); + marker_update_probes(); +} +EXPORT_SYMBOL_GPL(ltt_channels_trace_free); + +/** + * ltt_channels_trace_set_timer - set switch timer + * @channel: channel + * @interval: interval of timer interrupt, in jiffies. 0 inhibits timer. + */ + +void ltt_channels_trace_set_timer(struct ltt_chan *chan, + unsigned long interval) +{ + chan->switch_timer_interval = interval; +} +EXPORT_SYMBOL_GPL(ltt_channels_trace_set_timer); + +/** + * _ltt_channels_get_event_id - get next event ID for a marker + * @channel: channel name + * @name: event name + * + * Returns a unique event ID (for this channel) or < 0 on error. + * Must be called with channels mutex held. + */ +int _ltt_channels_get_event_id(const char *channel, const char *name) +{ + struct ltt_channel_setting *setting; + int ret; + + setting = ltt_channels_get_setting_from_name(channel); + if (!setting) { + ret = -ENOENT; + goto end; + } + if (strcmp(channel, "metadata") == 0) { + if (strcmp(name, "core_marker_id") == 0) + ret = 0; + else if (strcmp(name, "core_marker_format") == 0) + ret = 1; + else + ret = -ENOENT; + goto end; + } + if (setting->free_event_id == EVENTS_PER_CHANNEL - 1) { + ret = -ENOSPC; + goto end; + } + ret = setting->free_event_id++; +end: + return ret; +} + +/** + * ltt_channels_get_event_id - get next event ID for a marker + * @channel: channel name + * @name: event name + * + * Returns a unique event ID (for this channel) or < 0 on error. + */ +int ltt_channels_get_event_id(const char *channel, const char *name) +{ + int ret; + + mutex_lock(<t_channel_mutex); + ret = _ltt_channels_get_event_id(channel, name); + mutex_unlock(<t_channel_mutex); + return ret; +} + +/** + * ltt_channels_reset_event_ids - reset event IDs at compaction + * + * Called with lock marker and channel mutex held. + */ +void _ltt_channels_reset_event_ids(void) +{ + struct ltt_channel_setting *iter; + + list_for_each_entry(iter, <t_channels, list) + iter->free_event_id = 0; +} + +MODULE_LICENSE("GPL and additional rights"); +MODULE_AUTHOR("Mathieu Desnoyers"); +MODULE_DESCRIPTION("Linux Trace Toolkit Next Generation Channel Management"); diff --git a/kernel/marker.c b/kernel/marker.c new file mode 100644 index 00000000000..eac8ebfc3b9 --- /dev/null +++ b/kernel/marker.c @@ -0,0 +1,1262 @@ +/* + * Copyright (C) 2007 Mathieu Desnoyers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/types.h> +#include <linux/jhash.h> +#include <linux/hash.h> +#include <linux/list.h> +#include <linux/rcupdate.h> +#include <linux/marker.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/immediate.h> +#include <linux/ltt-channels.h> + +extern struct marker __start___markers[]; +extern struct marker __stop___markers[]; + +/* Set to 1 to enable marker debug output */ +static const int marker_debug; + +/* + * markers_mutex nests inside module_mutex. Markers mutex protects the builtin + * and module markers and the hash table. + * markers_mutex nests inside the trace lock, to ensure event ID consistency + * between the hash table and the marker section. + */ +static DEFINE_MUTEX(markers_mutex); + +void lock_markers(void) +{ + mutex_lock(&markers_mutex); +} +EXPORT_SYMBOL_GPL(lock_markers); + +void unlock_markers(void) +{ + mutex_unlock(&markers_mutex); +} +EXPORT_SYMBOL_GPL(unlock_markers); + +/* + * Marker hash table, containing the active markers. + * Protected by module_mutex. + */ +#define MARKER_HASH_BITS 6 +#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) +static struct hlist_head marker_table[MARKER_TABLE_SIZE]; +static struct hlist_head id_table[MARKER_TABLE_SIZE]; + +struct marker_probe_array { + struct rcu_head rcu; + struct marker_probe_closure c[0]; +}; + +/* + * Note about RCU : + * It is used to make sure every handler has finished using its private data + * between two consecutive operation (add or remove) on a given marker. It is + * also used to delay the free of multiple probes array until a quiescent state + * is reached. + * marker entries modifications are protected by the markers_mutex. + */ +struct marker_entry { + struct hlist_node hlist; + struct hlist_node id_list; + char *format; + char *name; + /* Probe wrapper */ + void (*call)(const struct marker *mdata, void *call_private, ...); + struct marker_probe_closure single; + struct marker_probe_array *multi; + int refcount; /* Number of times armed. 0 if disarmed. */ + u16 channel_id; + u16 event_id; + unsigned char ptype:1; + unsigned char format_allocated:1; + char channel[0]; /* Contains channel'\0'name'\0'format'\0' */ +}; + +/** + * __mark_empty_function - Empty probe callback + * @mdata: marker data + * @probe_private: probe private data + * @call_private: call site private data + * @fmt: format string + * @...: variable argument list + * + * Empty callback provided as a probe to the markers. By providing this to a + * disabled marker, we make sure the execution flow is always valid even + * though the function pointer change and the marker enabling are two distinct + * operations that modifies the execution flow of preemptible code. + */ +notrace void __mark_empty_function(const struct marker *mdata, + void *probe_private, void *call_private, const char *fmt, va_list *args) +{ +} +EXPORT_SYMBOL_GPL(__mark_empty_function); + +/* + * marker_probe_cb Callback that prepares the variable argument list for probes. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @...: Variable argument list. + * + * Since we do not use "typical" pointer based RCU in the 1 argument case, we + * need to put a full smp_rmb() in this branch. This is why we do not use + * rcu_dereference() for the pointer read. + */ +notrace void marker_probe_cb(const struct marker *mdata, + void *call_private, ...) +{ + va_list args; + char ptype; + + /* + * rcu_read_lock_sched does two things : disabling preemption to make + * sure the teardown of the callbacks can be done correctly when they + * are in modules and they insure RCU read coherency. + */ + rcu_read_lock_sched_notrace(); + ptype = mdata->ptype; + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = mdata->single.func; + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + va_start(args, call_private); + func(mdata, mdata->single.probe_private, call_private, + mdata->format, &args); + va_end(args); + } else { + struct marker_probe_array *multi; + int i; + /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + for (i = 0; multi->c[i].func; i++) { + va_start(args, call_private); + multi->c[i].func(mdata, multi->c[i].probe_private, + call_private, mdata->format, &args); + va_end(args); + } + } + rcu_read_unlock_sched_notrace(); +} +EXPORT_SYMBOL_GPL(marker_probe_cb); + +/* + * marker_probe_cb Callback that does not prepare the variable argument list. + * @mdata: pointer of type struct marker + * @call_private: caller site private data + * @...: Variable argument list. + * + * Should be connected to markers "MARK_NOARGS". + */ +static notrace void marker_probe_cb_noarg(const struct marker *mdata, + void *call_private, ...) +{ + va_list args; /* not initialized */ + char ptype; + + rcu_read_lock_sched_notrace(); + ptype = mdata->ptype; + if (likely(!ptype)) { + marker_probe_func *func; + /* Must read the ptype before ptr. They are not data dependant, + * so we put an explicit smp_rmb() here. */ + smp_rmb(); + func = mdata->single.func; + /* Must read the ptr before private data. They are not data + * dependant, so we put an explicit smp_rmb() here. */ + smp_rmb(); + func(mdata, mdata->single.probe_private, call_private, + mdata->format, &args); + } else { + struct marker_probe_array *multi; + int i; + /* + * Read mdata->ptype before mdata->multi. + */ + smp_rmb(); + multi = mdata->multi; + /* + * multi points to an array, therefore accessing the array + * depends on reading multi. However, even in this case, + * we must insure that the pointer is read _before_ the array + * data. Same as rcu_dereference, but we need a full smp_rmb() + * in the fast path, so put the explicit barrier here. + */ + smp_read_barrier_depends(); + for (i = 0; multi->c[i].func; i++) + multi->c[i].func(mdata, multi->c[i].probe_private, + call_private, mdata->format, &args); + } + rcu_read_unlock_sched_notrace(); +} + +static void free_old_closure(struct rcu_head *head) +{ + struct marker_probe_array *multi = container_of(head, struct marker_probe_array, rcu); + kfree(multi); +} + +static void debug_print_probes(struct marker_entry *entry) +{ + int i; + + if (!marker_debug) + return; + + if (!entry->ptype) { + printk(KERN_DEBUG "Single probe : %p %p\n", + entry->single.func, + entry->single.probe_private); + } else { + for (i = 0; entry->multi->c[i].func; i++) + printk(KERN_DEBUG "Multi probe %d : %p %p\n", i, + entry->multi->c[i].func, + entry->multi->c[i].probe_private); + } +} + +static struct marker_probe_array * +marker_entry_add_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0; + struct marker_probe_array *old, *new; + + WARN_ON(!probe); + + debug_print_probes(entry); + old = entry->multi; + if (!entry->ptype) { + if (entry->single.func == probe && + entry->single.probe_private == probe_private) + return ERR_PTR(-EBUSY); + if (entry->single.func == __mark_empty_function) { + /* 0 -> 1 probes */ + entry->single.func = probe; + entry->single.probe_private = probe_private; + entry->refcount = 1; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* 1 -> 2 probes */ + nr_probes = 1; + old = NULL; + } + } else { + /* (N -> N+1), (N != 0, 1) probes */ + for (nr_probes = 0; old->c[nr_probes].func; nr_probes++) + if (old->c[nr_probes].func == probe + && old->c[nr_probes].probe_private + == probe_private) + return ERR_PTR(-EBUSY); + } + /* + 2 : one for new probe, one for NULL func */ + new = kzalloc(sizeof(struct marker_probe_array) + + ((nr_probes + 2) * sizeof(struct marker_probe_closure)), + GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + if (!old) + new->c[0] = entry->single; + else + memcpy(&new->c[0], &old->c[0], + nr_probes * sizeof(struct marker_probe_closure)); + new->c[nr_probes].func = probe; + new->c[nr_probes].probe_private = probe_private; + entry->refcount = nr_probes + 1; + entry->multi = new; + entry->ptype = 1; + debug_print_probes(entry); + return old; +} + +static struct marker_probe_array * +marker_entry_remove_probe(struct marker_entry *entry, + marker_probe_func *probe, void *probe_private) +{ + int nr_probes = 0, nr_del = 0, i; + struct marker_probe_array *old, *new; + + old = entry->multi; + + debug_print_probes(entry); + if (!entry->ptype) { + /* 0 -> N is an error */ + WARN_ON(entry->single.func == __mark_empty_function); + /* 1 -> 0 probes */ + WARN_ON(probe && entry->single.func != probe); + WARN_ON(entry->single.probe_private != probe_private); + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + debug_print_probes(entry); + return NULL; + } else { + /* (N -> M), (N > 1, M >= 0) probes */ + for (nr_probes = 0; old->c[nr_probes].func; nr_probes++) { + if ((!probe || old->c[nr_probes].func == probe) + && old->c[nr_probes].probe_private + == probe_private) + nr_del++; + } + } + + if (nr_probes - nr_del == 0) { + /* N -> 0, (N > 1) */ + entry->single.func = __mark_empty_function; + entry->refcount = 0; + entry->ptype = 0; + } else if (nr_probes - nr_del == 1) { + /* N -> 1, (N > 1) */ + for (i = 0; old->c[i].func; i++) + if ((probe && old->c[i].func != probe) || + old->c[i].probe_private != probe_private) + entry->single = old->c[i]; + entry->refcount = 1; + entry->ptype = 0; + } else { + int j = 0; + /* N -> M, (N > 1, M > 1) */ + /* + 1 for NULL */ + new = kzalloc(sizeof(struct marker_probe_array) + + ((nr_probes - nr_del + 1) + * sizeof(struct marker_probe_closure)), + GFP_KERNEL); + if (new == NULL) + return ERR_PTR(-ENOMEM); + for (i = 0; old->c[i].func; i++) + if ((probe && old->c[i].func != probe) || + old->c[i].probe_private != probe_private) + new->c[j++] = old->c[i]; + entry->refcount = nr_probes - nr_del; + entry->ptype = 1; + entry->multi = new; + } + debug_print_probes(entry); + return old; +} + +/* + * Get marker if the marker is present in the marker hash table. + * Must be called with markers_mutex held. + * Returns NULL if not present. + */ +static struct marker_entry *get_marker(const char *channel, const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + size_t channel_len = strlen(channel) + 1; + size_t name_len = strlen(name) + 1; + u32 hash; + + hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0); + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) + return e; + } + return NULL; +} + +/* + * Add the marker to the marker hash table. Must be called with markers_mutex + * held. + */ +static struct marker_entry *add_marker(const char *channel, const char *name, + const char *format) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + size_t channel_len = strlen(channel) + 1; + size_t name_len = strlen(name) + 1; + size_t format_len = 0; + u32 hash; + + hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0); + if (format) + format_len = strlen(format) + 1; + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) { + printk(KERN_NOTICE + "Marker %s.%s busy\n", channel, name); + return ERR_PTR(-EBUSY); /* Already there */ + } + } + /* + * Using kmalloc here to allocate a variable length element. Could + * cause some memory fragmentation if overused. + */ + e = kmalloc(sizeof(struct marker_entry) + + channel_len + name_len + format_len, + GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + memcpy(e->channel, channel, channel_len); + e->name = &e->channel[channel_len]; + memcpy(e->name, name, name_len); + if (format) { + e->format = &e->name[name_len]; + memcpy(e->format, format, format_len); + if (strcmp(e->format, MARK_NOARGS) == 0) + e->call = marker_probe_cb_noarg; + else + e->call = marker_probe_cb; + trace_mark(metadata, core_marker_format, + "channel %s name %s format %s", + e->channel, e->name, e->format); + } else { + e->format = NULL; + e->call = marker_probe_cb; + } + e->single.func = __mark_empty_function; + e->single.probe_private = NULL; + e->multi = NULL; + e->ptype = 0; + e->format_allocated = 0; + e->refcount = 0; + hlist_add_head(&e->hlist, head); + return e; +} + +/* + * Remove the marker from the marker hash table. Must be called with mutex_lock + * held. Parameter "registered" indicates if the channel registration has been + * performed. + */ +static int remove_marker(const char *channel, const char *name, int registered, + int compacting) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + int found = 0; + size_t channel_len = strlen(channel) + 1; + size_t name_len = strlen(name) + 1; + u32 hash; + int ret; + + hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0); + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) { + found = 1; + break; + } + } + if (!found) + return -ENOENT; + if (e->single.func != __mark_empty_function) + return -EBUSY; + + if (registered && ltt_channels_trace_ref()) + return 0; + + hlist_del(&e->hlist); + hlist_del(&e->id_list); + if (registered) { + ret = ltt_channels_unregister(e->channel, compacting); + WARN_ON(ret); + } + if (e->format_allocated) + kfree(e->format); + kfree(e); + return 0; +} + +/* + * Set the mark_entry format to the format found in the element. + */ +static int marker_set_format(struct marker_entry *entry, const char *format) +{ + entry->format = kstrdup(format, GFP_KERNEL); + if (!entry->format) + return -ENOMEM; + entry->format_allocated = 1; + + trace_mark(metadata, core_marker_format, + "channel %s name %s format %s", + entry->channel, entry->name, entry->format); + return 0; +} + +/* + * Sets the probe callback corresponding to one marker. + */ +static int set_marker(struct marker_entry *entry, struct marker *elem, + int active) +{ + int ret = 0; + WARN_ON(strcmp(entry->name, elem->name) != 0); + + if (entry->format) { + if (strcmp(entry->format, elem->format) != 0) { + printk(KERN_NOTICE + "Format mismatch for probe %s " + "(%s), marker (%s)\n", + entry->name, + entry->format, + elem->format); + return -EPERM; + } + } else { + ret = marker_set_format(entry, elem->format); + if (ret) + return ret; + } + + /* + * probe_cb setup (statically known) is done here. It is + * asynchronous with the rest of execution, therefore we only + * pass from a "safe" callback (with argument) to an "unsafe" + * callback (does not set arguments). + */ + elem->call = entry->call; + elem->channel_id = entry->channel_id; + elem->event_id = entry->event_id; + /* + * Sanity check : + * We only update the single probe private data when the ptr is + * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1) + */ + WARN_ON(elem->single.func != __mark_empty_function + && elem->single.probe_private != entry->single.probe_private + && !elem->ptype); + elem->single.probe_private = entry->single.probe_private; + /* + * Make sure the private data is valid when we update the + * single probe ptr. + */ + smp_wmb(); + elem->single.func = entry->single.func; + /* + * We also make sure that the new probe callbacks array is consistent + * before setting a pointer to it. + */ + rcu_assign_pointer(elem->multi, entry->multi); + /* + * Update the function or multi probe array pointer before setting the + * ptype. + */ + smp_wmb(); + elem->ptype = entry->ptype; + + if (elem->tp_name && (active ^ _imv_read(elem->state))) { + WARN_ON(!elem->tp_cb); + /* + * It is ok to directly call the probe registration because type + * checking has been done in the __trace_mark_tp() macro. + */ + + if (active) { + /* + * try_module_get should always succeed because we hold + * lock_module() to get the tp_cb address. + */ + ret = try_module_get(__module_text_address( + (unsigned long)elem->tp_cb)); + BUG_ON(!ret); + ret = tracepoint_probe_register_noupdate( + elem->tp_name, + elem->tp_cb, NULL); + } else { + ret = tracepoint_probe_unregister_noupdate( + elem->tp_name, + elem->tp_cb, NULL); + /* + * tracepoint_probe_update_all() must be called + * before the module containing tp_cb is unloaded. + */ + module_put(__module_text_address( + (unsigned long)elem->tp_cb)); + } + } + elem->state__imv = active; + + return ret; +} + +/* + * Disable a marker and its probe callback. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by rcu_read_lock_sched around the call site. + */ +static void disable_marker(struct marker *elem) +{ + int ret; + + /* leave "call" as is. It is known statically. */ + if (elem->tp_name && _imv_read(elem->state)) { + WARN_ON(!elem->tp_cb); + /* + * It is ok to directly call the probe registration because type + * checking has been done in the __trace_mark_tp() macro. + */ + ret = tracepoint_probe_unregister_noupdate(elem->tp_name, + elem->tp_cb, NULL); + WARN_ON(ret); + /* + * tracepoint_probe_update_all() must be called + * before the module containing tp_cb is unloaded. + */ + module_put(__module_text_address((unsigned long)elem->tp_cb)); + } + elem->state__imv = 0; + elem->single.func = __mark_empty_function; + /* Update the function before setting the ptype */ + smp_wmb(); + elem->ptype = 0; /* single probe */ + /* + * Leave the private data and channel_id/event_id there, because removal + * is racy and should be done only after an RCU period. These are never + * used until the next initialization anyway. + */ +} + +/* + * is_marker_present - Check if a marker is present in kernel. + * @channel: channel name + * @name: marker name + * + * We cannot take the marker lock around calls to this function because it needs + * to take the module mutex within the iterator. Marker mutex nests inside + * module mutex. + * Returns 1 if the marker is present, 0 if not. + */ +int is_marker_present(const char *channel, const char *name) +{ + int ret; + struct marker_iter iter; + + ret = 0; + + marker_iter_reset(&iter); + marker_iter_start(&iter); + for (; iter.marker != NULL; marker_iter_next(&iter)) { + if (!strcmp(iter.marker->channel, channel) && + !strcmp(iter.marker->name, name)) { + ret = 1; + goto end; + } + } +end: + marker_iter_stop(&iter); + return ret; +} +EXPORT_SYMBOL_GPL(is_marker_present); + +/* + * _is_marker_enabled - Check if a marker is enabled, must be called with + * markers_mutex held. + * @channel: channel name + * @name: marker name + * + * Returns 1 if the marker is enabled, 0 if disabled. + */ +int _is_marker_enabled(const char *channel, const char *name) +{ + struct marker_entry *entry; + + entry = get_marker(channel, name); + + return entry && !!entry->refcount; +} +EXPORT_SYMBOL_GPL(_is_marker_enabled); + +/* + * is_marker_enabled - the wrapper of _is_marker_enabled + * @channel: channel name + * @name: marker name + * + * Returns 1 if the marker is enabled, 0 if disabled. + */ +int is_marker_enabled(const char *channel, const char *name) +{ + int ret; + + lock_markers(); + ret = _is_marker_enabled(channel, name); + unlock_markers(); + + return ret; +} +EXPORT_SYMBOL_GPL(is_marker_enabled); + +/** + * marker_update_probe_range - Update a probe range + * @begin: beginning of the range + * @end: end of the range + * + * Updates the probe callback corresponding to a range of markers. + */ +void marker_update_probe_range(struct marker *begin, + struct marker *end) +{ + struct marker *iter; + struct marker_entry *mark_entry; + + mutex_lock(&markers_mutex); + for (iter = begin; iter < end; iter++) { + mark_entry = get_marker(iter->channel, iter->name); + if (mark_entry) { + set_marker(mark_entry, iter, !!mark_entry->refcount); + /* + * ignore error, continue + */ + } else { + disable_marker(iter); + } + } + mutex_unlock(&markers_mutex); +} + +/* + * Update probes, removing the faulty probes. + * + * Internal callback only changed before the first probe is connected to it. + * Single probe private data can only be changed on 0 -> 1 and 2 -> 1 + * transitions. All other transitions will leave the old private data valid. + * This makes the non-atomicity of the callback/private data updates valid. + * + * "special case" updates : + * 0 -> 1 callback + * 1 -> 0 callback + * 1 -> 2 callbacks + * 2 -> 1 callbacks + * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates. + * Site effect : marker_set_format may delete the marker entry (creating a + * replacement). + */ +void marker_update_probes(void) +{ + /* Core kernel markers */ + marker_update_probe_range(__start___markers, __stop___markers); + /* Markers in modules. */ + module_update_markers(); + tracepoint_probe_update_all(); + /* Update immediate values */ + core_imv_update(); + module_imv_update(); +} + +/** + * marker_probe_register - Connect a probe to a marker + * @channel: marker channel + * @name: marker name + * @format: format string + * @probe: probe handler + * @probe_private: probe private data + * + * private data must be a valid allocated memory address, or NULL. + * Returns 0 if ok, error value on error. + * The probe address must at least be aligned on the architecture pointer size. + */ +int marker_probe_register(const char *channel, const char *name, + const char *format, marker_probe_func *probe, + void *probe_private) +{ + struct marker_entry *entry; + int ret = 0, ret_err; + struct marker_probe_array *old; + int first_probe = 0; + + mutex_lock(&markers_mutex); + entry = get_marker(channel, name); + if (!entry) { + first_probe = 1; + entry = add_marker(channel, name, format); + if (IS_ERR(entry)) + ret = PTR_ERR(entry); + if (ret) + goto end; + ret = ltt_channels_register(channel); + if (ret) + goto error_remove_marker; + ret = ltt_channels_get_index_from_name(channel); + if (ret < 0) + goto error_unregister_channel; + entry->channel_id = ret; + ret = ltt_channels_get_event_id(channel, name); + if (ret < 0) + goto error_unregister_channel; + entry->event_id = ret; + hlist_add_head(&entry->id_list, id_table + hash_32( + (entry->channel_id << 16) | entry->event_id, + MARKER_HASH_BITS)); + ret = 0; + trace_mark(metadata, core_marker_id, + "channel %s name %s event_id %hu " + "int #1u%zu long #1u%zu pointer #1u%zu " + "size_t #1u%zu alignment #1u%u", + channel, name, entry->event_id, + sizeof(int), sizeof(long), sizeof(void *), + sizeof(size_t), ltt_get_alignment()); + } else if (format) { + if (!entry->format) + ret = marker_set_format(entry, format); + else if (strcmp(entry->format, format)) + ret = -EPERM; + if (ret) + goto end; + } + + old = marker_entry_add_probe(entry, probe, probe_private); + if (IS_ERR(old)) { + ret = PTR_ERR(old); + if (first_probe) + goto error_unregister_channel; + else + goto end; + } + mutex_unlock(&markers_mutex); + + marker_update_probes(); + if (old) + call_rcu_sched(&old->rcu, free_old_closure); + return ret; + +error_unregister_channel: + ret_err = ltt_channels_unregister(channel, 1); + WARN_ON(ret_err); +error_remove_marker: + ret_err = remove_marker(channel, name, 0, 0); + WARN_ON(ret_err); +end: + mutex_unlock(&markers_mutex); + marker_update_probes(); /* for compaction on error path */ + return ret; +} +EXPORT_SYMBOL_GPL(marker_probe_register); + +/** + * marker_probe_unregister - Disconnect a probe from a marker + * @channel: marker channel + * @name: marker name + * @probe: probe function pointer + * @probe_private: probe private data + * + * Returns the private data given to marker_probe_register, or an ERR_PTR(). + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. + */ +int marker_probe_unregister(const char *channel, const char *name, + marker_probe_func *probe, void *probe_private) +{ + struct marker_entry *entry; + struct marker_probe_array *old; + int ret = 0; + + mutex_lock(&markers_mutex); + entry = get_marker(channel, name); + if (!entry) { + ret = -ENOENT; + goto end; + } + old = marker_entry_remove_probe(entry, probe, probe_private); + remove_marker(channel, name, 1, 0); /* Ignore busy error message */ + mutex_unlock(&markers_mutex); + + marker_update_probes(); + if (old) + call_rcu_sched(&old->rcu, free_old_closure); + return ret; + +end: + mutex_unlock(&markers_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(marker_probe_unregister); + +static struct marker_entry * +get_marker_from_private_data(marker_probe_func *probe, void *probe_private) +{ + struct marker_entry *entry; + unsigned int i; + struct hlist_head *head; + struct hlist_node *node; + + for (i = 0; i < MARKER_TABLE_SIZE; i++) { + head = &marker_table[i]; + hlist_for_each_entry(entry, node, head, hlist) { + if (!entry->ptype) { + if (entry->single.func == probe + && entry->single.probe_private + == probe_private) + return entry; + } else { + struct marker_probe_array *closure; + closure = entry->multi; + for (i = 0; closure->c[i].func; i++) { + if (closure->c[i].func == probe && + closure->c[i].probe_private + == probe_private) + return entry; + } + } + } + } + return NULL; +} + +/** + * marker_probe_unregister_private_data - Disconnect a probe from a marker + * @probe: probe function + * @probe_private: probe private data + * + * Unregister a probe by providing the registered private data. + * Only removes the first marker found in hash table. + * Return 0 on success or error value. + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. + */ +int marker_probe_unregister_private_data(marker_probe_func *probe, + void *probe_private) +{ + struct marker_entry *entry; + int ret = 0; + struct marker_probe_array *old; + const char *channel = NULL, *name = NULL; + + mutex_lock(&markers_mutex); + entry = get_marker_from_private_data(probe, probe_private); + if (!entry) { + ret = -ENOENT; + goto unlock; + } + old = marker_entry_remove_probe(entry, NULL, probe_private); + channel = kstrdup(entry->channel, GFP_KERNEL); + name = kstrdup(entry->name, GFP_KERNEL); + remove_marker(channel, name, 1, 0); /* Ignore busy error message */ + mutex_unlock(&markers_mutex); + + marker_update_probes(); + if (old) + call_rcu_sched(&old->rcu, free_old_closure); + goto end; + +unlock: + mutex_unlock(&markers_mutex); +end: + kfree(channel); + kfree(name); + return ret; +} +EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); + +/** + * marker_get_private_data - Get a marker's probe private data + * @channel: marker channel + * @name: marker name + * @probe: probe to match + * @num: get the nth matching probe's private data + * + * Returns the nth private data pointer (starting from 0) matching, or an + * ERR_PTR. + * Returns the private data pointer, or an ERR_PTR. + * The private data pointer should _only_ be dereferenced if the caller is the + * owner of the data, or its content could vanish. This is mostly used to + * confirm that a caller is the owner of a registered probe. + */ +void *marker_get_private_data(const char *channel, const char *name, + marker_probe_func *probe, int num) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e; + size_t channel_len = strlen(channel) + 1; + size_t name_len = strlen(name) + 1; + int i; + u32 hash; + + hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0); + head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) { + if (!e->ptype) { + if (num == 0 && e->single.func == probe) + return e->single.probe_private; + } else { + struct marker_probe_array *closure; + int match = 0; + closure = e->multi; + for (i = 0; closure->c[i].func; i++) { + if (closure->c[i].func != probe) + continue; + if (match++ == num) + return closure->c[i].probe_private; + } + } + break; + } + } + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL_GPL(marker_get_private_data); + +static struct marker_entry *get_entry_from_id(u16 channel_id, u16 event_id) +{ + struct hlist_head *head; + struct hlist_node *node; + struct marker_entry *e, *found = NULL; + u32 hash = hash_32((channel_id << 16) | event_id, MARKER_HASH_BITS); + + mutex_lock(&markers_mutex); + head = id_table + hash; + hlist_for_each_entry(e, node, head, id_list) { + if (e->channel_id == channel_id && e->event_id == event_id) { + found = e; + break; + } + } + mutex_unlock(&markers_mutex); + return found; +} + +/* must call when ids/marker_entry are kept alive */ +const char *marker_get_name_from_id(u16 channel_id, u16 event_id) +{ + struct marker_entry *e = get_entry_from_id(channel_id, event_id); + return e ? e->name : NULL; +} +EXPORT_SYMBOL_GPL(marker_get_name_from_id); + +const char *marker_get_fmt_from_id(u16 channel_id, u16 event_id) +{ + struct marker_entry *e = get_entry_from_id(channel_id, event_id); + return e ? e->format : NULL; +} +EXPORT_SYMBOL_GPL(marker_get_fmt_from_id); + +/** + * markers_compact_event_ids - Compact markers event IDs and reassign channels + * + * Called when no channel users are active by the channel infrastructure. + * Called with trace lock, lock_markers() and channel mutex held. + * + * marker_update_probes() must be executed after compaction before releasing the + * trace lock. + */ +void markers_compact_event_ids(void) +{ + struct marker_entry *entry; + unsigned int i; + struct hlist_head *head; + struct hlist_node *node, *next; + int ret; + + _ltt_channels_reset_event_ids(); + + for (i = 0; i < MARKER_TABLE_SIZE; i++) { + head = &marker_table[i]; + hlist_for_each_entry_safe(entry, node, next, head, hlist) { + if (!entry->refcount) { + remove_marker(entry->channel, entry->name, + 1, 1); + continue; + } + ret = ltt_channels_get_index_from_name(entry->channel); + WARN_ON(ret < 0); + entry->channel_id = ret; + ret = _ltt_channels_get_event_id(entry->channel, + entry->name); + WARN_ON(ret < 0); + entry->event_id = ret; + } + } + + memset(id_table, 0, sizeof(id_table)); + for (i = 0; i < MARKER_TABLE_SIZE; i++) { + head = &marker_table[i]; + hlist_for_each_entry(entry, node, head, hlist) { + hlist_add_head(&entry->id_list, id_table + hash_32( + (entry->channel_id << 16) + | entry->event_id, MARKER_HASH_BITS)); + } + } +} + +#ifdef CONFIG_MODULES + +/** + * marker_get_iter_range - Get a next marker iterator given a range. + * @marker: current markers (in), next marker (out) + * @begin: beginning of the range + * @end: end of the range + * + * Returns whether a next marker has been found (1) or not (0). + * Will return the first marker in the range if the input marker is NULL. + */ +int marker_get_iter_range(struct marker **marker, struct marker *begin, + struct marker *end) +{ + if (!*marker && begin != end) { + *marker = begin; + return 1; + } + if (*marker >= begin && *marker < end) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(marker_get_iter_range); + +static void marker_get_iter(struct marker_iter *iter) +{ + int found = 0; + + /* Core kernel markers */ + if (!iter->module) { + found = marker_get_iter_range(&iter->marker, + __start___markers, __stop___markers); + if (found) + goto end; + } + /* Markers in modules. */ + found = module_get_iter_markers(iter); +end: + if (!found) + marker_iter_reset(iter); +} + +void marker_iter_start(struct marker_iter *iter) +{ + marker_get_iter(iter); +} +EXPORT_SYMBOL_GPL(marker_iter_start); + +void marker_iter_next(struct marker_iter *iter) +{ + iter->marker++; + /* + * iter->marker may be invalid because we blindly incremented it. + * Make sure it is valid by marshalling on the markers, getting the + * markers from following modules if necessary. + */ + marker_get_iter(iter); +} +EXPORT_SYMBOL_GPL(marker_iter_next); + +void marker_iter_stop(struct marker_iter *iter) +{ +} +EXPORT_SYMBOL_GPL(marker_iter_stop); + +void marker_iter_reset(struct marker_iter *iter) +{ + iter->module = NULL; + iter->marker = NULL; +} +EXPORT_SYMBOL_GPL(marker_iter_reset); + +int marker_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); + break; + case MODULE_STATE_GOING: + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); + break; + } + return 0; +} + +struct notifier_block marker_module_nb = { + .notifier_call = marker_module_notify, + .priority = 0, +}; + +static int init_markers(void) +{ + return register_module_notifier(&marker_module_nb); +} +__initcall(init_markers); + +#endif /* CONFIG_MODULES */ + +void ltt_dump_marker_state(struct ltt_trace *trace) +{ + struct marker_entry *entry; + struct ltt_probe_private_data call_data; + struct hlist_head *head; + struct hlist_node *node; + unsigned int i; + + mutex_lock(&markers_mutex); + call_data.trace = trace; + call_data.serializer = NULL; + + for (i = 0; i < MARKER_TABLE_SIZE; i++) { + head = &marker_table[i]; + hlist_for_each_entry(entry, node, head, hlist) { + __trace_mark(0, metadata, core_marker_id, + &call_data, + "channel %s name %s event_id %hu " + "int #1u%zu long #1u%zu pointer #1u%zu " + "size_t #1u%zu alignment #1u%u", + entry->channel, + entry->name, + entry->event_id, + sizeof(int), sizeof(long), + sizeof(void *), sizeof(size_t), + ltt_get_alignment()); + if (entry->format) + __trace_mark(0, metadata, + core_marker_format, + &call_data, + "channel %s name %s format %s", + entry->channel, + entry->name, + entry->format); + } + } + mutex_unlock(&markers_mutex); +} +EXPORT_SYMBOL_GPL(ltt_dump_marker_state); diff --git a/kernel/module.c b/kernel/module.c index efa290ea94b..2767c8eaf12 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -57,6 +57,7 @@ #include <linux/kmemleak.h> #include <linux/jump_label.h> #include <linux/pfn.h> +#include <trace/kernel.h> #define CREATE_TRACE_POINTS #include <trace/events/module.h> @@ -99,7 +100,9 @@ * 1) List of modules (also safely readable with preempt_disable), * 2) module_use links, * 3) module_addr_min/module_addr_max. - * (delete uses stop_machine/add uses RCU list operations). */ + * (delete uses stop_machine/add uses RCU list operations). + * Sorted by ascending list node address. + */ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); @@ -120,6 +123,9 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list); * Protected by module_mutex. */ static unsigned long module_addr_min = -1UL, module_addr_max = 0; +DEFINE_TRACE(kernel_module_load); +DEFINE_TRACE(kernel_module_free); + int register_module_notifier(struct notifier_block * nb) { return blocking_notifier_chain_register(&module_notify_list, nb); @@ -1675,6 +1681,7 @@ static inline void unset_section_ro_nx(struct module *mod, void *module_region) /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { + trace_kernel_module_free(mod); trace_module_free(mod); /* Delete from various lists */ @@ -2272,6 +2279,12 @@ static int copy_and_check(struct load_info *info, if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) return -ENOMEM; + /* + * Make sure the module text or data access never generates any page + * fault. + */ + vmalloc_sync_all(); + if (copy_from_user(hdr, umod, len) != 0) { err = -EFAULT; goto free_hdr; @@ -2459,6 +2472,10 @@ static void find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ctors), &mod->num_ctors); #endif +#ifdef CONFIG_MARKERS + mod->markers = section_objs(info, "__markers", + sizeof(*mod->markers), &mod->num_markers); +#endif #ifdef CONFIG_TRACEPOINTS mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", sizeof(*mod->tracepoints_ptrs), @@ -2717,7 +2734,7 @@ static struct module *load_module(void __user *umod, const char __user *uargs) { struct load_info info = { NULL, }; - struct module *mod; + struct module *mod, *iter; long err; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2799,7 +2816,23 @@ static struct module *load_module(void __user *umod, goto ddebug; module_bug_finalize(info.hdr, info.sechdrs, mod); + /* + * We sort the modules by struct module pointer address to permit + * correct iteration over modules of, at least, kallsyms for preemptible + * operations, such as read(). Sorting by struct module pointer address + * is equivalent to sort by list node address. + */ + list_for_each_entry_reverse(iter, &modules, list) { + BUG_ON(iter == mod); /* Should never be in the list twice */ + if (iter < mod) { + /* We belong to the location right after iter. */ + list_add_rcu(&mod->list, &iter->list); + goto module_added; + } + } + /* We should be added at the head of the list */ list_add_rcu(&mod->list, &modules); +module_added: mutex_unlock(&module_mutex); /* Module is ready to execute: parsing args may do that. */ @@ -2817,6 +2850,7 @@ static struct module *load_module(void __user *umod, free_copy(&info); /* Done! */ + trace_kernel_module_load(mod); trace_module_load(mod); return mod; @@ -3196,12 +3230,12 @@ static char *module_flags(struct module *mod, char *buf) static void *m_start(struct seq_file *m, loff_t *pos) { mutex_lock(&module_mutex); - return seq_list_start(&modules, *pos); + return seq_sorted_list_start(&modules, pos); } static void *m_next(struct seq_file *m, void *p, loff_t *pos) { - return seq_list_next(p, &modules, pos); + return seq_sorted_list_next(p, &modules, pos); } static void m_stop(struct seq_file *m, void *p) @@ -3266,6 +3300,27 @@ static int __init proc_modules_init(void) module_init(proc_modules_init); #endif +void list_modules(void *call_data) +{ + /* Enumerate loaded modules */ + struct list_head *i; + struct module *mod; + unsigned long refcount = 0; + + mutex_lock(&module_mutex); + list_for_each(i, &modules) { + mod = list_entry(i, struct module, list); +#ifdef CONFIG_MODULE_UNLOAD + refcount = module_refcount(mod); +#endif + __trace_mark(0, module_state, list_module, call_data, + "name %s state %d refcount %lu", + mod->name, mod->state, refcount); + } + mutex_unlock(&module_mutex); +} +EXPORT_SYMBOL_GPL(list_modules); + /* Given an address, look for it in the module exception tables. */ const struct exception_table_entry *search_module_extables(unsigned long addr) { @@ -3393,12 +3448,59 @@ void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, struct kernel_symbol *ks, + struct marker *marker, struct tracepoint * const *tp) { } EXPORT_SYMBOL(module_layout); #endif +#ifdef CONFIG_MARKERS +void module_update_markers(void) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) + if (!(mod->taints & TAINT_FORCED_MODULE)) + marker_update_probe_range(mod->markers, + mod->markers + mod->num_markers); + mutex_unlock(&module_mutex); +} + +/* + * Returns 0 if current not found. + * Returns 1 if current found. + */ +int module_get_iter_markers(struct marker_iter *iter) +{ + struct module *iter_mod; + int found = 0; + + mutex_lock(&module_mutex); + list_for_each_entry(iter_mod, &modules, list) { + if (!(iter_mod->taints & TAINT_FORCED_MODULE)) { + /* + * Sorted module list + */ + if (iter_mod < iter->module) + continue; + else if (iter_mod > iter->module) + iter->marker = NULL; + found = marker_get_iter_range(&iter->marker, + iter_mod->markers, + iter_mod->markers + iter_mod->num_markers); + if (found) { + iter->module = iter_mod; + break; + } + } + } + mutex_unlock(&module_mutex); + return found; +} +#endif + #ifdef CONFIG_TRACEPOINTS void module_update_tracepoints(void) { diff --git a/kernel/notifier.c b/kernel/notifier.c index 2488ba7eb56..e8481427153 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -5,6 +5,7 @@ #include <linux/rcupdate.h> #include <linux/vmalloc.h> #include <linux/reboot.h> +#include <linux/idle.h> /* * Notifier list for kernel code which wants to be called @@ -148,7 +149,7 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_unregister(&nh->head, n); spin_unlock_irqrestore(&nh->lock, flags); - synchronize_rcu(); + synchronize_sched(); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); @@ -178,9 +179,9 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, { int ret; - rcu_read_lock(); + rcu_read_lock_sched_notrace(); ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); - rcu_read_unlock(); + rcu_read_unlock_sched_notrace(); return ret; } EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); @@ -584,3 +585,27 @@ int unregister_die_notifier(struct notifier_block *nb) return atomic_notifier_chain_unregister(&die_chain, nb); } EXPORT_SYMBOL_GPL(unregister_die_notifier); + +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +/* + * Trace last event before calling notifiers. Notifiers flush data from buffers + * before going to idle. + */ +int notrace notify_idle(enum idle_val val) +{ + return atomic_notifier_call_chain(&idle_notifier, val, NULL); +} +EXPORT_SYMBOL_GPL(notify_idle); + +void register_idle_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(register_idle_notifier); + +void unregister_idle_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(unregister_idle_notifier); diff --git a/kernel/panic.c b/kernel/panic.c index e510d28b3e6..79708502b00 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,6 +23,9 @@ #include <linux/init.h> #include <linux/nmi.h> #include <linux/dmi.h> +#include <trace/kernel.h> + +DEFINE_TRACE(kernel_panic); #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -67,6 +70,10 @@ NORET_TYPE void panic(const char * fmt, ...) long i, i_next = 0; int state = 0; + va_start(args, fmt); + trace_kernel_panic(fmt, args); + va_end(args); + /* * It's possible to come here directly from a panic-assertion and * not have preempt disabled. Some functions called from here want diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 656222fcf76..ad02feadb6b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4567,7 +4567,7 @@ static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) - return 0; + return 1; if (regs) { if (event->attr.exclude_user && user_mode(regs)) @@ -4923,6 +4923,8 @@ static int perf_tp_event_match(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs) { + if (event->hw.state & PERF_HES_STOPPED) + return 0; /* * All tracepoints are from kernel-space. */ @@ -6113,17 +6115,20 @@ __perf_event_exit_task(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { - struct perf_event *parent_event; + if (child_event->parent) { + raw_spin_lock_irq(&child_ctx->lock); + perf_group_detach(child_event); + raw_spin_unlock_irq(&child_ctx->lock); + } perf_event_remove_from_context(child_event); - parent_event = child_event->parent; /* - * It can happen that parent exits first, and has events + * It can happen that the parent exits first, and has events * that are still around due to the child reference. These - * events need to be zapped - but otherwise linger. + * events need to be zapped. */ - if (parent_event) { + if (child_event->parent) { sync_child_event(child_event, child); free_event(child_event); } diff --git a/kernel/printk.c b/kernel/printk.c index f629e802fdc..92d1e3b6d37 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -40,6 +40,7 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/rculist.h> +#include <trace/kernel.h> #include <asm/uaccess.h> @@ -71,6 +72,7 @@ int console_printk[4] = { MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; +EXPORT_SYMBOL_GPL(console_printk); /* * Low level drivers may need that to know if they can schedule in @@ -140,6 +142,9 @@ EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; +DEFINE_TRACE(kernel_printk); +DEFINE_TRACE(kernel_vprintk); + #ifdef CONFIG_PRINTK static char __log_buf[__LOG_BUF_LEN]; @@ -701,6 +706,7 @@ asmlinkage int printk(const char *fmt, ...) } #endif va_start(args, fmt); + trace_kernel_printk(_RET_IP_); r = vprintk(fmt, args); va_end(args); @@ -827,6 +833,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) #ifdef CONFIG_DEBUG_LL printascii(printk_buf); #endif + trace_kernel_vprintk(_RET_IP_, printk_buf, printed_len); p = printk_buf; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dd4aea806f8..a86e46b6bc1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -47,6 +47,7 @@ #include <linux/mutex.h> #include <linux/time.h> #include <linux/kernel_stat.h> +#include <trace/rcu.h> #include "rcutree.h" @@ -145,6 +146,10 @@ int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT; module_param(rcu_cpu_stall_suppress, int, 0644); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +DEFINE_TRACE(rcu_tree_call_rcu); +DEFINE_TRACE(rcu_tree_call_rcu_bh); +DEFINE_TRACE(rcu_tree_callback); + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -1143,6 +1148,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) next = list->next; prefetch(next); debug_rcu_head_unqueue(list); + trace_rcu_tree_callback(list); list->func(list); list = next; if (++count >= rdp->blimit) @@ -1488,6 +1494,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched); */ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { + trace_rcu_tree_call_rcu_bh(head, _RET_IP_); __call_rcu(head, func, &rcu_bh_state); } EXPORT_SYMBOL_GPL(call_rcu_bh); diff --git a/kernel/sched.c b/kernel/sched.c index b294a1882ff..e50b1bf0374 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5573,7 +5573,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) * The idle tasks have their own, simple scheduling class: */ idle->sched_class = &idle_sched_class; - ftrace_graph_init_task(idle); + ftrace_graph_init_idle_task(idle, cpu); } /* @@ -9344,3 +9344,57 @@ struct cgroup_subsys cpuacct_subsys = { }; #endif /* CONFIG_CGROUP_CPUACCT */ +static DEFINE_MUTEX(kernel_trace_mutex); +static int kernel_trace_refcount; + +/** + * clear_kernel_trace_flag_all_tasks - clears all TIF_KERNEL_TRACE thread flags. + * + * This function iterates on all threads in the system to clear their + * TIF_KERNEL_TRACE flag. Setting the TIF_KERNEL_TRACE flag with the + * tasklist_lock held in copy_process() makes sure that once we finish clearing + * the thread flags, all threads have their flags cleared. + */ +void clear_kernel_trace_flag_all_tasks(void) +{ + struct task_struct *p; + struct task_struct *t; + + mutex_lock(&kernel_trace_mutex); + if (--kernel_trace_refcount) + goto end; + read_lock(&tasklist_lock); + do_each_thread(p, t) { + clear_tsk_thread_flag(t, TIF_KERNEL_TRACE); + } while_each_thread(p, t); + read_unlock(&tasklist_lock); +end: + mutex_unlock(&kernel_trace_mutex); +} +EXPORT_SYMBOL_GPL(clear_kernel_trace_flag_all_tasks); + +/** + * set_kernel_trace_flag_all_tasks - sets all TIF_KERNEL_TRACE thread flags. + * + * This function iterates on all threads in the system to set their + * TIF_KERNEL_TRACE flag. Setting the TIF_KERNEL_TRACE flag with the + * tasklist_lock held in copy_process() makes sure that once we finish setting + * the thread flags, all threads have their flags set. + */ +void set_kernel_trace_flag_all_tasks(void) +{ + struct task_struct *p; + struct task_struct *t; + + mutex_lock(&kernel_trace_mutex); + if (kernel_trace_refcount++) + goto end; + read_lock(&tasklist_lock); + do_each_thread(p, t) { + set_tsk_thread_flag(t, TIF_KERNEL_TRACE); + } while_each_thread(p, t); + read_unlock(&tasklist_lock); +end: + mutex_unlock(&kernel_trace_mutex); +} +EXPORT_SYMBOL_GPL(set_kernel_trace_flag_all_tasks); diff --git a/kernel/signal.c b/kernel/signal.c index 4e3cff10fdc..31751868de8 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2421,9 +2421,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, return -EFAULT; /* Not even root can pretend to send signals from the kernel. - Nor can they impersonate a kill(), which adds source info. */ - if (info.si_code >= 0) + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info.si_code != SI_QUEUE) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info.si_code < 0); return -EPERM; + } info.si_signo = sig; /* POSIX.1b doesn't mention process groups. */ @@ -2437,9 +2441,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) return -EINVAL; /* Not even root can pretend to send signals from the kernel. - Nor can they impersonate a kill(), which adds source info. */ - if (info->si_code >= 0) + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info->si_code != SI_QUEUE) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info->si_code < 0); return -EPERM; + } info->si_signo = sig; return do_send_specific(tgid, pid, sig, info); diff --git a/kernel/smp.c b/kernel/smp.c index 9910744f085..954548906af 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -450,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask, { struct call_function_data *data; unsigned long flags; - int cpu, next_cpu, this_cpu = smp_processor_id(); + int refs, cpu, next_cpu, this_cpu = smp_processor_id(); /* * Can deadlock when called with interrupts disabled. @@ -461,7 +461,7 @@ void smp_call_function_many(const struct cpumask *mask, WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress && !early_boot_irqs_disabled); - /* So, what's a CPU they want? Ignoring this one. */ + /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); if (cpu == this_cpu) cpu = cpumask_next_and(cpu, mask, cpu_online_mask); @@ -483,22 +483,49 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); + + /* This BUG_ON verifies our reuse assertions and can be removed */ BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); + /* + * The global call function queue list add and delete are protected + * by a lock, but the list is traversed without any lock, relying + * on the rcu list add and delete to allow safe concurrent traversal. + * We reuse the call function data without waiting for any grace + * period after some other cpu removes it from the global queue. + * This means a cpu might find our data block as it is being + * filled out. + * + * We hold off the interrupt handler on the other cpu by + * ordering our writes to the cpu mask vs our setting of the + * refs counter. We assert only the cpu owning the data block + * will set a bit in cpumask, and each bit will only be cleared + * by the subject cpu. Each cpu must first find its bit is + * set and then check that refs is set indicating the element is + * ready to be processed, otherwise it must skip the entry. + * + * On the previous iteration refs was set to 0 by another cpu. + * To avoid the use of transitivity, set the counter to 0 here + * so the wmb will pair with the rmb in the interrupt handler. + */ + atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */ + data->csd.func = func; data->csd.info = info; - cpumask_and(data->cpumask, mask, cpu_online_mask); - cpumask_clear_cpu(this_cpu, data->cpumask); - /* - * To ensure the interrupt handler gets an complete view - * we order the cpumask and refs writes and order the read - * of them in the interrupt handler. In addition we may - * only clear our own cpu bit from the mask. - */ + /* Ensure 0 refs is visible before mask. Also orders func and info */ smp_wmb(); - atomic_set(&data->refs, cpumask_weight(data->cpumask)); + /* We rely on the "and" being processed before the store */ + cpumask_and(data->cpumask, mask, cpu_online_mask); + cpumask_clear_cpu(this_cpu, data->cpumask); + refs = cpumask_weight(data->cpumask); + + /* Some callers race with other cpus changing the passed mask */ + if (unlikely(!refs)) { + csd_unlock(&data->csd); + return; + } raw_spin_lock_irqsave(&call_function.lock, flags); /* @@ -507,6 +534,12 @@ void smp_call_function_many(const struct cpumask *mask, * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); + /* + * We rely on the wmb() in list_add_rcu to complete our writes + * to the cpumask before this write to refs, which indicates + * data is on the list and is ready to be processed. + */ + atomic_set(&data->refs, refs); raw_spin_unlock_irqrestore(&call_function.lock, flags); /* diff --git a/kernel/softirq.c b/kernel/softirq.c index 68eb5efec38..a25bf611d13 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -23,7 +23,10 @@ #include <linux/rcupdate.h> #include <linux/ftrace.h> #include <linux/smp.h> +#include <linux/marker.h> +#include <linux/kallsyms.h> #include <linux/tick.h> +#include <trace/irq.h> #define CREATE_TRACE_POINTS #include <trace/events/irq.h> @@ -54,6 +57,20 @@ EXPORT_SYMBOL(irq_stat); static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; +void ltt_dump_softirq_vec(void *call_data) +{ + int i; + char namebuf[KSYM_NAME_LEN]; + + for (i = 0; i < 32; i++) { + sprint_symbol(namebuf, (unsigned long)softirq_vec[i].action); + __trace_mark(0, softirq_state, softirq_vec, call_data, + "id %d address %p symbol %s", + i, softirq_vec[i].action, namebuf); + } +} +EXPORT_SYMBOL_GPL(ltt_dump_softirq_vec); + static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); char *softirq_to_name[NR_SOFTIRQS] = { @@ -61,6 +78,11 @@ char *softirq_to_name[NR_SOFTIRQS] = { "TASKLET", "SCHED", "HRTIMER", "RCU" }; +DEFINE_TRACE(irq_tasklet_high_entry); +DEFINE_TRACE(irq_tasklet_high_exit); +DEFINE_TRACE(irq_tasklet_low_entry); +DEFINE_TRACE(irq_tasklet_low_exit); + /* * we cannot loop indefinitely here to avoid userspace starvation, * but we also don't want to introduce a worst case 1/HZ latency @@ -341,6 +363,7 @@ void irq_exit(void) */ inline void raise_softirq_irqoff(unsigned int nr) { + trace_softirq_raise(nr); __raise_softirq_irqoff(nr); /* @@ -440,7 +463,9 @@ static void tasklet_action(struct softirq_action *a) if (!atomic_read(&t->count)) { if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) BUG(); + trace_irq_tasklet_low_entry(t); t->func(t->data); + trace_irq_tasklet_low_exit(t); tasklet_unlock(t); continue; } @@ -475,7 +500,9 @@ static void tasklet_hi_action(struct softirq_action *a) if (!atomic_read(&t->count)) { if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) BUG(); + trace_irq_tasklet_high_entry(t); t->func(t->data); + trace_irq_tasklet_high_exit(t); tasklet_unlock(t); continue; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a26c37df1b1..4bc1435706b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -170,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; @@ -714,7 +719,7 @@ static struct ctl_table kern_table[] = { .data = &kptr_restrict, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dmesg_restrict, .extra1 = &zero, .extra2 = &two, }, @@ -2405,6 +2410,17 @@ static int proc_taint(struct ctl_table *table, int write, return err; } +#ifdef CONFIG_PRINTK +static int proc_dmesg_restrict(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + struct do_proc_dointvec_minmax_conv_param { int *min; int *max; diff --git a/kernel/time/Makefile b/kernel/time/Makefile index ee266620b06..dbaa0648631 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o +obj-$(CONFIG_HAVE_UNSYNCHRONIZED_TSC) += tsc-sync.o diff --git a/kernel/time/tsc-sync.c b/kernel/time/tsc-sync.c new file mode 100644 index 00000000000..2ac1544ee22 --- /dev/null +++ b/kernel/time/tsc-sync.c @@ -0,0 +1,313 @@ +/* + * kernel/time/tsc-sync.c + * + * Test TSC synchronization + * + * marks the tsc as unstable _and_ keep a simple "_tsc_is_sync" variable, which + * is fast to read when a simple test must determine which clock source to use + * for kernel tracing. + * + * - CPU init : + * + * We check whether all boot CPUs have their TSC's synchronized, + * print a warning if not and turn off the TSC clock-source. + * + * Only two CPUs may participate - they can enter in any order. + * ( The serial nature of the boot logic and the CPU hotplug lock + * protects against more than 2 CPUs entering this code. + * + * - When CPUs are up : + * + * TSC synchronicity of all CPUs can be checked later at run-time by calling + * test_tsc_synchronization(). + * + * Copyright 2007, 2008 + * Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> + */ +#include <linux/module.h> +#include <linux/timer.h> +#include <linux/timex.h> +#include <linux/jiffies.h> +#include <linux/trace-clock.h> +#include <linux/cpu.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/cpu.h> + +#define MAX_CYCLES_DELTA 3000ULL + +/* + * Number of loops to take care of MCE, NMIs, SMIs. + */ +#define NR_LOOPS 200 + +static DEFINE_MUTEX(tscsync_mutex); + +struct sync_data { + int nr_waits; + int wait_sync; + cycles_t tsc_count; +} ____cacheline_aligned; + +/* 0 is master, 1 is slave */ +static struct sync_data sync_data[2] = { + [0 ... 1] = { + .nr_waits = 3 * NR_LOOPS + 1, + .wait_sync = 3 * NR_LOOPS + 1, + }, +}; + +int _tsc_is_sync = 1; +EXPORT_SYMBOL(_tsc_is_sync); + +static int force_tsc_sync; +static cycles_t slave_offset; +static int slave_offset_ready; /* for 32-bits architectures */ + +static int __init force_tsc_sync_setup(char *str) +{ + force_tsc_sync = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("force_tsc_sync=", force_tsc_sync_setup); + +/* + * Mark it noinline so we make sure it is not unrolled. + * Wait until value is reached. + */ +static noinline void tsc_barrier(long this_cpu) +{ + sync_core(); + sync_data[this_cpu].wait_sync--; + smp_mb(); /* order master/slave sync_data read/write */ + while (unlikely(sync_data[1 - this_cpu].wait_sync >= + sync_data[this_cpu].nr_waits)) + barrier(); /* + * barrier is used because faster and + * more predictable than cpu_idle(). + */ + smp_mb(); /* order master/slave sync_data read/write */ + sync_data[this_cpu].nr_waits--; + get_cycles_barrier(); + sync_data[this_cpu].tsc_count = get_cycles(); + get_cycles_barrier(); +} + +/* + * Worker thread called on each CPU. + * First wait with interrupts enabled, then wait with interrupt disabled, + * for precision. We are already bound to one CPU. + * this_cpu 0 : master + * this_cpu 1 : slave + */ +static void test_sync(void *arg) +{ + long this_cpu = (long)arg; + unsigned long flags; + + local_irq_save(flags); + /* Make sure the instructions are in I-CACHE */ + tsc_barrier(this_cpu); + tsc_barrier(this_cpu); + sync_data[this_cpu].wait_sync--; + smp_mb(); /* order master/slave sync_data read/write */ + while (unlikely(sync_data[1 - this_cpu].wait_sync >= + sync_data[this_cpu].nr_waits)) + barrier(); /* + * barrier is used because faster and + * more predictable than cpu_idle(). + */ + smp_mb(); /* order master/slave sync_data read/write */ + sync_data[this_cpu].nr_waits--; + /* + * Here, only the master will wait for the slave to reach this barrier. + * This makes sure that the master, which holds the mutex and will reset + * the barriers, waits for the slave to stop using the barrier values + * before it continues. This is only done at the complete end of all the + * loops. This is why there is a + 1 in original wait_sync value. + */ + if (sync_data[this_cpu].nr_waits == 1) + sync_data[this_cpu].wait_sync--; + local_irq_restore(flags); +} + +/* + * Each CPU (master and target) must decrement the wait_sync value twice (one + * for priming in cache), and also once after the get_cycles. After all the + * loops, one last synchronization is required to make sure the master waits + * for the slave before resetting the barriers. + */ +static void reset_barriers(void) +{ + int i; + + /* + * Wait until slave is done so that we don't overwrite + * wait_end_sync prematurely. + */ + smp_mb(); /* order master/slave sync_data read/write */ + while (unlikely(sync_data[1].wait_sync >= sync_data[0].nr_waits)) + barrier(); /* + * barrier is used because faster and + * more predictable than cpu_idle(). + */ + smp_mb(); /* order master/slave sync_data read/write */ + + for (i = 0; i < 2; i++) { + WARN_ON(sync_data[i].wait_sync != 0); + WARN_ON(sync_data[i].nr_waits != 1); + sync_data[i].wait_sync = 3 * NR_LOOPS + 1; + sync_data[i].nr_waits = 3 * NR_LOOPS + 1; + } +} + +/* + * Do loops (making sure no unexpected event changes the timing), keep the best + * one. The result of each loop is the highest tsc delta between the master CPU + * and the slaves. Stop CPU hotplug when this code is executed to make sure we + * are concurrency-safe wrt CPU hotplug also using this code. Test TSC + * synchronization even if we already "know" CPUs were not synchronized. This + * can be used as a test to check if, for some reason, the CPUs eventually got + * in sync after a CPU has been unplugged. This code is kept separate from the + * CPU hotplug code because the slave CPU executes in an IPI, which we want to + * keep as short as possible (this is happening while the system is running). + * Therefore, we do not send a single IPI for all the test loops, but rather + * send one IPI per loop. + */ +int test_tsc_synchronization(void) +{ + long cpu, master; + cycles_t max_diff = 0, diff, best_loop, worse_loop = 0; + int i; + + mutex_lock(&tscsync_mutex); + get_online_cpus(); + + printk(KERN_INFO + "checking TSC synchronization across all online CPUs:"); + + preempt_disable(); + master = smp_processor_id(); + for_each_online_cpu(cpu) { + if (master == cpu) + continue; + best_loop = (cycles_t)ULLONG_MAX; + for (i = 0; i < NR_LOOPS; i++) { + smp_call_function_single(cpu, test_sync, + (void *)1UL, 0); + test_sync((void *)0UL); + diff = abs(sync_data[1].tsc_count + - sync_data[0].tsc_count); + best_loop = min(best_loop, diff); + worse_loop = max(worse_loop, diff); + } + reset_barriers(); + max_diff = max(best_loop, max_diff); + } + preempt_enable(); + if (max_diff >= MAX_CYCLES_DELTA) { + printk(KERN_WARNING + "Measured %llu cycles TSC offset between CPUs," + " turning off TSC clock.\n", (u64)max_diff); + mark_tsc_unstable("check_tsc_sync_source failed"); + _tsc_is_sync = 0; + } else { + printk(" passed.\n"); + } + put_online_cpus(); + mutex_unlock(&tscsync_mutex); + return max_diff < MAX_CYCLES_DELTA; +} +EXPORT_SYMBOL_GPL(test_tsc_synchronization); + +/* + * Test synchronicity of a single core when it is hotplugged. + * Source CPU calls into this - waits for the freshly booted target CPU to + * arrive and then start the measurement: + */ +void __cpuinit check_tsc_sync_source(int cpu) +{ + cycles_t diff, abs_diff, + best_loop = (cycles_t)ULLONG_MAX, worse_loop = 0; + int i; + + /* + * No need to check if we already know that the TSC is not synchronized: + */ + if (!force_tsc_sync && unsynchronized_tsc()) { + /* + * Make sure we mark _tsc_is_sync to 0 if the TSC is found + * to be unsynchronized for other causes than non-synchronized + * TSCs across CPUs. + */ + _tsc_is_sync = 0; + set_trace_clock_is_sync(0); + return; + } + + printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", + smp_processor_id(), cpu); + + for (i = 0; i < NR_LOOPS; i++) { + test_sync((void *)0UL); + diff = sync_data[1].tsc_count - sync_data[0].tsc_count; + abs_diff = abs(diff); + best_loop = min(best_loop, abs_diff); + worse_loop = max(worse_loop, abs_diff); + if (force_tsc_sync && best_loop == abs_diff) + slave_offset = diff; + } + reset_barriers(); + + if (!force_tsc_sync && best_loop >= MAX_CYCLES_DELTA) { + printk(" failed.\n"); + printk(KERN_WARNING + "Measured %llu cycles TSC offset between CPUs," + " turning off TSC clock.\n", (u64)best_loop); + mark_tsc_unstable("check_tsc_sync_source failed"); + _tsc_is_sync = 0; + set_trace_clock_is_sync(0); + } else { + printk(" %s.\n", !force_tsc_sync ? "passed" : "forced"); + } + if (force_tsc_sync) { + /* order slave_offset and slave_offset_ready writes */ + smp_wmb(); + slave_offset_ready = 1; + } +} + +/* + * Freshly booted CPUs call into this: + */ +void __cpuinit check_tsc_sync_target(void) +{ + int i; + + if (!force_tsc_sync && unsynchronized_tsc()) + return; + + for (i = 0; i < NR_LOOPS; i++) + test_sync((void *)1UL); + + /* + * Force slave synchronization if requested. + */ + if (force_tsc_sync) { + unsigned long flags; + cycles_t new_tsc; + + while (!slave_offset_ready) + cpu_relax(); + /* order slave_offset and slave_offset_ready reads */ + smp_rmb(); + local_irq_save(flags); + /* + * slave_offset is read when master has finished writing to it, + * and is protected by cpu hotplug serialization. + */ + new_tsc = get_cycles() - slave_offset; + write_tsc((u32)new_tsc, (u32)((u64)new_tsc >> 32)); + local_irq_restore(flags); + } +} diff --git a/kernel/timer.c b/kernel/timer.c index d6459923d24..65cc58ce148 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -40,12 +40,14 @@ #include <linux/irq_work.h> #include <linux/sched.h> #include <linux/slab.h> +#include <trace/timer.h> #include <asm/uaccess.h> #include <asm/unistd.h> #include <asm/div64.h> #include <asm/timex.h> #include <asm/io.h> +#include <asm/irq_regs.h> #define CREATE_TRACE_POINTS #include <trace/events/timer.h> @@ -54,6 +56,10 @@ u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); +DEFINE_TRACE(timer_set); +DEFINE_TRACE(timer_update_time); +DEFINE_TRACE(timer_timeout); + /* * per-CPU timer vector definitions: */ @@ -366,6 +372,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; vec = base->tv5.vec + i; } + trace_timer_set(timer); /* * Timers are FIFO: */ @@ -1303,8 +1310,13 @@ void run_local_timers(void) void do_timer(unsigned long ticks) { + struct timespec curtime, wtom; + jiffies_64 += ticks; update_wall_time(); + curtime = __current_kernel_time(); + wtom = __get_wall_to_monotonic(); + trace_timer_update_time(&curtime, &wtom); calc_global_load(ticks); } @@ -1387,7 +1399,9 @@ SYSCALL_DEFINE0(getegid) static void process_timeout(unsigned long __data) { - wake_up_process((struct task_struct *)__data); + struct task_struct *task = (struct task_struct *)__data; + trace_timer_timeout(task); + wake_up_process(task); } /** diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 761c510a06c..614d9153a24 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -56,5 +56,7 @@ obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif +obj-$(CONFIG_HAVE_TRACE_CLOCK_32_TO_64) += trace-clock-32-to-64.o +obj-$(CONFIG_HAVE_TRACE_CLOCK_GENERIC) += trace-clock.o libftrace-y := ftrace.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f3dadae8388..888b611897d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3328,7 +3328,7 @@ static int start_graph_tracing(void) /* The cpu_boot init_task->ret_stack will never be freed */ for_each_online_cpu(cpu) { if (!idle_task(cpu)->ret_stack) - ftrace_graph_init_task(idle_task(cpu)); + ftrace_graph_init_idle_task(idle_task(cpu), cpu); } do { @@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void) mutex_unlock(&ftrace_lock); } +static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); + +static void +graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +{ + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; + /* make curr_ret_stack visable before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; +} + +/* + * Allocate a return stack for the idle task. May be the first + * time through, or it may be done by CPU hotplug online. + */ +void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) +{ + t->curr_ret_stack = -1; + /* + * The idle task has no parent, it either has its own + * stack or no stack at all. + */ + if (t->ret_stack) + WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = per_cpu(idle_ret_stack, cpu); + if (!ret_stack) { + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + per_cpu(idle_ret_stack, cpu) = ret_stack; + } + graph_init_task(t, ret_stack); + } +} + /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { @@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t) GFP_KERNEL); if (!ret_stack) return; - atomic_set(&t->tracing_graph_pause, 0); - atomic_set(&t->trace_overrun, 0); - t->ftrace_timestamp = 0; - /* make curr_ret_stack visable before we add the ret_stack */ - smp_wmb(); - t->ret_stack = ret_stack; + graph_init_task(t, ret_stack); } } diff --git a/kernel/trace/trace-clock-32-to-64.c b/kernel/trace/trace-clock-32-to-64.c new file mode 100644 index 00000000000..c036f5c5586 --- /dev/null +++ b/kernel/trace/trace-clock-32-to-64.c @@ -0,0 +1,296 @@ +/* + * kernel/trace/trace-clock-32-to-64.c + * + * (C) Copyright 2006,2007,2008 - + * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) + * + * Extends a 32 bits clock source to a full 64 bits count, readable atomically + * from any execution context. + * + * notes : + * - trace clock 32->64 bits extended timer-based clock cannot be used for early + * tracing in the boot process, as it depends on timer interrupts. + * - The timer is only on one CPU to support hotplug. + * - We have the choice between schedule_delayed_work_on and an IPI to get each + * CPU to write the heartbeat. IPI has been chosen because it is considered + * faster than passing through the timer to get the work scheduled on all the + * CPUs. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/timer.h> +#include <linux/workqueue.h> +#include <linux/cpu.h> +#include <linux/timex.h> +#include <linux/bitops.h> +#include <linux/trace-clock.h> +#include <linux/smp.h> +#include <linux/sched.h> /* needed due to include order problem on m68k */ +#include <linux/math64.h> + +#define HW_BITMASK ((1ULL << TC_HW_BITS) - 1) +#define HW_LS32(hw) ((hw) & HW_BITMASK) +#define SW_MS32(sw) ((sw) & ~HW_BITMASK) + +static DEFINE_SPINLOCK(synthetic_tsc_lock); +static int synthetic_tsc_refcount; /* Number of readers */ +static int synthetic_tsc_enabled; /* synth. TSC enabled on all online CPUs */ + +static DEFINE_PER_CPU(struct timer_list, tsc_timer); +static unsigned int precalc_expire; + +struct synthetic_tsc_struct { + union { + u64 val; + struct { +#ifdef __BIG_ENDIAN + u32 ms32; + u32 ls32; +#else + u32 ls32; + u32 ms32; +#endif + } sel; + } tsc[2]; + unsigned int index; /* Index of the current synth. tsc. */ +}; + +static DEFINE_PER_CPU(struct synthetic_tsc_struct, synthetic_tsc); + +/* Called from IPI or timer interrupt */ +static void update_synthetic_tsc(void) +{ + struct synthetic_tsc_struct *cpu_synth; + u32 tsc; + + cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id()); + tsc = trace_clock_read32(); /* Hardware clocksource read */ + + if (tsc < HW_LS32(cpu_synth->tsc[cpu_synth->index].sel.ls32)) { + unsigned int new_index = 1 - cpu_synth->index; /* 0 <-> 1 */ + /* + * Overflow + * Non atomic update of the non current synthetic TSC, followed + * by an atomic index change. There is no write concurrency, + * so the index read/write does not need to be atomic. + */ + cpu_synth->tsc[new_index].val = + (SW_MS32(cpu_synth->tsc[cpu_synth->index].val) + | (u64)tsc) + (1ULL << TC_HW_BITS); + /* + * Ensure the compiler does not reorder index write. It makes + * sure all nested interrupts will see the new value before the + * new index is written. + */ + barrier(); + cpu_synth->index = new_index; /* atomic change of index */ + } else { + /* + * No overflow : We know that the only bits changed are + * contained in the 32 LS32s, which can be written to atomically. + */ + cpu_synth->tsc[cpu_synth->index].sel.ls32 = + SW_MS32(cpu_synth->tsc[cpu_synth->index].sel.ls32) | tsc; + } +} + +/* + * Should only be called when interrupts are off. Affects only current CPU. + */ +void _trace_clock_write_synthetic_tsc(u64 value) +{ + struct synthetic_tsc_struct *cpu_synth; + unsigned int new_index; + + cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id()); + new_index = 1 - cpu_synth->index; /* 0 <-> 1 */ + cpu_synth->tsc[new_index].val = value; + barrier(); + cpu_synth->index = new_index; /* atomic change of index */ +} + +/* Called from buffer switch : in _any_ context (even NMI) */ +u64 notrace trace_clock_read_synthetic_tsc(void) +{ + struct synthetic_tsc_struct *cpu_synth; + u64 ret; + unsigned int index; + u32 tsc; + + preempt_disable_notrace(); + cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id()); + index = ACCESS_ONCE(cpu_synth->index); /* atomic read */ + tsc = trace_clock_read32(); /* Hardware clocksource read */ + + /* Overflow detection */ + if (unlikely(tsc < HW_LS32(cpu_synth->tsc[index].sel.ls32))) + ret = (SW_MS32(cpu_synth->tsc[index].val) | (u64)tsc) + + (1ULL << TC_HW_BITS); + else + ret = SW_MS32(cpu_synth->tsc[index].val) | (u64)tsc; + preempt_enable_notrace(); + return ret; +} +EXPORT_SYMBOL_GPL(trace_clock_read_synthetic_tsc); + +static void synthetic_tsc_ipi(void *info) +{ + update_synthetic_tsc(); +} + +/* + * tsc_timer_fct : - Timer function synchronizing synthetic TSC. + * @data: unused + * + * Guarantees at least 1 execution before low word of TSC wraps. + */ +static void tsc_timer_fct(unsigned long data) +{ + update_synthetic_tsc(); + + mod_timer_pinned(&per_cpu(tsc_timer, smp_processor_id()), + jiffies + precalc_expire); +} + +/* + * precalc_stsc_interval: - Precalculates the interval between the clock + * wraparounds. + */ +static int __init precalc_stsc_interval(void) +{ + u64 rem_freq, rem_interval; + + precalc_expire = + __iter_div_u64_rem(HW_BITMASK, ( + __iter_div_u64_rem(trace_clock_frequency(), + HZ * trace_clock_freq_scale(), &rem_freq) << 1 + ) + - 1 + - (TC_EXPECTED_INTERRUPT_LATENCY * HZ / 1000), &rem_interval) + >> 1; + WARN_ON(precalc_expire == 0); + printk(KERN_DEBUG "Synthetic TSC timer will fire each %u jiffies.\n", + precalc_expire); + return 0; +} + +static void prepare_synthetic_tsc(int cpu) +{ + struct synthetic_tsc_struct *cpu_synth; + u64 local_count; + + cpu_synth = &per_cpu(synthetic_tsc, cpu); + local_count = trace_clock_read_synthetic_tsc(); + cpu_synth->tsc[0].val = local_count; + cpu_synth->index = 0; + smp_wmb(); /* Writing in data of CPU about to come up */ + init_timer_deferrable(&per_cpu(tsc_timer, cpu)); + per_cpu(tsc_timer, cpu).function = tsc_timer_fct; + per_cpu(tsc_timer, cpu).expires = jiffies + precalc_expire; +} + +static void enable_synthetic_tsc(int cpu) +{ + smp_call_function_single(cpu, synthetic_tsc_ipi, NULL, 1); + add_timer_on(&per_cpu(tsc_timer, cpu), cpu); +} + +static void disable_synthetic_tsc(int cpu) +{ + del_timer_sync(&per_cpu(tsc_timer, cpu)); +} + +/* + * hotcpu_callback - CPU hotplug callback + * @nb: notifier block + * @action: hotplug action to take + * @hcpu: CPU number + * + * Sets the new CPU's current synthetic TSC to the same value as the + * currently running CPU. + * + * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD) + */ +static int __cpuinit hotcpu_callback(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + unsigned int hotcpu = (unsigned long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + spin_lock(&synthetic_tsc_lock); + if (synthetic_tsc_refcount) + prepare_synthetic_tsc(hotcpu); + spin_unlock(&synthetic_tsc_lock); + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + spin_lock(&synthetic_tsc_lock); + if (synthetic_tsc_refcount) + enable_synthetic_tsc(hotcpu); + spin_unlock(&synthetic_tsc_lock); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + spin_lock(&synthetic_tsc_lock); + if (synthetic_tsc_refcount) + disable_synthetic_tsc(hotcpu); + spin_unlock(&synthetic_tsc_lock); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +void get_synthetic_tsc(void) +{ + int cpu; + + spin_lock(&synthetic_tsc_lock); + if (synthetic_tsc_refcount++) + goto end; + + synthetic_tsc_enabled = 1; + for_each_online_cpu(cpu) { + prepare_synthetic_tsc(cpu); + enable_synthetic_tsc(cpu); + } +end: + spin_unlock(&synthetic_tsc_lock); +} +EXPORT_SYMBOL_GPL(get_synthetic_tsc); + +void put_synthetic_tsc(void) +{ + int cpu; + + spin_lock(&synthetic_tsc_lock); + WARN_ON(synthetic_tsc_refcount <= 0); + if (synthetic_tsc_refcount != 1 || !synthetic_tsc_enabled) + goto end; + + for_each_online_cpu(cpu) + disable_synthetic_tsc(cpu); + synthetic_tsc_enabled = 0; +end: + synthetic_tsc_refcount--; + spin_unlock(&synthetic_tsc_lock); +} +EXPORT_SYMBOL_GPL(put_synthetic_tsc); + +/* Called from CPU 0, before any tracing starts, to init each structure */ +static int __init init_synthetic_tsc(void) +{ + precalc_stsc_interval(); + hotcpu_notifier(hotcpu_callback, 3); + return 0; +} + +/* Before SMP is up */ +/* workaround for omap4 */ +late_initcall(init_synthetic_tsc); diff --git a/kernel/trace/trace-clock.c b/kernel/trace/trace-clock.c new file mode 100644 index 00000000000..3ed1667aacb --- /dev/null +++ b/kernel/trace/trace-clock.c @@ -0,0 +1,97 @@ +/* + * kernel/trace/trace-clock.c + * + * (C) Copyright 2008 - + * Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) + * + * Generic kernel tracing clock for architectures without TSC. + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/timer.h> +#include <linux/workqueue.h> +#include <linux/cpu.h> +#include <linux/timex.h> +#include <linux/bitops.h> +#include <linux/trace-clock.h> +#include <linux/jiffies.h> + +static int trace_clock_refcount; +static DEFINE_MUTEX(trace_clock_mutex); +static struct timer_list trace_clock_timer; +/* + * bits 0..12 : counter, atomically incremented + * bits 13..{32,64} : time counter, incremented each jiffy. + */ +atomic_long_t trace_clock_var; +EXPORT_SYMBOL(trace_clock_var); + +static void trace_clock_update(void) +{ + long old_clock, new_clock; + unsigned long ticks; + + /* + * Make sure we keep track of delayed timer. + */ + ticks = jiffies - trace_clock_timer.expires + 1; + /* Don't update if ticks is zero, time would go backward. */ + if (unlikely(!ticks)) + return; + do { + old_clock = atomic_long_read(&trace_clock_var); + new_clock = (old_clock + (ticks << TRACE_CLOCK_SHIFT)) + & (~((1 << TRACE_CLOCK_SHIFT) - 1)); + } while (atomic_long_cmpxchg(&trace_clock_var, old_clock, new_clock) + != old_clock); +} + +static void trace_clock_timer_fct(unsigned long data) +{ + trace_clock_update(); + trace_clock_timer.expires = jiffies + 1; + add_timer(&trace_clock_timer); +} + +static void enable_trace_clock(void) +{ + init_timer(&trace_clock_timer); + /* trace_clock_update() reads expires */ + trace_clock_timer.function = trace_clock_timer_fct; + trace_clock_timer.expires = jiffies + 1; + trace_clock_update(); + add_timer(&trace_clock_timer); +} + +static void disable_trace_clock(void) +{ + del_timer_sync(&trace_clock_timer); +} + +void get_trace_clock(void) +{ + get_synthetic_tsc(); + mutex_lock(&trace_clock_mutex); + if (trace_clock_refcount++) + goto end; + enable_trace_clock(); +end: + mutex_unlock(&trace_clock_mutex); +} +EXPORT_SYMBOL_GPL(get_trace_clock); + +void put_trace_clock(void) +{ + mutex_lock(&trace_clock_mutex); + WARN_ON(trace_clock_refcount <= 0); + if (trace_clock_refcount != 1) + goto end; + disable_trace_clock(); +end: + trace_clock_refcount--; + mutex_unlock(&trace_clock_mutex); + put_synthetic_tsc(); +} +EXPORT_SYMBOL_GPL(put_trace_clock); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2547d8813cf..687699d365a 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -11,6 +11,7 @@ #include <linux/ftrace.h> #include <linux/string.h> #include <linux/module.h> +#include <linux/marker.h> #include <linux/mutex.h> #include <linux/ctype.h> #include <linux/list.h> |