Merge branch 'upstream/linaro.38' into linaro-android.38

Conflicts: arch/arm/kernel/signal.c drivers/mmc/card/block.c drivers/mtd/nand/Kconfig include/linux/amba/mmci.h kernel/printk.c mm/shmem.c net/socket.c
author: John Stultz <john.stultz@linaro.org> 2011-04-05 12:21:10 -0700
committer: John Stultz <john.stultz@linaro.org> 2011-04-05 12:21:10 -0700
commit: 4ce7ea0bfbb301ffb79154b6cecd2ef030db4cdf (patch)
tree: a7ad87580793912a9da208e7a9ea21d6c7768f08 /kernel
parent: 4450182f400f1a5f50b1680faec25af2315c2849 (diff)
parent: 7c4bc9c2662c6d9840afed0e29eb01314af9bb78 (diff)
30 files changed, 2811 insertions, 45 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba3..c039580ba3b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -91,6 +91,7 @@ obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
+obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_BINFMT_ELF) += elfcore.o
@@ -99,7 +100,10 @@ obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
+obj-$(CONFIG_MARKERS) += ltt-channels.o
 obj-$(CONFIG_RING_BUFFER) += trace/
+obj-$(CONFIG_HAVE_TRACE_CLOCK_32_TO_64) += trace/
+obj-$(CONFIG_HAVE_TRACE_CLOCK_GENERIC) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d96bd1eb562..d83723e7ee0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1820,10 +1820,8 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 
 	/* Update the css_set linked lists if we're using them */
 	write_lock(&css_set_lock);
-	if (!list_empty(&tsk->cg_list)) {
-		list_del(&tsk->cg_list);
-		list_add(&tsk->cg_list, &newcg->tasks);
-	}
+	if (!list_empty(&tsk->cg_list))
+		list_move(&tsk->cg_list, &newcg->tasks);
 	write_unlock(&css_set_lock);
 
 	for_each_subsys(root, ss) {
@@ -3670,12 +3668,12 @@ again:
 	spin_lock(&release_list_lock);
 	set_bit(CGRP_REMOVED, &cgrp->flags);
 	if (!list_empty(&cgrp->release_list))
-		list_del(&cgrp->release_list);
+		list_del_init(&cgrp->release_list);
 	spin_unlock(&release_list_lock);
 
 	cgroup_lock_hierarchy(cgrp->root);
 	/* delete this cgroup from parent->children */
-	list_del(&cgrp->sibling);
+	list_del_init(&cgrp->sibling);
 	cgroup_unlock_hierarchy(cgrp->root);
 
 	d = dget(cgrp->dentry);
@@ -3893,7 +3891,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 	subsys[ss->subsys_id] = NULL;
 
 	/* remove subsystem from rootnode's list of subsystems */
-	list_del(&ss->sibling);
+	list_del_init(&ss->sibling);
 
 	/*
 	 * disentangle the css from all css_sets attached to the dummytop. as
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b..0d9a3444614 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -514,6 +514,8 @@ struct files_struct *get_files_struct(struct task_struct *task)
 	return files;
 }
 
+EXPORT_SYMBOL(get_files_struct);
+
 void put_files_struct(struct files_struct *files)
 {
 	struct fdtable *fdt;
@@ -535,6 +537,8 @@ void put_files_struct(struct files_struct *files)
 	}
 }
 
+EXPORT_SYMBOL(put_files_struct);
+
 void reset_files_struct(struct files_struct *files)
 {
 	struct task_struct *tsk = current;
diff --git a/kernel/fork.c b/kernel/fork.c
index 515abda4084..131ce90d19d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -88,6 +88,7 @@ int max_threads;		/* tunable limit on nr_threads */
 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+EXPORT_SYMBOL(tasklist_lock);
 
 #ifdef CONFIG_PROVE_RCU
 int lockdep_tasklist_lock_is_held(void)
@@ -1266,6 +1267,15 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);
 
+	/*
+	 * The state of the parent's TIF_KTRACE flag may have changed
+	 * since it was copied in dup_task_struct() so we re-copy it here.
+	 */
+	if (test_thread_flag(TIF_KERNEL_TRACE))
+		set_tsk_thread_flag(p, TIF_KERNEL_TRACE);
+	else
+		clear_tsk_thread_flag(p, TIF_KERNEL_TRACE);
+
 	/* CLONE_PARENT re-uses the old parent */
 	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
 		p->real_parent = current->real_parent;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3540a719012..db864334a95 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,7 @@
 #include <linux/kernel_stat.h>
 
 #include <trace/events/irq.h>
+#include <trace/irq.h>
 
 #include "internals.h"
 
@@ -51,6 +52,9 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 	       "but no thread function available.", irq, action->name);
 }
 
+DEFINE_TRACE(irq_entry);
+DEFINE_TRACE(irq_exit);
+
 /**
  * handle_IRQ_event - irq action chain handler
  * @irq:	the interrupt number
@@ -63,6 +67,8 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 	irqreturn_t ret, retval = IRQ_NONE;
 	unsigned int status = 0;
 
+	trace_irq_entry(irq, NULL, action);
+
 	do {
 		trace_irq_handler_entry(irq, action);
 		ret = action->handler(irq, action->dev_id);
@@ -116,5 +122,7 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 		add_interrupt_randomness(irq);
 	local_irq_disable();
 
+	trace_irq_exit(retval);
+
 	return retval;
 }
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2039bea31bd..1c07afd307f 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -109,6 +109,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 {
 	return radix_tree_lookup(&irq_desc_tree, irq);
 }
+EXPORT_SYMBOL_GPL(irq_to_desc);
 
 static void delete_irq_desc(unsigned int irq)
 {
@@ -273,6 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 {
 	return (irq < NR_IRQS) ? irq_desc + irq : NULL;
 }
+EXPORT_SYMBOL_GPL(irq_to_desc);
 
 struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
 {
diff --git a/kernel/itimer.c b/kernel/itimer.c
index d802883153d..18fd8e919c0 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -13,9 +13,13 @@
 #include <linux/posix-timers.h>
 #include <linux/hrtimer.h>
 #include <trace/events/timer.h>
+#include <trace/timer.h>
 
 #include <asm/uaccess.h>
 
+DEFINE_TRACE(timer_itimer_expired);
+DEFINE_TRACE(timer_itimer_set);
+
 /**
  * itimer_get_remtime - get remaining time for the timer
  *
@@ -124,6 +128,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
 		container_of(timer, struct signal_struct, real_timer);
 
 	trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0);
+	trace_timer_itimer_expired(sig);
 	kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid);
 
 	return HRTIMER_NORESTART;
@@ -201,6 +206,8 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 	    !timeval_valid(&value->it_interval))
 		return -EINVAL;
 
+	trace_timer_itimer_set(which, value);
+
 	switch (which) {
 	case ITIMER_REAL:
 again:
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ec19b92c7eb..779f0031929 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
 #include <linux/vmalloc.h>
 #include <linux/swap.h>
 #include <linux/kmsg_dump.h>
+#include <trace/kernel.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
@@ -40,6 +41,9 @@
 #include <asm/system.h>
 #include <asm/sections.h>
 
+DEFINE_TRACE(kernel_kernel_kexec);
+DEFINE_TRACE(kernel_crash_kexec);
+
 /* Per cpu memory for storing cpu states in case of system crash. */
 note_buf_t __percpu *crash_notes;
 
@@ -1066,6 +1070,8 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
 
 void crash_kexec(struct pt_regs *regs)
 {
+	trace_kernel_crash_kexec(kexec_crash_image, regs);
+
 	/* Take the kexec_mutex here to prevent sys_kexec_load
 	 * running on one cpu from replacing the crash kernel
 	 * we are using after a panic on a different cpu.
@@ -1495,6 +1501,8 @@ int kernel_kexec(void)
 {
 	int error = 0;
 
+	trace_kernel_kernel_kexec(kexec_image);
+
 	if (!mutex_trylock(&kexec_mutex))
 		return -EBUSY;
 	if (!kexec_image) {
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0d2058da80f..e0841c537db 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -49,6 +49,8 @@
 
 #include "lockdep_internals.h"
 
+#include <trace/lockdep.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/lock.h>
 
@@ -66,6 +68,13 @@ module_param(lock_stat, int, 0644);
 #define lock_stat 0
 #endif
 
+DEFINE_TRACE(lockdep_hardirqs_on);
+DEFINE_TRACE(lockdep_hardirqs_off);
+DEFINE_TRACE(lockdep_softirqs_on);
+DEFINE_TRACE(lockdep_softirqs_off);
+DEFINE_TRACE(lockdep_lock_acquire);
+DEFINE_TRACE(lockdep_lock_release);
+
 /*
  * lockdep_lock: protects the lockdep graph, the hashes and the
  *               class/list/hash allocators.
@@ -2300,6 +2309,8 @@ void trace_hardirqs_on_caller(unsigned long ip)
 
 	time_hardirqs_on(CALLER_ADDR0, ip);
 
+	trace_lockdep_hardirqs_on(ip);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2358,6 +2369,8 @@ void trace_hardirqs_off_caller(unsigned long ip)
 
 	time_hardirqs_off(CALLER_ADDR0, ip);
 
+	trace_lockdep_hardirqs_off(ip);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2390,6 +2403,8 @@ void trace_softirqs_on(unsigned long ip)
 {
 	struct task_struct *curr = current;
 
+	trace_lockdep_softirqs_on(ip);
+
 	if (unlikely(!debug_locks))
 		return;
 
@@ -2424,6 +2439,8 @@ void trace_softirqs_off(unsigned long ip)
 {
 	struct task_struct *curr = current;
 
+	trace_lockdep_softirqs_off(ip);
+
 	if (unlikely(!debug_locks))
 		return;
 
@@ -2730,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	int class_idx;
 	u64 chain_key;
 
+	trace_lockdep_lock_acquire(ip, subclass, lock, trylock, read,
+		hardirqs_off);
+
 	if (!prove_locking)
 		check = 1;
 
@@ -3108,6 +3128,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
 {
 	struct task_struct *curr = current;
 
+	trace_lockdep_lock_release(ip, lock, nested);
+
 	if (!check_unlock(curr, lock, ip))
 		return;
 
diff --git a/kernel/ltt-channels.c b/kernel/ltt-channels.c
new file mode 100644
index 00000000000..102513874ad
--- /dev/null
+++ b/kernel/ltt-channels.c
@@ -0,0 +1,388 @@
+/*
+ * ltt/ltt-channels.c
+ *
+ * (C) Copyright 2008 - Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * LTTng channel management.
+ *
+ * Author:
+ *	Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Dual LGPL v2.1/GPL v2 license.
+ */
+
+#include <linux/module.h>
+#include <linux/ltt-channels.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+/*
+ * ltt_channel_mutex may be nested inside the LTT trace mutex.
+ * ltt_channel_mutex mutex may be nested inside markers mutex.
+ */
+static DEFINE_MUTEX(ltt_channel_mutex);
+static LIST_HEAD(ltt_channels);
+/*
+ * Index of next channel in array. Makes sure that as long as a trace channel is
+ * allocated, no array index will be re-used when a channel is freed and then
+ * another channel is allocated. This index is cleared and the array indexeds
+ * get reassigned when the index_kref goes back to 0, which indicates that no
+ * more trace channels are allocated.
+ */
+static unsigned int free_index;
+/* index_kref is protected by both ltt_channel_mutex and lock_markers */
+static struct kref index_kref;	/* Keeps track of allocated trace channels */
+
+static struct ltt_channel_setting *lookup_channel(const char *name)
+{
+	struct ltt_channel_setting *iter;
+
+	list_for_each_entry(iter, &ltt_channels, list)
+		if (strcmp(name, iter->name) == 0)
+			return iter;
+	return NULL;
+}
+
+/*
+ * Must be called when channel refcount falls to 0 _and_ also when the last
+ * trace is freed. This function is responsible for compacting the channel and
+ * event IDs when no users are active.
+ *
+ * Called with lock_markers() and channels mutex held.
+ */
+static void release_channel_setting(struct kref *kref)
+{
+	struct ltt_channel_setting *setting = container_of(kref,
+		struct ltt_channel_setting, kref);
+	struct ltt_channel_setting *iter;
+
+	if (atomic_read(&index_kref.refcount) == 0
+	    && atomic_read(&setting->kref.refcount) == 0) {
+		list_del(&setting->list);
+		kfree(setting);
+
+		free_index = 0;
+		list_for_each_entry(iter, &ltt_channels, list) {
+			iter->index = free_index++;
+			iter->free_event_id = 0;
+		}
+	}
+}
+
+/*
+ * Perform channel index compaction when the last trace channel is freed.
+ *
+ * Called with lock_markers() and channels mutex held.
+ */
+static void release_trace_channel(struct kref *kref)
+{
+	struct ltt_channel_setting *iter, *n;
+
+	list_for_each_entry_safe(iter, n, &ltt_channels, list)
+		release_channel_setting(&iter->kref);
+	if (atomic_read(&index_kref.refcount) == 0)
+		markers_compact_event_ids();
+}
+
+/*
+ * ltt_channel_trace_ref :  Is there an existing trace session ?
+ *
+ * Must be called with lock_markers() held.
+ */
+int ltt_channels_trace_ref(void)
+{
+	return !!atomic_read(&index_kref.refcount);
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_ref);
+
+/**
+ * ltt_channels_register - Register a trace channel.
+ * @name: channel name
+ *
+ * Uses refcounting.
+ */
+int ltt_channels_register(const char *name)
+{
+	struct ltt_channel_setting *setting;
+	int ret = 0;
+
+	mutex_lock(&ltt_channel_mutex);
+	setting = lookup_channel(name);
+	if (setting) {
+		if (atomic_read(&setting->kref.refcount) == 0)
+			goto init_kref;
+		else {
+			kref_get(&setting->kref);
+			goto end;
+		}
+	}
+	setting = kzalloc(sizeof(*setting), GFP_KERNEL);
+	if (!setting) {
+		ret = -ENOMEM;
+		goto end;
+	}
+	list_add(&setting->list, &ltt_channels);
+	strncpy(setting->name, name, PATH_MAX-1);
+	setting->index = free_index++;
+init_kref:
+	kref_init(&setting->kref);
+end:
+	mutex_unlock(&ltt_channel_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_register);
+
+/**
+ * ltt_channels_unregister - Unregister a trace channel.
+ * @name: channel name
+ * @compacting: performing compaction
+ *
+ * Must be called with markers mutex held.
+ */
+int ltt_channels_unregister(const char *name, int compacting)
+{
+	struct ltt_channel_setting *setting;
+	int ret = 0;
+
+	if (!compacting)
+		mutex_lock(&ltt_channel_mutex);
+	setting = lookup_channel(name);
+	if (!setting || atomic_read(&setting->kref.refcount) == 0) {
+		ret = -ENOENT;
+		goto end;
+	}
+	kref_put(&setting->kref, release_channel_setting);
+	if (!compacting && atomic_read(&index_kref.refcount) == 0)
+			markers_compact_event_ids();
+end:
+	if (!compacting)
+		mutex_unlock(&ltt_channel_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_unregister);
+
+/**
+ * ltt_channels_set_default - Set channel default behavior.
+ * @name: default channel name
+ * @sb_size: size of the subbuffers
+ * @n_sb: number of subbuffers
+ */
+int ltt_channels_set_default(const char *name,
+			     unsigned int sb_size,
+			     unsigned int n_sb)
+{
+	struct ltt_channel_setting *setting;
+	int ret = 0;
+
+	mutex_lock(&ltt_channel_mutex);
+	setting = lookup_channel(name);
+	if (!setting || atomic_read(&setting->kref.refcount) == 0) {
+		ret = -ENOENT;
+		goto end;
+	}
+	setting->sb_size = sb_size;
+	setting->n_sb = n_sb;
+end:
+	mutex_unlock(&ltt_channel_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_set_default);
+
+/**
+ * ltt_channels_get_name_from_index - get channel name from channel index
+ * @index: channel index
+ *
+ * Allows to lookup the channel name given its index. Done to keep the name
+ * information outside of each trace channel instance.
+ */
+const char *ltt_channels_get_name_from_index(unsigned int index)
+{
+	struct ltt_channel_setting *iter;
+
+	list_for_each_entry(iter, &ltt_channels, list)
+		if (iter->index == index && atomic_read(&iter->kref.refcount))
+			return iter->name;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_get_name_from_index);
+
+static struct ltt_channel_setting *
+ltt_channels_get_setting_from_name(const char *name)
+{
+	struct ltt_channel_setting *iter;
+
+	list_for_each_entry(iter, &ltt_channels, list)
+		if (!strcmp(iter->name, name)
+		    && atomic_read(&iter->kref.refcount))
+			return iter;
+	return NULL;
+}
+
+/**
+ * ltt_channels_get_index_from_name - get channel index from channel name
+ * @name: channel name
+ *
+ * Allows to lookup the channel index given its name. Done to keep the name
+ * information outside of each trace channel instance.
+ * Returns -1 if not found.
+ */
+int ltt_channels_get_index_from_name(const char *name)
+{
+	struct ltt_channel_setting *setting;
+
+	setting = ltt_channels_get_setting_from_name(name);
+	if (setting)
+		return setting->index;
+	else
+		return -1;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_get_index_from_name);
+
+/**
+ * ltt_channels_trace_alloc - Allocate channel structures for a trace
+ * @sb_size: subbuffer size. 0 uses default.
+ * @n_sb: number of subbuffers per per-cpu buffers. 0 uses default.
+ * @flags: Default channel flags
+ *
+ * Use the current channel list to allocate the channels for a trace.
+ * Called with trace lock held. Does not perform the trace buffer allocation,
+ * because we must let the user overwrite specific channel sizes.
+ */
+struct ltt_chan *ltt_channels_trace_alloc(unsigned int *nr_channels,
+					  int overwrite, int active)
+{
+	struct ltt_chan *chan = NULL;
+	struct ltt_channel_setting *iter;
+
+	lock_markers();
+	mutex_lock(&ltt_channel_mutex);
+	if (!free_index)
+		goto end;
+	if (!atomic_read(&index_kref.refcount))
+		kref_init(&index_kref);
+	else
+		kref_get(&index_kref);
+	*nr_channels = free_index;
+	chan = kzalloc(sizeof(struct ltt_chan) * free_index, GFP_KERNEL);
+	if (!chan)
+		goto end;
+	list_for_each_entry(iter, &ltt_channels, list) {
+		if (!atomic_read(&iter->kref.refcount))
+			continue;
+		chan[iter->index].a.sb_size = iter->sb_size;
+		chan[iter->index].a.n_sb = iter->n_sb;
+		chan[iter->index].overwrite = overwrite;
+		chan[iter->index].active = active;
+		strncpy(chan[iter->index].a.filename, iter->name, NAME_MAX - 1);
+		chan[iter->index].switch_timer_interval = 0;
+	}
+end:
+	mutex_unlock(&ltt_channel_mutex);
+	unlock_markers();
+	return chan;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_alloc);
+
+/**
+ * ltt_channels_trace_free - Free one trace's channels
+ * @channels: channels to free
+ *
+ * Called with trace lock held. The actual channel buffers must be freed before
+ * this function is called.
+ */
+void ltt_channels_trace_free(struct ltt_chan *channels,
+			     unsigned int nr_channels)
+{
+	lock_markers();
+	mutex_lock(&ltt_channel_mutex);
+	kfree(channels);
+	kref_put(&index_kref, release_trace_channel);
+	mutex_unlock(&ltt_channel_mutex);
+	unlock_markers();
+	marker_update_probes();
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_free);
+
+/**
+ * ltt_channels_trace_set_timer - set switch timer
+ * @channel: channel
+ * @interval: interval of timer interrupt, in jiffies. 0 inhibits timer.
+ */
+
+void ltt_channels_trace_set_timer(struct ltt_chan *chan,
+				  unsigned long interval)
+{
+	chan->switch_timer_interval = interval;
+}
+EXPORT_SYMBOL_GPL(ltt_channels_trace_set_timer);
+
+/**
+ * _ltt_channels_get_event_id - get next event ID for a marker
+ * @channel: channel name
+ * @name: event name
+ *
+ * Returns a unique event ID (for this channel) or < 0 on error.
+ * Must be called with channels mutex held.
+ */
+int _ltt_channels_get_event_id(const char *channel, const char *name)
+{
+	struct ltt_channel_setting *setting;
+	int ret;
+
+	setting = ltt_channels_get_setting_from_name(channel);
+	if (!setting) {
+		ret = -ENOENT;
+		goto end;
+	}
+	if (strcmp(channel, "metadata") == 0) {
+		if (strcmp(name, "core_marker_id") == 0)
+			ret = 0;
+		else if (strcmp(name, "core_marker_format") == 0)
+			ret = 1;
+		else
+			ret = -ENOENT;
+		goto end;
+	}
+	if (setting->free_event_id == EVENTS_PER_CHANNEL - 1) {
+		ret = -ENOSPC;
+		goto end;
+	}
+	ret = setting->free_event_id++;
+end:
+	return ret;
+}
+
+/**
+ * ltt_channels_get_event_id - get next event ID for a marker
+ * @channel: channel name
+ * @name: event name
+ *
+ * Returns a unique event ID (for this channel) or < 0 on error.
+ */
+int ltt_channels_get_event_id(const char *channel, const char *name)
+{
+	int ret;
+
+	mutex_lock(&ltt_channel_mutex);
+	ret = _ltt_channels_get_event_id(channel, name);
+	mutex_unlock(&ltt_channel_mutex);
+	return ret;
+}
+
+/**
+ * ltt_channels_reset_event_ids - reset event IDs at compaction
+ *
+ * Called with lock marker and channel mutex held.
+ */
+void _ltt_channels_reset_event_ids(void)
+{
+	struct ltt_channel_setting *iter;
+
+	list_for_each_entry(iter, &ltt_channels, list)
+		iter->free_event_id = 0;
+}
+
+MODULE_LICENSE("GPL and additional rights");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Linux Trace Toolkit Next Generation Channel Management");
diff --git a/kernel/marker.c b/kernel/marker.c
new file mode 100644
index 00000000000..eac8ebfc3b9
--- /dev/null
+++ b/kernel/marker.c
@@ -0,0 +1,1262 @@
+/*
+ * Copyright (C) 2007 Mathieu Desnoyers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/jhash.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/marker.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/immediate.h>
+#include <linux/ltt-channels.h>
+
+extern struct marker __start___markers[];
+extern struct marker __stop___markers[];
+
+/* Set to 1 to enable marker debug output */
+static const int marker_debug;
+
+/*
+ * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
+ * and module markers and the hash table.
+ * markers_mutex nests inside the trace lock, to ensure event ID consistency
+ * between the hash table and the marker section.
+ */
+static DEFINE_MUTEX(markers_mutex);
+
+void lock_markers(void)
+{
+	mutex_lock(&markers_mutex);
+}
+EXPORT_SYMBOL_GPL(lock_markers);
+
+void unlock_markers(void)
+{
+	mutex_unlock(&markers_mutex);
+}
+EXPORT_SYMBOL_GPL(unlock_markers);
+
+/*
+ * Marker hash table, containing the active markers.
+ * Protected by module_mutex.
+ */
+#define MARKER_HASH_BITS 6
+#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+static struct hlist_head marker_table[MARKER_TABLE_SIZE];
+static struct hlist_head id_table[MARKER_TABLE_SIZE];
+
+struct marker_probe_array {
+	struct rcu_head rcu;
+	struct marker_probe_closure c[0];
+};
+
+/*
+ * Note about RCU :
+ * It is used to make sure every handler has finished using its private data
+ * between two consecutive operation (add or remove) on a given marker.  It is
+ * also used to delay the free of multiple probes array until a quiescent state
+ * is reached.
+ * marker entries modifications are protected by the markers_mutex.
+ */
+struct marker_entry {
+	struct hlist_node hlist;
+	struct hlist_node id_list;
+	char *format;
+	char *name;
+			/* Probe wrapper */
+	void (*call)(const struct marker *mdata, void *call_private, ...);
+	struct marker_probe_closure single;
+	struct marker_probe_array *multi;
+	int refcount;	/* Number of times armed. 0 if disarmed. */
+	u16 channel_id;
+	u16 event_id;
+	unsigned char ptype:1;
+	unsigned char format_allocated:1;
+	char channel[0];	/* Contains channel'\0'name'\0'format'\0' */
+};
+
+/**
+ * __mark_empty_function - Empty probe callback
+ * @mdata: marker data
+ * @probe_private: probe private data
+ * @call_private: call site private data
+ * @fmt: format string
+ * @...: variable argument list
+ *
+ * Empty callback provided as a probe to the markers. By providing this to a
+ * disabled marker, we make sure the  execution flow is always valid even
+ * though the function pointer change and the marker enabling are two distinct
+ * operations that modifies the execution flow of preemptible code.
+ */
+notrace void __mark_empty_function(const struct marker *mdata,
+	void *probe_private, void *call_private, const char *fmt, va_list *args)
+{
+}
+EXPORT_SYMBOL_GPL(__mark_empty_function);
+
+/*
+ * marker_probe_cb Callback that prepares the variable argument list for probes.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @...:  Variable argument list.
+ *
+ * Since we do not use "typical" pointer based RCU in the 1 argument case, we
+ * need to put a full smp_rmb() in this branch. This is why we do not use
+ * rcu_dereference() for the pointer read.
+ */
+notrace void marker_probe_cb(const struct marker *mdata,
+		void *call_private, ...)
+{
+	va_list args;
+	char ptype;
+
+	/*
+	 * rcu_read_lock_sched does two things : disabling preemption to make
+	 * sure the teardown of the callbacks can be done correctly when they
+	 * are in modules and they insure RCU read coherency.
+	 */
+	rcu_read_lock_sched_notrace();
+	ptype = mdata->ptype;
+	if (likely(!ptype)) {
+		marker_probe_func *func;
+		/* Must read the ptype before ptr. They are not data dependant,
+		 * so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		func = mdata->single.func;
+		/* Must read the ptr before private data. They are not data
+		 * dependant, so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		va_start(args, call_private);
+		func(mdata, mdata->single.probe_private, call_private,
+			mdata->format, &args);
+		va_end(args);
+	} else {
+		struct marker_probe_array *multi;
+		int i;
+		/*
+		 * Read mdata->ptype before mdata->multi.
+		 */
+		smp_rmb();
+		multi = mdata->multi;
+		/*
+		 * multi points to an array, therefore accessing the array
+		 * depends on reading multi. However, even in this case,
+		 * we must insure that the pointer is read _before_ the array
+		 * data. Same as rcu_dereference, but we need a full smp_rmb()
+		 * in the fast path, so put the explicit barrier here.
+		 */
+		smp_read_barrier_depends();
+		for (i = 0; multi->c[i].func; i++) {
+			va_start(args, call_private);
+			multi->c[i].func(mdata, multi->c[i].probe_private,
+				call_private, mdata->format, &args);
+			va_end(args);
+		}
+	}
+	rcu_read_unlock_sched_notrace();
+}
+EXPORT_SYMBOL_GPL(marker_probe_cb);
+
+/*
+ * marker_probe_cb Callback that does not prepare the variable argument list.
+ * @mdata: pointer of type struct marker
+ * @call_private: caller site private data
+ * @...:  Variable argument list.
+ *
+ * Should be connected to markers "MARK_NOARGS".
+ */
+static notrace void marker_probe_cb_noarg(const struct marker *mdata,
+		void *call_private, ...)
+{
+	va_list args;	/* not initialized */
+	char ptype;
+
+	rcu_read_lock_sched_notrace();
+	ptype = mdata->ptype;
+	if (likely(!ptype)) {
+		marker_probe_func *func;
+		/* Must read the ptype before ptr. They are not data dependant,
+		 * so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		func = mdata->single.func;
+		/* Must read the ptr before private data. They are not data
+		 * dependant, so we put an explicit smp_rmb() here. */
+		smp_rmb();
+		func(mdata, mdata->single.probe_private, call_private,
+			mdata->format, &args);
+	} else {
+		struct marker_probe_array *multi;
+		int i;
+		/*
+		 * Read mdata->ptype before mdata->multi.
+		 */
+		smp_rmb();
+		multi = mdata->multi;
+		/*
+		 * multi points to an array, therefore accessing the array
+		 * depends on reading multi. However, even in this case,
+		 * we must insure that the pointer is read _before_ the array
+		 * data. Same as rcu_dereference, but we need a full smp_rmb()
+		 * in the fast path, so put the explicit barrier here.
+		 */
+		smp_read_barrier_depends();
+		for (i = 0; multi->c[i].func; i++)
+			multi->c[i].func(mdata, multi->c[i].probe_private,
+				call_private, mdata->format, &args);
+	}
+	rcu_read_unlock_sched_notrace();
+}
+
+static void free_old_closure(struct rcu_head *head)
+{
+	struct marker_probe_array *multi = container_of(head, struct marker_probe_array, rcu);
+	kfree(multi);
+}
+
+static void debug_print_probes(struct marker_entry *entry)
+{
+	int i;
+
+	if (!marker_debug)
+		return;
+
+	if (!entry->ptype) {
+		printk(KERN_DEBUG "Single probe : %p %p\n",
+			entry->single.func,
+			entry->single.probe_private);
+	} else {
+		for (i = 0; entry->multi->c[i].func; i++)
+			printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
+				entry->multi->c[i].func,
+				entry->multi->c[i].probe_private);
+	}
+}
+
+static struct marker_probe_array *
+marker_entry_add_probe(struct marker_entry *entry,
+		marker_probe_func *probe, void *probe_private)
+{
+	int nr_probes = 0;
+	struct marker_probe_array *old, *new;
+
+	WARN_ON(!probe);
+
+	debug_print_probes(entry);
+	old = entry->multi;
+	if (!entry->ptype) {
+		if (entry->single.func == probe &&
+				entry->single.probe_private == probe_private)
+			return ERR_PTR(-EBUSY);
+		if (entry->single.func == __mark_empty_function) {
+			/* 0 -> 1 probes */
+			entry->single.func = probe;
+			entry->single.probe_private = probe_private;
+			entry->refcount = 1;
+			entry->ptype = 0;
+			debug_print_probes(entry);
+			return NULL;
+		} else {
+			/* 1 -> 2 probes */
+			nr_probes = 1;
+			old = NULL;
+		}
+	} else {
+		/* (N -> N+1), (N != 0, 1) probes */
+		for (nr_probes = 0; old->c[nr_probes].func; nr_probes++)
+			if (old->c[nr_probes].func == probe
+					&& old->c[nr_probes].probe_private
+						== probe_private)
+				return ERR_PTR(-EBUSY);
+	}
+	/* + 2 : one for new probe, one for NULL func */
+	new = kzalloc(sizeof(struct marker_probe_array)
+		      + ((nr_probes + 2) * sizeof(struct marker_probe_closure)),
+			GFP_KERNEL);
+	if (new == NULL)
+		return ERR_PTR(-ENOMEM);
+	if (!old)
+		new->c[0] = entry->single;
+	else
+		memcpy(&new->c[0], &old->c[0],
+			nr_probes * sizeof(struct marker_probe_closure));
+	new->c[nr_probes].func = probe;
+	new->c[nr_probes].probe_private = probe_private;
+	entry->refcount = nr_probes + 1;
+	entry->multi = new;
+	entry->ptype = 1;
+	debug_print_probes(entry);
+	return old;
+}
+
+static struct marker_probe_array *
+marker_entry_remove_probe(struct marker_entry *entry,
+		marker_probe_func *probe, void *probe_private)
+{
+	int nr_probes = 0, nr_del = 0, i;
+	struct marker_probe_array *old, *new;
+
+	old = entry->multi;
+
+	debug_print_probes(entry);
+	if (!entry->ptype) {
+		/* 0 -> N is an error */
+		WARN_ON(entry->single.func == __mark_empty_function);
+		/* 1 -> 0 probes */
+		WARN_ON(probe && entry->single.func != probe);
+		WARN_ON(entry->single.probe_private != probe_private);
+		entry->single.func = __mark_empty_function;
+		entry->refcount = 0;
+		entry->ptype = 0;
+		debug_print_probes(entry);
+		return NULL;
+	} else {
+		/* (N -> M), (N > 1, M >= 0) probes */
+		for (nr_probes = 0; old->c[nr_probes].func; nr_probes++) {
+			if ((!probe || old->c[nr_probes].func == probe)
+					&& old->c[nr_probes].probe_private
+						== probe_private)
+				nr_del++;
+		}
+	}
+
+	if (nr_probes - nr_del == 0) {
+		/* N -> 0, (N > 1) */
+		entry->single.func = __mark_empty_function;
+		entry->refcount = 0;
+		entry->ptype = 0;
+	} else if (nr_probes - nr_del == 1) {
+		/* N -> 1, (N > 1) */
+		for (i = 0; old->c[i].func; i++)
+			if ((probe && old->c[i].func != probe) ||
+			    old->c[i].probe_private != probe_private)
+				entry->single = old->c[i];
+		entry->refcount = 1;
+		entry->ptype = 0;
+	} else {
+		int j = 0;
+		/* N -> M, (N > 1, M > 1) */
+		/* + 1 for NULL */
+		new = kzalloc(sizeof(struct marker_probe_array)
+			      + ((nr_probes - nr_del + 1)
+			         * sizeof(struct marker_probe_closure)),
+			      GFP_KERNEL);
+		if (new == NULL)
+			return ERR_PTR(-ENOMEM);
+		for (i = 0; old->c[i].func; i++)
+			if ((probe && old->c[i].func != probe) ||
+			    old->c[i].probe_private != probe_private)
+				new->c[j++] = old->c[i];
+		entry->refcount = nr_probes - nr_del;
+		entry->ptype = 1;
+		entry->multi = new;
+	}
+	debug_print_probes(entry);
+	return old;
+}
+
+/*
+ * Get marker if the marker is present in the marker hash table.
+ * Must be called with markers_mutex held.
+ * Returns NULL if not present.
+ */
+static struct marker_entry *get_marker(const char *channel, const char *name)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e;
+	size_t channel_len = strlen(channel) + 1;
+	size_t name_len = strlen(name) + 1;
+	u32 hash;
+
+	hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0);
+	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (!strcmp(channel, e->channel) && !strcmp(name, e->name))
+			return e;
+	}
+	return NULL;
+}
+
+/*
+ * Add the marker to the marker hash table. Must be called with markers_mutex
+ * held.
+ */
+static struct marker_entry *add_marker(const char *channel, const char *name,
+		const char *format)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e;
+	size_t channel_len = strlen(channel) + 1;
+	size_t name_len = strlen(name) + 1;
+	size_t format_len = 0;
+	u32 hash;
+
+	hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0);
+	if (format)
+		format_len = strlen(format) + 1;
+	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) {
+			printk(KERN_NOTICE
+				"Marker %s.%s busy\n", channel, name);
+			return ERR_PTR(-EBUSY);	/* Already there */
+		}
+	}
+	/*
+	 * Using kmalloc here to allocate a variable length element. Could
+	 * cause some memory fragmentation if overused.
+	 */
+	e = kmalloc(sizeof(struct marker_entry)
+		    + channel_len + name_len + format_len,
+		    GFP_KERNEL);
+	if (!e)
+		return ERR_PTR(-ENOMEM);
+	memcpy(e->channel, channel, channel_len);
+	e->name = &e->channel[channel_len];
+	memcpy(e->name, name, name_len);
+	if (format) {
+		e->format = &e->name[name_len];
+		memcpy(e->format, format, format_len);
+		if (strcmp(e->format, MARK_NOARGS) == 0)
+			e->call = marker_probe_cb_noarg;
+		else
+			e->call = marker_probe_cb;
+		trace_mark(metadata, core_marker_format,
+			   "channel %s name %s format %s",
+			   e->channel, e->name, e->format);
+	} else {
+		e->format = NULL;
+		e->call = marker_probe_cb;
+	}
+	e->single.func = __mark_empty_function;
+	e->single.probe_private = NULL;
+	e->multi = NULL;
+	e->ptype = 0;
+	e->format_allocated = 0;
+	e->refcount = 0;
+	hlist_add_head(&e->hlist, head);
+	return e;
+}
+
+/*
+ * Remove the marker from the marker hash table. Must be called with mutex_lock
+ * held. Parameter "registered" indicates if the channel registration has been
+ * performed.
+ */
+static int remove_marker(const char *channel, const char *name, int registered,
+			 int compacting)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e;
+	int found = 0;
+	size_t channel_len = strlen(channel) + 1;
+	size_t name_len = strlen(name) + 1;
+	u32 hash;
+	int ret;
+
+	hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0);
+	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found)
+		return -ENOENT;
+	if (e->single.func != __mark_empty_function)
+		return -EBUSY;
+
+	if (registered && ltt_channels_trace_ref())
+		return 0;
+
+	hlist_del(&e->hlist);
+	hlist_del(&e->id_list);
+	if (registered) {
+		ret = ltt_channels_unregister(e->channel, compacting);
+		WARN_ON(ret);
+	}
+	if (e->format_allocated)
+		kfree(e->format);
+	kfree(e);
+	return 0;
+}
+
+/*
+ * Set the mark_entry format to the format found in the element.
+ */
+static int marker_set_format(struct marker_entry *entry, const char *format)
+{
+	entry->format = kstrdup(format, GFP_KERNEL);
+	if (!entry->format)
+		return -ENOMEM;
+	entry->format_allocated = 1;
+
+	trace_mark(metadata, core_marker_format,
+		   "channel %s name %s format %s",
+		   entry->channel, entry->name, entry->format);
+	return 0;
+}
+
+/*
+ * Sets the probe callback corresponding to one marker.
+ */
+static int set_marker(struct marker_entry *entry, struct marker *elem,
+		int active)
+{
+	int ret = 0;
+	WARN_ON(strcmp(entry->name, elem->name) != 0);
+
+	if (entry->format) {
+		if (strcmp(entry->format, elem->format) != 0) {
+			printk(KERN_NOTICE
+				"Format mismatch for probe %s "
+				"(%s), marker (%s)\n",
+				entry->name,
+				entry->format,
+				elem->format);
+			return -EPERM;
+		}
+	} else {
+		ret = marker_set_format(entry, elem->format);
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * probe_cb setup (statically known) is done here. It is
+	 * asynchronous with the rest of execution, therefore we only
+	 * pass from a "safe" callback (with argument) to an "unsafe"
+	 * callback (does not set arguments).
+	 */
+	elem->call = entry->call;
+	elem->channel_id = entry->channel_id;
+	elem->event_id = entry->event_id;
+	/*
+	 * Sanity check :
+	 * We only update the single probe private data when the ptr is
+	 * set to a _non_ single probe! (0 -> 1 and N -> 1, N != 1)
+	 */
+	WARN_ON(elem->single.func != __mark_empty_function
+		&& elem->single.probe_private != entry->single.probe_private
+		&& !elem->ptype);
+	elem->single.probe_private = entry->single.probe_private;
+	/*
+	 * Make sure the private data is valid when we update the
+	 * single probe ptr.
+	 */
+	smp_wmb();
+	elem->single.func = entry->single.func;
+	/*
+	 * We also make sure that the new probe callbacks array is consistent
+	 * before setting a pointer to it.
+	 */
+	rcu_assign_pointer(elem->multi, entry->multi);
+	/*
+	 * Update the function or multi probe array pointer before setting the
+	 * ptype.
+	 */
+	smp_wmb();
+	elem->ptype = entry->ptype;
+
+	if (elem->tp_name && (active ^ _imv_read(elem->state))) {
+		WARN_ON(!elem->tp_cb);
+		/*
+		 * It is ok to directly call the probe registration because type
+		 * checking has been done in the __trace_mark_tp() macro.
+		 */
+
+		if (active) {
+			/*
+			 * try_module_get should always succeed because we hold
+			 * lock_module() to get the tp_cb address.
+			 */
+			ret = try_module_get(__module_text_address(
+				(unsigned long)elem->tp_cb));
+			BUG_ON(!ret);
+			ret = tracepoint_probe_register_noupdate(
+				elem->tp_name,
+				elem->tp_cb, NULL);
+		} else {
+			ret = tracepoint_probe_unregister_noupdate(
+				elem->tp_name,
+				elem->tp_cb, NULL);
+			/*
+			 * tracepoint_probe_update_all() must be called
+			 * before the module containing tp_cb is unloaded.
+			 */
+			module_put(__module_text_address(
+				(unsigned long)elem->tp_cb));
+		}
+	}
+	elem->state__imv = active;
+
+	return ret;
+}
+
+/*
+ * Disable a marker and its probe callback.
+ * Note: only waiting an RCU period after setting elem->call to the empty
+ * function insures that the original callback is not used anymore. This insured
+ * by rcu_read_lock_sched around the call site.
+ */
+static void disable_marker(struct marker *elem)
+{
+	int ret;
+
+	/* leave "call" as is. It is known statically. */
+	if (elem->tp_name && _imv_read(elem->state)) {
+		WARN_ON(!elem->tp_cb);
+		/*
+		 * It is ok to directly call the probe registration because type
+		 * checking has been done in the __trace_mark_tp() macro.
+		 */
+		ret = tracepoint_probe_unregister_noupdate(elem->tp_name,
+			elem->tp_cb, NULL);
+		WARN_ON(ret);
+		/*
+		 * tracepoint_probe_update_all() must be called
+		 * before the module containing tp_cb is unloaded.
+		 */
+		module_put(__module_text_address((unsigned long)elem->tp_cb));
+	}
+	elem->state__imv = 0;
+	elem->single.func = __mark_empty_function;
+	/* Update the function before setting the ptype */
+	smp_wmb();
+	elem->ptype = 0;	/* single probe */
+	/*
+	 * Leave the private data and channel_id/event_id there, because removal
+	 * is racy and should be done only after an RCU period. These are never
+	 * used until the next initialization anyway.
+	 */
+}
+
+/*
+ * is_marker_present - Check if a marker is present in kernel.
+ * @channel: channel name
+ * @name: marker name
+ *
+ * We cannot take the marker lock around calls to this function because it needs
+ * to take the module mutex within the iterator. Marker mutex nests inside
+ * module mutex.
+ * Returns 1 if the marker is present, 0 if not.
+ */
+int is_marker_present(const char *channel, const char *name)
+{
+	int ret;
+	struct marker_iter iter;
+
+	ret = 0;
+
+	marker_iter_reset(&iter);
+	marker_iter_start(&iter);
+	for (; iter.marker != NULL; marker_iter_next(&iter)) {
+		if (!strcmp(iter.marker->channel, channel) &&
+		    !strcmp(iter.marker->name, name)) {
+			ret = 1;
+			goto end;
+		}
+	}
+end:
+	marker_iter_stop(&iter);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(is_marker_present);
+
+/*
+ * _is_marker_enabled - Check if a marker is enabled, must be called with
+ *                      markers_mutex held.
+ * @channel: channel name
+ * @name: marker name
+ *
+ * Returns 1 if the marker is enabled, 0 if disabled.
+ */
+int _is_marker_enabled(const char *channel, const char *name)
+{
+	struct marker_entry *entry;
+
+	entry = get_marker(channel, name);
+
+	return entry && !!entry->refcount;
+}
+EXPORT_SYMBOL_GPL(_is_marker_enabled);
+
+/*
+ * is_marker_enabled - the wrapper of _is_marker_enabled
+ * @channel: channel name
+ * @name: marker name
+ *
+ * Returns 1 if the marker is enabled, 0 if disabled.
+ */
+int is_marker_enabled(const char *channel, const char *name)
+{
+	int ret;
+
+	lock_markers();
+	ret = _is_marker_enabled(channel, name);
+	unlock_markers();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(is_marker_enabled);
+
+/**
+ * marker_update_probe_range - Update a probe range
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Updates the probe callback corresponding to a range of markers.
+ */
+void marker_update_probe_range(struct marker *begin,
+	struct marker *end)
+{
+	struct marker *iter;
+	struct marker_entry *mark_entry;
+
+	mutex_lock(&markers_mutex);
+	for (iter = begin; iter < end; iter++) {
+		mark_entry = get_marker(iter->channel, iter->name);
+		if (mark_entry) {
+			set_marker(mark_entry, iter, !!mark_entry->refcount);
+			/*
+			 * ignore error, continue
+			 */
+		} else {
+			disable_marker(iter);
+		}
+	}
+	mutex_unlock(&markers_mutex);
+}
+
+/*
+ * Update probes, removing the faulty probes.
+ *
+ * Internal callback only changed before the first probe is connected to it.
+ * Single probe private data can only be changed on 0 -> 1 and 2 -> 1
+ * transitions.  All other transitions will leave the old private data valid.
+ * This makes the non-atomicity of the callback/private data updates valid.
+ *
+ * "special case" updates :
+ * 0 -> 1 callback
+ * 1 -> 0 callback
+ * 1 -> 2 callbacks
+ * 2 -> 1 callbacks
+ * Other updates all behave the same, just like the 2 -> 3 or 3 -> 2 updates.
+ * Site effect : marker_set_format may delete the marker entry (creating a
+ * replacement).
+ */
+void marker_update_probes(void)
+{
+	/* Core kernel markers */
+	marker_update_probe_range(__start___markers, __stop___markers);
+	/* Markers in modules. */
+	module_update_markers();
+	tracepoint_probe_update_all();
+	/* Update immediate values */
+	core_imv_update();
+	module_imv_update();
+}
+
+/**
+ * marker_probe_register -  Connect a probe to a marker
+ * @channel: marker channel
+ * @name: marker name
+ * @format: format string
+ * @probe: probe handler
+ * @probe_private: probe private data
+ *
+ * private data must be a valid allocated memory address, or NULL.
+ * Returns 0 if ok, error value on error.
+ * The probe address must at least be aligned on the architecture pointer size.
+ */
+int marker_probe_register(const char *channel, const char *name,
+			  const char *format, marker_probe_func *probe,
+			  void *probe_private)
+{
+	struct marker_entry *entry;
+	int ret = 0, ret_err;
+	struct marker_probe_array *old;
+	int first_probe = 0;
+
+	mutex_lock(&markers_mutex);
+	entry = get_marker(channel, name);
+	if (!entry) {
+		first_probe = 1;
+		entry = add_marker(channel, name, format);
+		if (IS_ERR(entry))
+			ret = PTR_ERR(entry);
+		if (ret)
+			goto end;
+		ret = ltt_channels_register(channel);
+		if (ret)
+			goto error_remove_marker;
+		ret = ltt_channels_get_index_from_name(channel);
+		if (ret < 0)
+			goto error_unregister_channel;
+		entry->channel_id = ret;
+		ret = ltt_channels_get_event_id(channel, name);
+		if (ret < 0)
+			goto error_unregister_channel;
+		entry->event_id = ret;
+		hlist_add_head(&entry->id_list, id_table + hash_32(
+				(entry->channel_id << 16) | entry->event_id,
+				MARKER_HASH_BITS));
+		ret = 0;
+		trace_mark(metadata, core_marker_id,
+			   "channel %s name %s event_id %hu "
+			   "int #1u%zu long #1u%zu pointer #1u%zu "
+			   "size_t #1u%zu alignment #1u%u",
+			   channel, name, entry->event_id,
+			   sizeof(int), sizeof(long), sizeof(void *),
+			   sizeof(size_t), ltt_get_alignment());
+	} else if (format) {
+		if (!entry->format)
+			ret = marker_set_format(entry, format);
+		else if (strcmp(entry->format, format))
+			ret = -EPERM;
+		if (ret)
+			goto end;
+	}
+
+	old = marker_entry_add_probe(entry, probe, probe_private);
+	if (IS_ERR(old)) {
+		ret = PTR_ERR(old);
+		if (first_probe)
+			goto error_unregister_channel;
+		else
+			goto end;
+	}
+	mutex_unlock(&markers_mutex);
+
+	marker_update_probes();
+	if (old)
+		call_rcu_sched(&old->rcu, free_old_closure);
+	return ret;
+
+error_unregister_channel:
+	ret_err = ltt_channels_unregister(channel, 1);
+	WARN_ON(ret_err);
+error_remove_marker:
+	ret_err = remove_marker(channel, name, 0, 0);
+	WARN_ON(ret_err);
+end:
+	mutex_unlock(&markers_mutex);
+	marker_update_probes();	/* for compaction on error path */
+	return ret;
+}
+EXPORT_SYMBOL_GPL(marker_probe_register);
+
+/**
+ * marker_probe_unregister -  Disconnect a probe from a marker
+ * @channel: marker channel
+ * @name: marker name
+ * @probe: probe function pointer
+ * @probe_private: probe private data
+ *
+ * Returns the private data given to marker_probe_register, or an ERR_PTR().
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
+ */
+int marker_probe_unregister(const char *channel, const char *name,
+			    marker_probe_func *probe, void *probe_private)
+{
+	struct marker_entry *entry;
+	struct marker_probe_array *old;
+	int ret = 0;
+
+	mutex_lock(&markers_mutex);
+	entry = get_marker(channel, name);
+	if (!entry) {
+		ret = -ENOENT;
+		goto end;
+	}
+	old = marker_entry_remove_probe(entry, probe, probe_private);
+	remove_marker(channel, name, 1, 0);	/* Ignore busy error message */
+	mutex_unlock(&markers_mutex);
+
+	marker_update_probes();
+	if (old)
+		call_rcu_sched(&old->rcu, free_old_closure);
+	return ret;
+
+end:
+	mutex_unlock(&markers_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(marker_probe_unregister);
+
+static struct marker_entry *
+get_marker_from_private_data(marker_probe_func *probe, void *probe_private)
+{
+	struct marker_entry *entry;
+	unsigned int i;
+	struct hlist_head *head;
+	struct hlist_node *node;
+
+	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+		head = &marker_table[i];
+		hlist_for_each_entry(entry, node, head, hlist) {
+			if (!entry->ptype) {
+				if (entry->single.func == probe
+						&& entry->single.probe_private
+						== probe_private)
+					return entry;
+			} else {
+				struct marker_probe_array *closure;
+				closure = entry->multi;
+				for (i = 0; closure->c[i].func; i++) {
+					if (closure->c[i].func == probe &&
+					    closure->c[i].probe_private
+					    == probe_private)
+						return entry;
+				}
+			}
+		}
+	}
+	return NULL;
+}
+
+/**
+ * marker_probe_unregister_private_data -  Disconnect a probe from a marker
+ * @probe: probe function
+ * @probe_private: probe private data
+ *
+ * Unregister a probe by providing the registered private data.
+ * Only removes the first marker found in hash table.
+ * Return 0 on success or error value.
+ * We do not need to call a synchronize_sched to make sure the probes have
+ * finished running before doing a module unload, because the module unload
+ * itself uses stop_machine(), which insures that every preempt disabled section
+ * have finished.
+ */
+int marker_probe_unregister_private_data(marker_probe_func *probe,
+		void *probe_private)
+{
+	struct marker_entry *entry;
+	int ret = 0;
+	struct marker_probe_array *old;
+	const char *channel = NULL, *name = NULL;
+
+	mutex_lock(&markers_mutex);
+	entry = get_marker_from_private_data(probe, probe_private);
+	if (!entry) {
+		ret = -ENOENT;
+		goto unlock;
+	}
+	old = marker_entry_remove_probe(entry, NULL, probe_private);
+	channel = kstrdup(entry->channel, GFP_KERNEL);
+	name = kstrdup(entry->name, GFP_KERNEL);
+	remove_marker(channel, name, 1, 0);	/* Ignore busy error message */
+	mutex_unlock(&markers_mutex);
+
+	marker_update_probes();
+	if (old)
+		call_rcu_sched(&old->rcu, free_old_closure);
+	goto end;
+
+unlock:
+	mutex_unlock(&markers_mutex);
+end:
+	kfree(channel);
+	kfree(name);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
+
+/**
+ * marker_get_private_data - Get a marker's probe private data
+ * @channel: marker channel
+ * @name: marker name
+ * @probe: probe to match
+ * @num: get the nth matching probe's private data
+ *
+ * Returns the nth private data pointer (starting from 0) matching, or an
+ * ERR_PTR.
+ * Returns the private data pointer, or an ERR_PTR.
+ * The private data pointer should _only_ be dereferenced if the caller is the
+ * owner of the data, or its content could vanish. This is mostly used to
+ * confirm that a caller is the owner of a registered probe.
+ */
+void *marker_get_private_data(const char *channel, const char *name,
+			      marker_probe_func *probe, int num)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e;
+	size_t channel_len = strlen(channel) + 1;
+	size_t name_len = strlen(name) + 1;
+	int i;
+	u32 hash;
+
+	hash = jhash(channel, channel_len-1, 0) ^ jhash(name, name_len-1, 0);
+	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (!strcmp(channel, e->channel) && !strcmp(name, e->name)) {
+			if (!e->ptype) {
+				if (num == 0 && e->single.func == probe)
+					return e->single.probe_private;
+			} else {
+				struct marker_probe_array *closure;
+				int match = 0;
+				closure = e->multi;
+				for (i = 0; closure->c[i].func; i++) {
+					if (closure->c[i].func != probe)
+						continue;
+					if (match++ == num)
+						return closure->c[i].probe_private;
+				}
+			}
+			break;
+		}
+	}
+	return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(marker_get_private_data);
+
+static struct marker_entry *get_entry_from_id(u16 channel_id, u16 event_id)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e, *found = NULL;
+	u32 hash = hash_32((channel_id << 16) | event_id, MARKER_HASH_BITS);
+
+	mutex_lock(&markers_mutex);
+	head = id_table + hash;
+	hlist_for_each_entry(e, node, head, id_list) {
+		if (e->channel_id == channel_id && e->event_id == event_id) {
+			found = e;
+			break;
+		}
+	}
+	mutex_unlock(&markers_mutex);
+	return found;
+}
+
+/* must call when ids/marker_entry are kept alive */
+const char *marker_get_name_from_id(u16 channel_id, u16 event_id)
+{
+	struct marker_entry *e = get_entry_from_id(channel_id, event_id);
+	return e ? e->name : NULL;
+}
+EXPORT_SYMBOL_GPL(marker_get_name_from_id);
+
+const char *marker_get_fmt_from_id(u16 channel_id, u16 event_id)
+{
+	struct marker_entry *e = get_entry_from_id(channel_id, event_id);
+	return e ? e->format : NULL;
+}
+EXPORT_SYMBOL_GPL(marker_get_fmt_from_id);
+
+/**
+ * markers_compact_event_ids - Compact markers event IDs and reassign channels
+ *
+ * Called when no channel users are active by the channel infrastructure.
+ * Called with trace lock, lock_markers() and channel mutex held.
+ *
+ * marker_update_probes() must be executed after compaction before releasing the
+ * trace lock.
+ */
+void markers_compact_event_ids(void)
+{
+	struct marker_entry *entry;
+	unsigned int i;
+	struct hlist_head *head;
+	struct hlist_node *node, *next;
+	int ret;
+
+	_ltt_channels_reset_event_ids();
+
+	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+		head = &marker_table[i];
+		hlist_for_each_entry_safe(entry, node, next, head, hlist) {
+			if (!entry->refcount) {
+				remove_marker(entry->channel, entry->name,
+					      1, 1);
+				continue;
+			}
+			ret = ltt_channels_get_index_from_name(entry->channel);
+			WARN_ON(ret < 0);
+			entry->channel_id = ret;
+			ret = _ltt_channels_get_event_id(entry->channel,
+							 entry->name);
+			WARN_ON(ret < 0);
+			entry->event_id = ret;
+		}
+	}
+
+	memset(id_table, 0, sizeof(id_table));
+	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+		head = &marker_table[i];
+		hlist_for_each_entry(entry, node, head, hlist) {
+			hlist_add_head(&entry->id_list, id_table + hash_32(
+					(entry->channel_id << 16)
+					| entry->event_id, MARKER_HASH_BITS));
+		}
+	}
+}
+
+#ifdef CONFIG_MODULES
+
+/**
+ * marker_get_iter_range - Get a next marker iterator given a range.
+ * @marker: current markers (in), next marker (out)
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Returns whether a next marker has been found (1) or not (0).
+ * Will return the first marker in the range if the input marker is NULL.
+ */
+int marker_get_iter_range(struct marker **marker, struct marker *begin,
+	struct marker *end)
+{
+	if (!*marker && begin != end) {
+		*marker = begin;
+		return 1;
+	}
+	if (*marker >= begin && *marker < end)
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(marker_get_iter_range);
+
+static void marker_get_iter(struct marker_iter *iter)
+{
+	int found = 0;
+
+	/* Core kernel markers */
+	if (!iter->module) {
+		found = marker_get_iter_range(&iter->marker,
+				__start___markers, __stop___markers);
+		if (found)
+			goto end;
+	}
+	/* Markers in modules. */
+	found = module_get_iter_markers(iter);
+end:
+	if (!found)
+		marker_iter_reset(iter);
+}
+
+void marker_iter_start(struct marker_iter *iter)
+{
+	marker_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(marker_iter_start);
+
+void marker_iter_next(struct marker_iter *iter)
+{
+	iter->marker++;
+	/*
+	 * iter->marker may be invalid because we blindly incremented it.
+	 * Make sure it is valid by marshalling on the markers, getting the
+	 * markers from following modules if necessary.
+	 */
+	marker_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(marker_iter_next);
+
+void marker_iter_stop(struct marker_iter *iter)
+{
+}
+EXPORT_SYMBOL_GPL(marker_iter_stop);
+
+void marker_iter_reset(struct marker_iter *iter)
+{
+	iter->module = NULL;
+	iter->marker = NULL;
+}
+EXPORT_SYMBOL_GPL(marker_iter_reset);
+
+int marker_module_notify(struct notifier_block *self,
+			 unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		marker_update_probe_range(mod->markers,
+			mod->markers + mod->num_markers);
+		break;
+	case MODULE_STATE_GOING:
+		marker_update_probe_range(mod->markers,
+			mod->markers + mod->num_markers);
+		break;
+	}
+	return 0;
+}
+
+struct notifier_block marker_module_nb = {
+	.notifier_call = marker_module_notify,
+	.priority = 0,
+};
+
+static int init_markers(void)
+{
+	return register_module_notifier(&marker_module_nb);
+}
+__initcall(init_markers);
+
+#endif /* CONFIG_MODULES */
+
+void ltt_dump_marker_state(struct ltt_trace *trace)
+{
+	struct marker_entry *entry;
+	struct ltt_probe_private_data call_data;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	unsigned int i;
+
+	mutex_lock(&markers_mutex);
+	call_data.trace = trace;
+	call_data.serializer = NULL;
+
+	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+		head = &marker_table[i];
+		hlist_for_each_entry(entry, node, head, hlist) {
+			__trace_mark(0, metadata, core_marker_id,
+				&call_data,
+				"channel %s name %s event_id %hu "
+				"int #1u%zu long #1u%zu pointer #1u%zu "
+				"size_t #1u%zu alignment #1u%u",
+				entry->channel,
+				entry->name,
+				entry->event_id,
+				sizeof(int), sizeof(long),
+				sizeof(void *), sizeof(size_t),
+				ltt_get_alignment());
+			if (entry->format)
+				__trace_mark(0, metadata,
+					core_marker_format,
+					&call_data,
+					"channel %s name %s format %s",
+					entry->channel,
+					entry->name,
+					entry->format);
+		}
+	}
+	mutex_unlock(&markers_mutex);
+}
+EXPORT_SYMBOL_GPL(ltt_dump_marker_state);
diff --git a/kernel/module.c b/kernel/module.c
index efa290ea94b..2767c8eaf12 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -57,6 +57,7 @@
 #include <linux/kmemleak.h>
 #include <linux/jump_label.h>
 #include <linux/pfn.h>
+#include <trace/kernel.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/module.h>
@@ -99,7 +100,9 @@
  * 1) List of modules (also safely readable with preempt_disable),
  * 2) module_use links,
  * 3) module_addr_min/module_addr_max.
- * (delete uses stop_machine/add uses RCU list operations). */
+ * (delete uses stop_machine/add uses RCU list operations).
+ * Sorted by ascending list node address.
+ */
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
@@ -120,6 +123,9 @@ static BLOCKING_NOTIFIER_HEAD(module_notify_list);
  * Protected by module_mutex. */
 static unsigned long module_addr_min = -1UL, module_addr_max = 0;
 
+DEFINE_TRACE(kernel_module_load);
+DEFINE_TRACE(kernel_module_free);
+
 int register_module_notifier(struct notifier_block * nb)
 {
 	return blocking_notifier_chain_register(&module_notify_list, nb);
@@ -1675,6 +1681,7 @@ static inline void unset_section_ro_nx(struct module *mod, void *module_region)
 /* Free a module, remove from lists, etc. */
 static void free_module(struct module *mod)
 {
+	trace_kernel_module_free(mod);
 	trace_module_free(mod);
 
 	/* Delete from various lists */
@@ -2272,6 +2279,12 @@ static int copy_and_check(struct load_info *info,
 	if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
 		return -ENOMEM;
 
+	/*
+	 * Make sure the module text or data access never generates any page
+	 * fault.
+	 */
+	vmalloc_sync_all();
+
 	if (copy_from_user(hdr, umod, len) != 0) {
 		err = -EFAULT;
 		goto free_hdr;
@@ -2459,6 +2472,10 @@ static void find_module_sections(struct module *mod, struct load_info *info)
 				  sizeof(*mod->ctors), &mod->num_ctors);
 #endif
 
+#ifdef CONFIG_MARKERS
+	mod->markers = section_objs(info, "__markers",
+				    sizeof(*mod->markers), &mod->num_markers);
+#endif
 #ifdef CONFIG_TRACEPOINTS
 	mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs",
 					     sizeof(*mod->tracepoints_ptrs),
@@ -2717,7 +2734,7 @@ static struct module *load_module(void __user *umod,
 				  const char __user *uargs)
 {
 	struct load_info info = { NULL, };
-	struct module *mod;
+	struct module *mod, *iter;
 	long err;
 
 	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2799,7 +2816,23 @@ static struct module *load_module(void __user *umod,
 		goto ddebug;
 
 	module_bug_finalize(info.hdr, info.sechdrs, mod);
+	/*
+	 * We sort the modules by struct module pointer address to permit
+	 * correct iteration over modules of, at least, kallsyms for preemptible
+	 * operations, such as read(). Sorting by struct module pointer address
+	 * is equivalent to sort by list node address.
+	 */
+	list_for_each_entry_reverse(iter, &modules, list) {
+		BUG_ON(iter == mod);	/* Should never be in the list twice */
+		if (iter < mod) {
+			/* We belong to the location right after iter. */
+			list_add_rcu(&mod->list, &iter->list);
+			goto module_added;
+		}
+	}
+	/* We should be added at the head of the list */
 	list_add_rcu(&mod->list, &modules);
+module_added:
 	mutex_unlock(&module_mutex);
 
 	/* Module is ready to execute: parsing args may do that. */
@@ -2817,6 +2850,7 @@ static struct module *load_module(void __user *umod,
 	free_copy(&info);
 
 	/* Done! */
+	trace_kernel_module_load(mod);
 	trace_module_load(mod);
 	return mod;
 
@@ -3196,12 +3230,12 @@ static char *module_flags(struct module *mod, char *buf)
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
 	mutex_lock(&module_mutex);
-	return seq_list_start(&modules, *pos);
+	return seq_sorted_list_start(&modules, pos);
 }
 
 static void *m_next(struct seq_file *m, void *p, loff_t *pos)
 {
-	return seq_list_next(p, &modules, pos);
+	return seq_sorted_list_next(p, &modules, pos);
 }
 
 static void m_stop(struct seq_file *m, void *p)
@@ -3266,6 +3300,27 @@ static int __init proc_modules_init(void)
 module_init(proc_modules_init);
 #endif
 
+void list_modules(void *call_data)
+{
+	/* Enumerate loaded modules */
+	struct list_head	*i;
+	struct module		*mod;
+	unsigned long refcount = 0;
+
+	mutex_lock(&module_mutex);
+	list_for_each(i, &modules) {
+		mod = list_entry(i, struct module, list);
+#ifdef CONFIG_MODULE_UNLOAD
+		refcount = module_refcount(mod);
+#endif
+		__trace_mark(0, module_state, list_module, call_data,
+				"name %s state %d refcount %lu",
+				mod->name, mod->state, refcount);
+	}
+	mutex_unlock(&module_mutex);
+}
+EXPORT_SYMBOL_GPL(list_modules);
+
 /* Given an address, look for it in the module exception tables. */
 const struct exception_table_entry *search_module_extables(unsigned long addr)
 {
@@ -3393,12 +3448,59 @@ void module_layout(struct module *mod,
 		   struct modversion_info *ver,
 		   struct kernel_param *kp,
 		   struct kernel_symbol *ks,
+		   struct marker *marker,
 		   struct tracepoint * const *tp)
 {
 }
 EXPORT_SYMBOL(module_layout);
 #endif
 
+#ifdef CONFIG_MARKERS
+void module_update_markers(void)
+{
+	struct module *mod;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(mod, &modules, list)
+		if (!(mod->taints & TAINT_FORCED_MODULE))
+			marker_update_probe_range(mod->markers,
+				mod->markers + mod->num_markers);
+	mutex_unlock(&module_mutex);
+}
+
+/*
+ * Returns 0 if current not found.
+ * Returns 1 if current found.
+ */
+int module_get_iter_markers(struct marker_iter *iter)
+{
+	struct module *iter_mod;
+	int found = 0;
+
+	mutex_lock(&module_mutex);
+	list_for_each_entry(iter_mod, &modules, list) {
+		if (!(iter_mod->taints & TAINT_FORCED_MODULE)) {
+			/*
+			 * Sorted module list
+			 */
+			if (iter_mod < iter->module)
+				continue;
+			else if (iter_mod > iter->module)
+				iter->marker = NULL;
+			found = marker_get_iter_range(&iter->marker,
+				iter_mod->markers,
+				iter_mod->markers + iter_mod->num_markers);
+			if (found) {
+				iter->module = iter_mod;
+				break;
+			}
+		}
+	}
+	mutex_unlock(&module_mutex);
+	return found;
+}
+#endif
+
 #ifdef CONFIG_TRACEPOINTS
 void module_update_tracepoints(void)
 {
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2488ba7eb56..e8481427153 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -5,6 +5,7 @@
 #include <linux/rcupdate.h>
 #include <linux/vmalloc.h>
 #include <linux/reboot.h>
+#include <linux/idle.h>
 
 /*
  *	Notifier list for kernel code which wants to be called
@@ -148,7 +149,7 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh,
 	spin_lock_irqsave(&nh->lock, flags);
 	ret = notifier_chain_unregister(&nh->head, n);
 	spin_unlock_irqrestore(&nh->lock, flags);
-	synchronize_rcu();
+	synchronize_sched();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
@@ -178,9 +179,9 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
 {
 	int ret;
 
-	rcu_read_lock();
+	rcu_read_lock_sched_notrace();
 	ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
-	rcu_read_unlock();
+	rcu_read_unlock_sched_notrace();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
@@ -584,3 +585,27 @@ int unregister_die_notifier(struct notifier_block *nb)
 	return atomic_notifier_chain_unregister(&die_chain, nb);
 }
 EXPORT_SYMBOL_GPL(unregister_die_notifier);
+
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+/*
+ * Trace last event before calling notifiers. Notifiers flush data from buffers
+ * before going to idle.
+ */
+int notrace notify_idle(enum idle_val val)
+{
+	return atomic_notifier_call_chain(&idle_notifier, val, NULL);
+}
+EXPORT_SYMBOL_GPL(notify_idle);
+
+void register_idle_notifier(struct notifier_block *n)
+{
+	atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(register_idle_notifier);
+
+void unregister_idle_notifier(struct notifier_block *n)
+{
+	atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(unregister_idle_notifier);
diff --git a/kernel/panic.c b/kernel/panic.c
index e510d28b3e6..79708502b00 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,6 +23,9 @@
 #include <linux/init.h>
 #include <linux/nmi.h>
 #include <linux/dmi.h>
+#include <trace/kernel.h>
+
+DEFINE_TRACE(kernel_panic);
 
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -67,6 +70,10 @@ NORET_TYPE void panic(const char * fmt, ...)
 	long i, i_next = 0;
 	int state = 0;
 
+	va_start(args, fmt);
+	trace_kernel_panic(fmt, args);
+	va_end(args);
+
 	/*
 	 * It's possible to come here directly from a panic-assertion and
 	 * not have preempt disabled. Some functions called from here want
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 656222fcf76..ad02feadb6b 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4567,7 +4567,7 @@ static int perf_exclude_event(struct perf_event *event,
 			      struct pt_regs *regs)
 {
 	if (event->hw.state & PERF_HES_STOPPED)
-		return 0;
+		return 1;
 
 	if (regs) {
 		if (event->attr.exclude_user && user_mode(regs))
@@ -4923,6 +4923,8 @@ static int perf_tp_event_match(struct perf_event *event,
 				struct perf_sample_data *data,
 				struct pt_regs *regs)
 {
+	if (event->hw.state & PERF_HES_STOPPED)
+		return 0;
 	/*
 	 * All tracepoints are from kernel-space.
 	 */
@@ -6113,17 +6115,20 @@ __perf_event_exit_task(struct perf_event *child_event,
 			 struct perf_event_context *child_ctx,
 			 struct task_struct *child)
 {
-	struct perf_event *parent_event;
+	if (child_event->parent) {
+		raw_spin_lock_irq(&child_ctx->lock);
+		perf_group_detach(child_event);
+		raw_spin_unlock_irq(&child_ctx->lock);
+	}
 
 	perf_event_remove_from_context(child_event);
 
-	parent_event = child_event->parent;
 	/*
-	 * It can happen that parent exits first, and has events
+	 * It can happen that the parent exits first, and has events
 	 * that are still around due to the child reference. These
-	 * events need to be zapped - but otherwise linger.
+	 * events need to be zapped.
 	 */
-	if (parent_event) {
+	if (child_event->parent) {
 		sync_child_event(child_event, child);
 		free_event(child_event);
 	}
diff --git a/kernel/printk.c b/kernel/printk.c
index f629e802fdc..92d1e3b6d37 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -40,6 +40,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/rculist.h>
+#include <trace/kernel.h>
 
 #include <asm/uaccess.h>
 
@@ -71,6 +72,7 @@ int console_printk[4] = {
 	MINIMUM_CONSOLE_LOGLEVEL,	/* minimum_console_loglevel */
 	DEFAULT_CONSOLE_LOGLEVEL,	/* default_console_loglevel */
 };
+EXPORT_SYMBOL_GPL(console_printk);
 
 /*
  * Low level drivers may need that to know if they can schedule in
@@ -140,6 +142,9 @@ EXPORT_SYMBOL(console_set_on_cmdline);
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
 
+DEFINE_TRACE(kernel_printk);
+DEFINE_TRACE(kernel_vprintk);
+
 #ifdef CONFIG_PRINTK
 
 static char __log_buf[__LOG_BUF_LEN];
@@ -701,6 +706,7 @@ asmlinkage int printk(const char *fmt, ...)
 	}
 #endif
 	va_start(args, fmt);
+	trace_kernel_printk(_RET_IP_);
 	r = vprintk(fmt, args);
 	va_end(args);
 
@@ -827,6 +833,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 #ifdef	CONFIG_DEBUG_LL
 	printascii(printk_buf);
 #endif
+	trace_kernel_vprintk(_RET_IP_, printk_buf, printed_len);
 
 	p = printk_buf;
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dd4aea806f8..a86e46b6bc1 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -47,6 +47,7 @@
 #include <linux/mutex.h>
 #include <linux/time.h>
 #include <linux/kernel_stat.h>
+#include <trace/rcu.h>
 
 #include "rcutree.h"
 
@@ -145,6 +146,10 @@ int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
 module_param(rcu_cpu_stall_suppress, int, 0644);
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
+DEFINE_TRACE(rcu_tree_call_rcu);
+DEFINE_TRACE(rcu_tree_call_rcu_bh);
+DEFINE_TRACE(rcu_tree_callback);
+
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
 static int rcu_pending(int cpu);
 
@@ -1143,6 +1148,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 		next = list->next;
 		prefetch(next);
 		debug_rcu_head_unqueue(list);
+		trace_rcu_tree_callback(list);
 		list->func(list);
 		list = next;
 		if (++count >= rdp->blimit)
@@ -1488,6 +1494,7 @@ EXPORT_SYMBOL_GPL(call_rcu_sched);
  */
 void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 {
+	trace_rcu_tree_call_rcu_bh(head, _RET_IP_);
 	__call_rcu(head, func, &rcu_bh_state);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
diff --git a/kernel/sched.c b/kernel/sched.c
index b294a1882ff..e50b1bf0374 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5573,7 +5573,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	idle->sched_class = &idle_sched_class;
-	ftrace_graph_init_task(idle);
+	ftrace_graph_init_idle_task(idle, cpu);
 }
 
 /*
@@ -9344,3 +9344,57 @@ struct cgroup_subsys cpuacct_subsys = {
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
 
+static DEFINE_MUTEX(kernel_trace_mutex);
+static int kernel_trace_refcount;
+
+/**
+ * clear_kernel_trace_flag_all_tasks - clears all TIF_KERNEL_TRACE thread flags.
+ *
+ * This function iterates on all threads in the system to clear their
+ * TIF_KERNEL_TRACE flag. Setting the TIF_KERNEL_TRACE flag with the
+ * tasklist_lock held in copy_process() makes sure that once we finish clearing
+ * the thread flags, all threads have their flags cleared.
+ */
+void clear_kernel_trace_flag_all_tasks(void)
+{
+	struct task_struct *p;
+	struct task_struct *t;
+
+	mutex_lock(&kernel_trace_mutex);
+	if (--kernel_trace_refcount)
+		goto end;
+	read_lock(&tasklist_lock);
+	do_each_thread(p, t) {
+		clear_tsk_thread_flag(t, TIF_KERNEL_TRACE);
+	} while_each_thread(p, t);
+	read_unlock(&tasklist_lock);
+end:
+	mutex_unlock(&kernel_trace_mutex);
+}
+EXPORT_SYMBOL_GPL(clear_kernel_trace_flag_all_tasks);
+
+/**
+ * set_kernel_trace_flag_all_tasks - sets all TIF_KERNEL_TRACE thread flags.
+ *
+ * This function iterates on all threads in the system to set their
+ * TIF_KERNEL_TRACE flag. Setting the TIF_KERNEL_TRACE flag with the
+ * tasklist_lock held in copy_process() makes sure that once we finish setting
+ * the thread flags, all threads have their flags set.
+ */
+void set_kernel_trace_flag_all_tasks(void)
+{
+	struct task_struct *p;
+	struct task_struct *t;
+
+	mutex_lock(&kernel_trace_mutex);
+	if (kernel_trace_refcount++)
+		goto end;
+	read_lock(&tasklist_lock);
+	do_each_thread(p, t) {
+		set_tsk_thread_flag(t, TIF_KERNEL_TRACE);
+	} while_each_thread(p, t);
+	read_unlock(&tasklist_lock);
+end:
+	mutex_unlock(&kernel_trace_mutex);
+}
+EXPORT_SYMBOL_GPL(set_kernel_trace_flag_all_tasks);
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff10fdc..31751868de8 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2421,9 +2421,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
 		return -EFAULT;
 
 	/* Not even root can pretend to send signals from the kernel.
-	   Nor can they impersonate a kill(), which adds source info.  */
-	if (info.si_code >= 0)
+	 * Nor can they impersonate a kill()/tgkill(), which adds source info.
+	 */
+	if (info.si_code != SI_QUEUE) {
+		/* We used to allow any < 0 si_code */
+		WARN_ON_ONCE(info.si_code < 0);
 		return -EPERM;
+	}
 	info.si_signo = sig;
 
 	/* POSIX.1b doesn't mention process groups.  */
@@ -2437,9 +2441,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
 		return -EINVAL;
 
 	/* Not even root can pretend to send signals from the kernel.
-	   Nor can they impersonate a kill(), which adds source info.  */
-	if (info->si_code >= 0)
+	 * Nor can they impersonate a kill()/tgkill(), which adds source info.
+	 */
+	if (info->si_code != SI_QUEUE) {
+		/* We used to allow any < 0 si_code */
+		WARN_ON_ONCE(info->si_code < 0);
 		return -EPERM;
+	}
 	info->si_signo = sig;
 
 	return do_send_specific(tgid, pid, sig, info);
diff --git a/kernel/smp.c b/kernel/smp.c
index 9910744f085..954548906af 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -450,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask,
 {
 	struct call_function_data *data;
 	unsigned long flags;
-	int cpu, next_cpu, this_cpu = smp_processor_id();
+	int refs, cpu, next_cpu, this_cpu = smp_processor_id();
 
 	/*
 	 * Can deadlock when called with interrupts disabled.
@@ -461,7 +461,7 @@ void smp_call_function_many(const struct cpumask *mask,
 	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
 		     && !oops_in_progress && !early_boot_irqs_disabled);
 
-	/* So, what's a CPU they want? Ignoring this one. */
+	/* Try to fastpath.  So, what's a CPU they want? Ignoring this one. */
 	cpu = cpumask_first_and(mask, cpu_online_mask);
 	if (cpu == this_cpu)
 		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
@@ -483,22 +483,49 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	data = &__get_cpu_var(cfd_data);
 	csd_lock(&data->csd);
+
+	/* This BUG_ON verifies our reuse assertions and can be removed */
 	BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
 
+	/*
+	 * The global call function queue list add and delete are protected
+	 * by a lock, but the list is traversed without any lock, relying
+	 * on the rcu list add and delete to allow safe concurrent traversal.
+	 * We reuse the call function data without waiting for any grace
+	 * period after some other cpu removes it from the global queue.
+	 * This means a cpu might find our data block as it is being
+	 * filled out.
+	 *
+	 * We hold off the interrupt handler on the other cpu by
+	 * ordering our writes to the cpu mask vs our setting of the
+	 * refs counter.  We assert only the cpu owning the data block
+	 * will set a bit in cpumask, and each bit will only be cleared
+	 * by the subject cpu.  Each cpu must first find its bit is
+	 * set and then check that refs is set indicating the element is
+	 * ready to be processed, otherwise it must skip the entry.
+	 *
+	 * On the previous iteration refs was set to 0 by another cpu.
+	 * To avoid the use of transitivity, set the counter to 0 here
+	 * so the wmb will pair with the rmb in the interrupt handler.
+	 */
+	atomic_set(&data->refs, 0);	/* convert 3rd to 1st party write */
+
 	data->csd.func = func;
 	data->csd.info = info;
-	cpumask_and(data->cpumask, mask, cpu_online_mask);
-	cpumask_clear_cpu(this_cpu, data->cpumask);
 
-	/*
-	 * To ensure the interrupt handler gets an complete view
-	 * we order the cpumask and refs writes and order the read
-	 * of them in the interrupt handler.  In addition we may
-	 * only clear our own cpu bit from the mask.
-	 */
+	/* Ensure 0 refs is visible before mask.  Also orders func and info */
 	smp_wmb();
 
-	atomic_set(&data->refs, cpumask_weight(data->cpumask));
+	/* We rely on the "and" being processed before the store */
+	cpumask_and(data->cpumask, mask, cpu_online_mask);
+	cpumask_clear_cpu(this_cpu, data->cpumask);
+	refs = cpumask_weight(data->cpumask);
+
+	/* Some callers race with other cpus changing the passed mask */
+	if (unlikely(!refs)) {
+		csd_unlock(&data->csd);
+		return;
+	}
 
 	raw_spin_lock_irqsave(&call_function.lock, flags);
 	/*
@@ -507,6 +534,12 @@ void smp_call_function_many(const struct cpumask *mask,
 	 * will not miss any other list entries:
 	 */
 	list_add_rcu(&data->csd.list, &call_function.queue);
+	/*
+	 * We rely on the wmb() in list_add_rcu to complete our writes
+	 * to the cpumask before this write to refs, which indicates
+	 * data is on the list and is ready to be processed.
+	 */
+	atomic_set(&data->refs, refs);
 	raw_spin_unlock_irqrestore(&call_function.lock, flags);
 
 	/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec38..a25bf611d13 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -23,7 +23,10 @@
 #include <linux/rcupdate.h>
 #include <linux/ftrace.h>
 #include <linux/smp.h>
+#include <linux/marker.h>
+#include <linux/kallsyms.h>
 #include <linux/tick.h>
+#include <trace/irq.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
@@ -54,6 +57,20 @@ EXPORT_SYMBOL(irq_stat);
 
 static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
 
+void ltt_dump_softirq_vec(void *call_data)
+{
+	int i;
+	char namebuf[KSYM_NAME_LEN];
+
+	for (i = 0; i < 32; i++) {
+		sprint_symbol(namebuf, (unsigned long)softirq_vec[i].action);
+		__trace_mark(0, softirq_state, softirq_vec, call_data,
+			"id %d address %p symbol %s",
+			i, softirq_vec[i].action, namebuf);
+	}
+}
+EXPORT_SYMBOL_GPL(ltt_dump_softirq_vec);
+
 static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
 char *softirq_to_name[NR_SOFTIRQS] = {
@@ -61,6 +78,11 @@ char *softirq_to_name[NR_SOFTIRQS] = {
 	"TASKLET", "SCHED", "HRTIMER",	"RCU"
 };
 
+DEFINE_TRACE(irq_tasklet_high_entry);
+DEFINE_TRACE(irq_tasklet_high_exit);
+DEFINE_TRACE(irq_tasklet_low_entry);
+DEFINE_TRACE(irq_tasklet_low_exit);
+
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
@@ -341,6 +363,7 @@ void irq_exit(void)
  */
 inline void raise_softirq_irqoff(unsigned int nr)
 {
+	trace_softirq_raise(nr);
 	__raise_softirq_irqoff(nr);
 
 	/*
@@ -440,7 +463,9 @@ static void tasklet_action(struct softirq_action *a)
 			if (!atomic_read(&t->count)) {
 				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
 					BUG();
+				trace_irq_tasklet_low_entry(t);
 				t->func(t->data);
+				trace_irq_tasklet_low_exit(t);
 				tasklet_unlock(t);
 				continue;
 			}
@@ -475,7 +500,9 @@ static void tasklet_hi_action(struct softirq_action *a)
 			if (!atomic_read(&t->count)) {
 				if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state))
 					BUG();
+				trace_irq_tasklet_high_entry(t);
 				t->func(t->data);
+				trace_irq_tasklet_high_exit(t);
 				tasklet_unlock(t);
 				continue;
 			}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a26c37df1b1..4bc1435706b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -170,6 +170,11 @@ static int proc_taint(struct ctl_table *table, int write,
 			       void __user *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
 #ifdef CONFIG_MAGIC_SYSRQ
 /* Note: sysrq code uses it's own private copy */
 static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -714,7 +719,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &kptr_restrict,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_dmesg_restrict,
 		.extra1		= &zero,
 		.extra2		= &two,
 	},
@@ -2405,6 +2410,17 @@ static int proc_taint(struct ctl_table *table, int write,
 	return err;
 }
 
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
+
 struct do_proc_dointvec_minmax_conv_param {
 	int *min;
 	int *max;
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06..dbaa0648631 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= tick-broadcast.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-oneshot.o
 obj-$(CONFIG_TICK_ONESHOT)			+= tick-sched.o
 obj-$(CONFIG_TIMER_STATS)			+= timer_stats.o
+obj-$(CONFIG_HAVE_UNSYNCHRONIZED_TSC)		+= tsc-sync.o
diff --git a/kernel/time/tsc-sync.c b/kernel/time/tsc-sync.c
new file mode 100644
index 00000000000..2ac1544ee22
--- /dev/null
+++ b/kernel/time/tsc-sync.c
@@ -0,0 +1,313 @@
+/*
+ * kernel/time/tsc-sync.c
+ *
+ * Test TSC synchronization
+ *
+ * marks the tsc as unstable _and_ keep a simple "_tsc_is_sync" variable, which
+ * is fast to read when a simple test must determine which clock source to use
+ * for kernel tracing.
+ *
+ * - CPU init :
+ *
+ * We check whether all boot CPUs have their TSC's synchronized,
+ * print a warning if not and turn off the TSC clock-source.
+ *
+ * Only two CPUs may participate - they can enter in any order.
+ * ( The serial nature of the boot logic and the CPU hotplug lock
+ *   protects against more than 2 CPUs entering this code.
+ *
+ * - When CPUs are up :
+ *
+ * TSC synchronicity of all CPUs can be checked later at run-time by calling
+ * test_tsc_synchronization().
+ *
+ * Copyright 2007, 2008
+ *    Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ */
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/timex.h>
+#include <linux/jiffies.h>
+#include <linux/trace-clock.h>
+#include <linux/cpu.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+
+#define MAX_CYCLES_DELTA 3000ULL
+
+/*
+ * Number of loops to take care of MCE, NMIs, SMIs.
+ */
+#define NR_LOOPS	200
+
+static DEFINE_MUTEX(tscsync_mutex);
+
+struct sync_data {
+	int nr_waits;
+	int wait_sync;
+	cycles_t tsc_count;
+} ____cacheline_aligned;
+
+/* 0 is master, 1 is slave */
+static struct sync_data sync_data[2] = {
+	[0 ... 1] = {
+		.nr_waits = 3 * NR_LOOPS + 1,
+		.wait_sync = 3 * NR_LOOPS + 1,
+	},
+};
+
+int _tsc_is_sync = 1;
+EXPORT_SYMBOL(_tsc_is_sync);
+
+static int force_tsc_sync;
+static cycles_t slave_offset;
+static int slave_offset_ready;	/* for 32-bits architectures */
+
+static int __init force_tsc_sync_setup(char *str)
+{
+	force_tsc_sync = simple_strtoul(str, NULL, 0);
+	return 1;
+}
+__setup("force_tsc_sync=", force_tsc_sync_setup);
+
+/*
+ * Mark it noinline so we make sure it is not unrolled.
+ * Wait until value is reached.
+ */
+static noinline void tsc_barrier(long this_cpu)
+{
+	sync_core();
+	sync_data[this_cpu].wait_sync--;
+	smp_mb();	/* order master/slave sync_data read/write */
+	while (unlikely(sync_data[1 - this_cpu].wait_sync >=
+			sync_data[this_cpu].nr_waits))
+		barrier();	/*
+				 * barrier is used because faster and
+				 * more predictable than cpu_idle().
+				 */
+	smp_mb();	/* order master/slave sync_data read/write */
+	sync_data[this_cpu].nr_waits--;
+	get_cycles_barrier();
+	sync_data[this_cpu].tsc_count = get_cycles();
+	get_cycles_barrier();
+}
+
+/*
+ * Worker thread called on each CPU.
+ * First wait with interrupts enabled, then wait with interrupt disabled,
+ * for precision. We are already bound to one CPU.
+ * this_cpu 0 : master
+ * this_cpu 1 : slave
+ */
+static void test_sync(void *arg)
+{
+	long this_cpu = (long)arg;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	/* Make sure the instructions are in I-CACHE */
+	tsc_barrier(this_cpu);
+	tsc_barrier(this_cpu);
+	sync_data[this_cpu].wait_sync--;
+	smp_mb();	/* order master/slave sync_data read/write */
+	while (unlikely(sync_data[1 - this_cpu].wait_sync >=
+			sync_data[this_cpu].nr_waits))
+		barrier();	/*
+				 * barrier is used because faster and
+				 * more predictable than cpu_idle().
+				 */
+	smp_mb();	/* order master/slave sync_data read/write */
+	sync_data[this_cpu].nr_waits--;
+	/*
+	 * Here, only the master will wait for the slave to reach this barrier.
+	 * This makes sure that the master, which holds the mutex and will reset
+	 * the barriers, waits for the slave to stop using the barrier values
+	 * before it continues. This is only done at the complete end of all the
+	 * loops. This is why there is a + 1 in original wait_sync value.
+	 */
+	if (sync_data[this_cpu].nr_waits == 1)
+		sync_data[this_cpu].wait_sync--;
+	local_irq_restore(flags);
+}
+
+/*
+ * Each CPU (master and target) must decrement the wait_sync value twice (one
+ * for priming in cache), and also once after the get_cycles. After all the
+ * loops, one last synchronization is required to make sure the master waits
+ * for the slave before resetting the barriers.
+ */
+static void reset_barriers(void)
+{
+	int i;
+
+	/*
+	 * Wait until slave is done so that we don't overwrite
+	 * wait_end_sync prematurely.
+	 */
+	smp_mb();	/* order master/slave sync_data read/write */
+	while (unlikely(sync_data[1].wait_sync >= sync_data[0].nr_waits))
+		barrier();	/*
+				 * barrier is used because faster and
+				 * more predictable than cpu_idle().
+				 */
+	smp_mb();	/* order master/slave sync_data read/write */
+
+	for (i = 0; i < 2; i++) {
+		WARN_ON(sync_data[i].wait_sync != 0);
+		WARN_ON(sync_data[i].nr_waits != 1);
+		sync_data[i].wait_sync = 3 * NR_LOOPS + 1;
+		sync_data[i].nr_waits = 3 * NR_LOOPS + 1;
+	}
+}
+
+/*
+ * Do loops (making sure no unexpected event changes the timing), keep the best
+ * one. The result of each loop is the highest tsc delta between the master CPU
+ * and the slaves. Stop CPU hotplug when this code is executed to make sure we
+ * are concurrency-safe wrt CPU hotplug also using this code.  Test TSC
+ * synchronization even if we already "know" CPUs were not synchronized. This
+ * can be used as a test to check if, for some reason, the CPUs eventually got
+ * in sync after a CPU has been unplugged. This code is kept separate from the
+ * CPU hotplug code because the slave CPU executes in an IPI, which we want to
+ * keep as short as possible (this is happening while the system is running).
+ * Therefore, we do not send a single IPI for all the test loops, but rather
+ * send one IPI per loop.
+ */
+int test_tsc_synchronization(void)
+{
+	long cpu, master;
+	cycles_t max_diff = 0, diff, best_loop, worse_loop = 0;
+	int i;
+
+	mutex_lock(&tscsync_mutex);
+	get_online_cpus();
+
+	printk(KERN_INFO
+	       "checking TSC synchronization across all online CPUs:");
+
+	preempt_disable();
+	master = smp_processor_id();
+	for_each_online_cpu(cpu) {
+		if (master == cpu)
+			continue;
+		best_loop = (cycles_t)ULLONG_MAX;
+		for (i = 0; i < NR_LOOPS; i++) {
+			smp_call_function_single(cpu, test_sync,
+						(void *)1UL, 0);
+			test_sync((void *)0UL);
+			diff = abs(sync_data[1].tsc_count
+				- sync_data[0].tsc_count);
+			best_loop = min(best_loop, diff);
+			worse_loop = max(worse_loop, diff);
+		}
+		reset_barriers();
+		max_diff = max(best_loop, max_diff);
+	}
+	preempt_enable();
+	if (max_diff >= MAX_CYCLES_DELTA) {
+		printk(KERN_WARNING
+			"Measured %llu cycles TSC offset between CPUs,"
+			" turning off TSC clock.\n", (u64)max_diff);
+		mark_tsc_unstable("check_tsc_sync_source failed");
+		_tsc_is_sync = 0;
+	} else {
+		printk(" passed.\n");
+	}
+	put_online_cpus();
+	mutex_unlock(&tscsync_mutex);
+	return max_diff < MAX_CYCLES_DELTA;
+}
+EXPORT_SYMBOL_GPL(test_tsc_synchronization);
+
+/*
+ * Test synchronicity of a single core when it is hotplugged.
+ * Source CPU calls into this - waits for the freshly booted target CPU to
+ * arrive and then start the measurement:
+ */
+void __cpuinit check_tsc_sync_source(int cpu)
+{
+	cycles_t diff, abs_diff,
+		 best_loop = (cycles_t)ULLONG_MAX, worse_loop = 0;
+	int i;
+
+	/*
+	 * No need to check if we already know that the TSC is not synchronized:
+	 */
+	if (!force_tsc_sync && unsynchronized_tsc()) {
+		/*
+		 * Make sure we mark _tsc_is_sync to 0 if the TSC is found
+		 * to be unsynchronized for other causes than non-synchronized
+		 * TSCs across CPUs.
+		 */
+		_tsc_is_sync = 0;
+		set_trace_clock_is_sync(0);
+		return;
+	}
+
+	printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
+			  smp_processor_id(), cpu);
+
+	for (i = 0; i < NR_LOOPS; i++) {
+		test_sync((void *)0UL);
+		diff = sync_data[1].tsc_count - sync_data[0].tsc_count;
+		abs_diff = abs(diff);
+		best_loop = min(best_loop, abs_diff);
+		worse_loop = max(worse_loop, abs_diff);
+		if (force_tsc_sync && best_loop == abs_diff)
+			slave_offset = diff;
+	}
+	reset_barriers();
+
+	if (!force_tsc_sync && best_loop >= MAX_CYCLES_DELTA) {
+		printk(" failed.\n");
+		printk(KERN_WARNING
+			"Measured %llu cycles TSC offset between CPUs,"
+			" turning off TSC clock.\n", (u64)best_loop);
+		mark_tsc_unstable("check_tsc_sync_source failed");
+		_tsc_is_sync = 0;
+		set_trace_clock_is_sync(0);
+	} else {
+		printk(" %s.\n", !force_tsc_sync ? "passed" : "forced");
+	}
+	if (force_tsc_sync) {
+		/* order slave_offset and slave_offset_ready writes */
+		smp_wmb();
+		slave_offset_ready = 1;
+	}
+}
+
+/*
+ * Freshly booted CPUs call into this:
+ */
+void __cpuinit check_tsc_sync_target(void)
+{
+	int i;
+
+	if (!force_tsc_sync && unsynchronized_tsc())
+		return;
+
+	for (i = 0; i < NR_LOOPS; i++)
+		test_sync((void *)1UL);
+
+	/*
+	 * Force slave synchronization if requested.
+	 */
+	if (force_tsc_sync) {
+		unsigned long flags;
+		cycles_t new_tsc;
+
+		while (!slave_offset_ready)
+			cpu_relax();
+		/* order slave_offset and slave_offset_ready reads */
+		smp_rmb();
+		local_irq_save(flags);
+		/*
+		 * slave_offset is read when master has finished writing to it,
+		 * and is protected by cpu hotplug serialization.
+		 */
+		new_tsc = get_cycles() - slave_offset;
+		write_tsc((u32)new_tsc, (u32)((u64)new_tsc >> 32));
+		local_irq_restore(flags);
+	}
+}
diff --git a/kernel/timer.c b/kernel/timer.c
index d6459923d24..65cc58ce148 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -40,12 +40,14 @@
 #include <linux/irq_work.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <trace/timer.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/div64.h>
 #include <asm/timex.h>
 #include <asm/io.h>
+#include <asm/irq_regs.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/timer.h>
@@ -54,6 +56,10 @@ u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
 
 EXPORT_SYMBOL(jiffies_64);
 
+DEFINE_TRACE(timer_set);
+DEFINE_TRACE(timer_update_time);
+DEFINE_TRACE(timer_timeout);
+
 /*
  * per-CPU timer vector definitions:
  */
@@ -366,6 +372,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 		vec = base->tv5.vec + i;
 	}
+	trace_timer_set(timer);
 	/*
 	 * Timers are FIFO:
 	 */
@@ -1303,8 +1310,13 @@ void run_local_timers(void)
 
 void do_timer(unsigned long ticks)
 {
+	struct timespec curtime, wtom;
+
 	jiffies_64 += ticks;
 	update_wall_time();
+	curtime = __current_kernel_time();
+	wtom = __get_wall_to_monotonic();
+	trace_timer_update_time(&curtime, &wtom);
 	calc_global_load(ticks);
 }
 
@@ -1387,7 +1399,9 @@ SYSCALL_DEFINE0(getegid)
 
 static void process_timeout(unsigned long __data)
 {
-	wake_up_process((struct task_struct *)__data);
+	struct task_struct *task = (struct task_struct *)__data;
+	trace_timer_timeout(task);
+	wake_up_process(task);
 }
 
 /**
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 761c510a06c..614d9153a24 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -56,5 +56,7 @@ obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
+obj-$(CONFIG_HAVE_TRACE_CLOCK_32_TO_64) += trace-clock-32-to-64.o
+obj-$(CONFIG_HAVE_TRACE_CLOCK_GENERIC) += trace-clock.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3dadae8388..888b611897d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3328,7 +3328,7 @@ static int start_graph_tracing(void)
 	/* The cpu_boot init_task->ret_stack will never be freed */
 	for_each_online_cpu(cpu) {
 		if (!idle_task(cpu)->ret_stack)
-			ftrace_graph_init_task(idle_task(cpu));
+			ftrace_graph_init_idle_task(idle_task(cpu), cpu);
 	}
 
 	do {
@@ -3418,6 +3418,49 @@ void unregister_ftrace_graph(void)
 	mutex_unlock(&ftrace_lock);
 }
 
+static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+
+static void
+graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+{
+	atomic_set(&t->tracing_graph_pause, 0);
+	atomic_set(&t->trace_overrun, 0);
+	t->ftrace_timestamp = 0;
+	/* make curr_ret_stack visable before we add the ret_stack */
+	smp_wmb();
+	t->ret_stack = ret_stack;
+}
+
+/*
+ * Allocate a return stack for the idle task. May be the first
+ * time through, or it may be done by CPU hotplug online.
+ */
+void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
+{
+	t->curr_ret_stack = -1;
+	/*
+	 * The idle task has no parent, it either has its own
+	 * stack or no stack at all.
+	 */
+	if (t->ret_stack)
+		WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
+
+	if (ftrace_graph_active) {
+		struct ftrace_ret_stack *ret_stack;
+
+		ret_stack = per_cpu(idle_ret_stack, cpu);
+		if (!ret_stack) {
+			ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+					    * sizeof(struct ftrace_ret_stack),
+					    GFP_KERNEL);
+			if (!ret_stack)
+				return;
+			per_cpu(idle_ret_stack, cpu) = ret_stack;
+		}
+		graph_init_task(t, ret_stack);
+	}
+}
+
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
@@ -3433,12 +3476,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 				GFP_KERNEL);
 		if (!ret_stack)
 			return;
-		atomic_set(&t->tracing_graph_pause, 0);
-		atomic_set(&t->trace_overrun, 0);
-		t->ftrace_timestamp = 0;
-		/* make curr_ret_stack visable before we add the ret_stack */
-		smp_wmb();
-		t->ret_stack = ret_stack;
+		graph_init_task(t, ret_stack);
 	}
 }
 
diff --git a/kernel/trace/trace-clock-32-to-64.c b/kernel/trace/trace-clock-32-to-64.c
new file mode 100644
index 00000000000..c036f5c5586
--- /dev/null
+++ b/kernel/trace/trace-clock-32-to-64.c
@@ -0,0 +1,296 @@
+/*
+ * kernel/trace/trace-clock-32-to-64.c
+ *
+ * (C) Copyright	2006,2007,2008 -
+ * 		Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Extends a 32 bits clock source to a full 64 bits count, readable atomically
+ * from any execution context.
+ *
+ * notes :
+ * - trace clock 32->64 bits extended timer-based clock cannot be used for early
+ *   tracing in the boot process, as it depends on timer interrupts.
+ * - The timer is only on one CPU to support hotplug.
+ * - We have the choice between schedule_delayed_work_on and an IPI to get each
+ *   CPU to write the heartbeat. IPI has been chosen because it is considered
+ *   faster than passing through the timer to get the work scheduled on all the
+ *   CPUs.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <linux/cpu.h>
+#include <linux/timex.h>
+#include <linux/bitops.h>
+#include <linux/trace-clock.h>
+#include <linux/smp.h>
+#include <linux/sched.h> /* needed due to include order problem on m68k */
+#include <linux/math64.h>
+
+#define HW_BITMASK			((1ULL << TC_HW_BITS) - 1)
+#define HW_LS32(hw)			((hw) & HW_BITMASK)
+#define SW_MS32(sw)			((sw) & ~HW_BITMASK)
+
+static DEFINE_SPINLOCK(synthetic_tsc_lock);
+static int synthetic_tsc_refcount;  /* Number of readers */
+static int synthetic_tsc_enabled;   /* synth. TSC enabled on all online CPUs */
+
+static DEFINE_PER_CPU(struct timer_list, tsc_timer);
+static unsigned int precalc_expire;
+
+struct synthetic_tsc_struct {
+	union {
+		u64 val;
+		struct {
+#ifdef __BIG_ENDIAN
+			u32 ms32;
+			u32 ls32;
+#else
+			u32 ls32;
+			u32 ms32;
+#endif
+		} sel;
+	} tsc[2];
+	unsigned int index;	/* Index of the current synth. tsc. */
+};
+
+static DEFINE_PER_CPU(struct synthetic_tsc_struct, synthetic_tsc);
+
+/* Called from IPI or timer interrupt */
+static void update_synthetic_tsc(void)
+{
+	struct synthetic_tsc_struct *cpu_synth;
+	u32 tsc;
+
+	cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id());
+	tsc = trace_clock_read32();		/* Hardware clocksource read */
+
+	if (tsc < HW_LS32(cpu_synth->tsc[cpu_synth->index].sel.ls32)) {
+		unsigned int new_index = 1 - cpu_synth->index; /* 0 <-> 1 */
+		/*
+		 * Overflow
+		 * Non atomic update of the non current synthetic TSC, followed
+		 * by an atomic index change. There is no write concurrency,
+		 * so the index read/write does not need to be atomic.
+		 */
+		cpu_synth->tsc[new_index].val =
+			(SW_MS32(cpu_synth->tsc[cpu_synth->index].val)
+				| (u64)tsc) + (1ULL << TC_HW_BITS);
+		/*
+		 * Ensure the compiler does not reorder index write. It makes
+		 * sure all nested interrupts will see the new value before the
+		 * new index is written.
+		 */
+		barrier();
+		cpu_synth->index = new_index;	/* atomic change of index */
+	} else {
+		/*
+		 * No overflow : We know that the only bits changed are
+		 * contained in the 32 LS32s, which can be written to atomically.
+		 */
+		cpu_synth->tsc[cpu_synth->index].sel.ls32 =
+			SW_MS32(cpu_synth->tsc[cpu_synth->index].sel.ls32) | tsc;
+	}
+}
+
+/*
+ * Should only be called when interrupts are off. Affects only current CPU.
+ */
+void _trace_clock_write_synthetic_tsc(u64 value)
+{
+	struct synthetic_tsc_struct *cpu_synth;
+	unsigned int new_index;
+
+	cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id());
+	new_index = 1 - cpu_synth->index; /* 0 <-> 1 */
+	cpu_synth->tsc[new_index].val = value;
+	barrier();
+	cpu_synth->index = new_index;	/* atomic change of index */
+}
+
+/* Called from buffer switch : in _any_ context (even NMI) */
+u64 notrace trace_clock_read_synthetic_tsc(void)
+{
+	struct synthetic_tsc_struct *cpu_synth;
+	u64 ret;
+	unsigned int index;
+	u32 tsc;
+
+	preempt_disable_notrace();
+	cpu_synth = &per_cpu(synthetic_tsc, smp_processor_id());
+	index = ACCESS_ONCE(cpu_synth->index);	/* atomic read */
+	tsc = trace_clock_read32();		/* Hardware clocksource read */
+
+	/* Overflow detection */
+	if (unlikely(tsc < HW_LS32(cpu_synth->tsc[index].sel.ls32)))
+		ret = (SW_MS32(cpu_synth->tsc[index].val) | (u64)tsc)
+			+ (1ULL << TC_HW_BITS);
+	else
+		ret = SW_MS32(cpu_synth->tsc[index].val) | (u64)tsc;
+	preempt_enable_notrace();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(trace_clock_read_synthetic_tsc);
+
+static void synthetic_tsc_ipi(void *info)
+{
+	update_synthetic_tsc();
+}
+
+/*
+ * tsc_timer_fct : - Timer function synchronizing synthetic TSC.
+ * @data: unused
+ *
+ * Guarantees at least 1 execution before low word of TSC wraps.
+ */
+static void tsc_timer_fct(unsigned long data)
+{
+	update_synthetic_tsc();
+
+	mod_timer_pinned(&per_cpu(tsc_timer, smp_processor_id()),
+		  jiffies + precalc_expire);
+}
+
+/*
+ * precalc_stsc_interval: - Precalculates the interval between the clock
+ * wraparounds.
+ */
+static int __init precalc_stsc_interval(void)
+{
+	u64 rem_freq, rem_interval;
+
+	precalc_expire =
+		__iter_div_u64_rem(HW_BITMASK, (
+		  __iter_div_u64_rem(trace_clock_frequency(),
+		  HZ * trace_clock_freq_scale(), &rem_freq) << 1
+		 )
+		 - 1
+		 - (TC_EXPECTED_INTERRUPT_LATENCY * HZ / 1000), &rem_interval)
+		>> 1;
+	WARN_ON(precalc_expire == 0);
+	printk(KERN_DEBUG "Synthetic TSC timer will fire each %u jiffies.\n",
+		precalc_expire);
+	return 0;
+}
+
+static void prepare_synthetic_tsc(int cpu)
+{
+	struct synthetic_tsc_struct *cpu_synth;
+	u64 local_count;
+
+	cpu_synth = &per_cpu(synthetic_tsc, cpu);
+	local_count = trace_clock_read_synthetic_tsc();
+	cpu_synth->tsc[0].val = local_count;
+	cpu_synth->index = 0;
+	smp_wmb();	/* Writing in data of CPU about to come up */
+	init_timer_deferrable(&per_cpu(tsc_timer, cpu));
+	per_cpu(tsc_timer, cpu).function = tsc_timer_fct;
+	per_cpu(tsc_timer, cpu).expires = jiffies + precalc_expire;
+}
+
+static void enable_synthetic_tsc(int cpu)
+{
+	smp_call_function_single(cpu, synthetic_tsc_ipi, NULL, 1);
+	add_timer_on(&per_cpu(tsc_timer, cpu), cpu);
+}
+
+static void disable_synthetic_tsc(int cpu)
+{
+	del_timer_sync(&per_cpu(tsc_timer, cpu));
+}
+
+/*
+ * 	hotcpu_callback - CPU hotplug callback
+ * 	@nb: notifier block
+ * 	@action: hotplug action to take
+ * 	@hcpu: CPU number
+ *
+ *	Sets the new CPU's current synthetic TSC to the same value as the
+ *	currently running CPU.
+ *
+ * 	Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD)
+ */
+static int __cpuinit hotcpu_callback(struct notifier_block *nb,
+				unsigned long action,
+				void *hcpu)
+{
+	unsigned int hotcpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		spin_lock(&synthetic_tsc_lock);
+		if (synthetic_tsc_refcount)
+			prepare_synthetic_tsc(hotcpu);
+		spin_unlock(&synthetic_tsc_lock);
+		break;
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		spin_lock(&synthetic_tsc_lock);
+		if (synthetic_tsc_refcount)
+			enable_synthetic_tsc(hotcpu);
+		spin_unlock(&synthetic_tsc_lock);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		spin_lock(&synthetic_tsc_lock);
+		if (synthetic_tsc_refcount)
+			disable_synthetic_tsc(hotcpu);
+		spin_unlock(&synthetic_tsc_lock);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+	}
+	return NOTIFY_OK;
+}
+
+void get_synthetic_tsc(void)
+{
+	int cpu;
+
+	spin_lock(&synthetic_tsc_lock);
+	if (synthetic_tsc_refcount++)
+		goto end;
+
+	synthetic_tsc_enabled = 1;
+	for_each_online_cpu(cpu) {
+		prepare_synthetic_tsc(cpu);
+		enable_synthetic_tsc(cpu);
+	}
+end:
+	spin_unlock(&synthetic_tsc_lock);
+}
+EXPORT_SYMBOL_GPL(get_synthetic_tsc);
+
+void put_synthetic_tsc(void)
+{
+	int cpu;
+
+	spin_lock(&synthetic_tsc_lock);
+	WARN_ON(synthetic_tsc_refcount <= 0);
+	if (synthetic_tsc_refcount != 1 || !synthetic_tsc_enabled)
+		goto end;
+
+	for_each_online_cpu(cpu)
+		disable_synthetic_tsc(cpu);
+	synthetic_tsc_enabled = 0;
+end:
+	synthetic_tsc_refcount--;
+	spin_unlock(&synthetic_tsc_lock);
+}
+EXPORT_SYMBOL_GPL(put_synthetic_tsc);
+
+/* Called from CPU 0, before any tracing starts, to init each structure */
+static int __init init_synthetic_tsc(void)
+{
+	precalc_stsc_interval();
+	hotcpu_notifier(hotcpu_callback, 3);
+	return 0;
+}
+
+/* Before SMP is up */
+/* workaround for omap4 */
+late_initcall(init_synthetic_tsc);
diff --git a/kernel/trace/trace-clock.c b/kernel/trace/trace-clock.c
new file mode 100644
index 00000000000..3ed1667aacb
--- /dev/null
+++ b/kernel/trace/trace-clock.c
@@ -0,0 +1,97 @@
+/*
+ * kernel/trace/trace-clock.c
+ *
+ * (C) Copyright	2008 -
+ * 		Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca)
+ *
+ * Generic kernel tracing clock for architectures without TSC.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <linux/cpu.h>
+#include <linux/timex.h>
+#include <linux/bitops.h>
+#include <linux/trace-clock.h>
+#include <linux/jiffies.h>
+
+static int trace_clock_refcount;
+static DEFINE_MUTEX(trace_clock_mutex);
+static struct timer_list trace_clock_timer;
+/*
+ * bits 0..12 : counter, atomically incremented
+ * bits 13..{32,64} : time counter, incremented each jiffy.
+ */
+atomic_long_t trace_clock_var;
+EXPORT_SYMBOL(trace_clock_var);
+
+static void trace_clock_update(void)
+{
+	long old_clock, new_clock;
+	unsigned long ticks;
+
+	/*
+	 * Make sure we keep track of delayed timer.
+	 */
+	ticks = jiffies - trace_clock_timer.expires + 1;
+	/* Don't update if ticks is zero, time would go backward. */
+	if (unlikely(!ticks))
+		return;
+	do {
+		old_clock = atomic_long_read(&trace_clock_var);
+		new_clock = (old_clock + (ticks << TRACE_CLOCK_SHIFT))
+			& (~((1 << TRACE_CLOCK_SHIFT) - 1));
+	} while (atomic_long_cmpxchg(&trace_clock_var, old_clock, new_clock)
+			!= old_clock);
+}
+
+static void trace_clock_timer_fct(unsigned long data)
+{
+	trace_clock_update();
+	trace_clock_timer.expires = jiffies + 1;
+	add_timer(&trace_clock_timer);
+}
+
+static void enable_trace_clock(void)
+{
+	init_timer(&trace_clock_timer);
+	/* trace_clock_update() reads expires */
+	trace_clock_timer.function = trace_clock_timer_fct;
+	trace_clock_timer.expires = jiffies + 1;
+	trace_clock_update();
+	add_timer(&trace_clock_timer);
+}
+
+static void disable_trace_clock(void)
+{
+	del_timer_sync(&trace_clock_timer);
+}
+
+void get_trace_clock(void)
+{
+	get_synthetic_tsc();
+	mutex_lock(&trace_clock_mutex);
+	if (trace_clock_refcount++)
+		goto end;
+	enable_trace_clock();
+end:
+	mutex_unlock(&trace_clock_mutex);
+}
+EXPORT_SYMBOL_GPL(get_trace_clock);
+
+void put_trace_clock(void)
+{
+	mutex_lock(&trace_clock_mutex);
+	WARN_ON(trace_clock_refcount <= 0);
+	if (trace_clock_refcount != 1)
+		goto end;
+	disable_trace_clock();
+end:
+	trace_clock_refcount--;
+	mutex_unlock(&trace_clock_mutex);
+	put_synthetic_tsc();
+}
+EXPORT_SYMBOL_GPL(put_trace_clock);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2547d8813cf..687699d365a 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -11,6 +11,7 @@
 #include <linux/ftrace.h>
 #include <linux/string.h>
 #include <linux/module.h>
+#include <linux/marker.h>
 #include <linux/mutex.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
author	John Stultz <john.stultz@linaro.org>	2011-04-05 12:21:10 -0700
committer	John Stultz <john.stultz@linaro.org>	2011-04-05 12:21:10 -0700
commit	4ce7ea0bfbb301ffb79154b6cecd2ef030db4cdf (patch)
tree	a7ad87580793912a9da208e7a9ea21d6c7768f08 /kernel
parent	4450182f400f1a5f50b1680faec25af2315c2849 (diff)
parent	7c4bc9c2662c6d9840afed0e29eb01314af9bb78 (diff)