summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMathieu Desnoyers <mathieu.desnoyers@efficios.com>2011-03-16 19:05:58 -0400
committerMathieu Desnoyers <mathieu.desnoyers@polymtl.ca>2011-03-16 19:05:58 -0400
commite4e9699bcf4b5ea207c859e056bb705e30f06f10 (patch)
treebb004ac5b3d639ebf9830d0430992065a5091d4f
parent710944c4e1f7468dba1a0c4fa36b4cda5846f350 (diff)
trace-clock-userspace
TRACE_CLOCK and TRACE_CLOCK_FREQ in clock_gettime These new options to clock_gettime allows the user to retreive the TSC frequency and the current TSC from userspace. We use the LTTng infrastructure to make sure the TSC is synchronized. If it is not, we fallback to a syscall (which for the moment does the same thing but in the future will be modified to ensure consistency for the tracing between user and kernel space). The main difference with using the TSC clocksource directly is that the time starts at machine boot and not at Linux boot which makes it possible to correlate user and kernelspace events. Also we export frequency and cycles, we don't do the conversion in sec.nsec from the kernel since we don't need it. The differences between the v1 are : - we validated on 32 bits the clock_gettime vDSO doesn't exist so it cleans up the vDSO code; - the syscall is now properly defined using the posix timer architecture - we export the frequency to userspace so we don't need to convert the cycles in sec.nsec anymore. Which means that on 64 bits machine, the nsec field will contain the whole cycle counter and on 32 bits the value is split between the two fields sec and nsec. - remove the rdtsc_barrier() which is overkill for tracing purpose - trace_clock_is_sync field is updated as soon as the LTTng trace clock detects an inconsistency Updated benchmarks (with 20000000 iterations reading the tsc before and after each call on an i7 920): 64 bits with vDSO average cycles for clock_realtime: 101 average cycles for clock_monotonic: 104 average cycles for clock_trace: 52 64 bits without vDSO (using syscall) average cycles for clock_realtime: 240 average cycles for clock_monotonic: 256 average cycles for clock_trace: 219 32 bits (without vDSO) average cycles for clock_realtime: 649 average cycles for clock_monotonic: 661 average cycles for clock_trace: 616 Signed-off-by: Julien Desfossez <julien.desfossez@polymtl.ca> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
-rw-r--r--arch/x86/include/asm/trace-clock.h7
-rw-r--r--arch/x86/include/asm/vgtod.h1
-rw-r--r--arch/x86/include/asm/vsyscall.h8
-rw-r--r--arch/x86/kernel/trace-clock.c58
-rw-r--r--arch/x86/kernel/vsyscall_64.c14
-rw-r--r--arch/x86/vdso/vclock_gettime.c48
-rw-r--r--include/linux/time.h2
7 files changed, 138 insertions, 0 deletions
diff --git a/arch/x86/include/asm/trace-clock.h b/arch/x86/include/asm/trace-clock.h
index 5f6a36684c9..8ca73323366 100644
--- a/arch/x86/include/asm/trace-clock.h
+++ b/arch/x86/include/asm/trace-clock.h
@@ -11,12 +11,19 @@
*/
#include <linux/timex.h>
+#include <linux/time.h>
#include <asm/system.h>
#include <asm/processor.h>
#include <asm/atomic.h>
/* Minimum duration of a probe, in cycles */
#define TRACE_CLOCK_MIN_PROBE_DURATION 200
+#define TRACE_CLOCK_RES TRACE_CLOCK_MIN_PROBE_DURATION
+
+union lttng_timespec {
+ struct timespec ts;
+ u64 lttng_ts;
+};
extern cycles_t trace_clock_async_tsc_read(void);
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 3d61e204826..06abe8f409a 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -12,6 +12,7 @@ struct vsyscall_gtod_data {
u32 wall_time_nsec;
int sysctl_enabled;
+ int trace_clock_is_sync;
struct timezone sys_tz;
struct { /* extract of a clocksource struct */
cycle_t (*vread)(void);
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d0983d255fb..47b80f3ba4d 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -39,6 +39,14 @@ extern struct timezone sys_tz;
extern void map_vsyscall(void);
+#ifdef CONFIG_X86_64
+extern void update_trace_clock_is_sync_vdso(void);
+#else
+static inline void update_trace_clock_is_sync_vdso(void)
+{
+}
+#endif
+
#endif /* __KERNEL__ */
#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/kernel/trace-clock.c b/arch/x86/kernel/trace-clock.c
index 37cf125a940..47539e28276 100644
--- a/arch/x86/kernel/trace-clock.c
+++ b/arch/x86/kernel/trace-clock.c
@@ -11,6 +11,8 @@
#include <linux/jiffies.h>
#include <linux/timer.h>
#include <linux/cpu.h>
+#include <linux/posix-timers.h>
+#include <asm/vgtod.h>
static cycles_t trace_clock_last_tsc;
static DEFINE_PER_CPU(struct timer_list, update_timer);
@@ -22,11 +24,19 @@ int _trace_clock_is_sync = 1;
EXPORT_SYMBOL_GPL(_trace_clock_is_sync);
/*
+ * Is the trace clock being used by user-space ? We leave the trace clock active
+ * as soon as user-space starts using it. We never unref the trace clock
+ * reference taken by user-space.
+ */
+static atomic_t user_trace_clock_ref;
+
+/*
* Called by check_tsc_sync_source from CPU hotplug.
*/
void set_trace_clock_is_sync(int state)
{
_trace_clock_is_sync = state;
+ update_trace_clock_is_sync_vdso();
}
#if BITS_PER_LONG == 64
@@ -236,8 +246,56 @@ end:
}
EXPORT_SYMBOL_GPL(put_trace_clock);
+static int posix_get_trace(clockid_t which_clock, struct timespec *tp)
+{
+ union lttng_timespec *lts = (union lttng_timespec *) tp;
+ int ret;
+
+ /*
+ * Yes, there is a race here that would lead to refcount being
+ * incremented more than once, but all we care is to leave the trace
+ * clock active forever, so precise accounting is not needed.
+ */
+ if (unlikely(!atomic_read(&user_trace_clock_ref))) {
+ ret = get_trace_clock();
+ if (ret)
+ return ret;
+ atomic_inc(&user_trace_clock_ref);
+ }
+ lts->lttng_ts = trace_clock_read64();
+ return 0;
+}
+
+static int posix_get_trace_freq(clockid_t which_clock, struct timespec *tp)
+{
+ union lttng_timespec *lts = (union lttng_timespec *) tp;
+
+ lts->lttng_ts = trace_clock_frequency();
+ return 0;
+}
+
+static int posix_get_trace_res(const clockid_t which_clock, struct timespec *tp)
+{
+ union lttng_timespec *lts = (union lttng_timespec *) tp;
+
+ lts->lttng_ts = TRACE_CLOCK_RES;
+ return 0;
+}
+
static __init int init_unsync_trace_clock(void)
{
+ struct k_clock clock_trace = {
+ .clock_getres = posix_get_trace_res,
+ .clock_get = posix_get_trace,
+ };
+ struct k_clock clock_trace_freq = {
+ .clock_getres = posix_get_trace_res,
+ .clock_get = posix_get_trace_freq,
+ };
+
+ register_posix_clock(CLOCK_TRACE, &clock_trace);
+ register_posix_clock(CLOCK_TRACE_FREQ, &clock_trace_freq);
+
hotcpu_notifier(hotcpu_callback, 4);
return 0;
}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c4b69..df18f14c473 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -44,6 +44,8 @@
#include <asm/desc.h>
#include <asm/topology.h>
#include <asm/vgtod.h>
+#include <asm/trace-clock.h>
+#include <asm/timer.h>
#define __vsyscall(nr) \
__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
@@ -61,6 +63,7 @@ struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
{
.lock = SEQLOCK_UNLOCKED,
.sysctl_enabled = 1,
+ .trace_clock_is_sync = 1,
};
void update_vsyscall_tz(void)
@@ -73,6 +76,16 @@ void update_vsyscall_tz(void)
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
+void update_trace_clock_is_sync_vdso(void)
+{
+ unsigned long flags;
+
+ write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags);
+ vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
+ write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
+}
+EXPORT_SYMBOL_GPL(update_trace_clock_is_sync_vdso);
+
void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
struct clocksource *clock, u32 mult)
{
@@ -89,6 +102,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
vsyscall_gtod_data.wall_to_monotonic = *wtm;
vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
+ vsyscall_gtod_data.trace_clock_is_sync = _trace_clock_is_sync;
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index ee55754cc3c..7bc481508d0 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -22,6 +22,8 @@
#include <asm/hpet.h>
#include <asm/unistd.h>
#include <asm/io.h>
+#include <asm/trace-clock.h>
+#include <asm/timer.h>
#include "vextern.h"
#define gtod vdso_vsyscall_gtod_data
@@ -111,6 +113,46 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
return 0;
}
+/*
+ * If the TSC is synchronized across all CPUs, read the current TSC
+ * and export its value in the nsec field of the timespec
+ */
+notrace static noinline int do_trace_clock(struct timespec *ts)
+{
+ unsigned long seq;
+ union lttng_timespec *lts = (union lttng_timespec *) ts;
+
+ do {
+ seq = read_seqbegin(&gtod->lock);
+ if (unlikely(!gtod->trace_clock_is_sync))
+ return vdso_fallback_gettime(CLOCK_TRACE, ts);
+ /*
+ * We don't protect the rdtsc with the rdtsc_barrier because
+ * we can't obtain with tracing that level of precision.
+ * The operation of recording an event is not atomic therefore
+ * the small chance of imprecision doesn't justify the overhead
+ * of a barrier.
+ */
+ /*
+ * TODO: check that vget_cycles(), using paravirt ops, will
+ * match the TSC read by get_cycles() at the kernel level.
+ */
+ lts->lttng_ts = vget_cycles();
+ } while (unlikely(read_seqretry(&gtod->lock, seq)));
+
+ return 0;
+}
+
+/*
+ * Returns the cpu_khz, it needs to be a syscall because we can't access
+ * this value from userspace and it will only be called at the beginning
+ * of the tracing session
+ */
+notrace static noinline int do_trace_clock_freq(struct timespec *ts)
+{
+ return vdso_fallback_gettime(CLOCK_TRACE_FREQ, ts);
+}
+
notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
{
if (likely(gtod->sysctl_enabled))
@@ -127,6 +169,12 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
return do_realtime_coarse(ts);
case CLOCK_MONOTONIC_COARSE:
return do_monotonic_coarse(ts);
+ case CLOCK_TRACE:
+ return do_trace_clock(ts);
+ case CLOCK_TRACE_FREQ:
+ return do_trace_clock_freq(ts);
+ default:
+ return -EINVAL;
}
return vdso_fallback_gettime(clock, ts);
}
diff --git a/include/linux/time.h b/include/linux/time.h
index 1e6d3b59238..8ae676f1e7c 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -292,6 +292,8 @@ struct itimerval {
#define CLOCK_MONOTONIC_RAW 4
#define CLOCK_REALTIME_COARSE 5
#define CLOCK_MONOTONIC_COARSE 6
+#define CLOCK_TRACE_FREQ 14
+#define CLOCK_TRACE 15
/*
* The IDs of various hardware clocks: