From 9e55cca889cdadbd94f4ea658b41f4cb43ab3fcb Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Tue, 25 Apr 2017 15:12:50 +0100
Subject: wsim: Add rtavg balancer

An improved version of the rt balancer that tracks the average latency
on each engine, rather than the last instantaneous execution latency.
This makes it much less sensitive to rapid changes, which is both a
positive and a negative.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 benchmarks/ewma.h     |  71 +++++++++++++++++++++++++++
 benchmarks/gem_wsim.c | 133 ++++++++++++++++++++++++++++++++++++++++++--------
 benchmarks/ilog2.h    | 104 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 288 insertions(+), 20 deletions(-)
 create mode 100644 benchmarks/ewma.h
 create mode 100644 benchmarks/ilog2.h

(limited to 'benchmarks')

diff --git a/benchmarks/ewma.h b/benchmarks/ewma.h
new file mode 100644
index 00000000..8711004e
--- /dev/null
+++ b/benchmarks/ewma.h
@@ -0,0 +1,71 @@
+#ifndef EWMA_H
+#define EWMA_H
+
+#include <ilog2.h>
+
+#define BUILD_BUG_ON(expr)
+#define BUILD_BUG_ON_NOT_POWER_OF_2(expr)
+
+/*
+ * Exponentially weighted moving average (EWMA)
+ *
+ * This implements a fixed-precision EWMA algorithm, with both the
+ * precision and fall-off coefficient determined at compile-time
+ * and built into the generated helper funtions.
+ *
+ * The first argument to the macro is the name that will be used
+ * for the struct and helper functions.
+ *
+ * The second argument, the precision, expresses how many bits are
+ * used for the fractional part of the fixed-precision values.
+ *
+ * The third argument, the weight reciprocal, determines how the
+ * new values will be weighed vs. the old state, new values will
+ * get weight 1/weight_rcp and old values 1-1/weight_rcp. Note
+ * that this parameter must be a power of two for efficiency.
+ */
+
+#define DECLARE_EWMA(T, name, _precision, _weight_rcp)			\
+	struct ewma_##name {						\
+		T internal;					\
+	};								\
+	static inline void ewma_##name##_init(struct ewma_##name *e)	\
+	{								\
+		BUILD_BUG_ON(!__builtin_constant_p(_precision));	\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp));	\
+		/*							\
+		 * Even if you want to feed it just 0/1 you should have	\
+		 * some bits for the non-fractional part...		\
+		 */							\
+		BUILD_BUG_ON((_precision) > 30);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp);		\
+		e->internal = 0;					\
+	}								\
+	static inline T							\
+	ewma_##name##_read(struct ewma_##name *e)			\
+	{								\
+		BUILD_BUG_ON(!__builtin_constant_p(_precision));	\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp));	\
+		BUILD_BUG_ON((_precision) > 30);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp);		\
+		return e->internal >> (_precision);			\
+	}								\
+	static inline void ewma_##name##_add(struct ewma_##name *e,	\
+					     T val)			\
+	{								\
+		const T weight_rcp = ilog2(_weight_rcp);		\
+		const T precision = _precision;				\
+		T internal = e->internal;				\
+									\
+		BUILD_BUG_ON(!__builtin_constant_p(_precision));	\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp));	\
+		BUILD_BUG_ON((_precision) > 30);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp);		\
+									\
+		e->internal = internal ?				\
+			(((internal << weight_rcp) - internal) +	\
+				(val << precision)) >> weight_rcp :	\
+			(val << precision);				\
+	}
+
+#endif /* EWMA_H */
diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
index 47b070e2..9f690369 100644
--- a/benchmarks/gem_wsim.c
+++ b/benchmarks/gem_wsim.c
@@ -50,6 +50,8 @@
 #include "igt_aux.h"
 #include "igt_rand.h"
 
+#include "ewma.h"
+
 enum intel_engine_id {
 	RCS,
 	BCS,
@@ -103,6 +105,8 @@ struct w_step
 	unsigned int mapped_len;
 };
 
+DECLARE_EWMA(uint64_t, rt, 4, 2)
+
 struct workload
 {
 	unsigned int nr_steps;
@@ -127,6 +131,13 @@ struct workload
 
 	struct igt_list requests[NUM_ENGINES];
 	unsigned int nrequest[NUM_ENGINES];
+
+	union {
+		struct rtavg {
+			struct ewma_rt avg[NUM_ENGINES];
+			uint32_t last[NUM_ENGINES];
+		} rt;
+	};
 };
 
 static const unsigned int nop_calibration_us = 1000;
@@ -787,13 +798,32 @@ static const struct workload_balancer qd_balancer = {
 	.balance = qd_balance,
 };
 
+static enum intel_engine_id
+__rt_select_engine(struct workload *wrk, unsigned long *qd, bool random)
+{
+	unsigned int n;
+
+	qd[VCS1] >>= 10;
+	qd[VCS2] >>= 10;
+
+	if (qd[VCS1] < qd[VCS2])
+		n = 0;
+	else if (qd[VCS2] < qd[VCS1])
+		n = 1;
+	else if (random)
+		n = hars_petruska_f54_1_random(&wrk->prng) & 1;
+	else
+		n = wrk->vcs_rr;
+	wrk->vcs_rr = n ^ 1;
+
+	return get_vcs_engine(n);
+}
+
 static enum intel_engine_id
 __rt_balance(const struct workload_balancer *balancer,
 	     struct workload *wrk, struct w_step *w, bool random)
 {
-	enum intel_engine_id engine;
-	long qd[NUM_ENGINES];
-	unsigned int n;
+	unsigned long qd[NUM_ENGINES];
 
 	igt_assert(w->engine == VCS);
 
@@ -827,22 +857,7 @@ __rt_balance(const struct workload_balancer *balancer,
 	       qd[VCS2]);
 #endif
 
-	qd[VCS1] >>= 10;
-	qd[VCS2] >>= 10;
-
-	if (qd[VCS1] < qd[VCS2])
-		n = 0;
-	else if (qd[VCS2] < qd[VCS1])
-		n = 1;
-	else if (random)
-		n = hars_petruska_f54_1_random(&wrk->prng) & 1;
-	else
-		n = wrk->vcs_rr;
-
-	engine = get_vcs_engine(n);
-	wrk->vcs_rr = n ^ 1;
-
-	return engine;
+	return __rt_select_engine(wrk, qd, random);
 }
 
 static enum intel_engine_id
@@ -870,6 +885,68 @@ static const struct workload_balancer rtr_balancer = {
 	.balance = rtr_balance,
 };
 
+static enum intel_engine_id
+rtavg_balance(const struct workload_balancer *balancer,
+	   struct workload *wrk, struct w_step *w)
+{
+	unsigned long qd[NUM_ENGINES];
+
+	igt_assert(w->engine == VCS);
+
+	/* Estimate the average "speed" of the most recent batches
+	 *    (finish time - submit time)
+	 * and use that as an approximate for the total remaining time for
+	 * all batches on that engine plus the time we expect to execute in.
+	 * We try to keep the total remaining balanced between the engines.
+	 */
+	if (wrk->status_page[VCS_SEQNO_IDX(VCS1)] != wrk->rt.last[VCS1]) {
+		igt_assert((long)(wrk->status_page[2] - wrk->status_page[1]) > 0);
+		ewma_rt_add(&wrk->rt.avg[VCS1],
+			    wrk->status_page[2] - wrk->status_page[1]);
+		wrk->rt.last[VCS1] = wrk->status_page[VCS_SEQNO_IDX(VCS1)];
+	}
+
+	qd[VCS1] = balancer->get_qd(balancer, wrk, VCS1);
+	wrk->qd_sum[VCS1] += qd[VCS1];
+	qd[VCS1] = (qd[VCS1] + 1) * ewma_rt_read(&wrk->rt.avg[VCS1]);
+
+#ifdef DEBUG
+	printf("qd[0] = %d (%d - %d) x %ld (%d) = %ld\n",
+	       wrk->seqno[VCS1] - wrk->status_page[0],
+	       wrk->seqno[VCS1], wrk->status_page[0],
+	       ewma_rt_read(&wrk->rt.avg[VCS1]),
+	       wrk->status_page[2] -  wrk->status_page[1],
+	       qd[VCS1]);
+#endif
+
+	if (wrk->status_page[VCS_SEQNO_IDX(VCS2)] != wrk->rt.last[VCS2]) {
+		igt_assert((long)(wrk->status_page[2+16] - wrk->status_page[1+16]) > 0);
+		ewma_rt_add(&wrk->rt.avg[VCS2],
+			    wrk->status_page[2+16] - wrk->status_page[1+16]);
+		wrk->rt.last[VCS2] = wrk->status_page[VCS_SEQNO_IDX(VCS2)];
+	}
+
+	qd[VCS2] = balancer->get_qd(balancer, wrk, VCS2);
+	wrk->qd_sum[VCS2] += qd[VCS2];
+	qd[VCS2] = (qd[VCS2] + 1) * ewma_rt_read(&wrk->rt.avg[VCS2]);
+
+#ifdef DEBUG
+	printf("qd[1] = %d (%d - %d) x %ld (%d) = %ld\n",
+	       wrk->seqno[VCS2] - wrk->status_page[16],
+	       wrk->seqno[VCS2], wrk->status_page[16],
+	       ewma_rt_read(&wrk->rt.avg[VCS2]),
+	       wrk->status_page[18] - wrk->status_page[17],
+	       qd[VCS2]);
+#endif
+
+	return __rt_select_engine(wrk, qd, false);
+}
+
+static const struct workload_balancer rtavg_balancer = {
+	.get_qd = get_qd_depth,
+	.balance = rtavg_balance,
+};
+
 static void
 update_bb_seqno(struct w_step *w, enum intel_engine_id engine, uint32_t seqno)
 {
@@ -1277,7 +1354,7 @@ add_workload_arg(char **w_args, unsigned int nr_args, char *w_arg)
 
 static int parse_balancing_mode(char *str)
 {
-	const char *modes[] = { "rr", "qd", "rt", "rtr" };
+	const char *modes[] = { "rr", "qd", "rt", "rtr" , "rtavg" };
 	int mode = -1;
 	unsigned int i;
 
@@ -1394,27 +1471,43 @@ int main(int argc, char **argv)
 			}
 			switch (i) {
 			case 0:
+				if (!quiet)
+					printf("Using rr balancer\n");
 				balancer = &rr_balancer;
 				flags |= BALANCE;
 				break;
 			case 1:
+				if (!quiet)
+					printf("Using qd balancer\n");
 				igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
 					   8);
 				balancer = &qd_balancer;
 				flags |= SEQNO | BALANCE;
 				break;
 			case 2:
+				if (!quiet)
+					printf("Using rt balancer\n");
 				igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
 					   8);
 				balancer = &rt_balancer;
 				flags |= SEQNO | BALANCE | RT;
 				break;
 			case 3:
+				if (!quiet)
+					printf("Using rtr balancer\n");
 				igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
 					   8);
 				balancer = &rtr_balancer;
 				flags |= SEQNO | BALANCE | RT;
 				break;
+			case 4:
+				if (!quiet)
+					printf("Using rtavg balancer\n");
+				igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
+					   8);
+				balancer = &rtavg_balancer;
+				flags |= SEQNO | BALANCE | RT;
+				break;
 			default:
 				if (!quiet)
 					fprintf(stderr,
diff --git a/benchmarks/ilog2.h b/benchmarks/ilog2.h
new file mode 100644
index 00000000..596d7c23
--- /dev/null
+++ b/benchmarks/ilog2.h
@@ -0,0 +1,104 @@
+#ifndef ILOG2_H
+#define ILOG2_H
+
+#include <stdint.h>
+
+static inline int fls(int x)
+{
+        int r = -1;
+        asm("bsrl %1,%0" : "=r" (r) : "rm" (x), "0" (-1));
+        return r + 1;
+}
+
+static inline int fls64(__u64 x)
+{
+        int r = -1;
+        asm("bsrq %1,%q0" : "+r" (r) : "rm" (x));
+        return r + 1;
+}
+
+static inline __attribute__((const))
+int __ilog2_u32(uint32_t n)
+{
+	return fls(n) - 1;
+}
+
+static inline __attribute__((const))
+int __ilog2_u64(uint64_t n)
+{
+	return fls64(n) - 1;
+}
+
+#define ilog2(n)				\
+(						\
+	__builtin_constant_p(n) ? (		\
+		(n) < 2 ? 0 :			\
+		(n) & (1ULL << 63) ? 63 :	\
+		(n) & (1ULL << 62) ? 62 :	\
+		(n) & (1ULL << 61) ? 61 :	\
+		(n) & (1ULL << 60) ? 60 :	\
+		(n) & (1ULL << 59) ? 59 :	\
+		(n) & (1ULL << 58) ? 58 :	\
+		(n) & (1ULL << 57) ? 57 :	\
+		(n) & (1ULL << 56) ? 56 :	\
+		(n) & (1ULL << 55) ? 55 :	\
+		(n) & (1ULL << 54) ? 54 :	\
+		(n) & (1ULL << 53) ? 53 :	\
+		(n) & (1ULL << 52) ? 52 :	\
+		(n) & (1ULL << 51) ? 51 :	\
+		(n) & (1ULL << 50) ? 50 :	\
+		(n) & (1ULL << 49) ? 49 :	\
+		(n) & (1ULL << 48) ? 48 :	\
+		(n) & (1ULL << 47) ? 47 :	\
+		(n) & (1ULL << 46) ? 46 :	\
+		(n) & (1ULL << 45) ? 45 :	\
+		(n) & (1ULL << 44) ? 44 :	\
+		(n) & (1ULL << 43) ? 43 :	\
+		(n) & (1ULL << 42) ? 42 :	\
+		(n) & (1ULL << 41) ? 41 :	\
+		(n) & (1ULL << 40) ? 40 :	\
+		(n) & (1ULL << 39) ? 39 :	\
+		(n) & (1ULL << 38) ? 38 :	\
+		(n) & (1ULL << 37) ? 37 :	\
+		(n) & (1ULL << 36) ? 36 :	\
+		(n) & (1ULL << 35) ? 35 :	\
+		(n) & (1ULL << 34) ? 34 :	\
+		(n) & (1ULL << 33) ? 33 :	\
+		(n) & (1ULL << 32) ? 32 :	\
+		(n) & (1ULL << 31) ? 31 :	\
+		(n) & (1ULL << 30) ? 30 :	\
+		(n) & (1ULL << 29) ? 29 :	\
+		(n) & (1ULL << 28) ? 28 :	\
+		(n) & (1ULL << 27) ? 27 :	\
+		(n) & (1ULL << 26) ? 26 :	\
+		(n) & (1ULL << 25) ? 25 :	\
+		(n) & (1ULL << 24) ? 24 :	\
+		(n) & (1ULL << 23) ? 23 :	\
+		(n) & (1ULL << 22) ? 22 :	\
+		(n) & (1ULL << 21) ? 21 :	\
+		(n) & (1ULL << 20) ? 20 :	\
+		(n) & (1ULL << 19) ? 19 :	\
+		(n) & (1ULL << 18) ? 18 :	\
+		(n) & (1ULL << 17) ? 17 :	\
+		(n) & (1ULL << 16) ? 16 :	\
+		(n) & (1ULL << 15) ? 15 :	\
+		(n) & (1ULL << 14) ? 14 :	\
+		(n) & (1ULL << 13) ? 13 :	\
+		(n) & (1ULL << 12) ? 12 :	\
+		(n) & (1ULL << 11) ? 11 :	\
+		(n) & (1ULL << 10) ? 10 :	\
+		(n) & (1ULL <<  9) ?  9 :	\
+		(n) & (1ULL <<  8) ?  8 :	\
+		(n) & (1ULL <<  7) ?  7 :	\
+		(n) & (1ULL <<  6) ?  6 :	\
+		(n) & (1ULL <<  5) ?  5 :	\
+		(n) & (1ULL <<  4) ?  4 :	\
+		(n) & (1ULL <<  3) ?  3 :	\
+		(n) & (1ULL <<  2) ?  2 :	\
+		1 ) :				\
+	(sizeof(n) <= 4) ?			\
+	__ilog2_u32(n) :			\
+	__ilog2_u64(n)				\
+ )
+
+#endif /* ILOG2_H */
-- 
cgit v1.2.3