3 files changed, 288 insertions, 20 deletions
diff --git a/benchmarks/ewma.h b/benchmarks/ewma.h
new file mode 100644
index 00000000..8711004e
--- /dev/null
+++ b/benchmarks/ewma.h
@@ -0,0 +1,71 @@
+#ifndef EWMA_H
+#define EWMA_H
+
+#include <ilog2.h>
+
+#define BUILD_BUG_ON(expr)
+#define BUILD_BUG_ON_NOT_POWER_OF_2(expr)
+
+/*
+ * Exponentially weighted moving average (EWMA)
+ *
+ * This implements a fixed-precision EWMA algorithm, with both the
+ * precision and fall-off coefficient determined at compile-time
+ * and built into the generated helper funtions.
+ *
+ * The first argument to the macro is the name that will be used
+ * for the struct and helper functions.
+ *
+ * The second argument, the precision, expresses how many bits are
+ * used for the fractional part of the fixed-precision values.
+ *
+ * The third argument, the weight reciprocal, determines how the
+ * new values will be weighed vs. the old state, new values will
+ * get weight 1/weight_rcp and old values 1-1/weight_rcp. Note
+ * that this parameter must be a power of two for efficiency.
+ */
+
+#define DECLARE_EWMA(T, name, _precision, _weight_rcp)			\
+	struct ewma_##name {						\
+		T internal;					\
+	};								\
+	static inline void ewma_##name##_init(struct ewma_##name *e)	\
+	{								\
+		BUILD_BUG_ON(!__builtin_constant_p(_precision));	\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp));	\
+		/*							\
+		 * Even if you want to feed it just 0/1 you should have	\
+		 * some bits for the non-fractional part...		\
+		 */							\
+		BUILD_BUG_ON((_precision) > 30);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp);		\
+		e->internal = 0;					\
+	}								\
+	static inline T							\
+	ewma_##name##_read(struct ewma_##name *e)			\
+	{								\
+		BUILD_BUG_ON(!__builtin_constant_p(_precision));	\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp));	\
+		BUILD_BUG_ON((_precision) > 30);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp);		\
+		return e->internal >> (_precision);			\
+	}								\
+	static inline void ewma_##name##_add(struct ewma_##name *e,	\
+					     T val)			\
+	{								\
+		const T weight_rcp = ilog2(_weight_rcp);		\
+		const T precision = _precision;				\
+		T internal = e->internal;				\
+									\
+		BUILD_BUG_ON(!__builtin_constant_p(_precision));	\
+		BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp));	\
+		BUILD_BUG_ON((_precision) > 30);			\
+		BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp);		\
+									\
+		e->internal = internal ?				\
+			(((internal << weight_rcp) - internal) +	\
+				(val << precision)) >> weight_rcp :	\
+			(val << precision);				\
+	}
+
+#endif /* EWMA_H */
diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
index 47b070e2..9f690369 100644
--- a/benchmarks/gem_wsim.c
+++ b/benchmarks/gem_wsim.c
@@ -50,6 +50,8 @@
 #include "igt_aux.h"
 #include "igt_rand.h"
 
+#include "ewma.h"
+
 enum intel_engine_id {
 	RCS,
 	BCS,
@@ -103,6 +105,8 @@ struct w_step
 	unsigned int mapped_len;
 };
 
+DECLARE_EWMA(uint64_t, rt, 4, 2)
+
 struct workload
 {
 	unsigned int nr_steps;
@@ -127,6 +131,13 @@ struct workload
 
 	struct igt_list requests[NUM_ENGINES];
 	unsigned int nrequest[NUM_ENGINES];
+
+	union {
+		struct rtavg {
+			struct ewma_rt avg[NUM_ENGINES];
+			uint32_t last[NUM_ENGINES];
+		} rt;
+	};
 };
 
 static const unsigned int nop_calibration_us = 1000;
@@ -788,12 +799,31 @@ static const struct workload_balancer qd_balancer = {
 };
 
 static enum intel_engine_id
+__rt_select_engine(struct workload *wrk, unsigned long *qd, bool random)
+{
+	unsigned int n;
+
+	qd[VCS1] >>= 10;
+	qd[VCS2] >>= 10;
+
+	if (qd[VCS1] < qd[VCS2])
+		n = 0;
+	else if (qd[VCS2] < qd[VCS1])
+		n = 1;
+	else if (random)
+		n = hars_petruska_f54_1_random(&wrk->prng) & 1;
+	else
+		n = wrk->vcs_rr;
+	wrk->vcs_rr = n ^ 1;
+
+	return get_vcs_engine(n);
+}
+
+static enum intel_engine_id
 __rt_balance(const struct workload_balancer *balancer,
 	     struct workload *wrk, struct w_step *w, bool random)
 {
-	enum intel_engine_id engine;
-	long qd[NUM_ENGINES];
-	unsigned int n;
+	unsigned long qd[NUM_ENGINES];
 
 	igt_assert(w->engine == VCS);
 
@@ -827,22 +857,7 @@ __rt_balance(const struct workload_balancer *balancer,
 	       qd[VCS2]);
 #endif
 
-	qd[VCS1] >>= 10;
-	qd[VCS2] >>= 10;
-
-	if (qd[VCS1] < qd[VCS2])
-		n = 0;
-	else if (qd[VCS2] < qd[VCS1])
-		n = 1;
-	else if (random)
-		n = hars_petruska_f54_1_random(&wrk->prng) & 1;
-	else
-		n = wrk->vcs_rr;
-
-	engine = get_vcs_engine(n);
-	wrk->vcs_rr = n ^ 1;
-
-	return engine;
+	return __rt_select_engine(wrk, qd, random);
 }
 
 static enum intel_engine_id
@@ -870,6 +885,68 @@ static const struct workload_balancer rtr_balancer = {
 	.balance = rtr_balance,
 };
 
+static enum intel_engine_id
+rtavg_balance(const struct workload_balancer *balancer,
+	   struct workload *wrk, struct w_step *w)
+{
+	unsigned long qd[NUM_ENGINES];
+
+	igt_assert(w->engine == VCS);
+
+	/* Estimate the average "speed" of the most recent batches
+	 *    (finish time - submit time)
+	 * and use that as an approximate for the total remaining time for
+	 * all batches on that engine plus the time we expect to execute in.
+	 * We try to keep the total remaining balanced between the engines.
+	 */
+	if (wrk->status_page[VCS_SEQNO_IDX(VCS1)] != wrk->rt.last[VCS1]) {
+		igt_assert((long)(wrk->status_page[2] - wrk->status_page[1]) > 0);
+		ewma_rt_add(&wrk->rt.avg[VCS1],
+			    wrk->status_page[2] - wrk->status_page[1]);
+		wrk->rt.last[VCS1] = wrk->status_page[VCS_SEQNO_IDX(VCS1)];
+	}
+
+	qd[VCS1] = balancer->get_qd(balancer, wrk, VCS1);
+	wrk->qd_sum[VCS1] += qd[VCS1];
+	qd[VCS1] = (qd[VCS1] + 1) * ewma_rt_read(&wrk->rt.avg[VCS1]);
+
+#ifdef DEBUG
+	printf("qd[0] = %d (%d - %d) x %ld (%d) = %ld\n",
+	       wrk->seqno[VCS1] - wrk->status_page[0],
+	       wrk->seqno[VCS1], wrk->status_page[0],
+	       ewma_rt_read(&wrk->rt.avg[VCS1]),
+	       wrk->status_page[2] -  wrk->status_page[1],
+	       qd[VCS1]);
+#endif
+
+	if (wrk->status_page[VCS_SEQNO_IDX(VCS2)] != wrk->rt.last[VCS2]) {
+		igt_assert((long)(wrk->status_page[2+16] - wrk->status_page[1+16]) > 0);
+		ewma_rt_add(&wrk->rt.avg[VCS2],
+			    wrk->status_page[2+16] - wrk->status_page[1+16]);
+		wrk->rt.last[VCS2] = wrk->status_page[VCS_SEQNO_IDX(VCS2)];
+	}
+
+	qd[VCS2] = balancer->get_qd(balancer, wrk, VCS2);
+	wrk->qd_sum[VCS2] += qd[VCS2];
+	qd[VCS2] = (qd[VCS2] + 1) * ewma_rt_read(&wrk->rt.avg[VCS2]);
+
+#ifdef DEBUG
+	printf("qd[1] = %d (%d - %d) x %ld (%d) = %ld\n",
+	       wrk->seqno[VCS2] - wrk->status_page[16],
+	       wrk->seqno[VCS2], wrk->status_page[16],
+	       ewma_rt_read(&wrk->rt.avg[VCS2]),
+	       wrk->status_page[18] - wrk->status_page[17],
+	       qd[VCS2]);
+#endif
+
+	return __rt_select_engine(wrk, qd, false);
+}
+
+static const struct workload_balancer rtavg_balancer = {
+	.get_qd = get_qd_depth,
+	.balance = rtavg_balance,
+};
+
 static void
 update_bb_seqno(struct w_step *w, enum intel_engine_id engine, uint32_t seqno)
 {
@@ -1277,7 +1354,7 @@ add_workload_arg(char **w_args, unsigned int nr_args, char *w_arg)
 
 static int parse_balancing_mode(char *str)
 {
-	const char *modes[] = { "rr", "qd", "rt", "rtr" };
+	const char *modes[] = { "rr", "qd", "rt", "rtr" , "rtavg" };
 	int mode = -1;
 	unsigned int i;
 
@@ -1394,27 +1471,43 @@ int main(int argc, char **argv)
 			}
 			switch (i) {
 			case 0:
+				if (!quiet)
+					printf("Using rr balancer\n");
 				balancer = &rr_balancer;
 				flags |= BALANCE;
 				break;
 			case 1:
+				if (!quiet)
+					printf("Using qd balancer\n");
 				igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
 					   8);
 				balancer = &qd_balancer;
 				flags |= SEQNO | BALANCE;
 				break;
 			case 2:
+				if (!quiet)
+					printf("Using rt balancer\n");
 				igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
 					   8);
 				balancer = &rt_balancer;
 				flags |= SEQNO | BALANCE | RT;
 				break;
 			case 3:
+				if (!quiet)
+					printf("Using rtr balancer\n");
 				igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
 					   8);
 				balancer = &rtr_balancer;
 				flags |= SEQNO | BALANCE | RT;
 				break;
+			case 4:
+				if (!quiet)
+					printf("Using rtavg balancer\n");
+				igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
+					   8);
+				balancer = &rtavg_balancer;
+				flags |= SEQNO | BALANCE | RT;
+				break;
 			default:
 				if (!quiet)
 					fprintf(stderr,
diff --git a/benchmarks/ilog2.h b/benchmarks/ilog2.h
new file mode 100644
index 00000000..596d7c23
--- /dev/null
+++ b/benchmarks/ilog2.h
@@ -0,0 +1,104 @@
+#ifndef ILOG2_H
+#define ILOG2_H
+
+#include <stdint.h>
+
+static inline int fls(int x)
+{
+        int r = -1;
+        asm("bsrl %1,%0" : "=r" (r) : "rm" (x), "0" (-1));
+        return r + 1;
+}
+
+static inline int fls64(__u64 x)
+{
+        int r = -1;
+        asm("bsrq %1,%q0" : "+r" (r) : "rm" (x));
+        return r + 1;
+}
+
+static inline __attribute__((const))
+int __ilog2_u32(uint32_t n)
+{
+	return fls(n) - 1;
+}
+
+static inline __attribute__((const))
+int __ilog2_u64(uint64_t n)
+{
+	return fls64(n) - 1;
+}
+
+#define ilog2(n)				\
+(						\
+	__builtin_constant_p(n) ? (		\
+		(n) < 2 ? 0 :			\
+		(n) & (1ULL << 63) ? 63 :	\
+		(n) & (1ULL << 62) ? 62 :	\
+		(n) & (1ULL << 61) ? 61 :	\
+		(n) & (1ULL << 60) ? 60 :	\
+		(n) & (1ULL << 59) ? 59 :	\
+		(n) & (1ULL << 58) ? 58 :	\
+		(n) & (1ULL << 57) ? 57 :	\
+		(n) & (1ULL << 56) ? 56 :	\
+		(n) & (1ULL << 55) ? 55 :	\
+		(n) & (1ULL << 54) ? 54 :	\
+		(n) & (1ULL << 53) ? 53 :	\
+		(n) & (1ULL << 52) ? 52 :	\
+		(n) & (1ULL << 51) ? 51 :	\
+		(n) & (1ULL << 50) ? 50 :	\
+		(n) & (1ULL << 49) ? 49 :	\
+		(n) & (1ULL << 48) ? 48 :	\
+		(n) & (1ULL << 47) ? 47 :	\
+		(n) & (1ULL << 46) ? 46 :	\
+		(n) & (1ULL << 45) ? 45 :	\
+		(n) & (1ULL << 44) ? 44 :	\
+		(n) & (1ULL << 43) ? 43 :	\
+		(n) & (1ULL << 42) ? 42 :	\
+		(n) & (1ULL << 41) ? 41 :	\
+		(n) & (1ULL << 40) ? 40 :	\
+		(n) & (1ULL << 39) ? 39 :	\
+		(n) & (1ULL << 38) ? 38 :	\
+		(n) & (1ULL << 37) ? 37 :	\
+		(n) & (1ULL << 36) ? 36 :	\
+		(n) & (1ULL << 35) ? 35 :	\
+		(n) & (1ULL << 34) ? 34 :	\
+		(n) & (1ULL << 33) ? 33 :	\
+		(n) & (1ULL << 32) ? 32 :	\
+		(n) & (1ULL << 31) ? 31 :	\
+		(n) & (1ULL << 30) ? 30 :	\
+		(n) & (1ULL << 29) ? 29 :	\
+		(n) & (1ULL << 28) ? 28 :	\
+		(n) & (1ULL << 27) ? 27 :	\
+		(n) & (1ULL << 26) ? 26 :	\
+		(n) & (1ULL << 25) ? 25 :	\
+		(n) & (1ULL << 24) ? 24 :	\
+		(n) & (1ULL << 23) ? 23 :	\
+		(n) & (1ULL << 22) ? 22 :	\
+		(n) & (1ULL << 21) ? 21 :	\
+		(n) & (1ULL << 20) ? 20 :	\
+		(n) & (1ULL << 19) ? 19 :	\
+		(n) & (1ULL << 18) ? 18 :	\
+		(n) & (1ULL << 17) ? 17 :	\
+		(n) & (1ULL << 16) ? 16 :	\
+		(n) & (1ULL << 15) ? 15 :	\
+		(n) & (1ULL << 14) ? 14 :	\
+		(n) & (1ULL << 13) ? 13 :	\
+		(n) & (1ULL << 12) ? 12 :	\
+		(n) & (1ULL << 11) ? 11 :	\
+		(n) & (1ULL << 10) ? 10 :	\
+		(n) & (1ULL <<  9) ?  9 :	\
+		(n) & (1ULL <<  8) ?  8 :	\
+		(n) & (1ULL <<  7) ?  7 :	\
+		(n) & (1ULL <<  6) ?  6 :	\
+		(n) & (1ULL <<  5) ?  5 :	\
+		(n) & (1ULL <<  4) ?  4 :	\
+		(n) & (1ULL <<  3) ?  3 :	\
+		(n) & (1ULL <<  2) ?  2 :	\
+		1 ) :				\
+	(sizeof(n) <= 4) ?			\
+	__ilog2_u32(n) :			\
+	__ilog2_u64(n)				\
+ )
+
+#endif /* ILOG2_H */