summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--benchmarks/ewma.h71
-rw-r--r--benchmarks/gem_wsim.c133
-rw-r--r--benchmarks/ilog2.h104
3 files changed, 288 insertions, 20 deletions
diff --git a/benchmarks/ewma.h b/benchmarks/ewma.h
new file mode 100644
index 00000000..8711004e
--- /dev/null
+++ b/benchmarks/ewma.h
@@ -0,0 +1,71 @@
+#ifndef EWMA_H
+#define EWMA_H
+
+#include <ilog2.h>
+
+#define BUILD_BUG_ON(expr)
+#define BUILD_BUG_ON_NOT_POWER_OF_2(expr)
+
+/*
+ * Exponentially weighted moving average (EWMA)
+ *
+ * This implements a fixed-precision EWMA algorithm, with both the
+ * precision and fall-off coefficient determined at compile-time
+ * and built into the generated helper funtions.
+ *
+ * The first argument to the macro is the name that will be used
+ * for the struct and helper functions.
+ *
+ * The second argument, the precision, expresses how many bits are
+ * used for the fractional part of the fixed-precision values.
+ *
+ * The third argument, the weight reciprocal, determines how the
+ * new values will be weighed vs. the old state, new values will
+ * get weight 1/weight_rcp and old values 1-1/weight_rcp. Note
+ * that this parameter must be a power of two for efficiency.
+ */
+
+#define DECLARE_EWMA(T, name, _precision, _weight_rcp) \
+ struct ewma_##name { \
+ T internal; \
+ }; \
+ static inline void ewma_##name##_init(struct ewma_##name *e) \
+ { \
+ BUILD_BUG_ON(!__builtin_constant_p(_precision)); \
+ BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \
+ /* \
+ * Even if you want to feed it just 0/1 you should have \
+ * some bits for the non-fractional part... \
+ */ \
+ BUILD_BUG_ON((_precision) > 30); \
+ BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \
+ e->internal = 0; \
+ } \
+ static inline T \
+ ewma_##name##_read(struct ewma_##name *e) \
+ { \
+ BUILD_BUG_ON(!__builtin_constant_p(_precision)); \
+ BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \
+ BUILD_BUG_ON((_precision) > 30); \
+ BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \
+ return e->internal >> (_precision); \
+ } \
+ static inline void ewma_##name##_add(struct ewma_##name *e, \
+ T val) \
+ { \
+ const T weight_rcp = ilog2(_weight_rcp); \
+ const T precision = _precision; \
+ T internal = e->internal; \
+ \
+ BUILD_BUG_ON(!__builtin_constant_p(_precision)); \
+ BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \
+ BUILD_BUG_ON((_precision) > 30); \
+ BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \
+ \
+ e->internal = internal ? \
+ (((internal << weight_rcp) - internal) + \
+ (val << precision)) >> weight_rcp : \
+ (val << precision); \
+ }
+
+#endif /* EWMA_H */
diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
index 47b070e2..9f690369 100644
--- a/benchmarks/gem_wsim.c
+++ b/benchmarks/gem_wsim.c
@@ -50,6 +50,8 @@
#include "igt_aux.h"
#include "igt_rand.h"
+#include "ewma.h"
+
enum intel_engine_id {
RCS,
BCS,
@@ -103,6 +105,8 @@ struct w_step
unsigned int mapped_len;
};
+DECLARE_EWMA(uint64_t, rt, 4, 2)
+
struct workload
{
unsigned int nr_steps;
@@ -127,6 +131,13 @@ struct workload
struct igt_list requests[NUM_ENGINES];
unsigned int nrequest[NUM_ENGINES];
+
+ union {
+ struct rtavg {
+ struct ewma_rt avg[NUM_ENGINES];
+ uint32_t last[NUM_ENGINES];
+ } rt;
+ };
};
static const unsigned int nop_calibration_us = 1000;
@@ -788,12 +799,31 @@ static const struct workload_balancer qd_balancer = {
};
static enum intel_engine_id
+__rt_select_engine(struct workload *wrk, unsigned long *qd, bool random)
+{
+ unsigned int n;
+
+ qd[VCS1] >>= 10;
+ qd[VCS2] >>= 10;
+
+ if (qd[VCS1] < qd[VCS2])
+ n = 0;
+ else if (qd[VCS2] < qd[VCS1])
+ n = 1;
+ else if (random)
+ n = hars_petruska_f54_1_random(&wrk->prng) & 1;
+ else
+ n = wrk->vcs_rr;
+ wrk->vcs_rr = n ^ 1;
+
+ return get_vcs_engine(n);
+}
+
+static enum intel_engine_id
__rt_balance(const struct workload_balancer *balancer,
struct workload *wrk, struct w_step *w, bool random)
{
- enum intel_engine_id engine;
- long qd[NUM_ENGINES];
- unsigned int n;
+ unsigned long qd[NUM_ENGINES];
igt_assert(w->engine == VCS);
@@ -827,22 +857,7 @@ __rt_balance(const struct workload_balancer *balancer,
qd[VCS2]);
#endif
- qd[VCS1] >>= 10;
- qd[VCS2] >>= 10;
-
- if (qd[VCS1] < qd[VCS2])
- n = 0;
- else if (qd[VCS2] < qd[VCS1])
- n = 1;
- else if (random)
- n = hars_petruska_f54_1_random(&wrk->prng) & 1;
- else
- n = wrk->vcs_rr;
-
- engine = get_vcs_engine(n);
- wrk->vcs_rr = n ^ 1;
-
- return engine;
+ return __rt_select_engine(wrk, qd, random);
}
static enum intel_engine_id
@@ -870,6 +885,68 @@ static const struct workload_balancer rtr_balancer = {
.balance = rtr_balance,
};
+static enum intel_engine_id
+rtavg_balance(const struct workload_balancer *balancer,
+ struct workload *wrk, struct w_step *w)
+{
+ unsigned long qd[NUM_ENGINES];
+
+ igt_assert(w->engine == VCS);
+
+ /* Estimate the average "speed" of the most recent batches
+ * (finish time - submit time)
+ * and use that as an approximate for the total remaining time for
+ * all batches on that engine plus the time we expect to execute in.
+ * We try to keep the total remaining balanced between the engines.
+ */
+ if (wrk->status_page[VCS_SEQNO_IDX(VCS1)] != wrk->rt.last[VCS1]) {
+ igt_assert((long)(wrk->status_page[2] - wrk->status_page[1]) > 0);
+ ewma_rt_add(&wrk->rt.avg[VCS1],
+ wrk->status_page[2] - wrk->status_page[1]);
+ wrk->rt.last[VCS1] = wrk->status_page[VCS_SEQNO_IDX(VCS1)];
+ }
+
+ qd[VCS1] = balancer->get_qd(balancer, wrk, VCS1);
+ wrk->qd_sum[VCS1] += qd[VCS1];
+ qd[VCS1] = (qd[VCS1] + 1) * ewma_rt_read(&wrk->rt.avg[VCS1]);
+
+#ifdef DEBUG
+ printf("qd[0] = %d (%d - %d) x %ld (%d) = %ld\n",
+ wrk->seqno[VCS1] - wrk->status_page[0],
+ wrk->seqno[VCS1], wrk->status_page[0],
+ ewma_rt_read(&wrk->rt.avg[VCS1]),
+ wrk->status_page[2] - wrk->status_page[1],
+ qd[VCS1]);
+#endif
+
+ if (wrk->status_page[VCS_SEQNO_IDX(VCS2)] != wrk->rt.last[VCS2]) {
+ igt_assert((long)(wrk->status_page[2+16] - wrk->status_page[1+16]) > 0);
+ ewma_rt_add(&wrk->rt.avg[VCS2],
+ wrk->status_page[2+16] - wrk->status_page[1+16]);
+ wrk->rt.last[VCS2] = wrk->status_page[VCS_SEQNO_IDX(VCS2)];
+ }
+
+ qd[VCS2] = balancer->get_qd(balancer, wrk, VCS2);
+ wrk->qd_sum[VCS2] += qd[VCS2];
+ qd[VCS2] = (qd[VCS2] + 1) * ewma_rt_read(&wrk->rt.avg[VCS2]);
+
+#ifdef DEBUG
+ printf("qd[1] = %d (%d - %d) x %ld (%d) = %ld\n",
+ wrk->seqno[VCS2] - wrk->status_page[16],
+ wrk->seqno[VCS2], wrk->status_page[16],
+ ewma_rt_read(&wrk->rt.avg[VCS2]),
+ wrk->status_page[18] - wrk->status_page[17],
+ qd[VCS2]);
+#endif
+
+ return __rt_select_engine(wrk, qd, false);
+}
+
+static const struct workload_balancer rtavg_balancer = {
+ .get_qd = get_qd_depth,
+ .balance = rtavg_balance,
+};
+
static void
update_bb_seqno(struct w_step *w, enum intel_engine_id engine, uint32_t seqno)
{
@@ -1277,7 +1354,7 @@ add_workload_arg(char **w_args, unsigned int nr_args, char *w_arg)
static int parse_balancing_mode(char *str)
{
- const char *modes[] = { "rr", "qd", "rt", "rtr" };
+ const char *modes[] = { "rr", "qd", "rt", "rtr" , "rtavg" };
int mode = -1;
unsigned int i;
@@ -1394,27 +1471,43 @@ int main(int argc, char **argv)
}
switch (i) {
case 0:
+ if (!quiet)
+ printf("Using rr balancer\n");
balancer = &rr_balancer;
flags |= BALANCE;
break;
case 1:
+ if (!quiet)
+ printf("Using qd balancer\n");
igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
8);
balancer = &qd_balancer;
flags |= SEQNO | BALANCE;
break;
case 2:
+ if (!quiet)
+ printf("Using rt balancer\n");
igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
8);
balancer = &rt_balancer;
flags |= SEQNO | BALANCE | RT;
break;
case 3:
+ if (!quiet)
+ printf("Using rtr balancer\n");
igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
8);
balancer = &rtr_balancer;
flags |= SEQNO | BALANCE | RT;
break;
+ case 4:
+ if (!quiet)
+ printf("Using rtavg balancer\n");
+ igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
+ 8);
+ balancer = &rtavg_balancer;
+ flags |= SEQNO | BALANCE | RT;
+ break;
default:
if (!quiet)
fprintf(stderr,
diff --git a/benchmarks/ilog2.h b/benchmarks/ilog2.h
new file mode 100644
index 00000000..596d7c23
--- /dev/null
+++ b/benchmarks/ilog2.h
@@ -0,0 +1,104 @@
+#ifndef ILOG2_H
+#define ILOG2_H
+
+#include <stdint.h>
+
+static inline int fls(int x)
+{
+ int r = -1;
+ asm("bsrl %1,%0" : "=r" (r) : "rm" (x), "0" (-1));
+ return r + 1;
+}
+
+static inline int fls64(__u64 x)
+{
+ int r = -1;
+ asm("bsrq %1,%q0" : "+r" (r) : "rm" (x));
+ return r + 1;
+}
+
+static inline __attribute__((const))
+int __ilog2_u32(uint32_t n)
+{
+ return fls(n) - 1;
+}
+
+static inline __attribute__((const))
+int __ilog2_u64(uint64_t n)
+{
+ return fls64(n) - 1;
+}
+
+#define ilog2(n) \
+( \
+ __builtin_constant_p(n) ? ( \
+ (n) < 2 ? 0 : \
+ (n) & (1ULL << 63) ? 63 : \
+ (n) & (1ULL << 62) ? 62 : \
+ (n) & (1ULL << 61) ? 61 : \
+ (n) & (1ULL << 60) ? 60 : \
+ (n) & (1ULL << 59) ? 59 : \
+ (n) & (1ULL << 58) ? 58 : \
+ (n) & (1ULL << 57) ? 57 : \
+ (n) & (1ULL << 56) ? 56 : \
+ (n) & (1ULL << 55) ? 55 : \
+ (n) & (1ULL << 54) ? 54 : \
+ (n) & (1ULL << 53) ? 53 : \
+ (n) & (1ULL << 52) ? 52 : \
+ (n) & (1ULL << 51) ? 51 : \
+ (n) & (1ULL << 50) ? 50 : \
+ (n) & (1ULL << 49) ? 49 : \
+ (n) & (1ULL << 48) ? 48 : \
+ (n) & (1ULL << 47) ? 47 : \
+ (n) & (1ULL << 46) ? 46 : \
+ (n) & (1ULL << 45) ? 45 : \
+ (n) & (1ULL << 44) ? 44 : \
+ (n) & (1ULL << 43) ? 43 : \
+ (n) & (1ULL << 42) ? 42 : \
+ (n) & (1ULL << 41) ? 41 : \
+ (n) & (1ULL << 40) ? 40 : \
+ (n) & (1ULL << 39) ? 39 : \
+ (n) & (1ULL << 38) ? 38 : \
+ (n) & (1ULL << 37) ? 37 : \
+ (n) & (1ULL << 36) ? 36 : \
+ (n) & (1ULL << 35) ? 35 : \
+ (n) & (1ULL << 34) ? 34 : \
+ (n) & (1ULL << 33) ? 33 : \
+ (n) & (1ULL << 32) ? 32 : \
+ (n) & (1ULL << 31) ? 31 : \
+ (n) & (1ULL << 30) ? 30 : \
+ (n) & (1ULL << 29) ? 29 : \
+ (n) & (1ULL << 28) ? 28 : \
+ (n) & (1ULL << 27) ? 27 : \
+ (n) & (1ULL << 26) ? 26 : \
+ (n) & (1ULL << 25) ? 25 : \
+ (n) & (1ULL << 24) ? 24 : \
+ (n) & (1ULL << 23) ? 23 : \
+ (n) & (1ULL << 22) ? 22 : \
+ (n) & (1ULL << 21) ? 21 : \
+ (n) & (1ULL << 20) ? 20 : \
+ (n) & (1ULL << 19) ? 19 : \
+ (n) & (1ULL << 18) ? 18 : \
+ (n) & (1ULL << 17) ? 17 : \
+ (n) & (1ULL << 16) ? 16 : \
+ (n) & (1ULL << 15) ? 15 : \
+ (n) & (1ULL << 14) ? 14 : \
+ (n) & (1ULL << 13) ? 13 : \
+ (n) & (1ULL << 12) ? 12 : \
+ (n) & (1ULL << 11) ? 11 : \
+ (n) & (1ULL << 10) ? 10 : \
+ (n) & (1ULL << 9) ? 9 : \
+ (n) & (1ULL << 8) ? 8 : \
+ (n) & (1ULL << 7) ? 7 : \
+ (n) & (1ULL << 6) ? 6 : \
+ (n) & (1ULL << 5) ? 5 : \
+ (n) & (1ULL << 4) ? 4 : \
+ (n) & (1ULL << 3) ? 3 : \
+ (n) & (1ULL << 2) ? 2 : \
+ 1 ) : \
+ (sizeof(n) <= 4) ? \
+ __ilog2_u32(n) : \
+ __ilog2_u64(n) \
+ )
+
+#endif /* ILOG2_H */