diff options
-rw-r--r-- | benchmarks/ewma.h | 71 | ||||
-rw-r--r-- | benchmarks/gem_wsim.c | 133 | ||||
-rw-r--r-- | benchmarks/ilog2.h | 104 |
3 files changed, 288 insertions, 20 deletions
diff --git a/benchmarks/ewma.h b/benchmarks/ewma.h new file mode 100644 index 00000000..8711004e --- /dev/null +++ b/benchmarks/ewma.h @@ -0,0 +1,71 @@ +#ifndef EWMA_H +#define EWMA_H + +#include <ilog2.h> + +#define BUILD_BUG_ON(expr) +#define BUILD_BUG_ON_NOT_POWER_OF_2(expr) + +/* + * Exponentially weighted moving average (EWMA) + * + * This implements a fixed-precision EWMA algorithm, with both the + * precision and fall-off coefficient determined at compile-time + * and built into the generated helper funtions. + * + * The first argument to the macro is the name that will be used + * for the struct and helper functions. + * + * The second argument, the precision, expresses how many bits are + * used for the fractional part of the fixed-precision values. + * + * The third argument, the weight reciprocal, determines how the + * new values will be weighed vs. the old state, new values will + * get weight 1/weight_rcp and old values 1-1/weight_rcp. Note + * that this parameter must be a power of two for efficiency. + */ + +#define DECLARE_EWMA(T, name, _precision, _weight_rcp) \ + struct ewma_##name { \ + T internal; \ + }; \ + static inline void ewma_##name##_init(struct ewma_##name *e) \ + { \ + BUILD_BUG_ON(!__builtin_constant_p(_precision)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \ + /* \ + * Even if you want to feed it just 0/1 you should have \ + * some bits for the non-fractional part... \ + */ \ + BUILD_BUG_ON((_precision) > 30); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \ + e->internal = 0; \ + } \ + static inline T \ + ewma_##name##_read(struct ewma_##name *e) \ + { \ + BUILD_BUG_ON(!__builtin_constant_p(_precision)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \ + BUILD_BUG_ON((_precision) > 30); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \ + return e->internal >> (_precision); \ + } \ + static inline void ewma_##name##_add(struct ewma_##name *e, \ + T val) \ + { \ + const T weight_rcp = ilog2(_weight_rcp); \ + const T precision = _precision; \ + T internal = e->internal; \ + \ + BUILD_BUG_ON(!__builtin_constant_p(_precision)); \ + BUILD_BUG_ON(!__builtin_constant_p(_weight_rcp)); \ + BUILD_BUG_ON((_precision) > 30); \ + BUILD_BUG_ON_NOT_POWER_OF_2(_weight_rcp); \ + \ + e->internal = internal ? \ + (((internal << weight_rcp) - internal) + \ + (val << precision)) >> weight_rcp : \ + (val << precision); \ + } + +#endif /* EWMA_H */ diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c index 47b070e2..9f690369 100644 --- a/benchmarks/gem_wsim.c +++ b/benchmarks/gem_wsim.c @@ -50,6 +50,8 @@ #include "igt_aux.h" #include "igt_rand.h" +#include "ewma.h" + enum intel_engine_id { RCS, BCS, @@ -103,6 +105,8 @@ struct w_step unsigned int mapped_len; }; +DECLARE_EWMA(uint64_t, rt, 4, 2) + struct workload { unsigned int nr_steps; @@ -127,6 +131,13 @@ struct workload struct igt_list requests[NUM_ENGINES]; unsigned int nrequest[NUM_ENGINES]; + + union { + struct rtavg { + struct ewma_rt avg[NUM_ENGINES]; + uint32_t last[NUM_ENGINES]; + } rt; + }; }; static const unsigned int nop_calibration_us = 1000; @@ -788,12 +799,31 @@ static const struct workload_balancer qd_balancer = { }; static enum intel_engine_id +__rt_select_engine(struct workload *wrk, unsigned long *qd, bool random) +{ + unsigned int n; + + qd[VCS1] >>= 10; + qd[VCS2] >>= 10; + + if (qd[VCS1] < qd[VCS2]) + n = 0; + else if (qd[VCS2] < qd[VCS1]) + n = 1; + else if (random) + n = hars_petruska_f54_1_random(&wrk->prng) & 1; + else + n = wrk->vcs_rr; + wrk->vcs_rr = n ^ 1; + + return get_vcs_engine(n); +} + +static enum intel_engine_id __rt_balance(const struct workload_balancer *balancer, struct workload *wrk, struct w_step *w, bool random) { - enum intel_engine_id engine; - long qd[NUM_ENGINES]; - unsigned int n; + unsigned long qd[NUM_ENGINES]; igt_assert(w->engine == VCS); @@ -827,22 +857,7 @@ __rt_balance(const struct workload_balancer *balancer, qd[VCS2]); #endif - qd[VCS1] >>= 10; - qd[VCS2] >>= 10; - - if (qd[VCS1] < qd[VCS2]) - n = 0; - else if (qd[VCS2] < qd[VCS1]) - n = 1; - else if (random) - n = hars_petruska_f54_1_random(&wrk->prng) & 1; - else - n = wrk->vcs_rr; - - engine = get_vcs_engine(n); - wrk->vcs_rr = n ^ 1; - - return engine; + return __rt_select_engine(wrk, qd, random); } static enum intel_engine_id @@ -870,6 +885,68 @@ static const struct workload_balancer rtr_balancer = { .balance = rtr_balance, }; +static enum intel_engine_id +rtavg_balance(const struct workload_balancer *balancer, + struct workload *wrk, struct w_step *w) +{ + unsigned long qd[NUM_ENGINES]; + + igt_assert(w->engine == VCS); + + /* Estimate the average "speed" of the most recent batches + * (finish time - submit time) + * and use that as an approximate for the total remaining time for + * all batches on that engine plus the time we expect to execute in. + * We try to keep the total remaining balanced between the engines. + */ + if (wrk->status_page[VCS_SEQNO_IDX(VCS1)] != wrk->rt.last[VCS1]) { + igt_assert((long)(wrk->status_page[2] - wrk->status_page[1]) > 0); + ewma_rt_add(&wrk->rt.avg[VCS1], + wrk->status_page[2] - wrk->status_page[1]); + wrk->rt.last[VCS1] = wrk->status_page[VCS_SEQNO_IDX(VCS1)]; + } + + qd[VCS1] = balancer->get_qd(balancer, wrk, VCS1); + wrk->qd_sum[VCS1] += qd[VCS1]; + qd[VCS1] = (qd[VCS1] + 1) * ewma_rt_read(&wrk->rt.avg[VCS1]); + +#ifdef DEBUG + printf("qd[0] = %d (%d - %d) x %ld (%d) = %ld\n", + wrk->seqno[VCS1] - wrk->status_page[0], + wrk->seqno[VCS1], wrk->status_page[0], + ewma_rt_read(&wrk->rt.avg[VCS1]), + wrk->status_page[2] - wrk->status_page[1], + qd[VCS1]); +#endif + + if (wrk->status_page[VCS_SEQNO_IDX(VCS2)] != wrk->rt.last[VCS2]) { + igt_assert((long)(wrk->status_page[2+16] - wrk->status_page[1+16]) > 0); + ewma_rt_add(&wrk->rt.avg[VCS2], + wrk->status_page[2+16] - wrk->status_page[1+16]); + wrk->rt.last[VCS2] = wrk->status_page[VCS_SEQNO_IDX(VCS2)]; + } + + qd[VCS2] = balancer->get_qd(balancer, wrk, VCS2); + wrk->qd_sum[VCS2] += qd[VCS2]; + qd[VCS2] = (qd[VCS2] + 1) * ewma_rt_read(&wrk->rt.avg[VCS2]); + +#ifdef DEBUG + printf("qd[1] = %d (%d - %d) x %ld (%d) = %ld\n", + wrk->seqno[VCS2] - wrk->status_page[16], + wrk->seqno[VCS2], wrk->status_page[16], + ewma_rt_read(&wrk->rt.avg[VCS2]), + wrk->status_page[18] - wrk->status_page[17], + qd[VCS2]); +#endif + + return __rt_select_engine(wrk, qd, false); +} + +static const struct workload_balancer rtavg_balancer = { + .get_qd = get_qd_depth, + .balance = rtavg_balance, +}; + static void update_bb_seqno(struct w_step *w, enum intel_engine_id engine, uint32_t seqno) { @@ -1277,7 +1354,7 @@ add_workload_arg(char **w_args, unsigned int nr_args, char *w_arg) static int parse_balancing_mode(char *str) { - const char *modes[] = { "rr", "qd", "rt", "rtr" }; + const char *modes[] = { "rr", "qd", "rt", "rtr" , "rtavg" }; int mode = -1; unsigned int i; @@ -1394,27 +1471,43 @@ int main(int argc, char **argv) } switch (i) { case 0: + if (!quiet) + printf("Using rr balancer\n"); balancer = &rr_balancer; flags |= BALANCE; break; case 1: + if (!quiet) + printf("Using qd balancer\n"); igt_assert(intel_gen(intel_get_drm_devid(fd)) >= 8); balancer = &qd_balancer; flags |= SEQNO | BALANCE; break; case 2: + if (!quiet) + printf("Using rt balancer\n"); igt_assert(intel_gen(intel_get_drm_devid(fd)) >= 8); balancer = &rt_balancer; flags |= SEQNO | BALANCE | RT; break; case 3: + if (!quiet) + printf("Using rtr balancer\n"); igt_assert(intel_gen(intel_get_drm_devid(fd)) >= 8); balancer = &rtr_balancer; flags |= SEQNO | BALANCE | RT; break; + case 4: + if (!quiet) + printf("Using rtavg balancer\n"); + igt_assert(intel_gen(intel_get_drm_devid(fd)) >= + 8); + balancer = &rtavg_balancer; + flags |= SEQNO | BALANCE | RT; + break; default: if (!quiet) fprintf(stderr, diff --git a/benchmarks/ilog2.h b/benchmarks/ilog2.h new file mode 100644 index 00000000..596d7c23 --- /dev/null +++ b/benchmarks/ilog2.h @@ -0,0 +1,104 @@ +#ifndef ILOG2_H +#define ILOG2_H + +#include <stdint.h> + +static inline int fls(int x) +{ + int r = -1; + asm("bsrl %1,%0" : "=r" (r) : "rm" (x), "0" (-1)); + return r + 1; +} + +static inline int fls64(__u64 x) +{ + int r = -1; + asm("bsrq %1,%q0" : "+r" (r) : "rm" (x)); + return r + 1; +} + +static inline __attribute__((const)) +int __ilog2_u32(uint32_t n) +{ + return fls(n) - 1; +} + +static inline __attribute__((const)) +int __ilog2_u64(uint64_t n) +{ + return fls64(n) - 1; +} + +#define ilog2(n) \ +( \ + __builtin_constant_p(n) ? ( \ + (n) < 2 ? 0 : \ + (n) & (1ULL << 63) ? 63 : \ + (n) & (1ULL << 62) ? 62 : \ + (n) & (1ULL << 61) ? 61 : \ + (n) & (1ULL << 60) ? 60 : \ + (n) & (1ULL << 59) ? 59 : \ + (n) & (1ULL << 58) ? 58 : \ + (n) & (1ULL << 57) ? 57 : \ + (n) & (1ULL << 56) ? 56 : \ + (n) & (1ULL << 55) ? 55 : \ + (n) & (1ULL << 54) ? 54 : \ + (n) & (1ULL << 53) ? 53 : \ + (n) & (1ULL << 52) ? 52 : \ + (n) & (1ULL << 51) ? 51 : \ + (n) & (1ULL << 50) ? 50 : \ + (n) & (1ULL << 49) ? 49 : \ + (n) & (1ULL << 48) ? 48 : \ + (n) & (1ULL << 47) ? 47 : \ + (n) & (1ULL << 46) ? 46 : \ + (n) & (1ULL << 45) ? 45 : \ + (n) & (1ULL << 44) ? 44 : \ + (n) & (1ULL << 43) ? 43 : \ + (n) & (1ULL << 42) ? 42 : \ + (n) & (1ULL << 41) ? 41 : \ + (n) & (1ULL << 40) ? 40 : \ + (n) & (1ULL << 39) ? 39 : \ + (n) & (1ULL << 38) ? 38 : \ + (n) & (1ULL << 37) ? 37 : \ + (n) & (1ULL << 36) ? 36 : \ + (n) & (1ULL << 35) ? 35 : \ + (n) & (1ULL << 34) ? 34 : \ + (n) & (1ULL << 33) ? 33 : \ + (n) & (1ULL << 32) ? 32 : \ + (n) & (1ULL << 31) ? 31 : \ + (n) & (1ULL << 30) ? 30 : \ + (n) & (1ULL << 29) ? 29 : \ + (n) & (1ULL << 28) ? 28 : \ + (n) & (1ULL << 27) ? 27 : \ + (n) & (1ULL << 26) ? 26 : \ + (n) & (1ULL << 25) ? 25 : \ + (n) & (1ULL << 24) ? 24 : \ + (n) & (1ULL << 23) ? 23 : \ + (n) & (1ULL << 22) ? 22 : \ + (n) & (1ULL << 21) ? 21 : \ + (n) & (1ULL << 20) ? 20 : \ + (n) & (1ULL << 19) ? 19 : \ + (n) & (1ULL << 18) ? 18 : \ + (n) & (1ULL << 17) ? 17 : \ + (n) & (1ULL << 16) ? 16 : \ + (n) & (1ULL << 15) ? 15 : \ + (n) & (1ULL << 14) ? 14 : \ + (n) & (1ULL << 13) ? 13 : \ + (n) & (1ULL << 12) ? 12 : \ + (n) & (1ULL << 11) ? 11 : \ + (n) & (1ULL << 10) ? 10 : \ + (n) & (1ULL << 9) ? 9 : \ + (n) & (1ULL << 8) ? 8 : \ + (n) & (1ULL << 7) ? 7 : \ + (n) & (1ULL << 6) ? 6 : \ + (n) & (1ULL << 5) ? 5 : \ + (n) & (1ULL << 4) ? 4 : \ + (n) & (1ULL << 3) ? 3 : \ + (n) & (1ULL << 2) ? 2 : \ + 1 ) : \ + (sizeof(n) <= 4) ? \ + __ilog2_u32(n) : \ + __ilog2_u64(n) \ + ) + +#endif /* ILOG2_H */ |