summaryrefslogtreecommitdiff
path: root/benchmarks
diff options
context:
space:
mode:
authorTvrtko Ursulin <tvrtko.ursulin@intel.com>2017-03-30 14:32:29 +0100
committerTvrtko Ursulin <tvrtko.ursulin@intel.com>2017-04-25 14:53:46 +0100
commit054eb1abecd1cce2e4ee0516f3ff8a67a35dca22 (patch)
tree8083331eb239e3c5f49d776adac040f3fffff717 /benchmarks
parentcf6f2c9be161e3ca6dd210f4d576cea52883c6bc (diff)
benchmarks/gem_wsim: Command submission workload simulator
Tool which emits batch buffers to engines with configurable sequences, durations, contexts, dependencies and userspace waits. Unfinished but shows promise so sending out for early feedback. v2: * Load workload descriptors from files. (also -w) * Help text. * Calibration control if needed. (-t) * NORELOC | LUT to eb flags. * Added sample workload to wsim/workload1. v3: * Multiple parallel different workloads (-w -w ...). * Multi-context workloads. * Variable (random) batch length. * Load balancing (round robin and queue depth estimation). * Workloads delays and explicit sync steps. * Workload frequency (period) control. v4: * Fixed queue-depth estimation by creating separate batches per engine when qd load balancing is on. * Dropped separate -s cmd line option. It can turn itself on automatically when needed. * Keep a single status page and lie about the write hazard as suggested by Chris. * Use batch_start_offset for controlling the batch duration. (Chris) * Set status page object cache level. (Chris) * Moved workload description to a README. * Tidied example workloads. * Some other cleanups and refactorings. v5: * Master and background workloads (-W / -w). * Single batch per step is enough even when balancing. (Chris) * Use hars_petruska_f54_1_random IGT functions and see to zero at start. (Chris) * Use WC cache domain when WC mapping. (Chris) * Keep seqnos 64-bytes apart in the status page. (Chris) * Add workload throttling and queue-depth throttling commands. (Chris) v6: * Added two more workloads. * Merged RT balancer from Chris. v7: * Merged NO_RELOC patch from Chris. * Added missing RT balancer to help text. TODO list: * Fence support. * Batch buffer caching (re-use pool). * Better error handling. * Less 1980's workload parsing. * More workloads. * Threads? * ... ? Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Chris Wilson <chris@chris-wilson.co.uk> Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Diffstat (limited to 'benchmarks')
-rw-r--r--benchmarks/Makefile.sources1
-rw-r--r--benchmarks/gem_wsim.c1320
-rw-r--r--benchmarks/wsim/README56
-rw-r--r--benchmarks/wsim/media_17i7.wsim7
-rw-r--r--benchmarks/wsim/media_19.wsim10
-rw-r--r--benchmarks/wsim/media_load_balance_17i7.wsim7
-rw-r--r--benchmarks/wsim/media_load_balance_19.wsim10
-rw-r--r--benchmarks/wsim/vcs1.wsim26
-rw-r--r--benchmarks/wsim/vcs_balanced.wsim26
9 files changed, 1463 insertions, 0 deletions
diff --git a/benchmarks/Makefile.sources b/benchmarks/Makefile.sources
index 3af54ebe..3a941150 100644
--- a/benchmarks/Makefile.sources
+++ b/benchmarks/Makefile.sources
@@ -14,6 +14,7 @@ benchmarks_prog_list = \
gem_prw \
gem_set_domain \
gem_syslatency \
+ gem_wsim \
kms_vblank \
prime_lookup \
vgem_mmap \
diff --git a/benchmarks/gem_wsim.c b/benchmarks/gem_wsim.c
new file mode 100644
index 00000000..21445cac
--- /dev/null
+++ b/benchmarks/gem_wsim.c
@@ -0,0 +1,1320 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <poll.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <assert.h>
+#include <limits.h>
+
+
+#include "intel_chipset.h"
+#include "drm.h"
+#include "ioctl_wrappers.h"
+#include "drmtest.h"
+#include "intel_io.h"
+#include "igt_rand.h"
+
+enum intel_engine_id {
+ RCS,
+ BCS,
+ VCS,
+ VCS1,
+ VCS2,
+ VECS,
+ NUM_ENGINES
+};
+
+struct duration {
+ unsigned int min, max;
+};
+
+enum w_type
+{
+ BATCH,
+ SYNC,
+ DELAY,
+ PERIOD,
+ THROTTLE,
+ QD_THROTTLE
+};
+
+struct w_step
+{
+ /* Workload step metadata */
+ enum w_type type;
+ unsigned int context;
+ unsigned int engine;
+ struct duration duration;
+ int dependency;
+ int wait;
+
+ /* Implementation details */
+ unsigned int idx;
+
+ struct drm_i915_gem_execbuffer2 eb;
+ struct drm_i915_gem_exec_object2 obj[4];
+ struct drm_i915_gem_relocation_entry reloc[3];
+ unsigned long bb_sz;
+ uint32_t bb_handle;
+ uint32_t *mapped_batch;
+ uint32_t *seqno_value;
+ uint32_t *seqno_address;
+ uint32_t *rt0_value;
+ uint32_t *rt0_address;
+ uint32_t *rt1_address;
+ unsigned int mapped_len;
+};
+
+struct workload
+{
+ unsigned int nr_steps;
+ struct w_step *steps;
+
+ struct timespec repeat_start;
+
+ int pipe[2];
+
+ unsigned int nr_ctxs;
+ uint32_t *ctx_id;
+
+ uint32_t seqno[NUM_ENGINES];
+ uint32_t status_page_handle;
+ uint32_t *status_page;
+ unsigned int vcs_rr;
+
+ unsigned long qd_sum[NUM_ENGINES];
+ unsigned long nr_bb[NUM_ENGINES];
+};
+
+static const unsigned int eb_engine_map[NUM_ENGINES] = {
+ [RCS] = I915_EXEC_RENDER,
+ [BCS] = I915_EXEC_BLT,
+ [VCS] = I915_EXEC_BSD,
+ [VCS1] = I915_EXEC_BSD | I915_EXEC_BSD_RING1,
+ [VCS2] = I915_EXEC_BSD | I915_EXEC_BSD_RING2,
+ [VECS] = I915_EXEC_VEBOX
+};
+
+static const unsigned int nop_calibration_us = 1000;
+static unsigned long nop_calibration;
+
+static bool quiet;
+static int fd;
+
+#define SWAPVCS (1<<0)
+#define SEQNO (1<<1)
+#define BALANCE (1<<2)
+#define RT (1<<3)
+
+#define VCS_SEQNO_IDX(engine) (((engine) - VCS1) * 16)
+#define VCS_SEQNO_OFFSET(engine) (VCS_SEQNO_IDX(engine) * sizeof(uint32_t))
+
+#define RCS_TIMESTAMP (0x2000 + 0x358)
+#define REG(x) (volatile uint32_t *)((volatile char *)igt_global_mmio + x)
+
+/*
+ * Workload descriptor:
+ *
+ * ctx.engine.duration.dependency.wait,...
+ * <uint>.<str>.<uint>.<int <= 0>.<0|1>,...
+ *
+ * Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS
+ *
+ * "1.VCS1.3000.0.1,1.RCS.1000.-1.0,1.RCS.3700.0.0,1.RCS.1000.-2.0,1.VCS2.2300.-2.0,1.RCS.4700.-1.0,1.VCS2.600.-1.1"
+ */
+
+static const char *ring_str_map[NUM_ENGINES] = {
+ [RCS] = "RCS",
+ [BCS] = "BCS",
+ [VCS] = "VCS",
+ [VCS1] = "VCS1",
+ [VCS2] = "VCS2",
+ [VECS] = "VECS",
+};
+
+static struct workload *parse_workload(char *_desc)
+{
+ struct workload *wrk;
+ unsigned int nr_steps = 0;
+ char *desc = strdup(_desc);
+ char *_token, *token, *tctx = NULL, *tstart = desc;
+ char *field, *fctx = NULL, *fstart;
+ struct w_step step, *steps = NULL;
+ unsigned int valid;
+ int tmp;
+
+ while ((_token = strtok_r(tstart, ",", &tctx)) != NULL) {
+ tstart = NULL;
+ token = strdup(_token);
+ fstart = token;
+ valid = 0;
+ memset(&step, 0, sizeof(step));
+
+ if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+ fstart = NULL;
+
+ if (!strcasecmp(field, "d")) {
+ if ((field = strtok_r(fstart, ".", &fctx)) !=
+ NULL) {
+ tmp = atoi(field);
+ if (tmp <= 0) {
+ if (!quiet)
+ fprintf(stderr,
+ "Invalid delay at step %u!\n",
+ nr_steps);
+ return NULL;
+ }
+
+ step.type = DELAY;
+ step.wait = tmp;
+ goto add_step;
+ }
+ } else if (!strcasecmp(field, "p")) {
+ if ((field = strtok_r(fstart, ".", &fctx)) !=
+ NULL) {
+ tmp = atoi(field);
+ if (tmp <= 0) {
+ if (!quiet)
+ fprintf(stderr,
+ "Invalid period at step %u!\n",
+ nr_steps);
+ return NULL;
+ }
+
+ step.type = PERIOD;
+ step.wait = tmp;
+ goto add_step;
+ }
+ } else if (!strcasecmp(field, "s")) {
+ if ((field = strtok_r(fstart, ".", &fctx)) !=
+ NULL) {
+ tmp = atoi(field);
+ if (tmp >= 0) {
+ if (!quiet)
+ fprintf(stderr,
+ "Invalid sync target at step %u!\n",
+ nr_steps);
+ return NULL;
+ }
+
+ step.type = SYNC;
+ step.wait = tmp;
+ goto add_step;
+ }
+ } else if (!strcasecmp(field, "t")) {
+ if ((field = strtok_r(fstart, ".", &fctx)) !=
+ NULL) {
+ tmp = atoi(field);
+ if (tmp < 0) {
+ if (!quiet)
+ fprintf(stderr,
+ "Invalid throttle at step %u!\n",
+ nr_steps);
+ return NULL;
+ }
+
+ step.type = THROTTLE;
+ step.wait = tmp;
+ goto add_step;
+ }
+ } else if (!strcasecmp(field, "q")) {
+ if ((field = strtok_r(fstart, ".", &fctx)) !=
+ NULL) {
+ tmp = atoi(field);
+ if (tmp < 0) {
+ if (!quiet)
+ fprintf(stderr,
+ "Invalid qd throttle at step %u!\n",
+ nr_steps);
+ return NULL;
+ }
+
+ step.type = QD_THROTTLE;
+ step.wait = tmp;
+ goto add_step;
+ }
+ }
+
+ tmp = atoi(field);
+ if (tmp < 0) {
+ if (!quiet)
+ fprintf(stderr,
+ "Invalid ctx id at step %u!\n",
+ nr_steps);
+ return NULL;
+ }
+ step.context = tmp;
+
+ valid++;
+ }
+
+ if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+ unsigned int i, old_valid = valid;
+
+ fstart = NULL;
+
+ for (i = 0; i < ARRAY_SIZE(ring_str_map); i++) {
+ if (!strcasecmp(field, ring_str_map[i])) {
+ step.engine = i;
+ valid++;
+ break;
+ }
+ }
+
+ if (old_valid == valid) {
+ if (!quiet)
+ fprintf(stderr,
+ "Invalid engine id at step %u!\n",
+ nr_steps);
+ return NULL;
+ }
+ }
+
+ if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+ char *sep = NULL;
+ long int tmpl;
+
+ fstart = NULL;
+
+ tmpl = strtol(field, &sep, 10);
+ if (tmpl == LONG_MIN || tmpl == LONG_MAX) {
+ if (!quiet)
+ fprintf(stderr,
+ "Invalid duration at step %u!\n",
+ nr_steps);
+ return NULL;
+ }
+ step.duration.min = tmpl;
+
+ if (sep && *sep == '-') {
+ tmpl = strtol(sep + 1, NULL, 10);
+ if (tmpl == LONG_MIN || tmpl == LONG_MAX) {
+ if (!quiet)
+ fprintf(stderr,
+ "Invalid duration range at step %u!\n",
+ nr_steps);
+ return NULL;
+ }
+ step.duration.max = tmpl;
+ } else {
+ step.duration.max = step.duration.min;
+ }
+
+ valid++;
+ }
+
+ if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+ fstart = NULL;
+
+ tmp = atoi(field);
+ if (tmp > 0) {
+ if (!quiet)
+ fprintf(stderr,
+ "Invalid forward dependency at step %u!\n",
+ nr_steps);
+ return NULL;
+ }
+ step.dependency = tmp;
+
+ valid++;
+ }
+
+ if ((field = strtok_r(fstart, ".", &fctx)) != NULL) {
+ fstart = NULL;
+
+ tmp = atoi(field);
+ if (tmp != 0 && tmp != 1) {
+ if (!quiet)
+ fprintf(stderr,
+ "Invalid wait boolean at step %u!\n",
+ nr_steps);
+ return NULL;
+ }
+ step.wait = tmp;
+
+ valid++;
+ }
+
+ if (valid != 5) {
+ if (!quiet)
+ fprintf(stderr, "Invalid record at step %u!\n",
+ nr_steps);
+ return NULL;
+ }
+
+ step.type = BATCH;
+
+add_step:
+ step.idx = nr_steps++;
+ steps = realloc(steps, sizeof(step) * nr_steps);
+ igt_assert(steps);
+
+ memcpy(&steps[nr_steps - 1], &step, sizeof(step));
+
+ free(token);
+ }
+
+ wrk = malloc(sizeof(*wrk));
+ igt_assert(wrk);
+
+ wrk->nr_steps = nr_steps;
+ wrk->steps = steps;
+
+ free(desc);
+
+ return wrk;
+}
+
+static struct workload *
+clone_workload(struct workload *_wrk)
+{
+ struct workload *wrk;
+
+ wrk = malloc(sizeof(*wrk));
+ igt_assert(wrk);
+ memset(wrk, 0, sizeof(*wrk));
+
+ wrk->nr_steps = _wrk->nr_steps;
+ wrk->steps = calloc(wrk->nr_steps, sizeof(struct w_step));
+ igt_assert(wrk->steps);
+
+ memcpy(wrk->steps, _wrk->steps, sizeof(struct w_step) * wrk->nr_steps);
+
+ return wrk;
+}
+
+#define rounddown(x, y) (x - (x%y))
+#ifndef PAGE_SIZE
+#define PAGE_SIZE (4096)
+#endif
+
+static unsigned int get_duration(struct duration *dur)
+{
+ if (dur->min == dur->max)
+ return dur->min;
+ else
+ return dur->min + hars_petruska_f54_1_random_unsafe() %
+ (dur->max + 1 - dur->min);
+}
+
+static unsigned long get_bb_sz(unsigned int duration)
+{
+ return ALIGN(duration * nop_calibration * sizeof(uint32_t) /
+ nop_calibration_us, sizeof(uint32_t));
+}
+
+static void
+terminate_bb(struct w_step *w, unsigned int flags)
+{
+ const uint32_t bbe = 0xa << 23;
+ unsigned long mmap_start, mmap_len;
+ unsigned long batch_start = w->bb_sz;
+ uint32_t *ptr, *cs;
+
+ igt_assert(((flags & RT) && (flags & SEQNO)) || !(flags & RT));
+
+ batch_start -= sizeof(uint32_t); /* bbend */
+ if (flags & SEQNO)
+ batch_start -= 4 * sizeof(uint32_t);
+ if (flags & RT)
+ batch_start -= 8 * sizeof(uint32_t);
+
+ mmap_start = rounddown(batch_start, PAGE_SIZE);
+ mmap_len = w->bb_sz - mmap_start;
+
+ gem_set_domain(fd, w->bb_handle,
+ I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
+
+ ptr = gem_mmap__wc(fd, w->bb_handle, mmap_start, mmap_len, PROT_WRITE);
+ cs = (uint32_t *)((char *)ptr + batch_start - mmap_start);
+
+ if (flags & SEQNO) {
+ w->reloc[0].offset = batch_start + sizeof(uint32_t);
+ batch_start += 4 * sizeof(uint32_t);
+
+ *cs++ = MI_STORE_DWORD_IMM;
+ w->seqno_address = cs;
+ *cs++ = 0;
+ *cs++ = 0;
+ w->seqno_value = cs;
+ *cs++ = 0;
+ }
+
+ if (flags & RT) {
+ w->reloc[1].offset = batch_start + sizeof(uint32_t);
+ batch_start += 4 * sizeof(uint32_t);
+
+ *cs++ = MI_STORE_DWORD_IMM;
+ w->rt0_address = cs;
+ *cs++ = 0;
+ *cs++ = 0;
+ w->rt0_value = cs;
+ *cs++ = 0;
+
+ w->reloc[2].offset = batch_start + 2 * sizeof(uint32_t);
+ batch_start += 4 * sizeof(uint32_t);
+
+ *cs++ = 0x24 << 23 | 2; /* MI_STORE_REG_MEM */
+ *cs++ = RCS_TIMESTAMP;
+ w->rt1_address = cs;
+ *cs++ = 0;
+ *cs++ = 0;
+ }
+
+ *cs = bbe;
+
+ w->mapped_batch = ptr;
+ w->mapped_len = mmap_len;
+}
+
+static void
+eb_update_flags(struct w_step *w, enum intel_engine_id engine,
+ unsigned int flags)
+{
+ w->eb.flags = eb_engine_map[engine];
+ w->eb.flags |= I915_EXEC_HANDLE_LUT;
+ w->eb.flags |= I915_EXEC_NO_RELOC;
+}
+
+static void
+alloc_step_batch(struct workload *wrk, struct w_step *w, unsigned int flags)
+{
+ enum intel_engine_id engine = w->engine;
+ unsigned int bb_i, j = 0;
+
+ w->obj[j].handle = gem_create(fd, 4096);
+ w->obj[j].flags = EXEC_OBJECT_WRITE;
+ j++;
+
+ if (flags & SEQNO) {
+ w->obj[j].handle = wrk->status_page_handle;
+ j++;
+ }
+
+ bb_i = j++;
+ w->bb_sz = get_bb_sz(w->duration.max);
+ w->bb_handle = w->obj[bb_i].handle = gem_create(fd, w->bb_sz);
+ terminate_bb(w, flags);
+
+ igt_assert(w->dependency <= 0);
+ if (w->dependency) {
+ int dep_idx = w->idx + w->dependency;
+
+ igt_assert(dep_idx >= 0 && dep_idx < wrk->nr_steps);
+ igt_assert(wrk->steps[dep_idx].type == BATCH);
+
+ w->obj[j].handle = w->obj[bb_i].handle;
+ bb_i = j;
+ w->obj[j - 1].handle = wrk->steps[dep_idx].obj[0].handle;
+ j++;
+ }
+
+ if (flags & SEQNO) {
+ w->obj[bb_i].relocs_ptr = to_user_pointer(&w->reloc);
+ if (flags & RT)
+ w->obj[bb_i].relocation_count = 3;
+ else
+ w->obj[bb_i].relocation_count = 1;
+ for (int i = 0; i < w->obj[bb_i].relocation_count; i++)
+ w->reloc[i].target_handle = 1;
+ }
+
+ w->eb.buffers_ptr = to_user_pointer(w->obj);
+ w->eb.buffer_count = j;
+ w->eb.rsvd1 = wrk->ctx_id[w->context];
+
+ if (flags & SWAPVCS && engine == VCS1)
+ engine = VCS2;
+ else if (flags & SWAPVCS && engine == VCS2)
+ engine = VCS1;
+ eb_update_flags(w, engine, flags);
+#ifdef DEBUG
+ printf("%u: %u:%x|%x|%x|%x %10lu flags=%llx bb=%x[%u] ctx[%u]=%u\n",
+ w->idx, w->eb.buffer_count, w->obj[0].handle,
+ w->obj[1].handle, w->obj[2].handle, w->obj[3].handle,
+ w->bb_sz, w->eb.flags, w->bb_handle, bb_i,
+ w->context, wrk->ctx_id[w->context]);
+#endif
+}
+
+static void
+prepare_workload(struct workload *wrk, unsigned int flags)
+{
+ int max_ctx = -1;
+ struct w_step *w;
+ int i;
+
+ if (flags & SEQNO) {
+ const unsigned int status_sz = sizeof(uint32_t);
+ uint32_t handle = gem_create(fd, status_sz);
+
+ gem_set_caching(fd, handle, I915_CACHING_CACHED);
+ wrk->status_page_handle = handle;
+ wrk->status_page = gem_mmap__cpu(fd, handle, 0, status_sz,
+ PROT_READ);
+ }
+
+ for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+ if ((int)w->context > max_ctx) {
+ int delta = w->context + 1 - wrk->nr_ctxs;
+
+ wrk->nr_ctxs += delta;
+ wrk->ctx_id = realloc(wrk->ctx_id,
+ wrk->nr_ctxs * sizeof(uint32_t));
+ memset(&wrk->ctx_id[wrk->nr_ctxs - delta], 0,
+ delta * sizeof(uint32_t));
+
+ max_ctx = w->context;
+ }
+
+ if (!wrk->ctx_id[w->context]) {
+ struct drm_i915_gem_context_create arg = {};
+
+ drmIoctl(fd, DRM_IOCTL_I915_GEM_CONTEXT_CREATE, &arg);
+ igt_assert(arg.ctx_id);
+
+ wrk->ctx_id[w->context] = arg.ctx_id;
+ }
+ }
+
+ for (i = 0, w = wrk->steps; i < wrk->nr_steps; i++, w++) {
+ unsigned int _flags = flags;
+ enum intel_engine_id engine = w->engine;
+
+ if (w->type != BATCH)
+ continue;
+
+ if (engine != VCS && engine != VCS1 && engine != VCS2)
+ _flags &= ~(SEQNO | RT);
+
+ if (engine == VCS)
+ _flags &= ~SWAPVCS;
+
+ alloc_step_batch(wrk, w, _flags);
+ }
+}
+
+static double elapsed(const struct timespec *start, const struct timespec *end)
+{
+ return (end->tv_sec - start->tv_sec) +
+ (end->tv_nsec - start->tv_nsec) / 1e9;
+}
+
+static int elapsed_us(const struct timespec *start, const struct timespec *end)
+{
+ return elapsed(start, end) * 1e6;
+}
+
+static enum intel_engine_id get_vcs_engine(unsigned int n)
+{
+ const enum intel_engine_id vcs_engines[2] = { VCS1, VCS2 };
+
+ igt_assert(n < ARRAY_SIZE(vcs_engines));
+
+ return vcs_engines[n];
+}
+
+struct workload_balancer {
+ unsigned int (*get_qd)(const struct workload_balancer *balancer,
+ struct workload *wrk,
+ enum intel_engine_id engine);
+ enum intel_engine_id (*balance)(const struct workload_balancer *balancer,
+ struct workload *wrk, struct w_step *w);
+};
+
+static enum intel_engine_id
+rr_balance(const struct workload_balancer *balancer,
+ struct workload *wrk, struct w_step *w)
+{
+ unsigned int engine;
+
+ engine = get_vcs_engine(wrk->vcs_rr);
+ wrk->vcs_rr ^= 1;
+
+ return engine;
+}
+
+static const struct workload_balancer rr_balancer = {
+ .balance = rr_balance,
+};
+
+static unsigned int
+get_qd_depth(const struct workload_balancer *balancer,
+ struct workload *wrk, enum intel_engine_id engine)
+{
+ return wrk->seqno[engine] -
+ wrk->status_page[VCS_SEQNO_IDX(engine)];
+}
+
+static enum intel_engine_id
+qd_balance(const struct workload_balancer *balancer,
+ struct workload *wrk, struct w_step *w)
+{
+ enum intel_engine_id engine;
+ long qd[NUM_ENGINES];
+ unsigned int n;
+
+ igt_assert(w->engine == VCS);
+
+ qd[VCS1] = balancer->get_qd(balancer, wrk, VCS1);
+ wrk->qd_sum[VCS1] += qd[VCS1];
+
+ qd[VCS2] = balancer->get_qd(balancer, wrk, VCS2);
+ wrk->qd_sum[VCS2] += qd[VCS2];
+
+ if (qd[VCS1] < qd[VCS2])
+ n = 0;
+ else if (qd[VCS2] < qd[VCS1])
+ n = 1;
+ else
+ n = wrk->vcs_rr;
+
+ engine = get_vcs_engine(n);
+ wrk->vcs_rr = n ^ 1;
+
+#ifdef DEBUG
+ printf("qd_balance: 1:%ld 2:%ld rr:%u = %u\t(%lu - %u) (%lu - %u)\n",
+ qd[VCS1], qd[VCS2], wrk->vcs_rr, engine,
+ wrk->seqno[VCS1], wrk->status_page[VCS_SEQNO_IDX(VCS1)],
+ wrk->seqno[VCS2], wrk->status_page[VCS_SEQNO_IDX(VCS2)]);
+#endif
+ return engine;
+}
+
+static const struct workload_balancer qd_balancer = {
+ .get_qd = get_qd_depth,
+ .balance = qd_balance,
+};
+
+static enum intel_engine_id
+rt_balance(const struct workload_balancer *balancer,
+ struct workload *wrk, struct w_step *w)
+{
+ enum intel_engine_id engine;
+ long qd[NUM_ENGINES];
+ unsigned int n;
+
+ igt_assert(w->engine == VCS);
+
+ /* Estimate the "speed" of the most recent batch
+ * (finish time - submit time)
+ * and use that as an approximate for the total remaining time for
+ * all batches on that engine. We try to keep the total remaining
+ * balanced between the engines.
+ */
+ qd[VCS1] = balancer->get_qd(balancer, wrk, VCS1);
+ wrk->qd_sum[VCS1] += qd[VCS1];
+ qd[VCS1] *= wrk->status_page[2] - wrk->status_page[1];
+#ifdef DEBUG
+ printf("qd[0] = %d (%d - %d) x %d (%d - %d) = %ld\n",
+ wrk->seqno[VCS1] - wrk->status_page[0],
+ wrk->seqno[VCS1], wrk->status_page[0],
+ wrk->status_page[2] - wrk->status_page[1],
+ wrk->status_page[2], wrk->status_page[1],
+ qd[VCS1]);
+#endif
+
+ qd[VCS2] = balancer->get_qd(balancer, wrk, VCS2);
+ wrk->qd_sum[VCS2] += qd[VCS2];
+ qd[VCS2] *= wrk->status_page[2 + 16] - wrk->status_page[1 + 16];
+#ifdef DEBUG
+ printf("qd[1] = %d (%d - %d) x %d (%d - %d) = %ld\n",
+ wrk->seqno[VCS2] - wrk->status_page[16],
+ wrk->seqno[VCS2], wrk->status_page[16],
+ wrk->status_page[18] - wrk->status_page[17],
+ wrk->status_page[18], wrk->status_page[17],
+ qd[VCS2]);
+#endif
+
+ if (qd[VCS1] < qd[VCS2])
+ n = 0;
+ else if (qd[VCS2] < qd[VCS1])
+ n = 1;
+ else
+ n = wrk->vcs_rr;
+
+ engine = get_vcs_engine(n);
+ wrk->vcs_rr = n ^ 1;
+
+ return engine;
+}
+
+static const struct workload_balancer rt_balancer = {
+ .get_qd = get_qd_depth,
+ .balance = rt_balance,
+};
+
+static void
+update_bb_seqno(struct w_step *w, enum intel_engine_id engine, uint32_t seqno)
+{
+ igt_assert(engine == VCS1 || engine == VCS2);
+
+ gem_set_domain(fd, w->bb_handle,
+ I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
+
+ w->reloc[0].delta = VCS_SEQNO_OFFSET(engine);
+
+ *w->seqno_value = seqno;
+ *w->seqno_address = w->reloc[0].presumed_offset + w->reloc[0].delta;
+
+ /* If not using NO_RELOC, force the relocations */
+ if (!(w->eb.flags & I915_EXEC_NO_RELOC))
+ w->reloc[0].presumed_offset = -1;
+}
+
+static void
+update_bb_rt(struct w_step *w, enum intel_engine_id engine)
+{
+ igt_assert(engine == VCS1 || engine == VCS2);
+
+ gem_set_domain(fd, w->bb_handle,
+ I915_GEM_DOMAIN_WC, I915_GEM_DOMAIN_WC);
+
+ w->reloc[1].delta = VCS_SEQNO_OFFSET(engine) + sizeof(uint32_t);
+ w->reloc[2].delta = VCS_SEQNO_OFFSET(engine) + 2 * sizeof(uint32_t);
+
+ *w->rt0_value = *REG(RCS_TIMESTAMP);
+ *w->rt0_address = w->reloc[1].presumed_offset + w->reloc[1].delta;
+ *w->rt1_address = w->reloc[1].presumed_offset + w->reloc[1].delta;
+
+ /* If not using NO_RELOC, force the relocations */
+ if (!(w->eb.flags & I915_EXEC_NO_RELOC)) {
+ w->reloc[1].presumed_offset = -1;
+ w->reloc[2].presumed_offset = -1;
+ }
+}
+
+static void w_sync_to(struct workload *wrk, struct w_step *w, int target)
+{
+ if (target < 0)
+ target = wrk->nr_steps + target;
+
+ igt_assert(target < wrk->nr_steps);
+
+ while (wrk->steps[target].type != BATCH) {
+ if (--target < 0)
+ target = wrk->nr_steps + target;
+ }
+
+ igt_assert(target < wrk->nr_steps);
+ igt_assert(wrk->steps[target].type == BATCH);
+
+ gem_sync(fd, wrk->steps[target].obj[0].handle);
+}
+
+static void
+run_workload(unsigned int id, struct workload *wrk,
+ bool background, int pipe_fd,
+ const struct workload_balancer *balancer,
+ unsigned int repeat,
+ unsigned int flags)
+{
+ struct timespec t_start, t_end;
+ struct w_step *w;
+ bool run = true;
+ int throttle = -1;
+ int qd_throttle = -1;
+ double t;
+ int i, j;
+
+ clock_gettime(CLOCK_MONOTONIC, &t_start);
+
+ hars_petruska_f54_1_random_seed(0);
+
+ for (j = 0; run && (background || j < repeat); j++) {
+ clock_gettime(CLOCK_MONOTONIC, &wrk->repeat_start);
+
+ for (i = 0, w = wrk->steps; run && (i < wrk->nr_steps);
+ i++, w++) {
+ enum intel_engine_id engine = w->engine;
+ int do_sleep = 0;
+
+ if (w->type == DELAY) {
+ do_sleep = w->wait;
+ } else if (w->type == PERIOD) {
+ struct timespec now;
+
+ clock_gettime(CLOCK_MONOTONIC, &now);
+ do_sleep = w->wait -
+ elapsed_us(&wrk->repeat_start, &now);
+ if (do_sleep < 0) {
+ if (!quiet) {
+ printf("%u: Dropped period @ %u/%u (%dus late)!\n",
+ id, j, i, do_sleep);
+ continue;
+ }
+ }
+ } else if (w->type == SYNC) {
+ unsigned int s_idx = i + w->wait;
+
+ igt_assert(i > 0 && i < wrk->nr_steps);
+ igt_assert(wrk->steps[s_idx].type == BATCH);
+ gem_sync(fd, wrk->steps[s_idx].obj[0].handle);
+ continue;
+ } else if (w->type == THROTTLE) {
+ throttle = w->wait;
+ continue;
+ } else if (w->type == QD_THROTTLE) {
+ qd_throttle = w->wait;
+ continue;
+ }
+
+ if (do_sleep) {
+ usleep(do_sleep);
+ continue;
+ }
+
+ wrk->nr_bb[engine]++;
+
+ if (engine == VCS && balancer) {
+ engine = balancer->balance(balancer, wrk, w);
+ wrk->nr_bb[engine]++;
+
+ eb_update_flags(w, engine, flags);
+
+ if (flags & SEQNO)
+ update_bb_seqno(w, engine,
+ ++wrk->seqno[engine]);
+ if (flags & RT)
+ update_bb_rt(w, engine);
+ }
+
+ if (w->duration.min != w->duration.max) {
+ unsigned int d = get_duration(&w->duration);
+ unsigned long offset;
+
+ offset = ALIGN(w->bb_sz - get_bb_sz(d),
+ 2 * sizeof(uint32_t));
+ w->eb.batch_start_offset = offset;
+ }
+
+ /* If workload want qd throttling when qd is not
+ * available approximate with normal throttling. */
+ if (qd_throttle > 0 && throttle < 0 &&
+ !(balancer && balancer->get_qd))
+ throttle = qd_throttle;
+
+ if (throttle > 0)
+ w_sync_to(wrk, w, i - throttle);
+
+ if (qd_throttle > 0 && balancer && balancer->get_qd) {
+ unsigned int target;
+
+ for (target = wrk->nr_steps - 1; target > 0;
+ target--) {
+ if (balancer->get_qd(balancer, wrk,
+ engine) <
+ qd_throttle)
+ break;
+ w_sync_to(wrk, w, i - target);
+ }
+ }
+
+ gem_execbuf(fd, &w->eb);
+
+ if (pipe_fd >= 0) {
+ struct pollfd fds;
+
+ fds.fd = pipe_fd;
+ fds.events = POLLHUP;
+ if (poll(&fds, 1, 0)) {
+ run = false;
+ break;
+ }
+ }
+
+ if (w->wait)
+ gem_sync(fd, w->obj[0].handle);
+ }
+ }
+
+ if (run)
+ gem_sync(fd, wrk->steps[wrk->nr_steps - 1].obj[0].handle);
+
+ clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+ t = elapsed(&t_start, &t_end);
+ if (!quiet && !balancer)
+ printf("%c%u: %.3fs elapsed (%.3f workloads/s)\n",
+ background ? ' ' : '*', id, t, repeat / t);
+ else if (!quiet && !balancer->get_qd)
+ printf("%c%u: %.3fs elapsed (%.3f workloads/s). %lu (%lu + %lu) total VCS batches.\n",
+ background ? ' ' : '*', id, t, repeat / t,
+ wrk->nr_bb[VCS], wrk->nr_bb[VCS1], wrk->nr_bb[VCS2]);
+ else if (!quiet && balancer)
+ printf("%c%u: %.3fs elapsed (%.3f workloads/s). %lu (%lu + %lu) total VCS batches. Average queue depths %.3f, %.3f.\n",
+ background ? ' ' : '*', id, t, repeat / t,
+ wrk->nr_bb[VCS], wrk->nr_bb[VCS1], wrk->nr_bb[VCS2],
+ (double)wrk->qd_sum[VCS1] / wrk->nr_bb[VCS],
+ (double)wrk->qd_sum[VCS2] / wrk->nr_bb[VCS]);
+}
+
+static void fini_workload(struct workload *wrk)
+{
+ free(wrk->steps);
+ free(wrk);
+}
+
+static unsigned long calibrate_nop(unsigned int tolerance_pct)
+{
+ const uint32_t bbe = 0xa << 23;
+ unsigned int loops = 17;
+ unsigned int usecs = nop_calibration_us;
+ struct drm_i915_gem_exec_object2 obj = {};
+ struct drm_i915_gem_execbuffer2 eb =
+ { .buffer_count = 1, .buffers_ptr = (uintptr_t)&obj};
+ long size, last_size;
+ struct timespec t_0, t_end;
+
+ clock_gettime(CLOCK_MONOTONIC, &t_0);
+
+ size = 256 * 1024;
+ do {
+ struct timespec t_start;
+
+ obj.handle = gem_create(fd, size);
+ gem_write(fd, obj.handle, size - sizeof(bbe), &bbe,
+ sizeof(bbe));
+ gem_execbuf(fd, &eb);
+ gem_sync(fd, obj.handle);
+
+ clock_gettime(CLOCK_MONOTONIC, &t_start);
+ for (int loop = 0; loop < loops; loop++)
+ gem_execbuf(fd, &eb);
+ gem_sync(fd, obj.handle);
+ clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+ gem_close(fd, obj.handle);
+
+ last_size = size;
+ size = loops * size / elapsed(&t_start, &t_end) / 1e6 * usecs;
+ size = ALIGN(size, sizeof(uint32_t));
+ } while (elapsed(&t_0, &t_end) < 5 ||
+ abs(size - last_size) > (size * tolerance_pct / 100));
+
+ return size / sizeof(uint32_t);
+}
+
+static void print_help(void)
+{
+ puts(
+"Usage: gem_wsim [OPTIONS]\n"
+"\n"
+"Runs a simulated workload on the GPU.\n"
+"When ran without arguments performs a GPU calibration result of which needs\n"
+"to be provided when running the simulation in subsequent invocations.\n"
+"\n"
+"Options:\n"
+" -h This text.\n"
+" -q Be quiet - do not output anything to stdout.\n"
+" -n <n> Nop calibration value.\n"
+" -t <n> Nop calibration tolerance percentage.\n"
+" Use when there is a difficulty obtaining calibration\n"
+" with the default settings.\n"
+" -w <desc|path> Filename or a workload descriptor.\n"
+" Can be given multiple times.\n"
+" -W <desc|path> Filename or a master workload descriptor.\n"
+" Only one master workload can be optinally specified\n"
+" in which case all other workloads become background\n"
+" ones and run as long as the master.\n"
+" -r <n> How many times to emit the workload.\n"
+" -c <n> Fork N clients emitting the workload simultaneously.\n"
+" -x Swap VCS1 and VCS2 engines in every other client.\n"
+" -b <n> Load balancing to use. (0: rr, 1: qd, 2: rt)\n"
+ );
+}
+
+static char *load_workload_descriptor(char *filename)
+{
+ struct stat sbuf;
+ char *buf;
+ int infd, ret, i;
+ ssize_t len;
+
+ ret = stat(filename, &sbuf);
+ if (ret || !S_ISREG(sbuf.st_mode))
+ return filename;
+
+ igt_assert(sbuf.st_size < 1024 * 1024); /* Just so. */
+ buf = malloc(sbuf.st_size);
+ igt_assert(buf);
+
+ infd = open(filename, O_RDONLY);
+ igt_assert(infd >= 0);
+ len = read(infd, buf, sbuf.st_size);
+ igt_assert(len == sbuf.st_size);
+ close(infd);
+
+ for (i = 0; i < len; i++) {
+ if (buf[i] == '\n')
+ buf[i] = ',';
+ }
+
+ len--;
+ while (buf[len] == ',')
+ buf[len--] = 0;
+
+ return buf;
+}
+
+static char **
+add_workload_arg(char **w_args, unsigned int nr_args, char *w_arg)
+{
+ w_args = realloc(w_args, sizeof(char *) * nr_args);
+ igt_assert(w_args);
+ w_args[nr_args - 1] = w_arg;
+
+ return w_args;
+}
+
+int main(int argc, char **argv)
+{
+ unsigned int repeat = 1;
+ unsigned int clients = 1;
+ unsigned int flags = 0;
+ struct timespec t_start, t_end;
+ struct workload **w, **wrk = NULL;
+ unsigned int nr_w_args = 0;
+ int master_workload = -1;
+ char **w_args = NULL;
+ unsigned int tolerance_pct = 1;
+ const struct workload_balancer *balancer = NULL;
+ double t;
+ int i, c;
+
+ fd = drm_open_driver(DRIVER_INTEL);
+ intel_register_access_init(intel_get_pci_device(), false, fd);
+
+ while ((c = getopt(argc, argv, "qc:n:r:xw:W:t:b:h")) != -1) {
+ switch (c) {
+ case 'W':
+ if (master_workload >= 0) {
+ if (!quiet)
+ fprintf(stderr,
+ "Only one master workload can be given!\n");
+ return 1;
+ }
+ master_workload = nr_w_args;
+ /* Fall through */
+ case 'w':
+ w_args = add_workload_arg(w_args, ++nr_w_args, optarg);
+ break;
+ case 'c':
+ clients = strtol(optarg, NULL, 0);
+ break;
+ case 't':
+ tolerance_pct = strtol(optarg, NULL, 0);
+ break;
+ case 'n':
+ nop_calibration = strtol(optarg, NULL, 0);
+ break;
+ case 'r':
+ repeat = strtol(optarg, NULL, 0);
+ break;
+ case 'q':
+ quiet = true;
+ break;
+ case 'x':
+ flags |= SWAPVCS;
+ break;
+ case 'b':
+ switch (strtol(optarg, NULL, 0)) {
+ case 0:
+ balancer = &rr_balancer;
+ flags |= BALANCE;
+ break;
+ case 1:
+ igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
+ 8);
+ balancer = &qd_balancer;
+ flags |= SEQNO | BALANCE;
+ break;
+ case 2:
+ igt_assert(intel_gen(intel_get_drm_devid(fd)) >=
+ 8);
+ balancer = &rt_balancer;
+ flags |= SEQNO | BALANCE | RT;
+ break;
+ default:
+ if (!quiet)
+ fprintf(stderr,
+ "Unknown balancing mode '%s'!\n",
+ optarg);
+ return 1;
+ }
+ break;
+ case 'h':
+ print_help();
+ return 0;
+ default:
+ return 1;
+ }
+ }
+
+ if (!nop_calibration) {
+ if (!quiet)
+ printf("Calibrating nop delay with %u%% tolerance...\n",
+ tolerance_pct);
+ nop_calibration = calibrate_nop(tolerance_pct);
+ if (!quiet)
+ printf("Nop calibration for %uus delay is %lu.\n",
+ nop_calibration_us, nop_calibration);
+
+ return 0;
+ }
+
+ if (!nr_w_args) {
+ if (!quiet)
+ fprintf(stderr, "No workload descriptor(s)!\n");
+ return 1;
+ }
+
+ if (nr_w_args > 1 && clients > 1) {
+ if (!quiet)
+ fprintf(stderr,
+ "Cloned clients cannot be combined with multiple workloads!\n");
+ return 1;
+ }
+
+ wrk = calloc(nr_w_args, sizeof(*wrk));
+ igt_assert(wrk);
+
+ for (i = 0; i < nr_w_args; i++) {
+ w_args[i] = load_workload_descriptor(w_args[i]);
+ if (!w_args[i]) {
+ if (!quiet)
+ fprintf(stderr,
+ "Failed to load workload descriptor %u!\n",
+ i);
+ return 1;
+ }
+
+ wrk[i] = parse_workload(w_args[i]);
+ if (!wrk[i]) {
+ if (!quiet)
+ fprintf(stderr,
+ "Failed to parse workload %u!\n", i);
+ return 1;
+ }
+ }
+
+ if (!quiet) {
+ printf("Using %lu nop calibration for %uus delay.\n",
+ nop_calibration, nop_calibration_us);
+ if (nr_w_args > 1)
+ clients = nr_w_args;
+ printf("%u client%s.\n", clients, clients > 1 ? "s" : "");
+ if (flags & SWAPVCS)
+ printf("Swapping VCS rings between clients.\n");
+ }
+
+ if (master_workload >= 0 && clients == 1)
+ master_workload = -1;
+
+ w = calloc(clients, sizeof(struct workload *));
+ igt_assert(w);
+
+ for (i = 0; i < clients; i++) {
+ unsigned int flags_ = flags;
+
+ w[i] = clone_workload(wrk[nr_w_args > 1 ? i : 0]);
+
+ if (master_workload >= 0) {
+ int ret = pipe(w[i]->pipe);
+
+ igt_assert(ret == 0);
+ }
+
+ if (flags & SWAPVCS && i & 1)
+ flags_ &= ~SWAPVCS;
+
+ prepare_workload(w[i], flags_);
+ }
+
+ clock_gettime(CLOCK_MONOTONIC, &t_start);
+
+ igt_fork(child, clients) {
+ int pipe_fd = -1;
+ bool background = false;
+
+ if (master_workload >= 0) {
+ close(w[child]->pipe[0]);
+ if (child != master_workload) {
+ pipe_fd = w[child]->pipe[1];
+ background = true;
+ } else {
+ close(w[child]->pipe[1]);
+ }
+ }
+
+ run_workload(child, w[child], background, pipe_fd, balancer,
+ repeat, flags);
+ }
+
+ if (master_workload >= 0) {
+ int status = -1;
+ pid_t pid;
+
+ for (i = 0; i < clients; i++)
+ close(w[i]->pipe[1]);
+
+ pid = wait(&status);
+ if (pid >= 0)
+ igt_child_done(pid);
+
+ for (i = 0; i < clients; i++)
+ close(w[i]->pipe[0]);
+ }
+
+ igt_waitchildren();
+
+ clock_gettime(CLOCK_MONOTONIC, &t_end);
+
+ t = elapsed(&t_start, &t_end);
+ if (!quiet)
+ printf("%.3fs elapsed (%.3f workloads/s)\n",
+ t, clients * repeat / t);
+
+ for (i = 0; i < clients; i++)
+ fini_workload(w[i]);
+ free(w);
+ for (i = 0; i < nr_w_args; i++)
+ fini_workload(wrk[i]);
+ free(w_args);
+
+ return 0;
+}
diff --git a/benchmarks/wsim/README b/benchmarks/wsim/README
new file mode 100644
index 00000000..7aa0694a
--- /dev/null
+++ b/benchmarks/wsim/README
@@ -0,0 +1,56 @@
+Workload descriptor format
+==========================
+
+ctx.engine.duration_us.dependency.wait,...
+<uint>.<str>.<uint>[-<uint>].<int <= 0>.<0|1>,...
+d|p|s.<uiny>,...
+
+For duration a range can be given from which a random value will be picked
+before every submit. Since this and seqno management requires CPU access to
+objects, care needs to be taken in order to ensure the submit queue is deep
+enough these operations do not affect the execution speed unless that is
+desired.
+
+Additional workload steps are also supported:
+
+ 'd' - Adds a delay (in microseconds).
+ 'p' - Adds a delay relative to the start of previous loop so that the each loop
+ starts execution with a given period.
+ 's' - Synchronises the pipeline to a batch relative to the step.
+ 't' - Throttle every n batches
+ 'q' - Throttle to n max queue depth
+
+Engine ids: RCS, BCS, VCS, VCS1, VCS2, VECS
+
+Example (leading spaces must not be present in the actual file):
+----------------------------------------------------------------
+
+ 1.VCS1.3000.0.1
+ 1.RCS.500-1000.-1.0
+ 1.RCS.3700.0.0
+ 1.RCS.1000.-2.0
+ 1.VCS2.2300.-2.0
+ 1.RCS.4700.-1.0
+ 1.VCS2.600.-1.1
+ p.16000
+
+The above workload described in human language works like this:
+
+ 1. A batch is sent to the VCS1 engine which will be executing for 3ms on the
+ GPU and userspace will wait until it is finished before proceeding.
+ 2-4. Now three batches are sent to RCS with durations of 0.5-1.5ms (random
+ duration range), 3.7ms and 1ms respectively. The first batch has a data
+ dependency on the preceding VCS1 batch, and the last of the group depends
+ on the first from the group.
+ 5. Now a 2.3ms batch is sent to VCS2, with a data dependency on the 3.7ms
+ RCS batch.
+ 6. This is followed by a 4.7ms RCS batch with a data dependency on the 2.3ms
+ VCS2 batch.
+ 7. Then a 0.6ms VCS2 batch is sent depending on the previous RCS one. In the
+ same step the tool is told to wait for the batch completes before
+ proceeding.
+ 8. Finally the tool is told to wait long enough to ensure the next iteration
+ starts 16ms after the previous one has started.
+
+When workload descriptors are provided on the command line, commas must be used
+instead of new lines.
diff --git a/benchmarks/wsim/media_17i7.wsim b/benchmarks/wsim/media_17i7.wsim
new file mode 100644
index 00000000..5f533d8e
--- /dev/null
+++ b/benchmarks/wsim/media_17i7.wsim
@@ -0,0 +1,7 @@
+1.VCS1.3000.0.1
+1.RCS.1000.-1.0
+1.RCS.3700.0.0
+1.RCS.1000.-2.0
+1.VCS2.2300.-2.0
+1.RCS.4700.-1.0
+1.VCS2.600.-1.1
diff --git a/benchmarks/wsim/media_19.wsim b/benchmarks/wsim/media_19.wsim
new file mode 100644
index 00000000..f210d794
--- /dev/null
+++ b/benchmarks/wsim/media_19.wsim
@@ -0,0 +1,10 @@
+0.VECS.1400-1500.0.0
+0.RCS.1000-1500.-1.0
+s.-2
+2.VCS2.50-350.0.1
+1.VCS1.1300-1400.0.1
+0.VECS.1400-1500.0.0
+0.RCS.100-300.-1.1
+2.RCS.1300-1500.0.0
+2.VCS2.100-300.-1.1
+1.VCS1.900-1400.0.1
diff --git a/benchmarks/wsim/media_load_balance_17i7.wsim b/benchmarks/wsim/media_load_balance_17i7.wsim
new file mode 100644
index 00000000..25a69203
--- /dev/null
+++ b/benchmarks/wsim/media_load_balance_17i7.wsim
@@ -0,0 +1,7 @@
+1.VCS.3000.0.1
+1.RCS.1000.-1.0
+1.RCS.3700.0.0
+1.RCS.1000.-2.0
+1.VCS.2300.-2.0
+1.RCS.4700.-1.0
+1.VCS.600.-1.1
diff --git a/benchmarks/wsim/media_load_balance_19.wsim b/benchmarks/wsim/media_load_balance_19.wsim
new file mode 100644
index 00000000..03890776
--- /dev/null
+++ b/benchmarks/wsim/media_load_balance_19.wsim
@@ -0,0 +1,10 @@
+0.VECS.1400-1500.0.0
+0.RCS.1000-1500.-1.0
+s.-2
+1.VCS.50-350.0.1
+1.VCS.1300-1400.0.1
+0.VECS.1400-1500.0.0
+0.RCS.100-300.-1.1
+1.RCS.1300-1500.0.0
+1.VCS.100-300.-1.1
+1.VCS.900-1400.0.1
diff --git a/benchmarks/wsim/vcs1.wsim b/benchmarks/wsim/vcs1.wsim
new file mode 100644
index 00000000..9d3e682b
--- /dev/null
+++ b/benchmarks/wsim/vcs1.wsim
@@ -0,0 +1,26 @@
+t.5
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
+0.VCS1.500-2000.0.0
diff --git a/benchmarks/wsim/vcs_balanced.wsim b/benchmarks/wsim/vcs_balanced.wsim
new file mode 100644
index 00000000..e8958b8f
--- /dev/null
+++ b/benchmarks/wsim/vcs_balanced.wsim
@@ -0,0 +1,26 @@
+q.5
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0
+0.VCS.500-2000.0.0