summaryrefslogtreecommitdiff
path: root/benchmarks/gem_latency.c
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2015-12-19 12:07:26 +0000
committerChris Wilson <chris@chris-wilson.co.uk>2015-12-19 12:16:52 +0000
commit646cab4c0c73dce310ff8020ffb96272647780ba (patch)
tree46e9ac948531ed64513b9cc678458aefc2d2069b /benchmarks/gem_latency.c
parente37a4c80922aaeb79a419a7d7f96b0ac4c6ffb88 (diff)
benchmarks/gem_latency: Split the nop/work/latency measurement
Split the distinct phases (generate interrupts, busywork, measure latency) into separate batches for finer control. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Diffstat (limited to 'benchmarks/gem_latency.c')
-rw-r--r--benchmarks/gem_latency.c204
1 files changed, 129 insertions, 75 deletions
diff --git a/benchmarks/gem_latency.c b/benchmarks/gem_latency.c
index 2571b65a..40251b77 100644
--- a/benchmarks/gem_latency.c
+++ b/benchmarks/gem_latency.c
@@ -57,9 +57,20 @@ struct consumer {
struct producer {
pthread_t thread;
uint32_t ctx;
- uint32_t nop_handle;
- struct drm_i915_gem_exec_object2 exec[2];
- struct drm_i915_gem_relocation_entry reloc[3];
+ struct {
+ struct drm_i915_gem_exec_object2 exec[1];
+ struct drm_i915_gem_execbuffer2 execbuf;
+ } nop_dispatch;
+ struct {
+ struct drm_i915_gem_exec_object2 exec[2];
+ struct drm_i915_gem_relocation_entry reloc[2];
+ struct drm_i915_gem_execbuffer2 execbuf;
+ } workload_dispatch;
+ struct {
+ struct drm_i915_gem_exec_object2 exec[1];
+ struct drm_i915_gem_relocation_entry reloc[1];
+ struct drm_i915_gem_execbuffer2 execbuf;
+ } latency_dispatch;
pthread_mutex_t lock;
pthread_cond_t p_cond, c_cond;
@@ -84,78 +95,114 @@ struct producer {
#define BCS_TIMESTAMP (0x22000 + 0x358)
-static void setup_workload(struct producer *p, int gen, uint32_t scratch)
+static uint32_t create_workload(int gen, uint32_t scratch)
{
const int has_64bit_reloc = gen >= 8;
- uint32_t *map;
+ uint32_t handle = gem_create(fd, 4096);
+ uint32_t buf[80];
int i = 0;
- p->exec[0].handle = scratch;
- p->exec[1].relocation_count = 3;
- p->exec[1].relocs_ptr = (uintptr_t)p->reloc;
- p->exec[1].handle = gem_create(fd, 4096);
- if (gem_has_llc(fd))
- map = gem_mmap__cpu(fd, p->exec[1].handle, 0, 4096, PROT_WRITE);
- else
- map = gem_mmap__gtt(fd, p->exec[1].handle, 4096, PROT_WRITE);
-
/* XY_SRC_COPY */
- map[i++] = COPY_BLT_CMD | BLT_WRITE_ALPHA | BLT_WRITE_RGB;
+ buf[i++] = COPY_BLT_CMD | BLT_WRITE_ALPHA | BLT_WRITE_RGB;
if (has_64bit_reloc)
- map[i-1] += 2;
- map[i++] = 0xcc << 16 | 1 << 25 | 1 << 24 | (4*WIDTH);
- map[i++] = 0;
- map[i++] = HEIGHT << 16 | WIDTH;
- p->reloc[0].offset = i * sizeof(uint32_t);
- p->reloc[0].delta = 0;
- p->reloc[0].target_handle = scratch;
- p->reloc[0].read_domains = I915_GEM_DOMAIN_RENDER;
- p->reloc[0].write_domain = I915_GEM_DOMAIN_RENDER;
- p->reloc[0].presumed_offset = 0;
- map[i++] = 0;
+ buf[i-1] += 2;
+ buf[i++] = 0xcc << 16 | 1 << 25 | 1 << 24 | (4*WIDTH);
+ buf[i++] = 0;
+ buf[i++] = HEIGHT << 16 | WIDTH;
+ buf[i++] = 0;
if (has_64bit_reloc)
- map[i++] = 0;
- map[i++] = 0;
- map[i++] = 4096;
- p->reloc[1].offset = i * sizeof(uint32_t);
- p->reloc[1].delta = 0;
- p->reloc[1].target_handle = scratch;
- p->reloc[1].read_domains = I915_GEM_DOMAIN_RENDER;
- p->reloc[1].write_domain = 0;
- p->reloc[1].presumed_offset = 0;
- map[i++] = 0;
+ buf[i++] = 0;
+ buf[i++] = 0;
+ buf[i++] = 4096;
+ buf[i++] = 0;
if (has_64bit_reloc)
- map[i++] = 0;
+ buf[i++] = 0;
+ buf[i++] = MI_BATCH_BUFFER_END;
- /* MI_FLUSH_DW */
- map[i++] = 0x26 << 23 | 1;
- if (has_64bit_reloc)
- map[i-1]++;
- map[i++] = 0;
- map[i++] = 0;
+ gem_write(fd, handle, 0, buf, i*sizeof(buf[0]));
+ return handle;
+}
+
+static void setup_workload(struct producer *p, int gen,
+ uint32_t scratch,
+ uint32_t batch)
+{
+ struct drm_i915_gem_execbuffer2 *eb;
+ const int has_64bit_reloc = gen >= 8;
+
+ p->workload_dispatch.exec[0].handle = scratch;
+ p->workload_dispatch.exec[1].relocation_count = 2;
+ p->workload_dispatch.exec[1].relocs_ptr = (uintptr_t)p->workload_dispatch.reloc;
+ p->workload_dispatch.exec[1].handle = batch;
+
+ p->workload_dispatch.reloc[0].offset = 4 * sizeof(uint32_t);
+ p->workload_dispatch.reloc[0].delta = 0;
+ p->workload_dispatch.reloc[0].target_handle = scratch;
+ p->workload_dispatch.reloc[0].read_domains = I915_GEM_DOMAIN_RENDER;
+ p->workload_dispatch.reloc[0].write_domain = I915_GEM_DOMAIN_RENDER;
+ p->workload_dispatch.reloc[0].presumed_offset = 0;
+
+ p->workload_dispatch.reloc[1].offset = 7 * sizeof(uint32_t);
if (has_64bit_reloc)
- map[i++] = 0;
+ p->workload_dispatch.reloc[1].offset += sizeof(uint32_t);
+ p->workload_dispatch.reloc[1].delta = 0;
+ p->workload_dispatch.reloc[1].target_handle = scratch;
+ p->workload_dispatch.reloc[1].read_domains = I915_GEM_DOMAIN_RENDER;
+ p->workload_dispatch.reloc[1].write_domain = 0;
+ p->workload_dispatch.reloc[1].presumed_offset = 0;
+
+ eb = memset(&p->workload_dispatch.execbuf, 0, sizeof(*eb));
+ eb->buffers_ptr = (uintptr_t)p->workload_dispatch.exec;
+ eb->buffer_count = 2;
+ eb->flags = I915_EXEC_BLT | LOCAL_EXEC_NO_RELOC;
+ eb->rsvd1 = p->ctx;
+}
+
+static void setup_latency(struct producer *p, int gen)
+{
+ struct drm_i915_gem_execbuffer2 *eb;
+ const int has_64bit_reloc = gen >= 8;
+ uint32_t handle;
+ uint32_t *map;
+ int i = 0;
+
+ handle = gem_create(fd, 4096);
+ if (gem_has_llc(fd))
+ map = gem_mmap__cpu(fd, handle, 0, 4096, PROT_WRITE);
+ else
+ map = gem_mmap__gtt(fd, handle, 4096, PROT_WRITE);
+
+ p->latency_dispatch.exec[0].relocation_count = 1;
+ p->latency_dispatch.exec[0].relocs_ptr =
+ (uintptr_t)p->latency_dispatch.reloc;
+ p->latency_dispatch.exec[0].handle = handle;
/* MI_STORE_REG_MEM */
map[i++] = 0x24 << 23 | 1;
if (has_64bit_reloc)
map[i-1]++;
map[i++] = BCS_TIMESTAMP;
- p->reloc[2].offset = i * sizeof(uint32_t);
- p->reloc[2].delta = 4000;
- p->reloc[2].target_handle = p->exec[1].handle;
- p->reloc[2].read_domains = I915_GEM_DOMAIN_INSTRUCTION;
- p->reloc[2].write_domain = 0; /* We lie! */
- p->reloc[2].presumed_offset = 0;
+ p->latency_dispatch.reloc[0].offset = i * sizeof(uint32_t);
+ p->latency_dispatch.reloc[0].delta = 4000;
+ p->latency_dispatch.reloc[0].target_handle = handle;
+ p->latency_dispatch.reloc[0].read_domains = I915_GEM_DOMAIN_INSTRUCTION;
+ p->latency_dispatch.reloc[0].write_domain = 0; /* We lie! */
+ p->latency_dispatch.reloc[0].presumed_offset = 0;
p->last_timestamp = &map[1000];
map[i++] = 4000;
if (has_64bit_reloc)
map[i++] = 0;
map[i++] = MI_BATCH_BUFFER_END;
+
+ eb = memset(&p->latency_dispatch.execbuf, 0, sizeof(*eb));
+ eb->buffers_ptr = (uintptr_t)p->latency_dispatch.exec;
+ eb->buffer_count = 1;
+ eb->flags = I915_EXEC_BLT | LOCAL_EXEC_NO_RELOC;
+ eb->rsvd1 = p->ctx;
}
-static uint32_t setup_nop(void)
+static uint32_t create_nop(void)
{
uint32_t buf = MI_BATCH_BUFFER_END;
uint32_t handle;
@@ -166,34 +213,31 @@ static uint32_t setup_nop(void)
return handle;
}
+static void setup_nop(struct producer *p, uint32_t batch)
+{
+ struct drm_i915_gem_execbuffer2 *eb;
+
+ p->nop_dispatch.exec[0].handle = batch;
+
+ eb = memset(&p->nop_dispatch.execbuf, 0, sizeof(*eb));
+ eb->buffers_ptr = (uintptr_t)p->nop_dispatch.exec;
+ eb->buffer_count = 1;
+ eb->flags = I915_EXEC_BLT | LOCAL_EXEC_NO_RELOC;
+ eb->rsvd1 = p->ctx;
+}
+
#define READ(x) *(volatile uint32_t *)((volatile char *)igt_global_mmio + x)
static void measure_latency(struct producer *p, igt_stats_t *stats)
{
- gem_sync(fd, p->exec[1].handle);
+ gem_sync(fd, p->latency_dispatch.exec[0].handle);
igt_stats_push(stats, READ(BCS_TIMESTAMP) - *p->last_timestamp);
}
static void *producer(void *arg)
{
struct producer *p = arg;
- struct drm_i915_gem_execbuffer2 nop, workload;
- struct drm_i915_gem_exec_object2 exec;
int n;
- memset(&exec, 0, sizeof(exec));
- exec.handle = p->nop_handle;
- memset(&nop, 0, sizeof(nop));
- nop.buffers_ptr = (uintptr_t)&exec;
- nop.buffer_count = 1;
- nop.flags = I915_EXEC_BLT | LOCAL_EXEC_NO_RELOC;
- nop.rsvd1 = p->ctx;
-
- memset(&workload, 0, sizeof(workload));
- workload.buffers_ptr = (uintptr_t)p->exec;
- workload.buffer_count = 2;
- workload.flags = I915_EXEC_BLT | LOCAL_EXEC_NO_RELOC;
- workload.rsvd1 = p->ctx;
-
while (!done) {
uint32_t start = READ(BCS_TIMESTAMP);
int batches;
@@ -206,7 +250,7 @@ static void *producer(void *arg)
*/
batches = p->nop;
while (batches--)
- gem_execbuf(fd, &nop);
+ gem_execbuf(fd, &p->nop_dispatch.execbuf);
/* Control the amount of work we do, similar to submitting
* empty buffers above, except this time we will load the
@@ -215,7 +259,12 @@ static void *producer(void *arg)
*/
batches = p->workload;
while (batches--)
- gem_execbuf(fd, &workload);
+ gem_execbuf(fd, &p->workload_dispatch.execbuf);
+
+ /* Finally, execute a batch that just reads the current
+ * TIMESTAMP so we can measure the latency.
+ */
+ gem_execbuf(fd, &p->latency_dispatch.execbuf);
/* Wake all the associated clients to wait upon our batch */
pthread_mutex_lock(&p->lock);
@@ -288,7 +337,9 @@ static int run(int seconds,
{
struct producer *p;
igt_stats_t latency, throughput;
- uint32_t scratch, batch;
+ uint32_t nop_batch;
+ uint32_t workload_batch;
+ uint32_t scratch;
int gen, n, m;
int complete;
int nrun;
@@ -305,16 +356,19 @@ static int run(int seconds,
intel_register_access_init(intel_get_pci_device(), false);
- batch = setup_nop();
scratch = gem_create(fd, 4*WIDTH*HEIGHT);
+ nop_batch = create_nop();
+ workload_batch = create_workload(gen, scratch);
p = calloc(nproducers, sizeof(*p));
for (n = 0; n < nproducers; n++) {
- p[n].nop_handle = batch;
- setup_workload(&p[n], gen, scratch);
if (flags & CONTEXT)
p[n].ctx = gem_context_create(fd);
+ setup_nop(&p[n], nop_batch);
+ setup_workload(&p[n], gen, scratch, workload_batch);
+ setup_latency(&p[n], gen);
+
pthread_mutex_init(&p[n].lock, NULL);
pthread_cond_init(&p[n].p_cond, NULL);
pthread_cond_init(&p[n].c_cond, NULL);
@@ -374,7 +428,7 @@ int main(int argc, char **argv)
int producers = 1;
int consumers = 0;
int nop = 0;
- int workload = 1;
+ int workload = 0;
unsigned flags = 0;
int c;