From 05ca171aa9a6902614241f9685de2f62f30126d8 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 3 Jun 2016 10:43:09 +0100
Subject: benchmarks/gem_exec_nop: Extend submission to check write
 inter-engine sync

Currently, we look at the throughput for submitting a read batch to a
single engine or any. The kernel optimises for this by allowing multiple
engine to read at the same time, but writes are exclusive to a single
engine. So lets try to measure the impact of inserting the barriers
between writes on different engines.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 benchmarks/gem_exec_nop.c | 80 ++++++++++++++++++++++++++++++++++-------------
 1 file changed, 59 insertions(+), 21 deletions(-)

(limited to 'benchmarks/gem_exec_nop.c')

diff --git a/benchmarks/gem_exec_nop.c b/benchmarks/gem_exec_nop.c
index ed4bf8d9..03a03541 100644
--- a/benchmarks/gem_exec_nop.c
+++ b/benchmarks/gem_exec_nop.c
@@ -53,6 +53,8 @@
 #define ENGINE_FLAGS  (I915_EXEC_RING_MASK | LOCAL_I915_EXEC_BSD_MASK)
 
 #define SYNC 0x1
+#define WRITE 0x2
+#define READ_ALL 0x4
 
 static double elapsed(const struct timespec *start,
 		      const struct timespec *end)
@@ -62,16 +64,18 @@ static double elapsed(const struct timespec *start,
 
 static uint32_t batch(int fd)
 {
-	const uint32_t buf[] = {MI_BATCH_BUFFER_END};
+	const uint32_t bbe = MI_BATCH_BUFFER_END;
 	uint32_t handle = gem_create(fd, 4096);
-	gem_write(fd, handle, 0, buf, sizeof(buf));
+	gem_write(fd, handle, 0, &bbe, sizeof(bbe));
 	return handle;
 }
 
 static int loop(unsigned ring, int reps, int ncpus, unsigned flags)
 {
 	struct drm_i915_gem_execbuffer2 execbuf;
-	struct drm_i915_gem_exec_object2 obj;
+	struct drm_i915_gem_exec_object2 obj[2];
+	unsigned all_engines[16];
+	unsigned all_nengine;
 	unsigned engines[16];
 	unsigned nengine;
 	double *shared;
@@ -81,12 +85,15 @@ static int loop(unsigned ring, int reps, int ncpus, unsigned flags)
 
 	fd = drm_open_driver(DRIVER_INTEL);
 
-	memset(&obj, 0, sizeof(obj));
-	obj.handle = batch(fd);
+	memset(obj, 0, sizeof(obj));
+	obj[0].handle = gem_create(fd, 4096);
+	if (flags & WRITE)
+		obj[0].flags = EXEC_OBJECT_WRITE;
+	obj[1].handle = batch(fd);
 
 	memset(&execbuf, 0, sizeof(execbuf));
-	execbuf.buffers_ptr = (uintptr_t)&obj;
-	execbuf.buffer_count = 1;
+	execbuf.buffers_ptr = (uintptr_t)obj;
+	execbuf.buffer_count = 2;
 	execbuf.flags |= LOCAL_I915_EXEC_HANDLE_LUT;
 	execbuf.flags |= LOCAL_I915_EXEC_NO_RELOC;
 	if (__gem_execbuf(fd, &execbuf)) {
@@ -95,47 +102,67 @@ static int loop(unsigned ring, int reps, int ncpus, unsigned flags)
 			return 77;
 	}
 
-	nengine = 0;
+	if (flags & WRITE && !(execbuf.flags & LOCAL_I915_EXEC_HANDLE_LUT))
+		return 77;
+
+	all_nengine = 0;
+	for (ring = 1; ring < 16; ring++) {
+		execbuf.flags &= ~ENGINE_FLAGS;
+		execbuf.flags |= ring;
+		if (__gem_execbuf(fd, &execbuf) == 0)
+			all_engines[all_nengine++] = ring;
+	}
+
 	if (ring == -1) {
-		for (ring = 1; ring < 16; ring++) {
-			execbuf.flags &= ~ENGINE_FLAGS;
-			execbuf.flags |= ring;
-			if (__gem_execbuf(fd, &execbuf) == 0)
-				engines[nengine++] = ring;
-		}
-	} else
-		engines[nengine++] = ring;
+		nengine = all_nengine;
+		memcpy(engines, all_engines, all_nengine*sizeof(engines[0]));
+	} else {
+		nengine = 1;
+		engines[0] = ring;
+	}
 
 	while (reps--) {
 		memset(shared, 0, 4096);
 
-		gem_set_domain(fd, obj.handle, I915_GEM_DOMAIN_GTT, 0);
+		gem_set_domain(fd, obj[1].handle, I915_GEM_DOMAIN_GTT, 0);
 		sleep(1); /* wait for the hw to go back to sleep */
 
 		igt_fork(child, ncpus) {
 			struct timespec start, end;
 			unsigned count = 0;
 
-			obj.handle = batch(fd);
+			obj[0].handle = gem_create(fd, 4096);
+			obj[1].handle = batch(fd);
 
 			clock_gettime(CLOCK_MONOTONIC, &start);
 			do {
 				for (int inner = 0; inner < 1024; inner++) {
+					if (flags & READ_ALL) {
+						obj[0].flags = 0;
+						for (int n = 0; n < all_nengine; n++) {
+							execbuf.flags &= ~ENGINE_FLAGS;
+							execbuf.flags |= all_engines[n];
+							gem_execbuf(fd, &execbuf);
+						}
+						if (flags & WRITE)
+							obj[0].flags = EXEC_OBJECT_WRITE;
+					}
 					execbuf.flags &= ~ENGINE_FLAGS;
 					execbuf.flags |= engines[count++ % nengine];
 					gem_execbuf(fd, &execbuf);
 					if (flags & SYNC)
-						gem_sync(fd, obj.handle);
+						gem_sync(fd, obj[1].handle);
 				}
 
 				clock_gettime(CLOCK_MONOTONIC, &end);
 			} while (elapsed(&start, &end) < 2.);
 
-			gem_sync(fd, obj.handle);
+			gem_sync(fd, obj[1].handle);
 			clock_gettime(CLOCK_MONOTONIC, &end);
 			shared[child] = 1e6*elapsed(&start, &end) / count;
 
-			gem_close(fd, obj.handle);
+			gem_close(fd, obj[1].handle);
+			gem_close(fd, obj[0].handle);
 		}
 		igt_waitchildren();
 
@@ -143,11 +170,14 @@ static int loop(unsigned ring, int reps, int ncpus, unsigned flags)
 			shared[ncpus] += shared[child];
 		printf("%7.3f\n", shared[ncpus] / ncpus);
 
+		obj[0].flags = 0;
 		for (int n = 0; n < nengine; n++) {
 			execbuf.flags &= ~ENGINE_FLAGS;
 			execbuf.flags |= engines[n];
 			gem_execbuf(fd, &execbuf);
 		}
+		if (flags & WRITE)
+			obj[0].flags = EXEC_OBJECT_WRITE;
 	}
 	return 0;
 }
@@ -191,6 +221,14 @@ int main(int argc, char **argv)
 			flags |= SYNC;
 			break;
 
+		case 'W':
+			flags |= WRITE;
+			break;
+
+		case 'A':
+			flags |= READ_ALL;
+			break;
+
 		default:
 			break;
 		}
-- 
cgit v1.2.3