/* * Copyright © 2017 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "i915/gem.h" #include "i915/gem_create.h" #include "igt.h" #include "igt_core.h" #include "igt_device.h" #include "igt_kmod.h" #include "igt_perf.h" #include "igt_sysfs.h" #include "igt_pm.h" #include "intel_ctx.h" #include "sw_sync.h" IGT_TEST_DESCRIPTION("Test the i915 pmu perf interface"); const double tolerance = 0.05f; const unsigned long batch_duration_ns = 500e6; static int open_pmu(int i915, uint64_t config) { int fd; fd = perf_i915_open(i915, config); igt_skip_on(fd < 0 && errno == ENODEV); igt_assert(fd >= 0); return fd; } static int open_group(int i915, uint64_t config, int group) { int fd; fd = perf_i915_open_group(i915, config, group); igt_skip_on(fd < 0 && errno == ENODEV); igt_assert(fd >= 0); return fd; } static void init(int gem_fd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e, uint8_t sample) { int fd, err = 0; bool exists; errno = 0; fd = perf_i915_open(gem_fd, __I915_PMU_ENGINE(e->class, e->instance, sample)); if (fd < 0) err = errno; exists = gem_context_has_engine(gem_fd, ctx->id, e->flags); if (intel_gen(intel_get_drm_devid(gem_fd)) < 6 && sample == I915_SAMPLE_SEMA) exists = false; if (exists) { igt_assert_eq(err, 0); igt_assert_fd(fd); close(fd); } else { igt_assert_lt(fd, 0); igt_assert_eq(err, ENODEV); } } static uint64_t __pmu_read_single(int fd, uint64_t *ts) { uint64_t data[2]; igt_assert_eq(read(fd, data, sizeof(data)), sizeof(data)); if (ts) *ts = data[1]; return data[0]; } static uint64_t pmu_read_single(int fd) { return __pmu_read_single(fd, NULL); } static uint64_t pmu_read_multi(int fd, unsigned int num, uint64_t *val) { uint64_t buf[2 + num]; unsigned int i; igt_assert_eq(read(fd, buf, sizeof(buf)), sizeof(buf)); for (i = 0; i < num; i++) val[i] = buf[2 + i]; return buf[1]; } #define __assert_within_epsilon(x, ref, tol_up, tol_down) \ igt_assert_f((double)(x) <= (1.0 + (tol_up)) * (double)(ref) && \ (double)(x) >= (1.0 - (tol_down)) * (double)(ref), \ "'%s' != '%s' (%f not within +%.1f%%/-%.1f%% tolerance of %f)\n",\ #x, #ref, (double)(x), \ (tol_up) * 100.0, (tol_down) * 100.0, \ (double)(ref)) #define assert_within_epsilon(x, ref, tolerance) \ __assert_within_epsilon(x, ref, tolerance, tolerance) /* * Helper for cases where we assert on time spent sleeping (directly or * indirectly), so make it more robust by ensuring the system sleep time * is within test tolerance to start with. */ static unsigned int measured_usleep(unsigned int usec) { struct timespec ts = { }; unsigned int slept; slept = igt_nsec_elapsed(&ts); igt_assert(slept == 0); do { usleep(usec - slept); slept = igt_nsec_elapsed(&ts) / 1000; } while (slept < usec); return igt_nsec_elapsed(&ts); } #define TEST_BUSY (1) #define FLAG_SYNC (2) #define TEST_TRAILING_IDLE (4) #define TEST_RUNTIME_PM (8) #define FLAG_LONG (16) #define FLAG_HANG (32) #define TEST_S3 (64) static igt_spin_t *__spin_poll(int fd, uint64_t ahnd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e) { struct igt_spin_factory opts = { .ahnd = ahnd, .ctx = ctx, .engine = e->flags, }; if (gem_class_can_store_dword(fd, e->class)) opts.flags |= IGT_SPIN_POLL_RUN; return __igt_spin_factory(fd, &opts); } static unsigned long __spin_wait(int fd, igt_spin_t *spin) { struct timespec start = { }; igt_nsec_elapsed(&start); if (igt_spin_has_poll(spin)) { unsigned long timeout = 0; while (!igt_spin_has_started(spin)) { unsigned long t = igt_nsec_elapsed(&start); igt_assert(gem_bo_busy(fd, spin->handle)); if ((t - timeout) > 250e6) { timeout = t; igt_warn("Spinner not running after %.2fms\n", (double)t / 1e6); igt_assert(t < 2e9); } } } else { igt_debug("__spin_wait - usleep mode\n"); usleep(500e3); /* Better than nothing! */ } igt_assert(gem_bo_busy(fd, spin->handle)); return igt_nsec_elapsed(&start); } static igt_spin_t *__spin_sync(int fd, uint64_t ahnd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e) { igt_spin_t *spin = __spin_poll(fd, ahnd, ctx, e); __spin_wait(fd, spin); return spin; } static igt_spin_t *spin_sync(int fd, uint64_t ahnd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e) { igt_require_gem(fd); return __spin_sync(fd, ahnd, ctx, e); } static igt_spin_t *spin_sync_flags(int fd, uint64_t ahnd, const intel_ctx_t *ctx, unsigned int flags) { struct intel_execution_engine2 e = { }; e.class = gem_execbuf_flags_to_engine_class(flags); e.instance = (flags & (I915_EXEC_BSD_MASK | I915_EXEC_RING_MASK)) == (I915_EXEC_BSD | I915_EXEC_BSD_RING2) ? 1 : 0; e.flags = flags; return spin_sync(fd, ahnd, ctx, &e); } static void end_spin(int fd, igt_spin_t *spin, unsigned int flags) { if (!spin) return; igt_spin_end(spin); if (flags & FLAG_SYNC) gem_sync(fd, spin->handle); if (flags & TEST_TRAILING_IDLE) { unsigned long t, timeout = 0; struct timespec start = { }; igt_nsec_elapsed(&start); do { t = igt_nsec_elapsed(&start); if (gem_bo_busy(fd, spin->handle) && (t - timeout) > 10e6) { timeout = t; igt_warn("Spinner not idle after %.2fms\n", (double)t / 1e6); } usleep(1e3); } while (t < batch_duration_ns / 5); } } static void single(int gem_fd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e, unsigned int flags) { unsigned long slept; igt_spin_t *spin; uint64_t val; int fd; uint64_t ahnd = get_reloc_ahnd(gem_fd, ctx->id); fd = open_pmu(gem_fd, I915_PMU_ENGINE_BUSY(e->class, e->instance)); if (flags & TEST_BUSY) spin = spin_sync(gem_fd, ahnd, ctx, e); else spin = NULL; val = pmu_read_single(fd); slept = measured_usleep(batch_duration_ns / 1000); if (flags & TEST_TRAILING_IDLE) end_spin(gem_fd, spin, flags); val = pmu_read_single(fd) - val; if (flags & FLAG_HANG) igt_force_gpu_reset(gem_fd); else end_spin(gem_fd, spin, FLAG_SYNC); assert_within_epsilon(val, flags & TEST_BUSY ? slept : 0.f, tolerance); /* Check for idle after hang. */ if (flags & FLAG_HANG) { gem_quiescent_gpu(gem_fd); igt_assert(!gem_bo_busy(gem_fd, spin->handle)); val = pmu_read_single(fd); slept = measured_usleep(batch_duration_ns / 1000); val = pmu_read_single(fd) - val; assert_within_epsilon(val, 0, tolerance); } igt_spin_free(gem_fd, spin); close(fd); put_ahnd(ahnd); gem_quiescent_gpu(gem_fd); } static void busy_start(int gem_fd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e) { unsigned long slept; uint64_t val, ts[2]; igt_spin_t *spin; int fd; uint64_t ahnd = get_reloc_ahnd(gem_fd, ctx->id); /* * Defeat the busy stats delayed disable, we need to guarantee we are * the first user. */ sleep(2); spin = __spin_sync(gem_fd, ahnd, ctx, e); fd = open_pmu(gem_fd, I915_PMU_ENGINE_BUSY(e->class, e->instance)); val = __pmu_read_single(fd, &ts[0]); slept = measured_usleep(batch_duration_ns / 1000); val = __pmu_read_single(fd, &ts[1]) - val; igt_debug("slept=%lu perf=%"PRIu64"\n", slept, ts[1] - ts[0]); igt_spin_free(gem_fd, spin); close(fd); put_ahnd(ahnd); assert_within_epsilon(val, ts[1] - ts[0], tolerance); gem_quiescent_gpu(gem_fd); } /* * This test has a potentially low rate of catching the issue it is trying to * catch. Or in other words, quite high rate of false negative successes. We * will depend on the CI systems running it a lot to detect issues. */ static void busy_double_start(int gem_fd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e) { unsigned long slept; uint64_t val, val2, ts[2]; igt_spin_t *spin[2]; const intel_ctx_t *tmp_ctx; int fd; uint64_t ahnd = get_reloc_ahnd(gem_fd, ctx->id), ahndN; tmp_ctx = intel_ctx_create(gem_fd, &ctx->cfg); ahndN = get_reloc_ahnd(gem_fd, tmp_ctx->id); /* * Defeat the busy stats delayed disable, we need to guarantee we are * the first user. */ sleep(2); /* * Submit two contexts, with a pause in between targeting the ELSP * re-submission in execlists mode. Make sure busyness is correctly * reported with the engine busy, and after the engine went idle. */ spin[0] = __spin_sync(gem_fd, ahnd, ctx, e); usleep(500e3); spin[1] = __igt_spin_new(gem_fd, .ahnd = ahndN, .ctx = tmp_ctx, .engine = e->flags); /* * Open PMU as fast as possible after the second spin batch in attempt * to be faster than the driver handling lite-restore. */ fd = open_pmu(gem_fd, I915_PMU_ENGINE_BUSY(e->class, e->instance)); val = __pmu_read_single(fd, &ts[0]); slept = measured_usleep(batch_duration_ns / 1000); val = __pmu_read_single(fd, &ts[1]) - val; igt_debug("slept=%lu perf=%"PRIu64"\n", slept, ts[1] - ts[0]); igt_spin_end(spin[0]); igt_spin_end(spin[1]); /* Wait for GPU idle to verify PMU reports idle. */ gem_quiescent_gpu(gem_fd); val2 = pmu_read_single(fd); usleep(batch_duration_ns / 1000); val2 = pmu_read_single(fd) - val2; igt_info("busy=%"PRIu64" idle=%"PRIu64"\n", val, val2); igt_spin_free(gem_fd, spin[0]); igt_spin_free(gem_fd, spin[1]); close(fd); intel_ctx_destroy(gem_fd, tmp_ctx); put_ahnd(ahnd); put_ahnd(ahndN); assert_within_epsilon(val, ts[1] - ts[0], tolerance); igt_assert_eq(val2, 0); gem_quiescent_gpu(gem_fd); } static void log_busy(unsigned int num_engines, uint64_t *val) { char buf[1024]; int rem = sizeof(buf); unsigned int i; char *p = buf; for (i = 0; i < num_engines; i++) { int len; len = snprintf(p, rem, "%u=%" PRIu64 "\n", i, val[i]); igt_assert(len > 0); rem -= len; p += len; } igt_info("%s", buf); } static void busy_check_all(int gem_fd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e, const unsigned int num_engines, unsigned int flags) { struct intel_execution_engine2 *e_; uint64_t tval[2][num_engines]; unsigned int busy_idx = 0, i; uint64_t val[num_engines]; int fd[num_engines]; unsigned long slept; igt_spin_t *spin; uint64_t ahnd = get_reloc_ahnd(gem_fd, ctx->id); i = 0; fd[0] = -1; for_each_ctx_engine(gem_fd, ctx, e_) { if (e->class == e_->class && e->instance == e_->instance) busy_idx = i; fd[i++] = open_group(gem_fd, I915_PMU_ENGINE_BUSY(e_->class, e_->instance), fd[0]); } igt_assert_eq(i, num_engines); spin = spin_sync(gem_fd, ahnd, ctx, e); pmu_read_multi(fd[0], num_engines, tval[0]); slept = measured_usleep(batch_duration_ns / 1000); if (flags & TEST_TRAILING_IDLE) end_spin(gem_fd, spin, flags); pmu_read_multi(fd[0], num_engines, tval[1]); end_spin(gem_fd, spin, FLAG_SYNC); igt_spin_free(gem_fd, spin); for (i = 0; i < num_engines; i++) close(fd[i]); put_ahnd(ahnd); for (i = 0; i < num_engines; i++) val[i] = tval[1][i] - tval[0][i]; log_busy(num_engines, val); assert_within_epsilon(val[busy_idx], slept, tolerance); for (i = 0; i < num_engines; i++) { if (i == busy_idx) continue; assert_within_epsilon(val[i], 0.0f, tolerance); } gem_quiescent_gpu(gem_fd); } static void __submit_spin(int gem_fd, igt_spin_t *spin, const struct intel_execution_engine2 *e, int offset) { struct drm_i915_gem_execbuffer2 eb = spin->execbuf; eb.flags &= ~(0x3f | I915_EXEC_BSD_MASK); eb.flags |= e->flags | I915_EXEC_NO_RELOC; eb.batch_start_offset += offset; gem_execbuf(gem_fd, &eb); } static void most_busy_check_all(int gem_fd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e, const unsigned int num_engines, unsigned int flags) { struct intel_execution_engine2 *e_; uint64_t tval[2][num_engines]; uint64_t val[num_engines]; int fd[num_engines]; unsigned long slept; igt_spin_t *spin = NULL; unsigned int idle_idx, i; uint64_t ahnd = get_reloc_ahnd(gem_fd, ctx->id); i = 0; for_each_ctx_engine(gem_fd, ctx, e_) { if (e->class == e_->class && e->instance == e_->instance) idle_idx = i; else if (spin) __submit_spin(gem_fd, spin, e_, 64); else spin = __spin_poll(gem_fd, ahnd, ctx, e_); val[i++] = I915_PMU_ENGINE_BUSY(e_->class, e_->instance); } igt_assert(i == num_engines); igt_require(spin); /* at least one busy engine */ fd[0] = -1; for (i = 0; i < num_engines; i++) fd[i] = open_group(gem_fd, val[i], fd[0]); /* Small delay to allow engines to start. */ usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3); pmu_read_multi(fd[0], num_engines, tval[0]); slept = measured_usleep(batch_duration_ns / 1000); if (flags & TEST_TRAILING_IDLE) end_spin(gem_fd, spin, flags); pmu_read_multi(fd[0], num_engines, tval[1]); end_spin(gem_fd, spin, FLAG_SYNC); igt_spin_free(gem_fd, spin); for (i = 0; i < num_engines; i++) close(fd[i]); put_ahnd(ahnd); for (i = 0; i < num_engines; i++) val[i] = tval[1][i] - tval[0][i]; log_busy(num_engines, val); for (i = 0; i < num_engines; i++) { if (i == idle_idx) assert_within_epsilon(val[i], 0.0f, tolerance); else assert_within_epsilon(val[i], slept, tolerance); } gem_quiescent_gpu(gem_fd); } static void all_busy_check_all(int gem_fd, const intel_ctx_t *ctx, const unsigned int num_engines, unsigned int flags) { struct intel_execution_engine2 *e; uint64_t tval[2][num_engines]; uint64_t val[num_engines]; int fd[num_engines]; unsigned long slept; igt_spin_t *spin = NULL; unsigned int i; uint64_t ahnd = get_reloc_ahnd(gem_fd, ctx->id); i = 0; for_each_ctx_engine(gem_fd, ctx, e) { if (spin) __submit_spin(gem_fd, spin, e, 64); else spin = __spin_poll(gem_fd, ahnd, ctx, e); val[i++] = I915_PMU_ENGINE_BUSY(e->class, e->instance); } igt_assert(i == num_engines); fd[0] = -1; for (i = 0; i < num_engines; i++) fd[i] = open_group(gem_fd, val[i], fd[0]); /* Small delay to allow engines to start. */ usleep(__spin_wait(gem_fd, spin) * num_engines / 1e3); pmu_read_multi(fd[0], num_engines, tval[0]); slept = measured_usleep(batch_duration_ns / 1000); if (flags & TEST_TRAILING_IDLE) end_spin(gem_fd, spin, flags); pmu_read_multi(fd[0], num_engines, tval[1]); end_spin(gem_fd, spin, FLAG_SYNC); igt_spin_free(gem_fd, spin); for (i = 0; i < num_engines; i++) close(fd[i]); put_ahnd(ahnd); for (i = 0; i < num_engines; i++) val[i] = tval[1][i] - tval[0][i]; log_busy(num_engines, val); for (i = 0; i < num_engines; i++) assert_within_epsilon(val[i], slept, tolerance); gem_quiescent_gpu(gem_fd); } static void no_sema(int gem_fd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e, unsigned int flags) { igt_spin_t *spin; uint64_t val[2][2]; int fd[2]; uint64_t ahnd = get_reloc_ahnd(gem_fd, ctx->id); fd[0] = open_group(gem_fd, I915_PMU_ENGINE_SEMA(e->class, e->instance), -1); fd[1] = open_group(gem_fd, I915_PMU_ENGINE_WAIT(e->class, e->instance), fd[0]); if (flags & TEST_BUSY) spin = spin_sync(gem_fd, ahnd, ctx, e); else spin = NULL; pmu_read_multi(fd[0], 2, val[0]); measured_usleep(batch_duration_ns / 1000); if (flags & TEST_TRAILING_IDLE) end_spin(gem_fd, spin, flags); pmu_read_multi(fd[0], 2, val[1]); val[0][0] = val[1][0] - val[0][0]; val[0][1] = val[1][1] - val[0][1]; if (spin) { end_spin(gem_fd, spin, FLAG_SYNC); igt_spin_free(gem_fd, spin); } close(fd[0]); close(fd[1]); put_ahnd(ahnd); assert_within_epsilon(val[0][0], 0.0f, tolerance); assert_within_epsilon(val[0][1], 0.0f, tolerance); } #define MI_INSTR(opcode, flags) (((opcode) << 23) | (flags)) #define MI_SEMAPHORE_WAIT MI_INSTR(0x1c, 2) /* GEN8+ */ #define MI_SEMAPHORE_POLL (1<<15) #define MI_SEMAPHORE_SAD_GTE_SDD (1<<12) #define MI_SEMAPHORE_SAD_NEQ_SDD (5 << 12) static void sema_wait(int gem_fd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e, unsigned int flags) { struct drm_i915_gem_relocation_entry reloc[2] = {}; struct drm_i915_gem_exec_object2 obj[2] = {}; struct drm_i915_gem_execbuffer2 eb = {}; uint32_t bb_handle, obj_handle; unsigned long slept; uint32_t *obj_ptr; uint32_t batch[16]; uint64_t val[2], ts[2]; int fd; uint64_t ahnd = get_reloc_ahnd(gem_fd, ctx->id); uint64_t obj_offset, bb_offset; igt_require(intel_gen(intel_get_drm_devid(gem_fd)) >= 8); /** * Setup up a batchbuffer with a polling semaphore wait command which * will wait on an value in a shared bo to change. This way we are able * to control how much time we will spend in this bb. */ bb_handle = gem_create(gem_fd, 4096); obj_handle = gem_create(gem_fd, 4096); bb_offset = get_offset(ahnd, bb_handle, 4096, 0); obj_offset = get_offset(ahnd, obj_handle, 4096, 0); obj_ptr = gem_mmap__device_coherent(gem_fd, obj_handle, 0, 4096, PROT_WRITE); batch[0] = MI_STORE_DWORD_IMM; batch[1] = obj_offset + sizeof(*obj_ptr); batch[2] = (obj_offset + sizeof(*obj_ptr)) >> 32; batch[3] = 1; batch[4] = MI_SEMAPHORE_WAIT | MI_SEMAPHORE_POLL | MI_SEMAPHORE_SAD_GTE_SDD; batch[5] = 1; batch[6] = obj_offset; batch[7] = obj_offset >> 32; batch[8] = MI_BATCH_BUFFER_END; gem_write(gem_fd, bb_handle, 0, batch, sizeof(batch)); reloc[0].target_handle = obj_handle; reloc[0].offset = 1 * sizeof(uint32_t); reloc[0].read_domains = I915_GEM_DOMAIN_RENDER; reloc[0].write_domain = I915_GEM_DOMAIN_RENDER; reloc[0].delta = sizeof(*obj_ptr); reloc[1].target_handle = obj_handle; reloc[1].offset = 6 * sizeof(uint32_t); reloc[1].read_domains = I915_GEM_DOMAIN_RENDER; obj[0].handle = obj_handle; obj[1].handle = bb_handle; obj[1].relocation_count = !ahnd ? 2 : 0; obj[1].relocs_ptr = to_user_pointer(reloc); eb.buffer_count = 2; eb.buffers_ptr = to_user_pointer(obj); eb.flags = e->flags; eb.rsvd1 = ctx->id; if (ahnd) { obj[0].flags |= EXEC_OBJECT_PINNED | EXEC_OBJECT_WRITE; obj[0].offset = obj_offset; obj[1].flags |= EXEC_OBJECT_PINNED; obj[1].offset = bb_offset; } /** * Start the semaphore wait PMU and after some known time let the above * semaphore wait command finish. Then check that the PMU is reporting * to expected time spent in semaphore wait state. */ fd = open_pmu(gem_fd, I915_PMU_ENGINE_SEMA(e->class, e->instance)); val[0] = pmu_read_single(fd); gem_execbuf(gem_fd, &eb); do { /* wait for the batch to start executing */ usleep(5e3); } while (!obj_ptr[1]); igt_assert_f(igt_wait(pmu_read_single(fd) != val[0], 10, 1), "sampling failed to start withing 10ms\n"); val[0] = __pmu_read_single(fd, &ts[0]); slept = measured_usleep(batch_duration_ns / 1000); if (flags & TEST_TRAILING_IDLE) obj_ptr[0] = 1; val[1] = __pmu_read_single(fd, &ts[1]); igt_debug("slept %.3fms (perf %.3fms), sampled %.3fms\n", slept * 1e-6, (ts[1] - ts[0]) * 1e-6, (val[1] - val[0]) * 1e-6); obj_ptr[0] = 1; gem_sync(gem_fd, bb_handle); munmap(obj_ptr, 4096); gem_close(gem_fd, obj_handle); gem_close(gem_fd, bb_handle); close(fd); put_ahnd(ahnd); assert_within_epsilon(val[1] - val[0], slept, tolerance); } static uint32_t create_sema(int gem_fd, uint64_t ahnd, struct drm_i915_gem_relocation_entry *reloc, __u64 *poffset) { uint32_t cs[] = { /* Reset our semaphore wait */ MI_STORE_DWORD_IMM, 0, 0, 1, /* Wait until the semaphore value is set to 0 [by caller] */ MI_SEMAPHORE_WAIT | MI_SEMAPHORE_POLL | MI_SEMAPHORE_SAD_NEQ_SDD, 1, 0, 0, MI_BATCH_BUFFER_END }; uint32_t handle; igt_assert(poffset); handle = gem_create(gem_fd, 4096); *poffset = get_offset(ahnd, handle, 4096, 0); memset(reloc, 0, 2 * sizeof(*reloc)); reloc[0].target_handle = handle; reloc[0].offset = 64 + 1 * sizeof(uint32_t); reloc[1].target_handle = handle; reloc[1].offset = 64 + 6 * sizeof(uint32_t); if (ahnd) { cs[1] = *poffset; cs[2] = *poffset >> 32; cs[6] = *poffset; cs[7] = *poffset >> 32; } gem_write(gem_fd, handle, 64, cs, sizeof(cs)); return handle; } static void __sema_busy(int gem_fd, uint64_t ahnd, int pmu, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e, int sema_pct, int busy_pct) { enum { SEMA = 0, BUSY, }; uint64_t total, sema, busy; uint64_t start[2], val[2]; struct drm_i915_gem_relocation_entry reloc[2]; struct drm_i915_gem_exec_object2 obj = { .handle = create_sema(gem_fd, ahnd, reloc, &obj.offset), .relocation_count = !ahnd ? 2 : 0, .relocs_ptr = to_user_pointer(reloc), }; struct drm_i915_gem_execbuffer2 eb = { .batch_start_offset = 64, .buffer_count = 1, .buffers_ptr = to_user_pointer(&obj), .flags = e->flags, .rsvd1 = ctx->id, }; igt_spin_t *spin; uint32_t *map; /* Time spent being busy includes time waiting on semaphores */ igt_assert(busy_pct >= sema_pct); gem_quiescent_gpu(gem_fd); map = gem_mmap__device_coherent(gem_fd, obj.handle, 0, 4096, PROT_WRITE); gem_execbuf(gem_fd, &eb); spin = igt_spin_new(gem_fd, .ahnd = ahnd, .ctx = ctx, .engine = e->flags); /* Wait until the batch is executed and the semaphore is busy-waiting */ while (!READ_ONCE(*map) && gem_bo_busy(gem_fd, obj.handle)) ; igt_assert(gem_bo_busy(gem_fd, obj.handle)); gem_close(gem_fd, obj.handle); total = pmu_read_multi(pmu, 2, start); sema = measured_usleep(batch_duration_ns * sema_pct / 100 / 1000); *map = 0; __sync_synchronize(); busy = measured_usleep(batch_duration_ns * (busy_pct - sema_pct) / 100 / 1000); igt_spin_end(spin); measured_usleep(batch_duration_ns * (100 - busy_pct) / 100 / 1000); total = pmu_read_multi(pmu, 2, val) - total; igt_spin_free(gem_fd, spin); munmap(map, 4096); busy += sema; val[SEMA] -= start[SEMA]; val[BUSY] -= start[BUSY]; igt_info("%s, target: {%.1f%% [%d], %.1f%% [%d]}, measured: {%.1f%%, %.1f%%}\n", e->name, sema * 100. / total, sema_pct, busy * 100. / total, busy_pct, val[SEMA] * 100. / total, val[BUSY] * 100. / total); assert_within_epsilon(val[SEMA], sema, tolerance); assert_within_epsilon(val[BUSY], busy, tolerance); igt_assert_f(val[SEMA] < val[BUSY] * (1 + tolerance), "Semaphore time (%.3fus, %.1f%%) greater than total time busy (%.3fus, %.1f%%)!\n", val[SEMA] * 1e-3, val[SEMA] * 100. / total, val[BUSY] * 1e-3, val[BUSY] * 100. / total); } static void sema_busy(int gem_fd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e, unsigned int flags) { int fd[2]; uint64_t ahnd = get_reloc_ahnd(gem_fd, ctx->id); igt_require(intel_gen(intel_get_drm_devid(gem_fd)) >= 8); fd[0] = open_group(gem_fd, I915_PMU_ENGINE_SEMA(e->class, e->instance), -1); fd[1] = open_group(gem_fd, I915_PMU_ENGINE_BUSY(e->class, e->instance), fd[0]); __sema_busy(gem_fd, ahnd, fd[0], ctx, e, 50, 100); __sema_busy(gem_fd, ahnd, fd[0], ctx, e, 25, 50); __sema_busy(gem_fd, ahnd, fd[0], ctx, e, 75, 75); close(fd[0]); close(fd[1]); put_ahnd(ahnd); } static void test_awake(int i915, const intel_ctx_t *ctx) { const struct intel_execution_engine2 *e; unsigned long slept; uint64_t val; int fd; uint64_t ahnd = get_reloc_ahnd(i915, ctx->id); fd = perf_i915_open(i915, I915_PMU_SOFTWARE_GT_AWAKE_TIME); igt_skip_on(fd < 0); /* Check that each engine is captured by the GT wakeref */ for_each_ctx_engine(i915, ctx, e) { igt_spin_new(i915, .ahnd = ahnd, .ctx = ctx, .engine = e->flags); val = pmu_read_single(fd); slept = measured_usleep(batch_duration_ns / 1000); val = pmu_read_single(fd) - val; gem_quiescent_gpu(i915); assert_within_epsilon(val, slept, tolerance); } /* And that the total GT wakeref matches walltime not summation */ for_each_ctx_engine(i915, ctx, e) igt_spin_new(i915, .ahnd = ahnd, .ctx = ctx, .engine = e->flags); val = pmu_read_single(fd); slept = measured_usleep(batch_duration_ns / 1000); val = pmu_read_single(fd) - val; gem_quiescent_gpu(i915); assert_within_epsilon(val, slept, tolerance); igt_free_spins(i915); close(fd); put_ahnd(ahnd); } #define MI_WAIT_FOR_PIPE_C_VBLANK (1<<21) #define MI_WAIT_FOR_PIPE_B_VBLANK (1<<11) #define MI_WAIT_FOR_PIPE_A_VBLANK (1<<3) typedef struct { igt_display_t display; struct igt_fb primary_fb; igt_output_t *output; enum pipe pipe; } data_t; static void prepare_crtc(data_t *data, int fd, igt_output_t *output) { drmModeModeInfo *mode; igt_display_t *display = &data->display; igt_plane_t *primary; /* select the pipe we want to use */ igt_output_set_pipe(output, data->pipe); /* create and set the primary plane fb */ mode = igt_output_get_mode(output); igt_create_color_fb(fd, mode->hdisplay, mode->vdisplay, DRM_FORMAT_XRGB8888, DRM_FORMAT_MOD_LINEAR, 0.0, 0.0, 0.0, &data->primary_fb); primary = igt_output_get_plane_type(output, DRM_PLANE_TYPE_PRIMARY); igt_plane_set_fb(primary, &data->primary_fb); igt_display_commit(display); igt_wait_for_vblank(fd, display->pipes[data->pipe].crtc_offset); } static void cleanup_crtc(data_t *data, int fd, igt_output_t *output) { igt_display_t *display = &data->display; igt_plane_t *primary; igt_remove_fb(fd, &data->primary_fb); primary = igt_output_get_plane_type(output, DRM_PLANE_TYPE_PRIMARY); igt_plane_set_fb(primary, NULL); igt_output_set_pipe(output, PIPE_ANY); igt_display_commit(display); } static int wait_vblank(int fd, union drm_wait_vblank *vbl) { int err; err = 0; if (igt_ioctl(fd, DRM_IOCTL_WAIT_VBLANK, vbl)) err = -errno; return err; } static int has_secure_batches(const int fd) { int v = -1; drm_i915_getparam_t gp = { .param = I915_PARAM_HAS_SECURE_BATCHES, .value = &v, }; drmIoctl(fd, DRM_IOCTL_I915_GETPARAM, &gp); return v > 0; } static void event_wait(int gem_fd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e) { struct drm_i915_gem_exec_object2 obj = { }; struct drm_i915_gem_execbuffer2 eb = { }; const uint32_t DERRMR = 0x44050; const uint32_t FORCEWAKE_MT = 0xa188; unsigned int valid_tests = 0; uint32_t batch[16], *b; uint16_t devid; igt_output_t *output; data_t data; enum pipe p; int fd; devid = intel_get_drm_devid(gem_fd); igt_require(intel_gen(devid) >= 7); igt_require(has_secure_batches(gem_fd)); igt_skip_on(IS_VALLEYVIEW(devid) || IS_CHERRYVIEW(devid)); igt_device_set_master(gem_fd); kmstest_set_vt_graphics_mode(); igt_display_require(&data.display, gem_fd); /** * We will use the display to render event forwarind so need to * program the DERRMR register and restore it at exit. * Note we assume that the default/desired value for DERRMR will always * be ~0u (all routing disable). To be fancy, we could do a SRM of the * reg beforehand and then LRM at the end. * * We will emit a MI_WAIT_FOR_EVENT listening for vblank events, * have a background helper to indirectly enable vblank irqs, and * listen to the recorded time spent in engine wait state as reported * by the PMU. */ obj.handle = gem_create(gem_fd, 4096); b = batch; *b++ = MI_LOAD_REGISTER_IMM; *b++ = FORCEWAKE_MT; *b++ = 2 << 16 | 2; *b++ = MI_LOAD_REGISTER_IMM; *b++ = DERRMR; *b++ = ~0u; *b++ = MI_WAIT_FOR_EVENT; *b++ = MI_LOAD_REGISTER_IMM; *b++ = DERRMR; *b++ = ~0u; *b++ = MI_LOAD_REGISTER_IMM; *b++ = FORCEWAKE_MT; *b++ = 2 << 16; *b++ = MI_BATCH_BUFFER_END; eb.buffer_count = 1; eb.buffers_ptr = to_user_pointer(&obj); eb.flags = e->flags | I915_EXEC_SECURE; eb.rsvd1 = ctx->id; for_each_pipe_with_valid_output(&data.display, p, output) { struct igt_helper_process waiter = { }; const unsigned int frames = 3; uint64_t val[2]; batch[6] = MI_WAIT_FOR_EVENT; switch (p) { case PIPE_A: batch[6] |= MI_WAIT_FOR_PIPE_A_VBLANK; batch[5] = ~(1 << 3); break; case PIPE_B: batch[6] |= MI_WAIT_FOR_PIPE_B_VBLANK; batch[5] = ~(1 << 11); break; case PIPE_C: batch[6] |= MI_WAIT_FOR_PIPE_C_VBLANK; batch[5] = ~(1 << 21); break; default: continue; } gem_write(gem_fd, obj.handle, 0, batch, sizeof(batch)); data.pipe = p; prepare_crtc(&data, gem_fd, output); fd = open_pmu(gem_fd, I915_PMU_ENGINE_WAIT(e->class, e->instance)); val[0] = pmu_read_single(fd); igt_fork_helper(&waiter) { const uint32_t pipe_id_flag = kmstest_get_vbl_flag(data.pipe); for (;;) { union drm_wait_vblank vbl = { }; vbl.request.type = _DRM_VBLANK_RELATIVE; vbl.request.type |= pipe_id_flag; vbl.request.sequence = 1; igt_assert_eq(wait_vblank(gem_fd, &vbl), 0); } } for (unsigned int frame = 0; frame < frames; frame++) { gem_execbuf(gem_fd, &eb); gem_sync(gem_fd, obj.handle); } igt_stop_helper(&waiter); val[1] = pmu_read_single(fd); close(fd); cleanup_crtc(&data, gem_fd, output); valid_tests++; igt_assert(val[1] - val[0] > 0); } gem_close(gem_fd, obj.handle); igt_require_f(valid_tests, "no valid crtc/connector combinations found\n"); } static void multi_client(int gem_fd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e) { uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance); unsigned long slept[2]; uint64_t val[2], ts[2], perf_slept[2]; igt_spin_t *spin; int fd[2]; uint64_t ahnd = get_reloc_ahnd(gem_fd, ctx->id); gem_quiescent_gpu(gem_fd); fd[0] = open_pmu(gem_fd, config); /* * Second PMU client which is initialized after the first one, * and exists before it, should not affect accounting as reported * in the first client. */ fd[1] = open_pmu(gem_fd, config); spin = spin_sync(gem_fd, ahnd, ctx, e); val[0] = val[1] = __pmu_read_single(fd[0], &ts[0]); slept[1] = measured_usleep(batch_duration_ns / 1000); val[1] = __pmu_read_single(fd[1], &ts[1]) - val[1]; perf_slept[1] = ts[1] - ts[0]; igt_debug("slept=%lu perf=%"PRIu64"\n", slept[1], perf_slept[1]); close(fd[1]); slept[0] = measured_usleep(batch_duration_ns / 1000) + slept[1]; val[0] = __pmu_read_single(fd[0], &ts[1]) - val[0]; perf_slept[0] = ts[1] - ts[0]; igt_debug("slept=%lu perf=%"PRIu64"\n", slept[0], perf_slept[0]); igt_spin_end(spin); gem_sync(gem_fd, spin->handle); igt_spin_free(gem_fd, spin); close(fd[0]); put_ahnd(ahnd); assert_within_epsilon(val[0], perf_slept[0], tolerance); assert_within_epsilon(val[1], perf_slept[1], tolerance); } /** * Tests that i915 PMU corectly errors out in invalid initialization. * i915 PMU is uncore PMU, thus: * - sampling period is not supported * - pid > 0 is not supported since we can't count per-process (we count * per whole system) * - cpu != 0 is not supported since i915 PMU only allows running on one cpu * and that is normally CPU0. */ static void invalid_init(int i915) { struct perf_event_attr attr; #define ATTR_INIT() \ do { \ memset(&attr, 0, sizeof (attr)); \ attr.config = I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0); \ attr.type = i915_perf_type_id(i915); \ igt_assert(attr.type != 0); \ errno = 0; \ } while(0) ATTR_INIT(); attr.sample_period = 100; igt_assert_eq(perf_event_open(&attr, -1, 0, -1, 0), -1); igt_assert_eq(errno, EINVAL); ATTR_INIT(); igt_assert_eq(perf_event_open(&attr, 0, 0, -1, 0), -1); igt_assert_eq(errno, EINVAL); ATTR_INIT(); igt_assert_eq(perf_event_open(&attr, -1, 1, -1, 0), -1); igt_assert_eq(errno, EINVAL); } static void open_invalid(int i915) { int fd; fd = perf_i915_open(i915, -1ULL); igt_assert(fd < 0); } static bool cpu0_hotplug_support(void) { return access("/sys/devices/system/cpu/cpu0/online", W_OK) == 0; } static void cpu_hotplug(int gem_fd) { igt_spin_t *spin[2]; uint64_t ts[2]; uint64_t val; int link[2]; int fd, ret; int cur = 0; char buf; uint64_t ahnd = get_reloc_ahnd(gem_fd, 0); igt_require(cpu0_hotplug_support()); fd = open_pmu(gem_fd, I915_PMU_ENGINE_BUSY(I915_ENGINE_CLASS_RENDER, 0)); /* * Create two spinners so test can ensure shorter gaps in engine * busyness as it is terminating one and re-starting the other. */ spin[0] = igt_spin_new(gem_fd, .ahnd = ahnd, .engine = I915_EXEC_DEFAULT); spin[1] = __igt_spin_new(gem_fd, .ahnd = ahnd, .engine = I915_EXEC_DEFAULT); val = __pmu_read_single(fd, &ts[0]); ret = pipe2(link, O_NONBLOCK); igt_assert_eq(ret, 0); /* * Toggle online status of all the CPUs in a child process and ensure * this has not affected busyness stats in the parent. */ igt_fork(child, 1) { int cpu = 0; close(link[0]); for (;;) { char name[128]; int cpufd; igt_assert_lt(snprintf(name, sizeof(name), "/sys/devices/system/cpu/cpu%d/online", cpu), sizeof(name)); cpufd = open(name, O_WRONLY); if (cpufd == -1) { igt_assert(cpu > 0); /* * Signal parent that we cycled through all * CPUs and we are done. */ igt_assert_eq(write(link[1], "*", 1), 1); break; } /* Offline followed by online a CPU. */ ret = write(cpufd, "0", 2); if (ret < 0) { /* * If we failed to offline a CPU we don't want * to proceed. */ igt_warn("Failed to offline cpu%u! (%d)\n", cpu, errno); igt_assert_eq(write(link[1], "s", 1), 1); break; } usleep(1e6); ret = write(cpufd, "1", 2); if (ret < 0) { /* * Failed to bring a CPU back online is fatal * for the sanity of a test run so stop further * testing. */ igt_warn("Failed to online cpu%u! (%d)\n", cpu, errno); igt_fatal_error(); } close(cpufd); cpu++; } } close(link[1]); /* * Very long batches can be declared as GPU hangs so emit shorter ones * until the CPU core shuffler finishes one loop. */ for (;;) { usleep(500e3); end_spin(gem_fd, spin[cur], 0); /* Check if the child is signaling completion. */ ret = read(link[0], &buf, 1); if ( ret == 1 || (ret < 0 && errno != EAGAIN)) break; igt_spin_free(gem_fd, spin[cur]); spin[cur] = __igt_spin_new(gem_fd, .ahnd = ahnd, .engine = I915_EXEC_DEFAULT); cur ^= 1; } val = __pmu_read_single(fd, &ts[1]) - val; end_spin(gem_fd, spin[0], FLAG_SYNC); end_spin(gem_fd, spin[1], FLAG_SYNC); igt_spin_free(gem_fd, spin[0]); igt_spin_free(gem_fd, spin[1]); igt_waitchildren(); close(fd); close(link[0]); put_ahnd(ahnd); /* Skip if child signals a problem with offlining a CPU. */ igt_skip_on(buf == 's'); assert_within_epsilon(val, ts[1] - ts[0], tolerance); } static int target_num_interrupts(int i915) { const intel_ctx_cfg_t cfg = intel_ctx_cfg_all_physical(i915); return min(gem_submission_measure(i915, &cfg, I915_EXEC_DEFAULT), 30u); } static void test_interrupts(int gem_fd) { const int target = target_num_interrupts(gem_fd); const unsigned int test_duration_ms = 1000; igt_spin_t *spin[target]; struct pollfd pfd; uint64_t idle, busy; int fence_fd; int fd; uint64_t ahnd = get_reloc_ahnd(gem_fd, 0); gem_quiescent_gpu(gem_fd); fd = open_pmu(gem_fd, I915_PMU_INTERRUPTS); /* Queue spinning batches. */ for (int i = 0; i < target; i++) { spin[i] = __igt_spin_new(gem_fd, .ahnd = ahnd, .engine = I915_EXEC_DEFAULT, .flags = IGT_SPIN_FENCE_OUT); if (i == 0) { fence_fd = spin[i]->out_fence; } else { int old_fd = fence_fd; fence_fd = sync_fence_merge(old_fd, spin[i]->out_fence); close(old_fd); } igt_assert(fence_fd >= 0); } /* Wait for idle state. */ idle = pmu_read_single(fd); do { busy = idle; usleep(1e3); idle = pmu_read_single(fd); } while (idle != busy); /* Arm batch expiration. */ for (int i = 0; i < target; i++) igt_spin_set_timeout(spin[i], (i + 1) * test_duration_ms * 1e6 / target); /* Wait for last batch to finish. */ pfd.events = POLLIN; pfd.fd = fence_fd; igt_assert_eq(poll(&pfd, 1, 2 * test_duration_ms), 1); close(fence_fd); /* Free batches. */ for (int i = 0; i < target; i++) igt_spin_free(gem_fd, spin[i]); put_ahnd(ahnd); /* Check at least as many interrupts has been generated. */ busy = pmu_read_single(fd) - idle; close(fd); igt_assert_lte(target, busy); } static void test_interrupts_sync(int gem_fd) { const int target = target_num_interrupts(gem_fd); const unsigned int test_duration_ms = 1000; igt_spin_t *spin[target]; struct pollfd pfd; uint64_t idle, busy; int fd; uint64_t ahnd = get_reloc_ahnd(gem_fd, 0); gem_quiescent_gpu(gem_fd); fd = open_pmu(gem_fd, I915_PMU_INTERRUPTS); /* Queue spinning batches. */ for (int i = 0; i < target; i++) spin[i] = __igt_spin_new(gem_fd, .ahnd = ahnd, .flags = IGT_SPIN_FENCE_OUT); /* Wait for idle state. */ idle = pmu_read_single(fd); do { busy = idle; usleep(1e3); idle = pmu_read_single(fd); } while (idle != busy); /* Process the batch queue. */ pfd.events = POLLIN; for (int i = 0; i < target; i++) { const unsigned int timeout_ms = test_duration_ms / target; pfd.fd = spin[i]->out_fence; igt_spin_set_timeout(spin[i], timeout_ms * 1e6); igt_assert_eq(poll(&pfd, 1, 2 * timeout_ms), 1); igt_spin_free(gem_fd, spin[i]); } /* Check at least as many interrupts has been generated. */ busy = pmu_read_single(fd) - idle; close(fd); put_ahnd(ahnd); igt_assert_lte(target, busy); } static void test_frequency(int gem_fd) { uint32_t min_freq, max_freq, boost_freq; uint64_t val[2], start[2], slept; double min[2], max[2]; igt_spin_t *spin; int fd[2], sysfs; uint64_t ahnd = get_reloc_ahnd(gem_fd, 0); sysfs = igt_sysfs_open(gem_fd); igt_require(sysfs >= 0); min_freq = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz"); max_freq = igt_sysfs_get_u32(sysfs, "gt_RP0_freq_mhz"); boost_freq = igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz"); igt_info("Frequency: min=%u, max=%u, boost=%u MHz\n", min_freq, max_freq, boost_freq); igt_require(min_freq > 0 && max_freq > 0 && boost_freq > 0); igt_require(max_freq > min_freq); igt_require(boost_freq > min_freq); fd[0] = open_group(gem_fd, I915_PMU_REQUESTED_FREQUENCY, -1); fd[1] = open_group(gem_fd, I915_PMU_ACTUAL_FREQUENCY, fd[0]); /* * Set GPU to min frequency and read PMU counters. */ igt_require(igt_sysfs_set_u32(sysfs, "gt_min_freq_mhz", min_freq)); igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == min_freq); igt_require(igt_sysfs_set_u32(sysfs, "gt_max_freq_mhz", min_freq)); igt_require(igt_sysfs_get_u32(sysfs, "gt_max_freq_mhz") == min_freq); igt_require(igt_sysfs_set_u32(sysfs, "gt_boost_freq_mhz", min_freq)); igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == min_freq); gem_quiescent_gpu(gem_fd); /* Idle to be sure the change takes effect */ spin = spin_sync_flags(gem_fd, ahnd, 0, I915_EXEC_DEFAULT); slept = pmu_read_multi(fd[0], 2, start); measured_usleep(batch_duration_ns / 1000); slept = pmu_read_multi(fd[0], 2, val) - slept; min[0] = 1e9*(val[0] - start[0]) / slept; min[1] = 1e9*(val[1] - start[1]) / slept; igt_spin_free(gem_fd, spin); gem_quiescent_gpu(gem_fd); /* Don't leak busy bo into the next phase */ usleep(1e6); /* * Set GPU to max frequency and read PMU counters. */ igt_require(igt_sysfs_set_u32(sysfs, "gt_max_freq_mhz", max_freq)); igt_require(igt_sysfs_get_u32(sysfs, "gt_max_freq_mhz") == max_freq); igt_require(igt_sysfs_set_u32(sysfs, "gt_boost_freq_mhz", boost_freq)); igt_require(igt_sysfs_get_u32(sysfs, "gt_boost_freq_mhz") == boost_freq); igt_require(igt_sysfs_set_u32(sysfs, "gt_min_freq_mhz", max_freq)); igt_require(igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") == max_freq); gem_quiescent_gpu(gem_fd); spin = spin_sync_flags(gem_fd, ahnd, 0, I915_EXEC_DEFAULT); slept = pmu_read_multi(fd[0], 2, start); measured_usleep(batch_duration_ns / 1000); slept = pmu_read_multi(fd[0], 2, val) - slept; max[0] = 1e9*(val[0] - start[0]) / slept; max[1] = 1e9*(val[1] - start[1]) / slept; igt_spin_free(gem_fd, spin); gem_quiescent_gpu(gem_fd); /* * Restore min/max. */ igt_sysfs_set_u32(sysfs, "gt_min_freq_mhz", min_freq); if (igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz") != min_freq) igt_warn("Unable to restore min frequency to saved value [%u MHz], now %u MHz\n", min_freq, igt_sysfs_get_u32(sysfs, "gt_min_freq_mhz")); close(fd[0]); close(fd[1]); put_ahnd(ahnd); igt_info("Min frequency: requested %.1f, actual %.1f\n", min[0], min[1]); igt_info("Max frequency: requested %.1f, actual %.1f\n", max[0], max[1]); assert_within_epsilon(min[0], min_freq, tolerance); /* * On thermally throttled devices we cannot be sure maximum frequency * can be reached so use larger tolerance downards. */ __assert_within_epsilon(max[0], max_freq, tolerance, 0.15f); } static void test_frequency_idle(int gem_fd) { uint32_t min_freq; uint64_t val[2], start[2], slept; double idle[2]; int fd[2], sysfs; sysfs = igt_sysfs_open(gem_fd); igt_require(sysfs >= 0); min_freq = igt_sysfs_get_u32(sysfs, "gt_RPn_freq_mhz"); close(sysfs); /* While parked, our convention is to report the GPU at 0Hz */ fd[0] = open_group(gem_fd, I915_PMU_REQUESTED_FREQUENCY, -1); fd[1] = open_group(gem_fd, I915_PMU_ACTUAL_FREQUENCY, fd[0]); gem_quiescent_gpu(gem_fd); /* Be idle! */ measured_usleep(2000); /* Wait for timers to cease */ slept = pmu_read_multi(fd[0], 2, start); measured_usleep(batch_duration_ns / 1000); slept = pmu_read_multi(fd[0], 2, val) - slept; close(fd[0]); close(fd[1]); idle[0] = 1e9*(val[0] - start[0]) / slept; idle[1] = 1e9*(val[1] - start[1]) / slept; igt_info("Idle frequency: requested %.1f, actual %.1f; HW min %u\n", idle[0], idle[1], min_freq); igt_assert_f(idle[0] <= min_freq, "Request frequency should be 0 while parked!\n"); igt_assert_f(idle[1] <= min_freq, "Actual frequency should be 0 while parked!\n"); } static bool wait_for_rc6(int fd, int timeout) { struct timespec tv = {}; uint64_t start, now; /* First wait for roughly an RC6 Evaluation Interval */ usleep(160 * 1000); /* Then poll for RC6 to start ticking */ now = pmu_read_single(fd); do { start = now; usleep(5000); now = pmu_read_single(fd); if (now - start > 1e6) return true; } while (igt_seconds_elapsed(&tv) <= timeout); return false; } static void test_rc6(int gem_fd, unsigned int flags) { int64_t duration_ns = 2e9; uint64_t idle, busy, prev, ts[2]; unsigned long slept; int fd, fw; gem_quiescent_gpu(gem_fd); fd = open_pmu(gem_fd, I915_PMU_RC6_RESIDENCY); if (flags & TEST_RUNTIME_PM) { drmModeRes *res; res = drmModeGetResources(gem_fd); igt_require(res); /* force all connectors off */ kmstest_set_vt_graphics_mode(); kmstest_unset_all_crtcs(gem_fd, res); drmModeFreeResources(res); igt_require(igt_setup_runtime_pm(gem_fd)); igt_require(igt_wait_for_pm_status(IGT_RUNTIME_PM_STATUS_SUSPENDED)); /* * Sleep for a bit to see if once woken up estimated RC6 hasn't * drifted to far in advance of real RC6. */ if (flags & FLAG_LONG) { pmu_read_single(fd); sleep(5); pmu_read_single(fd); } } igt_require(wait_for_rc6(fd, 1)); /* While idle check full RC6. */ prev = __pmu_read_single(fd, &ts[0]); slept = measured_usleep(duration_ns / 1000); idle = __pmu_read_single(fd, &ts[1]); igt_debug("slept=%lu perf=%"PRIu64"\n", slept, ts[1] - ts[0]); assert_within_epsilon(idle - prev, ts[1] - ts[0], tolerance); if (flags & TEST_S3) { /* * I expect that the system remains almost completely idle * across suspend, and that the time we spend with rc6 disable * for S3 is minimal. So across suspend I would expect that * the rc6 residency was almost the full monotonic time (i.e. * excluding the suspend time). * * However, in practice it appears we are not entering rc6 * immediately after resume... A bug? */ prev = __pmu_read_single(fd, &ts[0]); igt_system_suspend_autoresume(SUSPEND_STATE_MEM, SUSPEND_TEST_NONE); idle = __pmu_read_single(fd, &ts[1]); igt_debug("suspend=%"PRIu64", rc6=%"PRIu64"\n", ts[1] - ts[0], idle -prev); //assert_within_epsilon(idle - prev, ts[1] - ts[0], tolerance); } igt_assert(wait_for_rc6(fd, 5)); prev = __pmu_read_single(fd, &ts[0]); slept = measured_usleep(duration_ns / 1000); idle = __pmu_read_single(fd, &ts[1]); igt_debug("slept=%lu perf=%"PRIu64"\n", slept, ts[1] - ts[0]); assert_within_epsilon(idle - prev, ts[1] - ts[0], tolerance); /* Wake up device and check no RC6. */ fw = igt_open_forcewake_handle(gem_fd); igt_assert(fw >= 0); usleep(1e3); /* wait for the rc6 cycle counter to stop ticking */ prev = pmu_read_single(fd); usleep(duration_ns / 1000); busy = pmu_read_single(fd); close(fw); close(fd); if (flags & TEST_RUNTIME_PM) igt_restore_runtime_pm(); assert_within_epsilon(busy - prev, 0.0, tolerance); } static void test_enable_race(int gem_fd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e) { uint64_t config = I915_PMU_ENGINE_BUSY(e->class, e->instance); struct igt_helper_process engine_load = { }; const uint32_t bbend = MI_BATCH_BUFFER_END; struct drm_i915_gem_exec_object2 obj = { }; struct drm_i915_gem_execbuffer2 eb = { }; int fd; igt_require(gem_scheduler_has_engine_busy_stats(gem_fd)); igt_require(gem_context_has_engine(gem_fd, 0, e->flags)); obj.handle = gem_create(gem_fd, 4096); gem_write(gem_fd, obj.handle, 0, &bbend, sizeof(bbend)); eb.buffer_count = 1; eb.buffers_ptr = to_user_pointer(&obj); eb.flags = e->flags; eb.rsvd1 = ctx->id; /* * This test is probabilistic so run in a few times to increase the * chance of hitting the race. */ igt_until_timeout(10) { /* * Defeat the busy stats delayed disable, we need to guarantee * we are the first PMU user. */ gem_quiescent_gpu(gem_fd); sleep(2); /* Apply interrupt-heavy load on the engine. */ igt_fork_helper(&engine_load) { for (;;) gem_execbuf(gem_fd, &eb); } /* Wait a bit to allow engine load to start. */ usleep(500e3); /* Enable the PMU. */ fd = open_pmu(gem_fd, config); /* Stop load and close the PMU. */ igt_stop_helper(&engine_load); close(fd); } /* Cleanup. */ gem_close(gem_fd, obj.handle); gem_quiescent_gpu(gem_fd); } #define __assert_within(x, ref, tol_up, tol_down) \ igt_assert_f((double)(x) <= ((double)(ref) + (tol_up)) && \ (double)(x) >= ((double)(ref) - (tol_down)), \ "%f not within +%f/-%f of %f! ('%s' vs '%s')\n", \ (double)(x), (double)(tol_up), (double)(tol_down), \ (double)(ref), #x, #ref) #define assert_within(x, ref, tolerance) \ __assert_within(x, ref, tolerance, tolerance) static void accuracy(int gem_fd, const intel_ctx_t *ctx, const struct intel_execution_engine2 *e, unsigned long target_busy_pct, unsigned long target_iters) { const unsigned long min_test_us = 1e6; unsigned long pwm_calibration_us; unsigned long test_us; unsigned long cycle_us, busy_us, idle_us; double busy_r, expected; uint64_t val[2]; uint64_t ts[2]; int link[2]; int fd; /* Sampling platforms cannot reach the high accuracy criteria. */ igt_require(gem_scheduler_has_engine_busy_stats(gem_fd)); /* Aim for approximately 100 iterations for calibration */ cycle_us = min_test_us / target_iters; busy_us = cycle_us * target_busy_pct / 100; idle_us = cycle_us - busy_us; while (idle_us < 2500 || busy_us < 2500) { busy_us *= 2; idle_us *= 2; } cycle_us = busy_us + idle_us; pwm_calibration_us = target_iters * cycle_us / 2; test_us = target_iters * cycle_us; igt_info("calibration=%lums, test=%lums, cycle=%lums; ratio=%.2f%% (%luus/%luus)\n", pwm_calibration_us / 1000, test_us / 1000, cycle_us / 1000, (double)busy_us / cycle_us * 100.0, busy_us, idle_us); assert_within_epsilon((double)busy_us / cycle_us, (double)target_busy_pct / 100.0, tolerance); igt_assert(pipe(link) == 0); /* Emit PWM pattern on the engine from a child. */ igt_fork(child, 1) { const unsigned long timeout[] = { pwm_calibration_us * 1000, test_us * 1000 }; uint64_t total_busy_ns = 0, total_ns = 0; igt_spin_t *spin; uint64_t ahnd; intel_allocator_init(); ahnd = get_reloc_ahnd(gem_fd, 0); /* Allocate our spin batch and idle it. */ spin = igt_spin_new(gem_fd, .ahnd = ahnd, .ctx = ctx, .engine = e->flags); igt_spin_end(spin); gem_sync(gem_fd, spin->handle); /* 1st pass is calibration, second pass is the test. */ for (int pass = 0; pass < ARRAY_SIZE(timeout); pass++) { unsigned int target_idle_us = idle_us; struct timespec start = { }; uint64_t busy_ns = 0; unsigned long pass_ns = 0; double avg = 0.0, var = 0.0; unsigned int n = 0; igt_nsec_elapsed(&start); do { unsigned long loop_ns, loop_busy; struct timespec _ts = { }; double err, tmp; uint64_t now; /* PWM idle sleep. */ _ts.tv_nsec = target_idle_us * 1000; nanosleep(&_ts, NULL); /* Restart the spinbatch. */ igt_spin_reset(spin); __submit_spin(gem_fd, spin, e, 0); /* PWM busy sleep. */ loop_busy = igt_nsec_elapsed(&start); _ts.tv_nsec = busy_us * 1000; nanosleep(&_ts, NULL); igt_spin_end(spin); /* Time accounting. */ now = igt_nsec_elapsed(&start); loop_busy = now - loop_busy; loop_ns = now - pass_ns; pass_ns = now; busy_ns += loop_busy; total_busy_ns += loop_busy; total_ns += loop_ns; /* Re-calibrate. */ err = (double)total_busy_ns / total_ns - (double)target_busy_pct / 100.0; target_idle_us = (double)target_idle_us * (1.0 + err); /* Running average and variance for debug. */ err = 100.0 * total_busy_ns / total_ns; tmp = avg; avg += (err - avg) / ++n; var += (err - avg) * (err - tmp); } while (pass_ns < timeout[pass]); pass_ns = igt_nsec_elapsed(&start); expected = (double)busy_ns / pass_ns; igt_info("%u: %d cycles, busy %"PRIu64"us, idle %"PRIu64"us -> %.2f%% (target: %lu%%; average=%.2f±%.3f%%)\n", pass, n, busy_ns / 1000, (pass_ns - busy_ns) / 1000, 100 * expected, target_busy_pct, avg, sqrt(var / n)); write(link[1], &expected, sizeof(expected)); } igt_spin_free(gem_fd, spin); put_ahnd(ahnd); } fd = open_pmu(gem_fd, I915_PMU_ENGINE_BUSY(e->class, e->instance)); /* Let the child run. */ read(link[0], &expected, sizeof(expected)); assert_within(100.0 * expected, target_busy_pct, 5); /* Collect engine busyness for an interesting part of child runtime. */ val[0] = __pmu_read_single(fd, &ts[0]); read(link[0], &expected, sizeof(expected)); val[1] = __pmu_read_single(fd, &ts[1]); close(fd); close(link[1]); close(link[0]); igt_waitchildren(); busy_r = (double)(val[1] - val[0]) / (ts[1] - ts[0]); igt_info("error=%.2f%% (%.2f%% vs %.2f%%)\n", (busy_r - expected) * 100, 100 * busy_r, 100 * expected); assert_within(100.0 * busy_r, 100.0 * expected, 2); } static void *create_mmap(int gem_fd, const struct mmap_offset *t, int sz) { uint32_t handle; void *ptr; handle = gem_create(gem_fd, sz); ptr = __gem_mmap_offset(gem_fd, handle, 0, sz, PROT_WRITE, t->type); gem_close(gem_fd, handle); return ptr; } static void faulting_read(int gem_fd, const struct mmap_offset *t) { void *ptr; int fd; /* * Trigger a pagefault within the perf read() so that we can * teach lockdep about the potential chains. */ ptr = create_mmap(gem_fd, t, 4096); igt_require(ptr != NULL); fd = open_pmu(gem_fd, I915_PMU_ENGINE_BUSY(0, 0)); igt_require(fd != -1); igt_assert_eq(read(fd, ptr, 4096), 2 * sizeof(uint64_t)); close(fd); munmap(ptr, 4096); } static void test_unload(unsigned int num_engines) { igt_fork(child, 1) { intel_ctx_cfg_t cfg; const struct intel_execution_engine2 *e; int fd[4 + num_engines * 3], i; uint64_t *buf; int count = 0, ret; char *who = NULL; int i915; i915 = __drm_open_driver(DRIVER_INTEL); igt_debug("Opening perf events\n"); fd[count] = open_group(i915, I915_PMU_INTERRUPTS, -1); if (fd[count] != -1) count++; fd[count] = perf_i915_open_group(i915, I915_PMU_REQUESTED_FREQUENCY, fd[count - 1]); if (fd[count] != -1) count++; fd[count] = perf_i915_open_group(i915, I915_PMU_ACTUAL_FREQUENCY, fd[count - 1]); if (fd[count] != -1) count++; cfg = intel_ctx_cfg_all_physical(i915); for_each_ctx_cfg_engine(i915, &cfg, e) { fd[count] = perf_i915_open_group(i915, I915_PMU_ENGINE_BUSY(e->class, e->instance), fd[count - 1]); if (fd[count] != -1) count++; fd[count] = perf_i915_open_group(i915, I915_PMU_ENGINE_SEMA(e->class, e->instance), fd[count - 1]); if (fd[count] != -1) count++; fd[count] = perf_i915_open_group(i915, I915_PMU_ENGINE_WAIT(e->class, e->instance), fd[count - 1]); if (fd[count] != -1) count++; } fd[count] = perf_i915_open_group(i915, I915_PMU_RC6_RESIDENCY, fd[count - 1]); if (fd[count] != -1) count++; close(i915); buf = calloc(count, sizeof(uint64_t)); igt_assert(buf); igt_debug("Read %d events from perf and trial unload\n", count); pmu_read_multi(fd[0], count, buf); ret = __igt_i915_driver_unload(&who); igt_assert(ret != 0 && !strcmp(who, "i915")); free(who); pmu_read_multi(fd[0], count, buf); igt_debug("Close perf\n"); for (i = 0; i < count; i++) close(fd[i]); free(buf); } igt_waitchildren(); igt_debug("Final unload\n"); igt_assert_eq(__igt_i915_driver_unload(NULL), 0); } #define test_each_engine(T, i915, ctx, e) \ igt_subtest_with_dynamic(T) for_each_ctx_engine(i915, ctx, e) \ igt_dynamic_f("%s", e->name) #define test_each_rcs(T, i915, ctx, e) \ igt_subtest_with_dynamic(T) for_each_ctx_engine(i915, ctx, e) \ for_each_if((e)->class == I915_ENGINE_CLASS_RENDER) \ igt_dynamic_f("%s", e->name) igt_main { const struct intel_execution_engine2 *e; unsigned int num_engines = 0; const intel_ctx_t *ctx = NULL; int fd = -1; /** * All PMU should be accompanied by a test. * * Including all the I915_PMU_OTHER(x). */ igt_fixture { fd = __drm_open_driver(DRIVER_INTEL); igt_require_gem(fd); igt_require(i915_perf_type_id(fd) > 0); ctx = intel_ctx_create_all_physical(fd); for_each_ctx_engine(fd, ctx, e) num_engines++; igt_require(num_engines); } /** * Test invalid access via perf API is rejected. */ igt_subtest("invalid-init") invalid_init(fd); /** * Double check the invalid metric does fail. */ igt_subtest("invalid-open") open_invalid(fd); igt_subtest_with_dynamic("faulting-read") { for_each_mmap_offset_type(fd, t) { igt_dynamic_f("%s", t->name) faulting_read(fd, t); } } /** * Test that a single engine metric can be initialized or it * is correctly rejected. */ test_each_engine("init-busy", fd, ctx, e) init(fd, ctx, e, I915_SAMPLE_BUSY); test_each_engine("init-wait", fd, ctx, e) init(fd, ctx, e, I915_SAMPLE_WAIT); test_each_engine("init-sema", fd, ctx, e) init(fd, ctx, e, I915_SAMPLE_SEMA); /** * Test that engines show no load when idle. */ test_each_engine("idle", fd, ctx, e) single(fd, ctx, e, 0); /** * Test that a single engine reports load correctly. */ test_each_engine("busy", fd, ctx, e) single(fd, ctx, e, TEST_BUSY); test_each_engine("busy-idle", fd, ctx, e) single(fd, ctx, e, TEST_BUSY | TEST_TRAILING_IDLE); /** * Test that when one engine is loaded other report no * load. */ test_each_engine("busy-check-all", fd, ctx, e) busy_check_all(fd, ctx, e, num_engines, TEST_BUSY); test_each_engine("busy-idle-check-all", fd, ctx, e) busy_check_all(fd, ctx, e, num_engines, TEST_BUSY | TEST_TRAILING_IDLE); /** * Test that when all except one engine are loaded all * loads are correctly reported. */ test_each_engine("most-busy-check-all", fd, ctx, e) most_busy_check_all(fd, ctx, e, num_engines, TEST_BUSY); test_each_engine("most-busy-idle-check-all", fd, ctx, e) most_busy_check_all(fd, ctx, e, num_engines, TEST_BUSY | TEST_TRAILING_IDLE); /** * Test that semphore counters report no activity on * idle or busy engines. */ test_each_engine("idle-no-semaphores", fd, ctx, e) no_sema(fd, ctx, e, 0); test_each_engine("busy-no-semaphores", fd, ctx, e) no_sema(fd, ctx, e, TEST_BUSY); test_each_engine("busy-idle-no-semaphores", fd, ctx, e) no_sema(fd, ctx, e, TEST_BUSY | TEST_TRAILING_IDLE); /** * Test that semaphore waits are correctly reported. */ test_each_engine("semaphore-wait", fd, ctx, e) sema_wait(fd, ctx, e, TEST_BUSY); test_each_engine("semaphore-wait-idle", fd, ctx, e) sema_wait(fd, ctx, e, TEST_BUSY | TEST_TRAILING_IDLE); test_each_engine("semaphore-busy", fd, ctx, e) sema_busy(fd, ctx, e, 0); /** * Check that two perf clients do not influence each * others observations. */ test_each_engine("multi-client", fd, ctx, e) multi_client(fd, ctx, e); /** * Check that reported usage is correct when PMU is * enabled after the batch is running. */ test_each_engine("busy-start", fd, ctx, e) busy_start(fd, ctx, e); /** * Check that reported usage is correct when PMU is * enabled after two batches are running. */ igt_subtest_group { igt_fixture gem_require_contexts(fd); test_each_engine("busy-double-start", fd, ctx, e) busy_double_start(fd, ctx, e); } /** * Check that the PMU can be safely enabled in face of * interrupt-heavy engine load. */ test_each_engine("enable-race", fd, ctx, e) test_enable_race(fd, ctx, e); igt_subtest_group { const unsigned int pct[] = { 2, 50, 98 }; /** * Check engine busyness accuracy is as expected. */ for (unsigned int i = 0; i < ARRAY_SIZE(pct); i++) { igt_subtest_with_dynamic_f("busy-accuracy-%u", pct[i]) { for_each_ctx_engine(fd, ctx, e) { igt_dynamic_f("%s", e->name) accuracy(fd, ctx, e, pct[i], 10); } } } } test_each_engine("busy-hang", fd, ctx, e) { igt_hang_t hang = igt_allow_hang(fd, ctx->id, 0); single(fd, ctx, e, TEST_BUSY | FLAG_HANG); igt_disallow_hang(fd, hang); } /** * Test that event waits are correctly reported. */ test_each_rcs("event-wait", fd, ctx, e) event_wait(fd, ctx, e); /** * Test that when all engines are loaded all loads are * correctly reported. */ igt_subtest("all-busy-check-all") all_busy_check_all(fd, ctx, num_engines, TEST_BUSY); igt_subtest("all-busy-idle-check-all") all_busy_check_all(fd, ctx, num_engines, TEST_BUSY | TEST_TRAILING_IDLE); /** * Test counters are not affected by CPU offline/online events. */ igt_subtest("cpu-hotplug") cpu_hotplug(fd); /** * Test GPU frequency. */ igt_subtest("frequency") test_frequency(fd); igt_subtest("frequency-idle") test_frequency_idle(fd); /** * Test interrupt count reporting. */ igt_subtest("interrupts") test_interrupts(fd); igt_subtest("interrupts-sync") test_interrupts_sync(fd); /** * Test RC6 residency reporting. */ igt_subtest("rc6") test_rc6(fd, 0); igt_subtest("rc6-runtime-pm") test_rc6(fd, TEST_RUNTIME_PM); igt_subtest("rc6-runtime-pm-long") test_rc6(fd, TEST_RUNTIME_PM | FLAG_LONG); igt_subtest("rc6-suspend") test_rc6(fd, TEST_S3); /** * Test GT wakeref tracking (similar to RC0, opposite of RC6) */ igt_subtest("gt-awake") test_awake(fd, ctx); /** * Check render nodes are counted. */ igt_subtest_group { int render_fd = -1; const intel_ctx_t *render_ctx = NULL; igt_fixture { render_fd = __drm_open_driver_render(DRIVER_INTEL); igt_require_gem(render_fd); render_ctx = intel_ctx_create_all_physical(render_fd); gem_quiescent_gpu(fd); } test_each_engine("render-node-busy", render_fd, render_ctx, e) single(render_fd, render_ctx, e, TEST_BUSY); test_each_engine("render-node-busy-idle", render_fd, render_ctx, e) single(render_fd, render_ctx, e, TEST_BUSY | TEST_TRAILING_IDLE); igt_fixture { intel_ctx_destroy(render_fd, render_ctx); close(render_fd); } } igt_fixture { intel_ctx_destroy(fd, ctx); close(fd); } igt_subtest("module-unload") { igt_require(igt_i915_driver_unload() == 0); for (int pass = 0; pass < 3; pass++) test_unload(num_engines); } }