/* SPDX-License-Identifier: MIT */
/*
 * Copyright © 2016 Intel Corporation
 */

#include "config.h"

#include <pthread.h>
#include <sys/poll.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/resource.h>
#include <sys/syscall.h>
#include <sched.h>
#include <signal.h>
#include <unistd.h>

#include "sync_file.h"

#include "i915/gem.h"
#include "i915/gem_create.h"
#include "igt.h"
#include "igt_rand.h"
#include "igt_rapl.h"
#include "igt_sysfs.h"
#include "igt_syncobj.h"
#include "igt_vgem.h"
#include "ioctl_wrappers.h"
#include "sw_sync.h"

IGT_TEST_DESCRIPTION("Check that GPU time and execution order is fairly distributed across clients");

#define NSEC64 ((uint64_t)NSEC_PER_SEC)

static int has_secure_batches(int i915)
{
	int v = -1;
	drm_i915_getparam_t gp = {
		.param = I915_PARAM_HAS_SECURE_BATCHES,
		.value = &v,
	};

	drmIoctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);

	return v > 0;
}

static bool has_mi_math(int i915, const struct intel_execution_engine2 *e)
{
	uint32_t devid = intel_get_drm_devid(i915);

	if (intel_gen(devid) >= 8)
		return true;

	if (!IS_HASWELL(devid))
		return false;

	if (!has_secure_batches(i915))
		return false;

	return e == NULL || e->class == I915_ENGINE_CLASS_RENDER;
}

static unsigned int offset_in_page(void *addr)
{
	return (uintptr_t)addr & 4095;
}

static uint32_t __batch_create(int i915, uint32_t offset)
{
	const uint32_t bbe = MI_BATCH_BUFFER_END;
	uint32_t handle;

	handle = gem_create(i915, ALIGN(offset + 4, 4096));
	gem_write(i915, handle, offset, &bbe, sizeof(bbe));

	return handle;
}

static uint32_t batch_create(int i915)
{
	return __batch_create(i915, 0);
}

static int read_timestamp_frequency(int i915)
{
	int value = 0;
	drm_i915_getparam_t gp = {
		.value = &value,
		.param = I915_PARAM_CS_TIMESTAMP_FREQUENCY,
	};
	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
	return value;
}

static uint64_t div64_u64_round_up(uint64_t x, uint64_t y)
{
	return (x + y - 1) / y;
}

static uint64_t ns_to_ctx_ticks(int i915, uint64_t ns)
{
	int f = read_timestamp_frequency(i915);
	if (intel_gen(intel_get_drm_devid(i915)) == 11)
		f = 12500000; /* gen11!!! are you feeling alright? CTX vs CS */
	return div64_u64_round_up(ns * f, NSEC64);
}

static uint64_t ticks_to_ns(int i915, uint64_t ticks)
{
	return div64_u64_round_up(ticks * NSEC64,
				  read_timestamp_frequency(i915));
}

static void delay(int i915,
		  const struct intel_execution_engine2 *e,
		  uint32_t handle,
		  uint64_t addr,
		  uint64_t ns)
{
	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
	const uint32_t base = gem_engine_mmio_base(i915, e->name);
	const uint32_t runtime = base + (use_64b ? 0x3a8 : 0x358);
#define CS_GPR(x) (base + 0x600 + 8 * (x))
	enum { START_TS, NOW_TS };
	uint32_t *map, *cs, *jmp;

	igt_require(base);
	igt_assert(use_64b || (addr >> 32) == 0);

	/* Loop until CTX_TIMESTAMP - initial > @ns */

	cs = map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);

	*cs++ = MI_LOAD_REGISTER_IMM;
	*cs++ = CS_GPR(START_TS) + 4;
	*cs++ = 0;
	*cs++ = MI_LOAD_REGISTER_REG;
	*cs++ = runtime;
	*cs++ = CS_GPR(START_TS);

	while (offset_in_page(cs) & 63)
		*cs++ = 0;
	jmp = cs;

	*cs++ = 0x5 << 23; /* MI_ARB_CHECK */

	*cs++ = MI_LOAD_REGISTER_IMM;
	*cs++ = CS_GPR(NOW_TS) + 4;
	*cs++ = 0;
	*cs++ = MI_LOAD_REGISTER_REG;
	*cs++ = runtime;
	*cs++ = CS_GPR(NOW_TS);

	/* delta = now - start; inverted to match COND_BBE */
	*cs++ = MI_MATH(4);
	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(NOW_TS));
	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(START_TS));
	*cs++ = MI_MATH_SUB;
	*cs++ = MI_MATH_STOREINV(MI_MATH_REG(NOW_TS), MI_MATH_REG_ACCU);

	/* Save delta for reading by COND_BBE */
	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
	*cs++ = CS_GPR(NOW_TS);
	*cs++ = addr + 4000;
	*cs++ = addr >> 32;

	/* Delay between SRM and COND_BBE to post the writes */
	for (int n = 0; n < 8; n++) {
		*cs++ = MI_STORE_DWORD_IMM;
		if (use_64b) {
			*cs++ = addr + 4064;
			*cs++ = addr >> 32;
		} else {
			*cs++ = 0;
			*cs++ = addr + 4064;
		}
		*cs++ = 0;
	}

	/* Break if delta [time elapsed] > ns */
	*cs++ = MI_COND_BATCH_BUFFER_END | MI_DO_COMPARE | (1 + use_64b);
	*cs++ = ~ns_to_ctx_ticks(i915, ns);
	*cs++ = addr + 4000;
	*cs++ = addr >> 32;

	/* Otherwise back to recalculating delta */
	*cs++ = MI_BATCH_BUFFER_START | 1 << 8 | use_64b;
	*cs++ = addr + offset_in_page(jmp);
	*cs++ = addr >> 32;

	munmap(map, 4096);
}

static struct drm_i915_gem_exec_object2
delay_create(int i915, const intel_ctx_t *ctx,
	     const struct intel_execution_engine2 *e,
	     uint64_t target_ns)
{
	struct drm_i915_gem_exec_object2 obj = {
		.handle = batch_create(i915),
		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
	};
	struct drm_i915_gem_execbuffer2 execbuf = {
		.buffers_ptr = to_user_pointer(&obj),
		.buffer_count = 1,
		.rsvd1 = ctx->id,
		.flags = e->flags,
	};

	obj.offset = obj.handle << 12;
	gem_execbuf(i915, &execbuf);
	gem_sync(i915, obj.handle);

	delay(i915, e, obj.handle, obj.offset, target_ns);

	obj.flags |= EXEC_OBJECT_PINNED;
	return obj;
}

static void tslog(int i915,
		  const struct intel_execution_engine2 *e,
		  uint32_t handle,
		  uint64_t addr)
{
	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
	const uint32_t base = gem_engine_mmio_base(i915, e->name);
#define CS_GPR(x) (base + 0x600 + 8 * (x))
#define CS_TIMESTAMP (base + 0x358)
	enum { INC, MASK, ADDR };
	uint32_t *timestamp_lo, *addr_lo;
	uint32_t *map, *cs;

	igt_require(base);
	igt_assert(use_64b || (addr >> 32) == 0);

	map = gem_mmap__device_coherent(i915, handle, 0, 4096, PROT_WRITE);
	cs = map + 512;

	/* Record the current CS_TIMESTAMP into a journal [a 512 slot ring]. */
	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
	*cs++ = CS_TIMESTAMP;
	timestamp_lo = cs;
	*cs++ = addr;
	*cs++ = addr >> 32;

	/* Load the address + inc & mask variables */
	*cs++ = MI_LOAD_REGISTER_IMM;
	*cs++ = CS_GPR(ADDR);
	addr_lo = cs;
	*cs++ = addr;
	*cs++ = MI_LOAD_REGISTER_IMM;
	*cs++ = CS_GPR(ADDR) + 4;
	*cs++ = addr >> 32;

	*cs++ = MI_LOAD_REGISTER_IMM;
	*cs++ = CS_GPR(INC);
	*cs++ = 4;
	*cs++ = MI_LOAD_REGISTER_IMM;
	*cs++ = CS_GPR(INC) + 4;
	*cs++ = 0;

	*cs++ = MI_LOAD_REGISTER_IMM;
	*cs++ = CS_GPR(MASK);
	*cs++ = 0xfffff7ff;
	*cs++ = MI_LOAD_REGISTER_IMM;
	*cs++ = CS_GPR(MASK) + 4;
	*cs++ = 0xffffffff;

	/* Increment the [ring] address for saving CS_TIMESTAMP */
	*cs++ = MI_MATH(8);
	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(INC));
	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(ADDR));
	*cs++ = MI_MATH_ADD;
	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);
	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCA, MI_MATH_REG(ADDR));
	*cs++ = MI_MATH_LOAD(MI_MATH_REG_SRCB, MI_MATH_REG(MASK));
	*cs++ = MI_MATH_AND;
	*cs++ = MI_MATH_STORE(MI_MATH_REG(ADDR), MI_MATH_REG_ACCU);

	/* Rewrite the batch buffer for the next execution */
	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
	*cs++ = CS_GPR(ADDR);
	*cs++ = addr + offset_in_page(timestamp_lo);
	*cs++ = addr >> 32;
	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
	*cs++ = CS_GPR(ADDR);
	*cs++ = addr + offset_in_page(addr_lo);
	*cs++ = addr >> 32;

	*cs++ = MI_BATCH_BUFFER_END;

	munmap(map, 4096);
}

static struct drm_i915_gem_exec_object2
tslog_create(int i915, const intel_ctx_t *ctx,
	     const struct intel_execution_engine2 *e)
{
	struct drm_i915_gem_exec_object2 obj = {
		.handle = batch_create(i915),
		.flags = EXEC_OBJECT_SUPPORTS_48B_ADDRESS,
	};
	struct drm_i915_gem_execbuffer2 execbuf = {
		.buffers_ptr = to_user_pointer(&obj),
		.buffer_count = 1,
		.rsvd1 = ctx->id,
		.flags = e->flags,
	};

	obj.offset = obj.handle << 12;
	gem_execbuf(i915, &execbuf);
	gem_sync(i915, obj.handle);

	tslog(i915, e, obj.handle, obj.offset);

	obj.flags |= EXEC_OBJECT_PINNED;
	return obj;
}

static int cmp_u32(const void *A, const void *B)
{
	const uint32_t *a = A, *b = B;

	if (*a < *b)
		return -1;
	else if (*a > *b)
		return 1;
	else
		return 0;
}

static uint32_t
read_ctx_timestamp(int i915, const intel_ctx_t *ctx,
		   const struct intel_execution_engine2 *e)
{
	struct drm_i915_gem_relocation_entry reloc;
	struct drm_i915_gem_exec_object2 obj = {
		.handle = gem_create(i915, 4096),
		.offset = 32 << 20,
		.relocs_ptr = to_user_pointer(&reloc),
		.relocation_count = 1,
	};
	struct drm_i915_gem_execbuffer2 execbuf = {
		.buffers_ptr = to_user_pointer(&obj),
		.buffer_count = 1,
		.rsvd1 = ctx->id,
		.flags = e->flags,
	};
	const int use_64b = intel_gen(intel_get_drm_devid(i915)) >= 8;
	const uint32_t base = gem_engine_mmio_base(i915, e->name);
	const uint32_t runtime = base + (use_64b ? 0x3a8 : 0x358);
	uint32_t *map, *cs;
	uint32_t ts;
	bool has_relocs = gem_has_relocations(i915);

	cs = map = gem_mmap__device_coherent(i915, obj.handle,
					     0, 4096, PROT_WRITE);

	*cs++ = 0x24 << 23 | (1 + use_64b); /* SRM */
	*cs++ = runtime;

	memset(&reloc, 0, sizeof(reloc));
	reloc.target_handle = obj.handle;
	reloc.presumed_offset = obj.offset;
	reloc.offset = offset_in_page(cs);
	reloc.delta = 4000;
	*cs++ = obj.offset + 4000;
	*cs++ = obj.offset >> 32;

	*cs++ = MI_BATCH_BUFFER_END;

	if (!has_relocs) {
		obj.relocation_count = 0;
		obj.flags |= EXEC_OBJECT_PINNED;
	}

	gem_execbuf(i915, &execbuf);
	gem_sync(i915, obj.handle);
	ts = map[1000];

	if (!ts) {
		/* Twice for good luck (and avoid chance 0) */
		gem_execbuf(i915, &execbuf);
		gem_sync(i915, obj.handle);
		ts = map[1000];
	}

	gem_close(i915, obj.handle);
	munmap(map, 4096);

	return ts;
}

static bool has_ctx_timestamp(int i915, const intel_ctx_cfg_t *cfg,
			      const struct intel_execution_engine2 *e)
{
	const int gen = intel_gen(intel_get_drm_devid(i915));
	const intel_ctx_t *tmp_ctx;
	uint32_t timestamp;

	if (gen == 8 && e->class == I915_ENGINE_CLASS_VIDEO)
		return false; /* looks fubar */

	tmp_ctx = intel_ctx_create(i915, cfg);
	timestamp = read_ctx_timestamp(i915, tmp_ctx, e);
	intel_ctx_destroy(i915, tmp_ctx);

	return timestamp;
}

static struct intel_execution_engine2
pick_random_engine(int i915, const intel_ctx_cfg_t *cfg,
		   const struct intel_execution_engine2 *not)
{
	const struct intel_execution_engine2 *e;
	unsigned int count = 0;

	for_each_ctx_cfg_engine(i915, cfg, e) {
		if (e->flags == not->flags)
			continue;
		if (!gem_class_has_mutable_submission(i915, e->class))
			continue;
		count++;
	}
	if (!count)
		return *not;

	count = rand() % count;
	for_each_ctx_cfg_engine(i915, cfg, e) {
		if (e->flags == not->flags)
			continue;
		if (!gem_class_has_mutable_submission(i915, e->class))
			continue;
		if (!count--)
			break;
	}

	return *e;
}

static void fair_child(int i915, const intel_ctx_t *ctx,
		       const struct intel_execution_engine2 *e,
		       uint64_t frame_ns,
		       int timeline,
		       uint32_t common,
		       unsigned int flags,
		       unsigned long *ctl,
		       unsigned long *median,
		       unsigned long *iqr,
		       int sv, int rv)
#define F_SYNC		(1 << 0)
#define F_PACE		(1 << 1)
#define F_FLOW		(1 << 2)
#define F_HALF		(1 << 3)
#define F_SOLO		(1 << 4)
#define F_SPARE		(1 << 5)
#define F_NEXT		(1 << 6)
#define F_VIP		(1 << 7)
#define F_RRUL		(1 << 8)
#define F_SHARE		(1 << 9)
#define F_PING		(1 << 10)
#define F_THROTTLE	(1 << 11)
#define F_ISOLATE	(1 << 12)
{
	const int batches_per_frame = flags & F_SOLO ? 1 : 3;
	struct drm_i915_gem_exec_object2 obj[4] = {
		{},
		{
			.handle = common ?: gem_create(i915, 4096),
		},
		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
		delay_create(i915, ctx, e, frame_ns / batches_per_frame),
	};
	struct intel_execution_engine2 ping = *e;
	int p_fence = -1, n_fence = -1;
	unsigned long count = 0;
	unsigned int aux_flags;
	int n;

	srandom(getpid());
	if (flags & F_PING)
		ping = pick_random_engine(i915, &ctx->cfg, e);
	obj[0] = tslog_create(i915, ctx, &ping);

	/* Synchronize with other children/parent upon construction */
	if (sv != -1)
		write(sv, &p_fence, sizeof(p_fence));
	if (rv != -1)
		read(rv, &p_fence, sizeof(p_fence));
	igt_assert(p_fence == -1);

	aux_flags = 0;
	if (intel_gen(intel_get_drm_devid(i915)) < 8)
		aux_flags = I915_EXEC_SECURE;
	ping.flags |= aux_flags;
	aux_flags |= e->flags;

	while (!READ_ONCE(*ctl)) {
		struct drm_i915_gem_execbuffer2 execbuf = {
			.buffers_ptr = to_user_pointer(obj),
			.buffer_count = 3,
			.rsvd1 = ctx->id,
			.rsvd2 = -1,
			.flags = aux_flags,
		};

		if (flags & F_FLOW) {
			unsigned int seq;

			seq = count;
			if (flags & F_NEXT)
				seq++;

			execbuf.rsvd2 =
				sw_sync_timeline_create_fence(timeline, seq);
			execbuf.flags |= I915_EXEC_FENCE_IN;
		}

		execbuf.flags |= I915_EXEC_FENCE_OUT;
		gem_execbuf_wr(i915, &execbuf);
		n_fence = execbuf.rsvd2 >> 32;
		execbuf.flags &= ~(I915_EXEC_FENCE_OUT | I915_EXEC_FENCE_IN);
		for (n = 1; n < batches_per_frame; n++)
			gem_execbuf(i915, &execbuf);
		close(execbuf.rsvd2);

		execbuf.buffer_count = 1;
		execbuf.batch_start_offset = 2048;
		execbuf.flags = ping.flags | I915_EXEC_FENCE_IN;
		execbuf.rsvd2 = n_fence;
		gem_execbuf(i915, &execbuf);

		if (flags & F_PACE && p_fence != -1) {
			struct pollfd pfd = {
				.fd = p_fence,
				.events = POLLIN,
			};
			poll(&pfd, 1, -1);
		}
		close(p_fence);

		if (flags & F_SYNC) {
			struct pollfd pfd = {
				.fd = n_fence,
				.events = POLLIN,
			};
			poll(&pfd, 1, -1);
		}

		if (flags & F_THROTTLE)
			igt_ioctl(i915, DRM_IOCTL_I915_GEM_THROTTLE, 0);

		igt_swap(obj[2], obj[3]);
		igt_swap(p_fence, n_fence);
		count++;
	}
	close(p_fence);

	gem_close(i915, obj[3].handle);
	gem_close(i915, obj[2].handle);
	if (obj[1].handle != common)
		gem_close(i915, obj[1].handle);

	gem_sync(i915, obj[0].handle);
	if (median) {
		uint32_t *map;

		/*
		 * We recorded the CS_TIMESTAMP of each frame, and if
		 * the GPU is being shared completely fairly, we expect
		 * each frame to be at the same interval from the last.
		 *
		 * Compute the interval between frames and report back
		 * both the median interval and the range for this client.
		 */

		map = gem_mmap__device_coherent(i915, obj[0].handle,
						0, 4096, PROT_WRITE);
		igt_assert(map[0]);
		for (n = 1; n < min(count, 512ul); n++) {
			igt_assert(map[n]);
			map[n - 1] = map[n] - map[n - 1];
		}
		qsort(map, --n, sizeof(*map), cmp_u32);
		*iqr = ticks_to_ns(i915, map[(3 * n + 3) / 4] - map[n / 4]);
		*median = ticks_to_ns(i915, map[n / 2]);
		munmap(map, 4096);
	}
	gem_close(i915, obj[0].handle);
}

static int cmp_ul(const void *A, const void *B)
{
	const unsigned long *a = A, *b = B;

	if (*a < *b)
		return -1;
	else if (*a > *b)
		return 1;
	else
		return 0;
}

static uint64_t d_cpu_time(const struct rusage *a, const struct rusage *b)
{
	uint64_t cpu_time = 0;

	cpu_time += (a->ru_utime.tv_sec - b->ru_utime.tv_sec) * NSEC64;
	cpu_time += (a->ru_utime.tv_usec - b->ru_utime.tv_usec) * 1000;

	cpu_time += (a->ru_stime.tv_sec - b->ru_stime.tv_sec) * NSEC64;
	cpu_time += (a->ru_stime.tv_usec - b->ru_stime.tv_usec) * 1000;

	return cpu_time;
}

static void timeline_advance(int timeline, int delay_ns)
{
	struct timespec tv = { .tv_nsec = delay_ns };
	nanosleep(&tv, NULL);
	sw_sync_timeline_inc(timeline, 1);
}

static void fairness(int i915, const intel_ctx_cfg_t *cfg,
		     const struct intel_execution_engine2 *e,
		     int duration, unsigned int flags)
{
	const int frame_ns = 16666 * 1000;
	const int fence_ns = flags & F_HALF ? 2 * frame_ns : frame_ns;
	unsigned long *result, *iqr;
	uint32_t common = 0;
	struct {
		int child[2];
		int parent[2];
	} lnk;

	igt_require(has_ctx_timestamp(i915, cfg, e));
	igt_require(gem_class_has_mutable_submission(i915, e->class));
	if (flags & (F_ISOLATE | F_PING))
		igt_require(intel_gen(intel_get_drm_devid(i915)) >= 8);

	igt_assert(pipe(lnk.child) == 0);
	igt_assert(pipe(lnk.parent) == 0);

	if (flags & F_SHARE)
		common = gem_create(i915, 4095);

	result = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
	igt_assert(result != MAP_FAILED);
	iqr = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
	igt_assert(iqr != MAP_FAILED);

	/*
	 * The combined workload always runs at a 60fps target (unless F_HALF!).
	 * This gives a frame of interval of 16ms that is evenly split across
	 * all the clients, so simulating a system with a bunch of clients that
	 * are perfectly balanced and can sustain 60fps. Our job is to ensure
	 * that each client does run at a smooth 60fps.
	 *
	 * Each client runs a fixed length delay loop (as a single request,
	 * or split into 3) and then records the CS_TIMESTAMP after completing
	 * its delay. Given a fair allotment of GPU time to each client,
	 * that timestamp will [ideally] be at a precise 16ms intervals.
	 * In practice, time is wasted on context switches, so as the number
	 * of clients increases, the proprotion of time spent on context
	 * switches grows. As we get to 64 render clients, we will be spending
	 * as much time in context switches as executing the client workloads.
	 *
	 * Each client frame may be paced by some throttling technique found
	 * in the wild. i.e. each client may wait until a simulated vblank
	 * to indicate the start of a new frame, or it may wait until the
	 * completion of a previous frame. This causes submission from each
	 * client and across the system to be chunky and uneven.
	 *
	 * We look at the variation of frame intervals within each client, and
	 * the variation of the medians across the clients to see if the
	 * distribution (budget) of GPU time was fair enough.
	 *
	 * Alternative (and important) metrics will be more latency centric;
	 * looking at how well we can sustain meeting deadline given competition
	 * by clients for the GPU.
	 */

	for (int n = 2; n <= 256; n <<= 1) { /* 32 == 500us per client */
		int timeline = sw_sync_timeline_create();
		int nfences = duration * NSEC64 / fence_ns + 1;
		int nchild = n - 1; /* odd for easy medians */
		const int child_ns = frame_ns / (nchild + !!(flags & F_SPARE));
		const int lo = nchild / 4;
		const int hi = (3 * nchild + 3) / 4 - 1;
		struct rusage old_usage, usage;
		uint64_t cpu_time, d_time;
		struct timespec tv;
		struct igt_mean m;

		memset(result, 0, (nchild + 1) * sizeof(result[0]));

		if (flags & F_PING) { /* fill the others with light bg load */
			struct intel_execution_engine2 *ping;

			for_each_ctx_cfg_engine(i915, cfg, ping) {
				if (ping->flags == e->flags)
					continue;

				igt_fork(child, 1) {
					const intel_ctx_t *ctx = intel_ctx_create(i915, cfg);

					fair_child(i915, ctx, ping,
						   child_ns / 8,
						   -1, common,
						   F_SOLO | F_PACE | F_SHARE,
						   &result[nchild],
						   NULL, NULL, -1, -1);

					intel_ctx_destroy(i915, ctx);
				}
			}
		}

		getrusage(RUSAGE_CHILDREN, &old_usage);
		igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));
		igt_fork(child, nchild) {
			const intel_ctx_t *ctx;

			if (flags & F_ISOLATE) {
				int dmabuf = -1;

				if (common)
					dmabuf = prime_handle_to_fd(i915, common);

				i915 = gem_reopen_driver(i915);

				if (dmabuf != -1)
					common = prime_fd_to_handle(i915, dmabuf);
			}

			ctx = intel_ctx_create(i915, cfg);

			if (flags & F_VIP && child == 0) {
				gem_context_set_priority(i915, ctx->id, 1023);
				flags |= F_FLOW;
			}
			if (flags & F_RRUL && child == 0)
				flags |= F_SOLO | F_FLOW | F_SYNC;

			fair_child(i915, ctx, e, child_ns,
				   timeline, common, flags,
				   &result[nchild],
				   &result[child], &iqr[child],
				   lnk.child[1], lnk.parent[0]);

			intel_ctx_destroy(i915, ctx);
		}

		{
			int sync;
			for (int child = 0; child < nchild; child++)
				read(lnk.child[0], &sync, sizeof(sync));
			for (int child = 0; child < nchild; child++)
				write(lnk.parent[1], &sync, sizeof(sync));
		}

		while (nfences--)
			timeline_advance(timeline, fence_ns);

		result[nchild] = 1;
		for (int child = 0; child < nchild; child++) {
			while (!READ_ONCE(result[child]))
				timeline_advance(timeline, fence_ns);
		}

		igt_waitchildren();
		close(timeline);

		/*
		 * Are we running out of CPU time, and fail to submit frames?
		 *
		 * We try to rule out any undue impact on the GPU scheduling
		 * from the CPU scheduler by looking for core saturation. If
		 * we may be in a situation where the clients + kernel are
		 * taking a whole core (think lockdep), then it is increasingly
		 * likely that our measurements include delays from the CPU
		 * scheduler. Err on the side of caution.
		 */
		d_time = igt_nsec_elapsed(&tv);
		getrusage(RUSAGE_CHILDREN, &usage);
		cpu_time = d_cpu_time(&usage, &old_usage);
		igt_debug("CPU usage: %.0f%%\n", 100. * cpu_time / d_time);
		if (4 * cpu_time > 3 * d_time) {
			if (nchild > 7) /* good enough to judge pass/fail */
				break;

			igt_skip_on_f(4 * cpu_time > 3 * d_time,
				      "%.0f%% CPU usage, presuming capacity exceeded\n",
				      100. * cpu_time / d_time);
		}

		/* With no contention, we should match our target frametime */
		if (nchild == 1) {
			igt_info("Interval %.2fms, range %.2fms\n",
				 1e-6 * result[0], 1e-6 * iqr[0]);
			igt_assert(4 * result[0] > 3 * fence_ns &&
				   3 * result[0] < 4 * fence_ns);
			continue;
		}

		/*
		 * The VIP should always be able to hit the target frame rate;
		 * regardless of budget contention from lessor clients.
		 */
		if (flags & (F_VIP | F_RRUL)) {
			const char *who = flags & F_VIP ? "VIP" : "RRUL";
			igt_info("%s interval %.2fms, range %.2fms\n",
				 who, 1e-6 * result[0], 1e-6 * iqr[0]);
			if (flags & F_VIP) {
				igt_assert_f(4 * result[0] > 3 * fence_ns &&
					     3 * result[0] < 4 * fence_ns,
					     "%s expects to run exactly when it wants, expects an interval of %.2fms, was %.2fms\n",
					     who,
					     1e-6 * fence_ns,
					     1e-6 * result[0]);
			}
			igt_assert_f(iqr[0] < result[0],
				     "%s frame IQR %.2fms exceeded median threshold %.2fms\n",
				     who, 1e-6 * iqr[0], 1e-6 * result[0] / 2);
			if (!--nchild)
				continue;

			/* Exclude the VIP result from the plebian statistics */
			memmove(result, result + 1, nchild * sizeof(*result));
			memmove(iqr, iqr + 1, nchild * sizeof(*iqr));
		}

		igt_mean_init(&m);
		for (int child = 0; child < nchild; child++)
			igt_mean_add(&m, result[child]);

		qsort(result, nchild, sizeof(*result), cmp_ul);
		qsort(iqr, nchild, sizeof(*iqr), cmp_ul);

		/*
		 * The target interval for median/mean is 16ms (fence_ns).
		 * However, this work is evenly split across the clients so
		 * the range (and median) of client medians may be much less
		 * than 16ms [16/3N]. We present median of medians to try
		 * and avoid any instability while running in CI; at the cost
		 * of insensitivity!
		 */
		igt_info("%3d clients, range: [%.1f, %.1f], iqr: [%.1f, %.1f], median: %.1f [%.1f, %.1f], mean: %.1f ± %.2f ms, cpu: %.0f%%\n",
			 nchild,
			 1e-6 * result[0],  1e-6 * result[nchild - 1],
			 1e-6 * result[lo], 1e-6 * result[hi],
			 1e-6 * result[nchild / 2],
			 1e-6 * iqr[lo], 1e-6 * iqr[hi],
			 1e-6 * igt_mean_get(&m),
			 1e-6 * sqrt(igt_mean_get_variance(&m)),
			 100. * cpu_time / d_time);

		igt_assert_f(iqr[nchild / 2] < result[nchild / 2],
			     "Child frame IQR %.2fms exceeded median threshold %.2fms\n",
			     1e-6 * iqr[nchild / 2],
			     1e-6 * result[nchild / 2]);

		igt_assert_f(4 * igt_mean_get(&m) > 3 * result[nchild / 2] &&
			     3 * igt_mean_get(&m) < 4 * result[nchild / 2],
			     "Mean of client interval %.2fms differs from median %.2fms, distribution is skewed\n",

			     1e-6 * igt_mean_get(&m), 1e-6 * result[nchild / 2]);

		igt_assert_f(result[nchild / 2] > frame_ns / 2,
			     "Median client interval %.2fms did not match target interval %.2fms\n",
			     1e-6 * result[nchild / 2], 1e-6 * frame_ns);


		igt_assert_f(result[hi] - result[lo] < result[nchild / 2],
			     "Interquartile range of client intervals %.2fms is as large as the median threshold %.2fms, clients are not evenly distributed!\n",
			     1e-6 * (result[hi] - result[lo]),
			     1e-6 * result[nchild / 2]);

		/* May be slowed due to sheer volume of context switches */
		if (result[0] > 2 * fence_ns)
			break;
	}

	munmap(iqr, 4096);
	munmap(result, 4096);
	if (common)
		gem_close(i915, common);

	close(lnk.child[0]);
	close(lnk.child[1]);
	close(lnk.parent[0]);
	close(lnk.parent[1]);
}

static void deadline_child(int i915,
			   const intel_ctx_t *ctx,
			   const struct intel_execution_engine2 *e,
			   uint32_t handle,
			   int timeline,
			   int frame_ns,
			   int sv, int rv,
			   int *done,
			   unsigned int flags)
#define DL_PRIO (1 << 0)
{
	struct drm_i915_gem_exec_object2 obj[] = {
		{ handle }, delay_create(i915, ctx, e, frame_ns),
	};
	struct drm_i915_gem_exec_fence fence = {
		.flags = I915_EXEC_FENCE_SIGNAL,
	};
	struct drm_i915_gem_execbuffer2 execbuf = {
		.buffers_ptr = to_user_pointer(obj),
		.buffer_count = ARRAY_SIZE(obj),
		.flags = I915_EXEC_FENCE_OUT | e->flags,
		.rsvd1 = ctx->id,
	};
	unsigned int seq = 1;
	int prev = -1, next = -1;

	if (intel_gen(intel_get_drm_devid(i915)) < 8)
		execbuf.flags |= I915_EXEC_SECURE;

	gem_execbuf_wr(i915, &execbuf);
	execbuf.rsvd2 >>= 32;
	gem_execbuf_wr(i915, &execbuf);
	gem_sync(i915, obj[1].handle);

	execbuf.num_cliprects = 1;
	execbuf.cliprects_ptr = to_user_pointer(&fence);
	execbuf.flags |= I915_EXEC_FENCE_ARRAY;
	if (!(flags & DL_PRIO))
		execbuf.flags |= I915_EXEC_FENCE_IN;

	write(sv, &prev, sizeof(int));
	read(rv, &prev, sizeof(int));
	igt_assert(prev == -1);

	prev = execbuf.rsvd2;
	next = execbuf.rsvd2 >> 32;
	while (!READ_ONCE(*done)) {
		sync_fence_wait(prev, -1);
		igt_assert_eq(sync_fence_status(prev), 1);
		close(prev);

		fence.handle = syncobj_create(i915, 0);
		execbuf.rsvd2 = sw_sync_timeline_create_fence(timeline, seq);
		gem_execbuf_wr(i915, &execbuf);
		close(execbuf.rsvd2);

		write(sv, &fence.handle, sizeof(uint32_t));

		prev = next;
		next = execbuf.rsvd2 >> 32;
		seq++;
	}
	close(next);
	close(prev);
}

static struct intel_execution_engine2
pick_default(int i915, const intel_ctx_cfg_t *cfg)
{
	const struct intel_execution_engine2 *e;

	for_each_ctx_cfg_engine(i915, cfg, e) {
		if (!e->flags)
			return *e;
	}

	return (struct intel_execution_engine2){};
}

static struct intel_execution_engine2
pick_engine(int i915, const intel_ctx_cfg_t *cfg, const char *name)
{
	const struct intel_execution_engine2 *e;

	for_each_ctx_cfg_engine(i915, cfg, e) {
		if (!strcmp(e->name, name))
			return *e;
	}

	return (struct intel_execution_engine2){};
}

static bool has_syncobj(int i915)
{
	struct drm_get_cap cap = { .capability = DRM_CAP_SYNCOBJ };
	ioctl(i915, DRM_IOCTL_GET_CAP, &cap);
	return cap.value;
}

static bool has_fence_array(int i915)
{
	int value = 0;
	struct drm_i915_getparam gp = {
		.param = I915_PARAM_HAS_EXEC_FENCE_ARRAY,
		.value = &value,
	};

	ioctl(i915, DRM_IOCTL_I915_GETPARAM, &gp);
	errno = 0;

	return value;
}

static uint64_t time_get_mono_ns(void)
{
	struct timespec tv;

	igt_assert(clock_gettime(CLOCK_MONOTONIC, &tv) == 0);
	return tv.tv_sec * NSEC64 + tv.tv_nsec;
}

static void deadline(int i915, const intel_ctx_cfg_t *cfg,
		     int duration, unsigned int flags)
{
	const int64_t frame_ns = 33670 * 1000; /* 29.7fps */
	const int64_t parent_ns = 400 * 1000;
	const int64_t switch_ns = 50 * 1000;
	const int64_t overhead_ns = /* estimate timeslicing overhead */
		(frame_ns / 1000 / 1000 + 2) * switch_ns + parent_ns;
	struct intel_execution_engine2 pe = pick_default(i915, cfg);
	struct intel_execution_engine2 ve = pick_engine(i915, cfg, "vcs0");
	struct drm_i915_gem_exec_fence *fences = calloc(sizeof(*fences), 32);
	struct drm_i915_gem_exec_object2 *obj = calloc(sizeof(*obj), 32);
	struct drm_i915_gem_execbuffer2 execbuf = {
		.buffers_ptr = to_user_pointer(obj),
		.cliprects_ptr = to_user_pointer(fences),
		.flags =
			I915_EXEC_BATCH_FIRST |
			I915_EXEC_FENCE_ARRAY |
			I915_EXEC_FENCE_OUT
	};
	const intel_ctx_t *delay_ctx;
	int *ctl;

	igt_require(has_syncobj(i915));
	igt_require(has_fence_array(i915));
	igt_require(has_mi_math(i915, &pe));
	igt_require(has_ctx_timestamp(i915, cfg, &pe));
	igt_require(has_mi_math(i915, &ve));
	igt_require(has_ctx_timestamp(i915, cfg, &ve));
	igt_assert(obj && fences);
	if (flags & DL_PRIO)
		igt_require(gem_scheduler_has_preemption(i915));

	ctl = mmap(NULL, 4096, PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
	igt_assert(ctl != MAP_FAILED);

	delay_ctx = intel_ctx_create(i915, cfg);
	obj[0] = delay_create(i915, delay_ctx, &pe, parent_ns);
	if (flags & DL_PRIO)
		gem_context_set_priority(i915, delay_ctx->id, 1023);
	if (intel_gen(intel_get_drm_devid(i915)) < 8)
		execbuf.flags |= I915_EXEC_SECURE;
	for (int n = 1; n <= 5; n++) {
		int timeline = sw_sync_timeline_create();
		int nframes = duration * NSEC64 / frame_ns + 1;
		int num_children = (1 << n) - 1;
		int child_ns = (frame_ns - overhead_ns) / num_children - switch_ns;
		struct { int child[2], parent[2]; } *link;
		uint64_t start, over;
		int missed;

		if (child_ns < 0)
			break;

		execbuf.buffer_count = num_children + 1;
		execbuf.num_cliprects = num_children;

		link = malloc(sizeof(*link) * num_children);
		for (int i = 0; i < num_children; i++) {
			obj[i + 1].handle = gem_create(i915, 4096);
			pipe(link[i].child);
			pipe(link[i].parent);
		}

		*ctl = 0;
		igt_fork(child, num_children) {
			const intel_ctx_t *ctx = intel_ctx_create(i915, cfg);

			deadline_child(i915, ctx, &ve, obj[child + 1].handle,
				       timeline, child_ns,
				       link[child].child[1],
				       link[child].parent[0],
				       ctl, flags);

			intel_ctx_destroy(i915, ctx);
		}

		for (int i = 0; i < num_children; i++)
			read(link[i].child[0], &over, sizeof(int));
		igt_info("Testing %d children, with %'dns\n", num_children, child_ns);
		for (int i = 0; i < num_children; i++)
			write(link[i].parent[1], &over, sizeof(int));

		over = 0;
		missed = 0;
		start = time_get_mono_ns();
		for (int frame = 1; frame <= nframes; frame++) {
			struct rusage old_usage, usage;
			uint64_t cpu_time, d_time;
			struct timespec tv;
			uint64_t time;
			int fence;

			getrusage(RUSAGE_CHILDREN, &old_usage);
			igt_nsec_elapsed(memset(&tv, 0, sizeof(tv)));

			sw_sync_timeline_inc(timeline, 1);
			for (int i = 0; i < num_children; i++) {
				read(link[i].child[0], &fences[i].handle, sizeof(uint32_t));
				fences[i].flags = I915_EXEC_FENCE_WAIT;
			}

			gem_execbuf_wr(i915, &execbuf);
			for (int i = 0; i < num_children; i++)
				syncobj_destroy(i915, fences[i].handle);

			fence = execbuf.rsvd2 >> 32;
			sync_fence_wait(fence, -1);
			igt_assert_eq(sync_fence_status(fence), 1);
			time = sync_fence_timestamp(fence) - start;
			close(fence);

			d_time = igt_nsec_elapsed(&tv);
			getrusage(RUSAGE_CHILDREN, &usage);
			cpu_time = d_cpu_time(&usage, &old_usage);
			igt_debug("CPU usage: %.0f%%\n",
				  100. * cpu_time / d_time);
			if (4 * cpu_time > 3 * d_time)
				break;

			if (time > frame * frame_ns) {
				igt_warn("Frame %d: over by %'"PRIu64"ns\n",
					 frame, time - frame * frame_ns);
				over += time - frame * frame_ns;
				missed++;
			}
		}
		*ctl = 1;
		sw_sync_timeline_inc(timeline, 3);
		igt_waitchildren();
		close(timeline);

		igt_assert_f(missed == 0,
			     "%d child, missed %d frames, overran by %'"PRIu64"us\n",
			     num_children, missed, over / 1000);

		for (int i = 0; i < num_children; i++) {
			gem_close(i915, obj[i + 1].handle);
			close(link[i].child[0]);
			close(link[i].child[1]);
			close(link[i].parent[0]);
			close(link[i].parent[1]);
		}
		free(link);

		gem_quiescent_gpu(i915);
	}

	intel_ctx_destroy(i915, delay_ctx);
	gem_close(i915, obj[0].handle);
	free(obj);
	free(fences);
}

static bool set_heartbeat(int i915, const char *name, unsigned int value)
{
	unsigned int x;

	if (gem_engine_property_printf(i915, name,
				       "heartbeat_interval_ms",
				       "%d", value) < 0)
		return false;

	x = ~value;
	gem_engine_property_scanf(i915, name,
				  "heartbeat_interval_ms",
				  "%d", &x);
	igt_assert_eq(x, value);

	return true;
}

igt_main
{
	static const struct {
		const char *name;
		unsigned int flags;
		unsigned int basic;
#define BASIC		(1 << 0)
#define BASIC_ALL	(1 << 1)
	} fair[] = {
		/*
		 * none - maximal greed in each client
		 *
		 * Push as many frames from each client as fast as possible
		 */
		{ "none",       0, BASIC_ALL },
		{ "none-vip",   F_VIP, BASIC }, /* one vip client must meet deadlines */
		{ "none-solo",  F_SOLO, BASIC }, /* 1 batch per frame per client */
		{ "none-share", F_SHARE, BASIC }, /* read from a common buffer */
		{ "none-rrul",  F_RRUL, BASIC }, /* "realtime-response under load" */
		{ "none-ping",  F_PING }, /* measure inter-engine fairness */

		/*
		 * throttle - original per client throttling
		 *
		 * Used for front buffering rendering where there is no
		 * extenal frame marker. Each client tries to only keep
		 * 20ms of work submitted, though that measurement is
		 * flawed...
		 *
		 * This is used by Xorg to try and maintain some resembalance
		 * of input/output consistency when being feed a continuous
		 * stream of X11 draw requests straight into scanout, where
		 * the clients may submit the work faster than can be drawn.
		 *
		 * Throttling tracks requests per-file (and assumes that
		 * all requests are in submission order across the whole file),
		 * so we split each child to its own fd.
		 */
		{ "throttle",       F_THROTTLE | F_ISOLATE, BASIC },
		{ "throttle-vip",   F_THROTTLE | F_ISOLATE | F_VIP },
		{ "throttle-solo",  F_THROTTLE | F_ISOLATE | F_SOLO },
		{ "throttle-share", F_THROTTLE | F_ISOLATE | F_SHARE },
		{ "throttle-rrul",  F_THROTTLE | F_ISOLATE | F_RRUL },

		/*
		 * pace - mesa "submit double buffering"
		 *
		 * Submit a frame, wait for previous frame to start. This
		 * prevents each client from getting too far ahead of its
		 * rendering, maintaining a consistent input/output latency.
		 */
		{ "pace",       F_PACE, BASIC_ALL },
		{ "pace-solo",  F_PACE | F_SOLO, BASIC },
		{ "pace-share", F_PACE | F_SOLO | F_SHARE, BASIC },
		{ "pace-ping",  F_PACE | F_SOLO | F_SHARE | F_PING},

		/* sync - only submit a frame at a time */
		{ "sync",      F_SYNC, BASIC },
		{ "sync-vip",  F_SYNC | F_VIP },
		{ "sync-solo", F_SYNC | F_SOLO },

		/* flow - synchronise execution against the clock (vblank) */
		{ "flow",       F_PACE | F_FLOW, BASIC },
		{ "flow-solo",  F_PACE | F_FLOW | F_SOLO },
		{ "flow-share", F_PACE | F_FLOW | F_SHARE },
		{ "flow-ping",  F_PACE | F_FLOW | F_SHARE | F_PING },

		/* next - submit ahead of the clock (vblank double buffering) */
		{ "next",       F_PACE | F_FLOW | F_NEXT },
		{ "next-solo",  F_PACE | F_FLOW | F_NEXT | F_SOLO },
		{ "next-share", F_PACE | F_FLOW | F_NEXT | F_SHARE },
		{ "next-ping",  F_PACE | F_FLOW | F_NEXT | F_SHARE | F_PING },

		/* spare - underutilise by a single client timeslice */
		{ "spare",      F_PACE | F_FLOW | F_SPARE },
		{ "spare-solo", F_PACE | F_FLOW | F_SPARE | F_SOLO },

		/* half - run at half pace (submit 16ms of work every 32ms) */
		{ "half",       F_PACE | F_FLOW | F_HALF },
		{ "half-solo",  F_PACE | F_FLOW | F_HALF | F_SOLO },

		{}
	};
	const struct intel_execution_engine2 *e;
	intel_ctx_cfg_t cfg;
	int i915 = -1;

	igt_fixture {
		igt_require_sw_sync();

		i915 = drm_open_driver_master(DRIVER_INTEL);
		gem_submission_print_method(i915);
		gem_scheduler_print_capability(i915);

		igt_require_gem(i915);
		gem_require_mmap_wc(i915);
		gem_require_contexts(i915);
		igt_require(gem_scheduler_enabled(i915));
		igt_require(gem_scheduler_has_ctx_priority(i915));

		/*
		 * These tests are for a specific scheduling model which is
		 * not currently implemented by GuC. So skip on GuC platforms.
		 */
		igt_require(!gem_using_guc_submission(i915));

		cfg = intel_ctx_cfg_all_physical(i915);

		igt_info("CS timestamp frequency: %d\n",
			 read_timestamp_frequency(i915));
		igt_require(has_mi_math(i915, NULL));

		igt_fork_hang_detector(i915);
	}

	/* First we do a trimmed set of basic tests for faster CI */
	for (typeof(*fair) *f = fair; f->name; f++) {
		if (!f->basic)
			continue;

		igt_subtest_with_dynamic_f("basic-%s", f->name)  {
			for_each_ctx_cfg_engine(i915, &cfg, e) {
				if (!has_mi_math(i915, e))
					continue;

				if (!gem_class_can_store_dword(i915, e->class))
					continue;

				if (e->flags && !(f->basic & BASIC_ALL))
					continue;

				igt_dynamic_f("%s", e->name)
					fairness(i915, &cfg, e, 1, f->flags);
			}
		}
	}

	igt_subtest("basic-deadline")
		deadline(i915, &cfg, 2, 0);
	igt_subtest("deadline-prio")
		deadline(i915, &cfg, 2, DL_PRIO);

	for (typeof(*fair) *f = fair; f->name; f++) {
		igt_subtest_with_dynamic_f("fair-%s", f->name)  {
			for_each_ctx_cfg_engine(i915, &cfg, e) {
				if (!has_mi_math(i915, e))
					continue;

				if (!gem_class_can_store_dword(i915, e->class))
					continue;

				if (!set_heartbeat(i915, e->name, 5000))
					continue;

				igt_dynamic_f("%s", e->name)
					fairness(i915, &cfg, e, 5, f->flags);
			}
		}
	}

	igt_fixture {
		igt_stop_hang_detector();
		close(i915);
	}
}