/* basic set of prime tests between intel and nouveau */

/* test list -
   1. share buffer from intel -> nouveau.
   2. share buffer from nouveau -> intel
   3. share intel->nouveau, map on both, write intel, read nouveau
   4. share intel->nouveau, blit intel fill, readback on nouveau
   test 1 + map buffer, read/write, map other size.
   do some hw actions on the buffer
   some illegal operations -
       close prime fd try and map

   TODO add some nouveau rendering tests
*/


#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <errno.h>

#include "i915_drm.h"
#include "intel_bufmgr.h"
#include "nouveau.h"
#include "intel_gpu_tools.h"
#include "intel_batchbuffer.h"
#include "drmtest.h"

static int intel_fd = -1, nouveau_fd = -1;
static drm_intel_bufmgr *bufmgr;
static struct nouveau_device *ndev;
static struct nouveau_client *nclient;
static uint32_t devid;
static struct intel_batchbuffer *batch;
static struct nouveau_object *nchannel, *pcopy;
static struct nouveau_bufctx *nbufctx;
static struct nouveau_pushbuf *npush;

static struct nouveau_bo *query_bo;
static uint32_t query_counter;
static volatile uint32_t *query;
static uint32_t memtype_intel, tile_intel_y, tile_intel_x;

#define SUBC_COPY(x) 6, (x)
#define NV01_SUBCHAN_OBJECT 0

#define NV01_SUBC(subc, mthd) SUBC_##subc((NV01_SUBCHAN_##mthd))

#if 0
#define dbg(fmt...) fprintf(stderr, fmt);
#else
#define dbg(...) do { } while (0)
#endif

typedef struct {
	uint32_t w, h;
	uint32_t pitch, lines;
} rect;

static int nv_bo_alloc(struct nouveau_bo **bo, rect *r,
		       uint32_t w, uint32_t h, uint32_t tile_mode,
		       int handle, uint32_t dom)
{
	uint32_t size;
	uint32_t dx = 1, dy = 1, memtype = 0;
	int ret;

	*bo = NULL;
	if (tile_mode) {
		uint32_t tile_y;
		uint32_t tile_x;

		/* Y major tiling */
		if ((tile_mode & 0xf) == 0xe)
			/* but the internal layout is different */
			tile_x = 7;
		else
			tile_x = 6 + (tile_mode & 0xf);
		if (ndev->chipset < 0xc0) {
			memtype = 0x70;
			tile_y = 2;
		} else {
			memtype = 0xfe;
			tile_y = 3;
		}
		if ((tile_mode & 0xf) == 0xe)
			memtype = memtype_intel;
		tile_y += ((tile_mode & 0xf0)>>4);

		dx = 1 << tile_x;
		dy = 1 << tile_y;
		dbg("Tiling requirements: x y %u %u\n", dx, dy);
	}

	r->w = w;
	r->h = h;

	r->pitch = w = (w + dx-1) & ~(dx-1);
	r->lines = h = (h + dy-1) & ~(dy-1);
	size = w*h;

	if (handle < 0) {
		union nouveau_bo_config cfg;
		cfg.nv50.memtype = memtype;
		cfg.nv50.tile_mode = tile_mode;
		if (dom == NOUVEAU_BO_GART)
			dom |= NOUVEAU_BO_MAP;
		ret = nouveau_bo_new(ndev, dom, 4096, size, &cfg, bo);
		if (!ret)
			ret = nouveau_bo_map(*bo, NOUVEAU_BO_RDWR, nclient);
		if (ret) {
			fprintf(stderr, "creating bo failed with %i %s\n",
				ret, strerror(-ret));
			nouveau_bo_ref(NULL, bo);
			return ret;
		}

		dbg("new flags %08x memtype %08x tile %08x\n", (*bo)->flags, (*bo)->config.nv50.memtype, (*bo)->config.nv50.tile_mode);
		if (tile_mode == tile_intel_y || tile_mode == tile_intel_x) {
			dbg("tile mode was: %02x, now: %02x\n", (*bo)->config.nv50.tile_mode, tile_mode);
			/* Doesn't like intel tiling much.. */
			(*bo)->config.nv50.tile_mode = tile_mode;
		}
	} else {
		ret = nouveau_bo_prime_handle_ref(ndev, handle, bo);
		close(handle);
		if (ret < 0) {
			fprintf(stderr, "receiving bo failed with %i %s\n",
				ret, strerror(-ret));
			return ret;
		}
		if ((*bo)->size < size) {
			fprintf(stderr, "expected bo size to be at least %u,"
				"but received %"PRIu64"\n", size, (*bo)->size);
			nouveau_bo_ref(NULL, bo);
			return -1;
		}
		dbg("prime flags %08x memtype %08x tile %08x\n", (*bo)->flags, (*bo)->config.nv50.memtype, (*bo)->config.nv50.tile_mode);
		(*bo)->config.nv50.memtype = memtype;
		(*bo)->config.nv50.tile_mode = tile_mode;
	}
	dbg("size: %"PRIu64"\n", (*bo)->size);

	return ret;
}

static inline void
PUSH_DATA(struct nouveau_pushbuf *push, uint32_t data)
{
	*push->cur++ = data;
}

static inline void
BEGIN_NV04(struct nouveau_pushbuf *push, int subc, int mthd, int size)
{
	PUSH_DATA (push, 0x00000000 | (size << 18) | (subc << 13) | mthd);
}

static inline void
BEGIN_NI04(struct nouveau_pushbuf *push, int subc, int mthd, int size)
{
	PUSH_DATA (push, 0x40000000 | (size << 18) | (subc << 13) | mthd);
}

static inline void
BEGIN_NVC0(struct nouveau_pushbuf *push, int subc, int mthd, int size)
{
	PUSH_DATA (push, 0x20000000 | (size << 16) | (subc << 13) | (mthd / 4));
}

static inline void
BEGIN_NVXX(struct nouveau_pushbuf *push, int subc, int mthd, int size)
{
	if (ndev->chipset < 0xc0)
		BEGIN_NV04(push, subc, mthd, size);
	else
		BEGIN_NVC0(push, subc, mthd, size);
}

static void
noop_intel(drm_intel_bo *bo)
{
	BEGIN_BATCH(3);
	OUT_BATCH(MI_NOOP);
	OUT_BATCH(MI_BATCH_BUFFER_END);
	OUT_RELOC(bo, I915_GEM_DOMAIN_RENDER,
			I915_GEM_DOMAIN_RENDER, 0);
	ADVANCE_BATCH();

	intel_batchbuffer_flush(batch);
}

static int find_and_open_devices(void)
{
	int i;
	char path[80], *unused;
	struct stat buf;
	FILE *fl;
	char vendor_id[8] = {};
	int venid;
	for (i = 0; i < 9; i++) {
		sprintf(path, "/sys/class/drm/card%d/device/vendor", i);
		if (stat(path, &buf))
			break;

		fl = fopen(path, "r");
		if (!fl)
			break;

		unused = fgets(vendor_id, sizeof(vendor_id)-1, fl);
		(void)unused;
		fclose(fl);

		venid = strtoul(vendor_id, NULL, 16);
		sprintf(path, "/dev/dri/card%d", i);
		if (venid == 0x8086) {
			intel_fd = open(path, O_RDWR);
			if (!intel_fd)
				return -1;
		} else if (venid == 0x10de) {
			nouveau_fd = open(path, O_RDWR);
			if (!nouveau_fd)
				return -1;
		}
	}
	return 0;
}

static int init_nouveau(void)
{
	struct nv04_fifo nv04_data = { .vram = 0xbeef0201,
				       .gart = 0xbeef0202 };
	struct nvc0_fifo nvc0_data = { };
	struct nouveau_fifo *fifo;
	int size, ret;
	uint32_t class;
	void *data;

	ret = nouveau_device_wrap(nouveau_fd, 0, &ndev);
	if (ret < 0) {
		fprintf(stderr,"failed to wrap nouveau device\n");
		return ret;
	}

	ret = nouveau_client_new(ndev, &nclient);
	if (ret < 0) {
		fprintf(stderr,"failed to setup nouveau client\n");
		return ret;
	}

	if (ndev->chipset < 0xa3 || ndev->chipset == 0xaa || ndev->chipset == 0xac) {
		fprintf(stderr, "Your card doesn't support PCOPY\n");
		return -1;
	}

	// TODO: Get a kepler and add support for it
	if (ndev->chipset >= 0xe0) {
		fprintf(stderr, "Unsure how kepler works!\n");
		return -1;
	}
	ret = nouveau_bo_new(ndev,  NOUVEAU_BO_GART | NOUVEAU_BO_MAP,
			     4096, 4096, NULL, &query_bo);
	if (!ret)
		ret = nouveau_bo_map(query_bo, NOUVEAU_BO_RDWR, nclient);
	if (ret < 0) {
		fprintf(stderr,"failed to setup query counter\n");
		return ret;
	}
	query = query_bo->map;
	*query = query_counter;

	if (ndev->chipset < 0xc0) {
		class = 0x85b5;
		data = &nv04_data;
		size = sizeof(nv04_data);
	} else {
		class = ndev->chipset < 0xe0 ? 0x490b5 : 0xa0b5;
		data = &nvc0_data;
		size = sizeof(nvc0_data);
	}

	ret = nouveau_object_new(&ndev->object, 0, NOUVEAU_FIFO_CHANNEL_CLASS,
				 data, size, &nchannel);
	if (ret) {
		fprintf(stderr, "Error creating GPU channel: %d\n", ret);
		if (ret == -ENODEV) {
			fprintf(stderr, "Make sure nouveau_accel is active\n");
			fprintf(stderr, "nvd9 is likely broken regardless\n");
		}
		return ret;
	}

	fifo = nchannel->data;

	ret = nouveau_pushbuf_new(nclient, nchannel, 4, 32 * 1024,
				  true, &npush);
	if (ret) {
		fprintf(stderr, "Error allocating DMA push buffer: %d\n", ret);
		return ret;
	}

	ret = nouveau_bufctx_new(nclient, 1, &nbufctx);
	if (ret) {
		fprintf(stderr, "Error allocating buffer context: %d\n", ret);
		return ret;
	}

	npush->user_priv = nbufctx;

	/* Hope this is enough init for PCOPY */
	ret = nouveau_object_new(nchannel, class, class & 0xffff, NULL, 0, &pcopy);
	if (ret) {
		fprintf(stderr, "Failed to allocate pcopy: %d\n", ret);
		return ret;
	}
	ret = nouveau_pushbuf_space(npush, 512, 0, 0);
	if (ret) {
		fprintf(stderr, "No space in pushbuf: %d\n", ret);
		return ret;
	}
	if (ndev->chipset < 0xc0) {
		struct nv04_fifo *nv04_fifo = (struct nv04_fifo*)fifo;
		tile_intel_y = 0x3e;
		tile_intel_x = 0x13;

		BEGIN_NV04(npush, NV01_SUBC(COPY, OBJECT), 1);
		PUSH_DATA(npush, pcopy->handle);
		BEGIN_NV04(npush, SUBC_COPY(0x0180), 3);
		PUSH_DATA(npush, nv04_fifo->vram);
		PUSH_DATA(npush, nv04_fifo->vram);
		PUSH_DATA(npush, nv04_fifo->vram);
	} else {
		tile_intel_y = 0x2e;
		tile_intel_x = 0x03;
		BEGIN_NVC0(npush, NV01_SUBC(COPY, OBJECT), 1);
		PUSH_DATA(npush, pcopy->handle);
	}
	nouveau_pushbuf_kick(npush, npush->channel);
	return ret;
}

static void fill16(void *ptr, uint32_t val)
{
	uint32_t *p = ptr;
	val = (val) | (val << 8) | (val << 16) | (val << 24);
	p[0] = p[1] = p[2] = p[3] = val;
}

#define TILE_SIZE 4096

static int swtile_y(uint8_t *out, const uint8_t *in, int w, int h)
{
	uint32_t x, y, dx, dy;
	uint8_t *endptr = out + w * h;
	assert(!(w % 128));
	assert(!(h % 32));

	for (y = 0; y < h; y += 32) {
		for (x = 0; x < w; x += 128, out += TILE_SIZE) {
			for (dx = 0; dx < 8; ++dx) {
				for (dy = 0; dy < 32; ++dy) {
					uint32_t out_ofs = (dx * 32 + dy) * 16;
					uint32_t in_ofs = (y + dy) * w + (x + 16 * dx);
					assert(out_ofs < TILE_SIZE);
					assert(in_ofs < w*h);

					// To do the Y tiling quirk:
					// out_ofs = out_ofs ^ (((out_ofs >> 9) & 1) << 6);
					memcpy(&out[out_ofs], &in[in_ofs], 16);
				}
			}
		}
	}
	assert(out == endptr);
	return 0;
}

static int swtile_x(uint8_t *out, const uint8_t *in, int w, int h)
{
	uint32_t x, y, dy;
	uint8_t *endptr = out + w * h;
	assert(!(w % 512));
	assert(!(h % 8));

	for (y = 0; y < h; y += 8) {
		for (x = 0; x < w; x += 512, out += TILE_SIZE) {
			for (dy = 0; dy < 8; ++dy) {
				uint32_t out_ofs = 512 * dy;
				uint32_t in_ofs = (y + dy) * w + x;
				assert(out_ofs < TILE_SIZE);
				assert(in_ofs < w*h);
				memcpy(&out[out_ofs], &in[in_ofs], 512);
			}
		}
	}
	assert(out == endptr);
	return 0;
}

#if 0
/* X tiling is approximately linear, except tiled in 512x8 blocks, so lets abuse that
 *
 * How? Whole contiguous tiles can be copied safely as if linear
 */

static int perform_copy_hack(struct nouveau_bo *nvbo, const rect *dst,
			     uint32_t dst_x, uint32_t dst_y,
			     struct nouveau_bo *nvbi, const rect *src,
			     uint32_t src_x, uint32_t src_y,
			     uint32_t w, uint32_t h)
{
	struct nouveau_pushbuf_refn refs[] = {
		{ nvbi, (nvbi->flags & NOUVEAU_BO_APER) | NOUVEAU_BO_RD },
		{ nvbo, (nvbo->flags & NOUVEAU_BO_APER) | NOUVEAU_BO_WR },
		{ query_bo, NOUVEAU_BO_GART | NOUVEAU_BO_RDWR }
	};
	uint32_t exec = 0x00000000;
	uint32_t src_off = 0, dst_off = 0;
	struct nouveau_pushbuf *push = npush;
	uint32_t dw, tiles, tile_src = nvbi->config.nv50.tile_mode, tile_dst = nvbo->config.nv50.tile_mode;

	if (tile_src == tile_intel_x)
		dw = 512 - (src_x & 512);
	else
		dw = 512 - (dst_x % 512);

	if (!nvbi->config.nv50.memtype)
		exec |= 0x00000010;
	if (!tile_src)
		src_off = src_y * src->pitch + src_x;

	if (!nvbo->config.nv50.memtype)
		exec |= 0x00000100;
	if (!tile_dst)
		dst_off = dst_y * dst->pitch + dst_x;

	if (dw > w)
		dw = w;
	tiles = 1 + ((w - dw + 511)/512);

	if (nouveau_pushbuf_space(push, 8 + tiles * 32, 0, 0) ||
	    nouveau_pushbuf_refn(push, refs, 3))
		return -1;

	for (; w; w -= dw, src_x += dw, dst_x += dw, dw = w > 512 ? 512 : w) {
		if (tile_src == tile_intel_x) {
			/* Find the correct tiled offset */
			src_off = 8 * dst->pitch * (src_y / 8);
			src_off += src_x / 512 * 4096;
			src_off += (src_x % 512) + 512 * (src_y % 8);

			if (!tile_dst)
				dst_off = dst_y * dst->pitch + dst_x;
		} else {
			if (!tile_src)
				src_off = src_y * src->pitch + src_x;

			dst_off = 8 * dst->pitch * (dst_y / 8);
			dst_off += dst_x / 512 * 4096;
			dst_off += (dst_x % 512) + 512 * (dst_y % 8);
		}

		fprintf(stderr, "Copying from %u to %u for %u bytes\n", src_x, dst_x, dw);
		fprintf(stderr, "src ofs: %u, dst ofs: %u\n", src_off, dst_off);
		BEGIN_NVXX(push, SUBC_COPY(0x0200), 7);
		PUSH_DATA (push, tile_src == tile_intel_x ? 0 : nvbi->config.nv50.tile_mode);
		PUSH_DATA (push, src->pitch);
		PUSH_DATA (push, src->h);
		PUSH_DATA (push, 1);
		PUSH_DATA (push, 0);
		PUSH_DATA (push, src_x);
		PUSH_DATA (push, src_y);

		BEGIN_NVXX(push, SUBC_COPY(0x0220), 7);
		PUSH_DATA (push, tile_dst == tile_intel_x ? 0 : nvbo->config.nv50.tile_mode);
		PUSH_DATA (push, dst->pitch);
		PUSH_DATA (push, dst->h);
		PUSH_DATA (push, 1);
		PUSH_DATA (push, 0);
		PUSH_DATA (push, dst_x);
		PUSH_DATA (push, dst_y);

		BEGIN_NVXX(push, SUBC_COPY(0x030c), 8);
		PUSH_DATA (push, (nvbi->offset + src_off) >> 32);
		PUSH_DATA (push, (nvbi->offset + src_off));
		PUSH_DATA (push, (nvbo->offset + dst_off) >> 32);
		PUSH_DATA (push, (nvbo->offset + dst_off));
		PUSH_DATA (push, src->pitch);
		PUSH_DATA (push, dst->pitch);
		PUSH_DATA (push, dw);
		PUSH_DATA (push, h);

		if (w == dw) {
			exec |= 0x3000; /* QUERY|QUERY_SHORT */
			BEGIN_NVXX(push, SUBC_COPY(0x0338), 3);
			PUSH_DATA (push, (query_bo->offset) >> 32);
			PUSH_DATA (push, (query_bo->offset));
			PUSH_DATA (push, ++query_counter);
		}

		BEGIN_NVXX(push, SUBC_COPY(0x0300), 1);
		PUSH_DATA (push, exec);
	}
	nouveau_pushbuf_kick(push, push->channel);
	while (*query < query_counter) { }
	return 0;
}
#endif

static int perform_copy(struct nouveau_bo *nvbo, const rect *dst,
			uint32_t dst_x, uint32_t dst_y,
			struct nouveau_bo *nvbi, const rect *src,
			uint32_t src_x, uint32_t src_y,
			uint32_t w, uint32_t h)
{
#if 0
	/* Too much effort */
	if (nvbi->config.nv50.tile_mode == tile_intel_x &&
	    nvbo->config.nv50.tile_mode == tile_intel_x)
		return -1;
	else if (nvbi->config.nv50.tile_mode == tile_intel_x ||
		 nvbo->config.nv50.tile_mode == tile_intel_x)
		return perform_copy_hack(nvbo, dst, dst_x, dst_y,
					 nvbi, src, src_x, src_y, w, h);
#endif
	struct nouveau_pushbuf_refn refs[] = {
		{ nvbi, (nvbi->flags & NOUVEAU_BO_APER) | NOUVEAU_BO_RD },
		{ nvbo, (nvbo->flags & NOUVEAU_BO_APER) | NOUVEAU_BO_WR },
		{ query_bo, NOUVEAU_BO_GART | NOUVEAU_BO_RDWR }
	};
	uint32_t cpp = 1, exec = 0x00003000; /* QUERY|QUERY_SHORT|FORMAT */
	uint32_t src_off = 0, dst_off = 0;
	struct nouveau_pushbuf *push = npush;
	int ret;

	if (nvbi->config.nv50.tile_mode == tile_intel_y)
		dbg("src is y-tiled\n");
	if (nvbo->config.nv50.tile_mode == tile_intel_y)
		dbg("dst is y-tiled\n");

	if (nouveau_pushbuf_space(push, 64, 0, 0) ||
	    nouveau_pushbuf_refn(push, refs, 3))
		return -1;

	if (!nvbi->config.nv50.tile_mode) {
		src_off = src_y * src->pitch + src_x;
		exec |= 0x00000010;
	}

	if (!nvbo->config.nv50.tile_mode) {
		dst_off = dst_y * dst->pitch + dst_x;
		exec |= 0x00000100;
	}

	BEGIN_NVXX(push, SUBC_COPY(0x0200), 7);
	PUSH_DATA (push, nvbi->config.nv50.tile_mode);
	PUSH_DATA (push, src->pitch / cpp);
	PUSH_DATA (push, src->h);
	PUSH_DATA (push, 1);
	PUSH_DATA (push, 0);
	PUSH_DATA (push, src_x / cpp);
	PUSH_DATA (push, src_y);

	BEGIN_NVXX(push, SUBC_COPY(0x0220), 7);
	PUSH_DATA (push, nvbo->config.nv50.tile_mode);
	PUSH_DATA (push, dst->pitch / cpp);
	PUSH_DATA (push, dst->h);
	PUSH_DATA (push, 1);
	PUSH_DATA (push, 0);
	PUSH_DATA (push, dst_x / cpp);
	PUSH_DATA (push, dst_y);

	BEGIN_NVXX(push, SUBC_COPY(0x030c), 9);
	PUSH_DATA (push, (nvbi->offset + src_off) >> 32);
	PUSH_DATA (push, (nvbi->offset + src_off));
	PUSH_DATA (push, (nvbo->offset + dst_off) >> 32);
	PUSH_DATA (push, (nvbo->offset + dst_off));
	PUSH_DATA (push, src->pitch);
	PUSH_DATA (push, dst->pitch);
	PUSH_DATA (push, w / cpp);
	PUSH_DATA (push, h);
	PUSH_DATA (push, 0x03333120);

	BEGIN_NVXX(push, SUBC_COPY(0x0338), 3);
	PUSH_DATA (push, (query_bo->offset) >> 32);
	PUSH_DATA (push, (query_bo->offset));
	PUSH_DATA (push, ++query_counter);

	BEGIN_NVXX(push, SUBC_COPY(0x0300), 1);
	PUSH_DATA (push, exec);

	ret = nouveau_pushbuf_kick(push, push->channel);
	while (!ret && *query < query_counter) { usleep(1000); }
	return ret;
}

static int check1_macro(uint32_t *p, uint32_t w, uint32_t h)
{
	uint32_t i, val, j;

	for (i = 0; i < 256; ++i, p += 4) {
		val = (i) | (i << 8) | (i << 16) | (i << 24);
		if (p[0] != val || p[1] != val || p[2] != val || p[3] != val) {
			fprintf(stderr, "Retile check failed in first tile!\n");
			fprintf(stderr, "%08x %08x %08x %08x instead of %08x\n",
				p[0], p[1], p[2], p[3], val);
			return -1;
		}
	}

	val = 0x3e3e3e3e;
	for (i = 0; i < 256 * (w-1); ++i, p += 4) {
		if (p[0] != val || p[1] != val || p[2] != val || p[3] != val) {
			fprintf(stderr, "Retile check failed in second tile!\n");
			fprintf(stderr, "%08x %08x %08x %08x instead of %08x\n",
				p[0], p[1], p[2], p[3], val);
			return -1;
		}
	}

	for (j = 1; j < h; ++j) {
		val = 0x7e7e7e7e;
		for (i = 0; i < 256; ++i, p += 4) {
			if (p[0] != val || p[1] != val || p[2] != val || p[3] != val) {
				fprintf(stderr, "Retile check failed in third tile!\n");
				fprintf(stderr, "%08x %08x %08x %08x instead of %08x\n",
					p[0], p[1], p[2], p[3], val);
				return -1;
			}
		}

		val = 0xcececece;
		for (i = 0; i < 256 * (w-1); ++i, p += 4) {
			if (p[0] != val || p[1] != val || p[2] != val || p[3] != val) {
				fprintf(stderr, "Retile check failed in fourth tile!\n");
				fprintf(stderr, "%08x %08x %08x %08x instead of %08x\n",
					p[0], p[1], p[2], p[3], val);
				return -1;
			}
		}
	}
	return 0;
}

/* test 1, see if we can copy from linear to intel Y format safely */
static int test1_macro(void)
{
	int ret, prime_fd = -1;
	struct nouveau_bo *nvbo = NULL, *nvbi = NULL;
	rect dst, src;
	uint8_t *ptr;
	uint32_t w = 2 * 128, h = 2 * 32, x, y;

	ret = nv_bo_alloc(&nvbi, &src, w, h, 0, -1, NOUVEAU_BO_GART);
	if (ret >= 0)
		ret = nv_bo_alloc(&nvbo, &dst, w, h, tile_intel_y, -1, NOUVEAU_BO_GART);
	if (ret < 0)
		goto out;

	nouveau_bo_set_prime(nvbo, &prime_fd);

	/* Set up something for our tile that should map into the first
	 * y-major tile, assuming my understanding of documentation is
	 * correct
	 */

	/* First tile should be read out in groups of 16 bytes that
	 * are all set to a linear increasing value..
	 */
	ptr = nvbi->map;
	for (x = 0; x < 128; x += 16)
		for (y = 0; y < 32; ++y)
			fill16(&ptr[y * w + x], x * 2 + y);

	/* second tile */
	for (x = 128; x < w; x += 16)
		for (y = 0; y < 32; ++y)
			fill16(&ptr[y * w + x], 0x3e);

	/* third tile */
	for (x = 0; x < 128; x += 16)
		for (y = 32; y < h; ++y)
			fill16(&ptr[y * w + x], 0x7e);

	/* last tile */
	for (x = 128; x < w; x += 16)
		for (y = 32; y < h; ++y)
			fill16(&ptr[y * w + x], 0xce);
	memset(nvbo->map, 0xfc, w * h);

	if (pcopy)
		ret = perform_copy(nvbo, &dst, 0, 0, nvbi, &src, 0, 0, w, h);
	else
		ret = swtile_y(nvbo->map, nvbi->map, w, h);
	if (!ret)
		ret = check1_macro(nvbo->map, w/128, h/32);

out:
	nouveau_bo_ref(NULL, &nvbo);
	nouveau_bo_ref(NULL, &nvbi);
	close(prime_fd);
	return ret;
}

static int dump_line(uint8_t *map)
{
	uint32_t dx, dy;
	fprintf(stderr, "Dumping sub-tile:\n");
	for (dy = 0; dy < 32; ++dy) {
		for (dx = 0; dx < 15; ++dx, ++map) {
			fprintf(stderr, "%02x ", *map);
		}
		fprintf(stderr, "%02x\n", *(map++));
	}
	return -1;
}

static int check1_micro(void *map, uint32_t pitch, uint32_t lines,
			uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h)
{
	uint32_t x, y;

	/* check only the relevant subrectangle [0..w) [0...h) */
	uint8_t *m = map;
	for (y = 0; y < h; ++y, m += pitch) {
		for (x = 0; x < w; ++x) {
			uint8_t expected = ((y & 3) << 6) | (x & 0x3f);
			if (expected != m[x]) {
				fprintf(stderr, "failed check at x=%u y=%u, expected %02x got %02x\n",
					x, y, expected, m[x]);
				return dump_line(m);
			}
		}
	}

	return 0;
}

/* test 1, but check micro format, should be unaffected by bit9 swizzling */
static int test1_micro(void)
{
	struct nouveau_bo *bo_intel = NULL, *bo_nvidia = NULL, *bo_linear = NULL;
	rect intel, nvidia, linear;
	int ret = -1;
	uint32_t tiling = I915_TILING_Y;

	uint32_t src_x = 0, src_y = 0;
	uint32_t dst_x = 0, dst_y = 0;
	uint32_t x, y, w = 256, h = 64;

	drm_intel_bo *test_intel_bo;
	int prime_fd;

	test_intel_bo = drm_intel_bo_alloc(bufmgr, "test bo", w * h, 4096);
	if (!test_intel_bo)
		return -1;
	drm_intel_bo_set_tiling(test_intel_bo, &tiling, w);
	if (tiling != I915_TILING_Y) {
		fprintf(stderr, "Couldn't set y tiling\n");
		goto out;
	}
	ret = drm_intel_gem_bo_map_gtt(test_intel_bo);
	if (ret)
		goto out;

	drm_intel_bo_gem_export_to_prime(test_intel_bo, &prime_fd);
	if (prime_fd < 0) {
		drm_intel_bo_unreference(test_intel_bo);
		goto out;
	}
	noop_intel(test_intel_bo);

	ret = nv_bo_alloc(&bo_intel, &intel, w, h, tile_intel_y, prime_fd, 0);
	if (!ret)
		ret = nv_bo_alloc(&bo_nvidia, &nvidia, w, h, 0x10, -1, NOUVEAU_BO_VRAM);
	if (!ret)
		ret = nv_bo_alloc(&bo_linear, &linear, w, h, 0, -1, NOUVEAU_BO_GART);
	if (ret)
		goto out;

	for (y = 0; y < linear.h; ++y) {
		uint8_t *map = bo_linear->map;
		map += y * linear.pitch;
		for (x = 0; x < linear.pitch; ++x) {
			uint8_t pos = x & 0x3f;
			/* low 4 bits: micro tile pos */
			/* 2 bits: x pos in tile (wraps) */
			/* 2 bits: y pos in tile (wraps) */
			pos |= (y & 3) << 6;
			map[x] = pos;
		}
	}

	ret = perform_copy(bo_nvidia, &nvidia, 0, 0, bo_linear, &linear, 0, 0, nvidia.pitch, nvidia.h);
	if (ret)
		goto out;

	/* Perform the actual sub rectangle copy */
	if (pcopy)
		ret = perform_copy(bo_intel, &intel, dst_x, dst_y, bo_nvidia, &nvidia, src_x, src_y, w, h);
	else
		ret = swtile_y(test_intel_bo->virtual, bo_linear->map, w, h);
	if (ret)
		goto out;

	noop_intel(test_intel_bo);
	ret = check1_micro(test_intel_bo->virtual, intel.pitch, intel.h, dst_x, dst_y, w, h);

out:
	nouveau_bo_ref(NULL, &bo_linear);
	nouveau_bo_ref(NULL, &bo_nvidia);
	nouveau_bo_ref(NULL, &bo_intel);
	drm_intel_bo_unreference(test_intel_bo);
	return ret;
}

static int check1_swizzle(uint32_t *p, uint32_t pitch, uint32_t lines,
			  uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h)
{
	uint32_t i, val, j;

	for (j = 0; j < 32; ++j, p += (pitch - w)/4) {
		for (i = 0; i < 8; ++i, p += 4) {
			val = (i * 32) + j;
			val = (val) | (val << 8) | (val << 16) | (val << 24);
			if (p[0] != val || p[1] != val || p[2] != val || p[3] != val) {
				fprintf(stderr, "Retile check failed in first tile!\n");
				fprintf(stderr, "%08x %08x %08x %08x instead of %08x\n",
					p[0], p[1], p[2], p[3], val);
				return -1;
			}
		}

		val = 0x3e3e3e3e;
		for (; i < w/16; ++i, p += 4) {
			if (p[0] != val || p[1] != val || p[2] != val || p[3] != val) {
				fprintf(stderr, "Retile check failed in second tile!\n");
				fprintf(stderr, "%08x %08x %08x %08x instead of %08x\n",
					p[0], p[1], p[2], p[3], val);
				return -1;
			}
		}
	}

	for (j = 32; j < h; ++j, p += (pitch - w)/4) {
		val = 0x7e7e7e7e;
		for (i = 0; i < 8; ++i, p += 4) {
			if (p[0] != val || p[1] != val || p[2] != val || p[3] != val) {
				fprintf(stderr, "Retile check failed in third tile!\n");
				fprintf(stderr, "%08x %08x %08x %08x instead of %08x\n",
					p[0], p[1], p[2], p[3], val);
				return -1;
			}
		}

		val = 0xcececece;
		for (; i < w/16; ++i, p += 4) {
			if (p[0] != val || p[1] != val || p[2] != val || p[3] != val) {
				fprintf(stderr, "Retile check failed in fourth tile!\n");
				fprintf(stderr, "%08x %08x %08x %08x instead of %08x\n",
					p[0], p[1], p[2], p[3], val);
				return -1;
			}
		}
	}
	return 0;
}

/* Create a new bo, set tiling to y, and see if macro swizzling is done correctl */
static int test1_swizzle(void)
{
	struct nouveau_bo *bo_intel = NULL, *bo_nvidia = NULL, *bo_linear = NULL;
	rect intel, nvidia, linear;
	int ret = -1;
	uint32_t tiling = I915_TILING_Y;

	uint32_t src_x = 0, src_y = 0;
	uint32_t dst_x = 0, dst_y = 0;
	uint32_t x, y, w = 256, h = 64;
	uint8_t *ptr;

	drm_intel_bo *test_intel_bo;
	int prime_fd;

	test_intel_bo = drm_intel_bo_alloc(bufmgr, "test bo", w * h, 4096);
	if (!test_intel_bo)
		return -1;
	drm_intel_bo_set_tiling(test_intel_bo, &tiling, w);
	if (tiling != I915_TILING_Y) {
		fprintf(stderr, "Couldn't set y tiling\n");
		goto out;
	}
	ret = drm_intel_gem_bo_map_gtt(test_intel_bo);
	if (ret)
		goto out;

	drm_intel_bo_gem_export_to_prime(test_intel_bo, &prime_fd);
	if (prime_fd < 0) {
		drm_intel_bo_unreference(test_intel_bo);
		goto out;
	}

	ret = nv_bo_alloc(&bo_intel, &intel, w, h, tile_intel_y, prime_fd, 0);
	if (!ret)
		ret = nv_bo_alloc(&bo_nvidia, &nvidia, w, h, 0x10, -1, NOUVEAU_BO_VRAM);
	if (!ret)
		ret = nv_bo_alloc(&bo_linear, &linear, w, h, 0, -1, NOUVEAU_BO_GART);
	if (ret)
		goto out;

	noop_intel(test_intel_bo);
	ptr = bo_linear->map;
	for (x = 0; x < 128; x += 16)
		for (y = 0; y < 32; ++y)
			fill16(&ptr[y * w + x], x * 2 + y);

	/* second tile */
	for (x = 128; x < w; x += 16)
		for (y = 0; y < 32; ++y)
			fill16(&ptr[y * w + x], 0x3e);

	/* third tile */
	for (x = 0; x < 128; x += 16)
		for (y = 32; y < h; ++y)
			fill16(&ptr[y * w + x], 0x7e);

	/* last tile */
	for (x = 128; x < w; x += 16)
		for (y = 32; y < h; ++y)
			fill16(&ptr[y * w + x], 0xce);

	ret = perform_copy(bo_nvidia, &nvidia, 0, 0, bo_linear, &linear, 0, 0, nvidia.pitch, nvidia.h);
	if (ret)
		goto out;

	/* Perform the actual sub rectangle copy */
	ret = perform_copy(bo_intel, &intel, dst_x, dst_y, bo_nvidia, &nvidia, src_x, src_y, w, h);
	if (ret)
		goto out;
	noop_intel(test_intel_bo);

	ret = check1_swizzle(test_intel_bo->virtual, intel.pitch, intel.h, dst_x, dst_y, w, h);

out:
	nouveau_bo_ref(NULL, &bo_linear);
	nouveau_bo_ref(NULL, &bo_nvidia);
	nouveau_bo_ref(NULL, &bo_intel);
	drm_intel_bo_unreference(test_intel_bo);
	return ret;
}

/* test 2, see if we can copy from linear to intel X format safely
 * Seems nvidia lacks a method to do it, so just keep this test
 * as a reference for potential future tests. Software tiling is
 * used for now
 */
static int test2(void)
{
	int ret;
	struct nouveau_bo *nvbo = NULL, *nvbi = NULL;
	rect dst, src;
	uint8_t *ptr;
	uint32_t w = 1024, h = 16, x, y;

	ret = nv_bo_alloc(&nvbi, &src, w, h, 0, -1, NOUVEAU_BO_GART);
	if (ret >= 0)
		ret = nv_bo_alloc(&nvbo, &dst, w, h, tile_intel_x, -1, NOUVEAU_BO_GART);
	if (ret < 0)
		goto out;

	/* Set up something for our tile that should map into the first
	 * y-major tile, assuming my understanding of documentation is
	 * correct
	 */

	/* First tile should be read out in groups of 16 bytes that
	 * are all set to a linear increasing value..
	 */
	ptr = nvbi->map;
	for (y = 0; y < 8; ++y)
		for (x = 0; x < 512; x += 16)
			fill16(&ptr[y * w + x], (y * 512 + x)/16);

	for (y = 0; y < 8; ++y)
		for (x = 512; x < w; x += 16)
			fill16(&ptr[y * w + x], 0x3e);

	for (y = 8; y < h; ++y)
		for (x = 0; x < 512; x += 16)
			fill16(&ptr[y * w + x], 0x7e);

	for (y = 8; y < h; ++y)
		for (x = 512; x < w; x += 16)
			fill16(&ptr[y * w + x], 0xce);
	memset(nvbo->map, 0xfc, w * h);

	/* do this in software, there is no X major tiling in PCOPY (yet?) */
	if (0 && pcopy)
		ret = perform_copy(nvbo, &dst, 0, 0, nvbi, &src, 0, 0, w, h);
	else
		ret = swtile_x(nvbo->map, nvbi->map, w, h);
	if (!ret)
		ret = check1_macro(nvbo->map, w/512, h/8);

out:
	nouveau_bo_ref(NULL, &nvbo);
	nouveau_bo_ref(NULL, &nvbi);
	return ret;
}

static int check3(const uint32_t *p, uint32_t pitch, uint32_t lines,
		  uint32_t sub_x, uint32_t sub_y,
		  uint32_t sub_w, uint32_t sub_h)
{
	uint32_t x, y;

	sub_w += sub_x;
	sub_h += sub_y;

	if (p[pitch * lines / 4 - 1] == 0x03030303) {
		fprintf(stderr, "copy failed: Not all lines have been copied back!\n");
		return -1;
	}

	for (y = 0; y < lines; ++y) {
		for (x = 0; x < pitch; x += 4, ++p) {
			uint32_t expected;
			if ((x < sub_x || x >= sub_w) ||
			    (y < sub_y || y >= sub_h))
				expected = 0x80808080;
			else
				expected = 0x04040404;
			if (*p != expected) {
				fprintf(stderr, "%u,%u should be %08x, but is %08x\n", x, y, expected, *p);
				return -1;
			}
		}
	}
	return 0;
}

/* copy from nvidia bo to intel bo and copy to a linear bo to check if tiling went succesful */
static int test3_base(int tile_src, int tile_dst)
{
	struct nouveau_bo *bo_intel = NULL, *bo_nvidia = NULL, *bo_linear = NULL;
	rect intel, nvidia, linear;
	int ret;
	uint32_t cpp = 4;

	uint32_t src_x = 1 * cpp, src_y = 1;
	uint32_t dst_x = 2 * cpp, dst_y = 26;
	uint32_t w = 298 * cpp, h = 298;

	drm_intel_bo *test_intel_bo;
	int prime_fd;

	test_intel_bo = drm_intel_bo_alloc(bufmgr, "test bo", 2048 * cpp * 768, 4096);
	if (!test_intel_bo)
		return -1;

	drm_intel_bo_gem_export_to_prime(test_intel_bo, &prime_fd);
	if (prime_fd < 0) {
		drm_intel_bo_unreference(test_intel_bo);
		return -1;
	}

	ret = nv_bo_alloc(&bo_intel, &intel, 2048 * cpp, 768, tile_dst, prime_fd, 0);
	if (!ret)
		ret = nv_bo_alloc(&bo_nvidia, &nvidia, 300 * cpp, 300, tile_src, -1, NOUVEAU_BO_VRAM);
	if (!ret)
		ret = nv_bo_alloc(&bo_linear, &linear, 2048 * cpp, 768, 0, -1, NOUVEAU_BO_GART);
	if (ret)
		goto out;

	noop_intel(test_intel_bo);
	memset(bo_linear->map, 0x80, bo_linear->size);
	ret = perform_copy(bo_intel, &intel, 0, 0, bo_linear, &linear, 0, 0, linear.pitch, linear.h);
	if (ret)
		goto out;
	noop_intel(test_intel_bo);

	memset(bo_linear->map, 0x04, bo_linear->size);
	ret = perform_copy(bo_nvidia, &nvidia, 0, 0, bo_linear, &linear, 0, 0, nvidia.pitch, nvidia.h);
	if (ret)
		goto out;

	/* Perform the actual sub rectangle copy */
	noop_intel(test_intel_bo);
	ret = perform_copy(bo_intel, &intel, dst_x, dst_y, bo_nvidia, &nvidia, src_x, src_y, w, h);
	if (ret)
		goto out;
	noop_intel(test_intel_bo);

	memset(bo_linear->map, 0x3, bo_linear->size);
	noop_intel(test_intel_bo);
	ret = perform_copy(bo_linear, &linear, 0, 0, bo_intel, &intel, 0, 0, intel.pitch, intel.h);
	if (ret)
		goto out;
	noop_intel(test_intel_bo);

	ret = check3(bo_linear->map, linear.pitch, linear.h, dst_x, dst_y, w, h);

out:
	nouveau_bo_ref(NULL, &bo_linear);
	nouveau_bo_ref(NULL, &bo_nvidia);
	nouveau_bo_ref(NULL, &bo_intel);
	drm_intel_bo_unreference(test_intel_bo);
	return ret;
}

static int test3_1(void)
{
	/* nvidia tiling to intel */
	return test3_base(0x40, tile_intel_y);
}

static int test3_2(void)
{
	/* intel tiling to nvidia */
	return test3_base(tile_intel_y, 0x40);
}

static int test3_3(void)
{
	/* intel tiling to linear */
	return test3_base(tile_intel_y, 0);
}

static int test3_4(void)
{
	/* linear tiling to intel */
	return test3_base(0, tile_intel_y);
}

static int test3_5(void)
{
	/* linear to linear */
	return test3_base(0, 0);
}

/* Acquire when == SEQUENCE */
#define SEMA_ACQUIRE_EQUAL 1

/* Release, and write a 16 byte query structure to sema:
 * { (uint32)seq, (uint32)0, (uint64)timestamp } */
#define SEMA_WRITE_LONG 2

/* Acquire when >= SEQUENCE */
#define SEMA_ACQUIRE_GEQUAL 4

/* Test only new style semaphores, old ones are AWFUL */
static int test_semaphore(void)
{
	drm_intel_bo *test_intel_bo = NULL;
	struct nouveau_bo *sema_bo = NULL;
	int ret = -1, prime_fd;
	uint32_t *sema;
	struct nouveau_pushbuf *push = npush;

	if (ndev->chipset < 0x84)
		return -1;

	/* Should probably be kept in sysmem */
	test_intel_bo = drm_intel_bo_alloc(bufmgr, "semaphore bo", 4096, 4096);
	if (!test_intel_bo)
		goto out;

	drm_intel_bo_gem_export_to_prime(test_intel_bo, &prime_fd);
	if (prime_fd < 0)
		goto out;
	ret = nouveau_bo_prime_handle_ref(ndev, prime_fd, &sema_bo);
	close(prime_fd);
	if (ret < 0)
		goto out;

	ret = drm_intel_gem_bo_map_gtt(test_intel_bo);
	if (ret != 0) {
		fprintf(stderr,"failed to map bo\n");
		goto out;
	}
	sema = test_intel_bo->virtual;
	sema++;
	*sema = 0;

	ret = -1;
	if (nouveau_pushbuf_space(push, 64, 0, 0) ||
	    nouveau_pushbuf_refn(push, &(struct nouveau_pushbuf_refn)
	    { sema_bo, NOUVEAU_BO_GART|NOUVEAU_BO_RDWR }, 1))
		goto out;

	if (ndev->chipset < 0xc0) {
		struct nv04_fifo *nv04_fifo = nchannel->data;
		/* kernel binds it's own dma object here and overwrites old one,
		 * so just rebind vram every time we submit
		 */
		BEGIN_NV04(npush, SUBC_COPY(0x0060), 1);
		PUSH_DATA(npush, nv04_fifo->vram);
	}
	BEGIN_NVXX(push, SUBC_COPY(0x0010), 4);
	PUSH_DATA(push, sema_bo->offset >> 32);
	PUSH_DATA(push, sema_bo->offset + 4);
	PUSH_DATA(push, 2); // SEQUENCE
	PUSH_DATA(push, SEMA_WRITE_LONG); // TRIGGER

	BEGIN_NVXX(push, SUBC_COPY(0x0018), 2);
	PUSH_DATA(push, 3);
	PUSH_DATA(push, SEMA_ACQUIRE_EQUAL);
	BEGIN_NVXX(push, SUBC_COPY(0x0018), 2);
	PUSH_DATA(push, 4);
	PUSH_DATA(push, SEMA_WRITE_LONG);

	BEGIN_NVXX(push, SUBC_COPY(0x0018), 2);
	PUSH_DATA(push, 5);
	PUSH_DATA(push, SEMA_ACQUIRE_GEQUAL);
	BEGIN_NVXX(push, SUBC_COPY(0x0018), 2);
	PUSH_DATA(push, 6);
	PUSH_DATA(push, SEMA_WRITE_LONG);

	BEGIN_NVXX(push, SUBC_COPY(0x0018), 2);
	PUSH_DATA(push, 7);
	PUSH_DATA(push, SEMA_ACQUIRE_GEQUAL);
	BEGIN_NVXX(push, SUBC_COPY(0x0018), 2);
	PUSH_DATA(push, 9);
	PUSH_DATA(push, SEMA_WRITE_LONG);
	nouveau_pushbuf_kick(push, push->channel);

	usleep(1000);
	if (*sema != 2) {
		fprintf(stderr, "new sema should be 2 is %u\n", *sema);
		goto out;
	}

	*sema = 3;
	usleep(1000);
	if (*sema != 4) {
		fprintf(stderr, "new sema should be 4 is %u\n", *sema);
		goto out;
	}

	*sema = 5;
	usleep(1000);
	if (*sema != 6) {
		fprintf(stderr, "new sema should be 6 is %u\n", *sema);
		goto out;
	}

	*sema = 8;
	usleep(1000);
	if (*sema != 9) {
		fprintf(stderr, "new sema should be 9 is %u\n", *sema);
		goto out;
	}
	ret = 0;

out:
	nouveau_bo_ref(NULL, &sema_bo);
	if (test_intel_bo)
		drm_intel_bo_unreference(test_intel_bo);
	return ret;
}

int main(int argc, char **argv)
{
	int ret, failed = 0, run = 0;

	drmtest_subtest_init(argc, argv);

	ret = find_and_open_devices();
	if (ret < 0)
		return ret;

	if (nouveau_fd == -1 || intel_fd == -1) {
		fprintf(stderr,"failed to find intel and nouveau GPU\n");
		if (!drmtest_only_list_subtests())
			return 77;
	}

	/* set up intel bufmgr */
	bufmgr = drm_intel_bufmgr_gem_init(intel_fd, 4096);
	if (!bufmgr)
		return -1;
	/* Do not enable reuse, we share (almost) all buffers. */
	//drm_intel_bufmgr_gem_enable_reuse(bufmgr);

	/* set up nouveau bufmgr */
	ret = init_nouveau();
	if (ret < 0)
		return 77;

	/* set up an intel batch buffer */
	devid = intel_get_drm_devid(intel_fd);
	batch = intel_batchbuffer_alloc(bufmgr, devid);

#define xtest(x, args...) do { \
	if (!drmtest_run_subtest( #x )) break; \
	ret = ((x)(args)); \
	++run; \
	if (ret) { \
		++failed; \
		fprintf(stderr, "prime_pcopy: failed " #x "\n"); } \
	} while (0)

	xtest(test1_macro);
	xtest(test1_micro);
	xtest(test1_swizzle);
	xtest(test2);
	xtest(test3_1);
	xtest(test3_2);
	xtest(test3_3);
	xtest(test3_4);
	xtest(test3_5);
	xtest(test_semaphore);

	nouveau_bo_ref(NULL, &query_bo);
	nouveau_object_del(&pcopy);
	nouveau_bufctx_del(&nbufctx);
	nouveau_pushbuf_del(&npush);
	nouveau_object_del(&nchannel);

	intel_batchbuffer_free(batch);

	nouveau_client_del(&nclient);
	nouveau_device_del(&ndev);
	drm_intel_bufmgr_destroy(bufmgr);

	close(intel_fd);
	close(nouveau_fd);

	if (!drmtest_only_list_subtests())
		printf("Tests: %u run, %u failed\n", run, failed);

	return failed;
}