From 91e589724694c23db9669b987a9411b7ae152d0d Mon Sep 17 00:00:00 2001 From: Damien Lespiau Date: Wed, 27 Feb 2013 14:51:34 +0000 Subject: rendercopy/bdw: Fix the original implementation For posterity, I've squashed these commits against Damien's request. rendercopy/gen8: Fix the include guards rendercopy/gen8: Update the 3DSTATE_MULTISAMPLE opcode The opcode has changed in BDW. rendercopy/gen8: Add the VF_TOPOLOGY state The primitive type has moved out of the 3DPRIMITIVE to its own state, VF_TOPOLOGY. rendercopy/gen8: Fixup 3STATE_PS Update the state to the latest BSpec, in particular the thread count was using a wrong shift and we were missing kernel2 offset. rendercopy/gen8: Update 3DSTATE_BASE_ADDRESS This state has seen its fields moved around a bit, follow the BSpec. rendercopy/gen8: Allocate 64 VUEs The simulator screams at us if we try to allocate less than that. rendercopy/gen8: Surface states have to be 64 bytes a aligned rendercopy/gen8: Vertical/horizontal align 2 does not exist any more So set them to 4. This should not matter with rendercopy (which is not using compressed textures), but it makes the simulator moan. rendercopy/gen8: Make sure the vertex buffer is 8 bytes aligned rendercopy/gen8: Adjust 3DSTATE_VERTEX_BUFFERS for gen8 The address of the buffer is now on 48 bits. Also the size was computed as offset + size where the field is really the size of the buffer itself, not the end address. rendercopy/gen8: Update the SF/SBE states for gen8 gen8 has a few changes around those states and a new ones RASTER and SBE_SWIZ. rendercopy/gen8: Add the PS_EXTRA and PS_BLEND states rendercopy/gen8: Fix building with DEBUG_RENDERCOPY defined The forward declaration was missing the final ';'. Let's move the whole function at the top instead. rendercopy/gen8: Update the PS and CONSTANT_PS states rendercopy/gen8: Fix the red channel selection Make it output red. rendercopy/gen8: Update the write -1 shader With the latest assembler changes from Haihao. rendercopy/gen8: Remove blit.g8a There is no diff between this file and blig.g7a. Remove it. rendercopy/gen8: Fix the surface relocation offset The surface base address is now at dwords 8/9 so the relocation has to mirror the change. rendercopy/gen8: Add the VF_INSTANCING state Should work without, but doesn't hurt to add it. rendercopy/gen8: Set the Attribule enable field in PS_EXTRA When the SF is set up to output some attributes, the pixel shader also have to be told there's attributes to care about. rendercopy/gen8: Set the force bits to read URB offset/length If we want to override the URB offset/length in the SBE state itself, we need to set the force bits on (new in gen8) Signed-off-by: Damien Lespiau Acked-by: Kenneth Graunke Signed-off-by: Ben Widawsky --- lib/rendercopy_gen8.c | 182 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 109 insertions(+), 73 deletions(-) (limited to 'lib/rendercopy_gen8.c') diff --git a/lib/rendercopy_gen8.c b/lib/rendercopy_gen8.c index cc371b3d..9dfcbaf8 100644 --- a/lib/rendercopy_gen8.c +++ b/lib/rendercopy_gen8.c @@ -7,7 +7,13 @@ #define VERTEX_SIZE (3*4) #if DEBUG_RENDERCPY -static void dump_batch(struct intel_batchbuffer *batch) +static void dump_batch(struct intel_batchbuffer *batch) { + int fd = open("/tmp/i965-batchbuffers.dump", O_WRONLY | O_CREAT, 0666); + if (fd != -1) { + write(fd, batch->buffer, 4096); + fd = close(fd); + } +} #else #define dump_batch(x) do { } while(0) #endif @@ -33,15 +39,15 @@ static const uint32_t ps_kernel[][4] = { { 0x05800031, 0x200022e0, 0x0e000e00, 0x90031000 }, #else /* Write all -1 */ - { 0x00600001, 0x2e000061, 0x00000000, 0x3f800000 }, - { 0x00600001, 0x2e200061, 0x00000000, 0x3f800000 }, - { 0x00600001, 0x2e400061, 0x00000000, 0x3f800000 }, - { 0x00600001, 0x2e600061, 0x00000000, 0x3f800000 }, - { 0x00600001, 0x2e800061, 0x00000000, 0x3f800000 }, - { 0x00600001, 0x2ea00061, 0x00000000, 0x3f800000 }, - { 0x00600001, 0x2ec00061, 0x00000000, 0x3f800000 }, - { 0x00600001, 0x2ee00061, 0x00000000, 0x3f800000 }, - { 0x05800031, 0x20001e3c, 0x00000e00, 0x90031000 }, + { 0x00600001, 0x2e000608, 0x00000000, 0x3f800000 }, + { 0x00600001, 0x2e200608, 0x00000000, 0x3f800000 }, + { 0x00600001, 0x2e400608, 0x00000000, 0x3f800000 }, + { 0x00600001, 0x2e600608, 0x00000000, 0x3f800000 }, + { 0x00600001, 0x2e800608, 0x00000000, 0x3f800000 }, + { 0x00600001, 0x2ea00608, 0x00000000, 0x3f800000 }, + { 0x00600001, 0x2ec00608, 0x00000000, 0x3f800000 }, + { 0x00600001, 0x2ee00608, 0x00000000, 0x3f800000 }, + { 0x05800031, 0x200022e0, 0x0e000e00, 0x90031000 }, #endif }; @@ -107,10 +113,12 @@ gen8_bind_buf(struct intel_batchbuffer *batch, struct scratch_buf *buf, read_domain = I915_GEM_DOMAIN_SAMPLER; } - ss = batch_alloc(batch, sizeof(*ss), 32); + ss = batch_alloc(batch, sizeof(*ss), 64); ss->ss0.surface_type = GEN6_SURFACE_2D; ss->ss0.surface_format = format; ss->ss0.render_cache_read_write = 1; + ss->ss0.vertical_alignment = 1; /* align 4 */ + ss->ss0.horizontal_alignment = 1; /* align 4 */ if (buf->tiling == I915_TILING_X) ss->ss0.tiled_mode = 2; else if (buf->tiling == I915_TILING_Y) @@ -119,7 +127,7 @@ gen8_bind_buf(struct intel_batchbuffer *batch, struct scratch_buf *buf, ss->ss8.base_addr = buf->bo->offset; ret = drm_intel_bo_emit_reloc(batch->bo, - batch_offset(batch, ss) + 4, + batch_offset(batch, ss) + 8 * 4, buf->bo, 0, read_domain, write_domain); assert(ret == 0); @@ -128,7 +136,7 @@ gen8_bind_buf(struct intel_batchbuffer *batch, struct scratch_buf *buf, ss->ss2.width = buf_width(buf) - 1; ss->ss3.pitch = buf->stride - 1; - ss->ss7.shader_chanel_select_a = 4; + ss->ss7.shader_chanel_select_r = 4; ss->ss7.shader_chanel_select_g = 5; ss->ss7.shader_chanel_select_b = 6; ss->ss7.shader_chanel_select_a = 7; @@ -190,6 +198,7 @@ gen7_fill_vertex_buffer_data(struct intel_batchbuffer *batch, uint32_t width, uint32_t height) { void *ret; + batch_align(batch, 8); ret = batch->ptr; emit_vertex_2s(batch, dst_x + width, dst_y + height); @@ -272,14 +281,13 @@ gen6_emit_vertex_elements(struct intel_batchbuffer *batch) { */ static void gen7_emit_vertex_buffer(struct intel_batchbuffer *batch, uint32_t offset) { - OUT_BATCH(GEN6_3DSTATE_VERTEX_BUFFERS | (4 * 1 - 1)); + OUT_BATCH(GEN6_3DSTATE_VERTEX_BUFFERS | (1 + (4 * 1) - 2)); OUT_BATCH(0 << VB0_BUFFER_INDEX_SHIFT | /* VB 0th index */ - VB0_VERTEXDATA | GEN7_VB0_BUFFER_ADDR_MOD_EN | /* Address Modify Enable */ VERTEX_SIZE << VB0_BUFFER_PITCH_SHIFT); OUT_RELOC(batch->bo, I915_GEM_DOMAIN_VERTEX, 0, offset); - OUT_RELOC(batch->bo, I915_GEM_DOMAIN_VERTEX, 0, offset + (VERTEX_SIZE * 3) - 1); OUT_BATCH(0); + OUT_BATCH(3 * VERTEX_SIZE); } static uint32_t @@ -361,30 +369,47 @@ gen7_emit_push_constants(struct intel_batchbuffer *batch) { } static void -gen7_emit_state_base_address(struct intel_batchbuffer *batch) { - OUT_BATCH(GEN6_STATE_BASE_ADDRESS | (10 - 2)); - /* general (stateless) */ - /* surface */ - /* instruction */ - /* indirect */ - /* dynamic */ +gen8_emit_state_base_address(struct intel_batchbuffer *batch) { + OUT_BATCH(GEN6_STATE_BASE_ADDRESS | (16 - 2)); + + /* general */ OUT_BATCH(0 | BASE_ADDRESS_MODIFY); + OUT_BATCH(0); + + /* stateless data port */ + OUT_BATCH(0 | BASE_ADDRESS_MODIFY); + + /* surface */ OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, BASE_ADDRESS_MODIFY); + OUT_BATCH(0); + + /* dynamic */ OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY); - OUT_BATCH(0 | BASE_ADDRESS_MODIFY); + OUT_BATCH(0); + + /* indirect */ + OUT_BATCH(0); + OUT_BATCH(0); + + /* instruction */ OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY); + OUT_BATCH(0); - OUT_BATCH(0 | BASE_ADDRESS_MODIFY); - OUT_BATCH(0xfffff000 | BASE_ADDRESS_MODIFY); // copied from mesa - OUT_BATCH(0 | BASE_ADDRESS_MODIFY); - OUT_BATCH(0 | BASE_ADDRESS_MODIFY); + /* general state buffer size */ + OUT_BATCH(0xfffff000 | 1); + /* dynamic state buffer size */ + OUT_BATCH(1 << 12 | 1); + /* indirect object buffer size */ + OUT_BATCH(0xfffff000 | 1); + /* intruction buffer size */ + OUT_BATCH(1 << 12); } static void gen7_emit_urb(struct intel_batchbuffer *batch) { /* XXX: Min valid values from mesa */ - const int vs_entries = 32; + const int vs_entries = 64; const int vs_size = 2; const int vs_start = 2; @@ -408,8 +433,8 @@ gen8_emit_cc(struct intel_batchbuffer *batch) { } static void -gen7_emit_multisample(struct intel_batchbuffer *batch) { - OUT_BATCH(GEN6_3DSTATE_MULTISAMPLE | 2); +gen8_emit_multisample(struct intel_batchbuffer *batch) { + OUT_BATCH(GEN8_3DSTATE_MULTISAMPLE | 2); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); @@ -537,31 +562,32 @@ gen7_emit_clip(struct intel_batchbuffer *batch) { } static void -gen7_emit_sf(struct intel_batchbuffer *batch) { - OUT_BATCH(GEN7_3DSTATE_SBE | (14 - 2)); -#ifdef GPU_HANG - OUT_BATCH(0 << 22 | 1 << 11 | 1 << 4); -#else - OUT_BATCH(1 << 22 | 1 << 11 | 1 << 4); -#endif - OUT_BATCH(0); - OUT_BATCH(0); - OUT_BATCH(0); - OUT_BATCH(0); - OUT_BATCH(0); +gen8_emit_sf(struct intel_batchbuffer *batch) +{ + int i; + + OUT_BATCH(GEN7_3DSTATE_SBE | (4 - 2)); + OUT_BATCH(1 << GEN7_SBE_NUM_OUTPUTS_SHIFT | + GEN8_SBE_FORCE_URB_ENTRY_READ_LENGTH | + GEN8_SBE_FORCE_URB_ENTRY_READ_OFFSET | + 1 << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT | + 1 << GEN8_SBE_URB_ENTRY_READ_OFFSET_SHIFT); OUT_BATCH(0); OUT_BATCH(0); + + OUT_BATCH(GEN8_3DSTATE_SBE_SWIZ | (11 - 2)); + for (i = 0; i < 8; i++) + OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); + + OUT_BATCH(GEN8_3DSTATE_RASTER | (5 - 2)); + OUT_BATCH(GEN8_RASTER_FRONT_WINDING_CCW | GEN8_RASTER_CULL_NONE); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); - OUT_BATCH(GEN6_3DSTATE_SF | (7 - 2)); - OUT_BATCH(0); - OUT_BATCH(GEN6_3DSTATE_SF_CULL_NONE); -// OUT_BATCH(2 << GEN6_3DSTATE_SF_TRIFAN_PROVOKE_SHIFT); - OUT_BATCH(0); + OUT_BATCH(GEN6_3DSTATE_SF | (4 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); @@ -569,17 +595,19 @@ gen7_emit_sf(struct intel_batchbuffer *batch) { static void gen8_emit_ps(struct intel_batchbuffer *batch, uint32_t kernel) { - const int max_threads = 86; + const int max_threads = 63; - OUT_BATCH(GEN6_3DSTATE_WM | (3 - 2)); - OUT_BATCH(GEN7_WM_DISPATCH_ENABLE | - /* XXX: I don't understand the BARYCENTRIC stuff, but it + OUT_BATCH(GEN6_3DSTATE_WM | (2 - 2)); + OUT_BATCH(/* XXX: I don't understand the BARYCENTRIC stuff, but it * appears we need it to put our setup data in the place we * expect (g6, see below) */ GEN7_3DSTATE_PS_PERSPECTIVE_PIXEL_BARYCENTRIC); - OUT_BATCH(0); - OUT_BATCH(GEN6_3DSTATE_CONSTANT_PS | (7-2)); + OUT_BATCH(GEN6_3DSTATE_CONSTANT_PS | (11-2)); + OUT_BATCH(0); + OUT_BATCH(0); + OUT_BATCH(0); + OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); @@ -587,19 +615,26 @@ gen8_emit_ps(struct intel_batchbuffer *batch, uint32_t kernel) { OUT_BATCH(0); OUT_BATCH(0); - OUT_BATCH(GEN7_3DSTATE_PS | (10-2)); + OUT_BATCH(GEN7_3DSTATE_PS | (12-2)); OUT_BATCH(kernel); OUT_BATCH(0); /* kernel hi */ OUT_BATCH(1 << GEN6_3DSTATE_WM_SAMPLER_COUNT_SHITF | 2 << GEN6_3DSTATE_WM_BINDING_TABLE_ENTRY_COUNT_SHIFT); OUT_BATCH(0); /* scratch space stuff */ OUT_BATCH(0); /* scratch hi */ - OUT_BATCH((max_threads - 1) << GEN7_3DSTATE_WM_MAX_THREADS_SHIFT | - GEN7_3DSTATE_PS_ATTRIBUTE_ENABLED | + OUT_BATCH((max_threads - 1) << GEN8_3DSTATE_PS_MAX_THREADS_SHIFT | GEN6_3DSTATE_WM_16_DISPATCH_ENABLE); OUT_BATCH(6 << GEN6_3DSTATE_WM_DISPATCH_START_GRF_0_SHIFT); OUT_BATCH(0); // kernel 1 OUT_BATCH(0); /* kernel 1 hi */ + OUT_BATCH(0); // kernel 2 + OUT_BATCH(0); /* kernel 2 hi */ + + OUT_BATCH(GEN8_3DSTATE_PS_BLEND | (2 - 2)); + OUT_BATCH(GEN8_PS_BLEND_HAS_WRITEABLE_RT); + + OUT_BATCH(GEN8_3DSTATE_PS_EXTRA | (2 - 2)); + OUT_BATCH(GEN8_PSX_PIXEL_SHADER_VALID | GEN8_PSX_ATTRIBUTE_ENABLE); } static void @@ -637,11 +672,21 @@ gen6_emit_drawing_rectangle(struct intel_batchbuffer *batch, struct scratch_buf OUT_BATCH(0); } +static void gen8_emit_vf_topology(struct intel_batchbuffer *batch) +{ + OUT_BATCH(GEN8_3DSTATE_VF_TOPOLOGY); + OUT_BATCH(_3DPRIM_RECTLIST); +} + /* Vertex elements MUST be defined before this according to spec */ -static void gen7_emit_primitive(struct intel_batchbuffer *batch, uint32_t offset) +static void gen8_emit_primitive(struct intel_batchbuffer *batch, uint32_t offset) { + OUT_BATCH(GEN8_3DSTATE_VF_INSTANCING | (3 - 2)); + OUT_BATCH(0); + OUT_BATCH(0); + OUT_BATCH(GEN6_3DPRIMITIVE | (7-2)); - OUT_BATCH(_3DPRIM_RECTLIST); + OUT_BATCH(0); /* gen8+ ignore the topology type field */ OUT_BATCH(3); /* vertex count */ OUT_BATCH(0); /* We're specifying this instead with offset in GEN6_3DSTATE_VERTEX_BUFFERS */ OUT_BATCH(1); /* single instance */ @@ -719,7 +764,7 @@ void gen8_render_copyfunc(struct intel_batchbuffer *batch, gen7_emit_push_constants(batch); - gen7_emit_state_base_address(batch); + gen8_emit_state_base_address(batch); OUT_BATCH(GEN7_3DSTATE_VIEWPORT_STATE_POINTERS_CC); OUT_BATCH(viewport.cc_state); @@ -730,7 +775,7 @@ void gen8_render_copyfunc(struct intel_batchbuffer *batch, gen8_emit_cc(batch); - gen7_emit_multisample(batch); + gen8_emit_multisample(batch); gen7_emit_null_state(batch); @@ -740,7 +785,7 @@ void gen8_render_copyfunc(struct intel_batchbuffer *batch, gen7_emit_clip(batch); - gen7_emit_sf(batch); + gen8_emit_sf(batch); OUT_BATCH(GEN7_3DSTATE_BINDING_TABLE_POINTERS_PS); OUT_BATCH(ps_binding_table); @@ -762,7 +807,8 @@ void gen8_render_copyfunc(struct intel_batchbuffer *batch, gen7_emit_vertex_buffer(batch, vertex_buffer); gen6_emit_vertex_elements(batch); - gen7_emit_primitive(batch, vertex_buffer); + gen8_emit_vf_topology(batch); + gen8_emit_primitive(batch, vertex_buffer); OUT_BATCH(MI_BATCH_BUFFER_END); @@ -774,13 +820,3 @@ void gen8_render_copyfunc(struct intel_batchbuffer *batch, gen6_render_flush(batch, batch_end); intel_batchbuffer_reset(batch); } - -#if DEBUG_RENDERCPY -static void dump_batch(struct intel_batchbuffer *batch) { - int fd = open("/tmp/i965-batchbuffers.dump", O_WRONLY | O_CREAT, 0666); - if (fd != -1) { - write(fd, batch->buffer, 4096); - fd = close(fd); - } -} -#endif -- cgit v1.2.3