diff options
-rw-r--r-- | tools/intel_perf_counters.c | 146 |
1 files changed, 146 insertions, 0 deletions
diff --git a/tools/intel_perf_counters.c b/tools/intel_perf_counters.c index fd268b12..b5283614 100644 --- a/tools/intel_perf_counters.c +++ b/tools/intel_perf_counters.c @@ -22,9 +22,21 @@ * * Authors: * Eric Anholt <eric@anholt.net> + * Kenneth Graunke <kenneth@whitecape.org> + * + * While documentation for performance counters is suspiciously missing from the + * Sandybridge PRM, they were documented in Volume 1 Part 3 of the Ironlake PRM. + * + * A lot of the Ironlake PRM actually unintentionally documents Sandybridge + * due to mistakes made when updating the documentation for Gen6+. Many of + * these mislabeled sections carried forward to the public documentation. + * + * The Ironlake PRMs have been publicly available since 2010 and are online at: + * https://01.org/linuxgraphics/documentation/2010-intel-core-processor-family */ #include <unistd.h> +#include <stdbool.h> #include <stdlib.h> #include <stdio.h> #include <err.h> @@ -71,6 +83,60 @@ const char *gen5_counter_names[GEN5_COUNTER_COUNT] = { "cycles any EU is stalled for math", }; +#define GEN6_COUNTER_COUNT 29 + +/** + * Sandybridge: Counter Select = 001 + * A0 A1 A2 A3 A4 TIMESTAMP RPT_ID + * A5 A6 A7 A8 A9 A10 A11 A12 + * A13 A14 A15 A16 A17 A18 A19 A20 + * A21 A22 A23 A24 A25 A26 A27 A28 + */ +const int gen6_counter_format = 1; + +/** + * Names for aggregating counters A0-A28. + * + * While the Ironlake PRM clearly documents that there are 29 counters (A0-A28), + * it only lists the names for 28 of them; one is missing. However, careful + * examination reveals a pattern: there are five GS counters (Active, Stall, + * Core Stall, # threads loaded, and ready but not running time). There are + * also five PS counters, in the same order. But there are only four VS + * counters listed - the number of VS threads loaded is missing. Presumably, + * it exists and is counter 5, and the rest are shifted over one place. + */ +const char *gen6_counter_names[GEN6_COUNTER_COUNT] = { + [0] = "Aggregated Core Array Active", + [1] = "Aggregated Core Array Stalled", + [2] = "Vertex Shader Active Time", + [3] = "Vertex Shader Stall Time", + [4] = "Vertex Shader Stall Time - Core Stall", + [5] = "# VS threads loaded", + [6] = "Vertex Shader Ready but not running time", + [7] = "Geometry Shader Active Time", + [8] = "Geometry Shader Stall Time", + [9] = "Geometry Shader Stall Time - Core Stall", + [10] = "# GS threads loaded", + [11] = "Geometry Shader ready but not running Time", + [12] = "Pixel Shader Active Time", + [13] = "Pixel Shader Stall Time", + [14] = "Pixel Shader Stall Time - Core Stall", + [15] = "# PS threads loaded", + [16] = "Pixel Shader ready but not running Time", + [17] = "Early Z Test Pixels Passing", + [18] = "Early Z Test Pixels Failing", + [19] = "Early Stencil Test Pixels Passing", + [20] = "Early Stencil Test Pixels Failing", + [21] = "Pixel Kill Count", + [22] = "Alpha Test Pixels Failed", + [23] = "Post PS Stencil Pixels Failed", + [24] = "Post PS Z buffer Pixels Failed", + [25] = "Pixels/samples Written in the frame buffer", + [26] = "GPU Busy", + [27] = "CL active and not stalled", + [28] = "SF active and stalled", +}; + int have_totals = 0; uint32_t *totals; uint32_t *last_counter; @@ -85,6 +151,20 @@ struct intel_batchbuffer *batch; #define MI_COUNTER_ADDRESS_GTT (1 << 0) /* DW2: report ID */ +/** + * According to the Sandybridge PRM, Volume 1, Part 1, page 48, + * MI_REPORT_PERF_COUNT is now opcode 0x28. The Ironlake PRM, Volume 1, + * Part 3 details how it works. + */ +/* DW0 */ +#define GEN6_MI_REPORT_PERF_COUNT (0x28 << 23) +/* DW1 and 2 are the same as above */ + +/* OACONTROL exists on Gen6+ but is documented in the Ironlake PRM */ +#define OACONTROL 0x2360 +# define OACONTROL_COUNTER_SELECT_SHIFT 2 +# define PERFORMANCE_COUNTER_ENABLE (1 << 0) + static void gen5_get_counters(void) { @@ -124,6 +204,45 @@ gen5_get_counters(void) drm_intel_bo_unreference(stats_bo); } +static void +gen6_get_counters(void) +{ + int i; + drm_intel_bo *stats_bo; + uint32_t *stats_result; + + /* Map from counter names to their index in the buffer object */ + static const int buffer_index[GEN6_COUNTER_COUNT] = + { + 7, 6, 5, 4, 3, + 15, 14, 13, 12, 11, 10, 9, 8, + 23, 22, 21, 20, 19, 18, 17, 16, + 31, 30, 29, 28, 27, 26, 25, 24, + }; + + stats_bo = drm_intel_bo_alloc(bufmgr, "stats", 4096, 4096); + + BEGIN_BATCH(3); + OUT_BATCH(GEN6_MI_REPORT_PERF_COUNT | (3 - 2)); + OUT_RELOC(stats_bo, + I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, + MI_COUNTER_ADDRESS_GTT); + OUT_BATCH(0); + ADVANCE_BATCH(); + + intel_batchbuffer_flush_on_ring(batch, I915_EXEC_RENDER); + + drm_intel_bo_map(stats_bo, 0); + stats_result = stats_bo->virtual; + for (i = 0; i < GEN6_COUNTER_COUNT; i++) { + totals[i] += stats_result[buffer_index[i]] - last_counter[i]; + last_counter[i] = stats_result[buffer_index[i]]; + } + + drm_intel_bo_unmap(stats_bo); + drm_intel_bo_unreference(stats_bo); +} + #define STATS_CHECK_FREQUENCY 100 #define STATS_REPORT_FREQUENCY 2 @@ -131,6 +250,7 @@ int main(int argc, char **argv) { uint32_t devid; + int counter_format; int counter_count; const char **counter_name; void (*get_counters)(void); @@ -138,6 +258,7 @@ main(int argc, char **argv) char clear_screen[] = {0x1b, '[', 'H', 0x1b, '[', 'J', 0x0}; + bool oacontrol = true; int fd; int l; @@ -152,10 +273,27 @@ main(int argc, char **argv) counter_name = gen5_counter_names; counter_count = GEN5_COUNTER_COUNT; get_counters = gen5_get_counters; + oacontrol = false; + } else if (IS_GEN6(devid)) { + counter_name = gen6_counter_names; + counter_count = GEN6_COUNTER_COUNT; + counter_format = gen6_counter_format; + get_counters = gen6_get_counters; } else { printf("This tool is not yet supported on your platform.\n"); abort(); } + + if (oacontrol) { + /* Forcewake */ + intel_register_access_init(intel_get_pci_device(), 0); + + /* Enable performance counters */ + intel_register_write(OACONTROL, + counter_format << OACONTROL_COUNTER_SELECT_SHIFT | + PERFORMANCE_COUNTER_ENABLE); + } + totals = calloc(counter_count, sizeof(uint32_t)); last_counter = calloc(counter_count, sizeof(uint32_t)); @@ -180,6 +318,14 @@ main(int argc, char **argv) } } + if (oacontrol) { + /* Disable performance counters */ + intel_register_write(OACONTROL, 0); + + /* Forcewake */ + intel_register_access_fini(); + } + free(totals); free(last_counter); |