/* * Copyright © 2007 Intel Corporation * Copyright © 2011 Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * Authors: * Eric Anholt * Eugeni Dodonov * */ #include "config.h" #include #include #include #include #include #include #include #include #include #ifdef HAVE_TERMIOS_H #include #endif #include "intel_io.h" #include "instdone.h" #include "intel_reg.h" #include "intel_chipset.h" #include "drmtest.h" #define FORCEWAKE 0xA18C #define FORCEWAKE_ACK 0x130090 #define SAMPLES_PER_SEC 10000 #define SAMPLES_TO_PERCENT_RATIO (SAMPLES_PER_SEC / 100) #define MAX_NUM_TOP_BITS 100 #define HAS_STATS_REGS(devid) IS_965(devid) struct top_bit { struct instdone_bit *bit; int count; } top_bits[MAX_NUM_TOP_BITS]; struct top_bit *top_bits_sorted[MAX_NUM_TOP_BITS]; static uint32_t instdone, instdone1; static const char *bars[] = { " ", "▏", "▎", "▍", "▌", "▋", "▊", "▉", "█" }; enum stats_counts { IA_VERTICES, IA_PRIMITIVES, VS_INVOCATION, GS_INVOCATION, GS_PRIMITIVES, CL_INVOCATION, CL_PRIMITIVES, PS_INVOCATION, PS_DEPTH, STATS_COUNT }; const uint32_t stats_regs[STATS_COUNT] = { IA_VERTICES_COUNT_QW, IA_PRIMITIVES_COUNT_QW, VS_INVOCATION_COUNT_QW, GS_INVOCATION_COUNT_QW, GS_PRIMITIVES_COUNT_QW, CL_INVOCATION_COUNT_QW, CL_PRIMITIVES_COUNT_QW, PS_INVOCATION_COUNT_QW, PS_DEPTH_COUNT_QW, }; const char *stats_reg_names[STATS_COUNT] = { "vert fetch", "prim fetch", "VS invocations", "GS invocations", "GS prims", "CL invocations", "CL prims", "PS invocations", "PS depth pass", }; uint64_t stats[STATS_COUNT]; uint64_t last_stats[STATS_COUNT]; static unsigned long gettime(void) { struct timeval t; gettimeofday(&t, NULL); return (t.tv_usec + (t.tv_sec * 1000000)); } static int top_bits_sort(const void *a, const void *b) { struct top_bit * const *bit_a = a; struct top_bit * const *bit_b = b; int a_count = (*bit_a)->count; int b_count = (*bit_b)->count; if (a_count < b_count) return 1; else if (a_count == b_count) return 0; else return -1; } static void update_idle_bit(struct top_bit *top_bit) { uint32_t reg_val; if (top_bit->bit->reg == INSTDONE_1) reg_val = instdone1; else reg_val = instdone; if ((reg_val & top_bit->bit->bit) == 0) top_bit->count++; } static void print_clock(const char *name, int clock) { if (clock == -1) printf("%s clock: unknown", name); else printf("%s clock: %d Mhz", name, clock); } static int print_clock_info(struct pci_device *pci_dev) { uint32_t devid = pci_dev->device_id; uint16_t gcfgc; if (IS_GM45(devid)) { int core_clock = -1; pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC); switch (gcfgc & 0xf) { case 8: core_clock = 266; break; case 9: core_clock = 320; break; case 11: core_clock = 400; break; case 13: core_clock = 533; break; } print_clock("core", core_clock); } else if (IS_965(devid) && IS_MOBILE(devid)) { int render_clock = -1, sampler_clock = -1; pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC); switch (gcfgc & 0xf) { case 2: render_clock = 250; sampler_clock = 267; break; case 3: render_clock = 320; sampler_clock = 333; break; case 4: render_clock = 400; sampler_clock = 444; break; case 5: render_clock = 500; sampler_clock = 533; break; } print_clock("render", render_clock); printf(" "); print_clock("sampler", sampler_clock); } else if (IS_945(devid) && IS_MOBILE(devid)) { int render_clock = -1, display_clock = -1; pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC); switch (gcfgc & 0x7) { case 0: render_clock = 166; break; case 1: render_clock = 200; break; case 3: render_clock = 250; break; case 5: render_clock = 400; break; } switch (gcfgc & 0x70) { case 0: display_clock = 200; break; case 4: display_clock = 320; break; } if (gcfgc & (1 << 7)) display_clock = 133; print_clock("render", render_clock); printf(" "); print_clock("display", display_clock); } else if (IS_915(devid) && IS_MOBILE(devid)) { int render_clock = -1, display_clock = -1; pci_device_cfg_read_u16(pci_dev, &gcfgc, I915_GCFGC); switch (gcfgc & 0x7) { case 0: render_clock = 160; break; case 1: render_clock = 190; break; case 4: render_clock = 333; break; } if (gcfgc & (1 << 13)) render_clock = 133; switch (gcfgc & 0x70) { case 0: display_clock = 190; break; case 4: display_clock = 333; break; } if (gcfgc & (1 << 7)) display_clock = 133; print_clock("render", render_clock); printf(" "); print_clock("display", display_clock); } printf("\n"); return -1; } #define STATS_LEN (20) #define PERCENTAGE_BAR_END (79 - STATS_LEN) static void print_percentage_bar(float percent, int cur_line_len) { int bar_avail_len = (PERCENTAGE_BAR_END - cur_line_len - 1) * 8; int bar_len = bar_avail_len * (percent + .5) / 100.0; int i; for (i = bar_len; i >= 8; i -= 8) { printf("%s", bars[8]); cur_line_len++; } if (i) { printf("%s", bars[i]); cur_line_len++; } /* NB: We can't use a field width with utf8 so we manually * guarantee a field with of 45 chars for any bar. */ printf("%*s", PERCENTAGE_BAR_END - cur_line_len, ""); } struct ring { const char *name; uint32_t mmio; int head, tail, size; uint64_t full; int idle; }; static uint32_t ring_read(struct ring *ring, uint32_t reg) { return INREG(ring->mmio + reg); } static void ring_init(struct ring *ring) { ring->size = (((ring_read(ring, RING_LEN) & RING_NR_PAGES) >> 12) + 1) * 4096; } static void ring_reset(struct ring *ring) { ring->idle = ring->full = 0; } static void ring_sample(struct ring *ring) { int full; if (!ring->size) return; ring->head = ring_read(ring, RING_HEAD) & HEAD_ADDR; ring->tail = ring_read(ring, RING_TAIL) & TAIL_ADDR; if (ring->tail == ring->head) ring->idle++; full = ring->tail - ring->head; if (full < 0) full += ring->size; ring->full += full; } static void ring_print_header(FILE *out, struct ring *ring) { fprintf(out, "%.6s%%\tops\t", ring->name ); } static void ring_print(struct ring *ring, unsigned long samples_per_sec) { int percent_busy, len; if (!ring->size) return; percent_busy = 100 - 100 * ring->idle / samples_per_sec; len = printf("%25s busy: %3d%%: ", ring->name, percent_busy); print_percentage_bar (percent_busy, len); printf("%24s space: %d/%d\n", ring->name, (int)(ring->full / samples_per_sec), ring->size); } static void ring_log(struct ring *ring, unsigned long samples_per_sec, FILE *output) { if (ring->size) fprintf(output, "%3d\t%d\t", (int)(100 - 100 * ring->idle / samples_per_sec), (int)(ring->full / samples_per_sec)); else fprintf(output, "-1\t-1\t"); } static void usage(const char *appname) { printf("intel_gpu_top - Display a top-like summary of Intel GPU usage\n" "\n" "usage: %s [parameters]\n" "\n" "The following parameters apply:\n" "[-s ] samples per seconds (default %d)\n" "[-e ] command to profile\n" "[-o ] output statistics to file. If file is '-'," " run in batch mode and output statistics to stdio only \n" "[-h] show this help screen\n" "\n", appname, SAMPLES_PER_SEC ); return; } int main(int argc, char **argv) { uint32_t devid; struct pci_device *pci_dev; struct ring render_ring = { .name = "render", .mmio = 0x2030, }, bsd_ring = { .name = "bitstream", .mmio = 0x4030, }, bsd6_ring = { .name = "bitstream", .mmio = 0x12030, }, blt_ring = { .name = "blitter", .mmio = 0x22030, }; int i, ch; int samples_per_sec = SAMPLES_PER_SEC; FILE *output = NULL; double elapsed_time=0; int print_headers=1; pid_t child_pid=-1; int child_stat; char *cmd=NULL; int interactive=1; /* Parse options? */ while ((ch = getopt(argc, argv, "s:o:e:h")) != -1) { switch (ch) { case 'e': cmd = strdup(optarg); break; case 's': samples_per_sec = atoi(optarg); if (samples_per_sec < 100) { fprintf(stderr, "Error: samples per second must be >= 100\n"); exit(1); } break; case 'o': if (!strcmp(optarg, "-")) { /* Running in non-interactive mode */ interactive = 0; output = stdout; } else output = fopen(optarg, "w"); if (!output) { perror("fopen"); exit(1); } break; case 'h': usage(argv[0]); exit(0); break; default: fprintf(stderr, "Invalid flag %c!\n", (char)optopt); usage(argv[0]); exit(1); break; } } pci_dev = intel_get_pci_device(); devid = pci_dev->device_id; intel_mmio_use_pci_bar(pci_dev); init_instdone_definitions(devid); /* Do we have a command to run? */ if (cmd != NULL) { if (output) { fprintf(output, "# Profiling: %s\n", cmd); fflush(output); } child_pid = fork(); if (child_pid < 0) { perror("fork"); exit(1); } else if (child_pid == 0) { int res; res = system(cmd); if (res < 0) perror("running command"); if (output) { fflush(output); fprintf(output, "# %s exited with status %d\n", cmd, res); fflush(output); } free(cmd); exit(0); } else { free(cmd); } } for (i = 0; i < num_instdone_bits; i++) { top_bits[i].bit = &instdone_bits[i]; top_bits[i].count = 0; top_bits_sorted[i] = &top_bits[i]; } /* Grab access to the registers */ intel_register_access_init(pci_dev, 0, -1); ring_init(&render_ring); if (IS_GEN4(devid) || IS_GEN5(devid)) ring_init(&bsd_ring); if (IS_GEN6(devid) || IS_GEN7(devid)) { ring_init(&bsd6_ring); ring_init(&blt_ring); } /* Initialize GPU stats */ if (HAS_STATS_REGS(devid)) { for (i = 0; i < STATS_COUNT; i++) { uint32_t stats_high, stats_low, stats_high_2; do { stats_high = INREG(stats_regs[i] + 4); stats_low = INREG(stats_regs[i]); stats_high_2 = INREG(stats_regs[i] + 4); } while (stats_high != stats_high_2); last_stats[i] = (uint64_t)stats_high << 32 | stats_low; } } for (;;) { int j; unsigned long long t1, ti, tf, t2; unsigned long long def_sleep = 1000000 / samples_per_sec; unsigned long long last_samples_per_sec = samples_per_sec; unsigned short int max_lines; struct winsize ws; char clear_screen[] = {0x1b, '[', 'H', 0x1b, '[', 'J', 0x0}; int percent; int len; t1 = gettime(); ring_reset(&render_ring); ring_reset(&bsd_ring); ring_reset(&bsd6_ring); ring_reset(&blt_ring); for (i = 0; i < samples_per_sec; i++) { long long interval; ti = gettime(); if (IS_965(devid)) { instdone = INREG(INSTDONE_I965); instdone1 = INREG(INSTDONE_1); } else instdone = INREG(INSTDONE); for (j = 0; j < num_instdone_bits; j++) update_idle_bit(&top_bits[j]); ring_sample(&render_ring); ring_sample(&bsd_ring); ring_sample(&bsd6_ring); ring_sample(&blt_ring); tf = gettime(); if (tf - t1 >= 1000000) { /* We are out of sync, bail out */ last_samples_per_sec = i+1; break; } interval = def_sleep - (tf - ti); if (interval > 0) usleep(interval); } if (HAS_STATS_REGS(devid)) { for (i = 0; i < STATS_COUNT; i++) { uint32_t stats_high, stats_low, stats_high_2; do { stats_high = INREG(stats_regs[i] + 4); stats_low = INREG(stats_regs[i]); stats_high_2 = INREG(stats_regs[i] + 4); } while (stats_high != stats_high_2); stats[i] = (uint64_t)stats_high << 32 | stats_low; } } qsort(top_bits_sorted, num_instdone_bits, sizeof(struct top_bit *), top_bits_sort); /* Limit the number of lines printed to the terminal height so the * most important info (at the top) will stay on screen. */ max_lines = -1; if (ioctl(0, TIOCGWINSZ, &ws) != -1) max_lines = ws.ws_row - 6; /* exclude header lines */ if (max_lines >= num_instdone_bits) max_lines = num_instdone_bits; t2 = gettime(); elapsed_time += (t2 - t1) / 1000000.0; if (interactive) { printf("%s", clear_screen); print_clock_info(pci_dev); ring_print(&render_ring, last_samples_per_sec); ring_print(&bsd_ring, last_samples_per_sec); ring_print(&bsd6_ring, last_samples_per_sec); ring_print(&blt_ring, last_samples_per_sec); printf("\n%30s %s\n", "task", "percent busy"); for (i = 0; i < max_lines; i++) { if (top_bits_sorted[i]->count > 0) { percent = (top_bits_sorted[i]->count * 100) / last_samples_per_sec; len = printf("%30s: %3d%%: ", top_bits_sorted[i]->bit->name, percent); print_percentage_bar (percent, len); } else { printf("%*s", PERCENTAGE_BAR_END, ""); } if (i < STATS_COUNT && HAS_STATS_REGS(devid)) { printf("%13s: %llu (%lld/sec)", stats_reg_names[i], (long long)stats[i], (long long)(stats[i] - last_stats[i])); last_stats[i] = stats[i]; } else { if (!top_bits_sorted[i]->count) break; } printf("\n"); } } if (output) { /* Print headers for columns at first run */ if (print_headers) { fprintf(output, "# time\t"); ring_print_header(output, &render_ring); ring_print_header(output, &bsd_ring); ring_print_header(output, &bsd6_ring); ring_print_header(output, &blt_ring); for (i = 0; i < MAX_NUM_TOP_BITS; i++) { if (i < STATS_COUNT && HAS_STATS_REGS(devid)) { fprintf(output, "%.6s\t", stats_reg_names[i] ); } if (!top_bits[i].count) continue; } fprintf(output, "\n"); print_headers = 0; } /* Print statistics */ fprintf(output, "%.2f\t", elapsed_time); ring_log(&render_ring, last_samples_per_sec, output); ring_log(&bsd_ring, last_samples_per_sec, output); ring_log(&bsd6_ring, last_samples_per_sec, output); ring_log(&blt_ring, last_samples_per_sec, output); for (i = 0; i < MAX_NUM_TOP_BITS; i++) { if (i < STATS_COUNT && HAS_STATS_REGS(devid)) { fprintf(output, "%"PRIu64"\t", stats[i] - last_stats[i]); last_stats[i] = stats[i]; } if (!top_bits[i].count) continue; } fprintf(output, "\n"); fflush(output); } for (i = 0; i < num_instdone_bits; i++) { top_bits_sorted[i]->count = 0; if (i < STATS_COUNT) last_stats[i] = stats[i]; } /* Check if child has gone */ if (child_pid > 0) { int res; if ((res = waitpid(child_pid, &child_stat, WNOHANG)) == -1) { perror("waitpid"); exit(1); } if (res == 0) continue; if (WIFEXITED(child_stat)) break; } } fclose(output); intel_register_access_fini(); return 0; }