i915 GT timeline

#! /usr/bin/perl # # Copyright © 2017 Intel Corporation # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice (including the next # paragraph) shall be included in all copies or substantial portions of the # Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. # use strict; use warnings; use 5.010; my $gid = 0; my (%db, %queue, %submit, %notify, %rings, %ctxdb, %ringmap, %reqwait); my @freqs; my $max_items = 3000; my $width_us = 32000; my $correct_durations = 0; my %ignore_ring; my %skip_box; my $html = 0; my $trace = 0; my $avg_delay_stats = 0; my $squash_context_id = 0; my $gpu_timeline = 0; my @args; sub arg_help { return unless scalar(@_); if ($_[0] eq '--help' or $_[0] eq '-h') { shift @_; print <trace.log This file in turn should be piped into this tool which will generate some statistics out of it, or if --html was given HTML output. HTML can be viewed from a directory containing the 'vis' JavaScript module. On Ubuntu this can be installed like this: apt-get install npm npm install vis Usage: trace.pl

output-file --help / -h This help text --max-items=num / -m num Maximum number of boxes to put on the timeline. More boxes means more work for the JavaScript engine in the browser. --zoom-width-ms=ms / -z ms Width of the initial timeline zoom --split-requests / -s Try to split out request which were submitted together due coalescing in the driver. May not be 100% accurate and may influence the per-engine statistics so use with care. --ignore-ring=id / -i id Ignore ring with the numerical id when parsing the log (enum intel_engine_id). Can be given multiple times. --skip-box=name / -x name Do not put a certain type of a box on the timeline. One of: queue, ready, execute and ctxsave. Can be given multiple times. --html Generate HTML output. --trace cmd Trace the following command. --avg-delay-stats Print average delay stats. --squash-ctx-id Squash context id by substracting engine id from ctx id. --gpu-timeline Draw overall GPU busy timeline. ENDHELP exit 0; } return @_; } sub arg_html { return unless scalar(@_); if ($_[0] eq '--html') { shift @_; $html = 1; } return @_; } sub arg_avg_delay_stats { return unless scalar(@_); if ($_[0] eq '--avg-delay-stats') { shift @_; $avg_delay_stats = 1; } return @_; } sub arg_squash_ctx_id { return unless scalar(@_); if ($_[0] eq '--squash-ctx-id') { shift @_; $squash_context_id = 1; } return @_; } sub arg_gpu_timeline { return unless scalar(@_); if ($_[0] eq '--gpu-timeline') { shift @_; $gpu_timeline = 1; } return @_; } sub arg_trace { my @events = ( 'i915:intel_gpu_freq_change', 'i915:i915_gem_request_add', 'i915:i915_gem_request_submit', 'i915:i915_gem_request_in', 'i915:i915_gem_request_out', 'i915:intel_engine_notify', 'i915:i915_gem_request_wait_begin', 'i915:i915_gem_request_wait_end' ); return unless scalar(@_); if ($_[0] eq '--trace') { shift @_; unshift @_, join(',', @events); unshift @_, ('perf', 'record', '-a', '-c', '1', '-q', '-e'); exec @_; } return @_; } sub arg_max_items { my $val; return unless scalar(@_); if ($_[0] eq '--max-items' or $_[0] eq '-m') { shift @_; $val = shift @_; } elsif ($_[0] =~ /--max-items=(\d+)/) { shift @_; $val = $1; } $max_items = int($val) if defined $val; return @_; } sub arg_zoom_width { my $val; return unless scalar(@_); if ($_[0] eq '--zoom-width-ms' or $_[0] eq '-z') { shift @_; $val = shift @_; } elsif ($_[0] =~ /--zoom-width-ms=(\d+)/) { shift @_; $val = $1; } $width_us = int($val) * 1000 if defined $val; return @_; } sub arg_split_requests { return unless scalar(@_); if ($_[0] eq '--split-requests' or $_[0] eq '-s') { shift @_; $correct_durations = 1; } return @_; } sub arg_ignore_ring { my $val; return unless scalar(@_); if ($_[0] eq '--ignore-ring' or $_[0] eq '-i') { shift @_; $val = shift @_; } elsif ($_[0] =~ /--ignore-ring=(\d+)/) { shift @_; $val = $1; } $ignore_ring{$val} = 1 if defined $val; return @_; } sub arg_skip_box { my $val; return unless scalar(@_); if ($_[0] eq '--skip-box' or $_[0] eq '-x') { shift @_; $val = shift @_; } elsif ($_[0] =~ /--skip-box=(\d+)/) { shift @_; $val = $1; } $skip_box{$val} = 1 if defined $val; return @_; } @args = @ARGV; while (@args) { my $left = scalar(@args); @args = arg_help(@args); @args = arg_html(@args); @args = arg_avg_delay_stats(@args); @args = arg_squash_ctx_id(@args); @args = arg_gpu_timeline(@args); @args = arg_trace(@args); @args = arg_max_items(@args); @args = arg_zoom_width(@args); @args = arg_split_requests(@args); @args = arg_ignore_ring(@args); @args = arg_skip_box(@args); last if $left == scalar(@args); } die if scalar(@args); @ARGV = @args; sub parse_req { my ($line, $tp) = @_; state %cache; $cache{$tp} = qr/(\d+)\.(\d+):.*$tp.*ring=(\d+), ctx=(\d+), seqno=(\d+), global(?:_seqno)?=(\d+)/ unless exists $cache{$tp}; if ($line =~ $cache{$tp}) { return ($1, $2, $3, $4, $5, $6); } else { return undef; } } sub parse_req_hw { my ($line, $tp) = @_; state %cache; $cache{$tp} = qr/(\d+)\.(\d+):.*$tp.*ring=(\d+), ctx=(\d+), seqno=(\d+), global(?:_seqno)?=(\d+), port=(\d+)/ unless exists $cache{$tp}; if ($line =~ $cache{$tp}) { return ($1, $2, $3, $4, $5, $6, $7); } else { return undef; } } sub parse_req_wait_begin { my ($line, $tp) = @_; if ($line =~ /(\d+)\.(\d+):.*i915_gem_request_wait_begin.*ring=(\d+), ctx=(\d+), seqno=(\d+)/) { return ($1, $2, $3, $4, $5); } else { return undef; } } sub parse_notify { my ($line) = @_; if ($line =~ /(\d+)\.(\d+):.*intel_engine_notify.*ring=(\d+), seqno=(\d+)/) { return ($1, $2, $3, $4); } else { return undef; } } sub parse_freq { my ($line) = @_; if ($line =~ /(\d+)\.(\d+):.*intel_gpu_freq_change.*new_freq=(\d+)/) { return ($1, $2, $3); } else { return undef; } } sub us { my ($s, $us) = @_; return $s * 1000000 + $us; } sub db_key { my ($ring, $ctx, $seqno) = @_; return $ring . '/' . $ctx . '/' . $seqno; } sub global_key { my ($ring, $seqno) = @_; return $ring . '/' . $seqno; } sub sanitize_ctx { my ($ctx, $ring) = @_; $ctx = $ctx - $ring if $squash_context_id; if (exists $ctxdb{$ctx}) { return $ctx . '.' . $ctxdb{$ctx}; } else { return $ctx; } } sub ts { my ($us) = @_; my ($d, $h, $m, $s); $s = int($us / 1000000); $us = $us % 1000000; $m = int($s / 60); $s = $s % 60; $h = int($m / 60); $m = $m % 60; $d = 1 + int($h / 24); $h = $h % 24; return sprintf('2017-01-%02u %02u:%02u:%02u.%06u', int($d), int($h), int($m), int($s), int($us)); } # Main input loop - parse lines and build the internal representation of the # trace using a hash of requests and some auxilliary data structures. my $prev_freq = 0; my $prev_freq_ts = 0; my $oldkernelwa = 0; my ($no_queue, $no_in); while (<>) { my ($s, $us, $ring, $ctx, $seqno, $global_seqno, $port); my $freq; my $key; chomp; ($s, $us, $ring, $ctx, $seqno) = parse_req_wait_begin($_); if (defined $s) { my %rw; next if exists $ignore_ring{$ring}; $ctx = sanitize_ctx($ctx, $ring); $key = db_key($ring, $ctx, $seqno); next if exists $reqwait{$key}; $rw{'key'} = $key; $rw{'ring'} = $ring; $rw{'seqno'} = $seqno; $rw{'ctx'} = $ctx; $rw{'start'} = us($s, $us); $reqwait{$key} = \%rw; next; } ($s, $us, $ring, $ctx, $seqno, $global_seqno) = parse_req($_, 'i915:i915_gem_request_wait_end'); if (defined $s) { next if exists $ignore_ring{$ring}; $ctx = sanitize_ctx($ctx, $ring); $key = db_key($ring, $ctx, $seqno); next unless exists $reqwait{$key}; $reqwait{$key}->{'end'} = us($s, $us); next; } ($s, $us, $ring, $ctx, $seqno, $global_seqno) = parse_req($_, 'i915:i915_gem_request_add'); if (defined $s) { my $orig_ctx = $ctx; next if exists $ignore_ring{$ring}; $ctx = sanitize_ctx($ctx, $ring); $key = db_key($ring, $ctx, $seqno); if (exists $queue{$key}) { $ctxdb{$orig_ctx}++; $ctx = sanitize_ctx($orig_ctx, $ring); $key = db_key($ring, $ctx, $seqno); } $queue{$key} = us($s, $us); next; } ($s, $us, $ring, $ctx, $seqno, $global_seqno) = parse_req($_, 'i915:i915_gem_request_submit'); if (defined $s) { next if exists $ignore_ring{$ring}; $ctx = sanitize_ctx($ctx, $ring); $key = db_key($ring, $ctx, $seqno); die if exists $submit{$key}; die unless exists $queue{$key}; $submit{$key} = us($s, $us); next; } ($s, $us, $ring, $ctx, $seqno, $global_seqno, $port) = parse_req_hw($_, 'i915:i915_gem_request_in'); if (defined $s) { my %req; next if exists $ignore_ring{$ring}; $ctx = sanitize_ctx($ctx, $ring); $key = db_key($ring, $ctx, $seqno); die if exists $db{$key}; if (not exists $queue{$key} and $oldkernelwa) { $no_queue++; next; } die unless exists $queue{$key}; die unless exists $submit{$key}; $req{'start'} = us($s, $us); $req{'ring'} = $ring; $req{'seqno'} = $seqno; $req{'ctx'} = $ctx; $req{'name'} = $ctx . '/' . $seqno; $req{'global'} = $global_seqno; $req{'port'} = $port; $req{'queue'} = $queue{$key}; $req{'submit-delay'} = $submit{$key} - $queue{$key}; $req{'execute-delay'} = $req{'start'} - $submit{$key}; $rings{$ring} = $gid++ unless exists $rings{$ring}; $ringmap{$rings{$ring}} = $ring; $db{$key} = \%req; next; } ($s, $us, $ring, $ctx, $seqno, $global_seqno, $port) = parse_req($_, 'i915:i915_gem_request_out'); if (defined $s) { my $gkey = global_key($ring, $global_seqno); next if exists $ignore_ring{$ring}; $ctx = sanitize_ctx($ctx, $ring); $key = db_key($ring, $ctx, $seqno); if (not exists $db{$key} and $oldkernelwa) { $no_in++; next; } die unless exists $db{$key}; die unless exists $db{$key}->{'start'}; die if exists $db{$key}->{'end'}; $db{$key}->{'end'} = us($s, $us); if (exists $notify{$gkey}) { $db{$key}->{'notify'} = $notify{$gkey}; } else { # No notify so far. Maybe it will arrive later which # will be handled in the sanitation loop below. $db{$key}->{'notify'} = $db{$key}->{'end'}; $db{$key}->{'no-notify'} = 1; } $db{$key}->{'duration'} = $db{$key}->{'notify'} - $db{$key}->{'start'}; $db{$key}->{'context-complete-delay'} = $db{$key}->{'end'} - $db{$key}->{'notify'}; next; } ($s, $us, $ring, $seqno) = parse_notify($_); if (defined $s) { next if exists $ignore_ring{$ring}; $notify{global_key($ring, $seqno)} = us($s, $us); next; } ($s, $us, $freq) = parse_freq($_); if (defined $s) { my $cur = us($s, $us); push @freqs, [$prev_freq_ts, $cur, $prev_freq] if $prev_freq; $prev_freq_ts = $cur; $prev_freq = $freq; next; } } # Sanitation pass to fixup up out of order notify and context complete, and to # fine the largest seqno to be used for timeline sorting purposes. my $max_seqno = 0; foreach my $key (keys %db) { my $gkey = global_key($db{$key}->{'ring'}, $db{$key}->{'global'}); die unless exists $db{$key}->{'start'}; $max_seqno = $db{$key}->{'seqno'} if $db{$key}->{'seqno'} > $max_seqno; unless (exists $db{$key}->{'end'}) { # Context complete not received. if (exists $notify{$gkey}) { # No context complete due req merging - use notify. $db{$key}->{'notify'} = $notify{$gkey}; $db{$key}->{'end'} = $db{$key}->{'notify'}; $db{$key}->{'no-end'} = 1; } else { # No notify and no context complete - mark it. $db{$key}->{'no-end'} = 1; $db{$key}->{'end'} = $db{$key}->{'start'} + 999; $db{$key}->{'notify'} = $db{$key}->{'end'}; $db{$key}->{'incomplete'} = 1; } $db{$key}->{'duration'} = $db{$key}->{'notify'} - $db{$key}->{'start'}; $db{$key}->{'context-complete-delay'} = $db{$key}->{'end'} - $db{$key}->{'notify'}; } else { # Notify arrived after context complete. if (exists $db{$key}->{'no-notify'} and exists $notify{$gkey}) { delete $db{$key}->{'no-notify'}; $db{$key}->{'notify'} = $notify{$gkey}; $db{$key}->{'duration'} = $db{$key}->{'notify'} - $db{$key}->{'start'}; $db{$key}->{'context-complete-delay'} = $db{$key}->{'end'} - $db{$key}->{'notify'}; } } } # Fix up incompletes foreach my $key (keys %db) { next unless exists $db{$key}->{'incomplete'}; # End the incomplete batch at the time next one starts my $ring = $db{$key}->{'ring'}; my $ctx = $db{$key}->{'ctx'}; my $seqno = $db{$key}->{'seqno'}; my $next_key; my $i = 1; do { $next_key = db_key($ring, $ctx, $seqno + $i); $i++; } until ((exists $db{$next_key} and not exists $db{$next_key}->{'incomplete'}) or $i > scalar(keys(%db))); # ugly stop hack if (exists $db{$next_key}) { $db{$key}->{'notify'} = $db{$next_key}->{'end'}; $db{$key}->{'end'} = $db{$key}->{'notify'}; $db{$key}->{'duration'} = $db{$key}->{'notify'} - $db{$key}->{'start'}; $db{$key}->{'context-complete-delay'} = $db{$key}->{'end'} - $db{$key}->{'notify'}; } } # GPU time accounting my (%running, %runnable, %queued, %batch_avg, %batch_total_avg, %batch_count); my (%submit_avg, %execute_avg, %ctxsave_avg); my $last_ts = 0; my $first_ts; foreach my $key (sort {$db{$a}->{'start'} <=> $db{$b}->{'start'}} keys %db) { my $ring = $db{$key}->{'ring'}; my $end = $db{$key}->{'end'}; $first_ts = $db{$key}->{'queue'} if not defined $first_ts or $db{$key}->{'queue'} < $first_ts; $last_ts = $end if $end > $last_ts; # Adjust batch start with legacy execlists. # Port == 2 mean batch was merged udring queuing and hasn't actually # been submitted to the gpu until the batch with port < 2 is found. if ($correct_durations and $oldkernelwa and $db{$key}->{'port'} == 2) { my $ctx = $db{$key}->{'ctx'}; my $seqno = $db{$key}->{'seqno'}; my $next_key; my $i = 1; do { $next_key = db_key($ring, $ctx, $seqno + $i); $i++; } until ((exists $db{$next_key} and $db{$next_key}->{'port'} < 2) or $i > scalar(keys(%db))); # ugly stop hack if (exists $db{$next_key}) { $db{$key}->{'start'} = $db{$next_key}->{'start'}; $db{$key}->{'end'} = $db{$next_key}->{'end'}; die if $db{$key}->{'start'} > $db{$key}->{'end'}; } } $running{$ring} += $end - $db{$key}->{'start'} unless exists $db{$key}->{'no-end'}; $runnable{$ring} += $db{$key}->{'execute-delay'}; $queued{$ring} += $db{$key}->{'start'} - $db{$key}->{'execute-delay'} - $db{$key}->{'queue'}; $batch_count{$ring}++; # correct duration of merged batches if ($correct_durations and exists $db{$key}->{'no-end'}) { my $start = $db{$key}->{'start'}; my $ctx = $db{$key}->{'ctx'}; my $seqno = $db{$key}->{'seqno'}; my $next_key; my $i = 1; do { $next_key = db_key($ring, $ctx, $seqno + $i); $i++; } until (exists $db{$next_key} or $i > scalar(keys(%db))); # ugly stop hack # 20us tolerance if (exists $db{$next_key} and $db{$next_key}->{'start'} < $start + 20) { $db{$next_key}->{'start'} = $start + $db{$key}->{'duration'}; $db{$next_key}->{'start'} = $db{$next_key}->{'end'} if $db{$next_key}->{'start'} > $db{$next_key}->{'end'}; $db{$next_key}->{'duration'} = $db{$next_key}->{'notify'} - $db{$next_key}->{'start'}; $end = $db{$key}->{'notify'}; die if $db{$next_key}->{'start'} > $db{$next_key}->{'end'}; } die if $db{$key}->{'start'} > $db{$key}->{'end'}; } $batch_avg{$ring} += $db{$key}->{'duration'}; $batch_total_avg{$ring} += $end - $db{$key}->{'start'}; $submit_avg{$ring} += $db{$key}->{'submit-delay'}; $execute_avg{$ring} += $db{$key}->{'execute-delay'}; $ctxsave_avg{$ring} += $db{$key}->{'end'} - $db{$key}->{'notify'}; } foreach my $ring (keys %batch_avg) { $batch_avg{$ring} /= $batch_count{$ring}; $batch_total_avg{$ring} /= $batch_count{$ring}; $submit_avg{$ring} /= $batch_count{$ring}; $execute_avg{$ring} /= $batch_count{$ring}; $ctxsave_avg{$ring} /= $batch_count{$ring}; } # Calculate engine idle time my %flat_busy; foreach my $gid (sort keys %rings) { my $ring = $ringmap{$rings{$gid}}; my (@s_, @e_); # Extract all GPU busy intervals and sort them. foreach my $key (sort {$db{$a}->{'start'} <=> $db{$b}->{'start'}} keys %db) { next unless $db{$key}->{'ring'} == $ring; push @s_, $db{$key}->{'start'}; push @e_, $db{$key}->{'end'}; die if $db{$key}->{'start'} > $db{$key}->{'end'}; } die unless $#s_ == $#e_; # Flatten the intervals. for my $i (1..$#s_) { last if $i >= @s_; # End of array. die if $e_[$i] < $s_[$i]; if ($s_[$i] <= $e_[$i - 1]) { # Current entry overlaps with the previous one. We need # to merge end of the previous interval from the list # with the start of the current one. if ($e_[$i] >= $e_[$i - 1]) { splice @e_, $i - 1, 1; } else { splice @e_, $i, 1; } splice @s_, $i, 1; # Continue with the same element when list got squashed. redo; } } # Add up all busy times. my $total = 0; for my $i (0..$#s_) { die if $e_[$i] < $s_[$i]; $total = $total + ($e_[$i] - $s_[$i]); } $flat_busy{$ring} = $total; } # Calculate overall GPU idle time my @gpu_intervals; my (@s_, @e_); # Extract all GPU busy intervals and sort them. foreach my $key (sort {$db{$a}->{'start'} <=> $db{$b}->{'start'}} keys %db) { push @s_, $db{$key}->{'start'}; push @e_, $db{$key}->{'end'}; die if $db{$key}->{'start'} > $db{$key}->{'end'}; } die unless $#s_ == $#e_; # Flatten the intervals (copy & paste of the flattening loop above) for my $i (1..$#s_) { last if $i >= @s_; die if $e_[$i] < $s_[$i]; die if $s_[$i] < $s_[$i - 1]; if ($s_[$i] <= $e_[$i - 1]) { if ($e_[$i] >= $e_[$i - 1]) { splice @e_, $i - 1, 1; } else { splice @e_, $i, 1; } splice @s_, $i, 1; redo; } } # Add up all busy times. my $total = 0; for my $i (0..$#s_) { die if $e_[$i] < $s_[$i]; $total = $total + ($e_[$i] - $s_[$i]); } # Generate data for the GPU timeline if requested if ($gpu_timeline) { for my $i (0..$#s_) { push @gpu_intervals, [ $s_[$i], $e_[$i] ]; } } $flat_busy{'gpu-busy'} = $total / ($last_ts - $first_ts) * 100.0; $flat_busy{'gpu-idle'} = (1.0 - $total / ($last_ts - $first_ts)) * 100.0; # Add up all request waits per engine my %reqw; foreach my $key (keys %reqwait) { $reqw{$reqwait{$key}->{'ring'}} += $reqwait{$key}->{'end'} - $reqwait{$key}->{'start'}; } say sprintf('GPU: %.2f%% idle, %.2f%% busy', $flat_busy{'gpu-idle'}, $flat_busy{'gpu-busy'}) unless $html; print < i915 GT timeline

pink = requests executing on the GPU
grey = runnable requests waiting for a slot on GPU
blue = requests waiting on fences and dependencies before they are runnable

Boxes are in format 'ctx-id/seqno'.

Use Ctrl+scroll-action to zoom-in/out and scroll-action or dragging to move around the timeline.

GPU idle: $flat_busy{'gpu-idle'}%
GPU busy: $flat_busy{'gpu-busy'}%

ENDHTML