#! /usr/bin/perl # # Copyright © 2017 Intel Corporation # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice (including the next # paragraph) shall be included in all copies or substantial portions of the # Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS # IN THE SOFTWARE. # use strict; use warnings; use 5.010; my $gid = 0; my (%db, %queue, %submit, %notify, %rings, %ctxdb, %ringmap, %reqwait, %ctxtimelines); my @freqs; my $max_items = 3000; my $width_us = 32000; my $correct_durations = 0; my %ignore_ring; my %skip_box; my $html = 0; my $trace = 0; my $avg_delay_stats = 0; my $gpu_timeline = 0; my $colour_contexts = 0; my @args; sub arg_help { return unless scalar(@_); if ($_[0] eq '--help' or $_[0] eq '-h') { shift @_; print <trace.log This file in turn should be piped into this tool which will generate some statistics out of it, or if --html was given HTML output. HTML can be viewed from a directory containing the 'vis' JavaScript module. On Ubuntu this can be installed like this: apt-get install npm npm install vis Usage: trace.pl output-file --help / -h This help text --max-items=num / -m num Maximum number of boxes to put on the timeline. More boxes means more work for the JavaScript engine in the browser. --zoom-width-ms=ms / -z ms Width of the initial timeline zoom --split-requests / -s Try to split out request which were submitted together due coalescing in the driver. May not be 100% accurate and may influence the per-engine statistics so use with care. --ignore-ring=id / -i id Ignore ring with the numerical id when parsing the log (enum intel_engine_id). Can be given multiple times. --skip-box=name / -x name Do not put a certain type of a box on the timeline. One of: queue, ready, execute and ctxsave. Can be given multiple times. --html Generate HTML output. --trace cmd Trace the following command. --avg-delay-stats Print average delay stats. --gpu-timeline Draw overall GPU busy timeline. --colour-contexts / -c Use different colours for different context execution boxes. ENDHELP exit 0; } return @_; } sub arg_html { return unless scalar(@_); if ($_[0] eq '--html') { shift @_; $html = 1; } return @_; } sub arg_avg_delay_stats { return unless scalar(@_); if ($_[0] eq '--avg-delay-stats') { shift @_; $avg_delay_stats = 1; } return @_; } sub arg_gpu_timeline { return unless scalar(@_); if ($_[0] eq '--gpu-timeline') { shift @_; $gpu_timeline = 1; } return @_; } sub arg_trace { my @events = ( 'i915:intel_gpu_freq_change', 'i915:i915_request_add', 'i915:i915_request_submit', 'i915:i915_request_in', 'i915:i915_request_out', 'i915:intel_engine_notify', 'i915:i915_request_wait_begin', 'i915:i915_request_wait_end' ); return unless scalar(@_); if ($_[0] eq '--trace') { shift @_; unshift @_, join(',', @events); unshift @_, ('perf', 'record', '-a', '-c', '1', '-q', '-e'); exec @_; } return @_; } sub arg_max_items { my $val; return unless scalar(@_); if ($_[0] eq '--max-items' or $_[0] eq '-m') { shift @_; $val = shift @_; } elsif ($_[0] =~ /--max-items=(\d+)/) { shift @_; $val = $1; } $max_items = int($val) if defined $val; return @_; } sub arg_zoom_width { my $val; return unless scalar(@_); if ($_[0] eq '--zoom-width-ms' or $_[0] eq '-z') { shift @_; $val = shift @_; } elsif ($_[0] =~ /--zoom-width-ms=(\d+)/) { shift @_; $val = $1; } $width_us = int($val) * 1000 if defined $val; return @_; } sub arg_split_requests { return unless scalar(@_); if ($_[0] eq '--split-requests' or $_[0] eq '-s') { shift @_; $correct_durations = 1; } return @_; } sub arg_ignore_ring { my $val; return unless scalar(@_); if ($_[0] eq '--ignore-ring' or $_[0] eq '-i') { shift @_; $val = shift @_; } elsif ($_[0] =~ /--ignore-ring=(\d+)/) { shift @_; $val = $1; } $ignore_ring{$val} = 1 if defined $val; return @_; } sub arg_skip_box { my $val; return unless scalar(@_); if ($_[0] eq '--skip-box' or $_[0] eq '-x') { shift @_; $val = shift @_; } elsif ($_[0] =~ /--skip-box=(\d+)/) { shift @_; $val = $1; } $skip_box{$val} = 1 if defined $val; return @_; } sub arg_colour_contexts { return unless scalar(@_); if ($_[0] eq '--colour-contexts' or $_[0] eq '--color-contexts' or $_[0] eq '-c') { shift @_; $colour_contexts = 1; } return @_; } @args = @ARGV; while (@args) { my $left = scalar(@args); @args = arg_help(@args); @args = arg_html(@args); @args = arg_avg_delay_stats(@args); @args = arg_gpu_timeline(@args); @args = arg_trace(@args); @args = arg_max_items(@args); @args = arg_zoom_width(@args); @args = arg_split_requests(@args); @args = arg_ignore_ring(@args); @args = arg_skip_box(@args); @args = arg_colour_contexts(@args); last if $left == scalar(@args); } die if scalar(@args); @ARGV = @args; sub db_key { my ($ring, $ctx, $seqno) = @_; return $ring . '/' . $ctx . '/' . $seqno; } sub global_key { my ($ring, $seqno) = @_; return $ring . '/' . $seqno; } sub sanitize_ctx { my ($ctx, $ring) = @_; if (exists $ctxdb{$ctx} and $ctxdb{$ctx} > 1) { return $ctx . '.' . $ctxdb{$ctx}; } else { return $ctx; } } # Main input loop - parse lines and build the internal representation of the # trace using a hash of requests and some auxilliary data structures. my $prev_freq = 0; my $prev_freq_ts = 0; while (<>) { my @fields; my $tp_name; my %tp; my ($time, $ctx, $ring, $seqno, $orig_ctx, $key); chomp; @fields = split ' '; chop $fields[3]; $time = int($fields[3] * 1000000.0 + 0.5); $tp_name = $fields[4]; splice @fields, 0, 5; foreach my $f (@fields) { my ($k, $v); next unless $f =~ m/=/; ($k, $v) = ($`, $'); $k = 'global' if $k eq 'global_seqno'; chop $v if substr($v, -1, 1) eq ','; $tp{$k} = $v; $tp{'ring'} = $tp{'engine'} if $k eq 'engine'; } next if exists $tp{'ring'} and exists $ignore_ring{$tp{'ring'}}; if (exists $tp{'ring'} and exists $tp{'seqno'}) { $ring = $tp{'ring'}; $seqno = $tp{'seqno'}; if (exists $tp{'ctx'}) { $ctx = $tp{'ctx'}; $orig_ctx = $ctx; $ctx = sanitize_ctx($ctx, $ring); $key = db_key($ring, $ctx, $seqno); } } if ($tp_name eq 'i915:i915_request_wait_begin:') { my %rw; next if exists $reqwait{$key}; $rw{'key'} = $key; $rw{'ring'} = $ring; $rw{'seqno'} = $seqno; $rw{'ctx'} = $ctx; $rw{'start'} = $time; $reqwait{$key} = \%rw; } elsif ($tp_name eq 'i915:i915_request_wait_end:') { next unless exists $reqwait{$key}; $reqwait{$key}->{'end'} = $time; } elsif ($tp_name eq 'i915:i915_request_add:') { if (exists $queue{$key}) { $ctxdb{$orig_ctx}++; $ctx = sanitize_ctx($orig_ctx, $ring); $key = db_key($ring, $ctx, $seqno); } else { $ctxdb{$orig_ctx} = 1; } $queue{$key} = $time; } elsif ($tp_name eq 'i915:i915_request_submit:') { die if exists $submit{$key}; die unless exists $queue{$key}; $submit{$key} = $time; } elsif ($tp_name eq 'i915:i915_request_in:') { my %req; # preemption delete $db{$key} if exists $db{$key}; die unless exists $queue{$key}; die unless exists $submit{$key}; $req{'start'} = $time; $req{'ring'} = $ring; $req{'seqno'} = $seqno; $req{'ctx'} = $ctx; $ctxtimelines{$ctx . '/' . $ring} = 1; $req{'name'} = $ctx . '/' . $seqno; $req{'global'} = $tp{'global'}; $req{'port'} = $tp{'port'}; $req{'queue'} = $queue{$key}; $req{'submit'} = $submit{$key}; $rings{$ring} = $gid++ unless exists $rings{$ring}; $ringmap{$rings{$ring}} = $ring; $db{$key} = \%req; } elsif ($tp_name eq 'i915:i915_request_out:') { my $gkey = global_key($ring, $tp{'global'}); die unless exists $db{$key}; die unless exists $db{$key}->{'start'}; die if exists $db{$key}->{'end'}; $db{$key}->{'end'} = $time; $db{$key}->{'notify'} = $notify{$gkey} if exists $notify{$gkey}; } elsif ($tp_name eq 'i915:intel_engine_notify:') { my $gkey = global_key($ring, $seqno); $notify{$gkey} = $time unless exists $notify{$gkey}; } elsif ($tp_name eq 'i915:intel_gpu_freq_change:') { push @freqs, [$prev_freq_ts, $time, $prev_freq] if $prev_freq; $prev_freq_ts = $time; $prev_freq = $tp{'new_freq'}; } } # Sanitation pass to fixup up out of order notify and context complete, and to # find the largest seqno to be used for timeline sorting purposes. my $max_seqno = 0; foreach my $key (keys %db) { my $gkey = global_key($db{$key}->{'ring'}, $db{$key}->{'global'}); die unless exists $db{$key}->{'start'}; $max_seqno = $db{$key}->{'seqno'} if $db{$key}->{'seqno'} > $max_seqno; # Notify arrived after context complete? $db{$key}->{'notify'} = $notify{$gkey} if not exists $db{$key}->{'notify'} and exists $notify{$gkey}; # No notify but we have end? $db{$key}->{'notify'} = $db{$key}->{'end'} if exists $db{$key}->{'end'} and not exists $db{$key}->{'notify'}; # If user interrupt arrived out of order push it back to be no later # than request out. if (exists $db{$key}->{'end'} and exists $db{$key}->{'notify'} and $db{$key}->{'notify'} > $db{$key}->{'end'}) { $db{$key}->{'notify'} = $db{$key}->{'end'}; } } my $key_count = scalar(keys %db); my %engine_timelines; sub sortEngine { my $as = $db{$a}->{'global'}; my $bs = $db{$b}->{'global'}; my $val; $val = $as <=> $bs; die if $val == 0; return $val; } sub get_engine_timeline { my ($ring) = @_; my @timeline; return $engine_timelines{$ring} if exists $engine_timelines{$ring}; @timeline = grep { $db{$_}->{'ring'} eq $ring } keys %db; # FIXME seqno restart @timeline = sort sortEngine @timeline; $engine_timelines{$ring} = \@timeline; return \@timeline; } # Fix up coalesced requests by ending them either when the following same # context request with known end ends, or when a different context starts. foreach my $gid (sort keys %rings) { my $ring = $ringmap{$rings{$gid}}; my $timeline = get_engine_timeline($ring); my $last_complete = -1; my $last_ctx = -1; my $complete; foreach my $pos (0..$#{$timeline}) { my $key = @{$timeline}[$pos]; my ($ctx, $end); next if exists $db{$key}->{'end'}; $db{$key}->{'no-end'} = 1; $ctx = $db{$key}->{'ctx'}; if ($pos > $last_complete or $ctx != $last_ctx) { my $next = $pos; undef $complete; while ($next < $#{$timeline}) { my $next_key = ${$timeline}[++$next]; if ($ctx == $db{$next_key}->{'ctx'} and exists $db{$next_key}->{'end'}) { $last_ctx = $db{$next_key}->{'ctx'}; $last_complete = $next; $complete = $next_key; last; } } } if (defined $complete) { if ($ctx == $db{$complete}->{'ctx'}) { $end = $db{$complete}->{'end'}; } else { $end = $db{$complete}->{'start'}; } } else { # No next submission. Use notify if available or give up. if (exists $db{$key}->{'notify'}) { $end = $db{$key}->{'notify'}; } else { $end = $db{$key}->{'start'}; $db{$key}->{'incomplete'} = 1; } } unless (exists $db{$key}->{'notify'}) { $db{$key}->{'notify'} = $end; $db{$key}->{'no-notify'} = 1; } $db{$key}->{'end'} = $end; } } sub sortStart { my $as = $db{$a}->{'start'}; my $bs = $db{$b}->{'start'}; my $val; $val = $as <=> $bs; $val = $a cmp $b if $val == 0; return $val; } my $re_sort = 1; my @sorted_keys; sub maybe_sort_keys { if ($re_sort) { @sorted_keys = sort sortStart keys %db; $re_sort = 0; die "Database changed size?!" unless scalar(@sorted_keys) == $key_count; } } maybe_sort_keys(); my %ctx_timelines; sub sortContext { my $as = $db{$a}->{'seqno'}; my $bs = $db{$b}->{'seqno'}; my $val; $val = $as <=> $bs; die if $val == 0; return $val; } sub get_ctx_timeline { my ($ctx, $ring, $key) = @_; my @timeline; return $ctx_timelines{$key} if exists $ctx_timelines{$key}; @timeline = grep { $db{$_}->{'ring'} eq $ring and $db{$_}->{'ctx'} == $ctx } @sorted_keys; # FIXME seqno restart @timeline = sort sortContext @timeline; $ctx_timelines{$key} = \@timeline; return \@timeline; } # Split out merged batches if requested. if ($correct_durations) { # Shift !port0 requests start time to after the previous context on the # same timeline has finished. foreach my $gid (sort keys %rings) { my $ring = $ringmap{$rings{$gid}}; my $timeline = get_engine_timeline($ring); my $complete; foreach my $pos (0..$#{$timeline}) { my $key = @{$timeline}[$pos]; my $prev = $complete; my $pkey; $complete = $key unless exists $db{$key}->{'no-end'}; $pkey = $complete; next if $db{$key}->{'port'} == 0; $pkey = $prev if $complete eq $key; die unless defined $pkey; $db{$key}->{'start'} = $db{$pkey}->{'end'}; $db{$key}->{'start'} = $db{$pkey}->{'notify'} if $db{$key}->{'start'} > $db{$key}->{'end'}; die if $db{$key}->{'start'} > $db{$key}->{'end'}; $re_sort = 1; } } maybe_sort_keys(); # Batch with no-end (no request_out) means it was submitted as part of # coalesced context. This means it's start time should be set to the end # time of a following request on this context timeline. foreach my $tkey (sort keys %ctxtimelines) { my ($ctx, $ring) = split '/', $tkey; my $timeline = get_ctx_timeline($ctx, $ring, $tkey); my $last_complete = -1; my $complete; foreach my $pos (0..$#{$timeline}) { my $key = @{$timeline}[$pos]; my $next_key; next unless exists $db{$key}->{'no-end'}; last if $pos == $#{$timeline}; # Shift following request to start after the current one $next_key = ${$timeline}[$pos + 1]; if (exists $db{$key}->{'notify'}) { $db{$next_key}->{'engine-start'} = $db{$next_key}->{'start'}; $db{$next_key}->{'start'} = $db{$key}->{'notify'}; $re_sort = 1; } } } } maybe_sort_keys(); # GPU time accounting my (%running, %runnable, %queued, %batch_avg, %batch_total_avg, %batch_count); my (%submit_avg, %execute_avg, %ctxsave_avg); my $last_ts = 0; my $first_ts; my $min_ctx; foreach my $key (@sorted_keys) { my $ring = $db{$key}->{'ring'}; my $end = $db{$key}->{'end'}; my $start = $db{$key}->{'start'}; my $engine_start = $db{$key}->{'engine_start'}; my $notify = $db{$key}->{'notify'}; $first_ts = $db{$key}->{'queue'} if not defined $first_ts or $db{$key}->{'queue'} < $first_ts; $last_ts = $end if $end > $last_ts; $min_ctx = $db{$key}->{'ctx'} if not defined $min_ctx or $db{$key}->{'ctx'} < $min_ctx; unless (exists $db{$key}->{'no-end'}) { $db{$key}->{'context-complete-delay'} = $end - $notify; } else { $db{$key}->{'context-complete-delay'} = 0; } $engine_start = $db{$key}->{'start'} unless defined $engine_start; $db{$key}->{'execute-delay'} = $engine_start - $db{$key}->{'submit'}; $db{$key}->{'submit-delay'} = $db{$key}->{'submit'} - $db{$key}->{'queue'}; unless (exists $db{$key}->{'no-notify'}) { $db{$key}->{'duration'} = $notify - $start; } else { $db{$key}->{'duration'} = 0; } $running{$ring} += $end - $start if $correct_durations or not exists $db{$key}->{'no-end'}; $runnable{$ring} += $db{$key}->{'execute-delay'}; $queued{$ring} += $start - $db{$key}->{'execute-delay'} - $db{$key}->{'queue'}; $batch_count{$ring}++; $batch_avg{$ring} += $db{$key}->{'duration'}; $batch_total_avg{$ring} += $end - $start; $submit_avg{$ring} += $db{$key}->{'submit-delay'}; $execute_avg{$ring} += $db{$key}->{'execute-delay'}; $ctxsave_avg{$ring} += $db{$key}->{'context-complete-delay'}; } foreach my $ring (sort keys %batch_avg) { $batch_avg{$ring} /= $batch_count{$ring}; $batch_total_avg{$ring} /= $batch_count{$ring}; $submit_avg{$ring} /= $batch_count{$ring}; $execute_avg{$ring} /= $batch_count{$ring}; $ctxsave_avg{$ring} /= $batch_count{$ring}; } # Calculate engine idle time my %flat_busy; foreach my $gid (sort keys %rings) { my $ring = $ringmap{$rings{$gid}}; my (@s_, @e_); # Extract all GPU busy intervals and sort them. foreach my $key (@sorted_keys) { next unless $db{$key}->{'ring'} eq $ring; push @s_, $db{$key}->{'start'}; push @e_, $db{$key}->{'end'}; die if $db{$key}->{'start'} > $db{$key}->{'end'}; } die unless $#s_ == $#e_; # Flatten the intervals. for my $i (1..$#s_) { last if $i >= @s_; # End of array. die if $e_[$i] < $s_[$i]; if ($s_[$i] <= $e_[$i - 1]) { # Current entry overlaps with the previous one. We need # to merge end of the previous interval from the list # with the start of the current one. if ($e_[$i] >= $e_[$i - 1]) { splice @e_, $i - 1, 1; } else { splice @e_, $i, 1; } splice @s_, $i, 1; # Continue with the same element when list got squashed. redo; } } # Add up all busy times. my $total = 0; for my $i (0..$#s_) { die if $e_[$i] < $s_[$i]; $total = $total + ($e_[$i] - $s_[$i]); } $flat_busy{$ring} = $total; } # Calculate overall GPU idle time my @gpu_intervals; my (@s_, @e_); # Extract all GPU busy intervals and sort them. foreach my $key (@sorted_keys) { push @s_, $db{$key}->{'start'}; push @e_, $db{$key}->{'end'}; die if $db{$key}->{'start'} > $db{$key}->{'end'}; } die unless $#s_ == $#e_; # Flatten the intervals (copy & paste of the flattening loop above) for my $i (1..$#s_) { last if $i >= @s_; die if $e_[$i] < $s_[$i]; die if $s_[$i] < $s_[$i - 1]; if ($s_[$i] <= $e_[$i - 1]) { if ($e_[$i] >= $e_[$i - 1]) { splice @e_, $i - 1, 1; } else { splice @e_, $i, 1; } splice @s_, $i, 1; redo; } } # Add up all busy times. my $total = 0; for my $i (0..$#s_) { die if $e_[$i] < $s_[$i]; $total = $total + ($e_[$i] - $s_[$i]); } # Generate data for the GPU timeline if requested if ($gpu_timeline) { for my $i (0..$#s_) { push @gpu_intervals, [ $s_[$i], $e_[$i] ]; } } $flat_busy{'gpu-busy'} = $total / ($last_ts - $first_ts) * 100.0; $flat_busy{'gpu-idle'} = (1.0 - $total / ($last_ts - $first_ts)) * 100.0; # Add up all request waits per engine my %reqw; foreach my $key (keys %reqwait) { $reqw{$reqwait{$key}->{'ring'}} += $reqwait{$key}->{'end'} - $reqwait{$key}->{'start'}; } say sprintf('GPU: %.2f%% idle, %.2f%% busy', $flat_busy{'gpu-idle'}, $flat_busy{'gpu-busy'}) unless $html; my $timeline_text = $colour_contexts ? 'per context coloured shading like' : 'box shading like'; my %ctx_colours; my $ctx_table; sub generate_ctx_table { my @states = ('queue', 'ready', 'execute', 'ctxsave', 'incomplete'); my $max_show = 6; my (@ctxts, @disp_ctxts); my $step; if( $colour_contexts ) { @ctxts = sort keys %ctxdb; } else { @ctxts = ($min_ctx); } # Limit number of shown context examples $step = int(scalar(@ctxts) / $max_show); if ($step) { foreach my $i (0..$#ctxts) { push @disp_ctxts, $ctxts[$i] unless $i % $step; last if scalar(@disp_ctxts) == $max_show; } } else { @disp_ctxts = @ctxts; } $ctx_table .= ''; foreach my $ctx (@disp_ctxts) { $ctx_table .= "\n"; $ctx_table .= " \n" if $colour_contexts; foreach my $state (@states) { $ctx_table .= " \n"; } $ctx_table .= "\n"; } $ctx_table .= '
Context $ctx
" . uc($state) . "
'; } sub generate_ctx_colours { my $num_ctx = keys %ctxdb; my $i = 0; foreach my $ctx (sort keys %ctxdb) { $ctx_colours{$ctx} = int(360 / $num_ctx * $i++); } } generate_ctx_colours() if $html and $colour_contexts; generate_ctx_table() if $html; print < i915 GT timeline

Timeline request view is $timeline_text:
$ctx_table QUEUE = requests executing on the GPU
READY = runnable requests waiting for a slot on GPU
EXECUTE = requests waiting on fences and dependencies before they are runnable
CTXSAVE = GPU saving the context image
INCOMPLETE = request of unknown completion time

Boxes are in format 'ctx-id/seqno'.

Use Ctrl+scroll-action to zoom-in/out and scroll-action or dragging to move around the timeline.

GPU idle: $flat_busy{'gpu-idle'}%
GPU busy: $flat_busy{'gpu-busy'}%

ENDHTML