PR-URL: https://github.com/nodejs/node/pull/58070 Reviewed-By: Antoine du Hamel <duhamelantoine1995@gmail.com> Reviewed-By: Darshan Sen <raisinten@gmail.com> Reviewed-By: Joyee Cheung <joyeec9h3@gmail.com> Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com>
342 lines
12 KiB
Perl
Executable File
342 lines
12 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
# Copyright 2024 the V8 project authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
=head Description
|
|
|
|
This script bisects a performance regression down to the commit responsible for the regression.
|
|
|
|
|
|
=head Install
|
|
|
|
You'll need to install Perl's Statistics::Test::WilcoxonRankSum
|
|
module. You can use these commands to do so:
|
|
|
|
sudo apt install cpanminus # Install Perl's package manager
|
|
cpanm -f Statistics::Test::WilcoxonRankSum # Install the module
|
|
|
|
|
|
=head Usage
|
|
|
|
perl bisect.pl --start <start_hash> --end <end_hash> --run <benchmark_command>
|
|
|
|
Options:
|
|
|
|
--start <start_hash> At which commit to start the bisect.
|
|
--end <end_hash> At which commit to end the bisect.
|
|
--range <start>..<end> Alternative to the above start/end.
|
|
--run <command> The command line to run the benchmark. Use V8 as a placeholder in there:
|
|
it will be replace by the path to the actual V8 being tested.
|
|
--run-dir <dir> The directory from which benchmarks should be ran: this script will `cd`
|
|
there to run the benchmarks.
|
|
--compile <dir> Which directory to compile. The default is `out/x64.release`.
|
|
--perl-time If specified, use Perl's timer to get the performance at each commit.
|
|
--score-regex <regex> If specified, use <regex> to extract the performance from the benchmark's
|
|
output. The regex is assumed to use a capture group to extract the result.
|
|
Either --perl-time or --score-regex should always be passed.
|
|
--total-score If specified, expect a JetStream style TotalScore: xxx output.
|
|
--average-score If specified, expect a JetStream style AverageScore: xxx output.
|
|
--first-score If specified, expect a JetStream style FirstScore: xxx output.
|
|
--nb-run <num> How many repetition for each benchmark for each commit. Default is 30.
|
|
--retry <num> How many times to retry benchmark at a given commit when bisect fails
|
|
because it doesn't find a statistical difference between the begining and
|
|
the end of the range.
|
|
--verbose/--noverbose Enables or disables verbose output. Default is true.
|
|
|
|
=head Example
|
|
|
|
I bisected the regressions at https://chromeperf.appspot.com/group_report?rev=85409 with:
|
|
(the offending CL was obvious, but this was just to showcase this script)
|
|
|
|
perl bisect.pl --start b71cdae --end 54d255a --run "V8 --future cli.js -- ML" --run-dir v8-perf/v8-perf/benchmarks/JetStream2 --score-regex "Average-Score: (\d+)"
|
|
|
|
|
|
And that regression https://chromeperf.appspot.com/group_report?bug_id=1409635&project_id=chromium with:
|
|
|
|
perl bisect.pl --start aa7b016 --end 0033691 --run "V8 run.js -- spread_literal/es6" --run-dir test/js-perf-test/SixSpeed --score-regex ": (\d+)"
|
|
|
|
|
|
=cut
|
|
|
|
use strict;
|
|
use warnings;
|
|
use feature qw(say);
|
|
use autodie qw(open close);
|
|
|
|
use List::Util qw(sum max);
|
|
use Statistics::Test::WilcoxonRankSum;
|
|
use Term::ANSIColor;
|
|
use Time::HiRes qw(time);
|
|
use Getopt::Long;
|
|
use File::Path qw(make_path remove_tree);
|
|
use Cwd qw(getcwd abs_path);
|
|
|
|
$| = 1; # auto-flusing
|
|
|
|
my $START = ""; # first commit
|
|
my $END = ""; # last commit
|
|
my $RANGE = "";
|
|
my $RUN_CMD = ""; # command to run to check perf
|
|
my $RUN_CMD_DIR = ""; # directory to run perf command from
|
|
my $COMPILE_DIR = `arch` eq "arm64\n" ? "out/arm64.release" : "out/x64.release"; # Thing to compile
|
|
my $BISECT_DIR = "out/bisect";
|
|
my $PERL_TIME = 0; # If true, use Perl's timer for the comparison
|
|
my $SCORE_REGEX = ""; # If non-empty, use to extract score
|
|
my $TOTAL_SCORE_REGEX = "";
|
|
my $FIRST_SCORE_REGEX = "";
|
|
my $AVERAGE_SCORE_REGEX = "";
|
|
my $NB_RUN = 30; # number of runs per measure
|
|
my $RETRY = 3; # if no statistical difference is found, retry $RETRY times.
|
|
my $VERBOSE = 1;
|
|
|
|
|
|
my $PROBA_THRESHOLD = 0.05; # Wilcoxon's threshold
|
|
|
|
GetOptions("start=s" => \$START,
|
|
"end=s" => \$END,
|
|
"range=s" => \$RANGE,
|
|
"run=s" => \$RUN_CMD,
|
|
"run-dir=s" => \$RUN_CMD_DIR,
|
|
"compile=s" => \$COMPILE_DIR,
|
|
"bisect-dir=s" => \$BISECT_DIR,
|
|
"perl-time" => \$PERL_TIME,
|
|
"score-regex=s" => \$SCORE_REGEX,
|
|
"total-score" => \$TOTAL_SCORE_REGEX,
|
|
"average-score" => \$AVERAGE_SCORE_REGEX,
|
|
"first-score" => \$FIRST_SCORE_REGEX,
|
|
"nb-run=i" => \$NB_RUN,
|
|
"retry=i" => \$RETRY,
|
|
"verbose!" => \$VERBOSE);
|
|
|
|
my $LOG_FILE = abs_path("$BISECT_DIR/logs.txt");
|
|
|
|
sub trace {
|
|
if ($VERBOSE) {
|
|
print @_;
|
|
}
|
|
open my $FH, '>', $LOG_FILE;
|
|
print $FH @_;
|
|
close $FH;
|
|
}
|
|
|
|
sub usage {
|
|
say "Usage:\n\t./$0 --start <start_commit> --end <end_commit> --run <run_cmd> [--perl-time|--score-regex <regex>]";
|
|
exit();
|
|
}
|
|
|
|
if (! -d $BISECT_DIR) {
|
|
make_path $BISECT_DIR;
|
|
}
|
|
if (-f $LOG_FILE) {
|
|
unlink $LOG_FILE;
|
|
}
|
|
|
|
if (!$START && !$END && $RANGE) {
|
|
($START, $END) = split('\.\.', $RANGE);
|
|
}
|
|
|
|
trace("Checking parameters...\n");
|
|
if (!$START || !$END || !$RUN_CMD) {
|
|
my @missings = map { $_->[1] } grep { !$_->[0] }
|
|
[$START, 'start'], [$END, 'end'], [$RUN_CMD, 'run'];
|
|
say "Missing mandatory argument: ", join (", ", map { "--$_" } @missings);
|
|
usage();
|
|
}
|
|
if (!-d $COMPILE_DIR) {
|
|
say "Compile directory $COMPILE_DIR does not exist.";
|
|
usage();
|
|
}
|
|
if ($RUN_CMD_DIR ne "" && !-d $RUN_CMD_DIR) {
|
|
say "Run directory $RUN_CMD_DIR does not exist.";
|
|
usage();
|
|
}
|
|
if ($TOTAL_SCORE_REGEX && !$SCORE_REGEX) {
|
|
$SCORE_REGEX="Total-Score: (\\d+\\.?\\d*)"
|
|
}
|
|
if ($AVERAGE_SCORE_REGEX && !$SCORE_REGEX) {
|
|
$SCORE_REGEX="Average-Score: (\\d+\\.?\\d*)"
|
|
}
|
|
if ($FIRST_SCORE_REGEX && !$SCORE_REGEX) {
|
|
$SCORE_REGEX="First-Score: (\\d+\\.?\\d*)"
|
|
}
|
|
if (!$PERL_TIME && !$SCORE_REGEX) {
|
|
say "One of --perl-time and --score-regex must be specified.";
|
|
usage();
|
|
}
|
|
if ($PERL_TIME && $SCORE_REGEX) {
|
|
say "Only of of --perl-time and --score-regex can be specified.";
|
|
usage();
|
|
}
|
|
|
|
trace("Starting bisect...\n");
|
|
|
|
my ($start, $end) = ($START, $END);
|
|
my $compile_dir = getcwd();
|
|
while (1) {
|
|
chdir $compile_dir;
|
|
|
|
my $mid = get_middle_commit($start, $end);
|
|
if (!$mid) {
|
|
# $start and $end are consecutive commits.
|
|
say colored("Bisection done.", 'bold'), " Regression happened at ", colored($end, 'red'), ": ", colored(get_commit_title($end), "italic");
|
|
say "(previous commit: $start)";
|
|
exit(1);
|
|
}
|
|
|
|
trace("\nBisecting between ", colored($start, 'green'), " and ",
|
|
colored($end, "red"), " (middle = ", colored($mid, 'yellow'), ")\n");
|
|
|
|
trace(colored(" Compiling...\n", 'bold'));
|
|
my $start_dir = "$BISECT_DIR/$start";
|
|
my $mid_dir = "$BISECT_DIR/$mid";
|
|
my $end_dir = "$BISECT_DIR/$end";
|
|
compile($start, "$start_dir", 'green');
|
|
compile($mid, "$mid_dir", 'yellow');
|
|
compile($end, "$end_dir", 'red');
|
|
|
|
my $start_bin = abs_path("$start_dir/d8");
|
|
my $mid_bin = abs_path("$mid_dir/d8");
|
|
my $end_bin = abs_path("$end_dir/d8");
|
|
if ($RUN_CMD_DIR ne "") {
|
|
chdir $RUN_CMD_DIR;
|
|
}
|
|
|
|
my $retry = 0;
|
|
run:
|
|
{
|
|
trace(colored(" Running...\n", 'bold'));
|
|
my %scores;
|
|
for my $i (1 .. $NB_RUN) {
|
|
for my $bin ($start_bin, $mid_bin, $end_bin) {
|
|
trace("\r\033[2K $i/$NB_RUN: $bin");
|
|
my $time = time();
|
|
my $cmd = $RUN_CMD =~ s/^V8/$bin/r;
|
|
my $out = `$cmd`;
|
|
if ($? != 0) {
|
|
say "\n==== Error:";
|
|
say "`$cmd` exited with $?:";
|
|
say "====";
|
|
say "$out";
|
|
exit 1;
|
|
}
|
|
if ($PERL_TIME) {
|
|
push @{$scores{$bin}}, time() - $time;
|
|
} else {
|
|
my ($score) = $out =~ /$SCORE_REGEX/;
|
|
if (!defined $score) {
|
|
say "\n==== Error:";
|
|
say "`$cmd` did not return output matching $SCORE_REGEX:";
|
|
say "====";
|
|
say "$out";
|
|
exit 1;
|
|
}
|
|
push @{$scores{$bin}}, $score;
|
|
}
|
|
}
|
|
}
|
|
trace("\r\033[2K All runs completed.\n");
|
|
|
|
trace(colored(" Analyzing...\n", 'bold'));
|
|
my ($start_avg, $start_stdev) = avg_and_stdev($scores{$start_bin});
|
|
my ($mid_avg, $mid_stdev) = avg_and_stdev($scores{$mid_bin});
|
|
my ($end_avg, $end_stdev) = avg_and_stdev($scores{$end_bin});
|
|
trace(" Times:\n");
|
|
trace(" start: $start_avg +- $start_stdev\n");
|
|
trace(" mid: $mid_avg +- $mid_stdev\n");
|
|
trace(" end: $end_avg +- $end_stdev\n");
|
|
|
|
my $proba_start_mid = wilcoxon($scores{$start_bin}, $scores{$mid_bin});
|
|
my $proba_mid_end = wilcoxon($scores{$mid_bin}, $scores{$end_bin});
|
|
trace(" Proba:\n");
|
|
trace(" start-mid: ", color_proba($proba_start_mid), "\n");
|
|
trace(" mid-end: ", color_proba($proba_mid_end), "\n");
|
|
if ($proba_start_mid < $PROBA_THRESHOLD && $proba_mid_end < $PROBA_THRESHOLD) {
|
|
if ($retry++ == $RETRY) {
|
|
say "Probabilities are $proba_start_mid and $proba_mid_end, which would indicate 2 regressions rather than 1, which is not supported by this script. Try to manually narrow the bisection range and rerun the script. Current range: $start - $mid - $end.";
|
|
exit 1;
|
|
} else {
|
|
trace(" Two statistical differences (instead of one), re-running.\n");
|
|
goto run;
|
|
}
|
|
}
|
|
|
|
if ($proba_start_mid > $PROBA_THRESHOLD && $proba_mid_end > $PROBA_THRESHOLD) {
|
|
if ($retry++ == $RETRY) {
|
|
say "No statistical difference between $start, $mid and $end (after $RETRY retries). Aborting.";
|
|
exit 1;
|
|
} else {
|
|
trace(" No statistical difference, re-running.\n");
|
|
goto run;
|
|
}
|
|
}
|
|
|
|
if ($proba_start_mid < $PROBA_THRESHOLD) {
|
|
($start, $end) = ($start, $mid);
|
|
} else {
|
|
($start, $end) = ($mid, $end);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
sub compile {
|
|
my ($commit, $dst, $color) = @_;
|
|
my $commit_title = get_commit_title($commit);
|
|
trace(" Compiling at ", colored($commit, $color), ": ", colored($commit_title, 'italic'), "\n");
|
|
if (-d $dst && -f "$dst/d8") {
|
|
trace(" - reusing existing d8\n");
|
|
return;
|
|
}
|
|
system("git checkout $commit >>$LOG_FILE 2>&1") and die "Failed to checkout commit $commit";
|
|
system("gclient sync >>$LOG_FILE 2>&1") and die "Failed to gclient sync";
|
|
system("gn gen $COMPILE_DIR >>$LOG_FILE 2>&1") and die "Failed to gn gen";
|
|
system("gn clean $COMPILE_DIR >>$LOG_FILE 2>&1") and die "Failed to clean $COMPILE_DIR";
|
|
system("autoninja -C $COMPILE_DIR d8 >>$LOG_FILE 2>&1") and die "Failed to compile $COMPILE_DIR";
|
|
system("mkdir -p $dst");
|
|
system("cp $COMPILE_DIR/d8 $COMPILE_DIR/icudtl.dat $COMPILE_DIR/snapshot_blob.bin $dst/")
|
|
and die "Failed to copy $COMPILE_DIR to $dst";
|
|
}
|
|
|
|
sub get_middle_commit {
|
|
my ($start, $end) = @_;
|
|
my $cmd = "git log --oneline $start..$end";
|
|
# say "About to run: '$cmd'";
|
|
# say "Current dir: ", getcwd();
|
|
my @commits = map { s/ .*//r } split "\n", `$cmd`;
|
|
shift @commits; # Removing $start
|
|
if (!@commits) { return undef }
|
|
return $commits[@commits/2];
|
|
}
|
|
|
|
sub avg_and_stdev {
|
|
my $arr = shift;
|
|
my $u = sum(@$arr)/@$arr; # mean
|
|
my $s = ( sum( map {($_-$u)**2} @$arr ) / @$arr ) ** 0.5; # standard deviation
|
|
return (sprintf("%.2f",$u), sprintf("%.2f",$s));
|
|
}
|
|
|
|
# Compute Wilcoxon Rank-Sum test between two datasets.
|
|
sub wilcoxon {
|
|
my ($dataset1, $dataset2) = @_;
|
|
my $wilcox_test = Statistics::Test::WilcoxonRankSum->new();
|
|
$wilcox_test->load_data($dataset1, $dataset2);
|
|
return $wilcox_test->probability();
|
|
}
|
|
|
|
sub color_proba {
|
|
my $proba = shift;
|
|
if ($proba < $PROBA_THRESHOLD) {
|
|
return colored($proba, 'red');
|
|
} else {
|
|
return colored($proba, 'yellow');
|
|
}
|
|
}
|
|
|
|
sub get_commit_title {
|
|
my $commit = shift;
|
|
my $commit_msg = `git show -s --format=%B $commit`;
|
|
my ($title) = $commit_msg =~ /^(.*)$/m;
|
|
return $title;
|
|
}
|