#!/usr/bin/perl # Copyright 2024 the V8 project authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. =head Description This script bisects a performance regression down to the commit responsible for the regression. =head Install You'll need to install Perl's Statistics::Test::WilcoxonRankSum module. You can use these commands to do so: sudo apt install cpanminus # Install Perl's package manager cpanm -f Statistics::Test::WilcoxonRankSum # Install the module =head Usage perl bisect.pl --start --end --run Options: --start At which commit to start the bisect. --end At which commit to end the bisect. --range .. Alternative to the above start/end. --run The command line to run the benchmark. Use V8 as a placeholder in there: it will be replace by the path to the actual V8 being tested. --run-dir The directory from which benchmarks should be ran: this script will `cd` there to run the benchmarks. --compile Which directory to compile. The default is `out/x64.release`. --perl-time If specified, use Perl's timer to get the performance at each commit. --score-regex If specified, use to extract the performance from the benchmark's output. The regex is assumed to use a capture group to extract the result. Either --perl-time or --score-regex should always be passed. --total-score If specified, expect a JetStream style TotalScore: xxx output. --average-score If specified, expect a JetStream style AverageScore: xxx output. --first-score If specified, expect a JetStream style FirstScore: xxx output. --nb-run How many repetition for each benchmark for each commit. Default is 30. --retry How many times to retry benchmark at a given commit when bisect fails because it doesn't find a statistical difference between the begining and the end of the range. --verbose/--noverbose Enables or disables verbose output. Default is true. =head Example I bisected the regressions at https://chromeperf.appspot.com/group_report?rev=85409 with: (the offending CL was obvious, but this was just to showcase this script) perl bisect.pl --start b71cdae --end 54d255a --run "V8 --future cli.js -- ML" --run-dir v8-perf/v8-perf/benchmarks/JetStream2 --score-regex "Average-Score: (\d+)" And that regression https://chromeperf.appspot.com/group_report?bug_id=1409635&project_id=chromium with: perl bisect.pl --start aa7b016 --end 0033691 --run "V8 run.js -- spread_literal/es6" --run-dir test/js-perf-test/SixSpeed --score-regex ": (\d+)" =cut use strict; use warnings; use feature qw(say); use autodie qw(open close); use List::Util qw(sum max); use Statistics::Test::WilcoxonRankSum; use Term::ANSIColor; use Time::HiRes qw(time); use Getopt::Long; use File::Path qw(make_path remove_tree); use Cwd qw(getcwd abs_path); $| = 1; # auto-flusing my $START = ""; # first commit my $END = ""; # last commit my $RANGE = ""; my $RUN_CMD = ""; # command to run to check perf my $RUN_CMD_DIR = ""; # directory to run perf command from my $COMPILE_DIR = `arch` eq "arm64\n" ? "out/arm64.release" : "out/x64.release"; # Thing to compile my $BISECT_DIR = "out/bisect"; my $PERL_TIME = 0; # If true, use Perl's timer for the comparison my $SCORE_REGEX = ""; # If non-empty, use to extract score my $TOTAL_SCORE_REGEX = ""; my $FIRST_SCORE_REGEX = ""; my $AVERAGE_SCORE_REGEX = ""; my $NB_RUN = 30; # number of runs per measure my $RETRY = 3; # if no statistical difference is found, retry $RETRY times. my $VERBOSE = 1; my $PROBA_THRESHOLD = 0.05; # Wilcoxon's threshold GetOptions("start=s" => \$START, "end=s" => \$END, "range=s" => \$RANGE, "run=s" => \$RUN_CMD, "run-dir=s" => \$RUN_CMD_DIR, "compile=s" => \$COMPILE_DIR, "bisect-dir=s" => \$BISECT_DIR, "perl-time" => \$PERL_TIME, "score-regex=s" => \$SCORE_REGEX, "total-score" => \$TOTAL_SCORE_REGEX, "average-score" => \$AVERAGE_SCORE_REGEX, "first-score" => \$FIRST_SCORE_REGEX, "nb-run=i" => \$NB_RUN, "retry=i" => \$RETRY, "verbose!" => \$VERBOSE); my $LOG_FILE = abs_path("$BISECT_DIR/logs.txt"); sub trace { if ($VERBOSE) { print @_; } open my $FH, '>', $LOG_FILE; print $FH @_; close $FH; } sub usage { say "Usage:\n\t./$0 --start --end --run [--perl-time|--score-regex ]"; exit(); } if (! -d $BISECT_DIR) { make_path $BISECT_DIR; } if (-f $LOG_FILE) { unlink $LOG_FILE; } if (!$START && !$END && $RANGE) { ($START, $END) = split('\.\.', $RANGE); } trace("Checking parameters...\n"); if (!$START || !$END || !$RUN_CMD) { my @missings = map { $_->[1] } grep { !$_->[0] } [$START, 'start'], [$END, 'end'], [$RUN_CMD, 'run']; say "Missing mandatory argument: ", join (", ", map { "--$_" } @missings); usage(); } if (!-d $COMPILE_DIR) { say "Compile directory $COMPILE_DIR does not exist."; usage(); } if ($RUN_CMD_DIR ne "" && !-d $RUN_CMD_DIR) { say "Run directory $RUN_CMD_DIR does not exist."; usage(); } if ($TOTAL_SCORE_REGEX && !$SCORE_REGEX) { $SCORE_REGEX="Total-Score: (\\d+\\.?\\d*)" } if ($AVERAGE_SCORE_REGEX && !$SCORE_REGEX) { $SCORE_REGEX="Average-Score: (\\d+\\.?\\d*)" } if ($FIRST_SCORE_REGEX && !$SCORE_REGEX) { $SCORE_REGEX="First-Score: (\\d+\\.?\\d*)" } if (!$PERL_TIME && !$SCORE_REGEX) { say "One of --perl-time and --score-regex must be specified."; usage(); } if ($PERL_TIME && $SCORE_REGEX) { say "Only of of --perl-time and --score-regex can be specified."; usage(); } trace("Starting bisect...\n"); my ($start, $end) = ($START, $END); my $compile_dir = getcwd(); while (1) { chdir $compile_dir; my $mid = get_middle_commit($start, $end); if (!$mid) { # $start and $end are consecutive commits. say colored("Bisection done.", 'bold'), " Regression happened at ", colored($end, 'red'), ": ", colored(get_commit_title($end), "italic"); say "(previous commit: $start)"; exit(1); } trace("\nBisecting between ", colored($start, 'green'), " and ", colored($end, "red"), " (middle = ", colored($mid, 'yellow'), ")\n"); trace(colored(" Compiling...\n", 'bold')); my $start_dir = "$BISECT_DIR/$start"; my $mid_dir = "$BISECT_DIR/$mid"; my $end_dir = "$BISECT_DIR/$end"; compile($start, "$start_dir", 'green'); compile($mid, "$mid_dir", 'yellow'); compile($end, "$end_dir", 'red'); my $start_bin = abs_path("$start_dir/d8"); my $mid_bin = abs_path("$mid_dir/d8"); my $end_bin = abs_path("$end_dir/d8"); if ($RUN_CMD_DIR ne "") { chdir $RUN_CMD_DIR; } my $retry = 0; run: { trace(colored(" Running...\n", 'bold')); my %scores; for my $i (1 .. $NB_RUN) { for my $bin ($start_bin, $mid_bin, $end_bin) { trace("\r\033[2K $i/$NB_RUN: $bin"); my $time = time(); my $cmd = $RUN_CMD =~ s/^V8/$bin/r; my $out = `$cmd`; if ($? != 0) { say "\n==== Error:"; say "`$cmd` exited with $?:"; say "===="; say "$out"; exit 1; } if ($PERL_TIME) { push @{$scores{$bin}}, time() - $time; } else { my ($score) = $out =~ /$SCORE_REGEX/; if (!defined $score) { say "\n==== Error:"; say "`$cmd` did not return output matching $SCORE_REGEX:"; say "===="; say "$out"; exit 1; } push @{$scores{$bin}}, $score; } } } trace("\r\033[2K All runs completed.\n"); trace(colored(" Analyzing...\n", 'bold')); my ($start_avg, $start_stdev) = avg_and_stdev($scores{$start_bin}); my ($mid_avg, $mid_stdev) = avg_and_stdev($scores{$mid_bin}); my ($end_avg, $end_stdev) = avg_and_stdev($scores{$end_bin}); trace(" Times:\n"); trace(" start: $start_avg +- $start_stdev\n"); trace(" mid: $mid_avg +- $mid_stdev\n"); trace(" end: $end_avg +- $end_stdev\n"); my $proba_start_mid = wilcoxon($scores{$start_bin}, $scores{$mid_bin}); my $proba_mid_end = wilcoxon($scores{$mid_bin}, $scores{$end_bin}); trace(" Proba:\n"); trace(" start-mid: ", color_proba($proba_start_mid), "\n"); trace(" mid-end: ", color_proba($proba_mid_end), "\n"); if ($proba_start_mid < $PROBA_THRESHOLD && $proba_mid_end < $PROBA_THRESHOLD) { if ($retry++ == $RETRY) { say "Probabilities are $proba_start_mid and $proba_mid_end, which would indicate 2 regressions rather than 1, which is not supported by this script. Try to manually narrow the bisection range and rerun the script. Current range: $start - $mid - $end."; exit 1; } else { trace(" Two statistical differences (instead of one), re-running.\n"); goto run; } } if ($proba_start_mid > $PROBA_THRESHOLD && $proba_mid_end > $PROBA_THRESHOLD) { if ($retry++ == $RETRY) { say "No statistical difference between $start, $mid and $end (after $RETRY retries). Aborting."; exit 1; } else { trace(" No statistical difference, re-running.\n"); goto run; } } if ($proba_start_mid < $PROBA_THRESHOLD) { ($start, $end) = ($start, $mid); } else { ($start, $end) = ($mid, $end); } } } sub compile { my ($commit, $dst, $color) = @_; my $commit_title = get_commit_title($commit); trace(" Compiling at ", colored($commit, $color), ": ", colored($commit_title, 'italic'), "\n"); if (-d $dst && -f "$dst/d8") { trace(" - reusing existing d8\n"); return; } system("git checkout $commit >>$LOG_FILE 2>&1") and die "Failed to checkout commit $commit"; system("gclient sync >>$LOG_FILE 2>&1") and die "Failed to gclient sync"; system("gn gen $COMPILE_DIR >>$LOG_FILE 2>&1") and die "Failed to gn gen"; system("gn clean $COMPILE_DIR >>$LOG_FILE 2>&1") and die "Failed to clean $COMPILE_DIR"; system("autoninja -C $COMPILE_DIR d8 >>$LOG_FILE 2>&1") and die "Failed to compile $COMPILE_DIR"; system("mkdir -p $dst"); system("cp $COMPILE_DIR/d8 $COMPILE_DIR/icudtl.dat $COMPILE_DIR/snapshot_blob.bin $dst/") and die "Failed to copy $COMPILE_DIR to $dst"; } sub get_middle_commit { my ($start, $end) = @_; my $cmd = "git log --oneline $start..$end"; # say "About to run: '$cmd'"; # say "Current dir: ", getcwd(); my @commits = map { s/ .*//r } split "\n", `$cmd`; shift @commits; # Removing $start if (!@commits) { return undef } return $commits[@commits/2]; } sub avg_and_stdev { my $arr = shift; my $u = sum(@$arr)/@$arr; # mean my $s = ( sum( map {($_-$u)**2} @$arr ) / @$arr ) ** 0.5; # standard deviation return (sprintf("%.2f",$u), sprintf("%.2f",$s)); } # Compute Wilcoxon Rank-Sum test between two datasets. sub wilcoxon { my ($dataset1, $dataset2) = @_; my $wilcox_test = Statistics::Test::WilcoxonRankSum->new(); $wilcox_test->load_data($dataset1, $dataset2); return $wilcox_test->probability(); } sub color_proba { my $proba = shift; if ($proba < $PROBA_THRESHOLD) { return colored($proba, 'red'); } else { return colored($proba, 'yellow'); } } sub get_commit_title { my $commit = shift; my $commit_msg = `git show -s --format=%B $commit`; my ($title) = $commit_msg =~ /^(.*)$/m; return $title; }