benchtests/scripts/compare_bench.py

   1 #!/usr/bin/python
   2 # Copyright (C) 2015-2022 Free Software Foundation, Inc.
   3 # This file is part of the GNU C Library.
   4 #
   5 # The GNU C Library is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU Lesser General Public
   7 # License as published by the Free Software Foundation; either
   8 # version 2.1 of the License, or (at your option) any later version.
   9 #
  10 # The GNU C Library is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 # Lesser General Public License for more details.
  14 #
  15 # You should have received a copy of the GNU Lesser General Public
  16 # License along with the GNU C Library; if not, see
  17 # <https://www.gnu.org/licenses/>.
  18 """Compare two benchmark results
  19
  20 Given two benchmark result files and a threshold, this script compares the
  21 benchmark results and flags differences in performance beyond a given
  22 threshold.
  23 """
  24 import sys
  25 import os
  26 import pylab
  27 import import_bench as bench
  28 import argparse
  29
  30 def do_compare(func, var, tl1, tl2, par, threshold):
  31     """Compare one of the aggregate measurements
  32
  33     Helper function to compare one of the aggregate measurements of a function
  34     variant.
  35
  36     Args:
  37         func: Function name
  38         var: Function variant name
  39         tl1: The first timings list
  40         tl2: The second timings list
  41         par: The aggregate to measure
  42         threshold: The threshold for differences, beyond which the script should
  43         print a warning.
  44     """
  45     try:
  46         v1 = tl1[str(par)]
  47         v2 = tl2[str(par)]
  48         d = abs(v2 - v1) * 100 / v1
  49     except KeyError:
  50         sys.stderr.write('%s(%s)[%s]: stat does not exist\n' % (func, var, par))
  51         return
  52     except ZeroDivisionError:
  53         return
  54
  55     if d > threshold:
  56         if v1 > v2:
  57             ind = '+++'
  58         else:
  59             ind = '---'
  60         print('%s %s(%s)[%s]: (%.2lf%%) from %g to %g' %
  61                 (ind, func, var, par, d, v1, v2))
  62
  63
  64 def compare_runs(pts1, pts2, threshold, stats):
  65     """Compare two benchmark runs
  66
  67     Args:
  68         pts1: Timing data from first machine
  69         pts2: Timing data from second machine
  70     """
  71
  72     # XXX We assume that the two benchmarks have identical functions and
  73     # variants.  We cannot compare two benchmarks that may have different
  74     # functions or variants.  Maybe that is something for the future.
  75     for func in pts1['functions'].keys():
  76         for var in pts1['functions'][func].keys():
  77             tl1 = pts1['functions'][func][var]
  78             tl2 = pts2['functions'][func][var]
  79
  80             # Compare the consolidated numbers
  81             # do_compare(func, var, tl1, tl2, 'max', threshold)
  82             for stat in stats.split():
  83                 do_compare(func, var, tl1, tl2, stat, threshold)
  84
  85             # Skip over to the next variant or function if there is no detailed
  86             # timing info for the function variant.
  87             if 'timings' not in pts1['functions'][func][var].keys() or \
  88                 'timings' not in pts2['functions'][func][var].keys():
  89                 continue
  90
  91             # If two lists do not have the same length then it is likely that
  92             # the performance characteristics of the function have changed.
  93             # XXX: It is also likely that there was some measurement that
  94             # strayed outside the usual range.  Such ouiers should not
  95             # happen on an idle machine with identical hardware and
  96             # configuration, but ideal environments are hard to come by.
  97             if len(tl1['timings']) != len(tl2['timings']):
  98                 print('* %s(%s): Timing characteristics changed' %
  99                         (func, var))
 100                 print('\tBefore: [%s]' %
 101                         ', '.join([str(x) for x in tl1['timings']]))
 102                 print('\tAfter: [%s]' %
 103                         ', '.join([str(x) for x in tl2['timings']]))
 104                 continue
 105
 106             # Collect numbers whose differences cross the threshold we have
 107             # set.
 108             issues = [(x, y) for x, y in zip(tl1['timings'], tl2['timings']) \
 109                         if abs(y - x) * 100 / x > threshold]
 110
 111             # Now print them.
 112             for t1, t2 in issues:
 113                 d = abs(t2 - t1) * 100 / t1
 114                 if t2 > t1:
 115                     ind = '-'
 116                 else:
 117                     ind = '+'
 118
 119                 print("%s %s(%s): (%.2lf%%) from %g to %g" %
 120                         (ind, func, var, d, t1, t2))
 121
 122
 123 def plot_graphs(bench1, bench2):
 124     """Plot graphs for functions
 125
 126     Make scatter plots for the functions and their variants.
 127
 128     Args:
 129         bench1: Set of points from the first machine
 130         bench2: Set of points from the second machine.
 131     """
 132     for func in bench1['functions'].keys():
 133         for var in bench1['functions'][func].keys():
 134             # No point trying to print a graph if there are no detailed
 135             # timings.
 136             if u'timings' not in bench1['functions'][func][var].keys():
 137                 sys.stderr.write('Skipping graph for %s(%s)\n' % (func, var))
 138                 continue
 139
 140             pylab.clf()
 141             pylab.ylabel('Time (cycles)')
 142
 143             # First set of points
 144             length = len(bench1['functions'][func][var]['timings'])
 145             X = [float(x) for x in range(length)]
 146             lines = pylab.scatter(X, bench1['functions'][func][var]['timings'],
 147                     1.5 + 100 / length)
 148             pylab.setp(lines, 'color', 'r')
 149
 150             # Second set of points
 151             length = len(bench2['functions'][func][var]['timings'])
 152             X = [float(x) for x in range(length)]
 153             lines = pylab.scatter(X, bench2['functions'][func][var]['timings'],
 154                     1.5 + 100 / length)
 155             pylab.setp(lines, 'color', 'g')
 156
 157             if var:
 158                 filename = "%s-%s.png" % (func, var)
 159             else:
 160                 filename = "%s.png" % func
 161             sys.stderr.write('Writing out %s' % filename)
 162             pylab.savefig(filename)
 163
 164 def main(bench1, bench2, schema, threshold, stats):
 165     bench1 = bench.parse_bench(bench1, schema)
 166     bench.do_for_all_timings(bench1, lambda b, f, v:
 167         b['functions'][f][v]['timings'].sort())
 168     bench2 = bench.parse_bench(bench2, schema)
 169     bench.do_for_all_timings(bench2, lambda b, f, v:
 170         b['functions'][f][v]['timings'].sort())
 171
 172     plot_graphs(bench1, bench2)
 173
 174     bench.compress_timings(bench1)
 175     bench.compress_timings(bench2)
 176
 177     compare_runs(bench1, bench2, threshold, stats)
 178
 179
 180 if __name__ == '__main__':
 181     parser = argparse.ArgumentParser(description='Take two benchmark and compare their timings.')
 182
 183     # Required parameters
 184     parser.add_argument('bench1', help='First bench to compare')
 185     parser.add_argument('bench2', help='Second bench to compare')
 186
 187     # Optional parameters
 188     parser.add_argument('--schema',
 189                         default=os.path.join(os.path.dirname(os.path.realpath(__file__)),'benchout.schema.json'),
 190                         help='JSON file to validate source/dest files (default: %(default)s)')
 191     parser.add_argument('--threshold', default=10.0, type=float, help='Only print those with equal or higher threshold (default: %(default)s)')
 192     parser.add_argument('--stats', default='min mean', type=str, help='Only consider values from the statistics specified as a space separated list (default: %(default)s)')
 193
 194     args = parser.parse_args()
 195
 196     main(args.bench1, args.bench2, args.schema, args.threshold, args.stats)