tools/auto_bisect/bisect_results.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 import math
   6 import os
   7
   8 import bisect_utils
   9 import math_utils
  10 import source_control
  11 import ttest
  12
  13 from bisect_state import RevisionState
  14
  15
  16 class BisectResults(object):
  17   """Contains results of the completed bisect.
  18
  19   Properties:
  20     error: Error message if the bisect failed.
  21
  22   If the error is None, the following properties are present:
  23     warnings: List of warnings from the bisect run.
  24     state: BisectState object from which these results were generated.
  25     first_working_revision: First good revision.
  26     last_broken_revision: Last bad revision.
  27
  28   If both of above revisions are not None, the follow properties are present:
  29     culprit_revisions: A list of revisions, which contain the bad change
  30         introducing the failure.
  31     regression_size: For performance bisects, this is a relative change of
  32         the mean metric value. For other bisects this field always contains
  33         'zero-to-nonzero'.
  34     regression_std_err: For performance bisects, it is a pooled standard error
  35         for groups of good and bad runs. Not used for other bisects.
  36     confidence: For performance bisects, it is a confidence that the good and
  37         bad runs are distinct groups. Not used for non-performance bisects.
  38   """
  39
  40   def __init__(self, bisect_state=None, depot_registry=None, opts=None,
  41                runtime_warnings=None, error=None, abort_reason=None):
  42     """Computes final bisect results after a bisect run is complete.
  43
  44     This constructor should be called in one of the following ways:
  45       BisectResults(state, depot_registry, opts, runtime_warnings)
  46       BisectResults(error=error)
  47
  48     First option creates an object representing successful bisect results, while
  49     second option creates an error result.
  50
  51     Args:
  52       bisect_state: BisectState object representing latest bisect state.
  53       depot_registry: DepotDirectoryRegistry object with information on each
  54           repository in the bisect_state.
  55       opts: Options passed to the bisect run.
  56       runtime_warnings: A list of warnings from the bisect run.
  57       error: Error message. When error is not None, other arguments are ignored.
  58     """
  59     self.error = error
  60     self.abort_reason = abort_reason
  61     if error is not None or abort_reason is not None:
  62       return
  63
  64     assert (bisect_state is not None and depot_registry is not None and
  65             opts is not None and runtime_warnings is not None), (
  66                 'Incorrect use of the BisectResults constructor. '
  67                 'When error is None, all other arguments are required.')
  68
  69     self.state = bisect_state
  70
  71     rev_states = bisect_state.GetRevisionStates()
  72     first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states)
  73     self.first_working_revision = first_working_rev
  74     self.last_broken_revision = last_broken_rev
  75
  76     self.warnings = runtime_warnings
  77
  78     self.retest_results_tot = None
  79     self.retest_results_reverted = None
  80
  81     if first_working_rev is not None and last_broken_rev is not None:
  82       statistics = self._ComputeRegressionStatistics(
  83           rev_states, first_working_rev, last_broken_rev)
  84
  85       self.regression_size = statistics['regression_size']
  86       self.regression_std_err = statistics['regression_std_err']
  87       self.confidence = statistics['confidence']
  88
  89       self.culprit_revisions = self._FindCulpritRevisions(
  90           rev_states, depot_registry, first_working_rev, last_broken_rev)
  91
  92       self.warnings += self._GetResultBasedWarnings(
  93           self.culprit_revisions, opts, self.confidence)
  94     elif first_working_rev is not None:
  95       # Setting these attributes so that bisect printer does not break when the
  96       # regression cannot be reproduced (no broken revision was found)
  97       self.regression_size = 0
  98       self.regression_std_err = 0
  99       self.confidence = 0
 100       self.culprit_revisions = []
 101
 102   def AddRetestResults(self, results_tot, results_reverted):
 103     if not results_tot or not results_reverted:
 104       self.warnings.append(
 105           'Failed to re-test reverted culprit CL against ToT.')
 106       return
 107
 108     confidence = BisectResults.ConfidenceScore(
 109         results_reverted[0]['values'],
 110         results_tot[0]['values'])
 111
 112     self.retest_results_tot = RevisionState('ToT', 'n/a', 0)
 113     self.retest_results_tot.value = results_tot[0]
 114
 115     self.retest_results_reverted = RevisionState('Reverted', 'n/a', 0)
 116     self.retest_results_reverted.value = results_reverted[0]
 117
 118     if confidence <= bisect_utils.HIGH_CONFIDENCE:
 119       self.warnings.append(
 120           'Confidence of re-test with reverted CL is not high.'
 121           ' Check that the regression hasn\'t already recovered. '
 122           ' There\'s still a chance this is a regression, as performance of'
 123           ' local builds may not match official builds.')
 124
 125   @staticmethod
 126   def _GetResultBasedWarnings(culprit_revisions, opts, confidence):
 127     warnings = []
 128     if len(culprit_revisions) > 1:
 129       warnings.append('Due to build errors, regression range could '
 130                       'not be narrowed down to a single commit.')
 131     if opts.repeat_test_count == 1:
 132       warnings.append('Tests were only set to run once. This may '
 133                       'be insufficient to get meaningful results.')
 134     if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:
 135       warnings.append('Confidence is not high. Try bisecting again '
 136                       'with increased repeat_count, larger range, or '
 137                       'on another metric.')
 138     if not confidence:
 139       warnings.append('Confidence score is 0%. Try bisecting again on '
 140                       'another platform or another metric.')
 141     return warnings
 142
 143   @staticmethod
 144   def ConfidenceScore(sample1, sample2, accept_single_bad_or_good=False):
 145     """Calculates a confidence score.
 146
 147     This score is based on a statistical hypothesis test. The null
 148     hypothesis is that the two groups of results have no difference,
 149     i.e. there is no performance regression. The alternative hypothesis
 150     is that there is some difference between the groups that's unlikely
 151     to occur by chance.
 152
 153     The score returned by this function represents our confidence in the
 154     alternative hypothesis.
 155
 156     Note that if there's only one item in either sample, this means only
 157     one revision was classified good or bad, so there's not much evidence
 158     to make a decision.
 159
 160     Args:
 161       sample1: A flat list of "good" result numbers.
 162       sample2: A flat list of "bad" result numbers.
 163       accept_single_bad_or_good: If True, compute a value even if
 164           there is only one bad or good revision.
 165
 166     Returns:
 167       A float between 0 and 100; 0 if the samples aren't large enough.
 168     """
 169     if ((len(sample1) <= 1 or len(sample2) <= 1) and
 170         not accept_single_bad_or_good):
 171       return 0.0
 172     if not sample1 or not sample2:
 173       return 0.0
 174     _, _, p_value = ttest.WelchsTTest(sample1, sample2)
 175     return 100.0 * (1.0 - p_value)
 176
 177   @staticmethod
 178   def FindBreakingRevRange(revision_states):
 179     """Finds the last known good and first known bad revisions.
 180
 181     Note that since revision_states is expected to be in reverse chronological
 182     order, the last known good revision is the first revision in the list that
 183     has the passed property set to 1, therefore the name
 184     `first_working_revision`. The inverse applies to `last_broken_revision`.
 185
 186     Args:
 187       revision_states: A list of RevisionState instances.
 188
 189     Returns:
 190       A tuple containing the two revision states at the border. (Last
 191       known good and first known bad.)
 192     """
 193     first_working_revision = None
 194     last_broken_revision = None
 195
 196     for revision_state in revision_states:
 197       if revision_state.passed == 1 and not first_working_revision:
 198         first_working_revision = revision_state
 199
 200       if not revision_state.passed:
 201         last_broken_revision = revision_state
 202
 203     return first_working_revision, last_broken_revision
 204
 205   @staticmethod
 206   def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev,
 207                             last_broken_rev):
 208     cwd = os.getcwd()
 209
 210     culprit_revisions = []
 211     for i in xrange(last_broken_rev.index, first_working_rev.index):
 212       depot_registry.ChangeToDepotDir(revision_states[i].depot)
 213       info = source_control.QueryRevisionInfo(revision_states[i].revision)
 214       culprit_revisions.append((revision_states[i].revision, info,
 215                                 revision_states[i].depot))
 216
 217     os.chdir(cwd)
 218     return culprit_revisions
 219
 220   @classmethod
 221   def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,
 222                                    last_broken_rev):
 223     # TODO(sergiyb): We assume that value has "values" key, which may not be
 224     # the case for failure-bisects, where there is a single value only.
 225     broken_means = [state.value['values']
 226                     for state in rev_states[:last_broken_rev.index+1]
 227                     if state.value]
 228
 229     working_means = [state.value['values']
 230                      for state in rev_states[first_working_rev.index:]
 231                      if state.value]
 232
 233     # Flatten the lists to calculate mean of all values.
 234     working_mean = sum(working_means, [])
 235     broken_mean = sum(broken_means, [])
 236
 237     # Calculate the approximate size of the regression
 238     mean_of_bad_runs = math_utils.Mean(broken_mean)
 239     mean_of_good_runs = math_utils.Mean(working_mean)
 240
 241     regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
 242                                                       mean_of_bad_runs)
 243     if math.isnan(regression_size):
 244       regression_size = 'zero-to-nonzero'
 245
 246     regression_std_err = math.fabs(math_utils.PooledStandardError(
 247         [working_mean, broken_mean]) /
 248         max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
 249
 250     # Give a "confidence" in the bisect culprit by seeing whether the results
 251     # of the culprit revision and the revision before that appear to be
 252     # statistically significantly different.
 253     confidence = cls.ConfidenceScore(
 254         sum([first_working_rev.value['values']], []),
 255         sum([last_broken_rev.value['values']], []))
 256
 257     bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs
 258
 259     return {'regression_size': regression_size,
 260             'regression_std_err': regression_std_err,
 261             'confidence': confidence,
 262             'bad_greater_than_good': bad_greater_than_good}