tools/auto_bisect/math_utils.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 """General statistical or mathematical functions."""
   6
   7 import math
   8
   9
  10 def TruncatedMean(data_set, truncate_proportion):
  11   """Calculates the truncated mean of a set of values.
  12
  13   Note that this isn't just the mean of the set of values with the highest
  14   and lowest values discarded; the non-discarded values are also weighted
  15   differently depending how many values are discarded.
  16
  17   NOTE: If there's not much benefit from this keeping and weighting
  18   partial values, it might be better to use a simplified truncated mean
  19   function without weighting.
  20
  21   Args:
  22     data_set: Non-empty list of values.
  23     truncate_proportion: How much of the upper and lower portions of the data
  24         set to discard, expressed as a value in the range [0, 1].
  25         Note: a value of 0.5 or greater would be meaningless
  26
  27   Returns:
  28     The truncated mean as a float.
  29
  30   Raises:
  31     TypeError: The data set was empty after discarding values.
  32   """
  33   if len(data_set) > 2:
  34     data_set = sorted(data_set)
  35
  36     discard_num_float = len(data_set) * truncate_proportion
  37     discard_num_int = int(math.floor(discard_num_float))
  38     kept_weight = len(data_set) - (discard_num_float * 2)
  39
  40     data_set = data_set[discard_num_int:len(data_set)-discard_num_int]
  41
  42     weight_left = 1.0 - (discard_num_float - discard_num_int)
  43
  44     if weight_left < 1:
  45       # If the % to discard leaves a fractional portion, need to weight those
  46       # values.
  47       unweighted_vals = data_set[1:len(data_set)-1]
  48       weighted_vals = [data_set[0], data_set[len(data_set)-1]]
  49       weighted_vals = [w * weight_left for w in weighted_vals]
  50       data_set = weighted_vals + unweighted_vals
  51   else:
  52     kept_weight = len(data_set)
  53
  54   data_sum = reduce(lambda x, y: float(x) + float(y), data_set)
  55   truncated_mean = data_sum / kept_weight
  56   return truncated_mean
  57
  58
  59 def Mean(values):
  60   """Calculates the arithmetic mean of a list of values."""
  61   return TruncatedMean(values, 0.0)
  62
  63
  64 def Variance(values):
  65   """Calculates the sample variance."""
  66   if len(values) == 1:
  67     return 0.0
  68   mean = Mean(values)
  69   differences_from_mean = [float(x) - mean for x in values]
  70   squared_differences = [float(x * x) for x in differences_from_mean]
  71   variance = sum(squared_differences) / (len(values) - 1)
  72   return variance
  73
  74
  75 def StandardDeviation(values):
  76   """Calculates the sample standard deviation of the given list of values."""
  77   return math.sqrt(Variance(values))
  78
  79
  80 def RelativeChange(before, after):
  81   """Returns the relative change of before and after, relative to before.
  82
  83   There are several different ways to define relative difference between
  84   two numbers; sometimes it is defined as relative to the smaller number,
  85   or to the mean of the two numbers. This version returns the difference
  86   relative to the first of the two numbers.
  87
  88   Args:
  89     before: A number representing an earlier value.
  90     after: Another number, representing a later value.
  91
  92   Returns:
  93     A non-negative floating point number; 0.1 represents a 10% change.
  94   """
  95   if before == after:
  96     return 0.0
  97   if before == 0:
  98     return float('nan')
  99   difference = after - before
 100   return math.fabs(difference / before)
 101
 102
 103 def PooledStandardError(work_sets):
 104   """Calculates the pooled sample standard error for a set of samples.
 105
 106   Args:
 107     work_sets: A collection of collections of numbers.
 108
 109   Returns:
 110     Pooled sample standard error.
 111   """
 112   numerator = 0.0
 113   denominator1 = 0.0
 114   denominator2 = 0.0
 115
 116   for current_set in work_sets:
 117     std_dev = StandardDeviation(current_set)
 118     numerator += (len(current_set) - 1) * std_dev ** 2
 119     denominator1 += len(current_set) - 1
 120     if len(current_set) > 0:
 121       denominator2 += 1.0 / len(current_set)
 122
 123   if denominator1 == 0:
 124     return 0.0
 125
 126   return math.sqrt(numerator / denominator1) * math.sqrt(denominator2)
 127
 128
 129 # Redefining built-in 'StandardError'
 130 # pylint: disable=W0622
 131 def StandardError(values):
 132   """Calculates the standard error of a list of values."""
 133   # NOTE: This behavior of returning 0.0 in the case of an empty list is
 134   # inconsistent with Variance and StandardDeviation above.
 135   if len(values) <= 1:
 136     return 0.0
 137   std_dev = StandardDeviation(values)
 138   return std_dev / math.sqrt(len(values))