third_party/highway/hwy/nanobenchmark.h

   1 // Copyright 2019 Google LLC
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15
  16 #ifndef HIGHWAY_HWY_NANOBENCHMARK_H_
  17 #define HIGHWAY_HWY_NANOBENCHMARK_H_
  18
  19 // Benchmarks functions of a single integer argument with realistic branch
  20 // prediction hit rates. Uses a robust estimator to summarize the measurements.
  21 // The precision is about 0.2%.
  22 //
  23 // Examples: see nanobenchmark_test.cc.
  24 //
  25 // Background: Microbenchmarks such as http://github.com/google/benchmark
  26 // can measure elapsed times on the order of a microsecond. Shorter functions
  27 // are typically measured by repeating them thousands of times and dividing
  28 // the total elapsed time by this count. Unfortunately, repetition (especially
  29 // with the same input parameter!) influences the runtime. In time-critical
  30 // code, it is reasonable to expect warm instruction/data caches and TLBs,
  31 // but a perfect record of which branches will be taken is unrealistic.
  32 // Unless the application also repeatedly invokes the measured function with
  33 // the same parameter, the benchmark is measuring something very different -
  34 // a best-case result, almost as if the parameter were made a compile-time
  35 // constant. This may lead to erroneous conclusions about branch-heavy
  36 // algorithms outperforming branch-free alternatives.
  37 //
  38 // Our approach differs in three ways. Adding fences to the timer functions
  39 // reduces variability due to instruction reordering, improving the timer
  40 // resolution to about 40 CPU cycles. However, shorter functions must still
  41 // be invoked repeatedly. For more realistic branch prediction performance,
  42 // we vary the input parameter according to a user-specified distribution.
  43 // Thus, instead of VaryInputs(Measure(Repeat(func))), we change the
  44 // loop nesting to Measure(Repeat(VaryInputs(func))). We also estimate the
  45 // central tendency of the measurement samples with the "half sample mode",
  46 // which is more robust to outliers and skewed data than the mean or median.
  47
  48 #include <stddef.h>
  49 #include <stdint.h>
  50
  51 #include "hwy/highway_export.h"
  52
  53 // Enables sanity checks that verify correct operation at the cost of
  54 // longer benchmark runs.
  55 #ifndef NANOBENCHMARK_ENABLE_CHECKS
  56 #define NANOBENCHMARK_ENABLE_CHECKS 0
  57 #endif
  58
  59 #define NANOBENCHMARK_CHECK_ALWAYS(condition)                             \
  60   while (!(condition)) {                                                  \
  61     fprintf(stderr, "Nanobenchmark check failed at line %d\n", __LINE__); \
  62     abort();                                                              \
  63   }
  64
  65 #if NANOBENCHMARK_ENABLE_CHECKS
  66 #define NANOBENCHMARK_CHECK(condition) NANOBENCHMARK_CHECK_ALWAYS(condition)
  67 #else
  68 #define NANOBENCHMARK_CHECK(condition)
  69 #endif
  70
  71 namespace hwy {
  72
  73 namespace platform {
  74
  75 // Returns tick rate, useful for converting measurements to seconds. Invariant
  76 // means the tick counter frequency is independent of CPU throttling or sleep.
  77 // This call may be expensive, callers should cache the result.
  78 HWY_DLLEXPORT double InvariantTicksPerSecond();
  79
  80 // Returns current timestamp [in seconds] relative to an unspecified origin.
  81 // Features: monotonic (no negative elapsed time), steady (unaffected by system
  82 // time changes), high-resolution (on the order of microseconds).
  83 HWY_DLLEXPORT double Now();
  84
  85 // Returns ticks elapsed in back to back timer calls, i.e. a function of the
  86 // timer resolution (minimum measurable difference) and overhead.
  87 // This call is expensive, callers should cache the result.
  88 HWY_DLLEXPORT uint64_t TimerResolution();
  89
  90 }  // namespace platform
  91
  92 // Returns 1, but without the compiler knowing what the value is. This prevents
  93 // optimizing out code.
  94 HWY_DLLEXPORT int Unpredictable1();
  95
  96 // Input influencing the function being measured (e.g. number of bytes to copy).
  97 using FuncInput = size_t;
  98
  99 // "Proof of work" returned by Func to ensure the compiler does not elide it.
 100 using FuncOutput = uint64_t;
 101
 102 // Function to measure: either 1) a captureless lambda or function with two
 103 // arguments or 2) a lambda with capture, in which case the first argument
 104 // is reserved for use by MeasureClosure.
 105 using Func = FuncOutput (*)(const void*, FuncInput);
 106
 107 // Internal parameters that determine precision/resolution/measuring time.
 108 struct Params {
 109   // For measuring timer overhead/resolution. Used in a nested loop =>
 110   // quadratic time, acceptable because we know timer overhead is "low".
 111   // constexpr because this is used to define array bounds.
 112   static constexpr size_t kTimerSamples = 256;
 113
 114   // Best-case precision, expressed as a divisor of the timer resolution.
 115   // Larger => more calls to Func and higher precision.
 116   size_t precision_divisor = 1024;
 117
 118   // Ratio between full and subset input distribution sizes. Cannot be less
 119   // than 2; larger values increase measurement time but more faithfully
 120   // model the given input distribution.
 121   size_t subset_ratio = 2;
 122
 123   // Together with the estimated Func duration, determines how many times to
 124   // call Func before checking the sample variability. Larger values increase
 125   // measurement time, memory/cache use and precision.
 126   double seconds_per_eval = 4E-3;
 127
 128   // The minimum number of samples before estimating the central tendency.
 129   size_t min_samples_per_eval = 7;
 130
 131   // The mode is better than median for estimating the central tendency of
 132   // skewed/fat-tailed distributions, but it requires sufficient samples
 133   // relative to the width of half-ranges.
 134   size_t min_mode_samples = 64;
 135
 136   // Maximum permissible variability (= median absolute deviation / center).
 137   double target_rel_mad = 0.002;
 138
 139   // Abort after this many evals without reaching target_rel_mad. This
 140   // prevents infinite loops.
 141   size_t max_evals = 9;
 142
 143   // Whether to print additional statistics to stdout.
 144   bool verbose = true;
 145 };
 146
 147 // Measurement result for each unique input.
 148 struct Result {
 149   FuncInput input;
 150
 151   // Robust estimate (mode or median) of duration.
 152   float ticks;
 153
 154   // Measure of variability (median absolute deviation relative to "ticks").
 155   float variability;
 156 };
 157
 158 // Precisely measures the number of ticks elapsed when calling "func" with the
 159 // given inputs, shuffled to ensure realistic branch prediction hit rates.
 160 //
 161 // "func" returns a 'proof of work' to ensure its computations are not elided.
 162 // "arg" is passed to Func, or reserved for internal use by MeasureClosure.
 163 // "inputs" is an array of "num_inputs" (not necessarily unique) arguments to
 164 //   "func". The values should be chosen to maximize coverage of "func". This
 165 //   represents a distribution, so a value's frequency should reflect its
 166 //   probability in the real application. Order does not matter; for example, a
 167 //   uniform distribution over [0, 4) could be represented as {3,0,2,1}.
 168 // Returns how many Result were written to "results": one per unique input, or
 169 //   zero if the measurement failed (an error message goes to stderr).
 170 HWY_DLLEXPORT size_t Measure(Func func, const uint8_t* arg,
 171                              const FuncInput* inputs, size_t num_inputs,
 172                              Result* results, const Params& p = Params());
 173
 174 // Calls operator() of the given closure (lambda function).
 175 template <class Closure>
 176 static FuncOutput CallClosure(const Closure* f, const FuncInput input) {
 177   return (*f)(input);
 178 }
 179
 180 // Same as Measure, except "closure" is typically a lambda function of
 181 // FuncInput -> FuncOutput with a capture list.
 182 template <class Closure>
 183 static inline size_t MeasureClosure(const Closure& closure,
 184                                     const FuncInput* inputs,
 185                                     const size_t num_inputs, Result* results,
 186                                     const Params& p = Params()) {
 187   return Measure(reinterpret_cast<Func>(&CallClosure<Closure>),
 188                  reinterpret_cast<const uint8_t*>(&closure), inputs, num_inputs,
 189                  results, p);
 190 }
 191
 192 }  // namespace hwy
 193
 194 #endif  // HIGHWAY_HWY_NANOBENCHMARK_H_