third_party/highway/hwy/examples/benchmark.cc

   1 // Copyright 2019 Google LLC
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15
  16 #ifndef __STDC_FORMAT_MACROS
  17 #define __STDC_FORMAT_MACROS  // before inttypes.h
  18 #endif
  19 #include <inttypes.h>  // IWYU pragma: keep
  20 #include <stdio.h>
  21 #include <stdlib.h>  // abort
  22
  23 #include <cmath>  // std::abs
  24 #include <memory>
  25 #include <numeric>  // std::iota, std::inner_product
  26
  27 #undef HWY_TARGET_INCLUDE
  28 #define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
  29 #include "hwy/foreach_target.h"  // IWYU pragma: keep
  30
  31 // Must come after foreach_target.h to avoid redefinition errors.
  32 #include "hwy/aligned_allocator.h"
  33 #include "hwy/highway.h"
  34 #include "hwy/nanobenchmark.h"
  35
  36 HWY_BEFORE_NAMESPACE();
  37 namespace hwy {
  38 namespace HWY_NAMESPACE {
  39
  40 // These templates are not found via ADL.
  41 #if HWY_TARGET != HWY_SCALAR
  42 using hwy::HWY_NAMESPACE::CombineShiftRightLanes;
  43 #endif
  44
  45 class TwoArray {
  46  public:
  47   // Must be a multiple of the vector lane count * 8.
  48   static size_t NumItems() { return 3456; }
  49
  50   TwoArray()
  51       : a_(AllocateAligned<float>(NumItems() * 2)), b_(a_.get() + NumItems()) {
  52     // = 1, but compiler doesn't know
  53     const float init = static_cast<float>(Unpredictable1());
  54     std::iota(a_.get(), a_.get() + NumItems(), init);
  55     std::iota(b_, b_ + NumItems(), init);
  56   }
  57
  58  protected:
  59   AlignedFreeUniquePtr<float[]> a_;
  60   float* b_;
  61 };
  62
  63 // Measures durations, verifies results, prints timings.
  64 template <class Benchmark>
  65 void RunBenchmark(const char* caption) {
  66   printf("%10s: ", caption);
  67   const size_t kNumInputs = 1;
  68   const size_t num_items = Benchmark::NumItems() * size_t(Unpredictable1());
  69   const FuncInput inputs[kNumInputs] = {num_items};
  70   Result results[kNumInputs];
  71
  72   Benchmark benchmark;
  73
  74   Params p;
  75   p.verbose = false;
  76   p.max_evals = 7;
  77   p.target_rel_mad = 0.002;
  78   const size_t num_results = MeasureClosure(
  79       [&benchmark](const FuncInput input) { return benchmark(input); }, inputs,
  80       kNumInputs, results, p);
  81   if (num_results != kNumInputs) {
  82     fprintf(stderr, "MeasureClosure failed.\n");
  83   }
  84
  85   benchmark.Verify(num_items);
  86
  87   for (size_t i = 0; i < num_results; ++i) {
  88     const double cycles_per_item =
  89         results[i].ticks / static_cast<double>(results[i].input);
  90     const double mad = results[i].variability * cycles_per_item;
  91     printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n",
  92            static_cast<uint64_t>(results[i].input), cycles_per_item, mad);
  93   }
  94 }
  95
  96 void Intro() {
  97   const float in[16] = {1, 2, 3, 4, 5, 6};
  98   float out[16];
  99   const ScalableTag<float> d;  // largest possible vector
 100   for (size_t i = 0; i < 16; i += Lanes(d)) {
 101     const auto vec = LoadU(d, in + i);  // no alignment requirement
 102     auto result = Mul(vec, vec);
 103     result = Add(result, result);  // can update if not const
 104     StoreU(result, d, out + i);
 105   }
 106   printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
 107 }
 108
 109 // BEGINNER: dot product
 110 // 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
 111 class BenchmarkDot : public TwoArray {
 112  public:
 113   BenchmarkDot() : dot_{-1.0f} {}
 114
 115   FuncOutput operator()(const size_t num_items) {
 116     const ScalableTag<float> d;
 117     const size_t N = Lanes(d);
 118     using V = decltype(Zero(d));
 119     // Compiler doesn't make independent sum* accumulators, so unroll manually.
 120     // We cannot use an array because V might be a sizeless type. For reasonable
 121     // code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency).
 122     V sum0 = Zero(d);
 123     V sum1 = Zero(d);
 124     V sum2 = Zero(d);
 125     V sum3 = Zero(d);
 126     const float* const HWY_RESTRICT pa = &a_[0];
 127     const float* const HWY_RESTRICT pb = b_;
 128     for (size_t i = 0; i < num_items; i += 4 * N) {
 129       const auto a0 = Load(d, pa + i + 0 * N);
 130       const auto b0 = Load(d, pb + i + 0 * N);
 131       sum0 = MulAdd(a0, b0, sum0);
 132       const auto a1 = Load(d, pa + i + 1 * N);
 133       const auto b1 = Load(d, pb + i + 1 * N);
 134       sum1 = MulAdd(a1, b1, sum1);
 135       const auto a2 = Load(d, pa + i + 2 * N);
 136       const auto b2 = Load(d, pb + i + 2 * N);
 137       sum2 = MulAdd(a2, b2, sum2);
 138       const auto a3 = Load(d, pa + i + 3 * N);
 139       const auto b3 = Load(d, pb + i + 3 * N);
 140       sum3 = MulAdd(a3, b3, sum3);
 141     }
 142     // Reduction tree: sum of all accumulators by pairs into sum0.
 143     sum0 = Add(sum0, sum1);
 144     sum2 = Add(sum2, sum3);
 145     sum0 = Add(sum0, sum2);
 146     // Remember to store the result in `dot_` for verification; see `Verify`.
 147     dot_ = ReduceSum(d, sum0);
 148     // Return the result so that the benchmarking framework can ensure that the
 149     // computation is not elided by the compiler.
 150     return static_cast<FuncOutput>(dot_);
 151   }
 152   void Verify(size_t num_items) {
 153     if (dot_ == -1.0f) {
 154       fprintf(stderr, "Dot: must call Verify after benchmark");
 155       abort();
 156     }
 157
 158     const float expected =
 159         std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f);
 160     const float rel_err = std::abs(expected - dot_) / expected;
 161     if (rel_err > 1.1E-6f) {
 162       fprintf(stderr, "Dot: expected %e actual %e (%e)\n", expected, dot_,
 163               rel_err);
 164       abort();
 165     }
 166   }
 167
 168  private:
 169   float dot_;  // for Verify
 170 };
 171
 172 // INTERMEDIATE: delta coding
 173 // 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
 174 struct BenchmarkDelta : public TwoArray {
 175   FuncOutput operator()(const size_t num_items) const {
 176 #if HWY_TARGET == HWY_SCALAR
 177     b_[0] = a_[0];
 178     for (size_t i = 1; i < num_items; ++i) {
 179       b_[i] = a_[i] - a_[i - 1];
 180     }
 181 #elif HWY_CAP_GE256
 182     // Larger vectors are split into 128-bit blocks, easiest to use the
 183     // unaligned load support to shift between them.
 184     const ScalableTag<float> df;
 185     const size_t N = Lanes(df);
 186     size_t i;
 187     b_[0] = a_[0];
 188     for (i = 1; i < N; ++i) {
 189       b_[i] = a_[i] - a_[i - 1];
 190     }
 191     for (; i < num_items; i += N) {
 192       const auto a = Load(df, &a_[i]);
 193       const auto shifted = LoadU(df, &a_[i - 1]);
 194       Store(a - shifted, df, &b_[i]);
 195     }
 196 #else  // 128-bit
 197     // Slightly better than unaligned loads
 198     const HWY_CAPPED(float, 4) df;
 199     const size_t N = Lanes(df);
 200     size_t i;
 201     b_[0] = a_[0];
 202     for (i = 1; i < N; ++i) {
 203       b_[i] = a_[i] - a_[i - 1];
 204     }
 205     auto prev = Load(df, &a_[0]);
 206     for (; i < num_items; i += Lanes(df)) {
 207       const auto a = Load(df, &a_[i]);
 208       const auto shifted = CombineShiftRightLanes<3>(df, a, prev);
 209       prev = a;
 210       Store(Sub(a, shifted), df, &b_[i]);
 211     }
 212 #endif
 213     return static_cast<FuncOutput>(b_[num_items - 1]);
 214   }
 215
 216   void Verify(size_t num_items) {
 217     for (size_t i = 0; i < num_items; ++i) {
 218       const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1];
 219       const float err = std::abs(expected - b_[i]);
 220       if (err > 1E-6f) {
 221         fprintf(stderr, "Delta: expected %e, actual %e\n", expected, b_[i]);
 222       }
 223     }
 224   }
 225 };
 226
 227 void RunBenchmarks() {
 228   Intro();
 229   printf("------------------------ %s\n", TargetName(HWY_TARGET));
 230   RunBenchmark<BenchmarkDot>("dot");
 231   RunBenchmark<BenchmarkDelta>("delta");
 232 }
 233
 234 // NOLINTNEXTLINE(google-readability-namespace-comments)
 235 }  // namespace HWY_NAMESPACE
 236 }  // namespace hwy
 237 HWY_AFTER_NAMESPACE();
 238
 239 #if HWY_ONCE
 240 namespace hwy {
 241 HWY_EXPORT(RunBenchmarks);
 242
 243 void Run() {
 244   for (int64_t target : SupportedAndGeneratedTargets()) {
 245     SetSupportedTargetsForTest(target);
 246     HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
 247   }
 248   SetSupportedTargetsForTest(0);  // Reset the mask afterwards.
 249 }
 250
 251 }  // namespace hwy
 252
 253 int main(int /*argc*/, char** /*argv*/) {
 254   hwy::Run();
 255   return 0;
 256 }
 257 #endif  // HWY_ONCE