Bug 1861709 replace AudioCallbackDriver::ThreadRunning() assertions that mean to...
[gecko.git] / third_party / highway / hwy / examples / benchmark.cc
blob003d6cb606be0701303a2624b40934cbdde0400c
1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
16 #ifndef __STDC_FORMAT_MACROS
17 #define __STDC_FORMAT_MACROS // before inttypes.h
18 #endif
19 #include <inttypes.h> // IWYU pragma: keep
20 #include <stdio.h>
21 #include <stdlib.h> // abort
23 #include <cmath> // std::abs
24 #include <memory>
25 #include <numeric> // std::iota, std::inner_product
27 #undef HWY_TARGET_INCLUDE
28 #define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
29 #include "hwy/foreach_target.h" // IWYU pragma: keep
31 // Must come after foreach_target.h to avoid redefinition errors.
32 #include "hwy/aligned_allocator.h"
33 #include "hwy/highway.h"
34 #include "hwy/nanobenchmark.h"
36 HWY_BEFORE_NAMESPACE();
37 namespace hwy {
38 namespace HWY_NAMESPACE {
40 // These templates are not found via ADL.
41 #if HWY_TARGET != HWY_SCALAR
42 using hwy::HWY_NAMESPACE::CombineShiftRightLanes;
43 #endif
45 class TwoArray {
46 public:
47 // Must be a multiple of the vector lane count * 8.
48 static size_t NumItems() { return 3456; }
50 TwoArray()
51 : a_(AllocateAligned<float>(NumItems() * 2)), b_(a_.get() + NumItems()) {
52 // = 1, but compiler doesn't know
53 const float init = static_cast<float>(Unpredictable1());
54 std::iota(a_.get(), a_.get() + NumItems(), init);
55 std::iota(b_, b_ + NumItems(), init);
58 protected:
59 AlignedFreeUniquePtr<float[]> a_;
60 float* b_;
63 // Measures durations, verifies results, prints timings.
64 template <class Benchmark>
65 void RunBenchmark(const char* caption) {
66 printf("%10s: ", caption);
67 const size_t kNumInputs = 1;
68 const size_t num_items = Benchmark::NumItems() * size_t(Unpredictable1());
69 const FuncInput inputs[kNumInputs] = {num_items};
70 Result results[kNumInputs];
72 Benchmark benchmark;
74 Params p;
75 p.verbose = false;
76 p.max_evals = 7;
77 p.target_rel_mad = 0.002;
78 const size_t num_results = MeasureClosure(
79 [&benchmark](const FuncInput input) { return benchmark(input); }, inputs,
80 kNumInputs, results, p);
81 if (num_results != kNumInputs) {
82 fprintf(stderr, "MeasureClosure failed.\n");
85 benchmark.Verify(num_items);
87 for (size_t i = 0; i < num_results; ++i) {
88 const double cycles_per_item =
89 results[i].ticks / static_cast<double>(results[i].input);
90 const double mad = results[i].variability * cycles_per_item;
91 printf("%6" PRIu64 ": %6.3f (+/- %5.3f)\n",
92 static_cast<uint64_t>(results[i].input), cycles_per_item, mad);
96 void Intro() {
97 const float in[16] = {1, 2, 3, 4, 5, 6};
98 float out[16];
99 const ScalableTag<float> d; // largest possible vector
100 for (size_t i = 0; i < 16; i += Lanes(d)) {
101 const auto vec = LoadU(d, in + i); // no alignment requirement
102 auto result = Mul(vec, vec);
103 result = Add(result, result); // can update if not const
104 StoreU(result, d, out + i);
106 printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]);
109 // BEGINNER: dot product
110 // 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
111 class BenchmarkDot : public TwoArray {
112 public:
113 BenchmarkDot() : dot_{-1.0f} {}
115 FuncOutput operator()(const size_t num_items) {
116 const ScalableTag<float> d;
117 const size_t N = Lanes(d);
118 using V = decltype(Zero(d));
119 // Compiler doesn't make independent sum* accumulators, so unroll manually.
120 // We cannot use an array because V might be a sizeless type. For reasonable
121 // code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency).
122 V sum0 = Zero(d);
123 V sum1 = Zero(d);
124 V sum2 = Zero(d);
125 V sum3 = Zero(d);
126 const float* const HWY_RESTRICT pa = &a_[0];
127 const float* const HWY_RESTRICT pb = b_;
128 for (size_t i = 0; i < num_items; i += 4 * N) {
129 const auto a0 = Load(d, pa + i + 0 * N);
130 const auto b0 = Load(d, pb + i + 0 * N);
131 sum0 = MulAdd(a0, b0, sum0);
132 const auto a1 = Load(d, pa + i + 1 * N);
133 const auto b1 = Load(d, pb + i + 1 * N);
134 sum1 = MulAdd(a1, b1, sum1);
135 const auto a2 = Load(d, pa + i + 2 * N);
136 const auto b2 = Load(d, pb + i + 2 * N);
137 sum2 = MulAdd(a2, b2, sum2);
138 const auto a3 = Load(d, pa + i + 3 * N);
139 const auto b3 = Load(d, pb + i + 3 * N);
140 sum3 = MulAdd(a3, b3, sum3);
142 // Reduction tree: sum of all accumulators by pairs into sum0.
143 sum0 = Add(sum0, sum1);
144 sum2 = Add(sum2, sum3);
145 sum0 = Add(sum0, sum2);
146 // Remember to store the result in `dot_` for verification; see `Verify`.
147 dot_ = ReduceSum(d, sum0);
148 // Return the result so that the benchmarking framework can ensure that the
149 // computation is not elided by the compiler.
150 return static_cast<FuncOutput>(dot_);
152 void Verify(size_t num_items) {
153 if (dot_ == -1.0f) {
154 fprintf(stderr, "Dot: must call Verify after benchmark");
155 abort();
158 const float expected =
159 std::inner_product(a_.get(), a_.get() + num_items, b_, 0.0f);
160 const float rel_err = std::abs(expected - dot_) / expected;
161 if (rel_err > 1.1E-6f) {
162 fprintf(stderr, "Dot: expected %e actual %e (%e)\n", expected, dot_,
163 rel_err);
164 abort();
168 private:
169 float dot_; // for Verify
172 // INTERMEDIATE: delta coding
173 // 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
174 struct BenchmarkDelta : public TwoArray {
175 FuncOutput operator()(const size_t num_items) const {
176 #if HWY_TARGET == HWY_SCALAR
177 b_[0] = a_[0];
178 for (size_t i = 1; i < num_items; ++i) {
179 b_[i] = a_[i] - a_[i - 1];
181 #elif HWY_CAP_GE256
182 // Larger vectors are split into 128-bit blocks, easiest to use the
183 // unaligned load support to shift between them.
184 const ScalableTag<float> df;
185 const size_t N = Lanes(df);
186 size_t i;
187 b_[0] = a_[0];
188 for (i = 1; i < N; ++i) {
189 b_[i] = a_[i] - a_[i - 1];
191 for (; i < num_items; i += N) {
192 const auto a = Load(df, &a_[i]);
193 const auto shifted = LoadU(df, &a_[i - 1]);
194 Store(a - shifted, df, &b_[i]);
196 #else // 128-bit
197 // Slightly better than unaligned loads
198 const HWY_CAPPED(float, 4) df;
199 const size_t N = Lanes(df);
200 size_t i;
201 b_[0] = a_[0];
202 for (i = 1; i < N; ++i) {
203 b_[i] = a_[i] - a_[i - 1];
205 auto prev = Load(df, &a_[0]);
206 for (; i < num_items; i += Lanes(df)) {
207 const auto a = Load(df, &a_[i]);
208 const auto shifted = CombineShiftRightLanes<3>(df, a, prev);
209 prev = a;
210 Store(Sub(a, shifted), df, &b_[i]);
212 #endif
213 return static_cast<FuncOutput>(b_[num_items - 1]);
216 void Verify(size_t num_items) {
217 for (size_t i = 0; i < num_items; ++i) {
218 const float expected = (i == 0) ? a_[0] : a_[i] - a_[i - 1];
219 const float err = std::abs(expected - b_[i]);
220 if (err > 1E-6f) {
221 fprintf(stderr, "Delta: expected %e, actual %e\n", expected, b_[i]);
227 void RunBenchmarks() {
228 Intro();
229 printf("------------------------ %s\n", TargetName(HWY_TARGET));
230 RunBenchmark<BenchmarkDot>("dot");
231 RunBenchmark<BenchmarkDelta>("delta");
234 // NOLINTNEXTLINE(google-readability-namespace-comments)
235 } // namespace HWY_NAMESPACE
236 } // namespace hwy
237 HWY_AFTER_NAMESPACE();
239 #if HWY_ONCE
240 namespace hwy {
241 HWY_EXPORT(RunBenchmarks);
243 void Run() {
244 for (int64_t target : SupportedAndGeneratedTargets()) {
245 SetSupportedTargetsForTest(target);
246 HWY_DYNAMIC_DISPATCH(RunBenchmarks)();
248 SetSupportedTargetsForTest(0); // Reset the mask afterwards.
251 } // namespace hwy
253 int main(int /*argc*/, char** /*argv*/) {
254 hwy::Run();
255 return 0;
257 #endif // HWY_ONCE