1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
16 #ifndef __STDC_FORMAT_MACROS
17 #define __STDC_FORMAT_MACROS // before inttypes.h
19 #include <inttypes.h> // IWYU pragma: keep
21 #include <stdlib.h> // abort
23 #include <cmath> // std::abs
25 #include <numeric> // std::iota, std::inner_product
27 #undef HWY_TARGET_INCLUDE
28 #define HWY_TARGET_INCLUDE "hwy/examples/benchmark.cc"
29 #include "hwy/foreach_target.h" // IWYU pragma: keep
31 // Must come after foreach_target.h to avoid redefinition errors.
32 #include "hwy/aligned_allocator.h"
33 #include "hwy/highway.h"
34 #include "hwy/nanobenchmark.h"
36 HWY_BEFORE_NAMESPACE();
38 namespace HWY_NAMESPACE
{
40 // These templates are not found via ADL.
41 #if HWY_TARGET != HWY_SCALAR
42 using hwy::HWY_NAMESPACE::CombineShiftRightLanes
;
47 // Must be a multiple of the vector lane count * 8.
48 static size_t NumItems() { return 3456; }
51 : a_(AllocateAligned
<float>(NumItems() * 2)), b_(a_
.get() + NumItems()) {
52 // = 1, but compiler doesn't know
53 const float init
= static_cast<float>(Unpredictable1());
54 std::iota(a_
.get(), a_
.get() + NumItems(), init
);
55 std::iota(b_
, b_
+ NumItems(), init
);
59 AlignedFreeUniquePtr
<float[]> a_
;
63 // Measures durations, verifies results, prints timings.
64 template <class Benchmark
>
65 void RunBenchmark(const char* caption
) {
66 printf("%10s: ", caption
);
67 const size_t kNumInputs
= 1;
68 const size_t num_items
= Benchmark::NumItems() * size_t(Unpredictable1());
69 const FuncInput inputs
[kNumInputs
] = {num_items
};
70 Result results
[kNumInputs
];
77 p
.target_rel_mad
= 0.002;
78 const size_t num_results
= MeasureClosure(
79 [&benchmark
](const FuncInput input
) { return benchmark(input
); }, inputs
,
80 kNumInputs
, results
, p
);
81 if (num_results
!= kNumInputs
) {
82 fprintf(stderr
, "MeasureClosure failed.\n");
85 benchmark
.Verify(num_items
);
87 for (size_t i
= 0; i
< num_results
; ++i
) {
88 const double cycles_per_item
=
89 results
[i
].ticks
/ static_cast<double>(results
[i
].input
);
90 const double mad
= results
[i
].variability
* cycles_per_item
;
91 printf("%6" PRIu64
": %6.3f (+/- %5.3f)\n",
92 static_cast<uint64_t>(results
[i
].input
), cycles_per_item
, mad
);
97 const float in
[16] = {1, 2, 3, 4, 5, 6};
99 const ScalableTag
<float> d
; // largest possible vector
100 for (size_t i
= 0; i
< 16; i
+= Lanes(d
)) {
101 const auto vec
= LoadU(d
, in
+ i
); // no alignment requirement
102 auto result
= Mul(vec
, vec
);
103 result
= Add(result
, result
); // can update if not const
104 StoreU(result
, d
, out
+ i
);
106 printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in
[2], out
[2]);
109 // BEGINNER: dot product
110 // 0.4 cyc/float = bronze, 0.25 = silver, 0.15 = gold!
111 class BenchmarkDot
: public TwoArray
{
113 BenchmarkDot() : dot_
{-1.0f
} {}
115 FuncOutput
operator()(const size_t num_items
) {
116 const ScalableTag
<float> d
;
117 const size_t N
= Lanes(d
);
118 using V
= decltype(Zero(d
));
119 // Compiler doesn't make independent sum* accumulators, so unroll manually.
120 // We cannot use an array because V might be a sizeless type. For reasonable
121 // code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency).
126 const float* const HWY_RESTRICT pa
= &a_
[0];
127 const float* const HWY_RESTRICT pb
= b_
;
128 for (size_t i
= 0; i
< num_items
; i
+= 4 * N
) {
129 const auto a0
= Load(d
, pa
+ i
+ 0 * N
);
130 const auto b0
= Load(d
, pb
+ i
+ 0 * N
);
131 sum0
= MulAdd(a0
, b0
, sum0
);
132 const auto a1
= Load(d
, pa
+ i
+ 1 * N
);
133 const auto b1
= Load(d
, pb
+ i
+ 1 * N
);
134 sum1
= MulAdd(a1
, b1
, sum1
);
135 const auto a2
= Load(d
, pa
+ i
+ 2 * N
);
136 const auto b2
= Load(d
, pb
+ i
+ 2 * N
);
137 sum2
= MulAdd(a2
, b2
, sum2
);
138 const auto a3
= Load(d
, pa
+ i
+ 3 * N
);
139 const auto b3
= Load(d
, pb
+ i
+ 3 * N
);
140 sum3
= MulAdd(a3
, b3
, sum3
);
142 // Reduction tree: sum of all accumulators by pairs into sum0.
143 sum0
= Add(sum0
, sum1
);
144 sum2
= Add(sum2
, sum3
);
145 sum0
= Add(sum0
, sum2
);
146 // Remember to store the result in `dot_` for verification; see `Verify`.
147 dot_
= ReduceSum(d
, sum0
);
148 // Return the result so that the benchmarking framework can ensure that the
149 // computation is not elided by the compiler.
150 return static_cast<FuncOutput
>(dot_
);
152 void Verify(size_t num_items
) {
154 fprintf(stderr
, "Dot: must call Verify after benchmark");
158 const float expected
=
159 std::inner_product(a_
.get(), a_
.get() + num_items
, b_
, 0.0f
);
160 const float rel_err
= std::abs(expected
- dot_
) / expected
;
161 if (rel_err
> 1.1E-6f
) {
162 fprintf(stderr
, "Dot: expected %e actual %e (%e)\n", expected
, dot_
,
169 float dot_
; // for Verify
172 // INTERMEDIATE: delta coding
173 // 1.0 cycles/float = bronze, 0.7 = silver, 0.4 = gold!
174 struct BenchmarkDelta
: public TwoArray
{
175 FuncOutput
operator()(const size_t num_items
) const {
176 #if HWY_TARGET == HWY_SCALAR
178 for (size_t i
= 1; i
< num_items
; ++i
) {
179 b_
[i
] = a_
[i
] - a_
[i
- 1];
182 // Larger vectors are split into 128-bit blocks, easiest to use the
183 // unaligned load support to shift between them.
184 const ScalableTag
<float> df
;
185 const size_t N
= Lanes(df
);
188 for (i
= 1; i
< N
; ++i
) {
189 b_
[i
] = a_
[i
] - a_
[i
- 1];
191 for (; i
< num_items
; i
+= N
) {
192 const auto a
= Load(df
, &a_
[i
]);
193 const auto shifted
= LoadU(df
, &a_
[i
- 1]);
194 Store(a
- shifted
, df
, &b_
[i
]);
197 // Slightly better than unaligned loads
198 const HWY_CAPPED(float, 4) df
;
199 const size_t N
= Lanes(df
);
202 for (i
= 1; i
< N
; ++i
) {
203 b_
[i
] = a_
[i
] - a_
[i
- 1];
205 auto prev
= Load(df
, &a_
[0]);
206 for (; i
< num_items
; i
+= Lanes(df
)) {
207 const auto a
= Load(df
, &a_
[i
]);
208 const auto shifted
= CombineShiftRightLanes
<3>(df
, a
, prev
);
210 Store(Sub(a
, shifted
), df
, &b_
[i
]);
213 return static_cast<FuncOutput
>(b_
[num_items
- 1]);
216 void Verify(size_t num_items
) {
217 for (size_t i
= 0; i
< num_items
; ++i
) {
218 const float expected
= (i
== 0) ? a_
[0] : a_
[i
] - a_
[i
- 1];
219 const float err
= std::abs(expected
- b_
[i
]);
221 fprintf(stderr
, "Delta: expected %e, actual %e\n", expected
, b_
[i
]);
227 void RunBenchmarks() {
229 printf("------------------------ %s\n", TargetName(HWY_TARGET
));
230 RunBenchmark
<BenchmarkDot
>("dot");
231 RunBenchmark
<BenchmarkDelta
>("delta");
234 // NOLINTNEXTLINE(google-readability-namespace-comments)
235 } // namespace HWY_NAMESPACE
237 HWY_AFTER_NAMESPACE();
241 HWY_EXPORT(RunBenchmarks
);
244 for (int64_t target
: SupportedAndGeneratedTargets()) {
245 SetSupportedTargetsForTest(target
);
246 HWY_DYNAMIC_DISPATCH(RunBenchmarks
)();
248 SetSupportedTargetsForTest(0); // Reset the mask afterwards.
253 int main(int /*argc*/, char** /*argv*/) {