1 // Copyright 2019 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
16 #include "hwy/nanobenchmark.h"
18 #ifndef __STDC_FORMAT_MACROS
19 #define __STDC_FORMAT_MACROS // before inttypes.h
21 #include <inttypes.h> // IWYU pragma: keep
24 #include <time.h> // clock_gettime
26 #include <algorithm> // std::sort, std::find_if
28 #include <chrono> //NOLINT
30 #include <numeric> // std::iota
33 #include <utility> // std::pair
36 #if defined(_WIN32) || defined(_WIN64)
43 #if defined(__APPLE__)
44 #include <mach/mach.h>
45 #include <mach/mach_time.h>
48 #if defined(__HAIKU__)
53 #if HWY_ARCH_PPC && defined(__GLIBC__)
54 #include <sys/platform/ppc.h> // NOLINT __ppc_get_timebase_freq
60 #include <cpuid.h> // NOLINT
61 #endif // HWY_COMPILER_MSVC
63 #endif // HWY_ARCH_X86
69 // Ticks := platform-specific timer values (CPU cycles on x86). Must be
70 // unsigned to guarantee wraparound on overflow.
71 using Ticks
= uint64_t;
73 // Start/Stop return absolute timestamps and must be placed immediately before
74 // and after the region to measure. We provide separate Start/Stop functions
75 // because they use different fences.
77 // Background: RDTSC is not 'serializing'; earlier instructions may complete
78 // after it, and/or later instructions may complete before it. 'Fences' ensure
79 // regions' elapsed times are independent of such reordering. The only
80 // documented unprivileged serializing instruction is CPUID, which acts as a
81 // full fence (no reordering across it in either direction). Unfortunately
82 // the latency of CPUID varies wildly (perhaps made worse by not initializing
83 // its EAX input). Because it cannot reliably be deducted from the region's
84 // elapsed time, it must not be included in the region to measure (i.e.
85 // between the two RDTSC).
87 // The newer RDTSCP is sometimes described as serializing, but it actually
88 // only serves as a half-fence with release semantics. Although all
89 // instructions in the region will complete before the final timestamp is
90 // captured, subsequent instructions may leak into the region and increase the
91 // elapsed time. Inserting another fence after the final RDTSCP would prevent
92 // such reordering without affecting the measured region.
94 // Fortunately, such a fence exists. The LFENCE instruction is only documented
95 // to delay later loads until earlier loads are visible. However, Intel's
96 // reference manual says it acts as a full fence (waiting until all earlier
97 // instructions have completed, and delaying later instructions until it
98 // completes). AMD assigns the same behavior to MFENCE.
100 // We need a fence before the initial RDTSC to prevent earlier instructions
101 // from leaking into the region, and arguably another after RDTSC to avoid
102 // region instructions from completing before the timestamp is recorded.
103 // When surrounded by fences, the additional RDTSCP half-fence provides no
104 // benefit, so the initial timestamp can be recorded via RDTSC, which has
105 // lower overhead than RDTSCP because it does not read TSC_AUX. In summary,
106 // we define Start = LFENCE/RDTSC/LFENCE; Stop = RDTSCP/LFENCE.
108 // Using Start+Start leads to higher variance and overhead than Stop+Stop.
109 // However, Stop+Stop includes an LFENCE in the region measurements, which
110 // adds a delay dependent on earlier loads. The combination of Start+Stop
111 // is faster than Start+Start and more consistent than Stop+Stop because
112 // the first LFENCE already delayed subsequent loads before the measured
113 // region. This combination seems not to have been considered in prior work:
114 // http://akaros.cs.berkeley.edu/lxr/akaros/kern/arch/x86/rdtsc_test.c
116 // Note: performance counters can measure 'exact' instructions-retired or
117 // (unhalted) cycle counts. The RDPMC instruction is not serializing and also
118 // requires fences. Unfortunately, it is not accessible on all OSes and we
119 // prefer to avoid kernel-mode drivers. Performance counters are also affected
120 // by several under/over-count errata, so we use the TSC instead.
122 // Returns a 64-bit timestamp in unit of 'ticks'; to convert to seconds,
123 // divide by InvariantTicksPerSecond.
124 inline Ticks
Start() {
126 #if HWY_ARCH_PPC && defined(__GLIBC__)
127 asm volatile("mfspr %0, %1" : "=r"(t
) : "i"(268));
128 #elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
129 // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
130 asm volatile("mrs %0, cntvct_el0" : "=r"(t
));
131 #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
139 #elif HWY_ARCH_X86_64
148 // "memory" avoids reordering. rdx = TSC >> 32.
149 // "cc" = flags modified by SHL.
150 : "rdx", "memory", "cc");
152 asm volatile("rdtime %0" : "=r"(t
));
153 #elif defined(_WIN32) || defined(_WIN64)
154 LARGE_INTEGER counter
;
155 (void)QueryPerformanceCounter(&counter
);
156 t
= counter
.QuadPart
;
157 #elif defined(__APPLE__)
158 t
= mach_absolute_time();
159 #elif defined(__HAIKU__)
160 t
= system_time_nsecs(); // since boot
163 clock_gettime(CLOCK_MONOTONIC
, &ts
);
164 t
= static_cast<Ticks
>(ts
.tv_sec
* 1000000000LL + ts
.tv_nsec
);
169 // WARNING: on x86, caller must check HasRDTSCP before using this!
170 inline Ticks
Stop() {
172 #if HWY_ARCH_PPC && defined(__GLIBC__)
173 asm volatile("mfspr %0, %1" : "=r"(t
) : "i"(268));
174 #elif HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC
175 // pmccntr_el0 is privileged but cntvct_el0 is accessible in Linux and QEMU.
176 asm volatile("mrs %0, cntvct_el0" : "=r"(t
));
177 #elif HWY_ARCH_X86 && HWY_COMPILER_MSVC
184 #elif HWY_ARCH_X86_64
185 // Use inline asm because __rdtscp generates code to store TSC_AUX (ecx).
193 // "memory" avoids reordering. rcx = TSC_AUX. rdx = TSC >> 32.
194 // "cc" = flags modified by SHL.
195 : "rcx", "rdx", "memory", "cc");
204 namespace robust_statistics
{
206 // Sorts integral values in ascending order (e.g. for Mode). About 3x faster
207 // than std::sort for input distributions with very few unique values.
209 void CountingSort(T
* values
, size_t num_values
) {
210 // Unique values and their frequency (similar to flat_map).
211 using Unique
= std::pair
<T
, int>;
212 std::vector
<Unique
> unique
;
213 for (size_t i
= 0; i
< num_values
; ++i
) {
214 const T value
= values
[i
];
216 std::find_if(unique
.begin(), unique
.end(),
217 [value
](const Unique u
) { return u
.first
== value
; });
218 if (pos
== unique
.end()) {
219 unique
.push_back(std::make_pair(value
, 1));
225 // Sort in ascending order of value (pair.first).
226 std::sort(unique
.begin(), unique
.end());
228 // Write that many copies of each unique value to the array.
229 T
* HWY_RESTRICT p
= values
;
230 for (const auto& value_count
: unique
) {
231 std::fill(p
, p
+ value_count
.second
, value_count
.first
);
232 p
+= value_count
.second
;
234 NANOBENCHMARK_CHECK(p
== values
+ num_values
);
237 // @return i in [idx_begin, idx_begin + half_count) that minimizes
238 // sorted[i + half_count] - sorted[i].
239 template <typename T
>
240 size_t MinRange(const T
* const HWY_RESTRICT sorted
, const size_t idx_begin
,
241 const size_t half_count
) {
242 T min_range
= std::numeric_limits
<T
>::max();
245 for (size_t idx
= idx_begin
; idx
< idx_begin
+ half_count
; ++idx
) {
246 NANOBENCHMARK_CHECK(sorted
[idx
] <= sorted
[idx
+ half_count
]);
247 const T range
= sorted
[idx
+ half_count
] - sorted
[idx
];
248 if (range
< min_range
) {
257 // Returns an estimate of the mode by calling MinRange on successively
258 // halved intervals. "sorted" must be in ascending order. This is the
259 // Half Sample Mode estimator proposed by Bickel in "On a fast, robust
260 // estimator of the mode", with complexity O(N log N). The mode is less
261 // affected by outliers in highly-skewed distributions than the median.
262 // The averaging operation below assumes "T" is an unsigned integer type.
263 template <typename T
>
264 T
ModeOfSorted(const T
* const HWY_RESTRICT sorted
, const size_t num_values
) {
265 size_t idx_begin
= 0;
266 size_t half_count
= num_values
/ 2;
267 while (half_count
> 1) {
268 idx_begin
= MinRange(sorted
, idx_begin
, half_count
);
272 const T x
= sorted
[idx_begin
+ 0];
273 if (half_count
== 0) {
276 NANOBENCHMARK_CHECK(half_count
== 1);
277 const T average
= (x
+ sorted
[idx_begin
+ 1] + 1) / 2;
281 // Returns the mode. Side effect: sorts "values".
282 template <typename T
>
283 T
Mode(T
* values
, const size_t num_values
) {
284 CountingSort(values
, num_values
);
285 return ModeOfSorted(values
, num_values
);
288 template <typename T
, size_t N
>
289 T
Mode(T (&values
)[N
]) {
290 return Mode(&values
[0], N
);
293 // Returns the median value. Side effect: sorts "values".
294 template <typename T
>
295 T
Median(T
* values
, const size_t num_values
) {
296 NANOBENCHMARK_CHECK(!values
->empty());
297 std::sort(values
, values
+ num_values
);
298 const size_t half
= num_values
/ 2;
299 // Odd count: return middle
300 if (num_values
% 2) {
303 // Even count: return average of middle two.
304 return (values
[half
] + values
[half
- 1] + 1) / 2;
307 // Returns a robust measure of variability.
308 template <typename T
>
309 T
MedianAbsoluteDeviation(const T
* values
, const size_t num_values
,
311 NANOBENCHMARK_CHECK(num_values
!= 0);
312 std::vector
<T
> abs_deviations
;
313 abs_deviations
.reserve(num_values
);
314 for (size_t i
= 0; i
< num_values
; ++i
) {
315 const int64_t abs
= std::abs(static_cast<int64_t>(values
[i
]) -
316 static_cast<int64_t>(median
));
317 abs_deviations
.push_back(static_cast<T
>(abs
));
319 return Median(abs_deviations
.data(), num_values
);
322 } // namespace robust_statistics
327 // Measures the actual current frequency of Ticks. We cannot rely on the nominal
328 // frequency encoded in x86 BrandString because it is misleading on M1 Rosetta,
329 // and not reported by AMD. CPUID 0x15 is also not yet widely supported. Also
330 // used on RISC-V and aarch64.
331 HWY_MAYBE_UNUSED
double MeasureNominalClockRate() {
332 double max_ticks_per_sec
= 0.0;
333 // Arbitrary, enough to ignore 2 outliers without excessive init time.
334 for (int rep
= 0; rep
< 3; ++rep
) {
335 auto time0
= std::chrono::steady_clock::now();
336 using Time
= decltype(time0
);
337 const timer::Ticks ticks0
= timer::Start();
338 const Time time_min
= time0
+ std::chrono::milliseconds(10);
343 time1
= std::chrono::steady_clock::now();
344 // Ideally this would be Stop, but that requires RDTSCP on x86. To avoid
345 // another codepath, just use Start instead. now() presumably has its own
346 // fence-like behavior.
347 ticks1
= timer::Start(); // Do not use Stop, see comment above
348 if (time1
>= time_min
) break;
351 const double dticks
= static_cast<double>(ticks1
- ticks0
);
352 std::chrono::duration
<double, std::ratio
<1>> dtime
= time1
- time0
;
353 const double ticks_per_sec
= dticks
/ dtime
.count();
354 max_ticks_per_sec
= std::max(max_ticks_per_sec
, ticks_per_sec
);
356 return max_ticks_per_sec
;
361 void Cpuid(const uint32_t level
, const uint32_t count
,
362 uint32_t* HWY_RESTRICT abcd
) {
363 #if HWY_COMPILER_MSVC
365 __cpuidex(regs
, level
, count
);
366 for (int i
= 0; i
< 4; ++i
) {
374 __cpuid_count(level
, count
, a
, b
, c
, d
);
384 Cpuid(0x80000001U
, 0, abcd
); // Extended feature flags
385 return (abcd
[3] & (1u << 27)) != 0; // RDTSCP
388 std::string
BrandString() {
389 char brand_string
[49];
390 std::array
<uint32_t, 4> abcd
;
392 // Check if brand string is supported (it is on all reasonable Intel/AMD)
393 Cpuid(0x80000000U
, 0, abcd
.data());
394 if (abcd
[0] < 0x80000004U
) {
395 return std::string();
398 for (size_t i
= 0; i
< 3; ++i
) {
399 Cpuid(static_cast<uint32_t>(0x80000002U
+ i
), 0, abcd
.data());
400 CopyBytes
<sizeof(abcd
)>(&abcd
[0], brand_string
+ i
* 16); // not same size
402 brand_string
[48] = 0;
406 #endif // HWY_ARCH_X86
410 HWY_DLLEXPORT
double InvariantTicksPerSecond() {
411 #if HWY_ARCH_PPC && defined(__GLIBC__)
412 return static_cast<double>(__ppc_get_timebase_freq());
413 #elif HWY_ARCH_X86 || HWY_ARCH_RVV || (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC)
414 // We assume the x86 TSC is invariant; it is on all recent Intel/AMD CPUs.
415 static const double freq
= MeasureNominalClockRate();
417 #elif defined(_WIN32) || defined(_WIN64)
419 (void)QueryPerformanceFrequency(&freq
);
420 return static_cast<double>(freq
.QuadPart
);
421 #elif defined(__APPLE__)
422 // https://developer.apple.com/library/mac/qa/qa1398/_index.html
423 mach_timebase_info_data_t timebase
;
424 (void)mach_timebase_info(&timebase
);
425 return static_cast<double>(timebase
.denom
) / timebase
.numer
* 1E9
;
427 return 1E9
; // Haiku and clock_gettime return nanoseconds.
431 HWY_DLLEXPORT
double Now() {
432 static const double mul
= 1.0 / InvariantTicksPerSecond();
433 return static_cast<double>(timer::Start()) * mul
;
436 HWY_DLLEXPORT
uint64_t TimerResolution() {
438 bool can_use_stop
= platform::HasRDTSCP();
440 constexpr bool can_use_stop
= true;
443 // Nested loop avoids exceeding stack/L1 capacity.
444 timer::Ticks repetitions
[Params::kTimerSamples
];
445 for (size_t rep
= 0; rep
< Params::kTimerSamples
; ++rep
) {
446 timer::Ticks samples
[Params::kTimerSamples
];
448 for (size_t i
= 0; i
< Params::kTimerSamples
; ++i
) {
449 const timer::Ticks t0
= timer::Start();
450 const timer::Ticks t1
= timer::Stop(); // we checked HasRDTSCP above
451 samples
[i
] = t1
- t0
;
454 for (size_t i
= 0; i
< Params::kTimerSamples
; ++i
) {
455 const timer::Ticks t0
= timer::Start();
456 const timer::Ticks t1
= timer::Start(); // do not use Stop, see above
457 samples
[i
] = t1
- t0
;
460 repetitions
[rep
] = robust_statistics::Mode(samples
);
462 return robust_statistics::Mode(repetitions
);
465 } // namespace platform
468 static const timer::Ticks timer_resolution
= platform::TimerResolution();
470 // Estimates the expected value of "lambda" values with a variable number of
471 // samples until the variability "rel_mad" is less than "max_rel_mad".
472 template <class Lambda
>
473 timer::Ticks
SampleUntilStable(const double max_rel_mad
, double* rel_mad
,
474 const Params
& p
, const Lambda
& lambda
) {
475 // Choose initial samples_per_eval based on a single estimated duration.
476 timer::Ticks t0
= timer::Start();
478 timer::Ticks t1
= timer::Stop(); // Caller checks HasRDTSCP
479 timer::Ticks est
= t1
- t0
;
480 static const double ticks_per_second
= platform::InvariantTicksPerSecond();
481 const size_t ticks_per_eval
=
482 static_cast<size_t>(ticks_per_second
* p
.seconds_per_eval
);
483 size_t samples_per_eval
= est
== 0
484 ? p
.min_samples_per_eval
485 : static_cast<size_t>(ticks_per_eval
/ est
);
486 samples_per_eval
= HWY_MAX(samples_per_eval
, p
.min_samples_per_eval
);
488 std::vector
<timer::Ticks
> samples
;
489 samples
.reserve(1 + samples_per_eval
);
490 samples
.push_back(est
);
492 // Percentage is too strict for tiny differences, so also allow a small
493 // absolute "median absolute deviation".
494 const timer::Ticks max_abs_mad
= (timer_resolution
+ 99) / 100;
495 *rel_mad
= 0.0; // ensure initialized
497 for (size_t eval
= 0; eval
< p
.max_evals
; ++eval
, samples_per_eval
*= 2) {
498 samples
.reserve(samples
.size() + samples_per_eval
);
499 for (size_t i
= 0; i
< samples_per_eval
; ++i
) {
502 t1
= timer::Stop(); // Caller checks HasRDTSCP
503 samples
.push_back(t1
- t0
);
506 if (samples
.size() >= p
.min_mode_samples
) {
507 est
= robust_statistics::Mode(samples
.data(), samples
.size());
509 // For "few" (depends also on the variance) samples, Median is safer.
510 est
= robust_statistics::Median(samples
.data(), samples
.size());
512 NANOBENCHMARK_CHECK(est
!= 0);
514 // Median absolute deviation (mad) is a robust measure of 'variability'.
515 const timer::Ticks abs_mad
= robust_statistics::MedianAbsoluteDeviation(
516 samples
.data(), samples
.size(), est
);
517 *rel_mad
= static_cast<double>(abs_mad
) / static_cast<double>(est
);
519 if (*rel_mad
<= max_rel_mad
|| abs_mad
<= max_abs_mad
) {
521 printf("%6" PRIu64
" samples => %5" PRIu64
" (abs_mad=%4" PRIu64
522 ", rel_mad=%4.2f%%)\n",
523 static_cast<uint64_t>(samples
.size()),
524 static_cast<uint64_t>(est
), static_cast<uint64_t>(abs_mad
),
532 printf("WARNING: rel_mad=%4.2f%% still exceeds %4.2f%% after %6" PRIu64
534 *rel_mad
* 100.0, max_rel_mad
* 100.0,
535 static_cast<uint64_t>(samples
.size()));
540 using InputVec
= std::vector
<FuncInput
>;
542 // Returns vector of unique input values.
543 InputVec
UniqueInputs(const FuncInput
* inputs
, const size_t num_inputs
) {
544 InputVec
unique(inputs
, inputs
+ num_inputs
);
545 std::sort(unique
.begin(), unique
.end());
546 unique
.erase(std::unique(unique
.begin(), unique
.end()), unique
.end());
550 // Returns how often we need to call func for sufficient precision.
551 size_t NumSkip(const Func func
, const uint8_t* arg
, const InputVec
& unique
,
553 // Min elapsed ticks for any input.
554 timer::Ticks min_duration
= ~timer::Ticks(0);
556 for (const FuncInput input
: unique
) {
558 const timer::Ticks total
= SampleUntilStable(
559 p
.target_rel_mad
, &rel_mad
, p
,
560 [func
, arg
, input
]() { PreventElision(func(arg
, input
)); });
561 min_duration
= HWY_MIN(min_duration
, total
- timer_resolution
);
564 // Number of repetitions required to reach the target resolution.
565 const size_t max_skip
= p
.precision_divisor
;
566 // Number of repetitions given the estimated duration.
567 const size_t num_skip
=
570 : static_cast<size_t>((max_skip
+ min_duration
- 1) / min_duration
);
572 printf("res=%" PRIu64
" max_skip=%" PRIu64
" min_dur=%" PRIu64
573 " num_skip=%" PRIu64
"\n",
574 static_cast<uint64_t>(timer_resolution
),
575 static_cast<uint64_t>(max_skip
), static_cast<uint64_t>(min_duration
),
576 static_cast<uint64_t>(num_skip
));
581 // Replicates inputs until we can omit "num_skip" occurrences of an input.
582 InputVec
ReplicateInputs(const FuncInput
* inputs
, const size_t num_inputs
,
583 const size_t num_unique
, const size_t num_skip
,
586 if (num_unique
== 1) {
587 full
.assign(p
.subset_ratio
* num_skip
, inputs
[0]);
591 full
.reserve(p
.subset_ratio
* num_skip
* num_inputs
);
592 for (size_t i
= 0; i
< p
.subset_ratio
* num_skip
; ++i
) {
593 full
.insert(full
.end(), inputs
, inputs
+ num_inputs
);
596 std::shuffle(full
.begin(), full
.end(), rng
);
600 // Copies the "full" to "subset" in the same order, but with "num_skip"
601 // randomly selected occurrences of "input_to_skip" removed.
602 void FillSubset(const InputVec
& full
, const FuncInput input_to_skip
,
603 const size_t num_skip
, InputVec
* subset
) {
605 static_cast<size_t>(std::count(full
.begin(), full
.end(), input_to_skip
));
606 // Generate num_skip random indices: which occurrence to skip.
607 std::vector
<uint32_t> omit(count
);
608 std::iota(omit
.begin(), omit
.end(), 0);
609 // omit[] is the same on every call, but that's OK because they identify the
610 // Nth instance of input_to_skip, so the position within full[] differs.
612 std::shuffle(omit
.begin(), omit
.end(), rng
);
613 omit
.resize(num_skip
);
614 std::sort(omit
.begin(), omit
.end());
616 uint32_t occurrence
= ~0u; // 0 after preincrement
617 size_t idx_omit
= 0; // cursor within omit[]
618 size_t idx_subset
= 0; // cursor within *subset
619 for (const FuncInput next
: full
) {
620 if (next
== input_to_skip
) {
622 // Haven't removed enough already
623 if (idx_omit
< num_skip
) {
624 // This one is up for removal
625 if (occurrence
== omit
[idx_omit
]) {
631 if (idx_subset
< subset
->size()) {
632 (*subset
)[idx_subset
++] = next
;
635 NANOBENCHMARK_CHECK(idx_subset
== subset
->size());
636 NANOBENCHMARK_CHECK(idx_omit
== omit
.size());
637 NANOBENCHMARK_CHECK(occurrence
== count
- 1);
640 // Returns total ticks elapsed for all inputs.
641 timer::Ticks
TotalDuration(const Func func
, const uint8_t* arg
,
642 const InputVec
* inputs
, const Params
& p
,
643 double* max_rel_mad
) {
645 const timer::Ticks duration
=
646 SampleUntilStable(p
.target_rel_mad
, &rel_mad
, p
, [func
, arg
, inputs
]() {
647 for (const FuncInput input
: *inputs
) {
648 PreventElision(func(arg
, input
));
651 *max_rel_mad
= HWY_MAX(*max_rel_mad
, rel_mad
);
655 // (Nearly) empty Func for measuring timer overhead/resolution.
656 HWY_NOINLINE FuncOutput
EmptyFunc(const void* /*arg*/, const FuncInput input
) {
660 // Returns overhead of accessing inputs[] and calling a function; this will
661 // be deducted from future TotalDuration return values.
662 timer::Ticks
Overhead(const uint8_t* arg
, const InputVec
* inputs
,
665 // Zero tolerance because repeatability is crucial and EmptyFunc is fast.
666 return SampleUntilStable(0.0, &rel_mad
, p
, [arg
, inputs
]() {
667 for (const FuncInput input
: *inputs
) {
668 PreventElision(EmptyFunc(arg
, input
));
675 HWY_DLLEXPORT
int Unpredictable1() { return timer::Start() != ~0ULL; }
677 HWY_DLLEXPORT
size_t Measure(const Func func
, const uint8_t* arg
,
678 const FuncInput
* inputs
, const size_t num_inputs
,
679 Result
* results
, const Params
& p
) {
680 NANOBENCHMARK_CHECK(num_inputs
!= 0);
683 if (!platform::HasRDTSCP()) {
684 fprintf(stderr
, "CPU '%s' does not support RDTSCP, skipping benchmark.\n",
685 platform::BrandString().c_str());
690 const InputVec
& unique
= UniqueInputs(inputs
, num_inputs
);
692 const size_t num_skip
= NumSkip(func
, arg
, unique
, p
); // never 0
693 if (num_skip
== 0) return 0; // NumSkip already printed error message
694 // (slightly less work on x86 to cast from signed integer)
695 const float mul
= 1.0f
/ static_cast<float>(static_cast<int>(num_skip
));
697 const InputVec
& full
=
698 ReplicateInputs(inputs
, num_inputs
, unique
.size(), num_skip
, p
);
699 InputVec
subset(full
.size() - num_skip
);
701 const timer::Ticks overhead
= Overhead(arg
, &full
, p
);
702 const timer::Ticks overhead_skip
= Overhead(arg
, &subset
, p
);
703 if (overhead
< overhead_skip
) {
704 fprintf(stderr
, "Measurement failed: overhead %" PRIu64
" < %" PRIu64
"\n",
705 static_cast<uint64_t>(overhead
),
706 static_cast<uint64_t>(overhead_skip
));
711 printf("#inputs=%5" PRIu64
",%5" PRIu64
" overhead=%5" PRIu64
",%5" PRIu64
713 static_cast<uint64_t>(full
.size()),
714 static_cast<uint64_t>(subset
.size()),
715 static_cast<uint64_t>(overhead
),
716 static_cast<uint64_t>(overhead_skip
));
719 double max_rel_mad
= 0.0;
720 const timer::Ticks total
= TotalDuration(func
, arg
, &full
, p
, &max_rel_mad
);
722 for (size_t i
= 0; i
< unique
.size(); ++i
) {
723 FillSubset(full
, unique
[i
], num_skip
, &subset
);
724 const timer::Ticks total_skip
=
725 TotalDuration(func
, arg
, &subset
, p
, &max_rel_mad
);
727 if (total
< total_skip
) {
728 fprintf(stderr
, "Measurement failed: total %" PRIu64
" < %" PRIu64
"\n",
729 static_cast<uint64_t>(total
), static_cast<uint64_t>(total_skip
));
733 const timer::Ticks duration
=
734 (total
- overhead
) - (total_skip
- overhead_skip
);
735 results
[i
].input
= unique
[i
];
736 results
[i
].ticks
= static_cast<float>(duration
) * mul
;
737 results
[i
].variability
= static_cast<float>(max_rel_mad
);
740 return unique
.size();