1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 #define LO_ARRAYSUM_SPACE AVX512
12 #include "arraysum.hxx"
14 #include <arraysumfunctorinternal.hxx>
16 #include <tools/simd.hxx>
17 #include <tools/simdsupport.hxx>
23 #ifdef LO_AVX512F_AVAILABLE
25 bool hasAVX512FCode() { return true; }
27 using namespace AVX512
;
29 /** Kahan sum with AVX512.
31 static inline void sumAVX512(__m512d
& sum
, __m512d
& err
, const __m512d
& value
)
34 __m512d t
= _mm512_add_pd(sum
, value
);
35 // Absolute value of the total sum
36 __m512d asum
= _mm512_abs_pd(sum
);
37 // Absolute value of the value to add
38 __m512d avalue
= _mm512_abs_pd(value
);
39 // Compare the absolute values sum >= value
40 __mmask8 mask
= _mm512_cmp_pd_mask(avalue
, asum
, _CMP_GE_OQ
);
41 // The following code has this form ( a - t + b)
42 // Case 1: a = sum b = value
43 // Case 2: a = value b = sum
44 __m512d a
= _mm512_mask_blend_pd(mask
, sum
, value
);
45 __m512d b
= _mm512_mask_blend_pd(mask
, value
, sum
);
46 err
= _mm512_add_pd(err
, _mm512_add_pd(_mm512_sub_pd(a
, t
), b
));
51 /** Execute Kahan sum with AVX512.
53 KahanSumSimple
executeAVX512F(size_t& i
, size_t nSize
, const double* pCurrent
)
55 // Make sure we don't fall out of bounds.
56 // This works by sums of 8 terms.
57 // So the 8'th term is i+7
58 // If we iterate until nSize won't fall out of bounds
61 // Setup sums and errors as 0
62 __m512d sum
= _mm512_setzero_pd();
63 __m512d err
= _mm512_setzero_pd();
66 for (; i
+ 7 < nSize
; i
+= 8)
69 __m512d load
= _mm512_loadu_pd(pCurrent
);
70 sumAVX512(sum
, err
, load
);
75 static_assert(sizeof(double) == 8);
78 _mm512_storeu_pd(&sums
[0], sum
);
79 _mm512_storeu_pd(&errs
[0], err
);
81 // First Kahan & pairwise summation
82 // 0+1 1+2 3+4 4+5 6+7 -> 0, 2, 4, 6
83 sumNeumanierNormal(sums
[0], errs
[0], sums
[1]);
84 sumNeumanierNormal(sums
[2], errs
[2], sums
[3]);
85 sumNeumanierNormal(sums
[4], errs
[4], sums
[5]);
86 sumNeumanierNormal(sums
[6], errs
[6], sums
[7]);
87 sumNeumanierNormal(sums
[0], errs
[0], errs
[1]);
88 sumNeumanierNormal(sums
[2], errs
[2], errs
[3]);
89 sumNeumanierNormal(sums
[4], errs
[4], errs
[5]);
90 sumNeumanierNormal(sums
[6], errs
[6], errs
[7]);
92 // Second Kahan & pairwise summation
94 sumNeumanierNormal(sums
[0], errs
[0], sums
[2]);
95 sumNeumanierNormal(sums
[4], errs
[4], sums
[6]);
96 sumNeumanierNormal(sums
[0], errs
[0], errs
[2]);
97 sumNeumanierNormal(sums
[4], errs
[4], errs
[6]);
99 // Third Kahan & pairwise summation
101 sumNeumanierNormal(sums
[0], errs
[0], sums
[4]);
102 sumNeumanierNormal(sums
[0], errs
[0], errs
[4]);
104 // Return final result
105 return { sums
[0], errs
[0] };
110 #else // LO_AVX512F_AVAILABLE
112 bool hasAVX512FCode() { return false; }
114 KahanSumSimple
executeAVX512F(size_t&, size_t, const double*) { abort(); }
118 } // end namespace sc::op
120 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */