tdf#147109: Optimize ScInterpreter::ScSubstitute
[LibreOffice.git] / sc / source / core / tool / arraysumAVX512.cxx
blob6a3235a58e2e2ba1b6ac3180c6d85ff0d4b780ca
1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /*
3 * This file is part of the LibreOffice project.
5 * This Source Code Form is subject to the terms of the Mozilla Public
6 * License, v. 2.0. If a copy of the MPL was not distributed with this
7 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 */
11 #define LO_ARRAYSUM_SPACE AVX512
12 #include "arraysum.hxx"
14 #include <arraysumfunctorinternal.hxx>
16 #include <tools/simd.hxx>
17 #include <tools/simdsupport.hxx>
19 #include <stdlib.h>
21 namespace sc::op
23 #ifdef LO_AVX512F_AVAILABLE
25 bool hasAVX512FCode() { return true; }
27 using namespace AVX512;
29 /** Kahan sum with AVX512.
31 static inline void sumAVX512(__m512d& sum, __m512d& err, const __m512d& value)
33 // Temporal parameter
34 __m512d t = _mm512_add_pd(sum, value);
35 // Absolute value of the total sum
36 __m512d asum = _mm512_abs_pd(sum);
37 // Absolute value of the value to add
38 __m512d avalue = _mm512_abs_pd(value);
39 // Compare the absolute values sum >= value
40 __mmask8 mask = _mm512_cmp_pd_mask(avalue, asum, _CMP_GE_OQ);
41 // The following code has this form ( a - t + b)
42 // Case 1: a = sum b = value
43 // Case 2: a = value b = sum
44 __m512d a = _mm512_mask_blend_pd(mask, sum, value);
45 __m512d b = _mm512_mask_blend_pd(mask, value, sum);
46 err = _mm512_add_pd(err, _mm512_add_pd(_mm512_sub_pd(a, t), b));
47 // Store result
48 sum = t;
51 /** Execute Kahan sum with AVX512.
53 KahanSumSimple executeAVX512F(size_t& i, size_t nSize, const double* pCurrent)
55 // Make sure we don't fall out of bounds.
56 // This works by sums of 8 terms.
57 // So the 8'th term is i+7
58 // If we iterate until nSize won't fall out of bounds
59 if (nSize > i + 7)
61 // Setup sums and errors as 0
62 __m512d sum = _mm512_setzero_pd();
63 __m512d err = _mm512_setzero_pd();
65 // Sum the stuff
66 for (; i + 7 < nSize; i += 8)
68 // Kahan sum
69 __m512d load = _mm512_loadu_pd(pCurrent);
70 sumAVX512(sum, err, load);
71 pCurrent += 8;
74 // Store result
75 static_assert(sizeof(double) == 8);
76 double sums[8];
77 double errs[8];
78 _mm512_storeu_pd(&sums[0], sum);
79 _mm512_storeu_pd(&errs[0], err);
81 // First Kahan & pairwise summation
82 // 0+1 1+2 3+4 4+5 6+7 -> 0, 2, 4, 6
83 sumNeumanierNormal(sums[0], errs[0], sums[1]);
84 sumNeumanierNormal(sums[2], errs[2], sums[3]);
85 sumNeumanierNormal(sums[4], errs[4], sums[5]);
86 sumNeumanierNormal(sums[6], errs[6], sums[7]);
87 sumNeumanierNormal(sums[0], errs[0], errs[1]);
88 sumNeumanierNormal(sums[2], errs[2], errs[3]);
89 sumNeumanierNormal(sums[4], errs[4], errs[5]);
90 sumNeumanierNormal(sums[6], errs[6], errs[7]);
92 // Second Kahan & pairwise summation
93 // 0+2 4+6 -> 0, 4
94 sumNeumanierNormal(sums[0], errs[0], sums[2]);
95 sumNeumanierNormal(sums[4], errs[4], sums[6]);
96 sumNeumanierNormal(sums[0], errs[0], errs[2]);
97 sumNeumanierNormal(sums[4], errs[4], errs[6]);
99 // Third Kahan & pairwise summation
100 // 0+4 -> 0
101 sumNeumanierNormal(sums[0], errs[0], sums[4]);
102 sumNeumanierNormal(sums[0], errs[0], errs[4]);
104 // Return final result
105 return { sums[0], errs[0] };
107 return { 0.0, 0.0 };
110 #else // LO_AVX512F_AVAILABLE
112 bool hasAVX512FCode() { return false; }
114 KahanSumSimple executeAVX512F(size_t&, size_t, const double*) { abort(); }
116 #endif
118 } // end namespace sc::op
120 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */