sc/source/core/tool/arraysumAVX512.cxx

   1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
   2 /*
   3  * This file is part of the LibreOffice project.
   4  *
   5  * This Source Code Form is subject to the terms of the Mozilla Public
   6  * License, v. 2.0. If a copy of the MPL was not distributed with this
   7  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8  *
   9  */
  10
  11 #define LO_ARRAYSUM_SPACE AVX512
  12 #include "arraysum.hxx"
  13
  14 #include <arraysumfunctorinternal.hxx>
  15
  16 #include <tools/simd.hxx>
  17 #include <tools/simdsupport.hxx>
  18
  19 #include <stdlib.h>
  20
  21 namespace sc::op
  22 {
  23 #ifdef LO_AVX512F_AVAILABLE
  24
  25 bool hasAVX512FCode() { return true; }
  26
  27 using namespace AVX512;
  28
  29 /** Kahan sum with AVX512.
  30   */
  31 static inline void sumAVX512(__m512d& sum, __m512d& err, const __m512d& value)
  32 {
  33     // Temporal parameter
  34     __m512d t = _mm512_add_pd(sum, value);
  35     // Absolute value of the total sum
  36     __m512d asum = _mm512_abs_pd(sum);
  37     // Absolute value of the value to add
  38     __m512d avalue = _mm512_abs_pd(value);
  39     // Compare the absolute values sum >= value
  40     __mmask8 mask = _mm512_cmp_pd_mask(avalue, asum, _CMP_GE_OQ);
  41     // The following code has this form ( a - t + b)
  42     // Case 1: a = sum b = value
  43     // Case 2: a = value b = sum
  44     __m512d a = _mm512_mask_blend_pd(mask, sum, value);
  45     __m512d b = _mm512_mask_blend_pd(mask, value, sum);
  46     err = _mm512_add_pd(err, _mm512_add_pd(_mm512_sub_pd(a, t), b));
  47     // Store result
  48     sum = t;
  49 }
  50
  51 /** Execute Kahan sum with AVX512.
  52   */
  53 KahanSumSimple executeAVX512F(size_t& i, size_t nSize, const double* pCurrent)
  54 {
  55     // Make sure we don't fall out of bounds.
  56     // This works by sums of 8 terms.
  57     // So the 8'th term is i+7
  58     // If we iterate until nSize won't fall out of bounds
  59     if (nSize > i + 7)
  60     {
  61         // Setup sums and errors as 0
  62         __m512d sum = _mm512_setzero_pd();
  63         __m512d err = _mm512_setzero_pd();
  64
  65         // Sum the stuff
  66         for (; i + 7 < nSize; i += 8)
  67         {
  68             // Kahan sum
  69             __m512d load = _mm512_loadu_pd(pCurrent);
  70             sumAVX512(sum, err, load);
  71             pCurrent += 8;
  72         }
  73
  74         // Store result
  75         static_assert(sizeof(double) == 8);
  76         double sums[8];
  77         double errs[8];
  78         _mm512_storeu_pd(&sums[0], sum);
  79         _mm512_storeu_pd(&errs[0], err);
  80
  81         // First Kahan & pairwise summation
  82         // 0+1 1+2 3+4 4+5 6+7 -> 0, 2, 4, 6
  83         sumNeumanierNormal(sums[0], errs[0], sums[1]);
  84         sumNeumanierNormal(sums[2], errs[2], sums[3]);
  85         sumNeumanierNormal(sums[4], errs[4], sums[5]);
  86         sumNeumanierNormal(sums[6], errs[6], sums[7]);
  87         sumNeumanierNormal(sums[0], errs[0], errs[1]);
  88         sumNeumanierNormal(sums[2], errs[2], errs[3]);
  89         sumNeumanierNormal(sums[4], errs[4], errs[5]);
  90         sumNeumanierNormal(sums[6], errs[6], errs[7]);
  91
  92         // Second Kahan & pairwise summation
  93         // 0+2 4+6 -> 0, 4
  94         sumNeumanierNormal(sums[0], errs[0], sums[2]);
  95         sumNeumanierNormal(sums[4], errs[4], sums[6]);
  96         sumNeumanierNormal(sums[0], errs[0], errs[2]);
  97         sumNeumanierNormal(sums[4], errs[4], errs[6]);
  98
  99         // Third Kahan & pairwise summation
 100         // 0+4 -> 0
 101         sumNeumanierNormal(sums[0], errs[0], sums[4]);
 102         sumNeumanierNormal(sums[0], errs[0], errs[4]);
 103
 104         // Return final result
 105         return { sums[0], errs[0] };
 106     }
 107     return { 0.0, 0.0 };
 108 }
 109
 110 #else // LO_AVX512F_AVAILABLE
 111
 112 bool hasAVX512FCode() { return false; }
 113
 114 KahanSumSimple executeAVX512F(size_t&, size_t, const double*) { abort(); }
 115
 116 #endif
 117
 118 } // end namespace sc::op
 119
 120 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */