src/gromacs/simd/impl_x86_avx_256/impl_x86_avx_256_simd_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_X86_AVX_256_SIMD_FLOAT_H
  37 #define GMX_SIMD_IMPL_X86_AVX_256_SIMD_FLOAT_H
  38
  39 #include "config.h"
  40
  41 #include <immintrin.h>
  42
  43 #include "gromacs/utility/real.h"
  44
  45 #include "impl_x86_avx_256_common.h"
  46
  47 /****************************************************
  48  *      SINGLE PRECISION SIMD IMPLEMENTATION        *
  49  ****************************************************/
  50 #define gmx_simd_float_t           __m256
  51 #define gmx_simd_load_f            _mm256_load_ps
  52 #define gmx_simd_load1_f           _mm256_broadcast_ss
  53 #define gmx_simd_set1_f            _mm256_set1_ps
  54 #define gmx_simd_store_f           _mm256_store_ps
  55 #define gmx_simd_loadu_f           _mm256_loadu_ps
  56 #define gmx_simd_storeu_f          _mm256_storeu_ps
  57 #define gmx_simd_setzero_f         _mm256_setzero_ps
  58 #define gmx_simd_add_f             _mm256_add_ps
  59 #define gmx_simd_sub_f             _mm256_sub_ps
  60 #define gmx_simd_mul_f             _mm256_mul_ps
  61 #define gmx_simd_fmadd_f(a, b, c)    _mm256_add_ps(_mm256_mul_ps(a, b), c)
  62 #define gmx_simd_fmsub_f(a, b, c)    _mm256_sub_ps(_mm256_mul_ps(a, b), c)
  63 #define gmx_simd_fnmadd_f(a, b, c)   _mm256_sub_ps(c, _mm256_mul_ps(a, b))
  64 #define gmx_simd_fnmsub_f(a, b, c)   _mm256_sub_ps(_mm256_setzero_ps(), gmx_simd_fmadd_f(a, b, c))
  65 #define gmx_simd_and_f             _mm256_and_ps
  66 #define gmx_simd_andnot_f          _mm256_andnot_ps
  67 #define gmx_simd_or_f              _mm256_or_ps
  68 #define gmx_simd_xor_f             _mm256_xor_ps
  69 #define gmx_simd_rsqrt_f           _mm256_rsqrt_ps
  70 #define gmx_simd_rcp_f             _mm256_rcp_ps
  71 #define gmx_simd_fabs_f(x)         _mm256_andnot_ps(_mm256_set1_ps(GMX_FLOAT_NEGZERO), x)
  72 #define gmx_simd_fneg_f(x)         _mm256_xor_ps(x, _mm256_set1_ps(GMX_FLOAT_NEGZERO))
  73 #define gmx_simd_max_f             _mm256_max_ps
  74 #define gmx_simd_min_f             _mm256_min_ps
  75 #define gmx_simd_round_f(x)        _mm256_round_ps(x, _MM_FROUND_NINT)
  76 #define gmx_simd_trunc_f(x)        _mm256_round_ps(x, _MM_FROUND_TRUNC)
  77 #define gmx_simd_fraction_f(x)     _mm256_sub_ps(x, gmx_simd_trunc_f(x))
  78 #define gmx_simd_get_exponent_f    gmx_simd_get_exponent_f_avx_256
  79 #define gmx_simd_get_mantissa_f    gmx_simd_get_mantissa_f_avx_256
  80 #define gmx_simd_set_exponent_f    gmx_simd_set_exponent_f_avx_256
  81 /* integer datatype corresponding to float: gmx_simd_fint32_t */
  82 #define gmx_simd_fint32_t          __m256i
  83 #define gmx_simd_load_fi(m)        _mm256_load_si256((__m256i const*)(m))
  84 #define gmx_simd_set1_fi           _mm256_set1_epi32
  85 #define gmx_simd_store_fi(m, x)    _mm256_store_si256((__m256i *)(m), x)
  86 #define gmx_simd_loadu_fi(m)       _mm256_loadu_si256((__m256i const*)(m))
  87 #define gmx_simd_storeu_fi(m, x)   _mm256_storeu_si256((__m256i *)(m), x)
  88 #define gmx_simd_setzero_fi        _mm256_setzero_si256
  89 #define gmx_simd_cvt_f2i           _mm256_cvtps_epi32
  90 #define gmx_simd_cvtt_f2i          _mm256_cvttps_epi32
  91 #define gmx_simd_cvt_i2f           _mm256_cvtepi32_ps
  92 #define gmx_simd_extract_fi(x, i)   _mm_extract_epi32(_mm256_extractf128_si256(x, (i)>>2), (i)&0x3)
  93 /* Integer logical ops on gmx_simd_fint32_t */
  94 /* gmx_simd_add_fi not supported     */
  95 /* gmx_simd_sub_fi not supported     */
  96 /* gmx_simd_mul_fi not supported     */
  97 /* gmx_simd_slli_fi not supported    */
  98 /* gmx_simd_srli_fi not supported    */
  99 /* gmx_simd_and_fi not supported     */
 100 /* gmx_simd_andnot_fi not supported  */
 101 /* gmx_simd_or_fi not supported      */
 102 /* gmx_simd_xor_fi not supported     */
 103 /* Integer arithmetic ops on gmx_simd_fint32_t */
 104 /* gmx_simd_add_fi not supported     */
 105 /* gmx_simd_sub_fi not supported     */
 106 /* gmx_simd_mul_fi not supported     */
 107 /* Boolean & comparison operations on gmx_simd_float_t */
 108 #define gmx_simd_fbool_t           __m256
 109 #define gmx_simd_cmpeq_f(a, b)      _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
 110 #define gmx_simd_cmplt_f(a, b)      _mm256_cmp_ps(a, b, _CMP_LT_OQ)
 111 #define gmx_simd_cmple_f(a, b)      _mm256_cmp_ps(a, b, _CMP_LE_OQ)
 112 #define gmx_simd_and_fb            _mm256_and_ps
 113 #define gmx_simd_or_fb             _mm256_or_ps
 114 #define gmx_simd_anytrue_fb        _mm256_movemask_ps
 115 #define gmx_simd_blendzero_f       _mm256_and_ps
 116 #define gmx_simd_blendnotzero_f(a, sel)  _mm256_andnot_ps(sel, a)
 117 #define gmx_simd_blendv_f          _mm256_blendv_ps
 118 #define gmx_simd_reduce_f          gmx_simd_reduce_f_avx_256
 119 /* Boolean & comparison operations on gmx_simd_fint32_t */
 120 #define gmx_simd_fibool_t          __m256i
 121 /* gmx_simd_cmpeq_fi not supported        */
 122 /* gmx_simd_cmplt_fi not supported        */
 123 /* gmx_simd_and_fib not supported         */
 124 /* gmx_simd_or_fib not supported          */
 125 /* gmx_simd_anytrue_fib not supported     */
 126 /* gmx_simd_blendzero_fi not supported    */
 127 /* gmx_simd_blendnotzero_fi not supported    */
 128 /* gmx_simd_blendv_fi not supported       */
 129 /* Conversions between different booleans */
 130 #define gmx_simd_cvt_fb2fib        _mm256_castps_si256
 131 #define gmx_simd_cvt_fib2fb        _mm256_castsi256_ps
 132
 133 /*********************************************************
 134  * SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
 135  *********************************************************/
 136 static gmx_inline __m256 gmx_simdcall
 137 gmx_simd_get_exponent_f_avx_256(__m256 x)
 138 {
 139     const __m256  expmask      = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
 140     const __m128i expbias      = _mm_set1_epi32(127);
 141     __m256i       iexp256;
 142     __m128i       iexp128a, iexp128b;
 143
 144     iexp256   = _mm256_castps_si256(_mm256_and_ps(x, expmask));
 145     iexp128b  = _mm256_extractf128_si256(iexp256, 0x1);
 146     iexp128a  = _mm256_castsi256_si128(iexp256);
 147     iexp128a  = _mm_srli_epi32(iexp128a, 23);
 148     iexp128b  = _mm_srli_epi32(iexp128b, 23);
 149     iexp128a  = _mm_sub_epi32(iexp128a, expbias);
 150     iexp128b  = _mm_sub_epi32(iexp128b, expbias);
 151     iexp256   = _mm256_castsi128_si256(iexp128a);
 152     iexp256   = _mm256_insertf128_si256(iexp256, iexp128b, 0x1);
 153     return _mm256_cvtepi32_ps(iexp256);
 154 }
 155
 156 static gmx_inline __m256 gmx_simdcall
 157 gmx_simd_get_mantissa_f_avx_256(__m256 x)
 158 {
 159     const __m256 mantmask   = _mm256_castsi256_ps(_mm256_set1_epi32(0x007FFFFF));
 160     const __m256 one        = _mm256_set1_ps(1.0);
 161
 162     x = _mm256_and_ps(x, mantmask);
 163     return _mm256_or_ps(x, one);
 164 }
 165
 166 static gmx_inline __m256 gmx_simdcall
 167 gmx_simd_set_exponent_f_avx_256(__m256 x)
 168 {
 169     const __m128i expbias      = _mm_set1_epi32(127);
 170     __m256i       iexp256;
 171     __m128i       iexp128a, iexp128b;
 172
 173     iexp256   = _mm256_cvtps_epi32(x);
 174     iexp128b  = _mm256_extractf128_si256(iexp256, 0x1);
 175     iexp128a  = _mm256_castsi256_si128(iexp256);
 176     iexp128a  = _mm_slli_epi32(_mm_add_epi32(iexp128a, expbias), 23);
 177     iexp128b  = _mm_slli_epi32(_mm_add_epi32(iexp128b, expbias), 23);
 178     iexp256   = _mm256_castsi128_si256(iexp128a);
 179     iexp256   = _mm256_insertf128_si256(iexp256, iexp128b, 0x1);
 180     return _mm256_castsi256_ps(iexp256);
 181 }
 182
 183 static gmx_inline float gmx_simdcall
 184 gmx_simd_reduce_f_avx_256(__m256 a)
 185 {
 186     float  f;
 187
 188     __m128 a0, a1;
 189     a  = _mm256_hadd_ps(a, a);
 190     a  = _mm256_hadd_ps(a, a);
 191     a0 = _mm256_castps256_ps128(a);
 192     a1 = _mm256_extractf128_ps(a, 0x1);
 193     a0 = _mm_add_ss(a0, a1);
 194     _mm_store_ss(&f, a0);
 195     return f;
 196 }
 197
 198 #endif /* GMX_SIMD_IMPL_X86_AVX_256_SIMD_FLOAT_H */