Split SIMD implementations into 4 files
[gromacs.git] / src / gromacs / simd / impl_x86_avx_256 / impl_x86_avx_256_simd_float.h
blobf473ac3752cbbbfc05caf056143233eb0d36a7b1
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_X86_AVX_256_SIMD_FLOAT_H
37 #define GMX_SIMD_IMPL_X86_AVX_256_SIMD_FLOAT_H
39 #include "config.h"
41 #include <immintrin.h>
43 #include "gromacs/utility/real.h"
45 #include "impl_x86_avx_256_common.h"
47 /****************************************************
48 * SINGLE PRECISION SIMD IMPLEMENTATION *
49 ****************************************************/
50 #define gmx_simd_float_t __m256
51 #define gmx_simd_load_f _mm256_load_ps
52 #define gmx_simd_load1_f _mm256_broadcast_ss
53 #define gmx_simd_set1_f _mm256_set1_ps
54 #define gmx_simd_store_f _mm256_store_ps
55 #define gmx_simd_loadu_f _mm256_loadu_ps
56 #define gmx_simd_storeu_f _mm256_storeu_ps
57 #define gmx_simd_setzero_f _mm256_setzero_ps
58 #define gmx_simd_add_f _mm256_add_ps
59 #define gmx_simd_sub_f _mm256_sub_ps
60 #define gmx_simd_mul_f _mm256_mul_ps
61 #define gmx_simd_fmadd_f(a, b, c) _mm256_add_ps(_mm256_mul_ps(a, b), c)
62 #define gmx_simd_fmsub_f(a, b, c) _mm256_sub_ps(_mm256_mul_ps(a, b), c)
63 #define gmx_simd_fnmadd_f(a, b, c) _mm256_sub_ps(c, _mm256_mul_ps(a, b))
64 #define gmx_simd_fnmsub_f(a, b, c) _mm256_sub_ps(_mm256_setzero_ps(), gmx_simd_fmadd_f(a, b, c))
65 #define gmx_simd_and_f _mm256_and_ps
66 #define gmx_simd_andnot_f _mm256_andnot_ps
67 #define gmx_simd_or_f _mm256_or_ps
68 #define gmx_simd_xor_f _mm256_xor_ps
69 #define gmx_simd_rsqrt_f _mm256_rsqrt_ps
70 #define gmx_simd_rcp_f _mm256_rcp_ps
71 #define gmx_simd_fabs_f(x) _mm256_andnot_ps(_mm256_set1_ps(GMX_FLOAT_NEGZERO), x)
72 #define gmx_simd_fneg_f(x) _mm256_xor_ps(x, _mm256_set1_ps(GMX_FLOAT_NEGZERO))
73 #define gmx_simd_max_f _mm256_max_ps
74 #define gmx_simd_min_f _mm256_min_ps
75 #define gmx_simd_round_f(x) _mm256_round_ps(x, _MM_FROUND_NINT)
76 #define gmx_simd_trunc_f(x) _mm256_round_ps(x, _MM_FROUND_TRUNC)
77 #define gmx_simd_fraction_f(x) _mm256_sub_ps(x, gmx_simd_trunc_f(x))
78 #define gmx_simd_get_exponent_f gmx_simd_get_exponent_f_avx_256
79 #define gmx_simd_get_mantissa_f gmx_simd_get_mantissa_f_avx_256
80 #define gmx_simd_set_exponent_f gmx_simd_set_exponent_f_avx_256
81 /* integer datatype corresponding to float: gmx_simd_fint32_t */
82 #define gmx_simd_fint32_t __m256i
83 #define gmx_simd_load_fi(m) _mm256_load_si256((__m256i const*)(m))
84 #define gmx_simd_set1_fi _mm256_set1_epi32
85 #define gmx_simd_store_fi(m, x) _mm256_store_si256((__m256i *)(m), x)
86 #define gmx_simd_loadu_fi(m) _mm256_loadu_si256((__m256i const*)(m))
87 #define gmx_simd_storeu_fi(m, x) _mm256_storeu_si256((__m256i *)(m), x)
88 #define gmx_simd_setzero_fi _mm256_setzero_si256
89 #define gmx_simd_cvt_f2i _mm256_cvtps_epi32
90 #define gmx_simd_cvtt_f2i _mm256_cvttps_epi32
91 #define gmx_simd_cvt_i2f _mm256_cvtepi32_ps
92 #define gmx_simd_extract_fi(x, i) _mm_extract_epi32(_mm256_extractf128_si256(x, (i)>>2), (i)&0x3)
93 /* Integer logical ops on gmx_simd_fint32_t */
94 /* gmx_simd_add_fi not supported */
95 /* gmx_simd_sub_fi not supported */
96 /* gmx_simd_mul_fi not supported */
97 /* gmx_simd_slli_fi not supported */
98 /* gmx_simd_srli_fi not supported */
99 /* gmx_simd_and_fi not supported */
100 /* gmx_simd_andnot_fi not supported */
101 /* gmx_simd_or_fi not supported */
102 /* gmx_simd_xor_fi not supported */
103 /* Integer arithmetic ops on gmx_simd_fint32_t */
104 /* gmx_simd_add_fi not supported */
105 /* gmx_simd_sub_fi not supported */
106 /* gmx_simd_mul_fi not supported */
107 /* Boolean & comparison operations on gmx_simd_float_t */
108 #define gmx_simd_fbool_t __m256
109 #define gmx_simd_cmpeq_f(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
110 #define gmx_simd_cmplt_f(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
111 #define gmx_simd_cmple_f(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
112 #define gmx_simd_and_fb _mm256_and_ps
113 #define gmx_simd_or_fb _mm256_or_ps
114 #define gmx_simd_anytrue_fb _mm256_movemask_ps
115 #define gmx_simd_blendzero_f _mm256_and_ps
116 #define gmx_simd_blendnotzero_f(a, sel) _mm256_andnot_ps(sel, a)
117 #define gmx_simd_blendv_f _mm256_blendv_ps
118 #define gmx_simd_reduce_f gmx_simd_reduce_f_avx_256
119 /* Boolean & comparison operations on gmx_simd_fint32_t */
120 #define gmx_simd_fibool_t __m256i
121 /* gmx_simd_cmpeq_fi not supported */
122 /* gmx_simd_cmplt_fi not supported */
123 /* gmx_simd_and_fib not supported */
124 /* gmx_simd_or_fib not supported */
125 /* gmx_simd_anytrue_fib not supported */
126 /* gmx_simd_blendzero_fi not supported */
127 /* gmx_simd_blendnotzero_fi not supported */
128 /* gmx_simd_blendv_fi not supported */
129 /* Conversions between different booleans */
130 #define gmx_simd_cvt_fb2fib _mm256_castps_si256
131 #define gmx_simd_cvt_fib2fb _mm256_castsi256_ps
133 /*********************************************************
134 * SIMD SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
135 *********************************************************/
136 static gmx_inline __m256 gmx_simdcall
137 gmx_simd_get_exponent_f_avx_256(__m256 x)
139 const __m256 expmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7F800000));
140 const __m128i expbias = _mm_set1_epi32(127);
141 __m256i iexp256;
142 __m128i iexp128a, iexp128b;
144 iexp256 = _mm256_castps_si256(_mm256_and_ps(x, expmask));
145 iexp128b = _mm256_extractf128_si256(iexp256, 0x1);
146 iexp128a = _mm256_castsi256_si128(iexp256);
147 iexp128a = _mm_srli_epi32(iexp128a, 23);
148 iexp128b = _mm_srli_epi32(iexp128b, 23);
149 iexp128a = _mm_sub_epi32(iexp128a, expbias);
150 iexp128b = _mm_sub_epi32(iexp128b, expbias);
151 iexp256 = _mm256_castsi128_si256(iexp128a);
152 iexp256 = _mm256_insertf128_si256(iexp256, iexp128b, 0x1);
153 return _mm256_cvtepi32_ps(iexp256);
156 static gmx_inline __m256 gmx_simdcall
157 gmx_simd_get_mantissa_f_avx_256(__m256 x)
159 const __m256 mantmask = _mm256_castsi256_ps(_mm256_set1_epi32(0x007FFFFF));
160 const __m256 one = _mm256_set1_ps(1.0);
162 x = _mm256_and_ps(x, mantmask);
163 return _mm256_or_ps(x, one);
166 static gmx_inline __m256 gmx_simdcall
167 gmx_simd_set_exponent_f_avx_256(__m256 x)
169 const __m128i expbias = _mm_set1_epi32(127);
170 __m256i iexp256;
171 __m128i iexp128a, iexp128b;
173 iexp256 = _mm256_cvtps_epi32(x);
174 iexp128b = _mm256_extractf128_si256(iexp256, 0x1);
175 iexp128a = _mm256_castsi256_si128(iexp256);
176 iexp128a = _mm_slli_epi32(_mm_add_epi32(iexp128a, expbias), 23);
177 iexp128b = _mm_slli_epi32(_mm_add_epi32(iexp128b, expbias), 23);
178 iexp256 = _mm256_castsi128_si256(iexp128a);
179 iexp256 = _mm256_insertf128_si256(iexp256, iexp128b, 0x1);
180 return _mm256_castsi256_ps(iexp256);
183 static gmx_inline float gmx_simdcall
184 gmx_simd_reduce_f_avx_256(__m256 a)
186 float f;
188 __m128 a0, a1;
189 a = _mm256_hadd_ps(a, a);
190 a = _mm256_hadd_ps(a, a);
191 a0 = _mm256_castps256_ps128(a);
192 a1 = _mm256_extractf128_ps(a, 0x1);
193 a0 = _mm_add_ss(a0, a1);
194 _mm_store_ss(&f, a0);
195 return f;
198 #endif /* GMX_SIMD_IMPL_X86_AVX_256_SIMD_FLOAT_H */