Apply clang-tidy-8 readability-uppercase-literal-suffix
[gromacs.git] / src / gromacs / simd / impl_arm_neon / impl_arm_neon_util_float.h
blob81c3ff0ea2a668ac0aff474ea51d145a50c7b71a
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
35 #ifndef GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H
36 #define GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H
38 #include "config.h"
40 #include <cassert>
41 #include <cstddef>
42 #include <cstdint>
44 #include <arm_neon.h>
46 #include "gromacs/utility/basedefinitions.h"
48 #include "impl_arm_neon_simd_float.h"
51 namespace gmx
54 template <int align>
55 static inline void gmx_simdcall
56 gatherLoadTranspose(const float * base,
57 const std::int32_t offset[],
58 SimdFloat * v0,
59 SimdFloat * v1,
60 SimdFloat * v2,
61 SimdFloat * v3)
63 assert(std::size_t(offset) % 16 == 0);
64 assert(std::size_t(base) % 16 == 0);
65 assert(align % 4 == 0);
67 // Unfortunately we cannot use the beautiful Neon structured load
68 // instructions since the data comes from four different memory locations.
69 float32x4x2_t t0 = vuzpq_f32(vld1q_f32( base + align * offset[0] ), vld1q_f32( base + align * offset[2] ));
70 float32x4x2_t t1 = vuzpq_f32(vld1q_f32( base + align * offset[1] ), vld1q_f32( base + align * offset[3] ));
71 float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
72 float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
73 v0->simdInternal_ = t2.val[0];
74 v1->simdInternal_ = t3.val[0];
75 v2->simdInternal_ = t2.val[1];
76 v3->simdInternal_ = t3.val[1];
79 template <int align>
80 static inline void gmx_simdcall
81 gatherLoadTranspose(const float * base,
82 const std::int32_t offset[],
83 SimdFloat * v0,
84 SimdFloat * v1)
86 assert(std::size_t(offset) % 16 == 0);
87 assert(std::size_t(base) % 8 == 0);
88 assert(align % 2 == 0);
90 v0->simdInternal_ = vcombine_f32(vld1_f32( base + align * offset[0] ),
91 vld1_f32( base + align * offset[2] ));
92 v1->simdInternal_ = vcombine_f32(vld1_f32( base + align * offset[1] ),
93 vld1_f32( base + align * offset[3] ));
95 float32x4x2_t tmp = vtrnq_f32(v0->simdInternal_, v1->simdInternal_);
97 v0->simdInternal_ = tmp.val[0];
98 v1->simdInternal_ = tmp.val[1];
101 static const int c_simdBestPairAlignmentFloat = 2;
103 template <int align>
104 static inline void gmx_simdcall
105 gatherLoadUTranspose(const float * base,
106 const std::int32_t offset[],
107 SimdFloat * v0,
108 SimdFloat * v1,
109 SimdFloat * v2)
111 assert(std::size_t(offset) % 16 == 0);
113 float32x4x2_t t0 = vuzpq_f32(vld1q_f32( base + align * offset[0] ), vld1q_f32( base + align * offset[2] ));
114 float32x4x2_t t1 = vuzpq_f32(vld1q_f32( base + align * offset[1] ), vld1q_f32( base + align * offset[3] ));
115 float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
116 float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
117 v0->simdInternal_ = t2.val[0];
118 v1->simdInternal_ = t3.val[0];
119 v2->simdInternal_ = t2.val[1];
123 template <int align>
124 static inline void gmx_simdcall
125 transposeScatterStoreU(float * base,
126 const std::int32_t offset[],
127 SimdFloat v0,
128 SimdFloat v1,
129 SimdFloat v2)
131 assert(std::size_t(offset) % 16 == 0);
133 float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
135 vst1_f32( base + align * offset[0], vget_low_f32(tmp.val[0]) );
136 vst1_f32( base + align * offset[1], vget_low_f32(tmp.val[1]) );
137 vst1_f32( base + align * offset[2], vget_high_f32(tmp.val[0]) );
138 vst1_f32( base + align * offset[3], vget_high_f32(tmp.val[1]) );
140 vst1q_lane_f32( base + align * offset[0] + 2, v2.simdInternal_, 0);
141 vst1q_lane_f32( base + align * offset[1] + 2, v2.simdInternal_, 1);
142 vst1q_lane_f32( base + align * offset[2] + 2, v2.simdInternal_, 2);
143 vst1q_lane_f32( base + align * offset[3] + 2, v2.simdInternal_, 3);
147 template <int align>
148 static inline void gmx_simdcall
149 transposeScatterIncrU(float * base,
150 const std::int32_t offset[],
151 SimdFloat v0,
152 SimdFloat v1,
153 SimdFloat v2)
155 assert(std::size_t(offset) % 16 == 0);
157 if (align < 4)
159 float32x2_t t0, t1, t2, t3;
160 float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
162 t0 = vget_low_f32(tmp.val[0]);
163 t1 = vget_low_f32(tmp.val[1]);
164 t2 = vget_high_f32(tmp.val[0]);
165 t3 = vget_high_f32(tmp.val[1]);
167 t0 = vadd_f32(t0, vld1_f32(base + align * offset[0]));
168 vst1_f32(base + align * offset[0], t0);
169 base[ align * offset[0] + 2] += vgetq_lane_f32(v2.simdInternal_, 0);
171 t1 = vadd_f32(t1, vld1_f32(base + align * offset[1]));
172 vst1_f32(base + align * offset[1], t1);
173 base[ align * offset[1] + 2] += vgetq_lane_f32(v2.simdInternal_, 1);
175 t2 = vadd_f32(t2, vld1_f32(base + align * offset[2]));
176 vst1_f32(base + align * offset[2], t2);
177 base[ align * offset[2] + 2] += vgetq_lane_f32(v2.simdInternal_, 2);
179 t3 = vadd_f32(t3, vld1_f32(base + align * offset[3]));
180 vst1_f32(base + align * offset[3], t3);
181 base[ align * offset[3] + 2] += vgetq_lane_f32(v2.simdInternal_, 3);
183 else
185 // Extra elements means we can use full width-4 load/store operations
186 float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
187 float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0F));
188 float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
189 float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
190 float32x4_t t4 = t2.val[0];
191 float32x4_t t5 = t3.val[0];
192 float32x4_t t6 = t2.val[1];
193 float32x4_t t7 = t3.val[1];
195 vst1q_f32(base + align * offset[0], vaddq_f32(t4, vld1q_f32(base + align * offset[0])));
196 vst1q_f32(base + align * offset[1], vaddq_f32(t5, vld1q_f32(base + align * offset[1])));
197 vst1q_f32(base + align * offset[2], vaddq_f32(t6, vld1q_f32(base + align * offset[2])));
198 vst1q_f32(base + align * offset[3], vaddq_f32(t7, vld1q_f32(base + align * offset[3])));
202 template <int align>
203 static inline void gmx_simdcall
204 transposeScatterDecrU(float * base,
205 const std::int32_t offset[],
206 SimdFloat v0,
207 SimdFloat v1,
208 SimdFloat v2)
210 assert(std::size_t(offset) % 16 == 0);
212 if (align < 4)
214 float32x2_t t0, t1, t2, t3;
215 float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
217 t0 = vget_low_f32(tmp.val[0]);
218 t1 = vget_low_f32(tmp.val[1]);
219 t2 = vget_high_f32(tmp.val[0]);
220 t3 = vget_high_f32(tmp.val[1]);
222 t0 = vsub_f32(vld1_f32(base + align * offset[0]), t0);
223 vst1_f32(base + align * offset[0], t0);
224 base[ align * offset[0] + 2] -= vgetq_lane_f32(v2.simdInternal_, 0);
226 t1 = vsub_f32(vld1_f32(base + align * offset[1]), t1);
227 vst1_f32(base + align * offset[1], t1);
228 base[ align * offset[1] + 2] -= vgetq_lane_f32(v2.simdInternal_, 1);
230 t2 = vsub_f32(vld1_f32(base + align * offset[2]), t2);
231 vst1_f32(base + align * offset[2], t2);
232 base[ align * offset[2] + 2] -= vgetq_lane_f32(v2.simdInternal_, 2);
234 t3 = vsub_f32(vld1_f32(base + align * offset[3]), t3);
235 vst1_f32(base + align * offset[3], t3);
236 base[ align * offset[3] + 2] -= vgetq_lane_f32(v2.simdInternal_, 3);
238 else
240 // Extra elements means we can use full width-4 load/store operations
241 float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
242 float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0F));
243 float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
244 float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
245 float32x4_t t4 = t2.val[0];
246 float32x4_t t5 = t3.val[0];
247 float32x4_t t6 = t2.val[1];
248 float32x4_t t7 = t3.val[1];
250 vst1q_f32(base + align * offset[0], vsubq_f32(vld1q_f32(base + align * offset[0]), t4));
251 vst1q_f32(base + align * offset[1], vsubq_f32(vld1q_f32(base + align * offset[1]), t5));
252 vst1q_f32(base + align * offset[2], vsubq_f32(vld1q_f32(base + align * offset[2]), t6));
253 vst1q_f32(base + align * offset[3], vsubq_f32(vld1q_f32(base + align * offset[3]), t7));
257 static inline void gmx_simdcall
258 expandScalarsToTriplets(SimdFloat scalar,
259 SimdFloat * triplets0,
260 SimdFloat * triplets1,
261 SimdFloat * triplets2)
263 float32x2_t lo, hi;
264 float32x4_t t0, t1, t2, t3;
266 lo = vget_low_f32(scalar.simdInternal_);
267 hi = vget_high_f32(scalar.simdInternal_);
269 t0 = vdupq_lane_f32(lo, 0);
270 t1 = vdupq_lane_f32(lo, 1);
271 t2 = vdupq_lane_f32(hi, 0);
272 t3 = vdupq_lane_f32(hi, 1);
274 triplets0->simdInternal_ = vextq_f32(t0, t1, 1);
275 triplets1->simdInternal_ = vextq_f32(t1, t2, 2);
276 triplets2->simdInternal_ = vextq_f32(t2, t3, 3);
280 template <int align>
281 static inline void gmx_simdcall
282 gatherLoadBySimdIntTranspose(const float * base,
283 SimdFInt32 offset,
284 SimdFloat * v0,
285 SimdFloat * v1,
286 SimdFloat * v2,
287 SimdFloat * v3)
289 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
291 assert(std::size_t(base) % 16 == 0);
292 assert(align % 4 == 0);
294 store(ioffset, offset);
295 gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
298 template <int align>
299 static inline void gmx_simdcall
300 gatherLoadBySimdIntTranspose(const float * base,
301 SimdFInt32 offset,
302 SimdFloat * v0,
303 SimdFloat * v1)
305 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
307 store(ioffset, offset);
308 gatherLoadTranspose<align>(base, ioffset, v0, v1);
313 template <int align>
314 static inline void gmx_simdcall
315 gatherLoadUBySimdIntTranspose(const float * base,
316 SimdFInt32 offset,
317 SimdFloat * v0,
318 SimdFloat * v1)
320 alignas(GMX_SIMD_ALIGNMENT) std::int32_t ioffset[GMX_SIMD_FINT32_WIDTH];
322 store(ioffset, offset);
323 v0->simdInternal_ = vcombine_f32(vld1_f32( base + align * ioffset[0] ),
324 vld1_f32( base + align * ioffset[2] ));
325 v1->simdInternal_ = vcombine_f32(vld1_f32( base + align * ioffset[1] ),
326 vld1_f32( base + align * ioffset[3] ));
327 float32x4x2_t tmp = vtrnq_f32(v0->simdInternal_, v1->simdInternal_ );
328 v0->simdInternal_ = tmp.val[0];
329 v1->simdInternal_ = tmp.val[1];
332 static inline float gmx_simdcall
333 reduceIncr4ReturnSum(float * m,
334 SimdFloat v0,
335 SimdFloat v1,
336 SimdFloat v2,
337 SimdFloat v3)
339 assert(std::size_t(m) % 16 == 0);
341 float32x4x2_t t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
342 float32x4x2_t t1 = vuzpq_f32(v1.simdInternal_, v3.simdInternal_);
343 float32x4x2_t t2 = vtrnq_f32(t0.val[0], t1.val[0]);
344 float32x4x2_t t3 = vtrnq_f32(t0.val[1], t1.val[1]);
345 v0.simdInternal_ = t2.val[0];
346 v1.simdInternal_ = t3.val[0];
347 v2.simdInternal_ = t2.val[1];
348 v3.simdInternal_ = t3.val[1];
350 v0 = v0 + v1;
351 v2 = v2 + v3;
352 v0 = v0 + v2;
353 v2 = v0 + simdLoad(m);
354 store(m, v2);
356 return reduce(v0);
359 } // namespace gmx
361 #endif // GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H