Replace boost::shared_ptr with std::shared_ptr
[gromacs.git] / src / gromacs / simd / impl_arm_neon_asimd / impl_arm_neon_asimd.h
blob700e87e7fc457793bfd864b37e7ea57bd36e4375
1 /*
2 * This file is part of the GROMACS molecular simulation package.
4 * Copyright (c) 2014,2015, by the GROMACS development team, led by
5 * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
6 * and including many others, as listed in the AUTHORS file in the
7 * top-level source directory and at http://www.gromacs.org.
9 * GROMACS is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public License
11 * as published by the Free Software Foundation; either version 2.1
12 * of the License, or (at your option) any later version.
14 * GROMACS is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with GROMACS; if not, see
21 * http://www.gnu.org/licenses, or write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * If you want to redistribute modifications to GROMACS, please
25 * consider that scientific software is very special. Version
26 * control is crucial - bugs must be traceable. We will be happy to
27 * consider code for inclusion in the official distribution, but
28 * derived work must not be called official GROMACS. Details are found
29 * in the README & COPYING files - if they are missing, get the
30 * official version at http://www.gromacs.org.
32 * To help us fund GROMACS development, we humbly ask that you cite
33 * the research papers on the package. Check out http://www.gromacs.org.
36 #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_H
37 #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_H
39 #include <math.h>
41 #include <arm_neon.h>
43 /* ARM (AArch64) NEON Advanced SIMD instruction wrappers
45 * Please see documentation in gromacs/simd/simd.h for defines.
48 /* Inherit single-precision and integer part from 32-bit arm */
49 #include "gromacs/simd/impl_arm_neon/impl_arm_neon.h"
51 /* Override some capability definitions from ARM 32-bit NEON - we now have double */
52 #undef GMX_SIMD_HAVE_DOUBLE
53 #define GMX_SIMD_HAVE_DOUBLE 1
54 #undef GMX_SIMD_HAVE_DINT32
55 #define GMX_SIMD_HAVE_DINT32 1
56 #undef GMX_SIMD_HAVE_DINT32_EXTRACT
57 #define GMX_SIMD_HAVE_DINT32_EXTRACT 1
58 #undef GMX_SIMD_HAVE_DINT32_LOGICAL
59 #define GMX_SIMD_HAVE_DINT32_LOGICAL 1
60 #undef GMX_SIMD_HAVE_DINT32_ARITHMETICS
61 #define GMX_SIMD_HAVE_DINT32_ARITHMETICS 1
63 /* Implementation details */
64 #define GMX_SIMD_DOUBLE_WIDTH 2
65 #define GMX_SIMD_DINT32_WIDTH 2
67 /* NEON ASIMD always has FMA support, so make sure we use that for single too. */
68 #undef gmx_simd_fmadd_f
69 #define gmx_simd_fmadd_f(a, b, c) vfmaq_f32(c, b, a)
70 #undef gmx_simd_fmsub_f
71 #define gmx_simd_fmsub_f(a, b, c) vnegq_f32(vfmsq_f32(c, b, a))
72 #undef gmx_simd_fnmadd_f
73 #define gmx_simd_fnmadd_f(a, b, c) vfmsq_f32(c, b, a)
74 #undef gmx_simd_fnmsub_f
75 #define gmx_simd_fnmsub_f(a, b, c) vnegq_f32(vfmaq_f32(c, b, a))
77 /* The rounding instructions were actually added already in ARMv8, but most
78 * compilers did not add intrinsics for them. Make sure we use them for single
79 * precision too when enabling NEON Advanced SIMD.
81 #undef gmx_simd_round_f
82 #define gmx_simd_round_f(x) vrndnq_f32(x)
83 #undef gmx_simd_trunc_f
84 #define gmx_simd_trunc_f(x) vrndq_f32(x)
86 /* NEON Advanced SIMD has a real rounding conversion instruction */
87 #undef gmx_simd_cvt_f2i
88 #define gmx_simd_cvt_f2i(x) vcvtnq_s32_f32(x)
90 /* Since we redefine rounding/conversion-with-rounding, make
91 * sure we use the new operations by redefining the routine
92 * to set the exponent too.
94 #undef gmx_simd_set_exponent_f
95 #define gmx_simd_set_exponent_f gmx_simd_set_exponent_f_arm_neon_asimd
97 /* We can do more efficient reduce with vector pairwise arithmetic */
98 #undef gmx_simd_reduce_f
99 #define gmx_simd_reduce_f(a) gmx_simd_reduce_f_arm_neon_asimd(a)
101 /* Pick the largest unsigned integer as a shortcut for any-true */
102 #undef gmx_simd_anytrue_fb
103 #define gmx_simd_anytrue_fb(x) (vmaxvq_u32(x) != 0)
104 #undef gmx_simd_anytrue_fib
105 #define gmx_simd_anytrue_fib(x) (vmaxvq_u32(x) != 0)
107 /* gcc-4.8 is missing the proper vreinterpretq casts
108 * for 64-bit operands. However, since these datatypes
109 * are opaque to the compiler we can safely cast one
110 * to the other without any conversion happening.
113 /****************************************************
114 * DOUBLE PRECISION SIMD IMPLEMENTATION *
115 ****************************************************/
116 #define gmx_simd_double_t float64x2_t
117 #define gmx_simd_load_d vld1q_f64
118 #define gmx_simd_load1_d vld1q_dup_f64
119 #define gmx_simd_set1_d vdupq_n_f64
120 #define gmx_simd_store_d vst1q_f64
121 #define gmx_simd_loadu_d vld1q_f64
122 #define gmx_simd_storeu_d vst1q_f64
123 #define gmx_simd_setzero_d() vdupq_n_f64(0.0)
124 #define gmx_simd_add_d vaddq_f64
125 #define gmx_simd_sub_d vsubq_f64
126 #define gmx_simd_mul_d vmulq_f64
127 #define gmx_simd_fmadd_d(a, b, c) vfmaq_f64(c, b, a)
128 #define gmx_simd_fmsub_d(a, b, c) vnegq_f64(vfmsq_f64(c, b, a))
129 #define gmx_simd_fnmadd_d(a, b, c) vfmsq_f64(c, b, a)
130 #define gmx_simd_fnmsub_d(a, b, c) vnegq_f64(vfmaq_f64(c, b, a))
131 #define gmx_simd_and_d(a, b) (float64x2_t)(vandq_s64((int64x2_t)(a), (int64x2_t)(b)))
132 #define gmx_simd_andnot_d(a, b) (float64x2_t)(vbicq_s64((int64x2_t)(b), (int64x2_t)(a)))
133 #define gmx_simd_or_d(a, b) (float64x2_t)(vorrq_s64((int64x2_t)(a), (int64x2_t)(b)))
134 #define gmx_simd_xor_d(a, b) (float64x2_t)(veorq_s64((int64x2_t)(a), (int64x2_t)(b)))
135 #define gmx_simd_rsqrt_d vrsqrteq_f64
136 #define gmx_simd_rsqrt_iter_d(lu, x) vmulq_f64(lu, vrsqrtsq_f64(vmulq_f64(lu, lu), x))
137 #define gmx_simd_rcp_d vrecpeq_f64
138 #define gmx_simd_rcp_iter_d(lu, x) vmulq_f64(lu, vrecpsq_f64(lu, x))
139 #define gmx_simd_fabs_d(x) vabsq_f64(x)
140 #define gmx_simd_fneg_d(x) vnegq_f64(x)
141 #define gmx_simd_max_d vmaxq_f64
142 #define gmx_simd_min_d vminq_f64
143 #define gmx_simd_round_d(x) vrndnq_f64(x)
144 #define gmx_simd_trunc_d(x) vrndq_f64(x)
145 #define gmx_simd_fraction_d(x) vsubq_f64(x, gmx_simd_trunc_d(x))
146 #define gmx_simd_get_exponent_d gmx_simd_get_exponent_d_arm_neon_asimd
147 #define gmx_simd_get_mantissa_d gmx_simd_get_mantissa_d_arm_neon_asimd
148 #define gmx_simd_set_exponent_d gmx_simd_set_exponent_d_arm_neon_asimd
149 /* integer datatype corresponding to double: gmx_simd_dint32_t */
150 #define gmx_simd_dint32_t int32x2_t
151 #define gmx_simd_load_di(m) vld1_s32(m)
152 #define gmx_simd_set1_di vdup_n_s32
153 #define gmx_simd_store_di(m, x) vst1_s32(m, x)
154 #define gmx_simd_loadu_di(m) vld1_s32(m)
155 #define gmx_simd_storeu_di(m, x) vst1_s32(m, x)
156 #define gmx_simd_setzero_di() vdup_n_s32(0)
157 #define gmx_simd_cvtt_d2i(x) vmovn_s64(vcvtq_s64_f64(x))
158 #define gmx_simd_cvt_d2i(x) vmovn_s64(vcvtnq_s64_f64(x))
159 #define gmx_simd_cvt_i2d(x) vcvtq_f64_s64(vmovl_s32(x))
160 #define gmx_simd_extract_di(x, i) vget_lane_s32(x, i)
161 /* Integer logical ops on gmx_simd_dint32_t */
162 #define gmx_simd_slli_di vshl_n_s32
163 #define gmx_simd_srli_di vshr_n_s32
164 #define gmx_simd_and_di vand_s32
165 #define gmx_simd_andnot_di(a, b) vbic_s32(b, a)
166 #define gmx_simd_or_di vorr_s32
167 #define gmx_simd_xor_di veor_s32
168 /* Integer arithmetic ops on gmx_simd_dint32_t */
169 #define gmx_simd_add_di vadd_s32
170 #define gmx_simd_sub_di vsub_s32
171 #define gmx_simd_mul_di vmul_s32
172 /* Boolean & comparison operations on gmx_simd_double_t */
173 #define gmx_simd_dbool_t uint64x2_t
174 #define gmx_simd_cmpeq_d vceqq_f64
175 #define gmx_simd_cmplt_d vcltq_f64
176 #define gmx_simd_cmple_d vcleq_f64
177 #define gmx_simd_and_db vandq_u64
178 #define gmx_simd_or_db vorrq_u64
179 #define gmx_simd_anytrue_db(x) (vmaxvq_u32((uint32x4_t)(x)) != 0)
180 #define gmx_simd_blendzero_d(a, sel) (float64x2_t)(vandq_u64((uint64x2_t)(a), sel))
181 #define gmx_simd_blendnotzero_d(a, sel) (float64x2_t)(vbicq_u64((uint64x2_t)(a), sel))
182 #define gmx_simd_blendv_d(a, b, sel) vbslq_f64(sel, b, a)
183 #define gmx_simd_reduce_d(a) gmx_simd_reduce_d_arm_neon_asimd(a)
184 /* Boolean & comparison operations on gmx_simd_dint32_t */
185 #define gmx_simd_dibool_t uint32x2_t
186 #define gmx_simd_cmpeq_di vceq_s32
187 #define gmx_simd_cmplt_di vclt_s32
188 #define gmx_simd_and_dib vand_u32
189 #define gmx_simd_or_dib vorr_u32
190 #define gmx_simd_anytrue_dib(x) (vmaxv_u32(x) != 0)
191 #define gmx_simd_blendzero_di(a, sel) vand_s32(a, vreinterpret_s32_u32(sel))
192 #define gmx_simd_blendnotzero_di(a, sel) vbic_s32(a, vreinterpret_s32_u32(sel))
193 #define gmx_simd_blendv_di(a, b, sel) vbsl_s32(sel, b, a)
194 /* Conversions between different booleans */
195 #define gmx_simd_cvt_db2dib(x) vqmovn_u64(x)
196 #define gmx_simd_cvt_dib2db(x) vorrq_u64(vmovl_u32(x), vshlq_n_u64(vmovl_u32(x), 32))
198 /* Float/double conversion */
199 #define gmx_simd_cvt_f2dd(f, d0, d1) { *d0 = vcvt_f64_f32(vget_low_f32(f)); *d1 = vcvt_high_f64_f32(f); }
200 #define gmx_simd_cvt_dd2f(d0, d1) vcvt_high_f32_f64(vcvt_f32_f64(d0), d1)
202 /****************************************************
203 * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
204 ****************************************************/
205 static gmx_inline gmx_simd_float_t
206 gmx_simd_set_exponent_f_arm_neon_asimd(gmx_simd_float_t x)
208 int32x4_t iexp = vcvtnq_s32_f32(x);
210 iexp = vshlq_n_s32(vaddq_s32(iexp, vdupq_n_s32(127)), 23);
211 return vreinterpretq_f32_s32(iexp);
214 static gmx_inline float
215 gmx_simd_reduce_f_arm_neon_asimd(gmx_simd_float_t a)
217 a = vpaddq_f32(a, a);
218 a = vpaddq_f32(a, a);
219 return vgetq_lane_f32(a, 0);
223 /****************************************************
224 * DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
225 ****************************************************/
226 static gmx_inline gmx_simd_double_t
227 gmx_simd_get_exponent_d_arm_neon_asimd(gmx_simd_double_t x)
229 const float64x2_t expmask = (float64x2_t)( vdupq_n_s64(0x7FF0000000000000LL) );
230 int64x2_t iexp;
232 iexp = (int64x2_t)(gmx_simd_and_d(x, expmask));
233 iexp = vsubq_s64(vshrq_n_s64(iexp, 52), vdupq_n_s64(1023));
234 return vcvtq_f64_s64(iexp);
238 static gmx_inline gmx_simd_double_t
239 gmx_simd_get_mantissa_d_arm_neon_asimd(gmx_simd_double_t x)
241 const float64x2_t mantmask = (float64x2_t)( vdupq_n_s64(0x000FFFFFFFFFFFFFLL) );
242 const float64x2_t one = vdupq_n_f64(1.0);
244 /* Get mantissa */
245 x = gmx_simd_and_d(mantmask, x);
246 /* Reset zero (but correctly biased) exponent */
247 return gmx_simd_or_d(x, one);
251 static gmx_inline gmx_simd_double_t
252 gmx_simd_set_exponent_d_arm_neon_asimd(gmx_simd_double_t x)
254 int64x2_t iexp = vcvtnq_s64_f64(x);
256 iexp = vshlq_n_s64(vaddq_s64(iexp, vdupq_n_s64(1023)), 52);
257 return (float64x2_t)(iexp);
260 static gmx_inline double
261 gmx_simd_reduce_d_arm_neon_asimd(gmx_simd_double_t a)
263 a = vpaddq_f64(a, a);
264 return vgetq_lane_f64(a, 0);
267 #endif /* GMX_SIMD_IMPL_ARM_NEON_ASIMD_H */