src/gromacs/simd/impl_arm_neon_asimd/impl_arm_neon_asimd.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifndef GMX_SIMD_IMPL_ARM_NEON_ASIMD_H
  37 #define GMX_SIMD_IMPL_ARM_NEON_ASIMD_H
  38
  39 #include <math.h>
  40
  41 #include <arm_neon.h>
  42
  43 /* ARM (AArch64) NEON Advanced SIMD instruction wrappers
  44  *
  45  * Please see documentation in gromacs/simd/simd.h for defines.
  46  */
  47
  48 /* Inherit single-precision and integer part from 32-bit arm */
  49 #include "gromacs/simd/impl_arm_neon/impl_arm_neon.h"
  50
  51 /* Override some capability definitions from ARM 32-bit NEON - we now have double */
  52 #undef  GMX_SIMD_HAVE_DOUBLE
  53 #define GMX_SIMD_HAVE_DOUBLE                  1
  54 #undef  GMX_SIMD_HAVE_DINT32
  55 #define GMX_SIMD_HAVE_DINT32                  1
  56 #undef  GMX_SIMD_HAVE_DINT32_EXTRACT
  57 #define GMX_SIMD_HAVE_DINT32_EXTRACT          1
  58 #undef  GMX_SIMD_HAVE_DINT32_LOGICAL
  59 #define GMX_SIMD_HAVE_DINT32_LOGICAL          1
  60 #undef  GMX_SIMD_HAVE_DINT32_ARITHMETICS
  61 #define GMX_SIMD_HAVE_DINT32_ARITHMETICS      1
  62
  63 /* Implementation details */
  64 #define GMX_SIMD_DOUBLE_WIDTH        2
  65 #define GMX_SIMD_DINT32_WIDTH        2
  66
  67 /* NEON ASIMD always has FMA support, so make sure we use that for single too. */
  68 #undef  gmx_simd_fmadd_f
  69 #define gmx_simd_fmadd_f(a, b, c)  vfmaq_f32(c, b, a)
  70 #undef  gmx_simd_fmsub_f
  71 #define gmx_simd_fmsub_f(a, b, c)  vnegq_f32(vfmsq_f32(c, b, a))
  72 #undef  gmx_simd_fnmadd_f
  73 #define gmx_simd_fnmadd_f(a, b, c) vfmsq_f32(c, b, a)
  74 #undef  gmx_simd_fnmsub_f
  75 #define gmx_simd_fnmsub_f(a, b, c) vnegq_f32(vfmaq_f32(c, b, a))
  76
  77 /* The rounding instructions were actually added already in ARMv8, but most
  78  * compilers did not add intrinsics for them. Make sure we use them for single
  79  * precision too when enabling NEON Advanced SIMD.
  80  */
  81 #undef  gmx_simd_round_f
  82 #define gmx_simd_round_f(x)        vrndnq_f32(x)
  83 #undef  gmx_simd_trunc_f
  84 #define gmx_simd_trunc_f(x)        vrndq_f32(x)
  85
  86 /* NEON Advanced SIMD has a real rounding conversion instruction */
  87 #undef  gmx_simd_cvt_f2i
  88 #define gmx_simd_cvt_f2i(x)        vcvtnq_s32_f32(x)
  89
  90 /* Since we redefine rounding/conversion-with-rounding, make
  91  * sure we use the new operations by redefining the routine
  92  * to set the exponent too.
  93  */
  94 #undef  gmx_simd_set_exponent_f
  95 #define gmx_simd_set_exponent_f    gmx_simd_set_exponent_f_arm_neon_asimd
  96
  97 /* We can do more efficient reduce with vector pairwise arithmetic */
  98 #undef  gmx_simd_reduce_f
  99 #define gmx_simd_reduce_f(a)       gmx_simd_reduce_f_arm_neon_asimd(a)
 100
 101 /* Pick the largest unsigned integer as a shortcut for any-true */
 102 #undef  gmx_simd_anytrue_fb
 103 #define gmx_simd_anytrue_fb(x)     (vmaxvq_u32(x) != 0)
 104 #undef  gmx_simd_anytrue_fib
 105 #define gmx_simd_anytrue_fib(x)    (vmaxvq_u32(x) != 0)
 106
 107 /* gcc-4.8 is missing the proper vreinterpretq casts
 108  * for 64-bit operands. However, since these datatypes
 109  * are opaque to the compiler we can safely cast one
 110  * to the other without any conversion happening.
 111  */
 112
 113 /****************************************************
 114  *      DOUBLE PRECISION SIMD IMPLEMENTATION        *
 115  ****************************************************/
 116 #define gmx_simd_double_t          float64x2_t
 117 #define gmx_simd_load_d            vld1q_f64
 118 #define gmx_simd_load1_d           vld1q_dup_f64
 119 #define gmx_simd_set1_d            vdupq_n_f64
 120 #define gmx_simd_store_d           vst1q_f64
 121 #define gmx_simd_loadu_d           vld1q_f64
 122 #define gmx_simd_storeu_d          vst1q_f64
 123 #define gmx_simd_setzero_d()       vdupq_n_f64(0.0)
 124 #define gmx_simd_add_d             vaddq_f64
 125 #define gmx_simd_sub_d             vsubq_f64
 126 #define gmx_simd_mul_d             vmulq_f64
 127 #define gmx_simd_fmadd_d(a, b, c)  vfmaq_f64(c, b, a)
 128 #define gmx_simd_fmsub_d(a, b, c)  vnegq_f64(vfmsq_f64(c, b, a))
 129 #define gmx_simd_fnmadd_d(a, b, c) vfmsq_f64(c, b, a)
 130 #define gmx_simd_fnmsub_d(a, b, c) vnegq_f64(vfmaq_f64(c, b, a))
 131 #define gmx_simd_and_d(a, b)        (float64x2_t)(vandq_s64((int64x2_t)(a), (int64x2_t)(b)))
 132 #define gmx_simd_andnot_d(a, b)     (float64x2_t)(vbicq_s64((int64x2_t)(b), (int64x2_t)(a)))
 133 #define gmx_simd_or_d(a, b)         (float64x2_t)(vorrq_s64((int64x2_t)(a), (int64x2_t)(b)))
 134 #define gmx_simd_xor_d(a, b)        (float64x2_t)(veorq_s64((int64x2_t)(a), (int64x2_t)(b)))
 135 #define gmx_simd_rsqrt_d            vrsqrteq_f64
 136 #define gmx_simd_rsqrt_iter_d(lu, x) vmulq_f64(lu, vrsqrtsq_f64(vmulq_f64(lu, lu), x))
 137 #define gmx_simd_rcp_d              vrecpeq_f64
 138 #define gmx_simd_rcp_iter_d(lu, x)   vmulq_f64(lu, vrecpsq_f64(lu, x))
 139 #define gmx_simd_fabs_d(x)         vabsq_f64(x)
 140 #define gmx_simd_fneg_d(x)         vnegq_f64(x)
 141 #define gmx_simd_max_d             vmaxq_f64
 142 #define gmx_simd_min_d             vminq_f64
 143 #define gmx_simd_round_d(x)        vrndnq_f64(x)
 144 #define gmx_simd_trunc_d(x)        vrndq_f64(x)
 145 #define gmx_simd_fraction_d(x)     vsubq_f64(x, gmx_simd_trunc_d(x))
 146 #define gmx_simd_get_exponent_d    gmx_simd_get_exponent_d_arm_neon_asimd
 147 #define gmx_simd_get_mantissa_d    gmx_simd_get_mantissa_d_arm_neon_asimd
 148 #define gmx_simd_set_exponent_d    gmx_simd_set_exponent_d_arm_neon_asimd
 149 /* integer datatype corresponding to double: gmx_simd_dint32_t */
 150 #define gmx_simd_dint32_t          int32x2_t
 151 #define gmx_simd_load_di(m)        vld1_s32(m)
 152 #define gmx_simd_set1_di           vdup_n_s32
 153 #define gmx_simd_store_di(m, x)    vst1_s32(m, x)
 154 #define gmx_simd_loadu_di(m)       vld1_s32(m)
 155 #define gmx_simd_storeu_di(m, x)   vst1_s32(m, x)
 156 #define gmx_simd_setzero_di()      vdup_n_s32(0)
 157 #define gmx_simd_cvtt_d2i(x)       vmovn_s64(vcvtq_s64_f64(x))
 158 #define gmx_simd_cvt_d2i(x)        vmovn_s64(vcvtnq_s64_f64(x))
 159 #define gmx_simd_cvt_i2d(x)        vcvtq_f64_s64(vmovl_s32(x))
 160 #define gmx_simd_extract_di(x, i)  vget_lane_s32(x, i)
 161 /* Integer logical ops on gmx_simd_dint32_t */
 162 #define gmx_simd_slli_di           vshl_n_s32
 163 #define gmx_simd_srli_di           vshr_n_s32
 164 #define gmx_simd_and_di            vand_s32
 165 #define gmx_simd_andnot_di(a, b)    vbic_s32(b, a)
 166 #define gmx_simd_or_di             vorr_s32
 167 #define gmx_simd_xor_di            veor_s32
 168 /* Integer arithmetic ops on gmx_simd_dint32_t */
 169 #define gmx_simd_add_di            vadd_s32
 170 #define gmx_simd_sub_di            vsub_s32
 171 #define gmx_simd_mul_di            vmul_s32
 172 /* Boolean & comparison operations on gmx_simd_double_t */
 173 #define gmx_simd_dbool_t           uint64x2_t
 174 #define gmx_simd_cmpeq_d           vceqq_f64
 175 #define gmx_simd_cmplt_d           vcltq_f64
 176 #define gmx_simd_cmple_d           vcleq_f64
 177 #define gmx_simd_and_db            vandq_u64
 178 #define gmx_simd_or_db             vorrq_u64
 179 #define gmx_simd_anytrue_db(x)     (vmaxvq_u32((uint32x4_t)(x)) != 0)
 180 #define gmx_simd_blendzero_d(a, sel)     (float64x2_t)(vandq_u64((uint64x2_t)(a), sel))
 181 #define gmx_simd_blendnotzero_d(a, sel)  (float64x2_t)(vbicq_u64((uint64x2_t)(a), sel))
 182 #define gmx_simd_blendv_d(a, b, sel)     vbslq_f64(sel, b, a)
 183 #define gmx_simd_reduce_d(a)       gmx_simd_reduce_d_arm_neon_asimd(a)
 184 /* Boolean & comparison operations on gmx_simd_dint32_t */
 185 #define gmx_simd_dibool_t          uint32x2_t
 186 #define gmx_simd_cmpeq_di          vceq_s32
 187 #define gmx_simd_cmplt_di          vclt_s32
 188 #define gmx_simd_and_dib           vand_u32
 189 #define gmx_simd_or_dib            vorr_u32
 190 #define gmx_simd_anytrue_dib(x)    (vmaxv_u32(x) != 0)
 191 #define gmx_simd_blendzero_di(a, sel)      vand_s32(a, vreinterpret_s32_u32(sel))
 192 #define gmx_simd_blendnotzero_di(a, sel)  vbic_s32(a, vreinterpret_s32_u32(sel))
 193 #define gmx_simd_blendv_di(a, b, sel)     vbsl_s32(sel, b, a)
 194 /* Conversions between different booleans */
 195 #define gmx_simd_cvt_db2dib(x)     vqmovn_u64(x)
 196 #define gmx_simd_cvt_dib2db(x)     vorrq_u64(vmovl_u32(x), vshlq_n_u64(vmovl_u32(x), 32))
 197
 198 /* Float/double conversion */
 199 #define gmx_simd_cvt_f2dd(f, d0, d1)  { *d0 = vcvt_f64_f32(vget_low_f32(f)); *d1 = vcvt_high_f64_f32(f); }
 200 #define gmx_simd_cvt_dd2f(d0, d1)     vcvt_high_f32_f64(vcvt_f32_f64(d0), d1)
 201
 202 /****************************************************
 203  * SINGLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
 204  ****************************************************/
 205 static gmx_inline gmx_simd_float_t
 206 gmx_simd_set_exponent_f_arm_neon_asimd(gmx_simd_float_t x)
 207 {
 208     int32x4_t  iexp = vcvtnq_s32_f32(x);
 209
 210     iexp = vshlq_n_s32(vaddq_s32(iexp, vdupq_n_s32(127)), 23);
 211     return vreinterpretq_f32_s32(iexp);
 212 }
 213
 214 static gmx_inline float
 215 gmx_simd_reduce_f_arm_neon_asimd(gmx_simd_float_t a)
 216 {
 217     a = vpaddq_f32(a, a);
 218     a = vpaddq_f32(a, a);
 219     return vgetq_lane_f32(a, 0);
 220 }
 221
 222
 223 /****************************************************
 224  * DOUBLE PRECISION IMPLEMENTATION HELPER FUNCTIONS *
 225  ****************************************************/
 226 static gmx_inline gmx_simd_double_t
 227 gmx_simd_get_exponent_d_arm_neon_asimd(gmx_simd_double_t x)
 228 {
 229     const float64x2_t expmask    = (float64x2_t)( vdupq_n_s64(0x7FF0000000000000LL) );
 230     int64x2_t         iexp;
 231
 232     iexp = (int64x2_t)(gmx_simd_and_d(x, expmask));
 233     iexp = vsubq_s64(vshrq_n_s64(iexp, 52), vdupq_n_s64(1023));
 234     return vcvtq_f64_s64(iexp);
 235 }
 236
 237
 238 static gmx_inline gmx_simd_double_t
 239 gmx_simd_get_mantissa_d_arm_neon_asimd(gmx_simd_double_t x)
 240 {
 241     const float64x2_t mantmask   = (float64x2_t)( vdupq_n_s64(0x000FFFFFFFFFFFFFLL) );
 242     const float64x2_t one        = vdupq_n_f64(1.0);
 243
 244     /* Get mantissa */
 245     x = gmx_simd_and_d(mantmask, x);
 246     /* Reset zero (but correctly biased) exponent */
 247     return gmx_simd_or_d(x, one);
 248 }
 249
 250
 251 static gmx_inline gmx_simd_double_t
 252 gmx_simd_set_exponent_d_arm_neon_asimd(gmx_simd_double_t x)
 253 {
 254     int64x2_t  iexp = vcvtnq_s64_f64(x);
 255
 256     iexp = vshlq_n_s64(vaddq_s64(iexp, vdupq_n_s64(1023)), 52);
 257     return (float64x2_t)(iexp);
 258 }
 259
 260 static gmx_inline double
 261 gmx_simd_reduce_d_arm_neon_asimd(gmx_simd_double_t a)
 262 {
 263     a = vpaddq_f64(a, a);
 264     return vgetq_lane_f64(a, 0);
 265 }
 266
 267 #endif /* GMX_SIMD_IMPL_ARM_NEON_ASIMD_H */