src/gromacs/simd/impl_arm_neon/impl_arm_neon_util_float.h

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2014,2015,2016,2017,2018,2019, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35 #ifndef GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H
  36 #define GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H
  37
  38 #include "config.h"
  39
  40 #include <cassert>
  41 #include <cstddef>
  42 #include <cstdint>
  43
  44 #include <arm_neon.h>
  45
  46 #include "gromacs/utility/basedefinitions.h"
  47
  48 #include "impl_arm_neon_simd_float.h"
  49
  50
  51 namespace gmx
  52 {
  53
  54 template <int align>
  55 static inline void gmx_simdcall
  56 gatherLoadTranspose(const float *        base,
  57                     const std::int32_t   offset[],
  58                     SimdFloat *          v0,
  59                     SimdFloat *          v1,
  60                     SimdFloat *          v2,
  61                     SimdFloat *          v3)
  62 {
  63     assert(std::size_t(offset) % 16 == 0);
  64     assert(std::size_t(base) % 16 == 0);
  65     assert(align % 4 == 0);
  66
  67     // Unfortunately we cannot use the beautiful Neon structured load
  68     // instructions since the data comes from four different memory locations.
  69     float32x4x2_t  t0 = vuzpq_f32(vld1q_f32( base + align * offset[0] ), vld1q_f32( base + align * offset[2] ));
  70     float32x4x2_t  t1 = vuzpq_f32(vld1q_f32( base + align * offset[1] ), vld1q_f32( base + align * offset[3] ));
  71     float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
  72     float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
  73     v0->simdInternal_ = t2.val[0];
  74     v1->simdInternal_ = t3.val[0];
  75     v2->simdInternal_ = t2.val[1];
  76     v3->simdInternal_ = t3.val[1];
  77 }
  78
  79 template <int align>
  80 static inline void gmx_simdcall
  81 gatherLoadTranspose(const float *        base,
  82                     const std::int32_t   offset[],
  83                     SimdFloat *          v0,
  84                     SimdFloat *          v1)
  85 {
  86     assert(std::size_t(offset) % 16 == 0);
  87     assert(std::size_t(base) % 8 == 0);
  88     assert(align % 2 == 0);
  89
  90     v0->simdInternal_  = vcombine_f32(vld1_f32( base + align * offset[0] ),
  91                                       vld1_f32( base + align * offset[2] ));
  92     v1->simdInternal_  = vcombine_f32(vld1_f32( base + align * offset[1] ),
  93                                       vld1_f32( base + align * offset[3] ));
  94
  95     float32x4x2_t tmp  = vtrnq_f32(v0->simdInternal_, v1->simdInternal_);
  96
  97     v0->simdInternal_  = tmp.val[0];
  98     v1->simdInternal_  = tmp.val[1];
  99 }
 100
 101 static const int c_simdBestPairAlignmentFloat = 2;
 102
 103 template <int align>
 104 static inline void gmx_simdcall
 105 gatherLoadUTranspose(const float *        base,
 106                      const std::int32_t   offset[],
 107                      SimdFloat *          v0,
 108                      SimdFloat *          v1,
 109                      SimdFloat *          v2)
 110 {
 111     assert(std::size_t(offset) % 16 == 0);
 112
 113     float32x4x2_t  t0 = vuzpq_f32(vld1q_f32( base + align * offset[0] ), vld1q_f32( base + align * offset[2] ));
 114     float32x4x2_t  t1 = vuzpq_f32(vld1q_f32( base + align * offset[1] ), vld1q_f32( base + align * offset[3] ));
 115     float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
 116     float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
 117     v0->simdInternal_ = t2.val[0];
 118     v1->simdInternal_ = t3.val[0];
 119     v2->simdInternal_ = t2.val[1];
 120 }
 121
 122
 123 template <int align>
 124 static inline void gmx_simdcall
 125 transposeScatterStoreU(float *              base,
 126                        const std::int32_t   offset[],
 127                        SimdFloat            v0,
 128                        SimdFloat            v1,
 129                        SimdFloat            v2)
 130 {
 131     assert(std::size_t(offset) % 16 == 0);
 132
 133     float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
 134
 135     vst1_f32( base + align * offset[0], vget_low_f32(tmp.val[0]) );
 136     vst1_f32( base + align * offset[1], vget_low_f32(tmp.val[1]) );
 137     vst1_f32( base + align * offset[2], vget_high_f32(tmp.val[0]) );
 138     vst1_f32( base + align * offset[3], vget_high_f32(tmp.val[1]) );
 139
 140     vst1q_lane_f32( base + align * offset[0] + 2, v2.simdInternal_, 0);
 141     vst1q_lane_f32( base + align * offset[1] + 2, v2.simdInternal_, 1);
 142     vst1q_lane_f32( base + align * offset[2] + 2, v2.simdInternal_, 2);
 143     vst1q_lane_f32( base + align * offset[3] + 2, v2.simdInternal_, 3);
 144 }
 145
 146
 147 template <int align>
 148 static inline void gmx_simdcall
 149 transposeScatterIncrU(float *              base,
 150                       const std::int32_t   offset[],
 151                       SimdFloat            v0,
 152                       SimdFloat            v1,
 153                       SimdFloat            v2)
 154 {
 155     assert(std::size_t(offset) % 16 == 0);
 156
 157     if (align < 4)
 158     {
 159         float32x2_t   t0, t1, t2, t3;
 160         float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
 161
 162         t0 = vget_low_f32(tmp.val[0]);
 163         t1 = vget_low_f32(tmp.val[1]);
 164         t2 = vget_high_f32(tmp.val[0]);
 165         t3 = vget_high_f32(tmp.val[1]);
 166
 167         t0 = vadd_f32(t0, vld1_f32(base + align * offset[0]));
 168         vst1_f32(base + align * offset[0], t0);
 169         base[ align * offset[0] + 2] += vgetq_lane_f32(v2.simdInternal_, 0);
 170
 171         t1 = vadd_f32(t1, vld1_f32(base + align * offset[1]));
 172         vst1_f32(base + align * offset[1], t1);
 173         base[ align * offset[1] + 2] += vgetq_lane_f32(v2.simdInternal_, 1);
 174
 175         t2 = vadd_f32(t2, vld1_f32(base + align * offset[2]));
 176         vst1_f32(base + align * offset[2], t2);
 177         base[ align * offset[2] + 2] += vgetq_lane_f32(v2.simdInternal_, 2);
 178
 179         t3 = vadd_f32(t3, vld1_f32(base + align * offset[3]));
 180         vst1_f32(base + align * offset[3], t3);
 181         base[ align * offset[3] + 2] += vgetq_lane_f32(v2.simdInternal_, 3);
 182     }
 183     else
 184     {
 185         // Extra elements means we can use full width-4 load/store operations
 186         float32x4x2_t  t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
 187         float32x4x2_t  t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0F));
 188         float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
 189         float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
 190         float32x4_t    t4 = t2.val[0];
 191         float32x4_t    t5 = t3.val[0];
 192         float32x4_t    t6 = t2.val[1];
 193         float32x4_t    t7 = t3.val[1];
 194
 195         vst1q_f32(base + align * offset[0], vaddq_f32(t4, vld1q_f32(base + align * offset[0])));
 196         vst1q_f32(base + align * offset[1], vaddq_f32(t5, vld1q_f32(base + align * offset[1])));
 197         vst1q_f32(base + align * offset[2], vaddq_f32(t6, vld1q_f32(base + align * offset[2])));
 198         vst1q_f32(base + align * offset[3], vaddq_f32(t7, vld1q_f32(base + align * offset[3])));
 199     }
 200 }
 201
 202 template <int align>
 203 static inline void gmx_simdcall
 204 transposeScatterDecrU(float *              base,
 205                       const std::int32_t   offset[],
 206                       SimdFloat            v0,
 207                       SimdFloat            v1,
 208                       SimdFloat            v2)
 209 {
 210     assert(std::size_t(offset) % 16 == 0);
 211
 212     if (align < 4)
 213     {
 214         float32x2_t   t0, t1, t2, t3;
 215         float32x4x2_t tmp = vtrnq_f32(v0.simdInternal_, v1.simdInternal_);
 216
 217         t0 = vget_low_f32(tmp.val[0]);
 218         t1 = vget_low_f32(tmp.val[1]);
 219         t2 = vget_high_f32(tmp.val[0]);
 220         t3 = vget_high_f32(tmp.val[1]);
 221
 222         t0 = vsub_f32(vld1_f32(base + align * offset[0]), t0);
 223         vst1_f32(base + align * offset[0], t0);
 224         base[ align * offset[0] + 2] -= vgetq_lane_f32(v2.simdInternal_, 0);
 225
 226         t1 = vsub_f32(vld1_f32(base + align * offset[1]), t1);
 227         vst1_f32(base + align * offset[1], t1);
 228         base[ align * offset[1] + 2] -= vgetq_lane_f32(v2.simdInternal_, 1);
 229
 230         t2 = vsub_f32(vld1_f32(base + align * offset[2]), t2);
 231         vst1_f32(base + align * offset[2], t2);
 232         base[ align * offset[2] + 2] -= vgetq_lane_f32(v2.simdInternal_, 2);
 233
 234         t3 = vsub_f32(vld1_f32(base + align * offset[3]), t3);
 235         vst1_f32(base + align * offset[3], t3);
 236         base[ align * offset[3] + 2] -= vgetq_lane_f32(v2.simdInternal_, 3);
 237     }
 238     else
 239     {
 240         // Extra elements means we can use full width-4 load/store operations
 241         float32x4x2_t  t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
 242         float32x4x2_t  t1 = vuzpq_f32(v1.simdInternal_, vdupq_n_f32(0.0F));
 243         float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
 244         float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
 245         float32x4_t    t4 = t2.val[0];
 246         float32x4_t    t5 = t3.val[0];
 247         float32x4_t    t6 = t2.val[1];
 248         float32x4_t    t7 = t3.val[1];
 249
 250         vst1q_f32(base + align * offset[0], vsubq_f32(vld1q_f32(base + align * offset[0]), t4));
 251         vst1q_f32(base + align * offset[1], vsubq_f32(vld1q_f32(base + align * offset[1]), t5));
 252         vst1q_f32(base + align * offset[2], vsubq_f32(vld1q_f32(base + align * offset[2]), t6));
 253         vst1q_f32(base + align * offset[3], vsubq_f32(vld1q_f32(base + align * offset[3]), t7));
 254     }
 255 }
 256
 257 static inline void gmx_simdcall
 258 expandScalarsToTriplets(SimdFloat    scalar,
 259                         SimdFloat *  triplets0,
 260                         SimdFloat *  triplets1,
 261                         SimdFloat *  triplets2)
 262 {
 263     float32x2_t lo, hi;
 264     float32x4_t t0, t1, t2, t3;
 265
 266     lo = vget_low_f32(scalar.simdInternal_);
 267     hi = vget_high_f32(scalar.simdInternal_);
 268
 269     t0 = vdupq_lane_f32(lo, 0);
 270     t1 = vdupq_lane_f32(lo, 1);
 271     t2 = vdupq_lane_f32(hi, 0);
 272     t3 = vdupq_lane_f32(hi, 1);
 273
 274     triplets0->simdInternal_ = vextq_f32(t0, t1, 1);
 275     triplets1->simdInternal_ = vextq_f32(t1, t2, 2);
 276     triplets2->simdInternal_ = vextq_f32(t2, t3, 3);
 277 }
 278
 279
 280 template <int align>
 281 static inline void gmx_simdcall
 282 gatherLoadBySimdIntTranspose(const float *  base,
 283                              SimdFInt32     offset,
 284                              SimdFloat *    v0,
 285                              SimdFloat *    v1,
 286                              SimdFloat *    v2,
 287                              SimdFloat *    v3)
 288 {
 289     alignas(GMX_SIMD_ALIGNMENT) std::int32_t  ioffset[GMX_SIMD_FINT32_WIDTH];
 290
 291     assert(std::size_t(base) % 16 == 0);
 292     assert(align % 4 == 0);
 293
 294     store(ioffset, offset);
 295     gatherLoadTranspose<align>(base, ioffset, v0, v1, v2, v3);
 296 }
 297
 298 template <int align>
 299 static inline void gmx_simdcall
 300 gatherLoadBySimdIntTranspose(const float *   base,
 301                              SimdFInt32      offset,
 302                              SimdFloat *     v0,
 303                              SimdFloat *     v1)
 304 {
 305     alignas(GMX_SIMD_ALIGNMENT) std::int32_t  ioffset[GMX_SIMD_FINT32_WIDTH];
 306
 307     store(ioffset, offset);
 308     gatherLoadTranspose<align>(base, ioffset, v0, v1);
 309 }
 310
 311
 312
 313 template <int align>
 314 static inline void gmx_simdcall
 315 gatherLoadUBySimdIntTranspose(const float *  base,
 316                               SimdFInt32     offset,
 317                               SimdFloat *    v0,
 318                               SimdFloat *    v1)
 319 {
 320     alignas(GMX_SIMD_ALIGNMENT) std::int32_t  ioffset[GMX_SIMD_FINT32_WIDTH];
 321
 322     store(ioffset, offset);
 323     v0->simdInternal_ = vcombine_f32(vld1_f32( base + align * ioffset[0] ),
 324                                      vld1_f32( base + align * ioffset[2] ));
 325     v1->simdInternal_ = vcombine_f32(vld1_f32( base + align * ioffset[1] ),
 326                                      vld1_f32( base + align * ioffset[3] ));
 327     float32x4x2_t tmp = vtrnq_f32(v0->simdInternal_, v1->simdInternal_ );
 328     v0->simdInternal_ = tmp.val[0];
 329     v1->simdInternal_ = tmp.val[1];
 330 }
 331
 332 static inline float gmx_simdcall
 333 reduceIncr4ReturnSum(float *    m,
 334                      SimdFloat  v0,
 335                      SimdFloat  v1,
 336                      SimdFloat  v2,
 337                      SimdFloat  v3)
 338 {
 339     assert(std::size_t(m) % 16 == 0);
 340
 341     float32x4x2_t  t0 = vuzpq_f32(v0.simdInternal_, v2.simdInternal_);
 342     float32x4x2_t  t1 = vuzpq_f32(v1.simdInternal_, v3.simdInternal_);
 343     float32x4x2_t  t2 = vtrnq_f32(t0.val[0], t1.val[0]);
 344     float32x4x2_t  t3 = vtrnq_f32(t0.val[1], t1.val[1]);
 345     v0.simdInternal_ = t2.val[0];
 346     v1.simdInternal_ = t3.val[0];
 347     v2.simdInternal_ = t2.val[1];
 348     v3.simdInternal_ = t3.val[1];
 349
 350     v0 = v0 + v1;
 351     v2 = v2 + v3;
 352     v0 = v0 + v2;
 353     v2 = v0 + simdLoad(m);
 354     store(m, v2);
 355
 356     return reduce(v0);
 357 }
 358
 359 }      // namespace gmx
 360
 361 #endif // GMX_SIMD_IMPL_ARM_NEON_UTIL_FLOAT_H