mozglue/misc/SIMD_avx2.cpp

   1 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   2 /* This Source Code Form is subject to the terms of the Mozilla Public
   3  * License, v. 2.0. If a copy of the MPL was not distributed with this
   4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   5
   6 #include "mozilla/SIMD.h"
   7
   8 #include "mozilla/SSE.h"
   9 #include "mozilla/Assertions.h"
  10
  11 // Restricting to x86_64 simplifies things, and we're not particularly
  12 // worried about slightly degraded performance on 32 bit processors which
  13 // support AVX2, as this should be quite a minority.
  14 #if defined(MOZILLA_MAY_SUPPORT_AVX2) && defined(__x86_64__)
  15
  16 #  include <cstring>
  17 #  include <immintrin.h>
  18 #  include <stdint.h>
  19 #  include <type_traits>
  20
  21 #  include "mozilla/EndianUtils.h"
  22
  23 namespace mozilla {
  24
  25 const __m256i* Cast256(uintptr_t ptr) {
  26   return reinterpret_cast<const __m256i*>(ptr);
  27 }
  28
  29 template <typename T>
  30 T GetAs(uintptr_t ptr) {
  31   return *reinterpret_cast<const T*>(ptr);
  32 }
  33
  34 uintptr_t AlignDown32(uintptr_t ptr) { return ptr & ~0x1f; }
  35
  36 uintptr_t AlignUp32(uintptr_t ptr) { return AlignDown32(ptr + 0x1f); }
  37
  38 template <typename TValue>
  39 __m128i CmpEq128(__m128i a, __m128i b) {
  40   static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2);
  41   if (sizeof(TValue) == 1) {
  42     return _mm_cmpeq_epi8(a, b);
  43   }
  44   return _mm_cmpeq_epi16(a, b);
  45 }
  46
  47 template <typename TValue>
  48 __m256i CmpEq256(__m256i a, __m256i b) {
  49   static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
  50                 sizeof(TValue) == 8);
  51   if (sizeof(TValue) == 1) {
  52     return _mm256_cmpeq_epi8(a, b);
  53   }
  54   if (sizeof(TValue) == 2) {
  55     return _mm256_cmpeq_epi16(a, b);
  56   }
  57
  58   return _mm256_cmpeq_epi64(a, b);
  59 }
  60
  61 #  if defined(__GNUC__) && !defined(__clang__)
  62
  63 // See the comment in SIMD.cpp over Load32BitsIntoXMM. This is just adapted
  64 // from that workaround. Testing this, it also yields the correct instructions
  65 // across all tested compilers.
  66 __m128i Load64BitsIntoXMM(uintptr_t ptr) {
  67   int64_t tmp;
  68   memcpy(&tmp, reinterpret_cast<const void*>(ptr), sizeof(tmp));
  69   return _mm_cvtsi64_si128(tmp);
  70 }
  71
  72 #  else
  73
  74 __m128i Load64BitsIntoXMM(uintptr_t ptr) {
  75   return _mm_loadu_si64(reinterpret_cast<const __m128i*>(ptr));
  76 }
  77
  78 #  endif
  79
  80 template <typename TValue>
  81 const TValue* Check4x8Bytes(__m128i needle, uintptr_t a, uintptr_t b,
  82                             uintptr_t c, uintptr_t d) {
  83   __m128i haystackA = Load64BitsIntoXMM(a);
  84   __m128i cmpA = CmpEq128<TValue>(needle, haystackA);
  85   __m128i haystackB = Load64BitsIntoXMM(b);
  86   __m128i cmpB = CmpEq128<TValue>(needle, haystackB);
  87   __m128i haystackC = Load64BitsIntoXMM(c);
  88   __m128i cmpC = CmpEq128<TValue>(needle, haystackC);
  89   __m128i haystackD = Load64BitsIntoXMM(d);
  90   __m128i cmpD = CmpEq128<TValue>(needle, haystackD);
  91   __m128i or_ab = _mm_or_si128(cmpA, cmpB);
  92   __m128i or_cd = _mm_or_si128(cmpC, cmpD);
  93   __m128i or_abcd = _mm_or_si128(or_ab, or_cd);
  94   int orMask = _mm_movemask_epi8(or_abcd);
  95   if (orMask & 0xff) {
  96     int cmpMask;
  97     cmpMask = _mm_movemask_epi8(cmpA);
  98     if (cmpMask & 0xff) {
  99       return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
 100     }
 101     cmpMask = _mm_movemask_epi8(cmpB);
 102     if (cmpMask & 0xff) {
 103       return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
 104     }
 105     cmpMask = _mm_movemask_epi8(cmpC);
 106     if (cmpMask & 0xff) {
 107       return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
 108     }
 109     cmpMask = _mm_movemask_epi8(cmpD);
 110     if (cmpMask & 0xff) {
 111       return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
 112     }
 113   }
 114
 115   return nullptr;
 116 }
 117
 118 template <typename TValue>
 119 const TValue* Check4x32Bytes(__m256i needle, uintptr_t a, uintptr_t b,
 120                              uintptr_t c, uintptr_t d) {
 121   __m256i haystackA = _mm256_loadu_si256(Cast256(a));
 122   __m256i cmpA = CmpEq256<TValue>(needle, haystackA);
 123   __m256i haystackB = _mm256_loadu_si256(Cast256(b));
 124   __m256i cmpB = CmpEq256<TValue>(needle, haystackB);
 125   __m256i haystackC = _mm256_loadu_si256(Cast256(c));
 126   __m256i cmpC = CmpEq256<TValue>(needle, haystackC);
 127   __m256i haystackD = _mm256_loadu_si256(Cast256(d));
 128   __m256i cmpD = CmpEq256<TValue>(needle, haystackD);
 129   __m256i or_ab = _mm256_or_si256(cmpA, cmpB);
 130   __m256i or_cd = _mm256_or_si256(cmpC, cmpD);
 131   __m256i or_abcd = _mm256_or_si256(or_ab, or_cd);
 132   int orMask = _mm256_movemask_epi8(or_abcd);
 133   if (orMask) {
 134     int cmpMask;
 135     cmpMask = _mm256_movemask_epi8(cmpA);
 136     if (cmpMask) {
 137       return reinterpret_cast<const TValue*>(a + __builtin_ctz(cmpMask));
 138     }
 139     cmpMask = _mm256_movemask_epi8(cmpB);
 140     if (cmpMask) {
 141       return reinterpret_cast<const TValue*>(b + __builtin_ctz(cmpMask));
 142     }
 143     cmpMask = _mm256_movemask_epi8(cmpC);
 144     if (cmpMask) {
 145       return reinterpret_cast<const TValue*>(c + __builtin_ctz(cmpMask));
 146     }
 147     cmpMask = _mm256_movemask_epi8(cmpD);
 148     if (cmpMask) {
 149       return reinterpret_cast<const TValue*>(d + __builtin_ctz(cmpMask));
 150     }
 151   }
 152
 153   return nullptr;
 154 }
 155
 156 template <typename TValue>
 157 const TValue* FindInBufferAVX2(const TValue* ptr, TValue value, size_t length) {
 158   static_assert(sizeof(TValue) == 1 || sizeof(TValue) == 2 ||
 159                 sizeof(TValue) == 8);
 160   static_assert(std::is_unsigned<TValue>::value);
 161
 162   // Load our needle into a 32-byte register
 163   __m256i needle;
 164   if (sizeof(TValue) == 1) {
 165     needle = _mm256_set1_epi8(value);
 166   } else if (sizeof(TValue) == 2) {
 167     needle = _mm256_set1_epi16(value);
 168   } else {
 169     needle = _mm256_set1_epi64x(value);
 170   }
 171
 172   size_t numBytes = length * sizeof(TValue);
 173   uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
 174   uintptr_t end = cur + numBytes;
 175
 176   if (numBytes < 8 || (sizeof(TValue) == 8 && numBytes < 32)) {
 177     while (cur < end) {
 178       if (GetAs<TValue>(cur) == value) {
 179         return reinterpret_cast<const TValue*>(cur);
 180       }
 181       cur += sizeof(TValue);
 182     }
 183     return nullptr;
 184   }
 185
 186   if constexpr (sizeof(TValue) != 8) {
 187     if (numBytes < 32) {
 188       __m128i needle_narrow;
 189       if (sizeof(TValue) == 1) {
 190         needle_narrow = _mm_set1_epi8(value);
 191       } else {
 192         needle_narrow = _mm_set1_epi16(value);
 193       }
 194       uintptr_t a = cur;
 195       uintptr_t b = cur + ((numBytes & 16) >> 1);
 196       uintptr_t c = end - 8 - ((numBytes & 16) >> 1);
 197       uintptr_t d = end - 8;
 198       return Check4x8Bytes<TValue>(needle_narrow, a, b, c, d);
 199     }
 200   }
 201
 202   if (numBytes < 128) {
 203     // NOTE: here and below, we have some bit fiddling which could look a
 204     // little weird. The important thing to note though is it's just a trick
 205     // for getting the number 32 if numBytes is greater than or equal to 64,
 206     // and 0 otherwise. This lets us fully cover the range without any
 207     // branching for the case where numBytes is in [32,64), and [64,128). We get
 208     // four ranges from this - if numbytes > 64, we get:
 209     //   [0,32), [32,64], [end - 64), [end - 32)
 210     // and if numbytes < 64, we get
 211     //   [0,32), [0,32), [end - 32), [end - 32)
 212     uintptr_t a = cur;
 213     uintptr_t b = cur + ((numBytes & 64) >> 1);
 214     uintptr_t c = end - 32 - ((numBytes & 64) >> 1);
 215     uintptr_t d = end - 32;
 216     return Check4x32Bytes<TValue>(needle, a, b, c, d);
 217   }
 218
 219   // Get the initial unaligned load out of the way. This will overlap with the
 220   // aligned stuff below, but the overlapped part should effectively be free
 221   // (relative to a mispredict from doing a byte-by-byte loop).
 222   __m256i haystack = _mm256_loadu_si256(Cast256(cur));
 223   __m256i cmp = CmpEq256<TValue>(needle, haystack);
 224   int cmpMask = _mm256_movemask_epi8(cmp);
 225   if (cmpMask) {
 226     return reinterpret_cast<const TValue*>(cur + __builtin_ctz(cmpMask));
 227   }
 228
 229   // Now we're working with aligned memory. Hooray! \o/
 230   cur = AlignUp32(cur);
 231
 232   uintptr_t tailStartPtr = AlignDown32(end - 96);
 233   uintptr_t tailEndPtr = end - 32;
 234
 235   while (cur < tailStartPtr) {
 236     uintptr_t a = cur;
 237     uintptr_t b = cur + 32;
 238     uintptr_t c = cur + 64;
 239     uintptr_t d = cur + 96;
 240     const TValue* result = Check4x32Bytes<TValue>(needle, a, b, c, d);
 241     if (result) {
 242       return result;
 243     }
 244     cur += 128;
 245   }
 246
 247   uintptr_t a = tailStartPtr;
 248   uintptr_t b = tailStartPtr + 32;
 249   uintptr_t c = tailStartPtr + 64;
 250   uintptr_t d = tailEndPtr;
 251   return Check4x32Bytes<TValue>(needle, a, b, c, d);
 252 }
 253
 254 const char* SIMD::memchr8AVX2(const char* ptr, char value, size_t length) {
 255   const unsigned char* uptr = reinterpret_cast<const unsigned char*>(ptr);
 256   unsigned char uvalue = static_cast<unsigned char>(value);
 257   const unsigned char* uresult =
 258       FindInBufferAVX2<unsigned char>(uptr, uvalue, length);
 259   return reinterpret_cast<const char*>(uresult);
 260 }
 261
 262 const char16_t* SIMD::memchr16AVX2(const char16_t* ptr, char16_t value,
 263                                    size_t length) {
 264   return FindInBufferAVX2<char16_t>(ptr, value, length);
 265 }
 266
 267 const uint64_t* SIMD::memchr64AVX2(const uint64_t* ptr, uint64_t value,
 268                                    size_t length) {
 269   return FindInBufferAVX2<uint64_t>(ptr, value, length);
 270 }
 271
 272 }  // namespace mozilla
 273
 274 #else
 275
 276 namespace mozilla {
 277
 278 const char* SIMD::memchr8AVX2(const char* ptr, char value, size_t length) {
 279   MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
 280 }
 281
 282 const char16_t* SIMD::memchr16AVX2(const char16_t* ptr, char16_t value,
 283                                    size_t length) {
 284   MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
 285 }
 286
 287 const uint64_t* SIMD::memchr64AVX2(const uint64_t* ptr, uint64_t value,
 288                                    size_t length) {
 289   MOZ_RELEASE_ASSERT(false, "AVX2 not supported in this binary.");
 290 }
 291
 292 }  // namespace mozilla
 293
 294 #endif