xpcom/string/nsUTF8UtilsSSE2.cpp

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #include "nscore.h"
   8 #include "nsAlgorithm.h"
   9 #include <emmintrin.h>
  10 #include <nsUTF8Utils.h>
  11
  12 void
  13 LossyConvertEncoding16to8::write_sse2(const char16_t* aSource,
  14                                       uint32_t aSourceLength)
  15 {
  16   char* dest = mDestination;
  17
  18   // Align source to a 16-byte boundary.
  19   uint32_t i = 0;
  20   uint32_t alignLen =
  21     XPCOM_MIN<uint32_t>(aSourceLength,
  22                         uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf) / sizeof(char16_t));
  23   for (; i < alignLen; ++i) {
  24     dest[i] = static_cast<unsigned char>(aSource[i]);
  25   }
  26
  27   // Walk 64 bytes (four XMM registers) at a time.
  28   __m128i vectmask = _mm_set1_epi16(0x00ff);
  29   for (; aSourceLength - i > 31; i += 32) {
  30     __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
  31     source1 = _mm_and_si128(source1, vectmask);
  32
  33     __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8));
  34     source2 = _mm_and_si128(source2, vectmask);
  35
  36     __m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
  37     source3 = _mm_and_si128(source3, vectmask);
  38
  39     __m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24));
  40     source4 = _mm_and_si128(source4, vectmask);
  41
  42
  43     // Pack the source data.  SSE2 views this as a saturating uint16_t to
  44     // uint8_t conversion, but since we masked off the high-order byte of every
  45     // uint16_t, we're really just grabbing the low-order bytes of source1 and
  46     // source2.
  47     __m128i packed1 = _mm_packus_epi16(source1, source2);
  48     __m128i packed2 = _mm_packus_epi16(source3, source4);
  49
  50     // This store needs to be unaligned since there's no guarantee that the
  51     // alignment we did above for the source will align the destination.
  52     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i),      packed1);
  53     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2);
  54   }
  55
  56   // Finish up the rest.
  57   for (; i < aSourceLength; ++i) {
  58     dest[i] = static_cast<unsigned char>(aSource[i]);
  59   }
  60
  61   mDestination += i;
  62 }
  63
  64 void
  65 LossyConvertEncoding8to16::write_sse2(const char* aSource,
  66                                       uint32_t aSourceLength)
  67 {
  68   char16_t* dest = mDestination;
  69
  70   // Align source to a 16-byte boundary.  We choose to align source rather than
  71   // dest because we'd rather have our loads than our stores be fast. You have
  72   // to wait for a load to complete, but you can keep on moving after issuing a
  73   // store.
  74   uint32_t i = 0;
  75   uint32_t alignLen = XPCOM_MIN(aSourceLength,
  76                                 uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf));
  77   for (; i < alignLen; ++i) {
  78     dest[i] = static_cast<unsigned char>(aSource[i]);
  79   }
  80
  81   // Walk 32 bytes (two XMM registers) at a time.
  82   for (; aSourceLength - i > 31; i += 32) {
  83     __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
  84     __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
  85
  86     // Interleave 0s in with the bytes of source to create lo and hi.
  87     __m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
  88     __m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
  89     __m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128());
  90     __m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128());
  91
  92     // store lo and hi into dest.
  93     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i),      lo1);
  94     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8),  hi1);
  95     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2);
  96     _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2);
  97   }
  98
  99   // Finish up whatever's left.
 100   for (; i < aSourceLength; ++i) {
 101     dest[i] = static_cast<unsigned char>(aSource[i]);
 102   }
 103
 104   mDestination += i;
 105 }