gfx/2d/SwizzleSSE2.cpp

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #include "Swizzle.h"
   8
   9 #include <emmintrin.h>
  10
  11 namespace mozilla::gfx {
  12
  13 // Load 1-3 pixels into a 4 pixel vector.
  14 static MOZ_ALWAYS_INLINE __m128i LoadRemainder_SSE2(const uint8_t* aSrc,
  15                                                     size_t aLength) {
  16   __m128i px;
  17   if (aLength >= 2) {
  18     // Load first 2 pixels
  19     px = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(aSrc));
  20     // Load third pixel
  21     if (aLength >= 3) {
  22       px = _mm_unpacklo_epi64(
  23           px,
  24           _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc + 2 * 4)));
  25     }
  26   } else {
  27     // Load single pixel
  28     px = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc));
  29   }
  30   return px;
  31 }
  32
  33 // Store 1-3 pixels from a vector into memory without overwriting.
  34 static MOZ_ALWAYS_INLINE void StoreRemainder_SSE2(uint8_t* aDst, size_t aLength,
  35                                                   const __m128i& aSrc) {
  36   if (aLength >= 2) {
  37     // Store first 2 pixels
  38     _mm_storel_epi64(reinterpret_cast<__m128i*>(aDst), aSrc);
  39     // Store third pixel
  40     if (aLength >= 3) {
  41       *reinterpret_cast<uint32_t*>(aDst + 2 * 4) =
  42           _mm_cvtsi128_si32(_mm_srli_si128(aSrc, 2 * 4));
  43     }
  44   } else {
  45     // Store single pixel
  46     *reinterpret_cast<uint32_t*>(aDst) = _mm_cvtsi128_si32(aSrc);
  47   }
  48 }
  49
  50 // Premultiply vector of 4 pixels using splayed math.
  51 template <bool aSwapRB, bool aOpaqueAlpha>
  52 static MOZ_ALWAYS_INLINE __m128i PremultiplyVector_SSE2(const __m128i& aSrc) {
  53   // Isolate R and B with mask.
  54   const __m128i mask = _mm_set1_epi32(0x00FF00FF);
  55   __m128i rb = _mm_and_si128(mask, aSrc);
  56   // Swap R and B if necessary.
  57   if (aSwapRB) {
  58     rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
  59     rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
  60   }
  61   // Isolate G and A by shifting down to bottom of word.
  62   __m128i ga = _mm_srli_epi16(aSrc, 8);
  63
  64   // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
  65   __m128i alphas = _mm_shufflelo_epi16(ga, _MM_SHUFFLE(3, 3, 1, 1));
  66   alphas = _mm_shufflehi_epi16(alphas, _MM_SHUFFLE(3, 3, 1, 1));
  67
  68   // rb = rb*a + 255; rb += rb >> 8;
  69   rb = _mm_add_epi16(_mm_mullo_epi16(rb, alphas), mask);
  70   rb = _mm_add_epi16(rb, _mm_srli_epi16(rb, 8));
  71
  72   // If format is not opaque, force A to 255 so that A*alpha/255 = alpha
  73   if (!aOpaqueAlpha) {
  74     ga = _mm_or_si128(ga, _mm_set1_epi32(0x00FF0000));
  75   }
  76   // ga = ga*a + 255; ga += ga >> 8;
  77   ga = _mm_add_epi16(_mm_mullo_epi16(ga, alphas), mask);
  78   ga = _mm_add_epi16(ga, _mm_srli_epi16(ga, 8));
  79   // If format is opaque, force output A to be 255.
  80   if (aOpaqueAlpha) {
  81     ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
  82   }
  83
  84   // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00)
  85   rb = _mm_srli_epi16(rb, 8);
  86   ga = _mm_andnot_si128(mask, ga);
  87   return _mm_or_si128(rb, ga);
  88 }
  89
  90 // Premultiply vector of aAlignedRow + aRemainder pixels.
  91 template <bool aSwapRB, bool aOpaqueAlpha>
  92 static MOZ_ALWAYS_INLINE void PremultiplyChunk_SSE2(const uint8_t*& aSrc,
  93                                                     uint8_t*& aDst,
  94                                                     int32_t aAlignedRow,
  95                                                     int32_t aRemainder) {
  96   // Process all 4-pixel chunks as one vector.
  97   for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
  98     __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
  99     px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
 100     _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
 101     aSrc += 4 * 4;
 102     aDst += 4 * 4;
 103   }
 104
 105   // Handle any 1-3 remaining pixels.
 106   if (aRemainder) {
 107     __m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
 108     px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
 109     StoreRemainder_SSE2(aDst, aRemainder, px);
 110   }
 111 }
 112
 113 // Premultiply vector of aLength pixels.
 114 template <bool aSwapRB, bool aOpaqueAlpha>
 115 void PremultiplyRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
 116   int32_t alignedRow = 4 * (aLength & ~3);
 117   int32_t remainder = aLength & 3;
 118   PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
 119                                                remainder);
 120 }
 121
 122 template <bool aSwapRB, bool aOpaqueAlpha>
 123 void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 124                       int32_t aDstGap, IntSize aSize) {
 125   int32_t alignedRow = 4 * (aSize.width & ~3);
 126   int32_t remainder = aSize.width & 3;
 127   // Fold remainder into stride gap.
 128   aSrcGap += 4 * remainder;
 129   aDstGap += 4 * remainder;
 130
 131   for (int32_t height = aSize.height; height > 0; height--) {
 132     PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
 133                                                  remainder);
 134     aSrc += aSrcGap;
 135     aDst += aDstGap;
 136   }
 137 }
 138
 139 // Force instantiation of premultiply variants here.
 140 template void PremultiplyRow_SSE2<false, false>(const uint8_t*, uint8_t*,
 141                                                 int32_t);
 142 template void PremultiplyRow_SSE2<false, true>(const uint8_t*, uint8_t*,
 143                                                int32_t);
 144 template void PremultiplyRow_SSE2<true, false>(const uint8_t*, uint8_t*,
 145                                                int32_t);
 146 template void PremultiplyRow_SSE2<true, true>(const uint8_t*, uint8_t*,
 147                                               int32_t);
 148 template void Premultiply_SSE2<false, false>(const uint8_t*, int32_t, uint8_t*,
 149                                              int32_t, IntSize);
 150 template void Premultiply_SSE2<false, true>(const uint8_t*, int32_t, uint8_t*,
 151                                             int32_t, IntSize);
 152 template void Premultiply_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*,
 153                                             int32_t, IntSize);
 154 template void Premultiply_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*,
 155                                            int32_t, IntSize);
 156
 157 // This generates a table of fixed-point reciprocals representing 1/alpha
 158 // similar to the fallback implementation. However, the reciprocal must fit
 159 // in 16 bits to multiply cheaply. Observe that reciprocals of smaller alphas
 160 // require more bits than for larger alphas. We take advantage of this by
 161 // shifting the reciprocal down by either 3 or 8 bits depending on whether
 162 // the alpha value is less than 0x20. This is easy to then undo by multiplying
 163 // the color component to be unpremultiplying by either 8 or 0x100,
 164 // respectively. The 16 bit reciprocal is duplicated into both words of a
 165 // uint32_t here to reduce unpacking overhead.
 166 #define UNPREMULQ_SSE2(x) \
 167   (0x10001U * (0xFF0220U / ((x) * ((x) < 0x20 ? 0x100 : 8))))
 168 #define UNPREMULQ_SSE2_2(x) UNPREMULQ_SSE2(x), UNPREMULQ_SSE2((x) + 1)
 169 #define UNPREMULQ_SSE2_4(x) UNPREMULQ_SSE2_2(x), UNPREMULQ_SSE2_2((x) + 2)
 170 #define UNPREMULQ_SSE2_8(x) UNPREMULQ_SSE2_4(x), UNPREMULQ_SSE2_4((x) + 4)
 171 #define UNPREMULQ_SSE2_16(x) UNPREMULQ_SSE2_8(x), UNPREMULQ_SSE2_8((x) + 8)
 172 #define UNPREMULQ_SSE2_32(x) UNPREMULQ_SSE2_16(x), UNPREMULQ_SSE2_16((x) + 16)
 173 static const uint32_t sUnpremultiplyTable_SSE2[256] = {0,
 174                                                        UNPREMULQ_SSE2(1),
 175                                                        UNPREMULQ_SSE2_2(2),
 176                                                        UNPREMULQ_SSE2_4(4),
 177                                                        UNPREMULQ_SSE2_8(8),
 178                                                        UNPREMULQ_SSE2_16(16),
 179                                                        UNPREMULQ_SSE2_32(32),
 180                                                        UNPREMULQ_SSE2_32(64),
 181                                                        UNPREMULQ_SSE2_32(96),
 182                                                        UNPREMULQ_SSE2_32(128),
 183                                                        UNPREMULQ_SSE2_32(160),
 184                                                        UNPREMULQ_SSE2_32(192),
 185                                                        UNPREMULQ_SSE2_32(224)};
 186
 187 // Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
 188 // that avoids doing any actual division.
 189 template <bool aSwapRB>
 190 static MOZ_ALWAYS_INLINE __m128i UnpremultiplyVector_SSE2(const __m128i& aSrc) {
 191   // Isolate R and B with mask.
 192   __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
 193   // Swap R and B if necessary.
 194   if (aSwapRB) {
 195     rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
 196     rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
 197   }
 198
 199   // Isolate G and A by shifting down to bottom of word.
 200   __m128i ga = _mm_srli_epi16(aSrc, 8);
 201   // Extract the alphas for the 4 pixels from the now isolated words.
 202   int a1 = _mm_extract_epi16(ga, 1);
 203   int a2 = _mm_extract_epi16(ga, 3);
 204   int a3 = _mm_extract_epi16(ga, 5);
 205   int a4 = _mm_extract_epi16(ga, 7);
 206
 207   // Load the 16 bit reciprocals from the table for each alpha.
 208   // The reciprocals are doubled in each uint32_t entry.
 209   // Unpack them to a final vector of duplicated reciprocals of
 210   // the form Q1 Q1 Q2 Q2 Q3 Q3 Q4 Q4.
 211   __m128i q12 =
 212       _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a1]),
 213                          _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a2]));
 214   __m128i q34 =
 215       _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a3]),
 216                          _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a4]));
 217   __m128i q1234 = _mm_unpacklo_epi64(q12, q34);
 218
 219   // Check if the alphas are less than 0x20, so that we can undo
 220   // scaling of the reciprocals as appropriate.
 221   __m128i scale = _mm_cmplt_epi32(ga, _mm_set1_epi32(0x00200000));
 222   // Produce scale factors by ((a < 0x20) ^ 8) & 0x108,
 223   // such that scale is 0x100 if < 0x20, and 8 otherwise.
 224   scale = _mm_xor_si128(scale, _mm_set1_epi16(8));
 225   scale = _mm_and_si128(scale, _mm_set1_epi16(0x108));
 226   // Isolate G now so that we don't accidentally unpremultiply A.
 227   ga = _mm_and_si128(ga, _mm_set1_epi32(0x000000FF));
 228
 229   // Scale R, B, and G as required depending on reciprocal precision.
 230   rb = _mm_mullo_epi16(rb, scale);
 231   ga = _mm_mullo_epi16(ga, scale);
 232
 233   // Multiply R, B, and G by the reciprocal, only taking the high word
 234   // too effectively shift right by 16.
 235   rb = _mm_mulhi_epu16(rb, q1234);
 236   ga = _mm_mulhi_epu16(ga, q1234);
 237
 238   // Combine back to final pixel with rb | (ga << 8) | (aSrc & 0xFF000000),
 239   // which will add back on the original alpha value unchanged.
 240   ga = _mm_slli_si128(ga, 1);
 241   ga = _mm_or_si128(ga, _mm_and_si128(aSrc, _mm_set1_epi32(0xFF000000)));
 242   return _mm_or_si128(rb, ga);
 243 }
 244
 245 template <bool aSwapRB>
 246 static MOZ_ALWAYS_INLINE void UnpremultiplyChunk_SSE2(const uint8_t*& aSrc,
 247                                                       uint8_t*& aDst,
 248                                                       int32_t aAlignedRow,
 249                                                       int32_t aRemainder) {
 250   // Process all 4-pixel chunks as one vector.
 251   for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
 252     __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
 253     px = UnpremultiplyVector_SSE2<aSwapRB>(px);
 254     _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
 255     aSrc += 4 * 4;
 256     aDst += 4 * 4;
 257   }
 258
 259   // Handle any 1-3 remaining pixels.
 260   if (aRemainder) {
 261     __m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
 262     px = UnpremultiplyVector_SSE2<aSwapRB>(px);
 263     StoreRemainder_SSE2(aDst, aRemainder, px);
 264   }
 265 }
 266
 267 template <bool aSwapRB>
 268 void UnpremultiplyRow_SSE2(const uint8_t* aSrc, uint8_t* aDst,
 269                            int32_t aLength) {
 270   int32_t alignedRow = 4 * (aLength & ~3);
 271   int32_t remainder = aLength & 3;
 272   UnpremultiplyChunk_SSE2<aSwapRB>(aSrc, aDst, alignedRow, remainder);
 273 }
 274
 275 template <bool aSwapRB>
 276 void Unpremultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 277                         int32_t aDstGap, IntSize aSize) {
 278   int32_t alignedRow = 4 * (aSize.width & ~3);
 279   int32_t remainder = aSize.width & 3;
 280   // Fold remainder into stride gap.
 281   aSrcGap += 4 * remainder;
 282   aDstGap += 4 * remainder;
 283
 284   for (int32_t height = aSize.height; height > 0; height--) {
 285     UnpremultiplyChunk_SSE2<aSwapRB>(aSrc, aDst, alignedRow, remainder);
 286     aSrc += aSrcGap;
 287     aDst += aDstGap;
 288   }
 289 }
 290
 291 // Force instantiation of unpremultiply variants here.
 292 template void UnpremultiplyRow_SSE2<false>(const uint8_t*, uint8_t*, int32_t);
 293 template void UnpremultiplyRow_SSE2<true>(const uint8_t*, uint8_t*, int32_t);
 294 template void Unpremultiply_SSE2<false>(const uint8_t*, int32_t, uint8_t*,
 295                                         int32_t, IntSize);
 296 template void Unpremultiply_SSE2<true>(const uint8_t*, int32_t, uint8_t*,
 297                                        int32_t, IntSize);
 298
 299 // Swizzle a vector of 4 pixels providing swaps and opaquifying.
 300 template <bool aSwapRB, bool aOpaqueAlpha>
 301 static MOZ_ALWAYS_INLINE __m128i SwizzleVector_SSE2(const __m128i& aSrc) {
 302   // Isolate R and B.
 303   __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
 304   // Swap R and B.
 305   rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
 306   rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
 307   // Isolate G and A.
 308   __m128i ga = _mm_and_si128(aSrc, _mm_set1_epi32(0xFF00FF00));
 309   // Force alpha to 255 if necessary.
 310   if (aOpaqueAlpha) {
 311     ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
 312   }
 313   // Combine everything back together.
 314   return _mm_or_si128(rb, ga);
 315 }
 316
 317 #if 0
 318 // These specializations currently do not profile faster than the generic versions,
 319 // so disable them for now.
 320
 321 // Optimized implementations for when there is no R and B swap.
 322 template<>
 323 MOZ_ALWAYS_INLINE __m128i
 324 SwizzleVector_SSE2<false, true>(const __m128i& aSrc)
 325 {
 326   // Force alpha to 255.
 327   return _mm_or_si128(aSrc, _mm_set1_epi32(0xFF000000));
 328 }
 329
 330 template<>
 331 MOZ_ALWAYS_INLINE __m128i
 332 SwizzleVector_SSE2<false, false>(const __m128i& aSrc)
 333 {
 334   return aSrc;
 335 }
 336 #endif
 337
 338 template <bool aSwapRB, bool aOpaqueAlpha>
 339 static MOZ_ALWAYS_INLINE void SwizzleChunk_SSE2(const uint8_t*& aSrc,
 340                                                 uint8_t*& aDst,
 341                                                 int32_t aAlignedRow,
 342                                                 int32_t aRemainder) {
 343   // Process all 4-pixel chunks as one vector.
 344   for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
 345     __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
 346     px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
 347     _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
 348     aSrc += 4 * 4;
 349     aDst += 4 * 4;
 350   }
 351
 352   // Handle any 1-3 remaining pixels.
 353   if (aRemainder) {
 354     __m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
 355     px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
 356     StoreRemainder_SSE2(aDst, aRemainder, px);
 357   }
 358 }
 359
 360 template <bool aSwapRB, bool aOpaqueAlpha>
 361 void SwizzleRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
 362   int32_t alignedRow = 4 * (aLength & ~3);
 363   int32_t remainder = aLength & 3;
 364   SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
 365 }
 366
 367 template <bool aSwapRB, bool aOpaqueAlpha>
 368 void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 369                   int32_t aDstGap, IntSize aSize) {
 370   int32_t alignedRow = 4 * (aSize.width & ~3);
 371   int32_t remainder = aSize.width & 3;
 372   // Fold remainder into stride gap.
 373   aSrcGap += 4 * remainder;
 374   aDstGap += 4 * remainder;
 375
 376   for (int32_t height = aSize.height; height > 0; height--) {
 377     SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
 378     aSrc += aSrcGap;
 379     aDst += aDstGap;
 380   }
 381 }
 382
 383 // Force instantiation of swizzle variants here.
 384 template void SwizzleRow_SSE2<true, false>(const uint8_t*, uint8_t*, int32_t);
 385 template void SwizzleRow_SSE2<true, true>(const uint8_t*, uint8_t*, int32_t);
 386 template void Swizzle_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*,
 387                                         int32_t, IntSize);
 388 template void Swizzle_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*,
 389                                        int32_t, IntSize);
 390
 391 }  // namespace mozilla::gfx