gfx/2d/Swizzle.cpp

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #include "Swizzle.h"
   8 #include "Logging.h"
   9 #include "Orientation.h"
  10 #include "Tools.h"
  11 #include "mozilla/CheckedInt.h"
  12 #include "mozilla/EndianUtils.h"
  13 #include "mozilla/UniquePtr.h"
  14
  15 #ifdef USE_SSE2
  16 #  include "mozilla/SSE.h"
  17 #endif
  18
  19 #ifdef USE_NEON
  20 #  include "mozilla/arm.h"
  21 #endif
  22
  23 #include <new>
  24
  25 namespace mozilla {
  26 namespace gfx {
  27
  28 /**
  29  * Convenience macros for dispatching to various format combinations.
  30  */
  31
  32 // Hash the formats to a relatively dense value to optimize jump table
  33 // generation. The first 6 formats in SurfaceFormat are the 32-bit BGRA variants
  34 // and are the most common formats dispatched here. Room is reserved in the
  35 // lowish bits for up to these 6 destination formats. If a destination format is
  36 // >= 6, the 6th bit is set to avoid collisions.
  37 #define FORMAT_KEY(aSrcFormat, aDstFormat) \
  38   (int(aSrcFormat) * 6 + int(aDstFormat) + (int(int(aDstFormat) >= 6) << 6))
  39
  40 #define FORMAT_CASE_EXPR(aSrcFormat, aDstFormat, ...) \
  41   case FORMAT_KEY(aSrcFormat, aDstFormat):            \
  42     __VA_ARGS__;                                      \
  43     return true;
  44
  45 #define FORMAT_CASE(aSrcFormat, aDstFormat, ...) \
  46   FORMAT_CASE_EXPR(aSrcFormat, aDstFormat, FORMAT_CASE_CALL(__VA_ARGS__))
  47
  48 #define FORMAT_CASE_ROW(aSrcFormat, aDstFormat, ...) \
  49   case FORMAT_KEY(aSrcFormat, aDstFormat):           \
  50     return &__VA_ARGS__;
  51
  52 /**
  53  * Constexpr functions for analyzing format attributes in templates.
  54  */
  55
  56 // Whether B comes before R in pixel memory layout.
  57 static constexpr bool IsBGRFormat(SurfaceFormat aFormat) {
  58   return aFormat == SurfaceFormat::B8G8R8A8 ||
  59 #if MOZ_LITTLE_ENDIAN()
  60          aFormat == SurfaceFormat::R5G6B5_UINT16 ||
  61 #endif
  62          aFormat == SurfaceFormat::B8G8R8X8 || aFormat == SurfaceFormat::B8G8R8;
  63 }
  64
  65 // Whether the order of B and R need to be swapped to map from src to dst.
  66 static constexpr bool ShouldSwapRB(SurfaceFormat aSrcFormat,
  67                                    SurfaceFormat aDstFormat) {
  68   return IsBGRFormat(aSrcFormat) != IsBGRFormat(aDstFormat);
  69 }
  70
  71 // The starting byte of the RGB components in pixel memory.
  72 static constexpr uint32_t RGBByteIndex(SurfaceFormat aFormat) {
  73   return aFormat == SurfaceFormat::A8R8G8B8 ||
  74                  aFormat == SurfaceFormat::X8R8G8B8
  75              ? 1
  76              : 0;
  77 }
  78
  79 // The byte of the alpha component, which just comes after RGB.
  80 static constexpr uint32_t AlphaByteIndex(SurfaceFormat aFormat) {
  81   return (RGBByteIndex(aFormat) + 3) % 4;
  82 }
  83
  84 // The endian-dependent bit shift to access RGB of a UINT32 pixel.
  85 static constexpr uint32_t RGBBitShift(SurfaceFormat aFormat) {
  86 #if MOZ_LITTLE_ENDIAN()
  87   return 8 * RGBByteIndex(aFormat);
  88 #else
  89   return 8 - 8 * RGBByteIndex(aFormat);
  90 #endif
  91 }
  92
  93 // The endian-dependent bit shift to access alpha of a UINT32 pixel.
  94 static constexpr uint32_t AlphaBitShift(SurfaceFormat aFormat) {
  95   return (RGBBitShift(aFormat) + 24) % 32;
  96 }
  97
  98 // Whether the pixel format should ignore the value of the alpha channel and
  99 // treat it as opaque.
 100 static constexpr bool IgnoreAlpha(SurfaceFormat aFormat) {
 101   return aFormat == SurfaceFormat::B8G8R8X8 ||
 102          aFormat == SurfaceFormat::R8G8B8X8 ||
 103          aFormat == SurfaceFormat::X8R8G8B8;
 104 }
 105
 106 // Whether to force alpha to opaque to map from src to dst.
 107 static constexpr bool ShouldForceOpaque(SurfaceFormat aSrcFormat,
 108                                         SurfaceFormat aDstFormat) {
 109   return IgnoreAlpha(aSrcFormat) != IgnoreAlpha(aDstFormat);
 110 }
 111
 112 #ifdef USE_SSE2
 113 /**
 114  * SSE2 optimizations
 115  */
 116
 117 template <bool aSwapRB, bool aOpaqueAlpha>
 118 void Premultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
 119
 120 #  define PREMULTIPLY_SSE2(aSrcFormat, aDstFormat)                     \
 121     FORMAT_CASE(aSrcFormat, aDstFormat,                                \
 122                 Premultiply_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
 123                                  ShouldForceOpaque(aSrcFormat, aDstFormat)>)
 124
 125 template <bool aSwapRB, bool aOpaqueAlpha>
 126 void PremultiplyRow_SSE2(const uint8_t*, uint8_t*, int32_t);
 127
 128 #  define PREMULTIPLY_ROW_SSE2(aSrcFormat, aDstFormat)            \
 129     FORMAT_CASE_ROW(                                              \
 130         aSrcFormat, aDstFormat,                                   \
 131         PremultiplyRow_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
 132                             ShouldForceOpaque(aSrcFormat, aDstFormat)>)
 133
 134 template <bool aSwapRB>
 135 void Unpremultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
 136
 137 #  define UNPREMULTIPLY_SSE2(aSrcFormat, aDstFormat) \
 138     FORMAT_CASE(aSrcFormat, aDstFormat,              \
 139                 Unpremultiply_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat)>)
 140
 141 template <bool aSwapRB>
 142 void UnpremultiplyRow_SSE2(const uint8_t*, uint8_t*, int32_t);
 143
 144 #  define UNPREMULTIPLY_ROW_SSE2(aSrcFormat, aDstFormat) \
 145     FORMAT_CASE_ROW(                                     \
 146         aSrcFormat, aDstFormat,                          \
 147         UnpremultiplyRow_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat)>)
 148
 149 template <bool aSwapRB, bool aOpaqueAlpha>
 150 void Swizzle_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
 151
 152 #  define SWIZZLE_SSE2(aSrcFormat, aDstFormat)                     \
 153     FORMAT_CASE(aSrcFormat, aDstFormat,                            \
 154                 Swizzle_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
 155                              ShouldForceOpaque(aSrcFormat, aDstFormat)>)
 156
 157 template <bool aSwapRB, bool aOpaqueAlpha>
 158 void SwizzleRow_SSE2(const uint8_t*, uint8_t*, int32_t);
 159
 160 #  define SWIZZLE_ROW_SSE2(aSrcFormat, aDstFormat)            \
 161     FORMAT_CASE_ROW(                                          \
 162         aSrcFormat, aDstFormat,                               \
 163         SwizzleRow_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
 164                         ShouldForceOpaque(aSrcFormat, aDstFormat)>)
 165
 166 template <bool aSwapRB>
 167 void UnpackRowRGB24_SSSE3(const uint8_t*, uint8_t*, int32_t);
 168
 169 #  define UNPACK_ROW_RGB_SSSE3(aDstFormat) \
 170     FORMAT_CASE_ROW(                       \
 171         SurfaceFormat::R8G8B8, aDstFormat, \
 172         UnpackRowRGB24_SSSE3<ShouldSwapRB(SurfaceFormat::R8G8B8, aDstFormat)>)
 173
 174 template <bool aSwapRB>
 175 void UnpackRowRGB24_AVX2(const uint8_t*, uint8_t*, int32_t);
 176
 177 #  define UNPACK_ROW_RGB_AVX2(aDstFormat)  \
 178     FORMAT_CASE_ROW(                       \
 179         SurfaceFormat::R8G8B8, aDstFormat, \
 180         UnpackRowRGB24_AVX2<ShouldSwapRB(SurfaceFormat::R8G8B8, aDstFormat)>)
 181
 182 #endif
 183
 184 #ifdef USE_NEON
 185 /**
 186  * ARM NEON optimizations
 187  */
 188
 189 template <bool aSwapRB, bool aOpaqueAlpha>
 190 void Premultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
 191
 192 #  define PREMULTIPLY_NEON(aSrcFormat, aDstFormat)                     \
 193     FORMAT_CASE(aSrcFormat, aDstFormat,                                \
 194                 Premultiply_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
 195                                  ShouldForceOpaque(aSrcFormat, aDstFormat)>)
 196
 197 template <bool aSwapRB, bool aOpaqueAlpha>
 198 void PremultiplyRow_NEON(const uint8_t*, uint8_t*, int32_t);
 199
 200 #  define PREMULTIPLY_ROW_NEON(aSrcFormat, aDstFormat)            \
 201     FORMAT_CASE_ROW(                                              \
 202         aSrcFormat, aDstFormat,                                   \
 203         PremultiplyRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
 204                             ShouldForceOpaque(aSrcFormat, aDstFormat)>)
 205
 206 template <bool aSwapRB>
 207 void Unpremultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
 208
 209 #  define UNPREMULTIPLY_NEON(aSrcFormat, aDstFormat) \
 210     FORMAT_CASE(aSrcFormat, aDstFormat,              \
 211                 Unpremultiply_NEON<ShouldSwapRB(aSrcFormat, aDstFormat)>)
 212
 213 template <bool aSwapRB>
 214 void UnpremultiplyRow_NEON(const uint8_t*, uint8_t*, int32_t);
 215
 216 #  define UNPREMULTIPLY_ROW_NEON(aSrcFormat, aDstFormat) \
 217     FORMAT_CASE_ROW(                                     \
 218         aSrcFormat, aDstFormat,                          \
 219         UnpremultiplyRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat)>)
 220
 221 template <bool aSwapRB, bool aOpaqueAlpha>
 222 void Swizzle_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
 223
 224 #  define SWIZZLE_NEON(aSrcFormat, aDstFormat)                     \
 225     FORMAT_CASE(aSrcFormat, aDstFormat,                            \
 226                 Swizzle_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
 227                              ShouldForceOpaque(aSrcFormat, aDstFormat)>)
 228
 229 template <bool aSwapRB, bool aOpaqueAlpha>
 230 void SwizzleRow_NEON(const uint8_t*, uint8_t*, int32_t);
 231
 232 #  define SWIZZLE_ROW_NEON(aSrcFormat, aDstFormat)            \
 233     FORMAT_CASE_ROW(                                          \
 234         aSrcFormat, aDstFormat,                               \
 235         SwizzleRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
 236                         ShouldForceOpaque(aSrcFormat, aDstFormat)>)
 237
 238 template <bool aSwapRB>
 239 void UnpackRowRGB24_NEON(const uint8_t*, uint8_t*, int32_t);
 240
 241 #  define UNPACK_ROW_RGB_NEON(aDstFormat)  \
 242     FORMAT_CASE_ROW(                       \
 243         SurfaceFormat::R8G8B8, aDstFormat, \
 244         UnpackRowRGB24_NEON<ShouldSwapRB(SurfaceFormat::R8G8B8, aDstFormat)>)
 245 #endif
 246
 247 /**
 248  * Premultiplying
 249  */
 250
 251 // Fallback premultiply implementation that uses splayed pixel math to reduce
 252 // the multiplications used. That is, the R and B components are isolated from
 253 // the G and A components, which then can be multiplied as if they were two
 254 // 2-component vectors. Otherwise, an approximation if divide-by-255 is used
 255 // which is faster than an actual division. These optimizations are also used
 256 // for the SSE2 and NEON implementations.
 257 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
 258           uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
 259 static void PremultiplyChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst,
 260                                      int32_t aLength) {
 261   const uint8_t* end = aSrc + 4 * aLength;
 262   do {
 263     // Load and process 1 entire pixel at a time.
 264     uint32_t color = *reinterpret_cast<const uint32_t*>(aSrc);
 265
 266     uint32_t a = aSrcAShift ? color >> aSrcAShift : color & 0xFF;
 267
 268     // Isolate the R and B components.
 269     uint32_t rb = (color >> aSrcRGBShift) & 0x00FF00FF;
 270     // Swap the order of R and B if necessary.
 271     if (aSwapRB) {
 272       rb = (rb >> 16) | (rb << 16);
 273     }
 274     // Approximate the multiply by alpha and divide by 255 which is
 275     // essentially:
 276     // c = c*a + 255; c = (c + (c >> 8)) >> 8;
 277     // However, we omit the final >> 8 to fold it with the final shift into
 278     // place depending on desired output format.
 279     rb = rb * a + 0x00FF00FF;
 280     rb = (rb + ((rb >> 8) & 0x00FF00FF)) & 0xFF00FF00;
 281
 282     // Use same approximation as above, but G is shifted 8 bits left.
 283     // Alpha is left out and handled separately.
 284     uint32_t g = color & (0xFF00 << aSrcRGBShift);
 285     g = g * a + (0xFF00 << aSrcRGBShift);
 286     g = (g + (g >> 8)) & (0xFF0000 << aSrcRGBShift);
 287
 288     // The above math leaves RGB shifted left by 8 bits.
 289     // Shift them right if required for the output format.
 290     // then combine them back together to produce output pixel.
 291     // Add the alpha back on if the output format is not opaque.
 292     *reinterpret_cast<uint32_t*>(aDst) =
 293         (rb >> (8 - aDstRGBShift)) | (g >> (8 + aSrcRGBShift - aDstRGBShift)) |
 294         (aOpaqueAlpha ? 0xFF << aDstAShift : a << aDstAShift);
 295
 296     aSrc += 4;
 297     aDst += 4;
 298   } while (aSrc < end);
 299 }
 300
 301 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
 302           uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
 303 static void PremultiplyRowFallback(const uint8_t* aSrc, uint8_t* aDst,
 304                                    int32_t aLength) {
 305   PremultiplyChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
 306                            aDstRGBShift, aDstAShift>(aSrc, aDst, aLength);
 307 }
 308
 309 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
 310           uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
 311 static void PremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap,
 312                                 uint8_t* aDst, int32_t aDstGap, IntSize aSize) {
 313   for (int32_t height = aSize.height; height > 0; height--) {
 314     PremultiplyChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
 315                              aDstRGBShift, aDstAShift>(aSrc, aDst, aSize.width);
 316     aSrc += aSrcGap;
 317     aDst += aDstGap;
 318   }
 319 }
 320
 321 #define PREMULTIPLY_FALLBACK_CASE(aSrcFormat, aDstFormat)                     \
 322   FORMAT_CASE(                                                                \
 323       aSrcFormat, aDstFormat,                                                 \
 324       PremultiplyFallback<ShouldSwapRB(aSrcFormat, aDstFormat),               \
 325                           ShouldForceOpaque(aSrcFormat, aDstFormat),          \
 326                           RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
 327                           RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
 328
 329 #define PREMULTIPLY_FALLBACK(aSrcFormat)                         \
 330   PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8A8) \
 331   PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8X8) \
 332   PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \
 333   PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8X8) \
 334   PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8) \
 335   PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::X8R8G8B8)
 336
 337 #define PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, aDstFormat)             \
 338   FORMAT_CASE_ROW(aSrcFormat, aDstFormat,                                 \
 339                   PremultiplyRowFallback<                                 \
 340                       ShouldSwapRB(aSrcFormat, aDstFormat),               \
 341                       ShouldForceOpaque(aSrcFormat, aDstFormat),          \
 342                       RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
 343                       RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
 344
 345 #define PREMULTIPLY_ROW_FALLBACK(aSrcFormat)                         \
 346   PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8A8) \
 347   PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8X8) \
 348   PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \
 349   PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8X8) \
 350   PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8) \
 351   PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::X8R8G8B8)
 352
 353 // If rows are tightly packed, and the size of the total area will fit within
 354 // the precision range of a single row, then process all the data as if it was
 355 // a single row.
 356 static inline IntSize CollapseSize(const IntSize& aSize, int32_t aSrcStride,
 357                                    int32_t aDstStride) {
 358   if (aSrcStride == aDstStride && (aSrcStride & 3) == 0 &&
 359       aSrcStride / 4 == aSize.width) {
 360     CheckedInt32 area = CheckedInt32(aSize.width) * CheckedInt32(aSize.height);
 361     if (area.isValid()) {
 362       return IntSize(area.value(), 1);
 363     }
 364   }
 365   return aSize;
 366 }
 367
 368 static inline int32_t GetStrideGap(int32_t aWidth, SurfaceFormat aFormat,
 369                                    int32_t aStride) {
 370   CheckedInt32 used = CheckedInt32(aWidth) * BytesPerPixel(aFormat);
 371   if (!used.isValid() || used.value() < 0) {
 372     return -1;
 373   }
 374   return aStride - used.value();
 375 }
 376
 377 bool PremultiplyData(const uint8_t* aSrc, int32_t aSrcStride,
 378                      SurfaceFormat aSrcFormat, uint8_t* aDst,
 379                      int32_t aDstStride, SurfaceFormat aDstFormat,
 380                      const IntSize& aSize) {
 381   if (aSize.IsEmpty()) {
 382     return true;
 383   }
 384   IntSize size = CollapseSize(aSize, aSrcStride, aDstStride);
 385   // Find gap from end of row to the start of the next row.
 386   int32_t srcGap = GetStrideGap(aSize.width, aSrcFormat, aSrcStride);
 387   int32_t dstGap = GetStrideGap(aSize.width, aDstFormat, aDstStride);
 388   MOZ_ASSERT(srcGap >= 0 && dstGap >= 0);
 389   if (srcGap < 0 || dstGap < 0) {
 390     return false;
 391   }
 392
 393 #define FORMAT_CASE_CALL(...) __VA_ARGS__(aSrc, srcGap, aDst, dstGap, size)
 394
 395 #ifdef USE_SSE2
 396   if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
 397       PREMULTIPLY_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
 398       PREMULTIPLY_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
 399       PREMULTIPLY_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
 400       PREMULTIPLY_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
 401       PREMULTIPLY_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
 402       PREMULTIPLY_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
 403       PREMULTIPLY_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
 404       PREMULTIPLY_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
 405       default:
 406         break;
 407     }
 408 #endif
 409
 410 #ifdef USE_NEON
 411   if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
 412       PREMULTIPLY_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
 413       PREMULTIPLY_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
 414       PREMULTIPLY_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
 415       PREMULTIPLY_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
 416       PREMULTIPLY_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
 417       PREMULTIPLY_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
 418       PREMULTIPLY_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
 419       PREMULTIPLY_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
 420       default:
 421         break;
 422     }
 423 #endif
 424
 425   switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
 426     PREMULTIPLY_FALLBACK(SurfaceFormat::B8G8R8A8)
 427     PREMULTIPLY_FALLBACK(SurfaceFormat::R8G8B8A8)
 428     PREMULTIPLY_FALLBACK(SurfaceFormat::A8R8G8B8)
 429     default:
 430       break;
 431   }
 432
 433 #undef FORMAT_CASE_CALL
 434
 435   MOZ_ASSERT(false, "Unsupported premultiply formats");
 436   return false;
 437 }
 438
 439 SwizzleRowFn PremultiplyRow(SurfaceFormat aSrcFormat,
 440                             SurfaceFormat aDstFormat) {
 441 #ifdef USE_SSE2
 442   if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
 443       PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
 444       PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
 445       PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
 446       PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
 447       PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
 448       PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
 449       PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
 450       PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
 451       default:
 452         break;
 453     }
 454 #endif
 455
 456 #ifdef USE_NEON
 457   if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
 458       PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
 459       PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
 460       PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
 461       PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
 462       PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
 463       PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
 464       PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
 465       PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
 466       default:
 467         break;
 468     }
 469 #endif
 470
 471   switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
 472     PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::B8G8R8A8)
 473     PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::R8G8B8A8)
 474     PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::A8R8G8B8)
 475     default:
 476       break;
 477   }
 478
 479   MOZ_ASSERT_UNREACHABLE("Unsupported premultiply formats");
 480   return nullptr;
 481 }
 482
 483 /**
 484  * Unpremultiplying
 485  */
 486
 487 // Generate a table of 8.16 fixed-point reciprocals representing 1/alpha.
 488 #define UNPREMULQ(x) (0xFF00FFU / (x))
 489 #define UNPREMULQ_2(x) UNPREMULQ(x), UNPREMULQ((x) + 1)
 490 #define UNPREMULQ_4(x) UNPREMULQ_2(x), UNPREMULQ_2((x) + 2)
 491 #define UNPREMULQ_8(x) UNPREMULQ_4(x), UNPREMULQ_4((x) + 4)
 492 #define UNPREMULQ_16(x) UNPREMULQ_8(x), UNPREMULQ_8((x) + 8)
 493 #define UNPREMULQ_32(x) UNPREMULQ_16(x), UNPREMULQ_16((x) + 16)
 494 static const uint32_t sUnpremultiplyTable[256] = {0,
 495                                                   UNPREMULQ(1),
 496                                                   UNPREMULQ_2(2),
 497                                                   UNPREMULQ_4(4),
 498                                                   UNPREMULQ_8(8),
 499                                                   UNPREMULQ_16(16),
 500                                                   UNPREMULQ_32(32),
 501                                                   UNPREMULQ_32(64),
 502                                                   UNPREMULQ_32(96),
 503                                                   UNPREMULQ_32(128),
 504                                                   UNPREMULQ_32(160),
 505                                                   UNPREMULQ_32(192),
 506                                                   UNPREMULQ_32(224)};
 507
 508 // Fallback unpremultiply implementation that uses 8.16 fixed-point reciprocal
 509 // math to eliminate any division by the alpha component. This optimization is
 510 // used for the SSE2 and NEON implementations, with some adaptations. This
 511 // implementation also accesses color components using individual byte accesses
 512 // as this profiles faster than accessing the pixel as a uint32_t and
 513 // shifting/masking to access components.
 514 template <bool aSwapRB, uint32_t aSrcRGBIndex, uint32_t aSrcAIndex,
 515           uint32_t aDstRGBIndex, uint32_t aDstAIndex>
 516 static void UnpremultiplyChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst,
 517                                        int32_t aLength) {
 518   const uint8_t* end = aSrc + 4 * aLength;
 519   do {
 520     uint8_t r = aSrc[aSrcRGBIndex + (aSwapRB ? 2 : 0)];
 521     uint8_t g = aSrc[aSrcRGBIndex + 1];
 522     uint8_t b = aSrc[aSrcRGBIndex + (aSwapRB ? 0 : 2)];
 523     uint8_t a = aSrc[aSrcAIndex];
 524
 525     // Access the 8.16 reciprocal from the table based on alpha. Multiply by
 526     // the reciprocal and shift off the fraction bits to approximate the
 527     // division by alpha.
 528     uint32_t q = sUnpremultiplyTable[a];
 529     aDst[aDstRGBIndex + 0] = (r * q) >> 16;
 530     aDst[aDstRGBIndex + 1] = (g * q) >> 16;
 531     aDst[aDstRGBIndex + 2] = (b * q) >> 16;
 532     aDst[aDstAIndex] = a;
 533
 534     aSrc += 4;
 535     aDst += 4;
 536   } while (aSrc < end);
 537 }
 538
 539 template <bool aSwapRB, uint32_t aSrcRGBIndex, uint32_t aSrcAIndex,
 540           uint32_t aDstRGBIndex, uint32_t aDstAIndex>
 541 static void UnpremultiplyRowFallback(const uint8_t* aSrc, uint8_t* aDst,
 542                                      int32_t aLength) {
 543   UnpremultiplyChunkFallback<aSwapRB, aSrcRGBIndex, aSrcAIndex, aDstRGBIndex,
 544                              aDstAIndex>(aSrc, aDst, aLength);
 545 }
 546
 547 template <bool aSwapRB, uint32_t aSrcRGBIndex, uint32_t aSrcAIndex,
 548           uint32_t aDstRGBIndex, uint32_t aDstAIndex>
 549 static void UnpremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap,
 550                                   uint8_t* aDst, int32_t aDstGap,
 551                                   IntSize aSize) {
 552   for (int32_t height = aSize.height; height > 0; height--) {
 553     UnpremultiplyChunkFallback<aSwapRB, aSrcRGBIndex, aSrcAIndex, aDstRGBIndex,
 554                                aDstAIndex>(aSrc, aDst, aSize.width);
 555     aSrc += aSrcGap;
 556     aDst += aDstGap;
 557   }
 558 }
 559
 560 #define UNPREMULTIPLY_FALLBACK_CASE(aSrcFormat, aDstFormat)             \
 561   FORMAT_CASE(aSrcFormat, aDstFormat,                                   \
 562               UnpremultiplyFallback<                                    \
 563                   ShouldSwapRB(aSrcFormat, aDstFormat),                 \
 564                   RGBByteIndex(aSrcFormat), AlphaByteIndex(aSrcFormat), \
 565                   RGBByteIndex(aDstFormat), AlphaByteIndex(aDstFormat)>)
 566
 567 #define UNPREMULTIPLY_FALLBACK(aSrcFormat)                         \
 568   UNPREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8A8) \
 569   UNPREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \
 570   UNPREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8)
 571
 572 #define UNPREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, aDstFormat)             \
 573   FORMAT_CASE_ROW(aSrcFormat, aDstFormat,                                   \
 574                   UnpremultiplyRowFallback<                                 \
 575                       ShouldSwapRB(aSrcFormat, aDstFormat),                 \
 576                       RGBByteIndex(aSrcFormat), AlphaByteIndex(aSrcFormat), \
 577                       RGBByteIndex(aDstFormat), AlphaByteIndex(aDstFormat)>)
 578
 579 #define UNPREMULTIPLY_ROW_FALLBACK(aSrcFormat)                         \
 580   UNPREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8A8) \
 581   UNPREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \
 582   UNPREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8)
 583
 584 bool UnpremultiplyData(const uint8_t* aSrc, int32_t aSrcStride,
 585                        SurfaceFormat aSrcFormat, uint8_t* aDst,
 586                        int32_t aDstStride, SurfaceFormat aDstFormat,
 587                        const IntSize& aSize) {
 588   if (aSize.IsEmpty()) {
 589     return true;
 590   }
 591   IntSize size = CollapseSize(aSize, aSrcStride, aDstStride);
 592   // Find gap from end of row to the start of the next row.
 593   int32_t srcGap = GetStrideGap(aSize.width, aSrcFormat, aSrcStride);
 594   int32_t dstGap = GetStrideGap(aSize.width, aDstFormat, aDstStride);
 595   MOZ_ASSERT(srcGap >= 0 && dstGap >= 0);
 596   if (srcGap < 0 || dstGap < 0) {
 597     return false;
 598   }
 599
 600 #define FORMAT_CASE_CALL(...) __VA_ARGS__(aSrc, srcGap, aDst, dstGap, size)
 601
 602 #ifdef USE_SSE2
 603   if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
 604       UNPREMULTIPLY_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
 605       UNPREMULTIPLY_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
 606       UNPREMULTIPLY_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
 607       UNPREMULTIPLY_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
 608       default:
 609         break;
 610     }
 611 #endif
 612
 613 #ifdef USE_NEON
 614   if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
 615       UNPREMULTIPLY_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
 616       UNPREMULTIPLY_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
 617       UNPREMULTIPLY_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
 618       UNPREMULTIPLY_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
 619       default:
 620         break;
 621     }
 622 #endif
 623
 624   switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
 625     UNPREMULTIPLY_FALLBACK(SurfaceFormat::B8G8R8A8)
 626     UNPREMULTIPLY_FALLBACK(SurfaceFormat::R8G8B8A8)
 627     UNPREMULTIPLY_FALLBACK(SurfaceFormat::A8R8G8B8)
 628     default:
 629       break;
 630   }
 631
 632 #undef FORMAT_CASE_CALL
 633
 634   MOZ_ASSERT(false, "Unsupported unpremultiply formats");
 635   return false;
 636 }
 637
 638 SwizzleRowFn UnpremultiplyRow(SurfaceFormat aSrcFormat,
 639                               SurfaceFormat aDstFormat) {
 640 #ifdef USE_SSE2
 641   if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
 642       UNPREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
 643       UNPREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
 644       UNPREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
 645       UNPREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
 646       default:
 647         break;
 648     }
 649 #endif
 650
 651 #ifdef USE_NEON
 652   if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
 653       UNPREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
 654       UNPREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
 655       UNPREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
 656       UNPREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
 657       default:
 658         break;
 659     }
 660 #endif
 661
 662   switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
 663     UNPREMULTIPLY_ROW_FALLBACK(SurfaceFormat::B8G8R8A8)
 664     UNPREMULTIPLY_ROW_FALLBACK(SurfaceFormat::R8G8B8A8)
 665     UNPREMULTIPLY_ROW_FALLBACK(SurfaceFormat::A8R8G8B8)
 666     default:
 667       break;
 668   }
 669
 670   MOZ_ASSERT_UNREACHABLE("Unsupported premultiply formats");
 671   return nullptr;
 672 }
 673
 674 /**
 675  * Swizzling
 676  */
 677
 678 // Fallback swizzle implementation that uses shifting and masking to reorder
 679 // pixels.
 680 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
 681           uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
 682 static void SwizzleChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst,
 683                                  int32_t aLength) {
 684   const uint8_t* end = aSrc + 4 * aLength;
 685   do {
 686     uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
 687
 688     if (aSwapRB) {
 689       // Handle R and B swaps by exchanging words and masking.
 690       uint32_t rb =
 691           ((rgba << 16) | (rgba >> 16)) & (0x00FF00FF << aSrcRGBShift);
 692       uint32_t ga = rgba & ((0xFF << aSrcAShift) | (0xFF00 << aSrcRGBShift));
 693       rgba = rb | ga;
 694     }
 695
 696     // If src and dst shifts differ, rotate left or right to move RGB into
 697     // place, i.e. ARGB -> RGBA or ARGB -> RGBA.
 698     if (aDstRGBShift > aSrcRGBShift) {
 699       rgba = (rgba << 8) | (aOpaqueAlpha ? 0x000000FF : rgba >> 24);
 700     } else if (aSrcRGBShift > aDstRGBShift) {
 701       rgba = (rgba >> 8) | (aOpaqueAlpha ? 0xFF000000 : rgba << 24);
 702     } else if (aOpaqueAlpha) {
 703       rgba |= 0xFF << aDstAShift;
 704     }
 705
 706     *reinterpret_cast<uint32_t*>(aDst) = rgba;
 707
 708     aSrc += 4;
 709     aDst += 4;
 710   } while (aSrc < end);
 711 }
 712
 713 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
 714           uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
 715 static void SwizzleRowFallback(const uint8_t* aSrc, uint8_t* aDst,
 716                                int32_t aLength) {
 717   SwizzleChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
 718                        aDstRGBShift, aDstAShift>(aSrc, aDst, aLength);
 719 }
 720
 721 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
 722           uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
 723 static void SwizzleFallback(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 724                             int32_t aDstGap, IntSize aSize) {
 725   for (int32_t height = aSize.height; height > 0; height--) {
 726     SwizzleChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
 727                          aDstRGBShift, aDstAShift>(aSrc, aDst, aSize.width);
 728     aSrc += aSrcGap;
 729     aDst += aDstGap;
 730   }
 731 }
 732
 733 #define SWIZZLE_FALLBACK(aSrcFormat, aDstFormat)                          \
 734   FORMAT_CASE(                                                            \
 735       aSrcFormat, aDstFormat,                                             \
 736       SwizzleFallback<ShouldSwapRB(aSrcFormat, aDstFormat),               \
 737                       ShouldForceOpaque(aSrcFormat, aDstFormat),          \
 738                       RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
 739                       RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
 740
 741 #define SWIZZLE_ROW_FALLBACK(aSrcFormat, aDstFormat)                         \
 742   FORMAT_CASE_ROW(                                                           \
 743       aSrcFormat, aDstFormat,                                                \
 744       SwizzleRowFallback<ShouldSwapRB(aSrcFormat, aDstFormat),               \
 745                          ShouldForceOpaque(aSrcFormat, aDstFormat),          \
 746                          RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
 747                          RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
 748
 749 // Fast-path for matching formats.
 750 template <int32_t aBytesPerPixel>
 751 static void SwizzleRowCopy(const uint8_t* aSrc, uint8_t* aDst,
 752                            int32_t aLength) {
 753   if (aSrc != aDst) {
 754     memcpy(aDst, aSrc, aLength * aBytesPerPixel);
 755   }
 756 }
 757
 758 // Fast-path for matching formats.
 759 static void SwizzleCopy(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 760                         int32_t aDstGap, IntSize aSize, int32_t aBPP) {
 761   if (aSrc != aDst) {
 762     int32_t rowLength = aBPP * aSize.width;
 763     for (int32_t height = aSize.height; height > 0; height--) {
 764       memcpy(aDst, aSrc, rowLength);
 765       aSrc += rowLength + aSrcGap;
 766       aDst += rowLength + aDstGap;
 767     }
 768   }
 769 }
 770
 771 // Fast-path for conversions that swap all bytes.
 772 template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
 773 static void SwizzleChunkSwap(const uint8_t*& aSrc, uint8_t*& aDst,
 774                              int32_t aLength) {
 775   const uint8_t* end = aSrc + 4 * aLength;
 776   do {
 777     // Use an endian swap to move the bytes, i.e. BGRA -> ARGB.
 778     uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
 779 #if MOZ_LITTLE_ENDIAN()
 780     rgba = NativeEndian::swapToBigEndian(rgba);
 781 #else
 782     rgba = NativeEndian::swapToLittleEndian(rgba);
 783 #endif
 784     if (aOpaqueAlpha) {
 785       rgba |= 0xFF << aDstAShift;
 786     }
 787     *reinterpret_cast<uint32_t*>(aDst) = rgba;
 788     aSrc += 4;
 789     aDst += 4;
 790   } while (aSrc < end);
 791 }
 792
 793 template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
 794 static void SwizzleRowSwap(const uint8_t* aSrc, uint8_t* aDst,
 795                            int32_t aLength) {
 796   SwizzleChunkSwap<aOpaqueAlpha, aSrcAShift, aDstAShift>(aSrc, aDst, aLength);
 797 }
 798
 799 template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
 800 static void SwizzleSwap(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 801                         int32_t aDstGap, IntSize aSize) {
 802   for (int32_t height = aSize.height; height > 0; height--) {
 803     SwizzleChunkSwap<aOpaqueAlpha, aSrcAShift, aDstAShift>(aSrc, aDst,
 804                                                            aSize.width);
 805     aSrc += aSrcGap;
 806     aDst += aDstGap;
 807   }
 808 }
 809
 810 #define SWIZZLE_SWAP(aSrcFormat, aDstFormat)                 \
 811   FORMAT_CASE(                                               \
 812       aSrcFormat, aDstFormat,                                \
 813       SwizzleSwap<ShouldForceOpaque(aSrcFormat, aDstFormat), \
 814                   AlphaBitShift(aSrcFormat), AlphaBitShift(aDstFormat)>)
 815
 816 #define SWIZZLE_ROW_SWAP(aSrcFormat, aDstFormat)                \
 817   FORMAT_CASE_ROW(                                              \
 818       aSrcFormat, aDstFormat,                                   \
 819       SwizzleRowSwap<ShouldForceOpaque(aSrcFormat, aDstFormat), \
 820                      AlphaBitShift(aSrcFormat), AlphaBitShift(aDstFormat)>)
 821
 822 static void SwizzleChunkSwapRGB24(const uint8_t*& aSrc, uint8_t*& aDst,
 823                                   int32_t aLength) {
 824   const uint8_t* end = aSrc + 3 * aLength;
 825   do {
 826     uint8_t r = aSrc[0];
 827     uint8_t g = aSrc[1];
 828     uint8_t b = aSrc[2];
 829     aDst[0] = b;
 830     aDst[1] = g;
 831     aDst[2] = r;
 832     aSrc += 3;
 833     aDst += 3;
 834   } while (aSrc < end);
 835 }
 836
 837 static void SwizzleRowSwapRGB24(const uint8_t* aSrc, uint8_t* aDst,
 838                                 int32_t aLength) {
 839   SwizzleChunkSwapRGB24(aSrc, aDst, aLength);
 840 }
 841
 842 static void SwizzleSwapRGB24(const uint8_t* aSrc, int32_t aSrcGap,
 843                              uint8_t* aDst, int32_t aDstGap, IntSize aSize) {
 844   for (int32_t height = aSize.height; height > 0; height--) {
 845     SwizzleChunkSwapRGB24(aSrc, aDst, aSize.width);
 846     aSrc += aSrcGap;
 847     aDst += aDstGap;
 848   }
 849 }
 850
 851 #define SWIZZLE_SWAP_RGB24(aSrcFormat, aDstFormat) \
 852   FORMAT_CASE(aSrcFormat, aDstFormat, SwizzleSwapRGB24)
 853
 854 #define SWIZZLE_ROW_SWAP_RGB24(aSrcFormat, aDstFormat) \
 855   FORMAT_CASE_ROW(aSrcFormat, aDstFormat, SwizzleRowSwapRGB24)
 856
 857 // Fast-path for conversions that force alpha to opaque.
 858 template <uint32_t aDstAShift>
 859 static void SwizzleChunkOpaqueUpdate(uint8_t*& aBuffer, int32_t aLength) {
 860   const uint8_t* end = aBuffer + 4 * aLength;
 861   do {
 862     uint32_t rgba = *reinterpret_cast<const uint32_t*>(aBuffer);
 863     // Just add on the alpha bits to the source.
 864     rgba |= 0xFF << aDstAShift;
 865     *reinterpret_cast<uint32_t*>(aBuffer) = rgba;
 866     aBuffer += 4;
 867   } while (aBuffer < end);
 868 }
 869
 870 template <uint32_t aDstAShift>
 871 static void SwizzleChunkOpaqueCopy(const uint8_t*& aSrc, uint8_t* aDst,
 872                                    int32_t aLength) {
 873   const uint8_t* end = aSrc + 4 * aLength;
 874   do {
 875     uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
 876     // Just add on the alpha bits to the source.
 877     rgba |= 0xFF << aDstAShift;
 878     *reinterpret_cast<uint32_t*>(aDst) = rgba;
 879     aSrc += 4;
 880     aDst += 4;
 881   } while (aSrc < end);
 882 }
 883
 884 template <uint32_t aDstAShift>
 885 static void SwizzleRowOpaque(const uint8_t* aSrc, uint8_t* aDst,
 886                              int32_t aLength) {
 887   if (aSrc == aDst) {
 888     SwizzleChunkOpaqueUpdate<aDstAShift>(aDst, aLength);
 889   } else {
 890     SwizzleChunkOpaqueCopy<aDstAShift>(aSrc, aDst, aLength);
 891   }
 892 }
 893
 894 template <uint32_t aDstAShift>
 895 static void SwizzleOpaque(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 896                           int32_t aDstGap, IntSize aSize) {
 897   if (aSrc == aDst) {
 898     // Modifying in-place, so just write out the alpha.
 899     for (int32_t height = aSize.height; height > 0; height--) {
 900       SwizzleChunkOpaqueUpdate<aDstAShift>(aDst, aSize.width);
 901       aDst += aDstGap;
 902     }
 903   } else {
 904     for (int32_t height = aSize.height; height > 0; height--) {
 905       SwizzleChunkOpaqueCopy<aDstAShift>(aSrc, aDst, aSize.width);
 906       aSrc += aSrcGap;
 907       aDst += aDstGap;
 908     }
 909   }
 910 }
 911
 912 #define SWIZZLE_OPAQUE(aSrcFormat, aDstFormat) \
 913   FORMAT_CASE(aSrcFormat, aDstFormat, SwizzleOpaque<AlphaBitShift(aDstFormat)>)
 914
 915 #define SWIZZLE_ROW_OPAQUE(aSrcFormat, aDstFormat) \
 916   FORMAT_CASE_ROW(aSrcFormat, aDstFormat,          \
 917                   SwizzleRowOpaque<AlphaBitShift(aDstFormat)>)
 918
 919 // Packing of 32-bit formats to RGB565.
 920 template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
 921 static void PackToRGB565(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 922                          int32_t aDstGap, IntSize aSize) {
 923   for (int32_t height = aSize.height; height > 0; height--) {
 924     const uint8_t* end = aSrc + 4 * aSize.width;
 925     do {
 926       uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
 927
 928       // Isolate the R, G, and B components and shift to final endian-dependent
 929       // locations.
 930       uint16_t rgb565;
 931       if (aSwapRB) {
 932         rgb565 = ((rgba & (0xF8 << aSrcRGBShift)) << (8 - aSrcRGBShift)) |
 933                  ((rgba & (0xFC00 << aSrcRGBShift)) >> (5 + aSrcRGBShift)) |
 934                  ((rgba & (0xF80000 << aSrcRGBShift)) >> (19 + aSrcRGBShift));
 935       } else {
 936         rgb565 = ((rgba & (0xF8 << aSrcRGBShift)) >> (3 + aSrcRGBShift)) |
 937                  ((rgba & (0xFC00 << aSrcRGBShift)) >> (5 + aSrcRGBShift)) |
 938                  ((rgba & (0xF80000 << aSrcRGBShift)) >> (8 + aSrcRGBShift));
 939       }
 940
 941       *reinterpret_cast<uint16_t*>(aDst) = rgb565;
 942
 943       aSrc += 4;
 944       aDst += 2;
 945     } while (aSrc < end);
 946
 947     aSrc += aSrcGap;
 948     aDst += aDstGap;
 949   }
 950 }
 951
 952 // Packing of 32-bit formats to 24-bit formats.
 953 template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
 954 static void PackChunkToRGB24(const uint8_t*& aSrc, uint8_t*& aDst,
 955                              int32_t aLength) {
 956   const uint8_t* end = aSrc + 4 * aLength;
 957   do {
 958     uint8_t r = aSrc[aSrcRGBIndex + (aSwapRB ? 2 : 0)];
 959     uint8_t g = aSrc[aSrcRGBIndex + 1];
 960     uint8_t b = aSrc[aSrcRGBIndex + (aSwapRB ? 0 : 2)];
 961
 962     aDst[0] = r;
 963     aDst[1] = g;
 964     aDst[2] = b;
 965
 966     aSrc += 4;
 967     aDst += 3;
 968   } while (aSrc < end);
 969 }
 970
 971 template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
 972 static void PackRowToRGB24(const uint8_t* aSrc, uint8_t* aDst,
 973                            int32_t aLength) {
 974   PackChunkToRGB24<aSwapRB, aSrcRGBShift, aSrcRGBIndex>(aSrc, aDst, aLength);
 975 }
 976
 977 template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
 978 static void PackToRGB24(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 979                         int32_t aDstGap, IntSize aSize) {
 980   for (int32_t height = aSize.height; height > 0; height--) {
 981     PackChunkToRGB24<aSwapRB, aSrcRGBShift, aSrcRGBIndex>(aSrc, aDst,
 982                                                           aSize.width);
 983     aSrc += aSrcGap;
 984     aDst += aDstGap;
 985   }
 986 }
 987
 988 #define PACK_RGB_CASE(aSrcFormat, aDstFormat, aPackFunc)      \
 989   FORMAT_CASE(aSrcFormat, aDstFormat,                         \
 990               aPackFunc<ShouldSwapRB(aSrcFormat, aDstFormat), \
 991                         RGBBitShift(aSrcFormat), RGBByteIndex(aSrcFormat)>)
 992
 993 #define PACK_RGB(aDstFormat, aPackFunc)                         \
 994   PACK_RGB_CASE(SurfaceFormat::B8G8R8A8, aDstFormat, aPackFunc) \
 995   PACK_RGB_CASE(SurfaceFormat::B8G8R8X8, aDstFormat, aPackFunc) \
 996   PACK_RGB_CASE(SurfaceFormat::R8G8B8A8, aDstFormat, aPackFunc) \
 997   PACK_RGB_CASE(SurfaceFormat::R8G8B8X8, aDstFormat, aPackFunc) \
 998   PACK_RGB_CASE(SurfaceFormat::A8R8G8B8, aDstFormat, aPackFunc) \
 999   PACK_RGB_CASE(SurfaceFormat::X8R8G8B8, aDstFormat, aPackFunc)
1000
1001 #define PACK_ROW_RGB_CASE(aSrcFormat, aDstFormat, aPackFunc)                   \
1002   FORMAT_CASE_ROW(                                                             \
1003       aSrcFormat, aDstFormat,                                                  \
1004       aPackFunc<ShouldSwapRB(aSrcFormat, aDstFormat), RGBBitShift(aSrcFormat), \
1005                 RGBByteIndex(aSrcFormat)>)
1006
1007 #define PACK_ROW_RGB(aDstFormat, aPackFunc)                         \
1008   PACK_ROW_RGB_CASE(SurfaceFormat::B8G8R8A8, aDstFormat, aPackFunc) \
1009   PACK_ROW_RGB_CASE(SurfaceFormat::B8G8R8X8, aDstFormat, aPackFunc) \
1010   PACK_ROW_RGB_CASE(SurfaceFormat::R8G8B8A8, aDstFormat, aPackFunc) \
1011   PACK_ROW_RGB_CASE(SurfaceFormat::R8G8B8X8, aDstFormat, aPackFunc) \
1012   PACK_ROW_RGB_CASE(SurfaceFormat::A8R8G8B8, aDstFormat, aPackFunc) \
1013   PACK_ROW_RGB_CASE(SurfaceFormat::X8R8G8B8, aDstFormat, aPackFunc)
1014
1015 // Packing of 32-bit formats to A8.
1016 template <uint32_t aSrcAIndex>
1017 static void PackToA8(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
1018                      int32_t aDstGap, IntSize aSize) {
1019   for (int32_t height = aSize.height; height > 0; height--) {
1020     const uint8_t* end = aSrc + 4 * aSize.width;
1021     do {
1022       *aDst++ = aSrc[aSrcAIndex];
1023       aSrc += 4;
1024     } while (aSrc < end);
1025     aSrc += aSrcGap;
1026     aDst += aDstGap;
1027   }
1028 }
1029
1030 #define PACK_ALPHA_CASE(aSrcFormat, aDstFormat, aPackFunc) \
1031   FORMAT_CASE(aSrcFormat, aDstFormat, aPackFunc<AlphaByteIndex(aSrcFormat)>)
1032
1033 #define PACK_ALPHA(aDstFormat, aPackFunc)                         \
1034   PACK_ALPHA_CASE(SurfaceFormat::B8G8R8A8, aDstFormat, aPackFunc) \
1035   PACK_ALPHA_CASE(SurfaceFormat::R8G8B8A8, aDstFormat, aPackFunc) \
1036   PACK_ALPHA_CASE(SurfaceFormat::A8R8G8B8, aDstFormat, aPackFunc)
1037
1038 template <bool aSwapRB>
1039 void UnpackRowRGB24(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
1040   // Because we are expanding, we can only process the data back to front in
1041   // case we are performing this in place.
1042   const uint8_t* src = aSrc + 3 * (aLength - 1);
1043   uint32_t* dst = reinterpret_cast<uint32_t*>(aDst + 4 * aLength);
1044   while (src >= aSrc) {
1045     uint8_t r = src[aSwapRB ? 2 : 0];
1046     uint8_t g = src[1];
1047     uint8_t b = src[aSwapRB ? 0 : 2];
1048 #if MOZ_LITTLE_ENDIAN()
1049     *--dst = 0xFF000000 | (b << 16) | (g << 8) | r;
1050 #else
1051     *--dst = 0x000000FF | (b << 8) | (g << 16) | (r << 24);
1052 #endif
1053     src -= 3;
1054   }
1055 }
1056
1057 // Force instantiation of swizzle variants here.
1058 template void UnpackRowRGB24<false>(const uint8_t*, uint8_t*, int32_t);
1059 template void UnpackRowRGB24<true>(const uint8_t*, uint8_t*, int32_t);
1060
1061 #define UNPACK_ROW_RGB(aDstFormat)       \
1062   FORMAT_CASE_ROW(                       \
1063       SurfaceFormat::R8G8B8, aDstFormat, \
1064       UnpackRowRGB24<ShouldSwapRB(SurfaceFormat::R8G8B8, aDstFormat)>)
1065
1066 static void UnpackRowRGB24_To_ARGB(const uint8_t* aSrc, uint8_t* aDst,
1067                                    int32_t aLength) {
1068   // Because we are expanding, we can only process the data back to front in
1069   // case we are performing this in place.
1070   const uint8_t* src = aSrc + 3 * (aLength - 1);
1071   uint32_t* dst = reinterpret_cast<uint32_t*>(aDst + 4 * aLength);
1072   while (src >= aSrc) {
1073     uint8_t r = src[0];
1074     uint8_t g = src[1];
1075     uint8_t b = src[2];
1076 #if MOZ_LITTLE_ENDIAN()
1077     *--dst = 0x000000FF | (r << 8) | (g << 16) | (b << 24);
1078 #else
1079     *--dst = 0xFF000000 | (r << 24) | (g << 16) | b;
1080 #endif
1081     src -= 3;
1082   }
1083 }
1084
1085 #define UNPACK_ROW_RGB_TO_ARGB(aDstFormat) \
1086   FORMAT_CASE_ROW(SurfaceFormat::R8G8B8, aDstFormat, UnpackRowRGB24_To_ARGB)
1087
1088 bool SwizzleData(const uint8_t* aSrc, int32_t aSrcStride,
1089                  SurfaceFormat aSrcFormat, uint8_t* aDst, int32_t aDstStride,
1090                  SurfaceFormat aDstFormat, const IntSize& aSize) {
1091   if (aSize.IsEmpty()) {
1092     return true;
1093   }
1094   IntSize size = CollapseSize(aSize, aSrcStride, aDstStride);
1095   // Find gap from end of row to the start of the next row.
1096   int32_t srcGap = GetStrideGap(aSize.width, aSrcFormat, aSrcStride);
1097   int32_t dstGap = GetStrideGap(aSize.width, aDstFormat, aDstStride);
1098   MOZ_ASSERT(srcGap >= 0 && dstGap >= 0);
1099   if (srcGap < 0 || dstGap < 0) {
1100     return false;
1101   }
1102
1103 #define FORMAT_CASE_CALL(...) __VA_ARGS__(aSrc, srcGap, aDst, dstGap, size)
1104
1105 #ifdef USE_SSE2
1106   if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
1107       SWIZZLE_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
1108       SWIZZLE_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
1109       SWIZZLE_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
1110       SWIZZLE_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
1111       SWIZZLE_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
1112       SWIZZLE_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
1113       SWIZZLE_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
1114       SWIZZLE_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
1115       default:
1116         break;
1117     }
1118 #endif
1119
1120 #ifdef USE_NEON
1121   if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
1122       SWIZZLE_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
1123       SWIZZLE_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
1124       SWIZZLE_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
1125       SWIZZLE_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
1126       SWIZZLE_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
1127       SWIZZLE_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
1128       SWIZZLE_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
1129       SWIZZLE_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
1130       default:
1131         break;
1132     }
1133 #endif
1134
1135   switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
1136     SWIZZLE_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
1137     SWIZZLE_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
1138     SWIZZLE_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
1139     SWIZZLE_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
1140
1141     SWIZZLE_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
1142     SWIZZLE_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
1143     SWIZZLE_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
1144     SWIZZLE_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
1145     SWIZZLE_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::A8R8G8B8)
1146     SWIZZLE_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::X8R8G8B8)
1147
1148     SWIZZLE_FALLBACK(SurfaceFormat::A8R8G8B8, SurfaceFormat::R8G8B8A8)
1149     SWIZZLE_FALLBACK(SurfaceFormat::X8R8G8B8, SurfaceFormat::R8G8B8X8)
1150     SWIZZLE_FALLBACK(SurfaceFormat::A8R8G8B8, SurfaceFormat::R8G8B8X8)
1151     SWIZZLE_FALLBACK(SurfaceFormat::X8R8G8B8, SurfaceFormat::R8G8B8A8)
1152
1153     SWIZZLE_SWAP(SurfaceFormat::B8G8R8A8, SurfaceFormat::A8R8G8B8)
1154     SWIZZLE_SWAP(SurfaceFormat::B8G8R8A8, SurfaceFormat::X8R8G8B8)
1155     SWIZZLE_SWAP(SurfaceFormat::B8G8R8X8, SurfaceFormat::X8R8G8B8)
1156     SWIZZLE_SWAP(SurfaceFormat::B8G8R8X8, SurfaceFormat::A8R8G8B8)
1157     SWIZZLE_SWAP(SurfaceFormat::A8R8G8B8, SurfaceFormat::B8G8R8A8)
1158     SWIZZLE_SWAP(SurfaceFormat::A8R8G8B8, SurfaceFormat::B8G8R8X8)
1159     SWIZZLE_SWAP(SurfaceFormat::X8R8G8B8, SurfaceFormat::B8G8R8X8)
1160     SWIZZLE_SWAP(SurfaceFormat::X8R8G8B8, SurfaceFormat::B8G8R8A8)
1161
1162     SWIZZLE_SWAP_RGB24(SurfaceFormat::R8G8B8, SurfaceFormat::B8G8R8)
1163     SWIZZLE_SWAP_RGB24(SurfaceFormat::B8G8R8, SurfaceFormat::R8G8B8)
1164
1165     SWIZZLE_OPAQUE(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
1166     SWIZZLE_OPAQUE(SurfaceFormat::B8G8R8X8, SurfaceFormat::B8G8R8A8)
1167     SWIZZLE_OPAQUE(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
1168     SWIZZLE_OPAQUE(SurfaceFormat::R8G8B8X8, SurfaceFormat::R8G8B8A8)
1169     SWIZZLE_OPAQUE(SurfaceFormat::A8R8G8B8, SurfaceFormat::X8R8G8B8)
1170     SWIZZLE_OPAQUE(SurfaceFormat::X8R8G8B8, SurfaceFormat::A8R8G8B8)
1171
1172     PACK_RGB(SurfaceFormat::R5G6B5_UINT16, PackToRGB565)
1173     PACK_RGB(SurfaceFormat::B8G8R8, PackToRGB24)
1174     PACK_RGB(SurfaceFormat::R8G8B8, PackToRGB24)
1175     PACK_ALPHA(SurfaceFormat::A8, PackToA8)
1176
1177     default:
1178       break;
1179   }
1180
1181   if (aSrcFormat == aDstFormat) {
1182     // If the formats match, just do a generic copy.
1183     SwizzleCopy(aSrc, srcGap, aDst, dstGap, size, BytesPerPixel(aSrcFormat));
1184     return true;
1185   }
1186
1187 #undef FORMAT_CASE_CALL
1188
1189   MOZ_ASSERT(false, "Unsupported swizzle formats");
1190   return false;
1191 }
1192
1193 static bool SwizzleYFlipDataInternal(const uint8_t* aSrc, int32_t aSrcStride,
1194                                      SurfaceFormat aSrcFormat, uint8_t* aDst,
1195                                      int32_t aDstStride,
1196                                      SurfaceFormat aDstFormat,
1197                                      const IntSize& aSize,
1198                                      SwizzleRowFn aSwizzleFn) {
1199   if (!aSwizzleFn) {
1200     return false;
1201   }
1202
1203   // Guarantee our width and height are both greater than zero.
1204   if (aSize.IsEmpty()) {
1205     return true;
1206   }
1207
1208   // Unlike SwizzleData/PremultiplyData, we don't use the stride gaps directly,
1209   // but we can use it to verify that the stride is valid for our width and
1210   // format.
1211   int32_t srcGap = GetStrideGap(aSize.width, aSrcFormat, aSrcStride);
1212   int32_t dstGap = GetStrideGap(aSize.width, aDstFormat, aDstStride);
1213   MOZ_ASSERT(srcGap >= 0 && dstGap >= 0);
1214   if (srcGap < 0 || dstGap < 0) {
1215     return false;
1216   }
1217
1218   // Swapping/swizzling to a new buffer is trivial.
1219   if (aSrc != aDst) {
1220     const uint8_t* src = aSrc;
1221     const uint8_t* srcEnd = aSrc + aSize.height * aSrcStride;
1222     uint8_t* dst = aDst + (aSize.height - 1) * aDstStride;
1223     while (src < srcEnd) {
1224       aSwizzleFn(src, dst, aSize.width);
1225       src += aSrcStride;
1226       dst -= aDstStride;
1227     }
1228     return true;
1229   }
1230
1231   if (aSrcStride != aDstStride) {
1232     return false;
1233   }
1234
1235   // If we are swizzling in place, then we need a temporary row buffer.
1236   UniquePtr<uint8_t[]> rowBuffer(new (std::nothrow) uint8_t[aDstStride]);
1237   if (!rowBuffer) {
1238     return false;
1239   }
1240
1241   // Swizzle and swap the top and bottom rows until we meet in the middle.
1242   int32_t middleRow = aSize.height / 2;
1243   uint8_t* top = aDst;
1244   uint8_t* bottom = aDst + (aSize.height - 1) * aDstStride;
1245   for (int32_t row = 0; row < middleRow; ++row) {
1246     memcpy(rowBuffer.get(), bottom, aDstStride);
1247     aSwizzleFn(top, bottom, aSize.width);
1248     aSwizzleFn(rowBuffer.get(), top, aSize.width);
1249     top += aDstStride;
1250     bottom -= aDstStride;
1251   }
1252
1253   // If there is an odd numbered row, we haven't swizzled it yet.
1254   if (aSize.height % 2 == 1) {
1255     top = aDst + middleRow * aDstStride;
1256     aSwizzleFn(top, top, aSize.width);
1257   }
1258   return true;
1259 }
1260
1261 bool SwizzleYFlipData(const uint8_t* aSrc, int32_t aSrcStride,
1262                       SurfaceFormat aSrcFormat, uint8_t* aDst,
1263                       int32_t aDstStride, SurfaceFormat aDstFormat,
1264                       const IntSize& aSize) {
1265   return SwizzleYFlipDataInternal(aSrc, aSrcStride, aSrcFormat, aDst,
1266                                   aDstStride, aDstFormat, aSize,
1267                                   SwizzleRow(aSrcFormat, aDstFormat));
1268 }
1269
1270 bool PremultiplyYFlipData(const uint8_t* aSrc, int32_t aSrcStride,
1271                           SurfaceFormat aSrcFormat, uint8_t* aDst,
1272                           int32_t aDstStride, SurfaceFormat aDstFormat,
1273                           const IntSize& aSize) {
1274   return SwizzleYFlipDataInternal(aSrc, aSrcStride, aSrcFormat, aDst,
1275                                   aDstStride, aDstFormat, aSize,
1276                                   PremultiplyRow(aSrcFormat, aDstFormat));
1277 }
1278
1279 SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat) {
1280 #ifdef USE_SSE2
1281   if (mozilla::supports_avx2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
1282       UNPACK_ROW_RGB_AVX2(SurfaceFormat::R8G8B8X8)
1283       UNPACK_ROW_RGB_AVX2(SurfaceFormat::R8G8B8A8)
1284       UNPACK_ROW_RGB_AVX2(SurfaceFormat::B8G8R8X8)
1285       UNPACK_ROW_RGB_AVX2(SurfaceFormat::B8G8R8A8)
1286       default:
1287         break;
1288     }
1289
1290   if (mozilla::supports_ssse3()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
1291       UNPACK_ROW_RGB_SSSE3(SurfaceFormat::R8G8B8X8)
1292       UNPACK_ROW_RGB_SSSE3(SurfaceFormat::R8G8B8A8)
1293       UNPACK_ROW_RGB_SSSE3(SurfaceFormat::B8G8R8X8)
1294       UNPACK_ROW_RGB_SSSE3(SurfaceFormat::B8G8R8A8)
1295       default:
1296         break;
1297     }
1298
1299   if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
1300       SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
1301       SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
1302       SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
1303       SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
1304       SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
1305       SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
1306       SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
1307       SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
1308       default:
1309         break;
1310     }
1311 #endif
1312
1313 #ifdef USE_NEON
1314   if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
1315       UNPACK_ROW_RGB_NEON(SurfaceFormat::R8G8B8X8)
1316       UNPACK_ROW_RGB_NEON(SurfaceFormat::R8G8B8A8)
1317       UNPACK_ROW_RGB_NEON(SurfaceFormat::B8G8R8X8)
1318       UNPACK_ROW_RGB_NEON(SurfaceFormat::B8G8R8A8)
1319       SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
1320       SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
1321       SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
1322       SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
1323       SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
1324       SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
1325       SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
1326       SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
1327       default:
1328         break;
1329     }
1330 #endif
1331
1332   switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
1333     SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
1334     SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
1335     SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
1336     SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
1337
1338     SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
1339     SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
1340     SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
1341     SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
1342     SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::A8R8G8B8)
1343     SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::X8R8G8B8)
1344
1345     SWIZZLE_ROW_FALLBACK(SurfaceFormat::A8R8G8B8, SurfaceFormat::R8G8B8A8)
1346     SWIZZLE_ROW_FALLBACK(SurfaceFormat::X8R8G8B8, SurfaceFormat::R8G8B8X8)
1347     SWIZZLE_ROW_FALLBACK(SurfaceFormat::A8R8G8B8, SurfaceFormat::R8G8B8X8)
1348     SWIZZLE_ROW_FALLBACK(SurfaceFormat::X8R8G8B8, SurfaceFormat::R8G8B8A8)
1349
1350     SWIZZLE_ROW_OPAQUE(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
1351     SWIZZLE_ROW_OPAQUE(SurfaceFormat::B8G8R8X8, SurfaceFormat::B8G8R8A8)
1352     SWIZZLE_ROW_OPAQUE(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
1353     SWIZZLE_ROW_OPAQUE(SurfaceFormat::R8G8B8X8, SurfaceFormat::R8G8B8A8)
1354     SWIZZLE_ROW_OPAQUE(SurfaceFormat::A8R8G8B8, SurfaceFormat::X8R8G8B8)
1355     SWIZZLE_ROW_OPAQUE(SurfaceFormat::X8R8G8B8, SurfaceFormat::A8R8G8B8)
1356
1357     SWIZZLE_ROW_SWAP(SurfaceFormat::B8G8R8A8, SurfaceFormat::A8R8G8B8)
1358     SWIZZLE_ROW_SWAP(SurfaceFormat::B8G8R8A8, SurfaceFormat::X8R8G8B8)
1359     SWIZZLE_ROW_SWAP(SurfaceFormat::B8G8R8X8, SurfaceFormat::X8R8G8B8)
1360     SWIZZLE_ROW_SWAP(SurfaceFormat::B8G8R8X8, SurfaceFormat::A8R8G8B8)
1361     SWIZZLE_ROW_SWAP(SurfaceFormat::A8R8G8B8, SurfaceFormat::B8G8R8A8)
1362     SWIZZLE_ROW_SWAP(SurfaceFormat::A8R8G8B8, SurfaceFormat::B8G8R8X8)
1363     SWIZZLE_ROW_SWAP(SurfaceFormat::X8R8G8B8, SurfaceFormat::B8G8R8X8)
1364     SWIZZLE_ROW_SWAP(SurfaceFormat::X8R8G8B8, SurfaceFormat::B8G8R8A8)
1365
1366     SWIZZLE_ROW_SWAP_RGB24(SurfaceFormat::R8G8B8, SurfaceFormat::B8G8R8)
1367     SWIZZLE_ROW_SWAP_RGB24(SurfaceFormat::B8G8R8, SurfaceFormat::R8G8B8)
1368
1369     UNPACK_ROW_RGB(SurfaceFormat::R8G8B8X8)
1370     UNPACK_ROW_RGB(SurfaceFormat::R8G8B8A8)
1371     UNPACK_ROW_RGB(SurfaceFormat::B8G8R8X8)
1372     UNPACK_ROW_RGB(SurfaceFormat::B8G8R8A8)
1373     UNPACK_ROW_RGB_TO_ARGB(SurfaceFormat::A8R8G8B8)
1374     UNPACK_ROW_RGB_TO_ARGB(SurfaceFormat::X8R8G8B8)
1375
1376     PACK_ROW_RGB(SurfaceFormat::R8G8B8, PackRowToRGB24)
1377     PACK_ROW_RGB(SurfaceFormat::B8G8R8, PackRowToRGB24)
1378
1379     default:
1380       break;
1381   }
1382
1383   if (aSrcFormat == aDstFormat) {
1384     switch (BytesPerPixel(aSrcFormat)) {
1385       case 4:
1386         return &SwizzleRowCopy<4>;
1387       case 3:
1388         return &SwizzleRowCopy<3>;
1389       default:
1390         break;
1391     }
1392   }
1393
1394   MOZ_ASSERT_UNREACHABLE("Unsupported swizzle formats");
1395   return nullptr;
1396 }
1397
1398 static IntRect ReorientRowRotate0FlipFallback(const uint8_t* aSrc,
1399                                               int32_t aSrcRow, uint8_t* aDst,
1400                                               const IntSize& aDstSize,
1401                                               int32_t aDstStride) {
1402   // Reverse order of pixels in the row.
1403   const uint32_t* src = reinterpret_cast<const uint32_t*>(aSrc);
1404   const uint32_t* end = src + aDstSize.width;
1405   uint32_t* dst = reinterpret_cast<uint32_t*>(aDst + aSrcRow * aDstStride) +
1406                   aDstSize.width - 1;
1407   do {
1408     *dst-- = *src++;
1409   } while (src < end);
1410
1411   return IntRect(0, aSrcRow, aDstSize.width, 1);
1412 }
1413
1414 static IntRect ReorientRowRotate90FlipFallback(const uint8_t* aSrc,
1415                                                int32_t aSrcRow, uint8_t* aDst,
1416                                                const IntSize& aDstSize,
1417                                                int32_t aDstStride) {
1418   // Copy row of pixels from top to bottom, into left to right columns.
1419   const uint32_t* src = reinterpret_cast<const uint32_t*>(aSrc);
1420   const uint32_t* end = src + aDstSize.height;
1421   uint32_t* dst = reinterpret_cast<uint32_t*>(aDst) + aSrcRow;
1422   int32_t stride = aDstStride / sizeof(uint32_t);
1423   do {
1424     *dst = *src++;
1425     dst += stride;
1426   } while (src < end);
1427
1428   return IntRect(aSrcRow, 0, 1, aDstSize.height);
1429 }
1430
1431 static IntRect ReorientRowRotate180FlipFallback(const uint8_t* aSrc,
1432                                                 int32_t aSrcRow, uint8_t* aDst,
1433                                                 const IntSize& aDstSize,
1434                                                 int32_t aDstStride) {
1435   // Copy row of pixels from top to bottom, into bottom to top rows.
1436   uint8_t* dst = aDst + (aDstSize.height - aSrcRow - 1) * aDstStride;
1437   memcpy(dst, aSrc, aDstSize.width * sizeof(uint32_t));
1438   return IntRect(0, aDstSize.height - aSrcRow - 1, aDstSize.width, 1);
1439 }
1440
1441 static IntRect ReorientRowRotate270FlipFallback(const uint8_t* aSrc,
1442                                                 int32_t aSrcRow, uint8_t* aDst,
1443                                                 const IntSize& aDstSize,
1444                                                 int32_t aDstStride) {
1445   // Copy row of pixels in reverse order from top to bottom, into right to left
1446   // columns.
1447   const uint32_t* src = reinterpret_cast<const uint32_t*>(aSrc);
1448   const uint32_t* end = src + aDstSize.height;
1449   uint32_t* dst =
1450       reinterpret_cast<uint32_t*>(aDst + (aDstSize.height - 1) * aDstStride) +
1451       aDstSize.width - aSrcRow - 1;
1452   int32_t stride = aDstStride / sizeof(uint32_t);
1453   do {
1454     *dst = *src++;
1455     dst -= stride;
1456   } while (src < end);
1457
1458   return IntRect(aDstSize.width - aSrcRow - 1, 0, 1, aDstSize.height);
1459 }
1460
1461 static IntRect ReorientRowRotate0Fallback(const uint8_t* aSrc, int32_t aSrcRow,
1462                                           uint8_t* aDst,
1463                                           const IntSize& aDstSize,
1464                                           int32_t aDstStride) {
1465   // Copy row of pixels into the destination.
1466   uint8_t* dst = aDst + aSrcRow * aDstStride;
1467   memcpy(dst, aSrc, aDstSize.width * sizeof(uint32_t));
1468   return IntRect(0, aSrcRow, aDstSize.width, 1);
1469 }
1470
1471 static IntRect ReorientRowRotate90Fallback(const uint8_t* aSrc, int32_t aSrcRow,
1472                                            uint8_t* aDst,
1473                                            const IntSize& aDstSize,
1474                                            int32_t aDstStride) {
1475   // Copy row of pixels from top to bottom, into right to left columns.
1476   const uint32_t* src = reinterpret_cast<const uint32_t*>(aSrc);
1477   const uint32_t* end = src + aDstSize.height;
1478   uint32_t* dst =
1479       reinterpret_cast<uint32_t*>(aDst) + aDstSize.width - aSrcRow - 1;
1480   int32_t stride = aDstStride / sizeof(uint32_t);
1481   do {
1482     *dst = *src++;
1483     dst += stride;
1484   } while (src < end);
1485
1486   return IntRect(aDstSize.width - aSrcRow - 1, 0, 1, aDstSize.height);
1487 }
1488
1489 static IntRect ReorientRowRotate180Fallback(const uint8_t* aSrc,
1490                                             int32_t aSrcRow, uint8_t* aDst,
1491                                             const IntSize& aDstSize,
1492                                             int32_t aDstStride) {
1493   // Copy row of pixels in reverse order from top to bottom, into bottom to top
1494   // rows.
1495   const uint32_t* src = reinterpret_cast<const uint32_t*>(aSrc);
1496   const uint32_t* end = src + aDstSize.width;
1497   uint32_t* dst = reinterpret_cast<uint32_t*>(
1498                       aDst + (aDstSize.height - aSrcRow - 1) * aDstStride) +
1499                   aDstSize.width - 1;
1500   do {
1501     *dst-- = *src++;
1502   } while (src < end);
1503
1504   return IntRect(0, aDstSize.height - aSrcRow - 1, aDstSize.width, 1);
1505 }
1506
1507 static IntRect ReorientRowRotate270Fallback(const uint8_t* aSrc,
1508                                             int32_t aSrcRow, uint8_t* aDst,
1509                                             const IntSize& aDstSize,
1510                                             int32_t aDstStride) {
1511   // Copy row of pixels in reverse order from top to bottom, into left to right
1512   // column.
1513   const uint32_t* src = reinterpret_cast<const uint32_t*>(aSrc);
1514   const uint32_t* end = src + aDstSize.height;
1515   uint32_t* dst =
1516       reinterpret_cast<uint32_t*>(aDst + (aDstSize.height - 1) * aDstStride) +
1517       aSrcRow;
1518   int32_t stride = aDstStride / sizeof(uint32_t);
1519   do {
1520     *dst = *src++;
1521     dst -= stride;
1522   } while (src < end);
1523
1524   return IntRect(aSrcRow, 0, 1, aDstSize.height);
1525 }
1526
1527 ReorientRowFn ReorientRow(const struct image::Orientation& aOrientation) {
1528   switch (aOrientation.flip) {
1529     case image::Flip::Unflipped:
1530       switch (aOrientation.rotation) {
1531         case image::Angle::D0:
1532           return &ReorientRowRotate0Fallback;
1533         case image::Angle::D90:
1534           return &ReorientRowRotate90Fallback;
1535         case image::Angle::D180:
1536           return &ReorientRowRotate180Fallback;
1537         case image::Angle::D270:
1538           return &ReorientRowRotate270Fallback;
1539         default:
1540           break;
1541       }
1542       break;
1543     case image::Flip::Horizontal:
1544       switch (aOrientation.rotation) {
1545         case image::Angle::D0:
1546           return &ReorientRowRotate0FlipFallback;
1547         case image::Angle::D90:
1548           if (aOrientation.flipFirst) {
1549             return &ReorientRowRotate270FlipFallback;
1550           } else {
1551             return &ReorientRowRotate90FlipFallback;
1552           }
1553         case image::Angle::D180:
1554           return &ReorientRowRotate180FlipFallback;
1555         case image::Angle::D270:
1556           if (aOrientation.flipFirst) {
1557             return &ReorientRowRotate90FlipFallback;
1558           } else {
1559             return &ReorientRowRotate270FlipFallback;
1560           }
1561         default:
1562           break;
1563       }
1564       break;
1565     default:
1566       break;
1567   }
1568
1569   MOZ_ASSERT_UNREACHABLE("Unhandled orientation!");
1570   return nullptr;
1571 }
1572
1573 }  // namespace gfx
1574 }  // namespace mozilla