gfx/2d/SwizzleNEON.cpp

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #include "Swizzle.h"
   8
   9 #include <arm_neon.h>
  10
  11 namespace mozilla {
  12 namespace gfx {
  13
  14 // Load 1-3 pixels into a 4 pixel vector.
  15 static MOZ_ALWAYS_INLINE uint16x8_t LoadRemainder_NEON(const uint8_t* aSrc,
  16                                                        size_t aLength) {
  17   const uint32_t* src32 = reinterpret_cast<const uint32_t*>(aSrc);
  18   uint32x4_t dst32;
  19   if (aLength >= 2) {
  20     // Load first 2 pixels
  21     dst32 = vcombine_u32(vld1_u32(src32), vdup_n_u32(0));
  22     // Load third pixel
  23     if (aLength >= 3) {
  24       dst32 = vld1q_lane_u32(src32 + 2, dst32, 2);
  25     }
  26   } else {
  27     // Load single pixel
  28     dst32 = vld1q_lane_u32(src32, vdupq_n_u32(0), 0);
  29   }
  30   return vreinterpretq_u16_u32(dst32);
  31 }
  32
  33 // Store 1-3 pixels from a vector into memory without overwriting.
  34 static MOZ_ALWAYS_INLINE void StoreRemainder_NEON(uint8_t* aDst, size_t aLength,
  35                                                   const uint16x8_t& aSrc) {
  36   uint32_t* dst32 = reinterpret_cast<uint32_t*>(aDst);
  37   uint32x4_t src32 = vreinterpretq_u32_u16(aSrc);
  38   if (aLength >= 2) {
  39     // Store first 2 pixels
  40     vst1_u32(dst32, vget_low_u32(src32));
  41     // Store third pixel
  42     if (aLength >= 3) {
  43       vst1q_lane_u32(dst32 + 2, src32, 2);
  44     }
  45   } else {
  46     // Store single pixel
  47     vst1q_lane_u32(dst32, src32, 0);
  48   }
  49 }
  50
  51 // Premultiply vector of 4 pixels using splayed math.
  52 template <bool aSwapRB, bool aOpaqueAlpha>
  53 static MOZ_ALWAYS_INLINE uint16x8_t
  54 PremultiplyVector_NEON(const uint16x8_t& aSrc) {
  55   // Isolate R and B with mask.
  56   const uint16x8_t mask = vdupq_n_u16(0x00FF);
  57   uint16x8_t rb = vandq_u16(aSrc, mask);
  58   // Swap R and B if necessary.
  59   if (aSwapRB) {
  60     rb = vrev32q_u16(rb);
  61   }
  62   // Isolate G and A by shifting down to bottom of word.
  63   uint16x8_t ga = vshrq_n_u16(aSrc, 8);
  64
  65   // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
  66   uint16x8_t alphas = vtrnq_u16(ga, ga).val[1];
  67
  68   // rb = rb*a + 255; rb += rb >> 8;
  69   rb = vmlaq_u16(mask, rb, alphas);
  70   rb = vsraq_n_u16(rb, rb, 8);
  71
  72   // If format is not opaque, force A to 255 so that A*alpha/255 = alpha
  73   if (!aOpaqueAlpha) {
  74     ga = vorrq_u16(ga, vreinterpretq_u16_u32(vdupq_n_u32(0x00FF0000)));
  75   }
  76   // ga = ga*a + 255; ga += ga >> 8;
  77   ga = vmlaq_u16(mask, ga, alphas);
  78   ga = vsraq_n_u16(ga, ga, 8);
  79   // If format is opaque, force output A to be 255.
  80   if (aOpaqueAlpha) {
  81     ga = vorrq_u16(ga, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)));
  82   }
  83
  84   // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00)
  85   return vsriq_n_u16(ga, rb, 8);
  86 }
  87
  88 template <bool aSwapRB, bool aOpaqueAlpha>
  89 static MOZ_ALWAYS_INLINE void PremultiplyChunk_NEON(const uint8_t*& aSrc,
  90                                                     uint8_t*& aDst,
  91                                                     int32_t aAlignedRow,
  92                                                     int32_t aRemainder) {
  93   // Process all 4-pixel chunks as one vector.
  94   for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
  95     uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
  96     px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
  97     vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
  98     aSrc += 4 * 4;
  99     aDst += 4 * 4;
 100   }
 101
 102   // Handle any 1-3 remaining pixels.
 103   if (aRemainder) {
 104     uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
 105     px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
 106     StoreRemainder_NEON(aDst, aRemainder, px);
 107   }
 108 }
 109
 110 template <bool aSwapRB, bool aOpaqueAlpha>
 111 void PremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
 112   int32_t alignedRow = 4 * (aLength & ~3);
 113   int32_t remainder = aLength & 3;
 114   PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
 115                                                remainder);
 116 }
 117
 118 template <bool aSwapRB, bool aOpaqueAlpha>
 119 void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 120                       int32_t aDstGap, IntSize aSize) {
 121   int32_t alignedRow = 4 * (aSize.width & ~3);
 122   int32_t remainder = aSize.width & 3;
 123   // Fold remainder into stride gap.
 124   aSrcGap += 4 * remainder;
 125   aDstGap += 4 * remainder;
 126
 127   for (int32_t height = aSize.height; height > 0; height--) {
 128     PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
 129                                                  remainder);
 130     aSrc += aSrcGap;
 131     aDst += aDstGap;
 132   }
 133 }
 134
 135 // Force instantiation of premultiply variants here.
 136 template void PremultiplyRow_NEON<false, false>(const uint8_t*, uint8_t*,
 137                                                 int32_t);
 138 template void PremultiplyRow_NEON<false, true>(const uint8_t*, uint8_t*,
 139                                                int32_t);
 140 template void PremultiplyRow_NEON<true, false>(const uint8_t*, uint8_t*,
 141                                                int32_t);
 142 template void PremultiplyRow_NEON<true, true>(const uint8_t*, uint8_t*,
 143                                               int32_t);
 144 template void Premultiply_NEON<false, false>(const uint8_t*, int32_t, uint8_t*,
 145                                              int32_t, IntSize);
 146 template void Premultiply_NEON<false, true>(const uint8_t*, int32_t, uint8_t*,
 147                                             int32_t, IntSize);
 148 template void Premultiply_NEON<true, false>(const uint8_t*, int32_t, uint8_t*,
 149                                             int32_t, IntSize);
 150 template void Premultiply_NEON<true, true>(const uint8_t*, int32_t, uint8_t*,
 151                                            int32_t, IntSize);
 152
 153 // This generates a table of fixed-point reciprocals representing 1/alpha
 154 // similar to the fallback implementation. However, the reciprocal must
 155 // ultimately be multiplied as an unsigned 9 bit upper part and a signed
 156 // 15 bit lower part to cheaply multiply. Thus, the lower 15 bits of the
 157 // reciprocal is stored 15 bits of the reciprocal are masked off and
 158 // stored in the low word. The upper 9 bits are masked and shifted to fit
 159 // into the high word. These then get independently multiplied with the
 160 // color component and recombined to provide the full recriprocal multiply.
 161 #define UNPREMULQ_NEON(x) \
 162   ((((0xFF00FFU / (x)) & 0xFF8000U) << 1) | ((0xFF00FFU / (x)) & 0x7FFFU))
 163 #define UNPREMULQ_NEON_2(x) UNPREMULQ_NEON(x), UNPREMULQ_NEON((x) + 1)
 164 #define UNPREMULQ_NEON_4(x) UNPREMULQ_NEON_2(x), UNPREMULQ_NEON_2((x) + 2)
 165 #define UNPREMULQ_NEON_8(x) UNPREMULQ_NEON_4(x), UNPREMULQ_NEON_4((x) + 4)
 166 #define UNPREMULQ_NEON_16(x) UNPREMULQ_NEON_8(x), UNPREMULQ_NEON_8((x) + 8)
 167 #define UNPREMULQ_NEON_32(x) UNPREMULQ_NEON_16(x), UNPREMULQ_NEON_16((x) + 16)
 168 static const uint32_t sUnpremultiplyTable_NEON[256] = {0,
 169                                                        UNPREMULQ_NEON(1),
 170                                                        UNPREMULQ_NEON_2(2),
 171                                                        UNPREMULQ_NEON_4(4),
 172                                                        UNPREMULQ_NEON_8(8),
 173                                                        UNPREMULQ_NEON_16(16),
 174                                                        UNPREMULQ_NEON_32(32),
 175                                                        UNPREMULQ_NEON_32(64),
 176                                                        UNPREMULQ_NEON_32(96),
 177                                                        UNPREMULQ_NEON_32(128),
 178                                                        UNPREMULQ_NEON_32(160),
 179                                                        UNPREMULQ_NEON_32(192),
 180                                                        UNPREMULQ_NEON_32(224)};
 181
 182 // Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
 183 // that avoids doing any actual division.
 184 template <bool aSwapRB>
 185 static MOZ_ALWAYS_INLINE uint16x8_t
 186 UnpremultiplyVector_NEON(const uint16x8_t& aSrc) {
 187   // Isolate R and B with mask.
 188   uint16x8_t rb = vandq_u16(aSrc, vdupq_n_u16(0x00FF));
 189   // Swap R and B if necessary.
 190   if (aSwapRB) {
 191     rb = vrev32q_u16(rb);
 192   }
 193
 194   // Isolate G and A by shifting down to bottom of word.
 195   uint16x8_t ga = vshrq_n_u16(aSrc, 8);
 196   // Extract the alphas for the 4 pixels from the now isolated words.
 197   int a1 = vgetq_lane_u16(ga, 1);
 198   int a2 = vgetq_lane_u16(ga, 3);
 199   int a3 = vgetq_lane_u16(ga, 5);
 200   int a4 = vgetq_lane_u16(ga, 7);
 201
 202   // First load all of the interleaved low and high portions of the reciprocals
 203   // and combine them a single vector as lo1 hi1 lo2 hi2 lo3 hi3 lo4 hi4
 204   uint16x8_t q1234 = vreinterpretq_u16_u32(vld1q_lane_u32(
 205       &sUnpremultiplyTable_NEON[a4],
 206       vld1q_lane_u32(
 207           &sUnpremultiplyTable_NEON[a3],
 208           vld1q_lane_u32(
 209               &sUnpremultiplyTable_NEON[a2],
 210               vld1q_lane_u32(&sUnpremultiplyTable_NEON[a1], vdupq_n_u32(0), 0),
 211               1),
 212           2),
 213       3));
 214   // Transpose the interleaved low/high portions so that we produce
 215   // two separate duplicated vectors for the low and high portions respectively:
 216   // lo1 lo1 lo2 lo2 lo3 lo3 lo4 lo4 and hi1 hi1 hi2 hi2 hi3 hi3 hi4 hi4
 217   uint16x8x2_t q1234lohi = vtrnq_u16(q1234, q1234);
 218
 219   // VQDMULH is a signed multiply that doubles (*2) the result, then takes the
 220   // high word.  To work around the signedness and the doubling, the low
 221   // portion of the reciprocal only stores the lower 15 bits, which fits in a
 222   // signed 16 bit integer. The high 9 bit portion is effectively also doubled
 223   // by 2 as a side-effect of being shifted for storage. Thus the output scale
 224   // of doing a normal multiply by the high portion and the VQDMULH by the low
 225   // portion are both doubled and can be safely added together. The resulting
 226   // sum just needs to be halved (via VHADD) to thus cancel out the doubling.
 227   // All this combines to produce a reciprocal multiply of the form:
 228   // rb = ((rb * hi) + ((rb * lo * 2) >> 16)) / 2
 229   rb = vhaddq_u16(
 230       vmulq_u16(rb, q1234lohi.val[1]),
 231       vreinterpretq_u16_s16(vqdmulhq_s16(
 232           vreinterpretq_s16_u16(rb), vreinterpretq_s16_u16(q1234lohi.val[0]))));
 233
 234   // ga = ((ga * hi) + ((ga * lo * 2) >> 16)) / 2
 235   ga = vhaddq_u16(
 236       vmulq_u16(ga, q1234lohi.val[1]),
 237       vreinterpretq_u16_s16(vqdmulhq_s16(
 238           vreinterpretq_s16_u16(ga), vreinterpretq_s16_u16(q1234lohi.val[0]))));
 239
 240   // Combine to the final pixel with ((rb | (ga << 8)) & ~0xFF000000) | (aSrc &
 241   // 0xFF000000), which inserts back in the original alpha value unchanged.
 242   return vbslq_u16(vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)), aSrc,
 243                    vsliq_n_u16(rb, ga, 8));
 244 }
 245
 246 template <bool aSwapRB>
 247 static MOZ_ALWAYS_INLINE void UnpremultiplyChunk_NEON(const uint8_t*& aSrc,
 248                                                       uint8_t*& aDst,
 249                                                       int32_t aAlignedRow,
 250                                                       int32_t aRemainder) {
 251   // Process all 4-pixel chunks as one vector.
 252   for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
 253     uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
 254     px = UnpremultiplyVector_NEON<aSwapRB>(px);
 255     vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
 256     aSrc += 4 * 4;
 257     aDst += 4 * 4;
 258   }
 259
 260   // Handle any 1-3 remaining pixels.
 261   if (aRemainder) {
 262     uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
 263     px = UnpremultiplyVector_NEON<aSwapRB>(px);
 264     StoreRemainder_NEON(aDst, aRemainder, px);
 265   }
 266 }
 267
 268 template <bool aSwapRB>
 269 void UnpremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst,
 270                            int32_t aLength) {
 271   int32_t alignedRow = 4 * (aLength & ~3);
 272   int32_t remainder = aLength & 3;
 273   UnpremultiplyChunk_NEON<aSwapRB>(aSrc, aDst, alignedRow, remainder);
 274 }
 275
 276 template <bool aSwapRB>
 277 void Unpremultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 278                         int32_t aDstGap, IntSize aSize) {
 279   int32_t alignedRow = 4 * (aSize.width & ~3);
 280   int32_t remainder = aSize.width & 3;
 281   // Fold remainder into stride gap.
 282   aSrcGap += 4 * remainder;
 283   aDstGap += 4 * remainder;
 284
 285   for (int32_t height = aSize.height; height > 0; height--) {
 286     UnpremultiplyChunk_NEON<aSwapRB>(aSrc, aDst, alignedRow, remainder);
 287     aSrc += aSrcGap;
 288     aDst += aDstGap;
 289   }
 290 }
 291
 292 // Force instantiation of unpremultiply variants here.
 293 template void UnpremultiplyRow_NEON<false>(const uint8_t*, uint8_t*, int32_t);
 294 template void UnpremultiplyRow_NEON<true>(const uint8_t*, uint8_t*, int32_t);
 295 template void Unpremultiply_NEON<false>(const uint8_t*, int32_t, uint8_t*,
 296                                         int32_t, IntSize);
 297 template void Unpremultiply_NEON<true>(const uint8_t*, int32_t, uint8_t*,
 298                                        int32_t, IntSize);
 299
 300 // Swizzle a vector of 4 pixels providing swaps and opaquifying.
 301 template <bool aSwapRB, bool aOpaqueAlpha>
 302 static MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {
 303   // Swap R and B, then add to G and A (forced to 255):
 304   // (((src>>16) | (src << 16)) & 0x00FF00FF) |
 305   //   ((src | 0xFF000000) & ~0x00FF00FF)
 306   return vbslq_u16(
 307       vdupq_n_u16(0x00FF), vrev32q_u16(aSrc),
 308       aOpaqueAlpha
 309           ? vorrq_u16(aSrc, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)))
 310           : aSrc);
 311 }
 312
 313 #if 0
 314 // These specializations currently do not profile faster than the generic versions,
 315 // so disable them for now.
 316
 317 // Optimized implementations for when there is no R and B swap.
 318 template<>
 319 static MOZ_ALWAYS_INLINE uint16x8_t
 320 SwizzleVector_NEON<false, true>(const uint16x8_t& aSrc)
 321 {
 322   // Force alpha to 255.
 323   return vorrq_u16(aSrc, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)));
 324 }
 325
 326 template<>
 327 static MOZ_ALWAYS_INLINE uint16x8_t
 328 SwizzleVector_NEON<false, false>(const uint16x8_t& aSrc)
 329 {
 330   return aSrc;
 331 }
 332 #endif
 333
 334 template <bool aSwapRB, bool aOpaqueAlpha>
 335 static MOZ_ALWAYS_INLINE void SwizzleChunk_NEON(const uint8_t*& aSrc,
 336                                                 uint8_t*& aDst,
 337                                                 int32_t aAlignedRow,
 338                                                 int32_t aRemainder) {
 339   // Process all 4-pixel chunks as one vector.
 340   for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
 341     uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
 342     px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
 343     vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
 344     aSrc += 4 * 4;
 345     aDst += 4 * 4;
 346   }
 347
 348   // Handle any 1-3 remaining pixels.
 349   if (aRemainder) {
 350     uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
 351     px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
 352     StoreRemainder_NEON(aDst, aRemainder, px);
 353   }
 354 }
 355
 356 template <bool aSwapRB, bool aOpaqueAlpha>
 357 void SwizzleRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
 358   int32_t alignedRow = 4 * (aLength & ~3);
 359   int32_t remainder = aLength & 3;
 360   SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
 361 }
 362
 363 template <bool aSwapRB, bool aOpaqueAlpha>
 364 void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
 365                   int32_t aDstGap, IntSize aSize) {
 366   int32_t alignedRow = 4 * (aSize.width & ~3);
 367   int32_t remainder = aSize.width & 3;
 368   // Fold remainder into stride gap.
 369   aSrcGap += 4 * remainder;
 370   aDstGap += 4 * remainder;
 371
 372   for (int32_t height = aSize.height; height > 0; height--) {
 373     SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
 374     aSrc += aSrcGap;
 375     aDst += aDstGap;
 376   }
 377 }
 378
 379 // Force instantiation of swizzle variants here.
 380 template void SwizzleRow_NEON<true, false>(const uint8_t*, uint8_t*, int32_t);
 381 template void SwizzleRow_NEON<true, true>(const uint8_t*, uint8_t*, int32_t);
 382 template void Swizzle_NEON<true, false>(const uint8_t*, int32_t, uint8_t*,
 383                                         int32_t, IntSize);
 384 template void Swizzle_NEON<true, true>(const uint8_t*, int32_t, uint8_t*,
 385                                        int32_t, IntSize);
 386
 387 template <bool aSwapRB>
 388 void UnpackRowRGB24(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength);
 389
 390 template <bool aSwapRB>
 391 void UnpackRowRGB24_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
 392   // Because this implementation will read an additional 4 bytes of data that
 393   // is ignored and masked over, we cannot use the accelerated version for the
 394   // last 1-5 pixels (3-15 bytes remaining) to guarantee we don't access memory
 395   // outside the buffer (we read in 16 byte chunks).
 396   if (aLength < 6) {
 397     UnpackRowRGB24<aSwapRB>(aSrc, aDst, aLength);
 398     return;
 399   }
 400
 401   // Because we are expanding, we can only process the data back to front in
 402   // case we are performing this in place.
 403   int32_t alignedRow = (aLength - 2) & ~3;
 404   int32_t remainder = aLength - alignedRow;
 405
 406   const uint8_t* src = aSrc + alignedRow * 3;
 407   uint8_t* dst = aDst + alignedRow * 4;
 408
 409   // Handle 2-5 remaining pixels.
 410   UnpackRowRGB24<aSwapRB>(src, dst, remainder);
 411
 412   uint8x8_t masklo;
 413   uint8x8_t maskhi;
 414   if (aSwapRB) {
 415     static const uint8_t masklo_data[] = {2, 1, 0, 0, 5, 4, 3, 0};
 416     static const uint8_t maskhi_data[] = {4, 3, 2, 0, 7, 6, 5, 0};
 417     masklo = vld1_u8(masklo_data);
 418     maskhi = vld1_u8(maskhi_data);
 419   } else {
 420     static const uint8_t masklo_data[] = {0, 1, 2, 0, 3, 4, 5, 0};
 421     static const uint8_t maskhi_data[] = {2, 3, 4, 0, 5, 6, 7, 0};
 422     masklo = vld1_u8(masklo_data);
 423     maskhi = vld1_u8(maskhi_data);
 424   }
 425
 426   uint8x16_t alpha = vreinterpretq_u8_u32(vdupq_n_u32(0xFF000000));
 427
 428   // Process all 4-pixel chunks as one vector.
 429   src -= 4 * 3;
 430   dst -= 4 * 4;
 431   while (src >= aSrc) {
 432     uint8x16_t px = vld1q_u8(src);
 433     // G2R2B1G1 R1B0G0R0 -> X1R1G1B1 X0R0G0B0
 434     uint8x8_t pxlo = vtbl1_u8(vget_low_u8(px), masklo);
 435     // B3G3R3B2 G2R2B1G1 -> X3R3G3B3 X2R2G2B2
 436     uint8x8_t pxhi =
 437         vtbl1_u8(vext_u8(vget_low_u8(px), vget_high_u8(px), 4), maskhi);
 438     px = vcombine_u8(pxlo, pxhi);
 439     px = vorrq_u8(px, alpha);
 440     vst1q_u8(dst, px);
 441     src -= 4 * 3;
 442     dst -= 4 * 4;
 443   }
 444 }
 445
 446 // Force instantiation of swizzle variants here.
 447 template void UnpackRowRGB24_NEON<false>(const uint8_t*, uint8_t*, int32_t);
 448 template void UnpackRowRGB24_NEON<true>(const uint8_t*, uint8_t*, int32_t);
 449
 450 }  // namespace gfx
 451 }  // namespace mozilla