gfx/2d/FilterProcessingSIMD-inl.h

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #include "FilterProcessing.h"
   8
   9 #include "SIMD.h"
  10 #include "SVGTurbulenceRenderer-inl.h"
  11
  12 namespace mozilla {
  13 namespace gfx {
  14
  15 template <typename u8x16_t>
  16 inline already_AddRefed<DataSourceSurface> ConvertToB8G8R8A8_SIMD(
  17     SourceSurface* aSurface) {
  18   IntSize size = aSurface->GetSize();
  19   RefPtr<DataSourceSurface> output =
  20       Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
  21   if (!output) {
  22     return nullptr;
  23   }
  24
  25   RefPtr<DataSourceSurface> input = aSurface->GetDataSurface();
  26   DataSourceSurface::ScopedMap inputMap(input, DataSourceSurface::READ);
  27   DataSourceSurface::ScopedMap outputMap(output, DataSourceSurface::READ_WRITE);
  28   uint8_t* inputData = inputMap.GetData();
  29   uint8_t* outputData = outputMap.GetData();
  30   int32_t inputStride = inputMap.GetStride();
  31   int32_t outputStride = outputMap.GetStride();
  32   switch (input->GetFormat()) {
  33     case SurfaceFormat::B8G8R8A8:
  34       output = input;
  35       break;
  36     case SurfaceFormat::B8G8R8X8:
  37       for (int32_t y = 0; y < size.height; y++) {
  38         for (int32_t x = 0; x < size.width; x++) {
  39           int32_t inputIndex = y * inputStride + 4 * x;
  40           int32_t outputIndex = y * outputStride + 4 * x;
  41           outputData[outputIndex + 0] = inputData[inputIndex + 0];
  42           outputData[outputIndex + 1] = inputData[inputIndex + 1];
  43           outputData[outputIndex + 2] = inputData[inputIndex + 2];
  44           outputData[outputIndex + 3] = 255;
  45         }
  46       }
  47       break;
  48     case SurfaceFormat::R8G8B8A8:
  49       for (int32_t y = 0; y < size.height; y++) {
  50         for (int32_t x = 0; x < size.width; x++) {
  51           int32_t inputIndex = y * inputStride + 4 * x;
  52           int32_t outputIndex = y * outputStride + 4 * x;
  53           outputData[outputIndex + 2] = inputData[inputIndex + 0];
  54           outputData[outputIndex + 1] = inputData[inputIndex + 1];
  55           outputData[outputIndex + 0] = inputData[inputIndex + 2];
  56           outputData[outputIndex + 3] = inputData[inputIndex + 3];
  57         }
  58       }
  59       break;
  60     case SurfaceFormat::R8G8B8X8:
  61       for (int32_t y = 0; y < size.height; y++) {
  62         for (int32_t x = 0; x < size.width; x++) {
  63           int32_t inputIndex = y * inputStride + 4 * x;
  64           int32_t outputIndex = y * outputStride + 4 * x;
  65           outputData[outputIndex + 2] = inputData[inputIndex + 0];
  66           outputData[outputIndex + 1] = inputData[inputIndex + 1];
  67           outputData[outputIndex + 0] = inputData[inputIndex + 2];
  68           outputData[outputIndex + 3] = 255;
  69         }
  70       }
  71       break;
  72     case SurfaceFormat::A8:
  73       for (int32_t y = 0; y < size.height; y++) {
  74         for (int32_t x = 0; x < size.width; x += 16) {
  75           int32_t inputIndex = y * inputStride + x;
  76           int32_t outputIndex = y * outputStride + 4 * x;
  77           u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]);
  78           // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by
  79           // interleaving with 0000000000000000 twice.
  80           u8x16_t zero = simd::FromZero8<u8x16_t>();
  81           u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16);
  82           u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16);
  83           u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8);
  84           u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8);
  85           u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16);
  86           u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16);
  87           simd::Store8(&outputData[outputIndex], p1To4);
  88           if ((x + 4) * 4 < outputStride) {
  89             simd::Store8(&outputData[outputIndex + 4 * 4], p5To8);
  90           }
  91           if ((x + 8) * 4 < outputStride) {
  92             simd::Store8(&outputData[outputIndex + 4 * 8], p9To12);
  93           }
  94           if ((x + 12) * 4 < outputStride) {
  95             simd::Store8(&outputData[outputIndex + 4 * 12], p13To16);
  96           }
  97         }
  98       }
  99       break;
 100     default:
 101       output = nullptr;
 102       break;
 103   }
 104   return output.forget();
 105 }
 106
 107 template <typename u8x16_t>
 108 inline void ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData,
 109                               int32_t sourceStride, uint8_t* alphaData,
 110                               int32_t alphaStride) {
 111   for (int32_t y = 0; y < size.height; y++) {
 112     for (int32_t x = 0; x < size.width; x += 16) {
 113       // Process 16 pixels at a time.
 114       // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of
 115       // AAAAAAAAAAAAAAAA.
 116       int32_t sourceIndex = y * sourceStride + 4 * x;
 117       int32_t targetIndex = y * alphaStride + x;
 118
 119       u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
 120       u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
 121       u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
 122
 123       u8x16_t bgrabgrabgrabgra1 =
 124           simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
 125       if (4 * (x + 4) < sourceStride) {
 126         bgrabgrabgrabgra2 =
 127             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
 128       }
 129       if (4 * (x + 8) < sourceStride) {
 130         bgrabgrabgrabgra3 =
 131             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
 132       }
 133       if (4 * (x + 12) < sourceStride) {
 134         bgrabgrabgrabgra4 =
 135             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
 136       }
 137
 138       u8x16_t bbggrraabbggrraa1 =
 139           simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
 140       u8x16_t bbggrraabbggrraa2 =
 141           simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
 142       u8x16_t bbggrraabbggrraa3 =
 143           simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
 144       u8x16_t bbggrraabbggrraa4 =
 145           simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
 146       u8x16_t bbbbggggrrrraaaa1 =
 147           simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
 148       u8x16_t bbbbggggrrrraaaa2 =
 149           simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
 150       u8x16_t bbbbggggrrrraaaa3 =
 151           simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
 152       u8x16_t bbbbggggrrrraaaa4 =
 153           simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
 154       u8x16_t rrrrrrrraaaaaaaa1 =
 155           simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
 156       u8x16_t rrrrrrrraaaaaaaa2 =
 157           simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
 158       u8x16_t aaaaaaaaaaaaaaaa =
 159           simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
 160
 161       simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa);
 162     }
 163   }
 164 }
 165
 166 // This function calculates the result color values for four pixels, but for
 167 // only two color channels - either b & r or g & a. However, the a result will
 168 // not be used.
 169 // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa.
 170 // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the
 171 // alpha of all four pixels (and both aaaa's are the same).
 172 // blendendComponent1 and blendedComponent2 are the out parameters.
 173 template <typename i16x8_t, typename i32x4_t, uint32_t aBlendMode>
 174 inline void BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha,
 175                                            i16x8_t dest,
 176                                            const i16x8_t& destAlpha,
 177                                            i32x4_t& blendedComponent1,
 178                                            i32x4_t& blendedComponent2) {
 179   i16x8_t x255 = simd::FromI16<i16x8_t>(255);
 180
 181   switch (aBlendMode) {
 182     case BLEND_MODE_MULTIPLY: {
 183       // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) *
 184       // dest);
 185       i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
 186       i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
 187       i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource =
 188           simd::Add16(twoFiftyFiveMinusSourceAlpha, source);
 189
 190       i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
 191       i16x8_t leftFactor1 = simd::InterleaveLo16(
 192           twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
 193       blendedComponent1 =
 194           simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1);
 195       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
 196
 197       i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
 198       i16x8_t leftFactor2 = simd::InterleaveHi16(
 199           twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
 200       blendedComponent2 =
 201           simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2);
 202       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
 203
 204       break;
 205     }
 206
 207     case BLEND_MODE_SCREEN: {
 208       // val = 255 * (source + dest) + (0 - dest) * source;
 209       i16x8_t sourcePlusDest = simd::Add16(source, dest);
 210       i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest);
 211
 212       i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 =
 213           simd::InterleaveLo16(x255, zeroMinusDest);
 214       i16x8_t sourcePlusDestInterleavedWithSource1 =
 215           simd::InterleaveLo16(sourcePlusDest, source);
 216       blendedComponent1 =
 217           simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1,
 218                                    sourcePlusDestInterleavedWithSource1);
 219       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
 220
 221       i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 =
 222           simd::InterleaveHi16(x255, zeroMinusDest);
 223       i16x8_t sourcePlusDestInterleavedWithSource2 =
 224           simd::InterleaveHi16(sourcePlusDest, source);
 225       blendedComponent2 =
 226           simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2,
 227                                    sourcePlusDestInterleavedWithSource2);
 228       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
 229
 230       break;
 231     }
 232
 233     case BLEND_MODE_DARKEN:
 234     case BLEND_MODE_LIGHTEN: {
 235       // Darken:
 236       // val = min((255 - destAlpha) * source + 255                 * dest,
 237       //           255               * source + (255 - sourceAlpha) * dest);
 238       //
 239       // Lighten:
 240       // val = max((255 - destAlpha) * source + 255                 * dest,
 241       //           255               * source + (255 - sourceAlpha) * dest);
 242
 243       i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
 244       i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
 245
 246       i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 =
 247           simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255);
 248       i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 =
 249           simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha);
 250       i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
 251       i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(
 252           twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1,
 253           sourceInterleavedWithDest1);
 254       i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(
 255           twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1,
 256           sourceInterleavedWithDest1);
 257       blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN
 258                               ? simd::Min32(product1_1, product1_2)
 259                               : simd::Max32(product1_1, product1_2);
 260       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
 261
 262       i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 =
 263           simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255);
 264       i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 =
 265           simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha);
 266       i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
 267       i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(
 268           twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2,
 269           sourceInterleavedWithDest2);
 270       i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(
 271           twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2,
 272           sourceInterleavedWithDest2);
 273       blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN
 274                               ? simd::Min32(product2_1, product2_2)
 275                               : simd::Max32(product2_1, product2_2);
 276       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
 277
 278       break;
 279     }
 280   }
 281 }
 282
 283 // The alpha channel is subject to a different calculation than the RGB
 284 // channels, and this calculation is the same for all blend modes:
 285 // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha)
 286 template <typename i16x8_t, typename i32x4_t>
 287 inline i32x4_t BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234,
 288                                       i16x8_t d_rrrraaaa1234) {
 289   // clang-format off
 290   // We're using MulAdd16x8x2To32x4, so we need to interleave our factors
 291   // appropriately. The calculation is rewritten as follows:
 292   // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0])
 293   //                      = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
 294   //                      = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
 295   //                      = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0]
 296   // clang-format on
 297   i16x8_t zeroInterleavedWithSourceAlpha =
 298       simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234);
 299   i16x8_t fiveTenInterleavedWithDestAlpha =
 300       simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234);
 301   i16x8_t f1 =
 302       simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha);
 303   i16x8_t f2 =
 304       simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255));
 305   return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2));
 306 }
 307
 308 template <typename u8x16_t, typename i16x8_t>
 309 inline void UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,
 310                                        i16x8_t& bbbbgggg1234,
 311                                        i16x8_t& rrrraaaa1234) {
 312   // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234
 313   i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234);
 314   i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234);
 315   i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34);
 316   i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34);
 317   bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24);
 318   rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24);
 319 }
 320
 321 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
 322 inline u8x16_t ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234,
 323                                         i32x4_t rrrr1234,
 324                                         const i32x4_t& aaaa1234) {
 325   // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234
 326   i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234);
 327   i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234);
 328   i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234);
 329   i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234);
 330   i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234);
 331   i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234);
 332   return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34);
 333 }
 334
 335 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
 336 inline void ApplyBlending_SIMD(const DataSourceSurface::ScopedMap& aInputMap1,
 337                                const DataSourceSurface::ScopedMap& aInputMap2,
 338                                const DataSourceSurface::ScopedMap& aOutputMap,
 339                                const IntSize& aSize) {
 340   uint8_t* source1Data = aInputMap1.GetData();
 341   uint8_t* source2Data = aInputMap2.GetData();
 342   uint8_t* targetData = aOutputMap.GetData();
 343   int32_t targetStride = aOutputMap.GetStride();
 344   int32_t source1Stride = aInputMap1.GetStride();
 345   int32_t source2Stride = aInputMap2.GetStride();
 346
 347   for (int32_t y = 0; y < aSize.height; y++) {
 348     for (int32_t x = 0; x < aSize.width; x += 4) {
 349       int32_t targetIndex = y * targetStride + 4 * x;
 350       int32_t source1Index = y * source1Stride + 4 * x;
 351       int32_t source2Index = y * source2Stride + 4 * x;
 352
 353       u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
 354       u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
 355
 356       // The blending calculation for the RGB channels all need access to the
 357       // alpha channel of their pixel, and the alpha calculation is different,
 358       // so it makes sense to separate by channel.
 359
 360       i16x8_t s_bbbbgggg1234, s_rrrraaaa1234;
 361       i16x8_t d_bbbbgggg1234, d_rrrraaaa1234;
 362       UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234);
 363       UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234);
 364       i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(s_rrrraaaa1234);
 365       i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(d_rrrraaaa1234);
 366
 367       // We only use blendedB, blendedG and blendedR.
 368       i32x4_t blendedB, blendedG, blendedR, blendedA;
 369       BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>(
 370           s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234,
 371           blendedB, blendedG);
 372       BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>(
 373           s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234,
 374           blendedR, blendedA);
 375
 376       // Throw away blendedA and overwrite it with the correct blended alpha.
 377       blendedA = BlendAlphaOfFourPixels<i16x8_t, i32x4_t>(s_rrrraaaa1234,
 378                                                           d_rrrraaaa1234);
 379
 380       u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t, i16x8_t, u8x16_t>(
 381           blendedB, blendedG, blendedR, blendedA);
 382       simd::Store8(&targetData[targetIndex], result1234);
 383     }
 384   }
 385 }
 386
 387 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
 388 inline already_AddRefed<DataSourceSurface> ApplyBlending_SIMD(
 389     DataSourceSurface* aInput1, DataSourceSurface* aInput2) {
 390   IntSize size = aInput1->GetSize();
 391   RefPtr<DataSourceSurface> target =
 392       Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
 393   if (!target) {
 394     return nullptr;
 395   }
 396
 397   DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ);
 398   DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
 399   if (aInput1->Equals(aInput2)) {
 400     ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap1,
 401                                                         outputMap, size);
 402   } else {
 403     DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ);
 404     ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap2,
 405                                                         outputMap, size);
 406   }
 407
 408   return target.forget();
 409 }
 410
 411 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
 412 static already_AddRefed<DataSourceSurface> ApplyBlending_SIMD(
 413     DataSourceSurface* aInput1, DataSourceSurface* aInput2,
 414     BlendMode aBlendMode) {
 415   switch (aBlendMode) {
 416     case BLEND_MODE_MULTIPLY:
 417       return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_MULTIPLY>(
 418           aInput1, aInput2);
 419     case BLEND_MODE_SCREEN:
 420       return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_SCREEN>(
 421           aInput1, aInput2);
 422     case BLEND_MODE_DARKEN:
 423       return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_DARKEN>(
 424           aInput1, aInput2);
 425     case BLEND_MODE_LIGHTEN:
 426       return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_LIGHTEN>(
 427           aInput1, aInput2);
 428     default:
 429       return nullptr;
 430   }
 431 }
 432
 433 template <MorphologyOperator Operator, typename u8x16_t>
 434 static u8x16_t Morph8(u8x16_t a, u8x16_t b) {
 435   return Operator == MORPHOLOGY_OPERATOR_ERODE ? simd::Min8(a, b)
 436                                                : simd::Max8(a, b);
 437 }
 438
 439 // Set every pixel to the per-component minimum or maximum of the pixels around
 440 // it that are up to aRadius pixels away from it (horizontally).
 441 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t>
 442 inline void ApplyMorphologyHorizontal_SIMD(
 443     uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
 444     int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) {
 445   static_assert(
 446       op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE,
 447       "unexpected morphology operator");
 448
 449   int32_t kernelSize = aRadius + 1 + aRadius;
 450   MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0");
 451   MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3);
 452   int32_t completeKernelSizeForFourPixels = kernelSize + 3;
 453   MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 ||
 454              completeKernelSizeForFourPixels % 4 == 2);
 455
 456   // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just
 457   // the way we need them to be.
 458
 459   IntRect sourceRect = aDestRect;
 460   sourceRect.Inflate(aRadius, 0);
 461
 462   for (int32_t y = aDestRect.Y(); y < aDestRect.YMost(); y++) {
 463     int32_t kernelStartX = aDestRect.X() - aRadius;
 464     for (int32_t x = aDestRect.X(); x < aDestRect.XMost();
 465          x += 4, kernelStartX += 4) {
 466       // We process four pixels (16 color values) at a time.
 467       // aSourceData[0] points to the pixel located at aDestRect.TopLeft();
 468       // source values can be read beyond that because the source is extended
 469       // by aRadius pixels.
 470
 471       int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX;
 472       u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
 473       u8x16_t m1234 = p1234;
 474
 475       for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) {
 476         u8x16_t p5678 =
 477             (kernelStartX + i < sourceRect.XMost())
 478                 ? simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i])
 479                 : simd::FromZero8<u8x16_t>();
 480         u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678);
 481         u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678);
 482         m1234 = Morph8<op, u8x16_t>(m1234, p2345);
 483         m1234 = Morph8<op, u8x16_t>(m1234, p3456);
 484         if (i + 2 < completeKernelSizeForFourPixels) {
 485           u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678);
 486           m1234 = Morph8<op, u8x16_t>(m1234, p4567);
 487           m1234 = Morph8<op, u8x16_t>(m1234, p5678);
 488         }
 489         p1234 = p5678;
 490       }
 491
 492       int32_t destIndex = y * aDestStride + 4 * x;
 493       simd::Store8(&aDestData[destIndex], m1234);
 494     }
 495   }
 496 }
 497
 498 template <typename i16x8_t, typename u8x16_t>
 499 inline void ApplyMorphologyHorizontal_SIMD(
 500     uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
 501     int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius,
 502     MorphologyOperator aOp) {
 503   if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
 504     ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>(
 505         aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
 506   } else {
 507     ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t,
 508                                    u8x16_t>(
 509         aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
 510   }
 511 }
 512
 513 // Set every pixel to the per-component minimum or maximum of the pixels around
 514 // it that are up to aRadius pixels away from it (vertically).
 515 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t>
 516 static void ApplyMorphologyVertical_SIMD(
 517     uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
 518     int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) {
 519   static_assert(
 520       op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE,
 521       "unexpected morphology operator");
 522
 523   int32_t startY = aDestRect.Y() - aRadius;
 524   int32_t endY = aDestRect.Y() + aRadius;
 525   for (int32_t y = aDestRect.Y(); y < aDestRect.YMost();
 526        y++, startY++, endY++) {
 527     for (int32_t x = aDestRect.X(); x < aDestRect.XMost(); x += 4) {
 528       int32_t sourceIndex = startY * aSourceStride + 4 * x;
 529       u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
 530       sourceIndex += aSourceStride;
 531       for (int32_t iy = startY + 1; iy <= endY;
 532            iy++, sourceIndex += aSourceStride) {
 533         u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
 534         u = Morph8<op, u8x16_t>(u, u2);
 535       }
 536
 537       int32_t destIndex = y * aDestStride + 4 * x;
 538       simd::Store8(&aDestData[destIndex], u);
 539     }
 540   }
 541 }
 542
 543 template <typename i16x8_t, typename u8x16_t>
 544 inline void ApplyMorphologyVertical_SIMD(
 545     uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
 546     int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius,
 547     MorphologyOperator aOp) {
 548   if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
 549     ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>(
 550         aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
 551   } else {
 552     ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t, u8x16_t>(
 553         aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
 554   }
 555 }
 556
 557 template <typename i32x4_t, typename i16x8_t>
 558 static i32x4_t ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra,
 559                                    const i32x4_t& bias) {
 560   // int16_t p[8] == { b, g, r, a, b, g, r, a }.
 561   // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }.
 562   // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }.
 563   // int32_t bias[4] == { _B, _G, _R, _A }.
 564
 565   i32x4_t sum = bias;
 566
 567   // int16_t bg[8] = { b, g, b, g, b, g, b, g };
 568   i16x8_t bg = simd::ShuffleHi16<1, 0, 1, 0>(simd::ShuffleLo16<1, 0, 1, 0>(p));
 569   // int32_t prodsum_bg[4] =
 570   //   { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA }
 571   i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg);
 572   sum = simd::Add32(sum, prodsum_bg);
 573
 574   // uint16_t ra[8] = { r, a, r, a, r, a, r, a };
 575   i16x8_t ra = simd::ShuffleHi16<3, 2, 3, 2>(simd::ShuffleLo16<3, 2, 3, 2>(p));
 576   // int32_t prodsum_ra[4] =
 577   //   { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA }
 578   i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra);
 579   sum = simd::Add32(sum, prodsum_ra);
 580
 581   // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }.
 582   return sum;
 583 }
 584
 585 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
 586 static already_AddRefed<DataSourceSurface> ApplyColorMatrix_SIMD(
 587     DataSourceSurface* aInput, const Matrix5x4& aMatrix) {
 588   IntSize size = aInput->GetSize();
 589   RefPtr<DataSourceSurface> target =
 590       Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
 591   if (!target) {
 592     return nullptr;
 593   }
 594
 595   DataSourceSurface::ScopedMap inputMap(aInput, DataSourceSurface::READ);
 596   DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
 597
 598   uint8_t* sourceData = inputMap.GetData();
 599   uint8_t* targetData = outputMap.GetData();
 600   int32_t sourceStride = inputMap.GetStride();
 601   int32_t targetStride = outputMap.GetStride();
 602
 603   const int16_t factor = 128;
 604   const Float floatElementMax = INT16_MAX / factor;  // 255
 605   MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX,
 606              "badly chosen float-to-int scale");
 607
 608   const Float* floats = &aMatrix._11;
 609
 610   ptrdiff_t componentOffsets[4] = {
 611       B8G8R8A8_COMPONENT_BYTEOFFSET_R, B8G8R8A8_COMPONENT_BYTEOFFSET_G,
 612       B8G8R8A8_COMPONENT_BYTEOFFSET_B, B8G8R8A8_COMPONENT_BYTEOFFSET_A};
 613
 614   // We store the color matrix in rows_bgra in the following format:
 615   // { bB, bG, bR, bA, gB, gG, gR, gA }.
 616   // { bB, gB, bG, gG, bR, gR, bA, gA }
 617   // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16
 618   // which works especially well for our use case.
 619   int16_t rows_bgra[2][8];
 620   for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) {
 621     for (size_t colIndex = 0; colIndex < 4; colIndex++) {
 622       const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
 623       Float clampedFloatMatrixElement = std::min(
 624           std::max(floatMatrixElement, -floatElementMax), floatElementMax);
 625       int16_t scaledIntMatrixElement =
 626           int16_t(clampedFloatMatrixElement * factor + 0.5);
 627       int8_t bg_or_ra = componentOffsets[rowIndex] / 2;
 628       int8_t g_or_a = componentOffsets[rowIndex] % 2;
 629       int8_t B_or_G_or_R_or_A = componentOffsets[colIndex];
 630       rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] =
 631           scaledIntMatrixElement;
 632     }
 633   }
 634
 635   int32_t rowBias[4];
 636   Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255);
 637   for (size_t colIndex = 0; colIndex < 4; colIndex++) {
 638     size_t rowIndex = 4;
 639     const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
 640     Float clampedFloatMatrixElement =
 641         std::min(std::max(floatMatrixElement, -biasMax), biasMax);
 642     int32_t scaledIntMatrixElement =
 643         int32_t(clampedFloatMatrixElement * factor * 255 + 0.5);
 644     rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement;
 645   }
 646
 647   i16x8_t row_bg_v = simd::FromI16<i16x8_t>(
 648       rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3],
 649       rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]);
 650
 651   i16x8_t row_ra_v = simd::FromI16<i16x8_t>(
 652       rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3],
 653       rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]);
 654
 655   i32x4_t rowsBias_v =
 656       simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]);
 657
 658   for (int32_t y = 0; y < size.height; y++) {
 659     for (int32_t x = 0; x < size.width; x += 4) {
 660       MOZ_ASSERT(sourceStride >= 4 * (x + 4),
 661                  "need to be able to read 4 pixels at this position");
 662       MOZ_ASSERT(targetStride >= 4 * (x + 4),
 663                  "need to be able to write 4 pixels at this position");
 664       int32_t sourceIndex = y * sourceStride + 4 * x;
 665       int32_t targetIndex = y * targetStride + 4 * x;
 666
 667       // We load 4 pixels, unpack them, process them 1 pixel at a time, and
 668       // finally pack and store the 4 result pixels.
 669
 670       u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
 671
 672       // Splat needed to get each pixel twice into i16x8
 673       i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234));
 674       i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234));
 675       i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234));
 676       i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234));
 677
 678       i32x4_t result_p1 =
 679           ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v);
 680       i32x4_t result_p2 =
 681           ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v);
 682       i32x4_t result_p3 =
 683           ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v);
 684       i32x4_t result_p4 =
 685           ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v);
 686
 687       static_assert(factor == 1 << 7,
 688                     "Please adapt the calculation in the lines below for a "
 689                     "different factor.");
 690       u8x16_t result_p1234 = simd::PackAndSaturate32To8(
 691           simd::ShiftRight32<7>(result_p1), simd::ShiftRight32<7>(result_p2),
 692           simd::ShiftRight32<7>(result_p3), simd::ShiftRight32<7>(result_p4));
 693       simd::Store8(&targetData[targetIndex], result_p1234);
 694     }
 695   }
 696
 697   return target.forget();
 698 }
 699
 700 // source / dest: bgra bgra
 701 // sourceAlpha / destAlpha: aaaa aaaa
 702 // result: bgra bgra
 703 template <typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator>
 704 static inline u16x8_t CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha,
 705                                          u16x8_t dest,
 706                                          const u16x8_t& destAlpha) {
 707   u16x8_t x255 = simd::FromU16<u16x8_t>(255);
 708
 709   switch (aCompositeOperator) {
 710     case COMPOSITE_OPERATOR_OVER: {
 711       // val = dest * (255 - sourceAlpha) + source * 255;
 712       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
 713
 714       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
 715       u16x8_t rightFactor1 =
 716           simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255);
 717       i32x4_t result1 =
 718           simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
 719
 720       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
 721       u16x8_t rightFactor2 =
 722           simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255);
 723       i32x4_t result2 =
 724           simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
 725
 726       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
 727                                           simd::FastDivideBy255(result2));
 728     }
 729
 730     case COMPOSITE_OPERATOR_IN: {
 731       // val = source * destAlpha;
 732       return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha));
 733     }
 734
 735     case COMPOSITE_OPERATOR_OUT: {
 736       // val = source * (255 - destAlpha);
 737       u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha));
 738       return simd::FastDivideBy255_16(prod);
 739     }
 740
 741     case COMPOSITE_OPERATOR_ATOP: {
 742       // val = dest * (255 - sourceAlpha) + source * destAlpha;
 743       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
 744
 745       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
 746       u16x8_t rightFactor1 =
 747           simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha);
 748       i32x4_t result1 =
 749           simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
 750
 751       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
 752       u16x8_t rightFactor2 =
 753           simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha);
 754       i32x4_t result2 =
 755           simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
 756
 757       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
 758                                           simd::FastDivideBy255(result2));
 759     }
 760
 761     case COMPOSITE_OPERATOR_XOR: {
 762       // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha);
 763       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
 764       u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
 765
 766       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
 767       u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha,
 768                                                   twoFiftyFiveMinusDestAlpha);
 769       i32x4_t result1 =
 770           simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
 771
 772       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
 773       u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha,
 774                                                   twoFiftyFiveMinusDestAlpha);
 775       i32x4_t result2 =
 776           simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
 777
 778       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
 779                                           simd::FastDivideBy255(result2));
 780     }
 781
 782     case COMPOSITE_OPERATOR_LIGHTER: {
 783       // val = dest * sourceAlpha + source * destAlpha;
 784       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
 785       u16x8_t rightFactor1 = simd::InterleaveLo16(sourceAlpha, destAlpha);
 786       i32x4_t result1 =
 787           simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
 788
 789       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
 790       u16x8_t rightFactor2 = simd::InterleaveHi16(sourceAlpha, destAlpha);
 791       i32x4_t result2 =
 792           simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
 793
 794       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
 795                                           simd::FastDivideBy255(result2));
 796     }
 797
 798     default:
 799       return simd::FromU16<u16x8_t>(0);
 800   }
 801 }
 802
 803 template <typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op>
 804 static void ApplyComposition(DataSourceSurface* aSource,
 805                              DataSourceSurface* aDest) {
 806   IntSize size = aDest->GetSize();
 807
 808   DataSourceSurface::ScopedMap input(aSource, DataSourceSurface::READ);
 809   DataSourceSurface::ScopedMap output(aDest, DataSourceSurface::READ_WRITE);
 810
 811   uint8_t* sourceData = input.GetData();
 812   uint8_t* destData = output.GetData();
 813   uint32_t sourceStride = input.GetStride();
 814   uint32_t destStride = output.GetStride();
 815
 816   for (int32_t y = 0; y < size.height; y++) {
 817     for (int32_t x = 0; x < size.width; x += 4) {
 818       uint32_t sourceIndex = y * sourceStride + 4 * x;
 819       uint32_t destIndex = y * destStride + 4 * x;
 820
 821       u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
 822       u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]);
 823
 824       u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234);
 825       u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234);
 826       u16x8_t sa12 = simd::Splat16<3, 3>(s12);
 827       u16x8_t da12 = simd::Splat16<3, 3>(d12);
 828       u16x8_t result12 =
 829           CompositeTwoPixels<i32x4_t, u16x8_t, op>(s12, sa12, d12, da12);
 830
 831       u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234);
 832       u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234);
 833       u16x8_t sa34 = simd::Splat16<3, 3>(s34);
 834       u16x8_t da34 = simd::Splat16<3, 3>(d34);
 835       u16x8_t result34 =
 836           CompositeTwoPixels<i32x4_t, u16x8_t, op>(s34, sa34, d34, da34);
 837
 838       u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34);
 839       simd::Store8(&destData[destIndex], result1234);
 840     }
 841   }
 842 }
 843
 844 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
 845 static void ApplyComposition_SIMD(DataSourceSurface* aSource,
 846                                   DataSourceSurface* aDest,
 847                                   CompositeOperator aOperator) {
 848   switch (aOperator) {
 849     case COMPOSITE_OPERATOR_OVER:
 850       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OVER>(
 851           aSource, aDest);
 852       break;
 853     case COMPOSITE_OPERATOR_IN:
 854       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_IN>(
 855           aSource, aDest);
 856       break;
 857     case COMPOSITE_OPERATOR_OUT:
 858       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OUT>(
 859           aSource, aDest);
 860       break;
 861     case COMPOSITE_OPERATOR_ATOP:
 862       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_ATOP>(
 863           aSource, aDest);
 864       break;
 865     case COMPOSITE_OPERATOR_XOR:
 866       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_XOR>(
 867           aSource, aDest);
 868       break;
 869     case COMPOSITE_OPERATOR_LIGHTER:
 870       ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_LIGHTER>(
 871           aSource, aDest);
 872       break;
 873     default:
 874       MOZ_CRASH("GFX: Incomplete switch");
 875   }
 876 }
 877
 878 template <typename u8x16_t>
 879 static void SeparateColorChannels_SIMD(
 880     const IntSize& size, uint8_t* sourceData, int32_t sourceStride,
 881     uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data,
 882     uint8_t* channel3Data, int32_t channelStride) {
 883   for (int32_t y = 0; y < size.height; y++) {
 884     for (int32_t x = 0; x < size.width; x += 16) {
 885       // Process 16 pixels at a time.
 886       int32_t sourceIndex = y * sourceStride + 4 * x;
 887       int32_t targetIndex = y * channelStride + x;
 888
 889       u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
 890       u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
 891       u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
 892
 893       u8x16_t bgrabgrabgrabgra1 =
 894           simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
 895       if (4 * (x + 4) < sourceStride) {
 896         bgrabgrabgrabgra2 =
 897             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
 898       }
 899       if (4 * (x + 8) < sourceStride) {
 900         bgrabgrabgrabgra3 =
 901             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
 902       }
 903       if (4 * (x + 12) < sourceStride) {
 904         bgrabgrabgrabgra4 =
 905             simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
 906       }
 907
 908       u8x16_t bbggrraabbggrraa1 =
 909           simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
 910       u8x16_t bbggrraabbggrraa2 =
 911           simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
 912       u8x16_t bbggrraabbggrraa3 =
 913           simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
 914       u8x16_t bbggrraabbggrraa4 =
 915           simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
 916       u8x16_t bbbbggggrrrraaaa1 =
 917           simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
 918       u8x16_t bbbbggggrrrraaaa2 =
 919           simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
 920       u8x16_t bbbbggggrrrraaaa3 =
 921           simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
 922       u8x16_t bbbbggggrrrraaaa4 =
 923           simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
 924       u8x16_t bbbbbbbbgggggggg1 =
 925           simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
 926       u8x16_t rrrrrrrraaaaaaaa1 =
 927           simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
 928       u8x16_t bbbbbbbbgggggggg2 =
 929           simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
 930       u8x16_t rrrrrrrraaaaaaaa2 =
 931           simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
 932       u8x16_t bbbbbbbbbbbbbbbb =
 933           simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
 934       u8x16_t gggggggggggggggg =
 935           simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
 936       u8x16_t rrrrrrrrrrrrrrrr =
 937           simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
 938       u8x16_t aaaaaaaaaaaaaaaa =
 939           simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
 940
 941       simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb);
 942       simd::Store8(&channel1Data[targetIndex], gggggggggggggggg);
 943       simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr);
 944       simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa);
 945     }
 946   }
 947 }
 948
 949 template <typename u8x16_t>
 950 static void CombineColorChannels_SIMD(
 951     const IntSize& size, int32_t resultStride, uint8_t* resultData,
 952     int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data,
 953     uint8_t* channel2Data, uint8_t* channel3Data) {
 954   for (int32_t y = 0; y < size.height; y++) {
 955     for (int32_t x = 0; x < size.width; x += 16) {
 956       // Process 16 pixels at a time.
 957       int32_t resultIndex = y * resultStride + 4 * x;
 958       int32_t channelIndex = y * channelStride + x;
 959
 960       u8x16_t bbbbbbbbbbbbbbbb =
 961           simd::Load8<u8x16_t>(&channel0Data[channelIndex]);
 962       u8x16_t gggggggggggggggg =
 963           simd::Load8<u8x16_t>(&channel1Data[channelIndex]);
 964       u8x16_t rrrrrrrrrrrrrrrr =
 965           simd::Load8<u8x16_t>(&channel2Data[channelIndex]);
 966       u8x16_t aaaaaaaaaaaaaaaa =
 967           simd::Load8<u8x16_t>(&channel3Data[channelIndex]);
 968
 969       u8x16_t brbrbrbrbrbrbrbr1 =
 970           simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
 971       u8x16_t brbrbrbrbrbrbrbr2 =
 972           simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
 973       u8x16_t gagagagagagagaga1 =
 974           simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
 975       u8x16_t gagagagagagagaga2 =
 976           simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
 977
 978       u8x16_t bgrabgrabgrabgra1 =
 979           simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
 980       u8x16_t bgrabgrabgrabgra2 =
 981           simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
 982       u8x16_t bgrabgrabgrabgra3 =
 983           simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
 984       u8x16_t bgrabgrabgrabgra4 =
 985           simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
 986
 987       simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1);
 988       if (4 * (x + 4) < resultStride) {
 989         simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2);
 990       }
 991       if (4 * (x + 8) < resultStride) {
 992         simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3);
 993       }
 994       if (4 * (x + 12) < resultStride) {
 995         simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4);
 996       }
 997     }
 998   }
 999 }
1000
1001 template <typename i32x4_t, typename u16x8_t, typename u8x16_t>
1002 static void DoPremultiplicationCalculation_SIMD(const IntSize& aSize,
1003                                                 uint8_t* aTargetData,
1004                                                 int32_t aTargetStride,
1005                                                 uint8_t* aSourceData,
1006                                                 int32_t aSourceStride) {
1007   const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff,
1008                                                  0, 0, 0, 0xff, 0, 0, 0, 0xff);
1009   for (int32_t y = 0; y < aSize.height; y++) {
1010     for (int32_t x = 0; x < aSize.width; x += 4) {
1011       int32_t inputIndex = y * aSourceStride + 4 * x;
1012       int32_t targetIndex = y * aTargetStride + 4 * x;
1013
1014       u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
1015       u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
1016       u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
1017
1018       // Multiply all components with alpha.
1019       p12 = simd::Mul16(p12, simd::Splat16<3, 3>(p12));
1020       p34 = simd::Mul16(p34, simd::Splat16<3, 3>(p34));
1021
1022       // Divide by 255 and pack.
1023       u8x16_t result = simd::PackAndSaturate16To8(
1024           simd::FastDivideBy255_16(p12), simd::FastDivideBy255_16(p34));
1025
1026       // Get the original alpha channel value back from p1234.
1027       result = simd::Pick(alphaMask, result, p1234);
1028
1029       simd::Store8(&aTargetData[targetIndex], result);
1030     }
1031   }
1032 }
1033
1034 // We use a table of precomputed factors for unpremultiplying.
1035 // We want to compute round(r / (alpha / 255.0f)) for arbitrary values of
1036 // r and alpha in constant time. This table of factors has the property that
1037 // (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with
1038 // a maximum deviation of 1).
1039 //
1040 // sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha)
1041 //
1042 // This table has been created using the python code
1043 // ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha
1044 // in range(256))
1045 static const uint16_t sAlphaFactors[256] = {
1046     0,    65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528,
1047     5935, 5440,  5022,  4663,  4352,  4080,  3840,  3627, 3436, 3264, 3109,
1048     2967, 2838,  2720,  2611,  2511,  2418,  2331,  2251, 2176, 2106, 2040,
1049     1978, 1920,  1865,  1813,  1764,  1718,  1674,  1632, 1592, 1554, 1518,
1050     1484, 1451,  1419,  1389,  1360,  1332,  1306,  1280, 1255, 1232, 1209,
1051     1187, 1166,  1145,  1126,  1106,  1088,  1070,  1053, 1036, 1020, 1004,
1052     989,  974,   960,   946,   933,   919,   907,   894,  882,  870,  859,
1053     848,  837,   826,   816,   806,   796,   787,   777,  768,  759,  750,
1054     742,  733,   725,   717,   710,   702,   694,   687,  680,  673,  666,
1055     659,  653,   646,   640,   634,   628,   622,   616,  610,  604,  599,
1056     593,  588,   583,   578,   573,   568,   563,   558,  553,  549,  544,
1057     540,  535,   531,   526,   522,   518,   514,   510,  506,  502,  498,
1058     495,  491,   487,   484,   480,   476,   473,   470,  466,  463,  460,
1059     457,  453,   450,   447,   444,   441,   438,   435,  432,  429,  427,
1060     424,  421,   418,   416,   413,   411,   408,   405,  403,  400,  398,
1061     396,  393,   391,   389,   386,   384,   382,   380,  377,  375,  373,
1062     371,  369,   367,   365,   363,   361,   359,   357,  355,  353,  351,
1063     349,  347,   345,   344,   342,   340,   338,   336,  335,  333,  331,
1064     330,  328,   326,   325,   323,   322,   320,   318,  317,  315,  314,
1065     312,  311,   309,   308,   306,   305,   304,   302,  301,  299,  298,
1066     297,  295,   294,   293,   291,   290,   289,   288,  286,  285,  284,
1067     283,  281,   280,   279,   278,   277,   275,   274,  273,  272,  271,
1068     270,  269,   268,   266,   265,   264,   263,   262,  261,  260,  259,
1069     258,  257,   256};
1070
1071 template <typename u16x8_t, typename u8x16_t>
1072 static void DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize,
1073                                                   uint8_t* aTargetData,
1074                                                   int32_t aTargetStride,
1075                                                   uint8_t* aSourceData,
1076                                                   int32_t aSourceStride) {
1077   for (int32_t y = 0; y < aSize.height; y++) {
1078     for (int32_t x = 0; x < aSize.width; x += 4) {
1079       int32_t inputIndex = y * aSourceStride + 4 * x;
1080       int32_t targetIndex = y * aTargetStride + 4 * x;
1081       union {
1082         u8x16_t p1234;
1083         uint8_t u8[4][4];
1084       };
1085       p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
1086
1087       // Prepare the alpha factors.
1088       uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1089       uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1090       uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1091       uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1092       u16x8_t aF12 =
1093           simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8);
1094       u16x8_t aF34 =
1095           simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8);
1096
1097       u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
1098       u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
1099
1100       // Multiply with the alpha factors, add 128 for rounding, and shift right
1101       // by 8 bits.
1102       p12 = simd::ShiftRight16<8>(
1103           simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128)));
1104       p34 = simd::ShiftRight16<8>(
1105           simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128)));
1106
1107       u8x16_t result = simd::PackAndSaturate16To8(p12, p34);
1108       simd::Store8(&aTargetData[targetIndex], result);
1109     }
1110   }
1111 }
1112
1113 template <typename u16x8_t, typename u8x16_t>
1114 static void DoOpacityCalculation_SIMD(const IntSize& aSize,
1115                                       uint8_t* aTargetData,
1116                                       int32_t aTargetStride,
1117                                       uint8_t* aSourceData,
1118                                       int32_t aSourceStride, Float aOpacity) {
1119   uint8_t alphaValue = uint8_t(roundf(255.f * aOpacity));
1120   u16x8_t alphaValues =
1121       simd::FromU16<u16x8_t>(alphaValue, alphaValue, alphaValue, alphaValue,
1122                              alphaValue, alphaValue, alphaValue, alphaValue);
1123   for (int32_t y = 0; y < aSize.height; y++) {
1124     for (int32_t x = 0; x < aSize.width; x += 4) {
1125       int32_t inputIndex = y * aSourceStride + 4 * x;
1126       int32_t targetIndex = y * aTargetStride + 4 * x;
1127
1128       u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
1129       u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
1130       u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
1131
1132       // Multiply all components with alpha.
1133       p12 = simd::Mul16(p12, alphaValues);
1134       p34 = simd::Mul16(p34, alphaValues);
1135
1136       // Divide by 255 and pack.
1137       u8x16_t result = simd::PackAndSaturate16To8(simd::ShiftRight16<8>(p12),
1138                                                   simd::ShiftRight16<8>(p34));
1139
1140       simd::Store8(&aTargetData[targetIndex], result);
1141     }
1142   }
1143 }
1144
1145 template <typename f32x4_t, typename i32x4_t, typename u8x16_t>
1146 static already_AddRefed<DataSourceSurface> RenderTurbulence_SIMD(
1147     const IntSize& aSize, const Point& aOffset, const Size& aBaseFrequency,
1148     int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch,
1149     const Rect& aTileRect) {
1150 #define RETURN_TURBULENCE(Type, Stitch)                                    \
1151   SVGTurbulenceRenderer<Type, Stitch, f32x4_t, i32x4_t, u8x16_t> renderer( \
1152       aBaseFrequency, aSeed, aNumOctaves, aTileRect);                      \
1153   return renderer.Render(aSize, aOffset);
1154
1155   switch (aType) {
1156     case TURBULENCE_TYPE_TURBULENCE: {
1157       if (aStitch) {
1158         RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true);
1159       }
1160       RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false);
1161     }
1162     case TURBULENCE_TYPE_FRACTAL_NOISE: {
1163       if (aStitch) {
1164         RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true);
1165       }
1166       RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false);
1167     }
1168   }
1169   return nullptr;
1170 #undef RETURN_TURBULENCE
1171 }
1172
1173 // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
1174 template <typename i32x4_t, typename i16x8_t>
1175 static MOZ_ALWAYS_INLINE i16x8_t ArithmeticCombineTwoPixels(
1176     i16x8_t in1, i16x8_t in2, const i16x8_t& k1And4, const i16x8_t& k2And3) {
1177   // Calculate input product: inProd = (in1 * in2) / 255.
1178   i32x4_t inProd_1, inProd_2;
1179   simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2);
1180   i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1),
1181                                                simd::FastDivideBy255(inProd_2));
1182
1183   // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128
1184   i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128);
1185   i16x8_t inProd1AndOneTwentyEight =
1186       simd::InterleaveLo16(inProd, oneTwentyEight);
1187   i16x8_t inProd2AndOneTwentyEight =
1188       simd::InterleaveHi16(inProd, oneTwentyEight);
1189   i32x4_t inProdTimesK1PlusK4_1 =
1190       simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight);
1191   i32x4_t inProdTimesK1PlusK4_2 =
1192       simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight);
1193
1194   // Calculate k2 * in1 + k3 * in2
1195   i16x8_t in12_1 = simd::InterleaveLo16(in1, in2);
1196   i16x8_t in12_2 = simd::InterleaveHi16(in1, in2);
1197   i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1);
1198   i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2);
1199
1200   // Sum everything up and truncate the fractional part.
1201   i32x4_t result_1 =
1202       simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1));
1203   i32x4_t result_2 =
1204       simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2));
1205   return simd::PackAndSaturate32To16(result_1, result_2);
1206 }
1207
1208 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
1209 static void ApplyArithmeticCombine_SIMD(
1210     const DataSourceSurface::ScopedMap& aInputMap1,
1211     const DataSourceSurface::ScopedMap& aInputMap2,
1212     const DataSourceSurface::ScopedMap& aOutputMap, const IntSize& aSize,
1213     Float aK1, Float aK2, Float aK3, Float aK4) {
1214   uint8_t* source1Data = aInputMap1.GetData();
1215   uint8_t* source2Data = aInputMap2.GetData();
1216   uint8_t* targetData = aOutputMap.GetData();
1217   uint32_t source1Stride = aInputMap1.GetStride();
1218   uint32_t source2Stride = aInputMap2.GetStride();
1219   uint32_t targetStride = aOutputMap.GetStride();
1220
1221   // The arithmetic combine filter does the following calculation:
1222   // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
1223   //
1224   // Or, with in1/2 integers between 0 and 255:
1225   // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255
1226   //
1227   // We want the whole calculation to happen in integer, with 16-bit factors.
1228   // So we convert our factors to fixed-point with precision 1.8.7.
1229   // K4 is premultiplied with 255, and it will be multiplied with 128 later
1230   // during the actual calculation, because premultiplying it with 255 * 128
1231   // would overflow int16.
1232
1233   i16x8_t k1 = simd::FromI16<i16x8_t>(
1234       int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f)));
1235   i16x8_t k2 = simd::FromI16<i16x8_t>(
1236       int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f)));
1237   i16x8_t k3 = simd::FromI16<i16x8_t>(
1238       int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f)));
1239   i16x8_t k4 = simd::FromI16<i16x8_t>(
1240       int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f)));
1241
1242   i16x8_t k1And4 = simd::InterleaveLo16(k1, k4);
1243   i16x8_t k2And3 = simd::InterleaveLo16(k2, k3);
1244
1245   for (int32_t y = 0; y < aSize.height; y++) {
1246     for (int32_t x = 0; x < aSize.width; x += 4) {
1247       uint32_t source1Index = y * source1Stride + 4 * x;
1248       uint32_t source2Index = y * source2Stride + 4 * x;
1249       uint32_t targetIndex = y * targetStride + 4 * x;
1250
1251       // Load and unpack.
1252       u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
1253       u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
1254       i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1);
1255       i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1);
1256       i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2);
1257       i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2);
1258
1259       // Multiply and add.
1260       i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>(
1261           in1_12, in2_12, k1And4, k2And3);
1262       i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>(
1263           in1_34, in2_34, k1And4, k2And3);
1264
1265       // Pack and store.
1266       simd::Store8(&targetData[targetIndex],
1267                    simd::PackAndSaturate16To8(result_12, result_34));
1268     }
1269   }
1270 }
1271
1272 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
1273 static already_AddRefed<DataSourceSurface> ApplyArithmeticCombine_SIMD(
1274     DataSourceSurface* aInput1, DataSourceSurface* aInput2, Float aK1,
1275     Float aK2, Float aK3, Float aK4) {
1276   IntSize size = aInput1->GetSize();
1277   RefPtr<DataSourceSurface> target =
1278       Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
1279   if (!target) {
1280     return nullptr;
1281   }
1282
1283   DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ);
1284   DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
1285
1286   if (aInput1->Equals(aInput2)) {
1287     ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>(
1288         inputMap1, inputMap1, outputMap, size, aK1, aK2, aK3, aK4);
1289   } else {
1290     DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ);
1291     ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>(
1292         inputMap1, inputMap2, outputMap, size, aK1, aK2, aK3, aK4);
1293   }
1294
1295   return target.forget();
1296 }
1297
1298 }  // namespace gfx
1299 }  // namespace mozilla