no bug - Import translations from android-l10n r=release a=l10n CLOSED TREE
[gecko.git] / gfx / 2d / FilterProcessingSIMD-inl.h
blob81f30cfc9e47ccef21df670d5344cf026a980e68
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #include "FilterProcessing.h"
9 #include "SIMD.h"
10 #include "SVGTurbulenceRenderer-inl.h"
12 namespace mozilla {
13 namespace gfx {
15 template <typename u8x16_t>
16 inline already_AddRefed<DataSourceSurface> ConvertToB8G8R8A8_SIMD(
17 SourceSurface* aSurface) {
18 IntSize size = aSurface->GetSize();
19 RefPtr<DataSourceSurface> output =
20 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
21 if (!output) {
22 return nullptr;
25 RefPtr<DataSourceSurface> input = aSurface->GetDataSurface();
26 DataSourceSurface::ScopedMap inputMap(input, DataSourceSurface::READ);
27 DataSourceSurface::ScopedMap outputMap(output, DataSourceSurface::READ_WRITE);
28 uint8_t* inputData = inputMap.GetData();
29 uint8_t* outputData = outputMap.GetData();
30 int32_t inputStride = inputMap.GetStride();
31 int32_t outputStride = outputMap.GetStride();
32 switch (input->GetFormat()) {
33 case SurfaceFormat::B8G8R8A8:
34 output = input;
35 break;
36 case SurfaceFormat::B8G8R8X8:
37 for (int32_t y = 0; y < size.height; y++) {
38 for (int32_t x = 0; x < size.width; x++) {
39 int32_t inputIndex = y * inputStride + 4 * x;
40 int32_t outputIndex = y * outputStride + 4 * x;
41 outputData[outputIndex + 0] = inputData[inputIndex + 0];
42 outputData[outputIndex + 1] = inputData[inputIndex + 1];
43 outputData[outputIndex + 2] = inputData[inputIndex + 2];
44 outputData[outputIndex + 3] = 255;
47 break;
48 case SurfaceFormat::R8G8B8A8:
49 for (int32_t y = 0; y < size.height; y++) {
50 for (int32_t x = 0; x < size.width; x++) {
51 int32_t inputIndex = y * inputStride + 4 * x;
52 int32_t outputIndex = y * outputStride + 4 * x;
53 outputData[outputIndex + 2] = inputData[inputIndex + 0];
54 outputData[outputIndex + 1] = inputData[inputIndex + 1];
55 outputData[outputIndex + 0] = inputData[inputIndex + 2];
56 outputData[outputIndex + 3] = inputData[inputIndex + 3];
59 break;
60 case SurfaceFormat::R8G8B8X8:
61 for (int32_t y = 0; y < size.height; y++) {
62 for (int32_t x = 0; x < size.width; x++) {
63 int32_t inputIndex = y * inputStride + 4 * x;
64 int32_t outputIndex = y * outputStride + 4 * x;
65 outputData[outputIndex + 2] = inputData[inputIndex + 0];
66 outputData[outputIndex + 1] = inputData[inputIndex + 1];
67 outputData[outputIndex + 0] = inputData[inputIndex + 2];
68 outputData[outputIndex + 3] = 255;
71 break;
72 case SurfaceFormat::A8:
73 for (int32_t y = 0; y < size.height; y++) {
74 for (int32_t x = 0; x < size.width; x += 16) {
75 int32_t inputIndex = y * inputStride + x;
76 int32_t outputIndex = y * outputStride + 4 * x;
77 u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]);
78 // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by
79 // interleaving with 0000000000000000 twice.
80 u8x16_t zero = simd::FromZero8<u8x16_t>();
81 u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16);
82 u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16);
83 u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8);
84 u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8);
85 u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16);
86 u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16);
87 simd::Store8(&outputData[outputIndex], p1To4);
88 if ((x + 4) * 4 < outputStride) {
89 simd::Store8(&outputData[outputIndex + 4 * 4], p5To8);
91 if ((x + 8) * 4 < outputStride) {
92 simd::Store8(&outputData[outputIndex + 4 * 8], p9To12);
94 if ((x + 12) * 4 < outputStride) {
95 simd::Store8(&outputData[outputIndex + 4 * 12], p13To16);
99 break;
100 default:
101 output = nullptr;
102 break;
104 return output.forget();
107 template <typename u8x16_t>
108 inline void ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData,
109 int32_t sourceStride, uint8_t* alphaData,
110 int32_t alphaStride) {
111 for (int32_t y = 0; y < size.height; y++) {
112 for (int32_t x = 0; x < size.width; x += 16) {
113 // Process 16 pixels at a time.
114 // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of
115 // AAAAAAAAAAAAAAAA.
116 int32_t sourceIndex = y * sourceStride + 4 * x;
117 int32_t targetIndex = y * alphaStride + x;
119 u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
120 u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
121 u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
123 u8x16_t bgrabgrabgrabgra1 =
124 simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
125 if (4 * (x + 4) < sourceStride) {
126 bgrabgrabgrabgra2 =
127 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
129 if (4 * (x + 8) < sourceStride) {
130 bgrabgrabgrabgra3 =
131 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
133 if (4 * (x + 12) < sourceStride) {
134 bgrabgrabgrabgra4 =
135 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
138 u8x16_t bbggrraabbggrraa1 =
139 simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
140 u8x16_t bbggrraabbggrraa2 =
141 simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
142 u8x16_t bbggrraabbggrraa3 =
143 simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
144 u8x16_t bbggrraabbggrraa4 =
145 simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
146 u8x16_t bbbbggggrrrraaaa1 =
147 simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
148 u8x16_t bbbbggggrrrraaaa2 =
149 simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
150 u8x16_t bbbbggggrrrraaaa3 =
151 simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
152 u8x16_t bbbbggggrrrraaaa4 =
153 simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
154 u8x16_t rrrrrrrraaaaaaaa1 =
155 simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
156 u8x16_t rrrrrrrraaaaaaaa2 =
157 simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
158 u8x16_t aaaaaaaaaaaaaaaa =
159 simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
161 simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa);
166 // This function calculates the result color values for four pixels, but for
167 // only two color channels - either b & r or g & a. However, the a result will
168 // not be used.
169 // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa.
170 // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the
171 // alpha of all four pixels (and both aaaa's are the same).
172 // blendendComponent1 and blendedComponent2 are the out parameters.
173 template <typename i16x8_t, typename i32x4_t, uint32_t aBlendMode>
174 inline void BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha,
175 i16x8_t dest,
176 const i16x8_t& destAlpha,
177 i32x4_t& blendedComponent1,
178 i32x4_t& blendedComponent2) {
179 i16x8_t x255 = simd::FromI16<i16x8_t>(255);
181 switch (aBlendMode) {
182 case BLEND_MODE_MULTIPLY: {
183 // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) *
184 // dest);
185 i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
186 i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
187 i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource =
188 simd::Add16(twoFiftyFiveMinusSourceAlpha, source);
190 i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
191 i16x8_t leftFactor1 = simd::InterleaveLo16(
192 twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
193 blendedComponent1 =
194 simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1);
195 blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
197 i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
198 i16x8_t leftFactor2 = simd::InterleaveHi16(
199 twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
200 blendedComponent2 =
201 simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2);
202 blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
204 break;
207 case BLEND_MODE_SCREEN: {
208 // val = 255 * (source + dest) + (0 - dest) * source;
209 i16x8_t sourcePlusDest = simd::Add16(source, dest);
210 i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest);
212 i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 =
213 simd::InterleaveLo16(x255, zeroMinusDest);
214 i16x8_t sourcePlusDestInterleavedWithSource1 =
215 simd::InterleaveLo16(sourcePlusDest, source);
216 blendedComponent1 =
217 simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1,
218 sourcePlusDestInterleavedWithSource1);
219 blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
221 i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 =
222 simd::InterleaveHi16(x255, zeroMinusDest);
223 i16x8_t sourcePlusDestInterleavedWithSource2 =
224 simd::InterleaveHi16(sourcePlusDest, source);
225 blendedComponent2 =
226 simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2,
227 sourcePlusDestInterleavedWithSource2);
228 blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
230 break;
233 case BLEND_MODE_DARKEN:
234 case BLEND_MODE_LIGHTEN: {
235 // Darken:
236 // val = min((255 - destAlpha) * source + 255 * dest,
237 // 255 * source + (255 - sourceAlpha) * dest);
239 // Lighten:
240 // val = max((255 - destAlpha) * source + 255 * dest,
241 // 255 * source + (255 - sourceAlpha) * dest);
243 i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
244 i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
246 i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 =
247 simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255);
248 i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 =
249 simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha);
250 i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
251 i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(
252 twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1,
253 sourceInterleavedWithDest1);
254 i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(
255 twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1,
256 sourceInterleavedWithDest1);
257 blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN
258 ? simd::Min32(product1_1, product1_2)
259 : simd::Max32(product1_1, product1_2);
260 blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
262 i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 =
263 simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255);
264 i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 =
265 simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha);
266 i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
267 i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(
268 twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2,
269 sourceInterleavedWithDest2);
270 i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(
271 twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2,
272 sourceInterleavedWithDest2);
273 blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN
274 ? simd::Min32(product2_1, product2_2)
275 : simd::Max32(product2_1, product2_2);
276 blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
278 break;
283 // The alpha channel is subject to a different calculation than the RGB
284 // channels, and this calculation is the same for all blend modes:
285 // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha)
286 template <typename i16x8_t, typename i32x4_t>
287 inline i32x4_t BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234,
288 i16x8_t d_rrrraaaa1234) {
289 // clang-format off
290 // We're using MulAdd16x8x2To32x4, so we need to interleave our factors
291 // appropriately. The calculation is rewritten as follows:
292 // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0])
293 // = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
294 // = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
295 // = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0]
296 // clang-format on
297 i16x8_t zeroInterleavedWithSourceAlpha =
298 simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234);
299 i16x8_t fiveTenInterleavedWithDestAlpha =
300 simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234);
301 i16x8_t f1 =
302 simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha);
303 i16x8_t f2 =
304 simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255));
305 return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2));
308 template <typename u8x16_t, typename i16x8_t>
309 inline void UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,
310 i16x8_t& bbbbgggg1234,
311 i16x8_t& rrrraaaa1234) {
312 // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234
313 i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234);
314 i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234);
315 i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34);
316 i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34);
317 bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24);
318 rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24);
321 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
322 inline u8x16_t ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234,
323 i32x4_t rrrr1234,
324 const i32x4_t& aaaa1234) {
325 // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234
326 i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234);
327 i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234);
328 i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234);
329 i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234);
330 i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234);
331 i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234);
332 return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34);
335 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
336 inline void ApplyBlending_SIMD(const DataSourceSurface::ScopedMap& aInputMap1,
337 const DataSourceSurface::ScopedMap& aInputMap2,
338 const DataSourceSurface::ScopedMap& aOutputMap,
339 const IntSize& aSize) {
340 uint8_t* source1Data = aInputMap1.GetData();
341 uint8_t* source2Data = aInputMap2.GetData();
342 uint8_t* targetData = aOutputMap.GetData();
343 int32_t targetStride = aOutputMap.GetStride();
344 int32_t source1Stride = aInputMap1.GetStride();
345 int32_t source2Stride = aInputMap2.GetStride();
347 for (int32_t y = 0; y < aSize.height; y++) {
348 for (int32_t x = 0; x < aSize.width; x += 4) {
349 int32_t targetIndex = y * targetStride + 4 * x;
350 int32_t source1Index = y * source1Stride + 4 * x;
351 int32_t source2Index = y * source2Stride + 4 * x;
353 u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
354 u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
356 // The blending calculation for the RGB channels all need access to the
357 // alpha channel of their pixel, and the alpha calculation is different,
358 // so it makes sense to separate by channel.
360 i16x8_t s_bbbbgggg1234, s_rrrraaaa1234;
361 i16x8_t d_bbbbgggg1234, d_rrrraaaa1234;
362 UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234);
363 UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234);
364 i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(s_rrrraaaa1234);
365 i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(d_rrrraaaa1234);
367 // We only use blendedB, blendedG and blendedR.
368 i32x4_t blendedB, blendedG, blendedR, blendedA;
369 BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>(
370 s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234,
371 blendedB, blendedG);
372 BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>(
373 s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234,
374 blendedR, blendedA);
376 // Throw away blendedA and overwrite it with the correct blended alpha.
377 blendedA = BlendAlphaOfFourPixels<i16x8_t, i32x4_t>(s_rrrraaaa1234,
378 d_rrrraaaa1234);
380 u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t, i16x8_t, u8x16_t>(
381 blendedB, blendedG, blendedR, blendedA);
382 simd::Store8(&targetData[targetIndex], result1234);
387 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
388 inline already_AddRefed<DataSourceSurface> ApplyBlending_SIMD(
389 DataSourceSurface* aInput1, DataSourceSurface* aInput2) {
390 IntSize size = aInput1->GetSize();
391 RefPtr<DataSourceSurface> target =
392 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
393 if (!target) {
394 return nullptr;
397 DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ);
398 DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
399 if (aInput1->Equals(aInput2)) {
400 ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap1,
401 outputMap, size);
402 } else {
403 DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ);
404 ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap2,
405 outputMap, size);
408 return target.forget();
411 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
412 static already_AddRefed<DataSourceSurface> ApplyBlending_SIMD(
413 DataSourceSurface* aInput1, DataSourceSurface* aInput2,
414 BlendMode aBlendMode) {
415 switch (aBlendMode) {
416 case BLEND_MODE_MULTIPLY:
417 return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_MULTIPLY>(
418 aInput1, aInput2);
419 case BLEND_MODE_SCREEN:
420 return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_SCREEN>(
421 aInput1, aInput2);
422 case BLEND_MODE_DARKEN:
423 return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_DARKEN>(
424 aInput1, aInput2);
425 case BLEND_MODE_LIGHTEN:
426 return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_LIGHTEN>(
427 aInput1, aInput2);
428 default:
429 return nullptr;
433 template <MorphologyOperator Operator, typename u8x16_t>
434 static u8x16_t Morph8(u8x16_t a, u8x16_t b) {
435 return Operator == MORPHOLOGY_OPERATOR_ERODE ? simd::Min8(a, b)
436 : simd::Max8(a, b);
439 // Set every pixel to the per-component minimum or maximum of the pixels around
440 // it that are up to aRadius pixels away from it (horizontally).
441 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t>
442 inline void ApplyMorphologyHorizontal_SIMD(
443 uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
444 int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) {
445 static_assert(
446 op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE,
447 "unexpected morphology operator");
449 int32_t kernelSize = aRadius + 1 + aRadius;
450 MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0");
451 MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3);
452 int32_t completeKernelSizeForFourPixels = kernelSize + 3;
453 MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 ||
454 completeKernelSizeForFourPixels % 4 == 2);
456 // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just
457 // the way we need them to be.
459 IntRect sourceRect = aDestRect;
460 sourceRect.Inflate(aRadius, 0);
462 for (int32_t y = aDestRect.Y(); y < aDestRect.YMost(); y++) {
463 int32_t kernelStartX = aDestRect.X() - aRadius;
464 for (int32_t x = aDestRect.X(); x < aDestRect.XMost();
465 x += 4, kernelStartX += 4) {
466 // We process four pixels (16 color values) at a time.
467 // aSourceData[0] points to the pixel located at aDestRect.TopLeft();
468 // source values can be read beyond that because the source is extended
469 // by aRadius pixels.
471 int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX;
472 u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
473 u8x16_t m1234 = p1234;
475 for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) {
476 u8x16_t p5678 =
477 (kernelStartX + i < sourceRect.XMost())
478 ? simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i])
479 : simd::FromZero8<u8x16_t>();
480 u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678);
481 u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678);
482 m1234 = Morph8<op, u8x16_t>(m1234, p2345);
483 m1234 = Morph8<op, u8x16_t>(m1234, p3456);
484 if (i + 2 < completeKernelSizeForFourPixels) {
485 u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678);
486 m1234 = Morph8<op, u8x16_t>(m1234, p4567);
487 m1234 = Morph8<op, u8x16_t>(m1234, p5678);
489 p1234 = p5678;
492 int32_t destIndex = y * aDestStride + 4 * x;
493 simd::Store8(&aDestData[destIndex], m1234);
498 template <typename i16x8_t, typename u8x16_t>
499 inline void ApplyMorphologyHorizontal_SIMD(
500 uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
501 int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius,
502 MorphologyOperator aOp) {
503 if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
504 ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>(
505 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
506 } else {
507 ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t,
508 u8x16_t>(
509 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
513 // Set every pixel to the per-component minimum or maximum of the pixels around
514 // it that are up to aRadius pixels away from it (vertically).
515 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t>
516 static void ApplyMorphologyVertical_SIMD(
517 uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
518 int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) {
519 static_assert(
520 op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE,
521 "unexpected morphology operator");
523 int32_t startY = aDestRect.Y() - aRadius;
524 int32_t endY = aDestRect.Y() + aRadius;
525 for (int32_t y = aDestRect.Y(); y < aDestRect.YMost();
526 y++, startY++, endY++) {
527 for (int32_t x = aDestRect.X(); x < aDestRect.XMost(); x += 4) {
528 int32_t sourceIndex = startY * aSourceStride + 4 * x;
529 u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
530 sourceIndex += aSourceStride;
531 for (int32_t iy = startY + 1; iy <= endY;
532 iy++, sourceIndex += aSourceStride) {
533 u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
534 u = Morph8<op, u8x16_t>(u, u2);
537 int32_t destIndex = y * aDestStride + 4 * x;
538 simd::Store8(&aDestData[destIndex], u);
543 template <typename i16x8_t, typename u8x16_t>
544 inline void ApplyMorphologyVertical_SIMD(
545 uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
546 int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius,
547 MorphologyOperator aOp) {
548 if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
549 ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>(
550 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
551 } else {
552 ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t, u8x16_t>(
553 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
557 template <typename i32x4_t, typename i16x8_t>
558 static i32x4_t ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra,
559 const i32x4_t& bias) {
560 // int16_t p[8] == { b, g, r, a, b, g, r, a }.
561 // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }.
562 // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }.
563 // int32_t bias[4] == { _B, _G, _R, _A }.
565 i32x4_t sum = bias;
567 // int16_t bg[8] = { b, g, b, g, b, g, b, g };
568 i16x8_t bg = simd::ShuffleHi16<1, 0, 1, 0>(simd::ShuffleLo16<1, 0, 1, 0>(p));
569 // int32_t prodsum_bg[4] =
570 // { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA }
571 i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg);
572 sum = simd::Add32(sum, prodsum_bg);
574 // uint16_t ra[8] = { r, a, r, a, r, a, r, a };
575 i16x8_t ra = simd::ShuffleHi16<3, 2, 3, 2>(simd::ShuffleLo16<3, 2, 3, 2>(p));
576 // int32_t prodsum_ra[4] =
577 // { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA }
578 i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra);
579 sum = simd::Add32(sum, prodsum_ra);
581 // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }.
582 return sum;
585 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
586 static already_AddRefed<DataSourceSurface> ApplyColorMatrix_SIMD(
587 DataSourceSurface* aInput, const Matrix5x4& aMatrix) {
588 IntSize size = aInput->GetSize();
589 RefPtr<DataSourceSurface> target =
590 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
591 if (!target) {
592 return nullptr;
595 DataSourceSurface::ScopedMap inputMap(aInput, DataSourceSurface::READ);
596 DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
598 uint8_t* sourceData = inputMap.GetData();
599 uint8_t* targetData = outputMap.GetData();
600 int32_t sourceStride = inputMap.GetStride();
601 int32_t targetStride = outputMap.GetStride();
603 const int16_t factor = 128;
604 const Float floatElementMax = INT16_MAX / factor; // 255
605 MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX,
606 "badly chosen float-to-int scale");
608 const Float* floats = &aMatrix._11;
610 ptrdiff_t componentOffsets[4] = {
611 B8G8R8A8_COMPONENT_BYTEOFFSET_R, B8G8R8A8_COMPONENT_BYTEOFFSET_G,
612 B8G8R8A8_COMPONENT_BYTEOFFSET_B, B8G8R8A8_COMPONENT_BYTEOFFSET_A};
614 // We store the color matrix in rows_bgra in the following format:
615 // { bB, bG, bR, bA, gB, gG, gR, gA }.
616 // { bB, gB, bG, gG, bR, gR, bA, gA }
617 // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16
618 // which works especially well for our use case.
619 int16_t rows_bgra[2][8];
620 for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) {
621 for (size_t colIndex = 0; colIndex < 4; colIndex++) {
622 const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
623 Float clampedFloatMatrixElement = std::min(
624 std::max(floatMatrixElement, -floatElementMax), floatElementMax);
625 int16_t scaledIntMatrixElement =
626 int16_t(clampedFloatMatrixElement * factor + 0.5);
627 int8_t bg_or_ra = componentOffsets[rowIndex] / 2;
628 int8_t g_or_a = componentOffsets[rowIndex] % 2;
629 int8_t B_or_G_or_R_or_A = componentOffsets[colIndex];
630 rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] =
631 scaledIntMatrixElement;
635 int32_t rowBias[4];
636 Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255);
637 for (size_t colIndex = 0; colIndex < 4; colIndex++) {
638 size_t rowIndex = 4;
639 const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
640 Float clampedFloatMatrixElement =
641 std::min(std::max(floatMatrixElement, -biasMax), biasMax);
642 int32_t scaledIntMatrixElement =
643 int32_t(clampedFloatMatrixElement * factor * 255 + 0.5);
644 rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement;
647 i16x8_t row_bg_v = simd::FromI16<i16x8_t>(
648 rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3],
649 rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]);
651 i16x8_t row_ra_v = simd::FromI16<i16x8_t>(
652 rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3],
653 rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]);
655 i32x4_t rowsBias_v =
656 simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]);
658 for (int32_t y = 0; y < size.height; y++) {
659 for (int32_t x = 0; x < size.width; x += 4) {
660 MOZ_ASSERT(sourceStride >= 4 * (x + 4),
661 "need to be able to read 4 pixels at this position");
662 MOZ_ASSERT(targetStride >= 4 * (x + 4),
663 "need to be able to write 4 pixels at this position");
664 int32_t sourceIndex = y * sourceStride + 4 * x;
665 int32_t targetIndex = y * targetStride + 4 * x;
667 // We load 4 pixels, unpack them, process them 1 pixel at a time, and
668 // finally pack and store the 4 result pixels.
670 u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
672 // Splat needed to get each pixel twice into i16x8
673 i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234));
674 i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234));
675 i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234));
676 i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234));
678 i32x4_t result_p1 =
679 ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v);
680 i32x4_t result_p2 =
681 ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v);
682 i32x4_t result_p3 =
683 ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v);
684 i32x4_t result_p4 =
685 ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v);
687 static_assert(factor == 1 << 7,
688 "Please adapt the calculation in the lines below for a "
689 "different factor.");
690 u8x16_t result_p1234 = simd::PackAndSaturate32To8(
691 simd::ShiftRight32<7>(result_p1), simd::ShiftRight32<7>(result_p2),
692 simd::ShiftRight32<7>(result_p3), simd::ShiftRight32<7>(result_p4));
693 simd::Store8(&targetData[targetIndex], result_p1234);
697 return target.forget();
700 // source / dest: bgra bgra
701 // sourceAlpha / destAlpha: aaaa aaaa
702 // result: bgra bgra
703 template <typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator>
704 static inline u16x8_t CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha,
705 u16x8_t dest,
706 const u16x8_t& destAlpha) {
707 u16x8_t x255 = simd::FromU16<u16x8_t>(255);
709 switch (aCompositeOperator) {
710 case COMPOSITE_OPERATOR_OVER: {
711 // val = dest * (255 - sourceAlpha) + source * 255;
712 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
714 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
715 u16x8_t rightFactor1 =
716 simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255);
717 i32x4_t result1 =
718 simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
720 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
721 u16x8_t rightFactor2 =
722 simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255);
723 i32x4_t result2 =
724 simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
726 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
727 simd::FastDivideBy255(result2));
730 case COMPOSITE_OPERATOR_IN: {
731 // val = source * destAlpha;
732 return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha));
735 case COMPOSITE_OPERATOR_OUT: {
736 // val = source * (255 - destAlpha);
737 u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha));
738 return simd::FastDivideBy255_16(prod);
741 case COMPOSITE_OPERATOR_ATOP: {
742 // val = dest * (255 - sourceAlpha) + source * destAlpha;
743 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
745 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
746 u16x8_t rightFactor1 =
747 simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha);
748 i32x4_t result1 =
749 simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
751 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
752 u16x8_t rightFactor2 =
753 simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha);
754 i32x4_t result2 =
755 simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
757 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
758 simd::FastDivideBy255(result2));
761 case COMPOSITE_OPERATOR_XOR: {
762 // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha);
763 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
764 u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
766 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
767 u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha,
768 twoFiftyFiveMinusDestAlpha);
769 i32x4_t result1 =
770 simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
772 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
773 u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha,
774 twoFiftyFiveMinusDestAlpha);
775 i32x4_t result2 =
776 simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
778 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
779 simd::FastDivideBy255(result2));
782 case COMPOSITE_OPERATOR_LIGHTER: {
783 // val = dest * sourceAlpha + source * destAlpha;
784 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
785 u16x8_t rightFactor1 = simd::InterleaveLo16(sourceAlpha, destAlpha);
786 i32x4_t result1 =
787 simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
789 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
790 u16x8_t rightFactor2 = simd::InterleaveHi16(sourceAlpha, destAlpha);
791 i32x4_t result2 =
792 simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
794 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
795 simd::FastDivideBy255(result2));
798 default:
799 return simd::FromU16<u16x8_t>(0);
803 template <typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op>
804 static void ApplyComposition(DataSourceSurface* aSource,
805 DataSourceSurface* aDest) {
806 IntSize size = aDest->GetSize();
808 DataSourceSurface::ScopedMap input(aSource, DataSourceSurface::READ);
809 DataSourceSurface::ScopedMap output(aDest, DataSourceSurface::READ_WRITE);
811 uint8_t* sourceData = input.GetData();
812 uint8_t* destData = output.GetData();
813 uint32_t sourceStride = input.GetStride();
814 uint32_t destStride = output.GetStride();
816 for (int32_t y = 0; y < size.height; y++) {
817 for (int32_t x = 0; x < size.width; x += 4) {
818 uint32_t sourceIndex = y * sourceStride + 4 * x;
819 uint32_t destIndex = y * destStride + 4 * x;
821 u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
822 u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]);
824 u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234);
825 u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234);
826 u16x8_t sa12 = simd::Splat16<3, 3>(s12);
827 u16x8_t da12 = simd::Splat16<3, 3>(d12);
828 u16x8_t result12 =
829 CompositeTwoPixels<i32x4_t, u16x8_t, op>(s12, sa12, d12, da12);
831 u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234);
832 u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234);
833 u16x8_t sa34 = simd::Splat16<3, 3>(s34);
834 u16x8_t da34 = simd::Splat16<3, 3>(d34);
835 u16x8_t result34 =
836 CompositeTwoPixels<i32x4_t, u16x8_t, op>(s34, sa34, d34, da34);
838 u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34);
839 simd::Store8(&destData[destIndex], result1234);
844 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
845 static void ApplyComposition_SIMD(DataSourceSurface* aSource,
846 DataSourceSurface* aDest,
847 CompositeOperator aOperator) {
848 switch (aOperator) {
849 case COMPOSITE_OPERATOR_OVER:
850 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OVER>(
851 aSource, aDest);
852 break;
853 case COMPOSITE_OPERATOR_IN:
854 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_IN>(
855 aSource, aDest);
856 break;
857 case COMPOSITE_OPERATOR_OUT:
858 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OUT>(
859 aSource, aDest);
860 break;
861 case COMPOSITE_OPERATOR_ATOP:
862 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_ATOP>(
863 aSource, aDest);
864 break;
865 case COMPOSITE_OPERATOR_XOR:
866 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_XOR>(
867 aSource, aDest);
868 break;
869 case COMPOSITE_OPERATOR_LIGHTER:
870 ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_LIGHTER>(
871 aSource, aDest);
872 break;
873 default:
874 MOZ_CRASH("GFX: Incomplete switch");
878 template <typename u8x16_t>
879 static void SeparateColorChannels_SIMD(
880 const IntSize& size, uint8_t* sourceData, int32_t sourceStride,
881 uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data,
882 uint8_t* channel3Data, int32_t channelStride) {
883 for (int32_t y = 0; y < size.height; y++) {
884 for (int32_t x = 0; x < size.width; x += 16) {
885 // Process 16 pixels at a time.
886 int32_t sourceIndex = y * sourceStride + 4 * x;
887 int32_t targetIndex = y * channelStride + x;
889 u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
890 u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
891 u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
893 u8x16_t bgrabgrabgrabgra1 =
894 simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
895 if (4 * (x + 4) < sourceStride) {
896 bgrabgrabgrabgra2 =
897 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
899 if (4 * (x + 8) < sourceStride) {
900 bgrabgrabgrabgra3 =
901 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
903 if (4 * (x + 12) < sourceStride) {
904 bgrabgrabgrabgra4 =
905 simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
908 u8x16_t bbggrraabbggrraa1 =
909 simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
910 u8x16_t bbggrraabbggrraa2 =
911 simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
912 u8x16_t bbggrraabbggrraa3 =
913 simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
914 u8x16_t bbggrraabbggrraa4 =
915 simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
916 u8x16_t bbbbggggrrrraaaa1 =
917 simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
918 u8x16_t bbbbggggrrrraaaa2 =
919 simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
920 u8x16_t bbbbggggrrrraaaa3 =
921 simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
922 u8x16_t bbbbggggrrrraaaa4 =
923 simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
924 u8x16_t bbbbbbbbgggggggg1 =
925 simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
926 u8x16_t rrrrrrrraaaaaaaa1 =
927 simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
928 u8x16_t bbbbbbbbgggggggg2 =
929 simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
930 u8x16_t rrrrrrrraaaaaaaa2 =
931 simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
932 u8x16_t bbbbbbbbbbbbbbbb =
933 simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
934 u8x16_t gggggggggggggggg =
935 simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
936 u8x16_t rrrrrrrrrrrrrrrr =
937 simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
938 u8x16_t aaaaaaaaaaaaaaaa =
939 simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
941 simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb);
942 simd::Store8(&channel1Data[targetIndex], gggggggggggggggg);
943 simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr);
944 simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa);
949 template <typename u8x16_t>
950 static void CombineColorChannels_SIMD(
951 const IntSize& size, int32_t resultStride, uint8_t* resultData,
952 int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data,
953 uint8_t* channel2Data, uint8_t* channel3Data) {
954 for (int32_t y = 0; y < size.height; y++) {
955 for (int32_t x = 0; x < size.width; x += 16) {
956 // Process 16 pixels at a time.
957 int32_t resultIndex = y * resultStride + 4 * x;
958 int32_t channelIndex = y * channelStride + x;
960 u8x16_t bbbbbbbbbbbbbbbb =
961 simd::Load8<u8x16_t>(&channel0Data[channelIndex]);
962 u8x16_t gggggggggggggggg =
963 simd::Load8<u8x16_t>(&channel1Data[channelIndex]);
964 u8x16_t rrrrrrrrrrrrrrrr =
965 simd::Load8<u8x16_t>(&channel2Data[channelIndex]);
966 u8x16_t aaaaaaaaaaaaaaaa =
967 simd::Load8<u8x16_t>(&channel3Data[channelIndex]);
969 u8x16_t brbrbrbrbrbrbrbr1 =
970 simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
971 u8x16_t brbrbrbrbrbrbrbr2 =
972 simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
973 u8x16_t gagagagagagagaga1 =
974 simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
975 u8x16_t gagagagagagagaga2 =
976 simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
978 u8x16_t bgrabgrabgrabgra1 =
979 simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
980 u8x16_t bgrabgrabgrabgra2 =
981 simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
982 u8x16_t bgrabgrabgrabgra3 =
983 simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
984 u8x16_t bgrabgrabgrabgra4 =
985 simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
987 simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1);
988 if (4 * (x + 4) < resultStride) {
989 simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2);
991 if (4 * (x + 8) < resultStride) {
992 simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3);
994 if (4 * (x + 12) < resultStride) {
995 simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4);
1001 template <typename i32x4_t, typename u16x8_t, typename u8x16_t>
1002 static void DoPremultiplicationCalculation_SIMD(const IntSize& aSize,
1003 uint8_t* aTargetData,
1004 int32_t aTargetStride,
1005 uint8_t* aSourceData,
1006 int32_t aSourceStride) {
1007 const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff,
1008 0, 0, 0, 0xff, 0, 0, 0, 0xff);
1009 for (int32_t y = 0; y < aSize.height; y++) {
1010 for (int32_t x = 0; x < aSize.width; x += 4) {
1011 int32_t inputIndex = y * aSourceStride + 4 * x;
1012 int32_t targetIndex = y * aTargetStride + 4 * x;
1014 u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
1015 u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
1016 u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
1018 // Multiply all components with alpha.
1019 p12 = simd::Mul16(p12, simd::Splat16<3, 3>(p12));
1020 p34 = simd::Mul16(p34, simd::Splat16<3, 3>(p34));
1022 // Divide by 255 and pack.
1023 u8x16_t result = simd::PackAndSaturate16To8(
1024 simd::FastDivideBy255_16(p12), simd::FastDivideBy255_16(p34));
1026 // Get the original alpha channel value back from p1234.
1027 result = simd::Pick(alphaMask, result, p1234);
1029 simd::Store8(&aTargetData[targetIndex], result);
1034 // We use a table of precomputed factors for unpremultiplying.
1035 // We want to compute round(r / (alpha / 255.0f)) for arbitrary values of
1036 // r and alpha in constant time. This table of factors has the property that
1037 // (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with
1038 // a maximum deviation of 1).
1040 // sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha)
1042 // This table has been created using the python code
1043 // ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha
1044 // in range(256))
1045 static const uint16_t sAlphaFactors[256] = {
1046 0, 65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528,
1047 5935, 5440, 5022, 4663, 4352, 4080, 3840, 3627, 3436, 3264, 3109,
1048 2967, 2838, 2720, 2611, 2511, 2418, 2331, 2251, 2176, 2106, 2040,
1049 1978, 1920, 1865, 1813, 1764, 1718, 1674, 1632, 1592, 1554, 1518,
1050 1484, 1451, 1419, 1389, 1360, 1332, 1306, 1280, 1255, 1232, 1209,
1051 1187, 1166, 1145, 1126, 1106, 1088, 1070, 1053, 1036, 1020, 1004,
1052 989, 974, 960, 946, 933, 919, 907, 894, 882, 870, 859,
1053 848, 837, 826, 816, 806, 796, 787, 777, 768, 759, 750,
1054 742, 733, 725, 717, 710, 702, 694, 687, 680, 673, 666,
1055 659, 653, 646, 640, 634, 628, 622, 616, 610, 604, 599,
1056 593, 588, 583, 578, 573, 568, 563, 558, 553, 549, 544,
1057 540, 535, 531, 526, 522, 518, 514, 510, 506, 502, 498,
1058 495, 491, 487, 484, 480, 476, 473, 470, 466, 463, 460,
1059 457, 453, 450, 447, 444, 441, 438, 435, 432, 429, 427,
1060 424, 421, 418, 416, 413, 411, 408, 405, 403, 400, 398,
1061 396, 393, 391, 389, 386, 384, 382, 380, 377, 375, 373,
1062 371, 369, 367, 365, 363, 361, 359, 357, 355, 353, 351,
1063 349, 347, 345, 344, 342, 340, 338, 336, 335, 333, 331,
1064 330, 328, 326, 325, 323, 322, 320, 318, 317, 315, 314,
1065 312, 311, 309, 308, 306, 305, 304, 302, 301, 299, 298,
1066 297, 295, 294, 293, 291, 290, 289, 288, 286, 285, 284,
1067 283, 281, 280, 279, 278, 277, 275, 274, 273, 272, 271,
1068 270, 269, 268, 266, 265, 264, 263, 262, 261, 260, 259,
1069 258, 257, 256};
1071 template <typename u16x8_t, typename u8x16_t>
1072 static void DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize,
1073 uint8_t* aTargetData,
1074 int32_t aTargetStride,
1075 uint8_t* aSourceData,
1076 int32_t aSourceStride) {
1077 for (int32_t y = 0; y < aSize.height; y++) {
1078 for (int32_t x = 0; x < aSize.width; x += 4) {
1079 int32_t inputIndex = y * aSourceStride + 4 * x;
1080 int32_t targetIndex = y * aTargetStride + 4 * x;
1081 union {
1082 u8x16_t p1234;
1083 uint8_t u8[4][4];
1085 p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
1087 // Prepare the alpha factors.
1088 uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1089 uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1090 uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1091 uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
1092 u16x8_t aF12 =
1093 simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8);
1094 u16x8_t aF34 =
1095 simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8);
1097 u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
1098 u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
1100 // Multiply with the alpha factors, add 128 for rounding, and shift right
1101 // by 8 bits.
1102 p12 = simd::ShiftRight16<8>(
1103 simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128)));
1104 p34 = simd::ShiftRight16<8>(
1105 simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128)));
1107 u8x16_t result = simd::PackAndSaturate16To8(p12, p34);
1108 simd::Store8(&aTargetData[targetIndex], result);
1113 template <typename u16x8_t, typename u8x16_t>
1114 static void DoOpacityCalculation_SIMD(const IntSize& aSize,
1115 uint8_t* aTargetData,
1116 int32_t aTargetStride,
1117 uint8_t* aSourceData,
1118 int32_t aSourceStride, Float aOpacity) {
1119 uint8_t alphaValue = uint8_t(roundf(255.f * aOpacity));
1120 u16x8_t alphaValues =
1121 simd::FromU16<u16x8_t>(alphaValue, alphaValue, alphaValue, alphaValue,
1122 alphaValue, alphaValue, alphaValue, alphaValue);
1123 for (int32_t y = 0; y < aSize.height; y++) {
1124 for (int32_t x = 0; x < aSize.width; x += 4) {
1125 int32_t inputIndex = y * aSourceStride + 4 * x;
1126 int32_t targetIndex = y * aTargetStride + 4 * x;
1128 u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
1129 u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
1130 u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
1132 // Multiply all components with alpha.
1133 p12 = simd::Mul16(p12, alphaValues);
1134 p34 = simd::Mul16(p34, alphaValues);
1136 // Divide by 255 and pack.
1137 u8x16_t result = simd::PackAndSaturate16To8(simd::ShiftRight16<8>(p12),
1138 simd::ShiftRight16<8>(p34));
1140 simd::Store8(&aTargetData[targetIndex], result);
1145 template <typename f32x4_t, typename i32x4_t, typename u8x16_t>
1146 static already_AddRefed<DataSourceSurface> RenderTurbulence_SIMD(
1147 const IntSize& aSize, const Point& aOffset, const Size& aBaseFrequency,
1148 int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch,
1149 const Rect& aTileRect) {
1150 #define RETURN_TURBULENCE(Type, Stitch) \
1151 SVGTurbulenceRenderer<Type, Stitch, f32x4_t, i32x4_t, u8x16_t> renderer( \
1152 aBaseFrequency, aSeed, aNumOctaves, aTileRect); \
1153 return renderer.Render(aSize, aOffset);
1155 switch (aType) {
1156 case TURBULENCE_TYPE_TURBULENCE: {
1157 if (aStitch) {
1158 RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true);
1160 RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false);
1162 case TURBULENCE_TYPE_FRACTAL_NOISE: {
1163 if (aStitch) {
1164 RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true);
1166 RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false);
1169 return nullptr;
1170 #undef RETURN_TURBULENCE
1173 // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
1174 template <typename i32x4_t, typename i16x8_t>
1175 static MOZ_ALWAYS_INLINE i16x8_t ArithmeticCombineTwoPixels(
1176 i16x8_t in1, i16x8_t in2, const i16x8_t& k1And4, const i16x8_t& k2And3) {
1177 // Calculate input product: inProd = (in1 * in2) / 255.
1178 i32x4_t inProd_1, inProd_2;
1179 simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2);
1180 i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1),
1181 simd::FastDivideBy255(inProd_2));
1183 // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128
1184 i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128);
1185 i16x8_t inProd1AndOneTwentyEight =
1186 simd::InterleaveLo16(inProd, oneTwentyEight);
1187 i16x8_t inProd2AndOneTwentyEight =
1188 simd::InterleaveHi16(inProd, oneTwentyEight);
1189 i32x4_t inProdTimesK1PlusK4_1 =
1190 simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight);
1191 i32x4_t inProdTimesK1PlusK4_2 =
1192 simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight);
1194 // Calculate k2 * in1 + k3 * in2
1195 i16x8_t in12_1 = simd::InterleaveLo16(in1, in2);
1196 i16x8_t in12_2 = simd::InterleaveHi16(in1, in2);
1197 i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1);
1198 i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2);
1200 // Sum everything up and truncate the fractional part.
1201 i32x4_t result_1 =
1202 simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1));
1203 i32x4_t result_2 =
1204 simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2));
1205 return simd::PackAndSaturate32To16(result_1, result_2);
1208 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
1209 static void ApplyArithmeticCombine_SIMD(
1210 const DataSourceSurface::ScopedMap& aInputMap1,
1211 const DataSourceSurface::ScopedMap& aInputMap2,
1212 const DataSourceSurface::ScopedMap& aOutputMap, const IntSize& aSize,
1213 Float aK1, Float aK2, Float aK3, Float aK4) {
1214 uint8_t* source1Data = aInputMap1.GetData();
1215 uint8_t* source2Data = aInputMap2.GetData();
1216 uint8_t* targetData = aOutputMap.GetData();
1217 uint32_t source1Stride = aInputMap1.GetStride();
1218 uint32_t source2Stride = aInputMap2.GetStride();
1219 uint32_t targetStride = aOutputMap.GetStride();
1221 // The arithmetic combine filter does the following calculation:
1222 // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
1224 // Or, with in1/2 integers between 0 and 255:
1225 // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255
1227 // We want the whole calculation to happen in integer, with 16-bit factors.
1228 // So we convert our factors to fixed-point with precision 1.8.7.
1229 // K4 is premultiplied with 255, and it will be multiplied with 128 later
1230 // during the actual calculation, because premultiplying it with 255 * 128
1231 // would overflow int16.
1233 i16x8_t k1 = simd::FromI16<i16x8_t>(
1234 int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f)));
1235 i16x8_t k2 = simd::FromI16<i16x8_t>(
1236 int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f)));
1237 i16x8_t k3 = simd::FromI16<i16x8_t>(
1238 int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f)));
1239 i16x8_t k4 = simd::FromI16<i16x8_t>(
1240 int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f)));
1242 i16x8_t k1And4 = simd::InterleaveLo16(k1, k4);
1243 i16x8_t k2And3 = simd::InterleaveLo16(k2, k3);
1245 for (int32_t y = 0; y < aSize.height; y++) {
1246 for (int32_t x = 0; x < aSize.width; x += 4) {
1247 uint32_t source1Index = y * source1Stride + 4 * x;
1248 uint32_t source2Index = y * source2Stride + 4 * x;
1249 uint32_t targetIndex = y * targetStride + 4 * x;
1251 // Load and unpack.
1252 u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
1253 u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
1254 i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1);
1255 i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1);
1256 i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2);
1257 i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2);
1259 // Multiply and add.
1260 i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>(
1261 in1_12, in2_12, k1And4, k2And3);
1262 i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>(
1263 in1_34, in2_34, k1And4, k2And3);
1265 // Pack and store.
1266 simd::Store8(&targetData[targetIndex],
1267 simd::PackAndSaturate16To8(result_12, result_34));
1272 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
1273 static already_AddRefed<DataSourceSurface> ApplyArithmeticCombine_SIMD(
1274 DataSourceSurface* aInput1, DataSourceSurface* aInput2, Float aK1,
1275 Float aK2, Float aK3, Float aK4) {
1276 IntSize size = aInput1->GetSize();
1277 RefPtr<DataSourceSurface> target =
1278 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
1279 if (!target) {
1280 return nullptr;
1283 DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ);
1284 DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
1286 if (aInput1->Equals(aInput2)) {
1287 ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>(
1288 inputMap1, inputMap1, outputMap, size, aK1, aK2, aK3, aK4);
1289 } else {
1290 DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ);
1291 ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>(
1292 inputMap1, inputMap2, outputMap, size, aK1, aK2, aK3, aK4);
1295 return target.forget();
1298 } // namespace gfx
1299 } // namespace mozilla