no bug - Import translations from android-l10n r=release a=l10n CLOSED TREE
[gecko.git] / gfx / 2d / SwizzleNEON.cpp
blob887e93d6325f55850f4d352ce8e759d60992eecf
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #include "Swizzle.h"
9 #include <arm_neon.h>
11 namespace mozilla {
12 namespace gfx {
14 // Load 1-3 pixels into a 4 pixel vector.
15 static MOZ_ALWAYS_INLINE uint16x8_t LoadRemainder_NEON(const uint8_t* aSrc,
16 size_t aLength) {
17 const uint32_t* src32 = reinterpret_cast<const uint32_t*>(aSrc);
18 uint32x4_t dst32;
19 if (aLength >= 2) {
20 // Load first 2 pixels
21 dst32 = vcombine_u32(vld1_u32(src32), vdup_n_u32(0));
22 // Load third pixel
23 if (aLength >= 3) {
24 dst32 = vld1q_lane_u32(src32 + 2, dst32, 2);
26 } else {
27 // Load single pixel
28 dst32 = vld1q_lane_u32(src32, vdupq_n_u32(0), 0);
30 return vreinterpretq_u16_u32(dst32);
33 // Store 1-3 pixels from a vector into memory without overwriting.
34 static MOZ_ALWAYS_INLINE void StoreRemainder_NEON(uint8_t* aDst, size_t aLength,
35 const uint16x8_t& aSrc) {
36 uint32_t* dst32 = reinterpret_cast<uint32_t*>(aDst);
37 uint32x4_t src32 = vreinterpretq_u32_u16(aSrc);
38 if (aLength >= 2) {
39 // Store first 2 pixels
40 vst1_u32(dst32, vget_low_u32(src32));
41 // Store third pixel
42 if (aLength >= 3) {
43 vst1q_lane_u32(dst32 + 2, src32, 2);
45 } else {
46 // Store single pixel
47 vst1q_lane_u32(dst32, src32, 0);
51 // Premultiply vector of 4 pixels using splayed math.
52 template <bool aSwapRB, bool aOpaqueAlpha>
53 static MOZ_ALWAYS_INLINE uint16x8_t
54 PremultiplyVector_NEON(const uint16x8_t& aSrc) {
55 // Isolate R and B with mask.
56 const uint16x8_t mask = vdupq_n_u16(0x00FF);
57 uint16x8_t rb = vandq_u16(aSrc, mask);
58 // Swap R and B if necessary.
59 if (aSwapRB) {
60 rb = vrev32q_u16(rb);
62 // Isolate G and A by shifting down to bottom of word.
63 uint16x8_t ga = vshrq_n_u16(aSrc, 8);
65 // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
66 uint16x8_t alphas = vtrnq_u16(ga, ga).val[1];
68 // rb = rb*a + 255; rb += rb >> 8;
69 rb = vmlaq_u16(mask, rb, alphas);
70 rb = vsraq_n_u16(rb, rb, 8);
72 // If format is not opaque, force A to 255 so that A*alpha/255 = alpha
73 if (!aOpaqueAlpha) {
74 ga = vorrq_u16(ga, vreinterpretq_u16_u32(vdupq_n_u32(0x00FF0000)));
76 // ga = ga*a + 255; ga += ga >> 8;
77 ga = vmlaq_u16(mask, ga, alphas);
78 ga = vsraq_n_u16(ga, ga, 8);
79 // If format is opaque, force output A to be 255.
80 if (aOpaqueAlpha) {
81 ga = vorrq_u16(ga, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)));
84 // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00)
85 return vsriq_n_u16(ga, rb, 8);
88 template <bool aSwapRB, bool aOpaqueAlpha>
89 static MOZ_ALWAYS_INLINE void PremultiplyChunk_NEON(const uint8_t*& aSrc,
90 uint8_t*& aDst,
91 int32_t aAlignedRow,
92 int32_t aRemainder) {
93 // Process all 4-pixel chunks as one vector.
94 for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
95 uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
96 px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
97 vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
98 aSrc += 4 * 4;
99 aDst += 4 * 4;
102 // Handle any 1-3 remaining pixels.
103 if (aRemainder) {
104 uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
105 px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
106 StoreRemainder_NEON(aDst, aRemainder, px);
110 template <bool aSwapRB, bool aOpaqueAlpha>
111 void PremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
112 int32_t alignedRow = 4 * (aLength & ~3);
113 int32_t remainder = aLength & 3;
114 PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
115 remainder);
118 template <bool aSwapRB, bool aOpaqueAlpha>
119 void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
120 int32_t aDstGap, IntSize aSize) {
121 int32_t alignedRow = 4 * (aSize.width & ~3);
122 int32_t remainder = aSize.width & 3;
123 // Fold remainder into stride gap.
124 aSrcGap += 4 * remainder;
125 aDstGap += 4 * remainder;
127 for (int32_t height = aSize.height; height > 0; height--) {
128 PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
129 remainder);
130 aSrc += aSrcGap;
131 aDst += aDstGap;
135 // Force instantiation of premultiply variants here.
136 template void PremultiplyRow_NEON<false, false>(const uint8_t*, uint8_t*,
137 int32_t);
138 template void PremultiplyRow_NEON<false, true>(const uint8_t*, uint8_t*,
139 int32_t);
140 template void PremultiplyRow_NEON<true, false>(const uint8_t*, uint8_t*,
141 int32_t);
142 template void PremultiplyRow_NEON<true, true>(const uint8_t*, uint8_t*,
143 int32_t);
144 template void Premultiply_NEON<false, false>(const uint8_t*, int32_t, uint8_t*,
145 int32_t, IntSize);
146 template void Premultiply_NEON<false, true>(const uint8_t*, int32_t, uint8_t*,
147 int32_t, IntSize);
148 template void Premultiply_NEON<true, false>(const uint8_t*, int32_t, uint8_t*,
149 int32_t, IntSize);
150 template void Premultiply_NEON<true, true>(const uint8_t*, int32_t, uint8_t*,
151 int32_t, IntSize);
153 // This generates a table of fixed-point reciprocals representing 1/alpha
154 // similar to the fallback implementation. However, the reciprocal must
155 // ultimately be multiplied as an unsigned 9 bit upper part and a signed
156 // 15 bit lower part to cheaply multiply. Thus, the lower 15 bits of the
157 // reciprocal is stored 15 bits of the reciprocal are masked off and
158 // stored in the low word. The upper 9 bits are masked and shifted to fit
159 // into the high word. These then get independently multiplied with the
160 // color component and recombined to provide the full recriprocal multiply.
161 #define UNPREMULQ_NEON(x) \
162 ((((0xFF00FFU / (x)) & 0xFF8000U) << 1) | ((0xFF00FFU / (x)) & 0x7FFFU))
163 #define UNPREMULQ_NEON_2(x) UNPREMULQ_NEON(x), UNPREMULQ_NEON((x) + 1)
164 #define UNPREMULQ_NEON_4(x) UNPREMULQ_NEON_2(x), UNPREMULQ_NEON_2((x) + 2)
165 #define UNPREMULQ_NEON_8(x) UNPREMULQ_NEON_4(x), UNPREMULQ_NEON_4((x) + 4)
166 #define UNPREMULQ_NEON_16(x) UNPREMULQ_NEON_8(x), UNPREMULQ_NEON_8((x) + 8)
167 #define UNPREMULQ_NEON_32(x) UNPREMULQ_NEON_16(x), UNPREMULQ_NEON_16((x) + 16)
168 static const uint32_t sUnpremultiplyTable_NEON[256] = {0,
169 UNPREMULQ_NEON(1),
170 UNPREMULQ_NEON_2(2),
171 UNPREMULQ_NEON_4(4),
172 UNPREMULQ_NEON_8(8),
173 UNPREMULQ_NEON_16(16),
174 UNPREMULQ_NEON_32(32),
175 UNPREMULQ_NEON_32(64),
176 UNPREMULQ_NEON_32(96),
177 UNPREMULQ_NEON_32(128),
178 UNPREMULQ_NEON_32(160),
179 UNPREMULQ_NEON_32(192),
180 UNPREMULQ_NEON_32(224)};
182 // Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
183 // that avoids doing any actual division.
184 template <bool aSwapRB>
185 static MOZ_ALWAYS_INLINE uint16x8_t
186 UnpremultiplyVector_NEON(const uint16x8_t& aSrc) {
187 // Isolate R and B with mask.
188 uint16x8_t rb = vandq_u16(aSrc, vdupq_n_u16(0x00FF));
189 // Swap R and B if necessary.
190 if (aSwapRB) {
191 rb = vrev32q_u16(rb);
194 // Isolate G and A by shifting down to bottom of word.
195 uint16x8_t ga = vshrq_n_u16(aSrc, 8);
196 // Extract the alphas for the 4 pixels from the now isolated words.
197 int a1 = vgetq_lane_u16(ga, 1);
198 int a2 = vgetq_lane_u16(ga, 3);
199 int a3 = vgetq_lane_u16(ga, 5);
200 int a4 = vgetq_lane_u16(ga, 7);
202 // First load all of the interleaved low and high portions of the reciprocals
203 // and combine them a single vector as lo1 hi1 lo2 hi2 lo3 hi3 lo4 hi4
204 uint16x8_t q1234 = vreinterpretq_u16_u32(vld1q_lane_u32(
205 &sUnpremultiplyTable_NEON[a4],
206 vld1q_lane_u32(
207 &sUnpremultiplyTable_NEON[a3],
208 vld1q_lane_u32(
209 &sUnpremultiplyTable_NEON[a2],
210 vld1q_lane_u32(&sUnpremultiplyTable_NEON[a1], vdupq_n_u32(0), 0),
213 3));
214 // Transpose the interleaved low/high portions so that we produce
215 // two separate duplicated vectors for the low and high portions respectively:
216 // lo1 lo1 lo2 lo2 lo3 lo3 lo4 lo4 and hi1 hi1 hi2 hi2 hi3 hi3 hi4 hi4
217 uint16x8x2_t q1234lohi = vtrnq_u16(q1234, q1234);
219 // VQDMULH is a signed multiply that doubles (*2) the result, then takes the
220 // high word. To work around the signedness and the doubling, the low
221 // portion of the reciprocal only stores the lower 15 bits, which fits in a
222 // signed 16 bit integer. The high 9 bit portion is effectively also doubled
223 // by 2 as a side-effect of being shifted for storage. Thus the output scale
224 // of doing a normal multiply by the high portion and the VQDMULH by the low
225 // portion are both doubled and can be safely added together. The resulting
226 // sum just needs to be halved (via VHADD) to thus cancel out the doubling.
227 // All this combines to produce a reciprocal multiply of the form:
228 // rb = ((rb * hi) + ((rb * lo * 2) >> 16)) / 2
229 rb = vhaddq_u16(
230 vmulq_u16(rb, q1234lohi.val[1]),
231 vreinterpretq_u16_s16(vqdmulhq_s16(
232 vreinterpretq_s16_u16(rb), vreinterpretq_s16_u16(q1234lohi.val[0]))));
234 // ga = ((ga * hi) + ((ga * lo * 2) >> 16)) / 2
235 ga = vhaddq_u16(
236 vmulq_u16(ga, q1234lohi.val[1]),
237 vreinterpretq_u16_s16(vqdmulhq_s16(
238 vreinterpretq_s16_u16(ga), vreinterpretq_s16_u16(q1234lohi.val[0]))));
240 // Combine to the final pixel with ((rb | (ga << 8)) & ~0xFF000000) | (aSrc &
241 // 0xFF000000), which inserts back in the original alpha value unchanged.
242 return vbslq_u16(vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)), aSrc,
243 vsliq_n_u16(rb, ga, 8));
246 template <bool aSwapRB>
247 static MOZ_ALWAYS_INLINE void UnpremultiplyChunk_NEON(const uint8_t*& aSrc,
248 uint8_t*& aDst,
249 int32_t aAlignedRow,
250 int32_t aRemainder) {
251 // Process all 4-pixel chunks as one vector.
252 for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
253 uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
254 px = UnpremultiplyVector_NEON<aSwapRB>(px);
255 vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
256 aSrc += 4 * 4;
257 aDst += 4 * 4;
260 // Handle any 1-3 remaining pixels.
261 if (aRemainder) {
262 uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
263 px = UnpremultiplyVector_NEON<aSwapRB>(px);
264 StoreRemainder_NEON(aDst, aRemainder, px);
268 template <bool aSwapRB>
269 void UnpremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst,
270 int32_t aLength) {
271 int32_t alignedRow = 4 * (aLength & ~3);
272 int32_t remainder = aLength & 3;
273 UnpremultiplyChunk_NEON<aSwapRB>(aSrc, aDst, alignedRow, remainder);
276 template <bool aSwapRB>
277 void Unpremultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
278 int32_t aDstGap, IntSize aSize) {
279 int32_t alignedRow = 4 * (aSize.width & ~3);
280 int32_t remainder = aSize.width & 3;
281 // Fold remainder into stride gap.
282 aSrcGap += 4 * remainder;
283 aDstGap += 4 * remainder;
285 for (int32_t height = aSize.height; height > 0; height--) {
286 UnpremultiplyChunk_NEON<aSwapRB>(aSrc, aDst, alignedRow, remainder);
287 aSrc += aSrcGap;
288 aDst += aDstGap;
292 // Force instantiation of unpremultiply variants here.
293 template void UnpremultiplyRow_NEON<false>(const uint8_t*, uint8_t*, int32_t);
294 template void UnpremultiplyRow_NEON<true>(const uint8_t*, uint8_t*, int32_t);
295 template void Unpremultiply_NEON<false>(const uint8_t*, int32_t, uint8_t*,
296 int32_t, IntSize);
297 template void Unpremultiply_NEON<true>(const uint8_t*, int32_t, uint8_t*,
298 int32_t, IntSize);
300 // Swizzle a vector of 4 pixels providing swaps and opaquifying.
301 template <bool aSwapRB, bool aOpaqueAlpha>
302 static MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {
303 // Swap R and B, then add to G and A (forced to 255):
304 // (((src>>16) | (src << 16)) & 0x00FF00FF) |
305 // ((src | 0xFF000000) & ~0x00FF00FF)
306 return vbslq_u16(
307 vdupq_n_u16(0x00FF), vrev32q_u16(aSrc),
308 aOpaqueAlpha
309 ? vorrq_u16(aSrc, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)))
310 : aSrc);
313 #if 0
314 // These specializations currently do not profile faster than the generic versions,
315 // so disable them for now.
317 // Optimized implementations for when there is no R and B swap.
318 template<>
319 static MOZ_ALWAYS_INLINE uint16x8_t
320 SwizzleVector_NEON<false, true>(const uint16x8_t& aSrc)
322 // Force alpha to 255.
323 return vorrq_u16(aSrc, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)));
326 template<>
327 static MOZ_ALWAYS_INLINE uint16x8_t
328 SwizzleVector_NEON<false, false>(const uint16x8_t& aSrc)
330 return aSrc;
332 #endif
334 template <bool aSwapRB, bool aOpaqueAlpha>
335 static MOZ_ALWAYS_INLINE void SwizzleChunk_NEON(const uint8_t*& aSrc,
336 uint8_t*& aDst,
337 int32_t aAlignedRow,
338 int32_t aRemainder) {
339 // Process all 4-pixel chunks as one vector.
340 for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
341 uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
342 px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
343 vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
344 aSrc += 4 * 4;
345 aDst += 4 * 4;
348 // Handle any 1-3 remaining pixels.
349 if (aRemainder) {
350 uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
351 px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
352 StoreRemainder_NEON(aDst, aRemainder, px);
356 template <bool aSwapRB, bool aOpaqueAlpha>
357 void SwizzleRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
358 int32_t alignedRow = 4 * (aLength & ~3);
359 int32_t remainder = aLength & 3;
360 SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
363 template <bool aSwapRB, bool aOpaqueAlpha>
364 void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
365 int32_t aDstGap, IntSize aSize) {
366 int32_t alignedRow = 4 * (aSize.width & ~3);
367 int32_t remainder = aSize.width & 3;
368 // Fold remainder into stride gap.
369 aSrcGap += 4 * remainder;
370 aDstGap += 4 * remainder;
372 for (int32_t height = aSize.height; height > 0; height--) {
373 SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
374 aSrc += aSrcGap;
375 aDst += aDstGap;
379 // Force instantiation of swizzle variants here.
380 template void SwizzleRow_NEON<true, false>(const uint8_t*, uint8_t*, int32_t);
381 template void SwizzleRow_NEON<true, true>(const uint8_t*, uint8_t*, int32_t);
382 template void Swizzle_NEON<true, false>(const uint8_t*, int32_t, uint8_t*,
383 int32_t, IntSize);
384 template void Swizzle_NEON<true, true>(const uint8_t*, int32_t, uint8_t*,
385 int32_t, IntSize);
387 template <bool aSwapRB>
388 void UnpackRowRGB24(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength);
390 template <bool aSwapRB>
391 void UnpackRowRGB24_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
392 // Because this implementation will read an additional 4 bytes of data that
393 // is ignored and masked over, we cannot use the accelerated version for the
394 // last 1-5 pixels (3-15 bytes remaining) to guarantee we don't access memory
395 // outside the buffer (we read in 16 byte chunks).
396 if (aLength < 6) {
397 UnpackRowRGB24<aSwapRB>(aSrc, aDst, aLength);
398 return;
401 // Because we are expanding, we can only process the data back to front in
402 // case we are performing this in place.
403 int32_t alignedRow = (aLength - 2) & ~3;
404 int32_t remainder = aLength - alignedRow;
406 const uint8_t* src = aSrc + alignedRow * 3;
407 uint8_t* dst = aDst + alignedRow * 4;
409 // Handle 2-5 remaining pixels.
410 UnpackRowRGB24<aSwapRB>(src, dst, remainder);
412 uint8x8_t masklo;
413 uint8x8_t maskhi;
414 if (aSwapRB) {
415 static const uint8_t masklo_data[] = {2, 1, 0, 0, 5, 4, 3, 0};
416 static const uint8_t maskhi_data[] = {4, 3, 2, 0, 7, 6, 5, 0};
417 masklo = vld1_u8(masklo_data);
418 maskhi = vld1_u8(maskhi_data);
419 } else {
420 static const uint8_t masklo_data[] = {0, 1, 2, 0, 3, 4, 5, 0};
421 static const uint8_t maskhi_data[] = {2, 3, 4, 0, 5, 6, 7, 0};
422 masklo = vld1_u8(masklo_data);
423 maskhi = vld1_u8(maskhi_data);
426 uint8x16_t alpha = vreinterpretq_u8_u32(vdupq_n_u32(0xFF000000));
428 // Process all 4-pixel chunks as one vector.
429 src -= 4 * 3;
430 dst -= 4 * 4;
431 while (src >= aSrc) {
432 uint8x16_t px = vld1q_u8(src);
433 // G2R2B1G1 R1B0G0R0 -> X1R1G1B1 X0R0G0B0
434 uint8x8_t pxlo = vtbl1_u8(vget_low_u8(px), masklo);
435 // B3G3R3B2 G2R2B1G1 -> X3R3G3B3 X2R2G2B2
436 uint8x8_t pxhi =
437 vtbl1_u8(vext_u8(vget_low_u8(px), vget_high_u8(px), 4), maskhi);
438 px = vcombine_u8(pxlo, pxhi);
439 px = vorrq_u8(px, alpha);
440 vst1q_u8(dst, px);
441 src -= 4 * 3;
442 dst -= 4 * 4;
446 // Force instantiation of swizzle variants here.
447 template void UnpackRowRGB24_NEON<false>(const uint8_t*, uint8_t*, int32_t);
448 template void UnpackRowRGB24_NEON<true>(const uint8_t*, uint8_t*, int32_t);
450 } // namespace gfx
451 } // namespace mozilla