1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
11 namespace mozilla::gfx
{
13 // Load 1-3 pixels into a 4 pixel vector.
14 static MOZ_ALWAYS_INLINE __m128i
LoadRemainder_SSE2(const uint8_t* aSrc
,
18 // Load first 2 pixels
19 px
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(aSrc
));
22 px
= _mm_unpacklo_epi64(
24 _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc
+ 2 * 4)));
28 px
= _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc
));
33 // Store 1-3 pixels from a vector into memory without overwriting.
34 static MOZ_ALWAYS_INLINE
void StoreRemainder_SSE2(uint8_t* aDst
, size_t aLength
,
35 const __m128i
& aSrc
) {
37 // Store first 2 pixels
38 _mm_storel_epi64(reinterpret_cast<__m128i
*>(aDst
), aSrc
);
41 *reinterpret_cast<uint32_t*>(aDst
+ 2 * 4) =
42 _mm_cvtsi128_si32(_mm_srli_si128(aSrc
, 2 * 4));
46 *reinterpret_cast<uint32_t*>(aDst
) = _mm_cvtsi128_si32(aSrc
);
50 // Premultiply vector of 4 pixels using splayed math.
51 template <bool aSwapRB
, bool aOpaqueAlpha
>
52 static MOZ_ALWAYS_INLINE __m128i
PremultiplyVector_SSE2(const __m128i
& aSrc
) {
53 // Isolate R and B with mask.
54 const __m128i mask
= _mm_set1_epi32(0x00FF00FF);
55 __m128i rb
= _mm_and_si128(mask
, aSrc
);
56 // Swap R and B if necessary.
58 rb
= _mm_shufflelo_epi16(rb
, _MM_SHUFFLE(2, 3, 0, 1));
59 rb
= _mm_shufflehi_epi16(rb
, _MM_SHUFFLE(2, 3, 0, 1));
61 // Isolate G and A by shifting down to bottom of word.
62 __m128i ga
= _mm_srli_epi16(aSrc
, 8);
64 // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
65 __m128i alphas
= _mm_shufflelo_epi16(ga
, _MM_SHUFFLE(3, 3, 1, 1));
66 alphas
= _mm_shufflehi_epi16(alphas
, _MM_SHUFFLE(3, 3, 1, 1));
68 // rb = rb*a + 255; rb += rb >> 8;
69 rb
= _mm_add_epi16(_mm_mullo_epi16(rb
, alphas
), mask
);
70 rb
= _mm_add_epi16(rb
, _mm_srli_epi16(rb
, 8));
72 // If format is not opaque, force A to 255 so that A*alpha/255 = alpha
74 ga
= _mm_or_si128(ga
, _mm_set1_epi32(0x00FF0000));
76 // ga = ga*a + 255; ga += ga >> 8;
77 ga
= _mm_add_epi16(_mm_mullo_epi16(ga
, alphas
), mask
);
78 ga
= _mm_add_epi16(ga
, _mm_srli_epi16(ga
, 8));
79 // If format is opaque, force output A to be 255.
81 ga
= _mm_or_si128(ga
, _mm_set1_epi32(0xFF000000));
84 // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00)
85 rb
= _mm_srli_epi16(rb
, 8);
86 ga
= _mm_andnot_si128(mask
, ga
);
87 return _mm_or_si128(rb
, ga
);
90 // Premultiply vector of aAlignedRow + aRemainder pixels.
91 template <bool aSwapRB
, bool aOpaqueAlpha
>
92 static MOZ_ALWAYS_INLINE
void PremultiplyChunk_SSE2(const uint8_t*& aSrc
,
96 // Process all 4-pixel chunks as one vector.
97 for (const uint8_t* end
= aSrc
+ aAlignedRow
; aSrc
< end
;) {
98 __m128i px
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(aSrc
));
99 px
= PremultiplyVector_SSE2
<aSwapRB
, aOpaqueAlpha
>(px
);
100 _mm_storeu_si128(reinterpret_cast<__m128i
*>(aDst
), px
);
105 // Handle any 1-3 remaining pixels.
107 __m128i px
= LoadRemainder_SSE2(aSrc
, aRemainder
);
108 px
= PremultiplyVector_SSE2
<aSwapRB
, aOpaqueAlpha
>(px
);
109 StoreRemainder_SSE2(aDst
, aRemainder
, px
);
113 // Premultiply vector of aLength pixels.
114 template <bool aSwapRB
, bool aOpaqueAlpha
>
115 void PremultiplyRow_SSE2(const uint8_t* aSrc
, uint8_t* aDst
, int32_t aLength
) {
116 int32_t alignedRow
= 4 * (aLength
& ~3);
117 int32_t remainder
= aLength
& 3;
118 PremultiplyChunk_SSE2
<aSwapRB
, aOpaqueAlpha
>(aSrc
, aDst
, alignedRow
,
122 template <bool aSwapRB
, bool aOpaqueAlpha
>
123 void Premultiply_SSE2(const uint8_t* aSrc
, int32_t aSrcGap
, uint8_t* aDst
,
124 int32_t aDstGap
, IntSize aSize
) {
125 int32_t alignedRow
= 4 * (aSize
.width
& ~3);
126 int32_t remainder
= aSize
.width
& 3;
127 // Fold remainder into stride gap.
128 aSrcGap
+= 4 * remainder
;
129 aDstGap
+= 4 * remainder
;
131 for (int32_t height
= aSize
.height
; height
> 0; height
--) {
132 PremultiplyChunk_SSE2
<aSwapRB
, aOpaqueAlpha
>(aSrc
, aDst
, alignedRow
,
139 // Force instantiation of premultiply variants here.
140 template void PremultiplyRow_SSE2
<false, false>(const uint8_t*, uint8_t*,
142 template void PremultiplyRow_SSE2
<false, true>(const uint8_t*, uint8_t*,
144 template void PremultiplyRow_SSE2
<true, false>(const uint8_t*, uint8_t*,
146 template void PremultiplyRow_SSE2
<true, true>(const uint8_t*, uint8_t*,
148 template void Premultiply_SSE2
<false, false>(const uint8_t*, int32_t, uint8_t*,
150 template void Premultiply_SSE2
<false, true>(const uint8_t*, int32_t, uint8_t*,
152 template void Premultiply_SSE2
<true, false>(const uint8_t*, int32_t, uint8_t*,
154 template void Premultiply_SSE2
<true, true>(const uint8_t*, int32_t, uint8_t*,
157 // This generates a table of fixed-point reciprocals representing 1/alpha
158 // similar to the fallback implementation. However, the reciprocal must fit
159 // in 16 bits to multiply cheaply. Observe that reciprocals of smaller alphas
160 // require more bits than for larger alphas. We take advantage of this by
161 // shifting the reciprocal down by either 3 or 8 bits depending on whether
162 // the alpha value is less than 0x20. This is easy to then undo by multiplying
163 // the color component to be unpremultiplying by either 8 or 0x100,
164 // respectively. The 16 bit reciprocal is duplicated into both words of a
165 // uint32_t here to reduce unpacking overhead.
166 #define UNPREMULQ_SSE2(x) \
167 (0x10001U * (0xFF0220U / ((x) * ((x) < 0x20 ? 0x100 : 8))))
168 #define UNPREMULQ_SSE2_2(x) UNPREMULQ_SSE2(x), UNPREMULQ_SSE2((x) + 1)
169 #define UNPREMULQ_SSE2_4(x) UNPREMULQ_SSE2_2(x), UNPREMULQ_SSE2_2((x) + 2)
170 #define UNPREMULQ_SSE2_8(x) UNPREMULQ_SSE2_4(x), UNPREMULQ_SSE2_4((x) + 4)
171 #define UNPREMULQ_SSE2_16(x) UNPREMULQ_SSE2_8(x), UNPREMULQ_SSE2_8((x) + 8)
172 #define UNPREMULQ_SSE2_32(x) UNPREMULQ_SSE2_16(x), UNPREMULQ_SSE2_16((x) + 16)
173 static const uint32_t sUnpremultiplyTable_SSE2
[256] = {0,
178 UNPREMULQ_SSE2_16(16),
179 UNPREMULQ_SSE2_32(32),
180 UNPREMULQ_SSE2_32(64),
181 UNPREMULQ_SSE2_32(96),
182 UNPREMULQ_SSE2_32(128),
183 UNPREMULQ_SSE2_32(160),
184 UNPREMULQ_SSE2_32(192),
185 UNPREMULQ_SSE2_32(224)};
187 // Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
188 // that avoids doing any actual division.
189 template <bool aSwapRB
>
190 static MOZ_ALWAYS_INLINE __m128i
UnpremultiplyVector_SSE2(const __m128i
& aSrc
) {
191 // Isolate R and B with mask.
192 __m128i rb
= _mm_and_si128(aSrc
, _mm_set1_epi32(0x00FF00FF));
193 // Swap R and B if necessary.
195 rb
= _mm_shufflelo_epi16(rb
, _MM_SHUFFLE(2, 3, 0, 1));
196 rb
= _mm_shufflehi_epi16(rb
, _MM_SHUFFLE(2, 3, 0, 1));
199 // Isolate G and A by shifting down to bottom of word.
200 __m128i ga
= _mm_srli_epi16(aSrc
, 8);
201 // Extract the alphas for the 4 pixels from the now isolated words.
202 int a1
= _mm_extract_epi16(ga
, 1);
203 int a2
= _mm_extract_epi16(ga
, 3);
204 int a3
= _mm_extract_epi16(ga
, 5);
205 int a4
= _mm_extract_epi16(ga
, 7);
207 // Load the 16 bit reciprocals from the table for each alpha.
208 // The reciprocals are doubled in each uint32_t entry.
209 // Unpack them to a final vector of duplicated reciprocals of
210 // the form Q1 Q1 Q2 Q2 Q3 Q3 Q4 Q4.
212 _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2
[a1
]),
213 _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2
[a2
]));
215 _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2
[a3
]),
216 _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2
[a4
]));
217 __m128i q1234
= _mm_unpacklo_epi64(q12
, q34
);
219 // Check if the alphas are less than 0x20, so that we can undo
220 // scaling of the reciprocals as appropriate.
221 __m128i scale
= _mm_cmplt_epi32(ga
, _mm_set1_epi32(0x00200000));
222 // Produce scale factors by ((a < 0x20) ^ 8) & 0x108,
223 // such that scale is 0x100 if < 0x20, and 8 otherwise.
224 scale
= _mm_xor_si128(scale
, _mm_set1_epi16(8));
225 scale
= _mm_and_si128(scale
, _mm_set1_epi16(0x108));
226 // Isolate G now so that we don't accidentally unpremultiply A.
227 ga
= _mm_and_si128(ga
, _mm_set1_epi32(0x000000FF));
229 // Scale R, B, and G as required depending on reciprocal precision.
230 rb
= _mm_mullo_epi16(rb
, scale
);
231 ga
= _mm_mullo_epi16(ga
, scale
);
233 // Multiply R, B, and G by the reciprocal, only taking the high word
234 // too effectively shift right by 16.
235 rb
= _mm_mulhi_epu16(rb
, q1234
);
236 ga
= _mm_mulhi_epu16(ga
, q1234
);
238 // Combine back to final pixel with rb | (ga << 8) | (aSrc & 0xFF000000),
239 // which will add back on the original alpha value unchanged.
240 ga
= _mm_slli_si128(ga
, 1);
241 ga
= _mm_or_si128(ga
, _mm_and_si128(aSrc
, _mm_set1_epi32(0xFF000000)));
242 return _mm_or_si128(rb
, ga
);
245 template <bool aSwapRB
>
246 static MOZ_ALWAYS_INLINE
void UnpremultiplyChunk_SSE2(const uint8_t*& aSrc
,
249 int32_t aRemainder
) {
250 // Process all 4-pixel chunks as one vector.
251 for (const uint8_t* end
= aSrc
+ aAlignedRow
; aSrc
< end
;) {
252 __m128i px
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(aSrc
));
253 px
= UnpremultiplyVector_SSE2
<aSwapRB
>(px
);
254 _mm_storeu_si128(reinterpret_cast<__m128i
*>(aDst
), px
);
259 // Handle any 1-3 remaining pixels.
261 __m128i px
= LoadRemainder_SSE2(aSrc
, aRemainder
);
262 px
= UnpremultiplyVector_SSE2
<aSwapRB
>(px
);
263 StoreRemainder_SSE2(aDst
, aRemainder
, px
);
267 template <bool aSwapRB
>
268 void UnpremultiplyRow_SSE2(const uint8_t* aSrc
, uint8_t* aDst
,
270 int32_t alignedRow
= 4 * (aLength
& ~3);
271 int32_t remainder
= aLength
& 3;
272 UnpremultiplyChunk_SSE2
<aSwapRB
>(aSrc
, aDst
, alignedRow
, remainder
);
275 template <bool aSwapRB
>
276 void Unpremultiply_SSE2(const uint8_t* aSrc
, int32_t aSrcGap
, uint8_t* aDst
,
277 int32_t aDstGap
, IntSize aSize
) {
278 int32_t alignedRow
= 4 * (aSize
.width
& ~3);
279 int32_t remainder
= aSize
.width
& 3;
280 // Fold remainder into stride gap.
281 aSrcGap
+= 4 * remainder
;
282 aDstGap
+= 4 * remainder
;
284 for (int32_t height
= aSize
.height
; height
> 0; height
--) {
285 UnpremultiplyChunk_SSE2
<aSwapRB
>(aSrc
, aDst
, alignedRow
, remainder
);
291 // Force instantiation of unpremultiply variants here.
292 template void UnpremultiplyRow_SSE2
<false>(const uint8_t*, uint8_t*, int32_t);
293 template void UnpremultiplyRow_SSE2
<true>(const uint8_t*, uint8_t*, int32_t);
294 template void Unpremultiply_SSE2
<false>(const uint8_t*, int32_t, uint8_t*,
296 template void Unpremultiply_SSE2
<true>(const uint8_t*, int32_t, uint8_t*,
299 // Swizzle a vector of 4 pixels providing swaps and opaquifying.
300 template <bool aSwapRB
, bool aOpaqueAlpha
>
301 static MOZ_ALWAYS_INLINE __m128i
SwizzleVector_SSE2(const __m128i
& aSrc
) {
303 __m128i rb
= _mm_and_si128(aSrc
, _mm_set1_epi32(0x00FF00FF));
305 rb
= _mm_shufflelo_epi16(rb
, _MM_SHUFFLE(2, 3, 0, 1));
306 rb
= _mm_shufflehi_epi16(rb
, _MM_SHUFFLE(2, 3, 0, 1));
308 __m128i ga
= _mm_and_si128(aSrc
, _mm_set1_epi32(0xFF00FF00));
309 // Force alpha to 255 if necessary.
311 ga
= _mm_or_si128(ga
, _mm_set1_epi32(0xFF000000));
313 // Combine everything back together.
314 return _mm_or_si128(rb
, ga
);
318 // These specializations currently do not profile faster than the generic versions,
319 // so disable them for now.
321 // Optimized implementations for when there is no R and B swap.
323 MOZ_ALWAYS_INLINE __m128i
324 SwizzleVector_SSE2
<false, true>(const __m128i
& aSrc
)
326 // Force alpha to 255.
327 return _mm_or_si128(aSrc
, _mm_set1_epi32(0xFF000000));
331 MOZ_ALWAYS_INLINE __m128i
332 SwizzleVector_SSE2
<false, false>(const __m128i
& aSrc
)
338 template <bool aSwapRB
, bool aOpaqueAlpha
>
339 static MOZ_ALWAYS_INLINE
void SwizzleChunk_SSE2(const uint8_t*& aSrc
,
342 int32_t aRemainder
) {
343 // Process all 4-pixel chunks as one vector.
344 for (const uint8_t* end
= aSrc
+ aAlignedRow
; aSrc
< end
;) {
345 __m128i px
= _mm_loadu_si128(reinterpret_cast<const __m128i
*>(aSrc
));
346 px
= SwizzleVector_SSE2
<aSwapRB
, aOpaqueAlpha
>(px
);
347 _mm_storeu_si128(reinterpret_cast<__m128i
*>(aDst
), px
);
352 // Handle any 1-3 remaining pixels.
354 __m128i px
= LoadRemainder_SSE2(aSrc
, aRemainder
);
355 px
= SwizzleVector_SSE2
<aSwapRB
, aOpaqueAlpha
>(px
);
356 StoreRemainder_SSE2(aDst
, aRemainder
, px
);
360 template <bool aSwapRB
, bool aOpaqueAlpha
>
361 void SwizzleRow_SSE2(const uint8_t* aSrc
, uint8_t* aDst
, int32_t aLength
) {
362 int32_t alignedRow
= 4 * (aLength
& ~3);
363 int32_t remainder
= aLength
& 3;
364 SwizzleChunk_SSE2
<aSwapRB
, aOpaqueAlpha
>(aSrc
, aDst
, alignedRow
, remainder
);
367 template <bool aSwapRB
, bool aOpaqueAlpha
>
368 void Swizzle_SSE2(const uint8_t* aSrc
, int32_t aSrcGap
, uint8_t* aDst
,
369 int32_t aDstGap
, IntSize aSize
) {
370 int32_t alignedRow
= 4 * (aSize
.width
& ~3);
371 int32_t remainder
= aSize
.width
& 3;
372 // Fold remainder into stride gap.
373 aSrcGap
+= 4 * remainder
;
374 aDstGap
+= 4 * remainder
;
376 for (int32_t height
= aSize
.height
; height
> 0; height
--) {
377 SwizzleChunk_SSE2
<aSwapRB
, aOpaqueAlpha
>(aSrc
, aDst
, alignedRow
, remainder
);
383 // Force instantiation of swizzle variants here.
384 template void SwizzleRow_SSE2
<true, false>(const uint8_t*, uint8_t*, int32_t);
385 template void SwizzleRow_SSE2
<true, true>(const uint8_t*, uint8_t*, int32_t);
386 template void Swizzle_SSE2
<true, false>(const uint8_t*, int32_t, uint8_t*,
388 template void Swizzle_SSE2
<true, true>(const uint8_t*, int32_t, uint8_t*,
391 } // namespace mozilla::gfx