1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
9 #include "SSEHelpers.h"
13 namespace mozilla::gfx
{
16 __m128i
Divide(__m128i aValues
, __m128i aDivisor
) {
17 const __m128i mask
= _mm_setr_epi32(0x0, 0xffffffff, 0x0, 0xffffffff);
21 } roundingAddition
= {{int64_t(1) << 31, int64_t(1) << 31}};
23 __m128i multiplied31
= _mm_mul_epu32(aValues
, aDivisor
);
24 __m128i multiplied42
= _mm_mul_epu32(_mm_srli_epi64(aValues
, 32), aDivisor
);
26 // Add 1 << 31 before shifting or masking the lower 32 bits away, so that the
29 _mm_srli_epi64(_mm_add_epi64(multiplied31
, roundingAddition
.m
), 32);
31 _mm_and_si128(_mm_add_epi64(multiplied42
, roundingAddition
.m
), mask
);
32 __m128i p4321
= _mm_or_si128(p_3_1
, p4_2_
);
37 __m128i
BlurFourPixels(const __m128i
& aTopLeft
, const __m128i
& aTopRight
,
38 const __m128i
& aBottomRight
, const __m128i
& aBottomLeft
,
39 const __m128i
& aDivisor
) {
40 __m128i values
= _mm_add_epi32(
41 _mm_sub_epi32(_mm_sub_epi32(aBottomRight
, aTopRight
), aBottomLeft
),
43 return Divide(values
, aDivisor
);
47 void LoadIntegralRowFromRow(uint32_t* aDest
, const uint8_t* aSource
,
48 int32_t aSourceWidth
, int32_t aLeftInflation
,
49 int32_t aRightInflation
) {
50 int32_t currentRowSum
= 0;
52 for (int x
= 0; x
< aLeftInflation
; x
++) {
53 currentRowSum
+= aSource
[0];
54 aDest
[x
] = currentRowSum
;
56 for (int x
= aLeftInflation
; x
< (aSourceWidth
+ aLeftInflation
); x
++) {
57 currentRowSum
+= aSource
[(x
- aLeftInflation
)];
58 aDest
[x
] = currentRowSum
;
60 for (int x
= (aSourceWidth
+ aLeftInflation
);
61 x
< (aSourceWidth
+ aLeftInflation
+ aRightInflation
); x
++) {
62 currentRowSum
+= aSource
[aSourceWidth
- 1];
63 aDest
[x
] = currentRowSum
;
67 // This function calculates an integral of four pixels stored in the 4
68 // 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns
69 // { 30, 80, 160, 260 }. This seems to be the fastest way to do this after
72 __m128i
AccumulatePixelSums(__m128i aPixels
) {
73 __m128i sumPixels
= aPixels
;
74 __m128i currentPixels
= _mm_slli_si128(aPixels
, 4);
75 sumPixels
= _mm_add_epi32(sumPixels
, currentPixels
);
76 currentPixels
= _mm_unpacklo_epi64(_mm_setzero_si128(), sumPixels
);
78 return _mm_add_epi32(sumPixels
, currentPixels
);
81 MOZ_ALWAYS_INLINE
void GenerateIntegralImage_SSE2(
82 int32_t aLeftInflation
, int32_t aRightInflation
, int32_t aTopInflation
,
83 int32_t aBottomInflation
, uint32_t* aIntegralImage
,
84 size_t aIntegralImageStride
, uint8_t* aSource
, int32_t aSourceStride
,
85 const IntSize
& aSize
) {
86 MOZ_ASSERT(!(aLeftInflation
& 3));
88 uint32_t stride32bit
= aIntegralImageStride
/ 4;
90 IntSize
integralImageSize(aSize
.width
+ aLeftInflation
+ aRightInflation
,
91 aSize
.height
+ aTopInflation
+ aBottomInflation
);
93 LoadIntegralRowFromRow(aIntegralImage
, aSource
, aSize
.width
, aLeftInflation
,
96 for (int y
= 1; y
< aTopInflation
+ 1; y
++) {
97 uint32_t* intRow
= aIntegralImage
+ (y
* stride32bit
);
98 uint32_t* intPrevRow
= aIntegralImage
+ (y
- 1) * stride32bit
;
99 uint32_t* intFirstRow
= aIntegralImage
;
101 for (int x
= 0; x
< integralImageSize
.width
; x
+= 4) {
102 __m128i firstRow
= _mm_load_si128((__m128i
*)(intFirstRow
+ x
));
103 __m128i previousRow
= _mm_load_si128((__m128i
*)(intPrevRow
+ x
));
104 _mm_store_si128((__m128i
*)(intRow
+ x
),
105 _mm_add_epi32(firstRow
, previousRow
));
109 for (int y
= aTopInflation
+ 1; y
< (aSize
.height
+ aTopInflation
); y
++) {
110 __m128i currentRowSum
= _mm_setzero_si128();
111 uint32_t* intRow
= aIntegralImage
+ (y
* stride32bit
);
112 uint32_t* intPrevRow
= aIntegralImage
+ (y
- 1) * stride32bit
;
113 uint8_t* sourceRow
= aSource
+ aSourceStride
* (y
- aTopInflation
);
115 uint32_t pixel
= sourceRow
[0];
116 for (int x
= 0; x
< aLeftInflation
; x
+= 4) {
117 __m128i sumPixels
= AccumulatePixelSums(
118 _mm_shuffle_epi32(_mm_set1_epi32(pixel
), _MM_SHUFFLE(0, 0, 0, 0)));
120 sumPixels
= _mm_add_epi32(sumPixels
, currentRowSum
);
122 currentRowSum
= _mm_shuffle_epi32(sumPixels
, _MM_SHUFFLE(3, 3, 3, 3));
125 (__m128i
*)(intRow
+ x
),
126 _mm_add_epi32(sumPixels
, _mm_load_si128((__m128i
*)(intPrevRow
+ x
))));
128 for (int x
= aLeftInflation
; x
< (aSize
.width
+ aLeftInflation
); x
+= 4) {
129 uint32_t pixels
= *(uint32_t*)(sourceRow
+ (x
- aLeftInflation
));
131 // It's important to shuffle here. When we exit this loop currentRowSum
132 // has to be set to sumPixels, so that the following loop can get the
133 // correct pixel for the currentRowSum. The highest order pixel in
134 // currentRowSum could've originated from accumulation in the stride.
135 currentRowSum
= _mm_shuffle_epi32(currentRowSum
, _MM_SHUFFLE(3, 3, 3, 3));
137 __m128i sumPixels
= AccumulatePixelSums(_mm_unpacklo_epi16(
138 _mm_unpacklo_epi8(_mm_set1_epi32(pixels
), _mm_setzero_si128()),
139 _mm_setzero_si128()));
140 sumPixels
= _mm_add_epi32(sumPixels
, currentRowSum
);
142 currentRowSum
= sumPixels
;
145 (__m128i
*)(intRow
+ x
),
146 _mm_add_epi32(sumPixels
, _mm_load_si128((__m128i
*)(intPrevRow
+ x
))));
149 pixel
= sourceRow
[aSize
.width
- 1];
150 int x
= (aSize
.width
+ aLeftInflation
);
151 if ((aSize
.width
& 3)) {
152 // Deal with unaligned portion. Get the correct pixel from currentRowSum,
153 // see explanation above.
154 uint32_t intCurrentRowSum
=
155 ((uint32_t*)¤tRowSum
)[(aSize
.width
% 4) - 1];
156 for (; x
< integralImageSize
.width
; x
++) {
157 // We could be unaligned here!
160 currentRowSum
= _mm_set1_epi32(intCurrentRowSum
);
163 intCurrentRowSum
+= pixel
;
164 intRow
[x
] = intPrevRow
[x
] + intCurrentRowSum
;
167 currentRowSum
= _mm_shuffle_epi32(currentRowSum
, _MM_SHUFFLE(3, 3, 3, 3));
169 for (; x
< integralImageSize
.width
; x
+= 4) {
170 __m128i sumPixels
= AccumulatePixelSums(_mm_set1_epi32(pixel
));
172 sumPixels
= _mm_add_epi32(sumPixels
, currentRowSum
);
174 currentRowSum
= _mm_shuffle_epi32(sumPixels
, _MM_SHUFFLE(3, 3, 3, 3));
177 (__m128i
*)(intRow
+ x
),
178 _mm_add_epi32(sumPixels
, _mm_load_si128((__m128i
*)(intPrevRow
+ x
))));
182 if (aBottomInflation
) {
183 // Store the last valid row of our source image in the last row of
184 // our integral image. This will be overwritten with the correct values
185 // in the upcoming loop.
186 LoadIntegralRowFromRow(
187 aIntegralImage
+ (integralImageSize
.height
- 1) * stride32bit
,
188 aSource
+ (aSize
.height
- 1) * aSourceStride
, aSize
.width
,
189 aLeftInflation
, aRightInflation
);
191 for (int y
= aSize
.height
+ aTopInflation
; y
< integralImageSize
.height
;
193 __m128i
* intRow
= (__m128i
*)(aIntegralImage
+ (y
* stride32bit
));
194 __m128i
* intPrevRow
= (__m128i
*)(aIntegralImage
+ (y
- 1) * stride32bit
);
195 __m128i
* intLastRow
=
196 (__m128i
*)(aIntegralImage
+
197 (integralImageSize
.height
- 1) * stride32bit
);
199 for (int x
= 0; x
< integralImageSize
.width
; x
+= 4) {
200 _mm_store_si128(intRow
+ (x
/ 4),
201 _mm_add_epi32(_mm_load_si128(intLastRow
+ (x
/ 4)),
202 _mm_load_si128(intPrevRow
+ (x
/ 4))));
209 * Attempt to do an in-place box blur using an integral image.
211 void AlphaBoxBlur::BoxBlur_SSE2(uint8_t* aData
, int32_t aLeftLobe
,
212 int32_t aRightLobe
, int32_t aTopLobe
,
213 int32_t aBottomLobe
, uint32_t* aIntegralImage
,
214 size_t aIntegralImageStride
) const {
215 IntSize size
= GetSize();
217 MOZ_ASSERT(size
.height
> 0);
219 // Our 'left' or 'top' lobe will include the current pixel. i.e. when
220 // looking at an integral image the value of a pixel at 'x,y' is calculated
221 // using the value of the integral image values above/below that.
224 int32_t boxSize
= (aLeftLobe
+ aRightLobe
) * (aTopLobe
+ aBottomLobe
);
226 MOZ_ASSERT(boxSize
> 0);
232 uint32_t reciprocal
= uint32_t((uint64_t(1) << 32) / boxSize
);
234 uint32_t stride32bit
= aIntegralImageStride
/ 4;
235 int32_t leftInflation
= RoundUpToMultipleOf4(aLeftLobe
).value();
237 GenerateIntegralImage_SSE2(leftInflation
, aRightLobe
, aTopLobe
, aBottomLobe
,
238 aIntegralImage
, aIntegralImageStride
, aData
,
241 __m128i divisor
= _mm_set1_epi32(reciprocal
);
243 // This points to the start of the rectangle within the IntegralImage that
244 // overlaps the surface being blurred.
245 uint32_t* innerIntegral
=
246 aIntegralImage
+ (aTopLobe
* stride32bit
) + leftInflation
;
248 IntRect skipRect
= mSkipRect
;
249 int32_t stride
= mStride
;
250 uint8_t* data
= aData
;
251 for (int32_t y
= 0; y
< size
.height
; y
++) {
252 // Not using ContainsY(y) because we do not skip y == skipRect.Y()
253 // although that may not be done on purpose
254 bool inSkipRectY
= y
> skipRect
.Y() && y
< skipRect
.YMost();
256 uint32_t* topLeftBase
=
257 innerIntegral
+ ((y
- aTopLobe
) * ptrdiff_t(stride32bit
) - aLeftLobe
);
258 uint32_t* topRightBase
=
259 innerIntegral
+ ((y
- aTopLobe
) * ptrdiff_t(stride32bit
) + aRightLobe
);
260 uint32_t* bottomRightBase
=
262 ((y
+ aBottomLobe
) * ptrdiff_t(stride32bit
) + aRightLobe
);
263 uint32_t* bottomLeftBase
=
265 ((y
+ aBottomLobe
) * ptrdiff_t(stride32bit
) - aLeftLobe
);
268 // Process 16 pixels at a time for as long as possible.
269 for (; x
<= size
.width
- 16; x
+= 16) {
270 // Not using ContainsX(x) because we do not skip x == skipRect.X()
271 // although that may not be done on purpose
272 if (inSkipRectY
&& x
> skipRect
.X() && x
< skipRect
.XMost()) {
273 x
= skipRect
.XMost() - 16;
274 // Trigger early jump on coming loop iterations, this will be reset
285 topLeft
= loadUnaligned128((__m128i
*)(topLeftBase
+ x
));
286 topRight
= loadUnaligned128((__m128i
*)(topRightBase
+ x
));
287 bottomRight
= loadUnaligned128((__m128i
*)(bottomRightBase
+ x
));
288 bottomLeft
= loadUnaligned128((__m128i
*)(bottomLeftBase
+ x
));
290 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
292 topLeft
= loadUnaligned128((__m128i
*)(topLeftBase
+ x
+ 4));
293 topRight
= loadUnaligned128((__m128i
*)(topRightBase
+ x
+ 4));
294 bottomRight
= loadUnaligned128((__m128i
*)(bottomRightBase
+ x
+ 4));
295 bottomLeft
= loadUnaligned128((__m128i
*)(bottomLeftBase
+ x
+ 4));
297 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
299 topLeft
= loadUnaligned128((__m128i
*)(topLeftBase
+ x
+ 8));
300 topRight
= loadUnaligned128((__m128i
*)(topRightBase
+ x
+ 8));
301 bottomRight
= loadUnaligned128((__m128i
*)(bottomRightBase
+ x
+ 8));
302 bottomLeft
= loadUnaligned128((__m128i
*)(bottomLeftBase
+ x
+ 8));
304 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
306 topLeft
= loadUnaligned128((__m128i
*)(topLeftBase
+ x
+ 12));
307 topRight
= loadUnaligned128((__m128i
*)(topRightBase
+ x
+ 12));
308 bottomRight
= loadUnaligned128((__m128i
*)(bottomRightBase
+ x
+ 12));
309 bottomLeft
= loadUnaligned128((__m128i
*)(bottomLeftBase
+ x
+ 12));
311 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
313 __m128i final
= _mm_packus_epi16(_mm_packs_epi32(result1
, result2
),
314 _mm_packs_epi32(result3
, result4
));
316 _mm_storeu_si128((__m128i
*)(data
+ stride
* y
+ x
), final
);
319 // Process the remaining pixels 4 bytes at a time.
320 for (; x
< size
.width
; x
+= 4) {
321 // Not using Containsx(x) because we do not skip x == skipRect.X()
322 // although that may not be done on purpose
323 if (inSkipRectY
&& x
> skipRect
.X() && x
< skipRect
.XMost()) {
324 x
= skipRect
.XMost() - 4;
325 // Trigger early jump on coming loop iterations, this will be reset
330 __m128i topLeft
= loadUnaligned128((__m128i
*)(topLeftBase
+ x
));
331 __m128i topRight
= loadUnaligned128((__m128i
*)(topRightBase
+ x
));
332 __m128i bottomRight
= loadUnaligned128((__m128i
*)(bottomRightBase
+ x
));
333 __m128i bottomLeft
= loadUnaligned128((__m128i
*)(bottomLeftBase
+ x
));
336 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
337 __m128i final
= _mm_packus_epi16(
338 _mm_packs_epi32(result
, _mm_setzero_si128()), _mm_setzero_si128());
340 *(uint32_t*)(data
+ stride
* y
+ x
) = _mm_cvtsi128_si32(final
);
345 } // namespace mozilla::gfx