1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
14 uint16x4_t
Divide(uint32x4_t aValues
, uint32x2_t aDivisor
) {
15 uint64x2_t roundingAddition
= vdupq_n_u64(int64_t(1) << 31);
16 uint64x2_t multiplied21
= vmull_u32(vget_low_u32(aValues
), aDivisor
);
17 uint64x2_t multiplied43
= vmull_u32(vget_high_u32(aValues
), aDivisor
);
19 vcombine_u32(vshrn_n_u64(vaddq_u64(multiplied21
, roundingAddition
), 32),
20 vshrn_n_u64(vaddq_u64(multiplied43
, roundingAddition
), 32)));
24 uint16x4_t
BlurFourPixels(const uint32x4_t
& aTopLeft
,
25 const uint32x4_t
& aTopRight
,
26 const uint32x4_t
& aBottomRight
,
27 const uint32x4_t
& aBottomLeft
,
28 const uint32x2_t
& aDivisor
) {
29 uint32x4_t values
= vaddq_u32(
30 vsubq_u32(vsubq_u32(aBottomRight
, aTopRight
), aBottomLeft
), aTopLeft
);
31 return Divide(values
, aDivisor
);
35 void LoadIntegralRowFromRow(uint32_t* aDest
, const uint8_t* aSource
,
36 int32_t aSourceWidth
, int32_t aLeftInflation
,
37 int32_t aRightInflation
) {
38 int32_t currentRowSum
= 0;
40 for (int x
= 0; x
< aLeftInflation
; x
++) {
41 currentRowSum
+= aSource
[0];
42 aDest
[x
] = currentRowSum
;
44 for (int x
= aLeftInflation
; x
< (aSourceWidth
+ aLeftInflation
); x
++) {
45 currentRowSum
+= aSource
[(x
- aLeftInflation
)];
46 aDest
[x
] = currentRowSum
;
48 for (int x
= (aSourceWidth
+ aLeftInflation
);
49 x
< (aSourceWidth
+ aLeftInflation
+ aRightInflation
); x
++) {
50 currentRowSum
+= aSource
[aSourceWidth
- 1];
51 aDest
[x
] = currentRowSum
;
55 MOZ_ALWAYS_INLINE
void GenerateIntegralImage_NEON(
56 int32_t aLeftInflation
, int32_t aRightInflation
, int32_t aTopInflation
,
57 int32_t aBottomInflation
, uint32_t* aIntegralImage
,
58 size_t aIntegralImageStride
, uint8_t* aSource
, int32_t aSourceStride
,
59 const IntSize
& aSize
) {
60 MOZ_ASSERT(!(aLeftInflation
& 3));
62 uint32_t stride32bit
= aIntegralImageStride
/ 4;
63 IntSize
integralImageSize(aSize
.width
+ aLeftInflation
+ aRightInflation
,
64 aSize
.height
+ aTopInflation
+ aBottomInflation
);
66 LoadIntegralRowFromRow(aIntegralImage
, aSource
, aSize
.width
, aLeftInflation
,
69 for (int y
= 1; y
< aTopInflation
+ 1; y
++) {
70 uint32_t* intRow
= aIntegralImage
+ (y
* stride32bit
);
71 uint32_t* intPrevRow
= aIntegralImage
+ (y
- 1) * stride32bit
;
72 uint32_t* intFirstRow
= aIntegralImage
;
74 for (int x
= 0; x
< integralImageSize
.width
; x
+= 4) {
75 uint32x4_t firstRow
= vld1q_u32(intFirstRow
+ x
);
76 uint32x4_t previousRow
= vld1q_u32(intPrevRow
+ x
);
77 vst1q_u32(intRow
+ x
, vaddq_u32(firstRow
, previousRow
));
81 for (int y
= aTopInflation
+ 1; y
< (aSize
.height
+ aTopInflation
); y
++) {
82 uint32x4_t currentRowSum
= vdupq_n_u32(0);
83 uint32_t* intRow
= aIntegralImage
+ (y
* stride32bit
);
84 uint32_t* intPrevRow
= aIntegralImage
+ (y
- 1) * stride32bit
;
85 uint8_t* sourceRow
= aSource
+ aSourceStride
* (y
- aTopInflation
);
87 uint32_t pixel
= sourceRow
[0];
88 for (int x
= 0; x
< aLeftInflation
; x
+= 4) {
91 temp
[1] = temp
[0] + pixel
;
92 temp
[2] = temp
[1] + pixel
;
93 temp
[3] = temp
[2] + pixel
;
94 uint32x4_t sumPixels
= vld1q_u32(temp
);
95 sumPixels
= vaddq_u32(sumPixels
, currentRowSum
);
96 currentRowSum
= vdupq_n_u32(vgetq_lane_u32(sumPixels
, 3));
97 vst1q_u32(intRow
+ x
, vaddq_u32(sumPixels
, vld1q_u32(intPrevRow
+ x
)));
100 for (int x
= aLeftInflation
; x
< (aSize
.width
+ aLeftInflation
); x
+= 4) {
101 // It's important to shuffle here. When we exit this loop currentRowSum
102 // has to be set to sumPixels, so that the following loop can get the
103 // correct pixel for the currentRowSum. The highest order pixel in
104 // currentRowSum could've originated from accumulation in the stride.
105 currentRowSum
= vdupq_n_u32(vgetq_lane_u32(currentRowSum
, 3));
108 temp
[0] = *(sourceRow
+ (x
- aLeftInflation
));
109 temp
[1] = temp
[0] + *(sourceRow
+ (x
- aLeftInflation
) + 1);
110 temp
[2] = temp
[1] + *(sourceRow
+ (x
- aLeftInflation
) + 2);
111 temp
[3] = temp
[2] + *(sourceRow
+ (x
- aLeftInflation
) + 3);
112 uint32x4_t sumPixels
= vld1q_u32(temp
);
113 sumPixels
= vaddq_u32(sumPixels
, currentRowSum
);
114 currentRowSum
= sumPixels
;
115 vst1q_u32(intRow
+ x
, vaddq_u32(sumPixels
, vld1q_u32(intPrevRow
+ x
)));
118 pixel
= sourceRow
[aSize
.width
- 1];
119 int x
= (aSize
.width
+ aLeftInflation
);
120 if ((aSize
.width
& 3)) {
121 // Deal with unaligned portion. Get the correct pixel from currentRowSum,
122 // see explanation above.
123 uint32_t intCurrentRowSum
=
124 ((uint32_t*)¤tRowSum
)[(aSize
.width
% 4) - 1];
125 for (; x
< integralImageSize
.width
; x
++) {
126 // We could be unaligned here!
129 currentRowSum
= vdupq_n_u32(intCurrentRowSum
);
132 intCurrentRowSum
+= pixel
;
133 intRow
[x
] = intPrevRow
[x
] + intCurrentRowSum
;
136 currentRowSum
= vdupq_n_u32(vgetq_lane_u32(currentRowSum
, 3));
139 for (; x
< integralImageSize
.width
; x
+= 4) {
142 temp
[1] = temp
[0] + pixel
;
143 temp
[2] = temp
[1] + pixel
;
144 temp
[3] = temp
[2] + pixel
;
145 uint32x4_t sumPixels
= vld1q_u32(temp
);
146 sumPixels
= vaddq_u32(sumPixels
, currentRowSum
);
147 currentRowSum
= vdupq_n_u32(vgetq_lane_u32(sumPixels
, 3));
148 vst1q_u32(intRow
+ x
, vaddq_u32(sumPixels
, vld1q_u32(intPrevRow
+ x
)));
152 if (aBottomInflation
) {
153 // Store the last valid row of our source image in the last row of
154 // our integral image. This will be overwritten with the correct values
155 // in the upcoming loop.
156 LoadIntegralRowFromRow(
157 aIntegralImage
+ (integralImageSize
.height
- 1) * stride32bit
,
158 aSource
+ (aSize
.height
- 1) * aSourceStride
, aSize
.width
,
159 aLeftInflation
, aRightInflation
);
161 for (int y
= aSize
.height
+ aTopInflation
; y
< integralImageSize
.height
;
163 uint32_t* intRow
= aIntegralImage
+ (y
* stride32bit
);
164 uint32_t* intPrevRow
= aIntegralImage
+ (y
- 1) * stride32bit
;
165 uint32_t* intLastRow
=
166 aIntegralImage
+ (integralImageSize
.height
- 1) * stride32bit
;
167 for (int x
= 0; x
< integralImageSize
.width
; x
+= 4) {
168 vst1q_u32(intRow
+ x
, vaddq_u32(vld1q_u32(intLastRow
+ x
),
169 vld1q_u32(intPrevRow
+ x
)));
176 * Attempt to do an in-place box blur using an integral image.
178 void AlphaBoxBlur::BoxBlur_NEON(uint8_t* aData
, int32_t aLeftLobe
,
179 int32_t aRightLobe
, int32_t aTopLobe
,
180 int32_t aBottomLobe
, uint32_t* aIntegralImage
,
181 size_t aIntegralImageStride
) const {
182 IntSize size
= GetSize();
184 MOZ_ASSERT(size
.height
> 0);
186 // Our 'left' or 'top' lobe will include the current pixel. i.e. when
187 // looking at an integral image the value of a pixel at 'x,y' is calculated
188 // using the value of the integral image values above/below that.
191 int32_t boxSize
= (aLeftLobe
+ aRightLobe
) * (aTopLobe
+ aBottomLobe
);
193 MOZ_ASSERT(boxSize
> 0);
199 uint32_t reciprocal
= uint32_t((uint64_t(1) << 32) / boxSize
);
200 uint32_t stride32bit
= aIntegralImageStride
/ 4;
201 int32_t leftInflation
= RoundUpToMultipleOf4(aLeftLobe
).value();
203 GenerateIntegralImage_NEON(leftInflation
, aRightLobe
, aTopLobe
, aBottomLobe
,
204 aIntegralImage
, aIntegralImageStride
, aData
,
207 uint32x2_t divisor
= vdup_n_u32(reciprocal
);
209 // This points to the start of the rectangle within the IntegralImage that
210 // overlaps the surface being blurred.
211 uint32_t* innerIntegral
=
212 aIntegralImage
+ (aTopLobe
* stride32bit
) + leftInflation
;
213 IntRect skipRect
= mSkipRect
;
214 int32_t stride
= mStride
;
215 uint8_t* data
= aData
;
217 for (int32_t y
= 0; y
< size
.height
; y
++) {
218 bool inSkipRectY
= y
> skipRect
.y
&& y
< skipRect
.YMost();
219 uint32_t* topLeftBase
=
220 innerIntegral
+ ((y
- aTopLobe
) * ptrdiff_t(stride32bit
) - aLeftLobe
);
221 uint32_t* topRightBase
=
222 innerIntegral
+ ((y
- aTopLobe
) * ptrdiff_t(stride32bit
) + aRightLobe
);
223 uint32_t* bottomRightBase
=
225 ((y
+ aBottomLobe
) * ptrdiff_t(stride32bit
) + aRightLobe
);
226 uint32_t* bottomLeftBase
=
228 ((y
+ aBottomLobe
) * ptrdiff_t(stride32bit
) - aLeftLobe
);
231 // Process 16 pixels at a time for as long as possible.
232 for (; x
<= size
.width
- 16; x
+= 16) {
233 if (inSkipRectY
&& x
> skipRect
.x
&& x
< skipRect
.XMost()) {
234 x
= skipRect
.XMost() - 16;
235 // Trigger early jump on coming loop iterations, this will be reset
243 uint32x4_t bottomRight
;
244 uint32x4_t bottomLeft
;
245 topLeft
= vld1q_u32(topLeftBase
+ x
);
246 topRight
= vld1q_u32(topRightBase
+ x
);
247 bottomRight
= vld1q_u32(bottomRightBase
+ x
);
248 bottomLeft
= vld1q_u32(bottomLeftBase
+ x
);
250 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
252 topLeft
= vld1q_u32(topLeftBase
+ x
+ 4);
253 topRight
= vld1q_u32(topRightBase
+ x
+ 4);
254 bottomRight
= vld1q_u32(bottomRightBase
+ x
+ 4);
255 bottomLeft
= vld1q_u32(bottomLeftBase
+ x
+ 4);
257 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
259 topLeft
= vld1q_u32(topLeftBase
+ x
+ 8);
260 topRight
= vld1q_u32(topRightBase
+ x
+ 8);
261 bottomRight
= vld1q_u32(bottomRightBase
+ x
+ 8);
262 bottomLeft
= vld1q_u32(bottomLeftBase
+ x
+ 8);
264 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
266 topLeft
= vld1q_u32(topLeftBase
+ x
+ 12);
267 topRight
= vld1q_u32(topRightBase
+ x
+ 12);
268 bottomRight
= vld1q_u32(bottomRightBase
+ x
+ 12);
269 bottomLeft
= vld1q_u32(bottomLeftBase
+ x
+ 12);
271 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
273 uint8x8_t combine1
= vqmovn_u16(vcombine_u16(result1
, result2
));
274 uint8x8_t combine2
= vqmovn_u16(vcombine_u16(result3
, result4
));
275 uint8x16_t final
= vcombine_u8(combine1
, combine2
);
276 vst1q_u8(data
+ stride
* y
+ x
, final
);
279 // Process the remaining pixels 4 bytes at a time.
280 for (; x
< size
.width
; x
+= 4) {
281 if (inSkipRectY
&& x
> skipRect
.x
&& x
< skipRect
.XMost()) {
282 x
= skipRect
.XMost() - 4;
283 // Trigger early jump on coming loop iterations, this will be reset
289 uint32x4_t topLeft
= vld1q_u32(topLeftBase
+ x
);
290 uint32x4_t topRight
= vld1q_u32(topRightBase
+ x
);
291 uint32x4_t bottomRight
= vld1q_u32(bottomRightBase
+ x
);
292 uint32x4_t bottomLeft
= vld1q_u32(bottomLeftBase
+ x
);
294 BlurFourPixels(topLeft
, topRight
, bottomRight
, bottomLeft
, divisor
);
296 vreinterpret_u32_u8(vmovn_u16(vcombine_u16(result
, vdup_n_u16(0))));
297 *(uint32_t*)(data
+ stride
* y
+ x
) = vget_lane_u32(final
, 0);
303 } // namespace mozilla