no bug - Bumping Firefox l10n changesets r=release a=l10n-bump DONTBUILD CLOSED TREE
[gecko.git] / gfx / 2d / BlurNEON.cpp
blob601b6f363de66ee247c1de84901cada07ea9ad25
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #include "Blur.h"
8 #include <arm_neon.h>
10 namespace mozilla {
11 namespace gfx {
13 MOZ_ALWAYS_INLINE
14 uint16x4_t Divide(uint32x4_t aValues, uint32x2_t aDivisor) {
15 uint64x2_t roundingAddition = vdupq_n_u64(int64_t(1) << 31);
16 uint64x2_t multiplied21 = vmull_u32(vget_low_u32(aValues), aDivisor);
17 uint64x2_t multiplied43 = vmull_u32(vget_high_u32(aValues), aDivisor);
18 return vqmovn_u32(
19 vcombine_u32(vshrn_n_u64(vaddq_u64(multiplied21, roundingAddition), 32),
20 vshrn_n_u64(vaddq_u64(multiplied43, roundingAddition), 32)));
23 MOZ_ALWAYS_INLINE
24 uint16x4_t BlurFourPixels(const uint32x4_t& aTopLeft,
25 const uint32x4_t& aTopRight,
26 const uint32x4_t& aBottomRight,
27 const uint32x4_t& aBottomLeft,
28 const uint32x2_t& aDivisor) {
29 uint32x4_t values = vaddq_u32(
30 vsubq_u32(vsubq_u32(aBottomRight, aTopRight), aBottomLeft), aTopLeft);
31 return Divide(values, aDivisor);
34 MOZ_ALWAYS_INLINE
35 void LoadIntegralRowFromRow(uint32_t* aDest, const uint8_t* aSource,
36 int32_t aSourceWidth, int32_t aLeftInflation,
37 int32_t aRightInflation) {
38 int32_t currentRowSum = 0;
40 for (int x = 0; x < aLeftInflation; x++) {
41 currentRowSum += aSource[0];
42 aDest[x] = currentRowSum;
44 for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) {
45 currentRowSum += aSource[(x - aLeftInflation)];
46 aDest[x] = currentRowSum;
48 for (int x = (aSourceWidth + aLeftInflation);
49 x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
50 currentRowSum += aSource[aSourceWidth - 1];
51 aDest[x] = currentRowSum;
55 MOZ_ALWAYS_INLINE void GenerateIntegralImage_NEON(
56 int32_t aLeftInflation, int32_t aRightInflation, int32_t aTopInflation,
57 int32_t aBottomInflation, uint32_t* aIntegralImage,
58 size_t aIntegralImageStride, uint8_t* aSource, int32_t aSourceStride,
59 const IntSize& aSize) {
60 MOZ_ASSERT(!(aLeftInflation & 3));
62 uint32_t stride32bit = aIntegralImageStride / 4;
63 IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
64 aSize.height + aTopInflation + aBottomInflation);
66 LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation,
67 aRightInflation);
69 for (int y = 1; y < aTopInflation + 1; y++) {
70 uint32_t* intRow = aIntegralImage + (y * stride32bit);
71 uint32_t* intPrevRow = aIntegralImage + (y - 1) * stride32bit;
72 uint32_t* intFirstRow = aIntegralImage;
74 for (int x = 0; x < integralImageSize.width; x += 4) {
75 uint32x4_t firstRow = vld1q_u32(intFirstRow + x);
76 uint32x4_t previousRow = vld1q_u32(intPrevRow + x);
77 vst1q_u32(intRow + x, vaddq_u32(firstRow, previousRow));
81 for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
82 uint32x4_t currentRowSum = vdupq_n_u32(0);
83 uint32_t* intRow = aIntegralImage + (y * stride32bit);
84 uint32_t* intPrevRow = aIntegralImage + (y - 1) * stride32bit;
85 uint8_t* sourceRow = aSource + aSourceStride * (y - aTopInflation);
87 uint32_t pixel = sourceRow[0];
88 for (int x = 0; x < aLeftInflation; x += 4) {
89 uint32_t temp[4];
90 temp[0] = pixel;
91 temp[1] = temp[0] + pixel;
92 temp[2] = temp[1] + pixel;
93 temp[3] = temp[2] + pixel;
94 uint32x4_t sumPixels = vld1q_u32(temp);
95 sumPixels = vaddq_u32(sumPixels, currentRowSum);
96 currentRowSum = vdupq_n_u32(vgetq_lane_u32(sumPixels, 3));
97 vst1q_u32(intRow + x, vaddq_u32(sumPixels, vld1q_u32(intPrevRow + x)));
100 for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) {
101 // It's important to shuffle here. When we exit this loop currentRowSum
102 // has to be set to sumPixels, so that the following loop can get the
103 // correct pixel for the currentRowSum. The highest order pixel in
104 // currentRowSum could've originated from accumulation in the stride.
105 currentRowSum = vdupq_n_u32(vgetq_lane_u32(currentRowSum, 3));
107 uint32_t temp[4];
108 temp[0] = *(sourceRow + (x - aLeftInflation));
109 temp[1] = temp[0] + *(sourceRow + (x - aLeftInflation) + 1);
110 temp[2] = temp[1] + *(sourceRow + (x - aLeftInflation) + 2);
111 temp[3] = temp[2] + *(sourceRow + (x - aLeftInflation) + 3);
112 uint32x4_t sumPixels = vld1q_u32(temp);
113 sumPixels = vaddq_u32(sumPixels, currentRowSum);
114 currentRowSum = sumPixels;
115 vst1q_u32(intRow + x, vaddq_u32(sumPixels, vld1q_u32(intPrevRow + x)));
118 pixel = sourceRow[aSize.width - 1];
119 int x = (aSize.width + aLeftInflation);
120 if ((aSize.width & 3)) {
121 // Deal with unaligned portion. Get the correct pixel from currentRowSum,
122 // see explanation above.
123 uint32_t intCurrentRowSum =
124 ((uint32_t*)&currentRowSum)[(aSize.width % 4) - 1];
125 for (; x < integralImageSize.width; x++) {
126 // We could be unaligned here!
127 if (!(x & 3)) {
128 // aligned!
129 currentRowSum = vdupq_n_u32(intCurrentRowSum);
130 break;
132 intCurrentRowSum += pixel;
133 intRow[x] = intPrevRow[x] + intCurrentRowSum;
135 } else {
136 currentRowSum = vdupq_n_u32(vgetq_lane_u32(currentRowSum, 3));
139 for (; x < integralImageSize.width; x += 4) {
140 uint32_t temp[4];
141 temp[0] = pixel;
142 temp[1] = temp[0] + pixel;
143 temp[2] = temp[1] + pixel;
144 temp[3] = temp[2] + pixel;
145 uint32x4_t sumPixels = vld1q_u32(temp);
146 sumPixels = vaddq_u32(sumPixels, currentRowSum);
147 currentRowSum = vdupq_n_u32(vgetq_lane_u32(sumPixels, 3));
148 vst1q_u32(intRow + x, vaddq_u32(sumPixels, vld1q_u32(intPrevRow + x)));
152 if (aBottomInflation) {
153 // Store the last valid row of our source image in the last row of
154 // our integral image. This will be overwritten with the correct values
155 // in the upcoming loop.
156 LoadIntegralRowFromRow(
157 aIntegralImage + (integralImageSize.height - 1) * stride32bit,
158 aSource + (aSize.height - 1) * aSourceStride, aSize.width,
159 aLeftInflation, aRightInflation);
161 for (int y = aSize.height + aTopInflation; y < integralImageSize.height;
162 y++) {
163 uint32_t* intRow = aIntegralImage + (y * stride32bit);
164 uint32_t* intPrevRow = aIntegralImage + (y - 1) * stride32bit;
165 uint32_t* intLastRow =
166 aIntegralImage + (integralImageSize.height - 1) * stride32bit;
167 for (int x = 0; x < integralImageSize.width; x += 4) {
168 vst1q_u32(intRow + x, vaddq_u32(vld1q_u32(intLastRow + x),
169 vld1q_u32(intPrevRow + x)));
176 * Attempt to do an in-place box blur using an integral image.
178 void AlphaBoxBlur::BoxBlur_NEON(uint8_t* aData, int32_t aLeftLobe,
179 int32_t aRightLobe, int32_t aTopLobe,
180 int32_t aBottomLobe, uint32_t* aIntegralImage,
181 size_t aIntegralImageStride) const {
182 IntSize size = GetSize();
184 MOZ_ASSERT(size.height > 0);
186 // Our 'left' or 'top' lobe will include the current pixel. i.e. when
187 // looking at an integral image the value of a pixel at 'x,y' is calculated
188 // using the value of the integral image values above/below that.
189 aLeftLobe++;
190 aTopLobe++;
191 int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
193 MOZ_ASSERT(boxSize > 0);
195 if (boxSize == 1) {
196 return;
199 uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
200 uint32_t stride32bit = aIntegralImageStride / 4;
201 int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
203 GenerateIntegralImage_NEON(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
204 aIntegralImage, aIntegralImageStride, aData,
205 mStride, size);
207 uint32x2_t divisor = vdup_n_u32(reciprocal);
209 // This points to the start of the rectangle within the IntegralImage that
210 // overlaps the surface being blurred.
211 uint32_t* innerIntegral =
212 aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
213 IntRect skipRect = mSkipRect;
214 int32_t stride = mStride;
215 uint8_t* data = aData;
217 for (int32_t y = 0; y < size.height; y++) {
218 bool inSkipRectY = y > skipRect.y && y < skipRect.YMost();
219 uint32_t* topLeftBase =
220 innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
221 uint32_t* topRightBase =
222 innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe);
223 uint32_t* bottomRightBase =
224 innerIntegral +
225 ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe);
226 uint32_t* bottomLeftBase =
227 innerIntegral +
228 ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
230 int32_t x = 0;
231 // Process 16 pixels at a time for as long as possible.
232 for (; x <= size.width - 16; x += 16) {
233 if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
234 x = skipRect.XMost() - 16;
235 // Trigger early jump on coming loop iterations, this will be reset
236 // next line anyway.
237 inSkipRectY = false;
238 continue;
241 uint32x4_t topLeft;
242 uint32x4_t topRight;
243 uint32x4_t bottomRight;
244 uint32x4_t bottomLeft;
245 topLeft = vld1q_u32(topLeftBase + x);
246 topRight = vld1q_u32(topRightBase + x);
247 bottomRight = vld1q_u32(bottomRightBase + x);
248 bottomLeft = vld1q_u32(bottomLeftBase + x);
249 uint16x4_t result1 =
250 BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
252 topLeft = vld1q_u32(topLeftBase + x + 4);
253 topRight = vld1q_u32(topRightBase + x + 4);
254 bottomRight = vld1q_u32(bottomRightBase + x + 4);
255 bottomLeft = vld1q_u32(bottomLeftBase + x + 4);
256 uint16x4_t result2 =
257 BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
259 topLeft = vld1q_u32(topLeftBase + x + 8);
260 topRight = vld1q_u32(topRightBase + x + 8);
261 bottomRight = vld1q_u32(bottomRightBase + x + 8);
262 bottomLeft = vld1q_u32(bottomLeftBase + x + 8);
263 uint16x4_t result3 =
264 BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
266 topLeft = vld1q_u32(topLeftBase + x + 12);
267 topRight = vld1q_u32(topRightBase + x + 12);
268 bottomRight = vld1q_u32(bottomRightBase + x + 12);
269 bottomLeft = vld1q_u32(bottomLeftBase + x + 12);
270 uint16x4_t result4 =
271 BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
273 uint8x8_t combine1 = vqmovn_u16(vcombine_u16(result1, result2));
274 uint8x8_t combine2 = vqmovn_u16(vcombine_u16(result3, result4));
275 uint8x16_t final = vcombine_u8(combine1, combine2);
276 vst1q_u8(data + stride * y + x, final);
279 // Process the remaining pixels 4 bytes at a time.
280 for (; x < size.width; x += 4) {
281 if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) {
282 x = skipRect.XMost() - 4;
283 // Trigger early jump on coming loop iterations, this will be reset
284 // next line anyway.
285 inSkipRectY = false;
286 continue;
289 uint32x4_t topLeft = vld1q_u32(topLeftBase + x);
290 uint32x4_t topRight = vld1q_u32(topRightBase + x);
291 uint32x4_t bottomRight = vld1q_u32(bottomRightBase + x);
292 uint32x4_t bottomLeft = vld1q_u32(bottomLeftBase + x);
293 uint16x4_t result =
294 BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
295 uint32x2_t final =
296 vreinterpret_u32_u8(vmovn_u16(vcombine_u16(result, vdup_n_u16(0))));
297 *(uint32_t*)(data + stride * y + x) = vget_lane_u32(final, 0);
302 } // namespace gfx
303 } // namespace mozilla