Bug 1890689 accumulate input in LargerReceiverBlockSizeThanDesiredBuffering GTest...
[gecko.git] / gfx / 2d / Blur.cpp
bloba598f2c758be987cd2d2bdcef5c0c6f8451cd4f7
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #include "Blur.h"
9 #include <algorithm>
10 #include <math.h>
11 #include <string.h>
13 #include "mozilla/CheckedInt.h"
14 #include "NumericTools.h"
16 #include "2D.h"
17 #include "DataSurfaceHelpers.h"
18 #include "Tools.h"
20 #ifdef USE_NEON
21 # include "mozilla/arm.h"
22 #endif
24 namespace mozilla {
25 namespace gfx {
27 /**
28 * Helper function to process each row of the box blur.
29 * It takes care of transposing the data on input or output depending
30 * on whether we intend a horizontal or vertical blur, and whether we're
31 * reading from the initial source or writing to the final destination.
32 * It allows starting or ending anywhere within the row to accomodate
33 * a skip rect.
35 template <bool aTransposeInput, bool aTransposeOutput>
36 static inline void BoxBlurRow(const uint8_t* aInput, uint8_t* aOutput,
37 int32_t aLeftLobe, int32_t aRightLobe,
38 int32_t aWidth, int32_t aStride, int32_t aStart,
39 int32_t aEnd) {
40 // If the input or output is transposed, then we will move down a row
41 // for each step, instead of moving over a column. Since these values
42 // only depend on a template parameter, they will more easily get
43 // copy-propagated in the non-transposed case, which is why they
44 // are not passed as parameters.
45 const int32_t inputStep = aTransposeInput ? aStride : 1;
46 const int32_t outputStep = aTransposeOutput ? aStride : 1;
48 // We need to sample aLeftLobe pixels to the left and aRightLobe pixels
49 // to the right of the current position, then average them. So this is
50 // the size of the total width of this filter.
51 const int32_t boxSize = aLeftLobe + aRightLobe + 1;
53 // Instead of dividing the pixel sum by boxSize to average, we can just
54 // compute a scale that will normalize the result so that it can be quickly
55 // shifted into the desired range.
56 const uint32_t reciprocal = (1 << 24) / boxSize;
58 // The shift would normally truncate the result, whereas we would rather
59 // prefer to round the result to the closest increment. By adding 0.5 units
60 // to the initial sum, we bias the sum so that it will be rounded by the
61 // truncation instead.
62 uint32_t alphaSum = (boxSize + 1) / 2;
64 // We process the row with a moving filter, keeping a sum (alphaSum) of
65 // boxSize pixels. As we move over a pixel, we need to add on a pixel
66 // from the right extreme of the window that moved into range, and subtract
67 // off a pixel from the left extreme of window that moved out of range.
68 // But first, we need to initialization alphaSum to the contents of
69 // the window before we can get going. If the window moves out of bounds
70 // of the row, we clamp each sample to be the closest pixel from within
71 // row bounds, so the 0th and aWidth-1th pixel.
72 int32_t initLeft = aStart - aLeftLobe;
73 if (initLeft < 0) {
74 // If the left lobe samples before the row, add in clamped samples.
75 alphaSum += -initLeft * aInput[0];
76 initLeft = 0;
78 int32_t initRight = aStart + boxSize - aLeftLobe;
79 if (initRight > aWidth) {
80 // If the right lobe samples after the row, add in clamped samples.
81 alphaSum += (initRight - aWidth) * aInput[(aWidth - 1) * inputStep];
82 initRight = aWidth;
84 // Finally, add in all the valid, non-clamped samples to fill up the
85 // rest of the window.
86 const uint8_t* src = &aInput[initLeft * inputStep];
87 const uint8_t* iterEnd = &aInput[initRight * inputStep];
89 #define INIT_ITER \
90 alphaSum += *src; \
91 src += inputStep;
93 // We unroll the per-pixel loop here substantially. The amount of work
94 // done per sample is so small that the cost of a loop condition check
95 // and a branch can substantially add to or even dominate the performance
96 // of the loop.
97 while (src + 16 * inputStep <= iterEnd) {
98 INIT_ITER;
99 INIT_ITER;
100 INIT_ITER;
101 INIT_ITER;
102 INIT_ITER;
103 INIT_ITER;
104 INIT_ITER;
105 INIT_ITER;
106 INIT_ITER;
107 INIT_ITER;
108 INIT_ITER;
109 INIT_ITER;
110 INIT_ITER;
111 INIT_ITER;
112 INIT_ITER;
113 INIT_ITER;
115 while (src < iterEnd) {
116 INIT_ITER;
119 // Now we start moving the window over the row. We will be accessing
120 // pixels form aStart - aLeftLobe up to aEnd + aRightLobe, which may be
121 // out of bounds of the row. To avoid having to check within the inner
122 // loops if we are in bound, we instead compute the points at which
123 // we will move out of bounds of the row on the left side (splitLeft)
124 // and right side (splitRight).
125 int32_t splitLeft = std::min(std::max(aLeftLobe, aStart), aEnd);
126 int32_t splitRight =
127 std::min(std::max(aWidth - (boxSize - aLeftLobe), aStart), aEnd);
128 // If the filter window is actually large than the size of the row,
129 // there will be a middle area of overlap where the leftmost and rightmost
130 // pixel of the filter will both be outside the row. In this case, we need
131 // to invert the splits so that splitLeft <= splitRight.
132 if (boxSize > aWidth) {
133 std::swap(splitLeft, splitRight);
136 // Process all pixels up to splitLeft that would sample before the start of
137 // the row. Note that because inputStep and outputStep may not be a const 1
138 // value, it is more performant to increment pointers here for the source and
139 // destination rather than use a loop counter, since doing so would entail an
140 // expensive multiplication that significantly slows down the loop.
141 uint8_t* dst = &aOutput[aStart * outputStep];
142 iterEnd = &aOutput[splitLeft * outputStep];
143 src = &aInput[(aStart + boxSize - aLeftLobe) * inputStep];
144 uint8_t firstVal = aInput[0];
146 #define LEFT_ITER \
147 *dst = (alphaSum * reciprocal) >> 24; \
148 alphaSum += *src - firstVal; \
149 dst += outputStep; \
150 src += inputStep;
152 while (dst + 16 * outputStep <= iterEnd) {
153 LEFT_ITER;
154 LEFT_ITER;
155 LEFT_ITER;
156 LEFT_ITER;
157 LEFT_ITER;
158 LEFT_ITER;
159 LEFT_ITER;
160 LEFT_ITER;
161 LEFT_ITER;
162 LEFT_ITER;
163 LEFT_ITER;
164 LEFT_ITER;
165 LEFT_ITER;
166 LEFT_ITER;
167 LEFT_ITER;
168 LEFT_ITER;
170 while (dst < iterEnd) {
171 LEFT_ITER;
174 // Process all pixels between splitLeft and splitRight.
175 iterEnd = &aOutput[splitRight * outputStep];
176 if (boxSize <= aWidth) {
177 // The filter window is smaller than the row size, so the leftmost and
178 // rightmost samples are both within row bounds.
179 src = &aInput[(splitLeft - aLeftLobe) * inputStep];
180 int32_t boxStep = boxSize * inputStep;
182 #define CENTER_ITER \
183 *dst = (alphaSum * reciprocal) >> 24; \
184 alphaSum += src[boxStep] - *src; \
185 dst += outputStep; \
186 src += inputStep;
188 while (dst + 16 * outputStep <= iterEnd) {
189 CENTER_ITER;
190 CENTER_ITER;
191 CENTER_ITER;
192 CENTER_ITER;
193 CENTER_ITER;
194 CENTER_ITER;
195 CENTER_ITER;
196 CENTER_ITER;
197 CENTER_ITER;
198 CENTER_ITER;
199 CENTER_ITER;
200 CENTER_ITER;
201 CENTER_ITER;
202 CENTER_ITER;
203 CENTER_ITER;
204 CENTER_ITER;
206 while (dst < iterEnd) {
207 CENTER_ITER;
209 } else {
210 // The filter window is larger than the row size, and we're in the area of
211 // split overlap. So the leftmost and rightmost samples are both out of
212 // bounds and need to be clamped. We can just precompute the difference here
213 // consequently.
214 int32_t firstLastDiff = aInput[(aWidth - 1) * inputStep] - aInput[0];
215 while (dst < iterEnd) {
216 *dst = (alphaSum * reciprocal) >> 24;
217 alphaSum += firstLastDiff;
218 dst += outputStep;
222 // Process all remaining pixels after splitRight that would sample after the
223 // row end.
224 iterEnd = &aOutput[aEnd * outputStep];
225 src = &aInput[(splitRight - aLeftLobe) * inputStep];
226 uint8_t lastVal = aInput[(aWidth - 1) * inputStep];
228 #define RIGHT_ITER \
229 *dst = (alphaSum * reciprocal) >> 24; \
230 alphaSum += lastVal - *src; \
231 dst += outputStep; \
232 src += inputStep;
234 while (dst + 16 * outputStep <= iterEnd) {
235 RIGHT_ITER;
236 RIGHT_ITER;
237 RIGHT_ITER;
238 RIGHT_ITER;
239 RIGHT_ITER;
240 RIGHT_ITER;
241 RIGHT_ITER;
242 RIGHT_ITER;
243 RIGHT_ITER;
244 RIGHT_ITER;
245 RIGHT_ITER;
246 RIGHT_ITER;
247 RIGHT_ITER;
248 RIGHT_ITER;
249 RIGHT_ITER;
250 RIGHT_ITER;
252 while (dst < iterEnd) {
253 RIGHT_ITER;
258 * Box blur involves looking at one pixel, and setting its value to the average
259 * of its neighbouring pixels. This is meant to provide a 3-pass approximation
260 * of a Gaussian blur.
261 * @param aTranspose Whether to transpose the buffer when reading and writing
262 * to it.
263 * @param aData The buffer to be blurred.
264 * @param aLobes The number of pixels to blend on the left and right for each of
265 * 3 passes.
266 * @param aWidth The number of columns in the buffers.
267 * @param aRows The number of rows in the buffers.
268 * @param aStride The stride of the buffer.
270 template <bool aTranspose>
271 static void BoxBlur(uint8_t* aData, const int32_t aLobes[3][2], int32_t aWidth,
272 int32_t aRows, int32_t aStride, IntRect aSkipRect) {
273 if (aTranspose) {
274 std::swap(aWidth, aRows);
275 aSkipRect.Swap();
278 MOZ_ASSERT(aWidth > 0);
280 // All three passes of the box blur that approximate the Gaussian are done
281 // on each row in turn, so we only need two temporary row buffers to process
282 // each row, instead of a full-sized buffer. Data moves from the source to the
283 // first temporary, from the first temporary to the second, then from the
284 // second back to the destination. This way is more cache-friendly than
285 // processing whe whole buffer in each pass and thus yields a nice speedup.
286 uint8_t* tmpRow = new (std::nothrow) uint8_t[2 * aWidth];
287 if (!tmpRow) {
288 return;
290 uint8_t* tmpRow2 = tmpRow + aWidth;
292 const int32_t stride = aTranspose ? 1 : aStride;
293 bool skipRectCoversWholeRow =
294 0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost();
296 for (int32_t y = 0; y < aRows; y++) {
297 // Check whether the skip rect intersects this row. If the skip
298 // rect covers the whole surface in this row, we can avoid
299 // this row entirely (and any others along the skip rect).
300 bool inSkipRectY = aSkipRect.ContainsY(y);
301 if (inSkipRectY && skipRectCoversWholeRow) {
302 aData += stride * (aSkipRect.YMost() - y);
303 y = aSkipRect.YMost() - 1;
304 continue;
307 // Read in data from the source transposed if necessary.
308 BoxBlurRow<aTranspose, false>(aData, tmpRow, aLobes[0][0], aLobes[0][1],
309 aWidth, aStride, 0, aWidth);
311 // For the middle pass, the data is already pre-transposed and does not need
312 // to be post-transposed yet.
313 BoxBlurRow<false, false>(tmpRow, tmpRow2, aLobes[1][0], aLobes[1][1],
314 aWidth, aStride, 0, aWidth);
316 // Write back data to the destination transposed if necessary too.
317 // Make sure not to overwrite the skip rect by only outputting to the
318 // destination before and after the skip rect, if requested.
319 int32_t skipStart =
320 inSkipRectY ? std::min(std::max(aSkipRect.X(), 0), aWidth) : aWidth;
321 int32_t skipEnd = std::max(skipStart, aSkipRect.XMost());
322 if (skipStart > 0) {
323 BoxBlurRow<false, aTranspose>(tmpRow2, aData, aLobes[2][0], aLobes[2][1],
324 aWidth, aStride, 0, skipStart);
326 if (skipEnd < aWidth) {
327 BoxBlurRow<false, aTranspose>(tmpRow2, aData, aLobes[2][0], aLobes[2][1],
328 aWidth, aStride, skipEnd, aWidth);
331 aData += stride;
334 delete[] tmpRow;
337 static void ComputeLobes(int32_t aRadius, int32_t aLobes[3][2]) {
338 int32_t major, minor, final;
340 /* See http://www.w3.org/TR/SVG/filters.html#feGaussianBlur for
341 * some notes about approximating the Gaussian blur with box-blurs.
342 * The comments below are in the terminology of that page.
344 int32_t z = aRadius / 3;
345 switch (aRadius % 3) {
346 case 0:
347 // aRadius = z*3; choose d = 2*z + 1
348 major = minor = final = z;
349 break;
350 case 1:
351 // aRadius = z*3 + 1
352 // This is a tricky case since there is no value of d which will
353 // yield a radius of exactly aRadius. If d is odd, i.e. d=2*k + 1
354 // for some integer k, then the radius will be 3*k. If d is even,
355 // i.e. d=2*k, then the radius will be 3*k - 1.
356 // So we have to choose values that don't match the standard
357 // algorithm.
358 major = z + 1;
359 minor = final = z;
360 break;
361 case 2:
362 // aRadius = z*3 + 2; choose d = 2*z + 2
363 major = final = z + 1;
364 minor = z;
365 break;
366 default:
367 // Mathematical impossibility!
368 MOZ_ASSERT(false);
369 major = minor = final = 0;
371 MOZ_ASSERT(major + minor + final == aRadius);
373 aLobes[0][0] = major;
374 aLobes[0][1] = minor;
375 aLobes[1][0] = minor;
376 aLobes[1][1] = major;
377 aLobes[2][0] = final;
378 aLobes[2][1] = final;
381 static void SpreadHorizontal(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius,
382 int32_t aWidth, int32_t aRows, int32_t aStride,
383 const IntRect& aSkipRect) {
384 if (aRadius == 0) {
385 memcpy(aOutput, aInput, aStride * aRows);
386 return;
389 bool skipRectCoversWholeRow =
390 0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost();
391 for (int32_t y = 0; y < aRows; y++) {
392 // Check whether the skip rect intersects this row. If the skip
393 // rect covers the whole surface in this row, we can avoid
394 // this row entirely (and any others along the skip rect).
395 bool inSkipRectY = aSkipRect.ContainsY(y);
396 if (inSkipRectY && skipRectCoversWholeRow) {
397 y = aSkipRect.YMost() - 1;
398 continue;
401 for (int32_t x = 0; x < aWidth; x++) {
402 // Check whether we are within the skip rect. If so, go
403 // to the next point outside the skip rect.
404 if (inSkipRectY && aSkipRect.ContainsX(x)) {
405 x = aSkipRect.XMost();
406 if (x >= aWidth) break;
409 int32_t sMin = std::max(x - aRadius, 0);
410 int32_t sMax = std::min(x + aRadius, aWidth - 1);
411 int32_t v = 0;
412 for (int32_t s = sMin; s <= sMax; ++s) {
413 v = std::max<int32_t>(v, aInput[aStride * y + s]);
415 aOutput[aStride * y + x] = v;
420 static void SpreadVertical(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius,
421 int32_t aWidth, int32_t aRows, int32_t aStride,
422 const IntRect& aSkipRect) {
423 if (aRadius == 0) {
424 memcpy(aOutput, aInput, aStride * aRows);
425 return;
428 bool skipRectCoversWholeColumn =
429 0 >= aSkipRect.Y() && aRows <= aSkipRect.YMost();
430 for (int32_t x = 0; x < aWidth; x++) {
431 bool inSkipRectX = aSkipRect.ContainsX(x);
432 if (inSkipRectX && skipRectCoversWholeColumn) {
433 x = aSkipRect.XMost() - 1;
434 continue;
437 for (int32_t y = 0; y < aRows; y++) {
438 // Check whether we are within the skip rect. If so, go
439 // to the next point outside the skip rect.
440 if (inSkipRectX && aSkipRect.ContainsY(y)) {
441 y = aSkipRect.YMost();
442 if (y >= aRows) break;
445 int32_t sMin = std::max(y - aRadius, 0);
446 int32_t sMax = std::min(y + aRadius, aRows - 1);
447 int32_t v = 0;
448 for (int32_t s = sMin; s <= sMax; ++s) {
449 v = std::max<int32_t>(v, aInput[aStride * s + x]);
451 aOutput[aStride * y + x] = v;
456 CheckedInt<int32_t> AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal) {
457 CheckedInt<int32_t> val(aVal);
459 val += 3;
460 val /= 4;
461 val *= 4;
463 return val;
466 AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, const IntSize& aSpreadRadius,
467 const IntSize& aBlurRadius, const Rect* aDirtyRect,
468 const Rect* aSkipRect)
469 : mStride(0), mSurfaceAllocationSize(0) {
470 Init(aRect, aSpreadRadius, aBlurRadius, aDirtyRect, aSkipRect);
473 AlphaBoxBlur::AlphaBoxBlur()
474 : mStride(0), mSurfaceAllocationSize(0), mHasDirtyRect(false) {}
476 void AlphaBoxBlur::Init(const Rect& aRect, const IntSize& aSpreadRadius,
477 const IntSize& aBlurRadius, const Rect* aDirtyRect,
478 const Rect* aSkipRect) {
479 mSpreadRadius = aSpreadRadius;
480 mBlurRadius = aBlurRadius;
482 Rect rect(aRect);
483 rect.Inflate(Size(aBlurRadius + aSpreadRadius));
484 rect.RoundOut();
486 if (aDirtyRect) {
487 // If we get passed a dirty rect from layout, we can minimize the
488 // shadow size and make painting faster.
489 mHasDirtyRect = true;
490 mDirtyRect = *aDirtyRect;
491 Rect requiredBlurArea = mDirtyRect.Intersect(rect);
492 requiredBlurArea.Inflate(Size(aBlurRadius + aSpreadRadius));
493 rect = requiredBlurArea.Intersect(rect);
494 } else {
495 mHasDirtyRect = false;
498 mRect = TruncatedToInt(rect);
499 if (mRect.IsEmpty()) {
500 return;
503 if (aSkipRect) {
504 // If we get passed a skip rect, we can lower the amount of
505 // blurring/spreading we need to do. We convert it to IntRect to avoid
506 // expensive int<->float conversions if we were to use Rect instead.
507 Rect skipRect = *aSkipRect;
508 skipRect.Deflate(Size(aBlurRadius + aSpreadRadius));
509 mSkipRect = RoundedIn(skipRect);
510 mSkipRect = mSkipRect.Intersect(mRect);
511 if (mSkipRect.IsEqualInterior(mRect)) {
512 return;
515 mSkipRect -= mRect.TopLeft();
516 // Ensure the skip rect is 4-pixel-aligned in the x axis, so that all our
517 // accesses later are aligned as well, see bug 1622113.
518 mSkipRect.SetLeftEdge(RoundUpToMultiple(mSkipRect.X(), 4));
519 mSkipRect.SetRightEdge(RoundDownToMultiple(mSkipRect.XMost(), 4));
520 if (mSkipRect.IsEmpty()) {
521 mSkipRect = IntRect();
523 } else {
524 mSkipRect = IntRect();
527 CheckedInt<int32_t> stride = RoundUpToMultipleOf4(mRect.Width());
528 if (stride.isValid()) {
529 mStride = stride.value();
531 // We need to leave room for an additional 3 bytes for a potential overrun
532 // in our blurring code.
533 size_t size = BufferSizeFromStrideAndHeight(mStride, mRect.Height(), 3);
534 if (size != 0) {
535 mSurfaceAllocationSize = size;
540 AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, int32_t aStride, float aSigmaX,
541 float aSigmaY)
542 : mRect(TruncatedToInt(aRect)),
544 mBlurRadius(CalculateBlurRadius(Point(aSigmaX, aSigmaY))),
545 mStride(aStride),
546 mSurfaceAllocationSize(0),
547 mHasDirtyRect(false) {
548 IntRect intRect;
549 if (aRect.ToIntRect(&intRect)) {
550 size_t minDataSize =
551 BufferSizeFromStrideAndHeight(intRect.Width(), intRect.Height());
552 if (minDataSize != 0) {
553 mSurfaceAllocationSize = minDataSize;
558 AlphaBoxBlur::~AlphaBoxBlur() = default;
560 IntSize AlphaBoxBlur::GetSize() const {
561 IntSize size(mRect.Width(), mRect.Height());
562 return size;
565 int32_t AlphaBoxBlur::GetStride() const { return mStride; }
567 IntRect AlphaBoxBlur::GetRect() const { return mRect; }
569 Rect* AlphaBoxBlur::GetDirtyRect() {
570 if (mHasDirtyRect) {
571 return &mDirtyRect;
574 return nullptr;
577 size_t AlphaBoxBlur::GetSurfaceAllocationSize() const {
578 return mSurfaceAllocationSize;
581 void AlphaBoxBlur::Blur(uint8_t* aData) const {
582 if (!aData) {
583 return;
586 // no need to do all this if not blurring or spreading
587 if (mBlurRadius != IntSize(0, 0) || mSpreadRadius != IntSize(0, 0)) {
588 int32_t stride = GetStride();
590 IntSize size = GetSize();
592 if (mSpreadRadius.width > 0 || mSpreadRadius.height > 0) {
593 // No need to use CheckedInt here - we have validated it in the
594 // constructor.
595 size_t szB = stride * size.height;
596 uint8_t* tmpData = new (std::nothrow) uint8_t[szB];
598 if (!tmpData) {
599 return;
602 memset(tmpData, 0, szB);
604 SpreadHorizontal(aData, tmpData, mSpreadRadius.width, size.width,
605 size.height, stride, mSkipRect);
606 SpreadVertical(tmpData, aData, mSpreadRadius.height, size.width,
607 size.height, stride, mSkipRect);
609 delete[] tmpData;
612 int32_t horizontalLobes[3][2];
613 ComputeLobes(mBlurRadius.width, horizontalLobes);
614 int32_t verticalLobes[3][2];
615 ComputeLobes(mBlurRadius.height, verticalLobes);
617 // We want to allow for some extra space on the left for alignment reasons.
618 int32_t maxLeftLobe =
619 RoundUpToMultipleOf4(horizontalLobes[0][0] + 1).value();
621 IntSize integralImageSize(
622 size.width + maxLeftLobe + horizontalLobes[1][1],
623 size.height + verticalLobes[0][0] + verticalLobes[1][1] + 1);
625 if ((integralImageSize.width * integralImageSize.height) > (1 << 24)) {
626 // Fallback to old blurring code when the surface is so large it may
627 // overflow our integral image!
628 if (mBlurRadius.width > 0) {
629 BoxBlur<false>(aData, horizontalLobes, size.width, size.height, stride,
630 mSkipRect);
632 if (mBlurRadius.height > 0) {
633 BoxBlur<true>(aData, verticalLobes, size.width, size.height, stride,
634 mSkipRect);
636 } else {
637 size_t integralImageStride =
638 GetAlignedStride<16>(integralImageSize.width, 4);
639 if (integralImageStride == 0) {
640 return;
643 // We need to leave room for an additional 12 bytes for a maximum overrun
644 // of 3 pixels in the blurring code.
645 size_t bufLen = BufferSizeFromStrideAndHeight(
646 integralImageStride, integralImageSize.height, 12);
647 if (bufLen == 0) {
648 return;
650 // bufLen is a byte count, but here we want a multiple of 32-bit ints, so
651 // we divide by 4.
652 AlignedArray<uint32_t> integralImage((bufLen / 4) +
653 ((bufLen % 4) ? 1 : 0));
655 if (!integralImage) {
656 return;
659 #ifdef USE_SSE2
660 if (Factory::HasSSE2()) {
661 BoxBlur_SSE2(aData, horizontalLobes[0][0], horizontalLobes[0][1],
662 verticalLobes[0][0], verticalLobes[0][1], integralImage,
663 integralImageStride);
664 BoxBlur_SSE2(aData, horizontalLobes[1][0], horizontalLobes[1][1],
665 verticalLobes[1][0], verticalLobes[1][1], integralImage,
666 integralImageStride);
667 BoxBlur_SSE2(aData, horizontalLobes[2][0], horizontalLobes[2][1],
668 verticalLobes[2][0], verticalLobes[2][1], integralImage,
669 integralImageStride);
670 } else
671 #endif
672 #ifdef USE_NEON
673 if (mozilla::supports_neon()) {
674 BoxBlur_NEON(aData, horizontalLobes[0][0], horizontalLobes[0][1],
675 verticalLobes[0][0], verticalLobes[0][1], integralImage,
676 integralImageStride);
677 BoxBlur_NEON(aData, horizontalLobes[1][0], horizontalLobes[1][1],
678 verticalLobes[1][0], verticalLobes[1][1], integralImage,
679 integralImageStride);
680 BoxBlur_NEON(aData, horizontalLobes[2][0], horizontalLobes[2][1],
681 verticalLobes[2][0], verticalLobes[2][1], integralImage,
682 integralImageStride);
683 } else
684 #endif
686 #ifdef _MIPS_ARCH_LOONGSON3A
687 BoxBlur_LS3(aData, horizontalLobes[0][0], horizontalLobes[0][1],
688 verticalLobes[0][0], verticalLobes[0][1], integralImage,
689 integralImageStride);
690 BoxBlur_LS3(aData, horizontalLobes[1][0], horizontalLobes[1][1],
691 verticalLobes[1][0], verticalLobes[1][1], integralImage,
692 integralImageStride);
693 BoxBlur_LS3(aData, horizontalLobes[2][0], horizontalLobes[2][1],
694 verticalLobes[2][0], verticalLobes[2][1], integralImage,
695 integralImageStride);
696 #else
697 BoxBlur_C(aData, horizontalLobes[0][0], horizontalLobes[0][1],
698 verticalLobes[0][0], verticalLobes[0][1], integralImage,
699 integralImageStride);
700 BoxBlur_C(aData, horizontalLobes[1][0], horizontalLobes[1][1],
701 verticalLobes[1][0], verticalLobes[1][1], integralImage,
702 integralImageStride);
703 BoxBlur_C(aData, horizontalLobes[2][0], horizontalLobes[2][1],
704 verticalLobes[2][0], verticalLobes[2][1], integralImage,
705 integralImageStride);
706 #endif
712 MOZ_ALWAYS_INLINE void GenerateIntegralRow(uint32_t* aDest,
713 const uint8_t* aSource,
714 uint32_t* aPreviousRow,
715 const uint32_t& aSourceWidth,
716 const uint32_t& aLeftInflation,
717 const uint32_t& aRightInflation) {
718 uint32_t currentRowSum = 0;
719 uint32_t pixel = aSource[0];
720 for (uint32_t x = 0; x < aLeftInflation; x++) {
721 currentRowSum += pixel;
722 *aDest++ = currentRowSum + *aPreviousRow++;
724 for (uint32_t x = aLeftInflation; x < (aSourceWidth + aLeftInflation);
725 x += 4) {
726 uint32_t alphaValues = *(uint32_t*)(aSource + (x - aLeftInflation));
727 #if defined WORDS_BIGENDIAN || defined IS_BIG_ENDIAN || defined __BIG_ENDIAN__
728 currentRowSum += (alphaValues >> 24) & 0xff;
729 *aDest++ = *aPreviousRow++ + currentRowSum;
730 currentRowSum += (alphaValues >> 16) & 0xff;
731 *aDest++ = *aPreviousRow++ + currentRowSum;
732 currentRowSum += (alphaValues >> 8) & 0xff;
733 *aDest++ = *aPreviousRow++ + currentRowSum;
734 currentRowSum += alphaValues & 0xff;
735 *aDest++ = *aPreviousRow++ + currentRowSum;
736 #else
737 currentRowSum += alphaValues & 0xff;
738 *aDest++ = *aPreviousRow++ + currentRowSum;
739 alphaValues >>= 8;
740 currentRowSum += alphaValues & 0xff;
741 *aDest++ = *aPreviousRow++ + currentRowSum;
742 alphaValues >>= 8;
743 currentRowSum += alphaValues & 0xff;
744 *aDest++ = *aPreviousRow++ + currentRowSum;
745 alphaValues >>= 8;
746 currentRowSum += alphaValues & 0xff;
747 *aDest++ = *aPreviousRow++ + currentRowSum;
748 #endif
750 pixel = aSource[aSourceWidth - 1];
751 for (uint32_t x = (aSourceWidth + aLeftInflation);
752 x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
753 currentRowSum += pixel;
754 *aDest++ = currentRowSum + *aPreviousRow++;
758 MOZ_ALWAYS_INLINE void GenerateIntegralImage_C(
759 int32_t aLeftInflation, int32_t aRightInflation, int32_t aTopInflation,
760 int32_t aBottomInflation, uint32_t* aIntegralImage,
761 size_t aIntegralImageStride, uint8_t* aSource, int32_t aSourceStride,
762 const IntSize& aSize) {
763 uint32_t stride32bit = aIntegralImageStride / 4;
765 IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
766 aSize.height + aTopInflation + aBottomInflation);
768 memset(aIntegralImage, 0, aIntegralImageStride);
770 GenerateIntegralRow(aIntegralImage, aSource, aIntegralImage, aSize.width,
771 aLeftInflation, aRightInflation);
772 for (int y = 1; y < aTopInflation + 1; y++) {
773 GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource,
774 aIntegralImage + (y - 1) * stride32bit, aSize.width,
775 aLeftInflation, aRightInflation);
778 for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
779 GenerateIntegralRow(aIntegralImage + (y * stride32bit),
780 aSource + aSourceStride * (y - aTopInflation),
781 aIntegralImage + (y - 1) * stride32bit, aSize.width,
782 aLeftInflation, aRightInflation);
785 if (aBottomInflation) {
786 for (int y = (aSize.height + aTopInflation); y < integralImageSize.height;
787 y++) {
788 GenerateIntegralRow(aIntegralImage + (y * stride32bit),
789 aSource + ((aSize.height - 1) * aSourceStride),
790 aIntegralImage + (y - 1) * stride32bit, aSize.width,
791 aLeftInflation, aRightInflation);
797 * Attempt to do an in-place box blur using an integral image.
799 void AlphaBoxBlur::BoxBlur_C(uint8_t* aData, int32_t aLeftLobe,
800 int32_t aRightLobe, int32_t aTopLobe,
801 int32_t aBottomLobe, uint32_t* aIntegralImage,
802 size_t aIntegralImageStride) const {
803 IntSize size = GetSize();
805 MOZ_ASSERT(size.width > 0);
807 // Our 'left' or 'top' lobe will include the current pixel. i.e. when
808 // looking at an integral image the value of a pixel at 'x,y' is calculated
809 // using the value of the integral image values above/below that.
810 aLeftLobe++;
811 aTopLobe++;
812 int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
814 MOZ_ASSERT(boxSize > 0);
816 if (boxSize == 1) {
817 return;
820 int32_t stride32bit = aIntegralImageStride / 4;
822 int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
824 GenerateIntegralImage_C(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
825 aIntegralImage, aIntegralImageStride, aData, mStride,
826 size);
828 uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
830 uint32_t* innerIntegral =
831 aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
833 // Storing these locally makes this about 30% faster! Presumably the compiler
834 // can't be sure we're not altering the member variables in this loop.
835 IntRect skipRect = mSkipRect;
836 uint8_t* data = aData;
837 int32_t stride = mStride;
838 for (int32_t y = 0; y < size.height; y++) {
839 // Not using ContainsY(y) because we do not skip y == skipRect.Y()
840 // although that may not be done on purpose
841 bool inSkipRectY = y > skipRect.Y() && y < skipRect.YMost();
843 uint32_t* topLeftBase =
844 innerIntegral + ((y - aTopLobe) * stride32bit - aLeftLobe);
845 uint32_t* topRightBase =
846 innerIntegral + ((y - aTopLobe) * stride32bit + aRightLobe);
847 uint32_t* bottomRightBase =
848 innerIntegral + ((y + aBottomLobe) * stride32bit + aRightLobe);
849 uint32_t* bottomLeftBase =
850 innerIntegral + ((y + aBottomLobe) * stride32bit - aLeftLobe);
852 for (int32_t x = 0; x < size.width; x++) {
853 // Not using ContainsX(x) because we do not skip x == skipRect.X()
854 // although that may not be done on purpose
855 if (inSkipRectY && x > skipRect.X() && x < skipRect.XMost()) {
856 x = skipRect.XMost() - 1;
857 // Trigger early jump on coming loop iterations, this will be reset
858 // next line anyway.
859 inSkipRectY = false;
860 continue;
862 int32_t topLeft = topLeftBase[x];
863 int32_t topRight = topRightBase[x];
864 int32_t bottomRight = bottomRightBase[x];
865 int32_t bottomLeft = bottomLeftBase[x];
867 uint32_t value = bottomRight - topRight - bottomLeft;
868 value += topLeft;
870 data[stride * y + x] =
871 (uint64_t(reciprocal) * value + (uint64_t(1) << 31)) >> 32;
877 * Compute the box blur size (which we're calling the blur radius) from
878 * the standard deviation.
880 * Much of this, the 3 * sqrt(2 * pi) / 4, is the known value for
881 * approximating a Gaussian using box blurs. This yields quite a good
882 * approximation for a Gaussian. Then we multiply this by 1.5 since our
883 * code wants the radius of the entire triple-box-blur kernel instead of
884 * the diameter of an individual box blur. For more details, see:
885 * http://www.w3.org/TR/SVG11/filters.html#feGaussianBlurElement
886 * https://bugzilla.mozilla.org/show_bug.cgi?id=590039#c19
888 static const Float GAUSSIAN_SCALE_FACTOR =
889 Float((3 * sqrt(2 * M_PI) / 4) * 1.5);
891 IntSize AlphaBoxBlur::CalculateBlurRadius(const Point& aStd) {
892 IntSize size(
893 static_cast<int32_t>(floor(aStd.x * GAUSSIAN_SCALE_FACTOR + 0.5f)),
894 static_cast<int32_t>(floor(aStd.y * GAUSSIAN_SCALE_FACTOR + 0.5f)));
896 return size;
899 Float AlphaBoxBlur::CalculateBlurSigma(int32_t aBlurRadius) {
900 return aBlurRadius / GAUSSIAN_SCALE_FACTOR;
903 } // namespace gfx
904 } // namespace mozilla