gfx/2d/Blur.cpp

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #include "Blur.h"
   8
   9 #include <algorithm>
  10 #include <math.h>
  11 #include <string.h>
  12
  13 #include "mozilla/CheckedInt.h"
  14 #include "NumericTools.h"
  15
  16 #include "2D.h"
  17 #include "DataSurfaceHelpers.h"
  18 #include "Tools.h"
  19
  20 #ifdef USE_NEON
  21 #  include "mozilla/arm.h"
  22 #endif
  23
  24 namespace mozilla {
  25 namespace gfx {
  26
  27 /**
  28  * Helper function to process each row of the box blur.
  29  * It takes care of transposing the data on input or output depending
  30  * on whether we intend a horizontal or vertical blur, and whether we're
  31  * reading from the initial source or writing to the final destination.
  32  * It allows starting or ending anywhere within the row to accomodate
  33  * a skip rect.
  34  */
  35 template <bool aTransposeInput, bool aTransposeOutput>
  36 static inline void BoxBlurRow(const uint8_t* aInput, uint8_t* aOutput,
  37                               int32_t aLeftLobe, int32_t aRightLobe,
  38                               int32_t aWidth, int32_t aStride, int32_t aStart,
  39                               int32_t aEnd) {
  40   // If the input or output is transposed, then we will move down a row
  41   // for each step, instead of moving over a column. Since these values
  42   // only depend on a template parameter, they will more easily get
  43   // copy-propagated in the non-transposed case, which is why they
  44   // are not passed as parameters.
  45   const int32_t inputStep = aTransposeInput ? aStride : 1;
  46   const int32_t outputStep = aTransposeOutput ? aStride : 1;
  47
  48   // We need to sample aLeftLobe pixels to the left and aRightLobe pixels
  49   // to the right of the current position, then average them. So this is
  50   // the size of the total width of this filter.
  51   const int32_t boxSize = aLeftLobe + aRightLobe + 1;
  52
  53   // Instead of dividing the pixel sum by boxSize to average, we can just
  54   // compute a scale that will normalize the result so that it can be quickly
  55   // shifted into the desired range.
  56   const uint32_t reciprocal = (1 << 24) / boxSize;
  57
  58   // The shift would normally truncate the result, whereas we would rather
  59   // prefer to round the result to the closest increment. By adding 0.5 units
  60   // to the initial sum, we bias the sum so that it will be rounded by the
  61   // truncation instead.
  62   uint32_t alphaSum = (boxSize + 1) / 2;
  63
  64   // We process the row with a moving filter, keeping a sum (alphaSum) of
  65   // boxSize pixels. As we move over a pixel, we need to add on a pixel
  66   // from the right extreme of the window that moved into range, and subtract
  67   // off a pixel from the left extreme of window that moved out of range.
  68   // But first, we need to initialization alphaSum to the contents of
  69   // the window before we can get going. If the window moves out of bounds
  70   // of the row, we clamp each sample to be the closest pixel from within
  71   // row bounds, so the 0th and aWidth-1th pixel.
  72   int32_t initLeft = aStart - aLeftLobe;
  73   if (initLeft < 0) {
  74     // If the left lobe samples before the row, add in clamped samples.
  75     alphaSum += -initLeft * aInput[0];
  76     initLeft = 0;
  77   }
  78   int32_t initRight = aStart + boxSize - aLeftLobe;
  79   if (initRight > aWidth) {
  80     // If the right lobe samples after the row, add in clamped samples.
  81     alphaSum += (initRight - aWidth) * aInput[(aWidth - 1) * inputStep];
  82     initRight = aWidth;
  83   }
  84   // Finally, add in all the valid, non-clamped samples to fill up the
  85   // rest of the window.
  86   const uint8_t* src = &aInput[initLeft * inputStep];
  87   const uint8_t* iterEnd = &aInput[initRight * inputStep];
  88
  89 #define INIT_ITER   \
  90   alphaSum += *src; \
  91   src += inputStep;
  92
  93   // We unroll the per-pixel loop here substantially. The amount of work
  94   // done per sample is so small that the cost of a loop condition check
  95   // and a branch can substantially add to or even dominate the performance
  96   // of the loop.
  97   while (src + 16 * inputStep <= iterEnd) {
  98     INIT_ITER;
  99     INIT_ITER;
 100     INIT_ITER;
 101     INIT_ITER;
 102     INIT_ITER;
 103     INIT_ITER;
 104     INIT_ITER;
 105     INIT_ITER;
 106     INIT_ITER;
 107     INIT_ITER;
 108     INIT_ITER;
 109     INIT_ITER;
 110     INIT_ITER;
 111     INIT_ITER;
 112     INIT_ITER;
 113     INIT_ITER;
 114   }
 115   while (src < iterEnd) {
 116     INIT_ITER;
 117   }
 118
 119   // Now we start moving the window over the row. We will be accessing
 120   // pixels form aStart - aLeftLobe up to aEnd + aRightLobe, which may be
 121   // out of bounds of the row. To avoid having to check within the inner
 122   // loops if we are in bound, we instead compute the points at which
 123   // we will move out of bounds of the row on the left side (splitLeft)
 124   // and right side (splitRight).
 125   int32_t splitLeft = std::min(std::max(aLeftLobe, aStart), aEnd);
 126   int32_t splitRight =
 127       std::min(std::max(aWidth - (boxSize - aLeftLobe), aStart), aEnd);
 128   // If the filter window is actually large than the size of the row,
 129   // there will be a middle area of overlap where the leftmost and rightmost
 130   // pixel of the filter will both be outside the row. In this case, we need
 131   // to invert the splits so that splitLeft <= splitRight.
 132   if (boxSize > aWidth) {
 133     std::swap(splitLeft, splitRight);
 134   }
 135
 136   // Process all pixels up to splitLeft that would sample before the start of
 137   // the row. Note that because inputStep and outputStep may not be a const 1
 138   // value, it is more performant to increment pointers here for the source and
 139   // destination rather than use a loop counter, since doing so would entail an
 140   // expensive multiplication that significantly slows down the loop.
 141   uint8_t* dst = &aOutput[aStart * outputStep];
 142   iterEnd = &aOutput[splitLeft * outputStep];
 143   src = &aInput[(aStart + boxSize - aLeftLobe) * inputStep];
 144   uint8_t firstVal = aInput[0];
 145
 146 #define LEFT_ITER                       \
 147   *dst = (alphaSum * reciprocal) >> 24; \
 148   alphaSum += *src - firstVal;          \
 149   dst += outputStep;                    \
 150   src += inputStep;
 151
 152   while (dst + 16 * outputStep <= iterEnd) {
 153     LEFT_ITER;
 154     LEFT_ITER;
 155     LEFT_ITER;
 156     LEFT_ITER;
 157     LEFT_ITER;
 158     LEFT_ITER;
 159     LEFT_ITER;
 160     LEFT_ITER;
 161     LEFT_ITER;
 162     LEFT_ITER;
 163     LEFT_ITER;
 164     LEFT_ITER;
 165     LEFT_ITER;
 166     LEFT_ITER;
 167     LEFT_ITER;
 168     LEFT_ITER;
 169   }
 170   while (dst < iterEnd) {
 171     LEFT_ITER;
 172   }
 173
 174   // Process all pixels between splitLeft and splitRight.
 175   iterEnd = &aOutput[splitRight * outputStep];
 176   if (boxSize <= aWidth) {
 177     // The filter window is smaller than the row size, so the leftmost and
 178     // rightmost samples are both within row bounds.
 179     src = &aInput[(splitLeft - aLeftLobe) * inputStep];
 180     int32_t boxStep = boxSize * inputStep;
 181
 182 #define CENTER_ITER                     \
 183   *dst = (alphaSum * reciprocal) >> 24; \
 184   alphaSum += src[boxStep] - *src;      \
 185   dst += outputStep;                    \
 186   src += inputStep;
 187
 188     while (dst + 16 * outputStep <= iterEnd) {
 189       CENTER_ITER;
 190       CENTER_ITER;
 191       CENTER_ITER;
 192       CENTER_ITER;
 193       CENTER_ITER;
 194       CENTER_ITER;
 195       CENTER_ITER;
 196       CENTER_ITER;
 197       CENTER_ITER;
 198       CENTER_ITER;
 199       CENTER_ITER;
 200       CENTER_ITER;
 201       CENTER_ITER;
 202       CENTER_ITER;
 203       CENTER_ITER;
 204       CENTER_ITER;
 205     }
 206     while (dst < iterEnd) {
 207       CENTER_ITER;
 208     }
 209   } else {
 210     // The filter window is larger than the row size, and we're in the area of
 211     // split overlap. So the leftmost and rightmost samples are both out of
 212     // bounds and need to be clamped. We can just precompute the difference here
 213     // consequently.
 214     int32_t firstLastDiff = aInput[(aWidth - 1) * inputStep] - aInput[0];
 215     while (dst < iterEnd) {
 216       *dst = (alphaSum * reciprocal) >> 24;
 217       alphaSum += firstLastDiff;
 218       dst += outputStep;
 219     }
 220   }
 221
 222   // Process all remaining pixels after splitRight that would sample after the
 223   // row end.
 224   iterEnd = &aOutput[aEnd * outputStep];
 225   src = &aInput[(splitRight - aLeftLobe) * inputStep];
 226   uint8_t lastVal = aInput[(aWidth - 1) * inputStep];
 227
 228 #define RIGHT_ITER                      \
 229   *dst = (alphaSum * reciprocal) >> 24; \
 230   alphaSum += lastVal - *src;           \
 231   dst += outputStep;                    \
 232   src += inputStep;
 233
 234   while (dst + 16 * outputStep <= iterEnd) {
 235     RIGHT_ITER;
 236     RIGHT_ITER;
 237     RIGHT_ITER;
 238     RIGHT_ITER;
 239     RIGHT_ITER;
 240     RIGHT_ITER;
 241     RIGHT_ITER;
 242     RIGHT_ITER;
 243     RIGHT_ITER;
 244     RIGHT_ITER;
 245     RIGHT_ITER;
 246     RIGHT_ITER;
 247     RIGHT_ITER;
 248     RIGHT_ITER;
 249     RIGHT_ITER;
 250     RIGHT_ITER;
 251   }
 252   while (dst < iterEnd) {
 253     RIGHT_ITER;
 254   }
 255 }
 256
 257 /**
 258  * Box blur involves looking at one pixel, and setting its value to the average
 259  * of its neighbouring pixels. This is meant to provide a 3-pass approximation
 260  * of a Gaussian blur.
 261  * @param aTranspose Whether to transpose the buffer when reading and writing
 262  *                   to it.
 263  * @param aData The buffer to be blurred.
 264  * @param aLobes The number of pixels to blend on the left and right for each of
 265  *               3 passes.
 266  * @param aWidth The number of columns in the buffers.
 267  * @param aRows The number of rows in the buffers.
 268  * @param aStride The stride of the buffer.
 269  */
 270 template <bool aTranspose>
 271 static void BoxBlur(uint8_t* aData, const int32_t aLobes[3][2], int32_t aWidth,
 272                     int32_t aRows, int32_t aStride, IntRect aSkipRect) {
 273   if (aTranspose) {
 274     std::swap(aWidth, aRows);
 275     aSkipRect.Swap();
 276   }
 277
 278   MOZ_ASSERT(aWidth > 0);
 279
 280   // All three passes of the box blur that approximate the Gaussian are done
 281   // on each row in turn, so we only need two temporary row buffers to process
 282   // each row, instead of a full-sized buffer. Data moves from the source to the
 283   // first temporary, from the first temporary to the second, then from the
 284   // second back to the destination. This way is more cache-friendly than
 285   // processing whe whole buffer in each pass and thus yields a nice speedup.
 286   uint8_t* tmpRow = new (std::nothrow) uint8_t[2 * aWidth];
 287   if (!tmpRow) {
 288     return;
 289   }
 290   uint8_t* tmpRow2 = tmpRow + aWidth;
 291
 292   const int32_t stride = aTranspose ? 1 : aStride;
 293   bool skipRectCoversWholeRow =
 294       0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost();
 295
 296   for (int32_t y = 0; y < aRows; y++) {
 297     // Check whether the skip rect intersects this row. If the skip
 298     // rect covers the whole surface in this row, we can avoid
 299     // this row entirely (and any others along the skip rect).
 300     bool inSkipRectY = aSkipRect.ContainsY(y);
 301     if (inSkipRectY && skipRectCoversWholeRow) {
 302       aData += stride * (aSkipRect.YMost() - y);
 303       y = aSkipRect.YMost() - 1;
 304       continue;
 305     }
 306
 307     // Read in data from the source transposed if necessary.
 308     BoxBlurRow<aTranspose, false>(aData, tmpRow, aLobes[0][0], aLobes[0][1],
 309                                   aWidth, aStride, 0, aWidth);
 310
 311     // For the middle pass, the data is already pre-transposed and does not need
 312     // to be post-transposed yet.
 313     BoxBlurRow<false, false>(tmpRow, tmpRow2, aLobes[1][0], aLobes[1][1],
 314                              aWidth, aStride, 0, aWidth);
 315
 316     // Write back data to the destination transposed if necessary too.
 317     // Make sure not to overwrite the skip rect by only outputting to the
 318     // destination before and after the skip rect, if requested.
 319     int32_t skipStart =
 320         inSkipRectY ? std::min(std::max(aSkipRect.X(), 0), aWidth) : aWidth;
 321     int32_t skipEnd = std::max(skipStart, aSkipRect.XMost());
 322     if (skipStart > 0) {
 323       BoxBlurRow<false, aTranspose>(tmpRow2, aData, aLobes[2][0], aLobes[2][1],
 324                                     aWidth, aStride, 0, skipStart);
 325     }
 326     if (skipEnd < aWidth) {
 327       BoxBlurRow<false, aTranspose>(tmpRow2, aData, aLobes[2][0], aLobes[2][1],
 328                                     aWidth, aStride, skipEnd, aWidth);
 329     }
 330
 331     aData += stride;
 332   }
 333
 334   delete[] tmpRow;
 335 }
 336
 337 static void ComputeLobes(int32_t aRadius, int32_t aLobes[3][2]) {
 338   int32_t major, minor, final;
 339
 340   /* See http://www.w3.org/TR/SVG/filters.html#feGaussianBlur for
 341    * some notes about approximating the Gaussian blur with box-blurs.
 342    * The comments below are in the terminology of that page.
 343    */
 344   int32_t z = aRadius / 3;
 345   switch (aRadius % 3) {
 346     case 0:
 347       // aRadius = z*3; choose d = 2*z + 1
 348       major = minor = final = z;
 349       break;
 350     case 1:
 351       // aRadius = z*3 + 1
 352       // This is a tricky case since there is no value of d which will
 353       // yield a radius of exactly aRadius. If d is odd, i.e. d=2*k + 1
 354       // for some integer k, then the radius will be 3*k. If d is even,
 355       // i.e. d=2*k, then the radius will be 3*k - 1.
 356       // So we have to choose values that don't match the standard
 357       // algorithm.
 358       major = z + 1;
 359       minor = final = z;
 360       break;
 361     case 2:
 362       // aRadius = z*3 + 2; choose d = 2*z + 2
 363       major = final = z + 1;
 364       minor = z;
 365       break;
 366     default:
 367       // Mathematical impossibility!
 368       MOZ_ASSERT(false);
 369       major = minor = final = 0;
 370   }
 371   MOZ_ASSERT(major + minor + final == aRadius);
 372
 373   aLobes[0][0] = major;
 374   aLobes[0][1] = minor;
 375   aLobes[1][0] = minor;
 376   aLobes[1][1] = major;
 377   aLobes[2][0] = final;
 378   aLobes[2][1] = final;
 379 }
 380
 381 static void SpreadHorizontal(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius,
 382                              int32_t aWidth, int32_t aRows, int32_t aStride,
 383                              const IntRect& aSkipRect) {
 384   if (aRadius == 0) {
 385     memcpy(aOutput, aInput, aStride * aRows);
 386     return;
 387   }
 388
 389   bool skipRectCoversWholeRow =
 390       0 >= aSkipRect.X() && aWidth <= aSkipRect.XMost();
 391   for (int32_t y = 0; y < aRows; y++) {
 392     // Check whether the skip rect intersects this row. If the skip
 393     // rect covers the whole surface in this row, we can avoid
 394     // this row entirely (and any others along the skip rect).
 395     bool inSkipRectY = aSkipRect.ContainsY(y);
 396     if (inSkipRectY && skipRectCoversWholeRow) {
 397       y = aSkipRect.YMost() - 1;
 398       continue;
 399     }
 400
 401     for (int32_t x = 0; x < aWidth; x++) {
 402       // Check whether we are within the skip rect. If so, go
 403       // to the next point outside the skip rect.
 404       if (inSkipRectY && aSkipRect.ContainsX(x)) {
 405         x = aSkipRect.XMost();
 406         if (x >= aWidth) break;
 407       }
 408
 409       int32_t sMin = std::max(x - aRadius, 0);
 410       int32_t sMax = std::min(x + aRadius, aWidth - 1);
 411       int32_t v = 0;
 412       for (int32_t s = sMin; s <= sMax; ++s) {
 413         v = std::max<int32_t>(v, aInput[aStride * y + s]);
 414       }
 415       aOutput[aStride * y + x] = v;
 416     }
 417   }
 418 }
 419
 420 static void SpreadVertical(uint8_t* aInput, uint8_t* aOutput, int32_t aRadius,
 421                            int32_t aWidth, int32_t aRows, int32_t aStride,
 422                            const IntRect& aSkipRect) {
 423   if (aRadius == 0) {
 424     memcpy(aOutput, aInput, aStride * aRows);
 425     return;
 426   }
 427
 428   bool skipRectCoversWholeColumn =
 429       0 >= aSkipRect.Y() && aRows <= aSkipRect.YMost();
 430   for (int32_t x = 0; x < aWidth; x++) {
 431     bool inSkipRectX = aSkipRect.ContainsX(x);
 432     if (inSkipRectX && skipRectCoversWholeColumn) {
 433       x = aSkipRect.XMost() - 1;
 434       continue;
 435     }
 436
 437     for (int32_t y = 0; y < aRows; y++) {
 438       // Check whether we are within the skip rect. If so, go
 439       // to the next point outside the skip rect.
 440       if (inSkipRectX && aSkipRect.ContainsY(y)) {
 441         y = aSkipRect.YMost();
 442         if (y >= aRows) break;
 443       }
 444
 445       int32_t sMin = std::max(y - aRadius, 0);
 446       int32_t sMax = std::min(y + aRadius, aRows - 1);
 447       int32_t v = 0;
 448       for (int32_t s = sMin; s <= sMax; ++s) {
 449         v = std::max<int32_t>(v, aInput[aStride * s + x]);
 450       }
 451       aOutput[aStride * y + x] = v;
 452     }
 453   }
 454 }
 455
 456 CheckedInt<int32_t> AlphaBoxBlur::RoundUpToMultipleOf4(int32_t aVal) {
 457   CheckedInt<int32_t> val(aVal);
 458
 459   val += 3;
 460   val /= 4;
 461   val *= 4;
 462
 463   return val;
 464 }
 465
 466 AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, const IntSize& aSpreadRadius,
 467                            const IntSize& aBlurRadius, const Rect* aDirtyRect,
 468                            const Rect* aSkipRect)
 469     : mStride(0), mSurfaceAllocationSize(0) {
 470   Init(aRect, aSpreadRadius, aBlurRadius, aDirtyRect, aSkipRect);
 471 }
 472
 473 AlphaBoxBlur::AlphaBoxBlur()
 474     : mStride(0), mSurfaceAllocationSize(0), mHasDirtyRect(false) {}
 475
 476 void AlphaBoxBlur::Init(const Rect& aRect, const IntSize& aSpreadRadius,
 477                         const IntSize& aBlurRadius, const Rect* aDirtyRect,
 478                         const Rect* aSkipRect) {
 479   mSpreadRadius = aSpreadRadius;
 480   mBlurRadius = aBlurRadius;
 481
 482   Rect rect(aRect);
 483   rect.Inflate(Size(aBlurRadius + aSpreadRadius));
 484   rect.RoundOut();
 485
 486   if (aDirtyRect) {
 487     // If we get passed a dirty rect from layout, we can minimize the
 488     // shadow size and make painting faster.
 489     mHasDirtyRect = true;
 490     mDirtyRect = *aDirtyRect;
 491     Rect requiredBlurArea = mDirtyRect.Intersect(rect);
 492     requiredBlurArea.Inflate(Size(aBlurRadius + aSpreadRadius));
 493     rect = requiredBlurArea.Intersect(rect);
 494   } else {
 495     mHasDirtyRect = false;
 496   }
 497
 498   mRect = TruncatedToInt(rect);
 499   if (mRect.IsEmpty()) {
 500     return;
 501   }
 502
 503   if (aSkipRect) {
 504     // If we get passed a skip rect, we can lower the amount of
 505     // blurring/spreading we need to do. We convert it to IntRect to avoid
 506     // expensive int<->float conversions if we were to use Rect instead.
 507     Rect skipRect = *aSkipRect;
 508     skipRect.Deflate(Size(aBlurRadius + aSpreadRadius));
 509     mSkipRect = RoundedIn(skipRect);
 510     mSkipRect = mSkipRect.Intersect(mRect);
 511     if (mSkipRect.IsEqualInterior(mRect)) {
 512       return;
 513     }
 514
 515     mSkipRect -= mRect.TopLeft();
 516     // Ensure the skip rect is 4-pixel-aligned in the x axis, so that all our
 517     // accesses later are aligned as well, see bug 1622113.
 518     mSkipRect.SetLeftEdge(RoundUpToMultiple(mSkipRect.X(), 4));
 519     mSkipRect.SetRightEdge(RoundDownToMultiple(mSkipRect.XMost(), 4));
 520     if (mSkipRect.IsEmpty()) {
 521       mSkipRect = IntRect();
 522     }
 523   } else {
 524     mSkipRect = IntRect();
 525   }
 526
 527   CheckedInt<int32_t> stride = RoundUpToMultipleOf4(mRect.Width());
 528   if (stride.isValid()) {
 529     mStride = stride.value();
 530
 531     // We need to leave room for an additional 3 bytes for a potential overrun
 532     // in our blurring code.
 533     size_t size = BufferSizeFromStrideAndHeight(mStride, mRect.Height(), 3);
 534     if (size != 0) {
 535       mSurfaceAllocationSize = size;
 536     }
 537   }
 538 }
 539
 540 AlphaBoxBlur::AlphaBoxBlur(const Rect& aRect, int32_t aStride, float aSigmaX,
 541                            float aSigmaY)
 542     : mRect(TruncatedToInt(aRect)),
 543
 544       mBlurRadius(CalculateBlurRadius(Point(aSigmaX, aSigmaY))),
 545       mStride(aStride),
 546       mSurfaceAllocationSize(0),
 547       mHasDirtyRect(false) {
 548   IntRect intRect;
 549   if (aRect.ToIntRect(&intRect)) {
 550     size_t minDataSize =
 551         BufferSizeFromStrideAndHeight(intRect.Width(), intRect.Height());
 552     if (minDataSize != 0) {
 553       mSurfaceAllocationSize = minDataSize;
 554     }
 555   }
 556 }
 557
 558 AlphaBoxBlur::~AlphaBoxBlur() = default;
 559
 560 IntSize AlphaBoxBlur::GetSize() const {
 561   IntSize size(mRect.Width(), mRect.Height());
 562   return size;
 563 }
 564
 565 int32_t AlphaBoxBlur::GetStride() const { return mStride; }
 566
 567 IntRect AlphaBoxBlur::GetRect() const { return mRect; }
 568
 569 Rect* AlphaBoxBlur::GetDirtyRect() {
 570   if (mHasDirtyRect) {
 571     return &mDirtyRect;
 572   }
 573
 574   return nullptr;
 575 }
 576
 577 size_t AlphaBoxBlur::GetSurfaceAllocationSize() const {
 578   return mSurfaceAllocationSize;
 579 }
 580
 581 void AlphaBoxBlur::Blur(uint8_t* aData) const {
 582   if (!aData) {
 583     return;
 584   }
 585
 586   // no need to do all this if not blurring or spreading
 587   if (mBlurRadius != IntSize(0, 0) || mSpreadRadius != IntSize(0, 0)) {
 588     int32_t stride = GetStride();
 589
 590     IntSize size = GetSize();
 591
 592     if (mSpreadRadius.width > 0 || mSpreadRadius.height > 0) {
 593       // No need to use CheckedInt here - we have validated it in the
 594       // constructor.
 595       size_t szB = stride * size.height;
 596       uint8_t* tmpData = new (std::nothrow) uint8_t[szB];
 597
 598       if (!tmpData) {
 599         return;
 600       }
 601
 602       memset(tmpData, 0, szB);
 603
 604       SpreadHorizontal(aData, tmpData, mSpreadRadius.width, size.width,
 605                        size.height, stride, mSkipRect);
 606       SpreadVertical(tmpData, aData, mSpreadRadius.height, size.width,
 607                      size.height, stride, mSkipRect);
 608
 609       delete[] tmpData;
 610     }
 611
 612     int32_t horizontalLobes[3][2];
 613     ComputeLobes(mBlurRadius.width, horizontalLobes);
 614     int32_t verticalLobes[3][2];
 615     ComputeLobes(mBlurRadius.height, verticalLobes);
 616
 617     // We want to allow for some extra space on the left for alignment reasons.
 618     int32_t maxLeftLobe =
 619         RoundUpToMultipleOf4(horizontalLobes[0][0] + 1).value();
 620
 621     IntSize integralImageSize(
 622         size.width + maxLeftLobe + horizontalLobes[1][1],
 623         size.height + verticalLobes[0][0] + verticalLobes[1][1] + 1);
 624
 625     if ((integralImageSize.width * integralImageSize.height) > (1 << 24)) {
 626       // Fallback to old blurring code when the surface is so large it may
 627       // overflow our integral image!
 628       if (mBlurRadius.width > 0) {
 629         BoxBlur<false>(aData, horizontalLobes, size.width, size.height, stride,
 630                        mSkipRect);
 631       }
 632       if (mBlurRadius.height > 0) {
 633         BoxBlur<true>(aData, verticalLobes, size.width, size.height, stride,
 634                       mSkipRect);
 635       }
 636     } else {
 637       size_t integralImageStride =
 638           GetAlignedStride<16>(integralImageSize.width, 4);
 639       if (integralImageStride == 0) {
 640         return;
 641       }
 642
 643       // We need to leave room for an additional 12 bytes for a maximum overrun
 644       // of 3 pixels in the blurring code.
 645       size_t bufLen = BufferSizeFromStrideAndHeight(
 646           integralImageStride, integralImageSize.height, 12);
 647       if (bufLen == 0) {
 648         return;
 649       }
 650       // bufLen is a byte count, but here we want a multiple of 32-bit ints, so
 651       // we divide by 4.
 652       AlignedArray<uint32_t> integralImage((bufLen / 4) +
 653                                            ((bufLen % 4) ? 1 : 0));
 654
 655       if (!integralImage) {
 656         return;
 657       }
 658
 659 #ifdef USE_SSE2
 660       if (Factory::HasSSE2()) {
 661         BoxBlur_SSE2(aData, horizontalLobes[0][0], horizontalLobes[0][1],
 662                      verticalLobes[0][0], verticalLobes[0][1], integralImage,
 663                      integralImageStride);
 664         BoxBlur_SSE2(aData, horizontalLobes[1][0], horizontalLobes[1][1],
 665                      verticalLobes[1][0], verticalLobes[1][1], integralImage,
 666                      integralImageStride);
 667         BoxBlur_SSE2(aData, horizontalLobes[2][0], horizontalLobes[2][1],
 668                      verticalLobes[2][0], verticalLobes[2][1], integralImage,
 669                      integralImageStride);
 670       } else
 671 #endif
 672 #ifdef USE_NEON
 673           if (mozilla::supports_neon()) {
 674         BoxBlur_NEON(aData, horizontalLobes[0][0], horizontalLobes[0][1],
 675                      verticalLobes[0][0], verticalLobes[0][1], integralImage,
 676                      integralImageStride);
 677         BoxBlur_NEON(aData, horizontalLobes[1][0], horizontalLobes[1][1],
 678                      verticalLobes[1][0], verticalLobes[1][1], integralImage,
 679                      integralImageStride);
 680         BoxBlur_NEON(aData, horizontalLobes[2][0], horizontalLobes[2][1],
 681                      verticalLobes[2][0], verticalLobes[2][1], integralImage,
 682                      integralImageStride);
 683       } else
 684 #endif
 685       {
 686 #ifdef _MIPS_ARCH_LOONGSON3A
 687         BoxBlur_LS3(aData, horizontalLobes[0][0], horizontalLobes[0][1],
 688                     verticalLobes[0][0], verticalLobes[0][1], integralImage,
 689                     integralImageStride);
 690         BoxBlur_LS3(aData, horizontalLobes[1][0], horizontalLobes[1][1],
 691                     verticalLobes[1][0], verticalLobes[1][1], integralImage,
 692                     integralImageStride);
 693         BoxBlur_LS3(aData, horizontalLobes[2][0], horizontalLobes[2][1],
 694                     verticalLobes[2][0], verticalLobes[2][1], integralImage,
 695                     integralImageStride);
 696 #else
 697         BoxBlur_C(aData, horizontalLobes[0][0], horizontalLobes[0][1],
 698                   verticalLobes[0][0], verticalLobes[0][1], integralImage,
 699                   integralImageStride);
 700         BoxBlur_C(aData, horizontalLobes[1][0], horizontalLobes[1][1],
 701                   verticalLobes[1][0], verticalLobes[1][1], integralImage,
 702                   integralImageStride);
 703         BoxBlur_C(aData, horizontalLobes[2][0], horizontalLobes[2][1],
 704                   verticalLobes[2][0], verticalLobes[2][1], integralImage,
 705                   integralImageStride);
 706 #endif
 707       }
 708     }
 709   }
 710 }
 711
 712 MOZ_ALWAYS_INLINE void GenerateIntegralRow(uint32_t* aDest,
 713                                            const uint8_t* aSource,
 714                                            uint32_t* aPreviousRow,
 715                                            const uint32_t& aSourceWidth,
 716                                            const uint32_t& aLeftInflation,
 717                                            const uint32_t& aRightInflation) {
 718   uint32_t currentRowSum = 0;
 719   uint32_t pixel = aSource[0];
 720   for (uint32_t x = 0; x < aLeftInflation; x++) {
 721     currentRowSum += pixel;
 722     *aDest++ = currentRowSum + *aPreviousRow++;
 723   }
 724   for (uint32_t x = aLeftInflation; x < (aSourceWidth + aLeftInflation);
 725        x += 4) {
 726     uint32_t alphaValues = *(uint32_t*)(aSource + (x - aLeftInflation));
 727 #if defined WORDS_BIGENDIAN || defined IS_BIG_ENDIAN || defined __BIG_ENDIAN__
 728     currentRowSum += (alphaValues >> 24) & 0xff;
 729     *aDest++ = *aPreviousRow++ + currentRowSum;
 730     currentRowSum += (alphaValues >> 16) & 0xff;
 731     *aDest++ = *aPreviousRow++ + currentRowSum;
 732     currentRowSum += (alphaValues >> 8) & 0xff;
 733     *aDest++ = *aPreviousRow++ + currentRowSum;
 734     currentRowSum += alphaValues & 0xff;
 735     *aDest++ = *aPreviousRow++ + currentRowSum;
 736 #else
 737     currentRowSum += alphaValues & 0xff;
 738     *aDest++ = *aPreviousRow++ + currentRowSum;
 739     alphaValues >>= 8;
 740     currentRowSum += alphaValues & 0xff;
 741     *aDest++ = *aPreviousRow++ + currentRowSum;
 742     alphaValues >>= 8;
 743     currentRowSum += alphaValues & 0xff;
 744     *aDest++ = *aPreviousRow++ + currentRowSum;
 745     alphaValues >>= 8;
 746     currentRowSum += alphaValues & 0xff;
 747     *aDest++ = *aPreviousRow++ + currentRowSum;
 748 #endif
 749   }
 750   pixel = aSource[aSourceWidth - 1];
 751   for (uint32_t x = (aSourceWidth + aLeftInflation);
 752        x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
 753     currentRowSum += pixel;
 754     *aDest++ = currentRowSum + *aPreviousRow++;
 755   }
 756 }
 757
 758 MOZ_ALWAYS_INLINE void GenerateIntegralImage_C(
 759     int32_t aLeftInflation, int32_t aRightInflation, int32_t aTopInflation,
 760     int32_t aBottomInflation, uint32_t* aIntegralImage,
 761     size_t aIntegralImageStride, uint8_t* aSource, int32_t aSourceStride,
 762     const IntSize& aSize) {
 763   uint32_t stride32bit = aIntegralImageStride / 4;
 764
 765   IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
 766                             aSize.height + aTopInflation + aBottomInflation);
 767
 768   memset(aIntegralImage, 0, aIntegralImageStride);
 769
 770   GenerateIntegralRow(aIntegralImage, aSource, aIntegralImage, aSize.width,
 771                       aLeftInflation, aRightInflation);
 772   for (int y = 1; y < aTopInflation + 1; y++) {
 773     GenerateIntegralRow(aIntegralImage + (y * stride32bit), aSource,
 774                         aIntegralImage + (y - 1) * stride32bit, aSize.width,
 775                         aLeftInflation, aRightInflation);
 776   }
 777
 778   for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
 779     GenerateIntegralRow(aIntegralImage + (y * stride32bit),
 780                         aSource + aSourceStride * (y - aTopInflation),
 781                         aIntegralImage + (y - 1) * stride32bit, aSize.width,
 782                         aLeftInflation, aRightInflation);
 783   }
 784
 785   if (aBottomInflation) {
 786     for (int y = (aSize.height + aTopInflation); y < integralImageSize.height;
 787          y++) {
 788       GenerateIntegralRow(aIntegralImage + (y * stride32bit),
 789                           aSource + ((aSize.height - 1) * aSourceStride),
 790                           aIntegralImage + (y - 1) * stride32bit, aSize.width,
 791                           aLeftInflation, aRightInflation);
 792     }
 793   }
 794 }
 795
 796 /**
 797  * Attempt to do an in-place box blur using an integral image.
 798  */
 799 void AlphaBoxBlur::BoxBlur_C(uint8_t* aData, int32_t aLeftLobe,
 800                              int32_t aRightLobe, int32_t aTopLobe,
 801                              int32_t aBottomLobe, uint32_t* aIntegralImage,
 802                              size_t aIntegralImageStride) const {
 803   IntSize size = GetSize();
 804
 805   MOZ_ASSERT(size.width > 0);
 806
 807   // Our 'left' or 'top' lobe will include the current pixel. i.e. when
 808   // looking at an integral image the value of a pixel at 'x,y' is calculated
 809   // using the value of the integral image values above/below that.
 810   aLeftLobe++;
 811   aTopLobe++;
 812   int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
 813
 814   MOZ_ASSERT(boxSize > 0);
 815
 816   if (boxSize == 1) {
 817     return;
 818   }
 819
 820   int32_t stride32bit = aIntegralImageStride / 4;
 821
 822   int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
 823
 824   GenerateIntegralImage_C(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
 825                           aIntegralImage, aIntegralImageStride, aData, mStride,
 826                           size);
 827
 828   uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
 829
 830   uint32_t* innerIntegral =
 831       aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
 832
 833   // Storing these locally makes this about 30% faster! Presumably the compiler
 834   // can't be sure we're not altering the member variables in this loop.
 835   IntRect skipRect = mSkipRect;
 836   uint8_t* data = aData;
 837   int32_t stride = mStride;
 838   for (int32_t y = 0; y < size.height; y++) {
 839     // Not using ContainsY(y) because we do not skip y == skipRect.Y()
 840     // although that may not be done on purpose
 841     bool inSkipRectY = y > skipRect.Y() && y < skipRect.YMost();
 842
 843     uint32_t* topLeftBase =
 844         innerIntegral + ((y - aTopLobe) * stride32bit - aLeftLobe);
 845     uint32_t* topRightBase =
 846         innerIntegral + ((y - aTopLobe) * stride32bit + aRightLobe);
 847     uint32_t* bottomRightBase =
 848         innerIntegral + ((y + aBottomLobe) * stride32bit + aRightLobe);
 849     uint32_t* bottomLeftBase =
 850         innerIntegral + ((y + aBottomLobe) * stride32bit - aLeftLobe);
 851
 852     for (int32_t x = 0; x < size.width; x++) {
 853       // Not using ContainsX(x) because we do not skip x == skipRect.X()
 854       // although that may not be done on purpose
 855       if (inSkipRectY && x > skipRect.X() && x < skipRect.XMost()) {
 856         x = skipRect.XMost() - 1;
 857         // Trigger early jump on coming loop iterations, this will be reset
 858         // next line anyway.
 859         inSkipRectY = false;
 860         continue;
 861       }
 862       int32_t topLeft = topLeftBase[x];
 863       int32_t topRight = topRightBase[x];
 864       int32_t bottomRight = bottomRightBase[x];
 865       int32_t bottomLeft = bottomLeftBase[x];
 866
 867       uint32_t value = bottomRight - topRight - bottomLeft;
 868       value += topLeft;
 869
 870       data[stride * y + x] =
 871           (uint64_t(reciprocal) * value + (uint64_t(1) << 31)) >> 32;
 872     }
 873   }
 874 }
 875
 876 /**
 877  * Compute the box blur size (which we're calling the blur radius) from
 878  * the standard deviation.
 879  *
 880  * Much of this, the 3 * sqrt(2 * pi) / 4, is the known value for
 881  * approximating a Gaussian using box blurs.  This yields quite a good
 882  * approximation for a Gaussian.  Then we multiply this by 1.5 since our
 883  * code wants the radius of the entire triple-box-blur kernel instead of
 884  * the diameter of an individual box blur.  For more details, see:
 885  *   http://www.w3.org/TR/SVG11/filters.html#feGaussianBlurElement
 886  *   https://bugzilla.mozilla.org/show_bug.cgi?id=590039#c19
 887  */
 888 static const Float GAUSSIAN_SCALE_FACTOR =
 889     Float((3 * sqrt(2 * M_PI) / 4) * 1.5);
 890
 891 IntSize AlphaBoxBlur::CalculateBlurRadius(const Point& aStd) {
 892   IntSize size(
 893       static_cast<int32_t>(floor(aStd.x * GAUSSIAN_SCALE_FACTOR + 0.5f)),
 894       static_cast<int32_t>(floor(aStd.y * GAUSSIAN_SCALE_FACTOR + 0.5f)));
 895
 896   return size;
 897 }
 898
 899 Float AlphaBoxBlur::CalculateBlurSigma(int32_t aBlurRadius) {
 900   return aBlurRadius / GAUSSIAN_SCALE_FACTOR;
 901 }
 902
 903 }  // namespace gfx
 904 }  // namespace mozilla