gfx/2d/BlurSSE2.cpp

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #include "Blur.h"
   8
   9 #include "SSEHelpers.h"
  10
  11 #include <string.h>
  12
  13 namespace mozilla::gfx {
  14
  15 MOZ_ALWAYS_INLINE
  16 __m128i Divide(__m128i aValues, __m128i aDivisor) {
  17   const __m128i mask = _mm_setr_epi32(0x0, 0xffffffff, 0x0, 0xffffffff);
  18   static const union {
  19     int64_t i64[2];
  20     __m128i m;
  21   } roundingAddition = {{int64_t(1) << 31, int64_t(1) << 31}};
  22
  23   __m128i multiplied31 = _mm_mul_epu32(aValues, aDivisor);
  24   __m128i multiplied42 = _mm_mul_epu32(_mm_srli_epi64(aValues, 32), aDivisor);
  25
  26   // Add 1 << 31 before shifting or masking the lower 32 bits away, so that the
  27   // result is rounded.
  28   __m128i p_3_1 =
  29       _mm_srli_epi64(_mm_add_epi64(multiplied31, roundingAddition.m), 32);
  30   __m128i p4_2_ =
  31       _mm_and_si128(_mm_add_epi64(multiplied42, roundingAddition.m), mask);
  32   __m128i p4321 = _mm_or_si128(p_3_1, p4_2_);
  33   return p4321;
  34 }
  35
  36 MOZ_ALWAYS_INLINE
  37 __m128i BlurFourPixels(const __m128i& aTopLeft, const __m128i& aTopRight,
  38                        const __m128i& aBottomRight, const __m128i& aBottomLeft,
  39                        const __m128i& aDivisor) {
  40   __m128i values = _mm_add_epi32(
  41       _mm_sub_epi32(_mm_sub_epi32(aBottomRight, aTopRight), aBottomLeft),
  42       aTopLeft);
  43   return Divide(values, aDivisor);
  44 }
  45
  46 MOZ_ALWAYS_INLINE
  47 void LoadIntegralRowFromRow(uint32_t* aDest, const uint8_t* aSource,
  48                             int32_t aSourceWidth, int32_t aLeftInflation,
  49                             int32_t aRightInflation) {
  50   int32_t currentRowSum = 0;
  51
  52   for (int x = 0; x < aLeftInflation; x++) {
  53     currentRowSum += aSource[0];
  54     aDest[x] = currentRowSum;
  55   }
  56   for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) {
  57     currentRowSum += aSource[(x - aLeftInflation)];
  58     aDest[x] = currentRowSum;
  59   }
  60   for (int x = (aSourceWidth + aLeftInflation);
  61        x < (aSourceWidth + aLeftInflation + aRightInflation); x++) {
  62     currentRowSum += aSource[aSourceWidth - 1];
  63     aDest[x] = currentRowSum;
  64   }
  65 }
  66
  67 // This function calculates an integral of four pixels stored in the 4
  68 // 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns
  69 // { 30, 80, 160, 260 }. This seems to be the fastest way to do this after
  70 // much testing.
  71 MOZ_ALWAYS_INLINE
  72 __m128i AccumulatePixelSums(__m128i aPixels) {
  73   __m128i sumPixels = aPixels;
  74   __m128i currentPixels = _mm_slli_si128(aPixels, 4);
  75   sumPixels = _mm_add_epi32(sumPixels, currentPixels);
  76   currentPixels = _mm_unpacklo_epi64(_mm_setzero_si128(), sumPixels);
  77
  78   return _mm_add_epi32(sumPixels, currentPixels);
  79 }
  80
  81 MOZ_ALWAYS_INLINE void GenerateIntegralImage_SSE2(
  82     int32_t aLeftInflation, int32_t aRightInflation, int32_t aTopInflation,
  83     int32_t aBottomInflation, uint32_t* aIntegralImage,
  84     size_t aIntegralImageStride, uint8_t* aSource, int32_t aSourceStride,
  85     const IntSize& aSize) {
  86   MOZ_ASSERT(!(aLeftInflation & 3));
  87
  88   uint32_t stride32bit = aIntegralImageStride / 4;
  89
  90   IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation,
  91                             aSize.height + aTopInflation + aBottomInflation);
  92
  93   LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation,
  94                          aRightInflation);
  95
  96   for (int y = 1; y < aTopInflation + 1; y++) {
  97     uint32_t* intRow = aIntegralImage + (y * stride32bit);
  98     uint32_t* intPrevRow = aIntegralImage + (y - 1) * stride32bit;
  99     uint32_t* intFirstRow = aIntegralImage;
 100
 101     for (int x = 0; x < integralImageSize.width; x += 4) {
 102       __m128i firstRow = _mm_load_si128((__m128i*)(intFirstRow + x));
 103       __m128i previousRow = _mm_load_si128((__m128i*)(intPrevRow + x));
 104       _mm_store_si128((__m128i*)(intRow + x),
 105                       _mm_add_epi32(firstRow, previousRow));
 106     }
 107   }
 108
 109   for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) {
 110     __m128i currentRowSum = _mm_setzero_si128();
 111     uint32_t* intRow = aIntegralImage + (y * stride32bit);
 112     uint32_t* intPrevRow = aIntegralImage + (y - 1) * stride32bit;
 113     uint8_t* sourceRow = aSource + aSourceStride * (y - aTopInflation);
 114
 115     uint32_t pixel = sourceRow[0];
 116     for (int x = 0; x < aLeftInflation; x += 4) {
 117       __m128i sumPixels = AccumulatePixelSums(
 118           _mm_shuffle_epi32(_mm_set1_epi32(pixel), _MM_SHUFFLE(0, 0, 0, 0)));
 119
 120       sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
 121
 122       currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
 123
 124       _mm_store_si128(
 125           (__m128i*)(intRow + x),
 126           _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
 127     }
 128     for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) {
 129       uint32_t pixels = *(uint32_t*)(sourceRow + (x - aLeftInflation));
 130
 131       // It's important to shuffle here. When we exit this loop currentRowSum
 132       // has to be set to sumPixels, so that the following loop can get the
 133       // correct pixel for the currentRowSum. The highest order pixel in
 134       // currentRowSum could've originated from accumulation in the stride.
 135       currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
 136
 137       __m128i sumPixels = AccumulatePixelSums(_mm_unpacklo_epi16(
 138           _mm_unpacklo_epi8(_mm_set1_epi32(pixels), _mm_setzero_si128()),
 139           _mm_setzero_si128()));
 140       sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
 141
 142       currentRowSum = sumPixels;
 143
 144       _mm_store_si128(
 145           (__m128i*)(intRow + x),
 146           _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
 147     }
 148
 149     pixel = sourceRow[aSize.width - 1];
 150     int x = (aSize.width + aLeftInflation);
 151     if ((aSize.width & 3)) {
 152       // Deal with unaligned portion. Get the correct pixel from currentRowSum,
 153       // see explanation above.
 154       uint32_t intCurrentRowSum =
 155           ((uint32_t*)&currentRowSum)[(aSize.width % 4) - 1];
 156       for (; x < integralImageSize.width; x++) {
 157         // We could be unaligned here!
 158         if (!(x & 3)) {
 159           // aligned!
 160           currentRowSum = _mm_set1_epi32(intCurrentRowSum);
 161           break;
 162         }
 163         intCurrentRowSum += pixel;
 164         intRow[x] = intPrevRow[x] + intCurrentRowSum;
 165       }
 166     } else {
 167       currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3));
 168     }
 169     for (; x < integralImageSize.width; x += 4) {
 170       __m128i sumPixels = AccumulatePixelSums(_mm_set1_epi32(pixel));
 171
 172       sumPixels = _mm_add_epi32(sumPixels, currentRowSum);
 173
 174       currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3));
 175
 176       _mm_store_si128(
 177           (__m128i*)(intRow + x),
 178           _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x))));
 179     }
 180   }
 181
 182   if (aBottomInflation) {
 183     // Store the last valid row of our source image in the last row of
 184     // our integral image. This will be overwritten with the correct values
 185     // in the upcoming loop.
 186     LoadIntegralRowFromRow(
 187         aIntegralImage + (integralImageSize.height - 1) * stride32bit,
 188         aSource + (aSize.height - 1) * aSourceStride, aSize.width,
 189         aLeftInflation, aRightInflation);
 190
 191     for (int y = aSize.height + aTopInflation; y < integralImageSize.height;
 192          y++) {
 193       __m128i* intRow = (__m128i*)(aIntegralImage + (y * stride32bit));
 194       __m128i* intPrevRow = (__m128i*)(aIntegralImage + (y - 1) * stride32bit);
 195       __m128i* intLastRow =
 196           (__m128i*)(aIntegralImage +
 197                      (integralImageSize.height - 1) * stride32bit);
 198
 199       for (int x = 0; x < integralImageSize.width; x += 4) {
 200         _mm_store_si128(intRow + (x / 4),
 201                         _mm_add_epi32(_mm_load_si128(intLastRow + (x / 4)),
 202                                       _mm_load_si128(intPrevRow + (x / 4))));
 203       }
 204     }
 205   }
 206 }
 207
 208 /**
 209  * Attempt to do an in-place box blur using an integral image.
 210  */
 211 void AlphaBoxBlur::BoxBlur_SSE2(uint8_t* aData, int32_t aLeftLobe,
 212                                 int32_t aRightLobe, int32_t aTopLobe,
 213                                 int32_t aBottomLobe, uint32_t* aIntegralImage,
 214                                 size_t aIntegralImageStride) const {
 215   IntSize size = GetSize();
 216
 217   MOZ_ASSERT(size.height > 0);
 218
 219   // Our 'left' or 'top' lobe will include the current pixel. i.e. when
 220   // looking at an integral image the value of a pixel at 'x,y' is calculated
 221   // using the value of the integral image values above/below that.
 222   aLeftLobe++;
 223   aTopLobe++;
 224   int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe);
 225
 226   MOZ_ASSERT(boxSize > 0);
 227
 228   if (boxSize == 1) {
 229     return;
 230   }
 231
 232   uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize);
 233
 234   uint32_t stride32bit = aIntegralImageStride / 4;
 235   int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value();
 236
 237   GenerateIntegralImage_SSE2(leftInflation, aRightLobe, aTopLobe, aBottomLobe,
 238                              aIntegralImage, aIntegralImageStride, aData,
 239                              mStride, size);
 240
 241   __m128i divisor = _mm_set1_epi32(reciprocal);
 242
 243   // This points to the start of the rectangle within the IntegralImage that
 244   // overlaps the surface being blurred.
 245   uint32_t* innerIntegral =
 246       aIntegralImage + (aTopLobe * stride32bit) + leftInflation;
 247
 248   IntRect skipRect = mSkipRect;
 249   int32_t stride = mStride;
 250   uint8_t* data = aData;
 251   for (int32_t y = 0; y < size.height; y++) {
 252     // Not using ContainsY(y) because we do not skip y == skipRect.Y()
 253     // although that may not be done on purpose
 254     bool inSkipRectY = y > skipRect.Y() && y < skipRect.YMost();
 255
 256     uint32_t* topLeftBase =
 257         innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
 258     uint32_t* topRightBase =
 259         innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe);
 260     uint32_t* bottomRightBase =
 261         innerIntegral +
 262         ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe);
 263     uint32_t* bottomLeftBase =
 264         innerIntegral +
 265         ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe);
 266
 267     int32_t x = 0;
 268     // Process 16 pixels at a time for as long as possible.
 269     for (; x <= size.width - 16; x += 16) {
 270       // Not using ContainsX(x) because we do not skip x == skipRect.X()
 271       // although that may not be done on purpose
 272       if (inSkipRectY && x > skipRect.X() && x < skipRect.XMost()) {
 273         x = skipRect.XMost() - 16;
 274         // Trigger early jump on coming loop iterations, this will be reset
 275         // next line anyway.
 276         inSkipRectY = false;
 277         continue;
 278       }
 279
 280       __m128i topLeft;
 281       __m128i topRight;
 282       __m128i bottomRight;
 283       __m128i bottomLeft;
 284
 285       topLeft = loadUnaligned128((__m128i*)(topLeftBase + x));
 286       topRight = loadUnaligned128((__m128i*)(topRightBase + x));
 287       bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x));
 288       bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x));
 289       __m128i result1 =
 290           BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
 291
 292       topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 4));
 293       topRight = loadUnaligned128((__m128i*)(topRightBase + x + 4));
 294       bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 4));
 295       bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 4));
 296       __m128i result2 =
 297           BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
 298
 299       topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 8));
 300       topRight = loadUnaligned128((__m128i*)(topRightBase + x + 8));
 301       bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 8));
 302       bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 8));
 303       __m128i result3 =
 304           BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
 305
 306       topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 12));
 307       topRight = loadUnaligned128((__m128i*)(topRightBase + x + 12));
 308       bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 12));
 309       bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 12));
 310       __m128i result4 =
 311           BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
 312
 313       __m128i final = _mm_packus_epi16(_mm_packs_epi32(result1, result2),
 314                                        _mm_packs_epi32(result3, result4));
 315
 316       _mm_storeu_si128((__m128i*)(data + stride * y + x), final);
 317     }
 318
 319     // Process the remaining pixels 4 bytes at a time.
 320     for (; x < size.width; x += 4) {
 321       // Not using Containsx(x) because we do not skip x == skipRect.X()
 322       // although that may not be done on purpose
 323       if (inSkipRectY && x > skipRect.X() && x < skipRect.XMost()) {
 324         x = skipRect.XMost() - 4;
 325         // Trigger early jump on coming loop iterations, this will be reset
 326         // next line anyway.
 327         inSkipRectY = false;
 328         continue;
 329       }
 330       __m128i topLeft = loadUnaligned128((__m128i*)(topLeftBase + x));
 331       __m128i topRight = loadUnaligned128((__m128i*)(topRightBase + x));
 332       __m128i bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x));
 333       __m128i bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x));
 334
 335       __m128i result =
 336           BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor);
 337       __m128i final = _mm_packus_epi16(
 338           _mm_packs_epi32(result, _mm_setzero_si128()), _mm_setzero_si128());
 339
 340       *(uint32_t*)(data + stride * y + x) = _mm_cvtsi128_si32(final);
 341     }
 342   }
 343 }
 344
 345 }  // namespace mozilla::gfx