gfx/2d/ImageScalingSSE2.cpp

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 /* This Source Code Form is subject to the terms of the Mozilla Public
   4  * License, v. 2.0. If a copy of the MPL was not distributed with this
   5  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   6
   7 #include "ImageScaling.h"
   8 #include "mozilla/Attributes.h"
   9
  10 #include "SSEHelpers.h"
  11
  12 /* The functions below use the following system for averaging 4 pixels:
  13  *
  14  * The first observation is that a half-adder is implemented as follows:
  15  * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1);
  16  *
  17  * This can be trivially extended to three pixels by observaring that when
  18  * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the
  19  * carries of the individual numbers, since the sum of 3 bits can only ever
  20  * have a carry of one.
  21  *
  22  * We then observe that the average is then ((carry << 1) + sum) >> 1, or,
  23  * assuming eliminating overflows and underflows, carry + (sum >> 1).
  24  *
  25  * We now average our existing sum with the fourth number, so we get:
  26  * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1).
  27  *
  28  * We now observe that our sum has been moved into place relative to the
  29  * carry, so we can now average with the carry to get the final 4 input
  30  * average: avg = (sum2 + carry) >> 1;
  31  *
  32  * Or to reverse the proof:
  33  * avg = ((sum >> 1) + carry + d >> 1) >> 1
  34  * avg = ((a + b + c) >> 1 + d >> 1) >> 1
  35  * avg = ((a + b + c + d) >> 2)
  36  *
  37  * An additional fact used in the SSE versions is the concept that we can
  38  * trivially convert a rounded average to a truncated average:
  39  *
  40  * We have:
  41  * f(a, b) = (a + b + 1) >> 1
  42  *
  43  * And want:
  44  * g(a, b) = (a + b) >> 1
  45  *
  46  * Observe:
  47  * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1)
  48  *            == ~((-a - 1 + -b - 1 + 1) >> 1)
  49  *            == ~((-a - 1 + -b) >> 1)
  50  *            == ~((-(a + b) - 1) >> 1)
  51  *            == ~((~(a + b)) >> 1)
  52  *            == (a + b) >> 1
  53  *            == g(a, b)
  54  */
  55
  56 MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg) {
  57   __m128i minusone = _mm_set1_epi32(0xffffffff);
  58   return _mm_xor_si128(arg, minusone);
  59 }
  60
  61 /* We have to pass pointers here, MSVC does not allow passing more than 3
  62  * __m128i arguments on the stack. And it does not allow 16-byte aligned
  63  * stack variables. This inlines properly on MSVC 2010. It does -not- inline
  64  * with just the inline directive.
  65  */
  66 MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i* a, __m128i* b, __m128i* c,
  67                                        __m128i* d) {
  68 #define shuf1 _MM_SHUFFLE(2, 0, 2, 0)
  69 #define shuf2 _MM_SHUFFLE(3, 1, 3, 1)
  70
  71 // This cannot be an inline function as the __Imm argument to _mm_shuffle_ps
  72 // needs to be a compile time constant.
  73 #define shuffle_si128(arga, argb, imm)                      \
  74   _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), \
  75                                   _mm_castsi128_ps((argb)), (imm)));
  76
  77   __m128i t = shuffle_si128(*a, *b, shuf1);
  78   *b = shuffle_si128(*a, *b, shuf2);
  79   *a = t;
  80   t = shuffle_si128(*c, *d, shuf1);
  81   *d = shuffle_si128(*c, *d, shuf2);
  82   *c = t;
  83
  84 #undef shuf1
  85 #undef shuf2
  86 #undef shuffle_si128
  87
  88   __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c));
  89
  90   __m128i carry =
  91       _mm_or_si128(_mm_and_si128(*a, *b),
  92                    _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c)));
  93
  94   sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d));
  95
  96   return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry)));
  97 }
  98
  99 MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b) {
 100   return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
 101 }
 102
 103 MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b) {
 104   __m128i t = _mm_castps_si128(_mm_shuffle_ps(
 105       _mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));
 106   b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b),
 107                                       _MM_SHUFFLE(2, 0, 2, 0)));
 108   a = t;
 109
 110   return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
 111 }
 112
 113 MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c,
 114                                   uint32_t d) {
 115   uint32_t sum = a ^ b ^ c;
 116   uint32_t carry = (a & b) | (a & c) | (b & c);
 117
 118   uint32_t mask = 0xfefefefe;
 119
 120   // Not having a byte based average instruction means we should mask to avoid
 121   // underflow.
 122   sum = (((sum ^ d) & mask) >> 1) + (sum & d);
 123
 124   return (((sum ^ carry) & mask) >> 1) + (sum & carry);
 125 }
 126
 127 // Simple 2 pixel average version of the function above.
 128 MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b) {
 129   uint32_t sum = a ^ b;
 130   uint32_t carry = (a & b);
 131
 132   uint32_t mask = 0xfefefefe;
 133
 134   return ((sum & mask) >> 1) + carry;
 135 }
 136
 137 namespace mozilla::gfx {
 138
 139 void ImageHalfScaler::HalfImage2D_SSE2(uint8_t* aSource, int32_t aSourceStride,
 140                                        const IntSize& aSourceSize,
 141                                        uint8_t* aDest, uint32_t aDestStride) {
 142   const int Bpp = 4;
 143
 144   for (int y = 0; y < aSourceSize.height; y += 2) {
 145     __m128i* storage = (__m128i*)(aDest + (y / 2) * aDestStride);
 146     int x = 0;
 147     // Run a loop depending on alignment.
 148     if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
 149         !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
 150       for (; x < (aSourceSize.width - 7); x += 8) {
 151         __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
 152         __m128i* lowerRow =
 153             (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
 154
 155         __m128i a = _mm_load_si128(upperRow);
 156         __m128i b = _mm_load_si128(upperRow + 1);
 157         __m128i c = _mm_load_si128(lowerRow);
 158         __m128i d = _mm_load_si128(lowerRow + 1);
 159
 160         *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
 161       }
 162     } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
 163       for (; x < (aSourceSize.width - 7); x += 8) {
 164         __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
 165         __m128i* lowerRow =
 166             (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
 167
 168         __m128i a = _mm_load_si128(upperRow);
 169         __m128i b = _mm_load_si128(upperRow + 1);
 170         __m128i c = loadUnaligned128(lowerRow);
 171         __m128i d = loadUnaligned128(lowerRow + 1);
 172
 173         *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
 174       }
 175     } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
 176       for (; x < (aSourceSize.width - 7); x += 8) {
 177         __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
 178         __m128i* lowerRow =
 179             (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
 180
 181         __m128i a = loadUnaligned128((__m128i*)upperRow);
 182         __m128i b = loadUnaligned128((__m128i*)upperRow + 1);
 183         __m128i c = _mm_load_si128((__m128i*)lowerRow);
 184         __m128i d = _mm_load_si128((__m128i*)lowerRow + 1);
 185
 186         *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
 187       }
 188     } else {
 189       for (; x < (aSourceSize.width - 7); x += 8) {
 190         __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
 191         __m128i* lowerRow =
 192             (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
 193
 194         __m128i a = loadUnaligned128(upperRow);
 195         __m128i b = loadUnaligned128(upperRow + 1);
 196         __m128i c = loadUnaligned128(lowerRow);
 197         __m128i d = loadUnaligned128(lowerRow + 1);
 198
 199         *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
 200       }
 201     }
 202
 203     uint32_t* unalignedStorage = (uint32_t*)storage;
 204     // Take care of the final pixels, we know there's an even number of pixels
 205     // in the source rectangle. We use a 2x2 'simd' implementation for this.
 206     //
 207     // Potentially we only have to do this in the last row since overflowing
 208     // 8 pixels in an earlier row would appear to be harmless as it doesn't
 209     // touch invalid memory. Even when reading and writing to the same surface.
 210     // in practice we only do this when doing an additional downscale pass, and
 211     // in this situation we have unused stride to write into harmlessly.
 212     // I do not believe the additional code complexity would be worth it though.
 213     for (; x < aSourceSize.width; x += 2) {
 214       uint8_t* upperRow = aSource + (y * aSourceStride + x * Bpp);
 215       uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp);
 216
 217       *unalignedStorage++ =
 218           Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1),
 219                  *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1));
 220     }
 221   }
 222 }
 223
 224 void ImageHalfScaler::HalfImageVertical_SSE2(uint8_t* aSource,
 225                                              int32_t aSourceStride,
 226                                              const IntSize& aSourceSize,
 227                                              uint8_t* aDest,
 228                                              uint32_t aDestStride) {
 229   for (int y = 0; y < aSourceSize.height; y += 2) {
 230     __m128i* storage = (__m128i*)(aDest + (y / 2) * aDestStride);
 231     int x = 0;
 232     // Run a loop depending on alignment.
 233     if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
 234         !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
 235       for (; x < (aSourceSize.width - 3); x += 4) {
 236         uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
 237         uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
 238
 239         __m128i a = _mm_load_si128((__m128i*)upperRow);
 240         __m128i b = _mm_load_si128((__m128i*)lowerRow);
 241
 242         *storage++ = avg_sse2_4x2_4x1(a, b);
 243       }
 244     } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
 245       // This line doesn't align well.
 246       for (; x < (aSourceSize.width - 3); x += 4) {
 247         uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
 248         uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
 249
 250         __m128i a = _mm_load_si128((__m128i*)upperRow);
 251         __m128i b = loadUnaligned128((__m128i*)lowerRow);
 252
 253         *storage++ = avg_sse2_4x2_4x1(a, b);
 254       }
 255     } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
 256       for (; x < (aSourceSize.width - 3); x += 4) {
 257         uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
 258         uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
 259
 260         __m128i a = loadUnaligned128((__m128i*)upperRow);
 261         __m128i b = _mm_load_si128((__m128i*)lowerRow);
 262
 263         *storage++ = avg_sse2_4x2_4x1(a, b);
 264       }
 265     } else {
 266       for (; x < (aSourceSize.width - 3); x += 4) {
 267         uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
 268         uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
 269
 270         __m128i a = loadUnaligned128((__m128i*)upperRow);
 271         __m128i b = loadUnaligned128((__m128i*)lowerRow);
 272
 273         *storage++ = avg_sse2_4x2_4x1(a, b);
 274       }
 275     }
 276
 277     uint32_t* unalignedStorage = (uint32_t*)storage;
 278     // Take care of the final pixels, we know there's an even number of pixels
 279     // in the source rectangle.
 280     //
 281     // Similar overflow considerations are valid as in the previous function.
 282     for (; x < aSourceSize.width; x++) {
 283       uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
 284       uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
 285
 286       *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow);
 287     }
 288   }
 289 }
 290
 291 void ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t* aSource,
 292                                                int32_t aSourceStride,
 293                                                const IntSize& aSourceSize,
 294                                                uint8_t* aDest,
 295                                                uint32_t aDestStride) {
 296   for (int y = 0; y < aSourceSize.height; y++) {
 297     __m128i* storage = (__m128i*)(aDest + (y * aDestStride));
 298     int x = 0;
 299     // Run a loop depending on alignment.
 300     if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
 301       for (; x < (aSourceSize.width - 7); x += 8) {
 302         __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
 303
 304         __m128i a = _mm_load_si128(pixels);
 305         __m128i b = _mm_load_si128(pixels + 1);
 306
 307         *storage++ = avg_sse2_8x1_4x1(a, b);
 308       }
 309     } else {
 310       for (; x < (aSourceSize.width - 7); x += 8) {
 311         __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
 312
 313         __m128i a = loadUnaligned128(pixels);
 314         __m128i b = loadUnaligned128(pixels + 1);
 315
 316         *storage++ = avg_sse2_8x1_4x1(a, b);
 317       }
 318     }
 319
 320     uint32_t* unalignedStorage = (uint32_t*)storage;
 321     // Take care of the final pixels, we know there's an even number of pixels
 322     // in the source rectangle.
 323     //
 324     // Similar overflow considerations are valid as in the previous function.
 325     for (; x < aSourceSize.width; x += 2) {
 326       uint32_t* pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4));
 327
 328       *unalignedStorage++ = Avg2(*pixels, *(pixels + 1));
 329     }
 330   }
 331 }
 332
 333 }  // namespace mozilla::gfx