gfx/thebes/gfxAlphaRecoverySSE2.cpp

   1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
   2  * This Source Code Form is subject to the terms of the Mozilla Public
   3  * License, v. 2.0. If a copy of the MPL was not distributed with this
   4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
   5
   6 #include "gfxAlphaRecovery.h"
   7 #include "gfxImageSurface.h"
   8 #include "nsDebug.h"
   9 #include <emmintrin.h>
  10
  11 // This file should only be compiled on x86 and x64 systems.  Additionally,
  12 // you'll need to compile it with -msse2 if you're using GCC on x86.
  13
  14 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
  15 __declspec(align(16)) static uint32_t greenMaski[] = {0x0000ff00, 0x0000ff00,
  16                                                       0x0000ff00, 0x0000ff00};
  17 __declspec(align(16)) static uint32_t alphaMaski[] = {0xff000000, 0xff000000,
  18                                                       0xff000000, 0xff000000};
  19 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
  20 static uint32_t greenMaski[] __attribute__((aligned(16))) = {
  21     0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00};
  22 static uint32_t alphaMaski[] __attribute__((aligned(16))) = {
  23     0xff000000, 0xff000000, 0xff000000, 0xff000000};
  24 #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
  25 #  pragma align 16(greenMaski, alphaMaski)
  26 static uint32_t greenMaski[] = {0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00};
  27 static uint32_t alphaMaski[] = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
  28 #endif
  29
  30 bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
  31                                         const gfxImageSurface* whiteSurf) {
  32   mozilla::gfx::IntSize size = blackSurf->GetSize();
  33
  34   if (size != whiteSurf->GetSize() ||
  35       (blackSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
  36        blackSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32) ||
  37       (whiteSurf->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 &&
  38        whiteSurf->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32))
  39     return false;
  40
  41   blackSurf->Flush();
  42   whiteSurf->Flush();
  43
  44   unsigned char* blackData = blackSurf->Data();
  45   unsigned char* whiteData = whiteSurf->Data();
  46
  47   if ((NS_PTR_TO_UINT32(blackData) & 0xf) !=
  48           (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
  49       (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
  50     // Cannot keep these in alignment.
  51     return false;
  52   }
  53
  54   __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
  55   __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
  56
  57   for (int32_t i = 0; i < size.height; ++i) {
  58     int32_t j = 0;
  59     // Loop single pixels until at 4 byte alignment.
  60     while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
  61       *((uint32_t*)blackData) =
  62           RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
  63                        *reinterpret_cast<uint32_t*>(whiteData));
  64       blackData += 4;
  65       whiteData += 4;
  66       j++;
  67     }
  68     // This extra loop allows the compiler to do some more clever registry
  69     // management and makes it about 5% faster than with only the 4 pixel
  70     // at a time loop.
  71     for (; j < size.width - 8; j += 8) {
  72       __m128i black1 = _mm_load_si128((__m128i*)blackData);
  73       __m128i white1 = _mm_load_si128((__m128i*)whiteData);
  74       __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
  75       __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
  76
  77       // Execute the same instructions as described in RecoverPixel, only
  78       // using an SSE2 packed saturated subtract.
  79       white1 = _mm_subs_epu8(white1, black1);
  80       white2 = _mm_subs_epu8(white2, black2);
  81       white1 = _mm_subs_epu8(greenMask, white1);
  82       white2 = _mm_subs_epu8(greenMask, white2);
  83       // Producing the final black pixel in an XMM register and storing
  84       // that is actually faster than doing a masked store since that
  85       // does an unaligned storage. We have the black pixel in a register
  86       // anyway.
  87       black1 = _mm_andnot_si128(alphaMask, black1);
  88       black2 = _mm_andnot_si128(alphaMask, black2);
  89       white1 = _mm_slli_si128(white1, 2);
  90       white2 = _mm_slli_si128(white2, 2);
  91       white1 = _mm_and_si128(alphaMask, white1);
  92       white2 = _mm_and_si128(alphaMask, white2);
  93       black1 = _mm_or_si128(white1, black1);
  94       black2 = _mm_or_si128(white2, black2);
  95
  96       _mm_store_si128((__m128i*)blackData, black1);
  97       _mm_store_si128((__m128i*)(blackData + 16), black2);
  98       blackData += 32;
  99       whiteData += 32;
 100     }
 101     for (; j < size.width - 4; j += 4) {
 102       __m128i black = _mm_load_si128((__m128i*)blackData);
 103       __m128i white = _mm_load_si128((__m128i*)whiteData);
 104
 105       white = _mm_subs_epu8(white, black);
 106       white = _mm_subs_epu8(greenMask, white);
 107       black = _mm_andnot_si128(alphaMask, black);
 108       white = _mm_slli_si128(white, 2);
 109       white = _mm_and_si128(alphaMask, white);
 110       black = _mm_or_si128(white, black);
 111       _mm_store_si128((__m128i*)blackData, black);
 112       blackData += 16;
 113       whiteData += 16;
 114     }
 115     // Loop single pixels until we're done.
 116     while (j < size.width) {
 117       *((uint32_t*)blackData) =
 118           RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
 119                        *reinterpret_cast<uint32_t*>(whiteData));
 120       blackData += 4;
 121       whiteData += 4;
 122       j++;
 123     }
 124     blackData += blackSurf->Stride() - j * 4;
 125     whiteData += whiteSurf->Stride() - j * 4;
 126   }
 127
 128   blackSurf->MarkDirty();
 129
 130   return true;
 131 }
 132
 133 static int32_t ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY = 0,
 134                              int32_t aStride = 1) {
 135   return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
 136 }
 137
 138 /*static*/ mozilla::gfx::IntRect gfxAlphaRecovery::AlignRectForSubimageRecovery(
 139     const mozilla::gfx::IntRect& aRect, gfxImageSurface* aSurface) {
 140   NS_ASSERTION(
 141       mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32 == aSurface->Format(),
 142       "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
 143   static const int32_t kByteAlignLog2 = GoodAlignmentLog2();
 144   static const int32_t bpp = 4;
 145   static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp;
 146   //
 147   // We're going to create a subimage of the surface with size
 148   // <sw,sh> for alpha recovery, and want a SIMD fast-path.  The
 149   // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
 150   // properly aligned for SIMD.  So we want to find a rect <x',y',
 151   // w',h'> that's a superset of what needs to be redrawn but is
 152   // properly aligned.  Proper alignment is
 153   //
 154   //   BPP * (x' + y' * sw) \cong 0         (mod ALIGN)
 155   //   BPP * w'             \cong BPP * sw  (mod ALIGN)
 156   //
 157   // (We assume the pixel at surface <0,0> is already ALIGN'd.)
 158   // That rect (obviously) has to fit within the surface bounds, and
 159   // we should also minimize the extra pixels redrawn only for
 160   // alignment's sake.  So we also want
 161   //
 162   //  minimize <x',y', w',h'>
 163   //   0 <= x' <= x
 164   //   0 <= y' <= y
 165   //   w <= w' <= sw
 166   //   h <= h' <= sh
 167   //
 168   // This is a messy integer non-linear programming problem, except
 169   // ... we can assume that ALIGN/BPP is a very small constant.  So,
 170   // brute force is viable.  The algorithm below will find a
 171   // solution if one exists, but isn't guaranteed to find the
 172   // minimum solution.  (For SSE2, ALIGN/BPP = 4, so it'll do at
 173   // most 64 iterations below).  In what's likely the common case,
 174   // an already-aligned rectangle, it only needs 1 iteration.
 175   //
 176   // Is this alignment worth doing?  Recovering alpha will take work
 177   // proportional to w*h (assuming alpha recovery computation isn't
 178   // memory bound).  This analysis can lead to O(w+h) extra work
 179   // (with small constants).  In exchange, we expect to shave off a
 180   // ALIGN/BPP constant by using SIMD-ized alpha recovery.  So as
 181   // w*h diverges from w+h, the win factor approaches ALIGN/BPP.  We
 182   // only really care about the w*h >> w+h case anyway; others
 183   // should be fast enough even with the overhead.  (Unless the cost
 184   // of repainting the expanded rect is high, but in that case
 185   // SIMD-ized alpha recovery won't make a difference so this code
 186   // shouldn't be called.)
 187   //
 188   mozilla::gfx::IntSize surfaceSize = aSurface->GetSize();
 189   const int32_t stride = bpp * surfaceSize.width;
 190   if (stride != aSurface->Stride()) {
 191     NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
 192     return aRect;
 193   }
 194
 195   const int32_t x = aRect.X(), y = aRect.Y(), w = aRect.Width(),
 196                 h = aRect.Height();
 197   const int32_t r = x + w;
 198   const int32_t sw = surfaceSize.width;
 199   const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride);
 200
 201   // The outer two loops below keep the rightmost (|r| above) and
 202   // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
 203   // return only a superset of the original rect.  These loops
 204   // search for an aligned top-left pixel by trying to expand <x,y>
 205   // left and up by <dx,dy> pixels, respectively.
 206   //
 207   // Then if a properly-aligned top-left pixel is found, the
 208   // innermost loop tries to find an aligned stride by moving the
 209   // rightmost pixel rightward by dr.
 210   int32_t dx, dy, dr;
 211   for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
 212     for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
 213       if (0 != ByteAlignment(kByteAlignLog2, bpp * (x - dx), y - dy, stride)) {
 214         continue;
 215       }
 216       for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
 217         if (strideAlign == ByteAlignment(kByteAlignLog2, bpp * (w + dr + dx))) {
 218           goto FOUND_SOLUTION;
 219         }
 220       }
 221     }
 222   }
 223
 224   // Didn't find a solution.
 225   return aRect;
 226
 227 FOUND_SOLUTION:
 228   mozilla::gfx::IntRect solution =
 229       mozilla::gfx::IntRect(x - dx, y - dy, w + dr + dx, h + dy);
 230   MOZ_ASSERT(
 231       mozilla::gfx::IntRect(0, 0, sw, surfaceSize.height).Contains(solution),
 232       "'Solution' extends outside surface bounds!");
 233   return solution;
 234 }