1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "gfxAlphaRecovery.h"
7 #include "gfxImageSurface.h"
11 // This file should only be compiled on x86 and x64 systems. Additionally,
12 // you'll need to compile it with -msse2 if you're using GCC on x86.
14 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
15 __declspec(align(16)) static uint32_t greenMaski
[] = {0x0000ff00, 0x0000ff00,
16 0x0000ff00, 0x0000ff00};
17 __declspec(align(16)) static uint32_t alphaMaski
[] = {0xff000000, 0xff000000,
18 0xff000000, 0xff000000};
19 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
20 static uint32_t greenMaski
[] __attribute__((aligned(16))) = {
21 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00};
22 static uint32_t alphaMaski
[] __attribute__((aligned(16))) = {
23 0xff000000, 0xff000000, 0xff000000, 0xff000000};
24 #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
25 # pragma align 16(greenMaski, alphaMaski)
26 static uint32_t greenMaski
[] = {0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00};
27 static uint32_t alphaMaski
[] = {0xff000000, 0xff000000, 0xff000000, 0xff000000};
30 bool gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface
* blackSurf
,
31 const gfxImageSurface
* whiteSurf
) {
32 mozilla::gfx::IntSize size
= blackSurf
->GetSize();
34 if (size
!= whiteSurf
->GetSize() ||
35 (blackSurf
->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32
&&
36 blackSurf
->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32
) ||
37 (whiteSurf
->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32
&&
38 whiteSurf
->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32
))
44 unsigned char* blackData
= blackSurf
->Data();
45 unsigned char* whiteData
= whiteSurf
->Data();
47 if ((NS_PTR_TO_UINT32(blackData
) & 0xf) !=
48 (NS_PTR_TO_UINT32(whiteData
) & 0xf) ||
49 (blackSurf
->Stride() - whiteSurf
->Stride()) & 0xf) {
50 // Cannot keep these in alignment.
54 __m128i greenMask
= _mm_load_si128((__m128i
*)greenMaski
);
55 __m128i alphaMask
= _mm_load_si128((__m128i
*)alphaMaski
);
57 for (int32_t i
= 0; i
< size
.height
; ++i
) {
59 // Loop single pixels until at 4 byte alignment.
60 while (NS_PTR_TO_UINT32(blackData
) & 0xf && j
< size
.width
) {
61 *((uint32_t*)blackData
) =
62 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData
),
63 *reinterpret_cast<uint32_t*>(whiteData
));
68 // This extra loop allows the compiler to do some more clever registry
69 // management and makes it about 5% faster than with only the 4 pixel
71 for (; j
< size
.width
- 8; j
+= 8) {
72 __m128i black1
= _mm_load_si128((__m128i
*)blackData
);
73 __m128i white1
= _mm_load_si128((__m128i
*)whiteData
);
74 __m128i black2
= _mm_load_si128((__m128i
*)(blackData
+ 16));
75 __m128i white2
= _mm_load_si128((__m128i
*)(whiteData
+ 16));
77 // Execute the same instructions as described in RecoverPixel, only
78 // using an SSE2 packed saturated subtract.
79 white1
= _mm_subs_epu8(white1
, black1
);
80 white2
= _mm_subs_epu8(white2
, black2
);
81 white1
= _mm_subs_epu8(greenMask
, white1
);
82 white2
= _mm_subs_epu8(greenMask
, white2
);
83 // Producing the final black pixel in an XMM register and storing
84 // that is actually faster than doing a masked store since that
85 // does an unaligned storage. We have the black pixel in a register
87 black1
= _mm_andnot_si128(alphaMask
, black1
);
88 black2
= _mm_andnot_si128(alphaMask
, black2
);
89 white1
= _mm_slli_si128(white1
, 2);
90 white2
= _mm_slli_si128(white2
, 2);
91 white1
= _mm_and_si128(alphaMask
, white1
);
92 white2
= _mm_and_si128(alphaMask
, white2
);
93 black1
= _mm_or_si128(white1
, black1
);
94 black2
= _mm_or_si128(white2
, black2
);
96 _mm_store_si128((__m128i
*)blackData
, black1
);
97 _mm_store_si128((__m128i
*)(blackData
+ 16), black2
);
101 for (; j
< size
.width
- 4; j
+= 4) {
102 __m128i black
= _mm_load_si128((__m128i
*)blackData
);
103 __m128i white
= _mm_load_si128((__m128i
*)whiteData
);
105 white
= _mm_subs_epu8(white
, black
);
106 white
= _mm_subs_epu8(greenMask
, white
);
107 black
= _mm_andnot_si128(alphaMask
, black
);
108 white
= _mm_slli_si128(white
, 2);
109 white
= _mm_and_si128(alphaMask
, white
);
110 black
= _mm_or_si128(white
, black
);
111 _mm_store_si128((__m128i
*)blackData
, black
);
115 // Loop single pixels until we're done.
116 while (j
< size
.width
) {
117 *((uint32_t*)blackData
) =
118 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData
),
119 *reinterpret_cast<uint32_t*>(whiteData
));
124 blackData
+= blackSurf
->Stride() - j
* 4;
125 whiteData
+= whiteSurf
->Stride() - j
* 4;
128 blackSurf
->MarkDirty();
133 static int32_t ByteAlignment(int32_t aAlignToLog2
, int32_t aX
, int32_t aY
= 0,
134 int32_t aStride
= 1) {
135 return (aX
+ aStride
* aY
) & ((1 << aAlignToLog2
) - 1);
138 /*static*/ mozilla::gfx::IntRect
gfxAlphaRecovery::AlignRectForSubimageRecovery(
139 const mozilla::gfx::IntRect
& aRect
, gfxImageSurface
* aSurface
) {
141 mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32
== aSurface
->Format(),
142 "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
143 static const int32_t kByteAlignLog2
= GoodAlignmentLog2();
144 static const int32_t bpp
= 4;
145 static const int32_t pixPerAlign
= (1 << kByteAlignLog2
) / bpp
;
147 // We're going to create a subimage of the surface with size
148 // <sw,sh> for alpha recovery, and want a SIMD fast-path. The
149 // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
150 // properly aligned for SIMD. So we want to find a rect <x',y',
151 // w',h'> that's a superset of what needs to be redrawn but is
152 // properly aligned. Proper alignment is
154 // BPP * (x' + y' * sw) \cong 0 (mod ALIGN)
155 // BPP * w' \cong BPP * sw (mod ALIGN)
157 // (We assume the pixel at surface <0,0> is already ALIGN'd.)
158 // That rect (obviously) has to fit within the surface bounds, and
159 // we should also minimize the extra pixels redrawn only for
160 // alignment's sake. So we also want
162 // minimize <x',y', w',h'>
168 // This is a messy integer non-linear programming problem, except
169 // ... we can assume that ALIGN/BPP is a very small constant. So,
170 // brute force is viable. The algorithm below will find a
171 // solution if one exists, but isn't guaranteed to find the
172 // minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at
173 // most 64 iterations below). In what's likely the common case,
174 // an already-aligned rectangle, it only needs 1 iteration.
176 // Is this alignment worth doing? Recovering alpha will take work
177 // proportional to w*h (assuming alpha recovery computation isn't
178 // memory bound). This analysis can lead to O(w+h) extra work
179 // (with small constants). In exchange, we expect to shave off a
180 // ALIGN/BPP constant by using SIMD-ized alpha recovery. So as
181 // w*h diverges from w+h, the win factor approaches ALIGN/BPP. We
182 // only really care about the w*h >> w+h case anyway; others
183 // should be fast enough even with the overhead. (Unless the cost
184 // of repainting the expanded rect is high, but in that case
185 // SIMD-ized alpha recovery won't make a difference so this code
186 // shouldn't be called.)
188 mozilla::gfx::IntSize surfaceSize
= aSurface
->GetSize();
189 const int32_t stride
= bpp
* surfaceSize
.width
;
190 if (stride
!= aSurface
->Stride()) {
191 NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
195 const int32_t x
= aRect
.X(), y
= aRect
.Y(), w
= aRect
.Width(),
197 const int32_t r
= x
+ w
;
198 const int32_t sw
= surfaceSize
.width
;
199 const int32_t strideAlign
= ByteAlignment(kByteAlignLog2
, stride
);
201 // The outer two loops below keep the rightmost (|r| above) and
202 // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
203 // return only a superset of the original rect. These loops
204 // search for an aligned top-left pixel by trying to expand <x,y>
205 // left and up by <dx,dy> pixels, respectively.
207 // Then if a properly-aligned top-left pixel is found, the
208 // innermost loop tries to find an aligned stride by moving the
209 // rightmost pixel rightward by dr.
211 for (dy
= 0; (dy
< pixPerAlign
) && (y
- dy
>= 0); ++dy
) {
212 for (dx
= 0; (dx
< pixPerAlign
) && (x
- dx
>= 0); ++dx
) {
213 if (0 != ByteAlignment(kByteAlignLog2
, bpp
* (x
- dx
), y
- dy
, stride
)) {
216 for (dr
= 0; (dr
< pixPerAlign
) && (r
+ dr
<= sw
); ++dr
) {
217 if (strideAlign
== ByteAlignment(kByteAlignLog2
, bpp
* (w
+ dr
+ dx
))) {
224 // Didn't find a solution.
228 mozilla::gfx::IntRect solution
=
229 mozilla::gfx::IntRect(x
- dx
, y
- dy
, w
+ dr
+ dx
, h
+ dy
);
231 mozilla::gfx::IntRect(0, 0, sw
, surfaceSize
.height
).Contains(solution
),
232 "'Solution' extends outside surface bounds!");