1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 #ifndef _GFXALPHARECOVERY_GENERIC_H_
6 #define _GFXALPHARECOVERY_GENERIC_H_
8 #include "gfxAlphaRecovery.h"
9 #include "gfxImageSurface.h"
11 #include <xsimd/xsimd.hpp>
13 template <typename Arch
>
14 bool gfxAlphaRecovery::RecoverAlphaGeneric(gfxImageSurface
* blackSurf
,
15 const gfxImageSurface
* whiteSurf
) {
16 mozilla::gfx::IntSize size
= blackSurf
->GetSize();
18 if (size
!= whiteSurf
->GetSize() ||
19 (blackSurf
->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32
&&
20 blackSurf
->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32
) ||
21 (whiteSurf
->Format() != mozilla::gfx::SurfaceFormat::A8R8G8B8_UINT32
&&
22 whiteSurf
->Format() != mozilla::gfx::SurfaceFormat::X8R8G8B8_UINT32
))
28 unsigned char* blackData
= blackSurf
->Data();
29 unsigned char* whiteData
= whiteSurf
->Data();
31 if ((NS_PTR_TO_UINT32(blackData
) & 0xf) !=
32 (NS_PTR_TO_UINT32(whiteData
) & 0xf) ||
33 (blackSurf
->Stride() - whiteSurf
->Stride()) & 0xf) {
34 // Cannot keep these in alignment.
38 alignas(Arch::alignment()) static const uint8_t greenMaski
[] = {
39 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
40 0x00, 0xff, 0x00, 0x00, 0x00, 0xff, 0x00, 0x00,
42 alignas(Arch::alignment()) static const uint8_t alphaMaski
[] = {
43 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
44 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00, 0xff,
47 using batch_type
= xsimd::batch
<uint8_t, Arch
>;
48 constexpr size_t batch_size
= batch_type::size
;
49 static_assert(batch_size
== 16);
51 batch_type greenMask
= batch_type::load_aligned(greenMaski
);
52 batch_type alphaMask
= batch_type::load_aligned(alphaMaski
);
54 for (int32_t i
= 0; i
< size
.height
; ++i
) {
56 // Loop single pixels until at 4 byte alignment.
57 while (NS_PTR_TO_UINT32(blackData
) & 0xf && j
< size
.width
) {
58 *((uint32_t*)blackData
) =
59 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData
),
60 *reinterpret_cast<uint32_t*>(whiteData
));
65 // This extra loop allows the compiler to do some more clever registry
66 // management and makes it about 5% faster than with only the 4 pixel
68 for (; j
< size
.width
- 8; j
+= 8) {
69 auto black1
= batch_type::load_aligned(blackData
);
70 auto white1
= batch_type::load_aligned(whiteData
);
71 auto black2
= batch_type::load_aligned(blackData
+ batch_size
);
72 auto white2
= batch_type::load_aligned(whiteData
+ batch_size
);
74 // Execute the same instructions as described in RecoverPixel, only
75 // using an SSE2 packed saturated subtract.
76 white1
= xsimd::ssub(white1
, black1
);
77 white2
= xsimd::ssub(white2
, black2
);
78 white1
= xsimd::ssub(greenMask
, white1
);
79 white2
= xsimd::ssub(greenMask
, white2
);
80 // Producing the final black pixel in an XMM register and storing
81 // that is actually faster than doing a masked store since that
82 // does an unaligned storage. We have the black pixel in a register
84 black1
= xsimd::bitwise_andnot(black1
, alphaMask
);
85 black2
= xsimd::bitwise_andnot(black2
, alphaMask
);
86 white1
= xsimd::slide_left
<2>(white1
);
87 white2
= xsimd::slide_left
<2>(white2
);
93 black1
.store_aligned(blackData
);
94 black2
.store_aligned(blackData
+ batch_size
);
95 blackData
+= 2 * batch_size
;
96 whiteData
+= 2 * batch_size
;
98 for (; j
< size
.width
- 4; j
+= 4) {
99 auto black
= batch_type::load_aligned(blackData
);
100 auto white
= batch_type::load_aligned(whiteData
);
102 white
= xsimd::ssub(white
, black
);
103 white
= xsimd::ssub(greenMask
, white
);
104 black
= xsimd::bitwise_andnot(black
, alphaMask
);
105 white
= xsimd::slide_left
<2>(white
);
108 black
.store_aligned(blackData
);
109 blackData
+= batch_size
;
110 whiteData
+= batch_size
;
112 // Loop single pixels until we're done.
113 while (j
< size
.width
) {
114 *((uint32_t*)blackData
) =
115 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData
),
116 *reinterpret_cast<uint32_t*>(whiteData
));
121 blackData
+= blackSurf
->Stride() - j
* 4;
122 whiteData
+= whiteSurf
->Stride() - j
* 4;
125 blackSurf
->MarkDirty();