4 * This file is part of OpenTTD.
5 * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
6 * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
7 * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
10 /** @file 32bpp_sse4.cpp Implementation of the SSE4 32 bpp blitter. */
14 #include "../stdafx.h"
15 #include "../zoom_func.h"
16 #include "../settings_type.h"
17 #include "32bpp_sse4.hpp"
19 /** Instantiation of the SSE4 32bpp blitter factory. */
20 static FBlitter_32bppSSE4 iFBlitter_32bppSSE4
;
23 * Draws a sprite to a (screen) buffer. It is templated to allow faster operation.
25 * @tparam mode blitter mode
26 * @param bp further blitting parameters
27 * @param zoom zoom level at which we are drawing
29 IGNORE_UNINITIALIZED_WARNING_START
30 template <BlitterMode mode
, Blitter_32bppSSE2::ReadMode read_mode
, Blitter_32bppSSE2::BlockType bt_last
>
31 inline void Blitter_32bppSSE4::Draw(const Blitter::BlitterParams
*bp
, ZoomLevel zoom
)
33 const byte
* const remap
= bp
->remap
;
34 Colour
*dst_line
= (Colour
*) bp
->dst
+ bp
->top
* bp
->pitch
+ bp
->left
;
35 int effective_width
= bp
->width
;
37 /* Find where to start reading in the source sprite. */
38 const SpriteData
* const sd
= (const SpriteData
*) bp
->sprite
;
39 const SpriteInfo
* const si
= &sd
->infos
[zoom
];
40 const MapValue
*src_mv_line
= (const MapValue
*) &sd
->data
[si
->mv_offset
] + bp
->skip_top
* si
->sprite_width
;
41 const Colour
*src_rgba_line
= (const Colour
*) ((const byte
*) &sd
->data
[si
->sprite_offset
] + bp
->skip_top
* si
->sprite_line_size
);
43 if (read_mode
!= RM_WITH_MARGIN
) {
44 src_rgba_line
+= bp
->skip_left
;
45 src_mv_line
+= bp
->skip_left
;
48 /* Load these variables into register before loop. */
49 const __m128i a_cm
= ALPHA_CONTROL_MASK
;
50 const __m128i pack_low_cm
= PACK_LOW_CONTROL_MASK
;
51 const __m128i briAB_cm
= BRIGHTNESS_LOW_CONTROL_MASK
;
52 const __m128i div_cleaner
= BRIGHTNESS_DIV_CLEANER
;
53 const __m128i ob_check
= OVERBRIGHT_PRESENCE_MASK
;
54 const __m128i ob_mask
= OVERBRIGHT_VALUE_MASK
;
55 const __m128i ob_cm
= OVERBRIGHT_CONTROL_MASK
;
56 const __m128i tr_nom_base
= TRANSPARENT_NOM_BASE
;
58 for (int y
= bp
->height
; y
!= 0; y
--) {
59 const Colour
*src
= src_rgba_line
+ META_LENGTH
;
60 Colour
*dst
= dst_line
;
61 const MapValue
*src_mv
= src_mv_line
;
66 case RM_WITH_MARGIN
: {
67 src
+= src_rgba_line
[0].data
;
68 dst
+= src_rgba_line
[0].data
;
69 const int width_diff
= si
->sprite_width
- bp
->width
;
70 effective_width
= bp
->width
- (int) src_rgba_line
[0].data
;
71 const int delta_diff
= (int) src_rgba_line
[1].data
- width_diff
;
72 const int new_width
= effective_width
- (delta_diff
& ~1);
73 effective_width
= delta_diff
> 0 ? new_width
: effective_width
;
74 if (effective_width
<= 0) break;
79 for (uint x
= (uint
) effective_width
/ 2; x
> 0; x
--) {
80 __m128i srcABCD
= _mm_loadl_epi64((const __m128i
*) src
);
81 __m128i dstABCD
= _mm_loadl_epi64((__m128i
*) dst
);
82 ALPHA_BLEND_2(pack_low_cm
);
83 _mm_storel_epi64((__m128i
*) dst
, srcABCD
);
87 if (bt_last
== BT_ODD
) {
88 __m128i srcABCD
= _mm_cvtsi32_si128(src
->data
);
89 __m128i dstABCD
= _mm_cvtsi32_si128(dst
->data
);
90 ALPHA_BLEND_2(pack_low_cm
);
91 dst
->data
= _mm_cvtsi128_si32(srcABCD
);
96 default: NOT_REACHED();
101 case BM_COLOUR_REMAP
: {
103 case RM_WITH_MARGIN
: {
104 src
+= src_rgba_line
[0].data
;
105 src_mv
+= src_rgba_line
[0].data
;
106 dst
+= src_rgba_line
[0].data
;
107 const int width_diff
= si
->sprite_width
- bp
->width
;
108 effective_width
= bp
->width
- (int) src_rgba_line
[0].data
;
109 const int delta_diff
= (int) src_rgba_line
[1].data
- width_diff
;
110 const int new_width
= effective_width
- delta_diff
;
111 effective_width
= delta_diff
> 0 ? new_width
: effective_width
;
112 if (effective_width
<= 0) break;
117 for (uint x
= (uint
) effective_width
/ 2; x
> 0; x
--) {
118 __m128i srcABCD
= _mm_loadl_epi64((const __m128i
*) src
);
119 __m128i dstABCD
= _mm_loadl_epi64((__m128i
*) dst
);
120 uint32 mvX2
= *((uint32
*) const_cast<MapValue
*>(src_mv
));
123 if (mvX2
& 0x00FF00FF) {
124 /* Written so the compiler uses CMOV. */
125 const Colour src0
= src
[0];
126 const uint m0
= (byte
) mvX2
;
127 const uint r0
= remap
[m0
];
128 const Colour c0map
= (this->LookupColourInPalette(r0
).data
& 0x00FFFFFF) | (src0
.data
& 0xFF000000);
129 Colour c0
= 0; // Use alpha of 0 to keep dst as is.
130 c0
= r0
== 0 ? c0
: c0map
;
131 c0
= m0
!= 0 ? c0
: src0
;
132 INSR32(c0
.data
, srcABCD
, 0);
134 const Colour src1
= src
[1];
135 const uint m1
= (byte
) (mvX2
>> 16);
136 const uint r1
= remap
[m1
];
137 const Colour c1map
= (this->LookupColourInPalette(r1
).data
& 0x00FFFFFF) | (src1
.data
& 0xFF000000);
139 c1
= r1
== 0 ? c1
: c1map
;
140 c1
= m1
!= 0 ? c1
: src1
;
141 INSR32(c1
.data
, srcABCD
, 1);
143 if ((mvX2
& 0xFF00FF00) != 0x80008000) {
144 ADJUST_BRIGHTNESS_2(srcABCD
, mvX2
);
149 ALPHA_BLEND_2(pack_low_cm
);
150 _mm_storel_epi64((__m128i
*) dst
, srcABCD
);
156 if (effective_width
& 1) {
157 /* In case the m-channel is zero, do not remap this pixel in any way. */
160 const uint r
= remap
[src_mv
->m
];
162 Colour remapped_colour
= AdjustBrightness(this->LookupColourInPalette(r
), src_mv
->v
);
164 *dst
= remapped_colour
;
166 remapped_colour
.a
= src
->a
;
167 srcABCD
= _mm_cvtsi32_si128(remapped_colour
.data
);
168 goto bmcr_alpha_blend_single
;
172 srcABCD
= _mm_cvtsi32_si128(src
->data
);
174 bmcr_alpha_blend_single
:
175 __m128i dstABCD
= _mm_cvtsi32_si128(dst
->data
);
176 ALPHA_BLEND_2(pack_low_cm
);
178 dst
->data
= _mm_cvtsi128_si32(srcABCD
);
184 default: NOT_REACHED();
186 src_mv_line
+= si
->sprite_width
;
190 case BM_TRANSPARENT
: {
191 /* Make the current colour a bit more black, so it looks like this image is transparent.
192 * rgb = rgb * ((256/4) * 4 - (alpha/4)) / ((256/4) * 4)
194 for (uint x
= (uint
) bp
->width
/ 2; x
> 0; x
--) {
195 __m128i srcABCD
= _mm_loadl_epi64((const __m128i
*) src
);
196 __m128i dstABCD
= _mm_loadl_epi64((__m128i
*) dst
);
197 __m128i srcAB
= _mm_unpacklo_epi8(srcABCD
, _mm_setzero_si128());
198 __m128i dstAB
= _mm_unpacklo_epi8(dstABCD
, _mm_setzero_si128());
199 __m128i alphaAB
= _mm_shuffle_epi8(srcAB
, a_cm
);
200 alphaAB
= _mm_srli_epi16(alphaAB
, 2); // Reduce to 64 levels of shades so the max value fits in 16 bits.
201 __m128i nom
= _mm_sub_epi16(tr_nom_base
, alphaAB
);
202 dstAB
= _mm_mullo_epi16(dstAB
, nom
);
203 dstAB
= _mm_srli_epi16(dstAB
, 8);
204 dstAB
= _mm_packus_epi16(dstAB
, dstAB
);
205 _mm_storel_epi64((__m128i
*) dst
, dstAB
);
210 __m128i srcABCD
= _mm_cvtsi32_si128(src
->data
);
211 __m128i dstABCD
= _mm_cvtsi32_si128(dst
->data
);
212 __m128i srcAB
= _mm_unpacklo_epi8(srcABCD
, _mm_setzero_si128());
213 __m128i dstAB
= _mm_unpacklo_epi8(dstABCD
, _mm_setzero_si128());
214 __m128i alphaAB
= _mm_shuffle_epi8(srcAB
, a_cm
);
215 alphaAB
= _mm_srli_epi16(alphaAB
, 2);
216 __m128i nom
= _mm_sub_epi16(tr_nom_base
, alphaAB
);
217 dstAB
= _mm_mullo_epi16(dstAB
, nom
);
218 dstAB
= _mm_srli_epi16(dstAB
, 8);
219 dstAB
= _mm_packus_epi16(dstAB
, dstAB
);
220 dst
->data
= _mm_cvtsi128_si32(dstAB
);
227 src_rgba_line
= (const Colour
*) ((const byte
*) src_rgba_line
+ si
->sprite_line_size
);
228 dst_line
+= bp
->pitch
;
231 IGNORE_UNINITIALIZED_WARNING_STOP
234 * Draws a sprite to a (screen) buffer. Calls adequate templated function.
236 * @param bp further blitting parameters
237 * @param mode blitter mode
238 * @param zoom zoom level at which we are drawing
240 void Blitter_32bppSSE4::Draw(Blitter::BlitterParams
*bp
, BlitterMode mode
, ZoomLevel zoom
)
242 const BlockType bt_last
= (BlockType
) (bp
->width
& 1);
245 if (bp
->skip_left
!= 0 || bp
->width
<= MARGIN_NORMAL_THRESHOLD
) {
247 case BT_EVEN
: Draw
<BM_NORMAL
, RM_WITH_SKIP
, BT_EVEN
>(bp
, zoom
); return;
248 case BT_ODD
: Draw
<BM_NORMAL
, RM_WITH_SKIP
, BT_ODD
>(bp
, zoom
); return;
249 default: NOT_REACHED();
253 case BT_EVEN
: Draw
<BM_NORMAL
, RM_WITH_MARGIN
, BT_EVEN
>(bp
, zoom
); return;
254 case BT_ODD
: Draw
<BM_NORMAL
, RM_WITH_MARGIN
, BT_ODD
>(bp
, zoom
); return;
255 default: NOT_REACHED();
260 case BM_COLOUR_REMAP
:
261 if (bp
->skip_left
!= 0 || bp
->width
<= MARGIN_REMAP_THRESHOLD
) {
262 Draw
<BM_COLOUR_REMAP
, RM_WITH_SKIP
, BT_NONE
>(bp
, zoom
); return;
264 Draw
<BM_COLOUR_REMAP
, RM_WITH_MARGIN
, BT_NONE
>(bp
, zoom
); return;
266 case BM_TRANSPARENT
: Draw
<BM_TRANSPARENT
, RM_NONE
, BT_NONE
>(bp
, zoom
); return;
267 default: NOT_REACHED();
271 /** Same code as seen in 32bpp_sse2.cpp but some macros are not the same. */
272 inline Colour
Blitter_32bppSSE4::AdjustBrightness(Colour colour
, uint8 brightness
)
274 /* Shortcut for normal brightness. */
275 if (brightness
== DEFAULT_BRIGHTNESS
) return colour
;
277 return Blitter_32bppSSE4::ReallyAdjustBrightness(colour
, brightness
);
280 IGNORE_UNINITIALIZED_WARNING_START
281 /* static */ Colour
Blitter_32bppSSE4::ReallyAdjustBrightness(Colour colour
, uint8 brightness
)
283 uint64 c16
= colour
.b
| (uint64
) colour
.g
<< 16 | (uint64
) colour
.r
<< 32;
285 uint64 c16_ob
= c16
; // Helps out of order execution.
286 c16
/= DEFAULT_BRIGHTNESS
;
287 c16
&= 0x01FF01FF01FFULL
;
289 /* Sum overbright (maximum for each rgb is 508, 9 bits, -255 is changed in -256 so we just have to take the 8 lower bits into account). */
290 c16_ob
= (((c16_ob
>> (8 + 7)) & 0x0100010001ULL
) * 0xFF) & c16
;
291 uint64 ob
= (uint16
) c16_ob
+ (uint16
) (c16_ob
>> 16) + (uint16
) (c16_ob
>> 32);
293 const uint32 alpha32
= colour
.data
& 0xFF000000;
297 /* Reduce overbright strength. */
300 INSR64(ob
| ob
<< 16 | ob
<< 32, ob128
, 0);
301 __m128i white
= OVERBRIGHT_VALUE_MASK
;
303 ret
= _mm_subs_epu16(white
, c128
); /* PSUBUSW, (255 - rgb) */
304 ret
= _mm_mullo_epi16(ret
, ob128
); /* PMULLW, ob*(255 - rgb) */
305 ret
= _mm_srli_epi16(ret
, 8); /* PSRLW, ob*(255 - rgb)/256 */
306 ret
= _mm_add_epi16(ret
, c128
); /* PADDW, ob*(255 - rgb)/256 + rgb */
309 ret
= _mm_packus_epi16(ret
, ret
); /* PACKUSWB, saturate and pack. */
310 return alpha32
| _mm_cvtsi128_si32(ret
);
312 IGNORE_UNINITIALIZED_WARNING_STOP
314 #endif /* WITH_SSE */