2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
33 /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
34 #define PSHUFD_IS_FAST 0
36 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
37 #include <emmintrin.h> /* for SSE2 intrinsics */
38 #include "pixman-private.h"
39 #include "pixman-combine32.h"
40 #include "pixman-inlines.h"
42 static __m128i mask_0080
;
43 static __m128i mask_00ff
;
44 static __m128i mask_0101
;
45 static __m128i mask_ffff
;
46 static __m128i mask_ff000000
;
47 static __m128i mask_alpha
;
49 static __m128i mask_565_r
;
50 static __m128i mask_565_g1
, mask_565_g2
;
51 static __m128i mask_565_b
;
52 static __m128i mask_red
;
53 static __m128i mask_green
;
54 static __m128i mask_blue
;
56 static __m128i mask_565_fix_rb
;
57 static __m128i mask_565_fix_g
;
59 static __m128i mask_565_rb
;
60 static __m128i mask_565_pack_multiplier
;
62 static force_inline __m128i
63 unpack_32_1x128 (uint32_t data
)
65 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data
), _mm_setzero_si128 ());
68 static force_inline
void
69 unpack_128_2x128 (__m128i data
, __m128i
* data_lo
, __m128i
* data_hi
)
71 *data_lo
= _mm_unpacklo_epi8 (data
, _mm_setzero_si128 ());
72 *data_hi
= _mm_unpackhi_epi8 (data
, _mm_setzero_si128 ());
75 static force_inline __m128i
76 unpack_565_to_8888 (__m128i lo
)
78 __m128i r
, g
, b
, rb
, t
;
80 r
= _mm_and_si128 (_mm_slli_epi32 (lo
, 8), mask_red
);
81 g
= _mm_and_si128 (_mm_slli_epi32 (lo
, 5), mask_green
);
82 b
= _mm_and_si128 (_mm_slli_epi32 (lo
, 3), mask_blue
);
84 rb
= _mm_or_si128 (r
, b
);
85 t
= _mm_and_si128 (rb
, mask_565_fix_rb
);
86 t
= _mm_srli_epi32 (t
, 5);
87 rb
= _mm_or_si128 (rb
, t
);
89 t
= _mm_and_si128 (g
, mask_565_fix_g
);
90 t
= _mm_srli_epi32 (t
, 6);
91 g
= _mm_or_si128 (g
, t
);
93 return _mm_or_si128 (rb
, g
);
96 static force_inline
void
97 unpack_565_128_4x128 (__m128i data
,
105 lo
= _mm_unpacklo_epi16 (data
, _mm_setzero_si128 ());
106 hi
= _mm_unpackhi_epi16 (data
, _mm_setzero_si128 ());
108 lo
= unpack_565_to_8888 (lo
);
109 hi
= unpack_565_to_8888 (hi
);
111 unpack_128_2x128 (lo
, data0
, data1
);
112 unpack_128_2x128 (hi
, data2
, data3
);
115 static force_inline
uint16_t
116 pack_565_32_16 (uint32_t pixel
)
118 return (uint16_t) (((pixel
>> 8) & 0xf800) |
119 ((pixel
>> 5) & 0x07e0) |
120 ((pixel
>> 3) & 0x001f));
123 static force_inline __m128i
124 pack_2x128_128 (__m128i lo
, __m128i hi
)
126 return _mm_packus_epi16 (lo
, hi
);
129 static force_inline __m128i
130 pack_565_2packedx128_128 (__m128i lo
, __m128i hi
)
132 __m128i rb0
= _mm_and_si128 (lo
, mask_565_rb
);
133 __m128i rb1
= _mm_and_si128 (hi
, mask_565_rb
);
135 __m128i t0
= _mm_madd_epi16 (rb0
, mask_565_pack_multiplier
);
136 __m128i t1
= _mm_madd_epi16 (rb1
, mask_565_pack_multiplier
);
138 __m128i g0
= _mm_and_si128 (lo
, mask_green
);
139 __m128i g1
= _mm_and_si128 (hi
, mask_green
);
141 t0
= _mm_or_si128 (t0
, g0
);
142 t1
= _mm_or_si128 (t1
, g1
);
144 /* Simulates _mm_packus_epi32 */
145 t0
= _mm_slli_epi32 (t0
, 16 - 5);
146 t1
= _mm_slli_epi32 (t1
, 16 - 5);
147 t0
= _mm_srai_epi32 (t0
, 16);
148 t1
= _mm_srai_epi32 (t1
, 16);
149 return _mm_packs_epi32 (t0
, t1
);
152 static force_inline __m128i
153 pack_565_2x128_128 (__m128i lo
, __m128i hi
)
156 __m128i r
, g1
, g2
, b
;
158 data
= pack_2x128_128 (lo
, hi
);
160 r
= _mm_and_si128 (data
, mask_565_r
);
161 g1
= _mm_and_si128 (_mm_slli_epi32 (data
, 3), mask_565_g1
);
162 g2
= _mm_and_si128 (_mm_srli_epi32 (data
, 5), mask_565_g2
);
163 b
= _mm_and_si128 (_mm_srli_epi32 (data
, 3), mask_565_b
);
165 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r
, g1
), g2
), b
);
168 static force_inline __m128i
169 pack_565_4x128_128 (__m128i
* xmm0
, __m128i
* xmm1
, __m128i
* xmm2
, __m128i
* xmm3
)
171 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0
, *xmm1
),
172 pack_565_2x128_128 (*xmm2
, *xmm3
));
175 static force_inline
int
176 is_opaque (__m128i x
)
178 __m128i ffs
= _mm_cmpeq_epi8 (x
, x
);
180 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x
, ffs
)) & 0x8888) == 0x8888;
183 static force_inline
int
186 return _mm_movemask_epi8 (
187 _mm_cmpeq_epi8 (x
, _mm_setzero_si128 ())) == 0xffff;
190 static force_inline
int
191 is_transparent (__m128i x
)
193 return (_mm_movemask_epi8 (
194 _mm_cmpeq_epi8 (x
, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
197 static force_inline __m128i
198 expand_pixel_32_1x128 (uint32_t data
)
200 return _mm_shuffle_epi32 (unpack_32_1x128 (data
), _MM_SHUFFLE (1, 0, 1, 0));
203 static force_inline __m128i
204 expand_alpha_1x128 (__m128i data
)
206 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data
,
207 _MM_SHUFFLE (3, 3, 3, 3)),
208 _MM_SHUFFLE (3, 3, 3, 3));
211 static force_inline
void
212 expand_alpha_2x128 (__m128i data_lo
,
219 lo
= _mm_shufflelo_epi16 (data_lo
, _MM_SHUFFLE (3, 3, 3, 3));
220 hi
= _mm_shufflelo_epi16 (data_hi
, _MM_SHUFFLE (3, 3, 3, 3));
222 *alpha_lo
= _mm_shufflehi_epi16 (lo
, _MM_SHUFFLE (3, 3, 3, 3));
223 *alpha_hi
= _mm_shufflehi_epi16 (hi
, _MM_SHUFFLE (3, 3, 3, 3));
226 static force_inline
void
227 expand_alpha_rev_2x128 (__m128i data_lo
,
234 lo
= _mm_shufflelo_epi16 (data_lo
, _MM_SHUFFLE (0, 0, 0, 0));
235 hi
= _mm_shufflelo_epi16 (data_hi
, _MM_SHUFFLE (0, 0, 0, 0));
236 *alpha_lo
= _mm_shufflehi_epi16 (lo
, _MM_SHUFFLE (0, 0, 0, 0));
237 *alpha_hi
= _mm_shufflehi_epi16 (hi
, _MM_SHUFFLE (0, 0, 0, 0));
240 static force_inline
void
241 pix_multiply_2x128 (__m128i
* data_lo
,
250 lo
= _mm_mullo_epi16 (*data_lo
, *alpha_lo
);
251 hi
= _mm_mullo_epi16 (*data_hi
, *alpha_hi
);
252 lo
= _mm_adds_epu16 (lo
, mask_0080
);
253 hi
= _mm_adds_epu16 (hi
, mask_0080
);
254 *ret_lo
= _mm_mulhi_epu16 (lo
, mask_0101
);
255 *ret_hi
= _mm_mulhi_epu16 (hi
, mask_0101
);
258 static force_inline
void
259 pix_add_multiply_2x128 (__m128i
* src_lo
,
261 __m128i
* alpha_dst_lo
,
262 __m128i
* alpha_dst_hi
,
265 __m128i
* alpha_src_lo
,
266 __m128i
* alpha_src_hi
,
270 __m128i t1_lo
, t1_hi
;
271 __m128i t2_lo
, t2_hi
;
273 pix_multiply_2x128 (src_lo
, src_hi
, alpha_dst_lo
, alpha_dst_hi
, &t1_lo
, &t1_hi
);
274 pix_multiply_2x128 (dst_lo
, dst_hi
, alpha_src_lo
, alpha_src_hi
, &t2_lo
, &t2_hi
);
276 *ret_lo
= _mm_adds_epu8 (t1_lo
, t2_lo
);
277 *ret_hi
= _mm_adds_epu8 (t1_hi
, t2_hi
);
280 static force_inline
void
281 negate_2x128 (__m128i data_lo
,
286 *neg_lo
= _mm_xor_si128 (data_lo
, mask_00ff
);
287 *neg_hi
= _mm_xor_si128 (data_hi
, mask_00ff
);
290 static force_inline
void
291 invert_colors_2x128 (__m128i data_lo
,
298 lo
= _mm_shufflelo_epi16 (data_lo
, _MM_SHUFFLE (3, 0, 1, 2));
299 hi
= _mm_shufflelo_epi16 (data_hi
, _MM_SHUFFLE (3, 0, 1, 2));
300 *inv_lo
= _mm_shufflehi_epi16 (lo
, _MM_SHUFFLE (3, 0, 1, 2));
301 *inv_hi
= _mm_shufflehi_epi16 (hi
, _MM_SHUFFLE (3, 0, 1, 2));
304 static force_inline
void
305 over_2x128 (__m128i
* src_lo
,
314 negate_2x128 (*alpha_lo
, *alpha_hi
, &t1
, &t2
);
316 pix_multiply_2x128 (dst_lo
, dst_hi
, &t1
, &t2
, dst_lo
, dst_hi
);
318 *dst_lo
= _mm_adds_epu8 (*src_lo
, *dst_lo
);
319 *dst_hi
= _mm_adds_epu8 (*src_hi
, *dst_hi
);
322 static force_inline
void
323 over_rev_non_pre_2x128 (__m128i src_lo
,
329 __m128i alpha_lo
, alpha_hi
;
331 expand_alpha_2x128 (src_lo
, src_hi
, &alpha_lo
, &alpha_hi
);
333 lo
= _mm_or_si128 (alpha_lo
, mask_alpha
);
334 hi
= _mm_or_si128 (alpha_hi
, mask_alpha
);
336 invert_colors_2x128 (src_lo
, src_hi
, &src_lo
, &src_hi
);
338 pix_multiply_2x128 (&src_lo
, &src_hi
, &lo
, &hi
, &lo
, &hi
);
340 over_2x128 (&lo
, &hi
, &alpha_lo
, &alpha_hi
, dst_lo
, dst_hi
);
343 static force_inline
void
344 in_over_2x128 (__m128i
* src_lo
,
356 pix_multiply_2x128 (src_lo
, src_hi
, mask_lo
, mask_hi
, &s_lo
, &s_hi
);
357 pix_multiply_2x128 (alpha_lo
, alpha_hi
, mask_lo
, mask_hi
, &a_lo
, &a_hi
);
359 over_2x128 (&s_lo
, &s_hi
, &a_lo
, &a_hi
, dst_lo
, dst_hi
);
362 /* load 4 pixels from a 16-byte boundary aligned address */
363 static force_inline __m128i
364 load_128_aligned (__m128i
* src
)
366 return _mm_load_si128 (src
);
369 /* load 4 pixels from a unaligned address */
370 static force_inline __m128i
371 load_128_unaligned (const __m128i
* src
)
373 return _mm_loadu_si128 (src
);
376 /* save 4 pixels using Write Combining memory on a 16-byte
377 * boundary aligned address
379 static force_inline
void
380 save_128_write_combining (__m128i
* dst
,
383 _mm_stream_si128 (dst
, data
);
386 /* save 4 pixels on a 16-byte boundary aligned address */
387 static force_inline
void
388 save_128_aligned (__m128i
* dst
,
391 _mm_store_si128 (dst
, data
);
394 /* save 4 pixels on a unaligned address */
395 static force_inline
void
396 save_128_unaligned (__m128i
* dst
,
399 _mm_storeu_si128 (dst
, data
);
402 static force_inline __m128i
403 load_32_1x128 (uint32_t data
)
405 return _mm_cvtsi32_si128 (data
);
408 static force_inline __m128i
409 expand_alpha_rev_1x128 (__m128i data
)
411 return _mm_shufflelo_epi16 (data
, _MM_SHUFFLE (0, 0, 0, 0));
414 static force_inline __m128i
415 expand_pixel_8_1x128 (uint8_t data
)
417 return _mm_shufflelo_epi16 (
418 unpack_32_1x128 ((uint32_t)data
), _MM_SHUFFLE (0, 0, 0, 0));
421 static force_inline __m128i
422 pix_multiply_1x128 (__m128i data
,
425 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data
, alpha
),
430 static force_inline __m128i
431 pix_add_multiply_1x128 (__m128i
* src
,
436 __m128i t1
= pix_multiply_1x128 (*src
, *alpha_dst
);
437 __m128i t2
= pix_multiply_1x128 (*dst
, *alpha_src
);
439 return _mm_adds_epu8 (t1
, t2
);
442 static force_inline __m128i
443 negate_1x128 (__m128i data
)
445 return _mm_xor_si128 (data
, mask_00ff
);
448 static force_inline __m128i
449 invert_colors_1x128 (__m128i data
)
451 return _mm_shufflelo_epi16 (data
, _MM_SHUFFLE (3, 0, 1, 2));
454 static force_inline __m128i
455 over_1x128 (__m128i src
, __m128i alpha
, __m128i dst
)
457 return _mm_adds_epu8 (src
, pix_multiply_1x128 (dst
, negate_1x128 (alpha
)));
460 static force_inline __m128i
461 in_over_1x128 (__m128i
* src
, __m128i
* alpha
, __m128i
* mask
, __m128i
* dst
)
463 return over_1x128 (pix_multiply_1x128 (*src
, *mask
),
464 pix_multiply_1x128 (*alpha
, *mask
),
468 static force_inline __m128i
469 over_rev_non_pre_1x128 (__m128i src
, __m128i dst
)
471 __m128i alpha
= expand_alpha_1x128 (src
);
473 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src
),
474 _mm_or_si128 (alpha
, mask_alpha
)),
479 static force_inline
uint32_t
480 pack_1x128_32 (__m128i data
)
482 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data
, _mm_setzero_si128 ()));
485 static force_inline __m128i
486 expand565_16_1x128 (uint16_t pixel
)
488 __m128i m
= _mm_cvtsi32_si128 (pixel
);
490 m
= unpack_565_to_8888 (m
);
492 return _mm_unpacklo_epi8 (m
, _mm_setzero_si128 ());
495 static force_inline
uint32_t
496 core_combine_over_u_pixel_sse2 (uint32_t src
, uint32_t dst
)
509 xmms
= unpack_32_1x128 (src
);
510 return pack_1x128_32 (
511 over_1x128 (xmms
, expand_alpha_1x128 (xmms
),
512 unpack_32_1x128 (dst
)));
518 static force_inline
uint32_t
519 combine1 (const uint32_t *ps
, const uint32_t *pm
)
527 mm
= unpack_32_1x128 (*pm
);
528 mm
= expand_alpha_1x128 (mm
);
530 ms
= unpack_32_1x128 (s
);
531 ms
= pix_multiply_1x128 (ms
, mm
);
533 s
= pack_1x128_32 (ms
);
539 static force_inline __m128i
540 combine4 (const __m128i
*ps
, const __m128i
*pm
)
542 __m128i xmm_src_lo
, xmm_src_hi
;
543 __m128i xmm_msk_lo
, xmm_msk_hi
;
548 xmm_msk_lo
= load_128_unaligned (pm
);
550 if (is_transparent (xmm_msk_lo
))
551 return _mm_setzero_si128 ();
554 s
= load_128_unaligned (ps
);
558 unpack_128_2x128 (s
, &xmm_src_lo
, &xmm_src_hi
);
559 unpack_128_2x128 (xmm_msk_lo
, &xmm_msk_lo
, &xmm_msk_hi
);
561 expand_alpha_2x128 (xmm_msk_lo
, xmm_msk_hi
, &xmm_msk_lo
, &xmm_msk_hi
);
563 pix_multiply_2x128 (&xmm_src_lo
, &xmm_src_hi
,
564 &xmm_msk_lo
, &xmm_msk_hi
,
565 &xmm_src_lo
, &xmm_src_hi
);
567 s
= pack_2x128_128 (xmm_src_lo
, xmm_src_hi
);
573 static force_inline
void
574 core_combine_over_u_sse2_mask (uint32_t * pd
,
581 /* Align dst on a 16-byte boundary */
582 while (w
&& ((uintptr_t)pd
& 15))
585 s
= combine1 (ps
, pm
);
588 *pd
= core_combine_over_u_pixel_sse2 (s
, d
);
597 __m128i mask
= load_128_unaligned ((__m128i
*)pm
);
602 __m128i src_hi
, src_lo
;
603 __m128i mask_hi
, mask_lo
;
604 __m128i alpha_hi
, alpha_lo
;
606 src
= load_128_unaligned ((__m128i
*)ps
);
608 if (is_opaque (_mm_and_si128 (src
, mask
)))
610 save_128_aligned ((__m128i
*)pd
, src
);
614 __m128i dst
= load_128_aligned ((__m128i
*)pd
);
615 __m128i dst_hi
, dst_lo
;
617 unpack_128_2x128 (mask
, &mask_lo
, &mask_hi
);
618 unpack_128_2x128 (src
, &src_lo
, &src_hi
);
620 expand_alpha_2x128 (mask_lo
, mask_hi
, &mask_lo
, &mask_hi
);
621 pix_multiply_2x128 (&src_lo
, &src_hi
,
625 unpack_128_2x128 (dst
, &dst_lo
, &dst_hi
);
627 expand_alpha_2x128 (src_lo
, src_hi
,
628 &alpha_lo
, &alpha_hi
);
630 over_2x128 (&src_lo
, &src_hi
, &alpha_lo
, &alpha_hi
,
635 pack_2x128_128 (dst_lo
, dst_hi
));
647 s
= combine1 (ps
, pm
);
650 *pd
= core_combine_over_u_pixel_sse2 (s
, d
);
659 static force_inline
void
660 core_combine_over_u_sse2_no_mask (uint32_t * pd
,
666 /* Align dst on a 16-byte boundary */
667 while (w
&& ((uintptr_t)pd
& 15))
673 *pd
= core_combine_over_u_pixel_sse2 (s
, d
);
682 __m128i src_hi
, src_lo
, dst_hi
, dst_lo
;
683 __m128i alpha_hi
, alpha_lo
;
685 src
= load_128_unaligned ((__m128i
*)ps
);
691 save_128_aligned ((__m128i
*)pd
, src
);
695 __m128i dst
= load_128_aligned ((__m128i
*)pd
);
697 unpack_128_2x128 (src
, &src_lo
, &src_hi
);
698 unpack_128_2x128 (dst
, &dst_lo
, &dst_hi
);
700 expand_alpha_2x128 (src_lo
, src_hi
,
701 &alpha_lo
, &alpha_hi
);
702 over_2x128 (&src_lo
, &src_hi
, &alpha_lo
, &alpha_hi
,
707 pack_2x128_128 (dst_lo
, dst_hi
));
721 *pd
= core_combine_over_u_pixel_sse2 (s
, d
);
729 static force_inline
void
730 sse2_combine_over_u (pixman_implementation_t
*imp
,
738 core_combine_over_u_sse2_mask (pd
, ps
, pm
, w
);
740 core_combine_over_u_sse2_no_mask (pd
, ps
, w
);
744 sse2_combine_over_reverse_u (pixman_implementation_t
*imp
,
753 __m128i xmm_dst_lo
, xmm_dst_hi
;
754 __m128i xmm_src_lo
, xmm_src_hi
;
755 __m128i xmm_alpha_lo
, xmm_alpha_hi
;
757 /* Align dst on a 16-byte boundary */
759 ((uintptr_t)pd
& 15))
762 s
= combine1 (ps
, pm
);
764 *pd
++ = core_combine_over_u_pixel_sse2 (d
, s
);
773 /* I'm loading unaligned because I'm not sure
774 * about the address alignment.
776 xmm_src_hi
= combine4 ((__m128i
*)ps
, (__m128i
*)pm
);
777 xmm_dst_hi
= load_128_aligned ((__m128i
*) pd
);
779 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
780 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
782 expand_alpha_2x128 (xmm_dst_lo
, xmm_dst_hi
,
783 &xmm_alpha_lo
, &xmm_alpha_hi
);
785 over_2x128 (&xmm_dst_lo
, &xmm_dst_hi
,
786 &xmm_alpha_lo
, &xmm_alpha_hi
,
787 &xmm_src_lo
, &xmm_src_hi
);
789 /* rebuid the 4 pixel data and save*/
790 save_128_aligned ((__m128i
*)pd
,
791 pack_2x128_128 (xmm_src_lo
, xmm_src_hi
));
804 s
= combine1 (ps
, pm
);
806 *pd
++ = core_combine_over_u_pixel_sse2 (d
, s
);
814 static force_inline
uint32_t
815 core_combine_in_u_pixel_sse2 (uint32_t src
, uint32_t dst
)
817 uint32_t maska
= src
>> 24;
823 else if (maska
!= 0xff)
825 return pack_1x128_32 (
826 pix_multiply_1x128 (unpack_32_1x128 (dst
),
827 expand_alpha_1x128 (unpack_32_1x128 (src
))));
834 sse2_combine_in_u (pixman_implementation_t
*imp
,
843 __m128i xmm_src_lo
, xmm_src_hi
;
844 __m128i xmm_dst_lo
, xmm_dst_hi
;
846 while (w
&& ((uintptr_t)pd
& 15))
848 s
= combine1 (ps
, pm
);
851 *pd
++ = core_combine_in_u_pixel_sse2 (d
, s
);
860 xmm_dst_hi
= load_128_aligned ((__m128i
*) pd
);
861 xmm_src_hi
= combine4 ((__m128i
*) ps
, (__m128i
*) pm
);
863 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
864 expand_alpha_2x128 (xmm_dst_lo
, xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
866 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
867 pix_multiply_2x128 (&xmm_src_lo
, &xmm_src_hi
,
868 &xmm_dst_lo
, &xmm_dst_hi
,
869 &xmm_dst_lo
, &xmm_dst_hi
);
871 save_128_aligned ((__m128i
*)pd
,
872 pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
883 s
= combine1 (ps
, pm
);
886 *pd
++ = core_combine_in_u_pixel_sse2 (d
, s
);
895 sse2_combine_in_reverse_u (pixman_implementation_t
*imp
,
904 __m128i xmm_src_lo
, xmm_src_hi
;
905 __m128i xmm_dst_lo
, xmm_dst_hi
;
907 while (w
&& ((uintptr_t)pd
& 15))
909 s
= combine1 (ps
, pm
);
912 *pd
++ = core_combine_in_u_pixel_sse2 (s
, d
);
921 xmm_dst_hi
= load_128_aligned ((__m128i
*) pd
);
922 xmm_src_hi
= combine4 ((__m128i
*) ps
, (__m128i
*)pm
);
924 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
925 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
927 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
928 pix_multiply_2x128 (&xmm_dst_lo
, &xmm_dst_hi
,
929 &xmm_src_lo
, &xmm_src_hi
,
930 &xmm_dst_lo
, &xmm_dst_hi
);
933 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
944 s
= combine1 (ps
, pm
);
947 *pd
++ = core_combine_in_u_pixel_sse2 (s
, d
);
956 sse2_combine_out_reverse_u (pixman_implementation_t
*imp
,
963 while (w
&& ((uintptr_t)pd
& 15))
965 uint32_t s
= combine1 (ps
, pm
);
968 *pd
++ = pack_1x128_32 (
970 unpack_32_1x128 (d
), negate_1x128 (
971 expand_alpha_1x128 (unpack_32_1x128 (s
)))));
981 __m128i xmm_src_lo
, xmm_src_hi
;
982 __m128i xmm_dst_lo
, xmm_dst_hi
;
984 xmm_src_hi
= combine4 ((__m128i
*)ps
, (__m128i
*)pm
);
985 xmm_dst_hi
= load_128_aligned ((__m128i
*) pd
);
987 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
988 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
990 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
991 negate_2x128 (xmm_src_lo
, xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
993 pix_multiply_2x128 (&xmm_dst_lo
, &xmm_dst_hi
,
994 &xmm_src_lo
, &xmm_src_hi
,
995 &xmm_dst_lo
, &xmm_dst_hi
);
998 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
1010 uint32_t s
= combine1 (ps
, pm
);
1013 *pd
++ = pack_1x128_32 (
1014 pix_multiply_1x128 (
1015 unpack_32_1x128 (d
), negate_1x128 (
1016 expand_alpha_1x128 (unpack_32_1x128 (s
)))));
1025 sse2_combine_out_u (pixman_implementation_t
*imp
,
1028 const uint32_t * ps
,
1029 const uint32_t * pm
,
1032 while (w
&& ((uintptr_t)pd
& 15))
1034 uint32_t s
= combine1 (ps
, pm
);
1037 *pd
++ = pack_1x128_32 (
1038 pix_multiply_1x128 (
1039 unpack_32_1x128 (s
), negate_1x128 (
1040 expand_alpha_1x128 (unpack_32_1x128 (d
)))));
1049 __m128i xmm_src_lo
, xmm_src_hi
;
1050 __m128i xmm_dst_lo
, xmm_dst_hi
;
1052 xmm_src_hi
= combine4 ((__m128i
*) ps
, (__m128i
*)pm
);
1053 xmm_dst_hi
= load_128_aligned ((__m128i
*) pd
);
1055 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
1056 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
1058 expand_alpha_2x128 (xmm_dst_lo
, xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
1059 negate_2x128 (xmm_dst_lo
, xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
1061 pix_multiply_2x128 (&xmm_src_lo
, &xmm_src_hi
,
1062 &xmm_dst_lo
, &xmm_dst_hi
,
1063 &xmm_dst_lo
, &xmm_dst_hi
);
1066 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
1077 uint32_t s
= combine1 (ps
, pm
);
1080 *pd
++ = pack_1x128_32 (
1081 pix_multiply_1x128 (
1082 unpack_32_1x128 (s
), negate_1x128 (
1083 expand_alpha_1x128 (unpack_32_1x128 (d
)))));
1091 static force_inline
uint32_t
1092 core_combine_atop_u_pixel_sse2 (uint32_t src
,
1095 __m128i s
= unpack_32_1x128 (src
);
1096 __m128i d
= unpack_32_1x128 (dst
);
1098 __m128i sa
= negate_1x128 (expand_alpha_1x128 (s
));
1099 __m128i da
= expand_alpha_1x128 (d
);
1101 return pack_1x128_32 (pix_add_multiply_1x128 (&s
, &da
, &d
, &sa
));
1105 sse2_combine_atop_u (pixman_implementation_t
*imp
,
1108 const uint32_t * ps
,
1109 const uint32_t * pm
,
1114 __m128i xmm_src_lo
, xmm_src_hi
;
1115 __m128i xmm_dst_lo
, xmm_dst_hi
;
1116 __m128i xmm_alpha_src_lo
, xmm_alpha_src_hi
;
1117 __m128i xmm_alpha_dst_lo
, xmm_alpha_dst_hi
;
1119 while (w
&& ((uintptr_t)pd
& 15))
1121 s
= combine1 (ps
, pm
);
1124 *pd
++ = core_combine_atop_u_pixel_sse2 (s
, d
);
1133 xmm_src_hi
= combine4 ((__m128i
*)ps
, (__m128i
*)pm
);
1134 xmm_dst_hi
= load_128_aligned ((__m128i
*) pd
);
1136 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
1137 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
1139 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
1140 &xmm_alpha_src_lo
, &xmm_alpha_src_hi
);
1141 expand_alpha_2x128 (xmm_dst_lo
, xmm_dst_hi
,
1142 &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
);
1144 negate_2x128 (xmm_alpha_src_lo
, xmm_alpha_src_hi
,
1145 &xmm_alpha_src_lo
, &xmm_alpha_src_hi
);
1147 pix_add_multiply_2x128 (
1148 &xmm_src_lo
, &xmm_src_hi
, &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
,
1149 &xmm_dst_lo
, &xmm_dst_hi
, &xmm_alpha_src_lo
, &xmm_alpha_src_hi
,
1150 &xmm_dst_lo
, &xmm_dst_hi
);
1153 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
1164 s
= combine1 (ps
, pm
);
1167 *pd
++ = core_combine_atop_u_pixel_sse2 (s
, d
);
1175 static force_inline
uint32_t
1176 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src
,
1179 __m128i s
= unpack_32_1x128 (src
);
1180 __m128i d
= unpack_32_1x128 (dst
);
1182 __m128i sa
= expand_alpha_1x128 (s
);
1183 __m128i da
= negate_1x128 (expand_alpha_1x128 (d
));
1185 return pack_1x128_32 (pix_add_multiply_1x128 (&s
, &da
, &d
, &sa
));
1189 sse2_combine_atop_reverse_u (pixman_implementation_t
*imp
,
1192 const uint32_t * ps
,
1193 const uint32_t * pm
,
1198 __m128i xmm_src_lo
, xmm_src_hi
;
1199 __m128i xmm_dst_lo
, xmm_dst_hi
;
1200 __m128i xmm_alpha_src_lo
, xmm_alpha_src_hi
;
1201 __m128i xmm_alpha_dst_lo
, xmm_alpha_dst_hi
;
1203 while (w
&& ((uintptr_t)pd
& 15))
1205 s
= combine1 (ps
, pm
);
1208 *pd
++ = core_combine_reverse_atop_u_pixel_sse2 (s
, d
);
1217 xmm_src_hi
= combine4 ((__m128i
*)ps
, (__m128i
*)pm
);
1218 xmm_dst_hi
= load_128_aligned ((__m128i
*) pd
);
1220 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
1221 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
1223 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
1224 &xmm_alpha_src_lo
, &xmm_alpha_src_hi
);
1225 expand_alpha_2x128 (xmm_dst_lo
, xmm_dst_hi
,
1226 &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
);
1228 negate_2x128 (xmm_alpha_dst_lo
, xmm_alpha_dst_hi
,
1229 &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
);
1231 pix_add_multiply_2x128 (
1232 &xmm_src_lo
, &xmm_src_hi
, &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
,
1233 &xmm_dst_lo
, &xmm_dst_hi
, &xmm_alpha_src_lo
, &xmm_alpha_src_hi
,
1234 &xmm_dst_lo
, &xmm_dst_hi
);
1237 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
1248 s
= combine1 (ps
, pm
);
1251 *pd
++ = core_combine_reverse_atop_u_pixel_sse2 (s
, d
);
1259 static force_inline
uint32_t
1260 core_combine_xor_u_pixel_sse2 (uint32_t src
,
1263 __m128i s
= unpack_32_1x128 (src
);
1264 __m128i d
= unpack_32_1x128 (dst
);
1266 __m128i neg_d
= negate_1x128 (expand_alpha_1x128 (d
));
1267 __m128i neg_s
= negate_1x128 (expand_alpha_1x128 (s
));
1269 return pack_1x128_32 (pix_add_multiply_1x128 (&s
, &neg_d
, &d
, &neg_s
));
1273 sse2_combine_xor_u (pixman_implementation_t
*imp
,
1276 const uint32_t * src
,
1277 const uint32_t * mask
,
1283 const uint32_t* ps
= src
;
1284 const uint32_t* pm
= mask
;
1286 __m128i xmm_src
, xmm_src_lo
, xmm_src_hi
;
1287 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
1288 __m128i xmm_alpha_src_lo
, xmm_alpha_src_hi
;
1289 __m128i xmm_alpha_dst_lo
, xmm_alpha_dst_hi
;
1291 while (w
&& ((uintptr_t)pd
& 15))
1293 s
= combine1 (ps
, pm
);
1296 *pd
++ = core_combine_xor_u_pixel_sse2 (s
, d
);
1305 xmm_src
= combine4 ((__m128i
*) ps
, (__m128i
*) pm
);
1306 xmm_dst
= load_128_aligned ((__m128i
*) pd
);
1308 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
1309 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
1311 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
1312 &xmm_alpha_src_lo
, &xmm_alpha_src_hi
);
1313 expand_alpha_2x128 (xmm_dst_lo
, xmm_dst_hi
,
1314 &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
);
1316 negate_2x128 (xmm_alpha_src_lo
, xmm_alpha_src_hi
,
1317 &xmm_alpha_src_lo
, &xmm_alpha_src_hi
);
1318 negate_2x128 (xmm_alpha_dst_lo
, xmm_alpha_dst_hi
,
1319 &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
);
1321 pix_add_multiply_2x128 (
1322 &xmm_src_lo
, &xmm_src_hi
, &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
,
1323 &xmm_dst_lo
, &xmm_dst_hi
, &xmm_alpha_src_lo
, &xmm_alpha_src_hi
,
1324 &xmm_dst_lo
, &xmm_dst_hi
);
1327 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
1338 s
= combine1 (ps
, pm
);
1341 *pd
++ = core_combine_xor_u_pixel_sse2 (s
, d
);
1349 static force_inline
void
1350 sse2_combine_add_u (pixman_implementation_t
*imp
,
1353 const uint32_t * src
,
1354 const uint32_t * mask
,
1360 const uint32_t* ps
= src
;
1361 const uint32_t* pm
= mask
;
1363 while (w
&& (uintptr_t)pd
& 15)
1365 s
= combine1 (ps
, pm
);
1371 *pd
++ = _mm_cvtsi128_si32 (
1372 _mm_adds_epu8 (_mm_cvtsi32_si128 (s
), _mm_cvtsi32_si128 (d
)));
1380 s
= combine4 ((__m128i
*)ps
, (__m128i
*)pm
);
1383 (__m128i
*)pd
, _mm_adds_epu8 (s
, load_128_aligned ((__m128i
*)pd
)));
1394 s
= combine1 (ps
, pm
);
1398 *pd
++ = _mm_cvtsi128_si32 (
1399 _mm_adds_epu8 (_mm_cvtsi32_si128 (s
), _mm_cvtsi32_si128 (d
)));
1405 static force_inline
uint32_t
1406 core_combine_saturate_u_pixel_sse2 (uint32_t src
,
1409 __m128i ms
= unpack_32_1x128 (src
);
1410 __m128i md
= unpack_32_1x128 (dst
);
1411 uint32_t sa
= src
>> 24;
1412 uint32_t da
= ~dst
>> 24;
1416 ms
= pix_multiply_1x128 (
1417 ms
, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da
, sa
) << 24)));
1420 return pack_1x128_32 (_mm_adds_epu16 (md
, ms
));
1424 sse2_combine_saturate_u (pixman_implementation_t
*imp
,
1427 const uint32_t * ps
,
1428 const uint32_t * pm
,
1434 __m128i xmm_src
, xmm_dst
;
1436 while (w
&& (uintptr_t)pd
& 15)
1438 s
= combine1 (ps
, pm
);
1441 *pd
++ = core_combine_saturate_u_pixel_sse2 (s
, d
);
1450 xmm_dst
= load_128_aligned ((__m128i
*)pd
);
1451 xmm_src
= combine4 ((__m128i
*)ps
, (__m128i
*)pm
);
1453 pack_cmp
= _mm_movemask_epi8 (
1455 _mm_srli_epi32 (xmm_src
, 24),
1456 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst
, mask_ff000000
), 24)));
1458 /* if some alpha src is grater than respective ~alpha dst */
1461 s
= combine1 (ps
++, pm
);
1463 *pd
++ = core_combine_saturate_u_pixel_sse2 (s
, d
);
1467 s
= combine1 (ps
++, pm
);
1469 *pd
++ = core_combine_saturate_u_pixel_sse2 (s
, d
);
1473 s
= combine1 (ps
++, pm
);
1475 *pd
++ = core_combine_saturate_u_pixel_sse2 (s
, d
);
1479 s
= combine1 (ps
++, pm
);
1481 *pd
++ = core_combine_saturate_u_pixel_sse2 (s
, d
);
1487 save_128_aligned ((__m128i
*)pd
, _mm_adds_epu8 (xmm_dst
, xmm_src
));
1500 s
= combine1 (ps
, pm
);
1503 *pd
++ = core_combine_saturate_u_pixel_sse2 (s
, d
);
1511 sse2_combine_src_ca (pixman_implementation_t
*imp
,
1514 const uint32_t * ps
,
1515 const uint32_t * pm
,
1520 __m128i xmm_src_lo
, xmm_src_hi
;
1521 __m128i xmm_mask_lo
, xmm_mask_hi
;
1522 __m128i xmm_dst_lo
, xmm_dst_hi
;
1524 while (w
&& (uintptr_t)pd
& 15)
1528 *pd
++ = pack_1x128_32 (
1529 pix_multiply_1x128 (unpack_32_1x128 (s
), unpack_32_1x128 (m
)));
1535 xmm_src_hi
= load_128_unaligned ((__m128i
*)ps
);
1536 xmm_mask_hi
= load_128_unaligned ((__m128i
*)pm
);
1538 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
1539 unpack_128_2x128 (xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
1541 pix_multiply_2x128 (&xmm_src_lo
, &xmm_src_hi
,
1542 &xmm_mask_lo
, &xmm_mask_hi
,
1543 &xmm_dst_lo
, &xmm_dst_hi
);
1546 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
1558 *pd
++ = pack_1x128_32 (
1559 pix_multiply_1x128 (unpack_32_1x128 (s
), unpack_32_1x128 (m
)));
1564 static force_inline
uint32_t
1565 core_combine_over_ca_pixel_sse2 (uint32_t src
,
1569 __m128i s
= unpack_32_1x128 (src
);
1570 __m128i expAlpha
= expand_alpha_1x128 (s
);
1571 __m128i unpk_mask
= unpack_32_1x128 (mask
);
1572 __m128i unpk_dst
= unpack_32_1x128 (dst
);
1574 return pack_1x128_32 (in_over_1x128 (&s
, &expAlpha
, &unpk_mask
, &unpk_dst
));
1578 sse2_combine_over_ca (pixman_implementation_t
*imp
,
1581 const uint32_t * ps
,
1582 const uint32_t * pm
,
1587 __m128i xmm_alpha_lo
, xmm_alpha_hi
;
1588 __m128i xmm_src_lo
, xmm_src_hi
;
1589 __m128i xmm_dst_lo
, xmm_dst_hi
;
1590 __m128i xmm_mask_lo
, xmm_mask_hi
;
1592 while (w
&& (uintptr_t)pd
& 15)
1598 *pd
++ = core_combine_over_ca_pixel_sse2 (s
, m
, d
);
1604 xmm_dst_hi
= load_128_aligned ((__m128i
*)pd
);
1605 xmm_src_hi
= load_128_unaligned ((__m128i
*)ps
);
1606 xmm_mask_hi
= load_128_unaligned ((__m128i
*)pm
);
1608 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
1609 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
1610 unpack_128_2x128 (xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
1612 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
1613 &xmm_alpha_lo
, &xmm_alpha_hi
);
1615 in_over_2x128 (&xmm_src_lo
, &xmm_src_hi
,
1616 &xmm_alpha_lo
, &xmm_alpha_hi
,
1617 &xmm_mask_lo
, &xmm_mask_hi
,
1618 &xmm_dst_lo
, &xmm_dst_hi
);
1621 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
1635 *pd
++ = core_combine_over_ca_pixel_sse2 (s
, m
, d
);
1640 static force_inline
uint32_t
1641 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src
,
1645 __m128i d
= unpack_32_1x128 (dst
);
1647 return pack_1x128_32 (
1648 over_1x128 (d
, expand_alpha_1x128 (d
),
1649 pix_multiply_1x128 (unpack_32_1x128 (src
),
1650 unpack_32_1x128 (mask
))));
1654 sse2_combine_over_reverse_ca (pixman_implementation_t
*imp
,
1657 const uint32_t * ps
,
1658 const uint32_t * pm
,
1663 __m128i xmm_alpha_lo
, xmm_alpha_hi
;
1664 __m128i xmm_src_lo
, xmm_src_hi
;
1665 __m128i xmm_dst_lo
, xmm_dst_hi
;
1666 __m128i xmm_mask_lo
, xmm_mask_hi
;
1668 while (w
&& (uintptr_t)pd
& 15)
1674 *pd
++ = core_combine_over_reverse_ca_pixel_sse2 (s
, m
, d
);
1680 xmm_dst_hi
= load_128_aligned ((__m128i
*)pd
);
1681 xmm_src_hi
= load_128_unaligned ((__m128i
*)ps
);
1682 xmm_mask_hi
= load_128_unaligned ((__m128i
*)pm
);
1684 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
1685 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
1686 unpack_128_2x128 (xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
1688 expand_alpha_2x128 (xmm_dst_lo
, xmm_dst_hi
,
1689 &xmm_alpha_lo
, &xmm_alpha_hi
);
1690 pix_multiply_2x128 (&xmm_src_lo
, &xmm_src_hi
,
1691 &xmm_mask_lo
, &xmm_mask_hi
,
1692 &xmm_mask_lo
, &xmm_mask_hi
);
1694 over_2x128 (&xmm_dst_lo
, &xmm_dst_hi
,
1695 &xmm_alpha_lo
, &xmm_alpha_hi
,
1696 &xmm_mask_lo
, &xmm_mask_hi
);
1699 (__m128i
*)pd
, pack_2x128_128 (xmm_mask_lo
, xmm_mask_hi
));
1713 *pd
++ = core_combine_over_reverse_ca_pixel_sse2 (s
, m
, d
);
1719 sse2_combine_in_ca (pixman_implementation_t
*imp
,
1722 const uint32_t * ps
,
1723 const uint32_t * pm
,
1728 __m128i xmm_alpha_lo
, xmm_alpha_hi
;
1729 __m128i xmm_src_lo
, xmm_src_hi
;
1730 __m128i xmm_dst_lo
, xmm_dst_hi
;
1731 __m128i xmm_mask_lo
, xmm_mask_hi
;
1733 while (w
&& (uintptr_t)pd
& 15)
1739 *pd
++ = pack_1x128_32 (
1740 pix_multiply_1x128 (
1741 pix_multiply_1x128 (unpack_32_1x128 (s
), unpack_32_1x128 (m
)),
1742 expand_alpha_1x128 (unpack_32_1x128 (d
))));
1749 xmm_dst_hi
= load_128_aligned ((__m128i
*)pd
);
1750 xmm_src_hi
= load_128_unaligned ((__m128i
*)ps
);
1751 xmm_mask_hi
= load_128_unaligned ((__m128i
*)pm
);
1753 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
1754 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
1755 unpack_128_2x128 (xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
1757 expand_alpha_2x128 (xmm_dst_lo
, xmm_dst_hi
,
1758 &xmm_alpha_lo
, &xmm_alpha_hi
);
1760 pix_multiply_2x128 (&xmm_src_lo
, &xmm_src_hi
,
1761 &xmm_mask_lo
, &xmm_mask_hi
,
1762 &xmm_dst_lo
, &xmm_dst_hi
);
1764 pix_multiply_2x128 (&xmm_dst_lo
, &xmm_dst_hi
,
1765 &xmm_alpha_lo
, &xmm_alpha_hi
,
1766 &xmm_dst_lo
, &xmm_dst_hi
);
1769 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
1783 *pd
++ = pack_1x128_32 (
1784 pix_multiply_1x128 (
1785 pix_multiply_1x128 (
1786 unpack_32_1x128 (s
), unpack_32_1x128 (m
)),
1787 expand_alpha_1x128 (unpack_32_1x128 (d
))));
1794 sse2_combine_in_reverse_ca (pixman_implementation_t
*imp
,
1797 const uint32_t * ps
,
1798 const uint32_t * pm
,
1803 __m128i xmm_alpha_lo
, xmm_alpha_hi
;
1804 __m128i xmm_src_lo
, xmm_src_hi
;
1805 __m128i xmm_dst_lo
, xmm_dst_hi
;
1806 __m128i xmm_mask_lo
, xmm_mask_hi
;
1808 while (w
&& (uintptr_t)pd
& 15)
1814 *pd
++ = pack_1x128_32 (
1815 pix_multiply_1x128 (
1816 unpack_32_1x128 (d
),
1817 pix_multiply_1x128 (unpack_32_1x128 (m
),
1818 expand_alpha_1x128 (unpack_32_1x128 (s
)))));
1824 xmm_dst_hi
= load_128_aligned ((__m128i
*)pd
);
1825 xmm_src_hi
= load_128_unaligned ((__m128i
*)ps
);
1826 xmm_mask_hi
= load_128_unaligned ((__m128i
*)pm
);
1828 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
1829 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
1830 unpack_128_2x128 (xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
1832 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
1833 &xmm_alpha_lo
, &xmm_alpha_hi
);
1834 pix_multiply_2x128 (&xmm_mask_lo
, &xmm_mask_hi
,
1835 &xmm_alpha_lo
, &xmm_alpha_hi
,
1836 &xmm_alpha_lo
, &xmm_alpha_hi
);
1838 pix_multiply_2x128 (&xmm_dst_lo
, &xmm_dst_hi
,
1839 &xmm_alpha_lo
, &xmm_alpha_hi
,
1840 &xmm_dst_lo
, &xmm_dst_hi
);
1843 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
1857 *pd
++ = pack_1x128_32 (
1858 pix_multiply_1x128 (
1859 unpack_32_1x128 (d
),
1860 pix_multiply_1x128 (unpack_32_1x128 (m
),
1861 expand_alpha_1x128 (unpack_32_1x128 (s
)))));
1867 sse2_combine_out_ca (pixman_implementation_t
*imp
,
1870 const uint32_t * ps
,
1871 const uint32_t * pm
,
1876 __m128i xmm_alpha_lo
, xmm_alpha_hi
;
1877 __m128i xmm_src_lo
, xmm_src_hi
;
1878 __m128i xmm_dst_lo
, xmm_dst_hi
;
1879 __m128i xmm_mask_lo
, xmm_mask_hi
;
1881 while (w
&& (uintptr_t)pd
& 15)
1887 *pd
++ = pack_1x128_32 (
1888 pix_multiply_1x128 (
1889 pix_multiply_1x128 (
1890 unpack_32_1x128 (s
), unpack_32_1x128 (m
)),
1891 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d
)))));
1897 xmm_dst_hi
= load_128_aligned ((__m128i
*)pd
);
1898 xmm_src_hi
= load_128_unaligned ((__m128i
*)ps
);
1899 xmm_mask_hi
= load_128_unaligned ((__m128i
*)pm
);
1901 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
1902 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
1903 unpack_128_2x128 (xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
1905 expand_alpha_2x128 (xmm_dst_lo
, xmm_dst_hi
,
1906 &xmm_alpha_lo
, &xmm_alpha_hi
);
1907 negate_2x128 (xmm_alpha_lo
, xmm_alpha_hi
,
1908 &xmm_alpha_lo
, &xmm_alpha_hi
);
1910 pix_multiply_2x128 (&xmm_src_lo
, &xmm_src_hi
,
1911 &xmm_mask_lo
, &xmm_mask_hi
,
1912 &xmm_dst_lo
, &xmm_dst_hi
);
1913 pix_multiply_2x128 (&xmm_dst_lo
, &xmm_dst_hi
,
1914 &xmm_alpha_lo
, &xmm_alpha_hi
,
1915 &xmm_dst_lo
, &xmm_dst_hi
);
1918 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
1932 *pd
++ = pack_1x128_32 (
1933 pix_multiply_1x128 (
1934 pix_multiply_1x128 (
1935 unpack_32_1x128 (s
), unpack_32_1x128 (m
)),
1936 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d
)))));
1943 sse2_combine_out_reverse_ca (pixman_implementation_t
*imp
,
1946 const uint32_t * ps
,
1947 const uint32_t * pm
,
1952 __m128i xmm_alpha_lo
, xmm_alpha_hi
;
1953 __m128i xmm_src_lo
, xmm_src_hi
;
1954 __m128i xmm_dst_lo
, xmm_dst_hi
;
1955 __m128i xmm_mask_lo
, xmm_mask_hi
;
1957 while (w
&& (uintptr_t)pd
& 15)
1963 *pd
++ = pack_1x128_32 (
1964 pix_multiply_1x128 (
1965 unpack_32_1x128 (d
),
1966 negate_1x128 (pix_multiply_1x128 (
1967 unpack_32_1x128 (m
),
1968 expand_alpha_1x128 (unpack_32_1x128 (s
))))));
1974 xmm_dst_hi
= load_128_aligned ((__m128i
*)pd
);
1975 xmm_src_hi
= load_128_unaligned ((__m128i
*)ps
);
1976 xmm_mask_hi
= load_128_unaligned ((__m128i
*)pm
);
1978 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
1979 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
1980 unpack_128_2x128 (xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
1982 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
1983 &xmm_alpha_lo
, &xmm_alpha_hi
);
1985 pix_multiply_2x128 (&xmm_mask_lo
, &xmm_mask_hi
,
1986 &xmm_alpha_lo
, &xmm_alpha_hi
,
1987 &xmm_mask_lo
, &xmm_mask_hi
);
1989 negate_2x128 (xmm_mask_lo
, xmm_mask_hi
,
1990 &xmm_mask_lo
, &xmm_mask_hi
);
1992 pix_multiply_2x128 (&xmm_dst_lo
, &xmm_dst_hi
,
1993 &xmm_mask_lo
, &xmm_mask_hi
,
1994 &xmm_dst_lo
, &xmm_dst_hi
);
1997 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
2011 *pd
++ = pack_1x128_32 (
2012 pix_multiply_1x128 (
2013 unpack_32_1x128 (d
),
2014 negate_1x128 (pix_multiply_1x128 (
2015 unpack_32_1x128 (m
),
2016 expand_alpha_1x128 (unpack_32_1x128 (s
))))));
2021 static force_inline
uint32_t
2022 core_combine_atop_ca_pixel_sse2 (uint32_t src
,
2026 __m128i m
= unpack_32_1x128 (mask
);
2027 __m128i s
= unpack_32_1x128 (src
);
2028 __m128i d
= unpack_32_1x128 (dst
);
2029 __m128i sa
= expand_alpha_1x128 (s
);
2030 __m128i da
= expand_alpha_1x128 (d
);
2032 s
= pix_multiply_1x128 (s
, m
);
2033 m
= negate_1x128 (pix_multiply_1x128 (m
, sa
));
2035 return pack_1x128_32 (pix_add_multiply_1x128 (&d
, &m
, &s
, &da
));
2039 sse2_combine_atop_ca (pixman_implementation_t
*imp
,
2042 const uint32_t * ps
,
2043 const uint32_t * pm
,
2048 __m128i xmm_src_lo
, xmm_src_hi
;
2049 __m128i xmm_dst_lo
, xmm_dst_hi
;
2050 __m128i xmm_alpha_src_lo
, xmm_alpha_src_hi
;
2051 __m128i xmm_alpha_dst_lo
, xmm_alpha_dst_hi
;
2052 __m128i xmm_mask_lo
, xmm_mask_hi
;
2054 while (w
&& (uintptr_t)pd
& 15)
2060 *pd
++ = core_combine_atop_ca_pixel_sse2 (s
, m
, d
);
2066 xmm_dst_hi
= load_128_aligned ((__m128i
*)pd
);
2067 xmm_src_hi
= load_128_unaligned ((__m128i
*)ps
);
2068 xmm_mask_hi
= load_128_unaligned ((__m128i
*)pm
);
2070 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
2071 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
2072 unpack_128_2x128 (xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
2074 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
2075 &xmm_alpha_src_lo
, &xmm_alpha_src_hi
);
2076 expand_alpha_2x128 (xmm_dst_lo
, xmm_dst_hi
,
2077 &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
);
2079 pix_multiply_2x128 (&xmm_src_lo
, &xmm_src_hi
,
2080 &xmm_mask_lo
, &xmm_mask_hi
,
2081 &xmm_src_lo
, &xmm_src_hi
);
2082 pix_multiply_2x128 (&xmm_mask_lo
, &xmm_mask_hi
,
2083 &xmm_alpha_src_lo
, &xmm_alpha_src_hi
,
2084 &xmm_mask_lo
, &xmm_mask_hi
);
2086 negate_2x128 (xmm_mask_lo
, xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
2088 pix_add_multiply_2x128 (
2089 &xmm_dst_lo
, &xmm_dst_hi
, &xmm_mask_lo
, &xmm_mask_hi
,
2090 &xmm_src_lo
, &xmm_src_hi
, &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
,
2091 &xmm_dst_lo
, &xmm_dst_hi
);
2094 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
2108 *pd
++ = core_combine_atop_ca_pixel_sse2 (s
, m
, d
);
2113 static force_inline
uint32_t
2114 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src
,
2118 __m128i m
= unpack_32_1x128 (mask
);
2119 __m128i s
= unpack_32_1x128 (src
);
2120 __m128i d
= unpack_32_1x128 (dst
);
2122 __m128i da
= negate_1x128 (expand_alpha_1x128 (d
));
2123 __m128i sa
= expand_alpha_1x128 (s
);
2125 s
= pix_multiply_1x128 (s
, m
);
2126 m
= pix_multiply_1x128 (m
, sa
);
2128 return pack_1x128_32 (pix_add_multiply_1x128 (&d
, &m
, &s
, &da
));
2132 sse2_combine_atop_reverse_ca (pixman_implementation_t
*imp
,
2135 const uint32_t * ps
,
2136 const uint32_t * pm
,
2141 __m128i xmm_src_lo
, xmm_src_hi
;
2142 __m128i xmm_dst_lo
, xmm_dst_hi
;
2143 __m128i xmm_alpha_src_lo
, xmm_alpha_src_hi
;
2144 __m128i xmm_alpha_dst_lo
, xmm_alpha_dst_hi
;
2145 __m128i xmm_mask_lo
, xmm_mask_hi
;
2147 while (w
&& (uintptr_t)pd
& 15)
2153 *pd
++ = core_combine_reverse_atop_ca_pixel_sse2 (s
, m
, d
);
2159 xmm_dst_hi
= load_128_aligned ((__m128i
*)pd
);
2160 xmm_src_hi
= load_128_unaligned ((__m128i
*)ps
);
2161 xmm_mask_hi
= load_128_unaligned ((__m128i
*)pm
);
2163 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
2164 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
2165 unpack_128_2x128 (xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
2167 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
2168 &xmm_alpha_src_lo
, &xmm_alpha_src_hi
);
2169 expand_alpha_2x128 (xmm_dst_lo
, xmm_dst_hi
,
2170 &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
);
2172 pix_multiply_2x128 (&xmm_src_lo
, &xmm_src_hi
,
2173 &xmm_mask_lo
, &xmm_mask_hi
,
2174 &xmm_src_lo
, &xmm_src_hi
);
2175 pix_multiply_2x128 (&xmm_mask_lo
, &xmm_mask_hi
,
2176 &xmm_alpha_src_lo
, &xmm_alpha_src_hi
,
2177 &xmm_mask_lo
, &xmm_mask_hi
);
2179 negate_2x128 (xmm_alpha_dst_lo
, xmm_alpha_dst_hi
,
2180 &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
);
2182 pix_add_multiply_2x128 (
2183 &xmm_dst_lo
, &xmm_dst_hi
, &xmm_mask_lo
, &xmm_mask_hi
,
2184 &xmm_src_lo
, &xmm_src_hi
, &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
,
2185 &xmm_dst_lo
, &xmm_dst_hi
);
2188 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
2202 *pd
++ = core_combine_reverse_atop_ca_pixel_sse2 (s
, m
, d
);
2207 static force_inline
uint32_t
2208 core_combine_xor_ca_pixel_sse2 (uint32_t src
,
2212 __m128i a
= unpack_32_1x128 (mask
);
2213 __m128i s
= unpack_32_1x128 (src
);
2214 __m128i d
= unpack_32_1x128 (dst
);
2216 __m128i alpha_dst
= negate_1x128 (pix_multiply_1x128 (
2217 a
, expand_alpha_1x128 (s
)));
2218 __m128i dest
= pix_multiply_1x128 (s
, a
);
2219 __m128i alpha_src
= negate_1x128 (expand_alpha_1x128 (d
));
2221 return pack_1x128_32 (pix_add_multiply_1x128 (&d
,
2228 sse2_combine_xor_ca (pixman_implementation_t
*imp
,
2231 const uint32_t * ps
,
2232 const uint32_t * pm
,
2237 __m128i xmm_src_lo
, xmm_src_hi
;
2238 __m128i xmm_dst_lo
, xmm_dst_hi
;
2239 __m128i xmm_alpha_src_lo
, xmm_alpha_src_hi
;
2240 __m128i xmm_alpha_dst_lo
, xmm_alpha_dst_hi
;
2241 __m128i xmm_mask_lo
, xmm_mask_hi
;
2243 while (w
&& (uintptr_t)pd
& 15)
2249 *pd
++ = core_combine_xor_ca_pixel_sse2 (s
, m
, d
);
2255 xmm_dst_hi
= load_128_aligned ((__m128i
*)pd
);
2256 xmm_src_hi
= load_128_unaligned ((__m128i
*)ps
);
2257 xmm_mask_hi
= load_128_unaligned ((__m128i
*)pm
);
2259 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
2260 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
2261 unpack_128_2x128 (xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
2263 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
2264 &xmm_alpha_src_lo
, &xmm_alpha_src_hi
);
2265 expand_alpha_2x128 (xmm_dst_lo
, xmm_dst_hi
,
2266 &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
);
2268 pix_multiply_2x128 (&xmm_src_lo
, &xmm_src_hi
,
2269 &xmm_mask_lo
, &xmm_mask_hi
,
2270 &xmm_src_lo
, &xmm_src_hi
);
2271 pix_multiply_2x128 (&xmm_mask_lo
, &xmm_mask_hi
,
2272 &xmm_alpha_src_lo
, &xmm_alpha_src_hi
,
2273 &xmm_mask_lo
, &xmm_mask_hi
);
2275 negate_2x128 (xmm_alpha_dst_lo
, xmm_alpha_dst_hi
,
2276 &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
);
2277 negate_2x128 (xmm_mask_lo
, xmm_mask_hi
,
2278 &xmm_mask_lo
, &xmm_mask_hi
);
2280 pix_add_multiply_2x128 (
2281 &xmm_dst_lo
, &xmm_dst_hi
, &xmm_mask_lo
, &xmm_mask_hi
,
2282 &xmm_src_lo
, &xmm_src_hi
, &xmm_alpha_dst_lo
, &xmm_alpha_dst_hi
,
2283 &xmm_dst_lo
, &xmm_dst_hi
);
2286 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
2300 *pd
++ = core_combine_xor_ca_pixel_sse2 (s
, m
, d
);
2306 sse2_combine_add_ca (pixman_implementation_t
*imp
,
2309 const uint32_t * ps
,
2310 const uint32_t * pm
,
2315 __m128i xmm_src_lo
, xmm_src_hi
;
2316 __m128i xmm_dst_lo
, xmm_dst_hi
;
2317 __m128i xmm_mask_lo
, xmm_mask_hi
;
2319 while (w
&& (uintptr_t)pd
& 15)
2325 *pd
++ = pack_1x128_32 (
2326 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s
),
2327 unpack_32_1x128 (m
)),
2328 unpack_32_1x128 (d
)));
2334 xmm_src_hi
= load_128_unaligned ((__m128i
*)ps
);
2335 xmm_mask_hi
= load_128_unaligned ((__m128i
*)pm
);
2336 xmm_dst_hi
= load_128_aligned ((__m128i
*)pd
);
2338 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
2339 unpack_128_2x128 (xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
2340 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
2342 pix_multiply_2x128 (&xmm_src_lo
, &xmm_src_hi
,
2343 &xmm_mask_lo
, &xmm_mask_hi
,
2344 &xmm_src_lo
, &xmm_src_hi
);
2347 (__m128i
*)pd
, pack_2x128_128 (
2348 _mm_adds_epu8 (xmm_src_lo
, xmm_dst_lo
),
2349 _mm_adds_epu8 (xmm_src_hi
, xmm_dst_hi
)));
2363 *pd
++ = pack_1x128_32 (
2364 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s
),
2365 unpack_32_1x128 (m
)),
2366 unpack_32_1x128 (d
)));
2371 static force_inline __m128i
2372 create_mask_16_128 (uint16_t mask
)
2374 return _mm_set1_epi16 (mask
);
2377 /* Work around a code generation bug in Sun Studio 12. */
2378 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2379 # define create_mask_2x32_128(mask0, mask1) \
2380 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2382 static force_inline __m128i
2383 create_mask_2x32_128 (uint32_t mask0
,
2386 return _mm_set_epi32 (mask0
, mask1
, mask0
, mask1
);
2391 sse2_composite_over_n_8888 (pixman_implementation_t
*imp
,
2392 pixman_composite_info_t
*info
)
2394 PIXMAN_COMPOSITE_ARGS (info
);
2396 uint32_t *dst_line
, *dst
, d
;
2399 __m128i xmm_src
, xmm_alpha
;
2400 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
2402 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
2407 PIXMAN_IMAGE_GET_LINE (
2408 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2410 xmm_src
= expand_pixel_32_1x128 (src
);
2411 xmm_alpha
= expand_alpha_1x128 (xmm_src
);
2417 dst_line
+= dst_stride
;
2420 while (w
&& (uintptr_t)dst
& 15)
2423 *dst
++ = pack_1x128_32 (over_1x128 (xmm_src
,
2425 unpack_32_1x128 (d
)));
2431 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
2433 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
2435 over_2x128 (&xmm_src
, &xmm_src
,
2436 &xmm_alpha
, &xmm_alpha
,
2437 &xmm_dst_lo
, &xmm_dst_hi
);
2439 /* rebuid the 4 pixel data and save*/
2441 (__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
2450 *dst
++ = pack_1x128_32 (over_1x128 (xmm_src
,
2452 unpack_32_1x128 (d
)));
2460 sse2_composite_over_n_0565 (pixman_implementation_t
*imp
,
2461 pixman_composite_info_t
*info
)
2463 PIXMAN_COMPOSITE_ARGS (info
);
2465 uint16_t *dst_line
, *dst
, d
;
2468 __m128i xmm_src
, xmm_alpha
;
2469 __m128i xmm_dst
, xmm_dst0
, xmm_dst1
, xmm_dst2
, xmm_dst3
;
2471 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
2476 PIXMAN_IMAGE_GET_LINE (
2477 dest_image
, dest_x
, dest_y
, uint16_t, dst_stride
, dst_line
, 1);
2479 xmm_src
= expand_pixel_32_1x128 (src
);
2480 xmm_alpha
= expand_alpha_1x128 (xmm_src
);
2486 dst_line
+= dst_stride
;
2489 while (w
&& (uintptr_t)dst
& 15)
2493 *dst
++ = pack_565_32_16 (
2494 pack_1x128_32 (over_1x128 (xmm_src
,
2496 expand565_16_1x128 (d
))));
2502 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
2504 unpack_565_128_4x128 (xmm_dst
,
2505 &xmm_dst0
, &xmm_dst1
, &xmm_dst2
, &xmm_dst3
);
2507 over_2x128 (&xmm_src
, &xmm_src
,
2508 &xmm_alpha
, &xmm_alpha
,
2509 &xmm_dst0
, &xmm_dst1
);
2510 over_2x128 (&xmm_src
, &xmm_src
,
2511 &xmm_alpha
, &xmm_alpha
,
2512 &xmm_dst2
, &xmm_dst3
);
2514 xmm_dst
= pack_565_4x128_128 (
2515 &xmm_dst0
, &xmm_dst1
, &xmm_dst2
, &xmm_dst3
);
2517 save_128_aligned ((__m128i
*)dst
, xmm_dst
);
2526 *dst
++ = pack_565_32_16 (
2527 pack_1x128_32 (over_1x128 (xmm_src
, xmm_alpha
,
2528 expand565_16_1x128 (d
))));
2535 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t
*imp
,
2536 pixman_composite_info_t
*info
)
2538 PIXMAN_COMPOSITE_ARGS (info
);
2540 uint32_t *dst_line
, d
;
2541 uint32_t *mask_line
, m
;
2543 int dst_stride
, mask_stride
;
2547 __m128i xmm_mask
, xmm_mask_lo
, xmm_mask_hi
;
2549 __m128i mmx_src
, mmx_mask
, mmx_dest
;
2551 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
2556 PIXMAN_IMAGE_GET_LINE (
2557 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2558 PIXMAN_IMAGE_GET_LINE (
2559 mask_image
, mask_x
, mask_y
, uint32_t, mask_stride
, mask_line
, 1);
2561 xmm_src
= _mm_unpacklo_epi8 (
2562 create_mask_2x32_128 (src
, src
), _mm_setzero_si128 ());
2568 const uint32_t *pm
= (uint32_t *)mask_line
;
2569 uint32_t *pd
= (uint32_t *)dst_line
;
2571 dst_line
+= dst_stride
;
2572 mask_line
+= mask_stride
;
2574 while (w
&& (uintptr_t)pd
& 15)
2582 mmx_mask
= unpack_32_1x128 (m
);
2583 mmx_dest
= unpack_32_1x128 (d
);
2585 *pd
= pack_1x128_32 (
2586 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask
, mmx_src
),
2596 xmm_mask
= load_128_unaligned ((__m128i
*)pm
);
2600 _mm_cmpeq_epi32 (xmm_mask
, _mm_setzero_si128 ()));
2602 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2603 if (pack_cmp
!= 0xffff)
2605 xmm_dst
= load_128_aligned ((__m128i
*)pd
);
2607 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
2609 pix_multiply_2x128 (&xmm_src
, &xmm_src
,
2610 &xmm_mask_lo
, &xmm_mask_hi
,
2611 &xmm_mask_lo
, &xmm_mask_hi
);
2612 xmm_mask_hi
= pack_2x128_128 (xmm_mask_lo
, xmm_mask_hi
);
2615 (__m128i
*)pd
, _mm_adds_epu8 (xmm_mask_hi
, xmm_dst
));
2631 mmx_mask
= unpack_32_1x128 (m
);
2632 mmx_dest
= unpack_32_1x128 (d
);
2634 *pd
= pack_1x128_32 (
2635 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask
, mmx_src
),
2647 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t
*imp
,
2648 pixman_composite_info_t
*info
)
2650 PIXMAN_COMPOSITE_ARGS (info
);
2652 uint32_t *dst_line
, d
;
2653 uint32_t *mask_line
, m
;
2655 int dst_stride
, mask_stride
;
2657 __m128i xmm_src
, xmm_alpha
;
2658 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
2659 __m128i xmm_mask
, xmm_mask_lo
, xmm_mask_hi
;
2661 __m128i mmx_src
, mmx_alpha
, mmx_mask
, mmx_dest
;
2663 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
2668 PIXMAN_IMAGE_GET_LINE (
2669 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2670 PIXMAN_IMAGE_GET_LINE (
2671 mask_image
, mask_x
, mask_y
, uint32_t, mask_stride
, mask_line
, 1);
2673 xmm_src
= _mm_unpacklo_epi8 (
2674 create_mask_2x32_128 (src
, src
), _mm_setzero_si128 ());
2675 xmm_alpha
= expand_alpha_1x128 (xmm_src
);
2677 mmx_alpha
= xmm_alpha
;
2682 const uint32_t *pm
= (uint32_t *)mask_line
;
2683 uint32_t *pd
= (uint32_t *)dst_line
;
2685 dst_line
+= dst_stride
;
2686 mask_line
+= mask_stride
;
2688 while (w
&& (uintptr_t)pd
& 15)
2695 mmx_mask
= unpack_32_1x128 (m
);
2696 mmx_dest
= unpack_32_1x128 (d
);
2698 *pd
= pack_1x128_32 (in_over_1x128 (&mmx_src
,
2710 xmm_mask
= load_128_unaligned ((__m128i
*)pm
);
2714 _mm_cmpeq_epi32 (xmm_mask
, _mm_setzero_si128 ()));
2716 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2717 if (pack_cmp
!= 0xffff)
2719 xmm_dst
= load_128_aligned ((__m128i
*)pd
);
2721 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
2722 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
2724 in_over_2x128 (&xmm_src
, &xmm_src
,
2725 &xmm_alpha
, &xmm_alpha
,
2726 &xmm_mask_lo
, &xmm_mask_hi
,
2727 &xmm_dst_lo
, &xmm_dst_hi
);
2730 (__m128i
*)pd
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
2745 mmx_mask
= unpack_32_1x128 (m
);
2746 mmx_dest
= unpack_32_1x128 (d
);
2748 *pd
= pack_1x128_32 (
2749 in_over_1x128 (&mmx_src
, &mmx_alpha
, &mmx_mask
, &mmx_dest
));
2760 sse2_composite_over_8888_n_8888 (pixman_implementation_t
*imp
,
2761 pixman_composite_info_t
*info
)
2763 PIXMAN_COMPOSITE_ARGS (info
);
2764 uint32_t *dst_line
, *dst
;
2765 uint32_t *src_line
, *src
;
2768 int dst_stride
, src_stride
;
2771 __m128i xmm_src
, xmm_src_lo
, xmm_src_hi
;
2772 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
2773 __m128i xmm_alpha_lo
, xmm_alpha_hi
;
2775 PIXMAN_IMAGE_GET_LINE (
2776 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2777 PIXMAN_IMAGE_GET_LINE (
2778 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
2780 mask
= _pixman_image_get_solid (imp
, mask_image
, PIXMAN_a8r8g8b8
);
2782 xmm_mask
= create_mask_16_128 (mask
>> 24);
2787 dst_line
+= dst_stride
;
2789 src_line
+= src_stride
;
2792 while (w
&& (uintptr_t)dst
& 15)
2794 uint32_t s
= *src
++;
2800 __m128i ms
= unpack_32_1x128 (s
);
2801 __m128i alpha
= expand_alpha_1x128 (ms
);
2802 __m128i dest
= xmm_mask
;
2803 __m128i alpha_dst
= unpack_32_1x128 (d
);
2805 *dst
= pack_1x128_32 (
2806 in_over_1x128 (&ms
, &alpha
, &dest
, &alpha_dst
));
2814 xmm_src
= load_128_unaligned ((__m128i
*)src
);
2816 if (!is_zero (xmm_src
))
2818 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
2820 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
2821 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
2822 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
2823 &xmm_alpha_lo
, &xmm_alpha_hi
);
2825 in_over_2x128 (&xmm_src_lo
, &xmm_src_hi
,
2826 &xmm_alpha_lo
, &xmm_alpha_hi
,
2827 &xmm_mask
, &xmm_mask
,
2828 &xmm_dst_lo
, &xmm_dst_hi
);
2831 (__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
2841 uint32_t s
= *src
++;
2847 __m128i ms
= unpack_32_1x128 (s
);
2848 __m128i alpha
= expand_alpha_1x128 (ms
);
2849 __m128i mask
= xmm_mask
;
2850 __m128i dest
= unpack_32_1x128 (d
);
2852 *dst
= pack_1x128_32 (
2853 in_over_1x128 (&ms
, &alpha
, &mask
, &dest
));
2864 sse2_composite_src_x888_0565 (pixman_implementation_t
*imp
,
2865 pixman_composite_info_t
*info
)
2867 PIXMAN_COMPOSITE_ARGS (info
);
2868 uint16_t *dst_line
, *dst
;
2869 uint32_t *src_line
, *src
, s
;
2870 int dst_stride
, src_stride
;
2873 PIXMAN_IMAGE_GET_LINE (src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
2874 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint16_t, dst_stride
, dst_line
, 1);
2879 dst_line
+= dst_stride
;
2881 src_line
+= src_stride
;
2884 while (w
&& (uintptr_t)dst
& 15)
2887 *dst
= convert_8888_to_0565 (s
);
2894 __m128i xmm_src0
= load_128_unaligned ((__m128i
*)src
+ 0);
2895 __m128i xmm_src1
= load_128_unaligned ((__m128i
*)src
+ 1);
2897 save_128_aligned ((__m128i
*)dst
, pack_565_2packedx128_128 (xmm_src0
, xmm_src1
));
2907 *dst
= convert_8888_to_0565 (s
);
2915 sse2_composite_src_x888_8888 (pixman_implementation_t
*imp
,
2916 pixman_composite_info_t
*info
)
2918 PIXMAN_COMPOSITE_ARGS (info
);
2919 uint32_t *dst_line
, *dst
;
2920 uint32_t *src_line
, *src
;
2922 int dst_stride
, src_stride
;
2925 PIXMAN_IMAGE_GET_LINE (
2926 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2927 PIXMAN_IMAGE_GET_LINE (
2928 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
2933 dst_line
+= dst_stride
;
2935 src_line
+= src_stride
;
2938 while (w
&& (uintptr_t)dst
& 15)
2940 *dst
++ = *src
++ | 0xff000000;
2946 __m128i xmm_src1
, xmm_src2
, xmm_src3
, xmm_src4
;
2948 xmm_src1
= load_128_unaligned ((__m128i
*)src
+ 0);
2949 xmm_src2
= load_128_unaligned ((__m128i
*)src
+ 1);
2950 xmm_src3
= load_128_unaligned ((__m128i
*)src
+ 2);
2951 xmm_src4
= load_128_unaligned ((__m128i
*)src
+ 3);
2953 save_128_aligned ((__m128i
*)dst
+ 0, _mm_or_si128 (xmm_src1
, mask_ff000000
));
2954 save_128_aligned ((__m128i
*)dst
+ 1, _mm_or_si128 (xmm_src2
, mask_ff000000
));
2955 save_128_aligned ((__m128i
*)dst
+ 2, _mm_or_si128 (xmm_src3
, mask_ff000000
));
2956 save_128_aligned ((__m128i
*)dst
+ 3, _mm_or_si128 (xmm_src4
, mask_ff000000
));
2965 *dst
++ = *src
++ | 0xff000000;
2973 sse2_composite_over_x888_n_8888 (pixman_implementation_t
*imp
,
2974 pixman_composite_info_t
*info
)
2976 PIXMAN_COMPOSITE_ARGS (info
);
2977 uint32_t *dst_line
, *dst
;
2978 uint32_t *src_line
, *src
;
2980 int dst_stride
, src_stride
;
2983 __m128i xmm_mask
, xmm_alpha
;
2984 __m128i xmm_src
, xmm_src_lo
, xmm_src_hi
;
2985 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
2987 PIXMAN_IMAGE_GET_LINE (
2988 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2989 PIXMAN_IMAGE_GET_LINE (
2990 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
2992 mask
= _pixman_image_get_solid (imp
, mask_image
, PIXMAN_a8r8g8b8
);
2994 xmm_mask
= create_mask_16_128 (mask
>> 24);
2995 xmm_alpha
= mask_00ff
;
3000 dst_line
+= dst_stride
;
3002 src_line
+= src_stride
;
3005 while (w
&& (uintptr_t)dst
& 15)
3007 uint32_t s
= (*src
++) | 0xff000000;
3010 __m128i src
= unpack_32_1x128 (s
);
3011 __m128i alpha
= xmm_alpha
;
3012 __m128i mask
= xmm_mask
;
3013 __m128i dest
= unpack_32_1x128 (d
);
3015 *dst
++ = pack_1x128_32 (
3016 in_over_1x128 (&src
, &alpha
, &mask
, &dest
));
3023 xmm_src
= _mm_or_si128 (
3024 load_128_unaligned ((__m128i
*)src
), mask_ff000000
);
3025 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
3027 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
3028 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
3030 in_over_2x128 (&xmm_src_lo
, &xmm_src_hi
,
3031 &xmm_alpha
, &xmm_alpha
,
3032 &xmm_mask
, &xmm_mask
,
3033 &xmm_dst_lo
, &xmm_dst_hi
);
3036 (__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
3046 uint32_t s
= (*src
++) | 0xff000000;
3049 __m128i src
= unpack_32_1x128 (s
);
3050 __m128i alpha
= xmm_alpha
;
3051 __m128i mask
= xmm_mask
;
3052 __m128i dest
= unpack_32_1x128 (d
);
3054 *dst
++ = pack_1x128_32 (
3055 in_over_1x128 (&src
, &alpha
, &mask
, &dest
));
3064 sse2_composite_over_8888_8888 (pixman_implementation_t
*imp
,
3065 pixman_composite_info_t
*info
)
3067 PIXMAN_COMPOSITE_ARGS (info
);
3068 int dst_stride
, src_stride
;
3069 uint32_t *dst_line
, *dst
;
3070 uint32_t *src_line
, *src
;
3072 PIXMAN_IMAGE_GET_LINE (
3073 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
3074 PIXMAN_IMAGE_GET_LINE (
3075 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
3082 sse2_combine_over_u (imp
, op
, dst
, src
, NULL
, width
);
3089 static force_inline
uint16_t
3090 composite_over_8888_0565pixel (uint32_t src
, uint16_t dst
)
3094 ms
= unpack_32_1x128 (src
);
3095 return pack_565_32_16 (
3098 ms
, expand_alpha_1x128 (ms
), expand565_16_1x128 (dst
))));
3102 sse2_composite_over_8888_0565 (pixman_implementation_t
*imp
,
3103 pixman_composite_info_t
*info
)
3105 PIXMAN_COMPOSITE_ARGS (info
);
3106 uint16_t *dst_line
, *dst
, d
;
3107 uint32_t *src_line
, *src
, s
;
3108 int dst_stride
, src_stride
;
3111 __m128i xmm_alpha_lo
, xmm_alpha_hi
;
3112 __m128i xmm_src
, xmm_src_lo
, xmm_src_hi
;
3113 __m128i xmm_dst
, xmm_dst0
, xmm_dst1
, xmm_dst2
, xmm_dst3
;
3115 PIXMAN_IMAGE_GET_LINE (
3116 dest_image
, dest_x
, dest_y
, uint16_t, dst_stride
, dst_line
, 1);
3117 PIXMAN_IMAGE_GET_LINE (
3118 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
3125 dst_line
+= dst_stride
;
3126 src_line
+= src_stride
;
3129 /* Align dst on a 16-byte boundary */
3131 ((uintptr_t)dst
& 15))
3136 *dst
++ = composite_over_8888_0565pixel (s
, d
);
3140 /* It's a 8 pixel loop */
3143 /* I'm loading unaligned because I'm not sure
3144 * about the address alignment.
3146 xmm_src
= load_128_unaligned ((__m128i
*) src
);
3147 xmm_dst
= load_128_aligned ((__m128i
*) dst
);
3150 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
3151 unpack_565_128_4x128 (xmm_dst
,
3152 &xmm_dst0
, &xmm_dst1
, &xmm_dst2
, &xmm_dst3
);
3153 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
3154 &xmm_alpha_lo
, &xmm_alpha_hi
);
3156 /* I'm loading next 4 pixels from memory
3157 * before to optimze the memory read.
3159 xmm_src
= load_128_unaligned ((__m128i
*) (src
+ 4));
3161 over_2x128 (&xmm_src_lo
, &xmm_src_hi
,
3162 &xmm_alpha_lo
, &xmm_alpha_hi
,
3163 &xmm_dst0
, &xmm_dst1
);
3166 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
3167 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
3168 &xmm_alpha_lo
, &xmm_alpha_hi
);
3170 over_2x128 (&xmm_src_lo
, &xmm_src_hi
,
3171 &xmm_alpha_lo
, &xmm_alpha_hi
,
3172 &xmm_dst2
, &xmm_dst3
);
3175 (__m128i
*)dst
, pack_565_4x128_128 (
3176 &xmm_dst0
, &xmm_dst1
, &xmm_dst2
, &xmm_dst3
));
3188 *dst
++ = composite_over_8888_0565pixel (s
, d
);
3195 sse2_composite_over_n_8_8888 (pixman_implementation_t
*imp
,
3196 pixman_composite_info_t
*info
)
3198 PIXMAN_COMPOSITE_ARGS (info
);
3200 uint32_t *dst_line
, *dst
;
3201 uint8_t *mask_line
, *mask
;
3202 int dst_stride
, mask_stride
;
3206 __m128i xmm_src
, xmm_alpha
, xmm_def
;
3207 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
3208 __m128i xmm_mask
, xmm_mask_lo
, xmm_mask_hi
;
3210 __m128i mmx_src
, mmx_alpha
, mmx_mask
, mmx_dest
;
3212 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
3218 PIXMAN_IMAGE_GET_LINE (
3219 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
3220 PIXMAN_IMAGE_GET_LINE (
3221 mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
3223 xmm_def
= create_mask_2x32_128 (src
, src
);
3224 xmm_src
= expand_pixel_32_1x128 (src
);
3225 xmm_alpha
= expand_alpha_1x128 (xmm_src
);
3227 mmx_alpha
= xmm_alpha
;
3232 dst_line
+= dst_stride
;
3234 mask_line
+= mask_stride
;
3237 while (w
&& (uintptr_t)dst
& 15)
3239 uint8_t m
= *mask
++;
3244 mmx_mask
= expand_pixel_8_1x128 (m
);
3245 mmx_dest
= unpack_32_1x128 (d
);
3247 *dst
= pack_1x128_32 (in_over_1x128 (&mmx_src
,
3259 m
= *((uint32_t*)mask
);
3261 if (srca
== 0xff && m
== 0xffffffff)
3263 save_128_aligned ((__m128i
*)dst
, xmm_def
);
3267 xmm_dst
= load_128_aligned ((__m128i
*) dst
);
3268 xmm_mask
= unpack_32_1x128 (m
);
3269 xmm_mask
= _mm_unpacklo_epi8 (xmm_mask
, _mm_setzero_si128 ());
3272 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
3273 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
3275 expand_alpha_rev_2x128 (xmm_mask_lo
, xmm_mask_hi
,
3276 &xmm_mask_lo
, &xmm_mask_hi
);
3278 in_over_2x128 (&xmm_src
, &xmm_src
,
3279 &xmm_alpha
, &xmm_alpha
,
3280 &xmm_mask_lo
, &xmm_mask_hi
,
3281 &xmm_dst_lo
, &xmm_dst_hi
);
3284 (__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
3294 uint8_t m
= *mask
++;
3299 mmx_mask
= expand_pixel_8_1x128 (m
);
3300 mmx_dest
= unpack_32_1x128 (d
);
3302 *dst
= pack_1x128_32 (in_over_1x128 (&mmx_src
,
3315 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3316 __attribute__((__force_align_arg_pointer__
))
3318 static pixman_bool_t
3319 sse2_fill (pixman_implementation_t
*imp
,
3329 uint32_t byte_width
;
3339 stride
= stride
* (int) sizeof (uint32_t) / 1;
3340 byte_line
= (uint8_t *)(((uint8_t *)bits
) + stride
* y
+ x
);
3346 filler
= (w
<< 16) | w
;
3350 stride
= stride
* (int) sizeof (uint32_t) / 2;
3351 byte_line
= (uint8_t *)(((uint16_t *)bits
) + stride
* y
+ x
);
3352 byte_width
= 2 * width
;
3355 filler
= (filler
& 0xffff) * 0x00010001;
3359 stride
= stride
* (int) sizeof (uint32_t) / 4;
3360 byte_line
= (uint8_t *)(((uint32_t *)bits
) + stride
* y
+ x
);
3361 byte_width
= 4 * width
;
3369 xmm_def
= create_mask_2x32_128 (filler
, filler
);
3374 uint8_t *d
= byte_line
;
3375 byte_line
+= stride
;
3378 if (w
>= 1 && ((uintptr_t)d
& 1))
3380 *(uint8_t *)d
= filler
;
3385 while (w
>= 2 && ((uintptr_t)d
& 3))
3387 *(uint16_t *)d
= filler
;
3392 while (w
>= 4 && ((uintptr_t)d
& 15))
3394 *(uint32_t *)d
= filler
;
3402 save_128_aligned ((__m128i
*)(d
), xmm_def
);
3403 save_128_aligned ((__m128i
*)(d
+ 16), xmm_def
);
3404 save_128_aligned ((__m128i
*)(d
+ 32), xmm_def
);
3405 save_128_aligned ((__m128i
*)(d
+ 48), xmm_def
);
3406 save_128_aligned ((__m128i
*)(d
+ 64), xmm_def
);
3407 save_128_aligned ((__m128i
*)(d
+ 80), xmm_def
);
3408 save_128_aligned ((__m128i
*)(d
+ 96), xmm_def
);
3409 save_128_aligned ((__m128i
*)(d
+ 112), xmm_def
);
3417 save_128_aligned ((__m128i
*)(d
), xmm_def
);
3418 save_128_aligned ((__m128i
*)(d
+ 16), xmm_def
);
3419 save_128_aligned ((__m128i
*)(d
+ 32), xmm_def
);
3420 save_128_aligned ((__m128i
*)(d
+ 48), xmm_def
);
3428 save_128_aligned ((__m128i
*)(d
), xmm_def
);
3429 save_128_aligned ((__m128i
*)(d
+ 16), xmm_def
);
3437 save_128_aligned ((__m128i
*)(d
), xmm_def
);
3445 *(uint32_t *)d
= filler
;
3453 *(uint16_t *)d
= filler
;
3460 *(uint8_t *)d
= filler
;
3470 sse2_composite_src_n_8_8888 (pixman_implementation_t
*imp
,
3471 pixman_composite_info_t
*info
)
3473 PIXMAN_COMPOSITE_ARGS (info
);
3475 uint32_t *dst_line
, *dst
;
3476 uint8_t *mask_line
, *mask
;
3477 int dst_stride
, mask_stride
;
3481 __m128i xmm_src
, xmm_def
;
3482 __m128i xmm_mask
, xmm_mask_lo
, xmm_mask_hi
;
3484 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
3489 sse2_fill (imp
, dest_image
->bits
.bits
, dest_image
->bits
.rowstride
,
3490 PIXMAN_FORMAT_BPP (dest_image
->bits
.format
),
3491 dest_x
, dest_y
, width
, height
, 0);
3495 PIXMAN_IMAGE_GET_LINE (
3496 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
3497 PIXMAN_IMAGE_GET_LINE (
3498 mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
3500 xmm_def
= create_mask_2x32_128 (src
, src
);
3501 xmm_src
= expand_pixel_32_1x128 (src
);
3506 dst_line
+= dst_stride
;
3508 mask_line
+= mask_stride
;
3511 while (w
&& (uintptr_t)dst
& 15)
3513 uint8_t m
= *mask
++;
3517 *dst
= pack_1x128_32 (
3518 pix_multiply_1x128 (xmm_src
, expand_pixel_8_1x128 (m
)));
3531 m
= *((uint32_t*)mask
);
3533 if (srca
== 0xff && m
== 0xffffffff)
3535 save_128_aligned ((__m128i
*)dst
, xmm_def
);
3539 xmm_mask
= unpack_32_1x128 (m
);
3540 xmm_mask
= _mm_unpacklo_epi8 (xmm_mask
, _mm_setzero_si128 ());
3543 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
3545 expand_alpha_rev_2x128 (xmm_mask_lo
, xmm_mask_hi
,
3546 &xmm_mask_lo
, &xmm_mask_hi
);
3548 pix_multiply_2x128 (&xmm_src
, &xmm_src
,
3549 &xmm_mask_lo
, &xmm_mask_hi
,
3550 &xmm_mask_lo
, &xmm_mask_hi
);
3553 (__m128i
*)dst
, pack_2x128_128 (xmm_mask_lo
, xmm_mask_hi
));
3557 save_128_aligned ((__m128i
*)dst
, _mm_setzero_si128 ());
3567 uint8_t m
= *mask
++;
3571 *dst
= pack_1x128_32 (
3572 pix_multiply_1x128 (
3573 xmm_src
, expand_pixel_8_1x128 (m
)));
3588 sse2_composite_over_n_8_0565 (pixman_implementation_t
*imp
,
3589 pixman_composite_info_t
*info
)
3591 PIXMAN_COMPOSITE_ARGS (info
);
3593 uint16_t *dst_line
, *dst
, d
;
3594 uint8_t *mask_line
, *mask
;
3595 int dst_stride
, mask_stride
;
3598 __m128i mmx_src
, mmx_alpha
, mmx_mask
, mmx_dest
;
3600 __m128i xmm_src
, xmm_alpha
;
3601 __m128i xmm_mask
, xmm_mask_lo
, xmm_mask_hi
;
3602 __m128i xmm_dst
, xmm_dst0
, xmm_dst1
, xmm_dst2
, xmm_dst3
;
3604 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
3609 PIXMAN_IMAGE_GET_LINE (
3610 dest_image
, dest_x
, dest_y
, uint16_t, dst_stride
, dst_line
, 1);
3611 PIXMAN_IMAGE_GET_LINE (
3612 mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
3614 xmm_src
= expand_pixel_32_1x128 (src
);
3615 xmm_alpha
= expand_alpha_1x128 (xmm_src
);
3617 mmx_alpha
= xmm_alpha
;
3622 dst_line
+= dst_stride
;
3624 mask_line
+= mask_stride
;
3627 while (w
&& (uintptr_t)dst
& 15)
3634 mmx_mask
= expand_alpha_rev_1x128 (unpack_32_1x128 (m
));
3635 mmx_dest
= expand565_16_1x128 (d
);
3637 *dst
= pack_565_32_16 (
3640 &mmx_src
, &mmx_alpha
, &mmx_mask
, &mmx_dest
)));
3649 xmm_dst
= load_128_aligned ((__m128i
*) dst
);
3650 unpack_565_128_4x128 (xmm_dst
,
3651 &xmm_dst0
, &xmm_dst1
, &xmm_dst2
, &xmm_dst3
);
3653 m
= *((uint32_t*)mask
);
3658 xmm_mask
= unpack_32_1x128 (m
);
3659 xmm_mask
= _mm_unpacklo_epi8 (xmm_mask
, _mm_setzero_si128 ());
3662 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
3664 expand_alpha_rev_2x128 (xmm_mask_lo
, xmm_mask_hi
,
3665 &xmm_mask_lo
, &xmm_mask_hi
);
3667 in_over_2x128 (&xmm_src
, &xmm_src
,
3668 &xmm_alpha
, &xmm_alpha
,
3669 &xmm_mask_lo
, &xmm_mask_hi
,
3670 &xmm_dst0
, &xmm_dst1
);
3673 m
= *((uint32_t*)mask
);
3678 xmm_mask
= unpack_32_1x128 (m
);
3679 xmm_mask
= _mm_unpacklo_epi8 (xmm_mask
, _mm_setzero_si128 ());
3682 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
3684 expand_alpha_rev_2x128 (xmm_mask_lo
, xmm_mask_hi
,
3685 &xmm_mask_lo
, &xmm_mask_hi
);
3686 in_over_2x128 (&xmm_src
, &xmm_src
,
3687 &xmm_alpha
, &xmm_alpha
,
3688 &xmm_mask_lo
, &xmm_mask_hi
,
3689 &xmm_dst2
, &xmm_dst3
);
3693 (__m128i
*)dst
, pack_565_4x128_128 (
3694 &xmm_dst0
, &xmm_dst1
, &xmm_dst2
, &xmm_dst3
));
3707 mmx_mask
= expand_alpha_rev_1x128 (unpack_32_1x128 (m
));
3708 mmx_dest
= expand565_16_1x128 (d
);
3710 *dst
= pack_565_32_16 (
3713 &mmx_src
, &mmx_alpha
, &mmx_mask
, &mmx_dest
)));
3724 sse2_composite_over_pixbuf_0565 (pixman_implementation_t
*imp
,
3725 pixman_composite_info_t
*info
)
3727 PIXMAN_COMPOSITE_ARGS (info
);
3728 uint16_t *dst_line
, *dst
, d
;
3729 uint32_t *src_line
, *src
, s
;
3730 int dst_stride
, src_stride
;
3732 uint32_t opaque
, zero
;
3735 __m128i xmm_src
, xmm_src_lo
, xmm_src_hi
;
3736 __m128i xmm_dst
, xmm_dst0
, xmm_dst1
, xmm_dst2
, xmm_dst3
;
3738 PIXMAN_IMAGE_GET_LINE (
3739 dest_image
, dest_x
, dest_y
, uint16_t, dst_stride
, dst_line
, 1);
3740 PIXMAN_IMAGE_GET_LINE (
3741 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
3746 dst_line
+= dst_stride
;
3748 src_line
+= src_stride
;
3751 while (w
&& (uintptr_t)dst
& 15)
3756 ms
= unpack_32_1x128 (s
);
3758 *dst
++ = pack_565_32_16 (
3760 over_rev_non_pre_1x128 (ms
, expand565_16_1x128 (d
))));
3767 xmm_src
= load_128_unaligned ((__m128i
*)src
);
3768 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
3770 opaque
= is_opaque (xmm_src
);
3771 zero
= is_zero (xmm_src
);
3773 unpack_565_128_4x128 (xmm_dst
,
3774 &xmm_dst0
, &xmm_dst1
, &xmm_dst2
, &xmm_dst3
);
3775 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
3777 /* preload next round*/
3778 xmm_src
= load_128_unaligned ((__m128i
*)(src
+ 4));
3782 invert_colors_2x128 (xmm_src_lo
, xmm_src_hi
,
3783 &xmm_dst0
, &xmm_dst1
);
3787 over_rev_non_pre_2x128 (xmm_src_lo
, xmm_src_hi
,
3788 &xmm_dst0
, &xmm_dst1
);
3792 opaque
= is_opaque (xmm_src
);
3793 zero
= is_zero (xmm_src
);
3795 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
3799 invert_colors_2x128 (xmm_src_lo
, xmm_src_hi
,
3800 &xmm_dst2
, &xmm_dst3
);
3804 over_rev_non_pre_2x128 (xmm_src_lo
, xmm_src_hi
,
3805 &xmm_dst2
, &xmm_dst3
);
3809 (__m128i
*)dst
, pack_565_4x128_128 (
3810 &xmm_dst0
, &xmm_dst1
, &xmm_dst2
, &xmm_dst3
));
3822 ms
= unpack_32_1x128 (s
);
3824 *dst
++ = pack_565_32_16 (
3826 over_rev_non_pre_1x128 (ms
, expand565_16_1x128 (d
))));
3834 sse2_composite_over_pixbuf_8888 (pixman_implementation_t
*imp
,
3835 pixman_composite_info_t
*info
)
3837 PIXMAN_COMPOSITE_ARGS (info
);
3838 uint32_t *dst_line
, *dst
, d
;
3839 uint32_t *src_line
, *src
, s
;
3840 int dst_stride
, src_stride
;
3842 uint32_t opaque
, zero
;
3844 __m128i xmm_src_lo
, xmm_src_hi
;
3845 __m128i xmm_dst_lo
, xmm_dst_hi
;
3847 PIXMAN_IMAGE_GET_LINE (
3848 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
3849 PIXMAN_IMAGE_GET_LINE (
3850 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
3855 dst_line
+= dst_stride
;
3857 src_line
+= src_stride
;
3860 while (w
&& (uintptr_t)dst
& 15)
3865 *dst
++ = pack_1x128_32 (
3866 over_rev_non_pre_1x128 (
3867 unpack_32_1x128 (s
), unpack_32_1x128 (d
)));
3874 xmm_src_hi
= load_128_unaligned ((__m128i
*)src
);
3876 opaque
= is_opaque (xmm_src_hi
);
3877 zero
= is_zero (xmm_src_hi
);
3879 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
3883 invert_colors_2x128 (xmm_src_lo
, xmm_src_hi
,
3884 &xmm_dst_lo
, &xmm_dst_hi
);
3887 (__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
3891 xmm_dst_hi
= load_128_aligned ((__m128i
*)dst
);
3893 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
3895 over_rev_non_pre_2x128 (xmm_src_lo
, xmm_src_hi
,
3896 &xmm_dst_lo
, &xmm_dst_hi
);
3899 (__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
3912 *dst
++ = pack_1x128_32 (
3913 over_rev_non_pre_1x128 (
3914 unpack_32_1x128 (s
), unpack_32_1x128 (d
)));
3923 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t
*imp
,
3924 pixman_composite_info_t
*info
)
3926 PIXMAN_COMPOSITE_ARGS (info
);
3928 uint16_t *dst_line
, *dst
, d
;
3929 uint32_t *mask_line
, *mask
, m
;
3930 int dst_stride
, mask_stride
;
3934 __m128i xmm_src
, xmm_alpha
;
3935 __m128i xmm_mask
, xmm_mask_lo
, xmm_mask_hi
;
3936 __m128i xmm_dst
, xmm_dst0
, xmm_dst1
, xmm_dst2
, xmm_dst3
;
3938 __m128i mmx_src
, mmx_alpha
, mmx_mask
, mmx_dest
;
3940 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
3945 PIXMAN_IMAGE_GET_LINE (
3946 dest_image
, dest_x
, dest_y
, uint16_t, dst_stride
, dst_line
, 1);
3947 PIXMAN_IMAGE_GET_LINE (
3948 mask_image
, mask_x
, mask_y
, uint32_t, mask_stride
, mask_line
, 1);
3950 xmm_src
= expand_pixel_32_1x128 (src
);
3951 xmm_alpha
= expand_alpha_1x128 (xmm_src
);
3953 mmx_alpha
= xmm_alpha
;
3960 mask_line
+= mask_stride
;
3961 dst_line
+= dst_stride
;
3963 while (w
&& ((uintptr_t)dst
& 15))
3965 m
= *(uint32_t *) mask
;
3970 mmx_mask
= unpack_32_1x128 (m
);
3971 mmx_dest
= expand565_16_1x128 (d
);
3973 *dst
= pack_565_32_16 (
3976 &mmx_src
, &mmx_alpha
, &mmx_mask
, &mmx_dest
)));
3987 xmm_mask
= load_128_unaligned ((__m128i
*)mask
);
3988 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
3990 pack_cmp
= _mm_movemask_epi8 (
3991 _mm_cmpeq_epi32 (xmm_mask
, _mm_setzero_si128 ()));
3993 unpack_565_128_4x128 (xmm_dst
,
3994 &xmm_dst0
, &xmm_dst1
, &xmm_dst2
, &xmm_dst3
);
3995 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
3997 /* preload next round */
3998 xmm_mask
= load_128_unaligned ((__m128i
*)(mask
+ 4));
4000 /* preload next round */
4001 if (pack_cmp
!= 0xffff)
4003 in_over_2x128 (&xmm_src
, &xmm_src
,
4004 &xmm_alpha
, &xmm_alpha
,
4005 &xmm_mask_lo
, &xmm_mask_hi
,
4006 &xmm_dst0
, &xmm_dst1
);
4010 pack_cmp
= _mm_movemask_epi8 (
4011 _mm_cmpeq_epi32 (xmm_mask
, _mm_setzero_si128 ()));
4013 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
4015 if (pack_cmp
!= 0xffff)
4017 in_over_2x128 (&xmm_src
, &xmm_src
,
4018 &xmm_alpha
, &xmm_alpha
,
4019 &xmm_mask_lo
, &xmm_mask_hi
,
4020 &xmm_dst2
, &xmm_dst3
);
4024 (__m128i
*)dst
, pack_565_4x128_128 (
4025 &xmm_dst0
, &xmm_dst1
, &xmm_dst2
, &xmm_dst3
));
4034 m
= *(uint32_t *) mask
;
4039 mmx_mask
= unpack_32_1x128 (m
);
4040 mmx_dest
= expand565_16_1x128 (d
);
4042 *dst
= pack_565_32_16 (
4045 &mmx_src
, &mmx_alpha
, &mmx_mask
, &mmx_dest
)));
4057 sse2_composite_in_n_8_8 (pixman_implementation_t
*imp
,
4058 pixman_composite_info_t
*info
)
4060 PIXMAN_COMPOSITE_ARGS (info
);
4061 uint8_t *dst_line
, *dst
;
4062 uint8_t *mask_line
, *mask
;
4063 int dst_stride
, mask_stride
;
4069 __m128i xmm_mask
, xmm_mask_lo
, xmm_mask_hi
;
4070 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
4072 PIXMAN_IMAGE_GET_LINE (
4073 dest_image
, dest_x
, dest_y
, uint8_t, dst_stride
, dst_line
, 1);
4074 PIXMAN_IMAGE_GET_LINE (
4075 mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
4077 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
4079 xmm_alpha
= expand_alpha_1x128 (expand_pixel_32_1x128 (src
));
4084 dst_line
+= dst_stride
;
4086 mask_line
+= mask_stride
;
4089 while (w
&& ((uintptr_t)dst
& 15))
4091 m
= (uint32_t) *mask
++;
4092 d
= (uint32_t) *dst
;
4094 *dst
++ = (uint8_t) pack_1x128_32 (
4095 pix_multiply_1x128 (
4096 pix_multiply_1x128 (xmm_alpha
,
4097 unpack_32_1x128 (m
)),
4098 unpack_32_1x128 (d
)));
4104 xmm_mask
= load_128_unaligned ((__m128i
*)mask
);
4105 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
4107 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
4108 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
4110 pix_multiply_2x128 (&xmm_alpha
, &xmm_alpha
,
4111 &xmm_mask_lo
, &xmm_mask_hi
,
4112 &xmm_mask_lo
, &xmm_mask_hi
);
4114 pix_multiply_2x128 (&xmm_mask_lo
, &xmm_mask_hi
,
4115 &xmm_dst_lo
, &xmm_dst_hi
,
4116 &xmm_dst_lo
, &xmm_dst_hi
);
4119 (__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
4128 m
= (uint32_t) *mask
++;
4129 d
= (uint32_t) *dst
;
4131 *dst
++ = (uint8_t) pack_1x128_32 (
4132 pix_multiply_1x128 (
4133 pix_multiply_1x128 (
4134 xmm_alpha
, unpack_32_1x128 (m
)),
4135 unpack_32_1x128 (d
)));
4143 sse2_composite_in_n_8 (pixman_implementation_t
*imp
,
4144 pixman_composite_info_t
*info
)
4146 PIXMAN_COMPOSITE_ARGS (info
);
4147 uint8_t *dst_line
, *dst
;
4154 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
4156 PIXMAN_IMAGE_GET_LINE (
4157 dest_image
, dest_x
, dest_y
, uint8_t, dst_stride
, dst_line
, 1);
4159 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
4161 xmm_alpha
= expand_alpha_1x128 (expand_pixel_32_1x128 (src
));
4170 pixman_fill (dest_image
->bits
.bits
, dest_image
->bits
.rowstride
,
4171 8, dest_x
, dest_y
, width
, height
, src
);
4179 dst_line
+= dst_stride
;
4182 while (w
&& ((uintptr_t)dst
& 15))
4184 d
= (uint32_t) *dst
;
4186 *dst
++ = (uint8_t) pack_1x128_32 (
4187 pix_multiply_1x128 (
4189 unpack_32_1x128 (d
)));
4195 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
4197 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
4199 pix_multiply_2x128 (&xmm_alpha
, &xmm_alpha
,
4200 &xmm_dst_lo
, &xmm_dst_hi
,
4201 &xmm_dst_lo
, &xmm_dst_hi
);
4204 (__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
4212 d
= (uint32_t) *dst
;
4214 *dst
++ = (uint8_t) pack_1x128_32 (
4215 pix_multiply_1x128 (
4217 unpack_32_1x128 (d
)));
4225 sse2_composite_in_8_8 (pixman_implementation_t
*imp
,
4226 pixman_composite_info_t
*info
)
4228 PIXMAN_COMPOSITE_ARGS (info
);
4229 uint8_t *dst_line
, *dst
;
4230 uint8_t *src_line
, *src
;
4231 int src_stride
, dst_stride
;
4235 __m128i xmm_src
, xmm_src_lo
, xmm_src_hi
;
4236 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
4238 PIXMAN_IMAGE_GET_LINE (
4239 dest_image
, dest_x
, dest_y
, uint8_t, dst_stride
, dst_line
, 1);
4240 PIXMAN_IMAGE_GET_LINE (
4241 src_image
, src_x
, src_y
, uint8_t, src_stride
, src_line
, 1);
4246 dst_line
+= dst_stride
;
4248 src_line
+= src_stride
;
4251 while (w
&& ((uintptr_t)dst
& 15))
4253 s
= (uint32_t) *src
++;
4254 d
= (uint32_t) *dst
;
4256 *dst
++ = (uint8_t) pack_1x128_32 (
4257 pix_multiply_1x128 (
4258 unpack_32_1x128 (s
), unpack_32_1x128 (d
)));
4264 xmm_src
= load_128_unaligned ((__m128i
*)src
);
4265 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
4267 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
4268 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
4270 pix_multiply_2x128 (&xmm_src_lo
, &xmm_src_hi
,
4271 &xmm_dst_lo
, &xmm_dst_hi
,
4272 &xmm_dst_lo
, &xmm_dst_hi
);
4275 (__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
4284 s
= (uint32_t) *src
++;
4285 d
= (uint32_t) *dst
;
4287 *dst
++ = (uint8_t) pack_1x128_32 (
4288 pix_multiply_1x128 (unpack_32_1x128 (s
), unpack_32_1x128 (d
)));
4296 sse2_composite_add_n_8_8 (pixman_implementation_t
*imp
,
4297 pixman_composite_info_t
*info
)
4299 PIXMAN_COMPOSITE_ARGS (info
);
4300 uint8_t *dst_line
, *dst
;
4301 uint8_t *mask_line
, *mask
;
4302 int dst_stride
, mask_stride
;
4308 __m128i xmm_mask
, xmm_mask_lo
, xmm_mask_hi
;
4309 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
4311 PIXMAN_IMAGE_GET_LINE (
4312 dest_image
, dest_x
, dest_y
, uint8_t, dst_stride
, dst_line
, 1);
4313 PIXMAN_IMAGE_GET_LINE (
4314 mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
4316 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
4318 xmm_alpha
= expand_alpha_1x128 (expand_pixel_32_1x128 (src
));
4323 dst_line
+= dst_stride
;
4325 mask_line
+= mask_stride
;
4328 while (w
&& ((uintptr_t)dst
& 15))
4330 m
= (uint32_t) *mask
++;
4331 d
= (uint32_t) *dst
;
4333 *dst
++ = (uint8_t) pack_1x128_32 (
4335 pix_multiply_1x128 (
4336 xmm_alpha
, unpack_32_1x128 (m
)),
4337 unpack_32_1x128 (d
)));
4343 xmm_mask
= load_128_unaligned ((__m128i
*)mask
);
4344 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
4346 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
4347 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
4349 pix_multiply_2x128 (&xmm_alpha
, &xmm_alpha
,
4350 &xmm_mask_lo
, &xmm_mask_hi
,
4351 &xmm_mask_lo
, &xmm_mask_hi
);
4353 xmm_dst_lo
= _mm_adds_epu16 (xmm_mask_lo
, xmm_dst_lo
);
4354 xmm_dst_hi
= _mm_adds_epu16 (xmm_mask_hi
, xmm_dst_hi
);
4357 (__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
4366 m
= (uint32_t) *mask
++;
4367 d
= (uint32_t) *dst
;
4369 *dst
++ = (uint8_t) pack_1x128_32 (
4371 pix_multiply_1x128 (
4372 xmm_alpha
, unpack_32_1x128 (m
)),
4373 unpack_32_1x128 (d
)));
4382 sse2_composite_add_n_8 (pixman_implementation_t
*imp
,
4383 pixman_composite_info_t
*info
)
4385 PIXMAN_COMPOSITE_ARGS (info
);
4386 uint8_t *dst_line
, *dst
;
4393 PIXMAN_IMAGE_GET_LINE (
4394 dest_image
, dest_x
, dest_y
, uint8_t, dst_stride
, dst_line
, 1);
4396 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
4405 pixman_fill (dest_image
->bits
.bits
, dest_image
->bits
.rowstride
,
4406 8, dest_x
, dest_y
, width
, height
, 0xff);
4411 src
= (src
<< 24) | (src
<< 16) | (src
<< 8) | src
;
4412 xmm_src
= _mm_set_epi32 (src
, src
, src
, src
);
4417 dst_line
+= dst_stride
;
4420 while (w
&& ((uintptr_t)dst
& 15))
4422 *dst
= (uint8_t)_mm_cvtsi128_si32 (
4425 _mm_cvtsi32_si128 (*dst
)));
4434 (__m128i
*)dst
, _mm_adds_epu8 (xmm_src
, load_128_aligned ((__m128i
*)dst
)));
4442 *dst
= (uint8_t)_mm_cvtsi128_si32 (
4445 _mm_cvtsi32_si128 (*dst
)));
4455 sse2_composite_add_8_8 (pixman_implementation_t
*imp
,
4456 pixman_composite_info_t
*info
)
4458 PIXMAN_COMPOSITE_ARGS (info
);
4459 uint8_t *dst_line
, *dst
;
4460 uint8_t *src_line
, *src
;
4461 int dst_stride
, src_stride
;
4465 PIXMAN_IMAGE_GET_LINE (
4466 src_image
, src_x
, src_y
, uint8_t, src_stride
, src_line
, 1);
4467 PIXMAN_IMAGE_GET_LINE (
4468 dest_image
, dest_x
, dest_y
, uint8_t, dst_stride
, dst_line
, 1);
4475 dst_line
+= dst_stride
;
4476 src_line
+= src_stride
;
4480 while (w
&& (uintptr_t)dst
& 3)
4482 t
= (*dst
) + (*src
++);
4483 *dst
++ = t
| (0 - (t
>> 8));
4487 sse2_combine_add_u (imp
, op
,
4488 (uint32_t*)dst
, (uint32_t*)src
, NULL
, w
>> 2);
4498 t
= (*dst
) + (*src
++);
4499 *dst
++ = t
| (0 - (t
>> 8));
4507 sse2_composite_add_8888_8888 (pixman_implementation_t
*imp
,
4508 pixman_composite_info_t
*info
)
4510 PIXMAN_COMPOSITE_ARGS (info
);
4511 uint32_t *dst_line
, *dst
;
4512 uint32_t *src_line
, *src
;
4513 int dst_stride
, src_stride
;
4515 PIXMAN_IMAGE_GET_LINE (
4516 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
4517 PIXMAN_IMAGE_GET_LINE (
4518 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
4523 dst_line
+= dst_stride
;
4525 src_line
+= src_stride
;
4527 sse2_combine_add_u (imp
, op
, dst
, src
, NULL
, width
);
4532 sse2_composite_add_n_8888 (pixman_implementation_t
*imp
,
4533 pixman_composite_info_t
*info
)
4535 PIXMAN_COMPOSITE_ARGS (info
);
4536 uint32_t *dst_line
, *dst
, src
;
4541 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
4543 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
4549 pixman_fill (dest_image
->bits
.bits
, dest_image
->bits
.rowstride
, 32,
4550 dest_x
, dest_y
, width
, height
, ~0);
4555 xmm_src
= _mm_set_epi32 (src
, src
, src
, src
);
4562 dst_line
+= dst_stride
;
4564 while (w
&& (uintptr_t)dst
& 15)
4568 _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src
, _mm_cvtsi32_si128 (d
)));
4576 _mm_adds_epu8 (xmm_src
, load_128_aligned ((__m128i
*)dst
)));
4586 _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src
,
4587 _mm_cvtsi32_si128 (d
)));
4593 sse2_composite_add_n_8_8888 (pixman_implementation_t
*imp
,
4594 pixman_composite_info_t
*info
)
4596 PIXMAN_COMPOSITE_ARGS (info
);
4597 uint32_t *dst_line
, *dst
;
4598 uint8_t *mask_line
, *mask
;
4599 int dst_stride
, mask_stride
;
4605 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
4608 xmm_src
= expand_pixel_32_1x128 (src
);
4610 PIXMAN_IMAGE_GET_LINE (
4611 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
4612 PIXMAN_IMAGE_GET_LINE (
4613 mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
4618 dst_line
+= dst_stride
;
4620 mask_line
+= mask_stride
;
4623 while (w
&& ((uintptr_t)dst
& 15))
4625 uint8_t m
= *mask
++;
4628 *dst
= pack_1x128_32
4630 (pix_multiply_1x128 (xmm_src
, expand_pixel_8_1x128 (m
)),
4631 unpack_32_1x128 (*dst
)));
4639 uint32_t m
= *(uint32_t*)mask
;
4642 __m128i xmm_mask_lo
, xmm_mask_hi
;
4643 __m128i xmm_dst_lo
, xmm_dst_hi
;
4645 __m128i xmm_dst
= load_128_aligned ((__m128i
*)dst
);
4647 _mm_unpacklo_epi8 (unpack_32_1x128(m
),
4648 _mm_setzero_si128 ());
4650 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
4651 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
4653 expand_alpha_rev_2x128 (xmm_mask_lo
, xmm_mask_hi
,
4654 &xmm_mask_lo
, &xmm_mask_hi
);
4656 pix_multiply_2x128 (&xmm_src
, &xmm_src
,
4657 &xmm_mask_lo
, &xmm_mask_hi
,
4658 &xmm_mask_lo
, &xmm_mask_hi
);
4660 xmm_dst_lo
= _mm_adds_epu16 (xmm_mask_lo
, xmm_dst_lo
);
4661 xmm_dst_hi
= _mm_adds_epu16 (xmm_mask_hi
, xmm_dst_hi
);
4664 (__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
4674 uint8_t m
= *mask
++;
4677 *dst
= pack_1x128_32
4679 (pix_multiply_1x128 (xmm_src
, expand_pixel_8_1x128 (m
)),
4680 unpack_32_1x128 (*dst
)));
4688 static pixman_bool_t
4689 sse2_blt (pixman_implementation_t
*imp
,
4690 uint32_t * src_bits
,
4691 uint32_t * dst_bits
,
4703 uint8_t * src_bytes
;
4704 uint8_t * dst_bytes
;
4707 if (src_bpp
!= dst_bpp
)
4712 src_stride
= src_stride
* (int) sizeof (uint32_t) / 2;
4713 dst_stride
= dst_stride
* (int) sizeof (uint32_t) / 2;
4714 src_bytes
=(uint8_t *)(((uint16_t *)src_bits
) + src_stride
* (src_y
) + (src_x
));
4715 dst_bytes
= (uint8_t *)(((uint16_t *)dst_bits
) + dst_stride
* (dest_y
) + (dest_x
));
4716 byte_width
= 2 * width
;
4720 else if (src_bpp
== 32)
4722 src_stride
= src_stride
* (int) sizeof (uint32_t) / 4;
4723 dst_stride
= dst_stride
* (int) sizeof (uint32_t) / 4;
4724 src_bytes
= (uint8_t *)(((uint32_t *)src_bits
) + src_stride
* (src_y
) + (src_x
));
4725 dst_bytes
= (uint8_t *)(((uint32_t *)dst_bits
) + dst_stride
* (dest_y
) + (dest_x
));
4726 byte_width
= 4 * width
;
4738 uint8_t *s
= src_bytes
;
4739 uint8_t *d
= dst_bytes
;
4740 src_bytes
+= src_stride
;
4741 dst_bytes
+= dst_stride
;
4744 while (w
>= 2 && ((uintptr_t)d
& 3))
4746 *(uint16_t *)d
= *(uint16_t *)s
;
4752 while (w
>= 4 && ((uintptr_t)d
& 15))
4754 *(uint32_t *)d
= *(uint32_t *)s
;
4763 __m128i xmm0
, xmm1
, xmm2
, xmm3
;
4765 xmm0
= load_128_unaligned ((__m128i
*)(s
));
4766 xmm1
= load_128_unaligned ((__m128i
*)(s
+ 16));
4767 xmm2
= load_128_unaligned ((__m128i
*)(s
+ 32));
4768 xmm3
= load_128_unaligned ((__m128i
*)(s
+ 48));
4770 save_128_aligned ((__m128i
*)(d
), xmm0
);
4771 save_128_aligned ((__m128i
*)(d
+ 16), xmm1
);
4772 save_128_aligned ((__m128i
*)(d
+ 32), xmm2
);
4773 save_128_aligned ((__m128i
*)(d
+ 48), xmm3
);
4782 save_128_aligned ((__m128i
*)d
, load_128_unaligned ((__m128i
*)s
) );
4791 *(uint32_t *)d
= *(uint32_t *)s
;
4800 *(uint16_t *)d
= *(uint16_t *)s
;
4811 sse2_composite_copy_area (pixman_implementation_t
*imp
,
4812 pixman_composite_info_t
*info
)
4814 PIXMAN_COMPOSITE_ARGS (info
);
4815 sse2_blt (imp
, src_image
->bits
.bits
,
4816 dest_image
->bits
.bits
,
4817 src_image
->bits
.rowstride
,
4818 dest_image
->bits
.rowstride
,
4819 PIXMAN_FORMAT_BPP (src_image
->bits
.format
),
4820 PIXMAN_FORMAT_BPP (dest_image
->bits
.format
),
4821 src_x
, src_y
, dest_x
, dest_y
, width
, height
);
4825 sse2_composite_over_x888_8_8888 (pixman_implementation_t
*imp
,
4826 pixman_composite_info_t
*info
)
4828 PIXMAN_COMPOSITE_ARGS (info
);
4829 uint32_t *src
, *src_line
, s
;
4830 uint32_t *dst
, *dst_line
, d
;
4831 uint8_t *mask
, *mask_line
;
4833 int src_stride
, mask_stride
, dst_stride
;
4837 __m128i xmm_src
, xmm_src_lo
, xmm_src_hi
;
4838 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
4839 __m128i xmm_mask
, xmm_mask_lo
, xmm_mask_hi
;
4841 PIXMAN_IMAGE_GET_LINE (
4842 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
4843 PIXMAN_IMAGE_GET_LINE (
4844 mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
4845 PIXMAN_IMAGE_GET_LINE (
4846 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
4851 src_line
+= src_stride
;
4853 dst_line
+= dst_stride
;
4855 mask_line
+= mask_stride
;
4859 while (w
&& (uintptr_t)dst
& 15)
4861 s
= 0xff000000 | *src
++;
4862 m
= (uint32_t) *mask
++;
4864 ms
= unpack_32_1x128 (s
);
4868 __m128i ma
= expand_alpha_rev_1x128 (unpack_32_1x128 (m
));
4869 __m128i md
= unpack_32_1x128 (d
);
4871 ms
= in_over_1x128 (&ms
, &mask_00ff
, &ma
, &md
);
4874 *dst
++ = pack_1x128_32 (ms
);
4880 m
= *(uint32_t*) mask
;
4881 xmm_src
= _mm_or_si128 (
4882 load_128_unaligned ((__m128i
*)src
), mask_ff000000
);
4884 if (m
== 0xffffffff)
4886 save_128_aligned ((__m128i
*)dst
, xmm_src
);
4890 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
4892 xmm_mask
= _mm_unpacklo_epi16 (unpack_32_1x128 (m
), _mm_setzero_si128());
4894 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
4895 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
4896 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
4898 expand_alpha_rev_2x128 (
4899 xmm_mask_lo
, xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
4901 in_over_2x128 (&xmm_src_lo
, &xmm_src_hi
,
4902 &mask_00ff
, &mask_00ff
, &xmm_mask_lo
, &xmm_mask_hi
,
4903 &xmm_dst_lo
, &xmm_dst_hi
);
4905 save_128_aligned ((__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
4916 m
= (uint32_t) *mask
++;
4920 s
= 0xff000000 | *src
;
4932 ma
= expand_alpha_rev_1x128 (unpack_32_1x128 (m
));
4933 md
= unpack_32_1x128 (d
);
4934 ms
= unpack_32_1x128 (s
);
4936 *dst
= pack_1x128_32 (in_over_1x128 (&ms
, &mask_00ff
, &ma
, &md
));
4950 sse2_composite_over_8888_8_8888 (pixman_implementation_t
*imp
,
4951 pixman_composite_info_t
*info
)
4953 PIXMAN_COMPOSITE_ARGS (info
);
4954 uint32_t *src
, *src_line
, s
;
4955 uint32_t *dst
, *dst_line
, d
;
4956 uint8_t *mask
, *mask_line
;
4958 int src_stride
, mask_stride
, dst_stride
;
4961 __m128i xmm_src
, xmm_src_lo
, xmm_src_hi
, xmm_srca_lo
, xmm_srca_hi
;
4962 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
4963 __m128i xmm_mask
, xmm_mask_lo
, xmm_mask_hi
;
4965 PIXMAN_IMAGE_GET_LINE (
4966 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
4967 PIXMAN_IMAGE_GET_LINE (
4968 mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
4969 PIXMAN_IMAGE_GET_LINE (
4970 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
4975 src_line
+= src_stride
;
4977 dst_line
+= dst_stride
;
4979 mask_line
+= mask_stride
;
4983 while (w
&& (uintptr_t)dst
& 15)
4988 m
= (uint32_t) *mask
++;
4995 if (sa
== 0xff && m
== 0xff)
5001 __m128i ms
, md
, ma
, msa
;
5003 ma
= expand_alpha_rev_1x128 (load_32_1x128 (m
));
5004 ms
= unpack_32_1x128 (s
);
5005 md
= unpack_32_1x128 (d
);
5007 msa
= expand_alpha_rev_1x128 (load_32_1x128 (sa
));
5009 *dst
= pack_1x128_32 (in_over_1x128 (&ms
, &msa
, &ma
, &md
));
5019 m
= *(uint32_t *) mask
;
5023 xmm_src
= load_128_unaligned ((__m128i
*)src
);
5025 if (m
== 0xffffffff && is_opaque (xmm_src
))
5027 save_128_aligned ((__m128i
*)dst
, xmm_src
);
5031 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
5033 xmm_mask
= _mm_unpacklo_epi16 (unpack_32_1x128 (m
), _mm_setzero_si128());
5035 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
5036 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
5037 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
5039 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
, &xmm_srca_lo
, &xmm_srca_hi
);
5040 expand_alpha_rev_2x128 (xmm_mask_lo
, xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
5042 in_over_2x128 (&xmm_src_lo
, &xmm_src_hi
, &xmm_srca_lo
, &xmm_srca_hi
,
5043 &xmm_mask_lo
, &xmm_mask_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
5045 save_128_aligned ((__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
5060 m
= (uint32_t) *mask
++;
5067 if (sa
== 0xff && m
== 0xff)
5073 __m128i ms
, md
, ma
, msa
;
5075 ma
= expand_alpha_rev_1x128 (load_32_1x128 (m
));
5076 ms
= unpack_32_1x128 (s
);
5077 md
= unpack_32_1x128 (d
);
5079 msa
= expand_alpha_rev_1x128 (load_32_1x128 (sa
));
5081 *dst
= pack_1x128_32 (in_over_1x128 (&ms
, &msa
, &ma
, &md
));
5093 sse2_composite_over_reverse_n_8888 (pixman_implementation_t
*imp
,
5094 pixman_composite_info_t
*info
)
5096 PIXMAN_COMPOSITE_ARGS (info
);
5098 uint32_t *dst_line
, *dst
;
5100 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
5101 __m128i xmm_dsta_hi
, xmm_dsta_lo
;
5105 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
5110 PIXMAN_IMAGE_GET_LINE (
5111 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
5113 xmm_src
= expand_pixel_32_1x128 (src
);
5119 dst_line
+= dst_stride
;
5122 while (w
&& (uintptr_t)dst
& 15)
5126 vd
= unpack_32_1x128 (*dst
);
5128 *dst
= pack_1x128_32 (over_1x128 (vd
, expand_alpha_1x128 (vd
),
5136 __m128i tmp_lo
, tmp_hi
;
5138 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
5140 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
5141 expand_alpha_2x128 (xmm_dst_lo
, xmm_dst_hi
, &xmm_dsta_lo
, &xmm_dsta_hi
);
5146 over_2x128 (&xmm_dst_lo
, &xmm_dst_hi
,
5147 &xmm_dsta_lo
, &xmm_dsta_hi
,
5151 (__m128i
*)dst
, pack_2x128_128 (tmp_lo
, tmp_hi
));
5161 vd
= unpack_32_1x128 (*dst
);
5163 *dst
= pack_1x128_32 (over_1x128 (vd
, expand_alpha_1x128 (vd
),
5174 sse2_composite_over_8888_8888_8888 (pixman_implementation_t
*imp
,
5175 pixman_composite_info_t
*info
)
5177 PIXMAN_COMPOSITE_ARGS (info
);
5178 uint32_t *src
, *src_line
, s
;
5179 uint32_t *dst
, *dst_line
, d
;
5180 uint32_t *mask
, *mask_line
;
5182 int src_stride
, mask_stride
, dst_stride
;
5185 __m128i xmm_src
, xmm_src_lo
, xmm_src_hi
, xmm_srca_lo
, xmm_srca_hi
;
5186 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
5187 __m128i xmm_mask
, xmm_mask_lo
, xmm_mask_hi
;
5189 PIXMAN_IMAGE_GET_LINE (
5190 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
5191 PIXMAN_IMAGE_GET_LINE (
5192 mask_image
, mask_x
, mask_y
, uint32_t, mask_stride
, mask_line
, 1);
5193 PIXMAN_IMAGE_GET_LINE (
5194 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
5199 src_line
+= src_stride
;
5201 dst_line
+= dst_stride
;
5203 mask_line
+= mask_stride
;
5207 while (w
&& (uintptr_t)dst
& 15)
5212 m
= (*mask
++) >> 24;
5219 if (sa
== 0xff && m
== 0xff)
5225 __m128i ms
, md
, ma
, msa
;
5227 ma
= expand_alpha_rev_1x128 (load_32_1x128 (m
));
5228 ms
= unpack_32_1x128 (s
);
5229 md
= unpack_32_1x128 (d
);
5231 msa
= expand_alpha_rev_1x128 (load_32_1x128 (sa
));
5233 *dst
= pack_1x128_32 (in_over_1x128 (&ms
, &msa
, &ma
, &md
));
5243 xmm_mask
= load_128_unaligned ((__m128i
*)mask
);
5245 if (!is_transparent (xmm_mask
))
5247 xmm_src
= load_128_unaligned ((__m128i
*)src
);
5249 if (is_opaque (xmm_mask
) && is_opaque (xmm_src
))
5251 save_128_aligned ((__m128i
*)dst
, xmm_src
);
5255 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
5257 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
5258 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
5259 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
5261 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
, &xmm_srca_lo
, &xmm_srca_hi
);
5262 expand_alpha_2x128 (xmm_mask_lo
, xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
5264 in_over_2x128 (&xmm_src_lo
, &xmm_src_hi
, &xmm_srca_lo
, &xmm_srca_hi
,
5265 &xmm_mask_lo
, &xmm_mask_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
5267 save_128_aligned ((__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
5282 m
= (*mask
++) >> 24;
5289 if (sa
== 0xff && m
== 0xff)
5295 __m128i ms
, md
, ma
, msa
;
5297 ma
= expand_alpha_rev_1x128 (load_32_1x128 (m
));
5298 ms
= unpack_32_1x128 (s
);
5299 md
= unpack_32_1x128 (d
);
5301 msa
= expand_alpha_rev_1x128 (load_32_1x128 (sa
));
5303 *dst
= pack_1x128_32 (in_over_1x128 (&ms
, &msa
, &ma
, &md
));
5314 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5315 static force_inline
void
5316 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd
,
5320 pixman_fixed_t unit_x
,
5321 pixman_fixed_t src_width_fixed
,
5322 pixman_bool_t fully_transparent_src
)
5325 const uint32_t* pm
= NULL
;
5327 __m128i xmm_dst_lo
, xmm_dst_hi
;
5328 __m128i xmm_src_lo
, xmm_src_hi
;
5329 __m128i xmm_alpha_lo
, xmm_alpha_hi
;
5331 if (fully_transparent_src
)
5334 /* Align dst on a 16-byte boundary */
5335 while (w
&& ((uintptr_t)pd
& 15))
5338 s
= combine1 (ps
+ pixman_fixed_to_int (vx
), pm
);
5341 vx
-= src_width_fixed
;
5343 *pd
++ = core_combine_over_u_pixel_sse2 (s
, d
);
5352 uint32_t tmp1
, tmp2
, tmp3
, tmp4
;
5354 tmp1
= *(ps
+ pixman_fixed_to_int (vx
));
5357 vx
-= src_width_fixed
;
5358 tmp2
= *(ps
+ pixman_fixed_to_int (vx
));
5361 vx
-= src_width_fixed
;
5362 tmp3
= *(ps
+ pixman_fixed_to_int (vx
));
5365 vx
-= src_width_fixed
;
5366 tmp4
= *(ps
+ pixman_fixed_to_int (vx
));
5369 vx
-= src_width_fixed
;
5371 tmp
= _mm_set_epi32 (tmp4
, tmp3
, tmp2
, tmp1
);
5373 xmm_src_hi
= combine4 ((__m128i
*)&tmp
, (__m128i
*)pm
);
5375 if (is_opaque (xmm_src_hi
))
5377 save_128_aligned ((__m128i
*)pd
, xmm_src_hi
);
5379 else if (!is_zero (xmm_src_hi
))
5381 xmm_dst_hi
= load_128_aligned ((__m128i
*) pd
);
5383 unpack_128_2x128 (xmm_src_hi
, &xmm_src_lo
, &xmm_src_hi
);
5384 unpack_128_2x128 (xmm_dst_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
5386 expand_alpha_2x128 (
5387 xmm_src_lo
, xmm_src_hi
, &xmm_alpha_lo
, &xmm_alpha_hi
);
5389 over_2x128 (&xmm_src_lo
, &xmm_src_hi
,
5390 &xmm_alpha_lo
, &xmm_alpha_hi
,
5391 &xmm_dst_lo
, &xmm_dst_hi
);
5393 /* rebuid the 4 pixel data and save*/
5394 save_128_aligned ((__m128i
*)pd
,
5395 pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
5407 s
= combine1 (ps
+ pixman_fixed_to_int (vx
), pm
);
5410 vx
-= src_width_fixed
;
5412 *pd
++ = core_combine_over_u_pixel_sse2 (s
, d
);
5420 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER
,
5421 scaled_nearest_scanline_sse2_8888_8888_OVER
,
5422 uint32_t, uint32_t, COVER
)
5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER
,
5424 scaled_nearest_scanline_sse2_8888_8888_OVER
,
5425 uint32_t, uint32_t, NONE
)
5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER
,
5427 scaled_nearest_scanline_sse2_8888_8888_OVER
,
5428 uint32_t, uint32_t, PAD
)
5429 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER
,
5430 scaled_nearest_scanline_sse2_8888_8888_OVER
,
5431 uint32_t, uint32_t, NORMAL
)
5433 static force_inline
void
5434 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask
,
5436 const uint32_t * src
,
5439 pixman_fixed_t unit_x
,
5440 pixman_fixed_t src_width_fixed
,
5441 pixman_bool_t zero_src
)
5444 __m128i xmm_src
, xmm_src_lo
, xmm_src_hi
;
5445 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
5446 __m128i xmm_alpha_lo
, xmm_alpha_hi
;
5448 if (zero_src
|| (*mask
>> 24) == 0)
5451 xmm_mask
= create_mask_16_128 (*mask
>> 24);
5453 while (w
&& (uintptr_t)dst
& 15)
5455 uint32_t s
= *(src
+ pixman_fixed_to_int (vx
));
5458 vx
-= src_width_fixed
;
5464 __m128i ms
= unpack_32_1x128 (s
);
5465 __m128i alpha
= expand_alpha_1x128 (ms
);
5466 __m128i dest
= xmm_mask
;
5467 __m128i alpha_dst
= unpack_32_1x128 (d
);
5469 *dst
= pack_1x128_32 (
5470 in_over_1x128 (&ms
, &alpha
, &dest
, &alpha_dst
));
5478 uint32_t tmp1
, tmp2
, tmp3
, tmp4
;
5480 tmp1
= *(src
+ pixman_fixed_to_int (vx
));
5483 vx
-= src_width_fixed
;
5484 tmp2
= *(src
+ pixman_fixed_to_int (vx
));
5487 vx
-= src_width_fixed
;
5488 tmp3
= *(src
+ pixman_fixed_to_int (vx
));
5491 vx
-= src_width_fixed
;
5492 tmp4
= *(src
+ pixman_fixed_to_int (vx
));
5495 vx
-= src_width_fixed
;
5497 xmm_src
= _mm_set_epi32 (tmp4
, tmp3
, tmp2
, tmp1
);
5499 if (!is_zero (xmm_src
))
5501 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
5503 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
5504 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
5505 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
5506 &xmm_alpha_lo
, &xmm_alpha_hi
);
5508 in_over_2x128 (&xmm_src_lo
, &xmm_src_hi
,
5509 &xmm_alpha_lo
, &xmm_alpha_hi
,
5510 &xmm_mask
, &xmm_mask
,
5511 &xmm_dst_lo
, &xmm_dst_hi
);
5514 (__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
5523 uint32_t s
= *(src
+ pixman_fixed_to_int (vx
));
5526 vx
-= src_width_fixed
;
5532 __m128i ms
= unpack_32_1x128 (s
);
5533 __m128i alpha
= expand_alpha_1x128 (ms
);
5534 __m128i mask
= xmm_mask
;
5535 __m128i dest
= unpack_32_1x128 (d
);
5537 *dst
= pack_1x128_32 (
5538 in_over_1x128 (&ms
, &alpha
, &mask
, &dest
));
5547 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER
,
5548 scaled_nearest_scanline_sse2_8888_n_8888_OVER
,
5549 uint32_t, uint32_t, uint32_t, COVER
, TRUE
, TRUE
)
5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER
,
5551 scaled_nearest_scanline_sse2_8888_n_8888_OVER
,
5552 uint32_t, uint32_t, uint32_t, PAD
, TRUE
, TRUE
)
5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER
,
5554 scaled_nearest_scanline_sse2_8888_n_8888_OVER
,
5555 uint32_t, uint32_t, uint32_t, NONE
, TRUE
, TRUE
)
5556 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER
,
5557 scaled_nearest_scanline_sse2_8888_n_8888_OVER
,
5558 uint32_t, uint32_t, uint32_t, NORMAL
, TRUE
, TRUE
)
5562 /***********************************************************************************/
5564 # define BILINEAR_DECLARE_VARIABLES \
5565 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
5566 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
5567 const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
5568 const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
5569 unit_x, -unit_x, unit_x, -unit_x); \
5570 const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \
5571 unit_x * 4, -unit_x * 4, \
5572 unit_x * 4, -unit_x * 4, \
5573 unit_x * 4, -unit_x * 4); \
5574 const __m128i xmm_zero = _mm_setzero_si128 (); \
5575 __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3, \
5576 vx + unit_x * 2, -(vx + 1) - unit_x * 2, \
5577 vx + unit_x * 1, -(vx + 1) - unit_x * 1, \
5578 vx + unit_x * 0, -(vx + 1) - unit_x * 0); \
5579 __m128i xmm_wh_state;
5581 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_) \
5583 int phase = phase_; \
5584 __m128i xmm_wh, xmm_a, xmm_b; \
5585 /* fetch 2x2 pixel block into sse2 registers */ \
5586 __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
5587 __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
5589 /* vertical interpolation */ \
5590 xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
5591 xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
5592 xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
5593 /* calculate horizontal weights */ \
5596 xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
5597 16 - BILINEAR_INTERPOLATION_BITS)); \
5598 xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4); \
5601 xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase, \
5603 /* horizontal interpolation */ \
5604 xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
5605 xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh); \
5606 /* shift the result */ \
5607 pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
5610 #else /************************************************************************/
5612 # define BILINEAR_DECLARE_VARIABLES \
5613 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
5614 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
5615 const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
5616 const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
5617 unit_x, -unit_x, unit_x, -unit_x); \
5618 const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \
5619 unit_x * 4, -unit_x * 4, \
5620 unit_x * 4, -unit_x * 4, \
5621 unit_x * 4, -unit_x * 4); \
5622 const __m128i xmm_zero = _mm_setzero_si128 (); \
5623 __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1), \
5624 vx, -(vx + 1), vx, -(vx + 1))
5626 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase) \
5628 __m128i xmm_wh, xmm_a, xmm_b; \
5629 /* fetch 2x2 pixel block into sse2 registers */ \
5630 __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
5631 __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
5632 (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */ \
5634 /* vertical interpolation */ \
5635 xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
5636 xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
5637 xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
5638 /* calculate horizontal weights */ \
5639 xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
5640 16 - BILINEAR_INTERPOLATION_BITS)); \
5641 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
5642 /* horizontal interpolation */ \
5643 xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a); \
5644 xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh); \
5645 /* shift the result */ \
5646 pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
5649 /***********************************************************************************/
5653 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix); \
5656 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1); \
5657 xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix); \
5658 xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix); \
5659 pix = _mm_cvtsi128_si32 (xmm_pix); \
5662 #define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix); \
5664 __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4; \
5665 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0); \
5666 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1); \
5667 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2); \
5668 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3); \
5669 xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2); \
5670 xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4); \
5671 pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3); \
5674 #define BILINEAR_SKIP_ONE_PIXEL() \
5677 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
5680 #define BILINEAR_SKIP_FOUR_PIXELS() \
5683 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4); \
5686 /***********************************************************************************/
5688 static force_inline
void
5689 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst
,
5690 const uint32_t * mask
,
5691 const uint32_t * src_top
,
5692 const uint32_t * src_bottom
,
5697 pixman_fixed_t unit_x_
,
5698 pixman_fixed_t max_vx
,
5699 pixman_bool_t zero_src
)
5702 intptr_t unit_x
= unit_x_
;
5703 BILINEAR_DECLARE_VARIABLES
;
5704 uint32_t pix1
, pix2
;
5706 while (w
&& ((uintptr_t)dst
& 15))
5708 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
5713 while ((w
-= 4) >= 0) {
5715 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src
);
5716 _mm_store_si128 ((__m128i
*)dst
, xmm_src
);
5722 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
5723 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2
);
5730 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
5736 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC
,
5737 scaled_bilinear_scanline_sse2_8888_8888_SRC
,
5738 uint32_t, uint32_t, uint32_t,
5740 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC
,
5741 scaled_bilinear_scanline_sse2_8888_8888_SRC
,
5742 uint32_t, uint32_t, uint32_t,
5744 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC
,
5745 scaled_bilinear_scanline_sse2_8888_8888_SRC
,
5746 uint32_t, uint32_t, uint32_t,
5748 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC
,
5749 scaled_bilinear_scanline_sse2_8888_8888_SRC
,
5750 uint32_t, uint32_t, uint32_t,
5753 static force_inline
void
5754 scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t * dst
,
5755 const uint32_t * mask
,
5756 const uint32_t * src_top
,
5757 const uint32_t * src_bottom
,
5762 pixman_fixed_t unit_x_
,
5763 pixman_fixed_t max_vx
,
5764 pixman_bool_t zero_src
)
5767 intptr_t unit_x
= unit_x_
;
5768 BILINEAR_DECLARE_VARIABLES
;
5769 uint32_t pix1
, pix2
;
5771 while (w
&& ((uintptr_t)dst
& 15))
5773 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
5774 *dst
++ = pix1
| 0xFF000000;
5778 while ((w
-= 4) >= 0) {
5780 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src
);
5781 _mm_store_si128 ((__m128i
*)dst
, _mm_or_si128 (xmm_src
, mask_ff000000
));
5787 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
5788 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2
);
5789 *dst
++ = pix1
| 0xFF000000;
5790 *dst
++ = pix2
| 0xFF000000;
5795 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
5796 *dst
= pix1
| 0xFF000000;
5800 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC
,
5801 scaled_bilinear_scanline_sse2_x888_8888_SRC
,
5802 uint32_t, uint32_t, uint32_t,
5804 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC
,
5805 scaled_bilinear_scanline_sse2_x888_8888_SRC
,
5806 uint32_t, uint32_t, uint32_t,
5808 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC
,
5809 scaled_bilinear_scanline_sse2_x888_8888_SRC
,
5810 uint32_t, uint32_t, uint32_t,
5813 static force_inline
void
5814 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst
,
5815 const uint32_t * mask
,
5816 const uint32_t * src_top
,
5817 const uint32_t * src_bottom
,
5822 pixman_fixed_t unit_x_
,
5823 pixman_fixed_t max_vx
,
5824 pixman_bool_t zero_src
)
5827 intptr_t unit_x
= unit_x_
;
5828 BILINEAR_DECLARE_VARIABLES
;
5829 uint32_t pix1
, pix2
;
5831 while (w
&& ((uintptr_t)dst
& 15))
5833 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
5838 *dst
= core_combine_over_u_pixel_sse2 (pix1
, pix2
);
5848 __m128i xmm_src_hi
, xmm_src_lo
, xmm_dst_hi
, xmm_dst_lo
;
5849 __m128i xmm_alpha_hi
, xmm_alpha_lo
;
5851 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src
);
5853 if (!is_zero (xmm_src
))
5855 if (is_opaque (xmm_src
))
5857 save_128_aligned ((__m128i
*)dst
, xmm_src
);
5861 __m128i xmm_dst
= load_128_aligned ((__m128i
*)dst
);
5863 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
5864 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
5866 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
, &xmm_alpha_lo
, &xmm_alpha_hi
);
5867 over_2x128 (&xmm_src_lo
, &xmm_src_hi
, &xmm_alpha_lo
, &xmm_alpha_hi
,
5868 &xmm_dst_lo
, &xmm_dst_hi
);
5870 save_128_aligned ((__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
5880 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
5885 *dst
= core_combine_over_u_pixel_sse2 (pix1
, pix2
);
5893 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER
,
5894 scaled_bilinear_scanline_sse2_8888_8888_OVER
,
5895 uint32_t, uint32_t, uint32_t,
5897 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER
,
5898 scaled_bilinear_scanline_sse2_8888_8888_OVER
,
5899 uint32_t, uint32_t, uint32_t,
5901 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER
,
5902 scaled_bilinear_scanline_sse2_8888_8888_OVER
,
5903 uint32_t, uint32_t, uint32_t,
5905 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER
,
5906 scaled_bilinear_scanline_sse2_8888_8888_OVER
,
5907 uint32_t, uint32_t, uint32_t,
5910 static force_inline
void
5911 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst
,
5912 const uint8_t * mask
,
5913 const uint32_t * src_top
,
5914 const uint32_t * src_bottom
,
5919 pixman_fixed_t unit_x_
,
5920 pixman_fixed_t max_vx
,
5921 pixman_bool_t zero_src
)
5924 intptr_t unit_x
= unit_x_
;
5925 BILINEAR_DECLARE_VARIABLES
;
5926 uint32_t pix1
, pix2
;
5929 while (w
&& ((uintptr_t)dst
& 15))
5933 m
= (uint32_t) *mask
++;
5937 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
5940 if (sa
== 0xff && m
== 0xff)
5946 __m128i ms
, md
, ma
, msa
;
5949 ma
= expand_alpha_rev_1x128 (load_32_1x128 (m
));
5950 ms
= unpack_32_1x128 (pix1
);
5951 md
= unpack_32_1x128 (pix2
);
5953 msa
= expand_alpha_rev_1x128 (load_32_1x128 (sa
));
5955 *dst
= pack_1x128_32 (in_over_1x128 (&ms
, &msa
, &ma
, &md
));
5960 BILINEAR_SKIP_ONE_PIXEL ();
5969 __m128i xmm_src
, xmm_src_lo
, xmm_src_hi
, xmm_srca_lo
, xmm_srca_hi
;
5970 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
5971 __m128i xmm_mask
, xmm_mask_lo
, xmm_mask_hi
;
5973 m
= *(uint32_t*)mask
;
5977 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src
);
5979 if (m
== 0xffffffff && is_opaque (xmm_src
))
5981 save_128_aligned ((__m128i
*)dst
, xmm_src
);
5985 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
5987 xmm_mask
= _mm_unpacklo_epi16 (unpack_32_1x128 (m
), _mm_setzero_si128());
5989 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
5990 unpack_128_2x128 (xmm_mask
, &xmm_mask_lo
, &xmm_mask_hi
);
5991 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
5993 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
, &xmm_srca_lo
, &xmm_srca_hi
);
5994 expand_alpha_rev_2x128 (xmm_mask_lo
, xmm_mask_hi
, &xmm_mask_lo
, &xmm_mask_hi
);
5996 in_over_2x128 (&xmm_src_lo
, &xmm_src_hi
, &xmm_srca_lo
, &xmm_srca_hi
,
5997 &xmm_mask_lo
, &xmm_mask_hi
, &xmm_dst_lo
, &xmm_dst_hi
);
5999 save_128_aligned ((__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
6004 BILINEAR_SKIP_FOUR_PIXELS ();
6016 m
= (uint32_t) *mask
++;
6020 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
6023 if (sa
== 0xff && m
== 0xff)
6029 __m128i ms
, md
, ma
, msa
;
6032 ma
= expand_alpha_rev_1x128 (load_32_1x128 (m
));
6033 ms
= unpack_32_1x128 (pix1
);
6034 md
= unpack_32_1x128 (pix2
);
6036 msa
= expand_alpha_rev_1x128 (load_32_1x128 (sa
));
6038 *dst
= pack_1x128_32 (in_over_1x128 (&ms
, &msa
, &ma
, &md
));
6043 BILINEAR_SKIP_ONE_PIXEL ();
6051 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER
,
6052 scaled_bilinear_scanline_sse2_8888_8_8888_OVER
,
6053 uint32_t, uint8_t, uint32_t,
6054 COVER
, FLAG_HAVE_NON_SOLID_MASK
)
6055 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER
,
6056 scaled_bilinear_scanline_sse2_8888_8_8888_OVER
,
6057 uint32_t, uint8_t, uint32_t,
6058 PAD
, FLAG_HAVE_NON_SOLID_MASK
)
6059 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER
,
6060 scaled_bilinear_scanline_sse2_8888_8_8888_OVER
,
6061 uint32_t, uint8_t, uint32_t,
6062 NONE
, FLAG_HAVE_NON_SOLID_MASK
)
6063 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER
,
6064 scaled_bilinear_scanline_sse2_8888_8_8888_OVER
,
6065 uint32_t, uint8_t, uint32_t,
6066 NORMAL
, FLAG_HAVE_NON_SOLID_MASK
)
6068 static force_inline
void
6069 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst
,
6070 const uint32_t * mask
,
6071 const uint32_t * src_top
,
6072 const uint32_t * src_bottom
,
6077 pixman_fixed_t unit_x_
,
6078 pixman_fixed_t max_vx
,
6079 pixman_bool_t zero_src
)
6082 intptr_t unit_x
= unit_x_
;
6083 BILINEAR_DECLARE_VARIABLES
;
6087 if (zero_src
|| (*mask
>> 24) == 0)
6090 xmm_mask
= create_mask_16_128 (*mask
>> 24);
6092 while (w
&& ((uintptr_t)dst
& 15))
6094 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
6099 __m128i ms
= unpack_32_1x128 (pix1
);
6100 __m128i alpha
= expand_alpha_1x128 (ms
);
6101 __m128i dest
= xmm_mask
;
6102 __m128i alpha_dst
= unpack_32_1x128 (d
);
6104 *dst
= pack_1x128_32
6105 (in_over_1x128 (&ms
, &alpha
, &dest
, &alpha_dst
));
6115 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src
);
6117 if (!is_zero (xmm_src
))
6119 __m128i xmm_src_lo
, xmm_src_hi
;
6120 __m128i xmm_dst
, xmm_dst_lo
, xmm_dst_hi
;
6121 __m128i xmm_alpha_lo
, xmm_alpha_hi
;
6123 xmm_dst
= load_128_aligned ((__m128i
*)dst
);
6125 unpack_128_2x128 (xmm_src
, &xmm_src_lo
, &xmm_src_hi
);
6126 unpack_128_2x128 (xmm_dst
, &xmm_dst_lo
, &xmm_dst_hi
);
6127 expand_alpha_2x128 (xmm_src_lo
, xmm_src_hi
,
6128 &xmm_alpha_lo
, &xmm_alpha_hi
);
6130 in_over_2x128 (&xmm_src_lo
, &xmm_src_hi
,
6131 &xmm_alpha_lo
, &xmm_alpha_hi
,
6132 &xmm_mask
, &xmm_mask
,
6133 &xmm_dst_lo
, &xmm_dst_hi
);
6136 ((__m128i
*)dst
, pack_2x128_128 (xmm_dst_lo
, xmm_dst_hi
));
6145 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
6150 __m128i ms
= unpack_32_1x128 (pix1
);
6151 __m128i alpha
= expand_alpha_1x128 (ms
);
6152 __m128i dest
= xmm_mask
;
6153 __m128i alpha_dst
= unpack_32_1x128 (d
);
6155 *dst
= pack_1x128_32
6156 (in_over_1x128 (&ms
, &alpha
, &dest
, &alpha_dst
));
6164 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER
,
6165 scaled_bilinear_scanline_sse2_8888_n_8888_OVER
,
6166 uint32_t, uint32_t, uint32_t,
6167 COVER
, FLAG_HAVE_SOLID_MASK
)
6168 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER
,
6169 scaled_bilinear_scanline_sse2_8888_n_8888_OVER
,
6170 uint32_t, uint32_t, uint32_t,
6171 PAD
, FLAG_HAVE_SOLID_MASK
)
6172 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER
,
6173 scaled_bilinear_scanline_sse2_8888_n_8888_OVER
,
6174 uint32_t, uint32_t, uint32_t,
6175 NONE
, FLAG_HAVE_SOLID_MASK
)
6176 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER
,
6177 scaled_bilinear_scanline_sse2_8888_n_8888_OVER
,
6178 uint32_t, uint32_t, uint32_t,
6179 NORMAL
, FLAG_HAVE_SOLID_MASK
)
6181 static const pixman_fast_path_t sse2_fast_paths
[] =
6183 /* PIXMAN_OP_OVER */
6184 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, r5g6b5
, sse2_composite_over_n_8_0565
),
6185 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, b5g6r5
, sse2_composite_over_n_8_0565
),
6186 PIXMAN_STD_FAST_PATH (OVER
, solid
, null
, a8r8g8b8
, sse2_composite_over_n_8888
),
6187 PIXMAN_STD_FAST_PATH (OVER
, solid
, null
, x8r8g8b8
, sse2_composite_over_n_8888
),
6188 PIXMAN_STD_FAST_PATH (OVER
, solid
, null
, r5g6b5
, sse2_composite_over_n_0565
),
6189 PIXMAN_STD_FAST_PATH (OVER
, solid
, null
, b5g6r5
, sse2_composite_over_n_0565
),
6190 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, null
, a8r8g8b8
, sse2_composite_over_8888_8888
),
6191 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, null
, x8r8g8b8
, sse2_composite_over_8888_8888
),
6192 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, null
, a8b8g8r8
, sse2_composite_over_8888_8888
),
6193 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, null
, x8b8g8r8
, sse2_composite_over_8888_8888
),
6194 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, null
, r5g6b5
, sse2_composite_over_8888_0565
),
6195 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, null
, b5g6r5
, sse2_composite_over_8888_0565
),
6196 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, a8r8g8b8
, sse2_composite_over_n_8_8888
),
6197 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, x8r8g8b8
, sse2_composite_over_n_8_8888
),
6198 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, a8b8g8r8
, sse2_composite_over_n_8_8888
),
6199 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, x8b8g8r8
, sse2_composite_over_n_8_8888
),
6200 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, a8r8g8b8
, a8r8g8b8
, sse2_composite_over_8888_8888_8888
),
6201 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, a8
, x8r8g8b8
, sse2_composite_over_8888_8_8888
),
6202 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, a8
, a8r8g8b8
, sse2_composite_over_8888_8_8888
),
6203 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, a8
, x8b8g8r8
, sse2_composite_over_8888_8_8888
),
6204 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, a8
, a8b8g8r8
, sse2_composite_over_8888_8_8888
),
6205 PIXMAN_STD_FAST_PATH (OVER
, x8r8g8b8
, a8
, x8r8g8b8
, sse2_composite_over_x888_8_8888
),
6206 PIXMAN_STD_FAST_PATH (OVER
, x8r8g8b8
, a8
, a8r8g8b8
, sse2_composite_over_x888_8_8888
),
6207 PIXMAN_STD_FAST_PATH (OVER
, x8b8g8r8
, a8
, x8b8g8r8
, sse2_composite_over_x888_8_8888
),
6208 PIXMAN_STD_FAST_PATH (OVER
, x8b8g8r8
, a8
, a8b8g8r8
, sse2_composite_over_x888_8_8888
),
6209 PIXMAN_STD_FAST_PATH (OVER
, x8r8g8b8
, solid
, a8r8g8b8
, sse2_composite_over_x888_n_8888
),
6210 PIXMAN_STD_FAST_PATH (OVER
, x8r8g8b8
, solid
, x8r8g8b8
, sse2_composite_over_x888_n_8888
),
6211 PIXMAN_STD_FAST_PATH (OVER
, x8b8g8r8
, solid
, a8b8g8r8
, sse2_composite_over_x888_n_8888
),
6212 PIXMAN_STD_FAST_PATH (OVER
, x8b8g8r8
, solid
, x8b8g8r8
, sse2_composite_over_x888_n_8888
),
6213 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, solid
, a8r8g8b8
, sse2_composite_over_8888_n_8888
),
6214 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, solid
, x8r8g8b8
, sse2_composite_over_8888_n_8888
),
6215 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, solid
, a8b8g8r8
, sse2_composite_over_8888_n_8888
),
6216 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, solid
, x8b8g8r8
, sse2_composite_over_8888_n_8888
),
6217 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8r8g8b8
, a8r8g8b8
, sse2_composite_over_n_8888_8888_ca
),
6218 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8r8g8b8
, x8r8g8b8
, sse2_composite_over_n_8888_8888_ca
),
6219 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8b8g8r8
, a8b8g8r8
, sse2_composite_over_n_8888_8888_ca
),
6220 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8b8g8r8
, x8b8g8r8
, sse2_composite_over_n_8888_8888_ca
),
6221 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8r8g8b8
, r5g6b5
, sse2_composite_over_n_8888_0565_ca
),
6222 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8b8g8r8
, b5g6r5
, sse2_composite_over_n_8888_0565_ca
),
6223 PIXMAN_STD_FAST_PATH (OVER
, pixbuf
, pixbuf
, a8r8g8b8
, sse2_composite_over_pixbuf_8888
),
6224 PIXMAN_STD_FAST_PATH (OVER
, pixbuf
, pixbuf
, x8r8g8b8
, sse2_composite_over_pixbuf_8888
),
6225 PIXMAN_STD_FAST_PATH (OVER
, rpixbuf
, rpixbuf
, a8b8g8r8
, sse2_composite_over_pixbuf_8888
),
6226 PIXMAN_STD_FAST_PATH (OVER
, rpixbuf
, rpixbuf
, x8b8g8r8
, sse2_composite_over_pixbuf_8888
),
6227 PIXMAN_STD_FAST_PATH (OVER
, pixbuf
, pixbuf
, r5g6b5
, sse2_composite_over_pixbuf_0565
),
6228 PIXMAN_STD_FAST_PATH (OVER
, rpixbuf
, rpixbuf
, b5g6r5
, sse2_composite_over_pixbuf_0565
),
6229 PIXMAN_STD_FAST_PATH (OVER
, x8r8g8b8
, null
, x8r8g8b8
, sse2_composite_copy_area
),
6230 PIXMAN_STD_FAST_PATH (OVER
, x8b8g8r8
, null
, x8b8g8r8
, sse2_composite_copy_area
),
6232 /* PIXMAN_OP_OVER_REVERSE */
6233 PIXMAN_STD_FAST_PATH (OVER_REVERSE
, solid
, null
, a8r8g8b8
, sse2_composite_over_reverse_n_8888
),
6234 PIXMAN_STD_FAST_PATH (OVER_REVERSE
, solid
, null
, a8b8g8r8
, sse2_composite_over_reverse_n_8888
),
6237 PIXMAN_STD_FAST_PATH_CA (ADD
, solid
, a8r8g8b8
, a8r8g8b8
, sse2_composite_add_n_8888_8888_ca
),
6238 PIXMAN_STD_FAST_PATH (ADD
, a8
, null
, a8
, sse2_composite_add_8_8
),
6239 PIXMAN_STD_FAST_PATH (ADD
, a8r8g8b8
, null
, a8r8g8b8
, sse2_composite_add_8888_8888
),
6240 PIXMAN_STD_FAST_PATH (ADD
, a8b8g8r8
, null
, a8b8g8r8
, sse2_composite_add_8888_8888
),
6241 PIXMAN_STD_FAST_PATH (ADD
, solid
, a8
, a8
, sse2_composite_add_n_8_8
),
6242 PIXMAN_STD_FAST_PATH (ADD
, solid
, null
, a8
, sse2_composite_add_n_8
),
6243 PIXMAN_STD_FAST_PATH (ADD
, solid
, null
, x8r8g8b8
, sse2_composite_add_n_8888
),
6244 PIXMAN_STD_FAST_PATH (ADD
, solid
, null
, a8r8g8b8
, sse2_composite_add_n_8888
),
6245 PIXMAN_STD_FAST_PATH (ADD
, solid
, null
, x8b8g8r8
, sse2_composite_add_n_8888
),
6246 PIXMAN_STD_FAST_PATH (ADD
, solid
, null
, a8b8g8r8
, sse2_composite_add_n_8888
),
6247 PIXMAN_STD_FAST_PATH (ADD
, solid
, a8
, x8r8g8b8
, sse2_composite_add_n_8_8888
),
6248 PIXMAN_STD_FAST_PATH (ADD
, solid
, a8
, a8r8g8b8
, sse2_composite_add_n_8_8888
),
6249 PIXMAN_STD_FAST_PATH (ADD
, solid
, a8
, x8b8g8r8
, sse2_composite_add_n_8_8888
),
6250 PIXMAN_STD_FAST_PATH (ADD
, solid
, a8
, a8b8g8r8
, sse2_composite_add_n_8_8888
),
6253 PIXMAN_STD_FAST_PATH (SRC
, solid
, a8
, a8r8g8b8
, sse2_composite_src_n_8_8888
),
6254 PIXMAN_STD_FAST_PATH (SRC
, solid
, a8
, x8r8g8b8
, sse2_composite_src_n_8_8888
),
6255 PIXMAN_STD_FAST_PATH (SRC
, solid
, a8
, a8b8g8r8
, sse2_composite_src_n_8_8888
),
6256 PIXMAN_STD_FAST_PATH (SRC
, solid
, a8
, x8b8g8r8
, sse2_composite_src_n_8_8888
),
6257 PIXMAN_STD_FAST_PATH (SRC
, a8r8g8b8
, null
, r5g6b5
, sse2_composite_src_x888_0565
),
6258 PIXMAN_STD_FAST_PATH (SRC
, a8b8g8r8
, null
, b5g6r5
, sse2_composite_src_x888_0565
),
6259 PIXMAN_STD_FAST_PATH (SRC
, x8r8g8b8
, null
, r5g6b5
, sse2_composite_src_x888_0565
),
6260 PIXMAN_STD_FAST_PATH (SRC
, x8b8g8r8
, null
, b5g6r5
, sse2_composite_src_x888_0565
),
6261 PIXMAN_STD_FAST_PATH (SRC
, x8r8g8b8
, null
, a8r8g8b8
, sse2_composite_src_x888_8888
),
6262 PIXMAN_STD_FAST_PATH (SRC
, x8b8g8r8
, null
, a8b8g8r8
, sse2_composite_src_x888_8888
),
6263 PIXMAN_STD_FAST_PATH (SRC
, a8r8g8b8
, null
, a8r8g8b8
, sse2_composite_copy_area
),
6264 PIXMAN_STD_FAST_PATH (SRC
, a8b8g8r8
, null
, a8b8g8r8
, sse2_composite_copy_area
),
6265 PIXMAN_STD_FAST_PATH (SRC
, a8r8g8b8
, null
, x8r8g8b8
, sse2_composite_copy_area
),
6266 PIXMAN_STD_FAST_PATH (SRC
, a8b8g8r8
, null
, x8b8g8r8
, sse2_composite_copy_area
),
6267 PIXMAN_STD_FAST_PATH (SRC
, x8r8g8b8
, null
, x8r8g8b8
, sse2_composite_copy_area
),
6268 PIXMAN_STD_FAST_PATH (SRC
, x8b8g8r8
, null
, x8b8g8r8
, sse2_composite_copy_area
),
6269 PIXMAN_STD_FAST_PATH (SRC
, r5g6b5
, null
, r5g6b5
, sse2_composite_copy_area
),
6270 PIXMAN_STD_FAST_PATH (SRC
, b5g6r5
, null
, b5g6r5
, sse2_composite_copy_area
),
6273 PIXMAN_STD_FAST_PATH (IN
, a8
, null
, a8
, sse2_composite_in_8_8
),
6274 PIXMAN_STD_FAST_PATH (IN
, solid
, a8
, a8
, sse2_composite_in_n_8_8
),
6275 PIXMAN_STD_FAST_PATH (IN
, solid
, null
, a8
, sse2_composite_in_n_8
),
6277 SIMPLE_NEAREST_FAST_PATH (OVER
, a8r8g8b8
, x8r8g8b8
, sse2_8888_8888
),
6278 SIMPLE_NEAREST_FAST_PATH (OVER
, a8b8g8r8
, x8b8g8r8
, sse2_8888_8888
),
6279 SIMPLE_NEAREST_FAST_PATH (OVER
, a8r8g8b8
, a8r8g8b8
, sse2_8888_8888
),
6280 SIMPLE_NEAREST_FAST_PATH (OVER
, a8b8g8r8
, a8b8g8r8
, sse2_8888_8888
),
6282 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER
, a8r8g8b8
, a8r8g8b8
, sse2_8888_n_8888
),
6283 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER
, a8b8g8r8
, a8b8g8r8
, sse2_8888_n_8888
),
6284 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER
, a8r8g8b8
, x8r8g8b8
, sse2_8888_n_8888
),
6285 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER
, a8b8g8r8
, x8b8g8r8
, sse2_8888_n_8888
),
6287 SIMPLE_BILINEAR_FAST_PATH (SRC
, a8r8g8b8
, a8r8g8b8
, sse2_8888_8888
),
6288 SIMPLE_BILINEAR_FAST_PATH (SRC
, a8r8g8b8
, x8r8g8b8
, sse2_8888_8888
),
6289 SIMPLE_BILINEAR_FAST_PATH (SRC
, x8r8g8b8
, x8r8g8b8
, sse2_8888_8888
),
6290 SIMPLE_BILINEAR_FAST_PATH (SRC
, a8b8g8r8
, a8b8g8r8
, sse2_8888_8888
),
6291 SIMPLE_BILINEAR_FAST_PATH (SRC
, a8b8g8r8
, x8b8g8r8
, sse2_8888_8888
),
6292 SIMPLE_BILINEAR_FAST_PATH (SRC
, x8b8g8r8
, x8b8g8r8
, sse2_8888_8888
),
6294 SIMPLE_BILINEAR_FAST_PATH_COVER (SRC
, x8r8g8b8
, a8r8g8b8
, sse2_x888_8888
),
6295 SIMPLE_BILINEAR_FAST_PATH_COVER (SRC
, x8b8g8r8
, a8b8g8r8
, sse2_x888_8888
),
6296 SIMPLE_BILINEAR_FAST_PATH_PAD (SRC
, x8r8g8b8
, a8r8g8b8
, sse2_x888_8888
),
6297 SIMPLE_BILINEAR_FAST_PATH_PAD (SRC
, x8b8g8r8
, a8b8g8r8
, sse2_x888_8888
),
6298 SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC
, x8r8g8b8
, a8r8g8b8
, sse2_x888_8888
),
6299 SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC
, x8b8g8r8
, a8b8g8r8
, sse2_x888_8888
),
6301 SIMPLE_BILINEAR_FAST_PATH (OVER
, a8r8g8b8
, x8r8g8b8
, sse2_8888_8888
),
6302 SIMPLE_BILINEAR_FAST_PATH (OVER
, a8b8g8r8
, x8b8g8r8
, sse2_8888_8888
),
6303 SIMPLE_BILINEAR_FAST_PATH (OVER
, a8r8g8b8
, a8r8g8b8
, sse2_8888_8888
),
6304 SIMPLE_BILINEAR_FAST_PATH (OVER
, a8b8g8r8
, a8b8g8r8
, sse2_8888_8888
),
6306 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER
, a8r8g8b8
, x8r8g8b8
, sse2_8888_n_8888
),
6307 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER
, a8b8g8r8
, x8b8g8r8
, sse2_8888_n_8888
),
6308 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER
, a8r8g8b8
, a8r8g8b8
, sse2_8888_n_8888
),
6309 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER
, a8b8g8r8
, a8b8g8r8
, sse2_8888_n_8888
),
6311 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER
, a8r8g8b8
, x8r8g8b8
, sse2_8888_8_8888
),
6312 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER
, a8b8g8r8
, x8b8g8r8
, sse2_8888_8_8888
),
6313 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER
, a8r8g8b8
, a8r8g8b8
, sse2_8888_8_8888
),
6314 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER
, a8b8g8r8
, a8b8g8r8
, sse2_8888_8_8888
),
6320 sse2_fetch_x8r8g8b8 (pixman_iter_t
*iter
, const uint32_t *mask
)
6322 int w
= iter
->width
;
6323 __m128i ff000000
= mask_ff000000
;
6324 uint32_t *dst
= iter
->buffer
;
6325 uint32_t *src
= (uint32_t *)iter
->bits
;
6327 iter
->bits
+= iter
->stride
;
6329 while (w
&& ((uintptr_t)dst
) & 0x0f)
6331 *dst
++ = (*src
++) | 0xff000000;
6338 (__m128i
*)dst
, _mm_or_si128 (
6339 load_128_unaligned ((__m128i
*)src
), ff000000
));
6348 *dst
++ = (*src
++) | 0xff000000;
6352 return iter
->buffer
;
6356 sse2_fetch_r5g6b5 (pixman_iter_t
*iter
, const uint32_t *mask
)
6358 int w
= iter
->width
;
6359 uint32_t *dst
= iter
->buffer
;
6360 uint16_t *src
= (uint16_t *)iter
->bits
;
6361 __m128i ff000000
= mask_ff000000
;
6363 iter
->bits
+= iter
->stride
;
6365 while (w
&& ((uintptr_t)dst
) & 0x0f)
6367 uint16_t s
= *src
++;
6369 *dst
++ = convert_0565_to_8888 (s
);
6377 s
= _mm_loadu_si128 ((__m128i
*)src
);
6379 lo
= unpack_565_to_8888 (_mm_unpacklo_epi16 (s
, _mm_setzero_si128 ()));
6380 hi
= unpack_565_to_8888 (_mm_unpackhi_epi16 (s
, _mm_setzero_si128 ()));
6382 save_128_aligned ((__m128i
*)(dst
+ 0), _mm_or_si128 (lo
, ff000000
));
6383 save_128_aligned ((__m128i
*)(dst
+ 4), _mm_or_si128 (hi
, ff000000
));
6392 uint16_t s
= *src
++;
6394 *dst
++ = convert_0565_to_8888 (s
);
6398 return iter
->buffer
;
6402 sse2_fetch_a8 (pixman_iter_t
*iter
, const uint32_t *mask
)
6404 int w
= iter
->width
;
6405 uint32_t *dst
= iter
->buffer
;
6406 uint8_t *src
= iter
->bits
;
6407 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm5
, xmm6
;
6409 iter
->bits
+= iter
->stride
;
6411 while (w
&& (((uintptr_t)dst
) & 15))
6413 *dst
++ = *(src
++) << 24;
6419 xmm0
= _mm_loadu_si128((__m128i
*)src
);
6421 xmm1
= _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0
);
6422 xmm2
= _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0
);
6423 xmm3
= _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1
);
6424 xmm4
= _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1
);
6425 xmm5
= _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2
);
6426 xmm6
= _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2
);
6428 _mm_store_si128(((__m128i
*)(dst
+ 0)), xmm3
);
6429 _mm_store_si128(((__m128i
*)(dst
+ 4)), xmm4
);
6430 _mm_store_si128(((__m128i
*)(dst
+ 8)), xmm5
);
6431 _mm_store_si128(((__m128i
*)(dst
+ 12)), xmm6
);
6440 *dst
++ = *(src
++) << 24;
6444 return iter
->buffer
;
6447 #define IMAGE_FLAGS \
6448 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
6449 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
6451 static const pixman_iter_info_t sse2_iters
[] =
6453 { PIXMAN_x8r8g8b8
, IMAGE_FLAGS
, ITER_NARROW
,
6454 _pixman_iter_init_bits_stride
, sse2_fetch_x8r8g8b8
, NULL
6456 { PIXMAN_r5g6b5
, IMAGE_FLAGS
, ITER_NARROW
,
6457 _pixman_iter_init_bits_stride
, sse2_fetch_r5g6b5
, NULL
6459 { PIXMAN_a8
, IMAGE_FLAGS
, ITER_NARROW
,
6460 _pixman_iter_init_bits_stride
, sse2_fetch_a8
, NULL
6465 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6466 __attribute__((__force_align_arg_pointer__
))
6468 pixman_implementation_t
*
6469 _pixman_implementation_create_sse2 (pixman_implementation_t
*fallback
)
6471 pixman_implementation_t
*imp
= _pixman_implementation_create (fallback
, sse2_fast_paths
);
6473 /* SSE2 constants */
6474 mask_565_r
= create_mask_2x32_128 (0x00f80000, 0x00f80000);
6475 mask_565_g1
= create_mask_2x32_128 (0x00070000, 0x00070000);
6476 mask_565_g2
= create_mask_2x32_128 (0x000000e0, 0x000000e0);
6477 mask_565_b
= create_mask_2x32_128 (0x0000001f, 0x0000001f);
6478 mask_red
= create_mask_2x32_128 (0x00f80000, 0x00f80000);
6479 mask_green
= create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6480 mask_blue
= create_mask_2x32_128 (0x000000f8, 0x000000f8);
6481 mask_565_fix_rb
= create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6482 mask_565_fix_g
= create_mask_2x32_128 (0x0000c000, 0x0000c000);
6483 mask_0080
= create_mask_16_128 (0x0080);
6484 mask_00ff
= create_mask_16_128 (0x00ff);
6485 mask_0101
= create_mask_16_128 (0x0101);
6486 mask_ffff
= create_mask_16_128 (0xffff);
6487 mask_ff000000
= create_mask_2x32_128 (0xff000000, 0xff000000);
6488 mask_alpha
= create_mask_2x32_128 (0x00ff0000, 0x00000000);
6489 mask_565_rb
= create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
6490 mask_565_pack_multiplier
= create_mask_2x32_128 (0x20000004, 0x20000004);
6492 /* Set up function pointers */
6493 imp
->combine_32
[PIXMAN_OP_OVER
] = sse2_combine_over_u
;
6494 imp
->combine_32
[PIXMAN_OP_OVER_REVERSE
] = sse2_combine_over_reverse_u
;
6495 imp
->combine_32
[PIXMAN_OP_IN
] = sse2_combine_in_u
;
6496 imp
->combine_32
[PIXMAN_OP_IN_REVERSE
] = sse2_combine_in_reverse_u
;
6497 imp
->combine_32
[PIXMAN_OP_OUT
] = sse2_combine_out_u
;
6498 imp
->combine_32
[PIXMAN_OP_OUT_REVERSE
] = sse2_combine_out_reverse_u
;
6499 imp
->combine_32
[PIXMAN_OP_ATOP
] = sse2_combine_atop_u
;
6500 imp
->combine_32
[PIXMAN_OP_ATOP_REVERSE
] = sse2_combine_atop_reverse_u
;
6501 imp
->combine_32
[PIXMAN_OP_XOR
] = sse2_combine_xor_u
;
6502 imp
->combine_32
[PIXMAN_OP_ADD
] = sse2_combine_add_u
;
6504 imp
->combine_32
[PIXMAN_OP_SATURATE
] = sse2_combine_saturate_u
;
6506 imp
->combine_32_ca
[PIXMAN_OP_SRC
] = sse2_combine_src_ca
;
6507 imp
->combine_32_ca
[PIXMAN_OP_OVER
] = sse2_combine_over_ca
;
6508 imp
->combine_32_ca
[PIXMAN_OP_OVER_REVERSE
] = sse2_combine_over_reverse_ca
;
6509 imp
->combine_32_ca
[PIXMAN_OP_IN
] = sse2_combine_in_ca
;
6510 imp
->combine_32_ca
[PIXMAN_OP_IN_REVERSE
] = sse2_combine_in_reverse_ca
;
6511 imp
->combine_32_ca
[PIXMAN_OP_OUT
] = sse2_combine_out_ca
;
6512 imp
->combine_32_ca
[PIXMAN_OP_OUT_REVERSE
] = sse2_combine_out_reverse_ca
;
6513 imp
->combine_32_ca
[PIXMAN_OP_ATOP
] = sse2_combine_atop_ca
;
6514 imp
->combine_32_ca
[PIXMAN_OP_ATOP_REVERSE
] = sse2_combine_atop_reverse_ca
;
6515 imp
->combine_32_ca
[PIXMAN_OP_XOR
] = sse2_combine_xor_ca
;
6516 imp
->combine_32_ca
[PIXMAN_OP_ADD
] = sse2_combine_add_ca
;
6518 imp
->blt
= sse2_blt
;
6519 imp
->fill
= sse2_fill
;
6521 imp
->iter_info
= sse2_iters
;