beta-0.89.2
[luatex.git] / source / libs / pixman / pixman-src / pixman / pixman-sse2.c
blob895510372fd1cc5d6e3b18996fb376668abc8d69
1 /*
2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22 * SOFTWARE.
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
27 * Based on work by Owen Taylor and Søren Sandmann
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
33 /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
34 #define PSHUFD_IS_FAST 0
36 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
37 #include <emmintrin.h> /* for SSE2 intrinsics */
38 #include "pixman-private.h"
39 #include "pixman-combine32.h"
40 #include "pixman-inlines.h"
42 static __m128i mask_0080;
43 static __m128i mask_00ff;
44 static __m128i mask_0101;
45 static __m128i mask_ffff;
46 static __m128i mask_ff000000;
47 static __m128i mask_alpha;
49 static __m128i mask_565_r;
50 static __m128i mask_565_g1, mask_565_g2;
51 static __m128i mask_565_b;
52 static __m128i mask_red;
53 static __m128i mask_green;
54 static __m128i mask_blue;
56 static __m128i mask_565_fix_rb;
57 static __m128i mask_565_fix_g;
59 static __m128i mask_565_rb;
60 static __m128i mask_565_pack_multiplier;
62 static force_inline __m128i
63 unpack_32_1x128 (uint32_t data)
65 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
68 static force_inline void
69 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
71 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
72 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
75 static force_inline __m128i
76 unpack_565_to_8888 (__m128i lo)
78 __m128i r, g, b, rb, t;
80 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
81 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
82 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
84 rb = _mm_or_si128 (r, b);
85 t = _mm_and_si128 (rb, mask_565_fix_rb);
86 t = _mm_srli_epi32 (t, 5);
87 rb = _mm_or_si128 (rb, t);
89 t = _mm_and_si128 (g, mask_565_fix_g);
90 t = _mm_srli_epi32 (t, 6);
91 g = _mm_or_si128 (g, t);
93 return _mm_or_si128 (rb, g);
96 static force_inline void
97 unpack_565_128_4x128 (__m128i data,
98 __m128i* data0,
99 __m128i* data1,
100 __m128i* data2,
101 __m128i* data3)
103 __m128i lo, hi;
105 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
106 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
108 lo = unpack_565_to_8888 (lo);
109 hi = unpack_565_to_8888 (hi);
111 unpack_128_2x128 (lo, data0, data1);
112 unpack_128_2x128 (hi, data2, data3);
115 static force_inline uint16_t
116 pack_565_32_16 (uint32_t pixel)
118 return (uint16_t) (((pixel >> 8) & 0xf800) |
119 ((pixel >> 5) & 0x07e0) |
120 ((pixel >> 3) & 0x001f));
123 static force_inline __m128i
124 pack_2x128_128 (__m128i lo, __m128i hi)
126 return _mm_packus_epi16 (lo, hi);
129 static force_inline __m128i
130 pack_565_2packedx128_128 (__m128i lo, __m128i hi)
132 __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
133 __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
135 __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
136 __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
138 __m128i g0 = _mm_and_si128 (lo, mask_green);
139 __m128i g1 = _mm_and_si128 (hi, mask_green);
141 t0 = _mm_or_si128 (t0, g0);
142 t1 = _mm_or_si128 (t1, g1);
144 /* Simulates _mm_packus_epi32 */
145 t0 = _mm_slli_epi32 (t0, 16 - 5);
146 t1 = _mm_slli_epi32 (t1, 16 - 5);
147 t0 = _mm_srai_epi32 (t0, 16);
148 t1 = _mm_srai_epi32 (t1, 16);
149 return _mm_packs_epi32 (t0, t1);
152 static force_inline __m128i
153 pack_565_2x128_128 (__m128i lo, __m128i hi)
155 __m128i data;
156 __m128i r, g1, g2, b;
158 data = pack_2x128_128 (lo, hi);
160 r = _mm_and_si128 (data, mask_565_r);
161 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
162 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
163 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
165 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
168 static force_inline __m128i
169 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
171 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
172 pack_565_2x128_128 (*xmm2, *xmm3));
175 static force_inline int
176 is_opaque (__m128i x)
178 __m128i ffs = _mm_cmpeq_epi8 (x, x);
180 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
183 static force_inline int
184 is_zero (__m128i x)
186 return _mm_movemask_epi8 (
187 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
190 static force_inline int
191 is_transparent (__m128i x)
193 return (_mm_movemask_epi8 (
194 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
197 static force_inline __m128i
198 expand_pixel_32_1x128 (uint32_t data)
200 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
203 static force_inline __m128i
204 expand_alpha_1x128 (__m128i data)
206 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
207 _MM_SHUFFLE (3, 3, 3, 3)),
208 _MM_SHUFFLE (3, 3, 3, 3));
211 static force_inline void
212 expand_alpha_2x128 (__m128i data_lo,
213 __m128i data_hi,
214 __m128i* alpha_lo,
215 __m128i* alpha_hi)
217 __m128i lo, hi;
219 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
220 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
222 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
223 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
226 static force_inline void
227 expand_alpha_rev_2x128 (__m128i data_lo,
228 __m128i data_hi,
229 __m128i* alpha_lo,
230 __m128i* alpha_hi)
232 __m128i lo, hi;
234 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
235 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
236 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
237 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
240 static force_inline void
241 pix_multiply_2x128 (__m128i* data_lo,
242 __m128i* data_hi,
243 __m128i* alpha_lo,
244 __m128i* alpha_hi,
245 __m128i* ret_lo,
246 __m128i* ret_hi)
248 __m128i lo, hi;
250 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
251 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
252 lo = _mm_adds_epu16 (lo, mask_0080);
253 hi = _mm_adds_epu16 (hi, mask_0080);
254 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
255 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
258 static force_inline void
259 pix_add_multiply_2x128 (__m128i* src_lo,
260 __m128i* src_hi,
261 __m128i* alpha_dst_lo,
262 __m128i* alpha_dst_hi,
263 __m128i* dst_lo,
264 __m128i* dst_hi,
265 __m128i* alpha_src_lo,
266 __m128i* alpha_src_hi,
267 __m128i* ret_lo,
268 __m128i* ret_hi)
270 __m128i t1_lo, t1_hi;
271 __m128i t2_lo, t2_hi;
273 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
274 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
276 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
277 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
280 static force_inline void
281 negate_2x128 (__m128i data_lo,
282 __m128i data_hi,
283 __m128i* neg_lo,
284 __m128i* neg_hi)
286 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
287 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
290 static force_inline void
291 invert_colors_2x128 (__m128i data_lo,
292 __m128i data_hi,
293 __m128i* inv_lo,
294 __m128i* inv_hi)
296 __m128i lo, hi;
298 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
299 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
300 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
301 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
304 static force_inline void
305 over_2x128 (__m128i* src_lo,
306 __m128i* src_hi,
307 __m128i* alpha_lo,
308 __m128i* alpha_hi,
309 __m128i* dst_lo,
310 __m128i* dst_hi)
312 __m128i t1, t2;
314 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
316 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
318 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
319 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
322 static force_inline void
323 over_rev_non_pre_2x128 (__m128i src_lo,
324 __m128i src_hi,
325 __m128i* dst_lo,
326 __m128i* dst_hi)
328 __m128i lo, hi;
329 __m128i alpha_lo, alpha_hi;
331 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
333 lo = _mm_or_si128 (alpha_lo, mask_alpha);
334 hi = _mm_or_si128 (alpha_hi, mask_alpha);
336 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
338 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
340 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
343 static force_inline void
344 in_over_2x128 (__m128i* src_lo,
345 __m128i* src_hi,
346 __m128i* alpha_lo,
347 __m128i* alpha_hi,
348 __m128i* mask_lo,
349 __m128i* mask_hi,
350 __m128i* dst_lo,
351 __m128i* dst_hi)
353 __m128i s_lo, s_hi;
354 __m128i a_lo, a_hi;
356 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
357 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
359 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
362 /* load 4 pixels from a 16-byte boundary aligned address */
363 static force_inline __m128i
364 load_128_aligned (__m128i* src)
366 return _mm_load_si128 (src);
369 /* load 4 pixels from a unaligned address */
370 static force_inline __m128i
371 load_128_unaligned (const __m128i* src)
373 return _mm_loadu_si128 (src);
376 /* save 4 pixels using Write Combining memory on a 16-byte
377 * boundary aligned address
379 static force_inline void
380 save_128_write_combining (__m128i* dst,
381 __m128i data)
383 _mm_stream_si128 (dst, data);
386 /* save 4 pixels on a 16-byte boundary aligned address */
387 static force_inline void
388 save_128_aligned (__m128i* dst,
389 __m128i data)
391 _mm_store_si128 (dst, data);
394 /* save 4 pixels on a unaligned address */
395 static force_inline void
396 save_128_unaligned (__m128i* dst,
397 __m128i data)
399 _mm_storeu_si128 (dst, data);
402 static force_inline __m128i
403 load_32_1x128 (uint32_t data)
405 return _mm_cvtsi32_si128 (data);
408 static force_inline __m128i
409 expand_alpha_rev_1x128 (__m128i data)
411 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
414 static force_inline __m128i
415 expand_pixel_8_1x128 (uint8_t data)
417 return _mm_shufflelo_epi16 (
418 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
421 static force_inline __m128i
422 pix_multiply_1x128 (__m128i data,
423 __m128i alpha)
425 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
426 mask_0080),
427 mask_0101);
430 static force_inline __m128i
431 pix_add_multiply_1x128 (__m128i* src,
432 __m128i* alpha_dst,
433 __m128i* dst,
434 __m128i* alpha_src)
436 __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
437 __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
439 return _mm_adds_epu8 (t1, t2);
442 static force_inline __m128i
443 negate_1x128 (__m128i data)
445 return _mm_xor_si128 (data, mask_00ff);
448 static force_inline __m128i
449 invert_colors_1x128 (__m128i data)
451 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
454 static force_inline __m128i
455 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
457 return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
460 static force_inline __m128i
461 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
463 return over_1x128 (pix_multiply_1x128 (*src, *mask),
464 pix_multiply_1x128 (*alpha, *mask),
465 *dst);
468 static force_inline __m128i
469 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
471 __m128i alpha = expand_alpha_1x128 (src);
473 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
474 _mm_or_si128 (alpha, mask_alpha)),
475 alpha,
476 dst);
479 static force_inline uint32_t
480 pack_1x128_32 (__m128i data)
482 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
485 static force_inline __m128i
486 expand565_16_1x128 (uint16_t pixel)
488 __m128i m = _mm_cvtsi32_si128 (pixel);
490 m = unpack_565_to_8888 (m);
492 return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
495 static force_inline uint32_t
496 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
498 uint8_t a;
499 __m128i xmms;
501 a = src >> 24;
503 if (a == 0xff)
505 return src;
507 else if (src)
509 xmms = unpack_32_1x128 (src);
510 return pack_1x128_32 (
511 over_1x128 (xmms, expand_alpha_1x128 (xmms),
512 unpack_32_1x128 (dst)));
515 return dst;
518 static force_inline uint32_t
519 combine1 (const uint32_t *ps, const uint32_t *pm)
521 uint32_t s = *ps;
523 if (pm)
525 __m128i ms, mm;
527 mm = unpack_32_1x128 (*pm);
528 mm = expand_alpha_1x128 (mm);
530 ms = unpack_32_1x128 (s);
531 ms = pix_multiply_1x128 (ms, mm);
533 s = pack_1x128_32 (ms);
536 return s;
539 static force_inline __m128i
540 combine4 (const __m128i *ps, const __m128i *pm)
542 __m128i xmm_src_lo, xmm_src_hi;
543 __m128i xmm_msk_lo, xmm_msk_hi;
544 __m128i s;
546 if (pm)
548 xmm_msk_lo = load_128_unaligned (pm);
550 if (is_transparent (xmm_msk_lo))
551 return _mm_setzero_si128 ();
554 s = load_128_unaligned (ps);
556 if (pm)
558 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
559 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
561 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
563 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
564 &xmm_msk_lo, &xmm_msk_hi,
565 &xmm_src_lo, &xmm_src_hi);
567 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
570 return s;
573 static force_inline void
574 core_combine_over_u_sse2_mask (uint32_t * pd,
575 const uint32_t* ps,
576 const uint32_t* pm,
577 int w)
579 uint32_t s, d;
581 /* Align dst on a 16-byte boundary */
582 while (w && ((uintptr_t)pd & 15))
584 d = *pd;
585 s = combine1 (ps, pm);
587 if (s)
588 *pd = core_combine_over_u_pixel_sse2 (s, d);
589 pd++;
590 ps++;
591 pm++;
592 w--;
595 while (w >= 4)
597 __m128i mask = load_128_unaligned ((__m128i *)pm);
599 if (!is_zero (mask))
601 __m128i src;
602 __m128i src_hi, src_lo;
603 __m128i mask_hi, mask_lo;
604 __m128i alpha_hi, alpha_lo;
606 src = load_128_unaligned ((__m128i *)ps);
608 if (is_opaque (_mm_and_si128 (src, mask)))
610 save_128_aligned ((__m128i *)pd, src);
612 else
614 __m128i dst = load_128_aligned ((__m128i *)pd);
615 __m128i dst_hi, dst_lo;
617 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
618 unpack_128_2x128 (src, &src_lo, &src_hi);
620 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
621 pix_multiply_2x128 (&src_lo, &src_hi,
622 &mask_lo, &mask_hi,
623 &src_lo, &src_hi);
625 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
627 expand_alpha_2x128 (src_lo, src_hi,
628 &alpha_lo, &alpha_hi);
630 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
631 &dst_lo, &dst_hi);
633 save_128_aligned (
634 (__m128i *)pd,
635 pack_2x128_128 (dst_lo, dst_hi));
639 pm += 4;
640 ps += 4;
641 pd += 4;
642 w -= 4;
644 while (w)
646 d = *pd;
647 s = combine1 (ps, pm);
649 if (s)
650 *pd = core_combine_over_u_pixel_sse2 (s, d);
651 pd++;
652 ps++;
653 pm++;
655 w--;
659 static force_inline void
660 core_combine_over_u_sse2_no_mask (uint32_t * pd,
661 const uint32_t* ps,
662 int w)
664 uint32_t s, d;
666 /* Align dst on a 16-byte boundary */
667 while (w && ((uintptr_t)pd & 15))
669 d = *pd;
670 s = *ps;
672 if (s)
673 *pd = core_combine_over_u_pixel_sse2 (s, d);
674 pd++;
675 ps++;
676 w--;
679 while (w >= 4)
681 __m128i src;
682 __m128i src_hi, src_lo, dst_hi, dst_lo;
683 __m128i alpha_hi, alpha_lo;
685 src = load_128_unaligned ((__m128i *)ps);
687 if (!is_zero (src))
689 if (is_opaque (src))
691 save_128_aligned ((__m128i *)pd, src);
693 else
695 __m128i dst = load_128_aligned ((__m128i *)pd);
697 unpack_128_2x128 (src, &src_lo, &src_hi);
698 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
700 expand_alpha_2x128 (src_lo, src_hi,
701 &alpha_lo, &alpha_hi);
702 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
703 &dst_lo, &dst_hi);
705 save_128_aligned (
706 (__m128i *)pd,
707 pack_2x128_128 (dst_lo, dst_hi));
711 ps += 4;
712 pd += 4;
713 w -= 4;
715 while (w)
717 d = *pd;
718 s = *ps;
720 if (s)
721 *pd = core_combine_over_u_pixel_sse2 (s, d);
722 pd++;
723 ps++;
725 w--;
729 static force_inline void
730 sse2_combine_over_u (pixman_implementation_t *imp,
731 pixman_op_t op,
732 uint32_t * pd,
733 const uint32_t * ps,
734 const uint32_t * pm,
735 int w)
737 if (pm)
738 core_combine_over_u_sse2_mask (pd, ps, pm, w);
739 else
740 core_combine_over_u_sse2_no_mask (pd, ps, w);
743 static void
744 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
745 pixman_op_t op,
746 uint32_t * pd,
747 const uint32_t * ps,
748 const uint32_t * pm,
749 int w)
751 uint32_t s, d;
753 __m128i xmm_dst_lo, xmm_dst_hi;
754 __m128i xmm_src_lo, xmm_src_hi;
755 __m128i xmm_alpha_lo, xmm_alpha_hi;
757 /* Align dst on a 16-byte boundary */
758 while (w &&
759 ((uintptr_t)pd & 15))
761 d = *pd;
762 s = combine1 (ps, pm);
764 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
765 w--;
766 ps++;
767 if (pm)
768 pm++;
771 while (w >= 4)
773 /* I'm loading unaligned because I'm not sure
774 * about the address alignment.
776 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
777 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
779 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
780 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
782 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
783 &xmm_alpha_lo, &xmm_alpha_hi);
785 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
786 &xmm_alpha_lo, &xmm_alpha_hi,
787 &xmm_src_lo, &xmm_src_hi);
789 /* rebuid the 4 pixel data and save*/
790 save_128_aligned ((__m128i*)pd,
791 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
793 w -= 4;
794 ps += 4;
795 pd += 4;
797 if (pm)
798 pm += 4;
801 while (w)
803 d = *pd;
804 s = combine1 (ps, pm);
806 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
807 ps++;
808 w--;
809 if (pm)
810 pm++;
814 static force_inline uint32_t
815 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
817 uint32_t maska = src >> 24;
819 if (maska == 0)
821 return 0;
823 else if (maska != 0xff)
825 return pack_1x128_32 (
826 pix_multiply_1x128 (unpack_32_1x128 (dst),
827 expand_alpha_1x128 (unpack_32_1x128 (src))));
830 return dst;
833 static void
834 sse2_combine_in_u (pixman_implementation_t *imp,
835 pixman_op_t op,
836 uint32_t * pd,
837 const uint32_t * ps,
838 const uint32_t * pm,
839 int w)
841 uint32_t s, d;
843 __m128i xmm_src_lo, xmm_src_hi;
844 __m128i xmm_dst_lo, xmm_dst_hi;
846 while (w && ((uintptr_t)pd & 15))
848 s = combine1 (ps, pm);
849 d = *pd;
851 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
852 w--;
853 ps++;
854 if (pm)
855 pm++;
858 while (w >= 4)
860 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
861 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
863 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
864 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
866 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
867 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
868 &xmm_dst_lo, &xmm_dst_hi,
869 &xmm_dst_lo, &xmm_dst_hi);
871 save_128_aligned ((__m128i*)pd,
872 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
874 ps += 4;
875 pd += 4;
876 w -= 4;
877 if (pm)
878 pm += 4;
881 while (w)
883 s = combine1 (ps, pm);
884 d = *pd;
886 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
887 w--;
888 ps++;
889 if (pm)
890 pm++;
894 static void
895 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
896 pixman_op_t op,
897 uint32_t * pd,
898 const uint32_t * ps,
899 const uint32_t * pm,
900 int w)
902 uint32_t s, d;
904 __m128i xmm_src_lo, xmm_src_hi;
905 __m128i xmm_dst_lo, xmm_dst_hi;
907 while (w && ((uintptr_t)pd & 15))
909 s = combine1 (ps, pm);
910 d = *pd;
912 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
913 ps++;
914 w--;
915 if (pm)
916 pm++;
919 while (w >= 4)
921 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
922 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
924 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
925 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
927 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
928 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
929 &xmm_src_lo, &xmm_src_hi,
930 &xmm_dst_lo, &xmm_dst_hi);
932 save_128_aligned (
933 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
935 ps += 4;
936 pd += 4;
937 w -= 4;
938 if (pm)
939 pm += 4;
942 while (w)
944 s = combine1 (ps, pm);
945 d = *pd;
947 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
948 w--;
949 ps++;
950 if (pm)
951 pm++;
955 static void
956 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
957 pixman_op_t op,
958 uint32_t * pd,
959 const uint32_t * ps,
960 const uint32_t * pm,
961 int w)
963 while (w && ((uintptr_t)pd & 15))
965 uint32_t s = combine1 (ps, pm);
966 uint32_t d = *pd;
968 *pd++ = pack_1x128_32 (
969 pix_multiply_1x128 (
970 unpack_32_1x128 (d), negate_1x128 (
971 expand_alpha_1x128 (unpack_32_1x128 (s)))));
973 if (pm)
974 pm++;
975 ps++;
976 w--;
979 while (w >= 4)
981 __m128i xmm_src_lo, xmm_src_hi;
982 __m128i xmm_dst_lo, xmm_dst_hi;
984 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
985 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
987 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
988 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
990 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
991 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
993 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
994 &xmm_src_lo, &xmm_src_hi,
995 &xmm_dst_lo, &xmm_dst_hi);
997 save_128_aligned (
998 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1000 ps += 4;
1001 pd += 4;
1002 if (pm)
1003 pm += 4;
1005 w -= 4;
1008 while (w)
1010 uint32_t s = combine1 (ps, pm);
1011 uint32_t d = *pd;
1013 *pd++ = pack_1x128_32 (
1014 pix_multiply_1x128 (
1015 unpack_32_1x128 (d), negate_1x128 (
1016 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1017 ps++;
1018 if (pm)
1019 pm++;
1020 w--;
1024 static void
1025 sse2_combine_out_u (pixman_implementation_t *imp,
1026 pixman_op_t op,
1027 uint32_t * pd,
1028 const uint32_t * ps,
1029 const uint32_t * pm,
1030 int w)
1032 while (w && ((uintptr_t)pd & 15))
1034 uint32_t s = combine1 (ps, pm);
1035 uint32_t d = *pd;
1037 *pd++ = pack_1x128_32 (
1038 pix_multiply_1x128 (
1039 unpack_32_1x128 (s), negate_1x128 (
1040 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1041 w--;
1042 ps++;
1043 if (pm)
1044 pm++;
1047 while (w >= 4)
1049 __m128i xmm_src_lo, xmm_src_hi;
1050 __m128i xmm_dst_lo, xmm_dst_hi;
1052 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1053 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1055 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1056 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1058 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1059 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1061 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1062 &xmm_dst_lo, &xmm_dst_hi,
1063 &xmm_dst_lo, &xmm_dst_hi);
1065 save_128_aligned (
1066 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1068 ps += 4;
1069 pd += 4;
1070 w -= 4;
1071 if (pm)
1072 pm += 4;
1075 while (w)
1077 uint32_t s = combine1 (ps, pm);
1078 uint32_t d = *pd;
1080 *pd++ = pack_1x128_32 (
1081 pix_multiply_1x128 (
1082 unpack_32_1x128 (s), negate_1x128 (
1083 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1084 w--;
1085 ps++;
1086 if (pm)
1087 pm++;
1091 static force_inline uint32_t
1092 core_combine_atop_u_pixel_sse2 (uint32_t src,
1093 uint32_t dst)
1095 __m128i s = unpack_32_1x128 (src);
1096 __m128i d = unpack_32_1x128 (dst);
1098 __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1099 __m128i da = expand_alpha_1x128 (d);
1101 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1104 static void
1105 sse2_combine_atop_u (pixman_implementation_t *imp,
1106 pixman_op_t op,
1107 uint32_t * pd,
1108 const uint32_t * ps,
1109 const uint32_t * pm,
1110 int w)
1112 uint32_t s, d;
1114 __m128i xmm_src_lo, xmm_src_hi;
1115 __m128i xmm_dst_lo, xmm_dst_hi;
1116 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1117 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1119 while (w && ((uintptr_t)pd & 15))
1121 s = combine1 (ps, pm);
1122 d = *pd;
1124 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1125 w--;
1126 ps++;
1127 if (pm)
1128 pm++;
1131 while (w >= 4)
1133 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1134 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1136 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1137 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1139 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1140 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1141 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1142 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1144 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1145 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1147 pix_add_multiply_2x128 (
1148 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1149 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1150 &xmm_dst_lo, &xmm_dst_hi);
1152 save_128_aligned (
1153 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1155 ps += 4;
1156 pd += 4;
1157 w -= 4;
1158 if (pm)
1159 pm += 4;
1162 while (w)
1164 s = combine1 (ps, pm);
1165 d = *pd;
1167 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1168 w--;
1169 ps++;
1170 if (pm)
1171 pm++;
1175 static force_inline uint32_t
1176 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1177 uint32_t dst)
1179 __m128i s = unpack_32_1x128 (src);
1180 __m128i d = unpack_32_1x128 (dst);
1182 __m128i sa = expand_alpha_1x128 (s);
1183 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1185 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1188 static void
1189 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1190 pixman_op_t op,
1191 uint32_t * pd,
1192 const uint32_t * ps,
1193 const uint32_t * pm,
1194 int w)
1196 uint32_t s, d;
1198 __m128i xmm_src_lo, xmm_src_hi;
1199 __m128i xmm_dst_lo, xmm_dst_hi;
1200 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1201 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1203 while (w && ((uintptr_t)pd & 15))
1205 s = combine1 (ps, pm);
1206 d = *pd;
1208 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1209 ps++;
1210 w--;
1211 if (pm)
1212 pm++;
1215 while (w >= 4)
1217 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1218 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1220 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1221 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1223 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1224 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1225 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1226 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1228 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1229 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1231 pix_add_multiply_2x128 (
1232 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1233 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1234 &xmm_dst_lo, &xmm_dst_hi);
1236 save_128_aligned (
1237 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1239 ps += 4;
1240 pd += 4;
1241 w -= 4;
1242 if (pm)
1243 pm += 4;
1246 while (w)
1248 s = combine1 (ps, pm);
1249 d = *pd;
1251 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1252 ps++;
1253 w--;
1254 if (pm)
1255 pm++;
1259 static force_inline uint32_t
1260 core_combine_xor_u_pixel_sse2 (uint32_t src,
1261 uint32_t dst)
1263 __m128i s = unpack_32_1x128 (src);
1264 __m128i d = unpack_32_1x128 (dst);
1266 __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1267 __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1269 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1272 static void
1273 sse2_combine_xor_u (pixman_implementation_t *imp,
1274 pixman_op_t op,
1275 uint32_t * dst,
1276 const uint32_t * src,
1277 const uint32_t * mask,
1278 int width)
1280 int w = width;
1281 uint32_t s, d;
1282 uint32_t* pd = dst;
1283 const uint32_t* ps = src;
1284 const uint32_t* pm = mask;
1286 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1287 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1288 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1289 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1291 while (w && ((uintptr_t)pd & 15))
1293 s = combine1 (ps, pm);
1294 d = *pd;
1296 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1297 w--;
1298 ps++;
1299 if (pm)
1300 pm++;
1303 while (w >= 4)
1305 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1306 xmm_dst = load_128_aligned ((__m128i*) pd);
1308 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1309 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1311 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1312 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1313 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1314 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1316 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1317 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1318 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1319 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1321 pix_add_multiply_2x128 (
1322 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1323 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1324 &xmm_dst_lo, &xmm_dst_hi);
1326 save_128_aligned (
1327 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1329 ps += 4;
1330 pd += 4;
1331 w -= 4;
1332 if (pm)
1333 pm += 4;
1336 while (w)
1338 s = combine1 (ps, pm);
1339 d = *pd;
1341 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1342 w--;
1343 ps++;
1344 if (pm)
1345 pm++;
1349 static force_inline void
1350 sse2_combine_add_u (pixman_implementation_t *imp,
1351 pixman_op_t op,
1352 uint32_t * dst,
1353 const uint32_t * src,
1354 const uint32_t * mask,
1355 int width)
1357 int w = width;
1358 uint32_t s, d;
1359 uint32_t* pd = dst;
1360 const uint32_t* ps = src;
1361 const uint32_t* pm = mask;
1363 while (w && (uintptr_t)pd & 15)
1365 s = combine1 (ps, pm);
1366 d = *pd;
1368 ps++;
1369 if (pm)
1370 pm++;
1371 *pd++ = _mm_cvtsi128_si32 (
1372 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1373 w--;
1376 while (w >= 4)
1378 __m128i s;
1380 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1382 save_128_aligned (
1383 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1385 pd += 4;
1386 ps += 4;
1387 if (pm)
1388 pm += 4;
1389 w -= 4;
1392 while (w--)
1394 s = combine1 (ps, pm);
1395 d = *pd;
1397 ps++;
1398 *pd++ = _mm_cvtsi128_si32 (
1399 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1400 if (pm)
1401 pm++;
1405 static force_inline uint32_t
1406 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1407 uint32_t dst)
1409 __m128i ms = unpack_32_1x128 (src);
1410 __m128i md = unpack_32_1x128 (dst);
1411 uint32_t sa = src >> 24;
1412 uint32_t da = ~dst >> 24;
1414 if (sa > da)
1416 ms = pix_multiply_1x128 (
1417 ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1420 return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1423 static void
1424 sse2_combine_saturate_u (pixman_implementation_t *imp,
1425 pixman_op_t op,
1426 uint32_t * pd,
1427 const uint32_t * ps,
1428 const uint32_t * pm,
1429 int w)
1431 uint32_t s, d;
1433 uint32_t pack_cmp;
1434 __m128i xmm_src, xmm_dst;
1436 while (w && (uintptr_t)pd & 15)
1438 s = combine1 (ps, pm);
1439 d = *pd;
1441 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1442 w--;
1443 ps++;
1444 if (pm)
1445 pm++;
1448 while (w >= 4)
1450 xmm_dst = load_128_aligned ((__m128i*)pd);
1451 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1453 pack_cmp = _mm_movemask_epi8 (
1454 _mm_cmpgt_epi32 (
1455 _mm_srli_epi32 (xmm_src, 24),
1456 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1458 /* if some alpha src is grater than respective ~alpha dst */
1459 if (pack_cmp)
1461 s = combine1 (ps++, pm);
1462 d = *pd;
1463 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1464 if (pm)
1465 pm++;
1467 s = combine1 (ps++, pm);
1468 d = *pd;
1469 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1470 if (pm)
1471 pm++;
1473 s = combine1 (ps++, pm);
1474 d = *pd;
1475 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1476 if (pm)
1477 pm++;
1479 s = combine1 (ps++, pm);
1480 d = *pd;
1481 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1482 if (pm)
1483 pm++;
1485 else
1487 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1489 pd += 4;
1490 ps += 4;
1491 if (pm)
1492 pm += 4;
1495 w -= 4;
1498 while (w--)
1500 s = combine1 (ps, pm);
1501 d = *pd;
1503 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1504 ps++;
1505 if (pm)
1506 pm++;
1510 static void
1511 sse2_combine_src_ca (pixman_implementation_t *imp,
1512 pixman_op_t op,
1513 uint32_t * pd,
1514 const uint32_t * ps,
1515 const uint32_t * pm,
1516 int w)
1518 uint32_t s, m;
1520 __m128i xmm_src_lo, xmm_src_hi;
1521 __m128i xmm_mask_lo, xmm_mask_hi;
1522 __m128i xmm_dst_lo, xmm_dst_hi;
1524 while (w && (uintptr_t)pd & 15)
1526 s = *ps++;
1527 m = *pm++;
1528 *pd++ = pack_1x128_32 (
1529 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1530 w--;
1533 while (w >= 4)
1535 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1536 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1538 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1539 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1541 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1542 &xmm_mask_lo, &xmm_mask_hi,
1543 &xmm_dst_lo, &xmm_dst_hi);
1545 save_128_aligned (
1546 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1548 ps += 4;
1549 pd += 4;
1550 pm += 4;
1551 w -= 4;
1554 while (w)
1556 s = *ps++;
1557 m = *pm++;
1558 *pd++ = pack_1x128_32 (
1559 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1560 w--;
1564 static force_inline uint32_t
1565 core_combine_over_ca_pixel_sse2 (uint32_t src,
1566 uint32_t mask,
1567 uint32_t dst)
1569 __m128i s = unpack_32_1x128 (src);
1570 __m128i expAlpha = expand_alpha_1x128 (s);
1571 __m128i unpk_mask = unpack_32_1x128 (mask);
1572 __m128i unpk_dst = unpack_32_1x128 (dst);
1574 return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1577 static void
1578 sse2_combine_over_ca (pixman_implementation_t *imp,
1579 pixman_op_t op,
1580 uint32_t * pd,
1581 const uint32_t * ps,
1582 const uint32_t * pm,
1583 int w)
1585 uint32_t s, m, d;
1587 __m128i xmm_alpha_lo, xmm_alpha_hi;
1588 __m128i xmm_src_lo, xmm_src_hi;
1589 __m128i xmm_dst_lo, xmm_dst_hi;
1590 __m128i xmm_mask_lo, xmm_mask_hi;
1592 while (w && (uintptr_t)pd & 15)
1594 s = *ps++;
1595 m = *pm++;
1596 d = *pd;
1598 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1599 w--;
1602 while (w >= 4)
1604 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1605 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1606 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1608 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1609 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1610 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1612 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1613 &xmm_alpha_lo, &xmm_alpha_hi);
1615 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1616 &xmm_alpha_lo, &xmm_alpha_hi,
1617 &xmm_mask_lo, &xmm_mask_hi,
1618 &xmm_dst_lo, &xmm_dst_hi);
1620 save_128_aligned (
1621 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1623 ps += 4;
1624 pd += 4;
1625 pm += 4;
1626 w -= 4;
1629 while (w)
1631 s = *ps++;
1632 m = *pm++;
1633 d = *pd;
1635 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1636 w--;
1640 static force_inline uint32_t
1641 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1642 uint32_t mask,
1643 uint32_t dst)
1645 __m128i d = unpack_32_1x128 (dst);
1647 return pack_1x128_32 (
1648 over_1x128 (d, expand_alpha_1x128 (d),
1649 pix_multiply_1x128 (unpack_32_1x128 (src),
1650 unpack_32_1x128 (mask))));
1653 static void
1654 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1655 pixman_op_t op,
1656 uint32_t * pd,
1657 const uint32_t * ps,
1658 const uint32_t * pm,
1659 int w)
1661 uint32_t s, m, d;
1663 __m128i xmm_alpha_lo, xmm_alpha_hi;
1664 __m128i xmm_src_lo, xmm_src_hi;
1665 __m128i xmm_dst_lo, xmm_dst_hi;
1666 __m128i xmm_mask_lo, xmm_mask_hi;
1668 while (w && (uintptr_t)pd & 15)
1670 s = *ps++;
1671 m = *pm++;
1672 d = *pd;
1674 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1675 w--;
1678 while (w >= 4)
1680 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1681 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1682 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1684 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1685 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1686 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1688 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1689 &xmm_alpha_lo, &xmm_alpha_hi);
1690 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1691 &xmm_mask_lo, &xmm_mask_hi,
1692 &xmm_mask_lo, &xmm_mask_hi);
1694 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1695 &xmm_alpha_lo, &xmm_alpha_hi,
1696 &xmm_mask_lo, &xmm_mask_hi);
1698 save_128_aligned (
1699 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1701 ps += 4;
1702 pd += 4;
1703 pm += 4;
1704 w -= 4;
1707 while (w)
1709 s = *ps++;
1710 m = *pm++;
1711 d = *pd;
1713 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1714 w--;
1718 static void
1719 sse2_combine_in_ca (pixman_implementation_t *imp,
1720 pixman_op_t op,
1721 uint32_t * pd,
1722 const uint32_t * ps,
1723 const uint32_t * pm,
1724 int w)
1726 uint32_t s, m, d;
1728 __m128i xmm_alpha_lo, xmm_alpha_hi;
1729 __m128i xmm_src_lo, xmm_src_hi;
1730 __m128i xmm_dst_lo, xmm_dst_hi;
1731 __m128i xmm_mask_lo, xmm_mask_hi;
1733 while (w && (uintptr_t)pd & 15)
1735 s = *ps++;
1736 m = *pm++;
1737 d = *pd;
1739 *pd++ = pack_1x128_32 (
1740 pix_multiply_1x128 (
1741 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1742 expand_alpha_1x128 (unpack_32_1x128 (d))));
1744 w--;
1747 while (w >= 4)
1749 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1750 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1751 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1753 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1754 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1755 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1757 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1758 &xmm_alpha_lo, &xmm_alpha_hi);
1760 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1761 &xmm_mask_lo, &xmm_mask_hi,
1762 &xmm_dst_lo, &xmm_dst_hi);
1764 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1765 &xmm_alpha_lo, &xmm_alpha_hi,
1766 &xmm_dst_lo, &xmm_dst_hi);
1768 save_128_aligned (
1769 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1771 ps += 4;
1772 pd += 4;
1773 pm += 4;
1774 w -= 4;
1777 while (w)
1779 s = *ps++;
1780 m = *pm++;
1781 d = *pd;
1783 *pd++ = pack_1x128_32 (
1784 pix_multiply_1x128 (
1785 pix_multiply_1x128 (
1786 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1787 expand_alpha_1x128 (unpack_32_1x128 (d))));
1789 w--;
1793 static void
1794 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1795 pixman_op_t op,
1796 uint32_t * pd,
1797 const uint32_t * ps,
1798 const uint32_t * pm,
1799 int w)
1801 uint32_t s, m, d;
1803 __m128i xmm_alpha_lo, xmm_alpha_hi;
1804 __m128i xmm_src_lo, xmm_src_hi;
1805 __m128i xmm_dst_lo, xmm_dst_hi;
1806 __m128i xmm_mask_lo, xmm_mask_hi;
1808 while (w && (uintptr_t)pd & 15)
1810 s = *ps++;
1811 m = *pm++;
1812 d = *pd;
1814 *pd++ = pack_1x128_32 (
1815 pix_multiply_1x128 (
1816 unpack_32_1x128 (d),
1817 pix_multiply_1x128 (unpack_32_1x128 (m),
1818 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1819 w--;
1822 while (w >= 4)
1824 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1825 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1826 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1828 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1829 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1830 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1832 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1833 &xmm_alpha_lo, &xmm_alpha_hi);
1834 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1835 &xmm_alpha_lo, &xmm_alpha_hi,
1836 &xmm_alpha_lo, &xmm_alpha_hi);
1838 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1839 &xmm_alpha_lo, &xmm_alpha_hi,
1840 &xmm_dst_lo, &xmm_dst_hi);
1842 save_128_aligned (
1843 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1845 ps += 4;
1846 pd += 4;
1847 pm += 4;
1848 w -= 4;
1851 while (w)
1853 s = *ps++;
1854 m = *pm++;
1855 d = *pd;
1857 *pd++ = pack_1x128_32 (
1858 pix_multiply_1x128 (
1859 unpack_32_1x128 (d),
1860 pix_multiply_1x128 (unpack_32_1x128 (m),
1861 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1862 w--;
1866 static void
1867 sse2_combine_out_ca (pixman_implementation_t *imp,
1868 pixman_op_t op,
1869 uint32_t * pd,
1870 const uint32_t * ps,
1871 const uint32_t * pm,
1872 int w)
1874 uint32_t s, m, d;
1876 __m128i xmm_alpha_lo, xmm_alpha_hi;
1877 __m128i xmm_src_lo, xmm_src_hi;
1878 __m128i xmm_dst_lo, xmm_dst_hi;
1879 __m128i xmm_mask_lo, xmm_mask_hi;
1881 while (w && (uintptr_t)pd & 15)
1883 s = *ps++;
1884 m = *pm++;
1885 d = *pd;
1887 *pd++ = pack_1x128_32 (
1888 pix_multiply_1x128 (
1889 pix_multiply_1x128 (
1890 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1891 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1892 w--;
1895 while (w >= 4)
1897 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1898 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1899 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1901 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1902 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1903 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1905 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1906 &xmm_alpha_lo, &xmm_alpha_hi);
1907 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1908 &xmm_alpha_lo, &xmm_alpha_hi);
1910 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1911 &xmm_mask_lo, &xmm_mask_hi,
1912 &xmm_dst_lo, &xmm_dst_hi);
1913 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1914 &xmm_alpha_lo, &xmm_alpha_hi,
1915 &xmm_dst_lo, &xmm_dst_hi);
1917 save_128_aligned (
1918 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1920 ps += 4;
1921 pd += 4;
1922 pm += 4;
1923 w -= 4;
1926 while (w)
1928 s = *ps++;
1929 m = *pm++;
1930 d = *pd;
1932 *pd++ = pack_1x128_32 (
1933 pix_multiply_1x128 (
1934 pix_multiply_1x128 (
1935 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1936 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1938 w--;
1942 static void
1943 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1944 pixman_op_t op,
1945 uint32_t * pd,
1946 const uint32_t * ps,
1947 const uint32_t * pm,
1948 int w)
1950 uint32_t s, m, d;
1952 __m128i xmm_alpha_lo, xmm_alpha_hi;
1953 __m128i xmm_src_lo, xmm_src_hi;
1954 __m128i xmm_dst_lo, xmm_dst_hi;
1955 __m128i xmm_mask_lo, xmm_mask_hi;
1957 while (w && (uintptr_t)pd & 15)
1959 s = *ps++;
1960 m = *pm++;
1961 d = *pd;
1963 *pd++ = pack_1x128_32 (
1964 pix_multiply_1x128 (
1965 unpack_32_1x128 (d),
1966 negate_1x128 (pix_multiply_1x128 (
1967 unpack_32_1x128 (m),
1968 expand_alpha_1x128 (unpack_32_1x128 (s))))));
1969 w--;
1972 while (w >= 4)
1974 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1975 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1976 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1978 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1979 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1980 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1982 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1983 &xmm_alpha_lo, &xmm_alpha_hi);
1985 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1986 &xmm_alpha_lo, &xmm_alpha_hi,
1987 &xmm_mask_lo, &xmm_mask_hi);
1989 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1990 &xmm_mask_lo, &xmm_mask_hi);
1992 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1993 &xmm_mask_lo, &xmm_mask_hi,
1994 &xmm_dst_lo, &xmm_dst_hi);
1996 save_128_aligned (
1997 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1999 ps += 4;
2000 pd += 4;
2001 pm += 4;
2002 w -= 4;
2005 while (w)
2007 s = *ps++;
2008 m = *pm++;
2009 d = *pd;
2011 *pd++ = pack_1x128_32 (
2012 pix_multiply_1x128 (
2013 unpack_32_1x128 (d),
2014 negate_1x128 (pix_multiply_1x128 (
2015 unpack_32_1x128 (m),
2016 expand_alpha_1x128 (unpack_32_1x128 (s))))));
2017 w--;
2021 static force_inline uint32_t
2022 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2023 uint32_t mask,
2024 uint32_t dst)
2026 __m128i m = unpack_32_1x128 (mask);
2027 __m128i s = unpack_32_1x128 (src);
2028 __m128i d = unpack_32_1x128 (dst);
2029 __m128i sa = expand_alpha_1x128 (s);
2030 __m128i da = expand_alpha_1x128 (d);
2032 s = pix_multiply_1x128 (s, m);
2033 m = negate_1x128 (pix_multiply_1x128 (m, sa));
2035 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2038 static void
2039 sse2_combine_atop_ca (pixman_implementation_t *imp,
2040 pixman_op_t op,
2041 uint32_t * pd,
2042 const uint32_t * ps,
2043 const uint32_t * pm,
2044 int w)
2046 uint32_t s, m, d;
2048 __m128i xmm_src_lo, xmm_src_hi;
2049 __m128i xmm_dst_lo, xmm_dst_hi;
2050 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2051 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2052 __m128i xmm_mask_lo, xmm_mask_hi;
2054 while (w && (uintptr_t)pd & 15)
2056 s = *ps++;
2057 m = *pm++;
2058 d = *pd;
2060 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2061 w--;
2064 while (w >= 4)
2066 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2067 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2068 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2070 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2071 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2072 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2074 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2075 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2076 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2077 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2079 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2080 &xmm_mask_lo, &xmm_mask_hi,
2081 &xmm_src_lo, &xmm_src_hi);
2082 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2083 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2084 &xmm_mask_lo, &xmm_mask_hi);
2086 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2088 pix_add_multiply_2x128 (
2089 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2090 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2091 &xmm_dst_lo, &xmm_dst_hi);
2093 save_128_aligned (
2094 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2096 ps += 4;
2097 pd += 4;
2098 pm += 4;
2099 w -= 4;
2102 while (w)
2104 s = *ps++;
2105 m = *pm++;
2106 d = *pd;
2108 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2109 w--;
2113 static force_inline uint32_t
2114 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2115 uint32_t mask,
2116 uint32_t dst)
2118 __m128i m = unpack_32_1x128 (mask);
2119 __m128i s = unpack_32_1x128 (src);
2120 __m128i d = unpack_32_1x128 (dst);
2122 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2123 __m128i sa = expand_alpha_1x128 (s);
2125 s = pix_multiply_1x128 (s, m);
2126 m = pix_multiply_1x128 (m, sa);
2128 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2131 static void
2132 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2133 pixman_op_t op,
2134 uint32_t * pd,
2135 const uint32_t * ps,
2136 const uint32_t * pm,
2137 int w)
2139 uint32_t s, m, d;
2141 __m128i xmm_src_lo, xmm_src_hi;
2142 __m128i xmm_dst_lo, xmm_dst_hi;
2143 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2144 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2145 __m128i xmm_mask_lo, xmm_mask_hi;
2147 while (w && (uintptr_t)pd & 15)
2149 s = *ps++;
2150 m = *pm++;
2151 d = *pd;
2153 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2154 w--;
2157 while (w >= 4)
2159 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2160 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2161 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2163 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2164 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2165 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2167 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2168 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2169 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2170 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2172 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2173 &xmm_mask_lo, &xmm_mask_hi,
2174 &xmm_src_lo, &xmm_src_hi);
2175 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2176 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2177 &xmm_mask_lo, &xmm_mask_hi);
2179 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2180 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2182 pix_add_multiply_2x128 (
2183 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2184 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2185 &xmm_dst_lo, &xmm_dst_hi);
2187 save_128_aligned (
2188 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2190 ps += 4;
2191 pd += 4;
2192 pm += 4;
2193 w -= 4;
2196 while (w)
2198 s = *ps++;
2199 m = *pm++;
2200 d = *pd;
2202 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2203 w--;
2207 static force_inline uint32_t
2208 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2209 uint32_t mask,
2210 uint32_t dst)
2212 __m128i a = unpack_32_1x128 (mask);
2213 __m128i s = unpack_32_1x128 (src);
2214 __m128i d = unpack_32_1x128 (dst);
2216 __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2217 a, expand_alpha_1x128 (s)));
2218 __m128i dest = pix_multiply_1x128 (s, a);
2219 __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2221 return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2222 &alpha_dst,
2223 &dest,
2224 &alpha_src));
2227 static void
2228 sse2_combine_xor_ca (pixman_implementation_t *imp,
2229 pixman_op_t op,
2230 uint32_t * pd,
2231 const uint32_t * ps,
2232 const uint32_t * pm,
2233 int w)
2235 uint32_t s, m, d;
2237 __m128i xmm_src_lo, xmm_src_hi;
2238 __m128i xmm_dst_lo, xmm_dst_hi;
2239 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2240 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2241 __m128i xmm_mask_lo, xmm_mask_hi;
2243 while (w && (uintptr_t)pd & 15)
2245 s = *ps++;
2246 m = *pm++;
2247 d = *pd;
2249 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2250 w--;
2253 while (w >= 4)
2255 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2256 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2257 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2259 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2260 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2261 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2263 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2264 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2265 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2266 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2268 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2269 &xmm_mask_lo, &xmm_mask_hi,
2270 &xmm_src_lo, &xmm_src_hi);
2271 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2272 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2273 &xmm_mask_lo, &xmm_mask_hi);
2275 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2276 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2277 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2278 &xmm_mask_lo, &xmm_mask_hi);
2280 pix_add_multiply_2x128 (
2281 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2282 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2283 &xmm_dst_lo, &xmm_dst_hi);
2285 save_128_aligned (
2286 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2288 ps += 4;
2289 pd += 4;
2290 pm += 4;
2291 w -= 4;
2294 while (w)
2296 s = *ps++;
2297 m = *pm++;
2298 d = *pd;
2300 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2301 w--;
2305 static void
2306 sse2_combine_add_ca (pixman_implementation_t *imp,
2307 pixman_op_t op,
2308 uint32_t * pd,
2309 const uint32_t * ps,
2310 const uint32_t * pm,
2311 int w)
2313 uint32_t s, m, d;
2315 __m128i xmm_src_lo, xmm_src_hi;
2316 __m128i xmm_dst_lo, xmm_dst_hi;
2317 __m128i xmm_mask_lo, xmm_mask_hi;
2319 while (w && (uintptr_t)pd & 15)
2321 s = *ps++;
2322 m = *pm++;
2323 d = *pd;
2325 *pd++ = pack_1x128_32 (
2326 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2327 unpack_32_1x128 (m)),
2328 unpack_32_1x128 (d)));
2329 w--;
2332 while (w >= 4)
2334 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2335 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2336 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2338 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2339 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2340 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2342 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2343 &xmm_mask_lo, &xmm_mask_hi,
2344 &xmm_src_lo, &xmm_src_hi);
2346 save_128_aligned (
2347 (__m128i*)pd, pack_2x128_128 (
2348 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2349 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2351 ps += 4;
2352 pd += 4;
2353 pm += 4;
2354 w -= 4;
2357 while (w)
2359 s = *ps++;
2360 m = *pm++;
2361 d = *pd;
2363 *pd++ = pack_1x128_32 (
2364 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2365 unpack_32_1x128 (m)),
2366 unpack_32_1x128 (d)));
2367 w--;
2371 static force_inline __m128i
2372 create_mask_16_128 (uint16_t mask)
2374 return _mm_set1_epi16 (mask);
2377 /* Work around a code generation bug in Sun Studio 12. */
2378 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2379 # define create_mask_2x32_128(mask0, mask1) \
2380 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2381 #else
2382 static force_inline __m128i
2383 create_mask_2x32_128 (uint32_t mask0,
2384 uint32_t mask1)
2386 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2388 #endif
2390 static void
2391 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2392 pixman_composite_info_t *info)
2394 PIXMAN_COMPOSITE_ARGS (info);
2395 uint32_t src;
2396 uint32_t *dst_line, *dst, d;
2397 int32_t w;
2398 int dst_stride;
2399 __m128i xmm_src, xmm_alpha;
2400 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2402 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2404 if (src == 0)
2405 return;
2407 PIXMAN_IMAGE_GET_LINE (
2408 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2410 xmm_src = expand_pixel_32_1x128 (src);
2411 xmm_alpha = expand_alpha_1x128 (xmm_src);
2413 while (height--)
2415 dst = dst_line;
2417 dst_line += dst_stride;
2418 w = width;
2420 while (w && (uintptr_t)dst & 15)
2422 d = *dst;
2423 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2424 xmm_alpha,
2425 unpack_32_1x128 (d)));
2426 w--;
2429 while (w >= 4)
2431 xmm_dst = load_128_aligned ((__m128i*)dst);
2433 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2435 over_2x128 (&xmm_src, &xmm_src,
2436 &xmm_alpha, &xmm_alpha,
2437 &xmm_dst_lo, &xmm_dst_hi);
2439 /* rebuid the 4 pixel data and save*/
2440 save_128_aligned (
2441 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2443 w -= 4;
2444 dst += 4;
2447 while (w)
2449 d = *dst;
2450 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2451 xmm_alpha,
2452 unpack_32_1x128 (d)));
2453 w--;
2459 static void
2460 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2461 pixman_composite_info_t *info)
2463 PIXMAN_COMPOSITE_ARGS (info);
2464 uint32_t src;
2465 uint16_t *dst_line, *dst, d;
2466 int32_t w;
2467 int dst_stride;
2468 __m128i xmm_src, xmm_alpha;
2469 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2471 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2473 if (src == 0)
2474 return;
2476 PIXMAN_IMAGE_GET_LINE (
2477 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2479 xmm_src = expand_pixel_32_1x128 (src);
2480 xmm_alpha = expand_alpha_1x128 (xmm_src);
2482 while (height--)
2484 dst = dst_line;
2486 dst_line += dst_stride;
2487 w = width;
2489 while (w && (uintptr_t)dst & 15)
2491 d = *dst;
2493 *dst++ = pack_565_32_16 (
2494 pack_1x128_32 (over_1x128 (xmm_src,
2495 xmm_alpha,
2496 expand565_16_1x128 (d))));
2497 w--;
2500 while (w >= 8)
2502 xmm_dst = load_128_aligned ((__m128i*)dst);
2504 unpack_565_128_4x128 (xmm_dst,
2505 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2507 over_2x128 (&xmm_src, &xmm_src,
2508 &xmm_alpha, &xmm_alpha,
2509 &xmm_dst0, &xmm_dst1);
2510 over_2x128 (&xmm_src, &xmm_src,
2511 &xmm_alpha, &xmm_alpha,
2512 &xmm_dst2, &xmm_dst3);
2514 xmm_dst = pack_565_4x128_128 (
2515 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2517 save_128_aligned ((__m128i*)dst, xmm_dst);
2519 dst += 8;
2520 w -= 8;
2523 while (w--)
2525 d = *dst;
2526 *dst++ = pack_565_32_16 (
2527 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2528 expand565_16_1x128 (d))));
2534 static void
2535 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2536 pixman_composite_info_t *info)
2538 PIXMAN_COMPOSITE_ARGS (info);
2539 uint32_t src;
2540 uint32_t *dst_line, d;
2541 uint32_t *mask_line, m;
2542 uint32_t pack_cmp;
2543 int dst_stride, mask_stride;
2545 __m128i xmm_src;
2546 __m128i xmm_dst;
2547 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2549 __m128i mmx_src, mmx_mask, mmx_dest;
2551 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2553 if (src == 0)
2554 return;
2556 PIXMAN_IMAGE_GET_LINE (
2557 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2558 PIXMAN_IMAGE_GET_LINE (
2559 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2561 xmm_src = _mm_unpacklo_epi8 (
2562 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2563 mmx_src = xmm_src;
2565 while (height--)
2567 int w = width;
2568 const uint32_t *pm = (uint32_t *)mask_line;
2569 uint32_t *pd = (uint32_t *)dst_line;
2571 dst_line += dst_stride;
2572 mask_line += mask_stride;
2574 while (w && (uintptr_t)pd & 15)
2576 m = *pm++;
2578 if (m)
2580 d = *pd;
2582 mmx_mask = unpack_32_1x128 (m);
2583 mmx_dest = unpack_32_1x128 (d);
2585 *pd = pack_1x128_32 (
2586 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2587 mmx_dest));
2590 pd++;
2591 w--;
2594 while (w >= 4)
2596 xmm_mask = load_128_unaligned ((__m128i*)pm);
2598 pack_cmp =
2599 _mm_movemask_epi8 (
2600 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2602 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2603 if (pack_cmp != 0xffff)
2605 xmm_dst = load_128_aligned ((__m128i*)pd);
2607 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2609 pix_multiply_2x128 (&xmm_src, &xmm_src,
2610 &xmm_mask_lo, &xmm_mask_hi,
2611 &xmm_mask_lo, &xmm_mask_hi);
2612 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2614 save_128_aligned (
2615 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2618 pd += 4;
2619 pm += 4;
2620 w -= 4;
2623 while (w)
2625 m = *pm++;
2627 if (m)
2629 d = *pd;
2631 mmx_mask = unpack_32_1x128 (m);
2632 mmx_dest = unpack_32_1x128 (d);
2634 *pd = pack_1x128_32 (
2635 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2636 mmx_dest));
2639 pd++;
2640 w--;
2646 static void
2647 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2648 pixman_composite_info_t *info)
2650 PIXMAN_COMPOSITE_ARGS (info);
2651 uint32_t src;
2652 uint32_t *dst_line, d;
2653 uint32_t *mask_line, m;
2654 uint32_t pack_cmp;
2655 int dst_stride, mask_stride;
2657 __m128i xmm_src, xmm_alpha;
2658 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2659 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2661 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2663 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2665 if (src == 0)
2666 return;
2668 PIXMAN_IMAGE_GET_LINE (
2669 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2670 PIXMAN_IMAGE_GET_LINE (
2671 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2673 xmm_src = _mm_unpacklo_epi8 (
2674 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2675 xmm_alpha = expand_alpha_1x128 (xmm_src);
2676 mmx_src = xmm_src;
2677 mmx_alpha = xmm_alpha;
2679 while (height--)
2681 int w = width;
2682 const uint32_t *pm = (uint32_t *)mask_line;
2683 uint32_t *pd = (uint32_t *)dst_line;
2685 dst_line += dst_stride;
2686 mask_line += mask_stride;
2688 while (w && (uintptr_t)pd & 15)
2690 m = *pm++;
2692 if (m)
2694 d = *pd;
2695 mmx_mask = unpack_32_1x128 (m);
2696 mmx_dest = unpack_32_1x128 (d);
2698 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2699 &mmx_alpha,
2700 &mmx_mask,
2701 &mmx_dest));
2704 pd++;
2705 w--;
2708 while (w >= 4)
2710 xmm_mask = load_128_unaligned ((__m128i*)pm);
2712 pack_cmp =
2713 _mm_movemask_epi8 (
2714 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2716 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2717 if (pack_cmp != 0xffff)
2719 xmm_dst = load_128_aligned ((__m128i*)pd);
2721 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2722 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2724 in_over_2x128 (&xmm_src, &xmm_src,
2725 &xmm_alpha, &xmm_alpha,
2726 &xmm_mask_lo, &xmm_mask_hi,
2727 &xmm_dst_lo, &xmm_dst_hi);
2729 save_128_aligned (
2730 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2733 pd += 4;
2734 pm += 4;
2735 w -= 4;
2738 while (w)
2740 m = *pm++;
2742 if (m)
2744 d = *pd;
2745 mmx_mask = unpack_32_1x128 (m);
2746 mmx_dest = unpack_32_1x128 (d);
2748 *pd = pack_1x128_32 (
2749 in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2752 pd++;
2753 w--;
2759 static void
2760 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2761 pixman_composite_info_t *info)
2763 PIXMAN_COMPOSITE_ARGS (info);
2764 uint32_t *dst_line, *dst;
2765 uint32_t *src_line, *src;
2766 uint32_t mask;
2767 int32_t w;
2768 int dst_stride, src_stride;
2770 __m128i xmm_mask;
2771 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2772 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2773 __m128i xmm_alpha_lo, xmm_alpha_hi;
2775 PIXMAN_IMAGE_GET_LINE (
2776 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2777 PIXMAN_IMAGE_GET_LINE (
2778 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2780 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2782 xmm_mask = create_mask_16_128 (mask >> 24);
2784 while (height--)
2786 dst = dst_line;
2787 dst_line += dst_stride;
2788 src = src_line;
2789 src_line += src_stride;
2790 w = width;
2792 while (w && (uintptr_t)dst & 15)
2794 uint32_t s = *src++;
2796 if (s)
2798 uint32_t d = *dst;
2800 __m128i ms = unpack_32_1x128 (s);
2801 __m128i alpha = expand_alpha_1x128 (ms);
2802 __m128i dest = xmm_mask;
2803 __m128i alpha_dst = unpack_32_1x128 (d);
2805 *dst = pack_1x128_32 (
2806 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2808 dst++;
2809 w--;
2812 while (w >= 4)
2814 xmm_src = load_128_unaligned ((__m128i*)src);
2816 if (!is_zero (xmm_src))
2818 xmm_dst = load_128_aligned ((__m128i*)dst);
2820 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2821 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2822 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2823 &xmm_alpha_lo, &xmm_alpha_hi);
2825 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2826 &xmm_alpha_lo, &xmm_alpha_hi,
2827 &xmm_mask, &xmm_mask,
2828 &xmm_dst_lo, &xmm_dst_hi);
2830 save_128_aligned (
2831 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2834 dst += 4;
2835 src += 4;
2836 w -= 4;
2839 while (w)
2841 uint32_t s = *src++;
2843 if (s)
2845 uint32_t d = *dst;
2847 __m128i ms = unpack_32_1x128 (s);
2848 __m128i alpha = expand_alpha_1x128 (ms);
2849 __m128i mask = xmm_mask;
2850 __m128i dest = unpack_32_1x128 (d);
2852 *dst = pack_1x128_32 (
2853 in_over_1x128 (&ms, &alpha, &mask, &dest));
2856 dst++;
2857 w--;
2863 static void
2864 sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
2865 pixman_composite_info_t *info)
2867 PIXMAN_COMPOSITE_ARGS (info);
2868 uint16_t *dst_line, *dst;
2869 uint32_t *src_line, *src, s;
2870 int dst_stride, src_stride;
2871 int32_t w;
2873 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2874 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2876 while (height--)
2878 dst = dst_line;
2879 dst_line += dst_stride;
2880 src = src_line;
2881 src_line += src_stride;
2882 w = width;
2884 while (w && (uintptr_t)dst & 15)
2886 s = *src++;
2887 *dst = convert_8888_to_0565 (s);
2888 dst++;
2889 w--;
2892 while (w >= 8)
2894 __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
2895 __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
2897 save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
2899 w -= 8;
2900 src += 8;
2901 dst += 8;
2904 while (w)
2906 s = *src++;
2907 *dst = convert_8888_to_0565 (s);
2908 dst++;
2909 w--;
2914 static void
2915 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2916 pixman_composite_info_t *info)
2918 PIXMAN_COMPOSITE_ARGS (info);
2919 uint32_t *dst_line, *dst;
2920 uint32_t *src_line, *src;
2921 int32_t w;
2922 int dst_stride, src_stride;
2925 PIXMAN_IMAGE_GET_LINE (
2926 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2927 PIXMAN_IMAGE_GET_LINE (
2928 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2930 while (height--)
2932 dst = dst_line;
2933 dst_line += dst_stride;
2934 src = src_line;
2935 src_line += src_stride;
2936 w = width;
2938 while (w && (uintptr_t)dst & 15)
2940 *dst++ = *src++ | 0xff000000;
2941 w--;
2944 while (w >= 16)
2946 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2948 xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2949 xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2950 xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2951 xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2953 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2954 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2955 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2956 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2958 dst += 16;
2959 src += 16;
2960 w -= 16;
2963 while (w)
2965 *dst++ = *src++ | 0xff000000;
2966 w--;
2972 static void
2973 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2974 pixman_composite_info_t *info)
2976 PIXMAN_COMPOSITE_ARGS (info);
2977 uint32_t *dst_line, *dst;
2978 uint32_t *src_line, *src;
2979 uint32_t mask;
2980 int dst_stride, src_stride;
2981 int32_t w;
2983 __m128i xmm_mask, xmm_alpha;
2984 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2985 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2987 PIXMAN_IMAGE_GET_LINE (
2988 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2989 PIXMAN_IMAGE_GET_LINE (
2990 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2992 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2994 xmm_mask = create_mask_16_128 (mask >> 24);
2995 xmm_alpha = mask_00ff;
2997 while (height--)
2999 dst = dst_line;
3000 dst_line += dst_stride;
3001 src = src_line;
3002 src_line += src_stride;
3003 w = width;
3005 while (w && (uintptr_t)dst & 15)
3007 uint32_t s = (*src++) | 0xff000000;
3008 uint32_t d = *dst;
3010 __m128i src = unpack_32_1x128 (s);
3011 __m128i alpha = xmm_alpha;
3012 __m128i mask = xmm_mask;
3013 __m128i dest = unpack_32_1x128 (d);
3015 *dst++ = pack_1x128_32 (
3016 in_over_1x128 (&src, &alpha, &mask, &dest));
3018 w--;
3021 while (w >= 4)
3023 xmm_src = _mm_or_si128 (
3024 load_128_unaligned ((__m128i*)src), mask_ff000000);
3025 xmm_dst = load_128_aligned ((__m128i*)dst);
3027 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3028 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3030 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3031 &xmm_alpha, &xmm_alpha,
3032 &xmm_mask, &xmm_mask,
3033 &xmm_dst_lo, &xmm_dst_hi);
3035 save_128_aligned (
3036 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3038 dst += 4;
3039 src += 4;
3040 w -= 4;
3044 while (w)
3046 uint32_t s = (*src++) | 0xff000000;
3047 uint32_t d = *dst;
3049 __m128i src = unpack_32_1x128 (s);
3050 __m128i alpha = xmm_alpha;
3051 __m128i mask = xmm_mask;
3052 __m128i dest = unpack_32_1x128 (d);
3054 *dst++ = pack_1x128_32 (
3055 in_over_1x128 (&src, &alpha, &mask, &dest));
3057 w--;
3063 static void
3064 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3065 pixman_composite_info_t *info)
3067 PIXMAN_COMPOSITE_ARGS (info);
3068 int dst_stride, src_stride;
3069 uint32_t *dst_line, *dst;
3070 uint32_t *src_line, *src;
3072 PIXMAN_IMAGE_GET_LINE (
3073 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3074 PIXMAN_IMAGE_GET_LINE (
3075 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3077 dst = dst_line;
3078 src = src_line;
3080 while (height--)
3082 sse2_combine_over_u (imp, op, dst, src, NULL, width);
3084 dst += dst_stride;
3085 src += src_stride;
3089 static force_inline uint16_t
3090 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3092 __m128i ms;
3094 ms = unpack_32_1x128 (src);
3095 return pack_565_32_16 (
3096 pack_1x128_32 (
3097 over_1x128 (
3098 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3101 static void
3102 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3103 pixman_composite_info_t *info)
3105 PIXMAN_COMPOSITE_ARGS (info);
3106 uint16_t *dst_line, *dst, d;
3107 uint32_t *src_line, *src, s;
3108 int dst_stride, src_stride;
3109 int32_t w;
3111 __m128i xmm_alpha_lo, xmm_alpha_hi;
3112 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3113 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3115 PIXMAN_IMAGE_GET_LINE (
3116 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3117 PIXMAN_IMAGE_GET_LINE (
3118 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3120 while (height--)
3122 dst = dst_line;
3123 src = src_line;
3125 dst_line += dst_stride;
3126 src_line += src_stride;
3127 w = width;
3129 /* Align dst on a 16-byte boundary */
3130 while (w &&
3131 ((uintptr_t)dst & 15))
3133 s = *src++;
3134 d = *dst;
3136 *dst++ = composite_over_8888_0565pixel (s, d);
3137 w--;
3140 /* It's a 8 pixel loop */
3141 while (w >= 8)
3143 /* I'm loading unaligned because I'm not sure
3144 * about the address alignment.
3146 xmm_src = load_128_unaligned ((__m128i*) src);
3147 xmm_dst = load_128_aligned ((__m128i*) dst);
3149 /* Unpacking */
3150 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3151 unpack_565_128_4x128 (xmm_dst,
3152 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3153 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3154 &xmm_alpha_lo, &xmm_alpha_hi);
3156 /* I'm loading next 4 pixels from memory
3157 * before to optimze the memory read.
3159 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3161 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3162 &xmm_alpha_lo, &xmm_alpha_hi,
3163 &xmm_dst0, &xmm_dst1);
3165 /* Unpacking */
3166 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3167 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3168 &xmm_alpha_lo, &xmm_alpha_hi);
3170 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3171 &xmm_alpha_lo, &xmm_alpha_hi,
3172 &xmm_dst2, &xmm_dst3);
3174 save_128_aligned (
3175 (__m128i*)dst, pack_565_4x128_128 (
3176 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3178 w -= 8;
3179 dst += 8;
3180 src += 8;
3183 while (w--)
3185 s = *src++;
3186 d = *dst;
3188 *dst++ = composite_over_8888_0565pixel (s, d);
3194 static void
3195 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3196 pixman_composite_info_t *info)
3198 PIXMAN_COMPOSITE_ARGS (info);
3199 uint32_t src, srca;
3200 uint32_t *dst_line, *dst;
3201 uint8_t *mask_line, *mask;
3202 int dst_stride, mask_stride;
3203 int32_t w;
3204 uint32_t m, d;
3206 __m128i xmm_src, xmm_alpha, xmm_def;
3207 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3208 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3210 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3212 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3214 srca = src >> 24;
3215 if (src == 0)
3216 return;
3218 PIXMAN_IMAGE_GET_LINE (
3219 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3220 PIXMAN_IMAGE_GET_LINE (
3221 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3223 xmm_def = create_mask_2x32_128 (src, src);
3224 xmm_src = expand_pixel_32_1x128 (src);
3225 xmm_alpha = expand_alpha_1x128 (xmm_src);
3226 mmx_src = xmm_src;
3227 mmx_alpha = xmm_alpha;
3229 while (height--)
3231 dst = dst_line;
3232 dst_line += dst_stride;
3233 mask = mask_line;
3234 mask_line += mask_stride;
3235 w = width;
3237 while (w && (uintptr_t)dst & 15)
3239 uint8_t m = *mask++;
3241 if (m)
3243 d = *dst;
3244 mmx_mask = expand_pixel_8_1x128 (m);
3245 mmx_dest = unpack_32_1x128 (d);
3247 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3248 &mmx_alpha,
3249 &mmx_mask,
3250 &mmx_dest));
3253 w--;
3254 dst++;
3257 while (w >= 4)
3259 m = *((uint32_t*)mask);
3261 if (srca == 0xff && m == 0xffffffff)
3263 save_128_aligned ((__m128i*)dst, xmm_def);
3265 else if (m)
3267 xmm_dst = load_128_aligned ((__m128i*) dst);
3268 xmm_mask = unpack_32_1x128 (m);
3269 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3271 /* Unpacking */
3272 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3273 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3275 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3276 &xmm_mask_lo, &xmm_mask_hi);
3278 in_over_2x128 (&xmm_src, &xmm_src,
3279 &xmm_alpha, &xmm_alpha,
3280 &xmm_mask_lo, &xmm_mask_hi,
3281 &xmm_dst_lo, &xmm_dst_hi);
3283 save_128_aligned (
3284 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3287 w -= 4;
3288 dst += 4;
3289 mask += 4;
3292 while (w)
3294 uint8_t m = *mask++;
3296 if (m)
3298 d = *dst;
3299 mmx_mask = expand_pixel_8_1x128 (m);
3300 mmx_dest = unpack_32_1x128 (d);
3302 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3303 &mmx_alpha,
3304 &mmx_mask,
3305 &mmx_dest));
3308 w--;
3309 dst++;
3315 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3316 __attribute__((__force_align_arg_pointer__))
3317 #endif
3318 static pixman_bool_t
3319 sse2_fill (pixman_implementation_t *imp,
3320 uint32_t * bits,
3321 int stride,
3322 int bpp,
3323 int x,
3324 int y,
3325 int width,
3326 int height,
3327 uint32_t filler)
3329 uint32_t byte_width;
3330 uint8_t *byte_line;
3332 __m128i xmm_def;
3334 if (bpp == 8)
3336 uint8_t b;
3337 uint16_t w;
3339 stride = stride * (int) sizeof (uint32_t) / 1;
3340 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3341 byte_width = width;
3342 stride *= 1;
3344 b = filler & 0xff;
3345 w = (b << 8) | b;
3346 filler = (w << 16) | w;
3348 else if (bpp == 16)
3350 stride = stride * (int) sizeof (uint32_t) / 2;
3351 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3352 byte_width = 2 * width;
3353 stride *= 2;
3355 filler = (filler & 0xffff) * 0x00010001;
3357 else if (bpp == 32)
3359 stride = stride * (int) sizeof (uint32_t) / 4;
3360 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3361 byte_width = 4 * width;
3362 stride *= 4;
3364 else
3366 return FALSE;
3369 xmm_def = create_mask_2x32_128 (filler, filler);
3371 while (height--)
3373 int w;
3374 uint8_t *d = byte_line;
3375 byte_line += stride;
3376 w = byte_width;
3378 if (w >= 1 && ((uintptr_t)d & 1))
3380 *(uint8_t *)d = filler;
3381 w -= 1;
3382 d += 1;
3385 while (w >= 2 && ((uintptr_t)d & 3))
3387 *(uint16_t *)d = filler;
3388 w -= 2;
3389 d += 2;
3392 while (w >= 4 && ((uintptr_t)d & 15))
3394 *(uint32_t *)d = filler;
3396 w -= 4;
3397 d += 4;
3400 while (w >= 128)
3402 save_128_aligned ((__m128i*)(d), xmm_def);
3403 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3404 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3405 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3406 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3407 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3408 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3409 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3411 d += 128;
3412 w -= 128;
3415 if (w >= 64)
3417 save_128_aligned ((__m128i*)(d), xmm_def);
3418 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3419 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3420 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3422 d += 64;
3423 w -= 64;
3426 if (w >= 32)
3428 save_128_aligned ((__m128i*)(d), xmm_def);
3429 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3431 d += 32;
3432 w -= 32;
3435 if (w >= 16)
3437 save_128_aligned ((__m128i*)(d), xmm_def);
3439 d += 16;
3440 w -= 16;
3443 while (w >= 4)
3445 *(uint32_t *)d = filler;
3447 w -= 4;
3448 d += 4;
3451 if (w >= 2)
3453 *(uint16_t *)d = filler;
3454 w -= 2;
3455 d += 2;
3458 if (w >= 1)
3460 *(uint8_t *)d = filler;
3461 w -= 1;
3462 d += 1;
3466 return TRUE;
3469 static void
3470 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3471 pixman_composite_info_t *info)
3473 PIXMAN_COMPOSITE_ARGS (info);
3474 uint32_t src, srca;
3475 uint32_t *dst_line, *dst;
3476 uint8_t *mask_line, *mask;
3477 int dst_stride, mask_stride;
3478 int32_t w;
3479 uint32_t m;
3481 __m128i xmm_src, xmm_def;
3482 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3484 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3486 srca = src >> 24;
3487 if (src == 0)
3489 sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
3490 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3491 dest_x, dest_y, width, height, 0);
3492 return;
3495 PIXMAN_IMAGE_GET_LINE (
3496 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3497 PIXMAN_IMAGE_GET_LINE (
3498 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3500 xmm_def = create_mask_2x32_128 (src, src);
3501 xmm_src = expand_pixel_32_1x128 (src);
3503 while (height--)
3505 dst = dst_line;
3506 dst_line += dst_stride;
3507 mask = mask_line;
3508 mask_line += mask_stride;
3509 w = width;
3511 while (w && (uintptr_t)dst & 15)
3513 uint8_t m = *mask++;
3515 if (m)
3517 *dst = pack_1x128_32 (
3518 pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3520 else
3522 *dst = 0;
3525 w--;
3526 dst++;
3529 while (w >= 4)
3531 m = *((uint32_t*)mask);
3533 if (srca == 0xff && m == 0xffffffff)
3535 save_128_aligned ((__m128i*)dst, xmm_def);
3537 else if (m)
3539 xmm_mask = unpack_32_1x128 (m);
3540 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3542 /* Unpacking */
3543 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3545 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3546 &xmm_mask_lo, &xmm_mask_hi);
3548 pix_multiply_2x128 (&xmm_src, &xmm_src,
3549 &xmm_mask_lo, &xmm_mask_hi,
3550 &xmm_mask_lo, &xmm_mask_hi);
3552 save_128_aligned (
3553 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3555 else
3557 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3560 w -= 4;
3561 dst += 4;
3562 mask += 4;
3565 while (w)
3567 uint8_t m = *mask++;
3569 if (m)
3571 *dst = pack_1x128_32 (
3572 pix_multiply_1x128 (
3573 xmm_src, expand_pixel_8_1x128 (m)));
3575 else
3577 *dst = 0;
3580 w--;
3581 dst++;
3587 static void
3588 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3589 pixman_composite_info_t *info)
3591 PIXMAN_COMPOSITE_ARGS (info);
3592 uint32_t src;
3593 uint16_t *dst_line, *dst, d;
3594 uint8_t *mask_line, *mask;
3595 int dst_stride, mask_stride;
3596 int32_t w;
3597 uint32_t m;
3598 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3600 __m128i xmm_src, xmm_alpha;
3601 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3602 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3604 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3606 if (src == 0)
3607 return;
3609 PIXMAN_IMAGE_GET_LINE (
3610 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3611 PIXMAN_IMAGE_GET_LINE (
3612 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3614 xmm_src = expand_pixel_32_1x128 (src);
3615 xmm_alpha = expand_alpha_1x128 (xmm_src);
3616 mmx_src = xmm_src;
3617 mmx_alpha = xmm_alpha;
3619 while (height--)
3621 dst = dst_line;
3622 dst_line += dst_stride;
3623 mask = mask_line;
3624 mask_line += mask_stride;
3625 w = width;
3627 while (w && (uintptr_t)dst & 15)
3629 m = *mask++;
3631 if (m)
3633 d = *dst;
3634 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3635 mmx_dest = expand565_16_1x128 (d);
3637 *dst = pack_565_32_16 (
3638 pack_1x128_32 (
3639 in_over_1x128 (
3640 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3643 w--;
3644 dst++;
3647 while (w >= 8)
3649 xmm_dst = load_128_aligned ((__m128i*) dst);
3650 unpack_565_128_4x128 (xmm_dst,
3651 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3653 m = *((uint32_t*)mask);
3654 mask += 4;
3656 if (m)
3658 xmm_mask = unpack_32_1x128 (m);
3659 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3661 /* Unpacking */
3662 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3664 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3665 &xmm_mask_lo, &xmm_mask_hi);
3667 in_over_2x128 (&xmm_src, &xmm_src,
3668 &xmm_alpha, &xmm_alpha,
3669 &xmm_mask_lo, &xmm_mask_hi,
3670 &xmm_dst0, &xmm_dst1);
3673 m = *((uint32_t*)mask);
3674 mask += 4;
3676 if (m)
3678 xmm_mask = unpack_32_1x128 (m);
3679 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3681 /* Unpacking */
3682 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3684 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3685 &xmm_mask_lo, &xmm_mask_hi);
3686 in_over_2x128 (&xmm_src, &xmm_src,
3687 &xmm_alpha, &xmm_alpha,
3688 &xmm_mask_lo, &xmm_mask_hi,
3689 &xmm_dst2, &xmm_dst3);
3692 save_128_aligned (
3693 (__m128i*)dst, pack_565_4x128_128 (
3694 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3696 w -= 8;
3697 dst += 8;
3700 while (w)
3702 m = *mask++;
3704 if (m)
3706 d = *dst;
3707 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3708 mmx_dest = expand565_16_1x128 (d);
3710 *dst = pack_565_32_16 (
3711 pack_1x128_32 (
3712 in_over_1x128 (
3713 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3716 w--;
3717 dst++;
3723 static void
3724 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3725 pixman_composite_info_t *info)
3727 PIXMAN_COMPOSITE_ARGS (info);
3728 uint16_t *dst_line, *dst, d;
3729 uint32_t *src_line, *src, s;
3730 int dst_stride, src_stride;
3731 int32_t w;
3732 uint32_t opaque, zero;
3734 __m128i ms;
3735 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3736 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3738 PIXMAN_IMAGE_GET_LINE (
3739 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3740 PIXMAN_IMAGE_GET_LINE (
3741 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3743 while (height--)
3745 dst = dst_line;
3746 dst_line += dst_stride;
3747 src = src_line;
3748 src_line += src_stride;
3749 w = width;
3751 while (w && (uintptr_t)dst & 15)
3753 s = *src++;
3754 d = *dst;
3756 ms = unpack_32_1x128 (s);
3758 *dst++ = pack_565_32_16 (
3759 pack_1x128_32 (
3760 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3761 w--;
3764 while (w >= 8)
3766 /* First round */
3767 xmm_src = load_128_unaligned ((__m128i*)src);
3768 xmm_dst = load_128_aligned ((__m128i*)dst);
3770 opaque = is_opaque (xmm_src);
3771 zero = is_zero (xmm_src);
3773 unpack_565_128_4x128 (xmm_dst,
3774 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3775 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3777 /* preload next round*/
3778 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3780 if (opaque)
3782 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3783 &xmm_dst0, &xmm_dst1);
3785 else if (!zero)
3787 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3788 &xmm_dst0, &xmm_dst1);
3791 /* Second round */
3792 opaque = is_opaque (xmm_src);
3793 zero = is_zero (xmm_src);
3795 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3797 if (opaque)
3799 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3800 &xmm_dst2, &xmm_dst3);
3802 else if (!zero)
3804 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3805 &xmm_dst2, &xmm_dst3);
3808 save_128_aligned (
3809 (__m128i*)dst, pack_565_4x128_128 (
3810 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3812 w -= 8;
3813 src += 8;
3814 dst += 8;
3817 while (w)
3819 s = *src++;
3820 d = *dst;
3822 ms = unpack_32_1x128 (s);
3824 *dst++ = pack_565_32_16 (
3825 pack_1x128_32 (
3826 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3827 w--;
3833 static void
3834 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3835 pixman_composite_info_t *info)
3837 PIXMAN_COMPOSITE_ARGS (info);
3838 uint32_t *dst_line, *dst, d;
3839 uint32_t *src_line, *src, s;
3840 int dst_stride, src_stride;
3841 int32_t w;
3842 uint32_t opaque, zero;
3844 __m128i xmm_src_lo, xmm_src_hi;
3845 __m128i xmm_dst_lo, xmm_dst_hi;
3847 PIXMAN_IMAGE_GET_LINE (
3848 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3849 PIXMAN_IMAGE_GET_LINE (
3850 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3852 while (height--)
3854 dst = dst_line;
3855 dst_line += dst_stride;
3856 src = src_line;
3857 src_line += src_stride;
3858 w = width;
3860 while (w && (uintptr_t)dst & 15)
3862 s = *src++;
3863 d = *dst;
3865 *dst++ = pack_1x128_32 (
3866 over_rev_non_pre_1x128 (
3867 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3869 w--;
3872 while (w >= 4)
3874 xmm_src_hi = load_128_unaligned ((__m128i*)src);
3876 opaque = is_opaque (xmm_src_hi);
3877 zero = is_zero (xmm_src_hi);
3879 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3881 if (opaque)
3883 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3884 &xmm_dst_lo, &xmm_dst_hi);
3886 save_128_aligned (
3887 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3889 else if (!zero)
3891 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
3893 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3895 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3896 &xmm_dst_lo, &xmm_dst_hi);
3898 save_128_aligned (
3899 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3902 w -= 4;
3903 dst += 4;
3904 src += 4;
3907 while (w)
3909 s = *src++;
3910 d = *dst;
3912 *dst++ = pack_1x128_32 (
3913 over_rev_non_pre_1x128 (
3914 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3916 w--;
3922 static void
3923 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3924 pixman_composite_info_t *info)
3926 PIXMAN_COMPOSITE_ARGS (info);
3927 uint32_t src;
3928 uint16_t *dst_line, *dst, d;
3929 uint32_t *mask_line, *mask, m;
3930 int dst_stride, mask_stride;
3931 int w;
3932 uint32_t pack_cmp;
3934 __m128i xmm_src, xmm_alpha;
3935 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3936 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3938 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3940 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3942 if (src == 0)
3943 return;
3945 PIXMAN_IMAGE_GET_LINE (
3946 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3947 PIXMAN_IMAGE_GET_LINE (
3948 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3950 xmm_src = expand_pixel_32_1x128 (src);
3951 xmm_alpha = expand_alpha_1x128 (xmm_src);
3952 mmx_src = xmm_src;
3953 mmx_alpha = xmm_alpha;
3955 while (height--)
3957 w = width;
3958 mask = mask_line;
3959 dst = dst_line;
3960 mask_line += mask_stride;
3961 dst_line += dst_stride;
3963 while (w && ((uintptr_t)dst & 15))
3965 m = *(uint32_t *) mask;
3967 if (m)
3969 d = *dst;
3970 mmx_mask = unpack_32_1x128 (m);
3971 mmx_dest = expand565_16_1x128 (d);
3973 *dst = pack_565_32_16 (
3974 pack_1x128_32 (
3975 in_over_1x128 (
3976 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3979 w--;
3980 dst++;
3981 mask++;
3984 while (w >= 8)
3986 /* First round */
3987 xmm_mask = load_128_unaligned ((__m128i*)mask);
3988 xmm_dst = load_128_aligned ((__m128i*)dst);
3990 pack_cmp = _mm_movemask_epi8 (
3991 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3993 unpack_565_128_4x128 (xmm_dst,
3994 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3995 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3997 /* preload next round */
3998 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
4000 /* preload next round */
4001 if (pack_cmp != 0xffff)
4003 in_over_2x128 (&xmm_src, &xmm_src,
4004 &xmm_alpha, &xmm_alpha,
4005 &xmm_mask_lo, &xmm_mask_hi,
4006 &xmm_dst0, &xmm_dst1);
4009 /* Second round */
4010 pack_cmp = _mm_movemask_epi8 (
4011 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4013 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4015 if (pack_cmp != 0xffff)
4017 in_over_2x128 (&xmm_src, &xmm_src,
4018 &xmm_alpha, &xmm_alpha,
4019 &xmm_mask_lo, &xmm_mask_hi,
4020 &xmm_dst2, &xmm_dst3);
4023 save_128_aligned (
4024 (__m128i*)dst, pack_565_4x128_128 (
4025 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4027 w -= 8;
4028 dst += 8;
4029 mask += 8;
4032 while (w)
4034 m = *(uint32_t *) mask;
4036 if (m)
4038 d = *dst;
4039 mmx_mask = unpack_32_1x128 (m);
4040 mmx_dest = expand565_16_1x128 (d);
4042 *dst = pack_565_32_16 (
4043 pack_1x128_32 (
4044 in_over_1x128 (
4045 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4048 w--;
4049 dst++;
4050 mask++;
4056 static void
4057 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4058 pixman_composite_info_t *info)
4060 PIXMAN_COMPOSITE_ARGS (info);
4061 uint8_t *dst_line, *dst;
4062 uint8_t *mask_line, *mask;
4063 int dst_stride, mask_stride;
4064 uint32_t d, m;
4065 uint32_t src;
4066 int32_t w;
4068 __m128i xmm_alpha;
4069 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4070 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4072 PIXMAN_IMAGE_GET_LINE (
4073 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4074 PIXMAN_IMAGE_GET_LINE (
4075 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4077 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4079 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4081 while (height--)
4083 dst = dst_line;
4084 dst_line += dst_stride;
4085 mask = mask_line;
4086 mask_line += mask_stride;
4087 w = width;
4089 while (w && ((uintptr_t)dst & 15))
4091 m = (uint32_t) *mask++;
4092 d = (uint32_t) *dst;
4094 *dst++ = (uint8_t) pack_1x128_32 (
4095 pix_multiply_1x128 (
4096 pix_multiply_1x128 (xmm_alpha,
4097 unpack_32_1x128 (m)),
4098 unpack_32_1x128 (d)));
4099 w--;
4102 while (w >= 16)
4104 xmm_mask = load_128_unaligned ((__m128i*)mask);
4105 xmm_dst = load_128_aligned ((__m128i*)dst);
4107 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4108 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4110 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4111 &xmm_mask_lo, &xmm_mask_hi,
4112 &xmm_mask_lo, &xmm_mask_hi);
4114 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4115 &xmm_dst_lo, &xmm_dst_hi,
4116 &xmm_dst_lo, &xmm_dst_hi);
4118 save_128_aligned (
4119 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4121 mask += 16;
4122 dst += 16;
4123 w -= 16;
4126 while (w)
4128 m = (uint32_t) *mask++;
4129 d = (uint32_t) *dst;
4131 *dst++ = (uint8_t) pack_1x128_32 (
4132 pix_multiply_1x128 (
4133 pix_multiply_1x128 (
4134 xmm_alpha, unpack_32_1x128 (m)),
4135 unpack_32_1x128 (d)));
4136 w--;
4142 static void
4143 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4144 pixman_composite_info_t *info)
4146 PIXMAN_COMPOSITE_ARGS (info);
4147 uint8_t *dst_line, *dst;
4148 int dst_stride;
4149 uint32_t d;
4150 uint32_t src;
4151 int32_t w;
4153 __m128i xmm_alpha;
4154 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4156 PIXMAN_IMAGE_GET_LINE (
4157 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4159 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4161 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4163 src = src >> 24;
4165 if (src == 0xff)
4166 return;
4168 if (src == 0x00)
4170 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4171 8, dest_x, dest_y, width, height, src);
4173 return;
4176 while (height--)
4178 dst = dst_line;
4179 dst_line += dst_stride;
4180 w = width;
4182 while (w && ((uintptr_t)dst & 15))
4184 d = (uint32_t) *dst;
4186 *dst++ = (uint8_t) pack_1x128_32 (
4187 pix_multiply_1x128 (
4188 xmm_alpha,
4189 unpack_32_1x128 (d)));
4190 w--;
4193 while (w >= 16)
4195 xmm_dst = load_128_aligned ((__m128i*)dst);
4197 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4199 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4200 &xmm_dst_lo, &xmm_dst_hi,
4201 &xmm_dst_lo, &xmm_dst_hi);
4203 save_128_aligned (
4204 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4206 dst += 16;
4207 w -= 16;
4210 while (w)
4212 d = (uint32_t) *dst;
4214 *dst++ = (uint8_t) pack_1x128_32 (
4215 pix_multiply_1x128 (
4216 xmm_alpha,
4217 unpack_32_1x128 (d)));
4218 w--;
4224 static void
4225 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4226 pixman_composite_info_t *info)
4228 PIXMAN_COMPOSITE_ARGS (info);
4229 uint8_t *dst_line, *dst;
4230 uint8_t *src_line, *src;
4231 int src_stride, dst_stride;
4232 int32_t w;
4233 uint32_t s, d;
4235 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4236 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4238 PIXMAN_IMAGE_GET_LINE (
4239 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4240 PIXMAN_IMAGE_GET_LINE (
4241 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4243 while (height--)
4245 dst = dst_line;
4246 dst_line += dst_stride;
4247 src = src_line;
4248 src_line += src_stride;
4249 w = width;
4251 while (w && ((uintptr_t)dst & 15))
4253 s = (uint32_t) *src++;
4254 d = (uint32_t) *dst;
4256 *dst++ = (uint8_t) pack_1x128_32 (
4257 pix_multiply_1x128 (
4258 unpack_32_1x128 (s), unpack_32_1x128 (d)));
4259 w--;
4262 while (w >= 16)
4264 xmm_src = load_128_unaligned ((__m128i*)src);
4265 xmm_dst = load_128_aligned ((__m128i*)dst);
4267 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4268 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4270 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4271 &xmm_dst_lo, &xmm_dst_hi,
4272 &xmm_dst_lo, &xmm_dst_hi);
4274 save_128_aligned (
4275 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4277 src += 16;
4278 dst += 16;
4279 w -= 16;
4282 while (w)
4284 s = (uint32_t) *src++;
4285 d = (uint32_t) *dst;
4287 *dst++ = (uint8_t) pack_1x128_32 (
4288 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4289 w--;
4295 static void
4296 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4297 pixman_composite_info_t *info)
4299 PIXMAN_COMPOSITE_ARGS (info);
4300 uint8_t *dst_line, *dst;
4301 uint8_t *mask_line, *mask;
4302 int dst_stride, mask_stride;
4303 int32_t w;
4304 uint32_t src;
4305 uint32_t m, d;
4307 __m128i xmm_alpha;
4308 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4309 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4311 PIXMAN_IMAGE_GET_LINE (
4312 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4313 PIXMAN_IMAGE_GET_LINE (
4314 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4316 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4318 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4320 while (height--)
4322 dst = dst_line;
4323 dst_line += dst_stride;
4324 mask = mask_line;
4325 mask_line += mask_stride;
4326 w = width;
4328 while (w && ((uintptr_t)dst & 15))
4330 m = (uint32_t) *mask++;
4331 d = (uint32_t) *dst;
4333 *dst++ = (uint8_t) pack_1x128_32 (
4334 _mm_adds_epu16 (
4335 pix_multiply_1x128 (
4336 xmm_alpha, unpack_32_1x128 (m)),
4337 unpack_32_1x128 (d)));
4338 w--;
4341 while (w >= 16)
4343 xmm_mask = load_128_unaligned ((__m128i*)mask);
4344 xmm_dst = load_128_aligned ((__m128i*)dst);
4346 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4347 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4349 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4350 &xmm_mask_lo, &xmm_mask_hi,
4351 &xmm_mask_lo, &xmm_mask_hi);
4353 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4354 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4356 save_128_aligned (
4357 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4359 mask += 16;
4360 dst += 16;
4361 w -= 16;
4364 while (w)
4366 m = (uint32_t) *mask++;
4367 d = (uint32_t) *dst;
4369 *dst++ = (uint8_t) pack_1x128_32 (
4370 _mm_adds_epu16 (
4371 pix_multiply_1x128 (
4372 xmm_alpha, unpack_32_1x128 (m)),
4373 unpack_32_1x128 (d)));
4375 w--;
4381 static void
4382 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4383 pixman_composite_info_t *info)
4385 PIXMAN_COMPOSITE_ARGS (info);
4386 uint8_t *dst_line, *dst;
4387 int dst_stride;
4388 int32_t w;
4389 uint32_t src;
4391 __m128i xmm_src;
4393 PIXMAN_IMAGE_GET_LINE (
4394 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4396 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4398 src >>= 24;
4400 if (src == 0x00)
4401 return;
4403 if (src == 0xff)
4405 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4406 8, dest_x, dest_y, width, height, 0xff);
4408 return;
4411 src = (src << 24) | (src << 16) | (src << 8) | src;
4412 xmm_src = _mm_set_epi32 (src, src, src, src);
4414 while (height--)
4416 dst = dst_line;
4417 dst_line += dst_stride;
4418 w = width;
4420 while (w && ((uintptr_t)dst & 15))
4422 *dst = (uint8_t)_mm_cvtsi128_si32 (
4423 _mm_adds_epu8 (
4424 xmm_src,
4425 _mm_cvtsi32_si128 (*dst)));
4427 w--;
4428 dst++;
4431 while (w >= 16)
4433 save_128_aligned (
4434 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4436 dst += 16;
4437 w -= 16;
4440 while (w)
4442 *dst = (uint8_t)_mm_cvtsi128_si32 (
4443 _mm_adds_epu8 (
4444 xmm_src,
4445 _mm_cvtsi32_si128 (*dst)));
4447 w--;
4448 dst++;
4454 static void
4455 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4456 pixman_composite_info_t *info)
4458 PIXMAN_COMPOSITE_ARGS (info);
4459 uint8_t *dst_line, *dst;
4460 uint8_t *src_line, *src;
4461 int dst_stride, src_stride;
4462 int32_t w;
4463 uint16_t t;
4465 PIXMAN_IMAGE_GET_LINE (
4466 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4467 PIXMAN_IMAGE_GET_LINE (
4468 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4470 while (height--)
4472 dst = dst_line;
4473 src = src_line;
4475 dst_line += dst_stride;
4476 src_line += src_stride;
4477 w = width;
4479 /* Small head */
4480 while (w && (uintptr_t)dst & 3)
4482 t = (*dst) + (*src++);
4483 *dst++ = t | (0 - (t >> 8));
4484 w--;
4487 sse2_combine_add_u (imp, op,
4488 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4490 /* Small tail */
4491 dst += w & 0xfffc;
4492 src += w & 0xfffc;
4494 w &= 3;
4496 while (w)
4498 t = (*dst) + (*src++);
4499 *dst++ = t | (0 - (t >> 8));
4500 w--;
4506 static void
4507 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4508 pixman_composite_info_t *info)
4510 PIXMAN_COMPOSITE_ARGS (info);
4511 uint32_t *dst_line, *dst;
4512 uint32_t *src_line, *src;
4513 int dst_stride, src_stride;
4515 PIXMAN_IMAGE_GET_LINE (
4516 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4517 PIXMAN_IMAGE_GET_LINE (
4518 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4520 while (height--)
4522 dst = dst_line;
4523 dst_line += dst_stride;
4524 src = src_line;
4525 src_line += src_stride;
4527 sse2_combine_add_u (imp, op, dst, src, NULL, width);
4531 static void
4532 sse2_composite_add_n_8888 (pixman_implementation_t *imp,
4533 pixman_composite_info_t *info)
4535 PIXMAN_COMPOSITE_ARGS (info);
4536 uint32_t *dst_line, *dst, src;
4537 int dst_stride;
4539 __m128i xmm_src;
4541 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4543 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4544 if (src == 0)
4545 return;
4547 if (src == ~0)
4549 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
4550 dest_x, dest_y, width, height, ~0);
4552 return;
4555 xmm_src = _mm_set_epi32 (src, src, src, src);
4556 while (height--)
4558 int w = width;
4559 uint32_t d;
4561 dst = dst_line;
4562 dst_line += dst_stride;
4564 while (w && (uintptr_t)dst & 15)
4566 d = *dst;
4567 *dst++ =
4568 _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
4569 w--;
4572 while (w >= 4)
4574 save_128_aligned
4575 ((__m128i*)dst,
4576 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4578 dst += 4;
4579 w -= 4;
4582 while (w--)
4584 d = *dst;
4585 *dst++ =
4586 _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
4587 _mm_cvtsi32_si128 (d)));
4592 static void
4593 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
4594 pixman_composite_info_t *info)
4596 PIXMAN_COMPOSITE_ARGS (info);
4597 uint32_t *dst_line, *dst;
4598 uint8_t *mask_line, *mask;
4599 int dst_stride, mask_stride;
4600 int32_t w;
4601 uint32_t src;
4603 __m128i xmm_src;
4605 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4606 if (src == 0)
4607 return;
4608 xmm_src = expand_pixel_32_1x128 (src);
4610 PIXMAN_IMAGE_GET_LINE (
4611 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4612 PIXMAN_IMAGE_GET_LINE (
4613 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4615 while (height--)
4617 dst = dst_line;
4618 dst_line += dst_stride;
4619 mask = mask_line;
4620 mask_line += mask_stride;
4621 w = width;
4623 while (w && ((uintptr_t)dst & 15))
4625 uint8_t m = *mask++;
4626 if (m)
4628 *dst = pack_1x128_32
4629 (_mm_adds_epu16
4630 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4631 unpack_32_1x128 (*dst)));
4633 dst++;
4634 w--;
4637 while (w >= 4)
4639 uint32_t m = *(uint32_t*)mask;
4640 if (m)
4642 __m128i xmm_mask_lo, xmm_mask_hi;
4643 __m128i xmm_dst_lo, xmm_dst_hi;
4645 __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
4646 __m128i xmm_mask =
4647 _mm_unpacklo_epi8 (unpack_32_1x128(m),
4648 _mm_setzero_si128 ());
4650 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4651 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4653 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4654 &xmm_mask_lo, &xmm_mask_hi);
4656 pix_multiply_2x128 (&xmm_src, &xmm_src,
4657 &xmm_mask_lo, &xmm_mask_hi,
4658 &xmm_mask_lo, &xmm_mask_hi);
4660 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4661 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4663 save_128_aligned (
4664 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4667 w -= 4;
4668 dst += 4;
4669 mask += 4;
4672 while (w)
4674 uint8_t m = *mask++;
4675 if (m)
4677 *dst = pack_1x128_32
4678 (_mm_adds_epu16
4679 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4680 unpack_32_1x128 (*dst)));
4682 dst++;
4683 w--;
4688 static pixman_bool_t
4689 sse2_blt (pixman_implementation_t *imp,
4690 uint32_t * src_bits,
4691 uint32_t * dst_bits,
4692 int src_stride,
4693 int dst_stride,
4694 int src_bpp,
4695 int dst_bpp,
4696 int src_x,
4697 int src_y,
4698 int dest_x,
4699 int dest_y,
4700 int width,
4701 int height)
4703 uint8_t * src_bytes;
4704 uint8_t * dst_bytes;
4705 int byte_width;
4707 if (src_bpp != dst_bpp)
4708 return FALSE;
4710 if (src_bpp == 16)
4712 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4713 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4714 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4715 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4716 byte_width = 2 * width;
4717 src_stride *= 2;
4718 dst_stride *= 2;
4720 else if (src_bpp == 32)
4722 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4723 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4724 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4725 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4726 byte_width = 4 * width;
4727 src_stride *= 4;
4728 dst_stride *= 4;
4730 else
4732 return FALSE;
4735 while (height--)
4737 int w;
4738 uint8_t *s = src_bytes;
4739 uint8_t *d = dst_bytes;
4740 src_bytes += src_stride;
4741 dst_bytes += dst_stride;
4742 w = byte_width;
4744 while (w >= 2 && ((uintptr_t)d & 3))
4746 *(uint16_t *)d = *(uint16_t *)s;
4747 w -= 2;
4748 s += 2;
4749 d += 2;
4752 while (w >= 4 && ((uintptr_t)d & 15))
4754 *(uint32_t *)d = *(uint32_t *)s;
4756 w -= 4;
4757 s += 4;
4758 d += 4;
4761 while (w >= 64)
4763 __m128i xmm0, xmm1, xmm2, xmm3;
4765 xmm0 = load_128_unaligned ((__m128i*)(s));
4766 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4767 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4768 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4770 save_128_aligned ((__m128i*)(d), xmm0);
4771 save_128_aligned ((__m128i*)(d + 16), xmm1);
4772 save_128_aligned ((__m128i*)(d + 32), xmm2);
4773 save_128_aligned ((__m128i*)(d + 48), xmm3);
4775 s += 64;
4776 d += 64;
4777 w -= 64;
4780 while (w >= 16)
4782 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4784 w -= 16;
4785 d += 16;
4786 s += 16;
4789 while (w >= 4)
4791 *(uint32_t *)d = *(uint32_t *)s;
4793 w -= 4;
4794 s += 4;
4795 d += 4;
4798 if (w >= 2)
4800 *(uint16_t *)d = *(uint16_t *)s;
4801 w -= 2;
4802 s += 2;
4803 d += 2;
4807 return TRUE;
4810 static void
4811 sse2_composite_copy_area (pixman_implementation_t *imp,
4812 pixman_composite_info_t *info)
4814 PIXMAN_COMPOSITE_ARGS (info);
4815 sse2_blt (imp, src_image->bits.bits,
4816 dest_image->bits.bits,
4817 src_image->bits.rowstride,
4818 dest_image->bits.rowstride,
4819 PIXMAN_FORMAT_BPP (src_image->bits.format),
4820 PIXMAN_FORMAT_BPP (dest_image->bits.format),
4821 src_x, src_y, dest_x, dest_y, width, height);
4824 static void
4825 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4826 pixman_composite_info_t *info)
4828 PIXMAN_COMPOSITE_ARGS (info);
4829 uint32_t *src, *src_line, s;
4830 uint32_t *dst, *dst_line, d;
4831 uint8_t *mask, *mask_line;
4832 uint32_t m;
4833 int src_stride, mask_stride, dst_stride;
4834 int32_t w;
4835 __m128i ms;
4837 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4838 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4839 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4841 PIXMAN_IMAGE_GET_LINE (
4842 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4843 PIXMAN_IMAGE_GET_LINE (
4844 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4845 PIXMAN_IMAGE_GET_LINE (
4846 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4848 while (height--)
4850 src = src_line;
4851 src_line += src_stride;
4852 dst = dst_line;
4853 dst_line += dst_stride;
4854 mask = mask_line;
4855 mask_line += mask_stride;
4857 w = width;
4859 while (w && (uintptr_t)dst & 15)
4861 s = 0xff000000 | *src++;
4862 m = (uint32_t) *mask++;
4863 d = *dst;
4864 ms = unpack_32_1x128 (s);
4866 if (m != 0xff)
4868 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4869 __m128i md = unpack_32_1x128 (d);
4871 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4874 *dst++ = pack_1x128_32 (ms);
4875 w--;
4878 while (w >= 4)
4880 m = *(uint32_t*) mask;
4881 xmm_src = _mm_or_si128 (
4882 load_128_unaligned ((__m128i*)src), mask_ff000000);
4884 if (m == 0xffffffff)
4886 save_128_aligned ((__m128i*)dst, xmm_src);
4888 else
4890 xmm_dst = load_128_aligned ((__m128i*)dst);
4892 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4894 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4895 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4896 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4898 expand_alpha_rev_2x128 (
4899 xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4901 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4902 &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4903 &xmm_dst_lo, &xmm_dst_hi);
4905 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4908 src += 4;
4909 dst += 4;
4910 mask += 4;
4911 w -= 4;
4914 while (w)
4916 m = (uint32_t) *mask++;
4918 if (m)
4920 s = 0xff000000 | *src;
4922 if (m == 0xff)
4924 *dst = s;
4926 else
4928 __m128i ma, md, ms;
4930 d = *dst;
4932 ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4933 md = unpack_32_1x128 (d);
4934 ms = unpack_32_1x128 (s);
4936 *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4941 src++;
4942 dst++;
4943 w--;
4949 static void
4950 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4951 pixman_composite_info_t *info)
4953 PIXMAN_COMPOSITE_ARGS (info);
4954 uint32_t *src, *src_line, s;
4955 uint32_t *dst, *dst_line, d;
4956 uint8_t *mask, *mask_line;
4957 uint32_t m;
4958 int src_stride, mask_stride, dst_stride;
4959 int32_t w;
4961 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4962 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4963 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4965 PIXMAN_IMAGE_GET_LINE (
4966 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4967 PIXMAN_IMAGE_GET_LINE (
4968 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4969 PIXMAN_IMAGE_GET_LINE (
4970 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4972 while (height--)
4974 src = src_line;
4975 src_line += src_stride;
4976 dst = dst_line;
4977 dst_line += dst_stride;
4978 mask = mask_line;
4979 mask_line += mask_stride;
4981 w = width;
4983 while (w && (uintptr_t)dst & 15)
4985 uint32_t sa;
4987 s = *src++;
4988 m = (uint32_t) *mask++;
4989 d = *dst;
4991 sa = s >> 24;
4993 if (m)
4995 if (sa == 0xff && m == 0xff)
4997 *dst = s;
4999 else
5001 __m128i ms, md, ma, msa;
5003 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5004 ms = unpack_32_1x128 (s);
5005 md = unpack_32_1x128 (d);
5007 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5009 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5013 dst++;
5014 w--;
5017 while (w >= 4)
5019 m = *(uint32_t *) mask;
5021 if (m)
5023 xmm_src = load_128_unaligned ((__m128i*)src);
5025 if (m == 0xffffffff && is_opaque (xmm_src))
5027 save_128_aligned ((__m128i *)dst, xmm_src);
5029 else
5031 xmm_dst = load_128_aligned ((__m128i *)dst);
5033 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5035 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5036 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5037 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5039 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5040 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5042 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5043 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5045 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5049 src += 4;
5050 dst += 4;
5051 mask += 4;
5052 w -= 4;
5055 while (w)
5057 uint32_t sa;
5059 s = *src++;
5060 m = (uint32_t) *mask++;
5061 d = *dst;
5063 sa = s >> 24;
5065 if (m)
5067 if (sa == 0xff && m == 0xff)
5069 *dst = s;
5071 else
5073 __m128i ms, md, ma, msa;
5075 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5076 ms = unpack_32_1x128 (s);
5077 md = unpack_32_1x128 (d);
5079 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5081 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5085 dst++;
5086 w--;
5092 static void
5093 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5094 pixman_composite_info_t *info)
5096 PIXMAN_COMPOSITE_ARGS (info);
5097 uint32_t src;
5098 uint32_t *dst_line, *dst;
5099 __m128i xmm_src;
5100 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5101 __m128i xmm_dsta_hi, xmm_dsta_lo;
5102 int dst_stride;
5103 int32_t w;
5105 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5107 if (src == 0)
5108 return;
5110 PIXMAN_IMAGE_GET_LINE (
5111 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5113 xmm_src = expand_pixel_32_1x128 (src);
5115 while (height--)
5117 dst = dst_line;
5119 dst_line += dst_stride;
5120 w = width;
5122 while (w && (uintptr_t)dst & 15)
5124 __m128i vd;
5126 vd = unpack_32_1x128 (*dst);
5128 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5129 xmm_src));
5130 w--;
5131 dst++;
5134 while (w >= 4)
5136 __m128i tmp_lo, tmp_hi;
5138 xmm_dst = load_128_aligned ((__m128i*)dst);
5140 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5141 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5143 tmp_lo = xmm_src;
5144 tmp_hi = xmm_src;
5146 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5147 &xmm_dsta_lo, &xmm_dsta_hi,
5148 &tmp_lo, &tmp_hi);
5150 save_128_aligned (
5151 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5153 w -= 4;
5154 dst += 4;
5157 while (w)
5159 __m128i vd;
5161 vd = unpack_32_1x128 (*dst);
5163 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5164 xmm_src));
5165 w--;
5166 dst++;
5173 static void
5174 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5175 pixman_composite_info_t *info)
5177 PIXMAN_COMPOSITE_ARGS (info);
5178 uint32_t *src, *src_line, s;
5179 uint32_t *dst, *dst_line, d;
5180 uint32_t *mask, *mask_line;
5181 uint32_t m;
5182 int src_stride, mask_stride, dst_stride;
5183 int32_t w;
5185 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5186 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5187 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5189 PIXMAN_IMAGE_GET_LINE (
5190 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5191 PIXMAN_IMAGE_GET_LINE (
5192 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5193 PIXMAN_IMAGE_GET_LINE (
5194 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5196 while (height--)
5198 src = src_line;
5199 src_line += src_stride;
5200 dst = dst_line;
5201 dst_line += dst_stride;
5202 mask = mask_line;
5203 mask_line += mask_stride;
5205 w = width;
5207 while (w && (uintptr_t)dst & 15)
5209 uint32_t sa;
5211 s = *src++;
5212 m = (*mask++) >> 24;
5213 d = *dst;
5215 sa = s >> 24;
5217 if (m)
5219 if (sa == 0xff && m == 0xff)
5221 *dst = s;
5223 else
5225 __m128i ms, md, ma, msa;
5227 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5228 ms = unpack_32_1x128 (s);
5229 md = unpack_32_1x128 (d);
5231 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5233 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5237 dst++;
5238 w--;
5241 while (w >= 4)
5243 xmm_mask = load_128_unaligned ((__m128i*)mask);
5245 if (!is_transparent (xmm_mask))
5247 xmm_src = load_128_unaligned ((__m128i*)src);
5249 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5251 save_128_aligned ((__m128i *)dst, xmm_src);
5253 else
5255 xmm_dst = load_128_aligned ((__m128i *)dst);
5257 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5258 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5259 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5261 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5262 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5264 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5265 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5267 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5271 src += 4;
5272 dst += 4;
5273 mask += 4;
5274 w -= 4;
5277 while (w)
5279 uint32_t sa;
5281 s = *src++;
5282 m = (*mask++) >> 24;
5283 d = *dst;
5285 sa = s >> 24;
5287 if (m)
5289 if (sa == 0xff && m == 0xff)
5291 *dst = s;
5293 else
5295 __m128i ms, md, ma, msa;
5297 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5298 ms = unpack_32_1x128 (s);
5299 md = unpack_32_1x128 (d);
5301 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5303 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5307 dst++;
5308 w--;
5314 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5315 static force_inline void
5316 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5317 const uint32_t* ps,
5318 int32_t w,
5319 pixman_fixed_t vx,
5320 pixman_fixed_t unit_x,
5321 pixman_fixed_t src_width_fixed,
5322 pixman_bool_t fully_transparent_src)
5324 uint32_t s, d;
5325 const uint32_t* pm = NULL;
5327 __m128i xmm_dst_lo, xmm_dst_hi;
5328 __m128i xmm_src_lo, xmm_src_hi;
5329 __m128i xmm_alpha_lo, xmm_alpha_hi;
5331 if (fully_transparent_src)
5332 return;
5334 /* Align dst on a 16-byte boundary */
5335 while (w && ((uintptr_t)pd & 15))
5337 d = *pd;
5338 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5339 vx += unit_x;
5340 while (vx >= 0)
5341 vx -= src_width_fixed;
5343 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5344 if (pm)
5345 pm++;
5346 w--;
5349 while (w >= 4)
5351 __m128i tmp;
5352 uint32_t tmp1, tmp2, tmp3, tmp4;
5354 tmp1 = *(ps + pixman_fixed_to_int (vx));
5355 vx += unit_x;
5356 while (vx >= 0)
5357 vx -= src_width_fixed;
5358 tmp2 = *(ps + pixman_fixed_to_int (vx));
5359 vx += unit_x;
5360 while (vx >= 0)
5361 vx -= src_width_fixed;
5362 tmp3 = *(ps + pixman_fixed_to_int (vx));
5363 vx += unit_x;
5364 while (vx >= 0)
5365 vx -= src_width_fixed;
5366 tmp4 = *(ps + pixman_fixed_to_int (vx));
5367 vx += unit_x;
5368 while (vx >= 0)
5369 vx -= src_width_fixed;
5371 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5373 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5375 if (is_opaque (xmm_src_hi))
5377 save_128_aligned ((__m128i*)pd, xmm_src_hi);
5379 else if (!is_zero (xmm_src_hi))
5381 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5383 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5384 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5386 expand_alpha_2x128 (
5387 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5389 over_2x128 (&xmm_src_lo, &xmm_src_hi,
5390 &xmm_alpha_lo, &xmm_alpha_hi,
5391 &xmm_dst_lo, &xmm_dst_hi);
5393 /* rebuid the 4 pixel data and save*/
5394 save_128_aligned ((__m128i*)pd,
5395 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5398 w -= 4;
5399 pd += 4;
5400 if (pm)
5401 pm += 4;
5404 while (w)
5406 d = *pd;
5407 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5408 vx += unit_x;
5409 while (vx >= 0)
5410 vx -= src_width_fixed;
5412 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5413 if (pm)
5414 pm++;
5416 w--;
5420 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5421 scaled_nearest_scanline_sse2_8888_8888_OVER,
5422 uint32_t, uint32_t, COVER)
5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5424 scaled_nearest_scanline_sse2_8888_8888_OVER,
5425 uint32_t, uint32_t, NONE)
5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5427 scaled_nearest_scanline_sse2_8888_8888_OVER,
5428 uint32_t, uint32_t, PAD)
5429 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
5430 scaled_nearest_scanline_sse2_8888_8888_OVER,
5431 uint32_t, uint32_t, NORMAL)
5433 static force_inline void
5434 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5435 uint32_t * dst,
5436 const uint32_t * src,
5437 int32_t w,
5438 pixman_fixed_t vx,
5439 pixman_fixed_t unit_x,
5440 pixman_fixed_t src_width_fixed,
5441 pixman_bool_t zero_src)
5443 __m128i xmm_mask;
5444 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5445 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5446 __m128i xmm_alpha_lo, xmm_alpha_hi;
5448 if (zero_src || (*mask >> 24) == 0)
5449 return;
5451 xmm_mask = create_mask_16_128 (*mask >> 24);
5453 while (w && (uintptr_t)dst & 15)
5455 uint32_t s = *(src + pixman_fixed_to_int (vx));
5456 vx += unit_x;
5457 while (vx >= 0)
5458 vx -= src_width_fixed;
5460 if (s)
5462 uint32_t d = *dst;
5464 __m128i ms = unpack_32_1x128 (s);
5465 __m128i alpha = expand_alpha_1x128 (ms);
5466 __m128i dest = xmm_mask;
5467 __m128i alpha_dst = unpack_32_1x128 (d);
5469 *dst = pack_1x128_32 (
5470 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5472 dst++;
5473 w--;
5476 while (w >= 4)
5478 uint32_t tmp1, tmp2, tmp3, tmp4;
5480 tmp1 = *(src + pixman_fixed_to_int (vx));
5481 vx += unit_x;
5482 while (vx >= 0)
5483 vx -= src_width_fixed;
5484 tmp2 = *(src + pixman_fixed_to_int (vx));
5485 vx += unit_x;
5486 while (vx >= 0)
5487 vx -= src_width_fixed;
5488 tmp3 = *(src + pixman_fixed_to_int (vx));
5489 vx += unit_x;
5490 while (vx >= 0)
5491 vx -= src_width_fixed;
5492 tmp4 = *(src + pixman_fixed_to_int (vx));
5493 vx += unit_x;
5494 while (vx >= 0)
5495 vx -= src_width_fixed;
5497 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5499 if (!is_zero (xmm_src))
5501 xmm_dst = load_128_aligned ((__m128i*)dst);
5503 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5504 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5505 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5506 &xmm_alpha_lo, &xmm_alpha_hi);
5508 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5509 &xmm_alpha_lo, &xmm_alpha_hi,
5510 &xmm_mask, &xmm_mask,
5511 &xmm_dst_lo, &xmm_dst_hi);
5513 save_128_aligned (
5514 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5517 dst += 4;
5518 w -= 4;
5521 while (w)
5523 uint32_t s = *(src + pixman_fixed_to_int (vx));
5524 vx += unit_x;
5525 while (vx >= 0)
5526 vx -= src_width_fixed;
5528 if (s)
5530 uint32_t d = *dst;
5532 __m128i ms = unpack_32_1x128 (s);
5533 __m128i alpha = expand_alpha_1x128 (ms);
5534 __m128i mask = xmm_mask;
5535 __m128i dest = unpack_32_1x128 (d);
5537 *dst = pack_1x128_32 (
5538 in_over_1x128 (&ms, &alpha, &mask, &dest));
5541 dst++;
5542 w--;
5547 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5548 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5549 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5551 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5552 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5554 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5555 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5556 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
5557 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5558 uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
5560 #if PSHUFD_IS_FAST
5562 /***********************************************************************************/
5564 # define BILINEAR_DECLARE_VARIABLES \
5565 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
5566 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
5567 const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
5568 const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
5569 unit_x, -unit_x, unit_x, -unit_x); \
5570 const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \
5571 unit_x * 4, -unit_x * 4, \
5572 unit_x * 4, -unit_x * 4, \
5573 unit_x * 4, -unit_x * 4); \
5574 const __m128i xmm_zero = _mm_setzero_si128 (); \
5575 __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3, \
5576 vx + unit_x * 2, -(vx + 1) - unit_x * 2, \
5577 vx + unit_x * 1, -(vx + 1) - unit_x * 1, \
5578 vx + unit_x * 0, -(vx + 1) - unit_x * 0); \
5579 __m128i xmm_wh_state;
5581 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_) \
5582 do { \
5583 int phase = phase_; \
5584 __m128i xmm_wh, xmm_a, xmm_b; \
5585 /* fetch 2x2 pixel block into sse2 registers */ \
5586 __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
5587 __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
5588 vx += unit_x; \
5589 /* vertical interpolation */ \
5590 xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
5591 xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
5592 xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
5593 /* calculate horizontal weights */ \
5594 if (phase <= 0) \
5596 xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
5597 16 - BILINEAR_INTERPOLATION_BITS)); \
5598 xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4); \
5599 phase = 0; \
5601 xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase, \
5602 phase, phase)); \
5603 /* horizontal interpolation */ \
5604 xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
5605 xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh); \
5606 /* shift the result */ \
5607 pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
5608 } while (0)
5610 #else /************************************************************************/
5612 # define BILINEAR_DECLARE_VARIABLES \
5613 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
5614 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
5615 const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
5616 const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
5617 unit_x, -unit_x, unit_x, -unit_x); \
5618 const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4, \
5619 unit_x * 4, -unit_x * 4, \
5620 unit_x * 4, -unit_x * 4, \
5621 unit_x * 4, -unit_x * 4); \
5622 const __m128i xmm_zero = _mm_setzero_si128 (); \
5623 __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1), \
5624 vx, -(vx + 1), vx, -(vx + 1))
5626 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase) \
5627 do { \
5628 __m128i xmm_wh, xmm_a, xmm_b; \
5629 /* fetch 2x2 pixel block into sse2 registers */ \
5630 __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]); \
5631 __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]); \
5632 (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */ \
5633 vx += unit_x; \
5634 /* vertical interpolation */ \
5635 xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt); \
5636 xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb); \
5637 xmm_a = _mm_add_epi16 (xmm_a, xmm_b); \
5638 /* calculate horizontal weights */ \
5639 xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
5640 16 - BILINEAR_INTERPOLATION_BITS)); \
5641 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
5642 /* horizontal interpolation */ \
5643 xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a); \
5644 xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh); \
5645 /* shift the result */ \
5646 pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2); \
5647 } while (0)
5649 /***********************************************************************************/
5651 #endif
5653 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix); \
5654 do { \
5655 __m128i xmm_pix; \
5656 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1); \
5657 xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix); \
5658 xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix); \
5659 pix = _mm_cvtsi128_si32 (xmm_pix); \
5660 } while(0)
5662 #define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix); \
5663 do { \
5664 __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4; \
5665 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0); \
5666 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1); \
5667 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2); \
5668 BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3); \
5669 xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2); \
5670 xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4); \
5671 pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3); \
5672 } while(0)
5674 #define BILINEAR_SKIP_ONE_PIXEL() \
5675 do { \
5676 vx += unit_x; \
5677 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1); \
5678 } while(0)
5680 #define BILINEAR_SKIP_FOUR_PIXELS() \
5681 do { \
5682 vx += unit_x * 4; \
5683 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4); \
5684 } while(0)
5686 /***********************************************************************************/
5688 static force_inline void
5689 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
5690 const uint32_t * mask,
5691 const uint32_t * src_top,
5692 const uint32_t * src_bottom,
5693 int32_t w,
5694 int wt,
5695 int wb,
5696 pixman_fixed_t vx_,
5697 pixman_fixed_t unit_x_,
5698 pixman_fixed_t max_vx,
5699 pixman_bool_t zero_src)
5701 intptr_t vx = vx_;
5702 intptr_t unit_x = unit_x_;
5703 BILINEAR_DECLARE_VARIABLES;
5704 uint32_t pix1, pix2;
5706 while (w && ((uintptr_t)dst & 15))
5708 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5709 *dst++ = pix1;
5710 w--;
5713 while ((w -= 4) >= 0) {
5714 __m128i xmm_src;
5715 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5716 _mm_store_si128 ((__m128i *)dst, xmm_src);
5717 dst += 4;
5720 if (w & 2)
5722 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5723 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5724 *dst++ = pix1;
5725 *dst++ = pix2;
5728 if (w & 1)
5730 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5731 *dst = pix1;
5736 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5737 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5738 uint32_t, uint32_t, uint32_t,
5739 COVER, FLAG_NONE)
5740 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5741 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5742 uint32_t, uint32_t, uint32_t,
5743 PAD, FLAG_NONE)
5744 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5745 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5746 uint32_t, uint32_t, uint32_t,
5747 NONE, FLAG_NONE)
5748 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5749 scaled_bilinear_scanline_sse2_8888_8888_SRC,
5750 uint32_t, uint32_t, uint32_t,
5751 NORMAL, FLAG_NONE)
5753 static force_inline void
5754 scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t * dst,
5755 const uint32_t * mask,
5756 const uint32_t * src_top,
5757 const uint32_t * src_bottom,
5758 int32_t w,
5759 int wt,
5760 int wb,
5761 pixman_fixed_t vx_,
5762 pixman_fixed_t unit_x_,
5763 pixman_fixed_t max_vx,
5764 pixman_bool_t zero_src)
5766 intptr_t vx = vx_;
5767 intptr_t unit_x = unit_x_;
5768 BILINEAR_DECLARE_VARIABLES;
5769 uint32_t pix1, pix2;
5771 while (w && ((uintptr_t)dst & 15))
5773 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5774 *dst++ = pix1 | 0xFF000000;
5775 w--;
5778 while ((w -= 4) >= 0) {
5779 __m128i xmm_src;
5780 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5781 _mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
5782 dst += 4;
5785 if (w & 2)
5787 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5788 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5789 *dst++ = pix1 | 0xFF000000;
5790 *dst++ = pix2 | 0xFF000000;
5793 if (w & 1)
5795 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5796 *dst = pix1 | 0xFF000000;
5800 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
5801 scaled_bilinear_scanline_sse2_x888_8888_SRC,
5802 uint32_t, uint32_t, uint32_t,
5803 COVER, FLAG_NONE)
5804 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
5805 scaled_bilinear_scanline_sse2_x888_8888_SRC,
5806 uint32_t, uint32_t, uint32_t,
5807 PAD, FLAG_NONE)
5808 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
5809 scaled_bilinear_scanline_sse2_x888_8888_SRC,
5810 uint32_t, uint32_t, uint32_t,
5811 NORMAL, FLAG_NONE)
5813 static force_inline void
5814 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
5815 const uint32_t * mask,
5816 const uint32_t * src_top,
5817 const uint32_t * src_bottom,
5818 int32_t w,
5819 int wt,
5820 int wb,
5821 pixman_fixed_t vx_,
5822 pixman_fixed_t unit_x_,
5823 pixman_fixed_t max_vx,
5824 pixman_bool_t zero_src)
5826 intptr_t vx = vx_;
5827 intptr_t unit_x = unit_x_;
5828 BILINEAR_DECLARE_VARIABLES;
5829 uint32_t pix1, pix2;
5831 while (w && ((uintptr_t)dst & 15))
5833 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5835 if (pix1)
5837 pix2 = *dst;
5838 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5841 w--;
5842 dst++;
5845 while (w >= 4)
5847 __m128i xmm_src;
5848 __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5849 __m128i xmm_alpha_hi, xmm_alpha_lo;
5851 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5853 if (!is_zero (xmm_src))
5855 if (is_opaque (xmm_src))
5857 save_128_aligned ((__m128i *)dst, xmm_src);
5859 else
5861 __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5863 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5864 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5866 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5867 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5868 &xmm_dst_lo, &xmm_dst_hi);
5870 save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5874 w -= 4;
5875 dst += 4;
5878 while (w)
5880 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5882 if (pix1)
5884 pix2 = *dst;
5885 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5888 w--;
5889 dst++;
5893 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5894 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5895 uint32_t, uint32_t, uint32_t,
5896 COVER, FLAG_NONE)
5897 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5898 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5899 uint32_t, uint32_t, uint32_t,
5900 PAD, FLAG_NONE)
5901 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5902 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5903 uint32_t, uint32_t, uint32_t,
5904 NONE, FLAG_NONE)
5905 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5906 scaled_bilinear_scanline_sse2_8888_8888_OVER,
5907 uint32_t, uint32_t, uint32_t,
5908 NORMAL, FLAG_NONE)
5910 static force_inline void
5911 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
5912 const uint8_t * mask,
5913 const uint32_t * src_top,
5914 const uint32_t * src_bottom,
5915 int32_t w,
5916 int wt,
5917 int wb,
5918 pixman_fixed_t vx_,
5919 pixman_fixed_t unit_x_,
5920 pixman_fixed_t max_vx,
5921 pixman_bool_t zero_src)
5923 intptr_t vx = vx_;
5924 intptr_t unit_x = unit_x_;
5925 BILINEAR_DECLARE_VARIABLES;
5926 uint32_t pix1, pix2;
5927 uint32_t m;
5929 while (w && ((uintptr_t)dst & 15))
5931 uint32_t sa;
5933 m = (uint32_t) *mask++;
5935 if (m)
5937 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5938 sa = pix1 >> 24;
5940 if (sa == 0xff && m == 0xff)
5942 *dst = pix1;
5944 else
5946 __m128i ms, md, ma, msa;
5948 pix2 = *dst;
5949 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5950 ms = unpack_32_1x128 (pix1);
5951 md = unpack_32_1x128 (pix2);
5953 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5955 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5958 else
5960 BILINEAR_SKIP_ONE_PIXEL ();
5963 w--;
5964 dst++;
5967 while (w >= 4)
5969 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5970 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5971 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5973 m = *(uint32_t*)mask;
5975 if (m)
5977 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5979 if (m == 0xffffffff && is_opaque (xmm_src))
5981 save_128_aligned ((__m128i *)dst, xmm_src);
5983 else
5985 xmm_dst = load_128_aligned ((__m128i *)dst);
5987 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5989 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5990 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5991 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5993 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5994 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5996 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5997 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5999 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6002 else
6004 BILINEAR_SKIP_FOUR_PIXELS ();
6007 w -= 4;
6008 dst += 4;
6009 mask += 4;
6012 while (w)
6014 uint32_t sa;
6016 m = (uint32_t) *mask++;
6018 if (m)
6020 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6021 sa = pix1 >> 24;
6023 if (sa == 0xff && m == 0xff)
6025 *dst = pix1;
6027 else
6029 __m128i ms, md, ma, msa;
6031 pix2 = *dst;
6032 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
6033 ms = unpack_32_1x128 (pix1);
6034 md = unpack_32_1x128 (pix2);
6036 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
6038 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
6041 else
6043 BILINEAR_SKIP_ONE_PIXEL ();
6046 w--;
6047 dst++;
6051 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
6052 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6053 uint32_t, uint8_t, uint32_t,
6054 COVER, FLAG_HAVE_NON_SOLID_MASK)
6055 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
6056 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6057 uint32_t, uint8_t, uint32_t,
6058 PAD, FLAG_HAVE_NON_SOLID_MASK)
6059 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
6060 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6061 uint32_t, uint8_t, uint32_t,
6062 NONE, FLAG_HAVE_NON_SOLID_MASK)
6063 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
6064 scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6065 uint32_t, uint8_t, uint32_t,
6066 NORMAL, FLAG_HAVE_NON_SOLID_MASK)
6068 static force_inline void
6069 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst,
6070 const uint32_t * mask,
6071 const uint32_t * src_top,
6072 const uint32_t * src_bottom,
6073 int32_t w,
6074 int wt,
6075 int wb,
6076 pixman_fixed_t vx_,
6077 pixman_fixed_t unit_x_,
6078 pixman_fixed_t max_vx,
6079 pixman_bool_t zero_src)
6081 intptr_t vx = vx_;
6082 intptr_t unit_x = unit_x_;
6083 BILINEAR_DECLARE_VARIABLES;
6084 uint32_t pix1;
6085 __m128i xmm_mask;
6087 if (zero_src || (*mask >> 24) == 0)
6088 return;
6090 xmm_mask = create_mask_16_128 (*mask >> 24);
6092 while (w && ((uintptr_t)dst & 15))
6094 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6095 if (pix1)
6097 uint32_t d = *dst;
6099 __m128i ms = unpack_32_1x128 (pix1);
6100 __m128i alpha = expand_alpha_1x128 (ms);
6101 __m128i dest = xmm_mask;
6102 __m128i alpha_dst = unpack_32_1x128 (d);
6104 *dst = pack_1x128_32
6105 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6108 dst++;
6109 w--;
6112 while (w >= 4)
6114 __m128i xmm_src;
6115 BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
6117 if (!is_zero (xmm_src))
6119 __m128i xmm_src_lo, xmm_src_hi;
6120 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6121 __m128i xmm_alpha_lo, xmm_alpha_hi;
6123 xmm_dst = load_128_aligned ((__m128i*)dst);
6125 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6126 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6127 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6128 &xmm_alpha_lo, &xmm_alpha_hi);
6130 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6131 &xmm_alpha_lo, &xmm_alpha_hi,
6132 &xmm_mask, &xmm_mask,
6133 &xmm_dst_lo, &xmm_dst_hi);
6135 save_128_aligned
6136 ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6139 dst += 4;
6140 w -= 4;
6143 while (w)
6145 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6146 if (pix1)
6148 uint32_t d = *dst;
6150 __m128i ms = unpack_32_1x128 (pix1);
6151 __m128i alpha = expand_alpha_1x128 (ms);
6152 __m128i dest = xmm_mask;
6153 __m128i alpha_dst = unpack_32_1x128 (d);
6155 *dst = pack_1x128_32
6156 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6159 dst++;
6160 w--;
6164 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6165 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6166 uint32_t, uint32_t, uint32_t,
6167 COVER, FLAG_HAVE_SOLID_MASK)
6168 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6169 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6170 uint32_t, uint32_t, uint32_t,
6171 PAD, FLAG_HAVE_SOLID_MASK)
6172 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6173 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6174 uint32_t, uint32_t, uint32_t,
6175 NONE, FLAG_HAVE_SOLID_MASK)
6176 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
6177 scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6178 uint32_t, uint32_t, uint32_t,
6179 NORMAL, FLAG_HAVE_SOLID_MASK)
6181 static const pixman_fast_path_t sse2_fast_paths[] =
6183 /* PIXMAN_OP_OVER */
6184 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6185 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6186 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6187 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6188 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6189 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
6190 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6191 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6192 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6193 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6194 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6195 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6196 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6197 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6198 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6199 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6200 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6201 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6202 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6203 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6204 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6205 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6206 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6207 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6208 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6209 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6210 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6211 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6212 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6213 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6214 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6215 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6216 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6217 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6218 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6219 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6220 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6221 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6222 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6223 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6224 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6225 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6226 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6227 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6228 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6229 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6230 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6232 /* PIXMAN_OP_OVER_REVERSE */
6233 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6234 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6236 /* PIXMAN_OP_ADD */
6237 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6238 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6239 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6240 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6241 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6242 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6243 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
6244 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
6245 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
6246 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
6247 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
6248 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
6249 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
6250 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
6252 /* PIXMAN_OP_SRC */
6253 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6254 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6255 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6256 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6257 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6258 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6259 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6260 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6261 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6262 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6263 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6264 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6265 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6266 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6267 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6268 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6269 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6270 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6272 /* PIXMAN_OP_IN */
6273 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6274 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6275 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6277 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6278 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6279 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6280 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6282 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6283 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6284 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6285 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6287 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6288 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6289 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
6290 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6291 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6292 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
6294 SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6295 SIMPLE_BILINEAR_FAST_PATH_COVER (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6296 SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6297 SIMPLE_BILINEAR_FAST_PATH_PAD (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6298 SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6299 SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6301 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6302 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6303 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6304 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6306 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6307 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6308 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6309 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6311 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
6312 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
6313 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
6314 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
6316 { PIXMAN_OP_NONE },
6319 static uint32_t *
6320 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6322 int w = iter->width;
6323 __m128i ff000000 = mask_ff000000;
6324 uint32_t *dst = iter->buffer;
6325 uint32_t *src = (uint32_t *)iter->bits;
6327 iter->bits += iter->stride;
6329 while (w && ((uintptr_t)dst) & 0x0f)
6331 *dst++ = (*src++) | 0xff000000;
6332 w--;
6335 while (w >= 4)
6337 save_128_aligned (
6338 (__m128i *)dst, _mm_or_si128 (
6339 load_128_unaligned ((__m128i *)src), ff000000));
6341 dst += 4;
6342 src += 4;
6343 w -= 4;
6346 while (w)
6348 *dst++ = (*src++) | 0xff000000;
6349 w--;
6352 return iter->buffer;
6355 static uint32_t *
6356 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6358 int w = iter->width;
6359 uint32_t *dst = iter->buffer;
6360 uint16_t *src = (uint16_t *)iter->bits;
6361 __m128i ff000000 = mask_ff000000;
6363 iter->bits += iter->stride;
6365 while (w && ((uintptr_t)dst) & 0x0f)
6367 uint16_t s = *src++;
6369 *dst++ = convert_0565_to_8888 (s);
6370 w--;
6373 while (w >= 8)
6375 __m128i lo, hi, s;
6377 s = _mm_loadu_si128 ((__m128i *)src);
6379 lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6380 hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6382 save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6383 save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6385 dst += 8;
6386 src += 8;
6387 w -= 8;
6390 while (w)
6392 uint16_t s = *src++;
6394 *dst++ = convert_0565_to_8888 (s);
6395 w--;
6398 return iter->buffer;
6401 static uint32_t *
6402 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6404 int w = iter->width;
6405 uint32_t *dst = iter->buffer;
6406 uint8_t *src = iter->bits;
6407 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6409 iter->bits += iter->stride;
6411 while (w && (((uintptr_t)dst) & 15))
6413 *dst++ = *(src++) << 24;
6414 w--;
6417 while (w >= 16)
6419 xmm0 = _mm_loadu_si128((__m128i *)src);
6421 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
6422 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
6423 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6424 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6425 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6426 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6428 _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
6429 _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
6430 _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
6431 _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6433 dst += 16;
6434 src += 16;
6435 w -= 16;
6438 while (w)
6440 *dst++ = *(src++) << 24;
6441 w--;
6444 return iter->buffer;
6447 #define IMAGE_FLAGS \
6448 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
6449 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
6451 static const pixman_iter_info_t sse2_iters[] =
6453 { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
6454 _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL
6456 { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
6457 _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL
6459 { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
6460 _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL
6462 { PIXMAN_null },
6465 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6466 __attribute__((__force_align_arg_pointer__))
6467 #endif
6468 pixman_implementation_t *
6469 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6471 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6473 /* SSE2 constants */
6474 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6475 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6476 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6477 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6478 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6479 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6480 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6481 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6482 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
6483 mask_0080 = create_mask_16_128 (0x0080);
6484 mask_00ff = create_mask_16_128 (0x00ff);
6485 mask_0101 = create_mask_16_128 (0x0101);
6486 mask_ffff = create_mask_16_128 (0xffff);
6487 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6488 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6489 mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
6490 mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
6492 /* Set up function pointers */
6493 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6494 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6495 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6496 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6497 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6498 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6499 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6500 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6501 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6502 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6504 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6506 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6507 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6508 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6509 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6510 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6511 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6512 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6513 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6514 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6515 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6516 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6518 imp->blt = sse2_blt;
6519 imp->fill = sse2_fill;
6521 imp->iter_info = sse2_iters;
6523 return imp;