2 * Copyright © 2007 Luca Barbato
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of Luca Barbato not be used in advertising or
9 * publicity pertaining to distribution of the software without specific,
10 * written prior permission. Luca Barbato makes no representations about the
11 * suitability of this software for any purpose. It is provided "as is"
12 * without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * Author: Luca Barbato (lu_zero@gentoo.org)
25 * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
31 #include "pixman-private.h"
32 #include "pixman-combine32.h"
33 #include "pixman-inlines.h"
38 static vector
unsigned int mask_ff000000
;
39 static vector
unsigned int mask_red
;
40 static vector
unsigned int mask_green
;
41 static vector
unsigned int mask_blue
;
42 static vector
unsigned int mask_565_fix_rb
;
43 static vector
unsigned int mask_565_fix_g
;
45 static force_inline vector
unsigned int
46 splat_alpha (vector
unsigned int pix
)
48 #ifdef WORDS_BIGENDIAN
49 return vec_perm (pix
, pix
,
50 (vector
unsigned char)AVV (
51 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
52 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
54 return vec_perm (pix
, pix
,
55 (vector
unsigned char)AVV (
56 0x03, 0x03, 0x03, 0x03, 0x07, 0x07, 0x07, 0x07,
57 0x0B, 0x0B, 0x0B, 0x0B, 0x0F, 0x0F, 0x0F, 0x0F));
61 static force_inline vector
unsigned int
62 splat_pixel (vector
unsigned int pix
)
64 return vec_perm (pix
, pix
,
65 (vector
unsigned char)AVV (
66 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
67 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03));
70 static force_inline vector
unsigned int
71 pix_multiply (vector
unsigned int p
, vector
unsigned int a
)
73 vector
unsigned short hi
, lo
, mod
;
76 hi
= (vector
unsigned short)
77 #ifdef WORDS_BIGENDIAN
78 vec_mergeh ((vector
unsigned char)AVV (0),
79 (vector
unsigned char)p
);
81 vec_mergeh ((vector
unsigned char) p
,
82 (vector
unsigned char) AVV (0));
85 mod
= (vector
unsigned short)
86 #ifdef WORDS_BIGENDIAN
87 vec_mergeh ((vector
unsigned char)AVV (0),
88 (vector
unsigned char)a
);
90 vec_mergeh ((vector
unsigned char) a
,
91 (vector
unsigned char) AVV (0));
94 hi
= vec_mladd (hi
, mod
, (vector
unsigned short)
95 AVV (0x0080, 0x0080, 0x0080, 0x0080,
96 0x0080, 0x0080, 0x0080, 0x0080));
98 hi
= vec_adds (hi
, vec_sr (hi
, vec_splat_u16 (8)));
100 hi
= vec_sr (hi
, vec_splat_u16 (8));
102 /* unpack to short */
103 lo
= (vector
unsigned short)
104 #ifdef WORDS_BIGENDIAN
105 vec_mergel ((vector
unsigned char)AVV (0),
106 (vector
unsigned char)p
);
108 vec_mergel ((vector
unsigned char) p
,
109 (vector
unsigned char) AVV (0));
112 mod
= (vector
unsigned short)
113 #ifdef WORDS_BIGENDIAN
114 vec_mergel ((vector
unsigned char)AVV (0),
115 (vector
unsigned char)a
);
117 vec_mergel ((vector
unsigned char) a
,
118 (vector
unsigned char) AVV (0));
121 lo
= vec_mladd (lo
, mod
, (vector
unsigned short)
122 AVV (0x0080, 0x0080, 0x0080, 0x0080,
123 0x0080, 0x0080, 0x0080, 0x0080));
125 lo
= vec_adds (lo
, vec_sr (lo
, vec_splat_u16 (8)));
127 lo
= vec_sr (lo
, vec_splat_u16 (8));
129 return (vector
unsigned int)vec_packsu (hi
, lo
);
132 static force_inline vector
unsigned int
133 pix_add (vector
unsigned int a
, vector
unsigned int b
)
135 return (vector
unsigned int)vec_adds ((vector
unsigned char)a
,
136 (vector
unsigned char)b
);
139 static force_inline vector
unsigned int
140 pix_add_mul (vector
unsigned int x
,
141 vector
unsigned int a
,
142 vector
unsigned int y
,
143 vector
unsigned int b
)
145 vector
unsigned int t1
, t2
;
147 t1
= pix_multiply (x
, a
);
148 t2
= pix_multiply (y
, b
);
150 return pix_add (t1
, t2
);
153 static force_inline vector
unsigned int
154 negate (vector
unsigned int src
)
156 return vec_nor (src
, src
);
159 /* dest*~srca + src */
160 static force_inline vector
unsigned int
161 over (vector
unsigned int src
,
162 vector
unsigned int srca
,
163 vector
unsigned int dest
)
165 vector
unsigned char tmp
= (vector
unsigned char)
166 pix_multiply (dest
, negate (srca
));
168 tmp
= vec_adds ((vector
unsigned char)src
, tmp
);
169 return (vector
unsigned int)tmp
;
172 /* in == pix_multiply */
173 #define in_over(src, srca, mask, dest) \
174 over (pix_multiply (src, mask), \
175 pix_multiply (srca, mask), dest)
177 #ifdef WORDS_BIGENDIAN
179 #define COMPUTE_SHIFT_MASK(source) \
180 source ## _mask = vec_lvsl (0, source);
182 #define COMPUTE_SHIFT_MASKS(dest, source) \
183 source ## _mask = vec_lvsl (0, source);
185 #define COMPUTE_SHIFT_MASKC(dest, source, mask) \
186 mask ## _mask = vec_lvsl (0, mask); \
187 source ## _mask = vec_lvsl (0, source);
189 #define LOAD_VECTOR(source) \
192 vector unsigned char tmp1, tmp2; \
193 tmp1 = (typeof(tmp1))vec_ld (0, source); \
194 tmp2 = (typeof(tmp2))vec_ld (15, source); \
195 v ## source = (typeof(v ## source)) \
196 vec_perm (tmp1, tmp2, source ## _mask); \
199 #define LOAD_VECTORS(dest, source) \
202 LOAD_VECTOR(source); \
203 v ## dest = (typeof(v ## dest))vec_ld (0, dest); \
206 #define LOAD_VECTORSC(dest, source, mask) \
209 LOAD_VECTORS(dest, source); \
213 #define DECLARE_SRC_MASK_VAR vector unsigned char src_mask
214 #define DECLARE_MASK_MASK_VAR vector unsigned char mask_mask
218 /* Now the COMPUTE_SHIFT_{MASK, MASKS, MASKC} below are just no-op.
219 * They are defined that way because little endian altivec can do unaligned
220 * reads natively and have no need for constructing the permutation pattern
223 #define COMPUTE_SHIFT_MASK(source)
225 #define COMPUTE_SHIFT_MASKS(dest, source)
227 #define COMPUTE_SHIFT_MASKC(dest, source, mask)
229 # define LOAD_VECTOR(source) \
230 v ## source = *((typeof(v ## source)*)source);
232 # define LOAD_VECTORS(dest, source) \
233 LOAD_VECTOR(source); \
236 # define LOAD_VECTORSC(dest, source, mask) \
237 LOAD_VECTORS(dest, source); \
240 #define DECLARE_SRC_MASK_VAR
241 #define DECLARE_MASK_MASK_VAR
243 #endif /* WORDS_BIGENDIAN */
245 #define LOAD_VECTORSM(dest, source, mask) \
246 LOAD_VECTORSC (dest, source, mask); \
247 v ## source = pix_multiply (v ## source, \
248 splat_alpha (v ## mask));
250 #define STORE_VECTOR(dest) \
251 vec_st ((vector unsigned int) v ## dest, 0, dest);
253 /* load 4 pixels from a 16-byte boundary aligned address */
254 static force_inline vector
unsigned int
255 load_128_aligned (const uint32_t* src
)
257 return *((vector
unsigned int *) src
);
260 /* load 4 pixels from a unaligned address */
261 static force_inline vector
unsigned int
262 load_128_unaligned (const uint32_t* src
)
264 vector
unsigned int vsrc
;
265 DECLARE_SRC_MASK_VAR
;
267 COMPUTE_SHIFT_MASK (src
);
273 /* save 4 pixels on a 16-byte boundary aligned address */
274 static force_inline
void
275 save_128_aligned (uint32_t* data
,
276 vector
unsigned int vdata
)
281 static force_inline vector
unsigned int
282 create_mask_1x32_128 (const uint32_t *src
)
284 vector
unsigned int vsrc
;
285 DECLARE_SRC_MASK_VAR
;
287 COMPUTE_SHIFT_MASK (src
);
289 return vec_splat(vsrc
, 0);
292 static force_inline vector
unsigned int
293 create_mask_32_128 (uint32_t mask
)
295 return create_mask_1x32_128(&mask
);
298 static force_inline vector
unsigned int
299 unpacklo_128_16x8 (vector
unsigned int data1
, vector
unsigned int data2
)
301 vector
unsigned char lo
;
303 /* unpack to short */
304 lo
= (vector
unsigned char)
305 #ifdef WORDS_BIGENDIAN
306 vec_mergel ((vector
unsigned char) data2
,
307 (vector
unsigned char) data1
);
309 vec_mergel ((vector
unsigned char) data1
,
310 (vector
unsigned char) data2
);
313 return (vector
unsigned int) lo
;
316 static force_inline vector
unsigned int
317 unpackhi_128_16x8 (vector
unsigned int data1
, vector
unsigned int data2
)
319 vector
unsigned char hi
;
321 /* unpack to short */
322 hi
= (vector
unsigned char)
323 #ifdef WORDS_BIGENDIAN
324 vec_mergeh ((vector
unsigned char) data2
,
325 (vector
unsigned char) data1
);
327 vec_mergeh ((vector
unsigned char) data1
,
328 (vector
unsigned char) data2
);
331 return (vector
unsigned int) hi
;
334 static force_inline vector
unsigned int
335 unpacklo_128_8x16 (vector
unsigned int data1
, vector
unsigned int data2
)
337 vector
unsigned short lo
;
340 lo
= (vector
unsigned short)
341 #ifdef WORDS_BIGENDIAN
342 vec_mergel ((vector
unsigned short) data2
,
343 (vector
unsigned short) data1
);
345 vec_mergel ((vector
unsigned short) data1
,
346 (vector
unsigned short) data2
);
349 return (vector
unsigned int) lo
;
352 static force_inline vector
unsigned int
353 unpackhi_128_8x16 (vector
unsigned int data1
, vector
unsigned int data2
)
355 vector
unsigned short hi
;
358 hi
= (vector
unsigned short)
359 #ifdef WORDS_BIGENDIAN
360 vec_mergeh ((vector
unsigned short) data2
,
361 (vector
unsigned short) data1
);
363 vec_mergeh ((vector
unsigned short) data1
,
364 (vector
unsigned short) data2
);
367 return (vector
unsigned int) hi
;
370 static force_inline
void
371 unpack_128_2x128 (vector
unsigned int data1
, vector
unsigned int data2
,
372 vector
unsigned int* data_lo
, vector
unsigned int* data_hi
)
374 *data_lo
= unpacklo_128_16x8(data1
, data2
);
375 *data_hi
= unpackhi_128_16x8(data1
, data2
);
378 static force_inline
void
379 unpack_128_2x128_16 (vector
unsigned int data1
, vector
unsigned int data2
,
380 vector
unsigned int* data_lo
, vector
unsigned int* data_hi
)
382 *data_lo
= unpacklo_128_8x16(data1
, data2
);
383 *data_hi
= unpackhi_128_8x16(data1
, data2
);
386 static force_inline vector
unsigned int
387 unpack_565_to_8888 (vector
unsigned int lo
)
389 vector
unsigned int r
, g
, b
, rb
, t
;
391 r
= vec_and (vec_sl(lo
, create_mask_32_128(8)), mask_red
);
392 g
= vec_and (vec_sl(lo
, create_mask_32_128(5)), mask_green
);
393 b
= vec_and (vec_sl(lo
, create_mask_32_128(3)), mask_blue
);
396 t
= vec_and (rb
, mask_565_fix_rb
);
397 t
= vec_sr (t
, create_mask_32_128(5));
400 t
= vec_and (g
, mask_565_fix_g
);
401 t
= vec_sr (t
, create_mask_32_128(6));
404 return vec_or (rb
, g
);
407 static force_inline
int
408 is_opaque (vector
unsigned int x
)
411 vector
bool int ffs
= vec_cmpeq(x
, x
);
413 cmp_result
= vec_all_eq(x
, ffs
);
415 return (cmp_result
& 0x8888) == 0x8888;
418 static force_inline
int
419 is_zero (vector
unsigned int x
)
423 cmp_result
= vec_all_eq(x
, (vector
unsigned int) AVV(0));
425 return cmp_result
== 0xffff;
428 static force_inline
int
429 is_transparent (vector
unsigned int x
)
433 cmp_result
= vec_all_eq(x
, (vector
unsigned int) AVV(0));
434 return (cmp_result
& 0x8888) == 0x8888;
437 static force_inline
uint32_t
438 core_combine_over_u_pixel_vmx (uint32_t src
, uint32_t dst
)
450 UN8x4_MUL_UN8_ADD_UN8x4(dst
, (~a
& MASK
), src
);
456 static force_inline
uint32_t
457 combine1 (const uint32_t *ps
, const uint32_t *pm
)
462 UN8x4_MUL_UN8(s
, ALPHA_8(*pm
));
467 static force_inline vector
unsigned int
468 combine4 (const uint32_t* ps
, const uint32_t* pm
)
470 vector
unsigned int src
, msk
;
474 msk
= load_128_unaligned(pm
);
476 if (is_transparent(msk
))
477 return (vector
unsigned int) AVV(0);
480 src
= load_128_unaligned(ps
);
483 src
= pix_multiply(src
, msk
);
489 vmx_combine_over_u_no_mask (uint32_t * dest
,
494 vector
unsigned int vdest
, vsrc
;
495 DECLARE_SRC_MASK_VAR
;
497 while (width
&& ((uintptr_t)dest
& 15))
501 uint32_t ia
= ALPHA_8 (~s
);
503 UN8x4_MUL_UN8_ADD_UN8x4 (d
, ia
, s
);
509 COMPUTE_SHIFT_MASKS (dest
, src
);
511 /* printf ("%s\n",__PRETTY_FUNCTION__); */
512 for (i
= width
/ 4; i
> 0; i
--)
515 LOAD_VECTORS (dest
, src
);
517 vdest
= over (vsrc
, splat_alpha (vsrc
), vdest
);
525 for (i
= width
% 4; --i
>= 0;)
528 uint32_t d
= dest
[i
];
529 uint32_t ia
= ALPHA_8 (~s
);
531 UN8x4_MUL_UN8_ADD_UN8x4 (d
, ia
, s
);
538 vmx_combine_over_u_mask (uint32_t * dest
,
540 const uint32_t *mask
,
544 vector
unsigned int vdest
, vsrc
, vmask
;
545 DECLARE_SRC_MASK_VAR
;
546 DECLARE_MASK_MASK_VAR
;
548 while (width
&& ((uintptr_t)dest
& 15))
550 uint32_t m
= ALPHA_8 (*mask
++);
555 UN8x4_MUL_UN8 (s
, m
);
559 UN8x4_MUL_UN8_ADD_UN8x4 (d
, ia
, s
);
564 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
566 /* printf ("%s\n",__PRETTY_FUNCTION__); */
567 for (i
= width
/ 4; i
> 0; i
--)
569 LOAD_VECTORSM (dest
, src
, mask
);
571 vdest
= over (vsrc
, splat_alpha (vsrc
), vdest
);
580 for (i
= width
% 4; --i
>= 0;)
582 uint32_t m
= ALPHA_8 (mask
[i
]);
584 uint32_t d
= dest
[i
];
587 UN8x4_MUL_UN8 (s
, m
);
591 UN8x4_MUL_UN8_ADD_UN8x4 (d
, ia
, s
);
597 vmx_combine_over_u (pixman_implementation_t
*imp
,
600 const uint32_t * src
,
601 const uint32_t * mask
,
605 vmx_combine_over_u_mask (dest
, src
, mask
, width
);
607 vmx_combine_over_u_no_mask (dest
, src
, width
);
611 vmx_combine_over_reverse_u_no_mask (uint32_t * dest
,
616 vector
unsigned int vdest
, vsrc
;
617 DECLARE_SRC_MASK_VAR
;
619 while (width
&& ((uintptr_t)dest
& 15))
623 uint32_t ia
= ALPHA_8 (~d
);
625 UN8x4_MUL_UN8_ADD_UN8x4 (s
, ia
, d
);
630 COMPUTE_SHIFT_MASKS (dest
, src
);
632 /* printf ("%s\n",__PRETTY_FUNCTION__); */
633 for (i
= width
/ 4; i
> 0; i
--)
636 LOAD_VECTORS (dest
, src
);
638 vdest
= over (vdest
, splat_alpha (vdest
), vsrc
);
646 for (i
= width
% 4; --i
>= 0;)
649 uint32_t d
= dest
[i
];
650 uint32_t ia
= ALPHA_8 (~dest
[i
]);
652 UN8x4_MUL_UN8_ADD_UN8x4 (s
, ia
, d
);
658 vmx_combine_over_reverse_u_mask (uint32_t * dest
,
660 const uint32_t *mask
,
664 vector
unsigned int vdest
, vsrc
, vmask
;
665 DECLARE_SRC_MASK_VAR
;
666 DECLARE_MASK_MASK_VAR
;
668 while (width
&& ((uintptr_t)dest
& 15))
670 uint32_t m
= ALPHA_8 (*mask
++);
673 uint32_t ia
= ALPHA_8 (~d
);
675 UN8x4_MUL_UN8 (s
, m
);
677 UN8x4_MUL_UN8_ADD_UN8x4 (s
, ia
, d
);
682 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
684 /* printf ("%s\n",__PRETTY_FUNCTION__); */
685 for (i
= width
/ 4; i
> 0; i
--)
688 LOAD_VECTORSM (dest
, src
, mask
);
690 vdest
= over (vdest
, splat_alpha (vdest
), vsrc
);
699 for (i
= width
% 4; --i
>= 0;)
701 uint32_t m
= ALPHA_8 (mask
[i
]);
703 uint32_t d
= dest
[i
];
704 uint32_t ia
= ALPHA_8 (~dest
[i
]);
706 UN8x4_MUL_UN8 (s
, m
);
708 UN8x4_MUL_UN8_ADD_UN8x4 (s
, ia
, d
);
714 vmx_combine_over_reverse_u (pixman_implementation_t
*imp
,
717 const uint32_t * src
,
718 const uint32_t * mask
,
722 vmx_combine_over_reverse_u_mask (dest
, src
, mask
, width
);
724 vmx_combine_over_reverse_u_no_mask (dest
, src
, width
);
728 vmx_combine_in_u_no_mask (uint32_t * dest
,
733 vector
unsigned int vdest
, vsrc
;
734 DECLARE_SRC_MASK_VAR
;
736 while (width
&& ((uintptr_t)dest
& 15))
739 uint32_t a
= ALPHA_8 (*dest
);
741 UN8x4_MUL_UN8 (s
, a
);
746 COMPUTE_SHIFT_MASKS (dest
, src
);
748 /* printf ("%s\n",__PRETTY_FUNCTION__); */
749 for (i
= width
/ 4; i
> 0; i
--)
751 LOAD_VECTORS (dest
, src
);
753 vdest
= pix_multiply (vsrc
, splat_alpha (vdest
));
761 for (i
= width
% 4; --i
>= 0;)
764 uint32_t a
= ALPHA_8 (dest
[i
]);
766 UN8x4_MUL_UN8 (s
, a
);
772 vmx_combine_in_u_mask (uint32_t * dest
,
774 const uint32_t *mask
,
778 vector
unsigned int vdest
, vsrc
, vmask
;
779 DECLARE_SRC_MASK_VAR
;
780 DECLARE_MASK_MASK_VAR
;
782 while (width
&& ((uintptr_t)dest
& 15))
784 uint32_t m
= ALPHA_8 (*mask
++);
786 uint32_t a
= ALPHA_8 (*dest
);
788 UN8x4_MUL_UN8 (s
, m
);
789 UN8x4_MUL_UN8 (s
, a
);
795 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
797 /* printf ("%s\n",__PRETTY_FUNCTION__); */
798 for (i
= width
/ 4; i
> 0; i
--)
800 LOAD_VECTORSM (dest
, src
, mask
);
802 vdest
= pix_multiply (vsrc
, splat_alpha (vdest
));
811 for (i
= width
% 4; --i
>= 0;)
813 uint32_t m
= ALPHA_8 (mask
[i
]);
815 uint32_t a
= ALPHA_8 (dest
[i
]);
817 UN8x4_MUL_UN8 (s
, m
);
818 UN8x4_MUL_UN8 (s
, a
);
825 vmx_combine_in_u (pixman_implementation_t
*imp
,
828 const uint32_t * src
,
829 const uint32_t * mask
,
833 vmx_combine_in_u_mask (dest
, src
, mask
, width
);
835 vmx_combine_in_u_no_mask (dest
, src
, width
);
839 vmx_combine_in_reverse_u_no_mask (uint32_t * dest
,
844 vector
unsigned int vdest
, vsrc
;
845 DECLARE_SRC_MASK_VAR
;
847 while (width
&& ((uintptr_t)dest
& 15))
850 uint32_t a
= ALPHA_8 (*src
++);
852 UN8x4_MUL_UN8 (d
, a
);
858 COMPUTE_SHIFT_MASKS (dest
, src
);
860 /* printf ("%s\n",__PRETTY_FUNCTION__); */
861 for (i
= width
/ 4; i
> 0; i
--)
863 LOAD_VECTORS (dest
, src
);
865 vdest
= pix_multiply (vdest
, splat_alpha (vsrc
));
873 for (i
= width
% 4; --i
>= 0;)
875 uint32_t d
= dest
[i
];
876 uint32_t a
= ALPHA_8 (src
[i
]);
878 UN8x4_MUL_UN8 (d
, a
);
885 vmx_combine_in_reverse_u_mask (uint32_t * dest
,
887 const uint32_t *mask
,
891 vector
unsigned int vdest
, vsrc
, vmask
;
892 DECLARE_SRC_MASK_VAR
;
893 DECLARE_MASK_MASK_VAR
;
895 while (width
&& ((uintptr_t)dest
& 15))
897 uint32_t m
= ALPHA_8 (*mask
++);
901 UN8x4_MUL_UN8 (a
, m
);
903 UN8x4_MUL_UN8 (d
, a
);
909 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
911 /* printf ("%s\n",__PRETTY_FUNCTION__); */
912 for (i
= width
/ 4; i
> 0; i
--)
914 LOAD_VECTORSM (dest
, src
, mask
);
916 vdest
= pix_multiply (vdest
, splat_alpha (vsrc
));
925 for (i
= width
% 4; --i
>= 0;)
927 uint32_t m
= ALPHA_8 (mask
[i
]);
928 uint32_t d
= dest
[i
];
931 UN8x4_MUL_UN8 (a
, m
);
933 UN8x4_MUL_UN8 (d
, a
);
940 vmx_combine_in_reverse_u (pixman_implementation_t
*imp
,
943 const uint32_t * src
,
944 const uint32_t * mask
,
948 vmx_combine_in_reverse_u_mask (dest
, src
, mask
, width
);
950 vmx_combine_in_reverse_u_no_mask (dest
, src
, width
);
954 vmx_combine_out_u_no_mask (uint32_t * dest
,
959 vector
unsigned int vdest
, vsrc
;
960 DECLARE_SRC_MASK_VAR
;
962 while (width
&& ((uintptr_t)dest
& 15))
965 uint32_t a
= ALPHA_8 (~(*dest
));
967 UN8x4_MUL_UN8 (s
, a
);
973 COMPUTE_SHIFT_MASKS (dest
, src
);
975 /* printf ("%s\n",__PRETTY_FUNCTION__); */
976 for (i
= width
/ 4; i
> 0; i
--)
978 LOAD_VECTORS (dest
, src
);
980 vdest
= pix_multiply (vsrc
, splat_alpha (negate (vdest
)));
988 for (i
= width
% 4; --i
>= 0;)
991 uint32_t a
= ALPHA_8 (~dest
[i
]);
993 UN8x4_MUL_UN8 (s
, a
);
1000 vmx_combine_out_u_mask (uint32_t * dest
,
1001 const uint32_t *src
,
1002 const uint32_t *mask
,
1006 vector
unsigned int vdest
, vsrc
, vmask
;
1007 DECLARE_SRC_MASK_VAR
;
1008 DECLARE_MASK_MASK_VAR
;
1010 while (width
&& ((uintptr_t)dest
& 15))
1012 uint32_t m
= ALPHA_8 (*mask
++);
1013 uint32_t s
= *src
++;
1014 uint32_t a
= ALPHA_8 (~(*dest
));
1016 UN8x4_MUL_UN8 (s
, m
);
1017 UN8x4_MUL_UN8 (s
, a
);
1023 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
1025 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1026 for (i
= width
/ 4; i
> 0; i
--)
1028 LOAD_VECTORSM (dest
, src
, mask
);
1030 vdest
= pix_multiply (vsrc
, splat_alpha (negate (vdest
)));
1032 STORE_VECTOR (dest
);
1039 for (i
= width
% 4; --i
>= 0;)
1041 uint32_t m
= ALPHA_8 (mask
[i
]);
1042 uint32_t s
= src
[i
];
1043 uint32_t a
= ALPHA_8 (~dest
[i
]);
1045 UN8x4_MUL_UN8 (s
, m
);
1046 UN8x4_MUL_UN8 (s
, a
);
1053 vmx_combine_out_u (pixman_implementation_t
*imp
,
1056 const uint32_t * src
,
1057 const uint32_t * mask
,
1061 vmx_combine_out_u_mask (dest
, src
, mask
, width
);
1063 vmx_combine_out_u_no_mask (dest
, src
, width
);
1067 vmx_combine_out_reverse_u_no_mask (uint32_t * dest
,
1068 const uint32_t *src
,
1072 vector
unsigned int vdest
, vsrc
;
1073 DECLARE_SRC_MASK_VAR
;
1075 while (width
&& ((uintptr_t)dest
& 15))
1078 uint32_t a
= ALPHA_8 (~(*src
++));
1080 UN8x4_MUL_UN8 (d
, a
);
1086 COMPUTE_SHIFT_MASKS (dest
, src
);
1088 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1089 for (i
= width
/ 4; i
> 0; i
--)
1092 LOAD_VECTORS (dest
, src
);
1094 vdest
= pix_multiply (vdest
, splat_alpha (negate (vsrc
)));
1096 STORE_VECTOR (dest
);
1102 for (i
= width
% 4; --i
>= 0;)
1104 uint32_t d
= dest
[i
];
1105 uint32_t a
= ALPHA_8 (~src
[i
]);
1107 UN8x4_MUL_UN8 (d
, a
);
1114 vmx_combine_out_reverse_u_mask (uint32_t * dest
,
1115 const uint32_t *src
,
1116 const uint32_t *mask
,
1120 vector
unsigned int vdest
, vsrc
, vmask
;
1121 DECLARE_SRC_MASK_VAR
;
1122 DECLARE_MASK_MASK_VAR
;
1124 while (width
&& ((uintptr_t)dest
& 15))
1126 uint32_t m
= ALPHA_8 (*mask
++);
1128 uint32_t a
= *src
++;
1130 UN8x4_MUL_UN8 (a
, m
);
1132 UN8x4_MUL_UN8 (d
, a
);
1138 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
1140 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1141 for (i
= width
/ 4; i
> 0; i
--)
1143 LOAD_VECTORSM (dest
, src
, mask
);
1145 vdest
= pix_multiply (vdest
, splat_alpha (negate (vsrc
)));
1147 STORE_VECTOR (dest
);
1154 for (i
= width
% 4; --i
>= 0;)
1156 uint32_t m
= ALPHA_8 (mask
[i
]);
1157 uint32_t d
= dest
[i
];
1158 uint32_t a
= src
[i
];
1160 UN8x4_MUL_UN8 (a
, m
);
1162 UN8x4_MUL_UN8 (d
, a
);
1169 vmx_combine_out_reverse_u (pixman_implementation_t
*imp
,
1172 const uint32_t * src
,
1173 const uint32_t * mask
,
1177 vmx_combine_out_reverse_u_mask (dest
, src
, mask
, width
);
1179 vmx_combine_out_reverse_u_no_mask (dest
, src
, width
);
1183 vmx_combine_atop_u_no_mask (uint32_t * dest
,
1184 const uint32_t *src
,
1188 vector
unsigned int vdest
, vsrc
;
1189 DECLARE_SRC_MASK_VAR
;
1191 while (width
&& ((uintptr_t)dest
& 15))
1193 uint32_t s
= *src
++;
1195 uint32_t dest_a
= ALPHA_8 (d
);
1196 uint32_t src_ia
= ALPHA_8 (~s
);
1198 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s
, dest_a
, d
, src_ia
);
1204 COMPUTE_SHIFT_MASKS (dest
, src
);
1206 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1207 for (i
= width
/ 4; i
> 0; i
--)
1209 LOAD_VECTORS (dest
, src
);
1211 vdest
= pix_add_mul (vsrc
, splat_alpha (vdest
),
1212 vdest
, splat_alpha (negate (vsrc
)));
1214 STORE_VECTOR (dest
);
1220 for (i
= width
% 4; --i
>= 0;)
1222 uint32_t s
= src
[i
];
1223 uint32_t d
= dest
[i
];
1224 uint32_t dest_a
= ALPHA_8 (d
);
1225 uint32_t src_ia
= ALPHA_8 (~s
);
1227 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s
, dest_a
, d
, src_ia
);
1234 vmx_combine_atop_u_mask (uint32_t * dest
,
1235 const uint32_t *src
,
1236 const uint32_t *mask
,
1240 vector
unsigned int vdest
, vsrc
, vmask
;
1241 DECLARE_SRC_MASK_VAR
;
1242 DECLARE_MASK_MASK_VAR
;
1244 while (width
&& ((uintptr_t)dest
& 15))
1246 uint32_t m
= ALPHA_8 (*mask
++);
1247 uint32_t s
= *src
++;
1249 uint32_t dest_a
= ALPHA_8 (d
);
1252 UN8x4_MUL_UN8 (s
, m
);
1254 src_ia
= ALPHA_8 (~s
);
1256 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s
, dest_a
, d
, src_ia
);
1262 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
1264 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1265 for (i
= width
/ 4; i
> 0; i
--)
1267 LOAD_VECTORSM (dest
, src
, mask
);
1269 vdest
= pix_add_mul (vsrc
, splat_alpha (vdest
),
1270 vdest
, splat_alpha (negate (vsrc
)));
1272 STORE_VECTOR (dest
);
1279 for (i
= width
% 4; --i
>= 0;)
1281 uint32_t m
= ALPHA_8 (mask
[i
]);
1282 uint32_t s
= src
[i
];
1283 uint32_t d
= dest
[i
];
1284 uint32_t dest_a
= ALPHA_8 (d
);
1287 UN8x4_MUL_UN8 (s
, m
);
1289 src_ia
= ALPHA_8 (~s
);
1291 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s
, dest_a
, d
, src_ia
);
1298 vmx_combine_atop_u (pixman_implementation_t
*imp
,
1301 const uint32_t * src
,
1302 const uint32_t * mask
,
1306 vmx_combine_atop_u_mask (dest
, src
, mask
, width
);
1308 vmx_combine_atop_u_no_mask (dest
, src
, width
);
1312 vmx_combine_atop_reverse_u_no_mask (uint32_t * dest
,
1313 const uint32_t *src
,
1317 vector
unsigned int vdest
, vsrc
;
1318 DECLARE_SRC_MASK_VAR
;
1320 while (width
&& ((uintptr_t)dest
& 15))
1322 uint32_t s
= *src
++;
1324 uint32_t src_a
= ALPHA_8 (s
);
1325 uint32_t dest_ia
= ALPHA_8 (~d
);
1327 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s
, dest_ia
, d
, src_a
);
1333 COMPUTE_SHIFT_MASKS (dest
, src
);
1335 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1336 for (i
= width
/ 4; i
> 0; i
--)
1338 LOAD_VECTORS (dest
, src
);
1340 vdest
= pix_add_mul (vdest
, splat_alpha (vsrc
),
1341 vsrc
, splat_alpha (negate (vdest
)));
1343 STORE_VECTOR (dest
);
1349 for (i
= width
% 4; --i
>= 0;)
1351 uint32_t s
= src
[i
];
1352 uint32_t d
= dest
[i
];
1353 uint32_t src_a
= ALPHA_8 (s
);
1354 uint32_t dest_ia
= ALPHA_8 (~d
);
1356 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s
, dest_ia
, d
, src_a
);
1363 vmx_combine_atop_reverse_u_mask (uint32_t * dest
,
1364 const uint32_t *src
,
1365 const uint32_t *mask
,
1369 vector
unsigned int vdest
, vsrc
, vmask
;
1370 DECLARE_SRC_MASK_VAR
;
1371 DECLARE_MASK_MASK_VAR
;
1373 while (width
&& ((uintptr_t)dest
& 15))
1375 uint32_t m
= ALPHA_8 (*mask
++);
1376 uint32_t s
= *src
++;
1379 uint32_t dest_ia
= ALPHA_8 (~d
);
1381 UN8x4_MUL_UN8 (s
, m
);
1383 src_a
= ALPHA_8 (s
);
1385 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s
, dest_ia
, d
, src_a
);
1391 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
1393 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1394 for (i
= width
/ 4; i
> 0; i
--)
1396 LOAD_VECTORSM (dest
, src
, mask
);
1398 vdest
= pix_add_mul (vdest
, splat_alpha (vsrc
),
1399 vsrc
, splat_alpha (negate (vdest
)));
1401 STORE_VECTOR (dest
);
1408 for (i
= width
% 4; --i
>= 0;)
1410 uint32_t m
= ALPHA_8 (mask
[i
]);
1411 uint32_t s
= src
[i
];
1412 uint32_t d
= dest
[i
];
1414 uint32_t dest_ia
= ALPHA_8 (~d
);
1416 UN8x4_MUL_UN8 (s
, m
);
1418 src_a
= ALPHA_8 (s
);
1420 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s
, dest_ia
, d
, src_a
);
1427 vmx_combine_atop_reverse_u (pixman_implementation_t
*imp
,
1430 const uint32_t * src
,
1431 const uint32_t * mask
,
1435 vmx_combine_atop_reverse_u_mask (dest
, src
, mask
, width
);
1437 vmx_combine_atop_reverse_u_no_mask (dest
, src
, width
);
1441 vmx_combine_xor_u_no_mask (uint32_t * dest
,
1442 const uint32_t *src
,
1446 vector
unsigned int vdest
, vsrc
;
1447 DECLARE_SRC_MASK_VAR
;
1449 while (width
&& ((uintptr_t)dest
& 15))
1451 uint32_t s
= *src
++;
1453 uint32_t src_ia
= ALPHA_8 (~s
);
1454 uint32_t dest_ia
= ALPHA_8 (~d
);
1456 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s
, dest_ia
, d
, src_ia
);
1462 COMPUTE_SHIFT_MASKS (dest
, src
);
1464 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1465 for (i
= width
/ 4; i
> 0; i
--)
1467 LOAD_VECTORS (dest
, src
);
1469 vdest
= pix_add_mul (vsrc
, splat_alpha (negate (vdest
)),
1470 vdest
, splat_alpha (negate (vsrc
)));
1472 STORE_VECTOR (dest
);
1478 for (i
= width
% 4; --i
>= 0;)
1480 uint32_t s
= src
[i
];
1481 uint32_t d
= dest
[i
];
1482 uint32_t src_ia
= ALPHA_8 (~s
);
1483 uint32_t dest_ia
= ALPHA_8 (~d
);
1485 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s
, dest_ia
, d
, src_ia
);
1492 vmx_combine_xor_u_mask (uint32_t * dest
,
1493 const uint32_t *src
,
1494 const uint32_t *mask
,
1498 vector
unsigned int vdest
, vsrc
, vmask
;
1499 DECLARE_SRC_MASK_VAR
;
1500 DECLARE_MASK_MASK_VAR
;
1502 while (width
&& ((uintptr_t)dest
& 15))
1504 uint32_t m
= ALPHA_8 (*mask
++);
1505 uint32_t s
= *src
++;
1508 uint32_t dest_ia
= ALPHA_8 (~d
);
1510 UN8x4_MUL_UN8 (s
, m
);
1512 src_ia
= ALPHA_8 (~s
);
1514 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s
, dest_ia
, d
, src_ia
);
1520 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
1522 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1523 for (i
= width
/ 4; i
> 0; i
--)
1525 LOAD_VECTORSM (dest
, src
, mask
);
1527 vdest
= pix_add_mul (vsrc
, splat_alpha (negate (vdest
)),
1528 vdest
, splat_alpha (negate (vsrc
)));
1530 STORE_VECTOR (dest
);
1537 for (i
= width
% 4; --i
>= 0;)
1539 uint32_t m
= ALPHA_8 (mask
[i
]);
1540 uint32_t s
= src
[i
];
1541 uint32_t d
= dest
[i
];
1543 uint32_t dest_ia
= ALPHA_8 (~d
);
1545 UN8x4_MUL_UN8 (s
, m
);
1547 src_ia
= ALPHA_8 (~s
);
1549 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s
, dest_ia
, d
, src_ia
);
1556 vmx_combine_xor_u (pixman_implementation_t
*imp
,
1559 const uint32_t * src
,
1560 const uint32_t * mask
,
1564 vmx_combine_xor_u_mask (dest
, src
, mask
, width
);
1566 vmx_combine_xor_u_no_mask (dest
, src
, width
);
1570 vmx_combine_add_u_no_mask (uint32_t * dest
,
1571 const uint32_t *src
,
1575 vector
unsigned int vdest
, vsrc
;
1576 DECLARE_SRC_MASK_VAR
;
1578 while (width
&& ((uintptr_t)dest
& 15))
1580 uint32_t s
= *src
++;
1583 UN8x4_ADD_UN8x4 (d
, s
);
1589 COMPUTE_SHIFT_MASKS (dest
, src
);
1590 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1591 for (i
= width
/ 4; i
> 0; i
--)
1593 LOAD_VECTORS (dest
, src
);
1595 vdest
= pix_add (vsrc
, vdest
);
1597 STORE_VECTOR (dest
);
1603 for (i
= width
% 4; --i
>= 0;)
1605 uint32_t s
= src
[i
];
1606 uint32_t d
= dest
[i
];
1608 UN8x4_ADD_UN8x4 (d
, s
);
1615 vmx_combine_add_u_mask (uint32_t * dest
,
1616 const uint32_t *src
,
1617 const uint32_t *mask
,
1621 vector
unsigned int vdest
, vsrc
, vmask
;
1622 DECLARE_SRC_MASK_VAR
;
1623 DECLARE_MASK_MASK_VAR
;
1625 while (width
&& ((uintptr_t)dest
& 15))
1627 uint32_t m
= ALPHA_8 (*mask
++);
1628 uint32_t s
= *src
++;
1631 UN8x4_MUL_UN8 (s
, m
);
1632 UN8x4_ADD_UN8x4 (d
, s
);
1638 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
1640 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1641 for (i
= width
/ 4; i
> 0; i
--)
1643 LOAD_VECTORSM (dest
, src
, mask
);
1645 vdest
= pix_add (vsrc
, vdest
);
1647 STORE_VECTOR (dest
);
1654 for (i
= width
% 4; --i
>= 0;)
1656 uint32_t m
= ALPHA_8 (mask
[i
]);
1657 uint32_t s
= src
[i
];
1658 uint32_t d
= dest
[i
];
1660 UN8x4_MUL_UN8 (s
, m
);
1661 UN8x4_ADD_UN8x4 (d
, s
);
1668 vmx_combine_add_u (pixman_implementation_t
*imp
,
1671 const uint32_t * src
,
1672 const uint32_t * mask
,
1676 vmx_combine_add_u_mask (dest
, src
, mask
, width
);
1678 vmx_combine_add_u_no_mask (dest
, src
, width
);
1682 vmx_combine_src_ca (pixman_implementation_t
*imp
,
1685 const uint32_t * src
,
1686 const uint32_t * mask
,
1690 vector
unsigned int vdest
, vsrc
, vmask
;
1691 DECLARE_SRC_MASK_VAR
;
1692 DECLARE_MASK_MASK_VAR
;
1694 while (width
&& ((uintptr_t)dest
& 15))
1696 uint32_t a
= *mask
++;
1697 uint32_t s
= *src
++;
1699 UN8x4_MUL_UN8x4 (s
, a
);
1705 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
1707 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1708 for (i
= width
/ 4; i
> 0; i
--)
1710 LOAD_VECTORSC (dest
, src
, mask
);
1712 vdest
= pix_multiply (vsrc
, vmask
);
1714 STORE_VECTOR (dest
);
1721 for (i
= width
% 4; --i
>= 0;)
1723 uint32_t a
= mask
[i
];
1724 uint32_t s
= src
[i
];
1726 UN8x4_MUL_UN8x4 (s
, a
);
1733 vmx_combine_over_ca (pixman_implementation_t
*imp
,
1736 const uint32_t * src
,
1737 const uint32_t * mask
,
1741 vector
unsigned int vdest
, vsrc
, vmask
;
1742 DECLARE_SRC_MASK_VAR
;
1743 DECLARE_MASK_MASK_VAR
;
1745 while (width
&& ((uintptr_t)dest
& 15))
1747 uint32_t a
= *mask
++;
1748 uint32_t s
= *src
++;
1750 uint32_t sa
= ALPHA_8 (s
);
1752 UN8x4_MUL_UN8x4 (s
, a
);
1753 UN8x4_MUL_UN8 (a
, sa
);
1754 UN8x4_MUL_UN8x4_ADD_UN8x4 (d
, ~a
, s
);
1760 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
1762 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1763 for (i
= width
/ 4; i
> 0; i
--)
1765 LOAD_VECTORSC (dest
, src
, mask
);
1767 vdest
= in_over (vsrc
, splat_alpha (vsrc
), vmask
, vdest
);
1769 STORE_VECTOR (dest
);
1776 for (i
= width
% 4; --i
>= 0;)
1778 uint32_t a
= mask
[i
];
1779 uint32_t s
= src
[i
];
1780 uint32_t d
= dest
[i
];
1781 uint32_t sa
= ALPHA_8 (s
);
1783 UN8x4_MUL_UN8x4 (s
, a
);
1784 UN8x4_MUL_UN8 (a
, sa
);
1785 UN8x4_MUL_UN8x4_ADD_UN8x4 (d
, ~a
, s
);
1792 vmx_combine_over_reverse_ca (pixman_implementation_t
*imp
,
1795 const uint32_t * src
,
1796 const uint32_t * mask
,
1800 vector
unsigned int vdest
, vsrc
, vmask
;
1801 DECLARE_SRC_MASK_VAR
;
1802 DECLARE_MASK_MASK_VAR
;
1804 while (width
&& ((uintptr_t)dest
& 15))
1806 uint32_t a
= *mask
++;
1807 uint32_t s
= *src
++;
1809 uint32_t ida
= ALPHA_8 (~d
);
1811 UN8x4_MUL_UN8x4 (s
, a
);
1812 UN8x4_MUL_UN8_ADD_UN8x4 (s
, ida
, d
);
1818 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
1820 /* printf("%s\n",__PRETTY_FUNCTION__); */
1821 for (i
= width
/ 4; i
> 0; i
--)
1823 LOAD_VECTORSC (dest
, src
, mask
);
1825 vdest
= over (vdest
, splat_alpha (vdest
), pix_multiply (vsrc
, vmask
));
1827 STORE_VECTOR (dest
);
1834 for (i
= width
% 4; --i
>= 0;)
1836 uint32_t a
= mask
[i
];
1837 uint32_t s
= src
[i
];
1838 uint32_t d
= dest
[i
];
1839 uint32_t ida
= ALPHA_8 (~d
);
1841 UN8x4_MUL_UN8x4 (s
, a
);
1842 UN8x4_MUL_UN8_ADD_UN8x4 (s
, ida
, d
);
1849 vmx_combine_in_ca (pixman_implementation_t
*imp
,
1852 const uint32_t * src
,
1853 const uint32_t * mask
,
1857 vector
unsigned int vdest
, vsrc
, vmask
;
1858 DECLARE_SRC_MASK_VAR
;
1859 DECLARE_MASK_MASK_VAR
;
1861 while (width
&& ((uintptr_t)dest
& 15))
1863 uint32_t a
= *mask
++;
1864 uint32_t s
= *src
++;
1865 uint32_t da
= ALPHA_8 (*dest
);
1867 UN8x4_MUL_UN8x4 (s
, a
);
1868 UN8x4_MUL_UN8 (s
, da
);
1874 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
1876 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1877 for (i
= width
/ 4; i
> 0; i
--)
1879 LOAD_VECTORSC (dest
, src
, mask
);
1881 vdest
= pix_multiply (pix_multiply (vsrc
, vmask
), splat_alpha (vdest
));
1883 STORE_VECTOR (dest
);
1890 for (i
= width
% 4; --i
>= 0;)
1892 uint32_t a
= mask
[i
];
1893 uint32_t s
= src
[i
];
1894 uint32_t da
= ALPHA_8 (dest
[i
]);
1896 UN8x4_MUL_UN8x4 (s
, a
);
1897 UN8x4_MUL_UN8 (s
, da
);
1904 vmx_combine_in_reverse_ca (pixman_implementation_t
*imp
,
1907 const uint32_t * src
,
1908 const uint32_t * mask
,
1912 vector
unsigned int vdest
, vsrc
, vmask
;
1913 DECLARE_SRC_MASK_VAR
;
1914 DECLARE_MASK_MASK_VAR
;
1916 while (width
&& ((uintptr_t)dest
& 15))
1918 uint32_t a
= *mask
++;
1920 uint32_t sa
= ALPHA_8 (*src
++);
1922 UN8x4_MUL_UN8 (a
, sa
);
1923 UN8x4_MUL_UN8x4 (d
, a
);
1929 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
1931 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1932 for (i
= width
/ 4; i
> 0; i
--)
1935 LOAD_VECTORSC (dest
, src
, mask
);
1937 vdest
= pix_multiply (vdest
, pix_multiply (vmask
, splat_alpha (vsrc
)));
1939 STORE_VECTOR (dest
);
1946 for (i
= width
% 4; --i
>= 0;)
1948 uint32_t a
= mask
[i
];
1949 uint32_t d
= dest
[i
];
1950 uint32_t sa
= ALPHA_8 (src
[i
]);
1952 UN8x4_MUL_UN8 (a
, sa
);
1953 UN8x4_MUL_UN8x4 (d
, a
);
1960 vmx_combine_out_ca (pixman_implementation_t
*imp
,
1963 const uint32_t * src
,
1964 const uint32_t * mask
,
1968 vector
unsigned int vdest
, vsrc
, vmask
;
1969 DECLARE_SRC_MASK_VAR
;
1970 DECLARE_MASK_MASK_VAR
;
1972 while (width
&& ((uintptr_t)dest
& 15))
1974 uint32_t a
= *mask
++;
1975 uint32_t s
= *src
++;
1977 uint32_t da
= ALPHA_8 (~d
);
1979 UN8x4_MUL_UN8x4 (s
, a
);
1980 UN8x4_MUL_UN8 (s
, da
);
1986 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
1988 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1989 for (i
= width
/ 4; i
> 0; i
--)
1991 LOAD_VECTORSC (dest
, src
, mask
);
1993 vdest
= pix_multiply (
1994 pix_multiply (vsrc
, vmask
), splat_alpha (negate (vdest
)));
1996 STORE_VECTOR (dest
);
2003 for (i
= width
% 4; --i
>= 0;)
2005 uint32_t a
= mask
[i
];
2006 uint32_t s
= src
[i
];
2007 uint32_t d
= dest
[i
];
2008 uint32_t da
= ALPHA_8 (~d
);
2010 UN8x4_MUL_UN8x4 (s
, a
);
2011 UN8x4_MUL_UN8 (s
, da
);
2018 vmx_combine_out_reverse_ca (pixman_implementation_t
*imp
,
2021 const uint32_t * src
,
2022 const uint32_t * mask
,
2026 vector
unsigned int vdest
, vsrc
, vmask
;
2027 DECLARE_SRC_MASK_VAR
;
2028 DECLARE_MASK_MASK_VAR
;
2030 while (width
&& ((uintptr_t)dest
& 15))
2032 uint32_t a
= *mask
++;
2033 uint32_t s
= *src
++;
2035 uint32_t sa
= ALPHA_8 (s
);
2037 UN8x4_MUL_UN8 (a
, sa
);
2038 UN8x4_MUL_UN8x4 (d
, ~a
);
2044 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
2046 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2047 for (i
= width
/ 4; i
> 0; i
--)
2049 LOAD_VECTORSC (dest
, src
, mask
);
2051 vdest
= pix_multiply (
2052 vdest
, negate (pix_multiply (vmask
, splat_alpha (vsrc
))));
2054 STORE_VECTOR (dest
);
2061 for (i
= width
% 4; --i
>= 0;)
2063 uint32_t a
= mask
[i
];
2064 uint32_t s
= src
[i
];
2065 uint32_t d
= dest
[i
];
2066 uint32_t sa
= ALPHA_8 (s
);
2068 UN8x4_MUL_UN8 (a
, sa
);
2069 UN8x4_MUL_UN8x4 (d
, ~a
);
2076 vmx_combine_atop_ca (pixman_implementation_t
*imp
,
2079 const uint32_t * src
,
2080 const uint32_t * mask
,
2084 vector
unsigned int vdest
, vsrc
, vmask
, vsrca
;
2085 DECLARE_SRC_MASK_VAR
;
2086 DECLARE_MASK_MASK_VAR
;
2088 while (width
&& ((uintptr_t)dest
& 15))
2090 uint32_t a
= *mask
++;
2091 uint32_t s
= *src
++;
2093 uint32_t sa
= ALPHA_8 (s
);
2094 uint32_t da
= ALPHA_8 (d
);
2096 UN8x4_MUL_UN8x4 (s
, a
);
2097 UN8x4_MUL_UN8 (a
, sa
);
2098 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d
, ~a
, s
, da
);
2104 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
2106 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2107 for (i
= width
/ 4; i
> 0; i
--)
2109 LOAD_VECTORSC (dest
, src
, mask
);
2111 vsrca
= splat_alpha (vsrc
);
2113 vsrc
= pix_multiply (vsrc
, vmask
);
2114 vmask
= pix_multiply (vmask
, vsrca
);
2116 vdest
= pix_add_mul (vsrc
, splat_alpha (vdest
),
2117 negate (vmask
), vdest
);
2119 STORE_VECTOR (dest
);
2126 for (i
= width
% 4; --i
>= 0;)
2128 uint32_t a
= mask
[i
];
2129 uint32_t s
= src
[i
];
2130 uint32_t d
= dest
[i
];
2131 uint32_t sa
= ALPHA_8 (s
);
2132 uint32_t da
= ALPHA_8 (d
);
2134 UN8x4_MUL_UN8x4 (s
, a
);
2135 UN8x4_MUL_UN8 (a
, sa
);
2136 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d
, ~a
, s
, da
);
2143 vmx_combine_atop_reverse_ca (pixman_implementation_t
*imp
,
2146 const uint32_t * src
,
2147 const uint32_t * mask
,
2151 vector
unsigned int vdest
, vsrc
, vmask
;
2152 DECLARE_SRC_MASK_VAR
;
2153 DECLARE_MASK_MASK_VAR
;
2155 while (width
&& ((uintptr_t)dest
& 15))
2157 uint32_t a
= *mask
++;
2158 uint32_t s
= *src
++;
2160 uint32_t sa
= ALPHA_8 (s
);
2161 uint32_t da
= ALPHA_8 (~d
);
2163 UN8x4_MUL_UN8x4 (s
, a
);
2164 UN8x4_MUL_UN8 (a
, sa
);
2165 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d
, a
, s
, da
);
2171 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
2173 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2174 for (i
= width
/ 4; i
> 0; i
--)
2176 LOAD_VECTORSC (dest
, src
, mask
);
2178 vdest
= pix_add_mul (vdest
,
2179 pix_multiply (vmask
, splat_alpha (vsrc
)),
2180 pix_multiply (vsrc
, vmask
),
2181 negate (splat_alpha (vdest
)));
2183 STORE_VECTOR (dest
);
2190 for (i
= width
% 4; --i
>= 0;)
2192 uint32_t a
= mask
[i
];
2193 uint32_t s
= src
[i
];
2194 uint32_t d
= dest
[i
];
2195 uint32_t sa
= ALPHA_8 (s
);
2196 uint32_t da
= ALPHA_8 (~d
);
2198 UN8x4_MUL_UN8x4 (s
, a
);
2199 UN8x4_MUL_UN8 (a
, sa
);
2200 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d
, a
, s
, da
);
2207 vmx_combine_xor_ca (pixman_implementation_t
*imp
,
2210 const uint32_t * src
,
2211 const uint32_t * mask
,
2215 vector
unsigned int vdest
, vsrc
, vmask
;
2216 DECLARE_SRC_MASK_VAR
;
2217 DECLARE_MASK_MASK_VAR
;
2219 while (width
&& ((uintptr_t)dest
& 15))
2221 uint32_t a
= *mask
++;
2222 uint32_t s
= *src
++;
2224 uint32_t sa
= ALPHA_8 (s
);
2225 uint32_t da
= ALPHA_8 (~d
);
2227 UN8x4_MUL_UN8x4 (s
, a
);
2228 UN8x4_MUL_UN8 (a
, sa
);
2229 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d
, ~a
, s
, da
);
2235 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
2237 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2238 for (i
= width
/ 4; i
> 0; i
--)
2240 LOAD_VECTORSC (dest
, src
, mask
);
2242 vdest
= pix_add_mul (vdest
,
2243 negate (pix_multiply (vmask
, splat_alpha (vsrc
))),
2244 pix_multiply (vsrc
, vmask
),
2245 negate (splat_alpha (vdest
)));
2247 STORE_VECTOR (dest
);
2254 for (i
= width
% 4; --i
>= 0;)
2256 uint32_t a
= mask
[i
];
2257 uint32_t s
= src
[i
];
2258 uint32_t d
= dest
[i
];
2259 uint32_t sa
= ALPHA_8 (s
);
2260 uint32_t da
= ALPHA_8 (~d
);
2262 UN8x4_MUL_UN8x4 (s
, a
);
2263 UN8x4_MUL_UN8 (a
, sa
);
2264 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d
, ~a
, s
, da
);
2271 vmx_combine_add_ca (pixman_implementation_t
*imp
,
2274 const uint32_t * src
,
2275 const uint32_t * mask
,
2279 vector
unsigned int vdest
, vsrc
, vmask
;
2280 DECLARE_SRC_MASK_VAR
;
2281 DECLARE_MASK_MASK_VAR
;
2283 while (width
&& ((uintptr_t)dest
& 15))
2285 uint32_t a
= *mask
++;
2286 uint32_t s
= *src
++;
2289 UN8x4_MUL_UN8x4 (s
, a
);
2290 UN8x4_ADD_UN8x4 (s
, d
);
2296 COMPUTE_SHIFT_MASKC (dest
, src
, mask
);
2298 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2299 for (i
= width
/ 4; i
> 0; i
--)
2301 LOAD_VECTORSC (dest
, src
, mask
);
2303 vdest
= pix_add (pix_multiply (vsrc
, vmask
), vdest
);
2305 STORE_VECTOR (dest
);
2312 for (i
= width
% 4; --i
>= 0;)
2314 uint32_t a
= mask
[i
];
2315 uint32_t s
= src
[i
];
2316 uint32_t d
= dest
[i
];
2318 UN8x4_MUL_UN8x4 (s
, a
);
2319 UN8x4_ADD_UN8x4 (s
, d
);
2326 vmx_composite_over_n_8_8888 (pixman_implementation_t
*imp
,
2327 pixman_composite_info_t
*info
)
2329 PIXMAN_COMPOSITE_ARGS (info
);
2331 uint32_t *dst_line
, *dst
;
2333 int dst_stride
, mask_stride
;
2335 uint32_t m
, d
, s
, ia
;
2337 vector
unsigned int vsrc
, valpha
, vmask
, vdst
;
2339 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
2341 srca
= ALPHA_8(src
);
2345 PIXMAN_IMAGE_GET_LINE (
2346 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2347 PIXMAN_IMAGE_GET_LINE (
2348 mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
2350 vsrc
= (vector
unsigned int) {src
, src
, src
, src
};
2351 valpha
= splat_alpha(vsrc
);
2355 const uint8_t *pm
= mask_line
;
2357 dst_line
+= dst_stride
;
2358 mask_line
+= mask_stride
;
2361 while (w
&& (uintptr_t)dst
& 15)
2369 UN8x4_MUL_UN8 (s
, m
);
2371 UN8x4_MUL_UN8_ADD_UN8x4 (d
, ia
, s
);
2381 m
= *((uint32_t*)pm
);
2383 if (srca
== 0xff && m
== 0xffffffff)
2385 save_128_aligned(dst
, vsrc
);
2389 vmask
= splat_pixel((vector
unsigned int) {m
, m
, m
, m
});
2391 /* dst is 16-byte aligned */
2392 vdst
= in_over (vsrc
, valpha
, vmask
, load_128_aligned (dst
));
2394 save_128_aligned(dst
, vdst
);
2410 UN8x4_MUL_UN8 (s
, m
);
2412 UN8x4_MUL_UN8_ADD_UN8x4 (d
, ia
, s
);
2423 static pixman_bool_t
2424 vmx_fill (pixman_implementation_t
*imp
,
2434 uint32_t byte_width
;
2437 vector
unsigned int vfiller
;
2444 stride
= stride
* (int) sizeof (uint32_t) / 1;
2445 byte_line
= (uint8_t *)(((uint8_t *)bits
) + stride
* y
+ x
);
2451 filler
= (w
<< 16) | w
;
2455 stride
= stride
* (int) sizeof (uint32_t) / 2;
2456 byte_line
= (uint8_t *)(((uint16_t *)bits
) + stride
* y
+ x
);
2457 byte_width
= 2 * width
;
2460 filler
= (filler
& 0xffff) * 0x00010001;
2464 stride
= stride
* (int) sizeof (uint32_t) / 4;
2465 byte_line
= (uint8_t *)(((uint32_t *)bits
) + stride
* y
+ x
);
2466 byte_width
= 4 * width
;
2474 vfiller
= create_mask_1x32_128(&filler
);
2479 uint8_t *d
= byte_line
;
2480 byte_line
+= stride
;
2483 if (w
>= 1 && ((uintptr_t)d
& 1))
2485 *(uint8_t *)d
= filler
;
2490 while (w
>= 2 && ((uintptr_t)d
& 3))
2492 *(uint16_t *)d
= filler
;
2497 while (w
>= 4 && ((uintptr_t)d
& 15))
2499 *(uint32_t *)d
= filler
;
2507 vec_st(vfiller
, 0, (uint32_t *) d
);
2508 vec_st(vfiller
, 0, (uint32_t *) d
+ 4);
2509 vec_st(vfiller
, 0, (uint32_t *) d
+ 8);
2510 vec_st(vfiller
, 0, (uint32_t *) d
+ 12);
2511 vec_st(vfiller
, 0, (uint32_t *) d
+ 16);
2512 vec_st(vfiller
, 0, (uint32_t *) d
+ 20);
2513 vec_st(vfiller
, 0, (uint32_t *) d
+ 24);
2514 vec_st(vfiller
, 0, (uint32_t *) d
+ 28);
2522 vec_st(vfiller
, 0, (uint32_t *) d
);
2523 vec_st(vfiller
, 0, (uint32_t *) d
+ 4);
2524 vec_st(vfiller
, 0, (uint32_t *) d
+ 8);
2525 vec_st(vfiller
, 0, (uint32_t *) d
+ 12);
2533 vec_st(vfiller
, 0, (uint32_t *) d
);
2534 vec_st(vfiller
, 0, (uint32_t *) d
+ 4);
2542 vec_st(vfiller
, 0, (uint32_t *) d
);
2550 *(uint32_t *)d
= filler
;
2558 *(uint16_t *)d
= filler
;
2565 *(uint8_t *)d
= filler
;
2575 vmx_composite_src_x888_8888 (pixman_implementation_t
*imp
,
2576 pixman_composite_info_t
*info
)
2578 PIXMAN_COMPOSITE_ARGS (info
);
2579 uint32_t *dst_line
, *dst
;
2580 uint32_t *src_line
, *src
;
2582 int dst_stride
, src_stride
;
2584 PIXMAN_IMAGE_GET_LINE (
2585 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2586 PIXMAN_IMAGE_GET_LINE (
2587 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
2592 dst_line
+= dst_stride
;
2594 src_line
+= src_stride
;
2597 while (w
&& (uintptr_t)dst
& 15)
2599 *dst
++ = *src
++ | 0xff000000;
2605 vector
unsigned int vmx_src1
, vmx_src2
, vmx_src3
, vmx_src4
;
2607 vmx_src1
= load_128_unaligned (src
);
2608 vmx_src2
= load_128_unaligned (src
+ 4);
2609 vmx_src3
= load_128_unaligned (src
+ 8);
2610 vmx_src4
= load_128_unaligned (src
+ 12);
2612 save_128_aligned (dst
, vec_or (vmx_src1
, mask_ff000000
));
2613 save_128_aligned (dst
+ 4, vec_or (vmx_src2
, mask_ff000000
));
2614 save_128_aligned (dst
+ 8, vec_or (vmx_src3
, mask_ff000000
));
2615 save_128_aligned (dst
+ 12, vec_or (vmx_src4
, mask_ff000000
));
2624 *dst
++ = *src
++ | 0xff000000;
2631 vmx_composite_over_n_8888 (pixman_implementation_t
*imp
,
2632 pixman_composite_info_t
*info
)
2634 PIXMAN_COMPOSITE_ARGS (info
);
2635 uint32_t *dst_line
, *dst
;
2637 int i
, w
, dst_stride
;
2638 vector
unsigned int vdst
, vsrc
, via
;
2640 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
2645 PIXMAN_IMAGE_GET_LINE (
2646 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2648 vsrc
= (vector
unsigned int){src
, src
, src
, src
};
2649 via
= negate (splat_alpha (vsrc
));
2650 ia
= ALPHA_8 (~src
);
2655 dst_line
+= dst_stride
;
2658 while (w
&& ((uintptr_t)dst
& 15))
2661 UN8x4_MUL_UN8_ADD_UN8x4 (d
, ia
, src
);
2666 for (i
= w
/ 4; i
> 0; i
--)
2668 vdst
= pix_multiply (load_128_aligned (dst
), via
);
2669 save_128_aligned (dst
, pix_add (vsrc
, vdst
));
2673 for (i
= w
% 4; --i
>= 0;)
2675 uint32_t d
= dst
[i
];
2676 UN8x4_MUL_UN8_ADD_UN8x4 (d
, ia
, src
);
2683 vmx_composite_over_8888_8888 (pixman_implementation_t
*imp
,
2684 pixman_composite_info_t
*info
)
2686 PIXMAN_COMPOSITE_ARGS (info
);
2687 int dst_stride
, src_stride
;
2688 uint32_t *dst_line
, *dst
;
2689 uint32_t *src_line
, *src
;
2691 PIXMAN_IMAGE_GET_LINE (
2692 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2693 PIXMAN_IMAGE_GET_LINE (
2694 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
2701 vmx_combine_over_u (imp
, op
, dst
, src
, NULL
, width
);
2709 vmx_composite_over_n_8888_8888_ca (pixman_implementation_t
*imp
,
2710 pixman_composite_info_t
*info
)
2712 PIXMAN_COMPOSITE_ARGS (info
);
2714 uint32_t *dst_line
, d
;
2715 uint32_t *mask_line
, m
;
2717 int dst_stride
, mask_stride
;
2719 vector
unsigned int vsrc
, valpha
, vmask
, vdest
;
2721 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
2726 PIXMAN_IMAGE_GET_LINE (
2727 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2728 PIXMAN_IMAGE_GET_LINE (
2729 mask_image
, mask_x
, mask_y
, uint32_t, mask_stride
, mask_line
, 1);
2731 vsrc
= (vector
unsigned int) {src
, src
, src
, src
};
2732 valpha
= splat_alpha(vsrc
);
2738 const uint32_t *pm
= (uint32_t *)mask_line
;
2739 uint32_t *pd
= (uint32_t *)dst_line
;
2742 dst_line
+= dst_stride
;
2743 mask_line
+= mask_stride
;
2745 while (w
&& (uintptr_t)pd
& 15)
2753 UN8x4_MUL_UN8x4 (s
, m
);
2754 UN8x4_MUL_UN8 (m
, ia
);
2756 UN8x4_MUL_UN8x4_ADD_UN8x4 (d
, m
, s
);
2766 /* pm is NOT necessarily 16-byte aligned */
2767 vmask
= load_128_unaligned (pm
);
2769 pack_cmp
= vec_all_eq(vmask
, (vector
unsigned int) AVV(0));
2771 /* if all bits in mask are zero, pack_cmp is not 0 */
2774 /* pd is 16-byte aligned */
2775 vdest
= in_over (vsrc
, valpha
, vmask
, load_128_aligned (pd
));
2777 save_128_aligned(pd
, vdest
);
2793 UN8x4_MUL_UN8x4 (s
, m
);
2794 UN8x4_MUL_UN8 (m
, ia
);
2796 UN8x4_MUL_UN8x4_ADD_UN8x4 (d
, m
, s
);
2807 vmx_composite_add_8_8 (pixman_implementation_t
*imp
,
2808 pixman_composite_info_t
*info
)
2810 PIXMAN_COMPOSITE_ARGS (info
);
2811 uint8_t *dst_line
, *dst
;
2812 uint8_t *src_line
, *src
;
2813 int dst_stride
, src_stride
;
2817 PIXMAN_IMAGE_GET_LINE (
2818 src_image
, src_x
, src_y
, uint8_t, src_stride
, src_line
, 1);
2819 PIXMAN_IMAGE_GET_LINE (
2820 dest_image
, dest_x
, dest_y
, uint8_t, dst_stride
, dst_line
, 1);
2827 dst_line
+= dst_stride
;
2828 src_line
+= src_stride
;
2832 while (w
&& (uintptr_t)dst
& 3)
2834 t
= (*dst
) + (*src
++);
2835 *dst
++ = t
| (0 - (t
>> 8));
2839 vmx_combine_add_u (imp
, op
,
2840 (uint32_t*)dst
, (uint32_t*)src
, NULL
, w
>> 2);
2850 t
= (*dst
) + (*src
++);
2851 *dst
++ = t
| (0 - (t
>> 8));
2858 vmx_composite_add_8888_8888 (pixman_implementation_t
*imp
,
2859 pixman_composite_info_t
*info
)
2861 PIXMAN_COMPOSITE_ARGS (info
);
2862 uint32_t *dst_line
, *dst
;
2863 uint32_t *src_line
, *src
;
2864 int dst_stride
, src_stride
;
2866 PIXMAN_IMAGE_GET_LINE (
2867 src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
2868 PIXMAN_IMAGE_GET_LINE (
2869 dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2874 dst_line
+= dst_stride
;
2876 src_line
+= src_stride
;
2878 vmx_combine_add_u (imp
, op
, dst
, src
, NULL
, width
);
2882 static force_inline
void
2883 scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t* pd
,
2887 pixman_fixed_t unit_x
,
2888 pixman_fixed_t src_width_fixed
,
2889 pixman_bool_t fully_transparent_src
)
2892 const uint32_t* pm
= NULL
;
2894 vector
unsigned int vsrc
, vdst
;
2896 if (fully_transparent_src
)
2899 /* Align dst on a 16-byte boundary */
2900 while (w
&& ((uintptr_t)pd
& 15))
2903 s
= combine1 (ps
+ pixman_fixed_to_int (vx
), pm
);
2906 vx
-= src_width_fixed
;
2908 *pd
++ = core_combine_over_u_pixel_vmx (s
, d
);
2916 vector
unsigned int tmp
;
2917 uint32_t tmp1
, tmp2
, tmp3
, tmp4
;
2919 tmp1
= *(ps
+ pixman_fixed_to_int (vx
));
2922 vx
-= src_width_fixed
;
2923 tmp2
= *(ps
+ pixman_fixed_to_int (vx
));
2926 vx
-= src_width_fixed
;
2927 tmp3
= *(ps
+ pixman_fixed_to_int (vx
));
2930 vx
-= src_width_fixed
;
2931 tmp4
= *(ps
+ pixman_fixed_to_int (vx
));
2934 vx
-= src_width_fixed
;
2941 vsrc
= combine4 ((const uint32_t *) &tmp
, pm
);
2943 if (is_opaque (vsrc
))
2945 save_128_aligned (pd
, vsrc
);
2947 else if (!is_zero (vsrc
))
2949 vdst
= over(vsrc
, splat_alpha(vsrc
), load_128_aligned (pd
));
2951 save_128_aligned (pd
, vdst
);
2963 s
= combine1 (ps
+ pixman_fixed_to_int (vx
), pm
);
2966 vx
-= src_width_fixed
;
2968 *pd
++ = core_combine_over_u_pixel_vmx (s
, d
);
2976 FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER
,
2977 scaled_nearest_scanline_vmx_8888_8888_OVER
,
2978 uint32_t, uint32_t, COVER
)
2979 FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER
,
2980 scaled_nearest_scanline_vmx_8888_8888_OVER
,
2981 uint32_t, uint32_t, NONE
)
2982 FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER
,
2983 scaled_nearest_scanline_vmx_8888_8888_OVER
,
2984 uint32_t, uint32_t, PAD
)
2985 FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER
,
2986 scaled_nearest_scanline_vmx_8888_8888_OVER
,
2987 uint32_t, uint32_t, NORMAL
)
2989 static const pixman_fast_path_t vmx_fast_paths
[] =
2991 PIXMAN_STD_FAST_PATH (OVER
, solid
, null
, a8r8g8b8
, vmx_composite_over_n_8888
),
2992 PIXMAN_STD_FAST_PATH (OVER
, solid
, null
, x8r8g8b8
, vmx_composite_over_n_8888
),
2993 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, null
, a8r8g8b8
, vmx_composite_over_8888_8888
),
2994 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, null
, x8r8g8b8
, vmx_composite_over_8888_8888
),
2995 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, null
, a8b8g8r8
, vmx_composite_over_8888_8888
),
2996 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, null
, x8b8g8r8
, vmx_composite_over_8888_8888
),
2997 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, a8r8g8b8
, vmx_composite_over_n_8_8888
),
2998 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, x8r8g8b8
, vmx_composite_over_n_8_8888
),
2999 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, a8b8g8r8
, vmx_composite_over_n_8_8888
),
3000 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, x8b8g8r8
, vmx_composite_over_n_8_8888
),
3001 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8r8g8b8
, a8r8g8b8
, vmx_composite_over_n_8888_8888_ca
),
3002 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8r8g8b8
, x8r8g8b8
, vmx_composite_over_n_8888_8888_ca
),
3003 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8b8g8r8
, a8b8g8r8
, vmx_composite_over_n_8888_8888_ca
),
3004 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8b8g8r8
, x8b8g8r8
, vmx_composite_over_n_8888_8888_ca
),
3007 PIXMAN_STD_FAST_PATH (ADD
, a8
, null
, a8
, vmx_composite_add_8_8
),
3008 PIXMAN_STD_FAST_PATH (ADD
, a8r8g8b8
, null
, a8r8g8b8
, vmx_composite_add_8888_8888
),
3009 PIXMAN_STD_FAST_PATH (ADD
, a8b8g8r8
, null
, a8b8g8r8
, vmx_composite_add_8888_8888
),
3012 PIXMAN_STD_FAST_PATH (SRC
, x8r8g8b8
, null
, a8r8g8b8
, vmx_composite_src_x888_8888
),
3013 PIXMAN_STD_FAST_PATH (SRC
, x8b8g8r8
, null
, a8b8g8r8
, vmx_composite_src_x888_8888
),
3015 SIMPLE_NEAREST_FAST_PATH (OVER
, a8r8g8b8
, x8r8g8b8
, vmx_8888_8888
),
3016 SIMPLE_NEAREST_FAST_PATH (OVER
, a8b8g8r8
, x8b8g8r8
, vmx_8888_8888
),
3017 SIMPLE_NEAREST_FAST_PATH (OVER
, a8r8g8b8
, a8r8g8b8
, vmx_8888_8888
),
3018 SIMPLE_NEAREST_FAST_PATH (OVER
, a8b8g8r8
, a8b8g8r8
, vmx_8888_8888
),
3024 vmx_fetch_x8r8g8b8 (pixman_iter_t
*iter
, const uint32_t *mask
)
3026 int w
= iter
->width
;
3027 vector
unsigned int ff000000
= mask_ff000000
;
3028 uint32_t *dst
= iter
->buffer
;
3029 uint32_t *src
= (uint32_t *)iter
->bits
;
3031 iter
->bits
+= iter
->stride
;
3033 while (w
&& ((uintptr_t)dst
) & 0x0f)
3035 *dst
++ = (*src
++) | 0xff000000;
3041 save_128_aligned(dst
, vec_or(load_128_unaligned(src
), ff000000
));
3050 *dst
++ = (*src
++) | 0xff000000;
3054 return iter
->buffer
;
3058 vmx_fetch_a8 (pixman_iter_t
*iter
, const uint32_t *mask
)
3060 int w
= iter
->width
;
3061 uint32_t *dst
= iter
->buffer
;
3062 uint8_t *src
= iter
->bits
;
3063 vector
unsigned int vmx0
, vmx1
, vmx2
, vmx3
, vmx4
, vmx5
, vmx6
;
3065 iter
->bits
+= iter
->stride
;
3067 while (w
&& (((uintptr_t)dst
) & 15))
3069 *dst
++ = *(src
++) << 24;
3075 vmx0
= load_128_unaligned((uint32_t *) src
);
3077 unpack_128_2x128((vector
unsigned int) AVV(0), vmx0
, &vmx1
, &vmx2
);
3078 unpack_128_2x128_16((vector
unsigned int) AVV(0), vmx1
, &vmx3
, &vmx4
);
3079 unpack_128_2x128_16((vector
unsigned int) AVV(0), vmx2
, &vmx5
, &vmx6
);
3081 save_128_aligned(dst
, vmx6
);
3082 save_128_aligned((dst
+ 4), vmx5
);
3083 save_128_aligned((dst
+ 8), vmx4
);
3084 save_128_aligned((dst
+ 12), vmx3
);
3093 *dst
++ = *(src
++) << 24;
3097 return iter
->buffer
;
3100 #define IMAGE_FLAGS \
3101 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
3102 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3104 static const pixman_iter_info_t vmx_iters
[] =
3106 { PIXMAN_x8r8g8b8
, IMAGE_FLAGS
, ITER_NARROW
,
3107 _pixman_iter_init_bits_stride
, vmx_fetch_x8r8g8b8
, NULL
3109 { PIXMAN_a8
, IMAGE_FLAGS
, ITER_NARROW
,
3110 _pixman_iter_init_bits_stride
, vmx_fetch_a8
, NULL
3115 pixman_implementation_t
*
3116 _pixman_implementation_create_vmx (pixman_implementation_t
*fallback
)
3118 pixman_implementation_t
*imp
= _pixman_implementation_create (fallback
, vmx_fast_paths
);
3121 mask_ff000000
= create_mask_32_128 (0xff000000);
3122 mask_red
= create_mask_32_128 (0x00f80000);
3123 mask_green
= create_mask_32_128 (0x0000fc00);
3124 mask_blue
= create_mask_32_128 (0x000000f8);
3125 mask_565_fix_rb
= create_mask_32_128 (0x00e000e0);
3126 mask_565_fix_g
= create_mask_32_128 (0x0000c000);
3128 /* Set up function pointers */
3130 imp
->combine_32
[PIXMAN_OP_OVER
] = vmx_combine_over_u
;
3131 imp
->combine_32
[PIXMAN_OP_OVER_REVERSE
] = vmx_combine_over_reverse_u
;
3132 imp
->combine_32
[PIXMAN_OP_IN
] = vmx_combine_in_u
;
3133 imp
->combine_32
[PIXMAN_OP_IN_REVERSE
] = vmx_combine_in_reverse_u
;
3134 imp
->combine_32
[PIXMAN_OP_OUT
] = vmx_combine_out_u
;
3135 imp
->combine_32
[PIXMAN_OP_OUT_REVERSE
] = vmx_combine_out_reverse_u
;
3136 imp
->combine_32
[PIXMAN_OP_ATOP
] = vmx_combine_atop_u
;
3137 imp
->combine_32
[PIXMAN_OP_ATOP_REVERSE
] = vmx_combine_atop_reverse_u
;
3138 imp
->combine_32
[PIXMAN_OP_XOR
] = vmx_combine_xor_u
;
3140 imp
->combine_32
[PIXMAN_OP_ADD
] = vmx_combine_add_u
;
3142 imp
->combine_32_ca
[PIXMAN_OP_SRC
] = vmx_combine_src_ca
;
3143 imp
->combine_32_ca
[PIXMAN_OP_OVER
] = vmx_combine_over_ca
;
3144 imp
->combine_32_ca
[PIXMAN_OP_OVER_REVERSE
] = vmx_combine_over_reverse_ca
;
3145 imp
->combine_32_ca
[PIXMAN_OP_IN
] = vmx_combine_in_ca
;
3146 imp
->combine_32_ca
[PIXMAN_OP_IN_REVERSE
] = vmx_combine_in_reverse_ca
;
3147 imp
->combine_32_ca
[PIXMAN_OP_OUT
] = vmx_combine_out_ca
;
3148 imp
->combine_32_ca
[PIXMAN_OP_OUT_REVERSE
] = vmx_combine_out_reverse_ca
;
3149 imp
->combine_32_ca
[PIXMAN_OP_ATOP
] = vmx_combine_atop_ca
;
3150 imp
->combine_32_ca
[PIXMAN_OP_ATOP_REVERSE
] = vmx_combine_atop_reverse_ca
;
3151 imp
->combine_32_ca
[PIXMAN_OP_XOR
] = vmx_combine_xor_ca
;
3152 imp
->combine_32_ca
[PIXMAN_OP_ADD
] = vmx_combine_add_ca
;
3154 imp
->fill
= vmx_fill
;
3156 imp
->iter_info
= vmx_iters
;