beta-0.89.2
[luatex.git] / source / libs / pixman / pixman-src / pixman / pixman-vmx.c
blob41efdcfa1de49f23738f4e3febcbde298287eb6b
1 /*
2 * Copyright © 2007 Luca Barbato
4 * Permission to use, copy, modify, distribute, and sell this software and its
5 * documentation for any purpose is hereby granted without fee, provided that
6 * the above copyright notice appear in all copies and that both that
7 * copyright notice and this permission notice appear in supporting
8 * documentation, and that the name of Luca Barbato not be used in advertising or
9 * publicity pertaining to distribution of the software without specific,
10 * written prior permission. Luca Barbato makes no representations about the
11 * suitability of this software for any purpose. It is provided "as is"
12 * without express or implied warranty.
14 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
15 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
16 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
17 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
18 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
19 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
20 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
21 * SOFTWARE.
23 * Author: Luca Barbato (lu_zero@gentoo.org)
25 * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
28 #ifdef HAVE_CONFIG_H
29 #include <config.h>
30 #endif
31 #include "pixman-private.h"
32 #include "pixman-combine32.h"
33 #include "pixman-inlines.h"
34 #include <altivec.h>
36 #define AVV(x...) {x}
38 static vector unsigned int mask_ff000000;
39 static vector unsigned int mask_red;
40 static vector unsigned int mask_green;
41 static vector unsigned int mask_blue;
42 static vector unsigned int mask_565_fix_rb;
43 static vector unsigned int mask_565_fix_g;
45 static force_inline vector unsigned int
46 splat_alpha (vector unsigned int pix)
48 #ifdef WORDS_BIGENDIAN
49 return vec_perm (pix, pix,
50 (vector unsigned char)AVV (
51 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
52 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
53 #else
54 return vec_perm (pix, pix,
55 (vector unsigned char)AVV (
56 0x03, 0x03, 0x03, 0x03, 0x07, 0x07, 0x07, 0x07,
57 0x0B, 0x0B, 0x0B, 0x0B, 0x0F, 0x0F, 0x0F, 0x0F));
58 #endif
61 static force_inline vector unsigned int
62 splat_pixel (vector unsigned int pix)
64 return vec_perm (pix, pix,
65 (vector unsigned char)AVV (
66 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01,
67 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03));
70 static force_inline vector unsigned int
71 pix_multiply (vector unsigned int p, vector unsigned int a)
73 vector unsigned short hi, lo, mod;
75 /* unpack to short */
76 hi = (vector unsigned short)
77 #ifdef WORDS_BIGENDIAN
78 vec_mergeh ((vector unsigned char)AVV (0),
79 (vector unsigned char)p);
80 #else
81 vec_mergeh ((vector unsigned char) p,
82 (vector unsigned char) AVV (0));
83 #endif
85 mod = (vector unsigned short)
86 #ifdef WORDS_BIGENDIAN
87 vec_mergeh ((vector unsigned char)AVV (0),
88 (vector unsigned char)a);
89 #else
90 vec_mergeh ((vector unsigned char) a,
91 (vector unsigned char) AVV (0));
92 #endif
94 hi = vec_mladd (hi, mod, (vector unsigned short)
95 AVV (0x0080, 0x0080, 0x0080, 0x0080,
96 0x0080, 0x0080, 0x0080, 0x0080));
98 hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
100 hi = vec_sr (hi, vec_splat_u16 (8));
102 /* unpack to short */
103 lo = (vector unsigned short)
104 #ifdef WORDS_BIGENDIAN
105 vec_mergel ((vector unsigned char)AVV (0),
106 (vector unsigned char)p);
107 #else
108 vec_mergel ((vector unsigned char) p,
109 (vector unsigned char) AVV (0));
110 #endif
112 mod = (vector unsigned short)
113 #ifdef WORDS_BIGENDIAN
114 vec_mergel ((vector unsigned char)AVV (0),
115 (vector unsigned char)a);
116 #else
117 vec_mergel ((vector unsigned char) a,
118 (vector unsigned char) AVV (0));
119 #endif
121 lo = vec_mladd (lo, mod, (vector unsigned short)
122 AVV (0x0080, 0x0080, 0x0080, 0x0080,
123 0x0080, 0x0080, 0x0080, 0x0080));
125 lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
127 lo = vec_sr (lo, vec_splat_u16 (8));
129 return (vector unsigned int)vec_packsu (hi, lo);
132 static force_inline vector unsigned int
133 pix_add (vector unsigned int a, vector unsigned int b)
135 return (vector unsigned int)vec_adds ((vector unsigned char)a,
136 (vector unsigned char)b);
139 static force_inline vector unsigned int
140 pix_add_mul (vector unsigned int x,
141 vector unsigned int a,
142 vector unsigned int y,
143 vector unsigned int b)
145 vector unsigned int t1, t2;
147 t1 = pix_multiply (x, a);
148 t2 = pix_multiply (y, b);
150 return pix_add (t1, t2);
153 static force_inline vector unsigned int
154 negate (vector unsigned int src)
156 return vec_nor (src, src);
159 /* dest*~srca + src */
160 static force_inline vector unsigned int
161 over (vector unsigned int src,
162 vector unsigned int srca,
163 vector unsigned int dest)
165 vector unsigned char tmp = (vector unsigned char)
166 pix_multiply (dest, negate (srca));
168 tmp = vec_adds ((vector unsigned char)src, tmp);
169 return (vector unsigned int)tmp;
172 /* in == pix_multiply */
173 #define in_over(src, srca, mask, dest) \
174 over (pix_multiply (src, mask), \
175 pix_multiply (srca, mask), dest)
177 #ifdef WORDS_BIGENDIAN
179 #define COMPUTE_SHIFT_MASK(source) \
180 source ## _mask = vec_lvsl (0, source);
182 #define COMPUTE_SHIFT_MASKS(dest, source) \
183 source ## _mask = vec_lvsl (0, source);
185 #define COMPUTE_SHIFT_MASKC(dest, source, mask) \
186 mask ## _mask = vec_lvsl (0, mask); \
187 source ## _mask = vec_lvsl (0, source);
189 #define LOAD_VECTOR(source) \
190 do \
192 vector unsigned char tmp1, tmp2; \
193 tmp1 = (typeof(tmp1))vec_ld (0, source); \
194 tmp2 = (typeof(tmp2))vec_ld (15, source); \
195 v ## source = (typeof(v ## source)) \
196 vec_perm (tmp1, tmp2, source ## _mask); \
197 } while (0)
199 #define LOAD_VECTORS(dest, source) \
200 do \
202 LOAD_VECTOR(source); \
203 v ## dest = (typeof(v ## dest))vec_ld (0, dest); \
204 } while (0)
206 #define LOAD_VECTORSC(dest, source, mask) \
207 do \
209 LOAD_VECTORS(dest, source); \
210 LOAD_VECTOR(mask); \
211 } while (0)
213 #define DECLARE_SRC_MASK_VAR vector unsigned char src_mask
214 #define DECLARE_MASK_MASK_VAR vector unsigned char mask_mask
216 #else
218 /* Now the COMPUTE_SHIFT_{MASK, MASKS, MASKC} below are just no-op.
219 * They are defined that way because little endian altivec can do unaligned
220 * reads natively and have no need for constructing the permutation pattern
221 * variables.
223 #define COMPUTE_SHIFT_MASK(source)
225 #define COMPUTE_SHIFT_MASKS(dest, source)
227 #define COMPUTE_SHIFT_MASKC(dest, source, mask)
229 # define LOAD_VECTOR(source) \
230 v ## source = *((typeof(v ## source)*)source);
232 # define LOAD_VECTORS(dest, source) \
233 LOAD_VECTOR(source); \
234 LOAD_VECTOR(dest); \
236 # define LOAD_VECTORSC(dest, source, mask) \
237 LOAD_VECTORS(dest, source); \
238 LOAD_VECTOR(mask); \
240 #define DECLARE_SRC_MASK_VAR
241 #define DECLARE_MASK_MASK_VAR
243 #endif /* WORDS_BIGENDIAN */
245 #define LOAD_VECTORSM(dest, source, mask) \
246 LOAD_VECTORSC (dest, source, mask); \
247 v ## source = pix_multiply (v ## source, \
248 splat_alpha (v ## mask));
250 #define STORE_VECTOR(dest) \
251 vec_st ((vector unsigned int) v ## dest, 0, dest);
253 /* load 4 pixels from a 16-byte boundary aligned address */
254 static force_inline vector unsigned int
255 load_128_aligned (const uint32_t* src)
257 return *((vector unsigned int *) src);
260 /* load 4 pixels from a unaligned address */
261 static force_inline vector unsigned int
262 load_128_unaligned (const uint32_t* src)
264 vector unsigned int vsrc;
265 DECLARE_SRC_MASK_VAR;
267 COMPUTE_SHIFT_MASK (src);
268 LOAD_VECTOR (src);
270 return vsrc;
273 /* save 4 pixels on a 16-byte boundary aligned address */
274 static force_inline void
275 save_128_aligned (uint32_t* data,
276 vector unsigned int vdata)
278 STORE_VECTOR(data)
281 static force_inline vector unsigned int
282 create_mask_1x32_128 (const uint32_t *src)
284 vector unsigned int vsrc;
285 DECLARE_SRC_MASK_VAR;
287 COMPUTE_SHIFT_MASK (src);
288 LOAD_VECTOR (src);
289 return vec_splat(vsrc, 0);
292 static force_inline vector unsigned int
293 create_mask_32_128 (uint32_t mask)
295 return create_mask_1x32_128(&mask);
298 static force_inline vector unsigned int
299 unpacklo_128_16x8 (vector unsigned int data1, vector unsigned int data2)
301 vector unsigned char lo;
303 /* unpack to short */
304 lo = (vector unsigned char)
305 #ifdef WORDS_BIGENDIAN
306 vec_mergel ((vector unsigned char) data2,
307 (vector unsigned char) data1);
308 #else
309 vec_mergel ((vector unsigned char) data1,
310 (vector unsigned char) data2);
311 #endif
313 return (vector unsigned int) lo;
316 static force_inline vector unsigned int
317 unpackhi_128_16x8 (vector unsigned int data1, vector unsigned int data2)
319 vector unsigned char hi;
321 /* unpack to short */
322 hi = (vector unsigned char)
323 #ifdef WORDS_BIGENDIAN
324 vec_mergeh ((vector unsigned char) data2,
325 (vector unsigned char) data1);
326 #else
327 vec_mergeh ((vector unsigned char) data1,
328 (vector unsigned char) data2);
329 #endif
331 return (vector unsigned int) hi;
334 static force_inline vector unsigned int
335 unpacklo_128_8x16 (vector unsigned int data1, vector unsigned int data2)
337 vector unsigned short lo;
339 /* unpack to char */
340 lo = (vector unsigned short)
341 #ifdef WORDS_BIGENDIAN
342 vec_mergel ((vector unsigned short) data2,
343 (vector unsigned short) data1);
344 #else
345 vec_mergel ((vector unsigned short) data1,
346 (vector unsigned short) data2);
347 #endif
349 return (vector unsigned int) lo;
352 static force_inline vector unsigned int
353 unpackhi_128_8x16 (vector unsigned int data1, vector unsigned int data2)
355 vector unsigned short hi;
357 /* unpack to char */
358 hi = (vector unsigned short)
359 #ifdef WORDS_BIGENDIAN
360 vec_mergeh ((vector unsigned short) data2,
361 (vector unsigned short) data1);
362 #else
363 vec_mergeh ((vector unsigned short) data1,
364 (vector unsigned short) data2);
365 #endif
367 return (vector unsigned int) hi;
370 static force_inline void
371 unpack_128_2x128 (vector unsigned int data1, vector unsigned int data2,
372 vector unsigned int* data_lo, vector unsigned int* data_hi)
374 *data_lo = unpacklo_128_16x8(data1, data2);
375 *data_hi = unpackhi_128_16x8(data1, data2);
378 static force_inline void
379 unpack_128_2x128_16 (vector unsigned int data1, vector unsigned int data2,
380 vector unsigned int* data_lo, vector unsigned int* data_hi)
382 *data_lo = unpacklo_128_8x16(data1, data2);
383 *data_hi = unpackhi_128_8x16(data1, data2);
386 static force_inline vector unsigned int
387 unpack_565_to_8888 (vector unsigned int lo)
389 vector unsigned int r, g, b, rb, t;
391 r = vec_and (vec_sl(lo, create_mask_32_128(8)), mask_red);
392 g = vec_and (vec_sl(lo, create_mask_32_128(5)), mask_green);
393 b = vec_and (vec_sl(lo, create_mask_32_128(3)), mask_blue);
395 rb = vec_or (r, b);
396 t = vec_and (rb, mask_565_fix_rb);
397 t = vec_sr (t, create_mask_32_128(5));
398 rb = vec_or (rb, t);
400 t = vec_and (g, mask_565_fix_g);
401 t = vec_sr (t, create_mask_32_128(6));
402 g = vec_or (g, t);
404 return vec_or (rb, g);
407 static force_inline int
408 is_opaque (vector unsigned int x)
410 uint32_t cmp_result;
411 vector bool int ffs = vec_cmpeq(x, x);
413 cmp_result = vec_all_eq(x, ffs);
415 return (cmp_result & 0x8888) == 0x8888;
418 static force_inline int
419 is_zero (vector unsigned int x)
421 uint32_t cmp_result;
423 cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
425 return cmp_result == 0xffff;
428 static force_inline int
429 is_transparent (vector unsigned int x)
431 uint32_t cmp_result;
433 cmp_result = vec_all_eq(x, (vector unsigned int) AVV(0));
434 return (cmp_result & 0x8888) == 0x8888;
437 static force_inline uint32_t
438 core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
440 uint32_t a;
442 a = ALPHA_8(src);
444 if (a == 0xff)
446 return src;
448 else if (src)
450 UN8x4_MUL_UN8_ADD_UN8x4(dst, (~a & MASK), src);
453 return dst;
456 static force_inline uint32_t
457 combine1 (const uint32_t *ps, const uint32_t *pm)
459 uint32_t s = *ps;
461 if (pm)
462 UN8x4_MUL_UN8(s, ALPHA_8(*pm));
464 return s;
467 static force_inline vector unsigned int
468 combine4 (const uint32_t* ps, const uint32_t* pm)
470 vector unsigned int src, msk;
472 if (pm)
474 msk = load_128_unaligned(pm);
476 if (is_transparent(msk))
477 return (vector unsigned int) AVV(0);
480 src = load_128_unaligned(ps);
482 if (pm)
483 src = pix_multiply(src, msk);
485 return src;
488 static void
489 vmx_combine_over_u_no_mask (uint32_t * dest,
490 const uint32_t *src,
491 int width)
493 int i;
494 vector unsigned int vdest, vsrc;
495 DECLARE_SRC_MASK_VAR;
497 while (width && ((uintptr_t)dest & 15))
499 uint32_t s = *src++;
500 uint32_t d = *dest;
501 uint32_t ia = ALPHA_8 (~s);
503 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
505 *dest++ = d;
506 width--;
509 COMPUTE_SHIFT_MASKS (dest, src);
511 /* printf ("%s\n",__PRETTY_FUNCTION__); */
512 for (i = width / 4; i > 0; i--)
515 LOAD_VECTORS (dest, src);
517 vdest = over (vsrc, splat_alpha (vsrc), vdest);
519 STORE_VECTOR (dest);
521 src += 4;
522 dest += 4;
525 for (i = width % 4; --i >= 0;)
527 uint32_t s = src[i];
528 uint32_t d = dest[i];
529 uint32_t ia = ALPHA_8 (~s);
531 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
533 dest[i] = d;
537 static void
538 vmx_combine_over_u_mask (uint32_t * dest,
539 const uint32_t *src,
540 const uint32_t *mask,
541 int width)
543 int i;
544 vector unsigned int vdest, vsrc, vmask;
545 DECLARE_SRC_MASK_VAR;
546 DECLARE_MASK_MASK_VAR;
548 while (width && ((uintptr_t)dest & 15))
550 uint32_t m = ALPHA_8 (*mask++);
551 uint32_t s = *src++;
552 uint32_t d = *dest;
553 uint32_t ia;
555 UN8x4_MUL_UN8 (s, m);
557 ia = ALPHA_8 (~s);
559 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
560 *dest++ = d;
561 width--;
564 COMPUTE_SHIFT_MASKC (dest, src, mask);
566 /* printf ("%s\n",__PRETTY_FUNCTION__); */
567 for (i = width / 4; i > 0; i--)
569 LOAD_VECTORSM (dest, src, mask);
571 vdest = over (vsrc, splat_alpha (vsrc), vdest);
573 STORE_VECTOR (dest);
575 src += 4;
576 dest += 4;
577 mask += 4;
580 for (i = width % 4; --i >= 0;)
582 uint32_t m = ALPHA_8 (mask[i]);
583 uint32_t s = src[i];
584 uint32_t d = dest[i];
585 uint32_t ia;
587 UN8x4_MUL_UN8 (s, m);
589 ia = ALPHA_8 (~s);
591 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
592 dest[i] = d;
596 static void
597 vmx_combine_over_u (pixman_implementation_t *imp,
598 pixman_op_t op,
599 uint32_t * dest,
600 const uint32_t * src,
601 const uint32_t * mask,
602 int width)
604 if (mask)
605 vmx_combine_over_u_mask (dest, src, mask, width);
606 else
607 vmx_combine_over_u_no_mask (dest, src, width);
610 static void
611 vmx_combine_over_reverse_u_no_mask (uint32_t * dest,
612 const uint32_t *src,
613 int width)
615 int i;
616 vector unsigned int vdest, vsrc;
617 DECLARE_SRC_MASK_VAR;
619 while (width && ((uintptr_t)dest & 15))
621 uint32_t s = *src++;
622 uint32_t d = *dest;
623 uint32_t ia = ALPHA_8 (~d);
625 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
626 *dest++ = s;
627 width--;
630 COMPUTE_SHIFT_MASKS (dest, src);
632 /* printf ("%s\n",__PRETTY_FUNCTION__); */
633 for (i = width / 4; i > 0; i--)
636 LOAD_VECTORS (dest, src);
638 vdest = over (vdest, splat_alpha (vdest), vsrc);
640 STORE_VECTOR (dest);
642 src += 4;
643 dest += 4;
646 for (i = width % 4; --i >= 0;)
648 uint32_t s = src[i];
649 uint32_t d = dest[i];
650 uint32_t ia = ALPHA_8 (~dest[i]);
652 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
653 dest[i] = s;
657 static void
658 vmx_combine_over_reverse_u_mask (uint32_t * dest,
659 const uint32_t *src,
660 const uint32_t *mask,
661 int width)
663 int i;
664 vector unsigned int vdest, vsrc, vmask;
665 DECLARE_SRC_MASK_VAR;
666 DECLARE_MASK_MASK_VAR;
668 while (width && ((uintptr_t)dest & 15))
670 uint32_t m = ALPHA_8 (*mask++);
671 uint32_t s = *src++;
672 uint32_t d = *dest;
673 uint32_t ia = ALPHA_8 (~d);
675 UN8x4_MUL_UN8 (s, m);
677 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
678 *dest++ = s;
679 width--;
682 COMPUTE_SHIFT_MASKC (dest, src, mask);
684 /* printf ("%s\n",__PRETTY_FUNCTION__); */
685 for (i = width / 4; i > 0; i--)
688 LOAD_VECTORSM (dest, src, mask);
690 vdest = over (vdest, splat_alpha (vdest), vsrc);
692 STORE_VECTOR (dest);
694 src += 4;
695 dest += 4;
696 mask += 4;
699 for (i = width % 4; --i >= 0;)
701 uint32_t m = ALPHA_8 (mask[i]);
702 uint32_t s = src[i];
703 uint32_t d = dest[i];
704 uint32_t ia = ALPHA_8 (~dest[i]);
706 UN8x4_MUL_UN8 (s, m);
708 UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
709 dest[i] = s;
713 static void
714 vmx_combine_over_reverse_u (pixman_implementation_t *imp,
715 pixman_op_t op,
716 uint32_t * dest,
717 const uint32_t * src,
718 const uint32_t * mask,
719 int width)
721 if (mask)
722 vmx_combine_over_reverse_u_mask (dest, src, mask, width);
723 else
724 vmx_combine_over_reverse_u_no_mask (dest, src, width);
727 static void
728 vmx_combine_in_u_no_mask (uint32_t * dest,
729 const uint32_t *src,
730 int width)
732 int i;
733 vector unsigned int vdest, vsrc;
734 DECLARE_SRC_MASK_VAR;
736 while (width && ((uintptr_t)dest & 15))
738 uint32_t s = *src++;
739 uint32_t a = ALPHA_8 (*dest);
741 UN8x4_MUL_UN8 (s, a);
742 *dest++ = s;
743 width--;
746 COMPUTE_SHIFT_MASKS (dest, src);
748 /* printf ("%s\n",__PRETTY_FUNCTION__); */
749 for (i = width / 4; i > 0; i--)
751 LOAD_VECTORS (dest, src);
753 vdest = pix_multiply (vsrc, splat_alpha (vdest));
755 STORE_VECTOR (dest);
757 src += 4;
758 dest += 4;
761 for (i = width % 4; --i >= 0;)
763 uint32_t s = src[i];
764 uint32_t a = ALPHA_8 (dest[i]);
766 UN8x4_MUL_UN8 (s, a);
767 dest[i] = s;
771 static void
772 vmx_combine_in_u_mask (uint32_t * dest,
773 const uint32_t *src,
774 const uint32_t *mask,
775 int width)
777 int i;
778 vector unsigned int vdest, vsrc, vmask;
779 DECLARE_SRC_MASK_VAR;
780 DECLARE_MASK_MASK_VAR;
782 while (width && ((uintptr_t)dest & 15))
784 uint32_t m = ALPHA_8 (*mask++);
785 uint32_t s = *src++;
786 uint32_t a = ALPHA_8 (*dest);
788 UN8x4_MUL_UN8 (s, m);
789 UN8x4_MUL_UN8 (s, a);
791 *dest++ = s;
792 width--;
795 COMPUTE_SHIFT_MASKC (dest, src, mask);
797 /* printf ("%s\n",__PRETTY_FUNCTION__); */
798 for (i = width / 4; i > 0; i--)
800 LOAD_VECTORSM (dest, src, mask);
802 vdest = pix_multiply (vsrc, splat_alpha (vdest));
804 STORE_VECTOR (dest);
806 src += 4;
807 dest += 4;
808 mask += 4;
811 for (i = width % 4; --i >= 0;)
813 uint32_t m = ALPHA_8 (mask[i]);
814 uint32_t s = src[i];
815 uint32_t a = ALPHA_8 (dest[i]);
817 UN8x4_MUL_UN8 (s, m);
818 UN8x4_MUL_UN8 (s, a);
820 dest[i] = s;
824 static void
825 vmx_combine_in_u (pixman_implementation_t *imp,
826 pixman_op_t op,
827 uint32_t * dest,
828 const uint32_t * src,
829 const uint32_t * mask,
830 int width)
832 if (mask)
833 vmx_combine_in_u_mask (dest, src, mask, width);
834 else
835 vmx_combine_in_u_no_mask (dest, src, width);
838 static void
839 vmx_combine_in_reverse_u_no_mask (uint32_t * dest,
840 const uint32_t *src,
841 int width)
843 int i;
844 vector unsigned int vdest, vsrc;
845 DECLARE_SRC_MASK_VAR;
847 while (width && ((uintptr_t)dest & 15))
849 uint32_t d = *dest;
850 uint32_t a = ALPHA_8 (*src++);
852 UN8x4_MUL_UN8 (d, a);
854 *dest++ = d;
855 width--;
858 COMPUTE_SHIFT_MASKS (dest, src);
860 /* printf ("%s\n",__PRETTY_FUNCTION__); */
861 for (i = width / 4; i > 0; i--)
863 LOAD_VECTORS (dest, src);
865 vdest = pix_multiply (vdest, splat_alpha (vsrc));
867 STORE_VECTOR (dest);
869 src += 4;
870 dest += 4;
873 for (i = width % 4; --i >= 0;)
875 uint32_t d = dest[i];
876 uint32_t a = ALPHA_8 (src[i]);
878 UN8x4_MUL_UN8 (d, a);
880 dest[i] = d;
884 static void
885 vmx_combine_in_reverse_u_mask (uint32_t * dest,
886 const uint32_t *src,
887 const uint32_t *mask,
888 int width)
890 int i;
891 vector unsigned int vdest, vsrc, vmask;
892 DECLARE_SRC_MASK_VAR;
893 DECLARE_MASK_MASK_VAR;
895 while (width && ((uintptr_t)dest & 15))
897 uint32_t m = ALPHA_8 (*mask++);
898 uint32_t d = *dest;
899 uint32_t a = *src++;
901 UN8x4_MUL_UN8 (a, m);
902 a = ALPHA_8 (a);
903 UN8x4_MUL_UN8 (d, a);
905 *dest++ = d;
906 width--;
909 COMPUTE_SHIFT_MASKC (dest, src, mask);
911 /* printf ("%s\n",__PRETTY_FUNCTION__); */
912 for (i = width / 4; i > 0; i--)
914 LOAD_VECTORSM (dest, src, mask);
916 vdest = pix_multiply (vdest, splat_alpha (vsrc));
918 STORE_VECTOR (dest);
920 src += 4;
921 dest += 4;
922 mask += 4;
925 for (i = width % 4; --i >= 0;)
927 uint32_t m = ALPHA_8 (mask[i]);
928 uint32_t d = dest[i];
929 uint32_t a = src[i];
931 UN8x4_MUL_UN8 (a, m);
932 a = ALPHA_8 (a);
933 UN8x4_MUL_UN8 (d, a);
935 dest[i] = d;
939 static void
940 vmx_combine_in_reverse_u (pixman_implementation_t *imp,
941 pixman_op_t op,
942 uint32_t * dest,
943 const uint32_t * src,
944 const uint32_t * mask,
945 int width)
947 if (mask)
948 vmx_combine_in_reverse_u_mask (dest, src, mask, width);
949 else
950 vmx_combine_in_reverse_u_no_mask (dest, src, width);
953 static void
954 vmx_combine_out_u_no_mask (uint32_t * dest,
955 const uint32_t *src,
956 int width)
958 int i;
959 vector unsigned int vdest, vsrc;
960 DECLARE_SRC_MASK_VAR;
962 while (width && ((uintptr_t)dest & 15))
964 uint32_t s = *src++;
965 uint32_t a = ALPHA_8 (~(*dest));
967 UN8x4_MUL_UN8 (s, a);
969 *dest++ = s;
970 width--;
973 COMPUTE_SHIFT_MASKS (dest, src);
975 /* printf ("%s\n",__PRETTY_FUNCTION__); */
976 for (i = width / 4; i > 0; i--)
978 LOAD_VECTORS (dest, src);
980 vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
982 STORE_VECTOR (dest);
984 src += 4;
985 dest += 4;
988 for (i = width % 4; --i >= 0;)
990 uint32_t s = src[i];
991 uint32_t a = ALPHA_8 (~dest[i]);
993 UN8x4_MUL_UN8 (s, a);
995 dest[i] = s;
999 static void
1000 vmx_combine_out_u_mask (uint32_t * dest,
1001 const uint32_t *src,
1002 const uint32_t *mask,
1003 int width)
1005 int i;
1006 vector unsigned int vdest, vsrc, vmask;
1007 DECLARE_SRC_MASK_VAR;
1008 DECLARE_MASK_MASK_VAR;
1010 while (width && ((uintptr_t)dest & 15))
1012 uint32_t m = ALPHA_8 (*mask++);
1013 uint32_t s = *src++;
1014 uint32_t a = ALPHA_8 (~(*dest));
1016 UN8x4_MUL_UN8 (s, m);
1017 UN8x4_MUL_UN8 (s, a);
1019 *dest++ = s;
1020 width--;
1023 COMPUTE_SHIFT_MASKC (dest, src, mask);
1025 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1026 for (i = width / 4; i > 0; i--)
1028 LOAD_VECTORSM (dest, src, mask);
1030 vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
1032 STORE_VECTOR (dest);
1034 src += 4;
1035 dest += 4;
1036 mask += 4;
1039 for (i = width % 4; --i >= 0;)
1041 uint32_t m = ALPHA_8 (mask[i]);
1042 uint32_t s = src[i];
1043 uint32_t a = ALPHA_8 (~dest[i]);
1045 UN8x4_MUL_UN8 (s, m);
1046 UN8x4_MUL_UN8 (s, a);
1048 dest[i] = s;
1052 static void
1053 vmx_combine_out_u (pixman_implementation_t *imp,
1054 pixman_op_t op,
1055 uint32_t * dest,
1056 const uint32_t * src,
1057 const uint32_t * mask,
1058 int width)
1060 if (mask)
1061 vmx_combine_out_u_mask (dest, src, mask, width);
1062 else
1063 vmx_combine_out_u_no_mask (dest, src, width);
1066 static void
1067 vmx_combine_out_reverse_u_no_mask (uint32_t * dest,
1068 const uint32_t *src,
1069 int width)
1071 int i;
1072 vector unsigned int vdest, vsrc;
1073 DECLARE_SRC_MASK_VAR;
1075 while (width && ((uintptr_t)dest & 15))
1077 uint32_t d = *dest;
1078 uint32_t a = ALPHA_8 (~(*src++));
1080 UN8x4_MUL_UN8 (d, a);
1082 *dest++ = d;
1083 width--;
1086 COMPUTE_SHIFT_MASKS (dest, src);
1088 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1089 for (i = width / 4; i > 0; i--)
1092 LOAD_VECTORS (dest, src);
1094 vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
1096 STORE_VECTOR (dest);
1098 src += 4;
1099 dest += 4;
1102 for (i = width % 4; --i >= 0;)
1104 uint32_t d = dest[i];
1105 uint32_t a = ALPHA_8 (~src[i]);
1107 UN8x4_MUL_UN8 (d, a);
1109 dest[i] = d;
1113 static void
1114 vmx_combine_out_reverse_u_mask (uint32_t * dest,
1115 const uint32_t *src,
1116 const uint32_t *mask,
1117 int width)
1119 int i;
1120 vector unsigned int vdest, vsrc, vmask;
1121 DECLARE_SRC_MASK_VAR;
1122 DECLARE_MASK_MASK_VAR;
1124 while (width && ((uintptr_t)dest & 15))
1126 uint32_t m = ALPHA_8 (*mask++);
1127 uint32_t d = *dest;
1128 uint32_t a = *src++;
1130 UN8x4_MUL_UN8 (a, m);
1131 a = ALPHA_8 (~a);
1132 UN8x4_MUL_UN8 (d, a);
1134 *dest++ = d;
1135 width--;
1138 COMPUTE_SHIFT_MASKC (dest, src, mask);
1140 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1141 for (i = width / 4; i > 0; i--)
1143 LOAD_VECTORSM (dest, src, mask);
1145 vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
1147 STORE_VECTOR (dest);
1149 src += 4;
1150 dest += 4;
1151 mask += 4;
1154 for (i = width % 4; --i >= 0;)
1156 uint32_t m = ALPHA_8 (mask[i]);
1157 uint32_t d = dest[i];
1158 uint32_t a = src[i];
1160 UN8x4_MUL_UN8 (a, m);
1161 a = ALPHA_8 (~a);
1162 UN8x4_MUL_UN8 (d, a);
1164 dest[i] = d;
1168 static void
1169 vmx_combine_out_reverse_u (pixman_implementation_t *imp,
1170 pixman_op_t op,
1171 uint32_t * dest,
1172 const uint32_t * src,
1173 const uint32_t * mask,
1174 int width)
1176 if (mask)
1177 vmx_combine_out_reverse_u_mask (dest, src, mask, width);
1178 else
1179 vmx_combine_out_reverse_u_no_mask (dest, src, width);
1182 static void
1183 vmx_combine_atop_u_no_mask (uint32_t * dest,
1184 const uint32_t *src,
1185 int width)
1187 int i;
1188 vector unsigned int vdest, vsrc;
1189 DECLARE_SRC_MASK_VAR;
1191 while (width && ((uintptr_t)dest & 15))
1193 uint32_t s = *src++;
1194 uint32_t d = *dest;
1195 uint32_t dest_a = ALPHA_8 (d);
1196 uint32_t src_ia = ALPHA_8 (~s);
1198 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
1200 *dest++ = s;
1201 width--;
1204 COMPUTE_SHIFT_MASKS (dest, src);
1206 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1207 for (i = width / 4; i > 0; i--)
1209 LOAD_VECTORS (dest, src);
1211 vdest = pix_add_mul (vsrc, splat_alpha (vdest),
1212 vdest, splat_alpha (negate (vsrc)));
1214 STORE_VECTOR (dest);
1216 src += 4;
1217 dest += 4;
1220 for (i = width % 4; --i >= 0;)
1222 uint32_t s = src[i];
1223 uint32_t d = dest[i];
1224 uint32_t dest_a = ALPHA_8 (d);
1225 uint32_t src_ia = ALPHA_8 (~s);
1227 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
1229 dest[i] = s;
1233 static void
1234 vmx_combine_atop_u_mask (uint32_t * dest,
1235 const uint32_t *src,
1236 const uint32_t *mask,
1237 int width)
1239 int i;
1240 vector unsigned int vdest, vsrc, vmask;
1241 DECLARE_SRC_MASK_VAR;
1242 DECLARE_MASK_MASK_VAR;
1244 while (width && ((uintptr_t)dest & 15))
1246 uint32_t m = ALPHA_8 (*mask++);
1247 uint32_t s = *src++;
1248 uint32_t d = *dest;
1249 uint32_t dest_a = ALPHA_8 (d);
1250 uint32_t src_ia;
1252 UN8x4_MUL_UN8 (s, m);
1254 src_ia = ALPHA_8 (~s);
1256 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
1258 *dest++ = s;
1259 width--;
1262 COMPUTE_SHIFT_MASKC (dest, src, mask);
1264 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1265 for (i = width / 4; i > 0; i--)
1267 LOAD_VECTORSM (dest, src, mask);
1269 vdest = pix_add_mul (vsrc, splat_alpha (vdest),
1270 vdest, splat_alpha (negate (vsrc)));
1272 STORE_VECTOR (dest);
1274 src += 4;
1275 dest += 4;
1276 mask += 4;
1279 for (i = width % 4; --i >= 0;)
1281 uint32_t m = ALPHA_8 (mask[i]);
1282 uint32_t s = src[i];
1283 uint32_t d = dest[i];
1284 uint32_t dest_a = ALPHA_8 (d);
1285 uint32_t src_ia;
1287 UN8x4_MUL_UN8 (s, m);
1289 src_ia = ALPHA_8 (~s);
1291 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
1293 dest[i] = s;
1297 static void
1298 vmx_combine_atop_u (pixman_implementation_t *imp,
1299 pixman_op_t op,
1300 uint32_t * dest,
1301 const uint32_t * src,
1302 const uint32_t * mask,
1303 int width)
1305 if (mask)
1306 vmx_combine_atop_u_mask (dest, src, mask, width);
1307 else
1308 vmx_combine_atop_u_no_mask (dest, src, width);
1311 static void
1312 vmx_combine_atop_reverse_u_no_mask (uint32_t * dest,
1313 const uint32_t *src,
1314 int width)
1316 int i;
1317 vector unsigned int vdest, vsrc;
1318 DECLARE_SRC_MASK_VAR;
1320 while (width && ((uintptr_t)dest & 15))
1322 uint32_t s = *src++;
1323 uint32_t d = *dest;
1324 uint32_t src_a = ALPHA_8 (s);
1325 uint32_t dest_ia = ALPHA_8 (~d);
1327 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
1329 *dest++ = s;
1330 width--;
1333 COMPUTE_SHIFT_MASKS (dest, src);
1335 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1336 for (i = width / 4; i > 0; i--)
1338 LOAD_VECTORS (dest, src);
1340 vdest = pix_add_mul (vdest, splat_alpha (vsrc),
1341 vsrc, splat_alpha (negate (vdest)));
1343 STORE_VECTOR (dest);
1345 src += 4;
1346 dest += 4;
1349 for (i = width % 4; --i >= 0;)
1351 uint32_t s = src[i];
1352 uint32_t d = dest[i];
1353 uint32_t src_a = ALPHA_8 (s);
1354 uint32_t dest_ia = ALPHA_8 (~d);
1356 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
1358 dest[i] = s;
1362 static void
1363 vmx_combine_atop_reverse_u_mask (uint32_t * dest,
1364 const uint32_t *src,
1365 const uint32_t *mask,
1366 int width)
1368 int i;
1369 vector unsigned int vdest, vsrc, vmask;
1370 DECLARE_SRC_MASK_VAR;
1371 DECLARE_MASK_MASK_VAR;
1373 while (width && ((uintptr_t)dest & 15))
1375 uint32_t m = ALPHA_8 (*mask++);
1376 uint32_t s = *src++;
1377 uint32_t d = *dest;
1378 uint32_t src_a;
1379 uint32_t dest_ia = ALPHA_8 (~d);
1381 UN8x4_MUL_UN8 (s, m);
1383 src_a = ALPHA_8 (s);
1385 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
1387 *dest++ = s;
1388 width--;
1391 COMPUTE_SHIFT_MASKC (dest, src, mask);
1393 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1394 for (i = width / 4; i > 0; i--)
1396 LOAD_VECTORSM (dest, src, mask);
1398 vdest = pix_add_mul (vdest, splat_alpha (vsrc),
1399 vsrc, splat_alpha (negate (vdest)));
1401 STORE_VECTOR (dest);
1403 src += 4;
1404 dest += 4;
1405 mask += 4;
1408 for (i = width % 4; --i >= 0;)
1410 uint32_t m = ALPHA_8 (mask[i]);
1411 uint32_t s = src[i];
1412 uint32_t d = dest[i];
1413 uint32_t src_a;
1414 uint32_t dest_ia = ALPHA_8 (~d);
1416 UN8x4_MUL_UN8 (s, m);
1418 src_a = ALPHA_8 (s);
1420 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
1422 dest[i] = s;
1426 static void
1427 vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
1428 pixman_op_t op,
1429 uint32_t * dest,
1430 const uint32_t * src,
1431 const uint32_t * mask,
1432 int width)
1434 if (mask)
1435 vmx_combine_atop_reverse_u_mask (dest, src, mask, width);
1436 else
1437 vmx_combine_atop_reverse_u_no_mask (dest, src, width);
1440 static void
1441 vmx_combine_xor_u_no_mask (uint32_t * dest,
1442 const uint32_t *src,
1443 int width)
1445 int i;
1446 vector unsigned int vdest, vsrc;
1447 DECLARE_SRC_MASK_VAR;
1449 while (width && ((uintptr_t)dest & 15))
1451 uint32_t s = *src++;
1452 uint32_t d = *dest;
1453 uint32_t src_ia = ALPHA_8 (~s);
1454 uint32_t dest_ia = ALPHA_8 (~d);
1456 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1458 *dest++ = s;
1459 width--;
1462 COMPUTE_SHIFT_MASKS (dest, src);
1464 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1465 for (i = width / 4; i > 0; i--)
1467 LOAD_VECTORS (dest, src);
1469 vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
1470 vdest, splat_alpha (negate (vsrc)));
1472 STORE_VECTOR (dest);
1474 src += 4;
1475 dest += 4;
1478 for (i = width % 4; --i >= 0;)
1480 uint32_t s = src[i];
1481 uint32_t d = dest[i];
1482 uint32_t src_ia = ALPHA_8 (~s);
1483 uint32_t dest_ia = ALPHA_8 (~d);
1485 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1487 dest[i] = s;
1491 static void
1492 vmx_combine_xor_u_mask (uint32_t * dest,
1493 const uint32_t *src,
1494 const uint32_t *mask,
1495 int width)
1497 int i;
1498 vector unsigned int vdest, vsrc, vmask;
1499 DECLARE_SRC_MASK_VAR;
1500 DECLARE_MASK_MASK_VAR;
1502 while (width && ((uintptr_t)dest & 15))
1504 uint32_t m = ALPHA_8 (*mask++);
1505 uint32_t s = *src++;
1506 uint32_t d = *dest;
1507 uint32_t src_ia;
1508 uint32_t dest_ia = ALPHA_8 (~d);
1510 UN8x4_MUL_UN8 (s, m);
1512 src_ia = ALPHA_8 (~s);
1514 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1516 *dest++ = s;
1517 width--;
1520 COMPUTE_SHIFT_MASKC (dest, src, mask);
1522 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1523 for (i = width / 4; i > 0; i--)
1525 LOAD_VECTORSM (dest, src, mask);
1527 vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
1528 vdest, splat_alpha (negate (vsrc)));
1530 STORE_VECTOR (dest);
1532 src += 4;
1533 dest += 4;
1534 mask += 4;
1537 for (i = width % 4; --i >= 0;)
1539 uint32_t m = ALPHA_8 (mask[i]);
1540 uint32_t s = src[i];
1541 uint32_t d = dest[i];
1542 uint32_t src_ia;
1543 uint32_t dest_ia = ALPHA_8 (~d);
1545 UN8x4_MUL_UN8 (s, m);
1547 src_ia = ALPHA_8 (~s);
1549 UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
1551 dest[i] = s;
1555 static void
1556 vmx_combine_xor_u (pixman_implementation_t *imp,
1557 pixman_op_t op,
1558 uint32_t * dest,
1559 const uint32_t * src,
1560 const uint32_t * mask,
1561 int width)
1563 if (mask)
1564 vmx_combine_xor_u_mask (dest, src, mask, width);
1565 else
1566 vmx_combine_xor_u_no_mask (dest, src, width);
1569 static void
1570 vmx_combine_add_u_no_mask (uint32_t * dest,
1571 const uint32_t *src,
1572 int width)
1574 int i;
1575 vector unsigned int vdest, vsrc;
1576 DECLARE_SRC_MASK_VAR;
1578 while (width && ((uintptr_t)dest & 15))
1580 uint32_t s = *src++;
1581 uint32_t d = *dest;
1583 UN8x4_ADD_UN8x4 (d, s);
1585 *dest++ = d;
1586 width--;
1589 COMPUTE_SHIFT_MASKS (dest, src);
1590 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1591 for (i = width / 4; i > 0; i--)
1593 LOAD_VECTORS (dest, src);
1595 vdest = pix_add (vsrc, vdest);
1597 STORE_VECTOR (dest);
1599 src += 4;
1600 dest += 4;
1603 for (i = width % 4; --i >= 0;)
1605 uint32_t s = src[i];
1606 uint32_t d = dest[i];
1608 UN8x4_ADD_UN8x4 (d, s);
1610 dest[i] = d;
1614 static void
1615 vmx_combine_add_u_mask (uint32_t * dest,
1616 const uint32_t *src,
1617 const uint32_t *mask,
1618 int width)
1620 int i;
1621 vector unsigned int vdest, vsrc, vmask;
1622 DECLARE_SRC_MASK_VAR;
1623 DECLARE_MASK_MASK_VAR;
1625 while (width && ((uintptr_t)dest & 15))
1627 uint32_t m = ALPHA_8 (*mask++);
1628 uint32_t s = *src++;
1629 uint32_t d = *dest;
1631 UN8x4_MUL_UN8 (s, m);
1632 UN8x4_ADD_UN8x4 (d, s);
1634 *dest++ = d;
1635 width--;
1638 COMPUTE_SHIFT_MASKC (dest, src, mask);
1640 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1641 for (i = width / 4; i > 0; i--)
1643 LOAD_VECTORSM (dest, src, mask);
1645 vdest = pix_add (vsrc, vdest);
1647 STORE_VECTOR (dest);
1649 src += 4;
1650 dest += 4;
1651 mask += 4;
1654 for (i = width % 4; --i >= 0;)
1656 uint32_t m = ALPHA_8 (mask[i]);
1657 uint32_t s = src[i];
1658 uint32_t d = dest[i];
1660 UN8x4_MUL_UN8 (s, m);
1661 UN8x4_ADD_UN8x4 (d, s);
1663 dest[i] = d;
1667 static void
1668 vmx_combine_add_u (pixman_implementation_t *imp,
1669 pixman_op_t op,
1670 uint32_t * dest,
1671 const uint32_t * src,
1672 const uint32_t * mask,
1673 int width)
1675 if (mask)
1676 vmx_combine_add_u_mask (dest, src, mask, width);
1677 else
1678 vmx_combine_add_u_no_mask (dest, src, width);
1681 static void
1682 vmx_combine_src_ca (pixman_implementation_t *imp,
1683 pixman_op_t op,
1684 uint32_t * dest,
1685 const uint32_t * src,
1686 const uint32_t * mask,
1687 int width)
1689 int i;
1690 vector unsigned int vdest, vsrc, vmask;
1691 DECLARE_SRC_MASK_VAR;
1692 DECLARE_MASK_MASK_VAR;
1694 while (width && ((uintptr_t)dest & 15))
1696 uint32_t a = *mask++;
1697 uint32_t s = *src++;
1699 UN8x4_MUL_UN8x4 (s, a);
1701 *dest++ = s;
1702 width--;
1705 COMPUTE_SHIFT_MASKC (dest, src, mask);
1707 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1708 for (i = width / 4; i > 0; i--)
1710 LOAD_VECTORSC (dest, src, mask);
1712 vdest = pix_multiply (vsrc, vmask);
1714 STORE_VECTOR (dest);
1716 mask += 4;
1717 src += 4;
1718 dest += 4;
1721 for (i = width % 4; --i >= 0;)
1723 uint32_t a = mask[i];
1724 uint32_t s = src[i];
1726 UN8x4_MUL_UN8x4 (s, a);
1728 dest[i] = s;
1732 static void
1733 vmx_combine_over_ca (pixman_implementation_t *imp,
1734 pixman_op_t op,
1735 uint32_t * dest,
1736 const uint32_t * src,
1737 const uint32_t * mask,
1738 int width)
1740 int i;
1741 vector unsigned int vdest, vsrc, vmask;
1742 DECLARE_SRC_MASK_VAR;
1743 DECLARE_MASK_MASK_VAR;
1745 while (width && ((uintptr_t)dest & 15))
1747 uint32_t a = *mask++;
1748 uint32_t s = *src++;
1749 uint32_t d = *dest;
1750 uint32_t sa = ALPHA_8 (s);
1752 UN8x4_MUL_UN8x4 (s, a);
1753 UN8x4_MUL_UN8 (a, sa);
1754 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
1756 *dest++ = d;
1757 width--;
1760 COMPUTE_SHIFT_MASKC (dest, src, mask);
1762 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1763 for (i = width / 4; i > 0; i--)
1765 LOAD_VECTORSC (dest, src, mask);
1767 vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
1769 STORE_VECTOR (dest);
1771 mask += 4;
1772 src += 4;
1773 dest += 4;
1776 for (i = width % 4; --i >= 0;)
1778 uint32_t a = mask[i];
1779 uint32_t s = src[i];
1780 uint32_t d = dest[i];
1781 uint32_t sa = ALPHA_8 (s);
1783 UN8x4_MUL_UN8x4 (s, a);
1784 UN8x4_MUL_UN8 (a, sa);
1785 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
1787 dest[i] = d;
1791 static void
1792 vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1793 pixman_op_t op,
1794 uint32_t * dest,
1795 const uint32_t * src,
1796 const uint32_t * mask,
1797 int width)
1799 int i;
1800 vector unsigned int vdest, vsrc, vmask;
1801 DECLARE_SRC_MASK_VAR;
1802 DECLARE_MASK_MASK_VAR;
1804 while (width && ((uintptr_t)dest & 15))
1806 uint32_t a = *mask++;
1807 uint32_t s = *src++;
1808 uint32_t d = *dest;
1809 uint32_t ida = ALPHA_8 (~d);
1811 UN8x4_MUL_UN8x4 (s, a);
1812 UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
1814 *dest++ = s;
1815 width--;
1818 COMPUTE_SHIFT_MASKC (dest, src, mask);
1820 /* printf("%s\n",__PRETTY_FUNCTION__); */
1821 for (i = width / 4; i > 0; i--)
1823 LOAD_VECTORSC (dest, src, mask);
1825 vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
1827 STORE_VECTOR (dest);
1829 mask += 4;
1830 src += 4;
1831 dest += 4;
1834 for (i = width % 4; --i >= 0;)
1836 uint32_t a = mask[i];
1837 uint32_t s = src[i];
1838 uint32_t d = dest[i];
1839 uint32_t ida = ALPHA_8 (~d);
1841 UN8x4_MUL_UN8x4 (s, a);
1842 UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
1844 dest[i] = s;
1848 static void
1849 vmx_combine_in_ca (pixman_implementation_t *imp,
1850 pixman_op_t op,
1851 uint32_t * dest,
1852 const uint32_t * src,
1853 const uint32_t * mask,
1854 int width)
1856 int i;
1857 vector unsigned int vdest, vsrc, vmask;
1858 DECLARE_SRC_MASK_VAR;
1859 DECLARE_MASK_MASK_VAR;
1861 while (width && ((uintptr_t)dest & 15))
1863 uint32_t a = *mask++;
1864 uint32_t s = *src++;
1865 uint32_t da = ALPHA_8 (*dest);
1867 UN8x4_MUL_UN8x4 (s, a);
1868 UN8x4_MUL_UN8 (s, da);
1870 *dest++ = s;
1871 width--;
1874 COMPUTE_SHIFT_MASKC (dest, src, mask);
1876 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1877 for (i = width / 4; i > 0; i--)
1879 LOAD_VECTORSC (dest, src, mask);
1881 vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
1883 STORE_VECTOR (dest);
1885 src += 4;
1886 dest += 4;
1887 mask += 4;
1890 for (i = width % 4; --i >= 0;)
1892 uint32_t a = mask[i];
1893 uint32_t s = src[i];
1894 uint32_t da = ALPHA_8 (dest[i]);
1896 UN8x4_MUL_UN8x4 (s, a);
1897 UN8x4_MUL_UN8 (s, da);
1899 dest[i] = s;
1903 static void
1904 vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1905 pixman_op_t op,
1906 uint32_t * dest,
1907 const uint32_t * src,
1908 const uint32_t * mask,
1909 int width)
1911 int i;
1912 vector unsigned int vdest, vsrc, vmask;
1913 DECLARE_SRC_MASK_VAR;
1914 DECLARE_MASK_MASK_VAR;
1916 while (width && ((uintptr_t)dest & 15))
1918 uint32_t a = *mask++;
1919 uint32_t d = *dest;
1920 uint32_t sa = ALPHA_8 (*src++);
1922 UN8x4_MUL_UN8 (a, sa);
1923 UN8x4_MUL_UN8x4 (d, a);
1925 *dest++ = d;
1926 width--;
1929 COMPUTE_SHIFT_MASKC (dest, src, mask);
1931 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1932 for (i = width / 4; i > 0; i--)
1935 LOAD_VECTORSC (dest, src, mask);
1937 vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
1939 STORE_VECTOR (dest);
1941 src += 4;
1942 dest += 4;
1943 mask += 4;
1946 for (i = width % 4; --i >= 0;)
1948 uint32_t a = mask[i];
1949 uint32_t d = dest[i];
1950 uint32_t sa = ALPHA_8 (src[i]);
1952 UN8x4_MUL_UN8 (a, sa);
1953 UN8x4_MUL_UN8x4 (d, a);
1955 dest[i] = d;
1959 static void
1960 vmx_combine_out_ca (pixman_implementation_t *imp,
1961 pixman_op_t op,
1962 uint32_t * dest,
1963 const uint32_t * src,
1964 const uint32_t * mask,
1965 int width)
1967 int i;
1968 vector unsigned int vdest, vsrc, vmask;
1969 DECLARE_SRC_MASK_VAR;
1970 DECLARE_MASK_MASK_VAR;
1972 while (width && ((uintptr_t)dest & 15))
1974 uint32_t a = *mask++;
1975 uint32_t s = *src++;
1976 uint32_t d = *dest;
1977 uint32_t da = ALPHA_8 (~d);
1979 UN8x4_MUL_UN8x4 (s, a);
1980 UN8x4_MUL_UN8 (s, da);
1982 *dest++ = s;
1983 width--;
1986 COMPUTE_SHIFT_MASKC (dest, src, mask);
1988 /* printf ("%s\n",__PRETTY_FUNCTION__); */
1989 for (i = width / 4; i > 0; i--)
1991 LOAD_VECTORSC (dest, src, mask);
1993 vdest = pix_multiply (
1994 pix_multiply (vsrc, vmask), splat_alpha (negate (vdest)));
1996 STORE_VECTOR (dest);
1998 src += 4;
1999 dest += 4;
2000 mask += 4;
2003 for (i = width % 4; --i >= 0;)
2005 uint32_t a = mask[i];
2006 uint32_t s = src[i];
2007 uint32_t d = dest[i];
2008 uint32_t da = ALPHA_8 (~d);
2010 UN8x4_MUL_UN8x4 (s, a);
2011 UN8x4_MUL_UN8 (s, da);
2013 dest[i] = s;
2017 static void
2018 vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
2019 pixman_op_t op,
2020 uint32_t * dest,
2021 const uint32_t * src,
2022 const uint32_t * mask,
2023 int width)
2025 int i;
2026 vector unsigned int vdest, vsrc, vmask;
2027 DECLARE_SRC_MASK_VAR;
2028 DECLARE_MASK_MASK_VAR;
2030 while (width && ((uintptr_t)dest & 15))
2032 uint32_t a = *mask++;
2033 uint32_t s = *src++;
2034 uint32_t d = *dest;
2035 uint32_t sa = ALPHA_8 (s);
2037 UN8x4_MUL_UN8 (a, sa);
2038 UN8x4_MUL_UN8x4 (d, ~a);
2040 *dest++ = d;
2041 width--;
2044 COMPUTE_SHIFT_MASKC (dest, src, mask);
2046 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2047 for (i = width / 4; i > 0; i--)
2049 LOAD_VECTORSC (dest, src, mask);
2051 vdest = pix_multiply (
2052 vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))));
2054 STORE_VECTOR (dest);
2056 src += 4;
2057 dest += 4;
2058 mask += 4;
2061 for (i = width % 4; --i >= 0;)
2063 uint32_t a = mask[i];
2064 uint32_t s = src[i];
2065 uint32_t d = dest[i];
2066 uint32_t sa = ALPHA_8 (s);
2068 UN8x4_MUL_UN8 (a, sa);
2069 UN8x4_MUL_UN8x4 (d, ~a);
2071 dest[i] = d;
2075 static void
2076 vmx_combine_atop_ca (pixman_implementation_t *imp,
2077 pixman_op_t op,
2078 uint32_t * dest,
2079 const uint32_t * src,
2080 const uint32_t * mask,
2081 int width)
2083 int i;
2084 vector unsigned int vdest, vsrc, vmask, vsrca;
2085 DECLARE_SRC_MASK_VAR;
2086 DECLARE_MASK_MASK_VAR;
2088 while (width && ((uintptr_t)dest & 15))
2090 uint32_t a = *mask++;
2091 uint32_t s = *src++;
2092 uint32_t d = *dest;
2093 uint32_t sa = ALPHA_8 (s);
2094 uint32_t da = ALPHA_8 (d);
2096 UN8x4_MUL_UN8x4 (s, a);
2097 UN8x4_MUL_UN8 (a, sa);
2098 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
2100 *dest++ = d;
2101 width--;
2104 COMPUTE_SHIFT_MASKC (dest, src, mask);
2106 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2107 for (i = width / 4; i > 0; i--)
2109 LOAD_VECTORSC (dest, src, mask);
2111 vsrca = splat_alpha (vsrc);
2113 vsrc = pix_multiply (vsrc, vmask);
2114 vmask = pix_multiply (vmask, vsrca);
2116 vdest = pix_add_mul (vsrc, splat_alpha (vdest),
2117 negate (vmask), vdest);
2119 STORE_VECTOR (dest);
2121 src += 4;
2122 dest += 4;
2123 mask += 4;
2126 for (i = width % 4; --i >= 0;)
2128 uint32_t a = mask[i];
2129 uint32_t s = src[i];
2130 uint32_t d = dest[i];
2131 uint32_t sa = ALPHA_8 (s);
2132 uint32_t da = ALPHA_8 (d);
2134 UN8x4_MUL_UN8x4 (s, a);
2135 UN8x4_MUL_UN8 (a, sa);
2136 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
2138 dest[i] = d;
2142 static void
2143 vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
2144 pixman_op_t op,
2145 uint32_t * dest,
2146 const uint32_t * src,
2147 const uint32_t * mask,
2148 int width)
2150 int i;
2151 vector unsigned int vdest, vsrc, vmask;
2152 DECLARE_SRC_MASK_VAR;
2153 DECLARE_MASK_MASK_VAR;
2155 while (width && ((uintptr_t)dest & 15))
2157 uint32_t a = *mask++;
2158 uint32_t s = *src++;
2159 uint32_t d = *dest;
2160 uint32_t sa = ALPHA_8 (s);
2161 uint32_t da = ALPHA_8 (~d);
2163 UN8x4_MUL_UN8x4 (s, a);
2164 UN8x4_MUL_UN8 (a, sa);
2165 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
2167 *dest++ = d;
2168 width--;
2171 COMPUTE_SHIFT_MASKC (dest, src, mask);
2173 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2174 for (i = width / 4; i > 0; i--)
2176 LOAD_VECTORSC (dest, src, mask);
2178 vdest = pix_add_mul (vdest,
2179 pix_multiply (vmask, splat_alpha (vsrc)),
2180 pix_multiply (vsrc, vmask),
2181 negate (splat_alpha (vdest)));
2183 STORE_VECTOR (dest);
2185 src += 4;
2186 dest += 4;
2187 mask += 4;
2190 for (i = width % 4; --i >= 0;)
2192 uint32_t a = mask[i];
2193 uint32_t s = src[i];
2194 uint32_t d = dest[i];
2195 uint32_t sa = ALPHA_8 (s);
2196 uint32_t da = ALPHA_8 (~d);
2198 UN8x4_MUL_UN8x4 (s, a);
2199 UN8x4_MUL_UN8 (a, sa);
2200 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
2202 dest[i] = d;
2206 static void
2207 vmx_combine_xor_ca (pixman_implementation_t *imp,
2208 pixman_op_t op,
2209 uint32_t * dest,
2210 const uint32_t * src,
2211 const uint32_t * mask,
2212 int width)
2214 int i;
2215 vector unsigned int vdest, vsrc, vmask;
2216 DECLARE_SRC_MASK_VAR;
2217 DECLARE_MASK_MASK_VAR;
2219 while (width && ((uintptr_t)dest & 15))
2221 uint32_t a = *mask++;
2222 uint32_t s = *src++;
2223 uint32_t d = *dest;
2224 uint32_t sa = ALPHA_8 (s);
2225 uint32_t da = ALPHA_8 (~d);
2227 UN8x4_MUL_UN8x4 (s, a);
2228 UN8x4_MUL_UN8 (a, sa);
2229 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
2231 *dest++ = d;
2232 width--;
2235 COMPUTE_SHIFT_MASKC (dest, src, mask);
2237 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2238 for (i = width / 4; i > 0; i--)
2240 LOAD_VECTORSC (dest, src, mask);
2242 vdest = pix_add_mul (vdest,
2243 negate (pix_multiply (vmask, splat_alpha (vsrc))),
2244 pix_multiply (vsrc, vmask),
2245 negate (splat_alpha (vdest)));
2247 STORE_VECTOR (dest);
2249 src += 4;
2250 dest += 4;
2251 mask += 4;
2254 for (i = width % 4; --i >= 0;)
2256 uint32_t a = mask[i];
2257 uint32_t s = src[i];
2258 uint32_t d = dest[i];
2259 uint32_t sa = ALPHA_8 (s);
2260 uint32_t da = ALPHA_8 (~d);
2262 UN8x4_MUL_UN8x4 (s, a);
2263 UN8x4_MUL_UN8 (a, sa);
2264 UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
2266 dest[i] = d;
2270 static void
2271 vmx_combine_add_ca (pixman_implementation_t *imp,
2272 pixman_op_t op,
2273 uint32_t * dest,
2274 const uint32_t * src,
2275 const uint32_t * mask,
2276 int width)
2278 int i;
2279 vector unsigned int vdest, vsrc, vmask;
2280 DECLARE_SRC_MASK_VAR;
2281 DECLARE_MASK_MASK_VAR;
2283 while (width && ((uintptr_t)dest & 15))
2285 uint32_t a = *mask++;
2286 uint32_t s = *src++;
2287 uint32_t d = *dest;
2289 UN8x4_MUL_UN8x4 (s, a);
2290 UN8x4_ADD_UN8x4 (s, d);
2292 *dest++ = s;
2293 width--;
2296 COMPUTE_SHIFT_MASKC (dest, src, mask);
2298 /* printf ("%s\n",__PRETTY_FUNCTION__); */
2299 for (i = width / 4; i > 0; i--)
2301 LOAD_VECTORSC (dest, src, mask);
2303 vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
2305 STORE_VECTOR (dest);
2307 src += 4;
2308 dest += 4;
2309 mask += 4;
2312 for (i = width % 4; --i >= 0;)
2314 uint32_t a = mask[i];
2315 uint32_t s = src[i];
2316 uint32_t d = dest[i];
2318 UN8x4_MUL_UN8x4 (s, a);
2319 UN8x4_ADD_UN8x4 (s, d);
2321 dest[i] = s;
2325 static void
2326 vmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
2327 pixman_composite_info_t *info)
2329 PIXMAN_COMPOSITE_ARGS (info);
2330 uint32_t src, srca;
2331 uint32_t *dst_line, *dst;
2332 uint8_t *mask_line;
2333 int dst_stride, mask_stride;
2334 int32_t w;
2335 uint32_t m, d, s, ia;
2337 vector unsigned int vsrc, valpha, vmask, vdst;
2339 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2341 srca = ALPHA_8(src);
2342 if (src == 0)
2343 return;
2345 PIXMAN_IMAGE_GET_LINE (
2346 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2347 PIXMAN_IMAGE_GET_LINE (
2348 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2350 vsrc = (vector unsigned int) {src, src, src, src};
2351 valpha = splat_alpha(vsrc);
2353 while (height--)
2355 const uint8_t *pm = mask_line;
2356 dst = dst_line;
2357 dst_line += dst_stride;
2358 mask_line += mask_stride;
2359 w = width;
2361 while (w && (uintptr_t)dst & 15)
2363 s = src;
2364 m = *pm++;
2366 if (m)
2368 d = *dst;
2369 UN8x4_MUL_UN8 (s, m);
2370 ia = ALPHA_8 (~s);
2371 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
2372 *dst = d;
2375 w--;
2376 dst++;
2379 while (w >= 4)
2381 m = *((uint32_t*)pm);
2383 if (srca == 0xff && m == 0xffffffff)
2385 save_128_aligned(dst, vsrc);
2387 else if (m)
2389 vmask = splat_pixel((vector unsigned int) {m, m, m, m});
2391 /* dst is 16-byte aligned */
2392 vdst = in_over (vsrc, valpha, vmask, load_128_aligned (dst));
2394 save_128_aligned(dst, vdst);
2397 w -= 4;
2398 dst += 4;
2399 pm += 4;
2402 while (w)
2404 s = src;
2405 m = *pm++;
2407 if (m)
2409 d = *dst;
2410 UN8x4_MUL_UN8 (s, m);
2411 ia = ALPHA_8 (~s);
2412 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
2413 *dst = d;
2416 w--;
2417 dst++;
2423 static pixman_bool_t
2424 vmx_fill (pixman_implementation_t *imp,
2425 uint32_t * bits,
2426 int stride,
2427 int bpp,
2428 int x,
2429 int y,
2430 int width,
2431 int height,
2432 uint32_t filler)
2434 uint32_t byte_width;
2435 uint8_t *byte_line;
2437 vector unsigned int vfiller;
2439 if (bpp == 8)
2441 uint8_t b;
2442 uint16_t w;
2444 stride = stride * (int) sizeof (uint32_t) / 1;
2445 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2446 byte_width = width;
2447 stride *= 1;
2449 b = filler & 0xff;
2450 w = (b << 8) | b;
2451 filler = (w << 16) | w;
2453 else if (bpp == 16)
2455 stride = stride * (int) sizeof (uint32_t) / 2;
2456 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2457 byte_width = 2 * width;
2458 stride *= 2;
2460 filler = (filler & 0xffff) * 0x00010001;
2462 else if (bpp == 32)
2464 stride = stride * (int) sizeof (uint32_t) / 4;
2465 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2466 byte_width = 4 * width;
2467 stride *= 4;
2469 else
2471 return FALSE;
2474 vfiller = create_mask_1x32_128(&filler);
2476 while (height--)
2478 int w;
2479 uint8_t *d = byte_line;
2480 byte_line += stride;
2481 w = byte_width;
2483 if (w >= 1 && ((uintptr_t)d & 1))
2485 *(uint8_t *)d = filler;
2486 w -= 1;
2487 d += 1;
2490 while (w >= 2 && ((uintptr_t)d & 3))
2492 *(uint16_t *)d = filler;
2493 w -= 2;
2494 d += 2;
2497 while (w >= 4 && ((uintptr_t)d & 15))
2499 *(uint32_t *)d = filler;
2501 w -= 4;
2502 d += 4;
2505 while (w >= 128)
2507 vec_st(vfiller, 0, (uint32_t *) d);
2508 vec_st(vfiller, 0, (uint32_t *) d + 4);
2509 vec_st(vfiller, 0, (uint32_t *) d + 8);
2510 vec_st(vfiller, 0, (uint32_t *) d + 12);
2511 vec_st(vfiller, 0, (uint32_t *) d + 16);
2512 vec_st(vfiller, 0, (uint32_t *) d + 20);
2513 vec_st(vfiller, 0, (uint32_t *) d + 24);
2514 vec_st(vfiller, 0, (uint32_t *) d + 28);
2516 d += 128;
2517 w -= 128;
2520 if (w >= 64)
2522 vec_st(vfiller, 0, (uint32_t *) d);
2523 vec_st(vfiller, 0, (uint32_t *) d + 4);
2524 vec_st(vfiller, 0, (uint32_t *) d + 8);
2525 vec_st(vfiller, 0, (uint32_t *) d + 12);
2527 d += 64;
2528 w -= 64;
2531 if (w >= 32)
2533 vec_st(vfiller, 0, (uint32_t *) d);
2534 vec_st(vfiller, 0, (uint32_t *) d + 4);
2536 d += 32;
2537 w -= 32;
2540 if (w >= 16)
2542 vec_st(vfiller, 0, (uint32_t *) d);
2544 d += 16;
2545 w -= 16;
2548 while (w >= 4)
2550 *(uint32_t *)d = filler;
2552 w -= 4;
2553 d += 4;
2556 if (w >= 2)
2558 *(uint16_t *)d = filler;
2559 w -= 2;
2560 d += 2;
2563 if (w >= 1)
2565 *(uint8_t *)d = filler;
2566 w -= 1;
2567 d += 1;
2571 return TRUE;
2574 static void
2575 vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
2576 pixman_composite_info_t *info)
2578 PIXMAN_COMPOSITE_ARGS (info);
2579 uint32_t *dst_line, *dst;
2580 uint32_t *src_line, *src;
2581 int32_t w;
2582 int dst_stride, src_stride;
2584 PIXMAN_IMAGE_GET_LINE (
2585 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2586 PIXMAN_IMAGE_GET_LINE (
2587 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2589 while (height--)
2591 dst = dst_line;
2592 dst_line += dst_stride;
2593 src = src_line;
2594 src_line += src_stride;
2595 w = width;
2597 while (w && (uintptr_t)dst & 15)
2599 *dst++ = *src++ | 0xff000000;
2600 w--;
2603 while (w >= 16)
2605 vector unsigned int vmx_src1, vmx_src2, vmx_src3, vmx_src4;
2607 vmx_src1 = load_128_unaligned (src);
2608 vmx_src2 = load_128_unaligned (src + 4);
2609 vmx_src3 = load_128_unaligned (src + 8);
2610 vmx_src4 = load_128_unaligned (src + 12);
2612 save_128_aligned (dst, vec_or (vmx_src1, mask_ff000000));
2613 save_128_aligned (dst + 4, vec_or (vmx_src2, mask_ff000000));
2614 save_128_aligned (dst + 8, vec_or (vmx_src3, mask_ff000000));
2615 save_128_aligned (dst + 12, vec_or (vmx_src4, mask_ff000000));
2617 dst += 16;
2618 src += 16;
2619 w -= 16;
2622 while (w)
2624 *dst++ = *src++ | 0xff000000;
2625 w--;
2630 static void
2631 vmx_composite_over_n_8888 (pixman_implementation_t *imp,
2632 pixman_composite_info_t *info)
2634 PIXMAN_COMPOSITE_ARGS (info);
2635 uint32_t *dst_line, *dst;
2636 uint32_t src, ia;
2637 int i, w, dst_stride;
2638 vector unsigned int vdst, vsrc, via;
2640 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2642 if (src == 0)
2643 return;
2645 PIXMAN_IMAGE_GET_LINE (
2646 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2648 vsrc = (vector unsigned int){src, src, src, src};
2649 via = negate (splat_alpha (vsrc));
2650 ia = ALPHA_8 (~src);
2652 while (height--)
2654 dst = dst_line;
2655 dst_line += dst_stride;
2656 w = width;
2658 while (w && ((uintptr_t)dst & 15))
2660 uint32_t d = *dst;
2661 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
2662 *dst++ = d;
2663 w--;
2666 for (i = w / 4; i > 0; i--)
2668 vdst = pix_multiply (load_128_aligned (dst), via);
2669 save_128_aligned (dst, pix_add (vsrc, vdst));
2670 dst += 4;
2673 for (i = w % 4; --i >= 0;)
2675 uint32_t d = dst[i];
2676 UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
2677 dst[i] = d;
2682 static void
2683 vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
2684 pixman_composite_info_t *info)
2686 PIXMAN_COMPOSITE_ARGS (info);
2687 int dst_stride, src_stride;
2688 uint32_t *dst_line, *dst;
2689 uint32_t *src_line, *src;
2691 PIXMAN_IMAGE_GET_LINE (
2692 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2693 PIXMAN_IMAGE_GET_LINE (
2694 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2696 dst = dst_line;
2697 src = src_line;
2699 while (height--)
2701 vmx_combine_over_u (imp, op, dst, src, NULL, width);
2703 dst += dst_stride;
2704 src += src_stride;
2708 static void
2709 vmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2710 pixman_composite_info_t *info)
2712 PIXMAN_COMPOSITE_ARGS (info);
2713 uint32_t src, ia;
2714 uint32_t *dst_line, d;
2715 uint32_t *mask_line, m;
2716 uint32_t pack_cmp;
2717 int dst_stride, mask_stride;
2719 vector unsigned int vsrc, valpha, vmask, vdest;
2721 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2723 if (src == 0)
2724 return;
2726 PIXMAN_IMAGE_GET_LINE (
2727 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2728 PIXMAN_IMAGE_GET_LINE (
2729 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2731 vsrc = (vector unsigned int) {src, src, src, src};
2732 valpha = splat_alpha(vsrc);
2733 ia = ALPHA_8 (src);
2735 while (height--)
2737 int w = width;
2738 const uint32_t *pm = (uint32_t *)mask_line;
2739 uint32_t *pd = (uint32_t *)dst_line;
2740 uint32_t s;
2742 dst_line += dst_stride;
2743 mask_line += mask_stride;
2745 while (w && (uintptr_t)pd & 15)
2747 s = src;
2748 m = *pm++;
2750 if (m)
2752 d = *pd;
2753 UN8x4_MUL_UN8x4 (s, m);
2754 UN8x4_MUL_UN8 (m, ia);
2755 m = ~m;
2756 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
2757 *pd = d;
2760 pd++;
2761 w--;
2764 while (w >= 4)
2766 /* pm is NOT necessarily 16-byte aligned */
2767 vmask = load_128_unaligned (pm);
2769 pack_cmp = vec_all_eq(vmask, (vector unsigned int) AVV(0));
2771 /* if all bits in mask are zero, pack_cmp is not 0 */
2772 if (pack_cmp == 0)
2774 /* pd is 16-byte aligned */
2775 vdest = in_over (vsrc, valpha, vmask, load_128_aligned (pd));
2777 save_128_aligned(pd, vdest);
2780 pd += 4;
2781 pm += 4;
2782 w -= 4;
2785 while (w)
2787 s = src;
2788 m = *pm++;
2790 if (m)
2792 d = *pd;
2793 UN8x4_MUL_UN8x4 (s, m);
2794 UN8x4_MUL_UN8 (m, ia);
2795 m = ~m;
2796 UN8x4_MUL_UN8x4_ADD_UN8x4 (d, m, s);
2797 *pd = d;
2800 pd++;
2801 w--;
2806 static void
2807 vmx_composite_add_8_8 (pixman_implementation_t *imp,
2808 pixman_composite_info_t *info)
2810 PIXMAN_COMPOSITE_ARGS (info);
2811 uint8_t *dst_line, *dst;
2812 uint8_t *src_line, *src;
2813 int dst_stride, src_stride;
2814 int32_t w;
2815 uint16_t t;
2817 PIXMAN_IMAGE_GET_LINE (
2818 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2819 PIXMAN_IMAGE_GET_LINE (
2820 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2822 while (height--)
2824 dst = dst_line;
2825 src = src_line;
2827 dst_line += dst_stride;
2828 src_line += src_stride;
2829 w = width;
2831 /* Small head */
2832 while (w && (uintptr_t)dst & 3)
2834 t = (*dst) + (*src++);
2835 *dst++ = t | (0 - (t >> 8));
2836 w--;
2839 vmx_combine_add_u (imp, op,
2840 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
2842 /* Small tail */
2843 dst += w & 0xfffc;
2844 src += w & 0xfffc;
2846 w &= 3;
2848 while (w)
2850 t = (*dst) + (*src++);
2851 *dst++ = t | (0 - (t >> 8));
2852 w--;
2857 static void
2858 vmx_composite_add_8888_8888 (pixman_implementation_t *imp,
2859 pixman_composite_info_t *info)
2861 PIXMAN_COMPOSITE_ARGS (info);
2862 uint32_t *dst_line, *dst;
2863 uint32_t *src_line, *src;
2864 int dst_stride, src_stride;
2866 PIXMAN_IMAGE_GET_LINE (
2867 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2868 PIXMAN_IMAGE_GET_LINE (
2869 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2871 while (height--)
2873 dst = dst_line;
2874 dst_line += dst_stride;
2875 src = src_line;
2876 src_line += src_stride;
2878 vmx_combine_add_u (imp, op, dst, src, NULL, width);
2882 static force_inline void
2883 scaled_nearest_scanline_vmx_8888_8888_OVER (uint32_t* pd,
2884 const uint32_t* ps,
2885 int32_t w,
2886 pixman_fixed_t vx,
2887 pixman_fixed_t unit_x,
2888 pixman_fixed_t src_width_fixed,
2889 pixman_bool_t fully_transparent_src)
2891 uint32_t s, d;
2892 const uint32_t* pm = NULL;
2894 vector unsigned int vsrc, vdst;
2896 if (fully_transparent_src)
2897 return;
2899 /* Align dst on a 16-byte boundary */
2900 while (w && ((uintptr_t)pd & 15))
2902 d = *pd;
2903 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
2904 vx += unit_x;
2905 while (vx >= 0)
2906 vx -= src_width_fixed;
2908 *pd++ = core_combine_over_u_pixel_vmx (s, d);
2909 if (pm)
2910 pm++;
2911 w--;
2914 while (w >= 4)
2916 vector unsigned int tmp;
2917 uint32_t tmp1, tmp2, tmp3, tmp4;
2919 tmp1 = *(ps + pixman_fixed_to_int (vx));
2920 vx += unit_x;
2921 while (vx >= 0)
2922 vx -= src_width_fixed;
2923 tmp2 = *(ps + pixman_fixed_to_int (vx));
2924 vx += unit_x;
2925 while (vx >= 0)
2926 vx -= src_width_fixed;
2927 tmp3 = *(ps + pixman_fixed_to_int (vx));
2928 vx += unit_x;
2929 while (vx >= 0)
2930 vx -= src_width_fixed;
2931 tmp4 = *(ps + pixman_fixed_to_int (vx));
2932 vx += unit_x;
2933 while (vx >= 0)
2934 vx -= src_width_fixed;
2936 tmp[0] = tmp1;
2937 tmp[1] = tmp2;
2938 tmp[2] = tmp3;
2939 tmp[3] = tmp4;
2941 vsrc = combine4 ((const uint32_t *) &tmp, pm);
2943 if (is_opaque (vsrc))
2945 save_128_aligned (pd, vsrc);
2947 else if (!is_zero (vsrc))
2949 vdst = over(vsrc, splat_alpha(vsrc), load_128_aligned (pd));
2951 save_128_aligned (pd, vdst);
2954 w -= 4;
2955 pd += 4;
2956 if (pm)
2957 pm += 4;
2960 while (w)
2962 d = *pd;
2963 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
2964 vx += unit_x;
2965 while (vx >= 0)
2966 vx -= src_width_fixed;
2968 *pd++ = core_combine_over_u_pixel_vmx (s, d);
2969 if (pm)
2970 pm++;
2972 w--;
2976 FAST_NEAREST_MAINLOOP (vmx_8888_8888_cover_OVER,
2977 scaled_nearest_scanline_vmx_8888_8888_OVER,
2978 uint32_t, uint32_t, COVER)
2979 FAST_NEAREST_MAINLOOP (vmx_8888_8888_none_OVER,
2980 scaled_nearest_scanline_vmx_8888_8888_OVER,
2981 uint32_t, uint32_t, NONE)
2982 FAST_NEAREST_MAINLOOP (vmx_8888_8888_pad_OVER,
2983 scaled_nearest_scanline_vmx_8888_8888_OVER,
2984 uint32_t, uint32_t, PAD)
2985 FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
2986 scaled_nearest_scanline_vmx_8888_8888_OVER,
2987 uint32_t, uint32_t, NORMAL)
2989 static const pixman_fast_path_t vmx_fast_paths[] =
2991 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, vmx_composite_over_n_8888),
2992 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, vmx_composite_over_n_8888),
2993 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
2994 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
2995 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),
2996 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, vmx_composite_over_8888_8888),
2997 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, vmx_composite_over_n_8_8888),
2998 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, vmx_composite_over_n_8_8888),
2999 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, vmx_composite_over_n_8_8888),
3000 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, vmx_composite_over_n_8_8888),
3001 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, vmx_composite_over_n_8888_8888_ca),
3002 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, vmx_composite_over_n_8888_8888_ca),
3003 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, vmx_composite_over_n_8888_8888_ca),
3004 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, vmx_composite_over_n_8888_8888_ca),
3006 /* PIXMAN_OP_ADD */
3007 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, vmx_composite_add_8_8),
3008 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, vmx_composite_add_8888_8888),
3009 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, vmx_composite_add_8888_8888),
3011 /* PIXMAN_OP_SRC */
3012 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, vmx_composite_src_x888_8888),
3013 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, vmx_composite_src_x888_8888),
3015 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, vmx_8888_8888),
3016 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, vmx_8888_8888),
3017 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, vmx_8888_8888),
3018 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, vmx_8888_8888),
3020 { PIXMAN_OP_NONE },
3023 static uint32_t *
3024 vmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3026 int w = iter->width;
3027 vector unsigned int ff000000 = mask_ff000000;
3028 uint32_t *dst = iter->buffer;
3029 uint32_t *src = (uint32_t *)iter->bits;
3031 iter->bits += iter->stride;
3033 while (w && ((uintptr_t)dst) & 0x0f)
3035 *dst++ = (*src++) | 0xff000000;
3036 w--;
3039 while (w >= 4)
3041 save_128_aligned(dst, vec_or(load_128_unaligned(src), ff000000));
3043 dst += 4;
3044 src += 4;
3045 w -= 4;
3048 while (w)
3050 *dst++ = (*src++) | 0xff000000;
3051 w--;
3054 return iter->buffer;
3057 static uint32_t *
3058 vmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3060 int w = iter->width;
3061 uint32_t *dst = iter->buffer;
3062 uint8_t *src = iter->bits;
3063 vector unsigned int vmx0, vmx1, vmx2, vmx3, vmx4, vmx5, vmx6;
3065 iter->bits += iter->stride;
3067 while (w && (((uintptr_t)dst) & 15))
3069 *dst++ = *(src++) << 24;
3070 w--;
3073 while (w >= 16)
3075 vmx0 = load_128_unaligned((uint32_t *) src);
3077 unpack_128_2x128((vector unsigned int) AVV(0), vmx0, &vmx1, &vmx2);
3078 unpack_128_2x128_16((vector unsigned int) AVV(0), vmx1, &vmx3, &vmx4);
3079 unpack_128_2x128_16((vector unsigned int) AVV(0), vmx2, &vmx5, &vmx6);
3081 save_128_aligned(dst, vmx6);
3082 save_128_aligned((dst + 4), vmx5);
3083 save_128_aligned((dst + 8), vmx4);
3084 save_128_aligned((dst + 12), vmx3);
3086 dst += 16;
3087 src += 16;
3088 w -= 16;
3091 while (w)
3093 *dst++ = *(src++) << 24;
3094 w--;
3097 return iter->buffer;
3100 #define IMAGE_FLAGS \
3101 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
3102 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3104 static const pixman_iter_info_t vmx_iters[] =
3106 { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
3107 _pixman_iter_init_bits_stride, vmx_fetch_x8r8g8b8, NULL
3109 { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
3110 _pixman_iter_init_bits_stride, vmx_fetch_a8, NULL
3112 { PIXMAN_null },
3115 pixman_implementation_t *
3116 _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
3118 pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
3120 /* VMX constants */
3121 mask_ff000000 = create_mask_32_128 (0xff000000);
3122 mask_red = create_mask_32_128 (0x00f80000);
3123 mask_green = create_mask_32_128 (0x0000fc00);
3124 mask_blue = create_mask_32_128 (0x000000f8);
3125 mask_565_fix_rb = create_mask_32_128 (0x00e000e0);
3126 mask_565_fix_g = create_mask_32_128 (0x0000c000);
3128 /* Set up function pointers */
3130 imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
3131 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
3132 imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u;
3133 imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u;
3134 imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u;
3135 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u;
3136 imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u;
3137 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
3138 imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u;
3140 imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
3142 imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca;
3143 imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca;
3144 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
3145 imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca;
3146 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca;
3147 imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca;
3148 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca;
3149 imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca;
3150 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
3151 imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
3152 imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
3154 imp->fill = vmx_fill;
3156 imp->iter_info = vmx_iters;
3158 return imp;