2 * Copyright © 2004, 2005 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. Red Hat makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
25 * Author: Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
29 * Based on work by Owen Taylor
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
38 #ifdef USE_LOONGSON_MMI
39 #include <loongson-mmintrin.h>
43 #include "pixman-private.h"
44 #include "pixman-combine32.h"
45 #include "pixman-inlines.h"
48 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
53 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
54 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
55 extern __inline
void __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
63 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
64 # include <xmmintrin.h>
66 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
67 * instructions to be generated that we don't want. Just duplicate the
68 * functions we want to use. */
69 extern __inline
int __attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
70 _mm_movemask_pi8 (__m64 __A
)
74 asm ("pmovmskb %1, %0\n\t"
82 extern __inline __m64
__attribute__((__gnu_inline__
, __always_inline__
, __artificial__
))
83 _mm_mulhi_pu16 (__m64 __A
, __m64 __B
)
85 asm ("pmulhuw %1, %0\n\t"
92 # define _mm_shuffle_pi16(A, N) \
96 asm ("pshufw %2, %1, %0\n\t" \
98 : "y" (A), "K" ((const int8_t)N) \
107 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
108 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
111 /* Notes about writing mmx code
113 * give memory operands as the second operand. If you give it as the
114 * first, gcc will first load it into a register, then use that
119 * _mm_mullo_pi16 (x, mmx_constant);
123 * _mm_mullo_pi16 (mmx_constant, x);
125 * Also try to minimize dependencies. i.e. when you need a value, try
126 * to calculate it from a value that was calculated as early as
130 /* --------------- MMX primitives ------------------------------------- */
132 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
133 * the name of the member used to access the data.
134 * If __m64 requires using mm_cvt* intrinsics functions to convert between
135 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
136 * If __m64 and uint64_t values can just be cast to each other directly,
137 * then define USE_M64_CASTS.
138 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
141 # define M64_MEMBER m64_u64
143 # define USE_CVT_INTRINSICS
144 #elif defined(USE_LOONGSON_MMI)
145 # define USE_M64_DOUBLE
146 #elif defined(__GNUC__)
147 # define USE_M64_CASTS
148 #elif defined(__SUNPRO_C)
149 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
150 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
151 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
152 * is defined. If it is used, then the mm_cvt* intrinsics must be used.
154 # define USE_CVT_INTRINSICS
156 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
157 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
159 # define M64_MEMBER l_
163 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
164 typedef uint64_t mmxdatafield
;
166 typedef __m64 mmxdatafield
;
171 mmxdatafield mmx_4x00ff
;
172 mmxdatafield mmx_4x0080
;
173 mmxdatafield mmx_565_rgb
;
174 mmxdatafield mmx_565_unpack_multiplier
;
175 mmxdatafield mmx_565_pack_multiplier
;
176 mmxdatafield mmx_565_r
;
177 mmxdatafield mmx_565_g
;
178 mmxdatafield mmx_565_b
;
179 mmxdatafield mmx_packed_565_rb
;
180 mmxdatafield mmx_packed_565_g
;
181 mmxdatafield mmx_expand_565_g
;
182 mmxdatafield mmx_expand_565_b
;
183 mmxdatafield mmx_expand_565_r
;
184 #ifndef USE_LOONGSON_MMI
185 mmxdatafield mmx_mask_0
;
186 mmxdatafield mmx_mask_1
;
187 mmxdatafield mmx_mask_2
;
188 mmxdatafield mmx_mask_3
;
190 mmxdatafield mmx_full_alpha
;
191 mmxdatafield mmx_4x0101
;
192 mmxdatafield mmx_ff000000
;
195 #if defined(_MSC_VER)
196 # define MMXDATA_INIT(field, val) { val ## UI64 }
197 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
198 # define MMXDATA_INIT(field, val) field = { val ## ULL }
199 #else /* mmxdatafield is an integral type */
200 # define MMXDATA_INIT(field, val) field = val ## ULL
203 static const mmx_data_t c
=
205 MMXDATA_INIT (.mmx_4x00ff
, 0x00ff00ff00ff00ff),
206 MMXDATA_INIT (.mmx_4x0080
, 0x0080008000800080),
207 MMXDATA_INIT (.mmx_565_rgb
, 0x000001f0003f001f),
208 MMXDATA_INIT (.mmx_565_unpack_multiplier
, 0x0000008404100840),
209 MMXDATA_INIT (.mmx_565_pack_multiplier
, 0x2000000420000004),
210 MMXDATA_INIT (.mmx_565_r
, 0x000000f800000000),
211 MMXDATA_INIT (.mmx_565_g
, 0x0000000000fc0000),
212 MMXDATA_INIT (.mmx_565_b
, 0x00000000000000f8),
213 MMXDATA_INIT (.mmx_packed_565_rb
, 0x00f800f800f800f8),
214 MMXDATA_INIT (.mmx_packed_565_g
, 0x0000fc000000fc00),
215 MMXDATA_INIT (.mmx_expand_565_g
, 0x07e007e007e007e0),
216 MMXDATA_INIT (.mmx_expand_565_b
, 0x001f001f001f001f),
217 MMXDATA_INIT (.mmx_expand_565_r
, 0xf800f800f800f800),
218 #ifndef USE_LOONGSON_MMI
219 MMXDATA_INIT (.mmx_mask_0
, 0xffffffffffff0000),
220 MMXDATA_INIT (.mmx_mask_1
, 0xffffffff0000ffff),
221 MMXDATA_INIT (.mmx_mask_2
, 0xffff0000ffffffff),
222 MMXDATA_INIT (.mmx_mask_3
, 0x0000ffffffffffff),
224 MMXDATA_INIT (.mmx_full_alpha
, 0x00ff000000000000),
225 MMXDATA_INIT (.mmx_4x0101
, 0x0101010101010101),
226 MMXDATA_INIT (.mmx_ff000000
, 0xff000000ff000000),
229 #ifdef USE_CVT_INTRINSICS
230 # define MC(x) to_m64 (c.mmx_ ## x)
231 #elif defined(USE_M64_CASTS)
232 # define MC(x) ((__m64)c.mmx_ ## x)
233 #elif defined(USE_M64_DOUBLE)
234 # define MC(x) (*(__m64 *)&c.mmx_ ## x)
236 # define MC(x) c.mmx_ ## x
239 static force_inline __m64
242 #ifdef USE_CVT_INTRINSICS
243 return _mm_cvtsi64_m64 (x
);
244 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
249 #elif defined USE_M64_DOUBLE
251 #else /* USE_M64_CASTS */
256 static force_inline
uint64_t
259 #ifdef USE_CVT_INTRINSICS
260 return _mm_cvtm64_si64 (x
);
261 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
262 uint64_t res
= x
.M64_MEMBER
;
264 #elif defined USE_M64_DOUBLE
265 return *(uint64_t *)&x
;
266 #else /* USE_M64_CASTS */
271 static force_inline __m64
276 return _mm_slli_si64 (v
, s
);
278 return _mm_srli_si64 (v
, -s
);
283 static force_inline __m64
286 return _mm_xor_si64 (mask
, MC (4x00ff
));
289 /* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
290 * and maps its result to the same range.
292 * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
293 * Notation, Notation, Notation", the first of which is
295 * prod(a, b) = (a * b + 128) / 255.
297 * By approximating the division by 255 as 257/65536 it can be replaced by a
298 * multiply and a right shift. This is the implementation that we use in
299 * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
300 * 3DNow!, and unavailable at the time of the book's publication) to perform
301 * the multiplication and right shift in a single operation.
303 * prod(a, b) = ((a * b + 128) * 257) >> 16.
305 * A third way (how pix_multiply() was implemented prior to 14208344) exists
306 * also that performs the multiplication by 257 with adds and shifts.
308 * Where temp = a * b + 128
310 * prod(a, b) = (temp + (temp >> 8)) >> 8.
312 static force_inline __m64
313 pix_multiply (__m64 a
, __m64 b
)
317 res
= _mm_mullo_pi16 (a
, b
);
318 res
= _mm_adds_pu16 (res
, MC (4x0080
));
319 res
= _mm_mulhi_pu16 (res
, MC (4x0101
));
324 static force_inline __m64
325 pix_add (__m64 a
, __m64 b
)
327 return _mm_adds_pu8 (a
, b
);
330 static force_inline __m64
331 expand_alpha (__m64 pixel
)
333 return _mm_shuffle_pi16 (pixel
, _MM_SHUFFLE (3, 3, 3, 3));
336 static force_inline __m64
337 expand_alpha_rev (__m64 pixel
)
339 return _mm_shuffle_pi16 (pixel
, _MM_SHUFFLE (0, 0, 0, 0));
342 static force_inline __m64
343 invert_colors (__m64 pixel
)
345 return _mm_shuffle_pi16 (pixel
, _MM_SHUFFLE (3, 0, 1, 2));
348 static force_inline __m64
353 return _mm_adds_pu8 (src
, pix_multiply (dest
, negate (srca
)));
356 static force_inline __m64
357 over_rev_non_pre (__m64 src
, __m64 dest
)
359 __m64 srca
= expand_alpha (src
);
360 __m64 srcfaaa
= _mm_or_si64 (srca
, MC (full_alpha
));
362 return over (pix_multiply (invert_colors (src
), srcfaaa
), srca
, dest
);
365 static force_inline __m64
366 in (__m64 src
, __m64 mask
)
368 return pix_multiply (src
, mask
);
372 static force_inline __m64
373 in_over (__m64 src
, __m64 srca
, __m64 mask
, __m64 dest
)
375 return over (in (src
, mask
), pix_multiply (srca
, mask
), dest
);
380 #define in_over(src, srca, mask, dest) \
381 over (in (src, mask), pix_multiply (srca, mask), dest)
385 /* Elemental unaligned loads */
387 static force_inline __m64
ldq_u(__m64
*p
)
390 /* x86's alignment restrictions are very relaxed. */
392 #elif defined USE_ARM_IWMMXT
393 int align
= (uintptr_t)p
& 7;
397 aligned_p
= (__m64
*)((uintptr_t)p
& ~7);
398 return (__m64
) _mm_align_si64 (aligned_p
[0], aligned_p
[1], align
);
400 struct __una_u64
{ __m64 x
__attribute__((packed
)); };
401 const struct __una_u64
*ptr
= (const struct __una_u64
*) p
;
402 return (__m64
) ptr
->x
;
406 static force_inline
uint32_t ldl_u(const uint32_t *p
)
409 /* x86's alignment restrictions are very relaxed. */
412 struct __una_u32
{ uint32_t x
__attribute__((packed
)); };
413 const struct __una_u32
*ptr
= (const struct __una_u32
*) p
;
418 static force_inline __m64
419 load (const uint32_t *v
)
421 #ifdef USE_LOONGSON_MMI
423 asm ("lwc1 %0, %1\n\t"
429 return _mm_cvtsi32_si64 (*v
);
433 static force_inline __m64
434 load8888 (const uint32_t *v
)
436 #ifdef USE_LOONGSON_MMI
437 return _mm_unpacklo_pi8_f (*(__m32
*)v
, _mm_setzero_si64 ());
439 return _mm_unpacklo_pi8 (load (v
), _mm_setzero_si64 ());
443 static force_inline __m64
444 load8888u (const uint32_t *v
)
446 uint32_t l
= ldl_u (v
);
447 return load8888 (&l
);
450 static force_inline __m64
451 pack8888 (__m64 lo
, __m64 hi
)
453 return _mm_packs_pu16 (lo
, hi
);
456 static force_inline
void
457 store (uint32_t *dest
, __m64 v
)
459 #ifdef USE_LOONGSON_MMI
460 asm ("swc1 %1, %0\n\t"
466 *dest
= _mm_cvtsi64_si32 (v
);
470 static force_inline
void
471 store8888 (uint32_t *dest
, __m64 v
)
473 v
= pack8888 (v
, _mm_setzero_si64 ());
477 static force_inline pixman_bool_t
478 is_equal (__m64 a
, __m64 b
)
480 #ifdef USE_LOONGSON_MMI
481 /* __m64 is double, we can compare directly. */
484 return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a
, b
)) == 0xff;
488 static force_inline pixman_bool_t
491 #ifdef USE_LOONGSON_MMI
492 return is_equal (_mm_and_si64 (v
, MC (full_alpha
)), MC (full_alpha
));
494 __m64 ffs
= _mm_cmpeq_pi8 (v
, v
);
495 return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v
, ffs
)) & 0x40);
499 static force_inline pixman_bool_t
502 return is_equal (v
, _mm_setzero_si64 ());
505 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
509 * --- Expanding 565 in the low word ---
511 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
512 * m = m & (01f0003f001f);
513 * m = m * (008404100840);
516 * Note the trick here - the top word is shifted by another nibble to
517 * avoid it bumping into the middle word
519 static force_inline __m64
520 expand565 (__m64 pixel
, int pos
)
525 /* move pixel to low 16 bit and zero the rest */
526 #ifdef USE_LOONGSON_MMI
527 p
= loongson_extract_pi16 (p
, pos
);
529 p
= shift (shift (p
, (3 - pos
) * 16), -48);
532 t1
= shift (p
, 36 - 11);
533 t2
= shift (p
, 16 - 5);
535 p
= _mm_or_si64 (t1
, p
);
536 p
= _mm_or_si64 (t2
, p
);
537 p
= _mm_and_si64 (p
, MC (565_rgb
));
539 pixel
= _mm_mullo_pi16 (p
, MC (565_unpack_multiplier
));
540 return _mm_srli_pi16 (pixel
, 8);
543 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
547 static force_inline
void
548 expand_4xpacked565 (__m64 vin
, __m64
*vout0
, __m64
*vout1
, int full_alpha
)
550 __m64 t0
, t1
, alpha
= _mm_setzero_si64 ();
551 __m64 r
= _mm_and_si64 (vin
, MC (expand_565_r
));
552 __m64 g
= _mm_and_si64 (vin
, MC (expand_565_g
));
553 __m64 b
= _mm_and_si64 (vin
, MC (expand_565_b
));
555 alpha
= _mm_cmpeq_pi32 (alpha
, alpha
);
557 /* Replicate high bits into empty low bits. */
558 r
= _mm_or_si64 (_mm_srli_pi16 (r
, 8), _mm_srli_pi16 (r
, 13));
559 g
= _mm_or_si64 (_mm_srli_pi16 (g
, 3), _mm_srli_pi16 (g
, 9));
560 b
= _mm_or_si64 (_mm_slli_pi16 (b
, 3), _mm_srli_pi16 (b
, 2));
562 r
= _mm_packs_pu16 (r
, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */
563 g
= _mm_packs_pu16 (g
, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */
564 b
= _mm_packs_pu16 (b
, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */
566 t1
= _mm_unpacklo_pi8 (r
, alpha
); /* A3 R3 A2 R2 A1 R1 A0 R0 */
567 t0
= _mm_unpacklo_pi8 (b
, g
); /* G3 B3 G2 B2 G1 B1 G0 B0 */
569 *vout0
= _mm_unpacklo_pi16 (t0
, t1
); /* A1 R1 G1 B1 A0 R0 G0 B0 */
570 *vout1
= _mm_unpackhi_pi16 (t0
, t1
); /* A3 R3 G3 B3 A2 R2 G2 B2 */
573 static force_inline __m64
574 expand8888 (__m64 in
, int pos
)
577 return _mm_unpacklo_pi8 (in
, _mm_setzero_si64 ());
579 return _mm_unpackhi_pi8 (in
, _mm_setzero_si64 ());
582 static force_inline __m64
583 expandx888 (__m64 in
, int pos
)
585 return _mm_or_si64 (expand8888 (in
, pos
), MC (full_alpha
));
588 static force_inline
void
589 expand_4x565 (__m64 vin
, __m64
*vout0
, __m64
*vout1
, __m64
*vout2
, __m64
*vout3
, int full_alpha
)
592 expand_4xpacked565 (vin
, &v0
, &v1
, full_alpha
);
593 *vout0
= expand8888 (v0
, 0);
594 *vout1
= expand8888 (v0
, 1);
595 *vout2
= expand8888 (v1
, 0);
596 *vout3
= expand8888 (v1
, 1);
599 static force_inline __m64
600 pack_565 (__m64 pixel
, __m64 target
, int pos
)
606 r
= _mm_and_si64 (p
, MC (565_r
));
607 g
= _mm_and_si64 (p
, MC (565_g
));
608 b
= _mm_and_si64 (p
, MC (565_b
));
610 #ifdef USE_LOONGSON_MMI
611 r
= shift (r
, -(32 - 8));
612 g
= shift (g
, -(16 - 3));
613 b
= shift (b
, -(0 + 3));
615 p
= _mm_or_si64 (r
, g
);
616 p
= _mm_or_si64 (p
, b
);
617 return loongson_insert_pi16 (t
, p
, pos
);
619 r
= shift (r
, -(32 - 8) + pos
* 16);
620 g
= shift (g
, -(16 - 3) + pos
* 16);
621 b
= shift (b
, -(0 + 3) + pos
* 16);
624 t
= _mm_and_si64 (t
, MC (mask_0
));
626 t
= _mm_and_si64 (t
, MC (mask_1
));
628 t
= _mm_and_si64 (t
, MC (mask_2
));
630 t
= _mm_and_si64 (t
, MC (mask_3
));
632 p
= _mm_or_si64 (r
, t
);
633 p
= _mm_or_si64 (g
, p
);
635 return _mm_or_si64 (b
, p
);
639 static force_inline __m64
640 pack_4xpacked565 (__m64 a
, __m64 b
)
642 __m64 rb0
= _mm_and_si64 (a
, MC (packed_565_rb
));
643 __m64 rb1
= _mm_and_si64 (b
, MC (packed_565_rb
));
645 __m64 t0
= _mm_madd_pi16 (rb0
, MC (565_pack_multiplier
));
646 __m64 t1
= _mm_madd_pi16 (rb1
, MC (565_pack_multiplier
));
648 __m64 g0
= _mm_and_si64 (a
, MC (packed_565_g
));
649 __m64 g1
= _mm_and_si64 (b
, MC (packed_565_g
));
651 t0
= _mm_or_si64 (t0
, g0
);
652 t1
= _mm_or_si64 (t1
, g1
);
655 #ifdef USE_ARM_IWMMXT
657 return _mm_packs_pu32 (t0
, t1
);
659 t1
= shift(t1
, -5 + 16);
660 return _mm_shuffle_pi16 (_mm_or_si64 (t0
, t1
), _MM_SHUFFLE (3, 1, 2, 0));
666 static force_inline __m64
667 pack_4x565 (__m64 v0
, __m64 v1
, __m64 v2
, __m64 v3
)
669 return pack_4xpacked565 (pack8888 (v0
, v1
), pack8888 (v2
, v3
));
672 static force_inline __m64
673 pix_add_mul (__m64 x
, __m64 a
, __m64 y
, __m64 b
)
675 x
= pix_multiply (x
, a
);
676 y
= pix_multiply (y
, b
);
678 return pix_add (x
, y
);
683 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
685 #define pack_4x565(v0, v1, v2, v3) \
686 pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
688 #define pix_add_mul(x, a, y, b) \
689 ( x = pix_multiply (x, a), \
690 y = pix_multiply (y, b), \
695 /* --------------- MMX code patch for fbcompose.c --------------------- */
697 static force_inline __m64
698 combine (const uint32_t *src
, const uint32_t *mask
)
700 __m64 vsrc
= load8888 (src
);
704 __m64 m
= load8888 (mask
);
706 m
= expand_alpha (m
);
707 vsrc
= pix_multiply (vsrc
, m
);
713 static force_inline __m64
714 core_combine_over_u_pixel_mmx (__m64 vsrc
, __m64 vdst
)
716 vsrc
= _mm_unpacklo_pi8 (vsrc
, _mm_setzero_si64 ());
718 if (is_opaque (vsrc
))
722 else if (!is_zero (vsrc
))
724 return over (vsrc
, expand_alpha (vsrc
),
725 _mm_unpacklo_pi8 (vdst
, _mm_setzero_si64 ()));
728 return _mm_unpacklo_pi8 (vdst
, _mm_setzero_si64 ());
732 mmx_combine_over_u (pixman_implementation_t
*imp
,
735 const uint32_t * src
,
736 const uint32_t * mask
,
739 const uint32_t *end
= dest
+ width
;
743 __m64 vsrc
= combine (src
, mask
);
745 if (is_opaque (vsrc
))
747 store8888 (dest
, vsrc
);
749 else if (!is_zero (vsrc
))
751 __m64 sa
= expand_alpha (vsrc
);
752 store8888 (dest
, over (vsrc
, sa
, load8888 (dest
)));
764 mmx_combine_over_reverse_u (pixman_implementation_t
*imp
,
767 const uint32_t * src
,
768 const uint32_t * mask
,
771 const uint32_t *end
= dest
+ width
;
776 __m64 s
= combine (src
, mask
);
779 da
= expand_alpha (d
);
780 store8888 (dest
, over (d
, da
, s
));
791 mmx_combine_in_u (pixman_implementation_t
*imp
,
794 const uint32_t * src
,
795 const uint32_t * mask
,
798 const uint32_t *end
= dest
+ width
;
803 __m64 x
= combine (src
, mask
);
806 a
= expand_alpha (a
);
807 x
= pix_multiply (x
, a
);
820 mmx_combine_in_reverse_u (pixman_implementation_t
*imp
,
823 const uint32_t * src
,
824 const uint32_t * mask
,
827 const uint32_t *end
= dest
+ width
;
831 __m64 a
= combine (src
, mask
);
835 a
= expand_alpha (a
);
836 x
= pix_multiply (x
, a
);
848 mmx_combine_out_u (pixman_implementation_t
*imp
,
851 const uint32_t * src
,
852 const uint32_t * mask
,
855 const uint32_t *end
= dest
+ width
;
860 __m64 x
= combine (src
, mask
);
863 a
= expand_alpha (a
);
865 x
= pix_multiply (x
, a
);
877 mmx_combine_out_reverse_u (pixman_implementation_t
*imp
,
880 const uint32_t * src
,
881 const uint32_t * mask
,
884 const uint32_t *end
= dest
+ width
;
888 __m64 a
= combine (src
, mask
);
892 a
= expand_alpha (a
);
894 x
= pix_multiply (x
, a
);
907 mmx_combine_atop_u (pixman_implementation_t
*imp
,
910 const uint32_t * src
,
911 const uint32_t * mask
,
914 const uint32_t *end
= dest
+ width
;
919 __m64 s
= combine (src
, mask
);
922 sia
= expand_alpha (s
);
924 da
= expand_alpha (d
);
925 s
= pix_add_mul (s
, da
, d
, sia
);
937 mmx_combine_atop_reverse_u (pixman_implementation_t
*imp
,
940 const uint32_t * src
,
941 const uint32_t * mask
,
951 __m64 s
= combine (src
, mask
);
954 sa
= expand_alpha (s
);
955 dia
= expand_alpha (d
);
957 s
= pix_add_mul (s
, dia
, d
, sa
);
969 mmx_combine_xor_u (pixman_implementation_t
*imp
,
972 const uint32_t * src
,
973 const uint32_t * mask
,
976 const uint32_t *end
= dest
+ width
;
981 __m64 s
= combine (src
, mask
);
984 sia
= expand_alpha (s
);
985 dia
= expand_alpha (d
);
988 s
= pix_add_mul (s
, dia
, d
, sia
);
1000 mmx_combine_add_u (pixman_implementation_t
*imp
,
1003 const uint32_t * src
,
1004 const uint32_t * mask
,
1007 const uint32_t *end
= dest
+ width
;
1012 __m64 s
= combine (src
, mask
);
1014 d
= load8888 (dest
);
1016 store8888 (dest
, s
);
1027 mmx_combine_saturate_u (pixman_implementation_t
*imp
,
1030 const uint32_t * src
,
1031 const uint32_t * mask
,
1034 const uint32_t *end
= dest
+ width
;
1040 __m64 ms
= combine (src
, mask
);
1041 __m64 md
= load8888 (dest
);
1049 uint32_t quot
= DIV_UN8 (da
, sa
) << 24;
1050 __m64 msa
= load8888 ("
);
1051 msa
= expand_alpha (msa
);
1052 ms
= pix_multiply (ms
, msa
);
1055 md
= pix_add (md
, ms
);
1056 store8888 (dest
, md
);
1067 mmx_combine_src_ca (pixman_implementation_t
*imp
,
1070 const uint32_t * src
,
1071 const uint32_t * mask
,
1074 const uint32_t *end
= src
+ width
;
1078 __m64 a
= load8888 (mask
);
1079 __m64 s
= load8888 (src
);
1081 s
= pix_multiply (s
, a
);
1082 store8888 (dest
, s
);
1092 mmx_combine_over_ca (pixman_implementation_t
*imp
,
1095 const uint32_t * src
,
1096 const uint32_t * mask
,
1099 const uint32_t *end
= src
+ width
;
1103 __m64 a
= load8888 (mask
);
1104 __m64 s
= load8888 (src
);
1105 __m64 d
= load8888 (dest
);
1106 __m64 sa
= expand_alpha (s
);
1108 store8888 (dest
, in_over (s
, sa
, a
, d
));
1118 mmx_combine_over_reverse_ca (pixman_implementation_t
*imp
,
1121 const uint32_t * src
,
1122 const uint32_t * mask
,
1125 const uint32_t *end
= src
+ width
;
1129 __m64 a
= load8888 (mask
);
1130 __m64 s
= load8888 (src
);
1131 __m64 d
= load8888 (dest
);
1132 __m64 da
= expand_alpha (d
);
1134 store8888 (dest
, over (d
, da
, in (s
, a
)));
1144 mmx_combine_in_ca (pixman_implementation_t
*imp
,
1147 const uint32_t * src
,
1148 const uint32_t * mask
,
1151 const uint32_t *end
= src
+ width
;
1155 __m64 a
= load8888 (mask
);
1156 __m64 s
= load8888 (src
);
1157 __m64 d
= load8888 (dest
);
1158 __m64 da
= expand_alpha (d
);
1160 s
= pix_multiply (s
, a
);
1161 s
= pix_multiply (s
, da
);
1162 store8888 (dest
, s
);
1172 mmx_combine_in_reverse_ca (pixman_implementation_t
*imp
,
1175 const uint32_t * src
,
1176 const uint32_t * mask
,
1179 const uint32_t *end
= src
+ width
;
1183 __m64 a
= load8888 (mask
);
1184 __m64 s
= load8888 (src
);
1185 __m64 d
= load8888 (dest
);
1186 __m64 sa
= expand_alpha (s
);
1188 a
= pix_multiply (a
, sa
);
1189 d
= pix_multiply (d
, a
);
1190 store8888 (dest
, d
);
1200 mmx_combine_out_ca (pixman_implementation_t
*imp
,
1203 const uint32_t * src
,
1204 const uint32_t * mask
,
1207 const uint32_t *end
= src
+ width
;
1211 __m64 a
= load8888 (mask
);
1212 __m64 s
= load8888 (src
);
1213 __m64 d
= load8888 (dest
);
1214 __m64 da
= expand_alpha (d
);
1217 s
= pix_multiply (s
, a
);
1218 s
= pix_multiply (s
, da
);
1219 store8888 (dest
, s
);
1229 mmx_combine_out_reverse_ca (pixman_implementation_t
*imp
,
1232 const uint32_t * src
,
1233 const uint32_t * mask
,
1236 const uint32_t *end
= src
+ width
;
1240 __m64 a
= load8888 (mask
);
1241 __m64 s
= load8888 (src
);
1242 __m64 d
= load8888 (dest
);
1243 __m64 sa
= expand_alpha (s
);
1245 a
= pix_multiply (a
, sa
);
1247 d
= pix_multiply (d
, a
);
1248 store8888 (dest
, d
);
1258 mmx_combine_atop_ca (pixman_implementation_t
*imp
,
1261 const uint32_t * src
,
1262 const uint32_t * mask
,
1265 const uint32_t *end
= src
+ width
;
1269 __m64 a
= load8888 (mask
);
1270 __m64 s
= load8888 (src
);
1271 __m64 d
= load8888 (dest
);
1272 __m64 da
= expand_alpha (d
);
1273 __m64 sa
= expand_alpha (s
);
1275 s
= pix_multiply (s
, a
);
1276 a
= pix_multiply (a
, sa
);
1278 d
= pix_add_mul (d
, a
, s
, da
);
1279 store8888 (dest
, d
);
1289 mmx_combine_atop_reverse_ca (pixman_implementation_t
*imp
,
1292 const uint32_t * src
,
1293 const uint32_t * mask
,
1296 const uint32_t *end
= src
+ width
;
1300 __m64 a
= load8888 (mask
);
1301 __m64 s
= load8888 (src
);
1302 __m64 d
= load8888 (dest
);
1303 __m64 da
= expand_alpha (d
);
1304 __m64 sa
= expand_alpha (s
);
1306 s
= pix_multiply (s
, a
);
1307 a
= pix_multiply (a
, sa
);
1309 d
= pix_add_mul (d
, a
, s
, da
);
1310 store8888 (dest
, d
);
1320 mmx_combine_xor_ca (pixman_implementation_t
*imp
,
1323 const uint32_t * src
,
1324 const uint32_t * mask
,
1327 const uint32_t *end
= src
+ width
;
1331 __m64 a
= load8888 (mask
);
1332 __m64 s
= load8888 (src
);
1333 __m64 d
= load8888 (dest
);
1334 __m64 da
= expand_alpha (d
);
1335 __m64 sa
= expand_alpha (s
);
1337 s
= pix_multiply (s
, a
);
1338 a
= pix_multiply (a
, sa
);
1341 d
= pix_add_mul (d
, a
, s
, da
);
1342 store8888 (dest
, d
);
1352 mmx_combine_add_ca (pixman_implementation_t
*imp
,
1355 const uint32_t * src
,
1356 const uint32_t * mask
,
1359 const uint32_t *end
= src
+ width
;
1363 __m64 a
= load8888 (mask
);
1364 __m64 s
= load8888 (src
);
1365 __m64 d
= load8888 (dest
);
1367 s
= pix_multiply (s
, a
);
1369 store8888 (dest
, d
);
1378 /* ------------- MMX code paths called from fbpict.c -------------------- */
1381 mmx_composite_over_n_8888 (pixman_implementation_t
*imp
,
1382 pixman_composite_info_t
*info
)
1384 PIXMAN_COMPOSITE_ARGS (info
);
1386 uint32_t *dst_line
, *dst
;
1393 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
1398 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
1400 vsrc
= load8888 (&src
);
1401 vsrca
= expand_alpha (vsrc
);
1406 dst_line
+= dst_stride
;
1411 while (w
&& (uintptr_t)dst
& 7)
1413 store8888 (dst
, over (vsrc
, vsrca
, load8888 (dst
)));
1424 vdest
= *(__m64
*)dst
;
1426 dest0
= over (vsrc
, vsrca
, expand8888 (vdest
, 0));
1427 dest1
= over (vsrc
, vsrca
, expand8888 (vdest
, 1));
1429 *(__m64
*)dst
= pack8888 (dest0
, dest1
);
1439 store8888 (dst
, over (vsrc
, vsrca
, load8888 (dst
)));
1447 mmx_composite_over_n_0565 (pixman_implementation_t
*imp
,
1448 pixman_composite_info_t
*info
)
1450 PIXMAN_COMPOSITE_ARGS (info
);
1452 uint16_t *dst_line
, *dst
;
1459 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
1464 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint16_t, dst_stride
, dst_line
, 1);
1466 vsrc
= load8888 (&src
);
1467 vsrca
= expand_alpha (vsrc
);
1472 dst_line
+= dst_stride
;
1477 while (w
&& (uintptr_t)dst
& 7)
1480 __m64 vdest
= expand565 (to_m64 (d
), 0);
1482 vdest
= pack_565 (over (vsrc
, vsrca
, vdest
), vdest
, 0);
1483 *dst
= to_uint64 (vdest
);
1491 __m64 vdest
= *(__m64
*)dst
;
1492 __m64 v0
, v1
, v2
, v3
;
1494 expand_4x565 (vdest
, &v0
, &v1
, &v2
, &v3
, 0);
1496 v0
= over (vsrc
, vsrca
, v0
);
1497 v1
= over (vsrc
, vsrca
, v1
);
1498 v2
= over (vsrc
, vsrca
, v2
);
1499 v3
= over (vsrc
, vsrca
, v3
);
1501 *(__m64
*)dst
= pack_4x565 (v0
, v1
, v2
, v3
);
1512 __m64 vdest
= expand565 (to_m64 (d
), 0);
1514 vdest
= pack_565 (over (vsrc
, vsrca
, vdest
), vdest
, 0);
1515 *dst
= to_uint64 (vdest
);
1526 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t
*imp
,
1527 pixman_composite_info_t
*info
)
1529 PIXMAN_COMPOSITE_ARGS (info
);
1532 uint32_t *mask_line
;
1533 int dst_stride
, mask_stride
;
1538 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
1543 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
1544 PIXMAN_IMAGE_GET_LINE (mask_image
, mask_x
, mask_y
, uint32_t, mask_stride
, mask_line
, 1);
1546 vsrc
= load8888 (&src
);
1547 vsrca
= expand_alpha (vsrc
);
1552 uint32_t *p
= (uint32_t *)mask_line
;
1553 uint32_t *q
= (uint32_t *)dst_line
;
1555 while (twidth
&& (uintptr_t)q
& 7)
1557 uint32_t m
= *(uint32_t *)p
;
1561 __m64 vdest
= load8888 (q
);
1562 vdest
= in_over (vsrc
, vsrca
, load8888 (&m
), vdest
);
1563 store8888 (q
, vdest
);
1580 __m64 vdest
= *(__m64
*)q
;
1582 dest0
= in_over (vsrc
, vsrca
, load8888 (&m0
),
1583 expand8888 (vdest
, 0));
1584 dest1
= in_over (vsrc
, vsrca
, load8888 (&m1
),
1585 expand8888 (vdest
, 1));
1587 *(__m64
*)q
= pack8888 (dest0
, dest1
);
1597 uint32_t m
= *(uint32_t *)p
;
1601 __m64 vdest
= load8888 (q
);
1602 vdest
= in_over (vsrc
, vsrca
, load8888 (&m
), vdest
);
1603 store8888 (q
, vdest
);
1611 dst_line
+= dst_stride
;
1612 mask_line
+= mask_stride
;
1619 mmx_composite_over_8888_n_8888 (pixman_implementation_t
*imp
,
1620 pixman_composite_info_t
*info
)
1622 PIXMAN_COMPOSITE_ARGS (info
);
1623 uint32_t *dst_line
, *dst
;
1624 uint32_t *src_line
, *src
;
1627 int dst_stride
, src_stride
;
1632 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
1633 PIXMAN_IMAGE_GET_LINE (src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
1635 mask
= _pixman_image_get_solid (imp
, mask_image
, dest_image
->bits
.format
);
1636 vmask
= expand_alpha (load8888 (&mask
));
1641 dst_line
+= dst_stride
;
1643 src_line
+= src_stride
;
1646 while (w
&& (uintptr_t)dst
& 7)
1648 __m64 s
= load8888 (src
);
1649 __m64 d
= load8888 (dst
);
1651 store8888 (dst
, in_over (s
, expand_alpha (s
), vmask
, d
));
1660 __m64 vs
= ldq_u ((__m64
*)src
);
1661 __m64 vd
= *(__m64
*)dst
;
1662 __m64 vsrc0
= expand8888 (vs
, 0);
1663 __m64 vsrc1
= expand8888 (vs
, 1);
1665 *(__m64
*)dst
= pack8888 (
1666 in_over (vsrc0
, expand_alpha (vsrc0
), vmask
, expand8888 (vd
, 0)),
1667 in_over (vsrc1
, expand_alpha (vsrc1
), vmask
, expand8888 (vd
, 1)));
1676 __m64 s
= load8888 (src
);
1677 __m64 d
= load8888 (dst
);
1679 store8888 (dst
, in_over (s
, expand_alpha (s
), vmask
, d
));
1687 mmx_composite_over_x888_n_8888 (pixman_implementation_t
*imp
,
1688 pixman_composite_info_t
*info
)
1690 PIXMAN_COMPOSITE_ARGS (info
);
1691 uint32_t *dst_line
, *dst
;
1692 uint32_t *src_line
, *src
;
1695 int dst_stride
, src_stride
;
1701 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
1702 PIXMAN_IMAGE_GET_LINE (src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
1703 mask
= _pixman_image_get_solid (imp
, mask_image
, dest_image
->bits
.format
);
1705 vmask
= expand_alpha (load8888 (&mask
));
1711 dst_line
+= dst_stride
;
1713 src_line
+= src_stride
;
1716 while (w
&& (uintptr_t)dst
& 7)
1718 uint32_t ssrc
= *src
| 0xff000000;
1719 __m64 s
= load8888 (&ssrc
);
1720 __m64 d
= load8888 (dst
);
1722 store8888 (dst
, in_over (s
, srca
, vmask
, d
));
1731 __m64 vd0
= *(__m64
*)(dst
+ 0);
1732 __m64 vd1
= *(__m64
*)(dst
+ 2);
1733 __m64 vd2
= *(__m64
*)(dst
+ 4);
1734 __m64 vd3
= *(__m64
*)(dst
+ 6);
1735 __m64 vd4
= *(__m64
*)(dst
+ 8);
1736 __m64 vd5
= *(__m64
*)(dst
+ 10);
1737 __m64 vd6
= *(__m64
*)(dst
+ 12);
1738 __m64 vd7
= *(__m64
*)(dst
+ 14);
1740 __m64 vs0
= ldq_u ((__m64
*)(src
+ 0));
1741 __m64 vs1
= ldq_u ((__m64
*)(src
+ 2));
1742 __m64 vs2
= ldq_u ((__m64
*)(src
+ 4));
1743 __m64 vs3
= ldq_u ((__m64
*)(src
+ 6));
1744 __m64 vs4
= ldq_u ((__m64
*)(src
+ 8));
1745 __m64 vs5
= ldq_u ((__m64
*)(src
+ 10));
1746 __m64 vs6
= ldq_u ((__m64
*)(src
+ 12));
1747 __m64 vs7
= ldq_u ((__m64
*)(src
+ 14));
1750 in_over (expandx888 (vs0
, 0), srca
, vmask
, expand8888 (vd0
, 0)),
1751 in_over (expandx888 (vs0
, 1), srca
, vmask
, expand8888 (vd0
, 1)));
1754 in_over (expandx888 (vs1
, 0), srca
, vmask
, expand8888 (vd1
, 0)),
1755 in_over (expandx888 (vs1
, 1), srca
, vmask
, expand8888 (vd1
, 1)));
1758 in_over (expandx888 (vs2
, 0), srca
, vmask
, expand8888 (vd2
, 0)),
1759 in_over (expandx888 (vs2
, 1), srca
, vmask
, expand8888 (vd2
, 1)));
1762 in_over (expandx888 (vs3
, 0), srca
, vmask
, expand8888 (vd3
, 0)),
1763 in_over (expandx888 (vs3
, 1), srca
, vmask
, expand8888 (vd3
, 1)));
1766 in_over (expandx888 (vs4
, 0), srca
, vmask
, expand8888 (vd4
, 0)),
1767 in_over (expandx888 (vs4
, 1), srca
, vmask
, expand8888 (vd4
, 1)));
1770 in_over (expandx888 (vs5
, 0), srca
, vmask
, expand8888 (vd5
, 0)),
1771 in_over (expandx888 (vs5
, 1), srca
, vmask
, expand8888 (vd5
, 1)));
1774 in_over (expandx888 (vs6
, 0), srca
, vmask
, expand8888 (vd6
, 0)),
1775 in_over (expandx888 (vs6
, 1), srca
, vmask
, expand8888 (vd6
, 1)));
1778 in_over (expandx888 (vs7
, 0), srca
, vmask
, expand8888 (vd7
, 0)),
1779 in_over (expandx888 (vs7
, 1), srca
, vmask
, expand8888 (vd7
, 1)));
1781 *(__m64
*)(dst
+ 0) = vd0
;
1782 *(__m64
*)(dst
+ 2) = vd1
;
1783 *(__m64
*)(dst
+ 4) = vd2
;
1784 *(__m64
*)(dst
+ 6) = vd3
;
1785 *(__m64
*)(dst
+ 8) = vd4
;
1786 *(__m64
*)(dst
+ 10) = vd5
;
1787 *(__m64
*)(dst
+ 12) = vd6
;
1788 *(__m64
*)(dst
+ 14) = vd7
;
1797 uint32_t ssrc
= *src
| 0xff000000;
1798 __m64 s
= load8888 (&ssrc
);
1799 __m64 d
= load8888 (dst
);
1801 store8888 (dst
, in_over (s
, srca
, vmask
, d
));
1813 mmx_composite_over_8888_8888 (pixman_implementation_t
*imp
,
1814 pixman_composite_info_t
*info
)
1816 PIXMAN_COMPOSITE_ARGS (info
);
1817 uint32_t *dst_line
, *dst
;
1818 uint32_t *src_line
, *src
;
1820 int dst_stride
, src_stride
;
1826 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
1827 PIXMAN_IMAGE_GET_LINE (src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
1832 dst_line
+= dst_stride
;
1834 src_line
+= src_stride
;
1850 sa
= expand_alpha (ms
);
1851 store8888 (dst
, over (ms
, sa
, load8888 (dst
)));
1861 mmx_composite_over_8888_0565 (pixman_implementation_t
*imp
,
1862 pixman_composite_info_t
*info
)
1864 PIXMAN_COMPOSITE_ARGS (info
);
1865 uint16_t *dst_line
, *dst
;
1866 uint32_t *src_line
, *src
;
1867 int dst_stride
, src_stride
;
1872 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint16_t, dst_stride
, dst_line
, 1);
1873 PIXMAN_IMAGE_GET_LINE (src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
1877 assert (src_image
->drawable
== mask_image
->drawable
);
1883 dst_line
+= dst_stride
;
1885 src_line
+= src_stride
;
1890 while (w
&& (uintptr_t)dst
& 7)
1892 __m64 vsrc
= load8888 (src
);
1894 __m64 vdest
= expand565 (to_m64 (d
), 0);
1897 over (vsrc
, expand_alpha (vsrc
), vdest
), vdest
, 0);
1899 *dst
= to_uint64 (vdest
);
1910 __m64 vdest
= *(__m64
*)dst
;
1911 __m64 v0
, v1
, v2
, v3
;
1912 __m64 vsrc0
, vsrc1
, vsrc2
, vsrc3
;
1914 expand_4x565 (vdest
, &v0
, &v1
, &v2
, &v3
, 0);
1916 vsrc0
= load8888 ((src
+ 0));
1917 vsrc1
= load8888 ((src
+ 1));
1918 vsrc2
= load8888 ((src
+ 2));
1919 vsrc3
= load8888 ((src
+ 3));
1921 v0
= over (vsrc0
, expand_alpha (vsrc0
), v0
);
1922 v1
= over (vsrc1
, expand_alpha (vsrc1
), v1
);
1923 v2
= over (vsrc2
, expand_alpha (vsrc2
), v2
);
1924 v3
= over (vsrc3
, expand_alpha (vsrc3
), v3
);
1926 *(__m64
*)dst
= pack_4x565 (v0
, v1
, v2
, v3
);
1937 __m64 vsrc
= load8888 (src
);
1939 __m64 vdest
= expand565 (to_m64 (d
), 0);
1941 vdest
= pack_565 (over (vsrc
, expand_alpha (vsrc
), vdest
), vdest
, 0);
1943 *dst
= to_uint64 (vdest
);
1955 mmx_composite_over_n_8_8888 (pixman_implementation_t
*imp
,
1956 pixman_composite_info_t
*info
)
1958 PIXMAN_COMPOSITE_ARGS (info
);
1960 uint32_t *dst_line
, *dst
;
1961 uint8_t *mask_line
, *mask
;
1962 int dst_stride
, mask_stride
;
1969 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
1975 srcsrc
= (uint64_t)src
<< 32 | src
;
1977 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
1978 PIXMAN_IMAGE_GET_LINE (mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
1980 vsrc
= load8888 (&src
);
1981 vsrca
= expand_alpha (vsrc
);
1986 dst_line
+= dst_stride
;
1988 mask_line
+= mask_stride
;
1993 while (w
&& (uintptr_t)dst
& 7)
1999 __m64 vdest
= in_over (vsrc
, vsrca
,
2000 expand_alpha_rev (to_m64 (m
)),
2003 store8888 (dst
, vdest
);
2020 if (srca
== 0xff && (m0
& m1
) == 0xff)
2022 *(uint64_t *)dst
= srcsrc
;
2029 vdest
= *(__m64
*)dst
;
2031 dest0
= in_over (vsrc
, vsrca
, expand_alpha_rev (to_m64 (m0
)),
2032 expand8888 (vdest
, 0));
2033 dest1
= in_over (vsrc
, vsrca
, expand_alpha_rev (to_m64 (m1
)),
2034 expand8888 (vdest
, 1));
2036 *(__m64
*)dst
= pack8888 (dest0
, dest1
);
2052 __m64 vdest
= load8888 (dst
);
2055 vsrc
, vsrca
, expand_alpha_rev (to_m64 (m
)), vdest
);
2056 store8888 (dst
, vdest
);
2064 static pixman_bool_t
2065 mmx_fill (pixman_implementation_t
*imp
,
2077 uint32_t byte_width
;
2080 #if defined __GNUC__ && defined USE_X86_MMX
2081 __m64 v1
, v2
, v3
, v4
, v5
, v6
, v7
;
2084 if (bpp
!= 16 && bpp
!= 32 && bpp
!= 8)
2089 stride
= stride
* (int) sizeof (uint32_t) / 1;
2090 byte_line
= (uint8_t *)(((uint8_t *)bits
) + stride
* y
+ x
);
2093 filler
= (filler
& 0xff) * 0x01010101;
2097 stride
= stride
* (int) sizeof (uint32_t) / 2;
2098 byte_line
= (uint8_t *)(((uint16_t *)bits
) + stride
* y
+ x
);
2099 byte_width
= 2 * width
;
2101 filler
= (filler
& 0xffff) * 0x00010001;
2105 stride
= stride
* (int) sizeof (uint32_t) / 4;
2106 byte_line
= (uint8_t *)(((uint32_t *)bits
) + stride
* y
+ x
);
2107 byte_width
= 4 * width
;
2111 fill
= ((uint64_t)filler
<< 32) | filler
;
2112 vfill
= to_m64 (fill
);
2114 #if defined __GNUC__ && defined USE_X86_MMX
2123 : "=&y" (v1
), "=&y" (v2
), "=&y" (v3
),
2124 "=&y" (v4
), "=&y" (v5
), "=&y" (v6
), "=y" (v7
)
2131 uint8_t *d
= byte_line
;
2133 byte_line
+= stride
;
2136 if (w
>= 1 && ((uintptr_t)d
& 1))
2138 *(uint8_t *)d
= (filler
& 0xff);
2143 if (w
>= 2 && ((uintptr_t)d
& 3))
2145 *(uint16_t *)d
= filler
;
2150 while (w
>= 4 && ((uintptr_t)d
& 7))
2152 *(uint32_t *)d
= filler
;
2160 #if defined __GNUC__ && defined USE_X86_MMX
2172 "y" (vfill
), "y" (v1
), "y" (v2
), "y" (v3
),
2173 "y" (v4
), "y" (v5
), "y" (v6
), "y" (v7
)
2176 *(__m64
*) (d
+ 0) = vfill
;
2177 *(__m64
*) (d
+ 8) = vfill
;
2178 *(__m64
*) (d
+ 16) = vfill
;
2179 *(__m64
*) (d
+ 24) = vfill
;
2180 *(__m64
*) (d
+ 32) = vfill
;
2181 *(__m64
*) (d
+ 40) = vfill
;
2182 *(__m64
*) (d
+ 48) = vfill
;
2183 *(__m64
*) (d
+ 56) = vfill
;
2191 *(uint32_t *)d
= filler
;
2198 *(uint16_t *)d
= filler
;
2204 *(uint8_t *)d
= (filler
& 0xff);
2216 mmx_composite_src_x888_0565 (pixman_implementation_t
*imp
,
2217 pixman_composite_info_t
*info
)
2219 PIXMAN_COMPOSITE_ARGS (info
);
2220 uint16_t *dst_line
, *dst
;
2221 uint32_t *src_line
, *src
, s
;
2222 int dst_stride
, src_stride
;
2225 PIXMAN_IMAGE_GET_LINE (src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
2226 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint16_t, dst_stride
, dst_line
, 1);
2231 dst_line
+= dst_stride
;
2233 src_line
+= src_stride
;
2236 while (w
&& (uintptr_t)dst
& 7)
2239 *dst
= convert_8888_to_0565 (s
);
2247 __m64 vsrc0
= ldq_u ((__m64
*)(src
+ 0));
2248 __m64 vsrc1
= ldq_u ((__m64
*)(src
+ 2));
2250 vdest
= pack_4xpacked565 (vsrc0
, vsrc1
);
2252 *(__m64
*)dst
= vdest
;
2262 *dst
= convert_8888_to_0565 (s
);
2272 mmx_composite_src_n_8_8888 (pixman_implementation_t
*imp
,
2273 pixman_composite_info_t
*info
)
2275 PIXMAN_COMPOSITE_ARGS (info
);
2277 uint32_t *dst_line
, *dst
;
2278 uint8_t *mask_line
, *mask
;
2279 int dst_stride
, mask_stride
;
2286 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
2291 mmx_fill (imp
, dest_image
->bits
.bits
, dest_image
->bits
.rowstride
,
2292 PIXMAN_FORMAT_BPP (dest_image
->bits
.format
),
2293 dest_x
, dest_y
, width
, height
, 0);
2297 srcsrc
= (uint64_t)src
<< 32 | src
;
2299 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2300 PIXMAN_IMAGE_GET_LINE (mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
2302 vsrc
= load8888 (&src
);
2307 dst_line
+= dst_stride
;
2309 mask_line
+= mask_stride
;
2314 while (w
&& (uintptr_t)dst
& 7)
2320 __m64 vdest
= in (vsrc
, expand_alpha_rev (to_m64 (m
)));
2322 store8888 (dst
, vdest
);
2342 if (srca
== 0xff && (m0
& m1
) == 0xff)
2344 *(uint64_t *)dst
= srcsrc
;
2350 dest0
= in (vsrc
, expand_alpha_rev (to_m64 (m0
)));
2351 dest1
= in (vsrc
, expand_alpha_rev (to_m64 (m1
)));
2353 *(__m64
*)dst
= pack8888 (dest0
, dest1
);
2357 *(uint64_t *)dst
= 0;
2373 __m64 vdest
= load8888 (dst
);
2375 vdest
= in (vsrc
, expand_alpha_rev (to_m64 (m
)));
2376 store8888 (dst
, vdest
);
2389 mmx_composite_over_n_8_0565 (pixman_implementation_t
*imp
,
2390 pixman_composite_info_t
*info
)
2392 PIXMAN_COMPOSITE_ARGS (info
);
2394 uint16_t *dst_line
, *dst
;
2395 uint8_t *mask_line
, *mask
;
2396 int dst_stride
, mask_stride
;
2398 __m64 vsrc
, vsrca
, tmp
;
2403 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
2409 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint16_t, dst_stride
, dst_line
, 1);
2410 PIXMAN_IMAGE_GET_LINE (mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
2412 vsrc
= load8888 (&src
);
2413 vsrca
= expand_alpha (vsrc
);
2415 tmp
= pack_565 (vsrc
, _mm_setzero_si64 (), 0);
2416 srcsrcsrcsrc
= expand_alpha_rev (tmp
);
2421 dst_line
+= dst_stride
;
2423 mask_line
+= mask_stride
;
2428 while (w
&& (uintptr_t)dst
& 7)
2435 __m64 vd
= to_m64 (d
);
2436 __m64 vdest
= in_over (
2437 vsrc
, vsrca
, expand_alpha_rev (to_m64 (m
)), expand565 (vd
, 0));
2439 vd
= pack_565 (vdest
, _mm_setzero_si64 (), 0);
2440 *dst
= to_uint64 (vd
);
2452 uint64_t m0
, m1
, m2
, m3
;
2458 if (srca
== 0xff && (m0
& m1
& m2
& m3
) == 0xff)
2460 *(__m64
*)dst
= srcsrcsrcsrc
;
2462 else if (m0
| m1
| m2
| m3
)
2464 __m64 vdest
= *(__m64
*)dst
;
2465 __m64 v0
, v1
, v2
, v3
;
2466 __m64 vm0
, vm1
, vm2
, vm3
;
2468 expand_4x565 (vdest
, &v0
, &v1
, &v2
, &v3
, 0);
2471 v0
= in_over (vsrc
, vsrca
, expand_alpha_rev (vm0
), v0
);
2474 v1
= in_over (vsrc
, vsrca
, expand_alpha_rev (vm1
), v1
);
2477 v2
= in_over (vsrc
, vsrca
, expand_alpha_rev (vm2
), v2
);
2480 v3
= in_over (vsrc
, vsrca
, expand_alpha_rev (vm3
), v3
);
2482 *(__m64
*)dst
= pack_4x565 (v0
, v1
, v2
, v3
);;
2499 __m64 vd
= to_m64 (d
);
2500 __m64 vdest
= in_over (vsrc
, vsrca
, expand_alpha_rev (to_m64 (m
)),
2502 vd
= pack_565 (vdest
, _mm_setzero_si64 (), 0);
2503 *dst
= to_uint64 (vd
);
2516 mmx_composite_over_pixbuf_0565 (pixman_implementation_t
*imp
,
2517 pixman_composite_info_t
*info
)
2519 PIXMAN_COMPOSITE_ARGS (info
);
2520 uint16_t *dst_line
, *dst
;
2521 uint32_t *src_line
, *src
;
2522 int dst_stride
, src_stride
;
2527 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint16_t, dst_stride
, dst_line
, 1);
2528 PIXMAN_IMAGE_GET_LINE (src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
2532 assert (src_image
->drawable
== mask_image
->drawable
);
2538 dst_line
+= dst_stride
;
2540 src_line
+= src_stride
;
2545 while (w
&& (uintptr_t)dst
& 7)
2547 __m64 vsrc
= load8888 (src
);
2549 __m64 vdest
= expand565 (to_m64 (d
), 0);
2551 vdest
= pack_565 (over_rev_non_pre (vsrc
, vdest
), vdest
, 0);
2553 *dst
= to_uint64 (vdest
);
2564 uint32_t s0
, s1
, s2
, s3
;
2565 unsigned char a0
, a1
, a2
, a3
;
2577 if ((a0
& a1
& a2
& a3
) == 0xFF)
2579 __m64 v0
= invert_colors (load8888 (&s0
));
2580 __m64 v1
= invert_colors (load8888 (&s1
));
2581 __m64 v2
= invert_colors (load8888 (&s2
));
2582 __m64 v3
= invert_colors (load8888 (&s3
));
2584 *(__m64
*)dst
= pack_4x565 (v0
, v1
, v2
, v3
);
2586 else if (s0
| s1
| s2
| s3
)
2588 __m64 vdest
= *(__m64
*)dst
;
2589 __m64 v0
, v1
, v2
, v3
;
2591 __m64 vsrc0
= load8888 (&s0
);
2592 __m64 vsrc1
= load8888 (&s1
);
2593 __m64 vsrc2
= load8888 (&s2
);
2594 __m64 vsrc3
= load8888 (&s3
);
2596 expand_4x565 (vdest
, &v0
, &v1
, &v2
, &v3
, 0);
2598 v0
= over_rev_non_pre (vsrc0
, v0
);
2599 v1
= over_rev_non_pre (vsrc1
, v1
);
2600 v2
= over_rev_non_pre (vsrc2
, v2
);
2601 v3
= over_rev_non_pre (vsrc3
, v3
);
2603 *(__m64
*)dst
= pack_4x565 (v0
, v1
, v2
, v3
);
2615 __m64 vsrc
= load8888 (src
);
2617 __m64 vdest
= expand565 (to_m64 (d
), 0);
2619 vdest
= pack_565 (over_rev_non_pre (vsrc
, vdest
), vdest
, 0);
2621 *dst
= to_uint64 (vdest
);
2633 mmx_composite_over_pixbuf_8888 (pixman_implementation_t
*imp
,
2634 pixman_composite_info_t
*info
)
2636 PIXMAN_COMPOSITE_ARGS (info
);
2637 uint32_t *dst_line
, *dst
;
2638 uint32_t *src_line
, *src
;
2639 int dst_stride
, src_stride
;
2644 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
2645 PIXMAN_IMAGE_GET_LINE (src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
2649 assert (src_image
->drawable
== mask_image
->drawable
);
2655 dst_line
+= dst_stride
;
2657 src_line
+= src_stride
;
2660 while (w
&& (uintptr_t)dst
& 7)
2662 __m64 s
= load8888 (src
);
2663 __m64 d
= load8888 (dst
);
2665 store8888 (dst
, over_rev_non_pre (s
, d
));
2675 unsigned char a0
, a1
;
2684 if ((a0
& a1
) == 0xFF)
2686 d0
= invert_colors (load8888 (&s0
));
2687 d1
= invert_colors (load8888 (&s1
));
2689 *(__m64
*)dst
= pack8888 (d0
, d1
);
2693 __m64 vdest
= *(__m64
*)dst
;
2695 d0
= over_rev_non_pre (load8888 (&s0
), expand8888 (vdest
, 0));
2696 d1
= over_rev_non_pre (load8888 (&s1
), expand8888 (vdest
, 1));
2698 *(__m64
*)dst
= pack8888 (d0
, d1
);
2708 __m64 s
= load8888 (src
);
2709 __m64 d
= load8888 (dst
);
2711 store8888 (dst
, over_rev_non_pre (s
, d
));
2719 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t
*imp
,
2720 pixman_composite_info_t
*info
)
2722 PIXMAN_COMPOSITE_ARGS (info
);
2725 uint32_t *mask_line
;
2726 int dst_stride
, mask_stride
;
2731 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
2736 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint16_t, dst_stride
, dst_line
, 1);
2737 PIXMAN_IMAGE_GET_LINE (mask_image
, mask_x
, mask_y
, uint32_t, mask_stride
, mask_line
, 1);
2739 vsrc
= load8888 (&src
);
2740 vsrca
= expand_alpha (vsrc
);
2745 uint32_t *p
= (uint32_t *)mask_line
;
2746 uint16_t *q
= (uint16_t *)dst_line
;
2748 while (twidth
&& ((uintptr_t)q
& 7))
2750 uint32_t m
= *(uint32_t *)p
;
2755 __m64 vdest
= expand565 (to_m64 (d
), 0);
2756 vdest
= pack_565 (in_over (vsrc
, vsrca
, load8888 (&m
), vdest
), vdest
, 0);
2757 *q
= to_uint64 (vdest
);
2767 uint32_t m0
, m1
, m2
, m3
;
2774 if ((m0
| m1
| m2
| m3
))
2776 __m64 vdest
= *(__m64
*)q
;
2777 __m64 v0
, v1
, v2
, v3
;
2779 expand_4x565 (vdest
, &v0
, &v1
, &v2
, &v3
, 0);
2781 v0
= in_over (vsrc
, vsrca
, load8888 (&m0
), v0
);
2782 v1
= in_over (vsrc
, vsrca
, load8888 (&m1
), v1
);
2783 v2
= in_over (vsrc
, vsrca
, load8888 (&m2
), v2
);
2784 v3
= in_over (vsrc
, vsrca
, load8888 (&m3
), v3
);
2786 *(__m64
*)q
= pack_4x565 (v0
, v1
, v2
, v3
);
2801 __m64 vdest
= expand565 (to_m64 (d
), 0);
2802 vdest
= pack_565 (in_over (vsrc
, vsrca
, load8888 (&m
), vdest
), vdest
, 0);
2803 *q
= to_uint64 (vdest
);
2811 mask_line
+= mask_stride
;
2812 dst_line
+= dst_stride
;
2819 mmx_composite_in_n_8_8 (pixman_implementation_t
*imp
,
2820 pixman_composite_info_t
*info
)
2822 PIXMAN_COMPOSITE_ARGS (info
);
2823 uint8_t *dst_line
, *dst
;
2824 uint8_t *mask_line
, *mask
;
2825 int dst_stride
, mask_stride
;
2831 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint8_t, dst_stride
, dst_line
, 1);
2832 PIXMAN_IMAGE_GET_LINE (mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
2834 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
2838 vsrc
= load8888 (&src
);
2839 vsrca
= expand_alpha (vsrc
);
2844 dst_line
+= dst_stride
;
2846 mask_line
+= mask_stride
;
2849 while (w
&& (uintptr_t)dst
& 7)
2858 m
= MUL_UN8 (sa
, a
, tmp
);
2859 d
= MUL_UN8 (m
, d
, tmp
);
2870 vmask
= load8888u ((uint32_t *)mask
);
2871 vdest
= load8888 ((uint32_t *)dst
);
2873 store8888 ((uint32_t *)dst
, in (in (vsrca
, vmask
), vdest
));
2889 m
= MUL_UN8 (sa
, a
, tmp
);
2890 d
= MUL_UN8 (m
, d
, tmp
);
2900 mmx_composite_in_8_8 (pixman_implementation_t
*imp
,
2901 pixman_composite_info_t
*info
)
2903 PIXMAN_COMPOSITE_ARGS (info
);
2904 uint8_t *dst_line
, *dst
;
2905 uint8_t *src_line
, *src
;
2906 int src_stride
, dst_stride
;
2909 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint8_t, dst_stride
, dst_line
, 1);
2910 PIXMAN_IMAGE_GET_LINE (src_image
, src_x
, src_y
, uint8_t, src_stride
, src_line
, 1);
2915 dst_line
+= dst_stride
;
2917 src_line
+= src_stride
;
2920 while (w
&& (uintptr_t)dst
& 3)
2928 *dst
= MUL_UN8 (s
, d
, tmp
);
2937 uint32_t *s
= (uint32_t *)src
;
2938 uint32_t *d
= (uint32_t *)dst
;
2940 store8888 (d
, in (load8888u (s
), load8888 (d
)));
2955 *dst
= MUL_UN8 (s
, d
, tmp
);
2966 mmx_composite_add_n_8_8 (pixman_implementation_t
*imp
,
2967 pixman_composite_info_t
*info
)
2969 PIXMAN_COMPOSITE_ARGS (info
);
2970 uint8_t *dst_line
, *dst
;
2971 uint8_t *mask_line
, *mask
;
2972 int dst_stride
, mask_stride
;
2978 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint8_t, dst_stride
, dst_line
, 1);
2979 PIXMAN_IMAGE_GET_LINE (mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
2981 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
2988 vsrc
= load8888 (&src
);
2989 vsrca
= expand_alpha (vsrc
);
2994 dst_line
+= dst_stride
;
2996 mask_line
+= mask_stride
;
2999 while (w
&& (uintptr_t)dst
& 3)
3009 m
= MUL_UN8 (sa
, a
, tmp
);
3010 r
= ADD_UN8 (m
, d
, tmp
);
3021 vmask
= load8888u ((uint32_t *)mask
);
3022 vdest
= load8888 ((uint32_t *)dst
);
3024 store8888 ((uint32_t *)dst
, _mm_adds_pu8 (in (vsrca
, vmask
), vdest
));
3041 m
= MUL_UN8 (sa
, a
, tmp
);
3042 r
= ADD_UN8 (m
, d
, tmp
);
3052 mmx_composite_add_8_8 (pixman_implementation_t
*imp
,
3053 pixman_composite_info_t
*info
)
3055 PIXMAN_COMPOSITE_ARGS (info
);
3056 uint8_t *dst_line
, *dst
;
3057 uint8_t *src_line
, *src
;
3058 int dst_stride
, src_stride
;
3065 PIXMAN_IMAGE_GET_LINE (src_image
, src_x
, src_y
, uint8_t, src_stride
, src_line
, 1);
3066 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint8_t, dst_stride
, dst_line
, 1);
3071 dst_line
+= dst_stride
;
3073 src_line
+= src_stride
;
3076 while (w
&& (uintptr_t)dst
& 7)
3081 s
= t
| (0 - (t
>> 8));
3091 *(__m64
*)dst
= _mm_adds_pu8 (ldq_u ((__m64
*)src
), *(__m64
*)dst
);
3102 s
= t
| (0 - (t
>> 8));
3115 mmx_composite_add_0565_0565 (pixman_implementation_t
*imp
,
3116 pixman_composite_info_t
*info
)
3118 PIXMAN_COMPOSITE_ARGS (info
);
3119 uint16_t *dst_line
, *dst
;
3121 uint16_t *src_line
, *src
;
3123 int dst_stride
, src_stride
;
3128 PIXMAN_IMAGE_GET_LINE (src_image
, src_x
, src_y
, uint16_t, src_stride
, src_line
, 1);
3129 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint16_t, dst_stride
, dst_line
, 1);
3134 dst_line
+= dst_stride
;
3136 src_line
+= src_stride
;
3139 while (w
&& (uintptr_t)dst
& 7)
3145 s
= convert_0565_to_8888 (s
);
3148 d
= convert_0565_to_8888 (d
);
3149 UN8x4_ADD_UN8x4 (s
, d
);
3151 *dst
= convert_8888_to_0565 (s
);
3159 __m64 vdest
= *(__m64
*)dst
;
3160 __m64 vsrc
= ldq_u ((__m64
*)src
);
3164 expand_4xpacked565 (vdest
, &vd0
, &vd1
, 0);
3165 expand_4xpacked565 (vsrc
, &vs0
, &vs1
, 0);
3167 vd0
= _mm_adds_pu8 (vd0
, vs0
);
3168 vd1
= _mm_adds_pu8 (vd1
, vs1
);
3170 *(__m64
*)dst
= pack_4xpacked565 (vd0
, vd1
);
3183 s
= convert_0565_to_8888 (s
);
3186 d
= convert_0565_to_8888 (d
);
3187 UN8x4_ADD_UN8x4 (s
, d
);
3189 *dst
= convert_8888_to_0565 (s
);
3199 mmx_composite_add_8888_8888 (pixman_implementation_t
*imp
,
3200 pixman_composite_info_t
*info
)
3202 PIXMAN_COMPOSITE_ARGS (info
);
3203 uint32_t *dst_line
, *dst
;
3204 uint32_t *src_line
, *src
;
3205 int dst_stride
, src_stride
;
3210 PIXMAN_IMAGE_GET_LINE (src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
3211 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
3216 dst_line
+= dst_stride
;
3218 src_line
+= src_stride
;
3221 while (w
&& (uintptr_t)dst
& 7)
3223 store (dst
, _mm_adds_pu8 (load ((const uint32_t *)src
),
3224 load ((const uint32_t *)dst
)));
3232 *(__m64
*)dst
= _mm_adds_pu8 (ldq_u ((__m64
*)src
), *(__m64
*)dst
);
3240 store (dst
, _mm_adds_pu8 (load ((const uint32_t *)src
),
3241 load ((const uint32_t *)dst
)));
3249 static pixman_bool_t
3250 mmx_blt (pixman_implementation_t
*imp
,
3251 uint32_t * src_bits
,
3252 uint32_t * dst_bits
,
3264 uint8_t * src_bytes
;
3265 uint8_t * dst_bytes
;
3268 if (src_bpp
!= dst_bpp
)
3273 src_stride
= src_stride
* (int) sizeof (uint32_t) / 2;
3274 dst_stride
= dst_stride
* (int) sizeof (uint32_t) / 2;
3275 src_bytes
= (uint8_t *)(((uint16_t *)src_bits
) + src_stride
* (src_y
) + (src_x
));
3276 dst_bytes
= (uint8_t *)(((uint16_t *)dst_bits
) + dst_stride
* (dest_y
) + (dest_x
));
3277 byte_width
= 2 * width
;
3281 else if (src_bpp
== 32)
3283 src_stride
= src_stride
* (int) sizeof (uint32_t) / 4;
3284 dst_stride
= dst_stride
* (int) sizeof (uint32_t) / 4;
3285 src_bytes
= (uint8_t *)(((uint32_t *)src_bits
) + src_stride
* (src_y
) + (src_x
));
3286 dst_bytes
= (uint8_t *)(((uint32_t *)dst_bits
) + dst_stride
* (dest_y
) + (dest_x
));
3287 byte_width
= 4 * width
;
3299 uint8_t *s
= src_bytes
;
3300 uint8_t *d
= dst_bytes
;
3301 src_bytes
+= src_stride
;
3302 dst_bytes
+= dst_stride
;
3305 if (w
>= 1 && ((uintptr_t)d
& 1))
3307 *(uint8_t *)d
= *(uint8_t *)s
;
3313 if (w
>= 2 && ((uintptr_t)d
& 3))
3315 *(uint16_t *)d
= *(uint16_t *)s
;
3321 while (w
>= 4 && ((uintptr_t)d
& 7))
3323 *(uint32_t *)d
= ldl_u ((uint32_t *)s
);
3332 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3334 "movq (%1), %%mm0\n"
3335 "movq 8(%1), %%mm1\n"
3336 "movq 16(%1), %%mm2\n"
3337 "movq 24(%1), %%mm3\n"
3338 "movq 32(%1), %%mm4\n"
3339 "movq 40(%1), %%mm5\n"
3340 "movq 48(%1), %%mm6\n"
3341 "movq 56(%1), %%mm7\n"
3343 "movq %%mm0, (%0)\n"
3344 "movq %%mm1, 8(%0)\n"
3345 "movq %%mm2, 16(%0)\n"
3346 "movq %%mm3, 24(%0)\n"
3347 "movq %%mm4, 32(%0)\n"
3348 "movq %%mm5, 40(%0)\n"
3349 "movq %%mm6, 48(%0)\n"
3350 "movq %%mm7, 56(%0)\n"
3354 "%mm0", "%mm1", "%mm2", "%mm3",
3355 "%mm4", "%mm5", "%mm6", "%mm7");
3357 __m64 v0
= ldq_u ((__m64
*)(s
+ 0));
3358 __m64 v1
= ldq_u ((__m64
*)(s
+ 8));
3359 __m64 v2
= ldq_u ((__m64
*)(s
+ 16));
3360 __m64 v3
= ldq_u ((__m64
*)(s
+ 24));
3361 __m64 v4
= ldq_u ((__m64
*)(s
+ 32));
3362 __m64 v5
= ldq_u ((__m64
*)(s
+ 40));
3363 __m64 v6
= ldq_u ((__m64
*)(s
+ 48));
3364 __m64 v7
= ldq_u ((__m64
*)(s
+ 56));
3365 *(__m64
*)(d
+ 0) = v0
;
3366 *(__m64
*)(d
+ 8) = v1
;
3367 *(__m64
*)(d
+ 16) = v2
;
3368 *(__m64
*)(d
+ 24) = v3
;
3369 *(__m64
*)(d
+ 32) = v4
;
3370 *(__m64
*)(d
+ 40) = v5
;
3371 *(__m64
*)(d
+ 48) = v6
;
3372 *(__m64
*)(d
+ 56) = v7
;
3381 *(uint32_t *)d
= ldl_u ((uint32_t *)s
);
3389 *(uint16_t *)d
= *(uint16_t *)s
;
3402 mmx_composite_copy_area (pixman_implementation_t
*imp
,
3403 pixman_composite_info_t
*info
)
3405 PIXMAN_COMPOSITE_ARGS (info
);
3407 mmx_blt (imp
, src_image
->bits
.bits
,
3408 dest_image
->bits
.bits
,
3409 src_image
->bits
.rowstride
,
3410 dest_image
->bits
.rowstride
,
3411 PIXMAN_FORMAT_BPP (src_image
->bits
.format
),
3412 PIXMAN_FORMAT_BPP (dest_image
->bits
.format
),
3413 src_x
, src_y
, dest_x
, dest_y
, width
, height
);
3417 mmx_composite_over_x888_8_8888 (pixman_implementation_t
*imp
,
3418 pixman_composite_info_t
*info
)
3420 PIXMAN_COMPOSITE_ARGS (info
);
3421 uint32_t *src
, *src_line
;
3422 uint32_t *dst
, *dst_line
;
3423 uint8_t *mask
, *mask_line
;
3424 int src_stride
, mask_stride
, dst_stride
;
3427 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
3428 PIXMAN_IMAGE_GET_LINE (mask_image
, mask_x
, mask_y
, uint8_t, mask_stride
, mask_line
, 1);
3429 PIXMAN_IMAGE_GET_LINE (src_image
, src_x
, src_y
, uint32_t, src_stride
, src_line
, 1);
3434 src_line
+= src_stride
;
3436 dst_line
+= dst_stride
;
3438 mask_line
+= mask_stride
;
3448 uint32_t ssrc
= *src
| 0xff000000;
3449 __m64 s
= load8888 (&ssrc
);
3457 __m64 sa
= expand_alpha (s
);
3458 __m64 vm
= expand_alpha_rev (to_m64 (m
));
3459 __m64 vdest
= in_over (s
, sa
, vm
, load8888 (dst
));
3461 store8888 (dst
, vdest
);
3475 mmx_composite_over_reverse_n_8888 (pixman_implementation_t
*imp
,
3476 pixman_composite_info_t
*info
)
3478 PIXMAN_COMPOSITE_ARGS (info
);
3480 uint32_t *dst_line
, *dst
;
3487 src
= _pixman_image_get_solid (imp
, src_image
, dest_image
->bits
.format
);
3492 PIXMAN_IMAGE_GET_LINE (dest_image
, dest_x
, dest_y
, uint32_t, dst_stride
, dst_line
, 1);
3494 vsrc
= load8888 (&src
);
3499 dst_line
+= dst_stride
;
3504 while (w
&& (uintptr_t)dst
& 7)
3506 __m64 vdest
= load8888 (dst
);
3508 store8888 (dst
, over (vdest
, expand_alpha (vdest
), vsrc
));
3516 __m64 vdest
= *(__m64
*)dst
;
3517 __m64 dest0
= expand8888 (vdest
, 0);
3518 __m64 dest1
= expand8888 (vdest
, 1);
3521 dest0
= over (dest0
, expand_alpha (dest0
), vsrc
);
3522 dest1
= over (dest1
, expand_alpha (dest1
), vsrc
);
3524 *(__m64
*)dst
= pack8888 (dest0
, dest1
);
3534 __m64 vdest
= load8888 (dst
);
3536 store8888 (dst
, over (vdest
, expand_alpha (vdest
), vsrc
));
3543 static force_inline
void
3544 scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t* pd
,
3548 pixman_fixed_t unit_x
,
3549 pixman_fixed_t src_width_fixed
,
3550 pixman_bool_t fully_transparent_src
)
3552 if (fully_transparent_src
)
3557 __m64 d
= load (pd
);
3558 __m64 s
= load (ps
+ pixman_fixed_to_int (vx
));
3561 vx
-= src_width_fixed
;
3563 store8888 (pd
, core_combine_over_u_pixel_mmx (s
, d
));
3572 FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER
,
3573 scaled_nearest_scanline_mmx_8888_8888_OVER
,
3574 uint32_t, uint32_t, COVER
)
3575 FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER
,
3576 scaled_nearest_scanline_mmx_8888_8888_OVER
,
3577 uint32_t, uint32_t, NONE
)
3578 FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER
,
3579 scaled_nearest_scanline_mmx_8888_8888_OVER
,
3580 uint32_t, uint32_t, PAD
)
3581 FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER
,
3582 scaled_nearest_scanline_mmx_8888_8888_OVER
,
3583 uint32_t, uint32_t, NORMAL
)
3585 static force_inline
void
3586 scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask
,
3588 const uint32_t * src
,
3591 pixman_fixed_t unit_x
,
3592 pixman_fixed_t src_width_fixed
,
3593 pixman_bool_t zero_src
)
3597 if (zero_src
|| (*mask
>> 24) == 0)
3599 /* A workaround for https://gcc.gnu.org/PR47759 */
3604 mm_mask
= expand_alpha (load8888 (mask
));
3608 uint32_t s
= *(src
+ pixman_fixed_to_int (vx
));
3611 vx
-= src_width_fixed
;
3615 __m64 ms
= load8888 (&s
);
3616 __m64 alpha
= expand_alpha (ms
);
3617 __m64 dest
= load8888 (dst
);
3619 store8888 (dst
, (in_over (ms
, alpha
, mm_mask
, dest
)));
3629 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER
,
3630 scaled_nearest_scanline_mmx_8888_n_8888_OVER
,
3631 uint32_t, uint32_t, uint32_t, COVER
, TRUE
, TRUE
)
3632 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER
,
3633 scaled_nearest_scanline_mmx_8888_n_8888_OVER
,
3634 uint32_t, uint32_t, uint32_t, PAD
, TRUE
, TRUE
)
3635 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER
,
3636 scaled_nearest_scanline_mmx_8888_n_8888_OVER
,
3637 uint32_t, uint32_t, uint32_t, NONE
, TRUE
, TRUE
)
3638 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER
,
3639 scaled_nearest_scanline_mmx_8888_n_8888_OVER
,
3640 uint32_t, uint32_t, uint32_t, NORMAL
, TRUE
, TRUE
)
3642 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3643 #define BMSK (BSHIFT - 1)
3645 #define BILINEAR_DECLARE_VARIABLES \
3646 const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \
3647 const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \
3648 const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \
3649 const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \
3650 const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \
3651 const __m64 mm_zero = _mm_setzero_si64 (); \
3652 __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3654 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
3656 /* fetch 2x2 pixel block into 2 mmx registers */ \
3657 __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
3658 __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
3659 /* vertical interpolation */ \
3660 __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
3661 __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
3662 __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \
3663 __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
3664 __m64 hi = _mm_add_pi16 (t_hi, b_hi); \
3665 __m64 lo = _mm_add_pi16 (t_lo, b_lo); \
3666 /* calculate horizontal weights */ \
3667 __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \
3668 _mm_srli_pi16 (mm_x, \
3669 16 - BILINEAR_INTERPOLATION_BITS))); \
3670 /* horizontal interpolation */ \
3671 __m64 p = _mm_unpacklo_pi16 (lo, hi); \
3672 __m64 q = _mm_unpackhi_pi16 (lo, hi); \
3674 lo = _mm_madd_pi16 (p, mm_wh); \
3675 hi = _mm_madd_pi16 (q, mm_wh); \
3676 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3677 /* shift and pack the result */ \
3678 hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
3679 lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \
3680 lo = _mm_packs_pi32 (lo, hi); \
3681 lo = _mm_packs_pu16 (lo, lo); \
3685 #define BILINEAR_SKIP_ONE_PIXEL() \
3688 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3691 static force_inline
void
3692 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst
,
3693 const uint32_t * mask
,
3694 const uint32_t * src_top
,
3695 const uint32_t * src_bottom
,
3700 pixman_fixed_t unit_x
,
3701 pixman_fixed_t max_vx
,
3702 pixman_bool_t zero_src
)
3704 BILINEAR_DECLARE_VARIABLES
;
3709 BILINEAR_INTERPOLATE_ONE_PIXEL (pix
);
3717 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC
,
3718 scaled_bilinear_scanline_mmx_8888_8888_SRC
,
3719 uint32_t, uint32_t, uint32_t,
3721 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC
,
3722 scaled_bilinear_scanline_mmx_8888_8888_SRC
,
3723 uint32_t, uint32_t, uint32_t,
3725 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC
,
3726 scaled_bilinear_scanline_mmx_8888_8888_SRC
,
3727 uint32_t, uint32_t, uint32_t,
3729 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC
,
3730 scaled_bilinear_scanline_mmx_8888_8888_SRC
,
3731 uint32_t, uint32_t, uint32_t,
3734 static force_inline
void
3735 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst
,
3736 const uint32_t * mask
,
3737 const uint32_t * src_top
,
3738 const uint32_t * src_bottom
,
3743 pixman_fixed_t unit_x
,
3744 pixman_fixed_t max_vx
,
3745 pixman_bool_t zero_src
)
3747 BILINEAR_DECLARE_VARIABLES
;
3752 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
3754 if (!is_zero (pix1
))
3757 store8888 (dst
, core_combine_over_u_pixel_mmx (pix1
, pix2
));
3767 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER
,
3768 scaled_bilinear_scanline_mmx_8888_8888_OVER
,
3769 uint32_t, uint32_t, uint32_t,
3771 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER
,
3772 scaled_bilinear_scanline_mmx_8888_8888_OVER
,
3773 uint32_t, uint32_t, uint32_t,
3775 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER
,
3776 scaled_bilinear_scanline_mmx_8888_8888_OVER
,
3777 uint32_t, uint32_t, uint32_t,
3779 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER
,
3780 scaled_bilinear_scanline_mmx_8888_8888_OVER
,
3781 uint32_t, uint32_t, uint32_t,
3784 static force_inline
void
3785 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst
,
3786 const uint8_t * mask
,
3787 const uint32_t * src_top
,
3788 const uint32_t * src_bottom
,
3793 pixman_fixed_t unit_x
,
3794 pixman_fixed_t max_vx
,
3795 pixman_bool_t zero_src
)
3797 BILINEAR_DECLARE_VARIABLES
;
3803 m
= (uint32_t) *mask
++;
3807 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1
);
3809 if (m
== 0xff && is_opaque (pix1
))
3815 __m64 ms
, md
, ma
, msa
;
3818 ma
= expand_alpha_rev (to_m64 (m
));
3819 ms
= _mm_unpacklo_pi8 (pix1
, _mm_setzero_si64 ());
3820 md
= _mm_unpacklo_pi8 (pix2
, _mm_setzero_si64 ());
3822 msa
= expand_alpha (ms
);
3824 store8888 (dst
, (in_over (ms
, msa
, ma
, md
)));
3829 BILINEAR_SKIP_ONE_PIXEL ();
3839 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER
,
3840 scaled_bilinear_scanline_mmx_8888_8_8888_OVER
,
3841 uint32_t, uint8_t, uint32_t,
3842 COVER
, FLAG_HAVE_NON_SOLID_MASK
)
3843 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER
,
3844 scaled_bilinear_scanline_mmx_8888_8_8888_OVER
,
3845 uint32_t, uint8_t, uint32_t,
3846 PAD
, FLAG_HAVE_NON_SOLID_MASK
)
3847 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER
,
3848 scaled_bilinear_scanline_mmx_8888_8_8888_OVER
,
3849 uint32_t, uint8_t, uint32_t,
3850 NONE
, FLAG_HAVE_NON_SOLID_MASK
)
3851 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER
,
3852 scaled_bilinear_scanline_mmx_8888_8_8888_OVER
,
3853 uint32_t, uint8_t, uint32_t,
3854 NORMAL
, FLAG_HAVE_NON_SOLID_MASK
)
3857 mmx_fetch_x8r8g8b8 (pixman_iter_t
*iter
, const uint32_t *mask
)
3859 int w
= iter
->width
;
3860 uint32_t *dst
= iter
->buffer
;
3861 uint32_t *src
= (uint32_t *)iter
->bits
;
3863 iter
->bits
+= iter
->stride
;
3865 while (w
&& ((uintptr_t)dst
) & 7)
3867 *dst
++ = (*src
++) | 0xff000000;
3873 __m64 vsrc1
= ldq_u ((__m64
*)(src
+ 0));
3874 __m64 vsrc2
= ldq_u ((__m64
*)(src
+ 2));
3875 __m64 vsrc3
= ldq_u ((__m64
*)(src
+ 4));
3876 __m64 vsrc4
= ldq_u ((__m64
*)(src
+ 6));
3878 *(__m64
*)(dst
+ 0) = _mm_or_si64 (vsrc1
, MC (ff000000
));
3879 *(__m64
*)(dst
+ 2) = _mm_or_si64 (vsrc2
, MC (ff000000
));
3880 *(__m64
*)(dst
+ 4) = _mm_or_si64 (vsrc3
, MC (ff000000
));
3881 *(__m64
*)(dst
+ 6) = _mm_or_si64 (vsrc4
, MC (ff000000
));
3890 *dst
++ = (*src
++) | 0xff000000;
3895 return iter
->buffer
;
3899 mmx_fetch_r5g6b5 (pixman_iter_t
*iter
, const uint32_t *mask
)
3901 int w
= iter
->width
;
3902 uint32_t *dst
= iter
->buffer
;
3903 uint16_t *src
= (uint16_t *)iter
->bits
;
3905 iter
->bits
+= iter
->stride
;
3907 while (w
&& ((uintptr_t)dst
) & 0x0f)
3909 uint16_t s
= *src
++;
3911 *dst
++ = convert_0565_to_8888 (s
);
3917 __m64 vsrc
= ldq_u ((__m64
*)src
);
3920 expand_4xpacked565 (vsrc
, &mm0
, &mm1
, 1);
3922 *(__m64
*)(dst
+ 0) = mm0
;
3923 *(__m64
*)(dst
+ 2) = mm1
;
3932 uint16_t s
= *src
++;
3934 *dst
++ = convert_0565_to_8888 (s
);
3939 return iter
->buffer
;
3943 mmx_fetch_a8 (pixman_iter_t
*iter
, const uint32_t *mask
)
3945 int w
= iter
->width
;
3946 uint32_t *dst
= iter
->buffer
;
3947 uint8_t *src
= iter
->bits
;
3949 iter
->bits
+= iter
->stride
;
3951 while (w
&& (((uintptr_t)dst
) & 15))
3953 *dst
++ = *(src
++) << 24;
3959 __m64 mm0
= ldq_u ((__m64
*)src
);
3961 __m64 mm1
= _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0
);
3962 __m64 mm2
= _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0
);
3963 __m64 mm3
= _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1
);
3964 __m64 mm4
= _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1
);
3965 __m64 mm5
= _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2
);
3966 __m64 mm6
= _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2
);
3968 *(__m64
*)(dst
+ 0) = mm3
;
3969 *(__m64
*)(dst
+ 2) = mm4
;
3970 *(__m64
*)(dst
+ 4) = mm5
;
3971 *(__m64
*)(dst
+ 6) = mm6
;
3980 *dst
++ = *(src
++) << 24;
3985 return iter
->buffer
;
3988 #define IMAGE_FLAGS \
3989 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
3990 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3992 static const pixman_iter_info_t mmx_iters
[] =
3994 { PIXMAN_x8r8g8b8
, IMAGE_FLAGS
, ITER_NARROW
,
3995 _pixman_iter_init_bits_stride
, mmx_fetch_x8r8g8b8
, NULL
3997 { PIXMAN_r5g6b5
, IMAGE_FLAGS
, ITER_NARROW
,
3998 _pixman_iter_init_bits_stride
, mmx_fetch_r5g6b5
, NULL
4000 { PIXMAN_a8
, IMAGE_FLAGS
, ITER_NARROW
,
4001 _pixman_iter_init_bits_stride
, mmx_fetch_a8
, NULL
4006 static const pixman_fast_path_t mmx_fast_paths
[] =
4008 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, r5g6b5
, mmx_composite_over_n_8_0565
),
4009 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, b5g6r5
, mmx_composite_over_n_8_0565
),
4010 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, a8r8g8b8
, mmx_composite_over_n_8_8888
),
4011 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, x8r8g8b8
, mmx_composite_over_n_8_8888
),
4012 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, a8b8g8r8
, mmx_composite_over_n_8_8888
),
4013 PIXMAN_STD_FAST_PATH (OVER
, solid
, a8
, x8b8g8r8
, mmx_composite_over_n_8_8888
),
4014 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8r8g8b8
, a8r8g8b8
, mmx_composite_over_n_8888_8888_ca
),
4015 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8r8g8b8
, x8r8g8b8
, mmx_composite_over_n_8888_8888_ca
),
4016 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8r8g8b8
, r5g6b5
, mmx_composite_over_n_8888_0565_ca
),
4017 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8b8g8r8
, a8b8g8r8
, mmx_composite_over_n_8888_8888_ca
),
4018 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8b8g8r8
, x8b8g8r8
, mmx_composite_over_n_8888_8888_ca
),
4019 PIXMAN_STD_FAST_PATH_CA (OVER
, solid
, a8b8g8r8
, b5g6r5
, mmx_composite_over_n_8888_0565_ca
),
4020 PIXMAN_STD_FAST_PATH (OVER
, pixbuf
, pixbuf
, a8r8g8b8
, mmx_composite_over_pixbuf_8888
),
4021 PIXMAN_STD_FAST_PATH (OVER
, pixbuf
, pixbuf
, x8r8g8b8
, mmx_composite_over_pixbuf_8888
),
4022 PIXMAN_STD_FAST_PATH (OVER
, pixbuf
, pixbuf
, r5g6b5
, mmx_composite_over_pixbuf_0565
),
4023 PIXMAN_STD_FAST_PATH (OVER
, rpixbuf
, rpixbuf
, a8b8g8r8
, mmx_composite_over_pixbuf_8888
),
4024 PIXMAN_STD_FAST_PATH (OVER
, rpixbuf
, rpixbuf
, x8b8g8r8
, mmx_composite_over_pixbuf_8888
),
4025 PIXMAN_STD_FAST_PATH (OVER
, rpixbuf
, rpixbuf
, b5g6r5
, mmx_composite_over_pixbuf_0565
),
4026 PIXMAN_STD_FAST_PATH (OVER
, x8r8g8b8
, solid
, a8r8g8b8
, mmx_composite_over_x888_n_8888
),
4027 PIXMAN_STD_FAST_PATH (OVER
, x8r8g8b8
, solid
, x8r8g8b8
, mmx_composite_over_x888_n_8888
),
4028 PIXMAN_STD_FAST_PATH (OVER
, x8b8g8r8
, solid
, a8b8g8r8
, mmx_composite_over_x888_n_8888
),
4029 PIXMAN_STD_FAST_PATH (OVER
, x8b8g8r8
, solid
, x8b8g8r8
, mmx_composite_over_x888_n_8888
),
4030 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, solid
, a8r8g8b8
, mmx_composite_over_8888_n_8888
),
4031 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, solid
, x8r8g8b8
, mmx_composite_over_8888_n_8888
),
4032 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, solid
, a8b8g8r8
, mmx_composite_over_8888_n_8888
),
4033 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, solid
, x8b8g8r8
, mmx_composite_over_8888_n_8888
),
4034 PIXMAN_STD_FAST_PATH (OVER
, x8r8g8b8
, a8
, x8r8g8b8
, mmx_composite_over_x888_8_8888
),
4035 PIXMAN_STD_FAST_PATH (OVER
, x8r8g8b8
, a8
, a8r8g8b8
, mmx_composite_over_x888_8_8888
),
4036 PIXMAN_STD_FAST_PATH (OVER
, x8b8g8r8
, a8
, x8b8g8r8
, mmx_composite_over_x888_8_8888
),
4037 PIXMAN_STD_FAST_PATH (OVER
, x8b8g8r8
, a8
, a8b8g8r8
, mmx_composite_over_x888_8_8888
),
4038 PIXMAN_STD_FAST_PATH (OVER
, solid
, null
, a8r8g8b8
, mmx_composite_over_n_8888
),
4039 PIXMAN_STD_FAST_PATH (OVER
, solid
, null
, x8r8g8b8
, mmx_composite_over_n_8888
),
4040 PIXMAN_STD_FAST_PATH (OVER
, solid
, null
, r5g6b5
, mmx_composite_over_n_0565
),
4041 PIXMAN_STD_FAST_PATH (OVER
, solid
, null
, b5g6r5
, mmx_composite_over_n_0565
),
4042 PIXMAN_STD_FAST_PATH (OVER
, x8r8g8b8
, null
, x8r8g8b8
, mmx_composite_copy_area
),
4043 PIXMAN_STD_FAST_PATH (OVER
, x8b8g8r8
, null
, x8b8g8r8
, mmx_composite_copy_area
),
4045 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, null
, a8r8g8b8
, mmx_composite_over_8888_8888
),
4046 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, null
, x8r8g8b8
, mmx_composite_over_8888_8888
),
4047 PIXMAN_STD_FAST_PATH (OVER
, a8r8g8b8
, null
, r5g6b5
, mmx_composite_over_8888_0565
),
4048 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, null
, a8b8g8r8
, mmx_composite_over_8888_8888
),
4049 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, null
, x8b8g8r8
, mmx_composite_over_8888_8888
),
4050 PIXMAN_STD_FAST_PATH (OVER
, a8b8g8r8
, null
, b5g6r5
, mmx_composite_over_8888_0565
),
4052 PIXMAN_STD_FAST_PATH (OVER_REVERSE
, solid
, null
, a8r8g8b8
, mmx_composite_over_reverse_n_8888
),
4053 PIXMAN_STD_FAST_PATH (OVER_REVERSE
, solid
, null
, a8b8g8r8
, mmx_composite_over_reverse_n_8888
),
4055 PIXMAN_STD_FAST_PATH (ADD
, r5g6b5
, null
, r5g6b5
, mmx_composite_add_0565_0565
),
4056 PIXMAN_STD_FAST_PATH (ADD
, b5g6r5
, null
, b5g6r5
, mmx_composite_add_0565_0565
),
4057 PIXMAN_STD_FAST_PATH (ADD
, a8r8g8b8
, null
, a8r8g8b8
, mmx_composite_add_8888_8888
),
4058 PIXMAN_STD_FAST_PATH (ADD
, a8b8g8r8
, null
, a8b8g8r8
, mmx_composite_add_8888_8888
),
4059 PIXMAN_STD_FAST_PATH (ADD
, a8
, null
, a8
, mmx_composite_add_8_8
),
4060 PIXMAN_STD_FAST_PATH (ADD
, solid
, a8
, a8
, mmx_composite_add_n_8_8
),
4062 PIXMAN_STD_FAST_PATH (SRC
, a8r8g8b8
, null
, r5g6b5
, mmx_composite_src_x888_0565
),
4063 PIXMAN_STD_FAST_PATH (SRC
, a8b8g8r8
, null
, b5g6r5
, mmx_composite_src_x888_0565
),
4064 PIXMAN_STD_FAST_PATH (SRC
, x8r8g8b8
, null
, r5g6b5
, mmx_composite_src_x888_0565
),
4065 PIXMAN_STD_FAST_PATH (SRC
, x8b8g8r8
, null
, b5g6r5
, mmx_composite_src_x888_0565
),
4066 PIXMAN_STD_FAST_PATH (SRC
, solid
, a8
, a8r8g8b8
, mmx_composite_src_n_8_8888
),
4067 PIXMAN_STD_FAST_PATH (SRC
, solid
, a8
, x8r8g8b8
, mmx_composite_src_n_8_8888
),
4068 PIXMAN_STD_FAST_PATH (SRC
, solid
, a8
, a8b8g8r8
, mmx_composite_src_n_8_8888
),
4069 PIXMAN_STD_FAST_PATH (SRC
, solid
, a8
, x8b8g8r8
, mmx_composite_src_n_8_8888
),
4070 PIXMAN_STD_FAST_PATH (SRC
, a8r8g8b8
, null
, a8r8g8b8
, mmx_composite_copy_area
),
4071 PIXMAN_STD_FAST_PATH (SRC
, a8b8g8r8
, null
, a8b8g8r8
, mmx_composite_copy_area
),
4072 PIXMAN_STD_FAST_PATH (SRC
, a8r8g8b8
, null
, x8r8g8b8
, mmx_composite_copy_area
),
4073 PIXMAN_STD_FAST_PATH (SRC
, a8b8g8r8
, null
, x8b8g8r8
, mmx_composite_copy_area
),
4074 PIXMAN_STD_FAST_PATH (SRC
, x8r8g8b8
, null
, x8r8g8b8
, mmx_composite_copy_area
),
4075 PIXMAN_STD_FAST_PATH (SRC
, x8b8g8r8
, null
, x8b8g8r8
, mmx_composite_copy_area
),
4076 PIXMAN_STD_FAST_PATH (SRC
, r5g6b5
, null
, r5g6b5
, mmx_composite_copy_area
),
4077 PIXMAN_STD_FAST_PATH (SRC
, b5g6r5
, null
, b5g6r5
, mmx_composite_copy_area
),
4079 PIXMAN_STD_FAST_PATH (IN
, a8
, null
, a8
, mmx_composite_in_8_8
),
4080 PIXMAN_STD_FAST_PATH (IN
, solid
, a8
, a8
, mmx_composite_in_n_8_8
),
4082 SIMPLE_NEAREST_FAST_PATH (OVER
, a8r8g8b8
, x8r8g8b8
, mmx_8888_8888
),
4083 SIMPLE_NEAREST_FAST_PATH (OVER
, a8b8g8r8
, x8b8g8r8
, mmx_8888_8888
),
4084 SIMPLE_NEAREST_FAST_PATH (OVER
, a8r8g8b8
, a8r8g8b8
, mmx_8888_8888
),
4085 SIMPLE_NEAREST_FAST_PATH (OVER
, a8b8g8r8
, a8b8g8r8
, mmx_8888_8888
),
4087 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER
, a8r8g8b8
, a8r8g8b8
, mmx_8888_n_8888
),
4088 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER
, a8b8g8r8
, a8b8g8r8
, mmx_8888_n_8888
),
4089 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER
, a8r8g8b8
, x8r8g8b8
, mmx_8888_n_8888
),
4090 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER
, a8b8g8r8
, x8b8g8r8
, mmx_8888_n_8888
),
4092 SIMPLE_BILINEAR_FAST_PATH (SRC
, a8r8g8b8
, a8r8g8b8
, mmx_8888_8888
),
4093 SIMPLE_BILINEAR_FAST_PATH (SRC
, a8r8g8b8
, x8r8g8b8
, mmx_8888_8888
),
4094 SIMPLE_BILINEAR_FAST_PATH (SRC
, x8r8g8b8
, x8r8g8b8
, mmx_8888_8888
),
4095 SIMPLE_BILINEAR_FAST_PATH (SRC
, a8b8g8r8
, a8b8g8r8
, mmx_8888_8888
),
4096 SIMPLE_BILINEAR_FAST_PATH (SRC
, a8b8g8r8
, x8b8g8r8
, mmx_8888_8888
),
4097 SIMPLE_BILINEAR_FAST_PATH (SRC
, x8b8g8r8
, x8b8g8r8
, mmx_8888_8888
),
4099 SIMPLE_BILINEAR_FAST_PATH (OVER
, a8r8g8b8
, x8r8g8b8
, mmx_8888_8888
),
4100 SIMPLE_BILINEAR_FAST_PATH (OVER
, a8b8g8r8
, x8b8g8r8
, mmx_8888_8888
),
4101 SIMPLE_BILINEAR_FAST_PATH (OVER
, a8r8g8b8
, a8r8g8b8
, mmx_8888_8888
),
4102 SIMPLE_BILINEAR_FAST_PATH (OVER
, a8b8g8r8
, a8b8g8r8
, mmx_8888_8888
),
4104 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER
, a8r8g8b8
, x8r8g8b8
, mmx_8888_8_8888
),
4105 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER
, a8b8g8r8
, x8b8g8r8
, mmx_8888_8_8888
),
4106 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER
, a8r8g8b8
, a8r8g8b8
, mmx_8888_8_8888
),
4107 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER
, a8b8g8r8
, a8b8g8r8
, mmx_8888_8_8888
),
4112 pixman_implementation_t
*
4113 _pixman_implementation_create_mmx (pixman_implementation_t
*fallback
)
4115 pixman_implementation_t
*imp
= _pixman_implementation_create (fallback
, mmx_fast_paths
);
4117 imp
->combine_32
[PIXMAN_OP_OVER
] = mmx_combine_over_u
;
4118 imp
->combine_32
[PIXMAN_OP_OVER_REVERSE
] = mmx_combine_over_reverse_u
;
4119 imp
->combine_32
[PIXMAN_OP_IN
] = mmx_combine_in_u
;
4120 imp
->combine_32
[PIXMAN_OP_IN_REVERSE
] = mmx_combine_in_reverse_u
;
4121 imp
->combine_32
[PIXMAN_OP_OUT
] = mmx_combine_out_u
;
4122 imp
->combine_32
[PIXMAN_OP_OUT_REVERSE
] = mmx_combine_out_reverse_u
;
4123 imp
->combine_32
[PIXMAN_OP_ATOP
] = mmx_combine_atop_u
;
4124 imp
->combine_32
[PIXMAN_OP_ATOP_REVERSE
] = mmx_combine_atop_reverse_u
;
4125 imp
->combine_32
[PIXMAN_OP_XOR
] = mmx_combine_xor_u
;
4126 imp
->combine_32
[PIXMAN_OP_ADD
] = mmx_combine_add_u
;
4127 imp
->combine_32
[PIXMAN_OP_SATURATE
] = mmx_combine_saturate_u
;
4129 imp
->combine_32_ca
[PIXMAN_OP_SRC
] = mmx_combine_src_ca
;
4130 imp
->combine_32_ca
[PIXMAN_OP_OVER
] = mmx_combine_over_ca
;
4131 imp
->combine_32_ca
[PIXMAN_OP_OVER_REVERSE
] = mmx_combine_over_reverse_ca
;
4132 imp
->combine_32_ca
[PIXMAN_OP_IN
] = mmx_combine_in_ca
;
4133 imp
->combine_32_ca
[PIXMAN_OP_IN_REVERSE
] = mmx_combine_in_reverse_ca
;
4134 imp
->combine_32_ca
[PIXMAN_OP_OUT
] = mmx_combine_out_ca
;
4135 imp
->combine_32_ca
[PIXMAN_OP_OUT_REVERSE
] = mmx_combine_out_reverse_ca
;
4136 imp
->combine_32_ca
[PIXMAN_OP_ATOP
] = mmx_combine_atop_ca
;
4137 imp
->combine_32_ca
[PIXMAN_OP_ATOP_REVERSE
] = mmx_combine_atop_reverse_ca
;
4138 imp
->combine_32_ca
[PIXMAN_OP_XOR
] = mmx_combine_xor_ca
;
4139 imp
->combine_32_ca
[PIXMAN_OP_ADD
] = mmx_combine_add_ca
;
4142 imp
->fill
= mmx_fill
;
4144 imp
->iter_info
= mmx_iters
;
4149 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */