beta-0.89.2
[luatex.git] / source / libs / pixman / pixman-src / pixman / pixman-mmx.c
blobdec3974324259f2c5d669870d144fda839adf7ec
1 /*
2 * Copyright © 2004, 2005 Red Hat, Inc.
3 * Copyright © 2004 Nicholas Miell
4 * Copyright © 2005 Trolltech AS
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of Red Hat not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission. Red Hat makes no representations about the
13 * suitability of this software for any purpose. It is provided "as is"
14 * without express or implied warranty.
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23 * SOFTWARE.
25 * Author: Søren Sandmann (sandmann@redhat.com)
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
29 * Based on work by Owen Taylor
32 #ifdef HAVE_CONFIG_H
33 #include <config.h>
34 #endif
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
38 #ifdef USE_LOONGSON_MMI
39 #include <loongson-mmintrin.h>
40 #else
41 #include <mmintrin.h>
42 #endif
43 #include "pixman-private.h"
44 #include "pixman-combine32.h"
45 #include "pixman-inlines.h"
47 #ifdef VERBOSE
48 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
49 #else
50 #define CHECKPOINT()
51 #endif
53 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
54 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
55 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
56 _mm_empty (void)
60 #endif
62 #ifdef USE_X86_MMX
63 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
64 # include <xmmintrin.h>
65 # else
66 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
67 * instructions to be generated that we don't want. Just duplicate the
68 * functions we want to use. */
69 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70 _mm_movemask_pi8 (__m64 __A)
72 int ret;
74 asm ("pmovmskb %1, %0\n\t"
75 : "=r" (ret)
76 : "y" (__A)
79 return ret;
82 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
85 asm ("pmulhuw %1, %0\n\t"
86 : "+y" (__A)
87 : "y" (__B)
89 return __A;
92 # define _mm_shuffle_pi16(A, N) \
93 ({ \
94 __m64 ret; \
96 asm ("pshufw %2, %1, %0\n\t" \
97 : "=y" (ret) \
98 : "y" (A), "K" ((const int8_t)N) \
99 ); \
101 ret; \
103 # endif
104 #endif
106 #ifndef _MSC_VER
107 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
108 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
109 #endif
111 /* Notes about writing mmx code
113 * give memory operands as the second operand. If you give it as the
114 * first, gcc will first load it into a register, then use that
115 * register
117 * ie. use
119 * _mm_mullo_pi16 (x, mmx_constant);
121 * not
123 * _mm_mullo_pi16 (mmx_constant, x);
125 * Also try to minimize dependencies. i.e. when you need a value, try
126 * to calculate it from a value that was calculated as early as
127 * possible.
130 /* --------------- MMX primitives ------------------------------------- */
132 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
133 * the name of the member used to access the data.
134 * If __m64 requires using mm_cvt* intrinsics functions to convert between
135 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
136 * If __m64 and uint64_t values can just be cast to each other directly,
137 * then define USE_M64_CASTS.
138 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
140 #ifdef _MSC_VER
141 # define M64_MEMBER m64_u64
142 #elif defined(__ICC)
143 # define USE_CVT_INTRINSICS
144 #elif defined(USE_LOONGSON_MMI)
145 # define USE_M64_DOUBLE
146 #elif defined(__GNUC__)
147 # define USE_M64_CASTS
148 #elif defined(__SUNPRO_C)
149 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
150 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
151 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
152 * is defined. If it is used, then the mm_cvt* intrinsics must be used.
154 # define USE_CVT_INTRINSICS
155 # else
156 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
157 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
159 # define M64_MEMBER l_
160 # endif
161 #endif
163 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
164 typedef uint64_t mmxdatafield;
165 #else
166 typedef __m64 mmxdatafield;
167 #endif
169 typedef struct
171 mmxdatafield mmx_4x00ff;
172 mmxdatafield mmx_4x0080;
173 mmxdatafield mmx_565_rgb;
174 mmxdatafield mmx_565_unpack_multiplier;
175 mmxdatafield mmx_565_pack_multiplier;
176 mmxdatafield mmx_565_r;
177 mmxdatafield mmx_565_g;
178 mmxdatafield mmx_565_b;
179 mmxdatafield mmx_packed_565_rb;
180 mmxdatafield mmx_packed_565_g;
181 mmxdatafield mmx_expand_565_g;
182 mmxdatafield mmx_expand_565_b;
183 mmxdatafield mmx_expand_565_r;
184 #ifndef USE_LOONGSON_MMI
185 mmxdatafield mmx_mask_0;
186 mmxdatafield mmx_mask_1;
187 mmxdatafield mmx_mask_2;
188 mmxdatafield mmx_mask_3;
189 #endif
190 mmxdatafield mmx_full_alpha;
191 mmxdatafield mmx_4x0101;
192 mmxdatafield mmx_ff000000;
193 } mmx_data_t;
195 #if defined(_MSC_VER)
196 # define MMXDATA_INIT(field, val) { val ## UI64 }
197 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
198 # define MMXDATA_INIT(field, val) field = { val ## ULL }
199 #else /* mmxdatafield is an integral type */
200 # define MMXDATA_INIT(field, val) field = val ## ULL
201 #endif
203 static const mmx_data_t c =
205 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
206 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
207 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
208 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
209 MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004),
210 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
211 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
212 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
213 MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
214 MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
215 MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0),
216 MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f),
217 MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800),
218 #ifndef USE_LOONGSON_MMI
219 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
220 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
221 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
222 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
223 #endif
224 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
225 MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
226 MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000),
229 #ifdef USE_CVT_INTRINSICS
230 # define MC(x) to_m64 (c.mmx_ ## x)
231 #elif defined(USE_M64_CASTS)
232 # define MC(x) ((__m64)c.mmx_ ## x)
233 #elif defined(USE_M64_DOUBLE)
234 # define MC(x) (*(__m64 *)&c.mmx_ ## x)
235 #else
236 # define MC(x) c.mmx_ ## x
237 #endif
239 static force_inline __m64
240 to_m64 (uint64_t x)
242 #ifdef USE_CVT_INTRINSICS
243 return _mm_cvtsi64_m64 (x);
244 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
245 __m64 res;
247 res.M64_MEMBER = x;
248 return res;
249 #elif defined USE_M64_DOUBLE
250 return *(__m64 *)&x;
251 #else /* USE_M64_CASTS */
252 return (__m64)x;
253 #endif
256 static force_inline uint64_t
257 to_uint64 (__m64 x)
259 #ifdef USE_CVT_INTRINSICS
260 return _mm_cvtm64_si64 (x);
261 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
262 uint64_t res = x.M64_MEMBER;
263 return res;
264 #elif defined USE_M64_DOUBLE
265 return *(uint64_t *)&x;
266 #else /* USE_M64_CASTS */
267 return (uint64_t)x;
268 #endif
271 static force_inline __m64
272 shift (__m64 v,
273 int s)
275 if (s > 0)
276 return _mm_slli_si64 (v, s);
277 else if (s < 0)
278 return _mm_srli_si64 (v, -s);
279 else
280 return v;
283 static force_inline __m64
284 negate (__m64 mask)
286 return _mm_xor_si64 (mask, MC (4x00ff));
289 /* Computes the product of two unsigned fixed-point 8-bit values from 0 to 1
290 * and maps its result to the same range.
292 * Jim Blinn gives multiple ways to compute this in "Jim Blinn's Corner:
293 * Notation, Notation, Notation", the first of which is
295 * prod(a, b) = (a * b + 128) / 255.
297 * By approximating the division by 255 as 257/65536 it can be replaced by a
298 * multiply and a right shift. This is the implementation that we use in
299 * pix_multiply(), but we _mm_mulhi_pu16() by 257 (part of SSE1 or Extended
300 * 3DNow!, and unavailable at the time of the book's publication) to perform
301 * the multiplication and right shift in a single operation.
303 * prod(a, b) = ((a * b + 128) * 257) >> 16.
305 * A third way (how pix_multiply() was implemented prior to 14208344) exists
306 * also that performs the multiplication by 257 with adds and shifts.
308 * Where temp = a * b + 128
310 * prod(a, b) = (temp + (temp >> 8)) >> 8.
312 static force_inline __m64
313 pix_multiply (__m64 a, __m64 b)
315 __m64 res;
317 res = _mm_mullo_pi16 (a, b);
318 res = _mm_adds_pu16 (res, MC (4x0080));
319 res = _mm_mulhi_pu16 (res, MC (4x0101));
321 return res;
324 static force_inline __m64
325 pix_add (__m64 a, __m64 b)
327 return _mm_adds_pu8 (a, b);
330 static force_inline __m64
331 expand_alpha (__m64 pixel)
333 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
336 static force_inline __m64
337 expand_alpha_rev (__m64 pixel)
339 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
342 static force_inline __m64
343 invert_colors (__m64 pixel)
345 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
348 static force_inline __m64
349 over (__m64 src,
350 __m64 srca,
351 __m64 dest)
353 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
356 static force_inline __m64
357 over_rev_non_pre (__m64 src, __m64 dest)
359 __m64 srca = expand_alpha (src);
360 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
362 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
365 static force_inline __m64
366 in (__m64 src, __m64 mask)
368 return pix_multiply (src, mask);
371 #ifndef _MSC_VER
372 static force_inline __m64
373 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
375 return over (in (src, mask), pix_multiply (srca, mask), dest);
378 #else
380 #define in_over(src, srca, mask, dest) \
381 over (in (src, mask), pix_multiply (srca, mask), dest)
383 #endif
385 /* Elemental unaligned loads */
387 static force_inline __m64 ldq_u(__m64 *p)
389 #ifdef USE_X86_MMX
390 /* x86's alignment restrictions are very relaxed. */
391 return *(__m64 *)p;
392 #elif defined USE_ARM_IWMMXT
393 int align = (uintptr_t)p & 7;
394 __m64 *aligned_p;
395 if (align == 0)
396 return *p;
397 aligned_p = (__m64 *)((uintptr_t)p & ~7);
398 return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
399 #else
400 struct __una_u64 { __m64 x __attribute__((packed)); };
401 const struct __una_u64 *ptr = (const struct __una_u64 *) p;
402 return (__m64) ptr->x;
403 #endif
406 static force_inline uint32_t ldl_u(const uint32_t *p)
408 #ifdef USE_X86_MMX
409 /* x86's alignment restrictions are very relaxed. */
410 return *p;
411 #else
412 struct __una_u32 { uint32_t x __attribute__((packed)); };
413 const struct __una_u32 *ptr = (const struct __una_u32 *) p;
414 return ptr->x;
415 #endif
418 static force_inline __m64
419 load (const uint32_t *v)
421 #ifdef USE_LOONGSON_MMI
422 __m64 ret;
423 asm ("lwc1 %0, %1\n\t"
424 : "=f" (ret)
425 : "m" (*v)
427 return ret;
428 #else
429 return _mm_cvtsi32_si64 (*v);
430 #endif
433 static force_inline __m64
434 load8888 (const uint32_t *v)
436 #ifdef USE_LOONGSON_MMI
437 return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
438 #else
439 return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
440 #endif
443 static force_inline __m64
444 load8888u (const uint32_t *v)
446 uint32_t l = ldl_u (v);
447 return load8888 (&l);
450 static force_inline __m64
451 pack8888 (__m64 lo, __m64 hi)
453 return _mm_packs_pu16 (lo, hi);
456 static force_inline void
457 store (uint32_t *dest, __m64 v)
459 #ifdef USE_LOONGSON_MMI
460 asm ("swc1 %1, %0\n\t"
461 : "=m" (*dest)
462 : "f" (v)
463 : "memory"
465 #else
466 *dest = _mm_cvtsi64_si32 (v);
467 #endif
470 static force_inline void
471 store8888 (uint32_t *dest, __m64 v)
473 v = pack8888 (v, _mm_setzero_si64 ());
474 store (dest, v);
477 static force_inline pixman_bool_t
478 is_equal (__m64 a, __m64 b)
480 #ifdef USE_LOONGSON_MMI
481 /* __m64 is double, we can compare directly. */
482 return a == b;
483 #else
484 return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
485 #endif
488 static force_inline pixman_bool_t
489 is_opaque (__m64 v)
491 #ifdef USE_LOONGSON_MMI
492 return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
493 #else
494 __m64 ffs = _mm_cmpeq_pi8 (v, v);
495 return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
496 #endif
499 static force_inline pixman_bool_t
500 is_zero (__m64 v)
502 return is_equal (v, _mm_setzero_si64 ());
505 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
507 * 00RR00GG00BB
509 * --- Expanding 565 in the low word ---
511 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
512 * m = m & (01f0003f001f);
513 * m = m * (008404100840);
514 * m = m >> 8;
516 * Note the trick here - the top word is shifted by another nibble to
517 * avoid it bumping into the middle word
519 static force_inline __m64
520 expand565 (__m64 pixel, int pos)
522 __m64 p = pixel;
523 __m64 t1, t2;
525 /* move pixel to low 16 bit and zero the rest */
526 #ifdef USE_LOONGSON_MMI
527 p = loongson_extract_pi16 (p, pos);
528 #else
529 p = shift (shift (p, (3 - pos) * 16), -48);
530 #endif
532 t1 = shift (p, 36 - 11);
533 t2 = shift (p, 16 - 5);
535 p = _mm_or_si64 (t1, p);
536 p = _mm_or_si64 (t2, p);
537 p = _mm_and_si64 (p, MC (565_rgb));
539 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
540 return _mm_srli_pi16 (pixel, 8);
543 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
545 * AARRGGBBRRGGBB
547 static force_inline void
548 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
550 __m64 t0, t1, alpha = _mm_setzero_si64 ();
551 __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
552 __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
553 __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
554 if (full_alpha)
555 alpha = _mm_cmpeq_pi32 (alpha, alpha);
557 /* Replicate high bits into empty low bits. */
558 r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
559 g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
560 b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
562 r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */
563 g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */
564 b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */
566 t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */
567 t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */
569 *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */
570 *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */
573 static force_inline __m64
574 expand8888 (__m64 in, int pos)
576 if (pos == 0)
577 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
578 else
579 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
582 static force_inline __m64
583 expandx888 (__m64 in, int pos)
585 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
588 static force_inline void
589 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
591 __m64 v0, v1;
592 expand_4xpacked565 (vin, &v0, &v1, full_alpha);
593 *vout0 = expand8888 (v0, 0);
594 *vout1 = expand8888 (v0, 1);
595 *vout2 = expand8888 (v1, 0);
596 *vout3 = expand8888 (v1, 1);
599 static force_inline __m64
600 pack_565 (__m64 pixel, __m64 target, int pos)
602 __m64 p = pixel;
603 __m64 t = target;
604 __m64 r, g, b;
606 r = _mm_and_si64 (p, MC (565_r));
607 g = _mm_and_si64 (p, MC (565_g));
608 b = _mm_and_si64 (p, MC (565_b));
610 #ifdef USE_LOONGSON_MMI
611 r = shift (r, -(32 - 8));
612 g = shift (g, -(16 - 3));
613 b = shift (b, -(0 + 3));
615 p = _mm_or_si64 (r, g);
616 p = _mm_or_si64 (p, b);
617 return loongson_insert_pi16 (t, p, pos);
618 #else
619 r = shift (r, -(32 - 8) + pos * 16);
620 g = shift (g, -(16 - 3) + pos * 16);
621 b = shift (b, -(0 + 3) + pos * 16);
623 if (pos == 0)
624 t = _mm_and_si64 (t, MC (mask_0));
625 else if (pos == 1)
626 t = _mm_and_si64 (t, MC (mask_1));
627 else if (pos == 2)
628 t = _mm_and_si64 (t, MC (mask_2));
629 else if (pos == 3)
630 t = _mm_and_si64 (t, MC (mask_3));
632 p = _mm_or_si64 (r, t);
633 p = _mm_or_si64 (g, p);
635 return _mm_or_si64 (b, p);
636 #endif
639 static force_inline __m64
640 pack_4xpacked565 (__m64 a, __m64 b)
642 __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
643 __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
645 __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
646 __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
648 __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
649 __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
651 t0 = _mm_or_si64 (t0, g0);
652 t1 = _mm_or_si64 (t1, g1);
654 t0 = shift(t0, -5);
655 #ifdef USE_ARM_IWMMXT
656 t1 = shift(t1, -5);
657 return _mm_packs_pu32 (t0, t1);
658 #else
659 t1 = shift(t1, -5 + 16);
660 return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
661 #endif
664 #ifndef _MSC_VER
666 static force_inline __m64
667 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
669 return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
672 static force_inline __m64
673 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
675 x = pix_multiply (x, a);
676 y = pix_multiply (y, b);
678 return pix_add (x, y);
681 #else
683 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
685 #define pack_4x565(v0, v1, v2, v3) \
686 pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
688 #define pix_add_mul(x, a, y, b) \
689 ( x = pix_multiply (x, a), \
690 y = pix_multiply (y, b), \
691 pix_add (x, y) )
693 #endif
695 /* --------------- MMX code patch for fbcompose.c --------------------- */
697 static force_inline __m64
698 combine (const uint32_t *src, const uint32_t *mask)
700 __m64 vsrc = load8888 (src);
702 if (mask)
704 __m64 m = load8888 (mask);
706 m = expand_alpha (m);
707 vsrc = pix_multiply (vsrc, m);
710 return vsrc;
713 static force_inline __m64
714 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
716 vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
718 if (is_opaque (vsrc))
720 return vsrc;
722 else if (!is_zero (vsrc))
724 return over (vsrc, expand_alpha (vsrc),
725 _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
728 return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
731 static void
732 mmx_combine_over_u (pixman_implementation_t *imp,
733 pixman_op_t op,
734 uint32_t * dest,
735 const uint32_t * src,
736 const uint32_t * mask,
737 int width)
739 const uint32_t *end = dest + width;
741 while (dest < end)
743 __m64 vsrc = combine (src, mask);
745 if (is_opaque (vsrc))
747 store8888 (dest, vsrc);
749 else if (!is_zero (vsrc))
751 __m64 sa = expand_alpha (vsrc);
752 store8888 (dest, over (vsrc, sa, load8888 (dest)));
755 ++dest;
756 ++src;
757 if (mask)
758 ++mask;
760 _mm_empty ();
763 static void
764 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
765 pixman_op_t op,
766 uint32_t * dest,
767 const uint32_t * src,
768 const uint32_t * mask,
769 int width)
771 const uint32_t *end = dest + width;
773 while (dest < end)
775 __m64 d, da;
776 __m64 s = combine (src, mask);
778 d = load8888 (dest);
779 da = expand_alpha (d);
780 store8888 (dest, over (d, da, s));
782 ++dest;
783 ++src;
784 if (mask)
785 mask++;
787 _mm_empty ();
790 static void
791 mmx_combine_in_u (pixman_implementation_t *imp,
792 pixman_op_t op,
793 uint32_t * dest,
794 const uint32_t * src,
795 const uint32_t * mask,
796 int width)
798 const uint32_t *end = dest + width;
800 while (dest < end)
802 __m64 a;
803 __m64 x = combine (src, mask);
805 a = load8888 (dest);
806 a = expand_alpha (a);
807 x = pix_multiply (x, a);
809 store8888 (dest, x);
811 ++dest;
812 ++src;
813 if (mask)
814 mask++;
816 _mm_empty ();
819 static void
820 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
821 pixman_op_t op,
822 uint32_t * dest,
823 const uint32_t * src,
824 const uint32_t * mask,
825 int width)
827 const uint32_t *end = dest + width;
829 while (dest < end)
831 __m64 a = combine (src, mask);
832 __m64 x;
834 x = load8888 (dest);
835 a = expand_alpha (a);
836 x = pix_multiply (x, a);
837 store8888 (dest, x);
839 ++dest;
840 ++src;
841 if (mask)
842 mask++;
844 _mm_empty ();
847 static void
848 mmx_combine_out_u (pixman_implementation_t *imp,
849 pixman_op_t op,
850 uint32_t * dest,
851 const uint32_t * src,
852 const uint32_t * mask,
853 int width)
855 const uint32_t *end = dest + width;
857 while (dest < end)
859 __m64 a;
860 __m64 x = combine (src, mask);
862 a = load8888 (dest);
863 a = expand_alpha (a);
864 a = negate (a);
865 x = pix_multiply (x, a);
866 store8888 (dest, x);
868 ++dest;
869 ++src;
870 if (mask)
871 mask++;
873 _mm_empty ();
876 static void
877 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
878 pixman_op_t op,
879 uint32_t * dest,
880 const uint32_t * src,
881 const uint32_t * mask,
882 int width)
884 const uint32_t *end = dest + width;
886 while (dest < end)
888 __m64 a = combine (src, mask);
889 __m64 x;
891 x = load8888 (dest);
892 a = expand_alpha (a);
893 a = negate (a);
894 x = pix_multiply (x, a);
896 store8888 (dest, x);
898 ++dest;
899 ++src;
900 if (mask)
901 mask++;
903 _mm_empty ();
906 static void
907 mmx_combine_atop_u (pixman_implementation_t *imp,
908 pixman_op_t op,
909 uint32_t * dest,
910 const uint32_t * src,
911 const uint32_t * mask,
912 int width)
914 const uint32_t *end = dest + width;
916 while (dest < end)
918 __m64 da, d, sia;
919 __m64 s = combine (src, mask);
921 d = load8888 (dest);
922 sia = expand_alpha (s);
923 sia = negate (sia);
924 da = expand_alpha (d);
925 s = pix_add_mul (s, da, d, sia);
926 store8888 (dest, s);
928 ++dest;
929 ++src;
930 if (mask)
931 mask++;
933 _mm_empty ();
936 static void
937 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
938 pixman_op_t op,
939 uint32_t * dest,
940 const uint32_t * src,
941 const uint32_t * mask,
942 int width)
944 const uint32_t *end;
946 end = dest + width;
948 while (dest < end)
950 __m64 dia, d, sa;
951 __m64 s = combine (src, mask);
953 d = load8888 (dest);
954 sa = expand_alpha (s);
955 dia = expand_alpha (d);
956 dia = negate (dia);
957 s = pix_add_mul (s, dia, d, sa);
958 store8888 (dest, s);
960 ++dest;
961 ++src;
962 if (mask)
963 mask++;
965 _mm_empty ();
968 static void
969 mmx_combine_xor_u (pixman_implementation_t *imp,
970 pixman_op_t op,
971 uint32_t * dest,
972 const uint32_t * src,
973 const uint32_t * mask,
974 int width)
976 const uint32_t *end = dest + width;
978 while (dest < end)
980 __m64 dia, d, sia;
981 __m64 s = combine (src, mask);
983 d = load8888 (dest);
984 sia = expand_alpha (s);
985 dia = expand_alpha (d);
986 sia = negate (sia);
987 dia = negate (dia);
988 s = pix_add_mul (s, dia, d, sia);
989 store8888 (dest, s);
991 ++dest;
992 ++src;
993 if (mask)
994 mask++;
996 _mm_empty ();
999 static void
1000 mmx_combine_add_u (pixman_implementation_t *imp,
1001 pixman_op_t op,
1002 uint32_t * dest,
1003 const uint32_t * src,
1004 const uint32_t * mask,
1005 int width)
1007 const uint32_t *end = dest + width;
1009 while (dest < end)
1011 __m64 d;
1012 __m64 s = combine (src, mask);
1014 d = load8888 (dest);
1015 s = pix_add (s, d);
1016 store8888 (dest, s);
1018 ++dest;
1019 ++src;
1020 if (mask)
1021 mask++;
1023 _mm_empty ();
1026 static void
1027 mmx_combine_saturate_u (pixman_implementation_t *imp,
1028 pixman_op_t op,
1029 uint32_t * dest,
1030 const uint32_t * src,
1031 const uint32_t * mask,
1032 int width)
1034 const uint32_t *end = dest + width;
1036 while (dest < end)
1038 uint32_t s, sa, da;
1039 uint32_t d = *dest;
1040 __m64 ms = combine (src, mask);
1041 __m64 md = load8888 (dest);
1043 store8888(&s, ms);
1044 da = ~d >> 24;
1045 sa = s >> 24;
1047 if (sa > da)
1049 uint32_t quot = DIV_UN8 (da, sa) << 24;
1050 __m64 msa = load8888 (&quot);
1051 msa = expand_alpha (msa);
1052 ms = pix_multiply (ms, msa);
1055 md = pix_add (md, ms);
1056 store8888 (dest, md);
1058 ++src;
1059 ++dest;
1060 if (mask)
1061 mask++;
1063 _mm_empty ();
1066 static void
1067 mmx_combine_src_ca (pixman_implementation_t *imp,
1068 pixman_op_t op,
1069 uint32_t * dest,
1070 const uint32_t * src,
1071 const uint32_t * mask,
1072 int width)
1074 const uint32_t *end = src + width;
1076 while (src < end)
1078 __m64 a = load8888 (mask);
1079 __m64 s = load8888 (src);
1081 s = pix_multiply (s, a);
1082 store8888 (dest, s);
1084 ++src;
1085 ++mask;
1086 ++dest;
1088 _mm_empty ();
1091 static void
1092 mmx_combine_over_ca (pixman_implementation_t *imp,
1093 pixman_op_t op,
1094 uint32_t * dest,
1095 const uint32_t * src,
1096 const uint32_t * mask,
1097 int width)
1099 const uint32_t *end = src + width;
1101 while (src < end)
1103 __m64 a = load8888 (mask);
1104 __m64 s = load8888 (src);
1105 __m64 d = load8888 (dest);
1106 __m64 sa = expand_alpha (s);
1108 store8888 (dest, in_over (s, sa, a, d));
1110 ++src;
1111 ++dest;
1112 ++mask;
1114 _mm_empty ();
1117 static void
1118 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1119 pixman_op_t op,
1120 uint32_t * dest,
1121 const uint32_t * src,
1122 const uint32_t * mask,
1123 int width)
1125 const uint32_t *end = src + width;
1127 while (src < end)
1129 __m64 a = load8888 (mask);
1130 __m64 s = load8888 (src);
1131 __m64 d = load8888 (dest);
1132 __m64 da = expand_alpha (d);
1134 store8888 (dest, over (d, da, in (s, a)));
1136 ++src;
1137 ++dest;
1138 ++mask;
1140 _mm_empty ();
1143 static void
1144 mmx_combine_in_ca (pixman_implementation_t *imp,
1145 pixman_op_t op,
1146 uint32_t * dest,
1147 const uint32_t * src,
1148 const uint32_t * mask,
1149 int width)
1151 const uint32_t *end = src + width;
1153 while (src < end)
1155 __m64 a = load8888 (mask);
1156 __m64 s = load8888 (src);
1157 __m64 d = load8888 (dest);
1158 __m64 da = expand_alpha (d);
1160 s = pix_multiply (s, a);
1161 s = pix_multiply (s, da);
1162 store8888 (dest, s);
1164 ++src;
1165 ++dest;
1166 ++mask;
1168 _mm_empty ();
1171 static void
1172 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1173 pixman_op_t op,
1174 uint32_t * dest,
1175 const uint32_t * src,
1176 const uint32_t * mask,
1177 int width)
1179 const uint32_t *end = src + width;
1181 while (src < end)
1183 __m64 a = load8888 (mask);
1184 __m64 s = load8888 (src);
1185 __m64 d = load8888 (dest);
1186 __m64 sa = expand_alpha (s);
1188 a = pix_multiply (a, sa);
1189 d = pix_multiply (d, a);
1190 store8888 (dest, d);
1192 ++src;
1193 ++dest;
1194 ++mask;
1196 _mm_empty ();
1199 static void
1200 mmx_combine_out_ca (pixman_implementation_t *imp,
1201 pixman_op_t op,
1202 uint32_t * dest,
1203 const uint32_t * src,
1204 const uint32_t * mask,
1205 int width)
1207 const uint32_t *end = src + width;
1209 while (src < end)
1211 __m64 a = load8888 (mask);
1212 __m64 s = load8888 (src);
1213 __m64 d = load8888 (dest);
1214 __m64 da = expand_alpha (d);
1216 da = negate (da);
1217 s = pix_multiply (s, a);
1218 s = pix_multiply (s, da);
1219 store8888 (dest, s);
1221 ++src;
1222 ++dest;
1223 ++mask;
1225 _mm_empty ();
1228 static void
1229 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1230 pixman_op_t op,
1231 uint32_t * dest,
1232 const uint32_t * src,
1233 const uint32_t * mask,
1234 int width)
1236 const uint32_t *end = src + width;
1238 while (src < end)
1240 __m64 a = load8888 (mask);
1241 __m64 s = load8888 (src);
1242 __m64 d = load8888 (dest);
1243 __m64 sa = expand_alpha (s);
1245 a = pix_multiply (a, sa);
1246 a = negate (a);
1247 d = pix_multiply (d, a);
1248 store8888 (dest, d);
1250 ++src;
1251 ++dest;
1252 ++mask;
1254 _mm_empty ();
1257 static void
1258 mmx_combine_atop_ca (pixman_implementation_t *imp,
1259 pixman_op_t op,
1260 uint32_t * dest,
1261 const uint32_t * src,
1262 const uint32_t * mask,
1263 int width)
1265 const uint32_t *end = src + width;
1267 while (src < end)
1269 __m64 a = load8888 (mask);
1270 __m64 s = load8888 (src);
1271 __m64 d = load8888 (dest);
1272 __m64 da = expand_alpha (d);
1273 __m64 sa = expand_alpha (s);
1275 s = pix_multiply (s, a);
1276 a = pix_multiply (a, sa);
1277 a = negate (a);
1278 d = pix_add_mul (d, a, s, da);
1279 store8888 (dest, d);
1281 ++src;
1282 ++dest;
1283 ++mask;
1285 _mm_empty ();
1288 static void
1289 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1290 pixman_op_t op,
1291 uint32_t * dest,
1292 const uint32_t * src,
1293 const uint32_t * mask,
1294 int width)
1296 const uint32_t *end = src + width;
1298 while (src < end)
1300 __m64 a = load8888 (mask);
1301 __m64 s = load8888 (src);
1302 __m64 d = load8888 (dest);
1303 __m64 da = expand_alpha (d);
1304 __m64 sa = expand_alpha (s);
1306 s = pix_multiply (s, a);
1307 a = pix_multiply (a, sa);
1308 da = negate (da);
1309 d = pix_add_mul (d, a, s, da);
1310 store8888 (dest, d);
1312 ++src;
1313 ++dest;
1314 ++mask;
1316 _mm_empty ();
1319 static void
1320 mmx_combine_xor_ca (pixman_implementation_t *imp,
1321 pixman_op_t op,
1322 uint32_t * dest,
1323 const uint32_t * src,
1324 const uint32_t * mask,
1325 int width)
1327 const uint32_t *end = src + width;
1329 while (src < end)
1331 __m64 a = load8888 (mask);
1332 __m64 s = load8888 (src);
1333 __m64 d = load8888 (dest);
1334 __m64 da = expand_alpha (d);
1335 __m64 sa = expand_alpha (s);
1337 s = pix_multiply (s, a);
1338 a = pix_multiply (a, sa);
1339 da = negate (da);
1340 a = negate (a);
1341 d = pix_add_mul (d, a, s, da);
1342 store8888 (dest, d);
1344 ++src;
1345 ++dest;
1346 ++mask;
1348 _mm_empty ();
1351 static void
1352 mmx_combine_add_ca (pixman_implementation_t *imp,
1353 pixman_op_t op,
1354 uint32_t * dest,
1355 const uint32_t * src,
1356 const uint32_t * mask,
1357 int width)
1359 const uint32_t *end = src + width;
1361 while (src < end)
1363 __m64 a = load8888 (mask);
1364 __m64 s = load8888 (src);
1365 __m64 d = load8888 (dest);
1367 s = pix_multiply (s, a);
1368 d = pix_add (s, d);
1369 store8888 (dest, d);
1371 ++src;
1372 ++dest;
1373 ++mask;
1375 _mm_empty ();
1378 /* ------------- MMX code paths called from fbpict.c -------------------- */
1380 static void
1381 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1382 pixman_composite_info_t *info)
1384 PIXMAN_COMPOSITE_ARGS (info);
1385 uint32_t src;
1386 uint32_t *dst_line, *dst;
1387 int32_t w;
1388 int dst_stride;
1389 __m64 vsrc, vsrca;
1391 CHECKPOINT ();
1393 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1395 if (src == 0)
1396 return;
1398 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1400 vsrc = load8888 (&src);
1401 vsrca = expand_alpha (vsrc);
1403 while (height--)
1405 dst = dst_line;
1406 dst_line += dst_stride;
1407 w = width;
1409 CHECKPOINT ();
1411 while (w && (uintptr_t)dst & 7)
1413 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1415 w--;
1416 dst++;
1419 while (w >= 2)
1421 __m64 vdest;
1422 __m64 dest0, dest1;
1424 vdest = *(__m64 *)dst;
1426 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1427 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1429 *(__m64 *)dst = pack8888 (dest0, dest1);
1431 dst += 2;
1432 w -= 2;
1435 CHECKPOINT ();
1437 if (w)
1439 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1443 _mm_empty ();
1446 static void
1447 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1448 pixman_composite_info_t *info)
1450 PIXMAN_COMPOSITE_ARGS (info);
1451 uint32_t src;
1452 uint16_t *dst_line, *dst;
1453 int32_t w;
1454 int dst_stride;
1455 __m64 vsrc, vsrca;
1457 CHECKPOINT ();
1459 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1461 if (src == 0)
1462 return;
1464 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1466 vsrc = load8888 (&src);
1467 vsrca = expand_alpha (vsrc);
1469 while (height--)
1471 dst = dst_line;
1472 dst_line += dst_stride;
1473 w = width;
1475 CHECKPOINT ();
1477 while (w && (uintptr_t)dst & 7)
1479 uint64_t d = *dst;
1480 __m64 vdest = expand565 (to_m64 (d), 0);
1482 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1483 *dst = to_uint64 (vdest);
1485 w--;
1486 dst++;
1489 while (w >= 4)
1491 __m64 vdest = *(__m64 *)dst;
1492 __m64 v0, v1, v2, v3;
1494 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1496 v0 = over (vsrc, vsrca, v0);
1497 v1 = over (vsrc, vsrca, v1);
1498 v2 = over (vsrc, vsrca, v2);
1499 v3 = over (vsrc, vsrca, v3);
1501 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1503 dst += 4;
1504 w -= 4;
1507 CHECKPOINT ();
1509 while (w)
1511 uint64_t d = *dst;
1512 __m64 vdest = expand565 (to_m64 (d), 0);
1514 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1515 *dst = to_uint64 (vdest);
1517 w--;
1518 dst++;
1522 _mm_empty ();
1525 static void
1526 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1527 pixman_composite_info_t *info)
1529 PIXMAN_COMPOSITE_ARGS (info);
1530 uint32_t src;
1531 uint32_t *dst_line;
1532 uint32_t *mask_line;
1533 int dst_stride, mask_stride;
1534 __m64 vsrc, vsrca;
1536 CHECKPOINT ();
1538 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1540 if (src == 0)
1541 return;
1543 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1544 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1546 vsrc = load8888 (&src);
1547 vsrca = expand_alpha (vsrc);
1549 while (height--)
1551 int twidth = width;
1552 uint32_t *p = (uint32_t *)mask_line;
1553 uint32_t *q = (uint32_t *)dst_line;
1555 while (twidth && (uintptr_t)q & 7)
1557 uint32_t m = *(uint32_t *)p;
1559 if (m)
1561 __m64 vdest = load8888 (q);
1562 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1563 store8888 (q, vdest);
1566 twidth--;
1567 p++;
1568 q++;
1571 while (twidth >= 2)
1573 uint32_t m0, m1;
1574 m0 = *p;
1575 m1 = *(p + 1);
1577 if (m0 | m1)
1579 __m64 dest0, dest1;
1580 __m64 vdest = *(__m64 *)q;
1582 dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1583 expand8888 (vdest, 0));
1584 dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1585 expand8888 (vdest, 1));
1587 *(__m64 *)q = pack8888 (dest0, dest1);
1590 p += 2;
1591 q += 2;
1592 twidth -= 2;
1595 if (twidth)
1597 uint32_t m = *(uint32_t *)p;
1599 if (m)
1601 __m64 vdest = load8888 (q);
1602 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1603 store8888 (q, vdest);
1606 twidth--;
1607 p++;
1608 q++;
1611 dst_line += dst_stride;
1612 mask_line += mask_stride;
1615 _mm_empty ();
1618 static void
1619 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1620 pixman_composite_info_t *info)
1622 PIXMAN_COMPOSITE_ARGS (info);
1623 uint32_t *dst_line, *dst;
1624 uint32_t *src_line, *src;
1625 uint32_t mask;
1626 __m64 vmask;
1627 int dst_stride, src_stride;
1628 int32_t w;
1630 CHECKPOINT ();
1632 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1633 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1635 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1636 vmask = expand_alpha (load8888 (&mask));
1638 while (height--)
1640 dst = dst_line;
1641 dst_line += dst_stride;
1642 src = src_line;
1643 src_line += src_stride;
1644 w = width;
1646 while (w && (uintptr_t)dst & 7)
1648 __m64 s = load8888 (src);
1649 __m64 d = load8888 (dst);
1651 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1653 w--;
1654 dst++;
1655 src++;
1658 while (w >= 2)
1660 __m64 vs = ldq_u ((__m64 *)src);
1661 __m64 vd = *(__m64 *)dst;
1662 __m64 vsrc0 = expand8888 (vs, 0);
1663 __m64 vsrc1 = expand8888 (vs, 1);
1665 *(__m64 *)dst = pack8888 (
1666 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1667 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1669 w -= 2;
1670 dst += 2;
1671 src += 2;
1674 if (w)
1676 __m64 s = load8888 (src);
1677 __m64 d = load8888 (dst);
1679 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1683 _mm_empty ();
1686 static void
1687 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1688 pixman_composite_info_t *info)
1690 PIXMAN_COMPOSITE_ARGS (info);
1691 uint32_t *dst_line, *dst;
1692 uint32_t *src_line, *src;
1693 uint32_t mask;
1694 __m64 vmask;
1695 int dst_stride, src_stride;
1696 int32_t w;
1697 __m64 srca;
1699 CHECKPOINT ();
1701 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1702 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1703 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1705 vmask = expand_alpha (load8888 (&mask));
1706 srca = MC (4x00ff);
1708 while (height--)
1710 dst = dst_line;
1711 dst_line += dst_stride;
1712 src = src_line;
1713 src_line += src_stride;
1714 w = width;
1716 while (w && (uintptr_t)dst & 7)
1718 uint32_t ssrc = *src | 0xff000000;
1719 __m64 s = load8888 (&ssrc);
1720 __m64 d = load8888 (dst);
1722 store8888 (dst, in_over (s, srca, vmask, d));
1724 w--;
1725 dst++;
1726 src++;
1729 while (w >= 16)
1731 __m64 vd0 = *(__m64 *)(dst + 0);
1732 __m64 vd1 = *(__m64 *)(dst + 2);
1733 __m64 vd2 = *(__m64 *)(dst + 4);
1734 __m64 vd3 = *(__m64 *)(dst + 6);
1735 __m64 vd4 = *(__m64 *)(dst + 8);
1736 __m64 vd5 = *(__m64 *)(dst + 10);
1737 __m64 vd6 = *(__m64 *)(dst + 12);
1738 __m64 vd7 = *(__m64 *)(dst + 14);
1740 __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1741 __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1742 __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1743 __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1744 __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1745 __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1746 __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1747 __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1749 vd0 = pack8888 (
1750 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1751 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1753 vd1 = pack8888 (
1754 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1755 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1757 vd2 = pack8888 (
1758 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1759 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1761 vd3 = pack8888 (
1762 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1763 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1765 vd4 = pack8888 (
1766 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1767 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1769 vd5 = pack8888 (
1770 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1771 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1773 vd6 = pack8888 (
1774 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1775 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1777 vd7 = pack8888 (
1778 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1779 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1781 *(__m64 *)(dst + 0) = vd0;
1782 *(__m64 *)(dst + 2) = vd1;
1783 *(__m64 *)(dst + 4) = vd2;
1784 *(__m64 *)(dst + 6) = vd3;
1785 *(__m64 *)(dst + 8) = vd4;
1786 *(__m64 *)(dst + 10) = vd5;
1787 *(__m64 *)(dst + 12) = vd6;
1788 *(__m64 *)(dst + 14) = vd7;
1790 w -= 16;
1791 dst += 16;
1792 src += 16;
1795 while (w)
1797 uint32_t ssrc = *src | 0xff000000;
1798 __m64 s = load8888 (&ssrc);
1799 __m64 d = load8888 (dst);
1801 store8888 (dst, in_over (s, srca, vmask, d));
1803 w--;
1804 dst++;
1805 src++;
1809 _mm_empty ();
1812 static void
1813 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1814 pixman_composite_info_t *info)
1816 PIXMAN_COMPOSITE_ARGS (info);
1817 uint32_t *dst_line, *dst;
1818 uint32_t *src_line, *src;
1819 uint32_t s;
1820 int dst_stride, src_stride;
1821 uint8_t a;
1822 int32_t w;
1824 CHECKPOINT ();
1826 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1827 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1829 while (height--)
1831 dst = dst_line;
1832 dst_line += dst_stride;
1833 src = src_line;
1834 src_line += src_stride;
1835 w = width;
1837 while (w--)
1839 s = *src++;
1840 a = s >> 24;
1842 if (a == 0xff)
1844 *dst = s;
1846 else if (s)
1848 __m64 ms, sa;
1849 ms = load8888 (&s);
1850 sa = expand_alpha (ms);
1851 store8888 (dst, over (ms, sa, load8888 (dst)));
1854 dst++;
1857 _mm_empty ();
1860 static void
1861 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1862 pixman_composite_info_t *info)
1864 PIXMAN_COMPOSITE_ARGS (info);
1865 uint16_t *dst_line, *dst;
1866 uint32_t *src_line, *src;
1867 int dst_stride, src_stride;
1868 int32_t w;
1870 CHECKPOINT ();
1872 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1873 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1875 #if 0
1876 /* FIXME */
1877 assert (src_image->drawable == mask_image->drawable);
1878 #endif
1880 while (height--)
1882 dst = dst_line;
1883 dst_line += dst_stride;
1884 src = src_line;
1885 src_line += src_stride;
1886 w = width;
1888 CHECKPOINT ();
1890 while (w && (uintptr_t)dst & 7)
1892 __m64 vsrc = load8888 (src);
1893 uint64_t d = *dst;
1894 __m64 vdest = expand565 (to_m64 (d), 0);
1896 vdest = pack_565 (
1897 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1899 *dst = to_uint64 (vdest);
1901 w--;
1902 dst++;
1903 src++;
1906 CHECKPOINT ();
1908 while (w >= 4)
1910 __m64 vdest = *(__m64 *)dst;
1911 __m64 v0, v1, v2, v3;
1912 __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1914 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1916 vsrc0 = load8888 ((src + 0));
1917 vsrc1 = load8888 ((src + 1));
1918 vsrc2 = load8888 ((src + 2));
1919 vsrc3 = load8888 ((src + 3));
1921 v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1922 v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1923 v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1924 v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1926 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1928 w -= 4;
1929 dst += 4;
1930 src += 4;
1933 CHECKPOINT ();
1935 while (w)
1937 __m64 vsrc = load8888 (src);
1938 uint64_t d = *dst;
1939 __m64 vdest = expand565 (to_m64 (d), 0);
1941 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1943 *dst = to_uint64 (vdest);
1945 w--;
1946 dst++;
1947 src++;
1951 _mm_empty ();
1954 static void
1955 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1956 pixman_composite_info_t *info)
1958 PIXMAN_COMPOSITE_ARGS (info);
1959 uint32_t src, srca;
1960 uint32_t *dst_line, *dst;
1961 uint8_t *mask_line, *mask;
1962 int dst_stride, mask_stride;
1963 int32_t w;
1964 __m64 vsrc, vsrca;
1965 uint64_t srcsrc;
1967 CHECKPOINT ();
1969 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1971 srca = src >> 24;
1972 if (src == 0)
1973 return;
1975 srcsrc = (uint64_t)src << 32 | src;
1977 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1978 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1980 vsrc = load8888 (&src);
1981 vsrca = expand_alpha (vsrc);
1983 while (height--)
1985 dst = dst_line;
1986 dst_line += dst_stride;
1987 mask = mask_line;
1988 mask_line += mask_stride;
1989 w = width;
1991 CHECKPOINT ();
1993 while (w && (uintptr_t)dst & 7)
1995 uint64_t m = *mask;
1997 if (m)
1999 __m64 vdest = in_over (vsrc, vsrca,
2000 expand_alpha_rev (to_m64 (m)),
2001 load8888 (dst));
2003 store8888 (dst, vdest);
2006 w--;
2007 mask++;
2008 dst++;
2011 CHECKPOINT ();
2013 while (w >= 2)
2015 uint64_t m0, m1;
2017 m0 = *mask;
2018 m1 = *(mask + 1);
2020 if (srca == 0xff && (m0 & m1) == 0xff)
2022 *(uint64_t *)dst = srcsrc;
2024 else if (m0 | m1)
2026 __m64 vdest;
2027 __m64 dest0, dest1;
2029 vdest = *(__m64 *)dst;
2031 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2032 expand8888 (vdest, 0));
2033 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2034 expand8888 (vdest, 1));
2036 *(__m64 *)dst = pack8888 (dest0, dest1);
2039 mask += 2;
2040 dst += 2;
2041 w -= 2;
2044 CHECKPOINT ();
2046 if (w)
2048 uint64_t m = *mask;
2050 if (m)
2052 __m64 vdest = load8888 (dst);
2054 vdest = in_over (
2055 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
2056 store8888 (dst, vdest);
2061 _mm_empty ();
2064 static pixman_bool_t
2065 mmx_fill (pixman_implementation_t *imp,
2066 uint32_t * bits,
2067 int stride,
2068 int bpp,
2069 int x,
2070 int y,
2071 int width,
2072 int height,
2073 uint32_t filler)
2075 uint64_t fill;
2076 __m64 vfill;
2077 uint32_t byte_width;
2078 uint8_t *byte_line;
2080 #if defined __GNUC__ && defined USE_X86_MMX
2081 __m64 v1, v2, v3, v4, v5, v6, v7;
2082 #endif
2084 if (bpp != 16 && bpp != 32 && bpp != 8)
2085 return FALSE;
2087 if (bpp == 8)
2089 stride = stride * (int) sizeof (uint32_t) / 1;
2090 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2091 byte_width = width;
2092 stride *= 1;
2093 filler = (filler & 0xff) * 0x01010101;
2095 else if (bpp == 16)
2097 stride = stride * (int) sizeof (uint32_t) / 2;
2098 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2099 byte_width = 2 * width;
2100 stride *= 2;
2101 filler = (filler & 0xffff) * 0x00010001;
2103 else
2105 stride = stride * (int) sizeof (uint32_t) / 4;
2106 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2107 byte_width = 4 * width;
2108 stride *= 4;
2111 fill = ((uint64_t)filler << 32) | filler;
2112 vfill = to_m64 (fill);
2114 #if defined __GNUC__ && defined USE_X86_MMX
2115 __asm__ (
2116 "movq %7, %0\n"
2117 "movq %7, %1\n"
2118 "movq %7, %2\n"
2119 "movq %7, %3\n"
2120 "movq %7, %4\n"
2121 "movq %7, %5\n"
2122 "movq %7, %6\n"
2123 : "=&y" (v1), "=&y" (v2), "=&y" (v3),
2124 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2125 : "y" (vfill));
2126 #endif
2128 while (height--)
2130 int w;
2131 uint8_t *d = byte_line;
2133 byte_line += stride;
2134 w = byte_width;
2136 if (w >= 1 && ((uintptr_t)d & 1))
2138 *(uint8_t *)d = (filler & 0xff);
2139 w--;
2140 d++;
2143 if (w >= 2 && ((uintptr_t)d & 3))
2145 *(uint16_t *)d = filler;
2146 w -= 2;
2147 d += 2;
2150 while (w >= 4 && ((uintptr_t)d & 7))
2152 *(uint32_t *)d = filler;
2154 w -= 4;
2155 d += 4;
2158 while (w >= 64)
2160 #if defined __GNUC__ && defined USE_X86_MMX
2161 __asm__ (
2162 "movq %1, (%0)\n"
2163 "movq %2, 8(%0)\n"
2164 "movq %3, 16(%0)\n"
2165 "movq %4, 24(%0)\n"
2166 "movq %5, 32(%0)\n"
2167 "movq %6, 40(%0)\n"
2168 "movq %7, 48(%0)\n"
2169 "movq %8, 56(%0)\n"
2171 : "r" (d),
2172 "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2173 "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2174 : "memory");
2175 #else
2176 *(__m64*) (d + 0) = vfill;
2177 *(__m64*) (d + 8) = vfill;
2178 *(__m64*) (d + 16) = vfill;
2179 *(__m64*) (d + 24) = vfill;
2180 *(__m64*) (d + 32) = vfill;
2181 *(__m64*) (d + 40) = vfill;
2182 *(__m64*) (d + 48) = vfill;
2183 *(__m64*) (d + 56) = vfill;
2184 #endif
2185 w -= 64;
2186 d += 64;
2189 while (w >= 4)
2191 *(uint32_t *)d = filler;
2193 w -= 4;
2194 d += 4;
2196 if (w >= 2)
2198 *(uint16_t *)d = filler;
2199 w -= 2;
2200 d += 2;
2202 if (w >= 1)
2204 *(uint8_t *)d = (filler & 0xff);
2205 w--;
2206 d++;
2211 _mm_empty ();
2212 return TRUE;
2215 static void
2216 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2217 pixman_composite_info_t *info)
2219 PIXMAN_COMPOSITE_ARGS (info);
2220 uint16_t *dst_line, *dst;
2221 uint32_t *src_line, *src, s;
2222 int dst_stride, src_stride;
2223 int32_t w;
2225 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2226 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2228 while (height--)
2230 dst = dst_line;
2231 dst_line += dst_stride;
2232 src = src_line;
2233 src_line += src_stride;
2234 w = width;
2236 while (w && (uintptr_t)dst & 7)
2238 s = *src++;
2239 *dst = convert_8888_to_0565 (s);
2240 dst++;
2241 w--;
2244 while (w >= 4)
2246 __m64 vdest;
2247 __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2248 __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2250 vdest = pack_4xpacked565 (vsrc0, vsrc1);
2252 *(__m64 *)dst = vdest;
2254 w -= 4;
2255 src += 4;
2256 dst += 4;
2259 while (w)
2261 s = *src++;
2262 *dst = convert_8888_to_0565 (s);
2263 dst++;
2264 w--;
2268 _mm_empty ();
2271 static void
2272 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2273 pixman_composite_info_t *info)
2275 PIXMAN_COMPOSITE_ARGS (info);
2276 uint32_t src, srca;
2277 uint32_t *dst_line, *dst;
2278 uint8_t *mask_line, *mask;
2279 int dst_stride, mask_stride;
2280 int32_t w;
2281 __m64 vsrc;
2282 uint64_t srcsrc;
2284 CHECKPOINT ();
2286 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2288 srca = src >> 24;
2289 if (src == 0)
2291 mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2292 PIXMAN_FORMAT_BPP (dest_image->bits.format),
2293 dest_x, dest_y, width, height, 0);
2294 return;
2297 srcsrc = (uint64_t)src << 32 | src;
2299 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2300 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2302 vsrc = load8888 (&src);
2304 while (height--)
2306 dst = dst_line;
2307 dst_line += dst_stride;
2308 mask = mask_line;
2309 mask_line += mask_stride;
2310 w = width;
2312 CHECKPOINT ();
2314 while (w && (uintptr_t)dst & 7)
2316 uint64_t m = *mask;
2318 if (m)
2320 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2322 store8888 (dst, vdest);
2324 else
2326 *dst = 0;
2329 w--;
2330 mask++;
2331 dst++;
2334 CHECKPOINT ();
2336 while (w >= 2)
2338 uint64_t m0, m1;
2339 m0 = *mask;
2340 m1 = *(mask + 1);
2342 if (srca == 0xff && (m0 & m1) == 0xff)
2344 *(uint64_t *)dst = srcsrc;
2346 else if (m0 | m1)
2348 __m64 dest0, dest1;
2350 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2351 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2353 *(__m64 *)dst = pack8888 (dest0, dest1);
2355 else
2357 *(uint64_t *)dst = 0;
2360 mask += 2;
2361 dst += 2;
2362 w -= 2;
2365 CHECKPOINT ();
2367 if (w)
2369 uint64_t m = *mask;
2371 if (m)
2373 __m64 vdest = load8888 (dst);
2375 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2376 store8888 (dst, vdest);
2378 else
2380 *dst = 0;
2385 _mm_empty ();
2388 static void
2389 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2390 pixman_composite_info_t *info)
2392 PIXMAN_COMPOSITE_ARGS (info);
2393 uint32_t src, srca;
2394 uint16_t *dst_line, *dst;
2395 uint8_t *mask_line, *mask;
2396 int dst_stride, mask_stride;
2397 int32_t w;
2398 __m64 vsrc, vsrca, tmp;
2399 __m64 srcsrcsrcsrc;
2401 CHECKPOINT ();
2403 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2405 srca = src >> 24;
2406 if (src == 0)
2407 return;
2409 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2410 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2412 vsrc = load8888 (&src);
2413 vsrca = expand_alpha (vsrc);
2415 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2416 srcsrcsrcsrc = expand_alpha_rev (tmp);
2418 while (height--)
2420 dst = dst_line;
2421 dst_line += dst_stride;
2422 mask = mask_line;
2423 mask_line += mask_stride;
2424 w = width;
2426 CHECKPOINT ();
2428 while (w && (uintptr_t)dst & 7)
2430 uint64_t m = *mask;
2432 if (m)
2434 uint64_t d = *dst;
2435 __m64 vd = to_m64 (d);
2436 __m64 vdest = in_over (
2437 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2439 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2440 *dst = to_uint64 (vd);
2443 w--;
2444 mask++;
2445 dst++;
2448 CHECKPOINT ();
2450 while (w >= 4)
2452 uint64_t m0, m1, m2, m3;
2453 m0 = *mask;
2454 m1 = *(mask + 1);
2455 m2 = *(mask + 2);
2456 m3 = *(mask + 3);
2458 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2460 *(__m64 *)dst = srcsrcsrcsrc;
2462 else if (m0 | m1 | m2 | m3)
2464 __m64 vdest = *(__m64 *)dst;
2465 __m64 v0, v1, v2, v3;
2466 __m64 vm0, vm1, vm2, vm3;
2468 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2470 vm0 = to_m64 (m0);
2471 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2473 vm1 = to_m64 (m1);
2474 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2476 vm2 = to_m64 (m2);
2477 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2479 vm3 = to_m64 (m3);
2480 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
2482 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
2485 w -= 4;
2486 mask += 4;
2487 dst += 4;
2490 CHECKPOINT ();
2492 while (w)
2494 uint64_t m = *mask;
2496 if (m)
2498 uint64_t d = *dst;
2499 __m64 vd = to_m64 (d);
2500 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2501 expand565 (vd, 0));
2502 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2503 *dst = to_uint64 (vd);
2506 w--;
2507 mask++;
2508 dst++;
2512 _mm_empty ();
2515 static void
2516 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2517 pixman_composite_info_t *info)
2519 PIXMAN_COMPOSITE_ARGS (info);
2520 uint16_t *dst_line, *dst;
2521 uint32_t *src_line, *src;
2522 int dst_stride, src_stride;
2523 int32_t w;
2525 CHECKPOINT ();
2527 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2528 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2530 #if 0
2531 /* FIXME */
2532 assert (src_image->drawable == mask_image->drawable);
2533 #endif
2535 while (height--)
2537 dst = dst_line;
2538 dst_line += dst_stride;
2539 src = src_line;
2540 src_line += src_stride;
2541 w = width;
2543 CHECKPOINT ();
2545 while (w && (uintptr_t)dst & 7)
2547 __m64 vsrc = load8888 (src);
2548 uint64_t d = *dst;
2549 __m64 vdest = expand565 (to_m64 (d), 0);
2551 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2553 *dst = to_uint64 (vdest);
2555 w--;
2556 dst++;
2557 src++;
2560 CHECKPOINT ();
2562 while (w >= 4)
2564 uint32_t s0, s1, s2, s3;
2565 unsigned char a0, a1, a2, a3;
2567 s0 = *src;
2568 s1 = *(src + 1);
2569 s2 = *(src + 2);
2570 s3 = *(src + 3);
2572 a0 = (s0 >> 24);
2573 a1 = (s1 >> 24);
2574 a2 = (s2 >> 24);
2575 a3 = (s3 >> 24);
2577 if ((a0 & a1 & a2 & a3) == 0xFF)
2579 __m64 v0 = invert_colors (load8888 (&s0));
2580 __m64 v1 = invert_colors (load8888 (&s1));
2581 __m64 v2 = invert_colors (load8888 (&s2));
2582 __m64 v3 = invert_colors (load8888 (&s3));
2584 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2586 else if (s0 | s1 | s2 | s3)
2588 __m64 vdest = *(__m64 *)dst;
2589 __m64 v0, v1, v2, v3;
2591 __m64 vsrc0 = load8888 (&s0);
2592 __m64 vsrc1 = load8888 (&s1);
2593 __m64 vsrc2 = load8888 (&s2);
2594 __m64 vsrc3 = load8888 (&s3);
2596 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2598 v0 = over_rev_non_pre (vsrc0, v0);
2599 v1 = over_rev_non_pre (vsrc1, v1);
2600 v2 = over_rev_non_pre (vsrc2, v2);
2601 v3 = over_rev_non_pre (vsrc3, v3);
2603 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
2606 w -= 4;
2607 dst += 4;
2608 src += 4;
2611 CHECKPOINT ();
2613 while (w)
2615 __m64 vsrc = load8888 (src);
2616 uint64_t d = *dst;
2617 __m64 vdest = expand565 (to_m64 (d), 0);
2619 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2621 *dst = to_uint64 (vdest);
2623 w--;
2624 dst++;
2625 src++;
2629 _mm_empty ();
2632 static void
2633 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2634 pixman_composite_info_t *info)
2636 PIXMAN_COMPOSITE_ARGS (info);
2637 uint32_t *dst_line, *dst;
2638 uint32_t *src_line, *src;
2639 int dst_stride, src_stride;
2640 int32_t w;
2642 CHECKPOINT ();
2644 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2645 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2647 #if 0
2648 /* FIXME */
2649 assert (src_image->drawable == mask_image->drawable);
2650 #endif
2652 while (height--)
2654 dst = dst_line;
2655 dst_line += dst_stride;
2656 src = src_line;
2657 src_line += src_stride;
2658 w = width;
2660 while (w && (uintptr_t)dst & 7)
2662 __m64 s = load8888 (src);
2663 __m64 d = load8888 (dst);
2665 store8888 (dst, over_rev_non_pre (s, d));
2667 w--;
2668 dst++;
2669 src++;
2672 while (w >= 2)
2674 uint32_t s0, s1;
2675 unsigned char a0, a1;
2676 __m64 d0, d1;
2678 s0 = *src;
2679 s1 = *(src + 1);
2681 a0 = (s0 >> 24);
2682 a1 = (s1 >> 24);
2684 if ((a0 & a1) == 0xFF)
2686 d0 = invert_colors (load8888 (&s0));
2687 d1 = invert_colors (load8888 (&s1));
2689 *(__m64 *)dst = pack8888 (d0, d1);
2691 else if (s0 | s1)
2693 __m64 vdest = *(__m64 *)dst;
2695 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2696 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
2698 *(__m64 *)dst = pack8888 (d0, d1);
2701 w -= 2;
2702 dst += 2;
2703 src += 2;
2706 if (w)
2708 __m64 s = load8888 (src);
2709 __m64 d = load8888 (dst);
2711 store8888 (dst, over_rev_non_pre (s, d));
2715 _mm_empty ();
2718 static void
2719 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2720 pixman_composite_info_t *info)
2722 PIXMAN_COMPOSITE_ARGS (info);
2723 uint32_t src;
2724 uint16_t *dst_line;
2725 uint32_t *mask_line;
2726 int dst_stride, mask_stride;
2727 __m64 vsrc, vsrca;
2729 CHECKPOINT ();
2731 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2733 if (src == 0)
2734 return;
2736 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2737 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2739 vsrc = load8888 (&src);
2740 vsrca = expand_alpha (vsrc);
2742 while (height--)
2744 int twidth = width;
2745 uint32_t *p = (uint32_t *)mask_line;
2746 uint16_t *q = (uint16_t *)dst_line;
2748 while (twidth && ((uintptr_t)q & 7))
2750 uint32_t m = *(uint32_t *)p;
2752 if (m)
2754 uint64_t d = *q;
2755 __m64 vdest = expand565 (to_m64 (d), 0);
2756 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2757 *q = to_uint64 (vdest);
2760 twidth--;
2761 p++;
2762 q++;
2765 while (twidth >= 4)
2767 uint32_t m0, m1, m2, m3;
2769 m0 = *p;
2770 m1 = *(p + 1);
2771 m2 = *(p + 2);
2772 m3 = *(p + 3);
2774 if ((m0 | m1 | m2 | m3))
2776 __m64 vdest = *(__m64 *)q;
2777 __m64 v0, v1, v2, v3;
2779 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2781 v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2782 v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2783 v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2784 v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2786 *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
2788 twidth -= 4;
2789 p += 4;
2790 q += 4;
2793 while (twidth)
2795 uint32_t m;
2797 m = *(uint32_t *)p;
2798 if (m)
2800 uint64_t d = *q;
2801 __m64 vdest = expand565 (to_m64 (d), 0);
2802 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
2803 *q = to_uint64 (vdest);
2806 twidth--;
2807 p++;
2808 q++;
2811 mask_line += mask_stride;
2812 dst_line += dst_stride;
2815 _mm_empty ();
2818 static void
2819 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2820 pixman_composite_info_t *info)
2822 PIXMAN_COMPOSITE_ARGS (info);
2823 uint8_t *dst_line, *dst;
2824 uint8_t *mask_line, *mask;
2825 int dst_stride, mask_stride;
2826 int32_t w;
2827 uint32_t src;
2828 uint8_t sa;
2829 __m64 vsrc, vsrca;
2831 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2832 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2834 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2836 sa = src >> 24;
2838 vsrc = load8888 (&src);
2839 vsrca = expand_alpha (vsrc);
2841 while (height--)
2843 dst = dst_line;
2844 dst_line += dst_stride;
2845 mask = mask_line;
2846 mask_line += mask_stride;
2847 w = width;
2849 while (w && (uintptr_t)dst & 7)
2851 uint16_t tmp;
2852 uint8_t a;
2853 uint32_t m, d;
2855 a = *mask++;
2856 d = *dst;
2858 m = MUL_UN8 (sa, a, tmp);
2859 d = MUL_UN8 (m, d, tmp);
2861 *dst++ = d;
2862 w--;
2865 while (w >= 4)
2867 __m64 vmask;
2868 __m64 vdest;
2870 vmask = load8888u ((uint32_t *)mask);
2871 vdest = load8888 ((uint32_t *)dst);
2873 store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2875 dst += 4;
2876 mask += 4;
2877 w -= 4;
2880 while (w--)
2882 uint16_t tmp;
2883 uint8_t a;
2884 uint32_t m, d;
2886 a = *mask++;
2887 d = *dst;
2889 m = MUL_UN8 (sa, a, tmp);
2890 d = MUL_UN8 (m, d, tmp);
2892 *dst++ = d;
2896 _mm_empty ();
2899 static void
2900 mmx_composite_in_8_8 (pixman_implementation_t *imp,
2901 pixman_composite_info_t *info)
2903 PIXMAN_COMPOSITE_ARGS (info);
2904 uint8_t *dst_line, *dst;
2905 uint8_t *src_line, *src;
2906 int src_stride, dst_stride;
2907 int32_t w;
2909 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2910 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2912 while (height--)
2914 dst = dst_line;
2915 dst_line += dst_stride;
2916 src = src_line;
2917 src_line += src_stride;
2918 w = width;
2920 while (w && (uintptr_t)dst & 3)
2922 uint8_t s, d;
2923 uint16_t tmp;
2925 s = *src;
2926 d = *dst;
2928 *dst = MUL_UN8 (s, d, tmp);
2930 src++;
2931 dst++;
2932 w--;
2935 while (w >= 4)
2937 uint32_t *s = (uint32_t *)src;
2938 uint32_t *d = (uint32_t *)dst;
2940 store8888 (d, in (load8888u (s), load8888 (d)));
2942 w -= 4;
2943 dst += 4;
2944 src += 4;
2947 while (w--)
2949 uint8_t s, d;
2950 uint16_t tmp;
2952 s = *src;
2953 d = *dst;
2955 *dst = MUL_UN8 (s, d, tmp);
2957 src++;
2958 dst++;
2962 _mm_empty ();
2965 static void
2966 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2967 pixman_composite_info_t *info)
2969 PIXMAN_COMPOSITE_ARGS (info);
2970 uint8_t *dst_line, *dst;
2971 uint8_t *mask_line, *mask;
2972 int dst_stride, mask_stride;
2973 int32_t w;
2974 uint32_t src;
2975 uint8_t sa;
2976 __m64 vsrc, vsrca;
2978 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2979 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2981 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2983 sa = src >> 24;
2985 if (src == 0)
2986 return;
2988 vsrc = load8888 (&src);
2989 vsrca = expand_alpha (vsrc);
2991 while (height--)
2993 dst = dst_line;
2994 dst_line += dst_stride;
2995 mask = mask_line;
2996 mask_line += mask_stride;
2997 w = width;
2999 while (w && (uintptr_t)dst & 3)
3001 uint16_t tmp;
3002 uint16_t a;
3003 uint32_t m, d;
3004 uint32_t r;
3006 a = *mask++;
3007 d = *dst;
3009 m = MUL_UN8 (sa, a, tmp);
3010 r = ADD_UN8 (m, d, tmp);
3012 *dst++ = r;
3013 w--;
3016 while (w >= 4)
3018 __m64 vmask;
3019 __m64 vdest;
3021 vmask = load8888u ((uint32_t *)mask);
3022 vdest = load8888 ((uint32_t *)dst);
3024 store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3026 dst += 4;
3027 mask += 4;
3028 w -= 4;
3031 while (w--)
3033 uint16_t tmp;
3034 uint16_t a;
3035 uint32_t m, d;
3036 uint32_t r;
3038 a = *mask++;
3039 d = *dst;
3041 m = MUL_UN8 (sa, a, tmp);
3042 r = ADD_UN8 (m, d, tmp);
3044 *dst++ = r;
3048 _mm_empty ();
3051 static void
3052 mmx_composite_add_8_8 (pixman_implementation_t *imp,
3053 pixman_composite_info_t *info)
3055 PIXMAN_COMPOSITE_ARGS (info);
3056 uint8_t *dst_line, *dst;
3057 uint8_t *src_line, *src;
3058 int dst_stride, src_stride;
3059 int32_t w;
3060 uint8_t s, d;
3061 uint16_t t;
3063 CHECKPOINT ();
3065 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3066 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
3068 while (height--)
3070 dst = dst_line;
3071 dst_line += dst_stride;
3072 src = src_line;
3073 src_line += src_stride;
3074 w = width;
3076 while (w && (uintptr_t)dst & 7)
3078 s = *src;
3079 d = *dst;
3080 t = d + s;
3081 s = t | (0 - (t >> 8));
3082 *dst = s;
3084 dst++;
3085 src++;
3086 w--;
3089 while (w >= 8)
3091 *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3092 dst += 8;
3093 src += 8;
3094 w -= 8;
3097 while (w)
3099 s = *src;
3100 d = *dst;
3101 t = d + s;
3102 s = t | (0 - (t >> 8));
3103 *dst = s;
3105 dst++;
3106 src++;
3107 w--;
3111 _mm_empty ();
3114 static void
3115 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3116 pixman_composite_info_t *info)
3118 PIXMAN_COMPOSITE_ARGS (info);
3119 uint16_t *dst_line, *dst;
3120 uint32_t d;
3121 uint16_t *src_line, *src;
3122 uint32_t s;
3123 int dst_stride, src_stride;
3124 int32_t w;
3126 CHECKPOINT ();
3128 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3129 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3131 while (height--)
3133 dst = dst_line;
3134 dst_line += dst_stride;
3135 src = src_line;
3136 src_line += src_stride;
3137 w = width;
3139 while (w && (uintptr_t)dst & 7)
3141 s = *src++;
3142 if (s)
3144 d = *dst;
3145 s = convert_0565_to_8888 (s);
3146 if (d)
3148 d = convert_0565_to_8888 (d);
3149 UN8x4_ADD_UN8x4 (s, d);
3151 *dst = convert_8888_to_0565 (s);
3153 dst++;
3154 w--;
3157 while (w >= 4)
3159 __m64 vdest = *(__m64 *)dst;
3160 __m64 vsrc = ldq_u ((__m64 *)src);
3161 __m64 vd0, vd1;
3162 __m64 vs0, vs1;
3164 expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3165 expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3167 vd0 = _mm_adds_pu8 (vd0, vs0);
3168 vd1 = _mm_adds_pu8 (vd1, vs1);
3170 *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3172 dst += 4;
3173 src += 4;
3174 w -= 4;
3177 while (w--)
3179 s = *src++;
3180 if (s)
3182 d = *dst;
3183 s = convert_0565_to_8888 (s);
3184 if (d)
3186 d = convert_0565_to_8888 (d);
3187 UN8x4_ADD_UN8x4 (s, d);
3189 *dst = convert_8888_to_0565 (s);
3191 dst++;
3195 _mm_empty ();
3198 static void
3199 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3200 pixman_composite_info_t *info)
3202 PIXMAN_COMPOSITE_ARGS (info);
3203 uint32_t *dst_line, *dst;
3204 uint32_t *src_line, *src;
3205 int dst_stride, src_stride;
3206 int32_t w;
3208 CHECKPOINT ();
3210 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3211 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3213 while (height--)
3215 dst = dst_line;
3216 dst_line += dst_stride;
3217 src = src_line;
3218 src_line += src_stride;
3219 w = width;
3221 while (w && (uintptr_t)dst & 7)
3223 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3224 load ((const uint32_t *)dst)));
3225 dst++;
3226 src++;
3227 w--;
3230 while (w >= 2)
3232 *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
3233 dst += 2;
3234 src += 2;
3235 w -= 2;
3238 if (w)
3240 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3241 load ((const uint32_t *)dst)));
3246 _mm_empty ();
3249 static pixman_bool_t
3250 mmx_blt (pixman_implementation_t *imp,
3251 uint32_t * src_bits,
3252 uint32_t * dst_bits,
3253 int src_stride,
3254 int dst_stride,
3255 int src_bpp,
3256 int dst_bpp,
3257 int src_x,
3258 int src_y,
3259 int dest_x,
3260 int dest_y,
3261 int width,
3262 int height)
3264 uint8_t * src_bytes;
3265 uint8_t * dst_bytes;
3266 int byte_width;
3268 if (src_bpp != dst_bpp)
3269 return FALSE;
3271 if (src_bpp == 16)
3273 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3274 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3275 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3276 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3277 byte_width = 2 * width;
3278 src_stride *= 2;
3279 dst_stride *= 2;
3281 else if (src_bpp == 32)
3283 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3284 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3285 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3286 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
3287 byte_width = 4 * width;
3288 src_stride *= 4;
3289 dst_stride *= 4;
3291 else
3293 return FALSE;
3296 while (height--)
3298 int w;
3299 uint8_t *s = src_bytes;
3300 uint8_t *d = dst_bytes;
3301 src_bytes += src_stride;
3302 dst_bytes += dst_stride;
3303 w = byte_width;
3305 if (w >= 1 && ((uintptr_t)d & 1))
3307 *(uint8_t *)d = *(uint8_t *)s;
3308 w -= 1;
3309 s += 1;
3310 d += 1;
3313 if (w >= 2 && ((uintptr_t)d & 3))
3315 *(uint16_t *)d = *(uint16_t *)s;
3316 w -= 2;
3317 s += 2;
3318 d += 2;
3321 while (w >= 4 && ((uintptr_t)d & 7))
3323 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3325 w -= 4;
3326 s += 4;
3327 d += 4;
3330 while (w >= 64)
3332 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
3333 __asm__ (
3334 "movq (%1), %%mm0\n"
3335 "movq 8(%1), %%mm1\n"
3336 "movq 16(%1), %%mm2\n"
3337 "movq 24(%1), %%mm3\n"
3338 "movq 32(%1), %%mm4\n"
3339 "movq 40(%1), %%mm5\n"
3340 "movq 48(%1), %%mm6\n"
3341 "movq 56(%1), %%mm7\n"
3343 "movq %%mm0, (%0)\n"
3344 "movq %%mm1, 8(%0)\n"
3345 "movq %%mm2, 16(%0)\n"
3346 "movq %%mm3, 24(%0)\n"
3347 "movq %%mm4, 32(%0)\n"
3348 "movq %%mm5, 40(%0)\n"
3349 "movq %%mm6, 48(%0)\n"
3350 "movq %%mm7, 56(%0)\n"
3352 : "r" (d), "r" (s)
3353 : "memory",
3354 "%mm0", "%mm1", "%mm2", "%mm3",
3355 "%mm4", "%mm5", "%mm6", "%mm7");
3356 #else
3357 __m64 v0 = ldq_u ((__m64 *)(s + 0));
3358 __m64 v1 = ldq_u ((__m64 *)(s + 8));
3359 __m64 v2 = ldq_u ((__m64 *)(s + 16));
3360 __m64 v3 = ldq_u ((__m64 *)(s + 24));
3361 __m64 v4 = ldq_u ((__m64 *)(s + 32));
3362 __m64 v5 = ldq_u ((__m64 *)(s + 40));
3363 __m64 v6 = ldq_u ((__m64 *)(s + 48));
3364 __m64 v7 = ldq_u ((__m64 *)(s + 56));
3365 *(__m64 *)(d + 0) = v0;
3366 *(__m64 *)(d + 8) = v1;
3367 *(__m64 *)(d + 16) = v2;
3368 *(__m64 *)(d + 24) = v3;
3369 *(__m64 *)(d + 32) = v4;
3370 *(__m64 *)(d + 40) = v5;
3371 *(__m64 *)(d + 48) = v6;
3372 *(__m64 *)(d + 56) = v7;
3373 #endif
3375 w -= 64;
3376 s += 64;
3377 d += 64;
3379 while (w >= 4)
3381 *(uint32_t *)d = ldl_u ((uint32_t *)s);
3383 w -= 4;
3384 s += 4;
3385 d += 4;
3387 if (w >= 2)
3389 *(uint16_t *)d = *(uint16_t *)s;
3390 w -= 2;
3391 s += 2;
3392 d += 2;
3396 _mm_empty ();
3398 return TRUE;
3401 static void
3402 mmx_composite_copy_area (pixman_implementation_t *imp,
3403 pixman_composite_info_t *info)
3405 PIXMAN_COMPOSITE_ARGS (info);
3407 mmx_blt (imp, src_image->bits.bits,
3408 dest_image->bits.bits,
3409 src_image->bits.rowstride,
3410 dest_image->bits.rowstride,
3411 PIXMAN_FORMAT_BPP (src_image->bits.format),
3412 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3413 src_x, src_y, dest_x, dest_y, width, height);
3416 static void
3417 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3418 pixman_composite_info_t *info)
3420 PIXMAN_COMPOSITE_ARGS (info);
3421 uint32_t *src, *src_line;
3422 uint32_t *dst, *dst_line;
3423 uint8_t *mask, *mask_line;
3424 int src_stride, mask_stride, dst_stride;
3425 int32_t w;
3427 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3428 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3429 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3431 while (height--)
3433 src = src_line;
3434 src_line += src_stride;
3435 dst = dst_line;
3436 dst_line += dst_stride;
3437 mask = mask_line;
3438 mask_line += mask_stride;
3440 w = width;
3442 while (w--)
3444 uint64_t m = *mask;
3446 if (m)
3448 uint32_t ssrc = *src | 0xff000000;
3449 __m64 s = load8888 (&ssrc);
3451 if (m == 0xff)
3453 store8888 (dst, s);
3455 else
3457 __m64 sa = expand_alpha (s);
3458 __m64 vm = expand_alpha_rev (to_m64 (m));
3459 __m64 vdest = in_over (s, sa, vm, load8888 (dst));
3461 store8888 (dst, vdest);
3465 mask++;
3466 dst++;
3467 src++;
3471 _mm_empty ();
3474 static void
3475 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3476 pixman_composite_info_t *info)
3478 PIXMAN_COMPOSITE_ARGS (info);
3479 uint32_t src;
3480 uint32_t *dst_line, *dst;
3481 int32_t w;
3482 int dst_stride;
3483 __m64 vsrc;
3485 CHECKPOINT ();
3487 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3489 if (src == 0)
3490 return;
3492 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3494 vsrc = load8888 (&src);
3496 while (height--)
3498 dst = dst_line;
3499 dst_line += dst_stride;
3500 w = width;
3502 CHECKPOINT ();
3504 while (w && (uintptr_t)dst & 7)
3506 __m64 vdest = load8888 (dst);
3508 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3510 w--;
3511 dst++;
3514 while (w >= 2)
3516 __m64 vdest = *(__m64 *)dst;
3517 __m64 dest0 = expand8888 (vdest, 0);
3518 __m64 dest1 = expand8888 (vdest, 1);
3521 dest0 = over (dest0, expand_alpha (dest0), vsrc);
3522 dest1 = over (dest1, expand_alpha (dest1), vsrc);
3524 *(__m64 *)dst = pack8888 (dest0, dest1);
3526 dst += 2;
3527 w -= 2;
3530 CHECKPOINT ();
3532 if (w)
3534 __m64 vdest = load8888 (dst);
3536 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3540 _mm_empty ();
3543 static force_inline void
3544 scaled_nearest_scanline_mmx_8888_8888_OVER (uint32_t* pd,
3545 const uint32_t* ps,
3546 int32_t w,
3547 pixman_fixed_t vx,
3548 pixman_fixed_t unit_x,
3549 pixman_fixed_t src_width_fixed,
3550 pixman_bool_t fully_transparent_src)
3552 if (fully_transparent_src)
3553 return;
3555 while (w)
3557 __m64 d = load (pd);
3558 __m64 s = load (ps + pixman_fixed_to_int (vx));
3559 vx += unit_x;
3560 while (vx >= 0)
3561 vx -= src_width_fixed;
3563 store8888 (pd, core_combine_over_u_pixel_mmx (s, d));
3564 pd++;
3566 w--;
3569 _mm_empty ();
3572 FAST_NEAREST_MAINLOOP (mmx_8888_8888_cover_OVER,
3573 scaled_nearest_scanline_mmx_8888_8888_OVER,
3574 uint32_t, uint32_t, COVER)
3575 FAST_NEAREST_MAINLOOP (mmx_8888_8888_none_OVER,
3576 scaled_nearest_scanline_mmx_8888_8888_OVER,
3577 uint32_t, uint32_t, NONE)
3578 FAST_NEAREST_MAINLOOP (mmx_8888_8888_pad_OVER,
3579 scaled_nearest_scanline_mmx_8888_8888_OVER,
3580 uint32_t, uint32_t, PAD)
3581 FAST_NEAREST_MAINLOOP (mmx_8888_8888_normal_OVER,
3582 scaled_nearest_scanline_mmx_8888_8888_OVER,
3583 uint32_t, uint32_t, NORMAL)
3585 static force_inline void
3586 scaled_nearest_scanline_mmx_8888_n_8888_OVER (const uint32_t * mask,
3587 uint32_t * dst,
3588 const uint32_t * src,
3589 int32_t w,
3590 pixman_fixed_t vx,
3591 pixman_fixed_t unit_x,
3592 pixman_fixed_t src_width_fixed,
3593 pixman_bool_t zero_src)
3595 __m64 mm_mask;
3597 if (zero_src || (*mask >> 24) == 0)
3599 /* A workaround for https://gcc.gnu.org/PR47759 */
3600 _mm_empty ();
3601 return;
3604 mm_mask = expand_alpha (load8888 (mask));
3606 while (w)
3608 uint32_t s = *(src + pixman_fixed_to_int (vx));
3609 vx += unit_x;
3610 while (vx >= 0)
3611 vx -= src_width_fixed;
3613 if (s)
3615 __m64 ms = load8888 (&s);
3616 __m64 alpha = expand_alpha (ms);
3617 __m64 dest = load8888 (dst);
3619 store8888 (dst, (in_over (ms, alpha, mm_mask, dest)));
3622 dst++;
3623 w--;
3626 _mm_empty ();
3629 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_cover_OVER,
3630 scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3631 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
3632 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_pad_OVER,
3633 scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3634 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
3635 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_none_OVER,
3636 scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3637 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
3638 FAST_NEAREST_MAINLOOP_COMMON (mmx_8888_n_8888_normal_OVER,
3639 scaled_nearest_scanline_mmx_8888_n_8888_OVER,
3640 uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
3642 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3643 #define BMSK (BSHIFT - 1)
3645 #define BILINEAR_DECLARE_VARIABLES \
3646 const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \
3647 const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \
3648 const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \
3649 const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \
3650 const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \
3651 const __m64 mm_zero = _mm_setzero_si64 (); \
3652 __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3654 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
3655 do { \
3656 /* fetch 2x2 pixel block into 2 mmx registers */ \
3657 __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
3658 __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
3659 /* vertical interpolation */ \
3660 __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
3661 __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
3662 __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \
3663 __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
3664 __m64 hi = _mm_add_pi16 (t_hi, b_hi); \
3665 __m64 lo = _mm_add_pi16 (t_lo, b_lo); \
3666 /* calculate horizontal weights */ \
3667 __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \
3668 _mm_srli_pi16 (mm_x, \
3669 16 - BILINEAR_INTERPOLATION_BITS))); \
3670 /* horizontal interpolation */ \
3671 __m64 p = _mm_unpacklo_pi16 (lo, hi); \
3672 __m64 q = _mm_unpackhi_pi16 (lo, hi); \
3673 vx += unit_x; \
3674 lo = _mm_madd_pi16 (p, mm_wh); \
3675 hi = _mm_madd_pi16 (q, mm_wh); \
3676 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3677 /* shift and pack the result */ \
3678 hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
3679 lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \
3680 lo = _mm_packs_pi32 (lo, hi); \
3681 lo = _mm_packs_pu16 (lo, lo); \
3682 pix = lo; \
3683 } while (0)
3685 #define BILINEAR_SKIP_ONE_PIXEL() \
3686 do { \
3687 vx += unit_x; \
3688 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
3689 } while(0)
3691 static force_inline void
3692 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst,
3693 const uint32_t * mask,
3694 const uint32_t * src_top,
3695 const uint32_t * src_bottom,
3696 int32_t w,
3697 int wt,
3698 int wb,
3699 pixman_fixed_t vx,
3700 pixman_fixed_t unit_x,
3701 pixman_fixed_t max_vx,
3702 pixman_bool_t zero_src)
3704 BILINEAR_DECLARE_VARIABLES;
3705 __m64 pix;
3707 while (w--)
3709 BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3710 store (dst, pix);
3711 dst++;
3714 _mm_empty ();
3717 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3718 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3719 uint32_t, uint32_t, uint32_t,
3720 COVER, FLAG_NONE)
3721 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3722 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3723 uint32_t, uint32_t, uint32_t,
3724 PAD, FLAG_NONE)
3725 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3726 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3727 uint32_t, uint32_t, uint32_t,
3728 NONE, FLAG_NONE)
3729 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3730 scaled_bilinear_scanline_mmx_8888_8888_SRC,
3731 uint32_t, uint32_t, uint32_t,
3732 NORMAL, FLAG_NONE)
3734 static force_inline void
3735 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst,
3736 const uint32_t * mask,
3737 const uint32_t * src_top,
3738 const uint32_t * src_bottom,
3739 int32_t w,
3740 int wt,
3741 int wb,
3742 pixman_fixed_t vx,
3743 pixman_fixed_t unit_x,
3744 pixman_fixed_t max_vx,
3745 pixman_bool_t zero_src)
3747 BILINEAR_DECLARE_VARIABLES;
3748 __m64 pix1, pix2;
3750 while (w)
3752 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3754 if (!is_zero (pix1))
3756 pix2 = load (dst);
3757 store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3760 w--;
3761 dst++;
3764 _mm_empty ();
3767 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3768 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3769 uint32_t, uint32_t, uint32_t,
3770 COVER, FLAG_NONE)
3771 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3772 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3773 uint32_t, uint32_t, uint32_t,
3774 PAD, FLAG_NONE)
3775 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3776 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3777 uint32_t, uint32_t, uint32_t,
3778 NONE, FLAG_NONE)
3779 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3780 scaled_bilinear_scanline_mmx_8888_8888_OVER,
3781 uint32_t, uint32_t, uint32_t,
3782 NORMAL, FLAG_NONE)
3784 static force_inline void
3785 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst,
3786 const uint8_t * mask,
3787 const uint32_t * src_top,
3788 const uint32_t * src_bottom,
3789 int32_t w,
3790 int wt,
3791 int wb,
3792 pixman_fixed_t vx,
3793 pixman_fixed_t unit_x,
3794 pixman_fixed_t max_vx,
3795 pixman_bool_t zero_src)
3797 BILINEAR_DECLARE_VARIABLES;
3798 __m64 pix1, pix2;
3799 uint32_t m;
3801 while (w)
3803 m = (uint32_t) *mask++;
3805 if (m)
3807 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3809 if (m == 0xff && is_opaque (pix1))
3811 store (dst, pix1);
3813 else
3815 __m64 ms, md, ma, msa;
3817 pix2 = load (dst);
3818 ma = expand_alpha_rev (to_m64 (m));
3819 ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3820 md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3822 msa = expand_alpha (ms);
3824 store8888 (dst, (in_over (ms, msa, ma, md)));
3827 else
3829 BILINEAR_SKIP_ONE_PIXEL ();
3832 w--;
3833 dst++;
3836 _mm_empty ();
3839 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3840 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3841 uint32_t, uint8_t, uint32_t,
3842 COVER, FLAG_HAVE_NON_SOLID_MASK)
3843 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3844 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3845 uint32_t, uint8_t, uint32_t,
3846 PAD, FLAG_HAVE_NON_SOLID_MASK)
3847 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3848 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3849 uint32_t, uint8_t, uint32_t,
3850 NONE, FLAG_HAVE_NON_SOLID_MASK)
3851 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3852 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3853 uint32_t, uint8_t, uint32_t,
3854 NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3856 static uint32_t *
3857 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3859 int w = iter->width;
3860 uint32_t *dst = iter->buffer;
3861 uint32_t *src = (uint32_t *)iter->bits;
3863 iter->bits += iter->stride;
3865 while (w && ((uintptr_t)dst) & 7)
3867 *dst++ = (*src++) | 0xff000000;
3868 w--;
3871 while (w >= 8)
3873 __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3874 __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3875 __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3876 __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3878 *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3879 *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3880 *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3881 *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3883 dst += 8;
3884 src += 8;
3885 w -= 8;
3888 while (w)
3890 *dst++ = (*src++) | 0xff000000;
3891 w--;
3894 _mm_empty ();
3895 return iter->buffer;
3898 static uint32_t *
3899 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3901 int w = iter->width;
3902 uint32_t *dst = iter->buffer;
3903 uint16_t *src = (uint16_t *)iter->bits;
3905 iter->bits += iter->stride;
3907 while (w && ((uintptr_t)dst) & 0x0f)
3909 uint16_t s = *src++;
3911 *dst++ = convert_0565_to_8888 (s);
3912 w--;
3915 while (w >= 4)
3917 __m64 vsrc = ldq_u ((__m64 *)src);
3918 __m64 mm0, mm1;
3920 expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3922 *(__m64 *)(dst + 0) = mm0;
3923 *(__m64 *)(dst + 2) = mm1;
3925 dst += 4;
3926 src += 4;
3927 w -= 4;
3930 while (w)
3932 uint16_t s = *src++;
3934 *dst++ = convert_0565_to_8888 (s);
3935 w--;
3938 _mm_empty ();
3939 return iter->buffer;
3942 static uint32_t *
3943 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3945 int w = iter->width;
3946 uint32_t *dst = iter->buffer;
3947 uint8_t *src = iter->bits;
3949 iter->bits += iter->stride;
3951 while (w && (((uintptr_t)dst) & 15))
3953 *dst++ = *(src++) << 24;
3954 w--;
3957 while (w >= 8)
3959 __m64 mm0 = ldq_u ((__m64 *)src);
3961 __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0);
3962 __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0);
3963 __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3964 __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3965 __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3966 __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3968 *(__m64 *)(dst + 0) = mm3;
3969 *(__m64 *)(dst + 2) = mm4;
3970 *(__m64 *)(dst + 4) = mm5;
3971 *(__m64 *)(dst + 6) = mm6;
3973 dst += 8;
3974 src += 8;
3975 w -= 8;
3978 while (w)
3980 *dst++ = *(src++) << 24;
3981 w--;
3984 _mm_empty ();
3985 return iter->buffer;
3988 #define IMAGE_FLAGS \
3989 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
3990 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3992 static const pixman_iter_info_t mmx_iters[] =
3994 { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
3995 _pixman_iter_init_bits_stride, mmx_fetch_x8r8g8b8, NULL
3997 { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
3998 _pixman_iter_init_bits_stride, mmx_fetch_r5g6b5, NULL
4000 { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
4001 _pixman_iter_init_bits_stride, mmx_fetch_a8, NULL
4003 { PIXMAN_null },
4006 static const pixman_fast_path_t mmx_fast_paths[] =
4008 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
4009 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
4010 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
4011 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
4012 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
4013 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
4014 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
4015 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
4016 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
4017 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
4018 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
4019 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
4020 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
4021 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
4022 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
4023 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
4024 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
4025 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
4026 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
4027 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
4028 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
4029 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
4030 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
4031 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
4032 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
4033 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
4034 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
4035 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
4036 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
4037 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
4038 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
4039 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
4040 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
4041 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ),
4042 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
4043 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
4045 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
4046 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
4047 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
4048 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
4049 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
4050 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
4052 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
4053 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
4055 PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ),
4056 PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ),
4057 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
4058 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
4059 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
4060 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
4062 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
4063 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
4064 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
4065 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
4066 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
4067 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
4068 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
4069 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
4070 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
4071 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
4072 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
4073 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
4074 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
4075 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
4076 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
4077 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
4079 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
4080 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
4082 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4083 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4084 SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4085 SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4087 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_n_8888 ),
4088 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_n_8888 ),
4089 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_n_8888 ),
4090 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_n_8888 ),
4092 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4093 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4094 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4095 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4096 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4097 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4099 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
4100 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
4101 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
4102 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
4104 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ),
4105 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ),
4106 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ),
4107 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ),
4109 { PIXMAN_OP_NONE },
4112 pixman_implementation_t *
4113 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
4115 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
4117 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4118 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4119 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4120 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4121 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4122 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4123 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4124 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4125 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4126 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4127 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4129 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4130 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4131 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4132 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4133 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4134 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4135 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4136 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4137 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4138 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4139 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4141 imp->blt = mmx_blt;
4142 imp->fill = mmx_fill;
4144 imp->iter_info = mmx_iters;
4146 return imp;
4149 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */