source/libs/pixman/pixman-src/pixman/pixman-sse2.c

   1 /*
   2  * Copyright © 2008 Rodrigo Kumpera
   3  * Copyright © 2008 André Tupinambá
   4  *
   5  * Permission to use, copy, modify, distribute, and sell this software and its
   6  * documentation for any purpose is hereby granted without fee, provided that
   7  * the above copyright notice appear in all copies and that both that
   8  * copyright notice and this permission notice appear in supporting
   9  * documentation, and that the name of Red Hat not be used in advertising or
  10  * publicity pertaining to distribution of the software without specific,
  11  * written prior permission.  Red Hat makes no representations about the
  12  * suitability of this software for any purpose.  It is provided "as is"
  13  * without express or implied warranty.
  14  *
  15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
  16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
  18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
  20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
  21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
  22  * SOFTWARE.
  23  *
  24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
  25  *          André Tupinambá (andrelrt@gmail.com)
  26  *
  27  * Based on work by Owen Taylor and Søren Sandmann
  28  */
  29 #ifdef HAVE_CONFIG_H
  30 #include <config.h>
  31 #endif
  32
  33 /* PSHUFD is slow on a lot of old processors, and new processors have SSSE3 */
  34 #define PSHUFD_IS_FAST 0
  35
  36 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
  37 #include <emmintrin.h> /* for SSE2 intrinsics */
  38 #include "pixman-private.h"
  39 #include "pixman-combine32.h"
  40 #include "pixman-inlines.h"
  41
  42 static __m128i mask_0080;
  43 static __m128i mask_00ff;
  44 static __m128i mask_0101;
  45 static __m128i mask_ffff;
  46 static __m128i mask_ff000000;
  47 static __m128i mask_alpha;
  48
  49 static __m128i mask_565_r;
  50 static __m128i mask_565_g1, mask_565_g2;
  51 static __m128i mask_565_b;
  52 static __m128i mask_red;
  53 static __m128i mask_green;
  54 static __m128i mask_blue;
  55
  56 static __m128i mask_565_fix_rb;
  57 static __m128i mask_565_fix_g;
  58
  59 static __m128i mask_565_rb;
  60 static __m128i mask_565_pack_multiplier;
  61
  62 static force_inline __m128i
  63 unpack_32_1x128 (uint32_t data)
  64 {
  65     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
  66 }
  67
  68 static force_inline void
  69 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
  70 {
  71     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
  72     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
  73 }
  74
  75 static force_inline __m128i
  76 unpack_565_to_8888 (__m128i lo)
  77 {
  78     __m128i r, g, b, rb, t;
  79
  80     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
  81     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
  82     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
  83
  84     rb = _mm_or_si128 (r, b);
  85     t  = _mm_and_si128 (rb, mask_565_fix_rb);
  86     t  = _mm_srli_epi32 (t, 5);
  87     rb = _mm_or_si128 (rb, t);
  88
  89     t  = _mm_and_si128 (g, mask_565_fix_g);
  90     t  = _mm_srli_epi32 (t, 6);
  91     g  = _mm_or_si128 (g, t);
  92
  93     return _mm_or_si128 (rb, g);
  94 }
  95
  96 static force_inline void
  97 unpack_565_128_4x128 (__m128i  data,
  98                       __m128i* data0,
  99                       __m128i* data1,
 100                       __m128i* data2,
 101                       __m128i* data3)
 102 {
 103     __m128i lo, hi;
 104
 105     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
 106     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
 107
 108     lo = unpack_565_to_8888 (lo);
 109     hi = unpack_565_to_8888 (hi);
 110
 111     unpack_128_2x128 (lo, data0, data1);
 112     unpack_128_2x128 (hi, data2, data3);
 113 }
 114
 115 static force_inline uint16_t
 116 pack_565_32_16 (uint32_t pixel)
 117 {
 118     return (uint16_t) (((pixel >> 8) & 0xf800) |
 119                        ((pixel >> 5) & 0x07e0) |
 120                        ((pixel >> 3) & 0x001f));
 121 }
 122
 123 static force_inline __m128i
 124 pack_2x128_128 (__m128i lo, __m128i hi)
 125 {
 126     return _mm_packus_epi16 (lo, hi);
 127 }
 128
 129 static force_inline __m128i
 130 pack_565_2packedx128_128 (__m128i lo, __m128i hi)
 131 {
 132     __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
 133     __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
 134
 135     __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
 136     __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
 137
 138     __m128i g0 = _mm_and_si128 (lo, mask_green);
 139     __m128i g1 = _mm_and_si128 (hi, mask_green);
 140
 141     t0 = _mm_or_si128 (t0, g0);
 142     t1 = _mm_or_si128 (t1, g1);
 143
 144     /* Simulates _mm_packus_epi32 */
 145     t0 = _mm_slli_epi32 (t0, 16 - 5);
 146     t1 = _mm_slli_epi32 (t1, 16 - 5);
 147     t0 = _mm_srai_epi32 (t0, 16);
 148     t1 = _mm_srai_epi32 (t1, 16);
 149     return _mm_packs_epi32 (t0, t1);
 150 }
 151
 152 static force_inline __m128i
 153 pack_565_2x128_128 (__m128i lo, __m128i hi)
 154 {
 155     __m128i data;
 156     __m128i r, g1, g2, b;
 157
 158     data = pack_2x128_128 (lo, hi);
 159
 160     r  = _mm_and_si128 (data, mask_565_r);
 161     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
 162     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
 163     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
 164
 165     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
 166 }
 167
 168 static force_inline __m128i
 169 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
 170 {
 171     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
 172                              pack_565_2x128_128 (*xmm2, *xmm3));
 173 }
 174
 175 static force_inline int
 176 is_opaque (__m128i x)
 177 {
 178     __m128i ffs = _mm_cmpeq_epi8 (x, x);
 179
 180     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
 181 }
 182
 183 static force_inline int
 184 is_zero (__m128i x)
 185 {
 186     return _mm_movemask_epi8 (
 187         _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
 188 }
 189
 190 static force_inline int
 191 is_transparent (__m128i x)
 192 {
 193     return (_mm_movemask_epi8 (
 194                 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
 195 }
 196
 197 static force_inline __m128i
 198 expand_pixel_32_1x128 (uint32_t data)
 199 {
 200     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
 201 }
 202
 203 static force_inline __m128i
 204 expand_alpha_1x128 (__m128i data)
 205 {
 206     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
 207                                                      _MM_SHUFFLE (3, 3, 3, 3)),
 208                                 _MM_SHUFFLE (3, 3, 3, 3));
 209 }
 210
 211 static force_inline void
 212 expand_alpha_2x128 (__m128i  data_lo,
 213                     __m128i  data_hi,
 214                     __m128i* alpha_lo,
 215                     __m128i* alpha_hi)
 216 {
 217     __m128i lo, hi;
 218
 219     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
 220     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
 221
 222     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
 223     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
 224 }
 225
 226 static force_inline void
 227 expand_alpha_rev_2x128 (__m128i  data_lo,
 228                         __m128i  data_hi,
 229                         __m128i* alpha_lo,
 230                         __m128i* alpha_hi)
 231 {
 232     __m128i lo, hi;
 233
 234     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
 235     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
 236     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
 237     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
 238 }
 239
 240 static force_inline void
 241 pix_multiply_2x128 (__m128i* data_lo,
 242                     __m128i* data_hi,
 243                     __m128i* alpha_lo,
 244                     __m128i* alpha_hi,
 245                     __m128i* ret_lo,
 246                     __m128i* ret_hi)
 247 {
 248     __m128i lo, hi;
 249
 250     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
 251     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
 252     lo = _mm_adds_epu16 (lo, mask_0080);
 253     hi = _mm_adds_epu16 (hi, mask_0080);
 254     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
 255     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
 256 }
 257
 258 static force_inline void
 259 pix_add_multiply_2x128 (__m128i* src_lo,
 260                         __m128i* src_hi,
 261                         __m128i* alpha_dst_lo,
 262                         __m128i* alpha_dst_hi,
 263                         __m128i* dst_lo,
 264                         __m128i* dst_hi,
 265                         __m128i* alpha_src_lo,
 266                         __m128i* alpha_src_hi,
 267                         __m128i* ret_lo,
 268                         __m128i* ret_hi)
 269 {
 270     __m128i t1_lo, t1_hi;
 271     __m128i t2_lo, t2_hi;
 272
 273     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
 274     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
 275
 276     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
 277     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
 278 }
 279
 280 static force_inline void
 281 negate_2x128 (__m128i  data_lo,
 282               __m128i  data_hi,
 283               __m128i* neg_lo,
 284               __m128i* neg_hi)
 285 {
 286     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
 287     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
 288 }
 289
 290 static force_inline void
 291 invert_colors_2x128 (__m128i  data_lo,
 292                      __m128i  data_hi,
 293                      __m128i* inv_lo,
 294                      __m128i* inv_hi)
 295 {
 296     __m128i lo, hi;
 297
 298     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
 299     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
 300     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
 301     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
 302 }
 303
 304 static force_inline void
 305 over_2x128 (__m128i* src_lo,
 306             __m128i* src_hi,
 307             __m128i* alpha_lo,
 308             __m128i* alpha_hi,
 309             __m128i* dst_lo,
 310             __m128i* dst_hi)
 311 {
 312     __m128i t1, t2;
 313
 314     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
 315
 316     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
 317
 318     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
 319     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
 320 }
 321
 322 static force_inline void
 323 over_rev_non_pre_2x128 (__m128i  src_lo,
 324                         __m128i  src_hi,
 325                         __m128i* dst_lo,
 326                         __m128i* dst_hi)
 327 {
 328     __m128i lo, hi;
 329     __m128i alpha_lo, alpha_hi;
 330
 331     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
 332
 333     lo = _mm_or_si128 (alpha_lo, mask_alpha);
 334     hi = _mm_or_si128 (alpha_hi, mask_alpha);
 335
 336     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
 337
 338     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
 339
 340     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
 341 }
 342
 343 static force_inline void
 344 in_over_2x128 (__m128i* src_lo,
 345                __m128i* src_hi,
 346                __m128i* alpha_lo,
 347                __m128i* alpha_hi,
 348                __m128i* mask_lo,
 349                __m128i* mask_hi,
 350                __m128i* dst_lo,
 351                __m128i* dst_hi)
 352 {
 353     __m128i s_lo, s_hi;
 354     __m128i a_lo, a_hi;
 355
 356     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
 357     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
 358
 359     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
 360 }
 361
 362 /* load 4 pixels from a 16-byte boundary aligned address */
 363 static force_inline __m128i
 364 load_128_aligned (__m128i* src)
 365 {
 366     return _mm_load_si128 (src);
 367 }
 368
 369 /* load 4 pixels from a unaligned address */
 370 static force_inline __m128i
 371 load_128_unaligned (const __m128i* src)
 372 {
 373     return _mm_loadu_si128 (src);
 374 }
 375
 376 /* save 4 pixels using Write Combining memory on a 16-byte
 377  * boundary aligned address
 378  */
 379 static force_inline void
 380 save_128_write_combining (__m128i* dst,
 381                           __m128i  data)
 382 {
 383     _mm_stream_si128 (dst, data);
 384 }
 385
 386 /* save 4 pixels on a 16-byte boundary aligned address */
 387 static force_inline void
 388 save_128_aligned (__m128i* dst,
 389                   __m128i  data)
 390 {
 391     _mm_store_si128 (dst, data);
 392 }
 393
 394 /* save 4 pixels on a unaligned address */
 395 static force_inline void
 396 save_128_unaligned (__m128i* dst,
 397                     __m128i  data)
 398 {
 399     _mm_storeu_si128 (dst, data);
 400 }
 401
 402 static force_inline __m128i
 403 load_32_1x128 (uint32_t data)
 404 {
 405     return _mm_cvtsi32_si128 (data);
 406 }
 407
 408 static force_inline __m128i
 409 expand_alpha_rev_1x128 (__m128i data)
 410 {
 411     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
 412 }
 413
 414 static force_inline __m128i
 415 expand_pixel_8_1x128 (uint8_t data)
 416 {
 417     return _mm_shufflelo_epi16 (
 418         unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
 419 }
 420
 421 static force_inline __m128i
 422 pix_multiply_1x128 (__m128i data,
 423                     __m128i alpha)
 424 {
 425     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
 426                                             mask_0080),
 427                             mask_0101);
 428 }
 429
 430 static force_inline __m128i
 431 pix_add_multiply_1x128 (__m128i* src,
 432                         __m128i* alpha_dst,
 433                         __m128i* dst,
 434                         __m128i* alpha_src)
 435 {
 436     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
 437     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
 438
 439     return _mm_adds_epu8 (t1, t2);
 440 }
 441
 442 static force_inline __m128i
 443 negate_1x128 (__m128i data)
 444 {
 445     return _mm_xor_si128 (data, mask_00ff);
 446 }
 447
 448 static force_inline __m128i
 449 invert_colors_1x128 (__m128i data)
 450 {
 451     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
 452 }
 453
 454 static force_inline __m128i
 455 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
 456 {
 457     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
 458 }
 459
 460 static force_inline __m128i
 461 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
 462 {
 463     return over_1x128 (pix_multiply_1x128 (*src, *mask),
 464                        pix_multiply_1x128 (*alpha, *mask),
 465                        *dst);
 466 }
 467
 468 static force_inline __m128i
 469 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
 470 {
 471     __m128i alpha = expand_alpha_1x128 (src);
 472
 473     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
 474                                            _mm_or_si128 (alpha, mask_alpha)),
 475                        alpha,
 476                        dst);
 477 }
 478
 479 static force_inline uint32_t
 480 pack_1x128_32 (__m128i data)
 481 {
 482     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
 483 }
 484
 485 static force_inline __m128i
 486 expand565_16_1x128 (uint16_t pixel)
 487 {
 488     __m128i m = _mm_cvtsi32_si128 (pixel);
 489
 490     m = unpack_565_to_8888 (m);
 491
 492     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
 493 }
 494
 495 static force_inline uint32_t
 496 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
 497 {
 498     uint8_t a;
 499     __m128i xmms;
 500
 501     a = src >> 24;
 502
 503     if (a == 0xff)
 504     {
 505         return src;
 506     }
 507     else if (src)
 508     {
 509         xmms = unpack_32_1x128 (src);
 510         return pack_1x128_32 (
 511             over_1x128 (xmms, expand_alpha_1x128 (xmms),
 512                         unpack_32_1x128 (dst)));
 513     }
 514
 515     return dst;
 516 }
 517
 518 static force_inline uint32_t
 519 combine1 (const uint32_t *ps, const uint32_t *pm)
 520 {
 521     uint32_t s = *ps;
 522
 523     if (pm)
 524     {
 525         __m128i ms, mm;
 526
 527         mm = unpack_32_1x128 (*pm);
 528         mm = expand_alpha_1x128 (mm);
 529
 530         ms = unpack_32_1x128 (s);
 531         ms = pix_multiply_1x128 (ms, mm);
 532
 533         s = pack_1x128_32 (ms);
 534     }
 535
 536     return s;
 537 }
 538
 539 static force_inline __m128i
 540 combine4 (const __m128i *ps, const __m128i *pm)
 541 {
 542     __m128i xmm_src_lo, xmm_src_hi;
 543     __m128i xmm_msk_lo, xmm_msk_hi;
 544     __m128i s;
 545
 546     if (pm)
 547     {
 548         xmm_msk_lo = load_128_unaligned (pm);
 549
 550         if (is_transparent (xmm_msk_lo))
 551             return _mm_setzero_si128 ();
 552     }
 553
 554     s = load_128_unaligned (ps);
 555
 556     if (pm)
 557     {
 558         unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
 559         unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
 560
 561         expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
 562
 563         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
 564                             &xmm_msk_lo, &xmm_msk_hi,
 565                             &xmm_src_lo, &xmm_src_hi);
 566
 567         s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
 568     }
 569
 570     return s;
 571 }
 572
 573 static force_inline void
 574 core_combine_over_u_sse2_mask (uint32_t *         pd,
 575                                const uint32_t*    ps,
 576                                const uint32_t*    pm,
 577                                int                w)
 578 {
 579     uint32_t s, d;
 580
 581     /* Align dst on a 16-byte boundary */
 582     while (w && ((uintptr_t)pd & 15))
 583     {
 584         d = *pd;
 585         s = combine1 (ps, pm);
 586
 587         if (s)
 588             *pd = core_combine_over_u_pixel_sse2 (s, d);
 589         pd++;
 590         ps++;
 591         pm++;
 592         w--;
 593     }
 594
 595     while (w >= 4)
 596     {
 597         __m128i mask = load_128_unaligned ((__m128i *)pm);
 598
 599         if (!is_zero (mask))
 600         {
 601             __m128i src;
 602             __m128i src_hi, src_lo;
 603             __m128i mask_hi, mask_lo;
 604             __m128i alpha_hi, alpha_lo;
 605
 606             src = load_128_unaligned ((__m128i *)ps);
 607
 608             if (is_opaque (_mm_and_si128 (src, mask)))
 609             {
 610                 save_128_aligned ((__m128i *)pd, src);
 611             }
 612             else
 613             {
 614                 __m128i dst = load_128_aligned ((__m128i *)pd);
 615                 __m128i dst_hi, dst_lo;
 616
 617                 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
 618                 unpack_128_2x128 (src, &src_lo, &src_hi);
 619
 620                 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
 621                 pix_multiply_2x128 (&src_lo, &src_hi,
 622                                     &mask_lo, &mask_hi,
 623                                     &src_lo, &src_hi);
 624
 625                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
 626
 627                 expand_alpha_2x128 (src_lo, src_hi,
 628                                     &alpha_lo, &alpha_hi);
 629
 630                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
 631                             &dst_lo, &dst_hi);
 632
 633                 save_128_aligned (
 634                     (__m128i *)pd,
 635                     pack_2x128_128 (dst_lo, dst_hi));
 636             }
 637         }
 638
 639         pm += 4;
 640         ps += 4;
 641         pd += 4;
 642         w -= 4;
 643     }
 644     while (w)
 645     {
 646         d = *pd;
 647         s = combine1 (ps, pm);
 648
 649         if (s)
 650             *pd = core_combine_over_u_pixel_sse2 (s, d);
 651         pd++;
 652         ps++;
 653         pm++;
 654
 655         w--;
 656     }
 657 }
 658
 659 static force_inline void
 660 core_combine_over_u_sse2_no_mask (uint32_t *      pd,
 661                                   const uint32_t*    ps,
 662                                   int                w)
 663 {
 664     uint32_t s, d;
 665
 666     /* Align dst on a 16-byte boundary */
 667     while (w && ((uintptr_t)pd & 15))
 668     {
 669         d = *pd;
 670         s = *ps;
 671
 672         if (s)
 673             *pd = core_combine_over_u_pixel_sse2 (s, d);
 674         pd++;
 675         ps++;
 676         w--;
 677     }
 678
 679     while (w >= 4)
 680     {
 681         __m128i src;
 682         __m128i src_hi, src_lo, dst_hi, dst_lo;
 683         __m128i alpha_hi, alpha_lo;
 684
 685         src = load_128_unaligned ((__m128i *)ps);
 686
 687         if (!is_zero (src))
 688         {
 689             if (is_opaque (src))
 690             {
 691                 save_128_aligned ((__m128i *)pd, src);
 692             }
 693             else
 694             {
 695                 __m128i dst = load_128_aligned ((__m128i *)pd);
 696
 697                 unpack_128_2x128 (src, &src_lo, &src_hi);
 698                 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
 699
 700                 expand_alpha_2x128 (src_lo, src_hi,
 701                                     &alpha_lo, &alpha_hi);
 702                 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
 703                             &dst_lo, &dst_hi);
 704
 705                 save_128_aligned (
 706                     (__m128i *)pd,
 707                     pack_2x128_128 (dst_lo, dst_hi));
 708             }
 709         }
 710
 711         ps += 4;
 712         pd += 4;
 713         w -= 4;
 714     }
 715     while (w)
 716     {
 717         d = *pd;
 718         s = *ps;
 719
 720         if (s)
 721             *pd = core_combine_over_u_pixel_sse2 (s, d);
 722         pd++;
 723         ps++;
 724
 725         w--;
 726     }
 727 }
 728
 729 static force_inline void
 730 sse2_combine_over_u (pixman_implementation_t *imp,
 731                      pixman_op_t              op,
 732                      uint32_t *               pd,
 733                      const uint32_t *         ps,
 734                      const uint32_t *         pm,
 735                      int                      w)
 736 {
 737     if (pm)
 738         core_combine_over_u_sse2_mask (pd, ps, pm, w);
 739     else
 740         core_combine_over_u_sse2_no_mask (pd, ps, w);
 741 }
 742
 743 static void
 744 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
 745                              pixman_op_t              op,
 746                              uint32_t *               pd,
 747                              const uint32_t *         ps,
 748                              const uint32_t *         pm,
 749                              int                      w)
 750 {
 751     uint32_t s, d;
 752
 753     __m128i xmm_dst_lo, xmm_dst_hi;
 754     __m128i xmm_src_lo, xmm_src_hi;
 755     __m128i xmm_alpha_lo, xmm_alpha_hi;
 756
 757     /* Align dst on a 16-byte boundary */
 758     while (w &&
 759            ((uintptr_t)pd & 15))
 760     {
 761         d = *pd;
 762         s = combine1 (ps, pm);
 763
 764         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
 765         w--;
 766         ps++;
 767         if (pm)
 768             pm++;
 769     }
 770
 771     while (w >= 4)
 772     {
 773         /* I'm loading unaligned because I'm not sure
 774          * about the address alignment.
 775          */
 776         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 777         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 778
 779         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 780         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 781
 782         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
 783                             &xmm_alpha_lo, &xmm_alpha_hi);
 784
 785         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
 786                     &xmm_alpha_lo, &xmm_alpha_hi,
 787                     &xmm_src_lo, &xmm_src_hi);
 788
 789         /* rebuid the 4 pixel data and save*/
 790         save_128_aligned ((__m128i*)pd,
 791                           pack_2x128_128 (xmm_src_lo, xmm_src_hi));
 792
 793         w -= 4;
 794         ps += 4;
 795         pd += 4;
 796
 797         if (pm)
 798             pm += 4;
 799     }
 800
 801     while (w)
 802     {
 803         d = *pd;
 804         s = combine1 (ps, pm);
 805
 806         *pd++ = core_combine_over_u_pixel_sse2 (d, s);
 807         ps++;
 808         w--;
 809         if (pm)
 810             pm++;
 811     }
 812 }
 813
 814 static force_inline uint32_t
 815 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
 816 {
 817     uint32_t maska = src >> 24;
 818
 819     if (maska == 0)
 820     {
 821         return 0;
 822     }
 823     else if (maska != 0xff)
 824     {
 825         return pack_1x128_32 (
 826             pix_multiply_1x128 (unpack_32_1x128 (dst),
 827                                 expand_alpha_1x128 (unpack_32_1x128 (src))));
 828     }
 829
 830     return dst;
 831 }
 832
 833 static void
 834 sse2_combine_in_u (pixman_implementation_t *imp,
 835                    pixman_op_t              op,
 836                    uint32_t *               pd,
 837                    const uint32_t *         ps,
 838                    const uint32_t *         pm,
 839                    int                      w)
 840 {
 841     uint32_t s, d;
 842
 843     __m128i xmm_src_lo, xmm_src_hi;
 844     __m128i xmm_dst_lo, xmm_dst_hi;
 845
 846     while (w && ((uintptr_t)pd & 15))
 847     {
 848         s = combine1 (ps, pm);
 849         d = *pd;
 850
 851         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
 852         w--;
 853         ps++;
 854         if (pm)
 855             pm++;
 856     }
 857
 858     while (w >= 4)
 859     {
 860         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 861         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
 862
 863         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 864         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 865
 866         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 867         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
 868                             &xmm_dst_lo, &xmm_dst_hi,
 869                             &xmm_dst_lo, &xmm_dst_hi);
 870
 871         save_128_aligned ((__m128i*)pd,
 872                           pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 873
 874         ps += 4;
 875         pd += 4;
 876         w -= 4;
 877         if (pm)
 878             pm += 4;
 879     }
 880
 881     while (w)
 882     {
 883         s = combine1 (ps, pm);
 884         d = *pd;
 885
 886         *pd++ = core_combine_in_u_pixel_sse2 (d, s);
 887         w--;
 888         ps++;
 889         if (pm)
 890             pm++;
 891     }
 892 }
 893
 894 static void
 895 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
 896                            pixman_op_t              op,
 897                            uint32_t *               pd,
 898                            const uint32_t *         ps,
 899                            const uint32_t *         pm,
 900                            int                      w)
 901 {
 902     uint32_t s, d;
 903
 904     __m128i xmm_src_lo, xmm_src_hi;
 905     __m128i xmm_dst_lo, xmm_dst_hi;
 906
 907     while (w && ((uintptr_t)pd & 15))
 908     {
 909         s = combine1 (ps, pm);
 910         d = *pd;
 911
 912         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
 913         ps++;
 914         w--;
 915         if (pm)
 916             pm++;
 917     }
 918
 919     while (w >= 4)
 920     {
 921         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 922         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
 923
 924         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 925         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 926
 927         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 928         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
 929                             &xmm_src_lo, &xmm_src_hi,
 930                             &xmm_dst_lo, &xmm_dst_hi);
 931
 932         save_128_aligned (
 933             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 934
 935         ps += 4;
 936         pd += 4;
 937         w -= 4;
 938         if (pm)
 939             pm += 4;
 940     }
 941
 942     while (w)
 943     {
 944         s = combine1 (ps, pm);
 945         d = *pd;
 946
 947         *pd++ = core_combine_in_u_pixel_sse2 (s, d);
 948         w--;
 949         ps++;
 950         if (pm)
 951             pm++;
 952     }
 953 }
 954
 955 static void
 956 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
 957                             pixman_op_t              op,
 958                             uint32_t *               pd,
 959                             const uint32_t *         ps,
 960                             const uint32_t *         pm,
 961                             int                      w)
 962 {
 963     while (w && ((uintptr_t)pd & 15))
 964     {
 965         uint32_t s = combine1 (ps, pm);
 966         uint32_t d = *pd;
 967
 968         *pd++ = pack_1x128_32 (
 969             pix_multiply_1x128 (
 970                 unpack_32_1x128 (d), negate_1x128 (
 971                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
 972
 973         if (pm)
 974             pm++;
 975         ps++;
 976         w--;
 977     }
 978
 979     while (w >= 4)
 980     {
 981         __m128i xmm_src_lo, xmm_src_hi;
 982         __m128i xmm_dst_lo, xmm_dst_hi;
 983
 984         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
 985         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
 986
 987         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 988         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
 989
 990         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 991         negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
 992
 993         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
 994                             &xmm_src_lo, &xmm_src_hi,
 995                             &xmm_dst_lo, &xmm_dst_hi);
 996
 997         save_128_aligned (
 998             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
 999
1000         ps += 4;
1001         pd += 4;
1002         if (pm)
1003             pm += 4;
1004
1005         w -= 4;
1006     }
1007
1008     while (w)
1009     {
1010         uint32_t s = combine1 (ps, pm);
1011         uint32_t d = *pd;
1012
1013         *pd++ = pack_1x128_32 (
1014             pix_multiply_1x128 (
1015                 unpack_32_1x128 (d), negate_1x128 (
1016                     expand_alpha_1x128 (unpack_32_1x128 (s)))));
1017         ps++;
1018         if (pm)
1019             pm++;
1020         w--;
1021     }
1022 }
1023
1024 static void
1025 sse2_combine_out_u (pixman_implementation_t *imp,
1026                     pixman_op_t              op,
1027                     uint32_t *               pd,
1028                     const uint32_t *         ps,
1029                     const uint32_t *         pm,
1030                     int                      w)
1031 {
1032     while (w && ((uintptr_t)pd & 15))
1033     {
1034         uint32_t s = combine1 (ps, pm);
1035         uint32_t d = *pd;
1036
1037         *pd++ = pack_1x128_32 (
1038             pix_multiply_1x128 (
1039                 unpack_32_1x128 (s), negate_1x128 (
1040                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1041         w--;
1042         ps++;
1043         if (pm)
1044             pm++;
1045     }
1046
1047     while (w >= 4)
1048     {
1049         __m128i xmm_src_lo, xmm_src_hi;
1050         __m128i xmm_dst_lo, xmm_dst_hi;
1051
1052         xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1053         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1054
1055         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1056         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1057
1058         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1059         negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1060
1061         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1062                             &xmm_dst_lo, &xmm_dst_hi,
1063                             &xmm_dst_lo, &xmm_dst_hi);
1064
1065         save_128_aligned (
1066             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1067
1068         ps += 4;
1069         pd += 4;
1070         w -= 4;
1071         if (pm)
1072             pm += 4;
1073     }
1074
1075     while (w)
1076     {
1077         uint32_t s = combine1 (ps, pm);
1078         uint32_t d = *pd;
1079
1080         *pd++ = pack_1x128_32 (
1081             pix_multiply_1x128 (
1082                 unpack_32_1x128 (s), negate_1x128 (
1083                     expand_alpha_1x128 (unpack_32_1x128 (d)))));
1084         w--;
1085         ps++;
1086         if (pm)
1087             pm++;
1088     }
1089 }
1090
1091 static force_inline uint32_t
1092 core_combine_atop_u_pixel_sse2 (uint32_t src,
1093                                 uint32_t dst)
1094 {
1095     __m128i s = unpack_32_1x128 (src);
1096     __m128i d = unpack_32_1x128 (dst);
1097
1098     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1099     __m128i da = expand_alpha_1x128 (d);
1100
1101     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1102 }
1103
1104 static void
1105 sse2_combine_atop_u (pixman_implementation_t *imp,
1106                      pixman_op_t              op,
1107                      uint32_t *               pd,
1108                      const uint32_t *         ps,
1109                      const uint32_t *         pm,
1110                      int                      w)
1111 {
1112     uint32_t s, d;
1113
1114     __m128i xmm_src_lo, xmm_src_hi;
1115     __m128i xmm_dst_lo, xmm_dst_hi;
1116     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1117     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1118
1119     while (w && ((uintptr_t)pd & 15))
1120     {
1121         s = combine1 (ps, pm);
1122         d = *pd;
1123
1124         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1125         w--;
1126         ps++;
1127         if (pm)
1128             pm++;
1129     }
1130
1131     while (w >= 4)
1132     {
1133         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1134         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1135
1136         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1137         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1138
1139         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1140                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1141         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1142                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1143
1144         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1145                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1146
1147         pix_add_multiply_2x128 (
1148             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1149             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1150             &xmm_dst_lo, &xmm_dst_hi);
1151
1152         save_128_aligned (
1153             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1154
1155         ps += 4;
1156         pd += 4;
1157         w -= 4;
1158         if (pm)
1159             pm += 4;
1160     }
1161
1162     while (w)
1163     {
1164         s = combine1 (ps, pm);
1165         d = *pd;
1166
1167         *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1168         w--;
1169         ps++;
1170         if (pm)
1171             pm++;
1172     }
1173 }
1174
1175 static force_inline uint32_t
1176 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1177                                         uint32_t dst)
1178 {
1179     __m128i s = unpack_32_1x128 (src);
1180     __m128i d = unpack_32_1x128 (dst);
1181
1182     __m128i sa = expand_alpha_1x128 (s);
1183     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1184
1185     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1186 }
1187
1188 static void
1189 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1190                              pixman_op_t              op,
1191                              uint32_t *               pd,
1192                              const uint32_t *         ps,
1193                              const uint32_t *         pm,
1194                              int                      w)
1195 {
1196     uint32_t s, d;
1197
1198     __m128i xmm_src_lo, xmm_src_hi;
1199     __m128i xmm_dst_lo, xmm_dst_hi;
1200     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1201     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1202
1203     while (w && ((uintptr_t)pd & 15))
1204     {
1205         s = combine1 (ps, pm);
1206         d = *pd;
1207
1208         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1209         ps++;
1210         w--;
1211         if (pm)
1212             pm++;
1213     }
1214
1215     while (w >= 4)
1216     {
1217         xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1218         xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1219
1220         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1221         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1222
1223         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1224                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1225         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1226                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1227
1228         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1229                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1230
1231         pix_add_multiply_2x128 (
1232             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1233             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1234             &xmm_dst_lo, &xmm_dst_hi);
1235
1236         save_128_aligned (
1237             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1238
1239         ps += 4;
1240         pd += 4;
1241         w -= 4;
1242         if (pm)
1243             pm += 4;
1244     }
1245
1246     while (w)
1247     {
1248         s = combine1 (ps, pm);
1249         d = *pd;
1250
1251         *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1252         ps++;
1253         w--;
1254         if (pm)
1255             pm++;
1256     }
1257 }
1258
1259 static force_inline uint32_t
1260 core_combine_xor_u_pixel_sse2 (uint32_t src,
1261                                uint32_t dst)
1262 {
1263     __m128i s = unpack_32_1x128 (src);
1264     __m128i d = unpack_32_1x128 (dst);
1265
1266     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1267     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1268
1269     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1270 }
1271
1272 static void
1273 sse2_combine_xor_u (pixman_implementation_t *imp,
1274                     pixman_op_t              op,
1275                     uint32_t *               dst,
1276                     const uint32_t *         src,
1277                     const uint32_t *         mask,
1278                     int                      width)
1279 {
1280     int w = width;
1281     uint32_t s, d;
1282     uint32_t* pd = dst;
1283     const uint32_t* ps = src;
1284     const uint32_t* pm = mask;
1285
1286     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1287     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1288     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1289     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1290
1291     while (w && ((uintptr_t)pd & 15))
1292     {
1293         s = combine1 (ps, pm);
1294         d = *pd;
1295
1296         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1297         w--;
1298         ps++;
1299         if (pm)
1300             pm++;
1301     }
1302
1303     while (w >= 4)
1304     {
1305         xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1306         xmm_dst = load_128_aligned ((__m128i*) pd);
1307
1308         unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1309         unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1310
1311         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1312                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1313         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1314                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1315
1316         negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1317                       &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1318         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1319                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1320
1321         pix_add_multiply_2x128 (
1322             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1323             &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1324             &xmm_dst_lo, &xmm_dst_hi);
1325
1326         save_128_aligned (
1327             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1328
1329         ps += 4;
1330         pd += 4;
1331         w -= 4;
1332         if (pm)
1333             pm += 4;
1334     }
1335
1336     while (w)
1337     {
1338         s = combine1 (ps, pm);
1339         d = *pd;
1340
1341         *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1342         w--;
1343         ps++;
1344         if (pm)
1345             pm++;
1346     }
1347 }
1348
1349 static force_inline void
1350 sse2_combine_add_u (pixman_implementation_t *imp,
1351                     pixman_op_t              op,
1352                     uint32_t *               dst,
1353                     const uint32_t *         src,
1354                     const uint32_t *         mask,
1355                     int                      width)
1356 {
1357     int w = width;
1358     uint32_t s, d;
1359     uint32_t* pd = dst;
1360     const uint32_t* ps = src;
1361     const uint32_t* pm = mask;
1362
1363     while (w && (uintptr_t)pd & 15)
1364     {
1365         s = combine1 (ps, pm);
1366         d = *pd;
1367
1368         ps++;
1369         if (pm)
1370             pm++;
1371         *pd++ = _mm_cvtsi128_si32 (
1372             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1373         w--;
1374     }
1375
1376     while (w >= 4)
1377     {
1378         __m128i s;
1379
1380         s = combine4 ((__m128i*)ps, (__m128i*)pm);
1381
1382         save_128_aligned (
1383             (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1384
1385         pd += 4;
1386         ps += 4;
1387         if (pm)
1388             pm += 4;
1389         w -= 4;
1390     }
1391
1392     while (w--)
1393     {
1394         s = combine1 (ps, pm);
1395         d = *pd;
1396
1397         ps++;
1398         *pd++ = _mm_cvtsi128_si32 (
1399             _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1400         if (pm)
1401             pm++;
1402     }
1403 }
1404
1405 static force_inline uint32_t
1406 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1407                                     uint32_t dst)
1408 {
1409     __m128i ms = unpack_32_1x128 (src);
1410     __m128i md = unpack_32_1x128 (dst);
1411     uint32_t sa = src >> 24;
1412     uint32_t da = ~dst >> 24;
1413
1414     if (sa > da)
1415     {
1416         ms = pix_multiply_1x128 (
1417             ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1418     }
1419
1420     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1421 }
1422
1423 static void
1424 sse2_combine_saturate_u (pixman_implementation_t *imp,
1425                          pixman_op_t              op,
1426                          uint32_t *               pd,
1427                          const uint32_t *         ps,
1428                          const uint32_t *         pm,
1429                          int                      w)
1430 {
1431     uint32_t s, d;
1432
1433     uint32_t pack_cmp;
1434     __m128i xmm_src, xmm_dst;
1435
1436     while (w && (uintptr_t)pd & 15)
1437     {
1438         s = combine1 (ps, pm);
1439         d = *pd;
1440
1441         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1442         w--;
1443         ps++;
1444         if (pm)
1445             pm++;
1446     }
1447
1448     while (w >= 4)
1449     {
1450         xmm_dst = load_128_aligned  ((__m128i*)pd);
1451         xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1452
1453         pack_cmp = _mm_movemask_epi8 (
1454             _mm_cmpgt_epi32 (
1455                 _mm_srli_epi32 (xmm_src, 24),
1456                 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1457
1458         /* if some alpha src is grater than respective ~alpha dst */
1459         if (pack_cmp)
1460         {
1461             s = combine1 (ps++, pm);
1462             d = *pd;
1463             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1464             if (pm)
1465                 pm++;
1466
1467             s = combine1 (ps++, pm);
1468             d = *pd;
1469             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1470             if (pm)
1471                 pm++;
1472
1473             s = combine1 (ps++, pm);
1474             d = *pd;
1475             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1476             if (pm)
1477                 pm++;
1478
1479             s = combine1 (ps++, pm);
1480             d = *pd;
1481             *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1482             if (pm)
1483                 pm++;
1484         }
1485         else
1486         {
1487             save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1488
1489             pd += 4;
1490             ps += 4;
1491             if (pm)
1492                 pm += 4;
1493         }
1494
1495         w -= 4;
1496     }
1497
1498     while (w--)
1499     {
1500         s = combine1 (ps, pm);
1501         d = *pd;
1502
1503         *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1504         ps++;
1505         if (pm)
1506             pm++;
1507     }
1508 }
1509
1510 static void
1511 sse2_combine_src_ca (pixman_implementation_t *imp,
1512                      pixman_op_t              op,
1513                      uint32_t *               pd,
1514                      const uint32_t *         ps,
1515                      const uint32_t *         pm,
1516                      int                      w)
1517 {
1518     uint32_t s, m;
1519
1520     __m128i xmm_src_lo, xmm_src_hi;
1521     __m128i xmm_mask_lo, xmm_mask_hi;
1522     __m128i xmm_dst_lo, xmm_dst_hi;
1523
1524     while (w && (uintptr_t)pd & 15)
1525     {
1526         s = *ps++;
1527         m = *pm++;
1528         *pd++ = pack_1x128_32 (
1529             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1530         w--;
1531     }
1532
1533     while (w >= 4)
1534     {
1535         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1536         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1537
1538         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1539         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1540
1541         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1542                             &xmm_mask_lo, &xmm_mask_hi,
1543                             &xmm_dst_lo, &xmm_dst_hi);
1544
1545         save_128_aligned (
1546             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1547
1548         ps += 4;
1549         pd += 4;
1550         pm += 4;
1551         w -= 4;
1552     }
1553
1554     while (w)
1555     {
1556         s = *ps++;
1557         m = *pm++;
1558         *pd++ = pack_1x128_32 (
1559             pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1560         w--;
1561     }
1562 }
1563
1564 static force_inline uint32_t
1565 core_combine_over_ca_pixel_sse2 (uint32_t src,
1566                                  uint32_t mask,
1567                                  uint32_t dst)
1568 {
1569     __m128i s = unpack_32_1x128 (src);
1570     __m128i expAlpha = expand_alpha_1x128 (s);
1571     __m128i unpk_mask = unpack_32_1x128 (mask);
1572     __m128i unpk_dst  = unpack_32_1x128 (dst);
1573
1574     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1575 }
1576
1577 static void
1578 sse2_combine_over_ca (pixman_implementation_t *imp,
1579                       pixman_op_t              op,
1580                       uint32_t *               pd,
1581                       const uint32_t *         ps,
1582                       const uint32_t *         pm,
1583                       int                      w)
1584 {
1585     uint32_t s, m, d;
1586
1587     __m128i xmm_alpha_lo, xmm_alpha_hi;
1588     __m128i xmm_src_lo, xmm_src_hi;
1589     __m128i xmm_dst_lo, xmm_dst_hi;
1590     __m128i xmm_mask_lo, xmm_mask_hi;
1591
1592     while (w && (uintptr_t)pd & 15)
1593     {
1594         s = *ps++;
1595         m = *pm++;
1596         d = *pd;
1597
1598         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1599         w--;
1600     }
1601
1602     while (w >= 4)
1603     {
1604         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1605         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1606         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1607
1608         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1609         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1610         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1611
1612         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1613                             &xmm_alpha_lo, &xmm_alpha_hi);
1614
1615         in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1616                        &xmm_alpha_lo, &xmm_alpha_hi,
1617                        &xmm_mask_lo, &xmm_mask_hi,
1618                        &xmm_dst_lo, &xmm_dst_hi);
1619
1620         save_128_aligned (
1621             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1622
1623         ps += 4;
1624         pd += 4;
1625         pm += 4;
1626         w -= 4;
1627     }
1628
1629     while (w)
1630     {
1631         s = *ps++;
1632         m = *pm++;
1633         d = *pd;
1634
1635         *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1636         w--;
1637     }
1638 }
1639
1640 static force_inline uint32_t
1641 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1642                                          uint32_t mask,
1643                                          uint32_t dst)
1644 {
1645     __m128i d = unpack_32_1x128 (dst);
1646
1647     return pack_1x128_32 (
1648         over_1x128 (d, expand_alpha_1x128 (d),
1649                     pix_multiply_1x128 (unpack_32_1x128 (src),
1650                                         unpack_32_1x128 (mask))));
1651 }
1652
1653 static void
1654 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1655                               pixman_op_t              op,
1656                               uint32_t *               pd,
1657                               const uint32_t *         ps,
1658                               const uint32_t *         pm,
1659                               int                      w)
1660 {
1661     uint32_t s, m, d;
1662
1663     __m128i xmm_alpha_lo, xmm_alpha_hi;
1664     __m128i xmm_src_lo, xmm_src_hi;
1665     __m128i xmm_dst_lo, xmm_dst_hi;
1666     __m128i xmm_mask_lo, xmm_mask_hi;
1667
1668     while (w && (uintptr_t)pd & 15)
1669     {
1670         s = *ps++;
1671         m = *pm++;
1672         d = *pd;
1673
1674         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1675         w--;
1676     }
1677
1678     while (w >= 4)
1679     {
1680         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1681         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1682         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1683
1684         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1685         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1686         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1687
1688         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1689                             &xmm_alpha_lo, &xmm_alpha_hi);
1690         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1691                             &xmm_mask_lo, &xmm_mask_hi,
1692                             &xmm_mask_lo, &xmm_mask_hi);
1693
1694         over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1695                     &xmm_alpha_lo, &xmm_alpha_hi,
1696                     &xmm_mask_lo, &xmm_mask_hi);
1697
1698         save_128_aligned (
1699             (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1700
1701         ps += 4;
1702         pd += 4;
1703         pm += 4;
1704         w -= 4;
1705     }
1706
1707     while (w)
1708     {
1709         s = *ps++;
1710         m = *pm++;
1711         d = *pd;
1712
1713         *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1714         w--;
1715     }
1716 }
1717
1718 static void
1719 sse2_combine_in_ca (pixman_implementation_t *imp,
1720                     pixman_op_t              op,
1721                     uint32_t *               pd,
1722                     const uint32_t *         ps,
1723                     const uint32_t *         pm,
1724                     int                      w)
1725 {
1726     uint32_t s, m, d;
1727
1728     __m128i xmm_alpha_lo, xmm_alpha_hi;
1729     __m128i xmm_src_lo, xmm_src_hi;
1730     __m128i xmm_dst_lo, xmm_dst_hi;
1731     __m128i xmm_mask_lo, xmm_mask_hi;
1732
1733     while (w && (uintptr_t)pd & 15)
1734     {
1735         s = *ps++;
1736         m = *pm++;
1737         d = *pd;
1738
1739         *pd++ = pack_1x128_32 (
1740             pix_multiply_1x128 (
1741                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1742                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1743
1744         w--;
1745     }
1746
1747     while (w >= 4)
1748     {
1749         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1750         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1751         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1752
1753         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1754         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1755         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1756
1757         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1758                             &xmm_alpha_lo, &xmm_alpha_hi);
1759
1760         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1761                             &xmm_mask_lo, &xmm_mask_hi,
1762                             &xmm_dst_lo, &xmm_dst_hi);
1763
1764         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1765                             &xmm_alpha_lo, &xmm_alpha_hi,
1766                             &xmm_dst_lo, &xmm_dst_hi);
1767
1768         save_128_aligned (
1769             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1770
1771         ps += 4;
1772         pd += 4;
1773         pm += 4;
1774         w -= 4;
1775     }
1776
1777     while (w)
1778     {
1779         s = *ps++;
1780         m = *pm++;
1781         d = *pd;
1782
1783         *pd++ = pack_1x128_32 (
1784             pix_multiply_1x128 (
1785                 pix_multiply_1x128 (
1786                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1787                 expand_alpha_1x128 (unpack_32_1x128 (d))));
1788
1789         w--;
1790     }
1791 }
1792
1793 static void
1794 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1795                             pixman_op_t              op,
1796                             uint32_t *               pd,
1797                             const uint32_t *         ps,
1798                             const uint32_t *         pm,
1799                             int                      w)
1800 {
1801     uint32_t s, m, d;
1802
1803     __m128i xmm_alpha_lo, xmm_alpha_hi;
1804     __m128i xmm_src_lo, xmm_src_hi;
1805     __m128i xmm_dst_lo, xmm_dst_hi;
1806     __m128i xmm_mask_lo, xmm_mask_hi;
1807
1808     while (w && (uintptr_t)pd & 15)
1809     {
1810         s = *ps++;
1811         m = *pm++;
1812         d = *pd;
1813
1814         *pd++ = pack_1x128_32 (
1815             pix_multiply_1x128 (
1816                 unpack_32_1x128 (d),
1817                 pix_multiply_1x128 (unpack_32_1x128 (m),
1818                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1819         w--;
1820     }
1821
1822     while (w >= 4)
1823     {
1824         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1825         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1826         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1827
1828         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1829         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1830         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1831
1832         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1833                             &xmm_alpha_lo, &xmm_alpha_hi);
1834         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1835                             &xmm_alpha_lo, &xmm_alpha_hi,
1836                             &xmm_alpha_lo, &xmm_alpha_hi);
1837
1838         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1839                             &xmm_alpha_lo, &xmm_alpha_hi,
1840                             &xmm_dst_lo, &xmm_dst_hi);
1841
1842         save_128_aligned (
1843             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1844
1845         ps += 4;
1846         pd += 4;
1847         pm += 4;
1848         w -= 4;
1849     }
1850
1851     while (w)
1852     {
1853         s = *ps++;
1854         m = *pm++;
1855         d = *pd;
1856
1857         *pd++ = pack_1x128_32 (
1858             pix_multiply_1x128 (
1859                 unpack_32_1x128 (d),
1860                 pix_multiply_1x128 (unpack_32_1x128 (m),
1861                                    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1862         w--;
1863     }
1864 }
1865
1866 static void
1867 sse2_combine_out_ca (pixman_implementation_t *imp,
1868                      pixman_op_t              op,
1869                      uint32_t *               pd,
1870                      const uint32_t *         ps,
1871                      const uint32_t *         pm,
1872                      int                      w)
1873 {
1874     uint32_t s, m, d;
1875
1876     __m128i xmm_alpha_lo, xmm_alpha_hi;
1877     __m128i xmm_src_lo, xmm_src_hi;
1878     __m128i xmm_dst_lo, xmm_dst_hi;
1879     __m128i xmm_mask_lo, xmm_mask_hi;
1880
1881     while (w && (uintptr_t)pd & 15)
1882     {
1883         s = *ps++;
1884         m = *pm++;
1885         d = *pd;
1886
1887         *pd++ = pack_1x128_32 (
1888             pix_multiply_1x128 (
1889                 pix_multiply_1x128 (
1890                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1891                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1892         w--;
1893     }
1894
1895     while (w >= 4)
1896     {
1897         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1898         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1899         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1900
1901         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1902         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1903         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1904
1905         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1906                             &xmm_alpha_lo, &xmm_alpha_hi);
1907         negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1908                       &xmm_alpha_lo, &xmm_alpha_hi);
1909
1910         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1911                             &xmm_mask_lo, &xmm_mask_hi,
1912                             &xmm_dst_lo, &xmm_dst_hi);
1913         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1914                             &xmm_alpha_lo, &xmm_alpha_hi,
1915                             &xmm_dst_lo, &xmm_dst_hi);
1916
1917         save_128_aligned (
1918             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1919
1920         ps += 4;
1921         pd += 4;
1922         pm += 4;
1923         w -= 4;
1924     }
1925
1926     while (w)
1927     {
1928         s = *ps++;
1929         m = *pm++;
1930         d = *pd;
1931
1932         *pd++ = pack_1x128_32 (
1933             pix_multiply_1x128 (
1934                 pix_multiply_1x128 (
1935                     unpack_32_1x128 (s), unpack_32_1x128 (m)),
1936                 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1937
1938         w--;
1939     }
1940 }
1941
1942 static void
1943 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1944                              pixman_op_t              op,
1945                              uint32_t *               pd,
1946                              const uint32_t *         ps,
1947                              const uint32_t *         pm,
1948                              int                      w)
1949 {
1950     uint32_t s, m, d;
1951
1952     __m128i xmm_alpha_lo, xmm_alpha_hi;
1953     __m128i xmm_src_lo, xmm_src_hi;
1954     __m128i xmm_dst_lo, xmm_dst_hi;
1955     __m128i xmm_mask_lo, xmm_mask_hi;
1956
1957     while (w && (uintptr_t)pd & 15)
1958     {
1959         s = *ps++;
1960         m = *pm++;
1961         d = *pd;
1962
1963         *pd++ = pack_1x128_32 (
1964             pix_multiply_1x128 (
1965                 unpack_32_1x128 (d),
1966                 negate_1x128 (pix_multiply_1x128 (
1967                                  unpack_32_1x128 (m),
1968                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
1969         w--;
1970     }
1971
1972     while (w >= 4)
1973     {
1974         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1975         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1976         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1977
1978         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1979         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1980         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1981
1982         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1983                             &xmm_alpha_lo, &xmm_alpha_hi);
1984
1985         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1986                             &xmm_alpha_lo, &xmm_alpha_hi,
1987                             &xmm_mask_lo, &xmm_mask_hi);
1988
1989         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1990                       &xmm_mask_lo, &xmm_mask_hi);
1991
1992         pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1993                             &xmm_mask_lo, &xmm_mask_hi,
1994                             &xmm_dst_lo, &xmm_dst_hi);
1995
1996         save_128_aligned (
1997             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1998
1999         ps += 4;
2000         pd += 4;
2001         pm += 4;
2002         w -= 4;
2003     }
2004
2005     while (w)
2006     {
2007         s = *ps++;
2008         m = *pm++;
2009         d = *pd;
2010
2011         *pd++ = pack_1x128_32 (
2012             pix_multiply_1x128 (
2013                 unpack_32_1x128 (d),
2014                 negate_1x128 (pix_multiply_1x128 (
2015                                  unpack_32_1x128 (m),
2016                                  expand_alpha_1x128 (unpack_32_1x128 (s))))));
2017         w--;
2018     }
2019 }
2020
2021 static force_inline uint32_t
2022 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2023                                  uint32_t mask,
2024                                  uint32_t dst)
2025 {
2026     __m128i m = unpack_32_1x128 (mask);
2027     __m128i s = unpack_32_1x128 (src);
2028     __m128i d = unpack_32_1x128 (dst);
2029     __m128i sa = expand_alpha_1x128 (s);
2030     __m128i da = expand_alpha_1x128 (d);
2031
2032     s = pix_multiply_1x128 (s, m);
2033     m = negate_1x128 (pix_multiply_1x128 (m, sa));
2034
2035     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2036 }
2037
2038 static void
2039 sse2_combine_atop_ca (pixman_implementation_t *imp,
2040                       pixman_op_t              op,
2041                       uint32_t *               pd,
2042                       const uint32_t *         ps,
2043                       const uint32_t *         pm,
2044                       int                      w)
2045 {
2046     uint32_t s, m, d;
2047
2048     __m128i xmm_src_lo, xmm_src_hi;
2049     __m128i xmm_dst_lo, xmm_dst_hi;
2050     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2051     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2052     __m128i xmm_mask_lo, xmm_mask_hi;
2053
2054     while (w && (uintptr_t)pd & 15)
2055     {
2056         s = *ps++;
2057         m = *pm++;
2058         d = *pd;
2059
2060         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2061         w--;
2062     }
2063
2064     while (w >= 4)
2065     {
2066         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2067         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2068         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2069
2070         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2071         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2072         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2073
2074         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2075                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2076         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2077                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2078
2079         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2080                             &xmm_mask_lo, &xmm_mask_hi,
2081                             &xmm_src_lo, &xmm_src_hi);
2082         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2083                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2084                             &xmm_mask_lo, &xmm_mask_hi);
2085
2086         negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2087
2088         pix_add_multiply_2x128 (
2089             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2090             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2091             &xmm_dst_lo, &xmm_dst_hi);
2092
2093         save_128_aligned (
2094             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2095
2096         ps += 4;
2097         pd += 4;
2098         pm += 4;
2099         w -= 4;
2100     }
2101
2102     while (w)
2103     {
2104         s = *ps++;
2105         m = *pm++;
2106         d = *pd;
2107
2108         *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2109         w--;
2110     }
2111 }
2112
2113 static force_inline uint32_t
2114 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2115                                          uint32_t mask,
2116                                          uint32_t dst)
2117 {
2118     __m128i m = unpack_32_1x128 (mask);
2119     __m128i s = unpack_32_1x128 (src);
2120     __m128i d = unpack_32_1x128 (dst);
2121
2122     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2123     __m128i sa = expand_alpha_1x128 (s);
2124
2125     s = pix_multiply_1x128 (s, m);
2126     m = pix_multiply_1x128 (m, sa);
2127
2128     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2129 }
2130
2131 static void
2132 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2133                               pixman_op_t              op,
2134                               uint32_t *               pd,
2135                               const uint32_t *         ps,
2136                               const uint32_t *         pm,
2137                               int                      w)
2138 {
2139     uint32_t s, m, d;
2140
2141     __m128i xmm_src_lo, xmm_src_hi;
2142     __m128i xmm_dst_lo, xmm_dst_hi;
2143     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2144     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2145     __m128i xmm_mask_lo, xmm_mask_hi;
2146
2147     while (w && (uintptr_t)pd & 15)
2148     {
2149         s = *ps++;
2150         m = *pm++;
2151         d = *pd;
2152
2153         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2154         w--;
2155     }
2156
2157     while (w >= 4)
2158     {
2159         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2160         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2161         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2162
2163         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2164         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2165         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2166
2167         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2168                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2169         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2170                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2171
2172         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2173                             &xmm_mask_lo, &xmm_mask_hi,
2174                             &xmm_src_lo, &xmm_src_hi);
2175         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2176                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2177                             &xmm_mask_lo, &xmm_mask_hi);
2178
2179         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2180                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2181
2182         pix_add_multiply_2x128 (
2183             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2184             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2185             &xmm_dst_lo, &xmm_dst_hi);
2186
2187         save_128_aligned (
2188             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2189
2190         ps += 4;
2191         pd += 4;
2192         pm += 4;
2193         w -= 4;
2194     }
2195
2196     while (w)
2197     {
2198         s = *ps++;
2199         m = *pm++;
2200         d = *pd;
2201
2202         *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2203         w--;
2204     }
2205 }
2206
2207 static force_inline uint32_t
2208 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2209                                 uint32_t mask,
2210                                 uint32_t dst)
2211 {
2212     __m128i a = unpack_32_1x128 (mask);
2213     __m128i s = unpack_32_1x128 (src);
2214     __m128i d = unpack_32_1x128 (dst);
2215
2216     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2217                                        a, expand_alpha_1x128 (s)));
2218     __m128i dest      = pix_multiply_1x128 (s, a);
2219     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2220
2221     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2222                                                 &alpha_dst,
2223                                                 &dest,
2224                                                 &alpha_src));
2225 }
2226
2227 static void
2228 sse2_combine_xor_ca (pixman_implementation_t *imp,
2229                      pixman_op_t              op,
2230                      uint32_t *               pd,
2231                      const uint32_t *         ps,
2232                      const uint32_t *         pm,
2233                      int                      w)
2234 {
2235     uint32_t s, m, d;
2236
2237     __m128i xmm_src_lo, xmm_src_hi;
2238     __m128i xmm_dst_lo, xmm_dst_hi;
2239     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2240     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2241     __m128i xmm_mask_lo, xmm_mask_hi;
2242
2243     while (w && (uintptr_t)pd & 15)
2244     {
2245         s = *ps++;
2246         m = *pm++;
2247         d = *pd;
2248
2249         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2250         w--;
2251     }
2252
2253     while (w >= 4)
2254     {
2255         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2256         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2257         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2258
2259         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2260         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2261         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2262
2263         expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2264                             &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2265         expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2266                             &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2267
2268         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2269                             &xmm_mask_lo, &xmm_mask_hi,
2270                             &xmm_src_lo, &xmm_src_hi);
2271         pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2272                             &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2273                             &xmm_mask_lo, &xmm_mask_hi);
2274
2275         negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2276                       &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2277         negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2278                       &xmm_mask_lo, &xmm_mask_hi);
2279
2280         pix_add_multiply_2x128 (
2281             &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2282             &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2283             &xmm_dst_lo, &xmm_dst_hi);
2284
2285         save_128_aligned (
2286             (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2287
2288         ps += 4;
2289         pd += 4;
2290         pm += 4;
2291         w -= 4;
2292     }
2293
2294     while (w)
2295     {
2296         s = *ps++;
2297         m = *pm++;
2298         d = *pd;
2299
2300         *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2301         w--;
2302     }
2303 }
2304
2305 static void
2306 sse2_combine_add_ca (pixman_implementation_t *imp,
2307                      pixman_op_t              op,
2308                      uint32_t *               pd,
2309                      const uint32_t *         ps,
2310                      const uint32_t *         pm,
2311                      int                      w)
2312 {
2313     uint32_t s, m, d;
2314
2315     __m128i xmm_src_lo, xmm_src_hi;
2316     __m128i xmm_dst_lo, xmm_dst_hi;
2317     __m128i xmm_mask_lo, xmm_mask_hi;
2318
2319     while (w && (uintptr_t)pd & 15)
2320     {
2321         s = *ps++;
2322         m = *pm++;
2323         d = *pd;
2324
2325         *pd++ = pack_1x128_32 (
2326             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2327                                                unpack_32_1x128 (m)),
2328                            unpack_32_1x128 (d)));
2329         w--;
2330     }
2331
2332     while (w >= 4)
2333     {
2334         xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2335         xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2336         xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2337
2338         unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2339         unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2340         unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2341
2342         pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2343                             &xmm_mask_lo, &xmm_mask_hi,
2344                             &xmm_src_lo, &xmm_src_hi);
2345
2346         save_128_aligned (
2347             (__m128i*)pd, pack_2x128_128 (
2348                 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2349                 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2350
2351         ps += 4;
2352         pd += 4;
2353         pm += 4;
2354         w -= 4;
2355     }
2356
2357     while (w)
2358     {
2359         s = *ps++;
2360         m = *pm++;
2361         d = *pd;
2362
2363         *pd++ = pack_1x128_32 (
2364             _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2365                                                unpack_32_1x128 (m)),
2366                            unpack_32_1x128 (d)));
2367         w--;
2368     }
2369 }
2370
2371 static force_inline __m128i
2372 create_mask_16_128 (uint16_t mask)
2373 {
2374     return _mm_set1_epi16 (mask);
2375 }
2376
2377 /* Work around a code generation bug in Sun Studio 12. */
2378 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2379 # define create_mask_2x32_128(mask0, mask1)                             \
2380     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2381 #else
2382 static force_inline __m128i
2383 create_mask_2x32_128 (uint32_t mask0,
2384                       uint32_t mask1)
2385 {
2386     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2387 }
2388 #endif
2389
2390 static void
2391 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2392                             pixman_composite_info_t *info)
2393 {
2394     PIXMAN_COMPOSITE_ARGS (info);
2395     uint32_t src;
2396     uint32_t    *dst_line, *dst, d;
2397     int32_t w;
2398     int dst_stride;
2399     __m128i xmm_src, xmm_alpha;
2400     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2401
2402     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2403
2404     if (src == 0)
2405         return;
2406
2407     PIXMAN_IMAGE_GET_LINE (
2408         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2409
2410     xmm_src = expand_pixel_32_1x128 (src);
2411     xmm_alpha = expand_alpha_1x128 (xmm_src);
2412
2413     while (height--)
2414     {
2415         dst = dst_line;
2416
2417         dst_line += dst_stride;
2418         w = width;
2419
2420         while (w && (uintptr_t)dst & 15)
2421         {
2422             d = *dst;
2423             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2424                                                 xmm_alpha,
2425                                                 unpack_32_1x128 (d)));
2426             w--;
2427         }
2428
2429         while (w >= 4)
2430         {
2431             xmm_dst = load_128_aligned ((__m128i*)dst);
2432
2433             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2434
2435             over_2x128 (&xmm_src, &xmm_src,
2436                         &xmm_alpha, &xmm_alpha,
2437                         &xmm_dst_lo, &xmm_dst_hi);
2438
2439             /* rebuid the 4 pixel data and save*/
2440             save_128_aligned (
2441                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2442
2443             w -= 4;
2444             dst += 4;
2445         }
2446
2447         while (w)
2448         {
2449             d = *dst;
2450             *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2451                                                 xmm_alpha,
2452                                                 unpack_32_1x128 (d)));
2453             w--;
2454         }
2455
2456     }
2457 }
2458
2459 static void
2460 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2461                             pixman_composite_info_t *info)
2462 {
2463     PIXMAN_COMPOSITE_ARGS (info);
2464     uint32_t src;
2465     uint16_t    *dst_line, *dst, d;
2466     int32_t w;
2467     int dst_stride;
2468     __m128i xmm_src, xmm_alpha;
2469     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2470
2471     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2472
2473     if (src == 0)
2474         return;
2475
2476     PIXMAN_IMAGE_GET_LINE (
2477         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2478
2479     xmm_src = expand_pixel_32_1x128 (src);
2480     xmm_alpha = expand_alpha_1x128 (xmm_src);
2481
2482     while (height--)
2483     {
2484         dst = dst_line;
2485
2486         dst_line += dst_stride;
2487         w = width;
2488
2489         while (w && (uintptr_t)dst & 15)
2490         {
2491             d = *dst;
2492
2493             *dst++ = pack_565_32_16 (
2494                 pack_1x128_32 (over_1x128 (xmm_src,
2495                                            xmm_alpha,
2496                                            expand565_16_1x128 (d))));
2497             w--;
2498         }
2499
2500         while (w >= 8)
2501         {
2502             xmm_dst = load_128_aligned ((__m128i*)dst);
2503
2504             unpack_565_128_4x128 (xmm_dst,
2505                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2506
2507             over_2x128 (&xmm_src, &xmm_src,
2508                         &xmm_alpha, &xmm_alpha,
2509                         &xmm_dst0, &xmm_dst1);
2510             over_2x128 (&xmm_src, &xmm_src,
2511                         &xmm_alpha, &xmm_alpha,
2512                         &xmm_dst2, &xmm_dst3);
2513
2514             xmm_dst = pack_565_4x128_128 (
2515                 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2516
2517             save_128_aligned ((__m128i*)dst, xmm_dst);
2518
2519             dst += 8;
2520             w -= 8;
2521         }
2522
2523         while (w--)
2524         {
2525             d = *dst;
2526             *dst++ = pack_565_32_16 (
2527                 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2528                                            expand565_16_1x128 (d))));
2529         }
2530     }
2531
2532 }
2533
2534 static void
2535 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2536                                    pixman_composite_info_t *info)
2537 {
2538     PIXMAN_COMPOSITE_ARGS (info);
2539     uint32_t src;
2540     uint32_t    *dst_line, d;
2541     uint32_t    *mask_line, m;
2542     uint32_t pack_cmp;
2543     int dst_stride, mask_stride;
2544
2545     __m128i xmm_src;
2546     __m128i xmm_dst;
2547     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2548
2549     __m128i mmx_src, mmx_mask, mmx_dest;
2550
2551     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2552
2553     if (src == 0)
2554         return;
2555
2556     PIXMAN_IMAGE_GET_LINE (
2557         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2558     PIXMAN_IMAGE_GET_LINE (
2559         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2560
2561     xmm_src = _mm_unpacklo_epi8 (
2562         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2563     mmx_src   = xmm_src;
2564
2565     while (height--)
2566     {
2567         int w = width;
2568         const uint32_t *pm = (uint32_t *)mask_line;
2569         uint32_t *pd = (uint32_t *)dst_line;
2570
2571         dst_line += dst_stride;
2572         mask_line += mask_stride;
2573
2574         while (w && (uintptr_t)pd & 15)
2575         {
2576             m = *pm++;
2577
2578             if (m)
2579             {
2580                 d = *pd;
2581
2582                 mmx_mask = unpack_32_1x128 (m);
2583                 mmx_dest = unpack_32_1x128 (d);
2584
2585                 *pd = pack_1x128_32 (
2586                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2587                                    mmx_dest));
2588             }
2589
2590             pd++;
2591             w--;
2592         }
2593
2594         while (w >= 4)
2595         {
2596             xmm_mask = load_128_unaligned ((__m128i*)pm);
2597
2598             pack_cmp =
2599                 _mm_movemask_epi8 (
2600                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2601
2602             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2603             if (pack_cmp != 0xffff)
2604             {
2605                 xmm_dst = load_128_aligned ((__m128i*)pd);
2606
2607                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2608
2609                 pix_multiply_2x128 (&xmm_src, &xmm_src,
2610                                     &xmm_mask_lo, &xmm_mask_hi,
2611                                     &xmm_mask_lo, &xmm_mask_hi);
2612                 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2613
2614                 save_128_aligned (
2615                     (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2616             }
2617
2618             pd += 4;
2619             pm += 4;
2620             w -= 4;
2621         }
2622
2623         while (w)
2624         {
2625             m = *pm++;
2626
2627             if (m)
2628             {
2629                 d = *pd;
2630
2631                 mmx_mask = unpack_32_1x128 (m);
2632                 mmx_dest = unpack_32_1x128 (d);
2633
2634                 *pd = pack_1x128_32 (
2635                     _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2636                                    mmx_dest));
2637             }
2638
2639             pd++;
2640             w--;
2641         }
2642     }
2643
2644 }
2645
2646 static void
2647 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2648                                     pixman_composite_info_t *info)
2649 {
2650     PIXMAN_COMPOSITE_ARGS (info);
2651     uint32_t src;
2652     uint32_t    *dst_line, d;
2653     uint32_t    *mask_line, m;
2654     uint32_t pack_cmp;
2655     int dst_stride, mask_stride;
2656
2657     __m128i xmm_src, xmm_alpha;
2658     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2659     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2660
2661     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2662
2663     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2664
2665     if (src == 0)
2666         return;
2667
2668     PIXMAN_IMAGE_GET_LINE (
2669         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2670     PIXMAN_IMAGE_GET_LINE (
2671         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2672
2673     xmm_src = _mm_unpacklo_epi8 (
2674         create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2675     xmm_alpha = expand_alpha_1x128 (xmm_src);
2676     mmx_src   = xmm_src;
2677     mmx_alpha = xmm_alpha;
2678
2679     while (height--)
2680     {
2681         int w = width;
2682         const uint32_t *pm = (uint32_t *)mask_line;
2683         uint32_t *pd = (uint32_t *)dst_line;
2684
2685         dst_line += dst_stride;
2686         mask_line += mask_stride;
2687
2688         while (w && (uintptr_t)pd & 15)
2689         {
2690             m = *pm++;
2691
2692             if (m)
2693             {
2694                 d = *pd;
2695                 mmx_mask = unpack_32_1x128 (m);
2696                 mmx_dest = unpack_32_1x128 (d);
2697
2698                 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2699                                                   &mmx_alpha,
2700                                                   &mmx_mask,
2701                                                   &mmx_dest));
2702             }
2703
2704             pd++;
2705             w--;
2706         }
2707
2708         while (w >= 4)
2709         {
2710             xmm_mask = load_128_unaligned ((__m128i*)pm);
2711
2712             pack_cmp =
2713                 _mm_movemask_epi8 (
2714                     _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2715
2716             /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2717             if (pack_cmp != 0xffff)
2718             {
2719                 xmm_dst = load_128_aligned ((__m128i*)pd);
2720
2721                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2722                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2723
2724                 in_over_2x128 (&xmm_src, &xmm_src,
2725                                &xmm_alpha, &xmm_alpha,
2726                                &xmm_mask_lo, &xmm_mask_hi,
2727                                &xmm_dst_lo, &xmm_dst_hi);
2728
2729                 save_128_aligned (
2730                     (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2731             }
2732
2733             pd += 4;
2734             pm += 4;
2735             w -= 4;
2736         }
2737
2738         while (w)
2739         {
2740             m = *pm++;
2741
2742             if (m)
2743             {
2744                 d = *pd;
2745                 mmx_mask = unpack_32_1x128 (m);
2746                 mmx_dest = unpack_32_1x128 (d);
2747
2748                 *pd = pack_1x128_32 (
2749                     in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2750             }
2751
2752             pd++;
2753             w--;
2754         }
2755     }
2756
2757 }
2758
2759 static void
2760 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2761                                  pixman_composite_info_t *info)
2762 {
2763     PIXMAN_COMPOSITE_ARGS (info);
2764     uint32_t    *dst_line, *dst;
2765     uint32_t    *src_line, *src;
2766     uint32_t mask;
2767     int32_t w;
2768     int dst_stride, src_stride;
2769
2770     __m128i xmm_mask;
2771     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2772     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2773     __m128i xmm_alpha_lo, xmm_alpha_hi;
2774
2775     PIXMAN_IMAGE_GET_LINE (
2776         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2777     PIXMAN_IMAGE_GET_LINE (
2778         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2779
2780     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2781
2782     xmm_mask = create_mask_16_128 (mask >> 24);
2783
2784     while (height--)
2785     {
2786         dst = dst_line;
2787         dst_line += dst_stride;
2788         src = src_line;
2789         src_line += src_stride;
2790         w = width;
2791
2792         while (w && (uintptr_t)dst & 15)
2793         {
2794             uint32_t s = *src++;
2795
2796             if (s)
2797             {
2798                 uint32_t d = *dst;
2799
2800                 __m128i ms = unpack_32_1x128 (s);
2801                 __m128i alpha    = expand_alpha_1x128 (ms);
2802                 __m128i dest     = xmm_mask;
2803                 __m128i alpha_dst = unpack_32_1x128 (d);
2804
2805                 *dst = pack_1x128_32 (
2806                     in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2807             }
2808             dst++;
2809             w--;
2810         }
2811
2812         while (w >= 4)
2813         {
2814             xmm_src = load_128_unaligned ((__m128i*)src);
2815
2816             if (!is_zero (xmm_src))
2817             {
2818                 xmm_dst = load_128_aligned ((__m128i*)dst);
2819
2820                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2821                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2822                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2823                                     &xmm_alpha_lo, &xmm_alpha_hi);
2824
2825                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2826                                &xmm_alpha_lo, &xmm_alpha_hi,
2827                                &xmm_mask, &xmm_mask,
2828                                &xmm_dst_lo, &xmm_dst_hi);
2829
2830                 save_128_aligned (
2831                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2832             }
2833
2834             dst += 4;
2835             src += 4;
2836             w -= 4;
2837         }
2838
2839         while (w)
2840         {
2841             uint32_t s = *src++;
2842
2843             if (s)
2844             {
2845                 uint32_t d = *dst;
2846
2847                 __m128i ms = unpack_32_1x128 (s);
2848                 __m128i alpha = expand_alpha_1x128 (ms);
2849                 __m128i mask  = xmm_mask;
2850                 __m128i dest  = unpack_32_1x128 (d);
2851
2852                 *dst = pack_1x128_32 (
2853                     in_over_1x128 (&ms, &alpha, &mask, &dest));
2854             }
2855
2856             dst++;
2857             w--;
2858         }
2859     }
2860
2861 }
2862
2863 static void
2864 sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
2865                               pixman_composite_info_t *info)
2866 {
2867     PIXMAN_COMPOSITE_ARGS (info);
2868     uint16_t    *dst_line, *dst;
2869     uint32_t    *src_line, *src, s;
2870     int dst_stride, src_stride;
2871     int32_t w;
2872
2873     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2874     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2875
2876     while (height--)
2877     {
2878         dst = dst_line;
2879         dst_line += dst_stride;
2880         src = src_line;
2881         src_line += src_stride;
2882         w = width;
2883
2884         while (w && (uintptr_t)dst & 15)
2885         {
2886             s = *src++;
2887             *dst = convert_8888_to_0565 (s);
2888             dst++;
2889             w--;
2890         }
2891
2892         while (w >= 8)
2893         {
2894             __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
2895             __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
2896
2897             save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
2898
2899             w -= 8;
2900             src += 8;
2901             dst += 8;
2902         }
2903
2904         while (w)
2905         {
2906             s = *src++;
2907             *dst = convert_8888_to_0565 (s);
2908             dst++;
2909             w--;
2910         }
2911     }
2912 }
2913
2914 static void
2915 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2916                               pixman_composite_info_t *info)
2917 {
2918     PIXMAN_COMPOSITE_ARGS (info);
2919     uint32_t    *dst_line, *dst;
2920     uint32_t    *src_line, *src;
2921     int32_t w;
2922     int dst_stride, src_stride;
2923
2924
2925     PIXMAN_IMAGE_GET_LINE (
2926         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2927     PIXMAN_IMAGE_GET_LINE (
2928         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2929
2930     while (height--)
2931     {
2932         dst = dst_line;
2933         dst_line += dst_stride;
2934         src = src_line;
2935         src_line += src_stride;
2936         w = width;
2937
2938         while (w && (uintptr_t)dst & 15)
2939         {
2940             *dst++ = *src++ | 0xff000000;
2941             w--;
2942         }
2943
2944         while (w >= 16)
2945         {
2946             __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2947
2948             xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2949             xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2950             xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2951             xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2952
2953             save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2954             save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2955             save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2956             save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2957
2958             dst += 16;
2959             src += 16;
2960             w -= 16;
2961         }
2962
2963         while (w)
2964         {
2965             *dst++ = *src++ | 0xff000000;
2966             w--;
2967         }
2968     }
2969
2970 }
2971
2972 static void
2973 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2974                                  pixman_composite_info_t *info)
2975 {
2976     PIXMAN_COMPOSITE_ARGS (info);
2977     uint32_t    *dst_line, *dst;
2978     uint32_t    *src_line, *src;
2979     uint32_t mask;
2980     int dst_stride, src_stride;
2981     int32_t w;
2982
2983     __m128i xmm_mask, xmm_alpha;
2984     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2985     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2986
2987     PIXMAN_IMAGE_GET_LINE (
2988         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2989     PIXMAN_IMAGE_GET_LINE (
2990         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2991
2992     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2993
2994     xmm_mask = create_mask_16_128 (mask >> 24);
2995     xmm_alpha = mask_00ff;
2996
2997     while (height--)
2998     {
2999         dst = dst_line;
3000         dst_line += dst_stride;
3001         src = src_line;
3002         src_line += src_stride;
3003         w = width;
3004
3005         while (w && (uintptr_t)dst & 15)
3006         {
3007             uint32_t s = (*src++) | 0xff000000;
3008             uint32_t d = *dst;
3009
3010             __m128i src   = unpack_32_1x128 (s);
3011             __m128i alpha = xmm_alpha;
3012             __m128i mask  = xmm_mask;
3013             __m128i dest  = unpack_32_1x128 (d);
3014
3015             *dst++ = pack_1x128_32 (
3016                 in_over_1x128 (&src, &alpha, &mask, &dest));
3017
3018             w--;
3019         }
3020
3021         while (w >= 4)
3022         {
3023             xmm_src = _mm_or_si128 (
3024                 load_128_unaligned ((__m128i*)src), mask_ff000000);
3025             xmm_dst = load_128_aligned ((__m128i*)dst);
3026
3027             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3028             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3029
3030             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3031                            &xmm_alpha, &xmm_alpha,
3032                            &xmm_mask, &xmm_mask,
3033                            &xmm_dst_lo, &xmm_dst_hi);
3034
3035             save_128_aligned (
3036                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3037
3038             dst += 4;
3039             src += 4;
3040             w -= 4;
3041
3042         }
3043
3044         while (w)
3045         {
3046             uint32_t s = (*src++) | 0xff000000;
3047             uint32_t d = *dst;
3048
3049             __m128i src  = unpack_32_1x128 (s);
3050             __m128i alpha = xmm_alpha;
3051             __m128i mask  = xmm_mask;
3052             __m128i dest  = unpack_32_1x128 (d);
3053
3054             *dst++ = pack_1x128_32 (
3055                 in_over_1x128 (&src, &alpha, &mask, &dest));
3056
3057             w--;
3058         }
3059     }
3060
3061 }
3062
3063 static void
3064 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3065                                pixman_composite_info_t *info)
3066 {
3067     PIXMAN_COMPOSITE_ARGS (info);
3068     int dst_stride, src_stride;
3069     uint32_t    *dst_line, *dst;
3070     uint32_t    *src_line, *src;
3071
3072     PIXMAN_IMAGE_GET_LINE (
3073         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3074     PIXMAN_IMAGE_GET_LINE (
3075         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3076
3077     dst = dst_line;
3078     src = src_line;
3079
3080     while (height--)
3081     {
3082         sse2_combine_over_u (imp, op, dst, src, NULL, width);
3083
3084         dst += dst_stride;
3085         src += src_stride;
3086     }
3087 }
3088
3089 static force_inline uint16_t
3090 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3091 {
3092     __m128i ms;
3093
3094     ms = unpack_32_1x128 (src);
3095     return pack_565_32_16 (
3096         pack_1x128_32 (
3097             over_1x128 (
3098                 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3099 }
3100
3101 static void
3102 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3103                                pixman_composite_info_t *info)
3104 {
3105     PIXMAN_COMPOSITE_ARGS (info);
3106     uint16_t    *dst_line, *dst, d;
3107     uint32_t    *src_line, *src, s;
3108     int dst_stride, src_stride;
3109     int32_t w;
3110
3111     __m128i xmm_alpha_lo, xmm_alpha_hi;
3112     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3113     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3114
3115     PIXMAN_IMAGE_GET_LINE (
3116         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3117     PIXMAN_IMAGE_GET_LINE (
3118         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3119
3120     while (height--)
3121     {
3122         dst = dst_line;
3123         src = src_line;
3124
3125         dst_line += dst_stride;
3126         src_line += src_stride;
3127         w = width;
3128
3129         /* Align dst on a 16-byte boundary */
3130         while (w &&
3131                ((uintptr_t)dst & 15))
3132         {
3133             s = *src++;
3134             d = *dst;
3135
3136             *dst++ = composite_over_8888_0565pixel (s, d);
3137             w--;
3138         }
3139
3140         /* It's a 8 pixel loop */
3141         while (w >= 8)
3142         {
3143             /* I'm loading unaligned because I'm not sure
3144              * about the address alignment.
3145              */
3146             xmm_src = load_128_unaligned ((__m128i*) src);
3147             xmm_dst = load_128_aligned ((__m128i*) dst);
3148
3149             /* Unpacking */
3150             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3151             unpack_565_128_4x128 (xmm_dst,
3152                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3153             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3154                                 &xmm_alpha_lo, &xmm_alpha_hi);
3155
3156             /* I'm loading next 4 pixels from memory
3157              * before to optimze the memory read.
3158              */
3159             xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3160
3161             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3162                         &xmm_alpha_lo, &xmm_alpha_hi,
3163                         &xmm_dst0, &xmm_dst1);
3164
3165             /* Unpacking */
3166             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3167             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3168                                 &xmm_alpha_lo, &xmm_alpha_hi);
3169
3170             over_2x128 (&xmm_src_lo, &xmm_src_hi,
3171                         &xmm_alpha_lo, &xmm_alpha_hi,
3172                         &xmm_dst2, &xmm_dst3);
3173
3174             save_128_aligned (
3175                 (__m128i*)dst, pack_565_4x128_128 (
3176                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3177
3178             w -= 8;
3179             dst += 8;
3180             src += 8;
3181         }
3182
3183         while (w--)
3184         {
3185             s = *src++;
3186             d = *dst;
3187
3188             *dst++ = composite_over_8888_0565pixel (s, d);
3189         }
3190     }
3191
3192 }
3193
3194 static void
3195 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3196                               pixman_composite_info_t *info)
3197 {
3198     PIXMAN_COMPOSITE_ARGS (info);
3199     uint32_t src, srca;
3200     uint32_t *dst_line, *dst;
3201     uint8_t *mask_line, *mask;
3202     int dst_stride, mask_stride;
3203     int32_t w;
3204     uint32_t m, d;
3205
3206     __m128i xmm_src, xmm_alpha, xmm_def;
3207     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3208     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3209
3210     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3211
3212     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3213
3214     srca = src >> 24;
3215     if (src == 0)
3216         return;
3217
3218     PIXMAN_IMAGE_GET_LINE (
3219         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3220     PIXMAN_IMAGE_GET_LINE (
3221         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3222
3223     xmm_def = create_mask_2x32_128 (src, src);
3224     xmm_src = expand_pixel_32_1x128 (src);
3225     xmm_alpha = expand_alpha_1x128 (xmm_src);
3226     mmx_src   = xmm_src;
3227     mmx_alpha = xmm_alpha;
3228
3229     while (height--)
3230     {
3231         dst = dst_line;
3232         dst_line += dst_stride;
3233         mask = mask_line;
3234         mask_line += mask_stride;
3235         w = width;
3236
3237         while (w && (uintptr_t)dst & 15)
3238         {
3239             uint8_t m = *mask++;
3240
3241             if (m)
3242             {
3243                 d = *dst;
3244                 mmx_mask = expand_pixel_8_1x128 (m);
3245                 mmx_dest = unpack_32_1x128 (d);
3246
3247                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3248                                                    &mmx_alpha,
3249                                                    &mmx_mask,
3250                                                    &mmx_dest));
3251             }
3252
3253             w--;
3254             dst++;
3255         }
3256
3257         while (w >= 4)
3258         {
3259             m = *((uint32_t*)mask);
3260
3261             if (srca == 0xff && m == 0xffffffff)
3262             {
3263                 save_128_aligned ((__m128i*)dst, xmm_def);
3264             }
3265             else if (m)
3266             {
3267                 xmm_dst = load_128_aligned ((__m128i*) dst);
3268                 xmm_mask = unpack_32_1x128 (m);
3269                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3270
3271                 /* Unpacking */
3272                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3273                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3274
3275                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3276                                         &xmm_mask_lo, &xmm_mask_hi);
3277
3278                 in_over_2x128 (&xmm_src, &xmm_src,
3279                                &xmm_alpha, &xmm_alpha,
3280                                &xmm_mask_lo, &xmm_mask_hi,
3281                                &xmm_dst_lo, &xmm_dst_hi);
3282
3283                 save_128_aligned (
3284                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3285             }
3286
3287             w -= 4;
3288             dst += 4;
3289             mask += 4;
3290         }
3291
3292         while (w)
3293         {
3294             uint8_t m = *mask++;
3295
3296             if (m)
3297             {
3298                 d = *dst;
3299                 mmx_mask = expand_pixel_8_1x128 (m);
3300                 mmx_dest = unpack_32_1x128 (d);
3301
3302                 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3303                                                    &mmx_alpha,
3304                                                    &mmx_mask,
3305                                                    &mmx_dest));
3306             }
3307
3308             w--;
3309             dst++;
3310         }
3311     }
3312
3313 }
3314
3315 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3316 __attribute__((__force_align_arg_pointer__))
3317 #endif
3318 static pixman_bool_t
3319 sse2_fill (pixman_implementation_t *imp,
3320            uint32_t *               bits,
3321            int                      stride,
3322            int                      bpp,
3323            int                      x,
3324            int                      y,
3325            int                      width,
3326            int                      height,
3327            uint32_t                 filler)
3328 {
3329     uint32_t byte_width;
3330     uint8_t *byte_line;
3331
3332     __m128i xmm_def;
3333
3334     if (bpp == 8)
3335     {
3336         uint8_t b;
3337         uint16_t w;
3338
3339         stride = stride * (int) sizeof (uint32_t) / 1;
3340         byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3341         byte_width = width;
3342         stride *= 1;
3343
3344         b = filler & 0xff;
3345         w = (b << 8) | b;
3346         filler = (w << 16) | w;
3347     }
3348     else if (bpp == 16)
3349     {
3350         stride = stride * (int) sizeof (uint32_t) / 2;
3351         byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3352         byte_width = 2 * width;
3353         stride *= 2;
3354
3355         filler = (filler & 0xffff) * 0x00010001;
3356     }
3357     else if (bpp == 32)
3358     {
3359         stride = stride * (int) sizeof (uint32_t) / 4;
3360         byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3361         byte_width = 4 * width;
3362         stride *= 4;
3363     }
3364     else
3365     {
3366         return FALSE;
3367     }
3368
3369     xmm_def = create_mask_2x32_128 (filler, filler);
3370
3371     while (height--)
3372     {
3373         int w;
3374         uint8_t *d = byte_line;
3375         byte_line += stride;
3376         w = byte_width;
3377
3378         if (w >= 1 && ((uintptr_t)d & 1))
3379         {
3380             *(uint8_t *)d = filler;
3381             w -= 1;
3382             d += 1;
3383         }
3384
3385         while (w >= 2 && ((uintptr_t)d & 3))
3386         {
3387             *(uint16_t *)d = filler;
3388             w -= 2;
3389             d += 2;
3390         }
3391
3392         while (w >= 4 && ((uintptr_t)d & 15))
3393         {
3394             *(uint32_t *)d = filler;
3395
3396             w -= 4;
3397             d += 4;
3398         }
3399
3400         while (w >= 128)
3401         {
3402             save_128_aligned ((__m128i*)(d),     xmm_def);
3403             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3404             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3405             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3406             save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3407             save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3408             save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3409             save_128_aligned ((__m128i*)(d + 112), xmm_def);
3410
3411             d += 128;
3412             w -= 128;
3413         }
3414
3415         if (w >= 64)
3416         {
3417             save_128_aligned ((__m128i*)(d),     xmm_def);
3418             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3419             save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3420             save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3421
3422             d += 64;
3423             w -= 64;
3424         }
3425
3426         if (w >= 32)
3427         {
3428             save_128_aligned ((__m128i*)(d),     xmm_def);
3429             save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3430
3431             d += 32;
3432             w -= 32;
3433         }
3434
3435         if (w >= 16)
3436         {
3437             save_128_aligned ((__m128i*)(d),     xmm_def);
3438
3439             d += 16;
3440             w -= 16;
3441         }
3442
3443         while (w >= 4)
3444         {
3445             *(uint32_t *)d = filler;
3446
3447             w -= 4;
3448             d += 4;
3449         }
3450
3451         if (w >= 2)
3452         {
3453             *(uint16_t *)d = filler;
3454             w -= 2;
3455             d += 2;
3456         }
3457
3458         if (w >= 1)
3459         {
3460             *(uint8_t *)d = filler;
3461             w -= 1;
3462             d += 1;
3463         }
3464     }
3465
3466     return TRUE;
3467 }
3468
3469 static void
3470 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3471                              pixman_composite_info_t *info)
3472 {
3473     PIXMAN_COMPOSITE_ARGS (info);
3474     uint32_t src, srca;
3475     uint32_t    *dst_line, *dst;
3476     uint8_t     *mask_line, *mask;
3477     int dst_stride, mask_stride;
3478     int32_t w;
3479     uint32_t m;
3480
3481     __m128i xmm_src, xmm_def;
3482     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3483
3484     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3485
3486     srca = src >> 24;
3487     if (src == 0)
3488     {
3489         sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
3490                    PIXMAN_FORMAT_BPP (dest_image->bits.format),
3491                    dest_x, dest_y, width, height, 0);
3492         return;
3493     }
3494
3495     PIXMAN_IMAGE_GET_LINE (
3496         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3497     PIXMAN_IMAGE_GET_LINE (
3498         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3499
3500     xmm_def = create_mask_2x32_128 (src, src);
3501     xmm_src = expand_pixel_32_1x128 (src);
3502
3503     while (height--)
3504     {
3505         dst = dst_line;
3506         dst_line += dst_stride;
3507         mask = mask_line;
3508         mask_line += mask_stride;
3509         w = width;
3510
3511         while (w && (uintptr_t)dst & 15)
3512         {
3513             uint8_t m = *mask++;
3514
3515             if (m)
3516             {
3517                 *dst = pack_1x128_32 (
3518                     pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3519             }
3520             else
3521             {
3522                 *dst = 0;
3523             }
3524
3525             w--;
3526             dst++;
3527         }
3528
3529         while (w >= 4)
3530         {
3531             m = *((uint32_t*)mask);
3532
3533             if (srca == 0xff && m == 0xffffffff)
3534             {
3535                 save_128_aligned ((__m128i*)dst, xmm_def);
3536             }
3537             else if (m)
3538             {
3539                 xmm_mask = unpack_32_1x128 (m);
3540                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3541
3542                 /* Unpacking */
3543                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3544
3545                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3546                                         &xmm_mask_lo, &xmm_mask_hi);
3547
3548                 pix_multiply_2x128 (&xmm_src, &xmm_src,
3549                                     &xmm_mask_lo, &xmm_mask_hi,
3550                                     &xmm_mask_lo, &xmm_mask_hi);
3551
3552                 save_128_aligned (
3553                     (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3554             }
3555             else
3556             {
3557                 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3558             }
3559
3560             w -= 4;
3561             dst += 4;
3562             mask += 4;
3563         }
3564
3565         while (w)
3566         {
3567             uint8_t m = *mask++;
3568
3569             if (m)
3570             {
3571                 *dst = pack_1x128_32 (
3572                     pix_multiply_1x128 (
3573                         xmm_src, expand_pixel_8_1x128 (m)));
3574             }
3575             else
3576             {
3577                 *dst = 0;
3578             }
3579
3580             w--;
3581             dst++;
3582         }
3583     }
3584
3585 }
3586
3587 static void
3588 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3589                               pixman_composite_info_t *info)
3590 {
3591     PIXMAN_COMPOSITE_ARGS (info);
3592     uint32_t src;
3593     uint16_t    *dst_line, *dst, d;
3594     uint8_t     *mask_line, *mask;
3595     int dst_stride, mask_stride;
3596     int32_t w;
3597     uint32_t m;
3598     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3599
3600     __m128i xmm_src, xmm_alpha;
3601     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3602     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3603
3604     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3605
3606     if (src == 0)
3607         return;
3608
3609     PIXMAN_IMAGE_GET_LINE (
3610         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3611     PIXMAN_IMAGE_GET_LINE (
3612         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3613
3614     xmm_src = expand_pixel_32_1x128 (src);
3615     xmm_alpha = expand_alpha_1x128 (xmm_src);
3616     mmx_src = xmm_src;
3617     mmx_alpha = xmm_alpha;
3618
3619     while (height--)
3620     {
3621         dst = dst_line;
3622         dst_line += dst_stride;
3623         mask = mask_line;
3624         mask_line += mask_stride;
3625         w = width;
3626
3627         while (w && (uintptr_t)dst & 15)
3628         {
3629             m = *mask++;
3630
3631             if (m)
3632             {
3633                 d = *dst;
3634                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3635                 mmx_dest = expand565_16_1x128 (d);
3636
3637                 *dst = pack_565_32_16 (
3638                     pack_1x128_32 (
3639                         in_over_1x128 (
3640                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3641             }
3642
3643             w--;
3644             dst++;
3645         }
3646
3647         while (w >= 8)
3648         {
3649             xmm_dst = load_128_aligned ((__m128i*) dst);
3650             unpack_565_128_4x128 (xmm_dst,
3651                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3652
3653             m = *((uint32_t*)mask);
3654             mask += 4;
3655
3656             if (m)
3657             {
3658                 xmm_mask = unpack_32_1x128 (m);
3659                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3660
3661                 /* Unpacking */
3662                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3663
3664                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3665                                         &xmm_mask_lo, &xmm_mask_hi);
3666
3667                 in_over_2x128 (&xmm_src, &xmm_src,
3668                                &xmm_alpha, &xmm_alpha,
3669                                &xmm_mask_lo, &xmm_mask_hi,
3670                                &xmm_dst0, &xmm_dst1);
3671             }
3672
3673             m = *((uint32_t*)mask);
3674             mask += 4;
3675
3676             if (m)
3677             {
3678                 xmm_mask = unpack_32_1x128 (m);
3679                 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3680
3681                 /* Unpacking */
3682                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3683
3684                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3685                                         &xmm_mask_lo, &xmm_mask_hi);
3686                 in_over_2x128 (&xmm_src, &xmm_src,
3687                                &xmm_alpha, &xmm_alpha,
3688                                &xmm_mask_lo, &xmm_mask_hi,
3689                                &xmm_dst2, &xmm_dst3);
3690             }
3691
3692             save_128_aligned (
3693                 (__m128i*)dst, pack_565_4x128_128 (
3694                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3695
3696             w -= 8;
3697             dst += 8;
3698         }
3699
3700         while (w)
3701         {
3702             m = *mask++;
3703
3704             if (m)
3705             {
3706                 d = *dst;
3707                 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3708                 mmx_dest = expand565_16_1x128 (d);
3709
3710                 *dst = pack_565_32_16 (
3711                     pack_1x128_32 (
3712                         in_over_1x128 (
3713                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3714             }
3715
3716             w--;
3717             dst++;
3718         }
3719     }
3720
3721 }
3722
3723 static void
3724 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3725                                  pixman_composite_info_t *info)
3726 {
3727     PIXMAN_COMPOSITE_ARGS (info);
3728     uint16_t    *dst_line, *dst, d;
3729     uint32_t    *src_line, *src, s;
3730     int dst_stride, src_stride;
3731     int32_t w;
3732     uint32_t opaque, zero;
3733
3734     __m128i ms;
3735     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3736     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3737
3738     PIXMAN_IMAGE_GET_LINE (
3739         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3740     PIXMAN_IMAGE_GET_LINE (
3741         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3742
3743     while (height--)
3744     {
3745         dst = dst_line;
3746         dst_line += dst_stride;
3747         src = src_line;
3748         src_line += src_stride;
3749         w = width;
3750
3751         while (w && (uintptr_t)dst & 15)
3752         {
3753             s = *src++;
3754             d = *dst;
3755
3756             ms = unpack_32_1x128 (s);
3757
3758             *dst++ = pack_565_32_16 (
3759                 pack_1x128_32 (
3760                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3761             w--;
3762         }
3763
3764         while (w >= 8)
3765         {
3766             /* First round */
3767             xmm_src = load_128_unaligned ((__m128i*)src);
3768             xmm_dst = load_128_aligned  ((__m128i*)dst);
3769
3770             opaque = is_opaque (xmm_src);
3771             zero = is_zero (xmm_src);
3772
3773             unpack_565_128_4x128 (xmm_dst,
3774                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3775             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3776
3777             /* preload next round*/
3778             xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3779
3780             if (opaque)
3781             {
3782                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3783                                      &xmm_dst0, &xmm_dst1);
3784             }
3785             else if (!zero)
3786             {
3787                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3788                                         &xmm_dst0, &xmm_dst1);
3789             }
3790
3791             /* Second round */
3792             opaque = is_opaque (xmm_src);
3793             zero = is_zero (xmm_src);
3794
3795             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3796
3797             if (opaque)
3798             {
3799                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3800                                      &xmm_dst2, &xmm_dst3);
3801             }
3802             else if (!zero)
3803             {
3804                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3805                                         &xmm_dst2, &xmm_dst3);
3806             }
3807
3808             save_128_aligned (
3809                 (__m128i*)dst, pack_565_4x128_128 (
3810                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3811
3812             w -= 8;
3813             src += 8;
3814             dst += 8;
3815         }
3816
3817         while (w)
3818         {
3819             s = *src++;
3820             d = *dst;
3821
3822             ms = unpack_32_1x128 (s);
3823
3824             *dst++ = pack_565_32_16 (
3825                 pack_1x128_32 (
3826                     over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3827             w--;
3828         }
3829     }
3830
3831 }
3832
3833 static void
3834 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3835                                  pixman_composite_info_t *info)
3836 {
3837     PIXMAN_COMPOSITE_ARGS (info);
3838     uint32_t    *dst_line, *dst, d;
3839     uint32_t    *src_line, *src, s;
3840     int dst_stride, src_stride;
3841     int32_t w;
3842     uint32_t opaque, zero;
3843
3844     __m128i xmm_src_lo, xmm_src_hi;
3845     __m128i xmm_dst_lo, xmm_dst_hi;
3846
3847     PIXMAN_IMAGE_GET_LINE (
3848         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3849     PIXMAN_IMAGE_GET_LINE (
3850         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3851
3852     while (height--)
3853     {
3854         dst = dst_line;
3855         dst_line += dst_stride;
3856         src = src_line;
3857         src_line += src_stride;
3858         w = width;
3859
3860         while (w && (uintptr_t)dst & 15)
3861         {
3862             s = *src++;
3863             d = *dst;
3864
3865             *dst++ = pack_1x128_32 (
3866                 over_rev_non_pre_1x128 (
3867                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
3868
3869             w--;
3870         }
3871
3872         while (w >= 4)
3873         {
3874             xmm_src_hi = load_128_unaligned ((__m128i*)src);
3875
3876             opaque = is_opaque (xmm_src_hi);
3877             zero = is_zero (xmm_src_hi);
3878
3879             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3880
3881             if (opaque)
3882             {
3883                 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3884                                      &xmm_dst_lo, &xmm_dst_hi);
3885
3886                 save_128_aligned (
3887                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3888             }
3889             else if (!zero)
3890             {
3891                 xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
3892
3893                 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3894
3895                 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3896                                         &xmm_dst_lo, &xmm_dst_hi);
3897
3898                 save_128_aligned (
3899                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3900             }
3901
3902             w -= 4;
3903             dst += 4;
3904             src += 4;
3905         }
3906
3907         while (w)
3908         {
3909             s = *src++;
3910             d = *dst;
3911
3912             *dst++ = pack_1x128_32 (
3913                 over_rev_non_pre_1x128 (
3914                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
3915
3916             w--;
3917         }
3918     }
3919
3920 }
3921
3922 static void
3923 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3924                                     pixman_composite_info_t *info)
3925 {
3926     PIXMAN_COMPOSITE_ARGS (info);
3927     uint32_t src;
3928     uint16_t    *dst_line, *dst, d;
3929     uint32_t    *mask_line, *mask, m;
3930     int dst_stride, mask_stride;
3931     int w;
3932     uint32_t pack_cmp;
3933
3934     __m128i xmm_src, xmm_alpha;
3935     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3936     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3937
3938     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3939
3940     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3941
3942     if (src == 0)
3943         return;
3944
3945     PIXMAN_IMAGE_GET_LINE (
3946         dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3947     PIXMAN_IMAGE_GET_LINE (
3948         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3949
3950     xmm_src = expand_pixel_32_1x128 (src);
3951     xmm_alpha = expand_alpha_1x128 (xmm_src);
3952     mmx_src = xmm_src;
3953     mmx_alpha = xmm_alpha;
3954
3955     while (height--)
3956     {
3957         w = width;
3958         mask = mask_line;
3959         dst = dst_line;
3960         mask_line += mask_stride;
3961         dst_line += dst_stride;
3962
3963         while (w && ((uintptr_t)dst & 15))
3964         {
3965             m = *(uint32_t *) mask;
3966
3967             if (m)
3968             {
3969                 d = *dst;
3970                 mmx_mask = unpack_32_1x128 (m);
3971                 mmx_dest = expand565_16_1x128 (d);
3972
3973                 *dst = pack_565_32_16 (
3974                     pack_1x128_32 (
3975                         in_over_1x128 (
3976                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3977             }
3978
3979             w--;
3980             dst++;
3981             mask++;
3982         }
3983
3984         while (w >= 8)
3985         {
3986             /* First round */
3987             xmm_mask = load_128_unaligned ((__m128i*)mask);
3988             xmm_dst = load_128_aligned ((__m128i*)dst);
3989
3990             pack_cmp = _mm_movemask_epi8 (
3991                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3992
3993             unpack_565_128_4x128 (xmm_dst,
3994                                   &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3995             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3996
3997             /* preload next round */
3998             xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
3999
4000             /* preload next round */
4001             if (pack_cmp != 0xffff)
4002             {
4003                 in_over_2x128 (&xmm_src, &xmm_src,
4004                                &xmm_alpha, &xmm_alpha,
4005                                &xmm_mask_lo, &xmm_mask_hi,
4006                                &xmm_dst0, &xmm_dst1);
4007             }
4008
4009             /* Second round */
4010             pack_cmp = _mm_movemask_epi8 (
4011                 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4012
4013             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4014
4015             if (pack_cmp != 0xffff)
4016             {
4017                 in_over_2x128 (&xmm_src, &xmm_src,
4018                                &xmm_alpha, &xmm_alpha,
4019                                &xmm_mask_lo, &xmm_mask_hi,
4020                                &xmm_dst2, &xmm_dst3);
4021             }
4022
4023             save_128_aligned (
4024                 (__m128i*)dst, pack_565_4x128_128 (
4025                     &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4026
4027             w -= 8;
4028             dst += 8;
4029             mask += 8;
4030         }
4031
4032         while (w)
4033         {
4034             m = *(uint32_t *) mask;
4035
4036             if (m)
4037             {
4038                 d = *dst;
4039                 mmx_mask = unpack_32_1x128 (m);
4040                 mmx_dest = expand565_16_1x128 (d);
4041
4042                 *dst = pack_565_32_16 (
4043                     pack_1x128_32 (
4044                         in_over_1x128 (
4045                             &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4046             }
4047
4048             w--;
4049             dst++;
4050             mask++;
4051         }
4052     }
4053
4054 }
4055
4056 static void
4057 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4058                          pixman_composite_info_t *info)
4059 {
4060     PIXMAN_COMPOSITE_ARGS (info);
4061     uint8_t     *dst_line, *dst;
4062     uint8_t     *mask_line, *mask;
4063     int dst_stride, mask_stride;
4064     uint32_t d, m;
4065     uint32_t src;
4066     int32_t w;
4067
4068     __m128i xmm_alpha;
4069     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4070     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4071
4072     PIXMAN_IMAGE_GET_LINE (
4073         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4074     PIXMAN_IMAGE_GET_LINE (
4075         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4076
4077     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4078
4079     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4080
4081     while (height--)
4082     {
4083         dst = dst_line;
4084         dst_line += dst_stride;
4085         mask = mask_line;
4086         mask_line += mask_stride;
4087         w = width;
4088
4089         while (w && ((uintptr_t)dst & 15))
4090         {
4091             m = (uint32_t) *mask++;
4092             d = (uint32_t) *dst;
4093
4094             *dst++ = (uint8_t) pack_1x128_32 (
4095                 pix_multiply_1x128 (
4096                     pix_multiply_1x128 (xmm_alpha,
4097                                        unpack_32_1x128 (m)),
4098                     unpack_32_1x128 (d)));
4099             w--;
4100         }
4101
4102         while (w >= 16)
4103         {
4104             xmm_mask = load_128_unaligned ((__m128i*)mask);
4105             xmm_dst = load_128_aligned ((__m128i*)dst);
4106
4107             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4108             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4109
4110             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4111                                 &xmm_mask_lo, &xmm_mask_hi,
4112                                 &xmm_mask_lo, &xmm_mask_hi);
4113
4114             pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4115                                 &xmm_dst_lo, &xmm_dst_hi,
4116                                 &xmm_dst_lo, &xmm_dst_hi);
4117
4118             save_128_aligned (
4119                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4120
4121             mask += 16;
4122             dst += 16;
4123             w -= 16;
4124         }
4125
4126         while (w)
4127         {
4128             m = (uint32_t) *mask++;
4129             d = (uint32_t) *dst;
4130
4131             *dst++ = (uint8_t) pack_1x128_32 (
4132                 pix_multiply_1x128 (
4133                     pix_multiply_1x128 (
4134                         xmm_alpha, unpack_32_1x128 (m)),
4135                     unpack_32_1x128 (d)));
4136             w--;
4137         }
4138     }
4139
4140 }
4141
4142 static void
4143 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4144                        pixman_composite_info_t *info)
4145 {
4146     PIXMAN_COMPOSITE_ARGS (info);
4147     uint8_t     *dst_line, *dst;
4148     int dst_stride;
4149     uint32_t d;
4150     uint32_t src;
4151     int32_t w;
4152
4153     __m128i xmm_alpha;
4154     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4155
4156     PIXMAN_IMAGE_GET_LINE (
4157         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4158
4159     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4160
4161     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4162
4163     src = src >> 24;
4164
4165     if (src == 0xff)
4166         return;
4167
4168     if (src == 0x00)
4169     {
4170         pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4171                      8, dest_x, dest_y, width, height, src);
4172
4173         return;
4174     }
4175
4176     while (height--)
4177     {
4178         dst = dst_line;
4179         dst_line += dst_stride;
4180         w = width;
4181
4182         while (w && ((uintptr_t)dst & 15))
4183         {
4184             d = (uint32_t) *dst;
4185
4186             *dst++ = (uint8_t) pack_1x128_32 (
4187                 pix_multiply_1x128 (
4188                     xmm_alpha,
4189                     unpack_32_1x128 (d)));
4190             w--;
4191         }
4192
4193         while (w >= 16)
4194         {
4195             xmm_dst = load_128_aligned ((__m128i*)dst);
4196
4197             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4198
4199             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4200                                 &xmm_dst_lo, &xmm_dst_hi,
4201                                 &xmm_dst_lo, &xmm_dst_hi);
4202
4203             save_128_aligned (
4204                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4205
4206             dst += 16;
4207             w -= 16;
4208         }
4209
4210         while (w)
4211         {
4212             d = (uint32_t) *dst;
4213
4214             *dst++ = (uint8_t) pack_1x128_32 (
4215                 pix_multiply_1x128 (
4216                     xmm_alpha,
4217                     unpack_32_1x128 (d)));
4218             w--;
4219         }
4220     }
4221
4222 }
4223
4224 static void
4225 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4226                        pixman_composite_info_t *info)
4227 {
4228     PIXMAN_COMPOSITE_ARGS (info);
4229     uint8_t     *dst_line, *dst;
4230     uint8_t     *src_line, *src;
4231     int src_stride, dst_stride;
4232     int32_t w;
4233     uint32_t s, d;
4234
4235     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4236     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4237
4238     PIXMAN_IMAGE_GET_LINE (
4239         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4240     PIXMAN_IMAGE_GET_LINE (
4241         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4242
4243     while (height--)
4244     {
4245         dst = dst_line;
4246         dst_line += dst_stride;
4247         src = src_line;
4248         src_line += src_stride;
4249         w = width;
4250
4251         while (w && ((uintptr_t)dst & 15))
4252         {
4253             s = (uint32_t) *src++;
4254             d = (uint32_t) *dst;
4255
4256             *dst++ = (uint8_t) pack_1x128_32 (
4257                 pix_multiply_1x128 (
4258                     unpack_32_1x128 (s), unpack_32_1x128 (d)));
4259             w--;
4260         }
4261
4262         while (w >= 16)
4263         {
4264             xmm_src = load_128_unaligned ((__m128i*)src);
4265             xmm_dst = load_128_aligned ((__m128i*)dst);
4266
4267             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4268             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4269
4270             pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4271                                 &xmm_dst_lo, &xmm_dst_hi,
4272                                 &xmm_dst_lo, &xmm_dst_hi);
4273
4274             save_128_aligned (
4275                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4276
4277             src += 16;
4278             dst += 16;
4279             w -= 16;
4280         }
4281
4282         while (w)
4283         {
4284             s = (uint32_t) *src++;
4285             d = (uint32_t) *dst;
4286
4287             *dst++ = (uint8_t) pack_1x128_32 (
4288                 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4289             w--;
4290         }
4291     }
4292
4293 }
4294
4295 static void
4296 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4297                           pixman_composite_info_t *info)
4298 {
4299     PIXMAN_COMPOSITE_ARGS (info);
4300     uint8_t     *dst_line, *dst;
4301     uint8_t     *mask_line, *mask;
4302     int dst_stride, mask_stride;
4303     int32_t w;
4304     uint32_t src;
4305     uint32_t m, d;
4306
4307     __m128i xmm_alpha;
4308     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4309     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4310
4311     PIXMAN_IMAGE_GET_LINE (
4312         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4313     PIXMAN_IMAGE_GET_LINE (
4314         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4315
4316     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4317
4318     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4319
4320     while (height--)
4321     {
4322         dst = dst_line;
4323         dst_line += dst_stride;
4324         mask = mask_line;
4325         mask_line += mask_stride;
4326         w = width;
4327
4328         while (w && ((uintptr_t)dst & 15))
4329         {
4330             m = (uint32_t) *mask++;
4331             d = (uint32_t) *dst;
4332
4333             *dst++ = (uint8_t) pack_1x128_32 (
4334                 _mm_adds_epu16 (
4335                     pix_multiply_1x128 (
4336                         xmm_alpha, unpack_32_1x128 (m)),
4337                     unpack_32_1x128 (d)));
4338             w--;
4339         }
4340
4341         while (w >= 16)
4342         {
4343             xmm_mask = load_128_unaligned ((__m128i*)mask);
4344             xmm_dst = load_128_aligned ((__m128i*)dst);
4345
4346             unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4347             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4348
4349             pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4350                                 &xmm_mask_lo, &xmm_mask_hi,
4351                                 &xmm_mask_lo, &xmm_mask_hi);
4352
4353             xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4354             xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4355
4356             save_128_aligned (
4357                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4358
4359             mask += 16;
4360             dst += 16;
4361             w -= 16;
4362         }
4363
4364         while (w)
4365         {
4366             m = (uint32_t) *mask++;
4367             d = (uint32_t) *dst;
4368
4369             *dst++ = (uint8_t) pack_1x128_32 (
4370                 _mm_adds_epu16 (
4371                     pix_multiply_1x128 (
4372                         xmm_alpha, unpack_32_1x128 (m)),
4373                     unpack_32_1x128 (d)));
4374
4375             w--;
4376         }
4377     }
4378
4379 }
4380
4381 static void
4382 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4383                         pixman_composite_info_t *info)
4384 {
4385     PIXMAN_COMPOSITE_ARGS (info);
4386     uint8_t     *dst_line, *dst;
4387     int dst_stride;
4388     int32_t w;
4389     uint32_t src;
4390
4391     __m128i xmm_src;
4392
4393     PIXMAN_IMAGE_GET_LINE (
4394         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4395
4396     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4397
4398     src >>= 24;
4399
4400     if (src == 0x00)
4401         return;
4402
4403     if (src == 0xff)
4404     {
4405         pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4406                      8, dest_x, dest_y, width, height, 0xff);
4407
4408         return;
4409     }
4410
4411     src = (src << 24) | (src << 16) | (src << 8) | src;
4412     xmm_src = _mm_set_epi32 (src, src, src, src);
4413
4414     while (height--)
4415     {
4416         dst = dst_line;
4417         dst_line += dst_stride;
4418         w = width;
4419
4420         while (w && ((uintptr_t)dst & 15))
4421         {
4422             *dst = (uint8_t)_mm_cvtsi128_si32 (
4423                 _mm_adds_epu8 (
4424                     xmm_src,
4425                     _mm_cvtsi32_si128 (*dst)));
4426
4427             w--;
4428             dst++;
4429         }
4430
4431         while (w >= 16)
4432         {
4433             save_128_aligned (
4434                 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4435
4436             dst += 16;
4437             w -= 16;
4438         }
4439
4440         while (w)
4441         {
4442             *dst = (uint8_t)_mm_cvtsi128_si32 (
4443                 _mm_adds_epu8 (
4444                     xmm_src,
4445                     _mm_cvtsi32_si128 (*dst)));
4446
4447             w--;
4448             dst++;
4449         }
4450     }
4451
4452 }
4453
4454 static void
4455 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4456                         pixman_composite_info_t *info)
4457 {
4458     PIXMAN_COMPOSITE_ARGS (info);
4459     uint8_t     *dst_line, *dst;
4460     uint8_t     *src_line, *src;
4461     int dst_stride, src_stride;
4462     int32_t w;
4463     uint16_t t;
4464
4465     PIXMAN_IMAGE_GET_LINE (
4466         src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4467     PIXMAN_IMAGE_GET_LINE (
4468         dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4469
4470     while (height--)
4471     {
4472         dst = dst_line;
4473         src = src_line;
4474
4475         dst_line += dst_stride;
4476         src_line += src_stride;
4477         w = width;
4478
4479         /* Small head */
4480         while (w && (uintptr_t)dst & 3)
4481         {
4482             t = (*dst) + (*src++);
4483             *dst++ = t | (0 - (t >> 8));
4484             w--;
4485         }
4486
4487         sse2_combine_add_u (imp, op,
4488                             (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4489
4490         /* Small tail */
4491         dst += w & 0xfffc;
4492         src += w & 0xfffc;
4493
4494         w &= 3;
4495
4496         while (w)
4497         {
4498             t = (*dst) + (*src++);
4499             *dst++ = t | (0 - (t >> 8));
4500             w--;
4501         }
4502     }
4503
4504 }
4505
4506 static void
4507 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4508                               pixman_composite_info_t *info)
4509 {
4510     PIXMAN_COMPOSITE_ARGS (info);
4511     uint32_t    *dst_line, *dst;
4512     uint32_t    *src_line, *src;
4513     int dst_stride, src_stride;
4514
4515     PIXMAN_IMAGE_GET_LINE (
4516         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4517     PIXMAN_IMAGE_GET_LINE (
4518         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4519
4520     while (height--)
4521     {
4522         dst = dst_line;
4523         dst_line += dst_stride;
4524         src = src_line;
4525         src_line += src_stride;
4526
4527         sse2_combine_add_u (imp, op, dst, src, NULL, width);
4528     }
4529 }
4530
4531 static void
4532 sse2_composite_add_n_8888 (pixman_implementation_t *imp,
4533                            pixman_composite_info_t *info)
4534 {
4535     PIXMAN_COMPOSITE_ARGS (info);
4536     uint32_t *dst_line, *dst, src;
4537     int dst_stride;
4538
4539     __m128i xmm_src;
4540
4541     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4542
4543     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4544     if (src == 0)
4545         return;
4546
4547     if (src == ~0)
4548     {
4549         pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
4550                      dest_x, dest_y, width, height, ~0);
4551
4552         return;
4553     }
4554
4555     xmm_src = _mm_set_epi32 (src, src, src, src);
4556     while (height--)
4557     {
4558         int w = width;
4559         uint32_t d;
4560
4561         dst = dst_line;
4562         dst_line += dst_stride;
4563
4564         while (w && (uintptr_t)dst & 15)
4565         {
4566             d = *dst;
4567             *dst++ =
4568                 _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
4569             w--;
4570         }
4571
4572         while (w >= 4)
4573         {
4574             save_128_aligned
4575                 ((__m128i*)dst,
4576                  _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4577
4578             dst += 4;
4579             w -= 4;
4580         }
4581
4582         while (w--)
4583         {
4584             d = *dst;
4585             *dst++ =
4586                 _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
4587                                                   _mm_cvtsi32_si128 (d)));
4588         }
4589     }
4590 }
4591
4592 static void
4593 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
4594                              pixman_composite_info_t *info)
4595 {
4596     PIXMAN_COMPOSITE_ARGS (info);
4597     uint32_t     *dst_line, *dst;
4598     uint8_t     *mask_line, *mask;
4599     int dst_stride, mask_stride;
4600     int32_t w;
4601     uint32_t src;
4602
4603     __m128i xmm_src;
4604
4605     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4606     if (src == 0)
4607         return;
4608     xmm_src = expand_pixel_32_1x128 (src);
4609
4610     PIXMAN_IMAGE_GET_LINE (
4611         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4612     PIXMAN_IMAGE_GET_LINE (
4613         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4614
4615     while (height--)
4616     {
4617         dst = dst_line;
4618         dst_line += dst_stride;
4619         mask = mask_line;
4620         mask_line += mask_stride;
4621         w = width;
4622
4623         while (w && ((uintptr_t)dst & 15))
4624         {
4625             uint8_t m = *mask++;
4626             if (m)
4627             {
4628                 *dst = pack_1x128_32
4629                     (_mm_adds_epu16
4630                      (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4631                       unpack_32_1x128 (*dst)));
4632             }
4633             dst++;
4634             w--;
4635         }
4636
4637         while (w >= 4)
4638         {
4639             uint32_t m = *(uint32_t*)mask;
4640             if (m)
4641             {
4642                 __m128i xmm_mask_lo, xmm_mask_hi;
4643                 __m128i xmm_dst_lo, xmm_dst_hi;
4644
4645                 __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
4646                 __m128i xmm_mask =
4647                     _mm_unpacklo_epi8 (unpack_32_1x128(m),
4648                                        _mm_setzero_si128 ());
4649
4650                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4651                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4652
4653                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4654                                         &xmm_mask_lo, &xmm_mask_hi);
4655
4656                 pix_multiply_2x128 (&xmm_src, &xmm_src,
4657                                     &xmm_mask_lo, &xmm_mask_hi,
4658                                     &xmm_mask_lo, &xmm_mask_hi);
4659
4660                 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4661                 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4662
4663                 save_128_aligned (
4664                     (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4665             }
4666
4667             w -= 4;
4668             dst += 4;
4669             mask += 4;
4670         }
4671
4672         while (w)
4673         {
4674             uint8_t m = *mask++;
4675             if (m)
4676             {
4677                 *dst = pack_1x128_32
4678                     (_mm_adds_epu16
4679                      (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4680                       unpack_32_1x128 (*dst)));
4681             }
4682             dst++;
4683             w--;
4684         }
4685     }
4686 }
4687
4688 static pixman_bool_t
4689 sse2_blt (pixman_implementation_t *imp,
4690           uint32_t *               src_bits,
4691           uint32_t *               dst_bits,
4692           int                      src_stride,
4693           int                      dst_stride,
4694           int                      src_bpp,
4695           int                      dst_bpp,
4696           int                      src_x,
4697           int                      src_y,
4698           int                      dest_x,
4699           int                      dest_y,
4700           int                      width,
4701           int                      height)
4702 {
4703     uint8_t *   src_bytes;
4704     uint8_t *   dst_bytes;
4705     int byte_width;
4706
4707     if (src_bpp != dst_bpp)
4708         return FALSE;
4709
4710     if (src_bpp == 16)
4711     {
4712         src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4713         dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4714         src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4715         dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4716         byte_width = 2 * width;
4717         src_stride *= 2;
4718         dst_stride *= 2;
4719     }
4720     else if (src_bpp == 32)
4721     {
4722         src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4723         dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4724         src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4725         dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4726         byte_width = 4 * width;
4727         src_stride *= 4;
4728         dst_stride *= 4;
4729     }
4730     else
4731     {
4732         return FALSE;
4733     }
4734
4735     while (height--)
4736     {
4737         int w;
4738         uint8_t *s = src_bytes;
4739         uint8_t *d = dst_bytes;
4740         src_bytes += src_stride;
4741         dst_bytes += dst_stride;
4742         w = byte_width;
4743
4744         while (w >= 2 && ((uintptr_t)d & 3))
4745         {
4746             *(uint16_t *)d = *(uint16_t *)s;
4747             w -= 2;
4748             s += 2;
4749             d += 2;
4750         }
4751
4752         while (w >= 4 && ((uintptr_t)d & 15))
4753         {
4754             *(uint32_t *)d = *(uint32_t *)s;
4755
4756             w -= 4;
4757             s += 4;
4758             d += 4;
4759         }
4760
4761         while (w >= 64)
4762         {
4763             __m128i xmm0, xmm1, xmm2, xmm3;
4764
4765             xmm0 = load_128_unaligned ((__m128i*)(s));
4766             xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4767             xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4768             xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4769
4770             save_128_aligned ((__m128i*)(d),    xmm0);
4771             save_128_aligned ((__m128i*)(d + 16), xmm1);
4772             save_128_aligned ((__m128i*)(d + 32), xmm2);
4773             save_128_aligned ((__m128i*)(d + 48), xmm3);
4774
4775             s += 64;
4776             d += 64;
4777             w -= 64;
4778         }
4779
4780         while (w >= 16)
4781         {
4782             save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4783
4784             w -= 16;
4785             d += 16;
4786             s += 16;
4787         }
4788
4789         while (w >= 4)
4790         {
4791             *(uint32_t *)d = *(uint32_t *)s;
4792
4793             w -= 4;
4794             s += 4;
4795             d += 4;
4796         }
4797
4798         if (w >= 2)
4799         {
4800             *(uint16_t *)d = *(uint16_t *)s;
4801             w -= 2;
4802             s += 2;
4803             d += 2;
4804         }
4805     }
4806
4807     return TRUE;
4808 }
4809
4810 static void
4811 sse2_composite_copy_area (pixman_implementation_t *imp,
4812                           pixman_composite_info_t *info)
4813 {
4814     PIXMAN_COMPOSITE_ARGS (info);
4815     sse2_blt (imp, src_image->bits.bits,
4816               dest_image->bits.bits,
4817               src_image->bits.rowstride,
4818               dest_image->bits.rowstride,
4819               PIXMAN_FORMAT_BPP (src_image->bits.format),
4820               PIXMAN_FORMAT_BPP (dest_image->bits.format),
4821               src_x, src_y, dest_x, dest_y, width, height);
4822 }
4823
4824 static void
4825 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4826                                  pixman_composite_info_t *info)
4827 {
4828     PIXMAN_COMPOSITE_ARGS (info);
4829     uint32_t    *src, *src_line, s;
4830     uint32_t    *dst, *dst_line, d;
4831     uint8_t         *mask, *mask_line;
4832     uint32_t m;
4833     int src_stride, mask_stride, dst_stride;
4834     int32_t w;
4835     __m128i ms;
4836
4837     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4838     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4839     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4840
4841     PIXMAN_IMAGE_GET_LINE (
4842         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4843     PIXMAN_IMAGE_GET_LINE (
4844         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4845     PIXMAN_IMAGE_GET_LINE (
4846         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4847
4848     while (height--)
4849     {
4850         src = src_line;
4851         src_line += src_stride;
4852         dst = dst_line;
4853         dst_line += dst_stride;
4854         mask = mask_line;
4855         mask_line += mask_stride;
4856
4857         w = width;
4858
4859         while (w && (uintptr_t)dst & 15)
4860         {
4861             s = 0xff000000 | *src++;
4862             m = (uint32_t) *mask++;
4863             d = *dst;
4864             ms = unpack_32_1x128 (s);
4865
4866             if (m != 0xff)
4867             {
4868                 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4869                 __m128i md = unpack_32_1x128 (d);
4870
4871                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4872             }
4873
4874             *dst++ = pack_1x128_32 (ms);
4875             w--;
4876         }
4877
4878         while (w >= 4)
4879         {
4880             m = *(uint32_t*) mask;
4881             xmm_src = _mm_or_si128 (
4882                 load_128_unaligned ((__m128i*)src), mask_ff000000);
4883
4884             if (m == 0xffffffff)
4885             {
4886                 save_128_aligned ((__m128i*)dst, xmm_src);
4887             }
4888             else
4889             {
4890                 xmm_dst = load_128_aligned ((__m128i*)dst);
4891
4892                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4893
4894                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4895                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4896                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4897
4898                 expand_alpha_rev_2x128 (
4899                     xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4900
4901                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4902                                &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4903                                &xmm_dst_lo, &xmm_dst_hi);
4904
4905                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4906             }
4907
4908             src += 4;
4909             dst += 4;
4910             mask += 4;
4911             w -= 4;
4912         }
4913
4914         while (w)
4915         {
4916             m = (uint32_t) *mask++;
4917
4918             if (m)
4919             {
4920                 s = 0xff000000 | *src;
4921
4922                 if (m == 0xff)
4923                 {
4924                     *dst = s;
4925                 }
4926                 else
4927                 {
4928                     __m128i ma, md, ms;
4929
4930                     d = *dst;
4931
4932                     ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4933                     md = unpack_32_1x128 (d);
4934                     ms = unpack_32_1x128 (s);
4935
4936                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4937                 }
4938
4939             }
4940
4941             src++;
4942             dst++;
4943             w--;
4944         }
4945     }
4946
4947 }
4948
4949 static void
4950 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4951                                  pixman_composite_info_t *info)
4952 {
4953     PIXMAN_COMPOSITE_ARGS (info);
4954     uint32_t    *src, *src_line, s;
4955     uint32_t    *dst, *dst_line, d;
4956     uint8_t         *mask, *mask_line;
4957     uint32_t m;
4958     int src_stride, mask_stride, dst_stride;
4959     int32_t w;
4960
4961     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4962     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4963     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4964
4965     PIXMAN_IMAGE_GET_LINE (
4966         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4967     PIXMAN_IMAGE_GET_LINE (
4968         mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4969     PIXMAN_IMAGE_GET_LINE (
4970         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4971
4972     while (height--)
4973     {
4974         src = src_line;
4975         src_line += src_stride;
4976         dst = dst_line;
4977         dst_line += dst_stride;
4978         mask = mask_line;
4979         mask_line += mask_stride;
4980
4981         w = width;
4982
4983         while (w && (uintptr_t)dst & 15)
4984         {
4985             uint32_t sa;
4986
4987             s = *src++;
4988             m = (uint32_t) *mask++;
4989             d = *dst;
4990
4991             sa = s >> 24;
4992
4993             if (m)
4994             {
4995                 if (sa == 0xff && m == 0xff)
4996                 {
4997                     *dst = s;
4998                 }
4999                 else
5000                 {
5001                     __m128i ms, md, ma, msa;
5002
5003                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5004                     ms = unpack_32_1x128 (s);
5005                     md = unpack_32_1x128 (d);
5006
5007                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5008
5009                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5010                 }
5011             }
5012
5013             dst++;
5014             w--;
5015         }
5016
5017         while (w >= 4)
5018         {
5019             m = *(uint32_t *) mask;
5020
5021             if (m)
5022             {
5023                 xmm_src = load_128_unaligned ((__m128i*)src);
5024
5025                 if (m == 0xffffffff && is_opaque (xmm_src))
5026                 {
5027                     save_128_aligned ((__m128i *)dst, xmm_src);
5028                 }
5029                 else
5030                 {
5031                     xmm_dst = load_128_aligned ((__m128i *)dst);
5032
5033                     xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5034
5035                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5036                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5037                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5038
5039                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5040                     expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5041
5042                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5043                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5044
5045                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5046                 }
5047             }
5048
5049             src += 4;
5050             dst += 4;
5051             mask += 4;
5052             w -= 4;
5053         }
5054
5055         while (w)
5056         {
5057             uint32_t sa;
5058
5059             s = *src++;
5060             m = (uint32_t) *mask++;
5061             d = *dst;
5062
5063             sa = s >> 24;
5064
5065             if (m)
5066             {
5067                 if (sa == 0xff && m == 0xff)
5068                 {
5069                     *dst = s;
5070                 }
5071                 else
5072                 {
5073                     __m128i ms, md, ma, msa;
5074
5075                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5076                     ms = unpack_32_1x128 (s);
5077                     md = unpack_32_1x128 (d);
5078
5079                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5080
5081                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5082                 }
5083             }
5084
5085             dst++;
5086             w--;
5087         }
5088     }
5089
5090 }
5091
5092 static void
5093 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5094                                     pixman_composite_info_t *info)
5095 {
5096     PIXMAN_COMPOSITE_ARGS (info);
5097     uint32_t src;
5098     uint32_t    *dst_line, *dst;
5099     __m128i xmm_src;
5100     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5101     __m128i xmm_dsta_hi, xmm_dsta_lo;
5102     int dst_stride;
5103     int32_t w;
5104
5105     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5106
5107     if (src == 0)
5108         return;
5109
5110     PIXMAN_IMAGE_GET_LINE (
5111         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5112
5113     xmm_src = expand_pixel_32_1x128 (src);
5114
5115     while (height--)
5116     {
5117         dst = dst_line;
5118
5119         dst_line += dst_stride;
5120         w = width;
5121
5122         while (w && (uintptr_t)dst & 15)
5123         {
5124             __m128i vd;
5125
5126             vd = unpack_32_1x128 (*dst);
5127
5128             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5129                                               xmm_src));
5130             w--;
5131             dst++;
5132         }
5133
5134         while (w >= 4)
5135         {
5136             __m128i tmp_lo, tmp_hi;
5137
5138             xmm_dst = load_128_aligned ((__m128i*)dst);
5139
5140             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5141             expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5142
5143             tmp_lo = xmm_src;
5144             tmp_hi = xmm_src;
5145
5146             over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5147                         &xmm_dsta_lo, &xmm_dsta_hi,
5148                         &tmp_lo, &tmp_hi);
5149
5150             save_128_aligned (
5151                 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5152
5153             w -= 4;
5154             dst += 4;
5155         }
5156
5157         while (w)
5158         {
5159             __m128i vd;
5160
5161             vd = unpack_32_1x128 (*dst);
5162
5163             *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5164                                               xmm_src));
5165             w--;
5166             dst++;
5167         }
5168
5169     }
5170
5171 }
5172
5173 static void
5174 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5175                                     pixman_composite_info_t *info)
5176 {
5177     PIXMAN_COMPOSITE_ARGS (info);
5178     uint32_t    *src, *src_line, s;
5179     uint32_t    *dst, *dst_line, d;
5180     uint32_t    *mask, *mask_line;
5181     uint32_t    m;
5182     int src_stride, mask_stride, dst_stride;
5183     int32_t w;
5184
5185     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5186     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5187     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5188
5189     PIXMAN_IMAGE_GET_LINE (
5190         dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5191     PIXMAN_IMAGE_GET_LINE (
5192         mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5193     PIXMAN_IMAGE_GET_LINE (
5194         src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5195
5196     while (height--)
5197     {
5198         src = src_line;
5199         src_line += src_stride;
5200         dst = dst_line;
5201         dst_line += dst_stride;
5202         mask = mask_line;
5203         mask_line += mask_stride;
5204
5205         w = width;
5206
5207         while (w && (uintptr_t)dst & 15)
5208         {
5209             uint32_t sa;
5210
5211             s = *src++;
5212             m = (*mask++) >> 24;
5213             d = *dst;
5214
5215             sa = s >> 24;
5216
5217             if (m)
5218             {
5219                 if (sa == 0xff && m == 0xff)
5220                 {
5221                     *dst = s;
5222                 }
5223                 else
5224                 {
5225                     __m128i ms, md, ma, msa;
5226
5227                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5228                     ms = unpack_32_1x128 (s);
5229                     md = unpack_32_1x128 (d);
5230
5231                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5232
5233                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5234                 }
5235             }
5236
5237             dst++;
5238             w--;
5239         }
5240
5241         while (w >= 4)
5242         {
5243             xmm_mask = load_128_unaligned ((__m128i*)mask);
5244
5245             if (!is_transparent (xmm_mask))
5246             {
5247                 xmm_src = load_128_unaligned ((__m128i*)src);
5248
5249                 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5250                 {
5251                     save_128_aligned ((__m128i *)dst, xmm_src);
5252                 }
5253                 else
5254                 {
5255                     xmm_dst = load_128_aligned ((__m128i *)dst);
5256
5257                     unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5258                     unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5259                     unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5260
5261                     expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5262                     expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5263
5264                     in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5265                                    &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5266
5267                     save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5268                 }
5269             }
5270
5271             src += 4;
5272             dst += 4;
5273             mask += 4;
5274             w -= 4;
5275         }
5276
5277         while (w)
5278         {
5279             uint32_t sa;
5280
5281             s = *src++;
5282             m = (*mask++) >> 24;
5283             d = *dst;
5284
5285             sa = s >> 24;
5286
5287             if (m)
5288             {
5289                 if (sa == 0xff && m == 0xff)
5290                 {
5291                     *dst = s;
5292                 }
5293                 else
5294                 {
5295                     __m128i ms, md, ma, msa;
5296
5297                     ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5298                     ms = unpack_32_1x128 (s);
5299                     md = unpack_32_1x128 (d);
5300
5301                     msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5302
5303                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5304                 }
5305             }
5306
5307             dst++;
5308             w--;
5309         }
5310     }
5311
5312 }
5313
5314 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5315 static force_inline void
5316 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5317                                              const uint32_t* ps,
5318                                              int32_t         w,
5319                                              pixman_fixed_t  vx,
5320                                              pixman_fixed_t  unit_x,
5321                                              pixman_fixed_t  src_width_fixed,
5322                                              pixman_bool_t   fully_transparent_src)
5323 {
5324     uint32_t s, d;
5325     const uint32_t* pm = NULL;
5326
5327     __m128i xmm_dst_lo, xmm_dst_hi;
5328     __m128i xmm_src_lo, xmm_src_hi;
5329     __m128i xmm_alpha_lo, xmm_alpha_hi;
5330
5331     if (fully_transparent_src)
5332         return;
5333
5334     /* Align dst on a 16-byte boundary */
5335     while (w && ((uintptr_t)pd & 15))
5336     {
5337         d = *pd;
5338         s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5339         vx += unit_x;
5340         while (vx >= 0)
5341             vx -= src_width_fixed;
5342
5343         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5344         if (pm)
5345             pm++;
5346         w--;
5347     }
5348
5349     while (w >= 4)
5350     {
5351         __m128i tmp;
5352         uint32_t tmp1, tmp2, tmp3, tmp4;
5353
5354         tmp1 = *(ps + pixman_fixed_to_int (vx));
5355         vx += unit_x;
5356         while (vx >= 0)
5357             vx -= src_width_fixed;
5358         tmp2 = *(ps + pixman_fixed_to_int (vx));
5359         vx += unit_x;
5360         while (vx >= 0)
5361             vx -= src_width_fixed;
5362         tmp3 = *(ps + pixman_fixed_to_int (vx));
5363         vx += unit_x;
5364         while (vx >= 0)
5365             vx -= src_width_fixed;
5366         tmp4 = *(ps + pixman_fixed_to_int (vx));
5367         vx += unit_x;
5368         while (vx >= 0)
5369             vx -= src_width_fixed;
5370
5371         tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5372
5373         xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5374
5375         if (is_opaque (xmm_src_hi))
5376         {
5377             save_128_aligned ((__m128i*)pd, xmm_src_hi);
5378         }
5379         else if (!is_zero (xmm_src_hi))
5380         {
5381             xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5382
5383             unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5384             unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5385
5386             expand_alpha_2x128 (
5387                 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5388
5389             over_2x128 (&xmm_src_lo, &xmm_src_hi,
5390                         &xmm_alpha_lo, &xmm_alpha_hi,
5391                         &xmm_dst_lo, &xmm_dst_hi);
5392
5393             /* rebuid the 4 pixel data and save*/
5394             save_128_aligned ((__m128i*)pd,
5395                               pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5396         }
5397
5398         w -= 4;
5399         pd += 4;
5400         if (pm)
5401             pm += 4;
5402     }
5403
5404     while (w)
5405     {
5406         d = *pd;
5407         s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5408         vx += unit_x;
5409         while (vx >= 0)
5410             vx -= src_width_fixed;
5411
5412         *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5413         if (pm)
5414             pm++;
5415
5416         w--;
5417     }
5418 }
5419
5420 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5421                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5422                        uint32_t, uint32_t, COVER)
5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5424                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5425                        uint32_t, uint32_t, NONE)
5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5427                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5428                        uint32_t, uint32_t, PAD)
5429 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
5430                        scaled_nearest_scanline_sse2_8888_8888_OVER,
5431                        uint32_t, uint32_t, NORMAL)
5432
5433 static force_inline void
5434 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5435                                                uint32_t *       dst,
5436                                                const uint32_t * src,
5437                                                int32_t          w,
5438                                                pixman_fixed_t   vx,
5439                                                pixman_fixed_t   unit_x,
5440                                                pixman_fixed_t   src_width_fixed,
5441                                                pixman_bool_t    zero_src)
5442 {
5443     __m128i xmm_mask;
5444     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5445     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5446     __m128i xmm_alpha_lo, xmm_alpha_hi;
5447
5448     if (zero_src || (*mask >> 24) == 0)
5449         return;
5450
5451     xmm_mask = create_mask_16_128 (*mask >> 24);
5452
5453     while (w && (uintptr_t)dst & 15)
5454     {
5455         uint32_t s = *(src + pixman_fixed_to_int (vx));
5456         vx += unit_x;
5457         while (vx >= 0)
5458             vx -= src_width_fixed;
5459
5460         if (s)
5461         {
5462             uint32_t d = *dst;
5463
5464             __m128i ms = unpack_32_1x128 (s);
5465             __m128i alpha     = expand_alpha_1x128 (ms);
5466             __m128i dest      = xmm_mask;
5467             __m128i alpha_dst = unpack_32_1x128 (d);
5468
5469             *dst = pack_1x128_32 (
5470                 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5471         }
5472         dst++;
5473         w--;
5474     }
5475
5476     while (w >= 4)
5477     {
5478         uint32_t tmp1, tmp2, tmp3, tmp4;
5479
5480         tmp1 = *(src + pixman_fixed_to_int (vx));
5481         vx += unit_x;
5482         while (vx >= 0)
5483             vx -= src_width_fixed;
5484         tmp2 = *(src + pixman_fixed_to_int (vx));
5485         vx += unit_x;
5486         while (vx >= 0)
5487             vx -= src_width_fixed;
5488         tmp3 = *(src + pixman_fixed_to_int (vx));
5489         vx += unit_x;
5490         while (vx >= 0)
5491             vx -= src_width_fixed;
5492         tmp4 = *(src + pixman_fixed_to_int (vx));
5493         vx += unit_x;
5494         while (vx >= 0)
5495             vx -= src_width_fixed;
5496
5497         xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5498
5499         if (!is_zero (xmm_src))
5500         {
5501             xmm_dst = load_128_aligned ((__m128i*)dst);
5502
5503             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5504             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5505             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5506                                 &xmm_alpha_lo, &xmm_alpha_hi);
5507
5508             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5509                            &xmm_alpha_lo, &xmm_alpha_hi,
5510                            &xmm_mask, &xmm_mask,
5511                            &xmm_dst_lo, &xmm_dst_hi);
5512
5513             save_128_aligned (
5514                 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5515         }
5516
5517         dst += 4;
5518         w -= 4;
5519     }
5520
5521     while (w)
5522     {
5523         uint32_t s = *(src + pixman_fixed_to_int (vx));
5524         vx += unit_x;
5525         while (vx >= 0)
5526             vx -= src_width_fixed;
5527
5528         if (s)
5529         {
5530             uint32_t d = *dst;
5531
5532             __m128i ms = unpack_32_1x128 (s);
5533             __m128i alpha = expand_alpha_1x128 (ms);
5534             __m128i mask  = xmm_mask;
5535             __m128i dest  = unpack_32_1x128 (d);
5536
5537             *dst = pack_1x128_32 (
5538                 in_over_1x128 (&ms, &alpha, &mask, &dest));
5539         }
5540
5541         dst++;
5542         w--;
5543     }
5544
5545 }
5546
5547 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5548                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5549                               uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5551                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5552                               uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5554                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5555                               uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5556 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
5557                               scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5558                               uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
5559
5560 #if PSHUFD_IS_FAST
5561
5562 /***********************************************************************************/
5563
5564 # define BILINEAR_DECLARE_VARIABLES                                             \
5565     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);      \
5566     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);      \
5567     const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);            \
5568     const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,    \
5569                                            unit_x, -unit_x, unit_x, -unit_x);   \
5570     const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,             \
5571                                            unit_x * 4, -unit_x * 4,             \
5572                                            unit_x * 4, -unit_x * 4,             \
5573                                            unit_x * 4, -unit_x * 4);            \
5574     const __m128i xmm_zero = _mm_setzero_si128 ();                              \
5575     __m128i xmm_x = _mm_set_epi16 (vx + unit_x * 3, -(vx + 1) - unit_x * 3,     \
5576                                    vx + unit_x * 2, -(vx + 1) - unit_x * 2,     \
5577                                    vx + unit_x * 1, -(vx + 1) - unit_x * 1,     \
5578                                    vx + unit_x * 0, -(vx + 1) - unit_x * 0);    \
5579     __m128i xmm_wh_state;
5580
5581 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase_)                      \
5582 do {                                                                            \
5583     int phase = phase_;                                                         \
5584     __m128i xmm_wh, xmm_a, xmm_b;                                               \
5585     /* fetch 2x2 pixel block into sse2 registers */                             \
5586     __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);             \
5587     __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);          \
5588     vx += unit_x;                                                               \
5589     /* vertical interpolation */                                                \
5590     xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);       \
5591     xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);       \
5592     xmm_a = _mm_add_epi16 (xmm_a, xmm_b);                                               \
5593     /* calculate horizontal weights */                                          \
5594     if (phase <= 0)                                                             \
5595     {                                                                           \
5596         xmm_wh_state = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,          \
5597                                         16 - BILINEAR_INTERPOLATION_BITS));     \
5598         xmm_x = _mm_add_epi16 (xmm_x, (phase < 0) ? xmm_ux1 : xmm_ux4);         \
5599         phase = 0;                                                              \
5600     }                                                                           \
5601     xmm_wh = _mm_shuffle_epi32 (xmm_wh_state, _MM_SHUFFLE (phase, phase,        \
5602                                                            phase, phase));      \
5603     /* horizontal interpolation */                                              \
5604     xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (             \
5605                 xmm_a, _MM_SHUFFLE (1, 0, 3, 2)), xmm_a), xmm_wh);              \
5606     /* shift the result */                                                      \
5607     pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);              \
5608 } while (0)
5609
5610 #else /************************************************************************/
5611
5612 # define BILINEAR_DECLARE_VARIABLES                                             \
5613     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);      \
5614     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);      \
5615     const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);            \
5616     const __m128i xmm_ux1 = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,    \
5617                                           unit_x, -unit_x, unit_x, -unit_x);    \
5618     const __m128i xmm_ux4 = _mm_set_epi16 (unit_x * 4, -unit_x * 4,             \
5619                                            unit_x * 4, -unit_x * 4,             \
5620                                            unit_x * 4, -unit_x * 4,             \
5621                                            unit_x * 4, -unit_x * 4);            \
5622     const __m128i xmm_zero = _mm_setzero_si128 ();                              \
5623     __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),                \
5624                                    vx, -(vx + 1), vx, -(vx + 1))
5625
5626 #define BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER(pix, phase)                       \
5627 do {                                                                            \
5628     __m128i xmm_wh, xmm_a, xmm_b;                                               \
5629     /* fetch 2x2 pixel block into sse2 registers */                             \
5630     __m128i tltr = _mm_loadl_epi64 ((__m128i *)&src_top[vx >> 16]);             \
5631     __m128i blbr = _mm_loadl_epi64 ((__m128i *)&src_bottom[vx >> 16]);          \
5632     (void)xmm_ux4; /* suppress warning: unused variable 'xmm_ux4' */            \
5633     vx += unit_x;                                                               \
5634     /* vertical interpolation */                                                \
5635     xmm_a = _mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), xmm_wt);       \
5636     xmm_b = _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), xmm_wb);       \
5637     xmm_a = _mm_add_epi16 (xmm_a, xmm_b);                                       \
5638     /* calculate horizontal weights */                                          \
5639     xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,                    \
5640                                         16 - BILINEAR_INTERPOLATION_BITS));     \
5641     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);                                     \
5642     /* horizontal interpolation */                                              \
5643     xmm_b = _mm_unpacklo_epi64 (/* any value is fine here */ xmm_b, xmm_a);     \
5644     xmm_a = _mm_madd_epi16 (_mm_unpackhi_epi16 (xmm_b, xmm_a), xmm_wh);         \
5645     /* shift the result */                                                      \
5646     pix = _mm_srli_epi32 (xmm_a, BILINEAR_INTERPOLATION_BITS * 2);              \
5647 } while (0)
5648
5649 /***********************************************************************************/
5650
5651 #endif
5652
5653 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix);                                    \
5654 do {                                                                            \
5655         __m128i xmm_pix;                                                        \
5656         BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix, -1);                    \
5657         xmm_pix = _mm_packs_epi32 (xmm_pix, xmm_pix);                           \
5658         xmm_pix = _mm_packus_epi16 (xmm_pix, xmm_pix);                          \
5659         pix = _mm_cvtsi128_si32 (xmm_pix);                                      \
5660 } while(0)
5661
5662 #define BILINEAR_INTERPOLATE_FOUR_PIXELS(pix);                                  \
5663 do {                                                                            \
5664         __m128i xmm_pix1, xmm_pix2, xmm_pix3, xmm_pix4;                         \
5665         BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix1, 0);                    \
5666         BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix2, 1);                    \
5667         BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix3, 2);                    \
5668         BILINEAR_INTERPOLATE_ONE_PIXEL_HELPER (xmm_pix4, 3);                    \
5669         xmm_pix1 = _mm_packs_epi32 (xmm_pix1, xmm_pix2);                        \
5670         xmm_pix3 = _mm_packs_epi32 (xmm_pix3, xmm_pix4);                        \
5671         pix = _mm_packus_epi16 (xmm_pix1, xmm_pix3);                            \
5672 } while(0)
5673
5674 #define BILINEAR_SKIP_ONE_PIXEL()                                               \
5675 do {                                                                            \
5676     vx += unit_x;                                                               \
5677     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux1);                                     \
5678 } while(0)
5679
5680 #define BILINEAR_SKIP_FOUR_PIXELS()                                             \
5681 do {                                                                            \
5682     vx += unit_x * 4;                                                           \
5683     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux4);                                     \
5684 } while(0)
5685
5686 /***********************************************************************************/
5687
5688 static force_inline void
5689 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
5690                                              const uint32_t * mask,
5691                                              const uint32_t * src_top,
5692                                              const uint32_t * src_bottom,
5693                                              int32_t          w,
5694                                              int              wt,
5695                                              int              wb,
5696                                              pixman_fixed_t   vx_,
5697                                              pixman_fixed_t   unit_x_,
5698                                              pixman_fixed_t   max_vx,
5699                                              pixman_bool_t    zero_src)
5700 {
5701     intptr_t vx = vx_;
5702     intptr_t unit_x = unit_x_;
5703     BILINEAR_DECLARE_VARIABLES;
5704     uint32_t pix1, pix2;
5705
5706     while (w && ((uintptr_t)dst & 15))
5707     {
5708         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5709         *dst++ = pix1;
5710         w--;
5711     }
5712
5713     while ((w -= 4) >= 0) {
5714         __m128i xmm_src;
5715         BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5716         _mm_store_si128 ((__m128i *)dst, xmm_src);
5717         dst += 4;
5718     }
5719
5720     if (w & 2)
5721     {
5722         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5723         BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5724         *dst++ = pix1;
5725         *dst++ = pix2;
5726     }
5727
5728     if (w & 1)
5729     {
5730         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5731         *dst = pix1;
5732     }
5733
5734 }
5735
5736 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5737                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5738                                uint32_t, uint32_t, uint32_t,
5739                                COVER, FLAG_NONE)
5740 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5741                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5742                                uint32_t, uint32_t, uint32_t,
5743                                PAD, FLAG_NONE)
5744 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5745                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5746                                uint32_t, uint32_t, uint32_t,
5747                                NONE, FLAG_NONE)
5748 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5749                                scaled_bilinear_scanline_sse2_8888_8888_SRC,
5750                                uint32_t, uint32_t, uint32_t,
5751                                NORMAL, FLAG_NONE)
5752
5753 static force_inline void
5754 scaled_bilinear_scanline_sse2_x888_8888_SRC (uint32_t *       dst,
5755                                              const uint32_t * mask,
5756                                              const uint32_t * src_top,
5757                                              const uint32_t * src_bottom,
5758                                              int32_t          w,
5759                                              int              wt,
5760                                              int              wb,
5761                                              pixman_fixed_t   vx_,
5762                                              pixman_fixed_t   unit_x_,
5763                                              pixman_fixed_t   max_vx,
5764                                              pixman_bool_t    zero_src)
5765 {
5766     intptr_t vx = vx_;
5767     intptr_t unit_x = unit_x_;
5768     BILINEAR_DECLARE_VARIABLES;
5769     uint32_t pix1, pix2;
5770
5771     while (w && ((uintptr_t)dst & 15))
5772     {
5773         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5774         *dst++ = pix1 | 0xFF000000;
5775         w--;
5776     }
5777
5778     while ((w -= 4) >= 0) {
5779         __m128i xmm_src;
5780         BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5781         _mm_store_si128 ((__m128i *)dst, _mm_or_si128 (xmm_src, mask_ff000000));
5782         dst += 4;
5783     }
5784
5785     if (w & 2)
5786     {
5787         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5788         BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5789         *dst++ = pix1 | 0xFF000000;
5790         *dst++ = pix2 | 0xFF000000;
5791     }
5792
5793     if (w & 1)
5794     {
5795         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5796         *dst = pix1 | 0xFF000000;
5797     }
5798 }
5799
5800 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_cover_SRC,
5801                                scaled_bilinear_scanline_sse2_x888_8888_SRC,
5802                                uint32_t, uint32_t, uint32_t,
5803                                COVER, FLAG_NONE)
5804 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_pad_SRC,
5805                                scaled_bilinear_scanline_sse2_x888_8888_SRC,
5806                                uint32_t, uint32_t, uint32_t,
5807                                PAD, FLAG_NONE)
5808 FAST_BILINEAR_MAINLOOP_COMMON (sse2_x888_8888_normal_SRC,
5809                                scaled_bilinear_scanline_sse2_x888_8888_SRC,
5810                                uint32_t, uint32_t, uint32_t,
5811                                NORMAL, FLAG_NONE)
5812
5813 static force_inline void
5814 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
5815                                               const uint32_t * mask,
5816                                               const uint32_t * src_top,
5817                                               const uint32_t * src_bottom,
5818                                               int32_t          w,
5819                                               int              wt,
5820                                               int              wb,
5821                                               pixman_fixed_t   vx_,
5822                                               pixman_fixed_t   unit_x_,
5823                                               pixman_fixed_t   max_vx,
5824                                               pixman_bool_t    zero_src)
5825 {
5826     intptr_t vx = vx_;
5827     intptr_t unit_x = unit_x_;
5828     BILINEAR_DECLARE_VARIABLES;
5829     uint32_t pix1, pix2;
5830
5831     while (w && ((uintptr_t)dst & 15))
5832     {
5833         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5834
5835         if (pix1)
5836         {
5837             pix2 = *dst;
5838             *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5839         }
5840
5841         w--;
5842         dst++;
5843     }
5844
5845     while (w  >= 4)
5846     {
5847         __m128i xmm_src;
5848         __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5849         __m128i xmm_alpha_hi, xmm_alpha_lo;
5850
5851         BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5852
5853         if (!is_zero (xmm_src))
5854         {
5855             if (is_opaque (xmm_src))
5856             {
5857                 save_128_aligned ((__m128i *)dst, xmm_src);
5858             }
5859             else
5860             {
5861                 __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5862
5863                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5864                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5865
5866                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5867                 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5868                             &xmm_dst_lo, &xmm_dst_hi);
5869
5870                 save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5871             }
5872         }
5873
5874         w -= 4;
5875         dst += 4;
5876     }
5877
5878     while (w)
5879     {
5880         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5881
5882         if (pix1)
5883         {
5884             pix2 = *dst;
5885             *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5886         }
5887
5888         w--;
5889         dst++;
5890     }
5891 }
5892
5893 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5894                                scaled_bilinear_scanline_sse2_8888_8888_OVER,
5895                                uint32_t, uint32_t, uint32_t,
5896                                COVER, FLAG_NONE)
5897 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5898                                scaled_bilinear_scanline_sse2_8888_8888_OVER,
5899                                uint32_t, uint32_t, uint32_t,
5900                                PAD, FLAG_NONE)
5901 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5902                                scaled_bilinear_scanline_sse2_8888_8888_OVER,
5903                                uint32_t, uint32_t, uint32_t,
5904                                NONE, FLAG_NONE)
5905 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5906                                scaled_bilinear_scanline_sse2_8888_8888_OVER,
5907                                uint32_t, uint32_t, uint32_t,
5908                                NORMAL, FLAG_NONE)
5909
5910 static force_inline void
5911 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
5912                                                 const uint8_t  * mask,
5913                                                 const uint32_t * src_top,
5914                                                 const uint32_t * src_bottom,
5915                                                 int32_t          w,
5916                                                 int              wt,
5917                                                 int              wb,
5918                                                 pixman_fixed_t   vx_,
5919                                                 pixman_fixed_t   unit_x_,
5920                                                 pixman_fixed_t   max_vx,
5921                                                 pixman_bool_t    zero_src)
5922 {
5923     intptr_t vx = vx_;
5924     intptr_t unit_x = unit_x_;
5925     BILINEAR_DECLARE_VARIABLES;
5926     uint32_t pix1, pix2;
5927     uint32_t m;
5928
5929     while (w && ((uintptr_t)dst & 15))
5930     {
5931         uint32_t sa;
5932
5933         m = (uint32_t) *mask++;
5934
5935         if (m)
5936         {
5937             BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5938             sa = pix1 >> 24;
5939
5940             if (sa == 0xff && m == 0xff)
5941             {
5942                 *dst = pix1;
5943             }
5944             else
5945             {
5946                 __m128i ms, md, ma, msa;
5947
5948                 pix2 = *dst;
5949                 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5950                 ms = unpack_32_1x128 (pix1);
5951                 md = unpack_32_1x128 (pix2);
5952
5953                 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5954
5955                 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5956             }
5957         }
5958         else
5959         {
5960             BILINEAR_SKIP_ONE_PIXEL ();
5961         }
5962
5963         w--;
5964         dst++;
5965     }
5966
5967     while (w >= 4)
5968     {
5969         __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5970         __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5971         __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5972
5973         m = *(uint32_t*)mask;
5974
5975         if (m)
5976         {
5977             BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
5978
5979             if (m == 0xffffffff && is_opaque (xmm_src))
5980             {
5981                 save_128_aligned ((__m128i *)dst, xmm_src);
5982             }
5983             else
5984             {
5985                 xmm_dst = load_128_aligned ((__m128i *)dst);
5986
5987                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5988
5989                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5990                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5991                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5992
5993                 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5994                 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5995
5996                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5997                                &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5998
5999                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6000             }
6001         }
6002         else
6003         {
6004             BILINEAR_SKIP_FOUR_PIXELS ();
6005         }
6006
6007         w -= 4;
6008         dst += 4;
6009         mask += 4;
6010     }
6011
6012     while (w)
6013     {
6014         uint32_t sa;
6015
6016         m = (uint32_t) *mask++;
6017
6018         if (m)
6019         {
6020             BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6021             sa = pix1 >> 24;
6022
6023             if (sa == 0xff && m == 0xff)
6024             {
6025                 *dst = pix1;
6026             }
6027             else
6028             {
6029                 __m128i ms, md, ma, msa;
6030
6031                 pix2 = *dst;
6032                 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
6033                 ms = unpack_32_1x128 (pix1);
6034                 md = unpack_32_1x128 (pix2);
6035
6036                 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
6037
6038                 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
6039             }
6040         }
6041         else
6042         {
6043             BILINEAR_SKIP_ONE_PIXEL ();
6044         }
6045
6046         w--;
6047         dst++;
6048     }
6049 }
6050
6051 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
6052                                scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6053                                uint32_t, uint8_t, uint32_t,
6054                                COVER, FLAG_HAVE_NON_SOLID_MASK)
6055 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
6056                                scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6057                                uint32_t, uint8_t, uint32_t,
6058                                PAD, FLAG_HAVE_NON_SOLID_MASK)
6059 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
6060                                scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6061                                uint32_t, uint8_t, uint32_t,
6062                                NONE, FLAG_HAVE_NON_SOLID_MASK)
6063 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
6064                                scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
6065                                uint32_t, uint8_t, uint32_t,
6066                                NORMAL, FLAG_HAVE_NON_SOLID_MASK)
6067
6068 static force_inline void
6069 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
6070                                                 const uint32_t * mask,
6071                                                 const uint32_t * src_top,
6072                                                 const uint32_t * src_bottom,
6073                                                 int32_t          w,
6074                                                 int              wt,
6075                                                 int              wb,
6076                                                 pixman_fixed_t   vx_,
6077                                                 pixman_fixed_t   unit_x_,
6078                                                 pixman_fixed_t   max_vx,
6079                                                 pixman_bool_t    zero_src)
6080 {
6081     intptr_t vx = vx_;
6082     intptr_t unit_x = unit_x_;
6083     BILINEAR_DECLARE_VARIABLES;
6084     uint32_t pix1;
6085     __m128i xmm_mask;
6086
6087     if (zero_src || (*mask >> 24) == 0)
6088         return;
6089
6090     xmm_mask = create_mask_16_128 (*mask >> 24);
6091
6092     while (w && ((uintptr_t)dst & 15))
6093     {
6094         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6095         if (pix1)
6096         {
6097                 uint32_t d = *dst;
6098
6099                 __m128i ms = unpack_32_1x128 (pix1);
6100                 __m128i alpha     = expand_alpha_1x128 (ms);
6101                 __m128i dest      = xmm_mask;
6102                 __m128i alpha_dst = unpack_32_1x128 (d);
6103
6104                 *dst = pack_1x128_32
6105                         (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6106         }
6107
6108         dst++;
6109         w--;
6110     }
6111
6112     while (w >= 4)
6113     {
6114         __m128i xmm_src;
6115         BILINEAR_INTERPOLATE_FOUR_PIXELS (xmm_src);
6116
6117         if (!is_zero (xmm_src))
6118         {
6119             __m128i xmm_src_lo, xmm_src_hi;
6120             __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6121             __m128i xmm_alpha_lo, xmm_alpha_hi;
6122
6123             xmm_dst = load_128_aligned ((__m128i*)dst);
6124
6125             unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6126             unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6127             expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6128                                 &xmm_alpha_lo, &xmm_alpha_hi);
6129
6130             in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6131                            &xmm_alpha_lo, &xmm_alpha_hi,
6132                            &xmm_mask, &xmm_mask,
6133                            &xmm_dst_lo, &xmm_dst_hi);
6134
6135             save_128_aligned
6136                 ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6137         }
6138
6139         dst += 4;
6140         w -= 4;
6141     }
6142
6143     while (w)
6144     {
6145         BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6146         if (pix1)
6147         {
6148                 uint32_t d = *dst;
6149
6150                 __m128i ms = unpack_32_1x128 (pix1);
6151                 __m128i alpha     = expand_alpha_1x128 (ms);
6152                 __m128i dest      = xmm_mask;
6153                 __m128i alpha_dst = unpack_32_1x128 (d);
6154
6155                 *dst = pack_1x128_32
6156                         (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6157         }
6158
6159         dst++;
6160         w--;
6161     }
6162 }
6163
6164 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6165                                scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6166                                uint32_t, uint32_t, uint32_t,
6167                                COVER, FLAG_HAVE_SOLID_MASK)
6168 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6169                                scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6170                                uint32_t, uint32_t, uint32_t,
6171                                PAD, FLAG_HAVE_SOLID_MASK)
6172 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6173                                scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6174                                uint32_t, uint32_t, uint32_t,
6175                                NONE, FLAG_HAVE_SOLID_MASK)
6176 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
6177                                scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6178                                uint32_t, uint32_t, uint32_t,
6179                                NORMAL, FLAG_HAVE_SOLID_MASK)
6180
6181 static const pixman_fast_path_t sse2_fast_paths[] =
6182 {
6183     /* PIXMAN_OP_OVER */
6184     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6185     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6186     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6187     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6188     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6189     PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
6190     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6191     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6192     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6193     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6194     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6195     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6196     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6197     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6198     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6199     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6200     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6201     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6202     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6203     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6204     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6205     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6206     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6207     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6208     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6209     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6210     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6211     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6212     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6213     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6214     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6215     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6216     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6217     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6218     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6219     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6220     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6221     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6222     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6223     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6224     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6225     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6226     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6227     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6228     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6229     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6230     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6231
6232     /* PIXMAN_OP_OVER_REVERSE */
6233     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6234     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6235
6236     /* PIXMAN_OP_ADD */
6237     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6238     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6239     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6240     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6241     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6242     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6243     PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
6244     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
6245     PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
6246     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
6247     PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
6248     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
6249     PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
6250     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
6251
6252     /* PIXMAN_OP_SRC */
6253     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6254     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6255     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6256     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6257     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6258     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6259     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6260     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6261     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6262     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6263     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6264     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6265     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6266     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6267     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6268     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6269     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6270     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6271
6272     /* PIXMAN_OP_IN */
6273     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6274     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6275     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6276
6277     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6278     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6279     SIMPLE_NEAREST_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6280     SIMPLE_NEAREST_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6281
6282     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6283     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6284     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6285     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6286
6287     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6288     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6289     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
6290     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6291     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6292     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
6293
6294     SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6295     SIMPLE_BILINEAR_FAST_PATH_COVER  (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6296     SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6297     SIMPLE_BILINEAR_FAST_PATH_PAD    (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6298     SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8r8g8b8, a8r8g8b8, sse2_x888_8888),
6299     SIMPLE_BILINEAR_FAST_PATH_NORMAL (SRC, x8b8g8r8, a8b8g8r8, sse2_x888_8888),
6300
6301     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6302     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6303     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6304     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6305
6306     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6307     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6308     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6309     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6310
6311     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
6312     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
6313     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
6314     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
6315
6316     { PIXMAN_OP_NONE },
6317 };
6318
6319 static uint32_t *
6320 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6321 {
6322     int w = iter->width;
6323     __m128i ff000000 = mask_ff000000;
6324     uint32_t *dst = iter->buffer;
6325     uint32_t *src = (uint32_t *)iter->bits;
6326
6327     iter->bits += iter->stride;
6328
6329     while (w && ((uintptr_t)dst) & 0x0f)
6330     {
6331         *dst++ = (*src++) | 0xff000000;
6332         w--;
6333     }
6334
6335     while (w >= 4)
6336     {
6337         save_128_aligned (
6338             (__m128i *)dst, _mm_or_si128 (
6339                 load_128_unaligned ((__m128i *)src), ff000000));
6340
6341         dst += 4;
6342         src += 4;
6343         w -= 4;
6344     }
6345
6346     while (w)
6347     {
6348         *dst++ = (*src++) | 0xff000000;
6349         w--;
6350     }
6351
6352     return iter->buffer;
6353 }
6354
6355 static uint32_t *
6356 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6357 {
6358     int w = iter->width;
6359     uint32_t *dst = iter->buffer;
6360     uint16_t *src = (uint16_t *)iter->bits;
6361     __m128i ff000000 = mask_ff000000;
6362
6363     iter->bits += iter->stride;
6364
6365     while (w && ((uintptr_t)dst) & 0x0f)
6366     {
6367         uint16_t s = *src++;
6368
6369         *dst++ = convert_0565_to_8888 (s);
6370         w--;
6371     }
6372
6373     while (w >= 8)
6374     {
6375         __m128i lo, hi, s;
6376
6377         s = _mm_loadu_si128 ((__m128i *)src);
6378
6379         lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6380         hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6381
6382         save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6383         save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6384
6385         dst += 8;
6386         src += 8;
6387         w -= 8;
6388     }
6389
6390     while (w)
6391     {
6392         uint16_t s = *src++;
6393
6394         *dst++ = convert_0565_to_8888 (s);
6395         w--;
6396     }
6397
6398     return iter->buffer;
6399 }
6400
6401 static uint32_t *
6402 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6403 {
6404     int w = iter->width;
6405     uint32_t *dst = iter->buffer;
6406     uint8_t *src = iter->bits;
6407     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6408
6409     iter->bits += iter->stride;
6410
6411     while (w && (((uintptr_t)dst) & 15))
6412     {
6413         *dst++ = *(src++) << 24;
6414         w--;
6415     }
6416
6417     while (w >= 16)
6418     {
6419         xmm0 = _mm_loadu_si128((__m128i *)src);
6420
6421         xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
6422         xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
6423         xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6424         xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6425         xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6426         xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6427
6428         _mm_store_si128(((__m128i *)(dst +  0)), xmm3);
6429         _mm_store_si128(((__m128i *)(dst +  4)), xmm4);
6430         _mm_store_si128(((__m128i *)(dst +  8)), xmm5);
6431         _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6432
6433         dst += 16;
6434         src += 16;
6435         w -= 16;
6436     }
6437
6438     while (w)
6439     {
6440         *dst++ = *(src++) << 24;
6441         w--;
6442     }
6443
6444     return iter->buffer;
6445 }
6446
6447 #define IMAGE_FLAGS                                                     \
6448     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |                \
6449      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
6450
6451 static const pixman_iter_info_t sse2_iters[] =
6452 {
6453     { PIXMAN_x8r8g8b8, IMAGE_FLAGS, ITER_NARROW,
6454       _pixman_iter_init_bits_stride, sse2_fetch_x8r8g8b8, NULL
6455     },
6456     { PIXMAN_r5g6b5, IMAGE_FLAGS, ITER_NARROW,
6457       _pixman_iter_init_bits_stride, sse2_fetch_r5g6b5, NULL
6458     },
6459     { PIXMAN_a8, IMAGE_FLAGS, ITER_NARROW,
6460       _pixman_iter_init_bits_stride, sse2_fetch_a8, NULL
6461     },
6462     { PIXMAN_null },
6463 };
6464
6465 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6466 __attribute__((__force_align_arg_pointer__))
6467 #endif
6468 pixman_implementation_t *
6469 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6470 {
6471     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6472
6473     /* SSE2 constants */
6474     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6475     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6476     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6477     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6478     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6479     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6480     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6481     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6482     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6483     mask_0080 = create_mask_16_128 (0x0080);
6484     mask_00ff = create_mask_16_128 (0x00ff);
6485     mask_0101 = create_mask_16_128 (0x0101);
6486     mask_ffff = create_mask_16_128 (0xffff);
6487     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6488     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6489     mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
6490     mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
6491
6492     /* Set up function pointers */
6493     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6494     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6495     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6496     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6497     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6498     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6499     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6500     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6501     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6502     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6503
6504     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6505
6506     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6507     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6508     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6509     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6510     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6511     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6512     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6513     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6514     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6515     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6516     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6517
6518     imp->blt = sse2_blt;
6519     imp->fill = sse2_fill;
6520
6521     imp->iter_info = sse2_iters;
6522
6523     return imp;
6524 }