2 * rgb2rgb.c, Software RGB to RGB convertor
3 * pluralize by Software PAL8 to RGB convertor
4 * Software YUV to YUV convertor
5 * Software YUV to RGB convertor
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byteorder fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * The C code (not assembly, mmx, ...) of this file can be used
27 * under the LGPL license.
31 #include <inttypes.h> /* for __WORDSIZE */
34 // #warning You have a misconfigured system and will probably lose performance!
35 #define __WORDSIZE MP_WORDSIZE
53 #define PREFETCH "prefetch"
54 #define PREFETCHW "prefetchw"
55 #define PAVGB "pavgusb"
56 #elif defined ( HAVE_MMX2 )
57 #define PREFETCH "prefetchnta"
58 #define PREFETCHW "prefetcht0"
65 #define PREFETCH " # nop"
66 #define PREFETCHW " # nop"
71 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
78 #define MOVNTQ "movntq"
79 #define SFENCE "sfence"
82 #define SFENCE " # nop"
85 static inline void RENAME(rgb24to32
)(const uint8_t *src
,uint8_t *dst
,long src_size
)
88 const uint8_t *s
= src
;
91 const uint8_t *mm_end
;
95 __asm
__volatile(PREFETCH
" %0"::"m"(*s
):"memory");
97 __asm
__volatile("movq %0, %%mm7"::"m"(mask32
):"memory");
102 "movd %1, %%mm0 \n\t"
103 "punpckldq 3%1, %%mm0 \n\t"
104 "movd 6%1, %%mm1 \n\t"
105 "punpckldq 9%1, %%mm1 \n\t"
106 "movd 12%1, %%mm2 \n\t"
107 "punpckldq 15%1, %%mm2 \n\t"
108 "movd 18%1, %%mm3 \n\t"
109 "punpckldq 21%1, %%mm3 \n\t"
110 "pand %%mm7, %%mm0 \n\t"
111 "pand %%mm7, %%mm1 \n\t"
112 "pand %%mm7, %%mm2 \n\t"
113 "pand %%mm7, %%mm3 \n\t"
114 MOVNTQ
" %%mm0, %0 \n\t"
115 MOVNTQ
" %%mm1, 8%0 \n\t"
116 MOVNTQ
" %%mm2, 16%0 \n\t"
124 __asm
__volatile(SFENCE:::"memory");
125 __asm
__volatile(EMMS:::"memory");
129 #ifdef WORDS_BIGENDIAN
130 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
145 static inline void RENAME(rgb32to24
)(const uint8_t *src
,uint8_t *dst
,long src_size
)
148 const uint8_t *s
= src
;
151 const uint8_t *mm_end
;
155 __asm
__volatile(PREFETCH
" %0"::"m"(*s
):"memory");
161 "movq %1, %%mm0 \n\t"
162 "movq 8%1, %%mm1 \n\t"
163 "movq 16%1, %%mm4 \n\t"
164 "movq 24%1, %%mm5 \n\t"
165 "movq %%mm0, %%mm2 \n\t"
166 "movq %%mm1, %%mm3 \n\t"
167 "movq %%mm4, %%mm6 \n\t"
168 "movq %%mm5, %%mm7 \n\t"
169 "psrlq $8, %%mm2 \n\t"
170 "psrlq $8, %%mm3 \n\t"
171 "psrlq $8, %%mm6 \n\t"
172 "psrlq $8, %%mm7 \n\t"
173 "pand %2, %%mm0 \n\t"
174 "pand %2, %%mm1 \n\t"
175 "pand %2, %%mm4 \n\t"
176 "pand %2, %%mm5 \n\t"
177 "pand %3, %%mm2 \n\t"
178 "pand %3, %%mm3 \n\t"
179 "pand %3, %%mm6 \n\t"
180 "pand %3, %%mm7 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "por %%mm3, %%mm1 \n\t"
183 "por %%mm6, %%mm4 \n\t"
184 "por %%mm7, %%mm5 \n\t"
186 "movq %%mm1, %%mm2 \n\t"
187 "movq %%mm4, %%mm3 \n\t"
188 "psllq $48, %%mm2 \n\t"
189 "psllq $32, %%mm3 \n\t"
190 "pand %4, %%mm2 \n\t"
191 "pand %5, %%mm3 \n\t"
192 "por %%mm2, %%mm0 \n\t"
193 "psrlq $16, %%mm1 \n\t"
194 "psrlq $32, %%mm4 \n\t"
195 "psllq $16, %%mm5 \n\t"
196 "por %%mm3, %%mm1 \n\t"
197 "pand %6, %%mm5 \n\t"
198 "por %%mm5, %%mm4 \n\t"
200 MOVNTQ
" %%mm0, %0 \n\t"
201 MOVNTQ
" %%mm1, 8%0 \n\t"
204 :"m"(*s
),"m"(mask24l
),
205 "m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
210 __asm
__volatile(SFENCE:::"memory");
211 __asm
__volatile(EMMS:::"memory");
215 #ifdef WORDS_BIGENDIAN
216 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
232 Original by Strepto/Astral
233 ported to gcc & bugfixed : A'rpi
234 MMX2, 3DNOW optimization by Nick Kurshev
235 32 bit C version, and and&add trick by Michael Niedermayer
237 static inline void RENAME(rgb15to16
)(const uint8_t *src
,uint8_t *dst
,long src_size
)
239 register const uint8_t* s
=src
;
240 register uint8_t* d
=dst
;
241 register const uint8_t *end
;
242 const uint8_t *mm_end
;
245 __asm
__volatile(PREFETCH
" %0"::"m"(*s
));
246 __asm
__volatile("movq %0, %%mm4"::"m"(mask15s
));
252 "movq %1, %%mm0 \n\t"
253 "movq 8%1, %%mm2 \n\t"
254 "movq %%mm0, %%mm1 \n\t"
255 "movq %%mm2, %%mm3 \n\t"
256 "pand %%mm4, %%mm0 \n\t"
257 "pand %%mm4, %%mm2 \n\t"
258 "paddw %%mm1, %%mm0 \n\t"
259 "paddw %%mm3, %%mm2 \n\t"
260 MOVNTQ
" %%mm0, %0 \n\t"
268 __asm
__volatile(SFENCE:::"memory");
269 __asm
__volatile(EMMS:::"memory");
274 register unsigned x
= *((uint32_t *)s
);
275 *((uint32_t *)d
) = (x
&0x7FFF7FFF) + (x
&0x7FE07FE0);
281 register unsigned short x
= *((uint16_t *)s
);
282 *((uint16_t *)d
) = (x
&0x7FFF) + (x
&0x7FE0);
286 static inline void RENAME(rgb16to15
)(const uint8_t *src
,uint8_t *dst
,long src_size
)
288 register const uint8_t* s
=src
;
289 register uint8_t* d
=dst
;
290 register const uint8_t *end
;
291 const uint8_t *mm_end
;
294 __asm
__volatile(PREFETCH
" %0"::"m"(*s
));
295 __asm
__volatile("movq %0, %%mm7"::"m"(mask15rg
));
296 __asm
__volatile("movq %0, %%mm6"::"m"(mask15b
));
302 "movq %1, %%mm0 \n\t"
303 "movq 8%1, %%mm2 \n\t"
304 "movq %%mm0, %%mm1 \n\t"
305 "movq %%mm2, %%mm3 \n\t"
306 "psrlq $1, %%mm0 \n\t"
307 "psrlq $1, %%mm2 \n\t"
308 "pand %%mm7, %%mm0 \n\t"
309 "pand %%mm7, %%mm2 \n\t"
310 "pand %%mm6, %%mm1 \n\t"
311 "pand %%mm6, %%mm3 \n\t"
312 "por %%mm1, %%mm0 \n\t"
313 "por %%mm3, %%mm2 \n\t"
314 MOVNTQ
" %%mm0, %0 \n\t"
322 __asm
__volatile(SFENCE:::"memory");
323 __asm
__volatile(EMMS:::"memory");
328 register uint32_t x
= *((uint32_t *)s
);
329 *((uint32_t *)d
) = ((x
>>1)&0x7FE07FE0) | (x
&0x001F001F);
335 register uint16_t x
= *((uint16_t *)s
);
336 *((uint16_t *)d
) = ((x
>>1)&0x7FE0) | (x
&0x001F);
342 static inline void RENAME(rgb32to16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
344 const uint8_t *s
= src
;
347 const uint8_t *mm_end
;
349 uint16_t *d
= (uint16_t *)dst
;
353 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
355 "movq %3, %%mm5 \n\t"
356 "movq %4, %%mm6 \n\t"
357 "movq %5, %%mm7 \n\t"
361 PREFETCH
" 32(%1) \n\t"
362 "movd (%1), %%mm0 \n\t"
363 "movd 4(%1), %%mm3 \n\t"
364 "punpckldq 8(%1), %%mm0 \n\t"
365 "punpckldq 12(%1), %%mm3 \n\t"
366 "movq %%mm0, %%mm1 \n\t"
367 "movq %%mm3, %%mm4 \n\t"
368 "pand %%mm6, %%mm0 \n\t"
369 "pand %%mm6, %%mm3 \n\t"
370 "pmaddwd %%mm7, %%mm0 \n\t"
371 "pmaddwd %%mm7, %%mm3 \n\t"
372 "pand %%mm5, %%mm1 \n\t"
373 "pand %%mm5, %%mm4 \n\t"
374 "por %%mm1, %%mm0 \n\t"
375 "por %%mm4, %%mm3 \n\t"
376 "psrld $5, %%mm0 \n\t"
377 "pslld $11, %%mm3 \n\t"
378 "por %%mm3, %%mm0 \n\t"
379 MOVNTQ
" %%mm0, (%0) \n\t"
386 : "r" (mm_end
), "m" (mask3216g
), "m" (mask3216br
), "m" (mul3216
)
389 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
391 "movq %0, %%mm7 \n\t"
392 "movq %1, %%mm6 \n\t"
393 ::"m"(red_16mask
),"m"(green_16mask
));
398 "movd %1, %%mm0 \n\t"
399 "movd 4%1, %%mm3 \n\t"
400 "punpckldq 8%1, %%mm0 \n\t"
401 "punpckldq 12%1, %%mm3 \n\t"
402 "movq %%mm0, %%mm1 \n\t"
403 "movq %%mm0, %%mm2 \n\t"
404 "movq %%mm3, %%mm4 \n\t"
405 "movq %%mm3, %%mm5 \n\t"
406 "psrlq $3, %%mm0 \n\t"
407 "psrlq $3, %%mm3 \n\t"
408 "pand %2, %%mm0 \n\t"
409 "pand %2, %%mm3 \n\t"
410 "psrlq $5, %%mm1 \n\t"
411 "psrlq $5, %%mm4 \n\t"
412 "pand %%mm6, %%mm1 \n\t"
413 "pand %%mm6, %%mm4 \n\t"
414 "psrlq $8, %%mm2 \n\t"
415 "psrlq $8, %%mm5 \n\t"
416 "pand %%mm7, %%mm2 \n\t"
417 "pand %%mm7, %%mm5 \n\t"
418 "por %%mm1, %%mm0 \n\t"
419 "por %%mm4, %%mm3 \n\t"
420 "por %%mm2, %%mm0 \n\t"
421 "por %%mm5, %%mm3 \n\t"
422 "psllq $16, %%mm3 \n\t"
423 "por %%mm3, %%mm0 \n\t"
424 MOVNTQ
" %%mm0, %0 \n\t"
425 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
430 __asm
__volatile(SFENCE:::"memory");
431 __asm
__volatile(EMMS:::"memory");
435 register int rgb
= *(uint32_t*)s
; s
+= 4;
436 *d
++ = ((rgb
&0xFF)>>3) + ((rgb
&0xFC00)>>5) + ((rgb
&0xF80000)>>8);
440 static inline void RENAME(rgb32tobgr16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
442 const uint8_t *s
= src
;
445 const uint8_t *mm_end
;
447 uint16_t *d
= (uint16_t *)dst
;
450 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
452 "movq %0, %%mm7 \n\t"
453 "movq %1, %%mm6 \n\t"
454 ::"m"(red_16mask
),"m"(green_16mask
));
460 "movd %1, %%mm0 \n\t"
461 "movd 4%1, %%mm3 \n\t"
462 "punpckldq 8%1, %%mm0 \n\t"
463 "punpckldq 12%1, %%mm3 \n\t"
464 "movq %%mm0, %%mm1 \n\t"
465 "movq %%mm0, %%mm2 \n\t"
466 "movq %%mm3, %%mm4 \n\t"
467 "movq %%mm3, %%mm5 \n\t"
468 "psllq $8, %%mm0 \n\t"
469 "psllq $8, %%mm3 \n\t"
470 "pand %%mm7, %%mm0 \n\t"
471 "pand %%mm7, %%mm3 \n\t"
472 "psrlq $5, %%mm1 \n\t"
473 "psrlq $5, %%mm4 \n\t"
474 "pand %%mm6, %%mm1 \n\t"
475 "pand %%mm6, %%mm4 \n\t"
476 "psrlq $19, %%mm2 \n\t"
477 "psrlq $19, %%mm5 \n\t"
478 "pand %2, %%mm2 \n\t"
479 "pand %2, %%mm5 \n\t"
480 "por %%mm1, %%mm0 \n\t"
481 "por %%mm4, %%mm3 \n\t"
482 "por %%mm2, %%mm0 \n\t"
483 "por %%mm5, %%mm3 \n\t"
484 "psllq $16, %%mm3 \n\t"
485 "por %%mm3, %%mm0 \n\t"
486 MOVNTQ
" %%mm0, %0 \n\t"
487 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
491 __asm
__volatile(SFENCE:::"memory");
492 __asm
__volatile(EMMS:::"memory");
496 register int rgb
= *(uint32_t*)s
; s
+= 4;
497 *d
++ = ((rgb
&0xF8)<<8) + ((rgb
&0xFC00)>>5) + ((rgb
&0xF80000)>>19);
501 static inline void RENAME(rgb32to15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
503 const uint8_t *s
= src
;
506 const uint8_t *mm_end
;
508 uint16_t *d
= (uint16_t *)dst
;
512 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
514 "movq %3, %%mm5 \n\t"
515 "movq %4, %%mm6 \n\t"
516 "movq %5, %%mm7 \n\t"
520 PREFETCH
" 32(%1) \n\t"
521 "movd (%1), %%mm0 \n\t"
522 "movd 4(%1), %%mm3 \n\t"
523 "punpckldq 8(%1), %%mm0 \n\t"
524 "punpckldq 12(%1), %%mm3 \n\t"
525 "movq %%mm0, %%mm1 \n\t"
526 "movq %%mm3, %%mm4 \n\t"
527 "pand %%mm6, %%mm0 \n\t"
528 "pand %%mm6, %%mm3 \n\t"
529 "pmaddwd %%mm7, %%mm0 \n\t"
530 "pmaddwd %%mm7, %%mm3 \n\t"
531 "pand %%mm5, %%mm1 \n\t"
532 "pand %%mm5, %%mm4 \n\t"
533 "por %%mm1, %%mm0 \n\t"
534 "por %%mm4, %%mm3 \n\t"
535 "psrld $6, %%mm0 \n\t"
536 "pslld $10, %%mm3 \n\t"
537 "por %%mm3, %%mm0 \n\t"
538 MOVNTQ
" %%mm0, (%0) \n\t"
545 : "r" (mm_end
), "m" (mask3215g
), "m" (mask3216br
), "m" (mul3215
)
548 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
550 "movq %0, %%mm7 \n\t"
551 "movq %1, %%mm6 \n\t"
552 ::"m"(red_15mask
),"m"(green_15mask
));
557 "movd %1, %%mm0 \n\t"
558 "movd 4%1, %%mm3 \n\t"
559 "punpckldq 8%1, %%mm0 \n\t"
560 "punpckldq 12%1, %%mm3 \n\t"
561 "movq %%mm0, %%mm1 \n\t"
562 "movq %%mm0, %%mm2 \n\t"
563 "movq %%mm3, %%mm4 \n\t"
564 "movq %%mm3, %%mm5 \n\t"
565 "psrlq $3, %%mm0 \n\t"
566 "psrlq $3, %%mm3 \n\t"
567 "pand %2, %%mm0 \n\t"
568 "pand %2, %%mm3 \n\t"
569 "psrlq $6, %%mm1 \n\t"
570 "psrlq $6, %%mm4 \n\t"
571 "pand %%mm6, %%mm1 \n\t"
572 "pand %%mm6, %%mm4 \n\t"
573 "psrlq $9, %%mm2 \n\t"
574 "psrlq $9, %%mm5 \n\t"
575 "pand %%mm7, %%mm2 \n\t"
576 "pand %%mm7, %%mm5 \n\t"
577 "por %%mm1, %%mm0 \n\t"
578 "por %%mm4, %%mm3 \n\t"
579 "por %%mm2, %%mm0 \n\t"
580 "por %%mm5, %%mm3 \n\t"
581 "psllq $16, %%mm3 \n\t"
582 "por %%mm3, %%mm0 \n\t"
583 MOVNTQ
" %%mm0, %0 \n\t"
584 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
589 __asm
__volatile(SFENCE:::"memory");
590 __asm
__volatile(EMMS:::"memory");
594 register int rgb
= *(uint32_t*)s
; s
+= 4;
595 *d
++ = ((rgb
&0xFF)>>3) + ((rgb
&0xF800)>>6) + ((rgb
&0xF80000)>>9);
599 static inline void RENAME(rgb32tobgr15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
601 const uint8_t *s
= src
;
604 const uint8_t *mm_end
;
606 uint16_t *d
= (uint16_t *)dst
;
609 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
611 "movq %0, %%mm7 \n\t"
612 "movq %1, %%mm6 \n\t"
613 ::"m"(red_15mask
),"m"(green_15mask
));
619 "movd %1, %%mm0 \n\t"
620 "movd 4%1, %%mm3 \n\t"
621 "punpckldq 8%1, %%mm0 \n\t"
622 "punpckldq 12%1, %%mm3 \n\t"
623 "movq %%mm0, %%mm1 \n\t"
624 "movq %%mm0, %%mm2 \n\t"
625 "movq %%mm3, %%mm4 \n\t"
626 "movq %%mm3, %%mm5 \n\t"
627 "psllq $7, %%mm0 \n\t"
628 "psllq $7, %%mm3 \n\t"
629 "pand %%mm7, %%mm0 \n\t"
630 "pand %%mm7, %%mm3 \n\t"
631 "psrlq $6, %%mm1 \n\t"
632 "psrlq $6, %%mm4 \n\t"
633 "pand %%mm6, %%mm1 \n\t"
634 "pand %%mm6, %%mm4 \n\t"
635 "psrlq $19, %%mm2 \n\t"
636 "psrlq $19, %%mm5 \n\t"
637 "pand %2, %%mm2 \n\t"
638 "pand %2, %%mm5 \n\t"
639 "por %%mm1, %%mm0 \n\t"
640 "por %%mm4, %%mm3 \n\t"
641 "por %%mm2, %%mm0 \n\t"
642 "por %%mm5, %%mm3 \n\t"
643 "psllq $16, %%mm3 \n\t"
644 "por %%mm3, %%mm0 \n\t"
645 MOVNTQ
" %%mm0, %0 \n\t"
646 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
650 __asm
__volatile(SFENCE:::"memory");
651 __asm
__volatile(EMMS:::"memory");
655 register int rgb
= *(uint32_t*)s
; s
+= 4;
656 *d
++ = ((rgb
&0xF8)<<7) + ((rgb
&0xF800)>>6) + ((rgb
&0xF80000)>>19);
660 static inline void RENAME(rgb24to16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
662 const uint8_t *s
= src
;
665 const uint8_t *mm_end
;
667 uint16_t *d
= (uint16_t *)dst
;
670 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
672 "movq %0, %%mm7 \n\t"
673 "movq %1, %%mm6 \n\t"
674 ::"m"(red_16mask
),"m"(green_16mask
));
680 "movd %1, %%mm0 \n\t"
681 "movd 3%1, %%mm3 \n\t"
682 "punpckldq 6%1, %%mm0 \n\t"
683 "punpckldq 9%1, %%mm3 \n\t"
684 "movq %%mm0, %%mm1 \n\t"
685 "movq %%mm0, %%mm2 \n\t"
686 "movq %%mm3, %%mm4 \n\t"
687 "movq %%mm3, %%mm5 \n\t"
688 "psrlq $3, %%mm0 \n\t"
689 "psrlq $3, %%mm3 \n\t"
690 "pand %2, %%mm0 \n\t"
691 "pand %2, %%mm3 \n\t"
692 "psrlq $5, %%mm1 \n\t"
693 "psrlq $5, %%mm4 \n\t"
694 "pand %%mm6, %%mm1 \n\t"
695 "pand %%mm6, %%mm4 \n\t"
696 "psrlq $8, %%mm2 \n\t"
697 "psrlq $8, %%mm5 \n\t"
698 "pand %%mm7, %%mm2 \n\t"
699 "pand %%mm7, %%mm5 \n\t"
700 "por %%mm1, %%mm0 \n\t"
701 "por %%mm4, %%mm3 \n\t"
702 "por %%mm2, %%mm0 \n\t"
703 "por %%mm5, %%mm3 \n\t"
704 "psllq $16, %%mm3 \n\t"
705 "por %%mm3, %%mm0 \n\t"
706 MOVNTQ
" %%mm0, %0 \n\t"
707 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
711 __asm
__volatile(SFENCE:::"memory");
712 __asm
__volatile(EMMS:::"memory");
719 *d
++ = (b
>>3) | ((g
&0xFC)<<3) | ((r
&0xF8)<<8);
723 static inline void RENAME(rgb24tobgr16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
725 const uint8_t *s
= src
;
728 const uint8_t *mm_end
;
730 uint16_t *d
= (uint16_t *)dst
;
733 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
735 "movq %0, %%mm7 \n\t"
736 "movq %1, %%mm6 \n\t"
737 ::"m"(red_16mask
),"m"(green_16mask
));
743 "movd %1, %%mm0 \n\t"
744 "movd 3%1, %%mm3 \n\t"
745 "punpckldq 6%1, %%mm0 \n\t"
746 "punpckldq 9%1, %%mm3 \n\t"
747 "movq %%mm0, %%mm1 \n\t"
748 "movq %%mm0, %%mm2 \n\t"
749 "movq %%mm3, %%mm4 \n\t"
750 "movq %%mm3, %%mm5 \n\t"
751 "psllq $8, %%mm0 \n\t"
752 "psllq $8, %%mm3 \n\t"
753 "pand %%mm7, %%mm0 \n\t"
754 "pand %%mm7, %%mm3 \n\t"
755 "psrlq $5, %%mm1 \n\t"
756 "psrlq $5, %%mm4 \n\t"
757 "pand %%mm6, %%mm1 \n\t"
758 "pand %%mm6, %%mm4 \n\t"
759 "psrlq $19, %%mm2 \n\t"
760 "psrlq $19, %%mm5 \n\t"
761 "pand %2, %%mm2 \n\t"
762 "pand %2, %%mm5 \n\t"
763 "por %%mm1, %%mm0 \n\t"
764 "por %%mm4, %%mm3 \n\t"
765 "por %%mm2, %%mm0 \n\t"
766 "por %%mm5, %%mm3 \n\t"
767 "psllq $16, %%mm3 \n\t"
768 "por %%mm3, %%mm0 \n\t"
769 MOVNTQ
" %%mm0, %0 \n\t"
770 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
774 __asm
__volatile(SFENCE:::"memory");
775 __asm
__volatile(EMMS:::"memory");
782 *d
++ = (b
>>3) | ((g
&0xFC)<<3) | ((r
&0xF8)<<8);
786 static inline void RENAME(rgb24to15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
788 const uint8_t *s
= src
;
791 const uint8_t *mm_end
;
793 uint16_t *d
= (uint16_t *)dst
;
796 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
798 "movq %0, %%mm7 \n\t"
799 "movq %1, %%mm6 \n\t"
800 ::"m"(red_15mask
),"m"(green_15mask
));
806 "movd %1, %%mm0 \n\t"
807 "movd 3%1, %%mm3 \n\t"
808 "punpckldq 6%1, %%mm0 \n\t"
809 "punpckldq 9%1, %%mm3 \n\t"
810 "movq %%mm0, %%mm1 \n\t"
811 "movq %%mm0, %%mm2 \n\t"
812 "movq %%mm3, %%mm4 \n\t"
813 "movq %%mm3, %%mm5 \n\t"
814 "psrlq $3, %%mm0 \n\t"
815 "psrlq $3, %%mm3 \n\t"
816 "pand %2, %%mm0 \n\t"
817 "pand %2, %%mm3 \n\t"
818 "psrlq $6, %%mm1 \n\t"
819 "psrlq $6, %%mm4 \n\t"
820 "pand %%mm6, %%mm1 \n\t"
821 "pand %%mm6, %%mm4 \n\t"
822 "psrlq $9, %%mm2 \n\t"
823 "psrlq $9, %%mm5 \n\t"
824 "pand %%mm7, %%mm2 \n\t"
825 "pand %%mm7, %%mm5 \n\t"
826 "por %%mm1, %%mm0 \n\t"
827 "por %%mm4, %%mm3 \n\t"
828 "por %%mm2, %%mm0 \n\t"
829 "por %%mm5, %%mm3 \n\t"
830 "psllq $16, %%mm3 \n\t"
831 "por %%mm3, %%mm0 \n\t"
832 MOVNTQ
" %%mm0, %0 \n\t"
833 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
837 __asm
__volatile(SFENCE:::"memory");
838 __asm
__volatile(EMMS:::"memory");
845 *d
++ = (b
>>3) | ((g
&0xF8)<<2) | ((r
&0xF8)<<7);
849 static inline void RENAME(rgb24tobgr15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
851 const uint8_t *s
= src
;
854 const uint8_t *mm_end
;
856 uint16_t *d
= (uint16_t *)dst
;
859 __asm
__volatile(PREFETCH
" %0"::"m"(*src
):"memory");
861 "movq %0, %%mm7 \n\t"
862 "movq %1, %%mm6 \n\t"
863 ::"m"(red_15mask
),"m"(green_15mask
));
869 "movd %1, %%mm0 \n\t"
870 "movd 3%1, %%mm3 \n\t"
871 "punpckldq 6%1, %%mm0 \n\t"
872 "punpckldq 9%1, %%mm3 \n\t"
873 "movq %%mm0, %%mm1 \n\t"
874 "movq %%mm0, %%mm2 \n\t"
875 "movq %%mm3, %%mm4 \n\t"
876 "movq %%mm3, %%mm5 \n\t"
877 "psllq $7, %%mm0 \n\t"
878 "psllq $7, %%mm3 \n\t"
879 "pand %%mm7, %%mm0 \n\t"
880 "pand %%mm7, %%mm3 \n\t"
881 "psrlq $6, %%mm1 \n\t"
882 "psrlq $6, %%mm4 \n\t"
883 "pand %%mm6, %%mm1 \n\t"
884 "pand %%mm6, %%mm4 \n\t"
885 "psrlq $19, %%mm2 \n\t"
886 "psrlq $19, %%mm5 \n\t"
887 "pand %2, %%mm2 \n\t"
888 "pand %2, %%mm5 \n\t"
889 "por %%mm1, %%mm0 \n\t"
890 "por %%mm4, %%mm3 \n\t"
891 "por %%mm2, %%mm0 \n\t"
892 "por %%mm5, %%mm3 \n\t"
893 "psllq $16, %%mm3 \n\t"
894 "por %%mm3, %%mm0 \n\t"
895 MOVNTQ
" %%mm0, %0 \n\t"
896 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
900 __asm
__volatile(SFENCE:::"memory");
901 __asm
__volatile(EMMS:::"memory");
908 *d
++ = (b
>>3) | ((g
&0xF8)<<2) | ((r
&0xF8)<<7);
913 I use less accurate approximation here by simply left-shifting the input
914 value and filling the low order bits with zeroes. This method improves PNG
915 compression but this scheme cannot reproduce white exactly, since it does
916 not generate an all-ones maximum value; the net effect is to darken the
919 The better method should be "left bit replication":
929 | Leftmost Bits Repeated to Fill Open Bits
933 static inline void RENAME(rgb15to24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
937 const uint16_t *mm_end
;
939 uint8_t *d
= (uint8_t *)dst
;
940 const uint16_t *s
= (uint16_t *)src
;
941 end
= s
+ src_size
/2;
943 __asm
__volatile(PREFETCH
" %0"::"m"(*s
):"memory");
949 "movq %1, %%mm0 \n\t"
950 "movq %1, %%mm1 \n\t"
951 "movq %1, %%mm2 \n\t"
952 "pand %2, %%mm0 \n\t"
953 "pand %3, %%mm1 \n\t"
954 "pand %4, %%mm2 \n\t"
955 "psllq $3, %%mm0 \n\t"
956 "psrlq $2, %%mm1 \n\t"
957 "psrlq $7, %%mm2 \n\t"
958 "movq %%mm0, %%mm3 \n\t"
959 "movq %%mm1, %%mm4 \n\t"
960 "movq %%mm2, %%mm5 \n\t"
961 "punpcklwd %5, %%mm0 \n\t"
962 "punpcklwd %5, %%mm1 \n\t"
963 "punpcklwd %5, %%mm2 \n\t"
964 "punpckhwd %5, %%mm3 \n\t"
965 "punpckhwd %5, %%mm4 \n\t"
966 "punpckhwd %5, %%mm5 \n\t"
967 "psllq $8, %%mm1 \n\t"
968 "psllq $16, %%mm2 \n\t"
969 "por %%mm1, %%mm0 \n\t"
970 "por %%mm2, %%mm0 \n\t"
971 "psllq $8, %%mm4 \n\t"
972 "psllq $16, %%mm5 \n\t"
973 "por %%mm4, %%mm3 \n\t"
974 "por %%mm5, %%mm3 \n\t"
976 "movq %%mm0, %%mm6 \n\t"
977 "movq %%mm3, %%mm7 \n\t"
979 "movq 8%1, %%mm0 \n\t"
980 "movq 8%1, %%mm1 \n\t"
981 "movq 8%1, %%mm2 \n\t"
982 "pand %2, %%mm0 \n\t"
983 "pand %3, %%mm1 \n\t"
984 "pand %4, %%mm2 \n\t"
985 "psllq $3, %%mm0 \n\t"
986 "psrlq $2, %%mm1 \n\t"
987 "psrlq $7, %%mm2 \n\t"
988 "movq %%mm0, %%mm3 \n\t"
989 "movq %%mm1, %%mm4 \n\t"
990 "movq %%mm2, %%mm5 \n\t"
991 "punpcklwd %5, %%mm0 \n\t"
992 "punpcklwd %5, %%mm1 \n\t"
993 "punpcklwd %5, %%mm2 \n\t"
994 "punpckhwd %5, %%mm3 \n\t"
995 "punpckhwd %5, %%mm4 \n\t"
996 "punpckhwd %5, %%mm5 \n\t"
997 "psllq $8, %%mm1 \n\t"
998 "psllq $16, %%mm2 \n\t"
999 "por %%mm1, %%mm0 \n\t"
1000 "por %%mm2, %%mm0 \n\t"
1001 "psllq $8, %%mm4 \n\t"
1002 "psllq $16, %%mm5 \n\t"
1003 "por %%mm4, %%mm3 \n\t"
1004 "por %%mm5, %%mm3 \n\t"
1007 :"m"(*s
),"m"(mask15b
),"m"(mask15g
),"m"(mask15r
), "m"(mmx_null
)
1009 /* Borrowed 32 to 24 */
1011 "movq %%mm0, %%mm4 \n\t"
1012 "movq %%mm3, %%mm5 \n\t"
1013 "movq %%mm6, %%mm0 \n\t"
1014 "movq %%mm7, %%mm1 \n\t"
1016 "movq %%mm4, %%mm6 \n\t"
1017 "movq %%mm5, %%mm7 \n\t"
1018 "movq %%mm0, %%mm2 \n\t"
1019 "movq %%mm1, %%mm3 \n\t"
1021 "psrlq $8, %%mm2 \n\t"
1022 "psrlq $8, %%mm3 \n\t"
1023 "psrlq $8, %%mm6 \n\t"
1024 "psrlq $8, %%mm7 \n\t"
1025 "pand %2, %%mm0 \n\t"
1026 "pand %2, %%mm1 \n\t"
1027 "pand %2, %%mm4 \n\t"
1028 "pand %2, %%mm5 \n\t"
1029 "pand %3, %%mm2 \n\t"
1030 "pand %3, %%mm3 \n\t"
1031 "pand %3, %%mm6 \n\t"
1032 "pand %3, %%mm7 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "por %%mm3, %%mm1 \n\t"
1035 "por %%mm6, %%mm4 \n\t"
1036 "por %%mm7, %%mm5 \n\t"
1038 "movq %%mm1, %%mm2 \n\t"
1039 "movq %%mm4, %%mm3 \n\t"
1040 "psllq $48, %%mm2 \n\t"
1041 "psllq $32, %%mm3 \n\t"
1042 "pand %4, %%mm2 \n\t"
1043 "pand %5, %%mm3 \n\t"
1044 "por %%mm2, %%mm0 \n\t"
1045 "psrlq $16, %%mm1 \n\t"
1046 "psrlq $32, %%mm4 \n\t"
1047 "psllq $16, %%mm5 \n\t"
1048 "por %%mm3, %%mm1 \n\t"
1049 "pand %6, %%mm5 \n\t"
1050 "por %%mm5, %%mm4 \n\t"
1052 MOVNTQ
" %%mm0, %0 \n\t"
1053 MOVNTQ
" %%mm1, 8%0 \n\t"
1054 MOVNTQ
" %%mm4, 16%0"
1057 :"m"(*s
),"m"(mask24l
),"m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
1062 __asm
__volatile(SFENCE:::"memory");
1063 __asm
__volatile(EMMS:::"memory");
1067 register uint16_t bgr
;
1069 *d
++ = (bgr
&0x1F)<<3;
1070 *d
++ = (bgr
&0x3E0)>>2;
1071 *d
++ = (bgr
&0x7C00)>>7;
1075 static inline void RENAME(rgb16to24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1077 const uint16_t *end
;
1079 const uint16_t *mm_end
;
1081 uint8_t *d
= (uint8_t *)dst
;
1082 const uint16_t *s
= (const uint16_t *)src
;
1083 end
= s
+ src_size
/2;
1085 __asm
__volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1090 PREFETCH
" 32%1 \n\t"
1091 "movq %1, %%mm0 \n\t"
1092 "movq %1, %%mm1 \n\t"
1093 "movq %1, %%mm2 \n\t"
1094 "pand %2, %%mm0 \n\t"
1095 "pand %3, %%mm1 \n\t"
1096 "pand %4, %%mm2 \n\t"
1097 "psllq $3, %%mm0 \n\t"
1098 "psrlq $3, %%mm1 \n\t"
1099 "psrlq $8, %%mm2 \n\t"
1100 "movq %%mm0, %%mm3 \n\t"
1101 "movq %%mm1, %%mm4 \n\t"
1102 "movq %%mm2, %%mm5 \n\t"
1103 "punpcklwd %5, %%mm0 \n\t"
1104 "punpcklwd %5, %%mm1 \n\t"
1105 "punpcklwd %5, %%mm2 \n\t"
1106 "punpckhwd %5, %%mm3 \n\t"
1107 "punpckhwd %5, %%mm4 \n\t"
1108 "punpckhwd %5, %%mm5 \n\t"
1109 "psllq $8, %%mm1 \n\t"
1110 "psllq $16, %%mm2 \n\t"
1111 "por %%mm1, %%mm0 \n\t"
1112 "por %%mm2, %%mm0 \n\t"
1113 "psllq $8, %%mm4 \n\t"
1114 "psllq $16, %%mm5 \n\t"
1115 "por %%mm4, %%mm3 \n\t"
1116 "por %%mm5, %%mm3 \n\t"
1118 "movq %%mm0, %%mm6 \n\t"
1119 "movq %%mm3, %%mm7 \n\t"
1121 "movq 8%1, %%mm0 \n\t"
1122 "movq 8%1, %%mm1 \n\t"
1123 "movq 8%1, %%mm2 \n\t"
1124 "pand %2, %%mm0 \n\t"
1125 "pand %3, %%mm1 \n\t"
1126 "pand %4, %%mm2 \n\t"
1127 "psllq $3, %%mm0 \n\t"
1128 "psrlq $3, %%mm1 \n\t"
1129 "psrlq $8, %%mm2 \n\t"
1130 "movq %%mm0, %%mm3 \n\t"
1131 "movq %%mm1, %%mm4 \n\t"
1132 "movq %%mm2, %%mm5 \n\t"
1133 "punpcklwd %5, %%mm0 \n\t"
1134 "punpcklwd %5, %%mm1 \n\t"
1135 "punpcklwd %5, %%mm2 \n\t"
1136 "punpckhwd %5, %%mm3 \n\t"
1137 "punpckhwd %5, %%mm4 \n\t"
1138 "punpckhwd %5, %%mm5 \n\t"
1139 "psllq $8, %%mm1 \n\t"
1140 "psllq $16, %%mm2 \n\t"
1141 "por %%mm1, %%mm0 \n\t"
1142 "por %%mm2, %%mm0 \n\t"
1143 "psllq $8, %%mm4 \n\t"
1144 "psllq $16, %%mm5 \n\t"
1145 "por %%mm4, %%mm3 \n\t"
1146 "por %%mm5, %%mm3 \n\t"
1148 :"m"(*s
),"m"(mask16b
),"m"(mask16g
),"m"(mask16r
),"m"(mmx_null
)
1150 /* Borrowed 32 to 24 */
1152 "movq %%mm0, %%mm4 \n\t"
1153 "movq %%mm3, %%mm5 \n\t"
1154 "movq %%mm6, %%mm0 \n\t"
1155 "movq %%mm7, %%mm1 \n\t"
1157 "movq %%mm4, %%mm6 \n\t"
1158 "movq %%mm5, %%mm7 \n\t"
1159 "movq %%mm0, %%mm2 \n\t"
1160 "movq %%mm1, %%mm3 \n\t"
1162 "psrlq $8, %%mm2 \n\t"
1163 "psrlq $8, %%mm3 \n\t"
1164 "psrlq $8, %%mm6 \n\t"
1165 "psrlq $8, %%mm7 \n\t"
1166 "pand %2, %%mm0 \n\t"
1167 "pand %2, %%mm1 \n\t"
1168 "pand %2, %%mm4 \n\t"
1169 "pand %2, %%mm5 \n\t"
1170 "pand %3, %%mm2 \n\t"
1171 "pand %3, %%mm3 \n\t"
1172 "pand %3, %%mm6 \n\t"
1173 "pand %3, %%mm7 \n\t"
1174 "por %%mm2, %%mm0 \n\t"
1175 "por %%mm3, %%mm1 \n\t"
1176 "por %%mm6, %%mm4 \n\t"
1177 "por %%mm7, %%mm5 \n\t"
1179 "movq %%mm1, %%mm2 \n\t"
1180 "movq %%mm4, %%mm3 \n\t"
1181 "psllq $48, %%mm2 \n\t"
1182 "psllq $32, %%mm3 \n\t"
1183 "pand %4, %%mm2 \n\t"
1184 "pand %5, %%mm3 \n\t"
1185 "por %%mm2, %%mm0 \n\t"
1186 "psrlq $16, %%mm1 \n\t"
1187 "psrlq $32, %%mm4 \n\t"
1188 "psllq $16, %%mm5 \n\t"
1189 "por %%mm3, %%mm1 \n\t"
1190 "pand %6, %%mm5 \n\t"
1191 "por %%mm5, %%mm4 \n\t"
1193 MOVNTQ
" %%mm0, %0 \n\t"
1194 MOVNTQ
" %%mm1, 8%0 \n\t"
1195 MOVNTQ
" %%mm4, 16%0"
1198 :"m"(*s
),"m"(mask24l
),"m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
1203 __asm
__volatile(SFENCE:::"memory");
1204 __asm
__volatile(EMMS:::"memory");
1208 register uint16_t bgr
;
1210 *d
++ = (bgr
&0x1F)<<3;
1211 *d
++ = (bgr
&0x7E0)>>3;
1212 *d
++ = (bgr
&0xF800)>>8;
1216 static inline void RENAME(rgb15to32
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1218 const uint16_t *end
;
1220 const uint16_t *mm_end
;
1222 uint8_t *d
= (uint8_t *)dst
;
1223 const uint16_t *s
= (const uint16_t *)src
;
1224 end
= s
+ src_size
/2;
1226 __asm
__volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1227 __asm
__volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1232 PREFETCH
" 32%1 \n\t"
1233 "movq %1, %%mm0 \n\t"
1234 "movq %1, %%mm1 \n\t"
1235 "movq %1, %%mm2 \n\t"
1236 "pand %2, %%mm0 \n\t"
1237 "pand %3, %%mm1 \n\t"
1238 "pand %4, %%mm2 \n\t"
1239 "psllq $3, %%mm0 \n\t"
1240 "psrlq $2, %%mm1 \n\t"
1241 "psrlq $7, %%mm2 \n\t"
1242 "movq %%mm0, %%mm3 \n\t"
1243 "movq %%mm1, %%mm4 \n\t"
1244 "movq %%mm2, %%mm5 \n\t"
1245 "punpcklwd %%mm7, %%mm0 \n\t"
1246 "punpcklwd %%mm7, %%mm1 \n\t"
1247 "punpcklwd %%mm7, %%mm2 \n\t"
1248 "punpckhwd %%mm7, %%mm3 \n\t"
1249 "punpckhwd %%mm7, %%mm4 \n\t"
1250 "punpckhwd %%mm7, %%mm5 \n\t"
1251 "psllq $8, %%mm1 \n\t"
1252 "psllq $16, %%mm2 \n\t"
1253 "por %%mm1, %%mm0 \n\t"
1254 "por %%mm2, %%mm0 \n\t"
1255 "psllq $8, %%mm4 \n\t"
1256 "psllq $16, %%mm5 \n\t"
1257 "por %%mm4, %%mm3 \n\t"
1258 "por %%mm5, %%mm3 \n\t"
1259 MOVNTQ
" %%mm0, %0 \n\t"
1260 MOVNTQ
" %%mm3, 8%0 \n\t"
1262 :"m"(*s
),"m"(mask15b
),"m"(mask15g
),"m"(mask15r
)
1267 __asm
__volatile(SFENCE:::"memory");
1268 __asm
__volatile(EMMS:::"memory");
1272 #if 0 //slightly slower on Athlon
1274 *((uint32_t*)d
)++ = ((bgr
&0x1F)<<3) + ((bgr
&0x3E0)<<6) + ((bgr
&0x7C00)<<9);
1276 register uint16_t bgr
;
1278 #ifdef WORDS_BIGENDIAN
1280 *d
++ = (bgr
&0x7C00)>>7;
1281 *d
++ = (bgr
&0x3E0)>>2;
1282 *d
++ = (bgr
&0x1F)<<3;
1284 *d
++ = (bgr
&0x1F)<<3;
1285 *d
++ = (bgr
&0x3E0)>>2;
1286 *d
++ = (bgr
&0x7C00)>>7;
1294 static inline void RENAME(rgb16to32
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1296 const uint16_t *end
;
1298 const uint16_t *mm_end
;
1300 uint8_t *d
= (uint8_t *)dst
;
1301 const uint16_t *s
= (uint16_t *)src
;
1302 end
= s
+ src_size
/2;
1304 __asm
__volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1305 __asm
__volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1310 PREFETCH
" 32%1 \n\t"
1311 "movq %1, %%mm0 \n\t"
1312 "movq %1, %%mm1 \n\t"
1313 "movq %1, %%mm2 \n\t"
1314 "pand %2, %%mm0 \n\t"
1315 "pand %3, %%mm1 \n\t"
1316 "pand %4, %%mm2 \n\t"
1317 "psllq $3, %%mm0 \n\t"
1318 "psrlq $3, %%mm1 \n\t"
1319 "psrlq $8, %%mm2 \n\t"
1320 "movq %%mm0, %%mm3 \n\t"
1321 "movq %%mm1, %%mm4 \n\t"
1322 "movq %%mm2, %%mm5 \n\t"
1323 "punpcklwd %%mm7, %%mm0 \n\t"
1324 "punpcklwd %%mm7, %%mm1 \n\t"
1325 "punpcklwd %%mm7, %%mm2 \n\t"
1326 "punpckhwd %%mm7, %%mm3 \n\t"
1327 "punpckhwd %%mm7, %%mm4 \n\t"
1328 "punpckhwd %%mm7, %%mm5 \n\t"
1329 "psllq $8, %%mm1 \n\t"
1330 "psllq $16, %%mm2 \n\t"
1331 "por %%mm1, %%mm0 \n\t"
1332 "por %%mm2, %%mm0 \n\t"
1333 "psllq $8, %%mm4 \n\t"
1334 "psllq $16, %%mm5 \n\t"
1335 "por %%mm4, %%mm3 \n\t"
1336 "por %%mm5, %%mm3 \n\t"
1337 MOVNTQ
" %%mm0, %0 \n\t"
1338 MOVNTQ
" %%mm3, 8%0 \n\t"
1340 :"m"(*s
),"m"(mask16b
),"m"(mask16g
),"m"(mask16r
)
1345 __asm
__volatile(SFENCE:::"memory");
1346 __asm
__volatile(EMMS:::"memory");
1350 register uint16_t bgr
;
1352 #ifdef WORDS_BIGENDIAN
1354 *d
++ = (bgr
&0xF800)>>8;
1355 *d
++ = (bgr
&0x7E0)>>3;
1356 *d
++ = (bgr
&0x1F)<<3;
1358 *d
++ = (bgr
&0x1F)<<3;
1359 *d
++ = (bgr
&0x7E0)>>3;
1360 *d
++ = (bgr
&0xF800)>>8;
1366 static inline void RENAME(rgb32tobgr32
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1368 long idx
= 15 - src_size
;
1369 uint8_t *s
= (uint8_t *) src
-idx
, *d
= dst
-idx
;
1374 PREFETCH
" (%1, %0) \n\t"
1375 "movq %3, %%mm7 \n\t"
1376 "pxor %4, %%mm7 \n\t"
1377 "movq %%mm7, %%mm6 \n\t"
1378 "pxor %5, %%mm7 \n\t"
1381 PREFETCH
" 32(%1, %0) \n\t"
1382 "movq (%1, %0), %%mm0 \n\t"
1383 "movq 8(%1, %0), %%mm1 \n\t"
1385 "pshufw $177, %%mm0, %%mm3 \n\t"
1386 "pshufw $177, %%mm1, %%mm5 \n\t"
1387 "pand %%mm7, %%mm0 \n\t"
1388 "pand %%mm6, %%mm3 \n\t"
1389 "pand %%mm7, %%mm1 \n\t"
1390 "pand %%mm6, %%mm5 \n\t"
1391 "por %%mm3, %%mm0 \n\t"
1392 "por %%mm5, %%mm1 \n\t"
1394 "movq %%mm0, %%mm2 \n\t"
1395 "movq %%mm1, %%mm4 \n\t"
1396 "pand %%mm7, %%mm0 \n\t"
1397 "pand %%mm6, %%mm2 \n\t"
1398 "pand %%mm7, %%mm1 \n\t"
1399 "pand %%mm6, %%mm4 \n\t"
1400 "movq %%mm2, %%mm3 \n\t"
1401 "movq %%mm4, %%mm5 \n\t"
1402 "pslld $16, %%mm2 \n\t"
1403 "psrld $16, %%mm3 \n\t"
1404 "pslld $16, %%mm4 \n\t"
1405 "psrld $16, %%mm5 \n\t"
1406 "por %%mm2, %%mm0 \n\t"
1407 "por %%mm4, %%mm1 \n\t"
1408 "por %%mm3, %%mm0 \n\t"
1409 "por %%mm5, %%mm1 \n\t"
1411 MOVNTQ
" %%mm0, (%2, %0) \n\t"
1412 MOVNTQ
" %%mm1, 8(%2, %0) \n\t"
1419 : "r" (s
), "r" (d
), "m" (mask32b
), "m" (mask32r
), "m" (mmx_one
)
1422 for (; idx
<15; idx
+=4) {
1423 register int v
= *(uint32_t *)&s
[idx
], g
= v
& 0xff00ff00;
1425 *(uint32_t *)&d
[idx
] = (v
>>16) + g
+ (v
<<16);
1429 static inline void RENAME(rgb24tobgr24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1433 long mmx_size
= 23 - src_size
;
1435 "test %%"REG_a
", %%"REG_a
" \n\t"
1437 "movq "MANGLE(mask24r
)", %%mm5 \n\t"
1438 "movq "MANGLE(mask24g
)", %%mm6 \n\t"
1439 "movq "MANGLE(mask24b
)", %%mm7 \n\t"
1442 PREFETCH
" 32(%1, %%"REG_a
") \n\t"
1443 "movq (%1, %%"REG_a
"), %%mm0 \n\t" // BGR BGR BG
1444 "movq (%1, %%"REG_a
"), %%mm1 \n\t" // BGR BGR BG
1445 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t" // R BGR BGR B
1446 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1447 "pand %%mm5, %%mm0 \n\t"
1448 "pand %%mm6, %%mm1 \n\t"
1449 "pand %%mm7, %%mm2 \n\t"
1450 "por %%mm0, %%mm1 \n\t"
1451 "por %%mm2, %%mm1 \n\t"
1452 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t" // BGR BGR BG
1453 MOVNTQ
" %%mm1, (%2, %%"REG_a
") \n\t" // RGB RGB RG
1454 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t" // R BGR BGR B
1455 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t" // GR BGR BGR
1456 "pand %%mm7, %%mm0 \n\t"
1457 "pand %%mm5, %%mm1 \n\t"
1458 "pand %%mm6, %%mm2 \n\t"
1459 "por %%mm0, %%mm1 \n\t"
1460 "por %%mm2, %%mm1 \n\t"
1461 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t" // R BGR BGR B
1462 MOVNTQ
" %%mm1, 8(%2, %%"REG_a
") \n\t" // B RGB RGB R
1463 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t" // GR BGR BGR
1464 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t" // BGR BGR BG
1465 "pand %%mm6, %%mm0 \n\t"
1466 "pand %%mm7, %%mm1 \n\t"
1467 "pand %%mm5, %%mm2 \n\t"
1468 "por %%mm0, %%mm1 \n\t"
1469 "por %%mm2, %%mm1 \n\t"
1470 MOVNTQ
" %%mm1, 16(%2, %%"REG_a
") \n\t"
1471 "add $24, %%"REG_a
" \n\t"
1475 : "r" (src
-mmx_size
), "r"(dst
-mmx_size
)
1478 __asm
__volatile(SFENCE:::"memory");
1479 __asm
__volatile(EMMS:::"memory");
1481 if (mmx_size
==23) return; //finihsed, was multiple of 8
1485 src_size
= 23-mmx_size
;
1489 for (i
=0; i
<src_size
; i
+=3)
1493 dst
[i
+ 1] = src
[i
+ 1];
1494 dst
[i
+ 2] = src
[i
+ 0];
1499 static inline void RENAME(yuvPlanartoyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1500 long width
, long height
,
1501 long lumStride
, long chromStride
, long dstStride
, long vertLumPerChroma
)
1504 const long chromWidth
= width
>>1;
1505 for (y
=0; y
<height
; y
++)
1508 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1510 "xor %%"REG_a
", %%"REG_a
" \n\t"
1513 PREFETCH
" 32(%1, %%"REG_a
", 2) \n\t"
1514 PREFETCH
" 32(%2, %%"REG_a
") \n\t"
1515 PREFETCH
" 32(%3, %%"REG_a
") \n\t"
1516 "movq (%2, %%"REG_a
"), %%mm0 \n\t" // U(0)
1517 "movq %%mm0, %%mm2 \n\t" // U(0)
1518 "movq (%3, %%"REG_a
"), %%mm1 \n\t" // V(0)
1519 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1520 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1522 "movq (%1, %%"REG_a
",2), %%mm3 \n\t" // Y(0)
1523 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t" // Y(8)
1524 "movq %%mm3, %%mm4 \n\t" // Y(0)
1525 "movq %%mm5, %%mm6 \n\t" // Y(8)
1526 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1527 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1528 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1529 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1531 MOVNTQ
" %%mm3, (%0, %%"REG_a
", 4) \n\t"
1532 MOVNTQ
" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1533 MOVNTQ
" %%mm5, 16(%0, %%"REG_a
", 4) \n\t"
1534 MOVNTQ
" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1536 "add $8, %%"REG_a
" \n\t"
1537 "cmp %4, %%"REG_a
" \n\t"
1539 ::"r"(dst
), "r"(ysrc
), "r"(usrc
), "r"(vsrc
), "g" (chromWidth
)
1544 #if defined ARCH_ALPHA && defined HAVE_MVI
1545 #define pl2yuy2(n) \
1550 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1551 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1552 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1553 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1554 yuv1 = (u << 8) + (v << 24); \
1561 uint64_t *qdst
= (uint64_t *) dst
;
1562 uint64_t *qdst2
= (uint64_t *) (dst
+ dstStride
);
1563 const uint32_t *yc
= (uint32_t *) ysrc
;
1564 const uint32_t *yc2
= (uint32_t *) (ysrc
+ lumStride
);
1565 const uint16_t *uc
= (uint16_t*) usrc
, *vc
= (uint16_t*) vsrc
;
1566 for (i
= 0; i
< chromWidth
; i
+= 8){
1567 uint64_t y1
, y2
, yuv1
, yuv2
;
1570 asm("ldq $31,64(%0)" :: "r"(yc
));
1571 asm("ldq $31,64(%0)" :: "r"(yc2
));
1572 asm("ldq $31,64(%0)" :: "r"(uc
));
1573 asm("ldq $31,64(%0)" :: "r"(vc
));
1591 #elif __WORDSIZE >= 64
1593 uint64_t *ldst
= (uint64_t *) dst
;
1594 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1595 for (i
= 0; i
< chromWidth
; i
+= 2){
1597 k
= yc
[0] + (uc
[0] << 8) +
1598 (yc
[1] << 16) + (vc
[0] << 24);
1599 l
= yc
[2] + (uc
[1] << 8) +
1600 (yc
[3] << 16) + (vc
[1] << 24);
1601 *ldst
++ = k
+ (l
<< 32);
1608 int i
, *idst
= (int32_t *) dst
;
1609 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1610 for (i
= 0; i
< chromWidth
; i
++){
1611 #ifdef WORDS_BIGENDIAN
1612 *idst
++ = (yc
[0] << 24)+ (uc
[0] << 16) +
1613 (yc
[1] << 8) + (vc
[0] << 0);
1615 *idst
++ = yc
[0] + (uc
[0] << 8) +
1616 (yc
[1] << 16) + (vc
[0] << 24);
1624 if ((y
&(vertLumPerChroma
-1))==(vertLumPerChroma
-1) )
1626 usrc
+= chromStride
;
1627 vsrc
+= chromStride
;
1640 * Height should be a multiple of 2 and width should be a multiple of 16 (if
1641 * this is a problem for anyone then tell me, and I will fix it).
1643 static inline void RENAME(yv12toyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1644 long width
, long height
,
1645 long lumStride
, long chromStride
, long dstStride
)
1647 //FIXME interpolate chroma
1648 RENAME(yuvPlanartoyuy2
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 2);
1651 static inline void RENAME(yuvPlanartouyvy
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1652 long width
, long height
,
1653 long lumStride
, long chromStride
, long dstStride
, long vertLumPerChroma
)
1656 const long chromWidth
= width
>>1;
1657 for (y
=0; y
<height
; y
++)
1660 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1662 "xor %%"REG_a
", %%"REG_a
" \n\t"
1665 PREFETCH
" 32(%1, %%"REG_a
", 2) \n\t"
1666 PREFETCH
" 32(%2, %%"REG_a
") \n\t"
1667 PREFETCH
" 32(%3, %%"REG_a
") \n\t"
1668 "movq (%2, %%"REG_a
"), %%mm0 \n\t" // U(0)
1669 "movq %%mm0, %%mm2 \n\t" // U(0)
1670 "movq (%3, %%"REG_a
"), %%mm1 \n\t" // V(0)
1671 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1672 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1674 "movq (%1, %%"REG_a
",2), %%mm3 \n\t" // Y(0)
1675 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t" // Y(8)
1676 "movq %%mm0, %%mm4 \n\t" // Y(0)
1677 "movq %%mm2, %%mm6 \n\t" // Y(8)
1678 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1679 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1680 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1681 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1683 MOVNTQ
" %%mm0, (%0, %%"REG_a
", 4) \n\t"
1684 MOVNTQ
" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1685 MOVNTQ
" %%mm2, 16(%0, %%"REG_a
", 4) \n\t"
1686 MOVNTQ
" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1688 "add $8, %%"REG_a
" \n\t"
1689 "cmp %4, %%"REG_a
" \n\t"
1691 ::"r"(dst
), "r"(ysrc
), "r"(usrc
), "r"(vsrc
), "g" (chromWidth
)
1695 //FIXME adapt the Alpha ASM code from yv12->yuy2
1697 #if __WORDSIZE >= 64
1699 uint64_t *ldst
= (uint64_t *) dst
;
1700 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1701 for (i
= 0; i
< chromWidth
; i
+= 2){
1703 k
= uc
[0] + (yc
[0] << 8) +
1704 (vc
[0] << 16) + (yc
[1] << 24);
1705 l
= uc
[1] + (yc
[2] << 8) +
1706 (vc
[1] << 16) + (yc
[3] << 24);
1707 *ldst
++ = k
+ (l
<< 32);
1714 int i
, *idst
= (int32_t *) dst
;
1715 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1716 for (i
= 0; i
< chromWidth
; i
++){
1717 #ifdef WORDS_BIGENDIAN
1718 *idst
++ = (uc
[0] << 24)+ (yc
[0] << 16) +
1719 (vc
[0] << 8) + (yc
[1] << 0);
1721 *idst
++ = uc
[0] + (yc
[0] << 8) +
1722 (vc
[0] << 16) + (yc
[1] << 24);
1730 if ((y
&(vertLumPerChroma
-1))==(vertLumPerChroma
-1) )
1732 usrc
+= chromStride
;
1733 vsrc
+= chromStride
;
1746 * Height should be a multiple of 2 and width should be a multiple of 16 (if
1747 * this is a problem for anyone then tell me, and I will fix it).
1749 static inline void RENAME(yv12touyvy
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1750 long width
, long height
,
1751 long lumStride
, long chromStride
, long dstStride
)
1753 //FIXME interpolate chroma
1754 RENAME(yuvPlanartouyvy
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 2);
1758 * Width should be a multiple of 16.
1760 static inline void RENAME(yuv422ptoyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1761 long width
, long height
,
1762 long lumStride
, long chromStride
, long dstStride
)
1764 RENAME(yuvPlanartoyuy2
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 1);
1768 * Height should be a multiple of 2 and width should be a multiple of 16 (if
1769 * this is a problem for anyone then tell me, and I will fix it).
1771 static inline void RENAME(yuy2toyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1772 long width
, long height
,
1773 long lumStride
, long chromStride
, long srcStride
)
1776 const long chromWidth
= width
>>1;
1777 for (y
=0; y
<height
; y
+=2)
1781 "xor %%"REG_a
", %%"REG_a
" \n\t"
1782 "pcmpeqw %%mm7, %%mm7 \n\t"
1783 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1786 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
1787 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // YUYV YUYV(0)
1788 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(4)
1789 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1790 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1791 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1792 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1793 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1794 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1795 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1796 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1798 MOVNTQ
" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1800 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(8)
1801 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t" // YUYV YUYV(12)
1802 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1803 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1804 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1805 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1806 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1807 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1808 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1809 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1811 MOVNTQ
" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1813 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1814 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1815 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1816 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1817 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1818 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1819 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1820 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1822 MOVNTQ
" %%mm0, (%3, %%"REG_a
") \n\t"
1823 MOVNTQ
" %%mm2, (%2, %%"REG_a
") \n\t"
1825 "add $8, %%"REG_a
" \n\t"
1826 "cmp %4, %%"REG_a
" \n\t"
1828 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
1829 : "memory", "%"REG_a
1836 "xor %%"REG_a
", %%"REG_a
" \n\t"
1839 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
1840 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // YUYV YUYV(0)
1841 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(4)
1842 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t" // YUYV YUYV(8)
1843 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t" // YUYV YUYV(12)
1844 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1845 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1846 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1847 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1848 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1849 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1851 MOVNTQ
" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1852 MOVNTQ
" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1854 "add $8, %%"REG_a
" \n\t"
1855 "cmp %4, %%"REG_a
" \n\t"
1858 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
1859 : "memory", "%"REG_a
1863 for (i
=0; i
<chromWidth
; i
++)
1865 ydst
[2*i
+0] = src
[4*i
+0];
1866 udst
[i
] = src
[4*i
+1];
1867 ydst
[2*i
+1] = src
[4*i
+2];
1868 vdst
[i
] = src
[4*i
+3];
1873 for (i
=0; i
<chromWidth
; i
++)
1875 ydst
[2*i
+0] = src
[4*i
+0];
1876 ydst
[2*i
+1] = src
[4*i
+2];
1879 udst
+= chromStride
;
1880 vdst
+= chromStride
;
1885 asm volatile( EMMS
" \n\t"
1891 static inline void RENAME(yvu9toyv12
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
,
1892 uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1893 long width
, long height
, long lumStride
, long chromStride
)
1896 memcpy(ydst
, ysrc
, width
*height
);
1898 /* XXX: implement upscaling for U,V */
1901 static inline void RENAME(planar2x
)(const uint8_t *src
, uint8_t *dst
, long srcWidth
, long srcHeight
, long srcStride
, long dstStride
)
1908 for (x
=0; x
<srcWidth
-1; x
++){
1909 dst
[2*x
+1]= (3*src
[x
] + src
[x
+1])>>2;
1910 dst
[2*x
+2]= ( src
[x
] + 3*src
[x
+1])>>2;
1912 dst
[2*srcWidth
-1]= src
[srcWidth
-1];
1916 for (y
=1; y
<srcHeight
; y
++){
1917 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1918 const long mmxSize
= srcWidth
&~15;
1920 "mov %4, %%"REG_a
" \n\t"
1922 "movq (%0, %%"REG_a
"), %%mm0 \n\t"
1923 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1924 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t"
1925 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t"
1926 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t"
1927 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t"
1928 PAVGB
" %%mm0, %%mm5 \n\t"
1929 PAVGB
" %%mm0, %%mm3 \n\t"
1930 PAVGB
" %%mm0, %%mm5 \n\t"
1931 PAVGB
" %%mm0, %%mm3 \n\t"
1932 PAVGB
" %%mm1, %%mm4 \n\t"
1933 PAVGB
" %%mm1, %%mm2 \n\t"
1934 PAVGB
" %%mm1, %%mm4 \n\t"
1935 PAVGB
" %%mm1, %%mm2 \n\t"
1936 "movq %%mm5, %%mm7 \n\t"
1937 "movq %%mm4, %%mm6 \n\t"
1938 "punpcklbw %%mm3, %%mm5 \n\t"
1939 "punpckhbw %%mm3, %%mm7 \n\t"
1940 "punpcklbw %%mm2, %%mm4 \n\t"
1941 "punpckhbw %%mm2, %%mm6 \n\t"
1943 MOVNTQ
" %%mm5, (%2, %%"REG_a
", 2) \n\t"
1944 MOVNTQ
" %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1945 MOVNTQ
" %%mm4, (%3, %%"REG_a
", 2) \n\t"
1946 MOVNTQ
" %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1948 "movq %%mm5, (%2, %%"REG_a
", 2) \n\t"
1949 "movq %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1950 "movq %%mm4, (%3, %%"REG_a
", 2) \n\t"
1951 "movq %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1953 "add $8, %%"REG_a
" \n\t"
1955 :: "r" (src
+ mmxSize
), "r" (src
+ srcStride
+ mmxSize
),
1956 "r" (dst
+ mmxSize
*2), "r" (dst
+ dstStride
+ mmxSize
*2),
1962 const long mmxSize
=1;
1964 dst
[0 ]= (3*src
[0] + src
[srcStride
])>>2;
1965 dst
[dstStride
]= ( src
[0] + 3*src
[srcStride
])>>2;
1967 for (x
=mmxSize
-1; x
<srcWidth
-1; x
++){
1968 dst
[2*x
+1]= (3*src
[x
+0] + src
[x
+srcStride
+1])>>2;
1969 dst
[2*x
+dstStride
+2]= ( src
[x
+0] + 3*src
[x
+srcStride
+1])>>2;
1970 dst
[2*x
+dstStride
+1]= ( src
[x
+1] + 3*src
[x
+srcStride
])>>2;
1971 dst
[2*x
+2]= (3*src
[x
+1] + src
[x
+srcStride
])>>2;
1973 dst
[srcWidth
*2 -1 ]= (3*src
[srcWidth
-1] + src
[srcWidth
-1 + srcStride
])>>2;
1974 dst
[srcWidth
*2 -1 + dstStride
]= ( src
[srcWidth
-1] + 3*src
[srcWidth
-1 + srcStride
])>>2;
1984 for (x
=0; x
<srcWidth
-1; x
++){
1985 dst
[2*x
+1]= (3*src
[x
] + src
[x
+1])>>2;
1986 dst
[2*x
+2]= ( src
[x
] + 3*src
[x
+1])>>2;
1988 dst
[2*srcWidth
-1]= src
[srcWidth
-1];
1990 for (x
=0; x
<srcWidth
; x
++){
1997 asm volatile( EMMS
" \n\t"
2004 * Height should be a multiple of 2 and width should be a multiple of 16 (if
2005 * this is a problem for anyone then tell me, and I will fix it).
2006 * Chrominance data is only taken from every secound line, others are ignored.
2007 * FIXME: Write HQ version.
2009 static inline void RENAME(uyvytoyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
2010 long width
, long height
,
2011 long lumStride
, long chromStride
, long srcStride
)
2014 const long chromWidth
= width
>>1;
2015 for (y
=0; y
<height
; y
+=2)
2019 "xorl %%eax, %%eax \n\t"
2020 "pcmpeqw %%mm7, %%mm7 \n\t"
2021 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2024 PREFETCH
" 64(%0, %%eax, 4) \n\t"
2025 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
2026 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
2027 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2028 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2029 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2030 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2031 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2032 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2033 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2034 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2036 MOVNTQ
" %%mm2, (%1, %%eax, 2) \n\t"
2038 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2039 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2040 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2041 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2042 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2043 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2044 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2045 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2046 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2047 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2049 MOVNTQ
" %%mm3, 8(%1, %%eax, 2) \n\t"
2051 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2052 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2053 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2054 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2055 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2056 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2057 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2058 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2060 MOVNTQ
" %%mm0, (%3, %%eax) \n\t"
2061 MOVNTQ
" %%mm2, (%2, %%eax) \n\t"
2063 "addl $8, %%eax \n\t"
2064 "cmpl %4, %%eax \n\t"
2066 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
2074 "xorl %%eax, %%eax \n\t"
2077 PREFETCH
" 64(%0, %%eax, 4) \n\t"
2078 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2079 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2080 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2081 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2082 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2083 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2084 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2085 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2086 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2087 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2089 MOVNTQ
" %%mm0, (%1, %%eax, 2) \n\t"
2090 MOVNTQ
" %%mm2, 8(%1, %%eax, 2) \n\t"
2092 "addl $8, %%eax \n\t"
2093 "cmpl %4, %%eax \n\t"
2096 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
2101 for (i
=0; i
<chromWidth
; i
++)
2103 udst
[i
] = src
[4*i
+0];
2104 ydst
[2*i
+0] = src
[4*i
+1];
2105 vdst
[i
] = src
[4*i
+2];
2106 ydst
[2*i
+1] = src
[4*i
+3];
2111 for (i
=0; i
<chromWidth
; i
++)
2113 ydst
[2*i
+0] = src
[4*i
+1];
2114 ydst
[2*i
+1] = src
[4*i
+3];
2117 udst
+= chromStride
;
2118 vdst
+= chromStride
;
2123 asm volatile( EMMS
" \n\t"
2130 * Height should be a multiple of 2 and width should be a multiple of 2 (if
2131 * this is a problem for anyone then tell me, and I will fix it).
2132 * Chrominance data is only taken from every secound line,
2133 * others are ignored in the C version.
2134 * FIXME: Write HQ version.
2136 static inline void RENAME(rgb24toyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
2137 long width
, long height
,
2138 long lumStride
, long chromStride
, long srcStride
)
2141 const long chromWidth
= width
>>1;
2143 for (y
=0; y
<height
-2; y
+=2)
2149 "mov %2, %%"REG_a
" \n\t"
2150 "movq "MANGLE(bgr2YCoeff
)", %%mm6 \n\t"
2151 "movq "MANGLE(w1111
)", %%mm5 \n\t"
2152 "pxor %%mm7, %%mm7 \n\t"
2153 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
2156 PREFETCH
" 64(%0, %%"REG_d
") \n\t"
2157 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
2158 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
2159 "punpcklbw %%mm7, %%mm0 \n\t"
2160 "punpcklbw %%mm7, %%mm1 \n\t"
2161 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
2162 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
2163 "punpcklbw %%mm7, %%mm2 \n\t"
2164 "punpcklbw %%mm7, %%mm3 \n\t"
2165 "pmaddwd %%mm6, %%mm0 \n\t"
2166 "pmaddwd %%mm6, %%mm1 \n\t"
2167 "pmaddwd %%mm6, %%mm2 \n\t"
2168 "pmaddwd %%mm6, %%mm3 \n\t"
2169 #ifndef FAST_BGR2YV12
2170 "psrad $8, %%mm0 \n\t"
2171 "psrad $8, %%mm1 \n\t"
2172 "psrad $8, %%mm2 \n\t"
2173 "psrad $8, %%mm3 \n\t"
2175 "packssdw %%mm1, %%mm0 \n\t"
2176 "packssdw %%mm3, %%mm2 \n\t"
2177 "pmaddwd %%mm5, %%mm0 \n\t"
2178 "pmaddwd %%mm5, %%mm2 \n\t"
2179 "packssdw %%mm2, %%mm0 \n\t"
2180 "psraw $7, %%mm0 \n\t"
2182 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
2183 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
2184 "punpcklbw %%mm7, %%mm4 \n\t"
2185 "punpcklbw %%mm7, %%mm1 \n\t"
2186 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
2187 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
2188 "punpcklbw %%mm7, %%mm2 \n\t"
2189 "punpcklbw %%mm7, %%mm3 \n\t"
2190 "pmaddwd %%mm6, %%mm4 \n\t"
2191 "pmaddwd %%mm6, %%mm1 \n\t"
2192 "pmaddwd %%mm6, %%mm2 \n\t"
2193 "pmaddwd %%mm6, %%mm3 \n\t"
2194 #ifndef FAST_BGR2YV12
2195 "psrad $8, %%mm4 \n\t"
2196 "psrad $8, %%mm1 \n\t"
2197 "psrad $8, %%mm2 \n\t"
2198 "psrad $8, %%mm3 \n\t"
2200 "packssdw %%mm1, %%mm4 \n\t"
2201 "packssdw %%mm3, %%mm2 \n\t"
2202 "pmaddwd %%mm5, %%mm4 \n\t"
2203 "pmaddwd %%mm5, %%mm2 \n\t"
2204 "add $24, %%"REG_d
" \n\t"
2205 "packssdw %%mm2, %%mm4 \n\t"
2206 "psraw $7, %%mm4 \n\t"
2208 "packuswb %%mm4, %%mm0 \n\t"
2209 "paddusb "MANGLE(bgr2YOffset
)", %%mm0 \n\t"
2211 MOVNTQ
" %%mm0, (%1, %%"REG_a
") \n\t"
2212 "add $8, %%"REG_a
" \n\t"
2214 : : "r" (src
+width
*3), "r" (ydst
+width
), "g" (-width
)
2215 : "%"REG_a
, "%"REG_d
2222 "mov %4, %%"REG_a
" \n\t"
2223 "movq "MANGLE(w1111
)", %%mm5 \n\t"
2224 "movq "MANGLE(bgr2UCoeff
)", %%mm6 \n\t"
2225 "pxor %%mm7, %%mm7 \n\t"
2226 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
2227 "add %%"REG_d
", %%"REG_d
" \n\t"
2230 PREFETCH
" 64(%0, %%"REG_d
") \n\t"
2231 PREFETCH
" 64(%1, %%"REG_d
") \n\t"
2232 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2233 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
2234 "movq (%1, %%"REG_d
"), %%mm1 \n\t"
2235 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
2236 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t"
2237 PAVGB
" %%mm1, %%mm0 \n\t"
2238 PAVGB
" %%mm3, %%mm2 \n\t"
2239 "movq %%mm0, %%mm1 \n\t"
2240 "movq %%mm2, %%mm3 \n\t"
2241 "psrlq $24, %%mm0 \n\t"
2242 "psrlq $24, %%mm2 \n\t"
2243 PAVGB
" %%mm1, %%mm0 \n\t"
2244 PAVGB
" %%mm3, %%mm2 \n\t"
2245 "punpcklbw %%mm7, %%mm0 \n\t"
2246 "punpcklbw %%mm7, %%mm2 \n\t"
2248 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
2249 "movd (%1, %%"REG_d
"), %%mm1 \n\t"
2250 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
2251 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t"
2252 "punpcklbw %%mm7, %%mm0 \n\t"
2253 "punpcklbw %%mm7, %%mm1 \n\t"
2254 "punpcklbw %%mm7, %%mm2 \n\t"
2255 "punpcklbw %%mm7, %%mm3 \n\t"
2256 "paddw %%mm1, %%mm0 \n\t"
2257 "paddw %%mm3, %%mm2 \n\t"
2258 "paddw %%mm2, %%mm0 \n\t"
2259 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
2260 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t"
2261 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
2262 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t"
2263 "punpcklbw %%mm7, %%mm4 \n\t"
2264 "punpcklbw %%mm7, %%mm1 \n\t"
2265 "punpcklbw %%mm7, %%mm2 \n\t"
2266 "punpcklbw %%mm7, %%mm3 \n\t"
2267 "paddw %%mm1, %%mm4 \n\t"
2268 "paddw %%mm3, %%mm2 \n\t"
2269 "paddw %%mm4, %%mm2 \n\t"
2270 "psrlw $2, %%mm0 \n\t"
2271 "psrlw $2, %%mm2 \n\t"
2273 "movq "MANGLE(bgr2VCoeff
)", %%mm1 \n\t"
2274 "movq "MANGLE(bgr2VCoeff
)", %%mm3 \n\t"
2276 "pmaddwd %%mm0, %%mm1 \n\t"
2277 "pmaddwd %%mm2, %%mm3 \n\t"
2278 "pmaddwd %%mm6, %%mm0 \n\t"
2279 "pmaddwd %%mm6, %%mm2 \n\t"
2280 #ifndef FAST_BGR2YV12
2281 "psrad $8, %%mm0 \n\t"
2282 "psrad $8, %%mm1 \n\t"
2283 "psrad $8, %%mm2 \n\t"
2284 "psrad $8, %%mm3 \n\t"
2286 "packssdw %%mm2, %%mm0 \n\t"
2287 "packssdw %%mm3, %%mm1 \n\t"
2288 "pmaddwd %%mm5, %%mm0 \n\t"
2289 "pmaddwd %%mm5, %%mm1 \n\t"
2290 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2291 "psraw $7, %%mm0 \n\t"
2293 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2294 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
2295 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t"
2296 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
2297 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t"
2298 PAVGB
" %%mm1, %%mm4 \n\t"
2299 PAVGB
" %%mm3, %%mm2 \n\t"
2300 "movq %%mm4, %%mm1 \n\t"
2301 "movq %%mm2, %%mm3 \n\t"
2302 "psrlq $24, %%mm4 \n\t"
2303 "psrlq $24, %%mm2 \n\t"
2304 PAVGB
" %%mm1, %%mm4 \n\t"
2305 PAVGB
" %%mm3, %%mm2 \n\t"
2306 "punpcklbw %%mm7, %%mm4 \n\t"
2307 "punpcklbw %%mm7, %%mm2 \n\t"
2309 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
2310 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t"
2311 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
2312 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t"
2313 "punpcklbw %%mm7, %%mm4 \n\t"
2314 "punpcklbw %%mm7, %%mm1 \n\t"
2315 "punpcklbw %%mm7, %%mm2 \n\t"
2316 "punpcklbw %%mm7, %%mm3 \n\t"
2317 "paddw %%mm1, %%mm4 \n\t"
2318 "paddw %%mm3, %%mm2 \n\t"
2319 "paddw %%mm2, %%mm4 \n\t"
2320 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
2321 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t"
2322 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
2323 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t"
2324 "punpcklbw %%mm7, %%mm5 \n\t"
2325 "punpcklbw %%mm7, %%mm1 \n\t"
2326 "punpcklbw %%mm7, %%mm2 \n\t"
2327 "punpcklbw %%mm7, %%mm3 \n\t"
2328 "paddw %%mm1, %%mm5 \n\t"
2329 "paddw %%mm3, %%mm2 \n\t"
2330 "paddw %%mm5, %%mm2 \n\t"
2331 "movq "MANGLE(w1111
)", %%mm5 \n\t"
2332 "psrlw $2, %%mm4 \n\t"
2333 "psrlw $2, %%mm2 \n\t"
2335 "movq "MANGLE(bgr2VCoeff
)", %%mm1 \n\t"
2336 "movq "MANGLE(bgr2VCoeff
)", %%mm3 \n\t"
2338 "pmaddwd %%mm4, %%mm1 \n\t"
2339 "pmaddwd %%mm2, %%mm3 \n\t"
2340 "pmaddwd %%mm6, %%mm4 \n\t"
2341 "pmaddwd %%mm6, %%mm2 \n\t"
2342 #ifndef FAST_BGR2YV12
2343 "psrad $8, %%mm4 \n\t"
2344 "psrad $8, %%mm1 \n\t"
2345 "psrad $8, %%mm2 \n\t"
2346 "psrad $8, %%mm3 \n\t"
2348 "packssdw %%mm2, %%mm4 \n\t"
2349 "packssdw %%mm3, %%mm1 \n\t"
2350 "pmaddwd %%mm5, %%mm4 \n\t"
2351 "pmaddwd %%mm5, %%mm1 \n\t"
2352 "add $24, %%"REG_d
" \n\t"
2353 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2354 "psraw $7, %%mm4 \n\t"
2356 "movq %%mm0, %%mm1 \n\t"
2357 "punpckldq %%mm4, %%mm0 \n\t"
2358 "punpckhdq %%mm4, %%mm1 \n\t"
2359 "packsswb %%mm1, %%mm0 \n\t"
2360 "paddb "MANGLE(bgr2UVOffset
)", %%mm0 \n\t"
2361 "movd %%mm0, (%2, %%"REG_a
") \n\t"
2362 "punpckhdq %%mm0, %%mm0 \n\t"
2363 "movd %%mm0, (%3, %%"REG_a
") \n\t"
2364 "add $4, %%"REG_a
" \n\t"
2366 : : "r" (src
+chromWidth
*6), "r" (src
+srcStride
+chromWidth
*6), "r" (udst
+chromWidth
), "r" (vdst
+chromWidth
), "g" (-chromWidth
)
2367 : "%"REG_a
, "%"REG_d
2370 udst
+= chromStride
;
2371 vdst
+= chromStride
;
2375 asm volatile( EMMS
" \n\t"
2381 for (; y
<height
; y
+=2)
2384 for (i
=0; i
<chromWidth
; i
++)
2386 unsigned int b
= src
[6*i
+0];
2387 unsigned int g
= src
[6*i
+1];
2388 unsigned int r
= src
[6*i
+2];
2390 unsigned int Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2391 unsigned int V
= ((RV
*r
+ GV
*g
+ BV
*b
)>>RGB2YUV_SHIFT
) + 128;
2392 unsigned int U
= ((RU
*r
+ GU
*g
+ BU
*b
)>>RGB2YUV_SHIFT
) + 128;
2402 Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2408 for (i
=0; i
<chromWidth
; i
++)
2410 unsigned int b
= src
[6*i
+0];
2411 unsigned int g
= src
[6*i
+1];
2412 unsigned int r
= src
[6*i
+2];
2414 unsigned int Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2422 Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2425 udst
+= chromStride
;
2426 vdst
+= chromStride
;
2432 void RENAME(interleaveBytes
)(uint8_t *src1
, uint8_t *src2
, uint8_t *dest
,
2433 long width
, long height
, long src1Stride
,
2434 long src2Stride
, long dstStride
){
2437 for (h
=0; h
< height
; h
++)
2444 "xor %%"REG_a
", %%"REG_a
" \n\t"
2446 PREFETCH
" 64(%1, %%"REG_a
") \n\t"
2447 PREFETCH
" 64(%2, %%"REG_a
") \n\t"
2448 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t"
2449 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t"
2450 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t"
2451 "punpcklbw %%xmm2, %%xmm0 \n\t"
2452 "punpckhbw %%xmm2, %%xmm1 \n\t"
2453 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t"
2454 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t"
2455 "add $16, %%"REG_a
" \n\t"
2456 "cmp %3, %%"REG_a
" \n\t"
2458 ::"r"(dest
), "r"(src1
), "r"(src2
), "r" (width
-15)
2459 : "memory", "%"REG_a
""
2463 "xor %%"REG_a
", %%"REG_a
" \n\t"
2465 PREFETCH
" 64(%1, %%"REG_a
") \n\t"
2466 PREFETCH
" 64(%2, %%"REG_a
") \n\t"
2467 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
2468 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t"
2469 "movq %%mm0, %%mm1 \n\t"
2470 "movq %%mm2, %%mm3 \n\t"
2471 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
2472 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t"
2473 "punpcklbw %%mm4, %%mm0 \n\t"
2474 "punpckhbw %%mm4, %%mm1 \n\t"
2475 "punpcklbw %%mm5, %%mm2 \n\t"
2476 "punpckhbw %%mm5, %%mm3 \n\t"
2477 MOVNTQ
" %%mm0, (%0, %%"REG_a
", 2) \n\t"
2478 MOVNTQ
" %%mm1, 8(%0, %%"REG_a
", 2) \n\t"
2479 MOVNTQ
" %%mm2, 16(%0, %%"REG_a
", 2) \n\t"
2480 MOVNTQ
" %%mm3, 24(%0, %%"REG_a
", 2) \n\t"
2481 "add $16, %%"REG_a
" \n\t"
2482 "cmp %3, %%"REG_a
" \n\t"
2484 ::"r"(dest
), "r"(src1
), "r"(src2
), "r" (width
-15)
2485 : "memory", "%"REG_a
2488 for (w
= (width
&(~15)); w
< width
; w
++)
2490 dest
[2*w
+0] = src1
[w
];
2491 dest
[2*w
+1] = src2
[w
];
2494 for (w
=0; w
< width
; w
++)
2496 dest
[2*w
+0] = src1
[w
];
2497 dest
[2*w
+1] = src2
[w
];
2513 static inline void RENAME(vu9_to_vu12
)(const uint8_t *src1
, const uint8_t *src2
,
2514 uint8_t *dst1
, uint8_t *dst2
,
2515 long width
, long height
,
2516 long srcStride1
, long srcStride2
,
2517 long dstStride1
, long dstStride2
)
2520 w
=width
/2; h
=height
/2;
2525 ::"m"(*(src1
+srcStride1
)),"m"(*(src2
+srcStride2
)):"memory");
2528 const uint8_t* s1
=src1
+srcStride1
*(y
>>1);
2529 uint8_t* d
=dst1
+dstStride1
*y
;
2535 PREFETCH
" 32%1 \n\t"
2536 "movq %1, %%mm0 \n\t"
2537 "movq 8%1, %%mm2 \n\t"
2538 "movq 16%1, %%mm4 \n\t"
2539 "movq 24%1, %%mm6 \n\t"
2540 "movq %%mm0, %%mm1 \n\t"
2541 "movq %%mm2, %%mm3 \n\t"
2542 "movq %%mm4, %%mm5 \n\t"
2543 "movq %%mm6, %%mm7 \n\t"
2544 "punpcklbw %%mm0, %%mm0 \n\t"
2545 "punpckhbw %%mm1, %%mm1 \n\t"
2546 "punpcklbw %%mm2, %%mm2 \n\t"
2547 "punpckhbw %%mm3, %%mm3 \n\t"
2548 "punpcklbw %%mm4, %%mm4 \n\t"
2549 "punpckhbw %%mm5, %%mm5 \n\t"
2550 "punpcklbw %%mm6, %%mm6 \n\t"
2551 "punpckhbw %%mm7, %%mm7 \n\t"
2552 MOVNTQ
" %%mm0, %0 \n\t"
2553 MOVNTQ
" %%mm1, 8%0 \n\t"
2554 MOVNTQ
" %%mm2, 16%0 \n\t"
2555 MOVNTQ
" %%mm3, 24%0 \n\t"
2556 MOVNTQ
" %%mm4, 32%0 \n\t"
2557 MOVNTQ
" %%mm5, 40%0 \n\t"
2558 MOVNTQ
" %%mm6, 48%0 \n\t"
2559 MOVNTQ
" %%mm7, 56%0"
2565 for (;x
<w
;x
++) d
[2*x
]=d
[2*x
+1]=s1
[x
];
2568 const uint8_t* s2
=src2
+srcStride2
*(y
>>1);
2569 uint8_t* d
=dst2
+dstStride2
*y
;
2575 PREFETCH
" 32%1 \n\t"
2576 "movq %1, %%mm0 \n\t"
2577 "movq 8%1, %%mm2 \n\t"
2578 "movq 16%1, %%mm4 \n\t"
2579 "movq 24%1, %%mm6 \n\t"
2580 "movq %%mm0, %%mm1 \n\t"
2581 "movq %%mm2, %%mm3 \n\t"
2582 "movq %%mm4, %%mm5 \n\t"
2583 "movq %%mm6, %%mm7 \n\t"
2584 "punpcklbw %%mm0, %%mm0 \n\t"
2585 "punpckhbw %%mm1, %%mm1 \n\t"
2586 "punpcklbw %%mm2, %%mm2 \n\t"
2587 "punpckhbw %%mm3, %%mm3 \n\t"
2588 "punpcklbw %%mm4, %%mm4 \n\t"
2589 "punpckhbw %%mm5, %%mm5 \n\t"
2590 "punpcklbw %%mm6, %%mm6 \n\t"
2591 "punpckhbw %%mm7, %%mm7 \n\t"
2592 MOVNTQ
" %%mm0, %0 \n\t"
2593 MOVNTQ
" %%mm1, 8%0 \n\t"
2594 MOVNTQ
" %%mm2, 16%0 \n\t"
2595 MOVNTQ
" %%mm3, 24%0 \n\t"
2596 MOVNTQ
" %%mm4, 32%0 \n\t"
2597 MOVNTQ
" %%mm5, 40%0 \n\t"
2598 MOVNTQ
" %%mm6, 48%0 \n\t"
2599 MOVNTQ
" %%mm7, 56%0"
2605 for (;x
<w
;x
++) d
[2*x
]=d
[2*x
+1]=s2
[x
];
2616 static inline void RENAME(yvu9_to_yuy2
)(const uint8_t *src1
, const uint8_t *src2
, const uint8_t *src3
,
2618 long width
, long height
,
2619 long srcStride1
, long srcStride2
,
2620 long srcStride3
, long dstStride
)
2623 w
=width
/2; h
=height
;
2625 const uint8_t* yp
=src1
+srcStride1
*y
;
2626 const uint8_t* up
=src2
+srcStride2
*(y
>>2);
2627 const uint8_t* vp
=src3
+srcStride3
*(y
>>2);
2628 uint8_t* d
=dst
+dstStride
*y
;
2634 PREFETCH
" 32(%1, %0) \n\t"
2635 PREFETCH
" 32(%2, %0) \n\t"
2636 PREFETCH
" 32(%3, %0) \n\t"
2637 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2638 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2639 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2640 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2641 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2642 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2643 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2644 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2645 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2646 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2648 "movq %%mm1, %%mm6 \n\t"
2649 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2650 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2651 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2652 MOVNTQ
" %%mm0, (%4, %0, 8) \n\t"
2653 MOVNTQ
" %%mm3, 8(%4, %0, 8) \n\t"
2655 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2656 "movq 8(%1, %0, 4), %%mm0 \n\t"
2657 "movq %%mm0, %%mm3 \n\t"
2658 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2659 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2660 MOVNTQ
" %%mm0, 16(%4, %0, 8) \n\t"
2661 MOVNTQ
" %%mm3, 24(%4, %0, 8) \n\t"
2663 "movq %%mm4, %%mm6 \n\t"
2664 "movq 16(%1, %0, 4), %%mm0 \n\t"
2665 "movq %%mm0, %%mm3 \n\t"
2666 "punpcklbw %%mm5, %%mm4 \n\t"
2667 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2668 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2669 MOVNTQ
" %%mm0, 32(%4, %0, 8) \n\t"
2670 MOVNTQ
" %%mm3, 40(%4, %0, 8) \n\t"
2672 "punpckhbw %%mm5, %%mm6 \n\t"
2673 "movq 24(%1, %0, 4), %%mm0 \n\t"
2674 "movq %%mm0, %%mm3 \n\t"
2675 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2676 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2677 MOVNTQ
" %%mm0, 48(%4, %0, 8) \n\t"
2678 MOVNTQ
" %%mm3, 56(%4, %0, 8) \n\t"
2681 : "r"(yp
), "r" (up
), "r"(vp
), "r"(d
)
2687 const long x2
= x
<<2;
2690 d
[8*x
+2] = yp
[x2
+1];
2692 d
[8*x
+4] = yp
[x2
+2];
2694 d
[8*x
+6] = yp
[x2
+3];
2707 static inline void RENAME(rgb2rgb_init
)(void){
2708 rgb15to16
= RENAME(rgb15to16
);
2709 rgb15to24
= RENAME(rgb15to24
);
2710 rgb15to32
= RENAME(rgb15to32
);
2711 rgb16to24
= RENAME(rgb16to24
);
2712 rgb16to32
= RENAME(rgb16to32
);
2713 rgb16to15
= RENAME(rgb16to15
);
2714 rgb24to16
= RENAME(rgb24to16
);
2715 rgb24to15
= RENAME(rgb24to15
);
2716 rgb24to32
= RENAME(rgb24to32
);
2717 rgb32to16
= RENAME(rgb32to16
);
2718 rgb32to15
= RENAME(rgb32to15
);
2719 rgb32to24
= RENAME(rgb32to24
);
2720 rgb24tobgr15
= RENAME(rgb24tobgr15
);
2721 rgb24tobgr16
= RENAME(rgb24tobgr16
);
2722 rgb24tobgr24
= RENAME(rgb24tobgr24
);
2723 rgb32tobgr32
= RENAME(rgb32tobgr32
);
2724 rgb32tobgr16
= RENAME(rgb32tobgr16
);
2725 rgb32tobgr15
= RENAME(rgb32tobgr15
);
2726 yv12toyuy2
= RENAME(yv12toyuy2
);
2727 yv12touyvy
= RENAME(yv12touyvy
);
2728 yuv422ptoyuy2
= RENAME(yuv422ptoyuy2
);
2729 yuy2toyv12
= RENAME(yuy2toyv12
);
2730 // uyvytoyv12 = RENAME(uyvytoyv12);
2731 // yvu9toyv12 = RENAME(yvu9toyv12);
2732 planar2x
= RENAME(planar2x
);
2733 rgb24toyv12
= RENAME(rgb24toyv12
);
2734 interleaveBytes
= RENAME(interleaveBytes
);
2735 vu9_to_vu12
= RENAME(vu9_to_vu12
);
2736 yvu9_to_yuy2
= RENAME(yvu9_to_yuy2
);