2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
47 #define PREFETCH "prefetch"
48 #define PREFETCHW "prefetchw"
49 #define PAVGB "pavgusb"
51 #define PREFETCH "prefetchnta"
52 #define PREFETCHW "prefetcht0"
55 #define PREFETCH " # nop"
56 #define PREFETCHW " # nop"
60 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
67 #define MOVNTQ "movntq"
68 #define SFENCE "sfence"
71 #define SFENCE " # nop"
74 static inline void RENAME(rgb24tobgr32
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
77 const uint8_t *s
= src
;
80 const uint8_t *mm_end
;
84 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
86 __asm__
volatile("movq %0, %%mm7"::"m"(mask32a
):"memory");
92 "punpckldq 3%1, %%mm0 \n\t"
93 "movd 6%1, %%mm1 \n\t"
94 "punpckldq 9%1, %%mm1 \n\t"
95 "movd 12%1, %%mm2 \n\t"
96 "punpckldq 15%1, %%mm2 \n\t"
97 "movd 18%1, %%mm3 \n\t"
98 "punpckldq 21%1, %%mm3 \n\t"
99 "por %%mm7, %%mm0 \n\t"
100 "por %%mm7, %%mm1 \n\t"
101 "por %%mm7, %%mm2 \n\t"
102 "por %%mm7, %%mm3 \n\t"
103 MOVNTQ
" %%mm0, %0 \n\t"
104 MOVNTQ
" %%mm1, 8%0 \n\t"
105 MOVNTQ
" %%mm2, 16%0 \n\t"
113 __asm__
volatile(SFENCE:::"memory");
114 __asm__
volatile(EMMS:::"memory");
118 #ifdef WORDS_BIGENDIAN
119 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
134 static inline void RENAME(rgb32tobgr24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
137 const uint8_t *s
= src
;
140 const uint8_t *mm_end
;
144 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
150 "movq %1, %%mm0 \n\t"
151 "movq 8%1, %%mm1 \n\t"
152 "movq 16%1, %%mm4 \n\t"
153 "movq 24%1, %%mm5 \n\t"
154 "movq %%mm0, %%mm2 \n\t"
155 "movq %%mm1, %%mm3 \n\t"
156 "movq %%mm4, %%mm6 \n\t"
157 "movq %%mm5, %%mm7 \n\t"
158 "psrlq $8, %%mm2 \n\t"
159 "psrlq $8, %%mm3 \n\t"
160 "psrlq $8, %%mm6 \n\t"
161 "psrlq $8, %%mm7 \n\t"
162 "pand %2, %%mm0 \n\t"
163 "pand %2, %%mm1 \n\t"
164 "pand %2, %%mm4 \n\t"
165 "pand %2, %%mm5 \n\t"
166 "pand %3, %%mm2 \n\t"
167 "pand %3, %%mm3 \n\t"
168 "pand %3, %%mm6 \n\t"
169 "pand %3, %%mm7 \n\t"
170 "por %%mm2, %%mm0 \n\t"
171 "por %%mm3, %%mm1 \n\t"
172 "por %%mm6, %%mm4 \n\t"
173 "por %%mm7, %%mm5 \n\t"
175 "movq %%mm1, %%mm2 \n\t"
176 "movq %%mm4, %%mm3 \n\t"
177 "psllq $48, %%mm2 \n\t"
178 "psllq $32, %%mm3 \n\t"
179 "pand %4, %%mm2 \n\t"
180 "pand %5, %%mm3 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "psrlq $16, %%mm1 \n\t"
183 "psrlq $32, %%mm4 \n\t"
184 "psllq $16, %%mm5 \n\t"
185 "por %%mm3, %%mm1 \n\t"
186 "pand %6, %%mm5 \n\t"
187 "por %%mm5, %%mm4 \n\t"
189 MOVNTQ
" %%mm0, %0 \n\t"
190 MOVNTQ
" %%mm1, 8%0 \n\t"
193 :"m"(*s
),"m"(mask24l
),
194 "m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
199 __asm__
volatile(SFENCE:::"memory");
200 __asm__
volatile(EMMS:::"memory");
204 #ifdef WORDS_BIGENDIAN
205 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
221 original by Strepto/Astral
222 ported to gcc & bugfixed: A'rpi
223 MMX2, 3DNOW optimization by Nick Kurshev
224 32-bit C version, and and&add trick by Michael Niedermayer
226 static inline void RENAME(rgb15to16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
228 register const uint8_t* s
=src
;
229 register uint8_t* d
=dst
;
230 register const uint8_t *end
;
231 const uint8_t *mm_end
;
234 __asm__
volatile(PREFETCH
" %0"::"m"(*s
));
235 __asm__
volatile("movq %0, %%mm4"::"m"(mask15s
));
241 "movq %1, %%mm0 \n\t"
242 "movq 8%1, %%mm2 \n\t"
243 "movq %%mm0, %%mm1 \n\t"
244 "movq %%mm2, %%mm3 \n\t"
245 "pand %%mm4, %%mm0 \n\t"
246 "pand %%mm4, %%mm2 \n\t"
247 "paddw %%mm1, %%mm0 \n\t"
248 "paddw %%mm3, %%mm2 \n\t"
249 MOVNTQ
" %%mm0, %0 \n\t"
257 __asm__
volatile(SFENCE:::"memory");
258 __asm__
volatile(EMMS:::"memory");
263 register unsigned x
= *((const uint32_t *)s
);
264 *((uint32_t *)d
) = (x
&0x7FFF7FFF) + (x
&0x7FE07FE0);
270 register unsigned short x
= *((const uint16_t *)s
);
271 *((uint16_t *)d
) = (x
&0x7FFF) + (x
&0x7FE0);
275 static inline void RENAME(rgb16to15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
277 register const uint8_t* s
=src
;
278 register uint8_t* d
=dst
;
279 register const uint8_t *end
;
280 const uint8_t *mm_end
;
283 __asm__
volatile(PREFETCH
" %0"::"m"(*s
));
284 __asm__
volatile("movq %0, %%mm7"::"m"(mask15rg
));
285 __asm__
volatile("movq %0, %%mm6"::"m"(mask15b
));
291 "movq %1, %%mm0 \n\t"
292 "movq 8%1, %%mm2 \n\t"
293 "movq %%mm0, %%mm1 \n\t"
294 "movq %%mm2, %%mm3 \n\t"
295 "psrlq $1, %%mm0 \n\t"
296 "psrlq $1, %%mm2 \n\t"
297 "pand %%mm7, %%mm0 \n\t"
298 "pand %%mm7, %%mm2 \n\t"
299 "pand %%mm6, %%mm1 \n\t"
300 "pand %%mm6, %%mm3 \n\t"
301 "por %%mm1, %%mm0 \n\t"
302 "por %%mm3, %%mm2 \n\t"
303 MOVNTQ
" %%mm0, %0 \n\t"
311 __asm__
volatile(SFENCE:::"memory");
312 __asm__
volatile(EMMS:::"memory");
317 register uint32_t x
= *((const uint32_t*)s
);
318 *((uint32_t *)d
) = ((x
>>1)&0x7FE07FE0) | (x
&0x001F001F);
324 register uint16_t x
= *((const uint16_t*)s
);
325 *((uint16_t *)d
) = ((x
>>1)&0x7FE0) | (x
&0x001F);
331 static inline void RENAME(rgb32to16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
333 const uint8_t *s
= src
;
336 const uint8_t *mm_end
;
338 uint16_t *d
= (uint16_t *)dst
;
342 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
344 "movq %3, %%mm5 \n\t"
345 "movq %4, %%mm6 \n\t"
346 "movq %5, %%mm7 \n\t"
350 PREFETCH
" 32(%1) \n\t"
351 "movd (%1), %%mm0 \n\t"
352 "movd 4(%1), %%mm3 \n\t"
353 "punpckldq 8(%1), %%mm0 \n\t"
354 "punpckldq 12(%1), %%mm3 \n\t"
355 "movq %%mm0, %%mm1 \n\t"
356 "movq %%mm3, %%mm4 \n\t"
357 "pand %%mm6, %%mm0 \n\t"
358 "pand %%mm6, %%mm3 \n\t"
359 "pmaddwd %%mm7, %%mm0 \n\t"
360 "pmaddwd %%mm7, %%mm3 \n\t"
361 "pand %%mm5, %%mm1 \n\t"
362 "pand %%mm5, %%mm4 \n\t"
363 "por %%mm1, %%mm0 \n\t"
364 "por %%mm4, %%mm3 \n\t"
365 "psrld $5, %%mm0 \n\t"
366 "pslld $11, %%mm3 \n\t"
367 "por %%mm3, %%mm0 \n\t"
368 MOVNTQ
" %%mm0, (%0) \n\t"
375 : "r" (mm_end
), "m" (mask3216g
), "m" (mask3216br
), "m" (mul3216
)
378 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
380 "movq %0, %%mm7 \n\t"
381 "movq %1, %%mm6 \n\t"
382 ::"m"(red_16mask
),"m"(green_16mask
));
387 "movd %1, %%mm0 \n\t"
388 "movd 4%1, %%mm3 \n\t"
389 "punpckldq 8%1, %%mm0 \n\t"
390 "punpckldq 12%1, %%mm3 \n\t"
391 "movq %%mm0, %%mm1 \n\t"
392 "movq %%mm0, %%mm2 \n\t"
393 "movq %%mm3, %%mm4 \n\t"
394 "movq %%mm3, %%mm5 \n\t"
395 "psrlq $3, %%mm0 \n\t"
396 "psrlq $3, %%mm3 \n\t"
397 "pand %2, %%mm0 \n\t"
398 "pand %2, %%mm3 \n\t"
399 "psrlq $5, %%mm1 \n\t"
400 "psrlq $5, %%mm4 \n\t"
401 "pand %%mm6, %%mm1 \n\t"
402 "pand %%mm6, %%mm4 \n\t"
403 "psrlq $8, %%mm2 \n\t"
404 "psrlq $8, %%mm5 \n\t"
405 "pand %%mm7, %%mm2 \n\t"
406 "pand %%mm7, %%mm5 \n\t"
407 "por %%mm1, %%mm0 \n\t"
408 "por %%mm4, %%mm3 \n\t"
409 "por %%mm2, %%mm0 \n\t"
410 "por %%mm5, %%mm3 \n\t"
411 "psllq $16, %%mm3 \n\t"
412 "por %%mm3, %%mm0 \n\t"
413 MOVNTQ
" %%mm0, %0 \n\t"
414 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
419 __asm__
volatile(SFENCE:::"memory");
420 __asm__
volatile(EMMS:::"memory");
424 register int rgb
= *(const uint32_t*)s
; s
+= 4;
425 *d
++ = ((rgb
&0xFF)>>3) + ((rgb
&0xFC00)>>5) + ((rgb
&0xF80000)>>8);
429 static inline void RENAME(rgb32tobgr16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
431 const uint8_t *s
= src
;
434 const uint8_t *mm_end
;
436 uint16_t *d
= (uint16_t *)dst
;
439 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
441 "movq %0, %%mm7 \n\t"
442 "movq %1, %%mm6 \n\t"
443 ::"m"(red_16mask
),"m"(green_16mask
));
449 "movd %1, %%mm0 \n\t"
450 "movd 4%1, %%mm3 \n\t"
451 "punpckldq 8%1, %%mm0 \n\t"
452 "punpckldq 12%1, %%mm3 \n\t"
453 "movq %%mm0, %%mm1 \n\t"
454 "movq %%mm0, %%mm2 \n\t"
455 "movq %%mm3, %%mm4 \n\t"
456 "movq %%mm3, %%mm5 \n\t"
457 "psllq $8, %%mm0 \n\t"
458 "psllq $8, %%mm3 \n\t"
459 "pand %%mm7, %%mm0 \n\t"
460 "pand %%mm7, %%mm3 \n\t"
461 "psrlq $5, %%mm1 \n\t"
462 "psrlq $5, %%mm4 \n\t"
463 "pand %%mm6, %%mm1 \n\t"
464 "pand %%mm6, %%mm4 \n\t"
465 "psrlq $19, %%mm2 \n\t"
466 "psrlq $19, %%mm5 \n\t"
467 "pand %2, %%mm2 \n\t"
468 "pand %2, %%mm5 \n\t"
469 "por %%mm1, %%mm0 \n\t"
470 "por %%mm4, %%mm3 \n\t"
471 "por %%mm2, %%mm0 \n\t"
472 "por %%mm5, %%mm3 \n\t"
473 "psllq $16, %%mm3 \n\t"
474 "por %%mm3, %%mm0 \n\t"
475 MOVNTQ
" %%mm0, %0 \n\t"
476 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
480 __asm__
volatile(SFENCE:::"memory");
481 __asm__
volatile(EMMS:::"memory");
485 register int rgb
= *(const uint32_t*)s
; s
+= 4;
486 *d
++ = ((rgb
&0xF8)<<8) + ((rgb
&0xFC00)>>5) + ((rgb
&0xF80000)>>19);
490 static inline void RENAME(rgb32to15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
492 const uint8_t *s
= src
;
495 const uint8_t *mm_end
;
497 uint16_t *d
= (uint16_t *)dst
;
501 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
503 "movq %3, %%mm5 \n\t"
504 "movq %4, %%mm6 \n\t"
505 "movq %5, %%mm7 \n\t"
509 PREFETCH
" 32(%1) \n\t"
510 "movd (%1), %%mm0 \n\t"
511 "movd 4(%1), %%mm3 \n\t"
512 "punpckldq 8(%1), %%mm0 \n\t"
513 "punpckldq 12(%1), %%mm3 \n\t"
514 "movq %%mm0, %%mm1 \n\t"
515 "movq %%mm3, %%mm4 \n\t"
516 "pand %%mm6, %%mm0 \n\t"
517 "pand %%mm6, %%mm3 \n\t"
518 "pmaddwd %%mm7, %%mm0 \n\t"
519 "pmaddwd %%mm7, %%mm3 \n\t"
520 "pand %%mm5, %%mm1 \n\t"
521 "pand %%mm5, %%mm4 \n\t"
522 "por %%mm1, %%mm0 \n\t"
523 "por %%mm4, %%mm3 \n\t"
524 "psrld $6, %%mm0 \n\t"
525 "pslld $10, %%mm3 \n\t"
526 "por %%mm3, %%mm0 \n\t"
527 MOVNTQ
" %%mm0, (%0) \n\t"
534 : "r" (mm_end
), "m" (mask3215g
), "m" (mask3216br
), "m" (mul3215
)
537 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
539 "movq %0, %%mm7 \n\t"
540 "movq %1, %%mm6 \n\t"
541 ::"m"(red_15mask
),"m"(green_15mask
));
546 "movd %1, %%mm0 \n\t"
547 "movd 4%1, %%mm3 \n\t"
548 "punpckldq 8%1, %%mm0 \n\t"
549 "punpckldq 12%1, %%mm3 \n\t"
550 "movq %%mm0, %%mm1 \n\t"
551 "movq %%mm0, %%mm2 \n\t"
552 "movq %%mm3, %%mm4 \n\t"
553 "movq %%mm3, %%mm5 \n\t"
554 "psrlq $3, %%mm0 \n\t"
555 "psrlq $3, %%mm3 \n\t"
556 "pand %2, %%mm0 \n\t"
557 "pand %2, %%mm3 \n\t"
558 "psrlq $6, %%mm1 \n\t"
559 "psrlq $6, %%mm4 \n\t"
560 "pand %%mm6, %%mm1 \n\t"
561 "pand %%mm6, %%mm4 \n\t"
562 "psrlq $9, %%mm2 \n\t"
563 "psrlq $9, %%mm5 \n\t"
564 "pand %%mm7, %%mm2 \n\t"
565 "pand %%mm7, %%mm5 \n\t"
566 "por %%mm1, %%mm0 \n\t"
567 "por %%mm4, %%mm3 \n\t"
568 "por %%mm2, %%mm0 \n\t"
569 "por %%mm5, %%mm3 \n\t"
570 "psllq $16, %%mm3 \n\t"
571 "por %%mm3, %%mm0 \n\t"
572 MOVNTQ
" %%mm0, %0 \n\t"
573 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
578 __asm__
volatile(SFENCE:::"memory");
579 __asm__
volatile(EMMS:::"memory");
583 register int rgb
= *(const uint32_t*)s
; s
+= 4;
584 *d
++ = ((rgb
&0xFF)>>3) + ((rgb
&0xF800)>>6) + ((rgb
&0xF80000)>>9);
588 static inline void RENAME(rgb32tobgr15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
590 const uint8_t *s
= src
;
593 const uint8_t *mm_end
;
595 uint16_t *d
= (uint16_t *)dst
;
598 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
600 "movq %0, %%mm7 \n\t"
601 "movq %1, %%mm6 \n\t"
602 ::"m"(red_15mask
),"m"(green_15mask
));
608 "movd %1, %%mm0 \n\t"
609 "movd 4%1, %%mm3 \n\t"
610 "punpckldq 8%1, %%mm0 \n\t"
611 "punpckldq 12%1, %%mm3 \n\t"
612 "movq %%mm0, %%mm1 \n\t"
613 "movq %%mm0, %%mm2 \n\t"
614 "movq %%mm3, %%mm4 \n\t"
615 "movq %%mm3, %%mm5 \n\t"
616 "psllq $7, %%mm0 \n\t"
617 "psllq $7, %%mm3 \n\t"
618 "pand %%mm7, %%mm0 \n\t"
619 "pand %%mm7, %%mm3 \n\t"
620 "psrlq $6, %%mm1 \n\t"
621 "psrlq $6, %%mm4 \n\t"
622 "pand %%mm6, %%mm1 \n\t"
623 "pand %%mm6, %%mm4 \n\t"
624 "psrlq $19, %%mm2 \n\t"
625 "psrlq $19, %%mm5 \n\t"
626 "pand %2, %%mm2 \n\t"
627 "pand %2, %%mm5 \n\t"
628 "por %%mm1, %%mm0 \n\t"
629 "por %%mm4, %%mm3 \n\t"
630 "por %%mm2, %%mm0 \n\t"
631 "por %%mm5, %%mm3 \n\t"
632 "psllq $16, %%mm3 \n\t"
633 "por %%mm3, %%mm0 \n\t"
634 MOVNTQ
" %%mm0, %0 \n\t"
635 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
639 __asm__
volatile(SFENCE:::"memory");
640 __asm__
volatile(EMMS:::"memory");
644 register int rgb
= *(const uint32_t*)s
; s
+= 4;
645 *d
++ = ((rgb
&0xF8)<<7) + ((rgb
&0xF800)>>6) + ((rgb
&0xF80000)>>19);
649 static inline void RENAME(rgb24tobgr16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
651 const uint8_t *s
= src
;
654 const uint8_t *mm_end
;
656 uint16_t *d
= (uint16_t *)dst
;
659 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
661 "movq %0, %%mm7 \n\t"
662 "movq %1, %%mm6 \n\t"
663 ::"m"(red_16mask
),"m"(green_16mask
));
669 "movd %1, %%mm0 \n\t"
670 "movd 3%1, %%mm3 \n\t"
671 "punpckldq 6%1, %%mm0 \n\t"
672 "punpckldq 9%1, %%mm3 \n\t"
673 "movq %%mm0, %%mm1 \n\t"
674 "movq %%mm0, %%mm2 \n\t"
675 "movq %%mm3, %%mm4 \n\t"
676 "movq %%mm3, %%mm5 \n\t"
677 "psrlq $3, %%mm0 \n\t"
678 "psrlq $3, %%mm3 \n\t"
679 "pand %2, %%mm0 \n\t"
680 "pand %2, %%mm3 \n\t"
681 "psrlq $5, %%mm1 \n\t"
682 "psrlq $5, %%mm4 \n\t"
683 "pand %%mm6, %%mm1 \n\t"
684 "pand %%mm6, %%mm4 \n\t"
685 "psrlq $8, %%mm2 \n\t"
686 "psrlq $8, %%mm5 \n\t"
687 "pand %%mm7, %%mm2 \n\t"
688 "pand %%mm7, %%mm5 \n\t"
689 "por %%mm1, %%mm0 \n\t"
690 "por %%mm4, %%mm3 \n\t"
691 "por %%mm2, %%mm0 \n\t"
692 "por %%mm5, %%mm3 \n\t"
693 "psllq $16, %%mm3 \n\t"
694 "por %%mm3, %%mm0 \n\t"
695 MOVNTQ
" %%mm0, %0 \n\t"
696 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
700 __asm__
volatile(SFENCE:::"memory");
701 __asm__
volatile(EMMS:::"memory");
708 *d
++ = (b
>>3) | ((g
&0xFC)<<3) | ((r
&0xF8)<<8);
712 static inline void RENAME(rgb24to16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
714 const uint8_t *s
= src
;
717 const uint8_t *mm_end
;
719 uint16_t *d
= (uint16_t *)dst
;
722 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
724 "movq %0, %%mm7 \n\t"
725 "movq %1, %%mm6 \n\t"
726 ::"m"(red_16mask
),"m"(green_16mask
));
732 "movd %1, %%mm0 \n\t"
733 "movd 3%1, %%mm3 \n\t"
734 "punpckldq 6%1, %%mm0 \n\t"
735 "punpckldq 9%1, %%mm3 \n\t"
736 "movq %%mm0, %%mm1 \n\t"
737 "movq %%mm0, %%mm2 \n\t"
738 "movq %%mm3, %%mm4 \n\t"
739 "movq %%mm3, %%mm5 \n\t"
740 "psllq $8, %%mm0 \n\t"
741 "psllq $8, %%mm3 \n\t"
742 "pand %%mm7, %%mm0 \n\t"
743 "pand %%mm7, %%mm3 \n\t"
744 "psrlq $5, %%mm1 \n\t"
745 "psrlq $5, %%mm4 \n\t"
746 "pand %%mm6, %%mm1 \n\t"
747 "pand %%mm6, %%mm4 \n\t"
748 "psrlq $19, %%mm2 \n\t"
749 "psrlq $19, %%mm5 \n\t"
750 "pand %2, %%mm2 \n\t"
751 "pand %2, %%mm5 \n\t"
752 "por %%mm1, %%mm0 \n\t"
753 "por %%mm4, %%mm3 \n\t"
754 "por %%mm2, %%mm0 \n\t"
755 "por %%mm5, %%mm3 \n\t"
756 "psllq $16, %%mm3 \n\t"
757 "por %%mm3, %%mm0 \n\t"
758 MOVNTQ
" %%mm0, %0 \n\t"
759 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
763 __asm__
volatile(SFENCE:::"memory");
764 __asm__
volatile(EMMS:::"memory");
771 *d
++ = (b
>>3) | ((g
&0xFC)<<3) | ((r
&0xF8)<<8);
775 static inline void RENAME(rgb24tobgr15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
777 const uint8_t *s
= src
;
780 const uint8_t *mm_end
;
782 uint16_t *d
= (uint16_t *)dst
;
785 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
787 "movq %0, %%mm7 \n\t"
788 "movq %1, %%mm6 \n\t"
789 ::"m"(red_15mask
),"m"(green_15mask
));
795 "movd %1, %%mm0 \n\t"
796 "movd 3%1, %%mm3 \n\t"
797 "punpckldq 6%1, %%mm0 \n\t"
798 "punpckldq 9%1, %%mm3 \n\t"
799 "movq %%mm0, %%mm1 \n\t"
800 "movq %%mm0, %%mm2 \n\t"
801 "movq %%mm3, %%mm4 \n\t"
802 "movq %%mm3, %%mm5 \n\t"
803 "psrlq $3, %%mm0 \n\t"
804 "psrlq $3, %%mm3 \n\t"
805 "pand %2, %%mm0 \n\t"
806 "pand %2, %%mm3 \n\t"
807 "psrlq $6, %%mm1 \n\t"
808 "psrlq $6, %%mm4 \n\t"
809 "pand %%mm6, %%mm1 \n\t"
810 "pand %%mm6, %%mm4 \n\t"
811 "psrlq $9, %%mm2 \n\t"
812 "psrlq $9, %%mm5 \n\t"
813 "pand %%mm7, %%mm2 \n\t"
814 "pand %%mm7, %%mm5 \n\t"
815 "por %%mm1, %%mm0 \n\t"
816 "por %%mm4, %%mm3 \n\t"
817 "por %%mm2, %%mm0 \n\t"
818 "por %%mm5, %%mm3 \n\t"
819 "psllq $16, %%mm3 \n\t"
820 "por %%mm3, %%mm0 \n\t"
821 MOVNTQ
" %%mm0, %0 \n\t"
822 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
826 __asm__
volatile(SFENCE:::"memory");
827 __asm__
volatile(EMMS:::"memory");
834 *d
++ = (b
>>3) | ((g
&0xF8)<<2) | ((r
&0xF8)<<7);
838 static inline void RENAME(rgb24to15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
840 const uint8_t *s
= src
;
843 const uint8_t *mm_end
;
845 uint16_t *d
= (uint16_t *)dst
;
848 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
850 "movq %0, %%mm7 \n\t"
851 "movq %1, %%mm6 \n\t"
852 ::"m"(red_15mask
),"m"(green_15mask
));
858 "movd %1, %%mm0 \n\t"
859 "movd 3%1, %%mm3 \n\t"
860 "punpckldq 6%1, %%mm0 \n\t"
861 "punpckldq 9%1, %%mm3 \n\t"
862 "movq %%mm0, %%mm1 \n\t"
863 "movq %%mm0, %%mm2 \n\t"
864 "movq %%mm3, %%mm4 \n\t"
865 "movq %%mm3, %%mm5 \n\t"
866 "psllq $7, %%mm0 \n\t"
867 "psllq $7, %%mm3 \n\t"
868 "pand %%mm7, %%mm0 \n\t"
869 "pand %%mm7, %%mm3 \n\t"
870 "psrlq $6, %%mm1 \n\t"
871 "psrlq $6, %%mm4 \n\t"
872 "pand %%mm6, %%mm1 \n\t"
873 "pand %%mm6, %%mm4 \n\t"
874 "psrlq $19, %%mm2 \n\t"
875 "psrlq $19, %%mm5 \n\t"
876 "pand %2, %%mm2 \n\t"
877 "pand %2, %%mm5 \n\t"
878 "por %%mm1, %%mm0 \n\t"
879 "por %%mm4, %%mm3 \n\t"
880 "por %%mm2, %%mm0 \n\t"
881 "por %%mm5, %%mm3 \n\t"
882 "psllq $16, %%mm3 \n\t"
883 "por %%mm3, %%mm0 \n\t"
884 MOVNTQ
" %%mm0, %0 \n\t"
885 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
889 __asm__
volatile(SFENCE:::"memory");
890 __asm__
volatile(EMMS:::"memory");
897 *d
++ = (b
>>3) | ((g
&0xF8)<<2) | ((r
&0xF8)<<7);
902 I use less accurate approximation here by simply left-shifting the input
903 value and filling the low order bits with zeroes. This method improves PNG
904 compression but this scheme cannot reproduce white exactly, since it does
905 not generate an all-ones maximum value; the net effect is to darken the
908 The better method should be "left bit replication":
918 | leftmost bits repeated to fill open bits
922 static inline void RENAME(rgb15tobgr24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
926 const uint16_t *mm_end
;
929 const uint16_t *s
= (const uint16_t*)src
;
930 end
= s
+ src_size
/2;
932 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
938 "movq %1, %%mm0 \n\t"
939 "movq %1, %%mm1 \n\t"
940 "movq %1, %%mm2 \n\t"
941 "pand %2, %%mm0 \n\t"
942 "pand %3, %%mm1 \n\t"
943 "pand %4, %%mm2 \n\t"
944 "psllq $3, %%mm0 \n\t"
945 "psrlq $2, %%mm1 \n\t"
946 "psrlq $7, %%mm2 \n\t"
947 "movq %%mm0, %%mm3 \n\t"
948 "movq %%mm1, %%mm4 \n\t"
949 "movq %%mm2, %%mm5 \n\t"
950 "punpcklwd %5, %%mm0 \n\t"
951 "punpcklwd %5, %%mm1 \n\t"
952 "punpcklwd %5, %%mm2 \n\t"
953 "punpckhwd %5, %%mm3 \n\t"
954 "punpckhwd %5, %%mm4 \n\t"
955 "punpckhwd %5, %%mm5 \n\t"
956 "psllq $8, %%mm1 \n\t"
957 "psllq $16, %%mm2 \n\t"
958 "por %%mm1, %%mm0 \n\t"
959 "por %%mm2, %%mm0 \n\t"
960 "psllq $8, %%mm4 \n\t"
961 "psllq $16, %%mm5 \n\t"
962 "por %%mm4, %%mm3 \n\t"
963 "por %%mm5, %%mm3 \n\t"
965 "movq %%mm0, %%mm6 \n\t"
966 "movq %%mm3, %%mm7 \n\t"
968 "movq 8%1, %%mm0 \n\t"
969 "movq 8%1, %%mm1 \n\t"
970 "movq 8%1, %%mm2 \n\t"
971 "pand %2, %%mm0 \n\t"
972 "pand %3, %%mm1 \n\t"
973 "pand %4, %%mm2 \n\t"
974 "psllq $3, %%mm0 \n\t"
975 "psrlq $2, %%mm1 \n\t"
976 "psrlq $7, %%mm2 \n\t"
977 "movq %%mm0, %%mm3 \n\t"
978 "movq %%mm1, %%mm4 \n\t"
979 "movq %%mm2, %%mm5 \n\t"
980 "punpcklwd %5, %%mm0 \n\t"
981 "punpcklwd %5, %%mm1 \n\t"
982 "punpcklwd %5, %%mm2 \n\t"
983 "punpckhwd %5, %%mm3 \n\t"
984 "punpckhwd %5, %%mm4 \n\t"
985 "punpckhwd %5, %%mm5 \n\t"
986 "psllq $8, %%mm1 \n\t"
987 "psllq $16, %%mm2 \n\t"
988 "por %%mm1, %%mm0 \n\t"
989 "por %%mm2, %%mm0 \n\t"
990 "psllq $8, %%mm4 \n\t"
991 "psllq $16, %%mm5 \n\t"
992 "por %%mm4, %%mm3 \n\t"
993 "por %%mm5, %%mm3 \n\t"
996 :"m"(*s
),"m"(mask15b
),"m"(mask15g
),"m"(mask15r
), "m"(mmx_null
)
998 /* borrowed 32 to 24 */
1000 "movq %%mm0, %%mm4 \n\t"
1001 "movq %%mm3, %%mm5 \n\t"
1002 "movq %%mm6, %%mm0 \n\t"
1003 "movq %%mm7, %%mm1 \n\t"
1005 "movq %%mm4, %%mm6 \n\t"
1006 "movq %%mm5, %%mm7 \n\t"
1007 "movq %%mm0, %%mm2 \n\t"
1008 "movq %%mm1, %%mm3 \n\t"
1010 "psrlq $8, %%mm2 \n\t"
1011 "psrlq $8, %%mm3 \n\t"
1012 "psrlq $8, %%mm6 \n\t"
1013 "psrlq $8, %%mm7 \n\t"
1014 "pand %2, %%mm0 \n\t"
1015 "pand %2, %%mm1 \n\t"
1016 "pand %2, %%mm4 \n\t"
1017 "pand %2, %%mm5 \n\t"
1018 "pand %3, %%mm2 \n\t"
1019 "pand %3, %%mm3 \n\t"
1020 "pand %3, %%mm6 \n\t"
1021 "pand %3, %%mm7 \n\t"
1022 "por %%mm2, %%mm0 \n\t"
1023 "por %%mm3, %%mm1 \n\t"
1024 "por %%mm6, %%mm4 \n\t"
1025 "por %%mm7, %%mm5 \n\t"
1027 "movq %%mm1, %%mm2 \n\t"
1028 "movq %%mm4, %%mm3 \n\t"
1029 "psllq $48, %%mm2 \n\t"
1030 "psllq $32, %%mm3 \n\t"
1031 "pand %4, %%mm2 \n\t"
1032 "pand %5, %%mm3 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "psrlq $16, %%mm1 \n\t"
1035 "psrlq $32, %%mm4 \n\t"
1036 "psllq $16, %%mm5 \n\t"
1037 "por %%mm3, %%mm1 \n\t"
1038 "pand %6, %%mm5 \n\t"
1039 "por %%mm5, %%mm4 \n\t"
1041 MOVNTQ
" %%mm0, %0 \n\t"
1042 MOVNTQ
" %%mm1, 8%0 \n\t"
1043 MOVNTQ
" %%mm4, 16%0"
1046 :"m"(*s
),"m"(mask24l
),"m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
1051 __asm__
volatile(SFENCE:::"memory");
1052 __asm__
volatile(EMMS:::"memory");
1056 register uint16_t bgr
;
1058 *d
++ = (bgr
&0x1F)<<3;
1059 *d
++ = (bgr
&0x3E0)>>2;
1060 *d
++ = (bgr
&0x7C00)>>7;
1064 static inline void RENAME(rgb16tobgr24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1066 const uint16_t *end
;
1068 const uint16_t *mm_end
;
1070 uint8_t *d
= (uint8_t *)dst
;
1071 const uint16_t *s
= (const uint16_t *)src
;
1072 end
= s
+ src_size
/2;
1074 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1079 PREFETCH
" 32%1 \n\t"
1080 "movq %1, %%mm0 \n\t"
1081 "movq %1, %%mm1 \n\t"
1082 "movq %1, %%mm2 \n\t"
1083 "pand %2, %%mm0 \n\t"
1084 "pand %3, %%mm1 \n\t"
1085 "pand %4, %%mm2 \n\t"
1086 "psllq $3, %%mm0 \n\t"
1087 "psrlq $3, %%mm1 \n\t"
1088 "psrlq $8, %%mm2 \n\t"
1089 "movq %%mm0, %%mm3 \n\t"
1090 "movq %%mm1, %%mm4 \n\t"
1091 "movq %%mm2, %%mm5 \n\t"
1092 "punpcklwd %5, %%mm0 \n\t"
1093 "punpcklwd %5, %%mm1 \n\t"
1094 "punpcklwd %5, %%mm2 \n\t"
1095 "punpckhwd %5, %%mm3 \n\t"
1096 "punpckhwd %5, %%mm4 \n\t"
1097 "punpckhwd %5, %%mm5 \n\t"
1098 "psllq $8, %%mm1 \n\t"
1099 "psllq $16, %%mm2 \n\t"
1100 "por %%mm1, %%mm0 \n\t"
1101 "por %%mm2, %%mm0 \n\t"
1102 "psllq $8, %%mm4 \n\t"
1103 "psllq $16, %%mm5 \n\t"
1104 "por %%mm4, %%mm3 \n\t"
1105 "por %%mm5, %%mm3 \n\t"
1107 "movq %%mm0, %%mm6 \n\t"
1108 "movq %%mm3, %%mm7 \n\t"
1110 "movq 8%1, %%mm0 \n\t"
1111 "movq 8%1, %%mm1 \n\t"
1112 "movq 8%1, %%mm2 \n\t"
1113 "pand %2, %%mm0 \n\t"
1114 "pand %3, %%mm1 \n\t"
1115 "pand %4, %%mm2 \n\t"
1116 "psllq $3, %%mm0 \n\t"
1117 "psrlq $3, %%mm1 \n\t"
1118 "psrlq $8, %%mm2 \n\t"
1119 "movq %%mm0, %%mm3 \n\t"
1120 "movq %%mm1, %%mm4 \n\t"
1121 "movq %%mm2, %%mm5 \n\t"
1122 "punpcklwd %5, %%mm0 \n\t"
1123 "punpcklwd %5, %%mm1 \n\t"
1124 "punpcklwd %5, %%mm2 \n\t"
1125 "punpckhwd %5, %%mm3 \n\t"
1126 "punpckhwd %5, %%mm4 \n\t"
1127 "punpckhwd %5, %%mm5 \n\t"
1128 "psllq $8, %%mm1 \n\t"
1129 "psllq $16, %%mm2 \n\t"
1130 "por %%mm1, %%mm0 \n\t"
1131 "por %%mm2, %%mm0 \n\t"
1132 "psllq $8, %%mm4 \n\t"
1133 "psllq $16, %%mm5 \n\t"
1134 "por %%mm4, %%mm3 \n\t"
1135 "por %%mm5, %%mm3 \n\t"
1137 :"m"(*s
),"m"(mask16b
),"m"(mask16g
),"m"(mask16r
),"m"(mmx_null
)
1139 /* borrowed 32 to 24 */
1141 "movq %%mm0, %%mm4 \n\t"
1142 "movq %%mm3, %%mm5 \n\t"
1143 "movq %%mm6, %%mm0 \n\t"
1144 "movq %%mm7, %%mm1 \n\t"
1146 "movq %%mm4, %%mm6 \n\t"
1147 "movq %%mm5, %%mm7 \n\t"
1148 "movq %%mm0, %%mm2 \n\t"
1149 "movq %%mm1, %%mm3 \n\t"
1151 "psrlq $8, %%mm2 \n\t"
1152 "psrlq $8, %%mm3 \n\t"
1153 "psrlq $8, %%mm6 \n\t"
1154 "psrlq $8, %%mm7 \n\t"
1155 "pand %2, %%mm0 \n\t"
1156 "pand %2, %%mm1 \n\t"
1157 "pand %2, %%mm4 \n\t"
1158 "pand %2, %%mm5 \n\t"
1159 "pand %3, %%mm2 \n\t"
1160 "pand %3, %%mm3 \n\t"
1161 "pand %3, %%mm6 \n\t"
1162 "pand %3, %%mm7 \n\t"
1163 "por %%mm2, %%mm0 \n\t"
1164 "por %%mm3, %%mm1 \n\t"
1165 "por %%mm6, %%mm4 \n\t"
1166 "por %%mm7, %%mm5 \n\t"
1168 "movq %%mm1, %%mm2 \n\t"
1169 "movq %%mm4, %%mm3 \n\t"
1170 "psllq $48, %%mm2 \n\t"
1171 "psllq $32, %%mm3 \n\t"
1172 "pand %4, %%mm2 \n\t"
1173 "pand %5, %%mm3 \n\t"
1174 "por %%mm2, %%mm0 \n\t"
1175 "psrlq $16, %%mm1 \n\t"
1176 "psrlq $32, %%mm4 \n\t"
1177 "psllq $16, %%mm5 \n\t"
1178 "por %%mm3, %%mm1 \n\t"
1179 "pand %6, %%mm5 \n\t"
1180 "por %%mm5, %%mm4 \n\t"
1182 MOVNTQ
" %%mm0, %0 \n\t"
1183 MOVNTQ
" %%mm1, 8%0 \n\t"
1184 MOVNTQ
" %%mm4, 16%0"
1187 :"m"(*s
),"m"(mask24l
),"m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
1192 __asm__
volatile(SFENCE:::"memory");
1193 __asm__
volatile(EMMS:::"memory");
1197 register uint16_t bgr
;
1199 *d
++ = (bgr
&0x1F)<<3;
1200 *d
++ = (bgr
&0x7E0)>>3;
1201 *d
++ = (bgr
&0xF800)>>8;
1206 * mm0 = 00 B3 00 B2 00 B1 00 B0
1207 * mm1 = 00 G3 00 G2 00 G1 00 G0
1208 * mm2 = 00 R3 00 R2 00 R1 00 R0
1209 * mm6 = FF FF FF FF FF FF FF FF
1210 * mm7 = 00 00 00 00 00 00 00 00
1212 #define PACK_RGB32 \
1213 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1214 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1215 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1216 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1217 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1218 "movq %%mm0, %%mm3 \n\t" \
1219 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1220 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1221 MOVNTQ" %%mm0, %0 \n\t" \
1222 MOVNTQ" %%mm3, 8%0 \n\t" \
1224 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1226 const uint16_t *end
;
1228 const uint16_t *mm_end
;
1231 const uint16_t *s
= (const uint16_t *)src
;
1232 end
= s
+ src_size
/2;
1234 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1235 __asm__
volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1236 __asm__
volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1241 PREFETCH
" 32%1 \n\t"
1242 "movq %1, %%mm0 \n\t"
1243 "movq %1, %%mm1 \n\t"
1244 "movq %1, %%mm2 \n\t"
1245 "pand %2, %%mm0 \n\t"
1246 "pand %3, %%mm1 \n\t"
1247 "pand %4, %%mm2 \n\t"
1248 "psllq $3, %%mm0 \n\t"
1249 "psrlq $2, %%mm1 \n\t"
1250 "psrlq $7, %%mm2 \n\t"
1253 :"m"(*s
),"m"(mask15b
),"m"(mask15g
),"m"(mask15r
)
1258 __asm__
volatile(SFENCE:::"memory");
1259 __asm__
volatile(EMMS:::"memory");
1263 #if 0 //slightly slower on Athlon
1265 *((uint32_t*)d
)++ = ((bgr
&0x1F)<<3) + ((bgr
&0x3E0)<<6) + ((bgr
&0x7C00)<<9);
1267 register uint16_t bgr
;
1269 #ifdef WORDS_BIGENDIAN
1271 *d
++ = (bgr
&0x7C00)>>7;
1272 *d
++ = (bgr
&0x3E0)>>2;
1273 *d
++ = (bgr
&0x1F)<<3;
1275 *d
++ = (bgr
&0x1F)<<3;
1276 *d
++ = (bgr
&0x3E0)>>2;
1277 *d
++ = (bgr
&0x7C00)>>7;
1285 static inline void RENAME(rgb16to32
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1287 const uint16_t *end
;
1289 const uint16_t *mm_end
;
1292 const uint16_t *s
= (const uint16_t*)src
;
1293 end
= s
+ src_size
/2;
1295 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1296 __asm__
volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1297 __asm__
volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1302 PREFETCH
" 32%1 \n\t"
1303 "movq %1, %%mm0 \n\t"
1304 "movq %1, %%mm1 \n\t"
1305 "movq %1, %%mm2 \n\t"
1306 "pand %2, %%mm0 \n\t"
1307 "pand %3, %%mm1 \n\t"
1308 "pand %4, %%mm2 \n\t"
1309 "psllq $3, %%mm0 \n\t"
1310 "psrlq $3, %%mm1 \n\t"
1311 "psrlq $8, %%mm2 \n\t"
1314 :"m"(*s
),"m"(mask16b
),"m"(mask16g
),"m"(mask16r
)
1319 __asm__
volatile(SFENCE:::"memory");
1320 __asm__
volatile(EMMS:::"memory");
1324 register uint16_t bgr
;
1326 #ifdef WORDS_BIGENDIAN
1328 *d
++ = (bgr
&0xF800)>>8;
1329 *d
++ = (bgr
&0x7E0)>>3;
1330 *d
++ = (bgr
&0x1F)<<3;
1332 *d
++ = (bgr
&0x1F)<<3;
1333 *d
++ = (bgr
&0x7E0)>>3;
1334 *d
++ = (bgr
&0xF800)>>8;
1340 static inline void RENAME(rgb32tobgr32
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1342 long idx
= 15 - src_size
;
1343 const uint8_t *s
= src
-idx
;
1344 uint8_t *d
= dst
-idx
;
1349 PREFETCH
" (%1, %0) \n\t"
1350 "movq %3, %%mm7 \n\t"
1351 "pxor %4, %%mm7 \n\t"
1352 "movq %%mm7, %%mm6 \n\t"
1353 "pxor %5, %%mm7 \n\t"
1356 PREFETCH
" 32(%1, %0) \n\t"
1357 "movq (%1, %0), %%mm0 \n\t"
1358 "movq 8(%1, %0), %%mm1 \n\t"
1360 "pshufw $177, %%mm0, %%mm3 \n\t"
1361 "pshufw $177, %%mm1, %%mm5 \n\t"
1362 "pand %%mm7, %%mm0 \n\t"
1363 "pand %%mm6, %%mm3 \n\t"
1364 "pand %%mm7, %%mm1 \n\t"
1365 "pand %%mm6, %%mm5 \n\t"
1366 "por %%mm3, %%mm0 \n\t"
1367 "por %%mm5, %%mm1 \n\t"
1369 "movq %%mm0, %%mm2 \n\t"
1370 "movq %%mm1, %%mm4 \n\t"
1371 "pand %%mm7, %%mm0 \n\t"
1372 "pand %%mm6, %%mm2 \n\t"
1373 "pand %%mm7, %%mm1 \n\t"
1374 "pand %%mm6, %%mm4 \n\t"
1375 "movq %%mm2, %%mm3 \n\t"
1376 "movq %%mm4, %%mm5 \n\t"
1377 "pslld $16, %%mm2 \n\t"
1378 "psrld $16, %%mm3 \n\t"
1379 "pslld $16, %%mm4 \n\t"
1380 "psrld $16, %%mm5 \n\t"
1381 "por %%mm2, %%mm0 \n\t"
1382 "por %%mm4, %%mm1 \n\t"
1383 "por %%mm3, %%mm0 \n\t"
1384 "por %%mm5, %%mm1 \n\t"
1386 MOVNTQ
" %%mm0, (%2, %0) \n\t"
1387 MOVNTQ
" %%mm1, 8(%2, %0) \n\t"
1394 : "r" (s
), "r" (d
), "m" (mask32b
), "m" (mask32r
), "m" (mmx_one
)
1397 for (; idx
<15; idx
+=4) {
1398 register int v
= *(const uint32_t *)&s
[idx
], g
= v
& 0xff00ff00;
1400 *(uint32_t *)&d
[idx
] = (v
>>16) + g
+ (v
<<16);
1404 static inline void RENAME(rgb24tobgr24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1408 long mmx_size
= 23 - src_size
;
1410 "test %%"REG_a
", %%"REG_a
" \n\t"
1412 "movq "MANGLE(mask24r
)", %%mm5 \n\t"
1413 "movq "MANGLE(mask24g
)", %%mm6 \n\t"
1414 "movq "MANGLE(mask24b
)", %%mm7 \n\t"
1417 PREFETCH
" 32(%1, %%"REG_a
") \n\t"
1418 "movq (%1, %%"REG_a
"), %%mm0 \n\t" // BGR BGR BG
1419 "movq (%1, %%"REG_a
"), %%mm1 \n\t" // BGR BGR BG
1420 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t" // R BGR BGR B
1421 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1422 "pand %%mm5, %%mm0 \n\t"
1423 "pand %%mm6, %%mm1 \n\t"
1424 "pand %%mm7, %%mm2 \n\t"
1425 "por %%mm0, %%mm1 \n\t"
1426 "por %%mm2, %%mm1 \n\t"
1427 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t" // BGR BGR BG
1428 MOVNTQ
" %%mm1, (%2, %%"REG_a
") \n\t" // RGB RGB RG
1429 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t" // R BGR BGR B
1430 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t" // GR BGR BGR
1431 "pand %%mm7, %%mm0 \n\t"
1432 "pand %%mm5, %%mm1 \n\t"
1433 "pand %%mm6, %%mm2 \n\t"
1434 "por %%mm0, %%mm1 \n\t"
1435 "por %%mm2, %%mm1 \n\t"
1436 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t" // R BGR BGR B
1437 MOVNTQ
" %%mm1, 8(%2, %%"REG_a
") \n\t" // B RGB RGB R
1438 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t" // GR BGR BGR
1439 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t" // BGR BGR BG
1440 "pand %%mm6, %%mm0 \n\t"
1441 "pand %%mm7, %%mm1 \n\t"
1442 "pand %%mm5, %%mm2 \n\t"
1443 "por %%mm0, %%mm1 \n\t"
1444 "por %%mm2, %%mm1 \n\t"
1445 MOVNTQ
" %%mm1, 16(%2, %%"REG_a
") \n\t"
1446 "add $24, %%"REG_a
" \n\t"
1450 : "r" (src
-mmx_size
), "r"(dst
-mmx_size
)
1453 __asm__
volatile(SFENCE:::"memory");
1454 __asm__
volatile(EMMS:::"memory");
1456 if (mmx_size
==23) return; //finished, was multiple of 8
1460 src_size
= 23-mmx_size
;
1464 for (i
=0; i
<src_size
; i
+=3)
1468 dst
[i
+ 1] = src
[i
+ 1];
1469 dst
[i
+ 2] = src
[i
+ 0];
1474 static inline void RENAME(yuvPlanartoyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1475 long width
, long height
,
1476 long lumStride
, long chromStride
, long dstStride
, long vertLumPerChroma
)
1479 const long chromWidth
= width
>>1;
1480 for (y
=0; y
<height
; y
++)
1483 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1485 "xor %%"REG_a
", %%"REG_a
" \n\t"
1488 PREFETCH
" 32(%1, %%"REG_a
", 2) \n\t"
1489 PREFETCH
" 32(%2, %%"REG_a
") \n\t"
1490 PREFETCH
" 32(%3, %%"REG_a
") \n\t"
1491 "movq (%2, %%"REG_a
"), %%mm0 \n\t" // U(0)
1492 "movq %%mm0, %%mm2 \n\t" // U(0)
1493 "movq (%3, %%"REG_a
"), %%mm1 \n\t" // V(0)
1494 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1495 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1497 "movq (%1, %%"REG_a
",2), %%mm3 \n\t" // Y(0)
1498 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t" // Y(8)
1499 "movq %%mm3, %%mm4 \n\t" // Y(0)
1500 "movq %%mm5, %%mm6 \n\t" // Y(8)
1501 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1502 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1503 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1504 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1506 MOVNTQ
" %%mm3, (%0, %%"REG_a
", 4) \n\t"
1507 MOVNTQ
" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1508 MOVNTQ
" %%mm5, 16(%0, %%"REG_a
", 4) \n\t"
1509 MOVNTQ
" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1511 "add $8, %%"REG_a
" \n\t"
1512 "cmp %4, %%"REG_a
" \n\t"
1514 ::"r"(dst
), "r"(ysrc
), "r"(usrc
), "r"(vsrc
), "g" (chromWidth
)
1519 #if ARCH_ALPHA && HAVE_MVI
1520 #define pl2yuy2(n) \
1525 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1526 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1527 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1528 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1529 yuv1 = (u << 8) + (v << 24); \
1536 uint64_t *qdst
= (uint64_t *) dst
;
1537 uint64_t *qdst2
= (uint64_t *) (dst
+ dstStride
);
1538 const uint32_t *yc
= (uint32_t *) ysrc
;
1539 const uint32_t *yc2
= (uint32_t *) (ysrc
+ lumStride
);
1540 const uint16_t *uc
= (uint16_t*) usrc
, *vc
= (uint16_t*) vsrc
;
1541 for (i
= 0; i
< chromWidth
; i
+= 8){
1542 uint64_t y1
, y2
, yuv1
, yuv2
;
1545 __asm__("ldq $31,64(%0)" :: "r"(yc
));
1546 __asm__("ldq $31,64(%0)" :: "r"(yc2
));
1547 __asm__("ldq $31,64(%0)" :: "r"(uc
));
1548 __asm__("ldq $31,64(%0)" :: "r"(vc
));
1566 #elif HAVE_FAST_64BIT
1568 uint64_t *ldst
= (uint64_t *) dst
;
1569 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1570 for (i
= 0; i
< chromWidth
; i
+= 2){
1572 k
= yc
[0] + (uc
[0] << 8) +
1573 (yc
[1] << 16) + (vc
[0] << 24);
1574 l
= yc
[2] + (uc
[1] << 8) +
1575 (yc
[3] << 16) + (vc
[1] << 24);
1576 *ldst
++ = k
+ (l
<< 32);
1583 int i
, *idst
= (int32_t *) dst
;
1584 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1585 for (i
= 0; i
< chromWidth
; i
++){
1586 #ifdef WORDS_BIGENDIAN
1587 *idst
++ = (yc
[0] << 24)+ (uc
[0] << 16) +
1588 (yc
[1] << 8) + (vc
[0] << 0);
1590 *idst
++ = yc
[0] + (uc
[0] << 8) +
1591 (yc
[1] << 16) + (vc
[0] << 24);
1599 if ((y
&(vertLumPerChroma
-1)) == vertLumPerChroma
-1)
1601 usrc
+= chromStride
;
1602 vsrc
+= chromStride
;
1608 __asm__( EMMS
" \n\t"
1615 * Height should be a multiple of 2 and width should be a multiple of 16.
1616 * (If this is a problem for anyone then tell me, and I will fix it.)
1618 static inline void RENAME(yv12toyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1619 long width
, long height
,
1620 long lumStride
, long chromStride
, long dstStride
)
1622 //FIXME interpolate chroma
1623 RENAME(yuvPlanartoyuy2
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 2);
1626 static inline void RENAME(yuvPlanartouyvy
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1627 long width
, long height
,
1628 long lumStride
, long chromStride
, long dstStride
, long vertLumPerChroma
)
1631 const long chromWidth
= width
>>1;
1632 for (y
=0; y
<height
; y
++)
1635 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1637 "xor %%"REG_a
", %%"REG_a
" \n\t"
1640 PREFETCH
" 32(%1, %%"REG_a
", 2) \n\t"
1641 PREFETCH
" 32(%2, %%"REG_a
") \n\t"
1642 PREFETCH
" 32(%3, %%"REG_a
") \n\t"
1643 "movq (%2, %%"REG_a
"), %%mm0 \n\t" // U(0)
1644 "movq %%mm0, %%mm2 \n\t" // U(0)
1645 "movq (%3, %%"REG_a
"), %%mm1 \n\t" // V(0)
1646 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1647 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1649 "movq (%1, %%"REG_a
",2), %%mm3 \n\t" // Y(0)
1650 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t" // Y(8)
1651 "movq %%mm0, %%mm4 \n\t" // Y(0)
1652 "movq %%mm2, %%mm6 \n\t" // Y(8)
1653 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1654 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1655 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1656 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1658 MOVNTQ
" %%mm0, (%0, %%"REG_a
", 4) \n\t"
1659 MOVNTQ
" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1660 MOVNTQ
" %%mm2, 16(%0, %%"REG_a
", 4) \n\t"
1661 MOVNTQ
" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1663 "add $8, %%"REG_a
" \n\t"
1664 "cmp %4, %%"REG_a
" \n\t"
1666 ::"r"(dst
), "r"(ysrc
), "r"(usrc
), "r"(vsrc
), "g" (chromWidth
)
1670 //FIXME adapt the Alpha ASM code from yv12->yuy2
1674 uint64_t *ldst
= (uint64_t *) dst
;
1675 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1676 for (i
= 0; i
< chromWidth
; i
+= 2){
1678 k
= uc
[0] + (yc
[0] << 8) +
1679 (vc
[0] << 16) + (yc
[1] << 24);
1680 l
= uc
[1] + (yc
[2] << 8) +
1681 (vc
[1] << 16) + (yc
[3] << 24);
1682 *ldst
++ = k
+ (l
<< 32);
1689 int i
, *idst
= (int32_t *) dst
;
1690 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1691 for (i
= 0; i
< chromWidth
; i
++){
1692 #ifdef WORDS_BIGENDIAN
1693 *idst
++ = (uc
[0] << 24)+ (yc
[0] << 16) +
1694 (vc
[0] << 8) + (yc
[1] << 0);
1696 *idst
++ = uc
[0] + (yc
[0] << 8) +
1697 (vc
[0] << 16) + (yc
[1] << 24);
1705 if ((y
&(vertLumPerChroma
-1)) == vertLumPerChroma
-1)
1707 usrc
+= chromStride
;
1708 vsrc
+= chromStride
;
1714 __asm__( EMMS
" \n\t"
1721 * Height should be a multiple of 2 and width should be a multiple of 16
1722 * (If this is a problem for anyone then tell me, and I will fix it.)
1724 static inline void RENAME(yv12touyvy
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1725 long width
, long height
,
1726 long lumStride
, long chromStride
, long dstStride
)
1728 //FIXME interpolate chroma
1729 RENAME(yuvPlanartouyvy
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 2);
1733 * Width should be a multiple of 16.
1735 static inline void RENAME(yuv422ptouyvy
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1736 long width
, long height
,
1737 long lumStride
, long chromStride
, long dstStride
)
1739 RENAME(yuvPlanartouyvy
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 1);
1743 * Width should be a multiple of 16.
1745 static inline void RENAME(yuv422ptoyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1746 long width
, long height
,
1747 long lumStride
, long chromStride
, long dstStride
)
1749 RENAME(yuvPlanartoyuy2
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 1);
1753 * Height should be a multiple of 2 and width should be a multiple of 16.
1754 * (If this is a problem for anyone then tell me, and I will fix it.)
1756 static inline void RENAME(yuy2toyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1757 long width
, long height
,
1758 long lumStride
, long chromStride
, long srcStride
)
1761 const long chromWidth
= width
>>1;
1762 for (y
=0; y
<height
; y
+=2)
1766 "xor %%"REG_a
", %%"REG_a
" \n\t"
1767 "pcmpeqw %%mm7, %%mm7 \n\t"
1768 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1771 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
1772 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // YUYV YUYV(0)
1773 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(4)
1774 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1775 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1776 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1777 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1778 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1779 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1780 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1781 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1783 MOVNTQ
" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1785 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(8)
1786 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t" // YUYV YUYV(12)
1787 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1788 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1789 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1790 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1791 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1792 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1793 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1794 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1796 MOVNTQ
" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1798 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1799 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1800 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1801 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1802 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1803 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1804 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1805 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1807 MOVNTQ
" %%mm0, (%3, %%"REG_a
") \n\t"
1808 MOVNTQ
" %%mm2, (%2, %%"REG_a
") \n\t"
1810 "add $8, %%"REG_a
" \n\t"
1811 "cmp %4, %%"REG_a
" \n\t"
1813 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
1814 : "memory", "%"REG_a
1821 "xor %%"REG_a
", %%"REG_a
" \n\t"
1824 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
1825 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // YUYV YUYV(0)
1826 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(4)
1827 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t" // YUYV YUYV(8)
1828 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t" // YUYV YUYV(12)
1829 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1830 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1831 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1832 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1833 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1834 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1836 MOVNTQ
" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1837 MOVNTQ
" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1839 "add $8, %%"REG_a
" \n\t"
1840 "cmp %4, %%"REG_a
" \n\t"
1843 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
1844 : "memory", "%"REG_a
1848 for (i
=0; i
<chromWidth
; i
++)
1850 ydst
[2*i
+0] = src
[4*i
+0];
1851 udst
[i
] = src
[4*i
+1];
1852 ydst
[2*i
+1] = src
[4*i
+2];
1853 vdst
[i
] = src
[4*i
+3];
1858 for (i
=0; i
<chromWidth
; i
++)
1860 ydst
[2*i
+0] = src
[4*i
+0];
1861 ydst
[2*i
+1] = src
[4*i
+2];
1864 udst
+= chromStride
;
1865 vdst
+= chromStride
;
1870 __asm__
volatile( EMMS
" \n\t"
1876 static inline void RENAME(yvu9toyv12
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
,
1877 uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1878 long width
, long height
, long lumStride
, long chromStride
)
1881 memcpy(ydst
, ysrc
, width
*height
);
1883 /* XXX: implement upscaling for U,V */
1886 static inline void RENAME(planar2x
)(const uint8_t *src
, uint8_t *dst
, long srcWidth
, long srcHeight
, long srcStride
, long dstStride
)
1893 for (x
=0; x
<srcWidth
-1; x
++){
1894 dst
[2*x
+1]= (3*src
[x
] + src
[x
+1])>>2;
1895 dst
[2*x
+2]= ( src
[x
] + 3*src
[x
+1])>>2;
1897 dst
[2*srcWidth
-1]= src
[srcWidth
-1];
1901 for (y
=1; y
<srcHeight
; y
++){
1902 #if HAVE_MMX2 || HAVE_AMD3DNOW
1903 const long mmxSize
= srcWidth
&~15;
1905 "mov %4, %%"REG_a
" \n\t"
1907 "movq (%0, %%"REG_a
"), %%mm0 \n\t"
1908 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1909 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t"
1910 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t"
1911 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t"
1912 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t"
1913 PAVGB
" %%mm0, %%mm5 \n\t"
1914 PAVGB
" %%mm0, %%mm3 \n\t"
1915 PAVGB
" %%mm0, %%mm5 \n\t"
1916 PAVGB
" %%mm0, %%mm3 \n\t"
1917 PAVGB
" %%mm1, %%mm4 \n\t"
1918 PAVGB
" %%mm1, %%mm2 \n\t"
1919 PAVGB
" %%mm1, %%mm4 \n\t"
1920 PAVGB
" %%mm1, %%mm2 \n\t"
1921 "movq %%mm5, %%mm7 \n\t"
1922 "movq %%mm4, %%mm6 \n\t"
1923 "punpcklbw %%mm3, %%mm5 \n\t"
1924 "punpckhbw %%mm3, %%mm7 \n\t"
1925 "punpcklbw %%mm2, %%mm4 \n\t"
1926 "punpckhbw %%mm2, %%mm6 \n\t"
1928 MOVNTQ
" %%mm5, (%2, %%"REG_a
", 2) \n\t"
1929 MOVNTQ
" %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1930 MOVNTQ
" %%mm4, (%3, %%"REG_a
", 2) \n\t"
1931 MOVNTQ
" %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1933 "movq %%mm5, (%2, %%"REG_a
", 2) \n\t"
1934 "movq %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1935 "movq %%mm4, (%3, %%"REG_a
", 2) \n\t"
1936 "movq %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1938 "add $8, %%"REG_a
" \n\t"
1940 :: "r" (src
+ mmxSize
), "r" (src
+ srcStride
+ mmxSize
),
1941 "r" (dst
+ mmxSize
*2), "r" (dst
+ dstStride
+ mmxSize
*2),
1947 const long mmxSize
=1;
1949 dst
[0 ]= (3*src
[0] + src
[srcStride
])>>2;
1950 dst
[dstStride
]= ( src
[0] + 3*src
[srcStride
])>>2;
1952 for (x
=mmxSize
-1; x
<srcWidth
-1; x
++){
1953 dst
[2*x
+1]= (3*src
[x
+0] + src
[x
+srcStride
+1])>>2;
1954 dst
[2*x
+dstStride
+2]= ( src
[x
+0] + 3*src
[x
+srcStride
+1])>>2;
1955 dst
[2*x
+dstStride
+1]= ( src
[x
+1] + 3*src
[x
+srcStride
])>>2;
1956 dst
[2*x
+2]= (3*src
[x
+1] + src
[x
+srcStride
])>>2;
1958 dst
[srcWidth
*2 -1 ]= (3*src
[srcWidth
-1] + src
[srcWidth
-1 + srcStride
])>>2;
1959 dst
[srcWidth
*2 -1 + dstStride
]= ( src
[srcWidth
-1] + 3*src
[srcWidth
-1 + srcStride
])>>2;
1969 for (x
=0; x
<srcWidth
-1; x
++){
1970 dst
[2*x
+1]= (3*src
[x
] + src
[x
+1])>>2;
1971 dst
[2*x
+2]= ( src
[x
] + 3*src
[x
+1])>>2;
1973 dst
[2*srcWidth
-1]= src
[srcWidth
-1];
1975 for (x
=0; x
<srcWidth
; x
++){
1982 __asm__
volatile( EMMS
" \n\t"
1989 * Height should be a multiple of 2 and width should be a multiple of 16.
1990 * (If this is a problem for anyone then tell me, and I will fix it.)
1991 * Chrominance data is only taken from every second line, others are ignored.
1992 * FIXME: Write HQ version.
1994 static inline void RENAME(uyvytoyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1995 long width
, long height
,
1996 long lumStride
, long chromStride
, long srcStride
)
1999 const long chromWidth
= width
>>1;
2000 for (y
=0; y
<height
; y
+=2)
2004 "xor %%"REG_a
", %%"REG_a
" \n\t"
2005 "pcmpeqw %%mm7, %%mm7 \n\t"
2006 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2009 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
2010 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // UYVY UYVY(0)
2011 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // UYVY UYVY(4)
2012 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2013 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2014 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2015 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2016 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2017 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2018 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2019 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2021 MOVNTQ
" %%mm2, (%1, %%"REG_a
", 2) \n\t"
2023 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t" // UYVY UYVY(8)
2024 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t" // UYVY UYVY(12)
2025 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2026 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2027 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2028 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2029 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2030 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2031 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2032 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2034 MOVNTQ
" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
2036 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2037 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2038 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2039 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2040 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2041 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2042 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2043 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2045 MOVNTQ
" %%mm0, (%3, %%"REG_a
") \n\t"
2046 MOVNTQ
" %%mm2, (%2, %%"REG_a
") \n\t"
2048 "add $8, %%"REG_a
" \n\t"
2049 "cmp %4, %%"REG_a
" \n\t"
2051 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
2052 : "memory", "%"REG_a
2059 "xor %%"REG_a
", %%"REG_a
" \n\t"
2062 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
2063 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // YUYV YUYV(0)
2064 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(4)
2065 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t" // YUYV YUYV(8)
2066 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t" // YUYV YUYV(12)
2067 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2068 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2069 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2070 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2071 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2072 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2074 MOVNTQ
" %%mm0, (%1, %%"REG_a
", 2) \n\t"
2075 MOVNTQ
" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
2077 "add $8, %%"REG_a
" \n\t"
2078 "cmp %4, %%"REG_a
" \n\t"
2081 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
2082 : "memory", "%"REG_a
2086 for (i
=0; i
<chromWidth
; i
++)
2088 udst
[i
] = src
[4*i
+0];
2089 ydst
[2*i
+0] = src
[4*i
+1];
2090 vdst
[i
] = src
[4*i
+2];
2091 ydst
[2*i
+1] = src
[4*i
+3];
2096 for (i
=0; i
<chromWidth
; i
++)
2098 ydst
[2*i
+0] = src
[4*i
+1];
2099 ydst
[2*i
+1] = src
[4*i
+3];
2102 udst
+= chromStride
;
2103 vdst
+= chromStride
;
2108 __asm__
volatile( EMMS
" \n\t"
2115 * Height should be a multiple of 2 and width should be a multiple of 2.
2116 * (If this is a problem for anyone then tell me, and I will fix it.)
2117 * Chrominance data is only taken from every second line,
2118 * others are ignored in the C version.
2119 * FIXME: Write HQ version.
2121 static inline void RENAME(rgb24toyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
2122 long width
, long height
,
2123 long lumStride
, long chromStride
, long srcStride
)
2126 const long chromWidth
= width
>>1;
2128 for (y
=0; y
<height
-2; y
+=2)
2134 "mov %2, %%"REG_a
" \n\t"
2135 "movq "MANGLE(ff_bgr2YCoeff
)", %%mm6 \n\t"
2136 "movq "MANGLE(ff_w1111
)", %%mm5 \n\t"
2137 "pxor %%mm7, %%mm7 \n\t"
2138 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
2141 PREFETCH
" 64(%0, %%"REG_d
") \n\t"
2142 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
2143 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
2144 "punpcklbw %%mm7, %%mm0 \n\t"
2145 "punpcklbw %%mm7, %%mm1 \n\t"
2146 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
2147 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
2148 "punpcklbw %%mm7, %%mm2 \n\t"
2149 "punpcklbw %%mm7, %%mm3 \n\t"
2150 "pmaddwd %%mm6, %%mm0 \n\t"
2151 "pmaddwd %%mm6, %%mm1 \n\t"
2152 "pmaddwd %%mm6, %%mm2 \n\t"
2153 "pmaddwd %%mm6, %%mm3 \n\t"
2154 #ifndef FAST_BGR2YV12
2155 "psrad $8, %%mm0 \n\t"
2156 "psrad $8, %%mm1 \n\t"
2157 "psrad $8, %%mm2 \n\t"
2158 "psrad $8, %%mm3 \n\t"
2160 "packssdw %%mm1, %%mm0 \n\t"
2161 "packssdw %%mm3, %%mm2 \n\t"
2162 "pmaddwd %%mm5, %%mm0 \n\t"
2163 "pmaddwd %%mm5, %%mm2 \n\t"
2164 "packssdw %%mm2, %%mm0 \n\t"
2165 "psraw $7, %%mm0 \n\t"
2167 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
2168 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
2169 "punpcklbw %%mm7, %%mm4 \n\t"
2170 "punpcklbw %%mm7, %%mm1 \n\t"
2171 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
2172 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
2173 "punpcklbw %%mm7, %%mm2 \n\t"
2174 "punpcklbw %%mm7, %%mm3 \n\t"
2175 "pmaddwd %%mm6, %%mm4 \n\t"
2176 "pmaddwd %%mm6, %%mm1 \n\t"
2177 "pmaddwd %%mm6, %%mm2 \n\t"
2178 "pmaddwd %%mm6, %%mm3 \n\t"
2179 #ifndef FAST_BGR2YV12
2180 "psrad $8, %%mm4 \n\t"
2181 "psrad $8, %%mm1 \n\t"
2182 "psrad $8, %%mm2 \n\t"
2183 "psrad $8, %%mm3 \n\t"
2185 "packssdw %%mm1, %%mm4 \n\t"
2186 "packssdw %%mm3, %%mm2 \n\t"
2187 "pmaddwd %%mm5, %%mm4 \n\t"
2188 "pmaddwd %%mm5, %%mm2 \n\t"
2189 "add $24, %%"REG_d
" \n\t"
2190 "packssdw %%mm2, %%mm4 \n\t"
2191 "psraw $7, %%mm4 \n\t"
2193 "packuswb %%mm4, %%mm0 \n\t"
2194 "paddusb "MANGLE(ff_bgr2YOffset
)", %%mm0 \n\t"
2196 MOVNTQ
" %%mm0, (%1, %%"REG_a
") \n\t"
2197 "add $8, %%"REG_a
" \n\t"
2199 : : "r" (src
+width
*3), "r" (ydst
+width
), "g" (-width
)
2200 : "%"REG_a
, "%"REG_d
2207 "mov %4, %%"REG_a
" \n\t"
2208 "movq "MANGLE(ff_w1111
)", %%mm5 \n\t"
2209 "movq "MANGLE(ff_bgr2UCoeff
)", %%mm6 \n\t"
2210 "pxor %%mm7, %%mm7 \n\t"
2211 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
2212 "add %%"REG_d
", %%"REG_d
" \n\t"
2215 PREFETCH
" 64(%0, %%"REG_d
") \n\t"
2216 PREFETCH
" 64(%1, %%"REG_d
") \n\t"
2217 #if HAVE_MMX2 || HAVE_AMD3DNOW
2218 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
2219 "movq (%1, %%"REG_d
"), %%mm1 \n\t"
2220 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
2221 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t"
2222 PAVGB
" %%mm1, %%mm0 \n\t"
2223 PAVGB
" %%mm3, %%mm2 \n\t"
2224 "movq %%mm0, %%mm1 \n\t"
2225 "movq %%mm2, %%mm3 \n\t"
2226 "psrlq $24, %%mm0 \n\t"
2227 "psrlq $24, %%mm2 \n\t"
2228 PAVGB
" %%mm1, %%mm0 \n\t"
2229 PAVGB
" %%mm3, %%mm2 \n\t"
2230 "punpcklbw %%mm7, %%mm0 \n\t"
2231 "punpcklbw %%mm7, %%mm2 \n\t"
2233 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
2234 "movd (%1, %%"REG_d
"), %%mm1 \n\t"
2235 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
2236 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t"
2237 "punpcklbw %%mm7, %%mm0 \n\t"
2238 "punpcklbw %%mm7, %%mm1 \n\t"
2239 "punpcklbw %%mm7, %%mm2 \n\t"
2240 "punpcklbw %%mm7, %%mm3 \n\t"
2241 "paddw %%mm1, %%mm0 \n\t"
2242 "paddw %%mm3, %%mm2 \n\t"
2243 "paddw %%mm2, %%mm0 \n\t"
2244 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
2245 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t"
2246 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
2247 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t"
2248 "punpcklbw %%mm7, %%mm4 \n\t"
2249 "punpcklbw %%mm7, %%mm1 \n\t"
2250 "punpcklbw %%mm7, %%mm2 \n\t"
2251 "punpcklbw %%mm7, %%mm3 \n\t"
2252 "paddw %%mm1, %%mm4 \n\t"
2253 "paddw %%mm3, %%mm2 \n\t"
2254 "paddw %%mm4, %%mm2 \n\t"
2255 "psrlw $2, %%mm0 \n\t"
2256 "psrlw $2, %%mm2 \n\t"
2258 "movq "MANGLE(ff_bgr2VCoeff
)", %%mm1 \n\t"
2259 "movq "MANGLE(ff_bgr2VCoeff
)", %%mm3 \n\t"
2261 "pmaddwd %%mm0, %%mm1 \n\t"
2262 "pmaddwd %%mm2, %%mm3 \n\t"
2263 "pmaddwd %%mm6, %%mm0 \n\t"
2264 "pmaddwd %%mm6, %%mm2 \n\t"
2265 #ifndef FAST_BGR2YV12
2266 "psrad $8, %%mm0 \n\t"
2267 "psrad $8, %%mm1 \n\t"
2268 "psrad $8, %%mm2 \n\t"
2269 "psrad $8, %%mm3 \n\t"
2271 "packssdw %%mm2, %%mm0 \n\t"
2272 "packssdw %%mm3, %%mm1 \n\t"
2273 "pmaddwd %%mm5, %%mm0 \n\t"
2274 "pmaddwd %%mm5, %%mm1 \n\t"
2275 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2276 "psraw $7, %%mm0 \n\t"
2278 #if HAVE_MMX2 || HAVE_AMD3DNOW
2279 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
2280 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t"
2281 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
2282 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t"
2283 PAVGB
" %%mm1, %%mm4 \n\t"
2284 PAVGB
" %%mm3, %%mm2 \n\t"
2285 "movq %%mm4, %%mm1 \n\t"
2286 "movq %%mm2, %%mm3 \n\t"
2287 "psrlq $24, %%mm4 \n\t"
2288 "psrlq $24, %%mm2 \n\t"
2289 PAVGB
" %%mm1, %%mm4 \n\t"
2290 PAVGB
" %%mm3, %%mm2 \n\t"
2291 "punpcklbw %%mm7, %%mm4 \n\t"
2292 "punpcklbw %%mm7, %%mm2 \n\t"
2294 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
2295 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t"
2296 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
2297 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t"
2298 "punpcklbw %%mm7, %%mm4 \n\t"
2299 "punpcklbw %%mm7, %%mm1 \n\t"
2300 "punpcklbw %%mm7, %%mm2 \n\t"
2301 "punpcklbw %%mm7, %%mm3 \n\t"
2302 "paddw %%mm1, %%mm4 \n\t"
2303 "paddw %%mm3, %%mm2 \n\t"
2304 "paddw %%mm2, %%mm4 \n\t"
2305 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
2306 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t"
2307 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
2308 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t"
2309 "punpcklbw %%mm7, %%mm5 \n\t"
2310 "punpcklbw %%mm7, %%mm1 \n\t"
2311 "punpcklbw %%mm7, %%mm2 \n\t"
2312 "punpcklbw %%mm7, %%mm3 \n\t"
2313 "paddw %%mm1, %%mm5 \n\t"
2314 "paddw %%mm3, %%mm2 \n\t"
2315 "paddw %%mm5, %%mm2 \n\t"
2316 "movq "MANGLE(ff_w1111
)", %%mm5 \n\t"
2317 "psrlw $2, %%mm4 \n\t"
2318 "psrlw $2, %%mm2 \n\t"
2320 "movq "MANGLE(ff_bgr2VCoeff
)", %%mm1 \n\t"
2321 "movq "MANGLE(ff_bgr2VCoeff
)", %%mm3 \n\t"
2323 "pmaddwd %%mm4, %%mm1 \n\t"
2324 "pmaddwd %%mm2, %%mm3 \n\t"
2325 "pmaddwd %%mm6, %%mm4 \n\t"
2326 "pmaddwd %%mm6, %%mm2 \n\t"
2327 #ifndef FAST_BGR2YV12
2328 "psrad $8, %%mm4 \n\t"
2329 "psrad $8, %%mm1 \n\t"
2330 "psrad $8, %%mm2 \n\t"
2331 "psrad $8, %%mm3 \n\t"
2333 "packssdw %%mm2, %%mm4 \n\t"
2334 "packssdw %%mm3, %%mm1 \n\t"
2335 "pmaddwd %%mm5, %%mm4 \n\t"
2336 "pmaddwd %%mm5, %%mm1 \n\t"
2337 "add $24, %%"REG_d
" \n\t"
2338 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2339 "psraw $7, %%mm4 \n\t"
2341 "movq %%mm0, %%mm1 \n\t"
2342 "punpckldq %%mm4, %%mm0 \n\t"
2343 "punpckhdq %%mm4, %%mm1 \n\t"
2344 "packsswb %%mm1, %%mm0 \n\t"
2345 "paddb "MANGLE(ff_bgr2UVOffset
)", %%mm0 \n\t"
2346 "movd %%mm0, (%2, %%"REG_a
") \n\t"
2347 "punpckhdq %%mm0, %%mm0 \n\t"
2348 "movd %%mm0, (%3, %%"REG_a
") \n\t"
2349 "add $4, %%"REG_a
" \n\t"
2351 : : "r" (src
+chromWidth
*6), "r" (src
+srcStride
+chromWidth
*6), "r" (udst
+chromWidth
), "r" (vdst
+chromWidth
), "g" (-chromWidth
)
2352 : "%"REG_a
, "%"REG_d
2355 udst
+= chromStride
;
2356 vdst
+= chromStride
;
2360 __asm__
volatile( EMMS
" \n\t"
2366 for (; y
<height
; y
+=2)
2369 for (i
=0; i
<chromWidth
; i
++)
2371 unsigned int b
= src
[6*i
+0];
2372 unsigned int g
= src
[6*i
+1];
2373 unsigned int r
= src
[6*i
+2];
2375 unsigned int Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2376 unsigned int V
= ((RV
*r
+ GV
*g
+ BV
*b
)>>RGB2YUV_SHIFT
) + 128;
2377 unsigned int U
= ((RU
*r
+ GU
*g
+ BU
*b
)>>RGB2YUV_SHIFT
) + 128;
2387 Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2393 for (i
=0; i
<chromWidth
; i
++)
2395 unsigned int b
= src
[6*i
+0];
2396 unsigned int g
= src
[6*i
+1];
2397 unsigned int r
= src
[6*i
+2];
2399 unsigned int Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2407 Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2410 udst
+= chromStride
;
2411 vdst
+= chromStride
;
2417 static void RENAME(interleaveBytes
)(uint8_t *src1
, uint8_t *src2
, uint8_t *dest
,
2418 long width
, long height
, long src1Stride
,
2419 long src2Stride
, long dstStride
){
2422 for (h
=0; h
< height
; h
++)
2429 "xor %%"REG_a
", %%"REG_a
" \n\t"
2431 PREFETCH
" 64(%1, %%"REG_a
") \n\t"
2432 PREFETCH
" 64(%2, %%"REG_a
") \n\t"
2433 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t"
2434 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t"
2435 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t"
2436 "punpcklbw %%xmm2, %%xmm0 \n\t"
2437 "punpckhbw %%xmm2, %%xmm1 \n\t"
2438 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t"
2439 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t"
2440 "add $16, %%"REG_a
" \n\t"
2441 "cmp %3, %%"REG_a
" \n\t"
2443 ::"r"(dest
), "r"(src1
), "r"(src2
), "r" (width
-15)
2444 : "memory", "%"REG_a
""
2448 "xor %%"REG_a
", %%"REG_a
" \n\t"
2450 PREFETCH
" 64(%1, %%"REG_a
") \n\t"
2451 PREFETCH
" 64(%2, %%"REG_a
") \n\t"
2452 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
2453 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t"
2454 "movq %%mm0, %%mm1 \n\t"
2455 "movq %%mm2, %%mm3 \n\t"
2456 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
2457 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t"
2458 "punpcklbw %%mm4, %%mm0 \n\t"
2459 "punpckhbw %%mm4, %%mm1 \n\t"
2460 "punpcklbw %%mm5, %%mm2 \n\t"
2461 "punpckhbw %%mm5, %%mm3 \n\t"
2462 MOVNTQ
" %%mm0, (%0, %%"REG_a
", 2) \n\t"
2463 MOVNTQ
" %%mm1, 8(%0, %%"REG_a
", 2) \n\t"
2464 MOVNTQ
" %%mm2, 16(%0, %%"REG_a
", 2) \n\t"
2465 MOVNTQ
" %%mm3, 24(%0, %%"REG_a
", 2) \n\t"
2466 "add $16, %%"REG_a
" \n\t"
2467 "cmp %3, %%"REG_a
" \n\t"
2469 ::"r"(dest
), "r"(src1
), "r"(src2
), "r" (width
-15)
2470 : "memory", "%"REG_a
2473 for (w
= (width
&(~15)); w
< width
; w
++)
2475 dest
[2*w
+0] = src1
[w
];
2476 dest
[2*w
+1] = src2
[w
];
2479 for (w
=0; w
< width
; w
++)
2481 dest
[2*w
+0] = src1
[w
];
2482 dest
[2*w
+1] = src2
[w
];
2498 static inline void RENAME(vu9_to_vu12
)(const uint8_t *src1
, const uint8_t *src2
,
2499 uint8_t *dst1
, uint8_t *dst2
,
2500 long width
, long height
,
2501 long srcStride1
, long srcStride2
,
2502 long dstStride1
, long dstStride2
)
2505 w
=width
/2; h
=height
/2;
2510 ::"m"(*(src1
+srcStride1
)),"m"(*(src2
+srcStride2
)):"memory");
2513 const uint8_t* s1
=src1
+srcStride1
*(y
>>1);
2514 uint8_t* d
=dst1
+dstStride1
*y
;
2520 PREFETCH
" 32%1 \n\t"
2521 "movq %1, %%mm0 \n\t"
2522 "movq 8%1, %%mm2 \n\t"
2523 "movq 16%1, %%mm4 \n\t"
2524 "movq 24%1, %%mm6 \n\t"
2525 "movq %%mm0, %%mm1 \n\t"
2526 "movq %%mm2, %%mm3 \n\t"
2527 "movq %%mm4, %%mm5 \n\t"
2528 "movq %%mm6, %%mm7 \n\t"
2529 "punpcklbw %%mm0, %%mm0 \n\t"
2530 "punpckhbw %%mm1, %%mm1 \n\t"
2531 "punpcklbw %%mm2, %%mm2 \n\t"
2532 "punpckhbw %%mm3, %%mm3 \n\t"
2533 "punpcklbw %%mm4, %%mm4 \n\t"
2534 "punpckhbw %%mm5, %%mm5 \n\t"
2535 "punpcklbw %%mm6, %%mm6 \n\t"
2536 "punpckhbw %%mm7, %%mm7 \n\t"
2537 MOVNTQ
" %%mm0, %0 \n\t"
2538 MOVNTQ
" %%mm1, 8%0 \n\t"
2539 MOVNTQ
" %%mm2, 16%0 \n\t"
2540 MOVNTQ
" %%mm3, 24%0 \n\t"
2541 MOVNTQ
" %%mm4, 32%0 \n\t"
2542 MOVNTQ
" %%mm5, 40%0 \n\t"
2543 MOVNTQ
" %%mm6, 48%0 \n\t"
2544 MOVNTQ
" %%mm7, 56%0"
2550 for (;x
<w
;x
++) d
[2*x
]=d
[2*x
+1]=s1
[x
];
2553 const uint8_t* s2
=src2
+srcStride2
*(y
>>1);
2554 uint8_t* d
=dst2
+dstStride2
*y
;
2560 PREFETCH
" 32%1 \n\t"
2561 "movq %1, %%mm0 \n\t"
2562 "movq 8%1, %%mm2 \n\t"
2563 "movq 16%1, %%mm4 \n\t"
2564 "movq 24%1, %%mm6 \n\t"
2565 "movq %%mm0, %%mm1 \n\t"
2566 "movq %%mm2, %%mm3 \n\t"
2567 "movq %%mm4, %%mm5 \n\t"
2568 "movq %%mm6, %%mm7 \n\t"
2569 "punpcklbw %%mm0, %%mm0 \n\t"
2570 "punpckhbw %%mm1, %%mm1 \n\t"
2571 "punpcklbw %%mm2, %%mm2 \n\t"
2572 "punpckhbw %%mm3, %%mm3 \n\t"
2573 "punpcklbw %%mm4, %%mm4 \n\t"
2574 "punpckhbw %%mm5, %%mm5 \n\t"
2575 "punpcklbw %%mm6, %%mm6 \n\t"
2576 "punpckhbw %%mm7, %%mm7 \n\t"
2577 MOVNTQ
" %%mm0, %0 \n\t"
2578 MOVNTQ
" %%mm1, 8%0 \n\t"
2579 MOVNTQ
" %%mm2, 16%0 \n\t"
2580 MOVNTQ
" %%mm3, 24%0 \n\t"
2581 MOVNTQ
" %%mm4, 32%0 \n\t"
2582 MOVNTQ
" %%mm5, 40%0 \n\t"
2583 MOVNTQ
" %%mm6, 48%0 \n\t"
2584 MOVNTQ
" %%mm7, 56%0"
2590 for (;x
<w
;x
++) d
[2*x
]=d
[2*x
+1]=s2
[x
];
2601 static inline void RENAME(yvu9_to_yuy2
)(const uint8_t *src1
, const uint8_t *src2
, const uint8_t *src3
,
2603 long width
, long height
,
2604 long srcStride1
, long srcStride2
,
2605 long srcStride3
, long dstStride
)
2608 w
=width
/2; h
=height
;
2610 const uint8_t* yp
=src1
+srcStride1
*y
;
2611 const uint8_t* up
=src2
+srcStride2
*(y
>>2);
2612 const uint8_t* vp
=src3
+srcStride3
*(y
>>2);
2613 uint8_t* d
=dst
+dstStride
*y
;
2619 PREFETCH
" 32(%1, %0) \n\t"
2620 PREFETCH
" 32(%2, %0) \n\t"
2621 PREFETCH
" 32(%3, %0) \n\t"
2622 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2623 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2624 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2625 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2626 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2627 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2628 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2629 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2630 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2631 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2633 "movq %%mm1, %%mm6 \n\t"
2634 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2635 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2636 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2637 MOVNTQ
" %%mm0, (%4, %0, 8) \n\t"
2638 MOVNTQ
" %%mm3, 8(%4, %0, 8) \n\t"
2640 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2641 "movq 8(%1, %0, 4), %%mm0 \n\t"
2642 "movq %%mm0, %%mm3 \n\t"
2643 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2644 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2645 MOVNTQ
" %%mm0, 16(%4, %0, 8) \n\t"
2646 MOVNTQ
" %%mm3, 24(%4, %0, 8) \n\t"
2648 "movq %%mm4, %%mm6 \n\t"
2649 "movq 16(%1, %0, 4), %%mm0 \n\t"
2650 "movq %%mm0, %%mm3 \n\t"
2651 "punpcklbw %%mm5, %%mm4 \n\t"
2652 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2653 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2654 MOVNTQ
" %%mm0, 32(%4, %0, 8) \n\t"
2655 MOVNTQ
" %%mm3, 40(%4, %0, 8) \n\t"
2657 "punpckhbw %%mm5, %%mm6 \n\t"
2658 "movq 24(%1, %0, 4), %%mm0 \n\t"
2659 "movq %%mm0, %%mm3 \n\t"
2660 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2661 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2662 MOVNTQ
" %%mm0, 48(%4, %0, 8) \n\t"
2663 MOVNTQ
" %%mm3, 56(%4, %0, 8) \n\t"
2666 : "r"(yp
), "r" (up
), "r"(vp
), "r"(d
)
2672 const long x2
= x
<<2;
2675 d
[8*x
+2] = yp
[x2
+1];
2677 d
[8*x
+4] = yp
[x2
+2];
2679 d
[8*x
+6] = yp
[x2
+3];
2692 static inline void RENAME(rgb2rgb_init
)(void){
2693 rgb15to16
= RENAME(rgb15to16
);
2694 rgb15tobgr24
= RENAME(rgb15tobgr24
);
2695 rgb15to32
= RENAME(rgb15to32
);
2696 rgb16tobgr24
= RENAME(rgb16tobgr24
);
2697 rgb16to32
= RENAME(rgb16to32
);
2698 rgb16to15
= RENAME(rgb16to15
);
2699 rgb24tobgr16
= RENAME(rgb24tobgr16
);
2700 rgb24tobgr15
= RENAME(rgb24tobgr15
);
2701 rgb24tobgr32
= RENAME(rgb24tobgr32
);
2702 rgb32to16
= RENAME(rgb32to16
);
2703 rgb32to15
= RENAME(rgb32to15
);
2704 rgb32tobgr24
= RENAME(rgb32tobgr24
);
2705 rgb24to15
= RENAME(rgb24to15
);
2706 rgb24to16
= RENAME(rgb24to16
);
2707 rgb24tobgr24
= RENAME(rgb24tobgr24
);
2708 rgb32tobgr32
= RENAME(rgb32tobgr32
);
2709 rgb32tobgr16
= RENAME(rgb32tobgr16
);
2710 rgb32tobgr15
= RENAME(rgb32tobgr15
);
2711 yv12toyuy2
= RENAME(yv12toyuy2
);
2712 yv12touyvy
= RENAME(yv12touyvy
);
2713 yuv422ptoyuy2
= RENAME(yuv422ptoyuy2
);
2714 yuv422ptouyvy
= RENAME(yuv422ptouyvy
);
2715 yuy2toyv12
= RENAME(yuy2toyv12
);
2716 // uyvytoyv12 = RENAME(uyvytoyv12);
2717 // yvu9toyv12 = RENAME(yvu9toyv12);
2718 planar2x
= RENAME(planar2x
);
2719 rgb24toyv12
= RENAME(rgb24toyv12
);
2720 interleaveBytes
= RENAME(interleaveBytes
);
2721 vu9_to_vu12
= RENAME(vu9_to_vu12
);
2722 yvu9_to_yuy2
= RENAME(yvu9_to_yuy2
);