2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
47 #define PREFETCH "prefetch"
48 #define PREFETCHW "prefetchw"
49 #define PAVGB "pavgusb"
51 #define PREFETCH "prefetchnta"
52 #define PREFETCHW "prefetcht0"
55 #define PREFETCH " # nop"
56 #define PREFETCHW " # nop"
60 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
67 #define MOVNTQ "movntq"
68 #define SFENCE "sfence"
71 #define SFENCE " # nop"
74 static inline void RENAME(rgb24tobgr32
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
77 const uint8_t *s
= src
;
80 const uint8_t *mm_end
;
84 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
86 __asm__
volatile("movq %0, %%mm7"::"m"(mask32
):"memory");
92 "punpckldq 3%1, %%mm0 \n\t"
93 "movd 6%1, %%mm1 \n\t"
94 "punpckldq 9%1, %%mm1 \n\t"
95 "movd 12%1, %%mm2 \n\t"
96 "punpckldq 15%1, %%mm2 \n\t"
97 "movd 18%1, %%mm3 \n\t"
98 "punpckldq 21%1, %%mm3 \n\t"
99 "pand %%mm7, %%mm0 \n\t"
100 "pand %%mm7, %%mm1 \n\t"
101 "pand %%mm7, %%mm2 \n\t"
102 "pand %%mm7, %%mm3 \n\t"
103 MOVNTQ
" %%mm0, %0 \n\t"
104 MOVNTQ
" %%mm1, 8%0 \n\t"
105 MOVNTQ
" %%mm2, 16%0 \n\t"
113 __asm__
volatile(SFENCE:::"memory");
114 __asm__
volatile(EMMS:::"memory");
118 #ifdef WORDS_BIGENDIAN
119 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
134 static inline void RENAME(rgb32tobgr24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
137 const uint8_t *s
= src
;
140 const uint8_t *mm_end
;
144 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
150 "movq %1, %%mm0 \n\t"
151 "movq 8%1, %%mm1 \n\t"
152 "movq 16%1, %%mm4 \n\t"
153 "movq 24%1, %%mm5 \n\t"
154 "movq %%mm0, %%mm2 \n\t"
155 "movq %%mm1, %%mm3 \n\t"
156 "movq %%mm4, %%mm6 \n\t"
157 "movq %%mm5, %%mm7 \n\t"
158 "psrlq $8, %%mm2 \n\t"
159 "psrlq $8, %%mm3 \n\t"
160 "psrlq $8, %%mm6 \n\t"
161 "psrlq $8, %%mm7 \n\t"
162 "pand %2, %%mm0 \n\t"
163 "pand %2, %%mm1 \n\t"
164 "pand %2, %%mm4 \n\t"
165 "pand %2, %%mm5 \n\t"
166 "pand %3, %%mm2 \n\t"
167 "pand %3, %%mm3 \n\t"
168 "pand %3, %%mm6 \n\t"
169 "pand %3, %%mm7 \n\t"
170 "por %%mm2, %%mm0 \n\t"
171 "por %%mm3, %%mm1 \n\t"
172 "por %%mm6, %%mm4 \n\t"
173 "por %%mm7, %%mm5 \n\t"
175 "movq %%mm1, %%mm2 \n\t"
176 "movq %%mm4, %%mm3 \n\t"
177 "psllq $48, %%mm2 \n\t"
178 "psllq $32, %%mm3 \n\t"
179 "pand %4, %%mm2 \n\t"
180 "pand %5, %%mm3 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "psrlq $16, %%mm1 \n\t"
183 "psrlq $32, %%mm4 \n\t"
184 "psllq $16, %%mm5 \n\t"
185 "por %%mm3, %%mm1 \n\t"
186 "pand %6, %%mm5 \n\t"
187 "por %%mm5, %%mm4 \n\t"
189 MOVNTQ
" %%mm0, %0 \n\t"
190 MOVNTQ
" %%mm1, 8%0 \n\t"
193 :"m"(*s
),"m"(mask24l
),
194 "m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
199 __asm__
volatile(SFENCE:::"memory");
200 __asm__
volatile(EMMS:::"memory");
204 #ifdef WORDS_BIGENDIAN
205 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
221 original by Strepto/Astral
222 ported to gcc & bugfixed: A'rpi
223 MMX2, 3DNOW optimization by Nick Kurshev
224 32-bit C version, and and&add trick by Michael Niedermayer
226 static inline void RENAME(rgb15to16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
228 register const uint8_t* s
=src
;
229 register uint8_t* d
=dst
;
230 register const uint8_t *end
;
231 const uint8_t *mm_end
;
234 __asm__
volatile(PREFETCH
" %0"::"m"(*s
));
235 __asm__
volatile("movq %0, %%mm4"::"m"(mask15s
));
241 "movq %1, %%mm0 \n\t"
242 "movq 8%1, %%mm2 \n\t"
243 "movq %%mm0, %%mm1 \n\t"
244 "movq %%mm2, %%mm3 \n\t"
245 "pand %%mm4, %%mm0 \n\t"
246 "pand %%mm4, %%mm2 \n\t"
247 "paddw %%mm1, %%mm0 \n\t"
248 "paddw %%mm3, %%mm2 \n\t"
249 MOVNTQ
" %%mm0, %0 \n\t"
257 __asm__
volatile(SFENCE:::"memory");
258 __asm__
volatile(EMMS:::"memory");
263 register unsigned x
= *((const uint32_t *)s
);
264 *((uint32_t *)d
) = (x
&0x7FFF7FFF) + (x
&0x7FE07FE0);
270 register unsigned short x
= *((const uint16_t *)s
);
271 *((uint16_t *)d
) = (x
&0x7FFF) + (x
&0x7FE0);
275 static inline void RENAME(rgb16to15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
277 register const uint8_t* s
=src
;
278 register uint8_t* d
=dst
;
279 register const uint8_t *end
;
280 const uint8_t *mm_end
;
283 __asm__
volatile(PREFETCH
" %0"::"m"(*s
));
284 __asm__
volatile("movq %0, %%mm7"::"m"(mask15rg
));
285 __asm__
volatile("movq %0, %%mm6"::"m"(mask15b
));
291 "movq %1, %%mm0 \n\t"
292 "movq 8%1, %%mm2 \n\t"
293 "movq %%mm0, %%mm1 \n\t"
294 "movq %%mm2, %%mm3 \n\t"
295 "psrlq $1, %%mm0 \n\t"
296 "psrlq $1, %%mm2 \n\t"
297 "pand %%mm7, %%mm0 \n\t"
298 "pand %%mm7, %%mm2 \n\t"
299 "pand %%mm6, %%mm1 \n\t"
300 "pand %%mm6, %%mm3 \n\t"
301 "por %%mm1, %%mm0 \n\t"
302 "por %%mm3, %%mm2 \n\t"
303 MOVNTQ
" %%mm0, %0 \n\t"
311 __asm__
volatile(SFENCE:::"memory");
312 __asm__
volatile(EMMS:::"memory");
317 register uint32_t x
= *((const uint32_t*)s
);
318 *((uint32_t *)d
) = ((x
>>1)&0x7FE07FE0) | (x
&0x001F001F);
324 register uint16_t x
= *((const uint16_t*)s
);
325 *((uint16_t *)d
) = ((x
>>1)&0x7FE0) | (x
&0x001F);
331 static inline void RENAME(rgb32to16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
333 const uint8_t *s
= src
;
336 const uint8_t *mm_end
;
338 uint16_t *d
= (uint16_t *)dst
;
342 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
344 "movq %3, %%mm5 \n\t"
345 "movq %4, %%mm6 \n\t"
346 "movq %5, %%mm7 \n\t"
350 PREFETCH
" 32(%1) \n\t"
351 "movd (%1), %%mm0 \n\t"
352 "movd 4(%1), %%mm3 \n\t"
353 "punpckldq 8(%1), %%mm0 \n\t"
354 "punpckldq 12(%1), %%mm3 \n\t"
355 "movq %%mm0, %%mm1 \n\t"
356 "movq %%mm3, %%mm4 \n\t"
357 "pand %%mm6, %%mm0 \n\t"
358 "pand %%mm6, %%mm3 \n\t"
359 "pmaddwd %%mm7, %%mm0 \n\t"
360 "pmaddwd %%mm7, %%mm3 \n\t"
361 "pand %%mm5, %%mm1 \n\t"
362 "pand %%mm5, %%mm4 \n\t"
363 "por %%mm1, %%mm0 \n\t"
364 "por %%mm4, %%mm3 \n\t"
365 "psrld $5, %%mm0 \n\t"
366 "pslld $11, %%mm3 \n\t"
367 "por %%mm3, %%mm0 \n\t"
368 MOVNTQ
" %%mm0, (%0) \n\t"
375 : "r" (mm_end
), "m" (mask3216g
), "m" (mask3216br
), "m" (mul3216
)
378 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
380 "movq %0, %%mm7 \n\t"
381 "movq %1, %%mm6 \n\t"
382 ::"m"(red_16mask
),"m"(green_16mask
));
387 "movd %1, %%mm0 \n\t"
388 "movd 4%1, %%mm3 \n\t"
389 "punpckldq 8%1, %%mm0 \n\t"
390 "punpckldq 12%1, %%mm3 \n\t"
391 "movq %%mm0, %%mm1 \n\t"
392 "movq %%mm0, %%mm2 \n\t"
393 "movq %%mm3, %%mm4 \n\t"
394 "movq %%mm3, %%mm5 \n\t"
395 "psrlq $3, %%mm0 \n\t"
396 "psrlq $3, %%mm3 \n\t"
397 "pand %2, %%mm0 \n\t"
398 "pand %2, %%mm3 \n\t"
399 "psrlq $5, %%mm1 \n\t"
400 "psrlq $5, %%mm4 \n\t"
401 "pand %%mm6, %%mm1 \n\t"
402 "pand %%mm6, %%mm4 \n\t"
403 "psrlq $8, %%mm2 \n\t"
404 "psrlq $8, %%mm5 \n\t"
405 "pand %%mm7, %%mm2 \n\t"
406 "pand %%mm7, %%mm5 \n\t"
407 "por %%mm1, %%mm0 \n\t"
408 "por %%mm4, %%mm3 \n\t"
409 "por %%mm2, %%mm0 \n\t"
410 "por %%mm5, %%mm3 \n\t"
411 "psllq $16, %%mm3 \n\t"
412 "por %%mm3, %%mm0 \n\t"
413 MOVNTQ
" %%mm0, %0 \n\t"
414 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
419 __asm__
volatile(SFENCE:::"memory");
420 __asm__
volatile(EMMS:::"memory");
424 register int rgb
= *(const uint32_t*)s
; s
+= 4;
425 *d
++ = ((rgb
&0xFF)>>3) + ((rgb
&0xFC00)>>5) + ((rgb
&0xF80000)>>8);
429 static inline void RENAME(rgb32tobgr16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
431 const uint8_t *s
= src
;
434 const uint8_t *mm_end
;
436 uint16_t *d
= (uint16_t *)dst
;
439 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
441 "movq %0, %%mm7 \n\t"
442 "movq %1, %%mm6 \n\t"
443 ::"m"(red_16mask
),"m"(green_16mask
));
449 "movd %1, %%mm0 \n\t"
450 "movd 4%1, %%mm3 \n\t"
451 "punpckldq 8%1, %%mm0 \n\t"
452 "punpckldq 12%1, %%mm3 \n\t"
453 "movq %%mm0, %%mm1 \n\t"
454 "movq %%mm0, %%mm2 \n\t"
455 "movq %%mm3, %%mm4 \n\t"
456 "movq %%mm3, %%mm5 \n\t"
457 "psllq $8, %%mm0 \n\t"
458 "psllq $8, %%mm3 \n\t"
459 "pand %%mm7, %%mm0 \n\t"
460 "pand %%mm7, %%mm3 \n\t"
461 "psrlq $5, %%mm1 \n\t"
462 "psrlq $5, %%mm4 \n\t"
463 "pand %%mm6, %%mm1 \n\t"
464 "pand %%mm6, %%mm4 \n\t"
465 "psrlq $19, %%mm2 \n\t"
466 "psrlq $19, %%mm5 \n\t"
467 "pand %2, %%mm2 \n\t"
468 "pand %2, %%mm5 \n\t"
469 "por %%mm1, %%mm0 \n\t"
470 "por %%mm4, %%mm3 \n\t"
471 "por %%mm2, %%mm0 \n\t"
472 "por %%mm5, %%mm3 \n\t"
473 "psllq $16, %%mm3 \n\t"
474 "por %%mm3, %%mm0 \n\t"
475 MOVNTQ
" %%mm0, %0 \n\t"
476 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
480 __asm__
volatile(SFENCE:::"memory");
481 __asm__
volatile(EMMS:::"memory");
485 register int rgb
= *(const uint32_t*)s
; s
+= 4;
486 *d
++ = ((rgb
&0xF8)<<8) + ((rgb
&0xFC00)>>5) + ((rgb
&0xF80000)>>19);
490 static inline void RENAME(rgb32to15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
492 const uint8_t *s
= src
;
495 const uint8_t *mm_end
;
497 uint16_t *d
= (uint16_t *)dst
;
501 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
503 "movq %3, %%mm5 \n\t"
504 "movq %4, %%mm6 \n\t"
505 "movq %5, %%mm7 \n\t"
509 PREFETCH
" 32(%1) \n\t"
510 "movd (%1), %%mm0 \n\t"
511 "movd 4(%1), %%mm3 \n\t"
512 "punpckldq 8(%1), %%mm0 \n\t"
513 "punpckldq 12(%1), %%mm3 \n\t"
514 "movq %%mm0, %%mm1 \n\t"
515 "movq %%mm3, %%mm4 \n\t"
516 "pand %%mm6, %%mm0 \n\t"
517 "pand %%mm6, %%mm3 \n\t"
518 "pmaddwd %%mm7, %%mm0 \n\t"
519 "pmaddwd %%mm7, %%mm3 \n\t"
520 "pand %%mm5, %%mm1 \n\t"
521 "pand %%mm5, %%mm4 \n\t"
522 "por %%mm1, %%mm0 \n\t"
523 "por %%mm4, %%mm3 \n\t"
524 "psrld $6, %%mm0 \n\t"
525 "pslld $10, %%mm3 \n\t"
526 "por %%mm3, %%mm0 \n\t"
527 MOVNTQ
" %%mm0, (%0) \n\t"
534 : "r" (mm_end
), "m" (mask3215g
), "m" (mask3216br
), "m" (mul3215
)
537 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
539 "movq %0, %%mm7 \n\t"
540 "movq %1, %%mm6 \n\t"
541 ::"m"(red_15mask
),"m"(green_15mask
));
546 "movd %1, %%mm0 \n\t"
547 "movd 4%1, %%mm3 \n\t"
548 "punpckldq 8%1, %%mm0 \n\t"
549 "punpckldq 12%1, %%mm3 \n\t"
550 "movq %%mm0, %%mm1 \n\t"
551 "movq %%mm0, %%mm2 \n\t"
552 "movq %%mm3, %%mm4 \n\t"
553 "movq %%mm3, %%mm5 \n\t"
554 "psrlq $3, %%mm0 \n\t"
555 "psrlq $3, %%mm3 \n\t"
556 "pand %2, %%mm0 \n\t"
557 "pand %2, %%mm3 \n\t"
558 "psrlq $6, %%mm1 \n\t"
559 "psrlq $6, %%mm4 \n\t"
560 "pand %%mm6, %%mm1 \n\t"
561 "pand %%mm6, %%mm4 \n\t"
562 "psrlq $9, %%mm2 \n\t"
563 "psrlq $9, %%mm5 \n\t"
564 "pand %%mm7, %%mm2 \n\t"
565 "pand %%mm7, %%mm5 \n\t"
566 "por %%mm1, %%mm0 \n\t"
567 "por %%mm4, %%mm3 \n\t"
568 "por %%mm2, %%mm0 \n\t"
569 "por %%mm5, %%mm3 \n\t"
570 "psllq $16, %%mm3 \n\t"
571 "por %%mm3, %%mm0 \n\t"
572 MOVNTQ
" %%mm0, %0 \n\t"
573 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
578 __asm__
volatile(SFENCE:::"memory");
579 __asm__
volatile(EMMS:::"memory");
583 register int rgb
= *(const uint32_t*)s
; s
+= 4;
584 *d
++ = ((rgb
&0xFF)>>3) + ((rgb
&0xF800)>>6) + ((rgb
&0xF80000)>>9);
588 static inline void RENAME(rgb32tobgr15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
590 const uint8_t *s
= src
;
593 const uint8_t *mm_end
;
595 uint16_t *d
= (uint16_t *)dst
;
598 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
600 "movq %0, %%mm7 \n\t"
601 "movq %1, %%mm6 \n\t"
602 ::"m"(red_15mask
),"m"(green_15mask
));
608 "movd %1, %%mm0 \n\t"
609 "movd 4%1, %%mm3 \n\t"
610 "punpckldq 8%1, %%mm0 \n\t"
611 "punpckldq 12%1, %%mm3 \n\t"
612 "movq %%mm0, %%mm1 \n\t"
613 "movq %%mm0, %%mm2 \n\t"
614 "movq %%mm3, %%mm4 \n\t"
615 "movq %%mm3, %%mm5 \n\t"
616 "psllq $7, %%mm0 \n\t"
617 "psllq $7, %%mm3 \n\t"
618 "pand %%mm7, %%mm0 \n\t"
619 "pand %%mm7, %%mm3 \n\t"
620 "psrlq $6, %%mm1 \n\t"
621 "psrlq $6, %%mm4 \n\t"
622 "pand %%mm6, %%mm1 \n\t"
623 "pand %%mm6, %%mm4 \n\t"
624 "psrlq $19, %%mm2 \n\t"
625 "psrlq $19, %%mm5 \n\t"
626 "pand %2, %%mm2 \n\t"
627 "pand %2, %%mm5 \n\t"
628 "por %%mm1, %%mm0 \n\t"
629 "por %%mm4, %%mm3 \n\t"
630 "por %%mm2, %%mm0 \n\t"
631 "por %%mm5, %%mm3 \n\t"
632 "psllq $16, %%mm3 \n\t"
633 "por %%mm3, %%mm0 \n\t"
634 MOVNTQ
" %%mm0, %0 \n\t"
635 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
639 __asm__
volatile(SFENCE:::"memory");
640 __asm__
volatile(EMMS:::"memory");
644 register int rgb
= *(const uint32_t*)s
; s
+= 4;
645 *d
++ = ((rgb
&0xF8)<<7) + ((rgb
&0xF800)>>6) + ((rgb
&0xF80000)>>19);
649 static inline void RENAME(rgb24tobgr16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
651 const uint8_t *s
= src
;
654 const uint8_t *mm_end
;
656 uint16_t *d
= (uint16_t *)dst
;
659 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
661 "movq %0, %%mm7 \n\t"
662 "movq %1, %%mm6 \n\t"
663 ::"m"(red_16mask
),"m"(green_16mask
));
669 "movd %1, %%mm0 \n\t"
670 "movd 3%1, %%mm3 \n\t"
671 "punpckldq 6%1, %%mm0 \n\t"
672 "punpckldq 9%1, %%mm3 \n\t"
673 "movq %%mm0, %%mm1 \n\t"
674 "movq %%mm0, %%mm2 \n\t"
675 "movq %%mm3, %%mm4 \n\t"
676 "movq %%mm3, %%mm5 \n\t"
677 "psrlq $3, %%mm0 \n\t"
678 "psrlq $3, %%mm3 \n\t"
679 "pand %2, %%mm0 \n\t"
680 "pand %2, %%mm3 \n\t"
681 "psrlq $5, %%mm1 \n\t"
682 "psrlq $5, %%mm4 \n\t"
683 "pand %%mm6, %%mm1 \n\t"
684 "pand %%mm6, %%mm4 \n\t"
685 "psrlq $8, %%mm2 \n\t"
686 "psrlq $8, %%mm5 \n\t"
687 "pand %%mm7, %%mm2 \n\t"
688 "pand %%mm7, %%mm5 \n\t"
689 "por %%mm1, %%mm0 \n\t"
690 "por %%mm4, %%mm3 \n\t"
691 "por %%mm2, %%mm0 \n\t"
692 "por %%mm5, %%mm3 \n\t"
693 "psllq $16, %%mm3 \n\t"
694 "por %%mm3, %%mm0 \n\t"
695 MOVNTQ
" %%mm0, %0 \n\t"
696 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
700 __asm__
volatile(SFENCE:::"memory");
701 __asm__
volatile(EMMS:::"memory");
708 *d
++ = (b
>>3) | ((g
&0xFC)<<3) | ((r
&0xF8)<<8);
712 static inline void RENAME(rgb24to16
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
714 const uint8_t *s
= src
;
717 const uint8_t *mm_end
;
719 uint16_t *d
= (uint16_t *)dst
;
722 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
724 "movq %0, %%mm7 \n\t"
725 "movq %1, %%mm6 \n\t"
726 ::"m"(red_16mask
),"m"(green_16mask
));
732 "movd %1, %%mm0 \n\t"
733 "movd 3%1, %%mm3 \n\t"
734 "punpckldq 6%1, %%mm0 \n\t"
735 "punpckldq 9%1, %%mm3 \n\t"
736 "movq %%mm0, %%mm1 \n\t"
737 "movq %%mm0, %%mm2 \n\t"
738 "movq %%mm3, %%mm4 \n\t"
739 "movq %%mm3, %%mm5 \n\t"
740 "psllq $8, %%mm0 \n\t"
741 "psllq $8, %%mm3 \n\t"
742 "pand %%mm7, %%mm0 \n\t"
743 "pand %%mm7, %%mm3 \n\t"
744 "psrlq $5, %%mm1 \n\t"
745 "psrlq $5, %%mm4 \n\t"
746 "pand %%mm6, %%mm1 \n\t"
747 "pand %%mm6, %%mm4 \n\t"
748 "psrlq $19, %%mm2 \n\t"
749 "psrlq $19, %%mm5 \n\t"
750 "pand %2, %%mm2 \n\t"
751 "pand %2, %%mm5 \n\t"
752 "por %%mm1, %%mm0 \n\t"
753 "por %%mm4, %%mm3 \n\t"
754 "por %%mm2, %%mm0 \n\t"
755 "por %%mm5, %%mm3 \n\t"
756 "psllq $16, %%mm3 \n\t"
757 "por %%mm3, %%mm0 \n\t"
758 MOVNTQ
" %%mm0, %0 \n\t"
759 :"=m"(*d
):"m"(*s
),"m"(blue_16mask
):"memory");
763 __asm__
volatile(SFENCE:::"memory");
764 __asm__
volatile(EMMS:::"memory");
771 *d
++ = (b
>>3) | ((g
&0xFC)<<3) | ((r
&0xF8)<<8);
775 static inline void RENAME(rgb24tobgr15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
777 const uint8_t *s
= src
;
780 const uint8_t *mm_end
;
782 uint16_t *d
= (uint16_t *)dst
;
785 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
787 "movq %0, %%mm7 \n\t"
788 "movq %1, %%mm6 \n\t"
789 ::"m"(red_15mask
),"m"(green_15mask
));
795 "movd %1, %%mm0 \n\t"
796 "movd 3%1, %%mm3 \n\t"
797 "punpckldq 6%1, %%mm0 \n\t"
798 "punpckldq 9%1, %%mm3 \n\t"
799 "movq %%mm0, %%mm1 \n\t"
800 "movq %%mm0, %%mm2 \n\t"
801 "movq %%mm3, %%mm4 \n\t"
802 "movq %%mm3, %%mm5 \n\t"
803 "psrlq $3, %%mm0 \n\t"
804 "psrlq $3, %%mm3 \n\t"
805 "pand %2, %%mm0 \n\t"
806 "pand %2, %%mm3 \n\t"
807 "psrlq $6, %%mm1 \n\t"
808 "psrlq $6, %%mm4 \n\t"
809 "pand %%mm6, %%mm1 \n\t"
810 "pand %%mm6, %%mm4 \n\t"
811 "psrlq $9, %%mm2 \n\t"
812 "psrlq $9, %%mm5 \n\t"
813 "pand %%mm7, %%mm2 \n\t"
814 "pand %%mm7, %%mm5 \n\t"
815 "por %%mm1, %%mm0 \n\t"
816 "por %%mm4, %%mm3 \n\t"
817 "por %%mm2, %%mm0 \n\t"
818 "por %%mm5, %%mm3 \n\t"
819 "psllq $16, %%mm3 \n\t"
820 "por %%mm3, %%mm0 \n\t"
821 MOVNTQ
" %%mm0, %0 \n\t"
822 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
826 __asm__
volatile(SFENCE:::"memory");
827 __asm__
volatile(EMMS:::"memory");
834 *d
++ = (b
>>3) | ((g
&0xF8)<<2) | ((r
&0xF8)<<7);
838 static inline void RENAME(rgb24to15
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
840 const uint8_t *s
= src
;
843 const uint8_t *mm_end
;
845 uint16_t *d
= (uint16_t *)dst
;
848 __asm__
volatile(PREFETCH
" %0"::"m"(*src
):"memory");
850 "movq %0, %%mm7 \n\t"
851 "movq %1, %%mm6 \n\t"
852 ::"m"(red_15mask
),"m"(green_15mask
));
858 "movd %1, %%mm0 \n\t"
859 "movd 3%1, %%mm3 \n\t"
860 "punpckldq 6%1, %%mm0 \n\t"
861 "punpckldq 9%1, %%mm3 \n\t"
862 "movq %%mm0, %%mm1 \n\t"
863 "movq %%mm0, %%mm2 \n\t"
864 "movq %%mm3, %%mm4 \n\t"
865 "movq %%mm3, %%mm5 \n\t"
866 "psllq $7, %%mm0 \n\t"
867 "psllq $7, %%mm3 \n\t"
868 "pand %%mm7, %%mm0 \n\t"
869 "pand %%mm7, %%mm3 \n\t"
870 "psrlq $6, %%mm1 \n\t"
871 "psrlq $6, %%mm4 \n\t"
872 "pand %%mm6, %%mm1 \n\t"
873 "pand %%mm6, %%mm4 \n\t"
874 "psrlq $19, %%mm2 \n\t"
875 "psrlq $19, %%mm5 \n\t"
876 "pand %2, %%mm2 \n\t"
877 "pand %2, %%mm5 \n\t"
878 "por %%mm1, %%mm0 \n\t"
879 "por %%mm4, %%mm3 \n\t"
880 "por %%mm2, %%mm0 \n\t"
881 "por %%mm5, %%mm3 \n\t"
882 "psllq $16, %%mm3 \n\t"
883 "por %%mm3, %%mm0 \n\t"
884 MOVNTQ
" %%mm0, %0 \n\t"
885 :"=m"(*d
):"m"(*s
),"m"(blue_15mask
):"memory");
889 __asm__
volatile(SFENCE:::"memory");
890 __asm__
volatile(EMMS:::"memory");
897 *d
++ = (b
>>3) | ((g
&0xF8)<<2) | ((r
&0xF8)<<7);
902 I use less accurate approximation here by simply left-shifting the input
903 value and filling the low order bits with zeroes. This method improves PNG
904 compression but this scheme cannot reproduce white exactly, since it does
905 not generate an all-ones maximum value; the net effect is to darken the
908 The better method should be "left bit replication":
918 | leftmost bits repeated to fill open bits
922 static inline void RENAME(rgb15tobgr24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
926 const uint16_t *mm_end
;
929 const uint16_t *s
= (const uint16_t*)src
;
930 end
= s
+ src_size
/2;
932 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
938 "movq %1, %%mm0 \n\t"
939 "movq %1, %%mm1 \n\t"
940 "movq %1, %%mm2 \n\t"
941 "pand %2, %%mm0 \n\t"
942 "pand %3, %%mm1 \n\t"
943 "pand %4, %%mm2 \n\t"
944 "psllq $3, %%mm0 \n\t"
945 "psrlq $2, %%mm1 \n\t"
946 "psrlq $7, %%mm2 \n\t"
947 "movq %%mm0, %%mm3 \n\t"
948 "movq %%mm1, %%mm4 \n\t"
949 "movq %%mm2, %%mm5 \n\t"
950 "punpcklwd %5, %%mm0 \n\t"
951 "punpcklwd %5, %%mm1 \n\t"
952 "punpcklwd %5, %%mm2 \n\t"
953 "punpckhwd %5, %%mm3 \n\t"
954 "punpckhwd %5, %%mm4 \n\t"
955 "punpckhwd %5, %%mm5 \n\t"
956 "psllq $8, %%mm1 \n\t"
957 "psllq $16, %%mm2 \n\t"
958 "por %%mm1, %%mm0 \n\t"
959 "por %%mm2, %%mm0 \n\t"
960 "psllq $8, %%mm4 \n\t"
961 "psllq $16, %%mm5 \n\t"
962 "por %%mm4, %%mm3 \n\t"
963 "por %%mm5, %%mm3 \n\t"
965 "movq %%mm0, %%mm6 \n\t"
966 "movq %%mm3, %%mm7 \n\t"
968 "movq 8%1, %%mm0 \n\t"
969 "movq 8%1, %%mm1 \n\t"
970 "movq 8%1, %%mm2 \n\t"
971 "pand %2, %%mm0 \n\t"
972 "pand %3, %%mm1 \n\t"
973 "pand %4, %%mm2 \n\t"
974 "psllq $3, %%mm0 \n\t"
975 "psrlq $2, %%mm1 \n\t"
976 "psrlq $7, %%mm2 \n\t"
977 "movq %%mm0, %%mm3 \n\t"
978 "movq %%mm1, %%mm4 \n\t"
979 "movq %%mm2, %%mm5 \n\t"
980 "punpcklwd %5, %%mm0 \n\t"
981 "punpcklwd %5, %%mm1 \n\t"
982 "punpcklwd %5, %%mm2 \n\t"
983 "punpckhwd %5, %%mm3 \n\t"
984 "punpckhwd %5, %%mm4 \n\t"
985 "punpckhwd %5, %%mm5 \n\t"
986 "psllq $8, %%mm1 \n\t"
987 "psllq $16, %%mm2 \n\t"
988 "por %%mm1, %%mm0 \n\t"
989 "por %%mm2, %%mm0 \n\t"
990 "psllq $8, %%mm4 \n\t"
991 "psllq $16, %%mm5 \n\t"
992 "por %%mm4, %%mm3 \n\t"
993 "por %%mm5, %%mm3 \n\t"
996 :"m"(*s
),"m"(mask15b
),"m"(mask15g
),"m"(mask15r
), "m"(mmx_null
)
998 /* borrowed 32 to 24 */
1000 "movq %%mm0, %%mm4 \n\t"
1001 "movq %%mm3, %%mm5 \n\t"
1002 "movq %%mm6, %%mm0 \n\t"
1003 "movq %%mm7, %%mm1 \n\t"
1005 "movq %%mm4, %%mm6 \n\t"
1006 "movq %%mm5, %%mm7 \n\t"
1007 "movq %%mm0, %%mm2 \n\t"
1008 "movq %%mm1, %%mm3 \n\t"
1010 "psrlq $8, %%mm2 \n\t"
1011 "psrlq $8, %%mm3 \n\t"
1012 "psrlq $8, %%mm6 \n\t"
1013 "psrlq $8, %%mm7 \n\t"
1014 "pand %2, %%mm0 \n\t"
1015 "pand %2, %%mm1 \n\t"
1016 "pand %2, %%mm4 \n\t"
1017 "pand %2, %%mm5 \n\t"
1018 "pand %3, %%mm2 \n\t"
1019 "pand %3, %%mm3 \n\t"
1020 "pand %3, %%mm6 \n\t"
1021 "pand %3, %%mm7 \n\t"
1022 "por %%mm2, %%mm0 \n\t"
1023 "por %%mm3, %%mm1 \n\t"
1024 "por %%mm6, %%mm4 \n\t"
1025 "por %%mm7, %%mm5 \n\t"
1027 "movq %%mm1, %%mm2 \n\t"
1028 "movq %%mm4, %%mm3 \n\t"
1029 "psllq $48, %%mm2 \n\t"
1030 "psllq $32, %%mm3 \n\t"
1031 "pand %4, %%mm2 \n\t"
1032 "pand %5, %%mm3 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "psrlq $16, %%mm1 \n\t"
1035 "psrlq $32, %%mm4 \n\t"
1036 "psllq $16, %%mm5 \n\t"
1037 "por %%mm3, %%mm1 \n\t"
1038 "pand %6, %%mm5 \n\t"
1039 "por %%mm5, %%mm4 \n\t"
1041 MOVNTQ
" %%mm0, %0 \n\t"
1042 MOVNTQ
" %%mm1, 8%0 \n\t"
1043 MOVNTQ
" %%mm4, 16%0"
1046 :"m"(*s
),"m"(mask24l
),"m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
1051 __asm__
volatile(SFENCE:::"memory");
1052 __asm__
volatile(EMMS:::"memory");
1056 register uint16_t bgr
;
1058 *d
++ = (bgr
&0x1F)<<3;
1059 *d
++ = (bgr
&0x3E0)>>2;
1060 *d
++ = (bgr
&0x7C00)>>7;
1064 static inline void RENAME(rgb16tobgr24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1066 const uint16_t *end
;
1068 const uint16_t *mm_end
;
1070 uint8_t *d
= (uint8_t *)dst
;
1071 const uint16_t *s
= (const uint16_t *)src
;
1072 end
= s
+ src_size
/2;
1074 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1079 PREFETCH
" 32%1 \n\t"
1080 "movq %1, %%mm0 \n\t"
1081 "movq %1, %%mm1 \n\t"
1082 "movq %1, %%mm2 \n\t"
1083 "pand %2, %%mm0 \n\t"
1084 "pand %3, %%mm1 \n\t"
1085 "pand %4, %%mm2 \n\t"
1086 "psllq $3, %%mm0 \n\t"
1087 "psrlq $3, %%mm1 \n\t"
1088 "psrlq $8, %%mm2 \n\t"
1089 "movq %%mm0, %%mm3 \n\t"
1090 "movq %%mm1, %%mm4 \n\t"
1091 "movq %%mm2, %%mm5 \n\t"
1092 "punpcklwd %5, %%mm0 \n\t"
1093 "punpcklwd %5, %%mm1 \n\t"
1094 "punpcklwd %5, %%mm2 \n\t"
1095 "punpckhwd %5, %%mm3 \n\t"
1096 "punpckhwd %5, %%mm4 \n\t"
1097 "punpckhwd %5, %%mm5 \n\t"
1098 "psllq $8, %%mm1 \n\t"
1099 "psllq $16, %%mm2 \n\t"
1100 "por %%mm1, %%mm0 \n\t"
1101 "por %%mm2, %%mm0 \n\t"
1102 "psllq $8, %%mm4 \n\t"
1103 "psllq $16, %%mm5 \n\t"
1104 "por %%mm4, %%mm3 \n\t"
1105 "por %%mm5, %%mm3 \n\t"
1107 "movq %%mm0, %%mm6 \n\t"
1108 "movq %%mm3, %%mm7 \n\t"
1110 "movq 8%1, %%mm0 \n\t"
1111 "movq 8%1, %%mm1 \n\t"
1112 "movq 8%1, %%mm2 \n\t"
1113 "pand %2, %%mm0 \n\t"
1114 "pand %3, %%mm1 \n\t"
1115 "pand %4, %%mm2 \n\t"
1116 "psllq $3, %%mm0 \n\t"
1117 "psrlq $3, %%mm1 \n\t"
1118 "psrlq $8, %%mm2 \n\t"
1119 "movq %%mm0, %%mm3 \n\t"
1120 "movq %%mm1, %%mm4 \n\t"
1121 "movq %%mm2, %%mm5 \n\t"
1122 "punpcklwd %5, %%mm0 \n\t"
1123 "punpcklwd %5, %%mm1 \n\t"
1124 "punpcklwd %5, %%mm2 \n\t"
1125 "punpckhwd %5, %%mm3 \n\t"
1126 "punpckhwd %5, %%mm4 \n\t"
1127 "punpckhwd %5, %%mm5 \n\t"
1128 "psllq $8, %%mm1 \n\t"
1129 "psllq $16, %%mm2 \n\t"
1130 "por %%mm1, %%mm0 \n\t"
1131 "por %%mm2, %%mm0 \n\t"
1132 "psllq $8, %%mm4 \n\t"
1133 "psllq $16, %%mm5 \n\t"
1134 "por %%mm4, %%mm3 \n\t"
1135 "por %%mm5, %%mm3 \n\t"
1137 :"m"(*s
),"m"(mask16b
),"m"(mask16g
),"m"(mask16r
),"m"(mmx_null
)
1139 /* borrowed 32 to 24 */
1141 "movq %%mm0, %%mm4 \n\t"
1142 "movq %%mm3, %%mm5 \n\t"
1143 "movq %%mm6, %%mm0 \n\t"
1144 "movq %%mm7, %%mm1 \n\t"
1146 "movq %%mm4, %%mm6 \n\t"
1147 "movq %%mm5, %%mm7 \n\t"
1148 "movq %%mm0, %%mm2 \n\t"
1149 "movq %%mm1, %%mm3 \n\t"
1151 "psrlq $8, %%mm2 \n\t"
1152 "psrlq $8, %%mm3 \n\t"
1153 "psrlq $8, %%mm6 \n\t"
1154 "psrlq $8, %%mm7 \n\t"
1155 "pand %2, %%mm0 \n\t"
1156 "pand %2, %%mm1 \n\t"
1157 "pand %2, %%mm4 \n\t"
1158 "pand %2, %%mm5 \n\t"
1159 "pand %3, %%mm2 \n\t"
1160 "pand %3, %%mm3 \n\t"
1161 "pand %3, %%mm6 \n\t"
1162 "pand %3, %%mm7 \n\t"
1163 "por %%mm2, %%mm0 \n\t"
1164 "por %%mm3, %%mm1 \n\t"
1165 "por %%mm6, %%mm4 \n\t"
1166 "por %%mm7, %%mm5 \n\t"
1168 "movq %%mm1, %%mm2 \n\t"
1169 "movq %%mm4, %%mm3 \n\t"
1170 "psllq $48, %%mm2 \n\t"
1171 "psllq $32, %%mm3 \n\t"
1172 "pand %4, %%mm2 \n\t"
1173 "pand %5, %%mm3 \n\t"
1174 "por %%mm2, %%mm0 \n\t"
1175 "psrlq $16, %%mm1 \n\t"
1176 "psrlq $32, %%mm4 \n\t"
1177 "psllq $16, %%mm5 \n\t"
1178 "por %%mm3, %%mm1 \n\t"
1179 "pand %6, %%mm5 \n\t"
1180 "por %%mm5, %%mm4 \n\t"
1182 MOVNTQ
" %%mm0, %0 \n\t"
1183 MOVNTQ
" %%mm1, 8%0 \n\t"
1184 MOVNTQ
" %%mm4, 16%0"
1187 :"m"(*s
),"m"(mask24l
),"m"(mask24h
),"m"(mask24hh
),"m"(mask24hhh
),"m"(mask24hhhh
)
1192 __asm__
volatile(SFENCE:::"memory");
1193 __asm__
volatile(EMMS:::"memory");
1197 register uint16_t bgr
;
1199 *d
++ = (bgr
&0x1F)<<3;
1200 *d
++ = (bgr
&0x7E0)>>3;
1201 *d
++ = (bgr
&0xF800)>>8;
1205 static inline void RENAME(rgb15to32
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1207 const uint16_t *end
;
1209 const uint16_t *mm_end
;
1212 const uint16_t *s
= (const uint16_t *)src
;
1213 end
= s
+ src_size
/2;
1215 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1216 __asm__
volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1221 PREFETCH
" 32%1 \n\t"
1222 "movq %1, %%mm0 \n\t"
1223 "movq %1, %%mm1 \n\t"
1224 "movq %1, %%mm2 \n\t"
1225 "pand %2, %%mm0 \n\t"
1226 "pand %3, %%mm1 \n\t"
1227 "pand %4, %%mm2 \n\t"
1228 "psllq $3, %%mm0 \n\t"
1229 "psrlq $2, %%mm1 \n\t"
1230 "psrlq $7, %%mm2 \n\t"
1231 "movq %%mm0, %%mm3 \n\t"
1232 "movq %%mm1, %%mm4 \n\t"
1233 "movq %%mm2, %%mm5 \n\t"
1234 "punpcklwd %%mm7, %%mm0 \n\t"
1235 "punpcklwd %%mm7, %%mm1 \n\t"
1236 "punpcklwd %%mm7, %%mm2 \n\t"
1237 "punpckhwd %%mm7, %%mm3 \n\t"
1238 "punpckhwd %%mm7, %%mm4 \n\t"
1239 "punpckhwd %%mm7, %%mm5 \n\t"
1240 "psllq $8, %%mm1 \n\t"
1241 "psllq $16, %%mm2 \n\t"
1242 "por %%mm1, %%mm0 \n\t"
1243 "por %%mm2, %%mm0 \n\t"
1244 "psllq $8, %%mm4 \n\t"
1245 "psllq $16, %%mm5 \n\t"
1246 "por %%mm4, %%mm3 \n\t"
1247 "por %%mm5, %%mm3 \n\t"
1248 MOVNTQ
" %%mm0, %0 \n\t"
1249 MOVNTQ
" %%mm3, 8%0 \n\t"
1251 :"m"(*s
),"m"(mask15b
),"m"(mask15g
),"m"(mask15r
)
1256 __asm__
volatile(SFENCE:::"memory");
1257 __asm__
volatile(EMMS:::"memory");
1261 #if 0 //slightly slower on Athlon
1263 *((uint32_t*)d
)++ = ((bgr
&0x1F)<<3) + ((bgr
&0x3E0)<<6) + ((bgr
&0x7C00)<<9);
1265 register uint16_t bgr
;
1267 #ifdef WORDS_BIGENDIAN
1269 *d
++ = (bgr
&0x7C00)>>7;
1270 *d
++ = (bgr
&0x3E0)>>2;
1271 *d
++ = (bgr
&0x1F)<<3;
1273 *d
++ = (bgr
&0x1F)<<3;
1274 *d
++ = (bgr
&0x3E0)>>2;
1275 *d
++ = (bgr
&0x7C00)>>7;
1283 static inline void RENAME(rgb16to32
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1285 const uint16_t *end
;
1287 const uint16_t *mm_end
;
1290 const uint16_t *s
= (const uint16_t*)src
;
1291 end
= s
+ src_size
/2;
1293 __asm__
volatile(PREFETCH
" %0"::"m"(*s
):"memory");
1294 __asm__
volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1299 PREFETCH
" 32%1 \n\t"
1300 "movq %1, %%mm0 \n\t"
1301 "movq %1, %%mm1 \n\t"
1302 "movq %1, %%mm2 \n\t"
1303 "pand %2, %%mm0 \n\t"
1304 "pand %3, %%mm1 \n\t"
1305 "pand %4, %%mm2 \n\t"
1306 "psllq $3, %%mm0 \n\t"
1307 "psrlq $3, %%mm1 \n\t"
1308 "psrlq $8, %%mm2 \n\t"
1309 "movq %%mm0, %%mm3 \n\t"
1310 "movq %%mm1, %%mm4 \n\t"
1311 "movq %%mm2, %%mm5 \n\t"
1312 "punpcklwd %%mm7, %%mm0 \n\t"
1313 "punpcklwd %%mm7, %%mm1 \n\t"
1314 "punpcklwd %%mm7, %%mm2 \n\t"
1315 "punpckhwd %%mm7, %%mm3 \n\t"
1316 "punpckhwd %%mm7, %%mm4 \n\t"
1317 "punpckhwd %%mm7, %%mm5 \n\t"
1318 "psllq $8, %%mm1 \n\t"
1319 "psllq $16, %%mm2 \n\t"
1320 "por %%mm1, %%mm0 \n\t"
1321 "por %%mm2, %%mm0 \n\t"
1322 "psllq $8, %%mm4 \n\t"
1323 "psllq $16, %%mm5 \n\t"
1324 "por %%mm4, %%mm3 \n\t"
1325 "por %%mm5, %%mm3 \n\t"
1326 MOVNTQ
" %%mm0, %0 \n\t"
1327 MOVNTQ
" %%mm3, 8%0 \n\t"
1329 :"m"(*s
),"m"(mask16b
),"m"(mask16g
),"m"(mask16r
)
1334 __asm__
volatile(SFENCE:::"memory");
1335 __asm__
volatile(EMMS:::"memory");
1339 register uint16_t bgr
;
1341 #ifdef WORDS_BIGENDIAN
1343 *d
++ = (bgr
&0xF800)>>8;
1344 *d
++ = (bgr
&0x7E0)>>3;
1345 *d
++ = (bgr
&0x1F)<<3;
1347 *d
++ = (bgr
&0x1F)<<3;
1348 *d
++ = (bgr
&0x7E0)>>3;
1349 *d
++ = (bgr
&0xF800)>>8;
1355 static inline void RENAME(rgb32tobgr32
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1357 long idx
= 15 - src_size
;
1358 const uint8_t *s
= src
-idx
;
1359 uint8_t *d
= dst
-idx
;
1364 PREFETCH
" (%1, %0) \n\t"
1365 "movq %3, %%mm7 \n\t"
1366 "pxor %4, %%mm7 \n\t"
1367 "movq %%mm7, %%mm6 \n\t"
1368 "pxor %5, %%mm7 \n\t"
1371 PREFETCH
" 32(%1, %0) \n\t"
1372 "movq (%1, %0), %%mm0 \n\t"
1373 "movq 8(%1, %0), %%mm1 \n\t"
1375 "pshufw $177, %%mm0, %%mm3 \n\t"
1376 "pshufw $177, %%mm1, %%mm5 \n\t"
1377 "pand %%mm7, %%mm0 \n\t"
1378 "pand %%mm6, %%mm3 \n\t"
1379 "pand %%mm7, %%mm1 \n\t"
1380 "pand %%mm6, %%mm5 \n\t"
1381 "por %%mm3, %%mm0 \n\t"
1382 "por %%mm5, %%mm1 \n\t"
1384 "movq %%mm0, %%mm2 \n\t"
1385 "movq %%mm1, %%mm4 \n\t"
1386 "pand %%mm7, %%mm0 \n\t"
1387 "pand %%mm6, %%mm2 \n\t"
1388 "pand %%mm7, %%mm1 \n\t"
1389 "pand %%mm6, %%mm4 \n\t"
1390 "movq %%mm2, %%mm3 \n\t"
1391 "movq %%mm4, %%mm5 \n\t"
1392 "pslld $16, %%mm2 \n\t"
1393 "psrld $16, %%mm3 \n\t"
1394 "pslld $16, %%mm4 \n\t"
1395 "psrld $16, %%mm5 \n\t"
1396 "por %%mm2, %%mm0 \n\t"
1397 "por %%mm4, %%mm1 \n\t"
1398 "por %%mm3, %%mm0 \n\t"
1399 "por %%mm5, %%mm1 \n\t"
1401 MOVNTQ
" %%mm0, (%2, %0) \n\t"
1402 MOVNTQ
" %%mm1, 8(%2, %0) \n\t"
1409 : "r" (s
), "r" (d
), "m" (mask32b
), "m" (mask32r
), "m" (mmx_one
)
1412 for (; idx
<15; idx
+=4) {
1413 register int v
= *(const uint32_t *)&s
[idx
], g
= v
& 0xff00ff00;
1415 *(uint32_t *)&d
[idx
] = (v
>>16) + g
+ (v
<<16);
1419 static inline void RENAME(rgb24tobgr24
)(const uint8_t *src
, uint8_t *dst
, long src_size
)
1423 long mmx_size
= 23 - src_size
;
1425 "test %%"REG_a
", %%"REG_a
" \n\t"
1427 "movq "MANGLE(mask24r
)", %%mm5 \n\t"
1428 "movq "MANGLE(mask24g
)", %%mm6 \n\t"
1429 "movq "MANGLE(mask24b
)", %%mm7 \n\t"
1432 PREFETCH
" 32(%1, %%"REG_a
") \n\t"
1433 "movq (%1, %%"REG_a
"), %%mm0 \n\t" // BGR BGR BG
1434 "movq (%1, %%"REG_a
"), %%mm1 \n\t" // BGR BGR BG
1435 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t" // R BGR BGR B
1436 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1437 "pand %%mm5, %%mm0 \n\t"
1438 "pand %%mm6, %%mm1 \n\t"
1439 "pand %%mm7, %%mm2 \n\t"
1440 "por %%mm0, %%mm1 \n\t"
1441 "por %%mm2, %%mm1 \n\t"
1442 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t" // BGR BGR BG
1443 MOVNTQ
" %%mm1, (%2, %%"REG_a
") \n\t" // RGB RGB RG
1444 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t" // R BGR BGR B
1445 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t" // GR BGR BGR
1446 "pand %%mm7, %%mm0 \n\t"
1447 "pand %%mm5, %%mm1 \n\t"
1448 "pand %%mm6, %%mm2 \n\t"
1449 "por %%mm0, %%mm1 \n\t"
1450 "por %%mm2, %%mm1 \n\t"
1451 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t" // R BGR BGR B
1452 MOVNTQ
" %%mm1, 8(%2, %%"REG_a
") \n\t" // B RGB RGB R
1453 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t" // GR BGR BGR
1454 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t" // BGR BGR BG
1455 "pand %%mm6, %%mm0 \n\t"
1456 "pand %%mm7, %%mm1 \n\t"
1457 "pand %%mm5, %%mm2 \n\t"
1458 "por %%mm0, %%mm1 \n\t"
1459 "por %%mm2, %%mm1 \n\t"
1460 MOVNTQ
" %%mm1, 16(%2, %%"REG_a
") \n\t"
1461 "add $24, %%"REG_a
" \n\t"
1465 : "r" (src
-mmx_size
), "r"(dst
-mmx_size
)
1468 __asm__
volatile(SFENCE:::"memory");
1469 __asm__
volatile(EMMS:::"memory");
1471 if (mmx_size
==23) return; //finished, was multiple of 8
1475 src_size
= 23-mmx_size
;
1479 for (i
=0; i
<src_size
; i
+=3)
1483 dst
[i
+ 1] = src
[i
+ 1];
1484 dst
[i
+ 2] = src
[i
+ 0];
1489 static inline void RENAME(yuvPlanartoyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1490 long width
, long height
,
1491 long lumStride
, long chromStride
, long dstStride
, long vertLumPerChroma
)
1494 const long chromWidth
= width
>>1;
1495 for (y
=0; y
<height
; y
++)
1498 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1500 "xor %%"REG_a
", %%"REG_a
" \n\t"
1503 PREFETCH
" 32(%1, %%"REG_a
", 2) \n\t"
1504 PREFETCH
" 32(%2, %%"REG_a
") \n\t"
1505 PREFETCH
" 32(%3, %%"REG_a
") \n\t"
1506 "movq (%2, %%"REG_a
"), %%mm0 \n\t" // U(0)
1507 "movq %%mm0, %%mm2 \n\t" // U(0)
1508 "movq (%3, %%"REG_a
"), %%mm1 \n\t" // V(0)
1509 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1510 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1512 "movq (%1, %%"REG_a
",2), %%mm3 \n\t" // Y(0)
1513 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t" // Y(8)
1514 "movq %%mm3, %%mm4 \n\t" // Y(0)
1515 "movq %%mm5, %%mm6 \n\t" // Y(8)
1516 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1517 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1518 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1519 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1521 MOVNTQ
" %%mm3, (%0, %%"REG_a
", 4) \n\t"
1522 MOVNTQ
" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1523 MOVNTQ
" %%mm5, 16(%0, %%"REG_a
", 4) \n\t"
1524 MOVNTQ
" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1526 "add $8, %%"REG_a
" \n\t"
1527 "cmp %4, %%"REG_a
" \n\t"
1529 ::"r"(dst
), "r"(ysrc
), "r"(usrc
), "r"(vsrc
), "g" (chromWidth
)
1534 #if ARCH_ALPHA && HAVE_MVI
1535 #define pl2yuy2(n) \
1540 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1541 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1542 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1543 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1544 yuv1 = (u << 8) + (v << 24); \
1551 uint64_t *qdst
= (uint64_t *) dst
;
1552 uint64_t *qdst2
= (uint64_t *) (dst
+ dstStride
);
1553 const uint32_t *yc
= (uint32_t *) ysrc
;
1554 const uint32_t *yc2
= (uint32_t *) (ysrc
+ lumStride
);
1555 const uint16_t *uc
= (uint16_t*) usrc
, *vc
= (uint16_t*) vsrc
;
1556 for (i
= 0; i
< chromWidth
; i
+= 8){
1557 uint64_t y1
, y2
, yuv1
, yuv2
;
1560 __asm__("ldq $31,64(%0)" :: "r"(yc
));
1561 __asm__("ldq $31,64(%0)" :: "r"(yc2
));
1562 __asm__("ldq $31,64(%0)" :: "r"(uc
));
1563 __asm__("ldq $31,64(%0)" :: "r"(vc
));
1581 #elif HAVE_FAST_64BIT
1583 uint64_t *ldst
= (uint64_t *) dst
;
1584 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1585 for (i
= 0; i
< chromWidth
; i
+= 2){
1587 k
= yc
[0] + (uc
[0] << 8) +
1588 (yc
[1] << 16) + (vc
[0] << 24);
1589 l
= yc
[2] + (uc
[1] << 8) +
1590 (yc
[3] << 16) + (vc
[1] << 24);
1591 *ldst
++ = k
+ (l
<< 32);
1598 int i
, *idst
= (int32_t *) dst
;
1599 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1600 for (i
= 0; i
< chromWidth
; i
++){
1601 #ifdef WORDS_BIGENDIAN
1602 *idst
++ = (yc
[0] << 24)+ (uc
[0] << 16) +
1603 (yc
[1] << 8) + (vc
[0] << 0);
1605 *idst
++ = yc
[0] + (uc
[0] << 8) +
1606 (yc
[1] << 16) + (vc
[0] << 24);
1614 if ((y
&(vertLumPerChroma
-1)) == vertLumPerChroma
-1)
1616 usrc
+= chromStride
;
1617 vsrc
+= chromStride
;
1623 __asm__( EMMS
" \n\t"
1630 * Height should be a multiple of 2 and width should be a multiple of 16.
1631 * (If this is a problem for anyone then tell me, and I will fix it.)
1633 static inline void RENAME(yv12toyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1634 long width
, long height
,
1635 long lumStride
, long chromStride
, long dstStride
)
1637 //FIXME interpolate chroma
1638 RENAME(yuvPlanartoyuy2
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 2);
1641 static inline void RENAME(yuvPlanartouyvy
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1642 long width
, long height
,
1643 long lumStride
, long chromStride
, long dstStride
, long vertLumPerChroma
)
1646 const long chromWidth
= width
>>1;
1647 for (y
=0; y
<height
; y
++)
1650 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1652 "xor %%"REG_a
", %%"REG_a
" \n\t"
1655 PREFETCH
" 32(%1, %%"REG_a
", 2) \n\t"
1656 PREFETCH
" 32(%2, %%"REG_a
") \n\t"
1657 PREFETCH
" 32(%3, %%"REG_a
") \n\t"
1658 "movq (%2, %%"REG_a
"), %%mm0 \n\t" // U(0)
1659 "movq %%mm0, %%mm2 \n\t" // U(0)
1660 "movq (%3, %%"REG_a
"), %%mm1 \n\t" // V(0)
1661 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1662 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1664 "movq (%1, %%"REG_a
",2), %%mm3 \n\t" // Y(0)
1665 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t" // Y(8)
1666 "movq %%mm0, %%mm4 \n\t" // Y(0)
1667 "movq %%mm2, %%mm6 \n\t" // Y(8)
1668 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1669 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1670 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1671 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1673 MOVNTQ
" %%mm0, (%0, %%"REG_a
", 4) \n\t"
1674 MOVNTQ
" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1675 MOVNTQ
" %%mm2, 16(%0, %%"REG_a
", 4) \n\t"
1676 MOVNTQ
" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1678 "add $8, %%"REG_a
" \n\t"
1679 "cmp %4, %%"REG_a
" \n\t"
1681 ::"r"(dst
), "r"(ysrc
), "r"(usrc
), "r"(vsrc
), "g" (chromWidth
)
1685 //FIXME adapt the Alpha ASM code from yv12->yuy2
1689 uint64_t *ldst
= (uint64_t *) dst
;
1690 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1691 for (i
= 0; i
< chromWidth
; i
+= 2){
1693 k
= uc
[0] + (yc
[0] << 8) +
1694 (vc
[0] << 16) + (yc
[1] << 24);
1695 l
= uc
[1] + (yc
[2] << 8) +
1696 (vc
[1] << 16) + (yc
[3] << 24);
1697 *ldst
++ = k
+ (l
<< 32);
1704 int i
, *idst
= (int32_t *) dst
;
1705 const uint8_t *yc
= ysrc
, *uc
= usrc
, *vc
= vsrc
;
1706 for (i
= 0; i
< chromWidth
; i
++){
1707 #ifdef WORDS_BIGENDIAN
1708 *idst
++ = (uc
[0] << 24)+ (yc
[0] << 16) +
1709 (vc
[0] << 8) + (yc
[1] << 0);
1711 *idst
++ = uc
[0] + (yc
[0] << 8) +
1712 (vc
[0] << 16) + (yc
[1] << 24);
1720 if ((y
&(vertLumPerChroma
-1)) == vertLumPerChroma
-1)
1722 usrc
+= chromStride
;
1723 vsrc
+= chromStride
;
1729 __asm__( EMMS
" \n\t"
1736 * Height should be a multiple of 2 and width should be a multiple of 16
1737 * (If this is a problem for anyone then tell me, and I will fix it.)
1739 static inline void RENAME(yv12touyvy
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1740 long width
, long height
,
1741 long lumStride
, long chromStride
, long dstStride
)
1743 //FIXME interpolate chroma
1744 RENAME(yuvPlanartouyvy
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 2);
1748 * Width should be a multiple of 16.
1750 static inline void RENAME(yuv422ptouyvy
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1751 long width
, long height
,
1752 long lumStride
, long chromStride
, long dstStride
)
1754 RENAME(yuvPlanartouyvy
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 1);
1758 * Width should be a multiple of 16.
1760 static inline void RENAME(yuv422ptoyuy2
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
, uint8_t *dst
,
1761 long width
, long height
,
1762 long lumStride
, long chromStride
, long dstStride
)
1764 RENAME(yuvPlanartoyuy2
)(ysrc
, usrc
, vsrc
, dst
, width
, height
, lumStride
, chromStride
, dstStride
, 1);
1768 * Height should be a multiple of 2 and width should be a multiple of 16.
1769 * (If this is a problem for anyone then tell me, and I will fix it.)
1771 static inline void RENAME(yuy2toyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1772 long width
, long height
,
1773 long lumStride
, long chromStride
, long srcStride
)
1776 const long chromWidth
= width
>>1;
1777 for (y
=0; y
<height
; y
+=2)
1781 "xor %%"REG_a
", %%"REG_a
" \n\t"
1782 "pcmpeqw %%mm7, %%mm7 \n\t"
1783 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1786 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
1787 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // YUYV YUYV(0)
1788 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(4)
1789 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1790 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1791 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1792 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1793 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1794 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1795 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1796 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1798 MOVNTQ
" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1800 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(8)
1801 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t" // YUYV YUYV(12)
1802 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1803 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1804 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1805 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1806 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1807 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1808 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1809 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1811 MOVNTQ
" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1813 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1814 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1815 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1816 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1817 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1818 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1819 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1820 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1822 MOVNTQ
" %%mm0, (%3, %%"REG_a
") \n\t"
1823 MOVNTQ
" %%mm2, (%2, %%"REG_a
") \n\t"
1825 "add $8, %%"REG_a
" \n\t"
1826 "cmp %4, %%"REG_a
" \n\t"
1828 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
1829 : "memory", "%"REG_a
1836 "xor %%"REG_a
", %%"REG_a
" \n\t"
1839 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
1840 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // YUYV YUYV(0)
1841 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(4)
1842 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t" // YUYV YUYV(8)
1843 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t" // YUYV YUYV(12)
1844 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1845 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1846 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1847 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1848 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1849 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1851 MOVNTQ
" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1852 MOVNTQ
" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1854 "add $8, %%"REG_a
" \n\t"
1855 "cmp %4, %%"REG_a
" \n\t"
1858 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
1859 : "memory", "%"REG_a
1863 for (i
=0; i
<chromWidth
; i
++)
1865 ydst
[2*i
+0] = src
[4*i
+0];
1866 udst
[i
] = src
[4*i
+1];
1867 ydst
[2*i
+1] = src
[4*i
+2];
1868 vdst
[i
] = src
[4*i
+3];
1873 for (i
=0; i
<chromWidth
; i
++)
1875 ydst
[2*i
+0] = src
[4*i
+0];
1876 ydst
[2*i
+1] = src
[4*i
+2];
1879 udst
+= chromStride
;
1880 vdst
+= chromStride
;
1885 __asm__
volatile( EMMS
" \n\t"
1891 static inline void RENAME(yvu9toyv12
)(const uint8_t *ysrc
, const uint8_t *usrc
, const uint8_t *vsrc
,
1892 uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
1893 long width
, long height
, long lumStride
, long chromStride
)
1896 memcpy(ydst
, ysrc
, width
*height
);
1898 /* XXX: implement upscaling for U,V */
1901 static inline void RENAME(planar2x
)(const uint8_t *src
, uint8_t *dst
, long srcWidth
, long srcHeight
, long srcStride
, long dstStride
)
1908 for (x
=0; x
<srcWidth
-1; x
++){
1909 dst
[2*x
+1]= (3*src
[x
] + src
[x
+1])>>2;
1910 dst
[2*x
+2]= ( src
[x
] + 3*src
[x
+1])>>2;
1912 dst
[2*srcWidth
-1]= src
[srcWidth
-1];
1916 for (y
=1; y
<srcHeight
; y
++){
1917 #if HAVE_MMX2 || HAVE_AMD3DNOW
1918 const long mmxSize
= srcWidth
&~15;
1920 "mov %4, %%"REG_a
" \n\t"
1922 "movq (%0, %%"REG_a
"), %%mm0 \n\t"
1923 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1924 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t"
1925 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t"
1926 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t"
1927 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t"
1928 PAVGB
" %%mm0, %%mm5 \n\t"
1929 PAVGB
" %%mm0, %%mm3 \n\t"
1930 PAVGB
" %%mm0, %%mm5 \n\t"
1931 PAVGB
" %%mm0, %%mm3 \n\t"
1932 PAVGB
" %%mm1, %%mm4 \n\t"
1933 PAVGB
" %%mm1, %%mm2 \n\t"
1934 PAVGB
" %%mm1, %%mm4 \n\t"
1935 PAVGB
" %%mm1, %%mm2 \n\t"
1936 "movq %%mm5, %%mm7 \n\t"
1937 "movq %%mm4, %%mm6 \n\t"
1938 "punpcklbw %%mm3, %%mm5 \n\t"
1939 "punpckhbw %%mm3, %%mm7 \n\t"
1940 "punpcklbw %%mm2, %%mm4 \n\t"
1941 "punpckhbw %%mm2, %%mm6 \n\t"
1943 MOVNTQ
" %%mm5, (%2, %%"REG_a
", 2) \n\t"
1944 MOVNTQ
" %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1945 MOVNTQ
" %%mm4, (%3, %%"REG_a
", 2) \n\t"
1946 MOVNTQ
" %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1948 "movq %%mm5, (%2, %%"REG_a
", 2) \n\t"
1949 "movq %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1950 "movq %%mm4, (%3, %%"REG_a
", 2) \n\t"
1951 "movq %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1953 "add $8, %%"REG_a
" \n\t"
1955 :: "r" (src
+ mmxSize
), "r" (src
+ srcStride
+ mmxSize
),
1956 "r" (dst
+ mmxSize
*2), "r" (dst
+ dstStride
+ mmxSize
*2),
1962 const long mmxSize
=1;
1964 dst
[0 ]= (3*src
[0] + src
[srcStride
])>>2;
1965 dst
[dstStride
]= ( src
[0] + 3*src
[srcStride
])>>2;
1967 for (x
=mmxSize
-1; x
<srcWidth
-1; x
++){
1968 dst
[2*x
+1]= (3*src
[x
+0] + src
[x
+srcStride
+1])>>2;
1969 dst
[2*x
+dstStride
+2]= ( src
[x
+0] + 3*src
[x
+srcStride
+1])>>2;
1970 dst
[2*x
+dstStride
+1]= ( src
[x
+1] + 3*src
[x
+srcStride
])>>2;
1971 dst
[2*x
+2]= (3*src
[x
+1] + src
[x
+srcStride
])>>2;
1973 dst
[srcWidth
*2 -1 ]= (3*src
[srcWidth
-1] + src
[srcWidth
-1 + srcStride
])>>2;
1974 dst
[srcWidth
*2 -1 + dstStride
]= ( src
[srcWidth
-1] + 3*src
[srcWidth
-1 + srcStride
])>>2;
1984 for (x
=0; x
<srcWidth
-1; x
++){
1985 dst
[2*x
+1]= (3*src
[x
] + src
[x
+1])>>2;
1986 dst
[2*x
+2]= ( src
[x
] + 3*src
[x
+1])>>2;
1988 dst
[2*srcWidth
-1]= src
[srcWidth
-1];
1990 for (x
=0; x
<srcWidth
; x
++){
1997 __asm__
volatile( EMMS
" \n\t"
2004 * Height should be a multiple of 2 and width should be a multiple of 16.
2005 * (If this is a problem for anyone then tell me, and I will fix it.)
2006 * Chrominance data is only taken from every second line, others are ignored.
2007 * FIXME: Write HQ version.
2009 static inline void RENAME(uyvytoyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
2010 long width
, long height
,
2011 long lumStride
, long chromStride
, long srcStride
)
2014 const long chromWidth
= width
>>1;
2015 for (y
=0; y
<height
; y
+=2)
2019 "xor %%"REG_a
", %%"REG_a
" \n\t"
2020 "pcmpeqw %%mm7, %%mm7 \n\t"
2021 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2024 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
2025 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // UYVY UYVY(0)
2026 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // UYVY UYVY(4)
2027 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2028 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2029 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2030 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2031 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2032 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2033 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2034 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2036 MOVNTQ
" %%mm2, (%1, %%"REG_a
", 2) \n\t"
2038 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t" // UYVY UYVY(8)
2039 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t" // UYVY UYVY(12)
2040 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2041 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2042 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2043 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2044 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2045 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2046 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2047 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2049 MOVNTQ
" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
2051 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2052 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2053 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2054 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2055 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2056 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2057 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2058 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2060 MOVNTQ
" %%mm0, (%3, %%"REG_a
") \n\t"
2061 MOVNTQ
" %%mm2, (%2, %%"REG_a
") \n\t"
2063 "add $8, %%"REG_a
" \n\t"
2064 "cmp %4, %%"REG_a
" \n\t"
2066 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
2067 : "memory", "%"REG_a
2074 "xor %%"REG_a
", %%"REG_a
" \n\t"
2077 PREFETCH
" 64(%0, %%"REG_a
", 4) \n\t"
2078 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t" // YUYV YUYV(0)
2079 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t" // YUYV YUYV(4)
2080 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t" // YUYV YUYV(8)
2081 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t" // YUYV YUYV(12)
2082 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2083 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2084 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2085 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2086 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2087 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2089 MOVNTQ
" %%mm0, (%1, %%"REG_a
", 2) \n\t"
2090 MOVNTQ
" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
2092 "add $8, %%"REG_a
" \n\t"
2093 "cmp %4, %%"REG_a
" \n\t"
2096 ::"r"(src
), "r"(ydst
), "r"(udst
), "r"(vdst
), "g" (chromWidth
)
2097 : "memory", "%"REG_a
2101 for (i
=0; i
<chromWidth
; i
++)
2103 udst
[i
] = src
[4*i
+0];
2104 ydst
[2*i
+0] = src
[4*i
+1];
2105 vdst
[i
] = src
[4*i
+2];
2106 ydst
[2*i
+1] = src
[4*i
+3];
2111 for (i
=0; i
<chromWidth
; i
++)
2113 ydst
[2*i
+0] = src
[4*i
+1];
2114 ydst
[2*i
+1] = src
[4*i
+3];
2117 udst
+= chromStride
;
2118 vdst
+= chromStride
;
2123 __asm__
volatile( EMMS
" \n\t"
2130 * Height should be a multiple of 2 and width should be a multiple of 2.
2131 * (If this is a problem for anyone then tell me, and I will fix it.)
2132 * Chrominance data is only taken from every second line,
2133 * others are ignored in the C version.
2134 * FIXME: Write HQ version.
2136 static inline void RENAME(rgb24toyv12
)(const uint8_t *src
, uint8_t *ydst
, uint8_t *udst
, uint8_t *vdst
,
2137 long width
, long height
,
2138 long lumStride
, long chromStride
, long srcStride
)
2141 const long chromWidth
= width
>>1;
2143 for (y
=0; y
<height
-2; y
+=2)
2149 "mov %2, %%"REG_a
" \n\t"
2150 "movq "MANGLE(ff_bgr2YCoeff
)", %%mm6 \n\t"
2151 "movq "MANGLE(ff_w1111
)", %%mm5 \n\t"
2152 "pxor %%mm7, %%mm7 \n\t"
2153 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
2156 PREFETCH
" 64(%0, %%"REG_d
") \n\t"
2157 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
2158 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
2159 "punpcklbw %%mm7, %%mm0 \n\t"
2160 "punpcklbw %%mm7, %%mm1 \n\t"
2161 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
2162 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
2163 "punpcklbw %%mm7, %%mm2 \n\t"
2164 "punpcklbw %%mm7, %%mm3 \n\t"
2165 "pmaddwd %%mm6, %%mm0 \n\t"
2166 "pmaddwd %%mm6, %%mm1 \n\t"
2167 "pmaddwd %%mm6, %%mm2 \n\t"
2168 "pmaddwd %%mm6, %%mm3 \n\t"
2169 #ifndef FAST_BGR2YV12
2170 "psrad $8, %%mm0 \n\t"
2171 "psrad $8, %%mm1 \n\t"
2172 "psrad $8, %%mm2 \n\t"
2173 "psrad $8, %%mm3 \n\t"
2175 "packssdw %%mm1, %%mm0 \n\t"
2176 "packssdw %%mm3, %%mm2 \n\t"
2177 "pmaddwd %%mm5, %%mm0 \n\t"
2178 "pmaddwd %%mm5, %%mm2 \n\t"
2179 "packssdw %%mm2, %%mm0 \n\t"
2180 "psraw $7, %%mm0 \n\t"
2182 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
2183 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
2184 "punpcklbw %%mm7, %%mm4 \n\t"
2185 "punpcklbw %%mm7, %%mm1 \n\t"
2186 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
2187 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
2188 "punpcklbw %%mm7, %%mm2 \n\t"
2189 "punpcklbw %%mm7, %%mm3 \n\t"
2190 "pmaddwd %%mm6, %%mm4 \n\t"
2191 "pmaddwd %%mm6, %%mm1 \n\t"
2192 "pmaddwd %%mm6, %%mm2 \n\t"
2193 "pmaddwd %%mm6, %%mm3 \n\t"
2194 #ifndef FAST_BGR2YV12
2195 "psrad $8, %%mm4 \n\t"
2196 "psrad $8, %%mm1 \n\t"
2197 "psrad $8, %%mm2 \n\t"
2198 "psrad $8, %%mm3 \n\t"
2200 "packssdw %%mm1, %%mm4 \n\t"
2201 "packssdw %%mm3, %%mm2 \n\t"
2202 "pmaddwd %%mm5, %%mm4 \n\t"
2203 "pmaddwd %%mm5, %%mm2 \n\t"
2204 "add $24, %%"REG_d
" \n\t"
2205 "packssdw %%mm2, %%mm4 \n\t"
2206 "psraw $7, %%mm4 \n\t"
2208 "packuswb %%mm4, %%mm0 \n\t"
2209 "paddusb "MANGLE(ff_bgr2YOffset
)", %%mm0 \n\t"
2211 MOVNTQ
" %%mm0, (%1, %%"REG_a
") \n\t"
2212 "add $8, %%"REG_a
" \n\t"
2214 : : "r" (src
+width
*3), "r" (ydst
+width
), "g" (-width
)
2215 : "%"REG_a
, "%"REG_d
2222 "mov %4, %%"REG_a
" \n\t"
2223 "movq "MANGLE(ff_w1111
)", %%mm5 \n\t"
2224 "movq "MANGLE(ff_bgr2UCoeff
)", %%mm6 \n\t"
2225 "pxor %%mm7, %%mm7 \n\t"
2226 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
2227 "add %%"REG_d
", %%"REG_d
" \n\t"
2230 PREFETCH
" 64(%0, %%"REG_d
") \n\t"
2231 PREFETCH
" 64(%1, %%"REG_d
") \n\t"
2232 #if HAVE_MMX2 || HAVE_AMD3DNOW
2233 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
2234 "movq (%1, %%"REG_d
"), %%mm1 \n\t"
2235 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
2236 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t"
2237 PAVGB
" %%mm1, %%mm0 \n\t"
2238 PAVGB
" %%mm3, %%mm2 \n\t"
2239 "movq %%mm0, %%mm1 \n\t"
2240 "movq %%mm2, %%mm3 \n\t"
2241 "psrlq $24, %%mm0 \n\t"
2242 "psrlq $24, %%mm2 \n\t"
2243 PAVGB
" %%mm1, %%mm0 \n\t"
2244 PAVGB
" %%mm3, %%mm2 \n\t"
2245 "punpcklbw %%mm7, %%mm0 \n\t"
2246 "punpcklbw %%mm7, %%mm2 \n\t"
2248 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
2249 "movd (%1, %%"REG_d
"), %%mm1 \n\t"
2250 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
2251 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t"
2252 "punpcklbw %%mm7, %%mm0 \n\t"
2253 "punpcklbw %%mm7, %%mm1 \n\t"
2254 "punpcklbw %%mm7, %%mm2 \n\t"
2255 "punpcklbw %%mm7, %%mm3 \n\t"
2256 "paddw %%mm1, %%mm0 \n\t"
2257 "paddw %%mm3, %%mm2 \n\t"
2258 "paddw %%mm2, %%mm0 \n\t"
2259 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
2260 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t"
2261 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
2262 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t"
2263 "punpcklbw %%mm7, %%mm4 \n\t"
2264 "punpcklbw %%mm7, %%mm1 \n\t"
2265 "punpcklbw %%mm7, %%mm2 \n\t"
2266 "punpcklbw %%mm7, %%mm3 \n\t"
2267 "paddw %%mm1, %%mm4 \n\t"
2268 "paddw %%mm3, %%mm2 \n\t"
2269 "paddw %%mm4, %%mm2 \n\t"
2270 "psrlw $2, %%mm0 \n\t"
2271 "psrlw $2, %%mm2 \n\t"
2273 "movq "MANGLE(ff_bgr2VCoeff
)", %%mm1 \n\t"
2274 "movq "MANGLE(ff_bgr2VCoeff
)", %%mm3 \n\t"
2276 "pmaddwd %%mm0, %%mm1 \n\t"
2277 "pmaddwd %%mm2, %%mm3 \n\t"
2278 "pmaddwd %%mm6, %%mm0 \n\t"
2279 "pmaddwd %%mm6, %%mm2 \n\t"
2280 #ifndef FAST_BGR2YV12
2281 "psrad $8, %%mm0 \n\t"
2282 "psrad $8, %%mm1 \n\t"
2283 "psrad $8, %%mm2 \n\t"
2284 "psrad $8, %%mm3 \n\t"
2286 "packssdw %%mm2, %%mm0 \n\t"
2287 "packssdw %%mm3, %%mm1 \n\t"
2288 "pmaddwd %%mm5, %%mm0 \n\t"
2289 "pmaddwd %%mm5, %%mm1 \n\t"
2290 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2291 "psraw $7, %%mm0 \n\t"
2293 #if HAVE_MMX2 || HAVE_AMD3DNOW
2294 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
2295 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t"
2296 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
2297 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t"
2298 PAVGB
" %%mm1, %%mm4 \n\t"
2299 PAVGB
" %%mm3, %%mm2 \n\t"
2300 "movq %%mm4, %%mm1 \n\t"
2301 "movq %%mm2, %%mm3 \n\t"
2302 "psrlq $24, %%mm4 \n\t"
2303 "psrlq $24, %%mm2 \n\t"
2304 PAVGB
" %%mm1, %%mm4 \n\t"
2305 PAVGB
" %%mm3, %%mm2 \n\t"
2306 "punpcklbw %%mm7, %%mm4 \n\t"
2307 "punpcklbw %%mm7, %%mm2 \n\t"
2309 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
2310 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t"
2311 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
2312 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t"
2313 "punpcklbw %%mm7, %%mm4 \n\t"
2314 "punpcklbw %%mm7, %%mm1 \n\t"
2315 "punpcklbw %%mm7, %%mm2 \n\t"
2316 "punpcklbw %%mm7, %%mm3 \n\t"
2317 "paddw %%mm1, %%mm4 \n\t"
2318 "paddw %%mm3, %%mm2 \n\t"
2319 "paddw %%mm2, %%mm4 \n\t"
2320 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
2321 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t"
2322 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
2323 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t"
2324 "punpcklbw %%mm7, %%mm5 \n\t"
2325 "punpcklbw %%mm7, %%mm1 \n\t"
2326 "punpcklbw %%mm7, %%mm2 \n\t"
2327 "punpcklbw %%mm7, %%mm3 \n\t"
2328 "paddw %%mm1, %%mm5 \n\t"
2329 "paddw %%mm3, %%mm2 \n\t"
2330 "paddw %%mm5, %%mm2 \n\t"
2331 "movq "MANGLE(ff_w1111
)", %%mm5 \n\t"
2332 "psrlw $2, %%mm4 \n\t"
2333 "psrlw $2, %%mm2 \n\t"
2335 "movq "MANGLE(ff_bgr2VCoeff
)", %%mm1 \n\t"
2336 "movq "MANGLE(ff_bgr2VCoeff
)", %%mm3 \n\t"
2338 "pmaddwd %%mm4, %%mm1 \n\t"
2339 "pmaddwd %%mm2, %%mm3 \n\t"
2340 "pmaddwd %%mm6, %%mm4 \n\t"
2341 "pmaddwd %%mm6, %%mm2 \n\t"
2342 #ifndef FAST_BGR2YV12
2343 "psrad $8, %%mm4 \n\t"
2344 "psrad $8, %%mm1 \n\t"
2345 "psrad $8, %%mm2 \n\t"
2346 "psrad $8, %%mm3 \n\t"
2348 "packssdw %%mm2, %%mm4 \n\t"
2349 "packssdw %%mm3, %%mm1 \n\t"
2350 "pmaddwd %%mm5, %%mm4 \n\t"
2351 "pmaddwd %%mm5, %%mm1 \n\t"
2352 "add $24, %%"REG_d
" \n\t"
2353 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2354 "psraw $7, %%mm4 \n\t"
2356 "movq %%mm0, %%mm1 \n\t"
2357 "punpckldq %%mm4, %%mm0 \n\t"
2358 "punpckhdq %%mm4, %%mm1 \n\t"
2359 "packsswb %%mm1, %%mm0 \n\t"
2360 "paddb "MANGLE(ff_bgr2UVOffset
)", %%mm0 \n\t"
2361 "movd %%mm0, (%2, %%"REG_a
") \n\t"
2362 "punpckhdq %%mm0, %%mm0 \n\t"
2363 "movd %%mm0, (%3, %%"REG_a
") \n\t"
2364 "add $4, %%"REG_a
" \n\t"
2366 : : "r" (src
+chromWidth
*6), "r" (src
+srcStride
+chromWidth
*6), "r" (udst
+chromWidth
), "r" (vdst
+chromWidth
), "g" (-chromWidth
)
2367 : "%"REG_a
, "%"REG_d
2370 udst
+= chromStride
;
2371 vdst
+= chromStride
;
2375 __asm__
volatile( EMMS
" \n\t"
2381 for (; y
<height
; y
+=2)
2384 for (i
=0; i
<chromWidth
; i
++)
2386 unsigned int b
= src
[6*i
+0];
2387 unsigned int g
= src
[6*i
+1];
2388 unsigned int r
= src
[6*i
+2];
2390 unsigned int Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2391 unsigned int V
= ((RV
*r
+ GV
*g
+ BV
*b
)>>RGB2YUV_SHIFT
) + 128;
2392 unsigned int U
= ((RU
*r
+ GU
*g
+ BU
*b
)>>RGB2YUV_SHIFT
) + 128;
2402 Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2408 for (i
=0; i
<chromWidth
; i
++)
2410 unsigned int b
= src
[6*i
+0];
2411 unsigned int g
= src
[6*i
+1];
2412 unsigned int r
= src
[6*i
+2];
2414 unsigned int Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2422 Y
= ((RY
*r
+ GY
*g
+ BY
*b
)>>RGB2YUV_SHIFT
) + 16;
2425 udst
+= chromStride
;
2426 vdst
+= chromStride
;
2432 static void RENAME(interleaveBytes
)(uint8_t *src1
, uint8_t *src2
, uint8_t *dest
,
2433 long width
, long height
, long src1Stride
,
2434 long src2Stride
, long dstStride
){
2437 for (h
=0; h
< height
; h
++)
2444 "xor %%"REG_a
", %%"REG_a
" \n\t"
2446 PREFETCH
" 64(%1, %%"REG_a
") \n\t"
2447 PREFETCH
" 64(%2, %%"REG_a
") \n\t"
2448 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t"
2449 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t"
2450 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t"
2451 "punpcklbw %%xmm2, %%xmm0 \n\t"
2452 "punpckhbw %%xmm2, %%xmm1 \n\t"
2453 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t"
2454 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t"
2455 "add $16, %%"REG_a
" \n\t"
2456 "cmp %3, %%"REG_a
" \n\t"
2458 ::"r"(dest
), "r"(src1
), "r"(src2
), "r" (width
-15)
2459 : "memory", "%"REG_a
""
2463 "xor %%"REG_a
", %%"REG_a
" \n\t"
2465 PREFETCH
" 64(%1, %%"REG_a
") \n\t"
2466 PREFETCH
" 64(%2, %%"REG_a
") \n\t"
2467 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
2468 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t"
2469 "movq %%mm0, %%mm1 \n\t"
2470 "movq %%mm2, %%mm3 \n\t"
2471 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
2472 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t"
2473 "punpcklbw %%mm4, %%mm0 \n\t"
2474 "punpckhbw %%mm4, %%mm1 \n\t"
2475 "punpcklbw %%mm5, %%mm2 \n\t"
2476 "punpckhbw %%mm5, %%mm3 \n\t"
2477 MOVNTQ
" %%mm0, (%0, %%"REG_a
", 2) \n\t"
2478 MOVNTQ
" %%mm1, 8(%0, %%"REG_a
", 2) \n\t"
2479 MOVNTQ
" %%mm2, 16(%0, %%"REG_a
", 2) \n\t"
2480 MOVNTQ
" %%mm3, 24(%0, %%"REG_a
", 2) \n\t"
2481 "add $16, %%"REG_a
" \n\t"
2482 "cmp %3, %%"REG_a
" \n\t"
2484 ::"r"(dest
), "r"(src1
), "r"(src2
), "r" (width
-15)
2485 : "memory", "%"REG_a
2488 for (w
= (width
&(~15)); w
< width
; w
++)
2490 dest
[2*w
+0] = src1
[w
];
2491 dest
[2*w
+1] = src2
[w
];
2494 for (w
=0; w
< width
; w
++)
2496 dest
[2*w
+0] = src1
[w
];
2497 dest
[2*w
+1] = src2
[w
];
2513 static inline void RENAME(vu9_to_vu12
)(const uint8_t *src1
, const uint8_t *src2
,
2514 uint8_t *dst1
, uint8_t *dst2
,
2515 long width
, long height
,
2516 long srcStride1
, long srcStride2
,
2517 long dstStride1
, long dstStride2
)
2520 w
=width
/2; h
=height
/2;
2525 ::"m"(*(src1
+srcStride1
)),"m"(*(src2
+srcStride2
)):"memory");
2528 const uint8_t* s1
=src1
+srcStride1
*(y
>>1);
2529 uint8_t* d
=dst1
+dstStride1
*y
;
2535 PREFETCH
" 32%1 \n\t"
2536 "movq %1, %%mm0 \n\t"
2537 "movq 8%1, %%mm2 \n\t"
2538 "movq 16%1, %%mm4 \n\t"
2539 "movq 24%1, %%mm6 \n\t"
2540 "movq %%mm0, %%mm1 \n\t"
2541 "movq %%mm2, %%mm3 \n\t"
2542 "movq %%mm4, %%mm5 \n\t"
2543 "movq %%mm6, %%mm7 \n\t"
2544 "punpcklbw %%mm0, %%mm0 \n\t"
2545 "punpckhbw %%mm1, %%mm1 \n\t"
2546 "punpcklbw %%mm2, %%mm2 \n\t"
2547 "punpckhbw %%mm3, %%mm3 \n\t"
2548 "punpcklbw %%mm4, %%mm4 \n\t"
2549 "punpckhbw %%mm5, %%mm5 \n\t"
2550 "punpcklbw %%mm6, %%mm6 \n\t"
2551 "punpckhbw %%mm7, %%mm7 \n\t"
2552 MOVNTQ
" %%mm0, %0 \n\t"
2553 MOVNTQ
" %%mm1, 8%0 \n\t"
2554 MOVNTQ
" %%mm2, 16%0 \n\t"
2555 MOVNTQ
" %%mm3, 24%0 \n\t"
2556 MOVNTQ
" %%mm4, 32%0 \n\t"
2557 MOVNTQ
" %%mm5, 40%0 \n\t"
2558 MOVNTQ
" %%mm6, 48%0 \n\t"
2559 MOVNTQ
" %%mm7, 56%0"
2565 for (;x
<w
;x
++) d
[2*x
]=d
[2*x
+1]=s1
[x
];
2568 const uint8_t* s2
=src2
+srcStride2
*(y
>>1);
2569 uint8_t* d
=dst2
+dstStride2
*y
;
2575 PREFETCH
" 32%1 \n\t"
2576 "movq %1, %%mm0 \n\t"
2577 "movq 8%1, %%mm2 \n\t"
2578 "movq 16%1, %%mm4 \n\t"
2579 "movq 24%1, %%mm6 \n\t"
2580 "movq %%mm0, %%mm1 \n\t"
2581 "movq %%mm2, %%mm3 \n\t"
2582 "movq %%mm4, %%mm5 \n\t"
2583 "movq %%mm6, %%mm7 \n\t"
2584 "punpcklbw %%mm0, %%mm0 \n\t"
2585 "punpckhbw %%mm1, %%mm1 \n\t"
2586 "punpcklbw %%mm2, %%mm2 \n\t"
2587 "punpckhbw %%mm3, %%mm3 \n\t"
2588 "punpcklbw %%mm4, %%mm4 \n\t"
2589 "punpckhbw %%mm5, %%mm5 \n\t"
2590 "punpcklbw %%mm6, %%mm6 \n\t"
2591 "punpckhbw %%mm7, %%mm7 \n\t"
2592 MOVNTQ
" %%mm0, %0 \n\t"
2593 MOVNTQ
" %%mm1, 8%0 \n\t"
2594 MOVNTQ
" %%mm2, 16%0 \n\t"
2595 MOVNTQ
" %%mm3, 24%0 \n\t"
2596 MOVNTQ
" %%mm4, 32%0 \n\t"
2597 MOVNTQ
" %%mm5, 40%0 \n\t"
2598 MOVNTQ
" %%mm6, 48%0 \n\t"
2599 MOVNTQ
" %%mm7, 56%0"
2605 for (;x
<w
;x
++) d
[2*x
]=d
[2*x
+1]=s2
[x
];
2616 static inline void RENAME(yvu9_to_yuy2
)(const uint8_t *src1
, const uint8_t *src2
, const uint8_t *src3
,
2618 long width
, long height
,
2619 long srcStride1
, long srcStride2
,
2620 long srcStride3
, long dstStride
)
2623 w
=width
/2; h
=height
;
2625 const uint8_t* yp
=src1
+srcStride1
*y
;
2626 const uint8_t* up
=src2
+srcStride2
*(y
>>2);
2627 const uint8_t* vp
=src3
+srcStride3
*(y
>>2);
2628 uint8_t* d
=dst
+dstStride
*y
;
2634 PREFETCH
" 32(%1, %0) \n\t"
2635 PREFETCH
" 32(%2, %0) \n\t"
2636 PREFETCH
" 32(%3, %0) \n\t"
2637 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2638 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2639 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2640 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2641 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2642 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2643 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2644 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2645 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2646 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2648 "movq %%mm1, %%mm6 \n\t"
2649 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2650 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2651 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2652 MOVNTQ
" %%mm0, (%4, %0, 8) \n\t"
2653 MOVNTQ
" %%mm3, 8(%4, %0, 8) \n\t"
2655 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2656 "movq 8(%1, %0, 4), %%mm0 \n\t"
2657 "movq %%mm0, %%mm3 \n\t"
2658 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2659 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2660 MOVNTQ
" %%mm0, 16(%4, %0, 8) \n\t"
2661 MOVNTQ
" %%mm3, 24(%4, %0, 8) \n\t"
2663 "movq %%mm4, %%mm6 \n\t"
2664 "movq 16(%1, %0, 4), %%mm0 \n\t"
2665 "movq %%mm0, %%mm3 \n\t"
2666 "punpcklbw %%mm5, %%mm4 \n\t"
2667 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2668 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2669 MOVNTQ
" %%mm0, 32(%4, %0, 8) \n\t"
2670 MOVNTQ
" %%mm3, 40(%4, %0, 8) \n\t"
2672 "punpckhbw %%mm5, %%mm6 \n\t"
2673 "movq 24(%1, %0, 4), %%mm0 \n\t"
2674 "movq %%mm0, %%mm3 \n\t"
2675 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2676 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2677 MOVNTQ
" %%mm0, 48(%4, %0, 8) \n\t"
2678 MOVNTQ
" %%mm3, 56(%4, %0, 8) \n\t"
2681 : "r"(yp
), "r" (up
), "r"(vp
), "r"(d
)
2687 const long x2
= x
<<2;
2690 d
[8*x
+2] = yp
[x2
+1];
2692 d
[8*x
+4] = yp
[x2
+2];
2694 d
[8*x
+6] = yp
[x2
+3];
2707 static inline void RENAME(rgb2rgb_init
)(void){
2708 rgb15to16
= RENAME(rgb15to16
);
2709 rgb15tobgr24
= RENAME(rgb15tobgr24
);
2710 rgb15to32
= RENAME(rgb15to32
);
2711 rgb16tobgr24
= RENAME(rgb16tobgr24
);
2712 rgb16to32
= RENAME(rgb16to32
);
2713 rgb16to15
= RENAME(rgb16to15
);
2714 rgb24tobgr16
= RENAME(rgb24tobgr16
);
2715 rgb24tobgr15
= RENAME(rgb24tobgr15
);
2716 rgb24tobgr32
= RENAME(rgb24tobgr32
);
2717 rgb32to16
= RENAME(rgb32to16
);
2718 rgb32to15
= RENAME(rgb32to15
);
2719 rgb32tobgr24
= RENAME(rgb32tobgr24
);
2720 rgb24to15
= RENAME(rgb24to15
);
2721 rgb24to16
= RENAME(rgb24to16
);
2722 rgb24tobgr24
= RENAME(rgb24tobgr24
);
2723 rgb32tobgr32
= RENAME(rgb32tobgr32
);
2724 rgb32tobgr16
= RENAME(rgb32tobgr16
);
2725 rgb32tobgr15
= RENAME(rgb32tobgr15
);
2726 yv12toyuy2
= RENAME(yv12toyuy2
);
2727 yv12touyvy
= RENAME(yv12touyvy
);
2728 yuv422ptoyuy2
= RENAME(yuv422ptoyuy2
);
2729 yuv422ptouyvy
= RENAME(yuv422ptouyvy
);
2730 yuy2toyv12
= RENAME(yuy2toyv12
);
2731 // uyvytoyv12 = RENAME(uyvytoyv12);
2732 // yvu9toyv12 = RENAME(yvu9toyv12);
2733 planar2x
= RENAME(planar2x
);
2734 rgb24toyv12
= RENAME(rgb24toyv12
);
2735 interleaveBytes
= RENAME(interleaveBytes
);
2736 vu9_to_vu12
= RENAME(vu9_to_vu12
);
2737 yvu9_to_yuy2
= RENAME(yvu9_to_yuy2
);