typo fix
[mplayer/greg.git] / libswscale / rgb2rgb_template.c
blob5daf089a71e52a187992db0a9e633ddaf879ed90
1 /*
2 * rgb2rgb.c, Software RGB to RGB convertor
3 * pluralize by Software PAL8 to RGB convertor
4 * Software YUV to YUV convertor
5 * Software YUV to RGB convertor
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byteorder fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * The C code (not assembly, mmx, ...) of this file can be used
27 * under the LGPL license.
30 #include <stddef.h>
31 #include <inttypes.h> /* for __WORDSIZE */
33 #ifndef __WORDSIZE
34 // #warning You have a misconfigured system and will probably lose performance!
35 #define __WORDSIZE MP_WORDSIZE
36 #endif
38 #undef PREFETCH
39 #undef MOVNTQ
40 #undef EMMS
41 #undef SFENCE
42 #undef MMREG_SIZE
43 #undef PREFETCHW
44 #undef PAVGB
46 #ifdef HAVE_SSE2
47 #define MMREG_SIZE 16
48 #else
49 #define MMREG_SIZE 8
50 #endif
52 #ifdef HAVE_3DNOW
53 #define PREFETCH "prefetch"
54 #define PREFETCHW "prefetchw"
55 #define PAVGB "pavgusb"
56 #elif defined ( HAVE_MMX2 )
57 #define PREFETCH "prefetchnta"
58 #define PREFETCHW "prefetcht0"
59 #define PAVGB "pavgb"
60 #else
61 #ifdef __APPLE__
62 #define PREFETCH "#"
63 #define PREFETCHW "#"
64 #else
65 #define PREFETCH " # nop"
66 #define PREFETCHW " # nop"
67 #endif
68 #endif
70 #ifdef HAVE_3DNOW
71 /* On K6 femms is faster than emms. On K7 femms is directly mapped on emms. */
72 #define EMMS "femms"
73 #else
74 #define EMMS "emms"
75 #endif
77 #ifdef HAVE_MMX2
78 #define MOVNTQ "movntq"
79 #define SFENCE "sfence"
80 #else
81 #define MOVNTQ "movq"
82 #define SFENCE " # nop"
83 #endif
85 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
87 uint8_t *dest = dst;
88 const uint8_t *s = src;
89 const uint8_t *end;
90 #ifdef HAVE_MMX
91 const uint8_t *mm_end;
92 #endif
93 end = s + src_size;
94 #ifdef HAVE_MMX
95 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
96 mm_end = end - 23;
97 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
98 while (s < mm_end)
100 __asm __volatile(
101 PREFETCH" 32%1 \n\t"
102 "movd %1, %%mm0 \n\t"
103 "punpckldq 3%1, %%mm0 \n\t"
104 "movd 6%1, %%mm1 \n\t"
105 "punpckldq 9%1, %%mm1 \n\t"
106 "movd 12%1, %%mm2 \n\t"
107 "punpckldq 15%1, %%mm2 \n\t"
108 "movd 18%1, %%mm3 \n\t"
109 "punpckldq 21%1, %%mm3 \n\t"
110 "pand %%mm7, %%mm0 \n\t"
111 "pand %%mm7, %%mm1 \n\t"
112 "pand %%mm7, %%mm2 \n\t"
113 "pand %%mm7, %%mm3 \n\t"
114 MOVNTQ" %%mm0, %0 \n\t"
115 MOVNTQ" %%mm1, 8%0 \n\t"
116 MOVNTQ" %%mm2, 16%0 \n\t"
117 MOVNTQ" %%mm3, 24%0"
118 :"=m"(*dest)
119 :"m"(*s)
120 :"memory");
121 dest += 32;
122 s += 24;
124 __asm __volatile(SFENCE:::"memory");
125 __asm __volatile(EMMS:::"memory");
126 #endif
127 while (s < end)
129 #ifdef WORDS_BIGENDIAN
130 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
131 *dest++ = 0;
132 *dest++ = s[2];
133 *dest++ = s[1];
134 *dest++ = s[0];
135 s+=3;
136 #else
137 *dest++ = *s++;
138 *dest++ = *s++;
139 *dest++ = *s++;
140 *dest++ = 0;
141 #endif
145 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
147 uint8_t *dest = dst;
148 const uint8_t *s = src;
149 const uint8_t *end;
150 #ifdef HAVE_MMX
151 const uint8_t *mm_end;
152 #endif
153 end = s + src_size;
154 #ifdef HAVE_MMX
155 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
156 mm_end = end - 31;
157 while (s < mm_end)
159 __asm __volatile(
160 PREFETCH" 32%1 \n\t"
161 "movq %1, %%mm0 \n\t"
162 "movq 8%1, %%mm1 \n\t"
163 "movq 16%1, %%mm4 \n\t"
164 "movq 24%1, %%mm5 \n\t"
165 "movq %%mm0, %%mm2 \n\t"
166 "movq %%mm1, %%mm3 \n\t"
167 "movq %%mm4, %%mm6 \n\t"
168 "movq %%mm5, %%mm7 \n\t"
169 "psrlq $8, %%mm2 \n\t"
170 "psrlq $8, %%mm3 \n\t"
171 "psrlq $8, %%mm6 \n\t"
172 "psrlq $8, %%mm7 \n\t"
173 "pand %2, %%mm0 \n\t"
174 "pand %2, %%mm1 \n\t"
175 "pand %2, %%mm4 \n\t"
176 "pand %2, %%mm5 \n\t"
177 "pand %3, %%mm2 \n\t"
178 "pand %3, %%mm3 \n\t"
179 "pand %3, %%mm6 \n\t"
180 "pand %3, %%mm7 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "por %%mm3, %%mm1 \n\t"
183 "por %%mm6, %%mm4 \n\t"
184 "por %%mm7, %%mm5 \n\t"
186 "movq %%mm1, %%mm2 \n\t"
187 "movq %%mm4, %%mm3 \n\t"
188 "psllq $48, %%mm2 \n\t"
189 "psllq $32, %%mm3 \n\t"
190 "pand %4, %%mm2 \n\t"
191 "pand %5, %%mm3 \n\t"
192 "por %%mm2, %%mm0 \n\t"
193 "psrlq $16, %%mm1 \n\t"
194 "psrlq $32, %%mm4 \n\t"
195 "psllq $16, %%mm5 \n\t"
196 "por %%mm3, %%mm1 \n\t"
197 "pand %6, %%mm5 \n\t"
198 "por %%mm5, %%mm4 \n\t"
200 MOVNTQ" %%mm0, %0 \n\t"
201 MOVNTQ" %%mm1, 8%0 \n\t"
202 MOVNTQ" %%mm4, 16%0"
203 :"=m"(*dest)
204 :"m"(*s),"m"(mask24l),
205 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
206 :"memory");
207 dest += 24;
208 s += 32;
210 __asm __volatile(SFENCE:::"memory");
211 __asm __volatile(EMMS:::"memory");
212 #endif
213 while (s < end)
215 #ifdef WORDS_BIGENDIAN
216 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
217 s++;
218 dest[2] = *s++;
219 dest[1] = *s++;
220 dest[0] = *s++;
221 dest += 3;
222 #else
223 *dest++ = *s++;
224 *dest++ = *s++;
225 *dest++ = *s++;
226 s++;
227 #endif
232 Original by Strepto/Astral
233 ported to gcc & bugfixed : A'rpi
234 MMX2, 3DNOW optimization by Nick Kurshev
235 32 bit C version, and and&add trick by Michael Niedermayer
237 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
239 register const uint8_t* s=src;
240 register uint8_t* d=dst;
241 register const uint8_t *end;
242 const uint8_t *mm_end;
243 end = s + src_size;
244 #ifdef HAVE_MMX
245 __asm __volatile(PREFETCH" %0"::"m"(*s));
246 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
247 mm_end = end - 15;
248 while (s<mm_end)
250 __asm __volatile(
251 PREFETCH" 32%1 \n\t"
252 "movq %1, %%mm0 \n\t"
253 "movq 8%1, %%mm2 \n\t"
254 "movq %%mm0, %%mm1 \n\t"
255 "movq %%mm2, %%mm3 \n\t"
256 "pand %%mm4, %%mm0 \n\t"
257 "pand %%mm4, %%mm2 \n\t"
258 "paddw %%mm1, %%mm0 \n\t"
259 "paddw %%mm3, %%mm2 \n\t"
260 MOVNTQ" %%mm0, %0 \n\t"
261 MOVNTQ" %%mm2, 8%0"
262 :"=m"(*d)
263 :"m"(*s)
265 d+=16;
266 s+=16;
268 __asm __volatile(SFENCE:::"memory");
269 __asm __volatile(EMMS:::"memory");
270 #endif
271 mm_end = end - 3;
272 while (s < mm_end)
274 register unsigned x= *((uint32_t *)s);
275 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
276 d+=4;
277 s+=4;
279 if (s < end)
281 register unsigned short x= *((uint16_t *)s);
282 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
286 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
288 register const uint8_t* s=src;
289 register uint8_t* d=dst;
290 register const uint8_t *end;
291 const uint8_t *mm_end;
292 end = s + src_size;
293 #ifdef HAVE_MMX
294 __asm __volatile(PREFETCH" %0"::"m"(*s));
295 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
296 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
297 mm_end = end - 15;
298 while (s<mm_end)
300 __asm __volatile(
301 PREFETCH" 32%1 \n\t"
302 "movq %1, %%mm0 \n\t"
303 "movq 8%1, %%mm2 \n\t"
304 "movq %%mm0, %%mm1 \n\t"
305 "movq %%mm2, %%mm3 \n\t"
306 "psrlq $1, %%mm0 \n\t"
307 "psrlq $1, %%mm2 \n\t"
308 "pand %%mm7, %%mm0 \n\t"
309 "pand %%mm7, %%mm2 \n\t"
310 "pand %%mm6, %%mm1 \n\t"
311 "pand %%mm6, %%mm3 \n\t"
312 "por %%mm1, %%mm0 \n\t"
313 "por %%mm3, %%mm2 \n\t"
314 MOVNTQ" %%mm0, %0 \n\t"
315 MOVNTQ" %%mm2, 8%0"
316 :"=m"(*d)
317 :"m"(*s)
319 d+=16;
320 s+=16;
322 __asm __volatile(SFENCE:::"memory");
323 __asm __volatile(EMMS:::"memory");
324 #endif
325 mm_end = end - 3;
326 while (s < mm_end)
328 register uint32_t x= *((uint32_t *)s);
329 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
330 s+=4;
331 d+=4;
333 if (s < end)
335 register uint16_t x= *((uint16_t *)s);
336 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
337 s+=2;
338 d+=2;
342 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
344 const uint8_t *s = src;
345 const uint8_t *end;
346 #ifdef HAVE_MMX
347 const uint8_t *mm_end;
348 #endif
349 uint16_t *d = (uint16_t *)dst;
350 end = s + src_size;
351 #ifdef HAVE_MMX
352 mm_end = end - 15;
353 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
354 asm volatile(
355 "movq %3, %%mm5 \n\t"
356 "movq %4, %%mm6 \n\t"
357 "movq %5, %%mm7 \n\t"
358 "jmp 2f \n\t"
359 ASMALIGN(4)
360 "1: \n\t"
361 PREFETCH" 32(%1) \n\t"
362 "movd (%1), %%mm0 \n\t"
363 "movd 4(%1), %%mm3 \n\t"
364 "punpckldq 8(%1), %%mm0 \n\t"
365 "punpckldq 12(%1), %%mm3 \n\t"
366 "movq %%mm0, %%mm1 \n\t"
367 "movq %%mm3, %%mm4 \n\t"
368 "pand %%mm6, %%mm0 \n\t"
369 "pand %%mm6, %%mm3 \n\t"
370 "pmaddwd %%mm7, %%mm0 \n\t"
371 "pmaddwd %%mm7, %%mm3 \n\t"
372 "pand %%mm5, %%mm1 \n\t"
373 "pand %%mm5, %%mm4 \n\t"
374 "por %%mm1, %%mm0 \n\t"
375 "por %%mm4, %%mm3 \n\t"
376 "psrld $5, %%mm0 \n\t"
377 "pslld $11, %%mm3 \n\t"
378 "por %%mm3, %%mm0 \n\t"
379 MOVNTQ" %%mm0, (%0) \n\t"
380 "add $16, %1 \n\t"
381 "add $8, %0 \n\t"
382 "2: \n\t"
383 "cmp %2, %1 \n\t"
384 " jb 1b \n\t"
385 : "+r" (d), "+r"(s)
386 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
388 #else
389 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
390 __asm __volatile(
391 "movq %0, %%mm7 \n\t"
392 "movq %1, %%mm6 \n\t"
393 ::"m"(red_16mask),"m"(green_16mask));
394 while (s < mm_end)
396 __asm __volatile(
397 PREFETCH" 32%1 \n\t"
398 "movd %1, %%mm0 \n\t"
399 "movd 4%1, %%mm3 \n\t"
400 "punpckldq 8%1, %%mm0 \n\t"
401 "punpckldq 12%1, %%mm3 \n\t"
402 "movq %%mm0, %%mm1 \n\t"
403 "movq %%mm0, %%mm2 \n\t"
404 "movq %%mm3, %%mm4 \n\t"
405 "movq %%mm3, %%mm5 \n\t"
406 "psrlq $3, %%mm0 \n\t"
407 "psrlq $3, %%mm3 \n\t"
408 "pand %2, %%mm0 \n\t"
409 "pand %2, %%mm3 \n\t"
410 "psrlq $5, %%mm1 \n\t"
411 "psrlq $5, %%mm4 \n\t"
412 "pand %%mm6, %%mm1 \n\t"
413 "pand %%mm6, %%mm4 \n\t"
414 "psrlq $8, %%mm2 \n\t"
415 "psrlq $8, %%mm5 \n\t"
416 "pand %%mm7, %%mm2 \n\t"
417 "pand %%mm7, %%mm5 \n\t"
418 "por %%mm1, %%mm0 \n\t"
419 "por %%mm4, %%mm3 \n\t"
420 "por %%mm2, %%mm0 \n\t"
421 "por %%mm5, %%mm3 \n\t"
422 "psllq $16, %%mm3 \n\t"
423 "por %%mm3, %%mm0 \n\t"
424 MOVNTQ" %%mm0, %0 \n\t"
425 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
426 d += 4;
427 s += 16;
429 #endif
430 __asm __volatile(SFENCE:::"memory");
431 __asm __volatile(EMMS:::"memory");
432 #endif
433 while (s < end)
435 register int rgb = *(uint32_t*)s; s += 4;
436 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
440 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
442 const uint8_t *s = src;
443 const uint8_t *end;
444 #ifdef HAVE_MMX
445 const uint8_t *mm_end;
446 #endif
447 uint16_t *d = (uint16_t *)dst;
448 end = s + src_size;
449 #ifdef HAVE_MMX
450 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
451 __asm __volatile(
452 "movq %0, %%mm7 \n\t"
453 "movq %1, %%mm6 \n\t"
454 ::"m"(red_16mask),"m"(green_16mask));
455 mm_end = end - 15;
456 while (s < mm_end)
458 __asm __volatile(
459 PREFETCH" 32%1 \n\t"
460 "movd %1, %%mm0 \n\t"
461 "movd 4%1, %%mm3 \n\t"
462 "punpckldq 8%1, %%mm0 \n\t"
463 "punpckldq 12%1, %%mm3 \n\t"
464 "movq %%mm0, %%mm1 \n\t"
465 "movq %%mm0, %%mm2 \n\t"
466 "movq %%mm3, %%mm4 \n\t"
467 "movq %%mm3, %%mm5 \n\t"
468 "psllq $8, %%mm0 \n\t"
469 "psllq $8, %%mm3 \n\t"
470 "pand %%mm7, %%mm0 \n\t"
471 "pand %%mm7, %%mm3 \n\t"
472 "psrlq $5, %%mm1 \n\t"
473 "psrlq $5, %%mm4 \n\t"
474 "pand %%mm6, %%mm1 \n\t"
475 "pand %%mm6, %%mm4 \n\t"
476 "psrlq $19, %%mm2 \n\t"
477 "psrlq $19, %%mm5 \n\t"
478 "pand %2, %%mm2 \n\t"
479 "pand %2, %%mm5 \n\t"
480 "por %%mm1, %%mm0 \n\t"
481 "por %%mm4, %%mm3 \n\t"
482 "por %%mm2, %%mm0 \n\t"
483 "por %%mm5, %%mm3 \n\t"
484 "psllq $16, %%mm3 \n\t"
485 "por %%mm3, %%mm0 \n\t"
486 MOVNTQ" %%mm0, %0 \n\t"
487 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
488 d += 4;
489 s += 16;
491 __asm __volatile(SFENCE:::"memory");
492 __asm __volatile(EMMS:::"memory");
493 #endif
494 while (s < end)
496 register int rgb = *(uint32_t*)s; s += 4;
497 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
501 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
503 const uint8_t *s = src;
504 const uint8_t *end;
505 #ifdef HAVE_MMX
506 const uint8_t *mm_end;
507 #endif
508 uint16_t *d = (uint16_t *)dst;
509 end = s + src_size;
510 #ifdef HAVE_MMX
511 mm_end = end - 15;
512 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
513 asm volatile(
514 "movq %3, %%mm5 \n\t"
515 "movq %4, %%mm6 \n\t"
516 "movq %5, %%mm7 \n\t"
517 "jmp 2f \n\t"
518 ASMALIGN(4)
519 "1: \n\t"
520 PREFETCH" 32(%1) \n\t"
521 "movd (%1), %%mm0 \n\t"
522 "movd 4(%1), %%mm3 \n\t"
523 "punpckldq 8(%1), %%mm0 \n\t"
524 "punpckldq 12(%1), %%mm3 \n\t"
525 "movq %%mm0, %%mm1 \n\t"
526 "movq %%mm3, %%mm4 \n\t"
527 "pand %%mm6, %%mm0 \n\t"
528 "pand %%mm6, %%mm3 \n\t"
529 "pmaddwd %%mm7, %%mm0 \n\t"
530 "pmaddwd %%mm7, %%mm3 \n\t"
531 "pand %%mm5, %%mm1 \n\t"
532 "pand %%mm5, %%mm4 \n\t"
533 "por %%mm1, %%mm0 \n\t"
534 "por %%mm4, %%mm3 \n\t"
535 "psrld $6, %%mm0 \n\t"
536 "pslld $10, %%mm3 \n\t"
537 "por %%mm3, %%mm0 \n\t"
538 MOVNTQ" %%mm0, (%0) \n\t"
539 "add $16, %1 \n\t"
540 "add $8, %0 \n\t"
541 "2: \n\t"
542 "cmp %2, %1 \n\t"
543 " jb 1b \n\t"
544 : "+r" (d), "+r"(s)
545 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
547 #else
548 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
549 __asm __volatile(
550 "movq %0, %%mm7 \n\t"
551 "movq %1, %%mm6 \n\t"
552 ::"m"(red_15mask),"m"(green_15mask));
553 while (s < mm_end)
555 __asm __volatile(
556 PREFETCH" 32%1 \n\t"
557 "movd %1, %%mm0 \n\t"
558 "movd 4%1, %%mm3 \n\t"
559 "punpckldq 8%1, %%mm0 \n\t"
560 "punpckldq 12%1, %%mm3 \n\t"
561 "movq %%mm0, %%mm1 \n\t"
562 "movq %%mm0, %%mm2 \n\t"
563 "movq %%mm3, %%mm4 \n\t"
564 "movq %%mm3, %%mm5 \n\t"
565 "psrlq $3, %%mm0 \n\t"
566 "psrlq $3, %%mm3 \n\t"
567 "pand %2, %%mm0 \n\t"
568 "pand %2, %%mm3 \n\t"
569 "psrlq $6, %%mm1 \n\t"
570 "psrlq $6, %%mm4 \n\t"
571 "pand %%mm6, %%mm1 \n\t"
572 "pand %%mm6, %%mm4 \n\t"
573 "psrlq $9, %%mm2 \n\t"
574 "psrlq $9, %%mm5 \n\t"
575 "pand %%mm7, %%mm2 \n\t"
576 "pand %%mm7, %%mm5 \n\t"
577 "por %%mm1, %%mm0 \n\t"
578 "por %%mm4, %%mm3 \n\t"
579 "por %%mm2, %%mm0 \n\t"
580 "por %%mm5, %%mm3 \n\t"
581 "psllq $16, %%mm3 \n\t"
582 "por %%mm3, %%mm0 \n\t"
583 MOVNTQ" %%mm0, %0 \n\t"
584 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
585 d += 4;
586 s += 16;
588 #endif
589 __asm __volatile(SFENCE:::"memory");
590 __asm __volatile(EMMS:::"memory");
591 #endif
592 while (s < end)
594 register int rgb = *(uint32_t*)s; s += 4;
595 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
599 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
601 const uint8_t *s = src;
602 const uint8_t *end;
603 #ifdef HAVE_MMX
604 const uint8_t *mm_end;
605 #endif
606 uint16_t *d = (uint16_t *)dst;
607 end = s + src_size;
608 #ifdef HAVE_MMX
609 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
610 __asm __volatile(
611 "movq %0, %%mm7 \n\t"
612 "movq %1, %%mm6 \n\t"
613 ::"m"(red_15mask),"m"(green_15mask));
614 mm_end = end - 15;
615 while (s < mm_end)
617 __asm __volatile(
618 PREFETCH" 32%1 \n\t"
619 "movd %1, %%mm0 \n\t"
620 "movd 4%1, %%mm3 \n\t"
621 "punpckldq 8%1, %%mm0 \n\t"
622 "punpckldq 12%1, %%mm3 \n\t"
623 "movq %%mm0, %%mm1 \n\t"
624 "movq %%mm0, %%mm2 \n\t"
625 "movq %%mm3, %%mm4 \n\t"
626 "movq %%mm3, %%mm5 \n\t"
627 "psllq $7, %%mm0 \n\t"
628 "psllq $7, %%mm3 \n\t"
629 "pand %%mm7, %%mm0 \n\t"
630 "pand %%mm7, %%mm3 \n\t"
631 "psrlq $6, %%mm1 \n\t"
632 "psrlq $6, %%mm4 \n\t"
633 "pand %%mm6, %%mm1 \n\t"
634 "pand %%mm6, %%mm4 \n\t"
635 "psrlq $19, %%mm2 \n\t"
636 "psrlq $19, %%mm5 \n\t"
637 "pand %2, %%mm2 \n\t"
638 "pand %2, %%mm5 \n\t"
639 "por %%mm1, %%mm0 \n\t"
640 "por %%mm4, %%mm3 \n\t"
641 "por %%mm2, %%mm0 \n\t"
642 "por %%mm5, %%mm3 \n\t"
643 "psllq $16, %%mm3 \n\t"
644 "por %%mm3, %%mm0 \n\t"
645 MOVNTQ" %%mm0, %0 \n\t"
646 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
647 d += 4;
648 s += 16;
650 __asm __volatile(SFENCE:::"memory");
651 __asm __volatile(EMMS:::"memory");
652 #endif
653 while (s < end)
655 register int rgb = *(uint32_t*)s; s += 4;
656 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
660 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
662 const uint8_t *s = src;
663 const uint8_t *end;
664 #ifdef HAVE_MMX
665 const uint8_t *mm_end;
666 #endif
667 uint16_t *d = (uint16_t *)dst;
668 end = s + src_size;
669 #ifdef HAVE_MMX
670 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
671 __asm __volatile(
672 "movq %0, %%mm7 \n\t"
673 "movq %1, %%mm6 \n\t"
674 ::"m"(red_16mask),"m"(green_16mask));
675 mm_end = end - 11;
676 while (s < mm_end)
678 __asm __volatile(
679 PREFETCH" 32%1 \n\t"
680 "movd %1, %%mm0 \n\t"
681 "movd 3%1, %%mm3 \n\t"
682 "punpckldq 6%1, %%mm0 \n\t"
683 "punpckldq 9%1, %%mm3 \n\t"
684 "movq %%mm0, %%mm1 \n\t"
685 "movq %%mm0, %%mm2 \n\t"
686 "movq %%mm3, %%mm4 \n\t"
687 "movq %%mm3, %%mm5 \n\t"
688 "psrlq $3, %%mm0 \n\t"
689 "psrlq $3, %%mm3 \n\t"
690 "pand %2, %%mm0 \n\t"
691 "pand %2, %%mm3 \n\t"
692 "psrlq $5, %%mm1 \n\t"
693 "psrlq $5, %%mm4 \n\t"
694 "pand %%mm6, %%mm1 \n\t"
695 "pand %%mm6, %%mm4 \n\t"
696 "psrlq $8, %%mm2 \n\t"
697 "psrlq $8, %%mm5 \n\t"
698 "pand %%mm7, %%mm2 \n\t"
699 "pand %%mm7, %%mm5 \n\t"
700 "por %%mm1, %%mm0 \n\t"
701 "por %%mm4, %%mm3 \n\t"
702 "por %%mm2, %%mm0 \n\t"
703 "por %%mm5, %%mm3 \n\t"
704 "psllq $16, %%mm3 \n\t"
705 "por %%mm3, %%mm0 \n\t"
706 MOVNTQ" %%mm0, %0 \n\t"
707 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
708 d += 4;
709 s += 12;
711 __asm __volatile(SFENCE:::"memory");
712 __asm __volatile(EMMS:::"memory");
713 #endif
714 while (s < end)
716 const int b = *s++;
717 const int g = *s++;
718 const int r = *s++;
719 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
723 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
725 const uint8_t *s = src;
726 const uint8_t *end;
727 #ifdef HAVE_MMX
728 const uint8_t *mm_end;
729 #endif
730 uint16_t *d = (uint16_t *)dst;
731 end = s + src_size;
732 #ifdef HAVE_MMX
733 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
734 __asm __volatile(
735 "movq %0, %%mm7 \n\t"
736 "movq %1, %%mm6 \n\t"
737 ::"m"(red_16mask),"m"(green_16mask));
738 mm_end = end - 15;
739 while (s < mm_end)
741 __asm __volatile(
742 PREFETCH" 32%1 \n\t"
743 "movd %1, %%mm0 \n\t"
744 "movd 3%1, %%mm3 \n\t"
745 "punpckldq 6%1, %%mm0 \n\t"
746 "punpckldq 9%1, %%mm3 \n\t"
747 "movq %%mm0, %%mm1 \n\t"
748 "movq %%mm0, %%mm2 \n\t"
749 "movq %%mm3, %%mm4 \n\t"
750 "movq %%mm3, %%mm5 \n\t"
751 "psllq $8, %%mm0 \n\t"
752 "psllq $8, %%mm3 \n\t"
753 "pand %%mm7, %%mm0 \n\t"
754 "pand %%mm7, %%mm3 \n\t"
755 "psrlq $5, %%mm1 \n\t"
756 "psrlq $5, %%mm4 \n\t"
757 "pand %%mm6, %%mm1 \n\t"
758 "pand %%mm6, %%mm4 \n\t"
759 "psrlq $19, %%mm2 \n\t"
760 "psrlq $19, %%mm5 \n\t"
761 "pand %2, %%mm2 \n\t"
762 "pand %2, %%mm5 \n\t"
763 "por %%mm1, %%mm0 \n\t"
764 "por %%mm4, %%mm3 \n\t"
765 "por %%mm2, %%mm0 \n\t"
766 "por %%mm5, %%mm3 \n\t"
767 "psllq $16, %%mm3 \n\t"
768 "por %%mm3, %%mm0 \n\t"
769 MOVNTQ" %%mm0, %0 \n\t"
770 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
771 d += 4;
772 s += 12;
774 __asm __volatile(SFENCE:::"memory");
775 __asm __volatile(EMMS:::"memory");
776 #endif
777 while (s < end)
779 const int r = *s++;
780 const int g = *s++;
781 const int b = *s++;
782 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
786 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
788 const uint8_t *s = src;
789 const uint8_t *end;
790 #ifdef HAVE_MMX
791 const uint8_t *mm_end;
792 #endif
793 uint16_t *d = (uint16_t *)dst;
794 end = s + src_size;
795 #ifdef HAVE_MMX
796 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
797 __asm __volatile(
798 "movq %0, %%mm7 \n\t"
799 "movq %1, %%mm6 \n\t"
800 ::"m"(red_15mask),"m"(green_15mask));
801 mm_end = end - 11;
802 while (s < mm_end)
804 __asm __volatile(
805 PREFETCH" 32%1 \n\t"
806 "movd %1, %%mm0 \n\t"
807 "movd 3%1, %%mm3 \n\t"
808 "punpckldq 6%1, %%mm0 \n\t"
809 "punpckldq 9%1, %%mm3 \n\t"
810 "movq %%mm0, %%mm1 \n\t"
811 "movq %%mm0, %%mm2 \n\t"
812 "movq %%mm3, %%mm4 \n\t"
813 "movq %%mm3, %%mm5 \n\t"
814 "psrlq $3, %%mm0 \n\t"
815 "psrlq $3, %%mm3 \n\t"
816 "pand %2, %%mm0 \n\t"
817 "pand %2, %%mm3 \n\t"
818 "psrlq $6, %%mm1 \n\t"
819 "psrlq $6, %%mm4 \n\t"
820 "pand %%mm6, %%mm1 \n\t"
821 "pand %%mm6, %%mm4 \n\t"
822 "psrlq $9, %%mm2 \n\t"
823 "psrlq $9, %%mm5 \n\t"
824 "pand %%mm7, %%mm2 \n\t"
825 "pand %%mm7, %%mm5 \n\t"
826 "por %%mm1, %%mm0 \n\t"
827 "por %%mm4, %%mm3 \n\t"
828 "por %%mm2, %%mm0 \n\t"
829 "por %%mm5, %%mm3 \n\t"
830 "psllq $16, %%mm3 \n\t"
831 "por %%mm3, %%mm0 \n\t"
832 MOVNTQ" %%mm0, %0 \n\t"
833 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
834 d += 4;
835 s += 12;
837 __asm __volatile(SFENCE:::"memory");
838 __asm __volatile(EMMS:::"memory");
839 #endif
840 while (s < end)
842 const int b = *s++;
843 const int g = *s++;
844 const int r = *s++;
845 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
849 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
851 const uint8_t *s = src;
852 const uint8_t *end;
853 #ifdef HAVE_MMX
854 const uint8_t *mm_end;
855 #endif
856 uint16_t *d = (uint16_t *)dst;
857 end = s + src_size;
858 #ifdef HAVE_MMX
859 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
860 __asm __volatile(
861 "movq %0, %%mm7 \n\t"
862 "movq %1, %%mm6 \n\t"
863 ::"m"(red_15mask),"m"(green_15mask));
864 mm_end = end - 15;
865 while (s < mm_end)
867 __asm __volatile(
868 PREFETCH" 32%1 \n\t"
869 "movd %1, %%mm0 \n\t"
870 "movd 3%1, %%mm3 \n\t"
871 "punpckldq 6%1, %%mm0 \n\t"
872 "punpckldq 9%1, %%mm3 \n\t"
873 "movq %%mm0, %%mm1 \n\t"
874 "movq %%mm0, %%mm2 \n\t"
875 "movq %%mm3, %%mm4 \n\t"
876 "movq %%mm3, %%mm5 \n\t"
877 "psllq $7, %%mm0 \n\t"
878 "psllq $7, %%mm3 \n\t"
879 "pand %%mm7, %%mm0 \n\t"
880 "pand %%mm7, %%mm3 \n\t"
881 "psrlq $6, %%mm1 \n\t"
882 "psrlq $6, %%mm4 \n\t"
883 "pand %%mm6, %%mm1 \n\t"
884 "pand %%mm6, %%mm4 \n\t"
885 "psrlq $19, %%mm2 \n\t"
886 "psrlq $19, %%mm5 \n\t"
887 "pand %2, %%mm2 \n\t"
888 "pand %2, %%mm5 \n\t"
889 "por %%mm1, %%mm0 \n\t"
890 "por %%mm4, %%mm3 \n\t"
891 "por %%mm2, %%mm0 \n\t"
892 "por %%mm5, %%mm3 \n\t"
893 "psllq $16, %%mm3 \n\t"
894 "por %%mm3, %%mm0 \n\t"
895 MOVNTQ" %%mm0, %0 \n\t"
896 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
897 d += 4;
898 s += 12;
900 __asm __volatile(SFENCE:::"memory");
901 __asm __volatile(EMMS:::"memory");
902 #endif
903 while (s < end)
905 const int r = *s++;
906 const int g = *s++;
907 const int b = *s++;
908 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
913 I use less accurate approximation here by simply left-shifting the input
914 value and filling the low order bits with zeroes. This method improves PNG
915 compression but this scheme cannot reproduce white exactly, since it does
916 not generate an all-ones maximum value; the net effect is to darken the
917 image slightly.
919 The better method should be "left bit replication":
921 4 3 2 1 0
922 ---------
923 1 1 0 1 1
925 7 6 5 4 3 2 1 0
926 ----------------
927 1 1 0 1 1 1 1 0
928 |=======| |===|
929 | Leftmost Bits Repeated to Fill Open Bits
931 Original Bits
933 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
935 const uint16_t *end;
936 #ifdef HAVE_MMX
937 const uint16_t *mm_end;
938 #endif
939 uint8_t *d = (uint8_t *)dst;
940 const uint16_t *s = (uint16_t *)src;
941 end = s + src_size/2;
942 #ifdef HAVE_MMX
943 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
944 mm_end = end - 7;
945 while (s < mm_end)
947 __asm __volatile(
948 PREFETCH" 32%1 \n\t"
949 "movq %1, %%mm0 \n\t"
950 "movq %1, %%mm1 \n\t"
951 "movq %1, %%mm2 \n\t"
952 "pand %2, %%mm0 \n\t"
953 "pand %3, %%mm1 \n\t"
954 "pand %4, %%mm2 \n\t"
955 "psllq $3, %%mm0 \n\t"
956 "psrlq $2, %%mm1 \n\t"
957 "psrlq $7, %%mm2 \n\t"
958 "movq %%mm0, %%mm3 \n\t"
959 "movq %%mm1, %%mm4 \n\t"
960 "movq %%mm2, %%mm5 \n\t"
961 "punpcklwd %5, %%mm0 \n\t"
962 "punpcklwd %5, %%mm1 \n\t"
963 "punpcklwd %5, %%mm2 \n\t"
964 "punpckhwd %5, %%mm3 \n\t"
965 "punpckhwd %5, %%mm4 \n\t"
966 "punpckhwd %5, %%mm5 \n\t"
967 "psllq $8, %%mm1 \n\t"
968 "psllq $16, %%mm2 \n\t"
969 "por %%mm1, %%mm0 \n\t"
970 "por %%mm2, %%mm0 \n\t"
971 "psllq $8, %%mm4 \n\t"
972 "psllq $16, %%mm5 \n\t"
973 "por %%mm4, %%mm3 \n\t"
974 "por %%mm5, %%mm3 \n\t"
976 "movq %%mm0, %%mm6 \n\t"
977 "movq %%mm3, %%mm7 \n\t"
979 "movq 8%1, %%mm0 \n\t"
980 "movq 8%1, %%mm1 \n\t"
981 "movq 8%1, %%mm2 \n\t"
982 "pand %2, %%mm0 \n\t"
983 "pand %3, %%mm1 \n\t"
984 "pand %4, %%mm2 \n\t"
985 "psllq $3, %%mm0 \n\t"
986 "psrlq $2, %%mm1 \n\t"
987 "psrlq $7, %%mm2 \n\t"
988 "movq %%mm0, %%mm3 \n\t"
989 "movq %%mm1, %%mm4 \n\t"
990 "movq %%mm2, %%mm5 \n\t"
991 "punpcklwd %5, %%mm0 \n\t"
992 "punpcklwd %5, %%mm1 \n\t"
993 "punpcklwd %5, %%mm2 \n\t"
994 "punpckhwd %5, %%mm3 \n\t"
995 "punpckhwd %5, %%mm4 \n\t"
996 "punpckhwd %5, %%mm5 \n\t"
997 "psllq $8, %%mm1 \n\t"
998 "psllq $16, %%mm2 \n\t"
999 "por %%mm1, %%mm0 \n\t"
1000 "por %%mm2, %%mm0 \n\t"
1001 "psllq $8, %%mm4 \n\t"
1002 "psllq $16, %%mm5 \n\t"
1003 "por %%mm4, %%mm3 \n\t"
1004 "por %%mm5, %%mm3 \n\t"
1006 :"=m"(*d)
1007 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1008 :"memory");
1009 /* Borrowed 32 to 24 */
1010 __asm __volatile(
1011 "movq %%mm0, %%mm4 \n\t"
1012 "movq %%mm3, %%mm5 \n\t"
1013 "movq %%mm6, %%mm0 \n\t"
1014 "movq %%mm7, %%mm1 \n\t"
1016 "movq %%mm4, %%mm6 \n\t"
1017 "movq %%mm5, %%mm7 \n\t"
1018 "movq %%mm0, %%mm2 \n\t"
1019 "movq %%mm1, %%mm3 \n\t"
1021 "psrlq $8, %%mm2 \n\t"
1022 "psrlq $8, %%mm3 \n\t"
1023 "psrlq $8, %%mm6 \n\t"
1024 "psrlq $8, %%mm7 \n\t"
1025 "pand %2, %%mm0 \n\t"
1026 "pand %2, %%mm1 \n\t"
1027 "pand %2, %%mm4 \n\t"
1028 "pand %2, %%mm5 \n\t"
1029 "pand %3, %%mm2 \n\t"
1030 "pand %3, %%mm3 \n\t"
1031 "pand %3, %%mm6 \n\t"
1032 "pand %3, %%mm7 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "por %%mm3, %%mm1 \n\t"
1035 "por %%mm6, %%mm4 \n\t"
1036 "por %%mm7, %%mm5 \n\t"
1038 "movq %%mm1, %%mm2 \n\t"
1039 "movq %%mm4, %%mm3 \n\t"
1040 "psllq $48, %%mm2 \n\t"
1041 "psllq $32, %%mm3 \n\t"
1042 "pand %4, %%mm2 \n\t"
1043 "pand %5, %%mm3 \n\t"
1044 "por %%mm2, %%mm0 \n\t"
1045 "psrlq $16, %%mm1 \n\t"
1046 "psrlq $32, %%mm4 \n\t"
1047 "psllq $16, %%mm5 \n\t"
1048 "por %%mm3, %%mm1 \n\t"
1049 "pand %6, %%mm5 \n\t"
1050 "por %%mm5, %%mm4 \n\t"
1052 MOVNTQ" %%mm0, %0 \n\t"
1053 MOVNTQ" %%mm1, 8%0 \n\t"
1054 MOVNTQ" %%mm4, 16%0"
1056 :"=m"(*d)
1057 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1058 :"memory");
1059 d += 24;
1060 s += 8;
1062 __asm __volatile(SFENCE:::"memory");
1063 __asm __volatile(EMMS:::"memory");
1064 #endif
1065 while (s < end)
1067 register uint16_t bgr;
1068 bgr = *s++;
1069 *d++ = (bgr&0x1F)<<3;
1070 *d++ = (bgr&0x3E0)>>2;
1071 *d++ = (bgr&0x7C00)>>7;
1075 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1077 const uint16_t *end;
1078 #ifdef HAVE_MMX
1079 const uint16_t *mm_end;
1080 #endif
1081 uint8_t *d = (uint8_t *)dst;
1082 const uint16_t *s = (const uint16_t *)src;
1083 end = s + src_size/2;
1084 #ifdef HAVE_MMX
1085 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1086 mm_end = end - 7;
1087 while (s < mm_end)
1089 __asm __volatile(
1090 PREFETCH" 32%1 \n\t"
1091 "movq %1, %%mm0 \n\t"
1092 "movq %1, %%mm1 \n\t"
1093 "movq %1, %%mm2 \n\t"
1094 "pand %2, %%mm0 \n\t"
1095 "pand %3, %%mm1 \n\t"
1096 "pand %4, %%mm2 \n\t"
1097 "psllq $3, %%mm0 \n\t"
1098 "psrlq $3, %%mm1 \n\t"
1099 "psrlq $8, %%mm2 \n\t"
1100 "movq %%mm0, %%mm3 \n\t"
1101 "movq %%mm1, %%mm4 \n\t"
1102 "movq %%mm2, %%mm5 \n\t"
1103 "punpcklwd %5, %%mm0 \n\t"
1104 "punpcklwd %5, %%mm1 \n\t"
1105 "punpcklwd %5, %%mm2 \n\t"
1106 "punpckhwd %5, %%mm3 \n\t"
1107 "punpckhwd %5, %%mm4 \n\t"
1108 "punpckhwd %5, %%mm5 \n\t"
1109 "psllq $8, %%mm1 \n\t"
1110 "psllq $16, %%mm2 \n\t"
1111 "por %%mm1, %%mm0 \n\t"
1112 "por %%mm2, %%mm0 \n\t"
1113 "psllq $8, %%mm4 \n\t"
1114 "psllq $16, %%mm5 \n\t"
1115 "por %%mm4, %%mm3 \n\t"
1116 "por %%mm5, %%mm3 \n\t"
1118 "movq %%mm0, %%mm6 \n\t"
1119 "movq %%mm3, %%mm7 \n\t"
1121 "movq 8%1, %%mm0 \n\t"
1122 "movq 8%1, %%mm1 \n\t"
1123 "movq 8%1, %%mm2 \n\t"
1124 "pand %2, %%mm0 \n\t"
1125 "pand %3, %%mm1 \n\t"
1126 "pand %4, %%mm2 \n\t"
1127 "psllq $3, %%mm0 \n\t"
1128 "psrlq $3, %%mm1 \n\t"
1129 "psrlq $8, %%mm2 \n\t"
1130 "movq %%mm0, %%mm3 \n\t"
1131 "movq %%mm1, %%mm4 \n\t"
1132 "movq %%mm2, %%mm5 \n\t"
1133 "punpcklwd %5, %%mm0 \n\t"
1134 "punpcklwd %5, %%mm1 \n\t"
1135 "punpcklwd %5, %%mm2 \n\t"
1136 "punpckhwd %5, %%mm3 \n\t"
1137 "punpckhwd %5, %%mm4 \n\t"
1138 "punpckhwd %5, %%mm5 \n\t"
1139 "psllq $8, %%mm1 \n\t"
1140 "psllq $16, %%mm2 \n\t"
1141 "por %%mm1, %%mm0 \n\t"
1142 "por %%mm2, %%mm0 \n\t"
1143 "psllq $8, %%mm4 \n\t"
1144 "psllq $16, %%mm5 \n\t"
1145 "por %%mm4, %%mm3 \n\t"
1146 "por %%mm5, %%mm3 \n\t"
1147 :"=m"(*d)
1148 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1149 :"memory");
1150 /* Borrowed 32 to 24 */
1151 __asm __volatile(
1152 "movq %%mm0, %%mm4 \n\t"
1153 "movq %%mm3, %%mm5 \n\t"
1154 "movq %%mm6, %%mm0 \n\t"
1155 "movq %%mm7, %%mm1 \n\t"
1157 "movq %%mm4, %%mm6 \n\t"
1158 "movq %%mm5, %%mm7 \n\t"
1159 "movq %%mm0, %%mm2 \n\t"
1160 "movq %%mm1, %%mm3 \n\t"
1162 "psrlq $8, %%mm2 \n\t"
1163 "psrlq $8, %%mm3 \n\t"
1164 "psrlq $8, %%mm6 \n\t"
1165 "psrlq $8, %%mm7 \n\t"
1166 "pand %2, %%mm0 \n\t"
1167 "pand %2, %%mm1 \n\t"
1168 "pand %2, %%mm4 \n\t"
1169 "pand %2, %%mm5 \n\t"
1170 "pand %3, %%mm2 \n\t"
1171 "pand %3, %%mm3 \n\t"
1172 "pand %3, %%mm6 \n\t"
1173 "pand %3, %%mm7 \n\t"
1174 "por %%mm2, %%mm0 \n\t"
1175 "por %%mm3, %%mm1 \n\t"
1176 "por %%mm6, %%mm4 \n\t"
1177 "por %%mm7, %%mm5 \n\t"
1179 "movq %%mm1, %%mm2 \n\t"
1180 "movq %%mm4, %%mm3 \n\t"
1181 "psllq $48, %%mm2 \n\t"
1182 "psllq $32, %%mm3 \n\t"
1183 "pand %4, %%mm2 \n\t"
1184 "pand %5, %%mm3 \n\t"
1185 "por %%mm2, %%mm0 \n\t"
1186 "psrlq $16, %%mm1 \n\t"
1187 "psrlq $32, %%mm4 \n\t"
1188 "psllq $16, %%mm5 \n\t"
1189 "por %%mm3, %%mm1 \n\t"
1190 "pand %6, %%mm5 \n\t"
1191 "por %%mm5, %%mm4 \n\t"
1193 MOVNTQ" %%mm0, %0 \n\t"
1194 MOVNTQ" %%mm1, 8%0 \n\t"
1195 MOVNTQ" %%mm4, 16%0"
1197 :"=m"(*d)
1198 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1199 :"memory");
1200 d += 24;
1201 s += 8;
1203 __asm __volatile(SFENCE:::"memory");
1204 __asm __volatile(EMMS:::"memory");
1205 #endif
1206 while (s < end)
1208 register uint16_t bgr;
1209 bgr = *s++;
1210 *d++ = (bgr&0x1F)<<3;
1211 *d++ = (bgr&0x7E0)>>3;
1212 *d++ = (bgr&0xF800)>>8;
1216 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1218 const uint16_t *end;
1219 #ifdef HAVE_MMX
1220 const uint16_t *mm_end;
1221 #endif
1222 uint8_t *d = (uint8_t *)dst;
1223 const uint16_t *s = (const uint16_t *)src;
1224 end = s + src_size/2;
1225 #ifdef HAVE_MMX
1226 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1227 __asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1228 mm_end = end - 3;
1229 while (s < mm_end)
1231 __asm __volatile(
1232 PREFETCH" 32%1 \n\t"
1233 "movq %1, %%mm0 \n\t"
1234 "movq %1, %%mm1 \n\t"
1235 "movq %1, %%mm2 \n\t"
1236 "pand %2, %%mm0 \n\t"
1237 "pand %3, %%mm1 \n\t"
1238 "pand %4, %%mm2 \n\t"
1239 "psllq $3, %%mm0 \n\t"
1240 "psrlq $2, %%mm1 \n\t"
1241 "psrlq $7, %%mm2 \n\t"
1242 "movq %%mm0, %%mm3 \n\t"
1243 "movq %%mm1, %%mm4 \n\t"
1244 "movq %%mm2, %%mm5 \n\t"
1245 "punpcklwd %%mm7, %%mm0 \n\t"
1246 "punpcklwd %%mm7, %%mm1 \n\t"
1247 "punpcklwd %%mm7, %%mm2 \n\t"
1248 "punpckhwd %%mm7, %%mm3 \n\t"
1249 "punpckhwd %%mm7, %%mm4 \n\t"
1250 "punpckhwd %%mm7, %%mm5 \n\t"
1251 "psllq $8, %%mm1 \n\t"
1252 "psllq $16, %%mm2 \n\t"
1253 "por %%mm1, %%mm0 \n\t"
1254 "por %%mm2, %%mm0 \n\t"
1255 "psllq $8, %%mm4 \n\t"
1256 "psllq $16, %%mm5 \n\t"
1257 "por %%mm4, %%mm3 \n\t"
1258 "por %%mm5, %%mm3 \n\t"
1259 MOVNTQ" %%mm0, %0 \n\t"
1260 MOVNTQ" %%mm3, 8%0 \n\t"
1261 :"=m"(*d)
1262 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1263 :"memory");
1264 d += 16;
1265 s += 4;
1267 __asm __volatile(SFENCE:::"memory");
1268 __asm __volatile(EMMS:::"memory");
1269 #endif
1270 while (s < end)
1272 #if 0 //slightly slower on Athlon
1273 int bgr= *s++;
1274 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1275 #else
1276 register uint16_t bgr;
1277 bgr = *s++;
1278 #ifdef WORDS_BIGENDIAN
1279 *d++ = 0;
1280 *d++ = (bgr&0x7C00)>>7;
1281 *d++ = (bgr&0x3E0)>>2;
1282 *d++ = (bgr&0x1F)<<3;
1283 #else
1284 *d++ = (bgr&0x1F)<<3;
1285 *d++ = (bgr&0x3E0)>>2;
1286 *d++ = (bgr&0x7C00)>>7;
1287 *d++ = 0;
1288 #endif
1290 #endif
1294 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1296 const uint16_t *end;
1297 #ifdef HAVE_MMX
1298 const uint16_t *mm_end;
1299 #endif
1300 uint8_t *d = (uint8_t *)dst;
1301 const uint16_t *s = (uint16_t *)src;
1302 end = s + src_size/2;
1303 #ifdef HAVE_MMX
1304 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1305 __asm __volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1306 mm_end = end - 3;
1307 while (s < mm_end)
1309 __asm __volatile(
1310 PREFETCH" 32%1 \n\t"
1311 "movq %1, %%mm0 \n\t"
1312 "movq %1, %%mm1 \n\t"
1313 "movq %1, %%mm2 \n\t"
1314 "pand %2, %%mm0 \n\t"
1315 "pand %3, %%mm1 \n\t"
1316 "pand %4, %%mm2 \n\t"
1317 "psllq $3, %%mm0 \n\t"
1318 "psrlq $3, %%mm1 \n\t"
1319 "psrlq $8, %%mm2 \n\t"
1320 "movq %%mm0, %%mm3 \n\t"
1321 "movq %%mm1, %%mm4 \n\t"
1322 "movq %%mm2, %%mm5 \n\t"
1323 "punpcklwd %%mm7, %%mm0 \n\t"
1324 "punpcklwd %%mm7, %%mm1 \n\t"
1325 "punpcklwd %%mm7, %%mm2 \n\t"
1326 "punpckhwd %%mm7, %%mm3 \n\t"
1327 "punpckhwd %%mm7, %%mm4 \n\t"
1328 "punpckhwd %%mm7, %%mm5 \n\t"
1329 "psllq $8, %%mm1 \n\t"
1330 "psllq $16, %%mm2 \n\t"
1331 "por %%mm1, %%mm0 \n\t"
1332 "por %%mm2, %%mm0 \n\t"
1333 "psllq $8, %%mm4 \n\t"
1334 "psllq $16, %%mm5 \n\t"
1335 "por %%mm4, %%mm3 \n\t"
1336 "por %%mm5, %%mm3 \n\t"
1337 MOVNTQ" %%mm0, %0 \n\t"
1338 MOVNTQ" %%mm3, 8%0 \n\t"
1339 :"=m"(*d)
1340 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1341 :"memory");
1342 d += 16;
1343 s += 4;
1345 __asm __volatile(SFENCE:::"memory");
1346 __asm __volatile(EMMS:::"memory");
1347 #endif
1348 while (s < end)
1350 register uint16_t bgr;
1351 bgr = *s++;
1352 #ifdef WORDS_BIGENDIAN
1353 *d++ = 0;
1354 *d++ = (bgr&0xF800)>>8;
1355 *d++ = (bgr&0x7E0)>>3;
1356 *d++ = (bgr&0x1F)<<3;
1357 #else
1358 *d++ = (bgr&0x1F)<<3;
1359 *d++ = (bgr&0x7E0)>>3;
1360 *d++ = (bgr&0xF800)>>8;
1361 *d++ = 0;
1362 #endif
1366 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1368 long idx = 15 - src_size;
1369 uint8_t *s = (uint8_t *) src-idx, *d = dst-idx;
1370 #ifdef HAVE_MMX
1371 __asm __volatile(
1372 "test %0, %0 \n\t"
1373 "jns 2f \n\t"
1374 PREFETCH" (%1, %0) \n\t"
1375 "movq %3, %%mm7 \n\t"
1376 "pxor %4, %%mm7 \n\t"
1377 "movq %%mm7, %%mm6 \n\t"
1378 "pxor %5, %%mm7 \n\t"
1379 ASMALIGN(4)
1380 "1: \n\t"
1381 PREFETCH" 32(%1, %0) \n\t"
1382 "movq (%1, %0), %%mm0 \n\t"
1383 "movq 8(%1, %0), %%mm1 \n\t"
1384 # ifdef HAVE_MMX2
1385 "pshufw $177, %%mm0, %%mm3 \n\t"
1386 "pshufw $177, %%mm1, %%mm5 \n\t"
1387 "pand %%mm7, %%mm0 \n\t"
1388 "pand %%mm6, %%mm3 \n\t"
1389 "pand %%mm7, %%mm1 \n\t"
1390 "pand %%mm6, %%mm5 \n\t"
1391 "por %%mm3, %%mm0 \n\t"
1392 "por %%mm5, %%mm1 \n\t"
1393 # else
1394 "movq %%mm0, %%mm2 \n\t"
1395 "movq %%mm1, %%mm4 \n\t"
1396 "pand %%mm7, %%mm0 \n\t"
1397 "pand %%mm6, %%mm2 \n\t"
1398 "pand %%mm7, %%mm1 \n\t"
1399 "pand %%mm6, %%mm4 \n\t"
1400 "movq %%mm2, %%mm3 \n\t"
1401 "movq %%mm4, %%mm5 \n\t"
1402 "pslld $16, %%mm2 \n\t"
1403 "psrld $16, %%mm3 \n\t"
1404 "pslld $16, %%mm4 \n\t"
1405 "psrld $16, %%mm5 \n\t"
1406 "por %%mm2, %%mm0 \n\t"
1407 "por %%mm4, %%mm1 \n\t"
1408 "por %%mm3, %%mm0 \n\t"
1409 "por %%mm5, %%mm1 \n\t"
1410 # endif
1411 MOVNTQ" %%mm0, (%2, %0) \n\t"
1412 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1413 "add $16, %0 \n\t"
1414 "js 1b \n\t"
1415 SFENCE" \n\t"
1416 EMMS" \n\t"
1417 "2: \n\t"
1418 : "+&r"(idx)
1419 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1420 : "memory");
1421 #endif
1422 for (; idx<15; idx+=4) {
1423 register int v = *(uint32_t *)&s[idx], g = v & 0xff00ff00;
1424 v &= 0xff00ff;
1425 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1429 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1431 unsigned i;
1432 #ifdef HAVE_MMX
1433 long mmx_size= 23 - src_size;
1434 asm volatile (
1435 "test %%"REG_a", %%"REG_a" \n\t"
1436 "jns 2f \n\t"
1437 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1438 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1439 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1440 ASMALIGN(4)
1441 "1: \n\t"
1442 PREFETCH" 32(%1, %%"REG_a") \n\t"
1443 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1444 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1445 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1446 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1447 "pand %%mm5, %%mm0 \n\t"
1448 "pand %%mm6, %%mm1 \n\t"
1449 "pand %%mm7, %%mm2 \n\t"
1450 "por %%mm0, %%mm1 \n\t"
1451 "por %%mm2, %%mm1 \n\t"
1452 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1453 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1454 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1455 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1456 "pand %%mm7, %%mm0 \n\t"
1457 "pand %%mm5, %%mm1 \n\t"
1458 "pand %%mm6, %%mm2 \n\t"
1459 "por %%mm0, %%mm1 \n\t"
1460 "por %%mm2, %%mm1 \n\t"
1461 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1462 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1463 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1464 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1465 "pand %%mm6, %%mm0 \n\t"
1466 "pand %%mm7, %%mm1 \n\t"
1467 "pand %%mm5, %%mm2 \n\t"
1468 "por %%mm0, %%mm1 \n\t"
1469 "por %%mm2, %%mm1 \n\t"
1470 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1471 "add $24, %%"REG_a" \n\t"
1472 " js 1b \n\t"
1473 "2: \n\t"
1474 : "+a" (mmx_size)
1475 : "r" (src-mmx_size), "r"(dst-mmx_size)
1478 __asm __volatile(SFENCE:::"memory");
1479 __asm __volatile(EMMS:::"memory");
1481 if (mmx_size==23) return; //finihsed, was multiple of 8
1483 src+= src_size;
1484 dst+= src_size;
1485 src_size= 23-mmx_size;
1486 src-= src_size;
1487 dst-= src_size;
1488 #endif
1489 for (i=0; i<src_size; i+=3)
1491 register uint8_t x;
1492 x = src[i + 2];
1493 dst[i + 1] = src[i + 1];
1494 dst[i + 2] = src[i + 0];
1495 dst[i + 0] = x;
1499 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1500 long width, long height,
1501 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1503 long y;
1504 const long chromWidth= width>>1;
1505 for (y=0; y<height; y++)
1507 #ifdef HAVE_MMX
1508 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1509 asm volatile(
1510 "xor %%"REG_a", %%"REG_a" \n\t"
1511 ASMALIGN(4)
1512 "1: \n\t"
1513 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1514 PREFETCH" 32(%2, %%"REG_a") \n\t"
1515 PREFETCH" 32(%3, %%"REG_a") \n\t"
1516 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1517 "movq %%mm0, %%mm2 \n\t" // U(0)
1518 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1519 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1520 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1522 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1523 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1524 "movq %%mm3, %%mm4 \n\t" // Y(0)
1525 "movq %%mm5, %%mm6 \n\t" // Y(8)
1526 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1527 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1528 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1529 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1531 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1532 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1533 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1534 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1536 "add $8, %%"REG_a" \n\t"
1537 "cmp %4, %%"REG_a" \n\t"
1538 " jb 1b \n\t"
1539 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1540 : "%"REG_a
1542 #else
1544 #if defined ARCH_ALPHA && defined HAVE_MVI
1545 #define pl2yuy2(n) \
1546 y1 = yc[n]; \
1547 y2 = yc2[n]; \
1548 u = uc[n]; \
1549 v = vc[n]; \
1550 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1551 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1552 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1553 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1554 yuv1 = (u << 8) + (v << 24); \
1555 yuv2 = yuv1 + y2; \
1556 yuv1 += y1; \
1557 qdst[n] = yuv1; \
1558 qdst2[n] = yuv2;
1560 int i;
1561 uint64_t *qdst = (uint64_t *) dst;
1562 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1563 const uint32_t *yc = (uint32_t *) ysrc;
1564 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1565 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1566 for (i = 0; i < chromWidth; i += 8){
1567 uint64_t y1, y2, yuv1, yuv2;
1568 uint64_t u, v;
1569 /* Prefetch */
1570 asm("ldq $31,64(%0)" :: "r"(yc));
1571 asm("ldq $31,64(%0)" :: "r"(yc2));
1572 asm("ldq $31,64(%0)" :: "r"(uc));
1573 asm("ldq $31,64(%0)" :: "r"(vc));
1575 pl2yuy2(0);
1576 pl2yuy2(1);
1577 pl2yuy2(2);
1578 pl2yuy2(3);
1580 yc += 4;
1581 yc2 += 4;
1582 uc += 4;
1583 vc += 4;
1584 qdst += 4;
1585 qdst2 += 4;
1587 y++;
1588 ysrc += lumStride;
1589 dst += dstStride;
1591 #elif __WORDSIZE >= 64
1592 int i;
1593 uint64_t *ldst = (uint64_t *) dst;
1594 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1595 for (i = 0; i < chromWidth; i += 2){
1596 uint64_t k, l;
1597 k = yc[0] + (uc[0] << 8) +
1598 (yc[1] << 16) + (vc[0] << 24);
1599 l = yc[2] + (uc[1] << 8) +
1600 (yc[3] << 16) + (vc[1] << 24);
1601 *ldst++ = k + (l << 32);
1602 yc += 4;
1603 uc += 2;
1604 vc += 2;
1607 #else
1608 int i, *idst = (int32_t *) dst;
1609 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1610 for (i = 0; i < chromWidth; i++){
1611 #ifdef WORDS_BIGENDIAN
1612 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1613 (yc[1] << 8) + (vc[0] << 0);
1614 #else
1615 *idst++ = yc[0] + (uc[0] << 8) +
1616 (yc[1] << 16) + (vc[0] << 24);
1617 #endif
1618 yc += 2;
1619 uc++;
1620 vc++;
1622 #endif
1623 #endif
1624 if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1626 usrc += chromStride;
1627 vsrc += chromStride;
1629 ysrc += lumStride;
1630 dst += dstStride;
1632 #ifdef HAVE_MMX
1633 asm( EMMS" \n\t"
1634 SFENCE" \n\t"
1635 :::"memory");
1636 #endif
1640 * Height should be a multiple of 2 and width should be a multiple of 16 (if
1641 * this is a problem for anyone then tell me, and I will fix it).
1643 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1644 long width, long height,
1645 long lumStride, long chromStride, long dstStride)
1647 //FIXME interpolate chroma
1648 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1651 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1652 long width, long height,
1653 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1655 long y;
1656 const long chromWidth= width>>1;
1657 for (y=0; y<height; y++)
1659 #ifdef HAVE_MMX
1660 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1661 asm volatile(
1662 "xor %%"REG_a", %%"REG_a" \n\t"
1663 ASMALIGN(4)
1664 "1: \n\t"
1665 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1666 PREFETCH" 32(%2, %%"REG_a") \n\t"
1667 PREFETCH" 32(%3, %%"REG_a") \n\t"
1668 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1669 "movq %%mm0, %%mm2 \n\t" // U(0)
1670 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1671 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1672 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1674 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1675 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1676 "movq %%mm0, %%mm4 \n\t" // Y(0)
1677 "movq %%mm2, %%mm6 \n\t" // Y(8)
1678 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1679 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1680 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1681 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1683 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1684 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1685 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1686 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1688 "add $8, %%"REG_a" \n\t"
1689 "cmp %4, %%"REG_a" \n\t"
1690 " jb 1b \n\t"
1691 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1692 : "%"REG_a
1694 #else
1695 //FIXME adapt the Alpha ASM code from yv12->yuy2
1697 #if __WORDSIZE >= 64
1698 int i;
1699 uint64_t *ldst = (uint64_t *) dst;
1700 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1701 for (i = 0; i < chromWidth; i += 2){
1702 uint64_t k, l;
1703 k = uc[0] + (yc[0] << 8) +
1704 (vc[0] << 16) + (yc[1] << 24);
1705 l = uc[1] + (yc[2] << 8) +
1706 (vc[1] << 16) + (yc[3] << 24);
1707 *ldst++ = k + (l << 32);
1708 yc += 4;
1709 uc += 2;
1710 vc += 2;
1713 #else
1714 int i, *idst = (int32_t *) dst;
1715 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1716 for (i = 0; i < chromWidth; i++){
1717 #ifdef WORDS_BIGENDIAN
1718 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1719 (vc[0] << 8) + (yc[1] << 0);
1720 #else
1721 *idst++ = uc[0] + (yc[0] << 8) +
1722 (vc[0] << 16) + (yc[1] << 24);
1723 #endif
1724 yc += 2;
1725 uc++;
1726 vc++;
1728 #endif
1729 #endif
1730 if ((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1732 usrc += chromStride;
1733 vsrc += chromStride;
1735 ysrc += lumStride;
1736 dst += dstStride;
1738 #ifdef HAVE_MMX
1739 asm( EMMS" \n\t"
1740 SFENCE" \n\t"
1741 :::"memory");
1742 #endif
1746 * Height should be a multiple of 2 and width should be a multiple of 16 (if
1747 * this is a problem for anyone then tell me, and I will fix it).
1749 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1750 long width, long height,
1751 long lumStride, long chromStride, long dstStride)
1753 //FIXME interpolate chroma
1754 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1758 * Width should be a multiple of 16.
1760 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1761 long width, long height,
1762 long lumStride, long chromStride, long dstStride)
1764 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1768 * Height should be a multiple of 2 and width should be a multiple of 16 (if
1769 * this is a problem for anyone then tell me, and I will fix it).
1771 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1772 long width, long height,
1773 long lumStride, long chromStride, long srcStride)
1775 long y;
1776 const long chromWidth= width>>1;
1777 for (y=0; y<height; y+=2)
1779 #ifdef HAVE_MMX
1780 asm volatile(
1781 "xor %%"REG_a", %%"REG_a" \n\t"
1782 "pcmpeqw %%mm7, %%mm7 \n\t"
1783 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1784 ASMALIGN(4)
1785 "1: \n\t"
1786 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1787 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1788 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1789 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1790 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1791 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1792 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1793 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1794 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1795 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1796 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1798 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1800 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1801 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1802 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1803 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1804 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1805 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1806 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1807 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1808 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1809 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1811 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1813 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1814 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1815 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1816 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1817 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1818 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1819 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1820 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1822 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1823 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1825 "add $8, %%"REG_a" \n\t"
1826 "cmp %4, %%"REG_a" \n\t"
1827 " jb 1b \n\t"
1828 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1829 : "memory", "%"REG_a
1832 ydst += lumStride;
1833 src += srcStride;
1835 asm volatile(
1836 "xor %%"REG_a", %%"REG_a" \n\t"
1837 ASMALIGN(4)
1838 "1: \n\t"
1839 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1840 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1841 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1842 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1843 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1844 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1845 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1846 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1847 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1848 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1849 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1851 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1852 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1854 "add $8, %%"REG_a" \n\t"
1855 "cmp %4, %%"REG_a" \n\t"
1856 " jb 1b \n\t"
1858 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1859 : "memory", "%"REG_a
1861 #else
1862 long i;
1863 for (i=0; i<chromWidth; i++)
1865 ydst[2*i+0] = src[4*i+0];
1866 udst[i] = src[4*i+1];
1867 ydst[2*i+1] = src[4*i+2];
1868 vdst[i] = src[4*i+3];
1870 ydst += lumStride;
1871 src += srcStride;
1873 for (i=0; i<chromWidth; i++)
1875 ydst[2*i+0] = src[4*i+0];
1876 ydst[2*i+1] = src[4*i+2];
1878 #endif
1879 udst += chromStride;
1880 vdst += chromStride;
1881 ydst += lumStride;
1882 src += srcStride;
1884 #ifdef HAVE_MMX
1885 asm volatile( EMMS" \n\t"
1886 SFENCE" \n\t"
1887 :::"memory");
1888 #endif
1891 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1892 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1893 long width, long height, long lumStride, long chromStride)
1895 /* Y Plane */
1896 memcpy(ydst, ysrc, width*height);
1898 /* XXX: implement upscaling for U,V */
1901 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1903 long x,y;
1905 dst[0]= src[0];
1907 // first line
1908 for (x=0; x<srcWidth-1; x++){
1909 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1910 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1912 dst[2*srcWidth-1]= src[srcWidth-1];
1914 dst+= dstStride;
1916 for (y=1; y<srcHeight; y++){
1917 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1918 const long mmxSize= srcWidth&~15;
1919 asm volatile(
1920 "mov %4, %%"REG_a" \n\t"
1921 "1: \n\t"
1922 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1923 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1924 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1925 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1926 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1927 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1928 PAVGB" %%mm0, %%mm5 \n\t"
1929 PAVGB" %%mm0, %%mm3 \n\t"
1930 PAVGB" %%mm0, %%mm5 \n\t"
1931 PAVGB" %%mm0, %%mm3 \n\t"
1932 PAVGB" %%mm1, %%mm4 \n\t"
1933 PAVGB" %%mm1, %%mm2 \n\t"
1934 PAVGB" %%mm1, %%mm4 \n\t"
1935 PAVGB" %%mm1, %%mm2 \n\t"
1936 "movq %%mm5, %%mm7 \n\t"
1937 "movq %%mm4, %%mm6 \n\t"
1938 "punpcklbw %%mm3, %%mm5 \n\t"
1939 "punpckhbw %%mm3, %%mm7 \n\t"
1940 "punpcklbw %%mm2, %%mm4 \n\t"
1941 "punpckhbw %%mm2, %%mm6 \n\t"
1942 #if 1
1943 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1944 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1945 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1946 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1947 #else
1948 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1949 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1950 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1951 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1952 #endif
1953 "add $8, %%"REG_a" \n\t"
1954 " js 1b \n\t"
1955 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1956 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1957 "g" (-mmxSize)
1958 : "%"REG_a
1961 #else
1962 const long mmxSize=1;
1963 #endif
1964 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1965 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1967 for (x=mmxSize-1; x<srcWidth-1; x++){
1968 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1969 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1970 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1971 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1973 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1974 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1976 dst+=dstStride*2;
1977 src+=srcStride;
1980 // last line
1981 #if 1
1982 dst[0]= src[0];
1984 for (x=0; x<srcWidth-1; x++){
1985 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1986 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1988 dst[2*srcWidth-1]= src[srcWidth-1];
1989 #else
1990 for (x=0; x<srcWidth; x++){
1991 dst[2*x+0]=
1992 dst[2*x+1]= src[x];
1994 #endif
1996 #ifdef HAVE_MMX
1997 asm volatile( EMMS" \n\t"
1998 SFENCE" \n\t"
1999 :::"memory");
2000 #endif
2004 * Height should be a multiple of 2 and width should be a multiple of 16 (if
2005 * this is a problem for anyone then tell me, and I will fix it).
2006 * Chrominance data is only taken from every secound line, others are ignored.
2007 * FIXME: Write HQ version.
2009 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2010 long width, long height,
2011 long lumStride, long chromStride, long srcStride)
2013 long y;
2014 const long chromWidth= width>>1;
2015 for (y=0; y<height; y+=2)
2017 #ifdef HAVE_MMX
2018 asm volatile(
2019 "xorl %%eax, %%eax \n\t"
2020 "pcmpeqw %%mm7, %%mm7 \n\t"
2021 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2022 ASMALIGN(4)
2023 "1: \n\t"
2024 PREFETCH" 64(%0, %%eax, 4) \n\t"
2025 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
2026 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
2027 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2028 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2029 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2030 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2031 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2032 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2033 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2034 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2036 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2038 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2039 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2040 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2041 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2042 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2043 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2044 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2045 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2046 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2047 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2049 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2051 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2052 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2053 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2054 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2055 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2056 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2057 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2058 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2060 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2061 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2063 "addl $8, %%eax \n\t"
2064 "cmpl %4, %%eax \n\t"
2065 " jb 1b \n\t"
2066 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2067 : "memory", "%eax"
2070 ydst += lumStride;
2071 src += srcStride;
2073 asm volatile(
2074 "xorl %%eax, %%eax \n\t"
2075 ASMALIGN(4)
2076 "1: \n\t"
2077 PREFETCH" 64(%0, %%eax, 4) \n\t"
2078 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2079 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2080 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2081 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2082 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2083 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2084 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2085 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2086 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2087 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2089 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2090 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2092 "addl $8, %%eax \n\t"
2093 "cmpl %4, %%eax \n\t"
2094 " jb 1b \n\t"
2096 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2097 : "memory", "%eax"
2099 #else
2100 long i;
2101 for (i=0; i<chromWidth; i++)
2103 udst[i] = src[4*i+0];
2104 ydst[2*i+0] = src[4*i+1];
2105 vdst[i] = src[4*i+2];
2106 ydst[2*i+1] = src[4*i+3];
2108 ydst += lumStride;
2109 src += srcStride;
2111 for (i=0; i<chromWidth; i++)
2113 ydst[2*i+0] = src[4*i+1];
2114 ydst[2*i+1] = src[4*i+3];
2116 #endif
2117 udst += chromStride;
2118 vdst += chromStride;
2119 ydst += lumStride;
2120 src += srcStride;
2122 #ifdef HAVE_MMX
2123 asm volatile( EMMS" \n\t"
2124 SFENCE" \n\t"
2125 :::"memory");
2126 #endif
2130 * Height should be a multiple of 2 and width should be a multiple of 2 (if
2131 * this is a problem for anyone then tell me, and I will fix it).
2132 * Chrominance data is only taken from every secound line,
2133 * others are ignored in the C version.
2134 * FIXME: Write HQ version.
2136 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2137 long width, long height,
2138 long lumStride, long chromStride, long srcStride)
2140 long y;
2141 const long chromWidth= width>>1;
2142 #ifdef HAVE_MMX
2143 for (y=0; y<height-2; y+=2)
2145 long i;
2146 for (i=0; i<2; i++)
2148 asm volatile(
2149 "mov %2, %%"REG_a" \n\t"
2150 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2151 "movq "MANGLE(w1111)", %%mm5 \n\t"
2152 "pxor %%mm7, %%mm7 \n\t"
2153 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2154 ASMALIGN(4)
2155 "1: \n\t"
2156 PREFETCH" 64(%0, %%"REG_d") \n\t"
2157 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2158 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2159 "punpcklbw %%mm7, %%mm0 \n\t"
2160 "punpcklbw %%mm7, %%mm1 \n\t"
2161 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2162 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2163 "punpcklbw %%mm7, %%mm2 \n\t"
2164 "punpcklbw %%mm7, %%mm3 \n\t"
2165 "pmaddwd %%mm6, %%mm0 \n\t"
2166 "pmaddwd %%mm6, %%mm1 \n\t"
2167 "pmaddwd %%mm6, %%mm2 \n\t"
2168 "pmaddwd %%mm6, %%mm3 \n\t"
2169 #ifndef FAST_BGR2YV12
2170 "psrad $8, %%mm0 \n\t"
2171 "psrad $8, %%mm1 \n\t"
2172 "psrad $8, %%mm2 \n\t"
2173 "psrad $8, %%mm3 \n\t"
2174 #endif
2175 "packssdw %%mm1, %%mm0 \n\t"
2176 "packssdw %%mm3, %%mm2 \n\t"
2177 "pmaddwd %%mm5, %%mm0 \n\t"
2178 "pmaddwd %%mm5, %%mm2 \n\t"
2179 "packssdw %%mm2, %%mm0 \n\t"
2180 "psraw $7, %%mm0 \n\t"
2182 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2183 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2184 "punpcklbw %%mm7, %%mm4 \n\t"
2185 "punpcklbw %%mm7, %%mm1 \n\t"
2186 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2187 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2188 "punpcklbw %%mm7, %%mm2 \n\t"
2189 "punpcklbw %%mm7, %%mm3 \n\t"
2190 "pmaddwd %%mm6, %%mm4 \n\t"
2191 "pmaddwd %%mm6, %%mm1 \n\t"
2192 "pmaddwd %%mm6, %%mm2 \n\t"
2193 "pmaddwd %%mm6, %%mm3 \n\t"
2194 #ifndef FAST_BGR2YV12
2195 "psrad $8, %%mm4 \n\t"
2196 "psrad $8, %%mm1 \n\t"
2197 "psrad $8, %%mm2 \n\t"
2198 "psrad $8, %%mm3 \n\t"
2199 #endif
2200 "packssdw %%mm1, %%mm4 \n\t"
2201 "packssdw %%mm3, %%mm2 \n\t"
2202 "pmaddwd %%mm5, %%mm4 \n\t"
2203 "pmaddwd %%mm5, %%mm2 \n\t"
2204 "add $24, %%"REG_d" \n\t"
2205 "packssdw %%mm2, %%mm4 \n\t"
2206 "psraw $7, %%mm4 \n\t"
2208 "packuswb %%mm4, %%mm0 \n\t"
2209 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2211 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2212 "add $8, %%"REG_a" \n\t"
2213 " js 1b \n\t"
2214 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2215 : "%"REG_a, "%"REG_d
2217 ydst += lumStride;
2218 src += srcStride;
2220 src -= srcStride*2;
2221 asm volatile(
2222 "mov %4, %%"REG_a" \n\t"
2223 "movq "MANGLE(w1111)", %%mm5 \n\t"
2224 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2225 "pxor %%mm7, %%mm7 \n\t"
2226 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2227 "add %%"REG_d", %%"REG_d" \n\t"
2228 ASMALIGN(4)
2229 "1: \n\t"
2230 PREFETCH" 64(%0, %%"REG_d") \n\t"
2231 PREFETCH" 64(%1, %%"REG_d") \n\t"
2232 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2233 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2234 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2235 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2236 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2237 PAVGB" %%mm1, %%mm0 \n\t"
2238 PAVGB" %%mm3, %%mm2 \n\t"
2239 "movq %%mm0, %%mm1 \n\t"
2240 "movq %%mm2, %%mm3 \n\t"
2241 "psrlq $24, %%mm0 \n\t"
2242 "psrlq $24, %%mm2 \n\t"
2243 PAVGB" %%mm1, %%mm0 \n\t"
2244 PAVGB" %%mm3, %%mm2 \n\t"
2245 "punpcklbw %%mm7, %%mm0 \n\t"
2246 "punpcklbw %%mm7, %%mm2 \n\t"
2247 #else
2248 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2249 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2250 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2251 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2252 "punpcklbw %%mm7, %%mm0 \n\t"
2253 "punpcklbw %%mm7, %%mm1 \n\t"
2254 "punpcklbw %%mm7, %%mm2 \n\t"
2255 "punpcklbw %%mm7, %%mm3 \n\t"
2256 "paddw %%mm1, %%mm0 \n\t"
2257 "paddw %%mm3, %%mm2 \n\t"
2258 "paddw %%mm2, %%mm0 \n\t"
2259 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2260 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2261 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2262 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2263 "punpcklbw %%mm7, %%mm4 \n\t"
2264 "punpcklbw %%mm7, %%mm1 \n\t"
2265 "punpcklbw %%mm7, %%mm2 \n\t"
2266 "punpcklbw %%mm7, %%mm3 \n\t"
2267 "paddw %%mm1, %%mm4 \n\t"
2268 "paddw %%mm3, %%mm2 \n\t"
2269 "paddw %%mm4, %%mm2 \n\t"
2270 "psrlw $2, %%mm0 \n\t"
2271 "psrlw $2, %%mm2 \n\t"
2272 #endif
2273 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2274 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2276 "pmaddwd %%mm0, %%mm1 \n\t"
2277 "pmaddwd %%mm2, %%mm3 \n\t"
2278 "pmaddwd %%mm6, %%mm0 \n\t"
2279 "pmaddwd %%mm6, %%mm2 \n\t"
2280 #ifndef FAST_BGR2YV12
2281 "psrad $8, %%mm0 \n\t"
2282 "psrad $8, %%mm1 \n\t"
2283 "psrad $8, %%mm2 \n\t"
2284 "psrad $8, %%mm3 \n\t"
2285 #endif
2286 "packssdw %%mm2, %%mm0 \n\t"
2287 "packssdw %%mm3, %%mm1 \n\t"
2288 "pmaddwd %%mm5, %%mm0 \n\t"
2289 "pmaddwd %%mm5, %%mm1 \n\t"
2290 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2291 "psraw $7, %%mm0 \n\t"
2293 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2294 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2295 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2296 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2297 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2298 PAVGB" %%mm1, %%mm4 \n\t"
2299 PAVGB" %%mm3, %%mm2 \n\t"
2300 "movq %%mm4, %%mm1 \n\t"
2301 "movq %%mm2, %%mm3 \n\t"
2302 "psrlq $24, %%mm4 \n\t"
2303 "psrlq $24, %%mm2 \n\t"
2304 PAVGB" %%mm1, %%mm4 \n\t"
2305 PAVGB" %%mm3, %%mm2 \n\t"
2306 "punpcklbw %%mm7, %%mm4 \n\t"
2307 "punpcklbw %%mm7, %%mm2 \n\t"
2308 #else
2309 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2310 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2311 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2312 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2313 "punpcklbw %%mm7, %%mm4 \n\t"
2314 "punpcklbw %%mm7, %%mm1 \n\t"
2315 "punpcklbw %%mm7, %%mm2 \n\t"
2316 "punpcklbw %%mm7, %%mm3 \n\t"
2317 "paddw %%mm1, %%mm4 \n\t"
2318 "paddw %%mm3, %%mm2 \n\t"
2319 "paddw %%mm2, %%mm4 \n\t"
2320 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2321 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2322 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2323 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2324 "punpcklbw %%mm7, %%mm5 \n\t"
2325 "punpcklbw %%mm7, %%mm1 \n\t"
2326 "punpcklbw %%mm7, %%mm2 \n\t"
2327 "punpcklbw %%mm7, %%mm3 \n\t"
2328 "paddw %%mm1, %%mm5 \n\t"
2329 "paddw %%mm3, %%mm2 \n\t"
2330 "paddw %%mm5, %%mm2 \n\t"
2331 "movq "MANGLE(w1111)", %%mm5 \n\t"
2332 "psrlw $2, %%mm4 \n\t"
2333 "psrlw $2, %%mm2 \n\t"
2334 #endif
2335 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2336 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2338 "pmaddwd %%mm4, %%mm1 \n\t"
2339 "pmaddwd %%mm2, %%mm3 \n\t"
2340 "pmaddwd %%mm6, %%mm4 \n\t"
2341 "pmaddwd %%mm6, %%mm2 \n\t"
2342 #ifndef FAST_BGR2YV12
2343 "psrad $8, %%mm4 \n\t"
2344 "psrad $8, %%mm1 \n\t"
2345 "psrad $8, %%mm2 \n\t"
2346 "psrad $8, %%mm3 \n\t"
2347 #endif
2348 "packssdw %%mm2, %%mm4 \n\t"
2349 "packssdw %%mm3, %%mm1 \n\t"
2350 "pmaddwd %%mm5, %%mm4 \n\t"
2351 "pmaddwd %%mm5, %%mm1 \n\t"
2352 "add $24, %%"REG_d" \n\t"
2353 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2354 "psraw $7, %%mm4 \n\t"
2356 "movq %%mm0, %%mm1 \n\t"
2357 "punpckldq %%mm4, %%mm0 \n\t"
2358 "punpckhdq %%mm4, %%mm1 \n\t"
2359 "packsswb %%mm1, %%mm0 \n\t"
2360 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2361 "movd %%mm0, (%2, %%"REG_a") \n\t"
2362 "punpckhdq %%mm0, %%mm0 \n\t"
2363 "movd %%mm0, (%3, %%"REG_a") \n\t"
2364 "add $4, %%"REG_a" \n\t"
2365 " js 1b \n\t"
2366 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2367 : "%"REG_a, "%"REG_d
2370 udst += chromStride;
2371 vdst += chromStride;
2372 src += srcStride*2;
2375 asm volatile( EMMS" \n\t"
2376 SFENCE" \n\t"
2377 :::"memory");
2378 #else
2379 y=0;
2380 #endif
2381 for (; y<height; y+=2)
2383 long i;
2384 for (i=0; i<chromWidth; i++)
2386 unsigned int b = src[6*i+0];
2387 unsigned int g = src[6*i+1];
2388 unsigned int r = src[6*i+2];
2390 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2391 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2392 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2394 udst[i] = U;
2395 vdst[i] = V;
2396 ydst[2*i] = Y;
2398 b = src[6*i+3];
2399 g = src[6*i+4];
2400 r = src[6*i+5];
2402 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2403 ydst[2*i+1] = Y;
2405 ydst += lumStride;
2406 src += srcStride;
2408 for (i=0; i<chromWidth; i++)
2410 unsigned int b = src[6*i+0];
2411 unsigned int g = src[6*i+1];
2412 unsigned int r = src[6*i+2];
2414 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2416 ydst[2*i] = Y;
2418 b = src[6*i+3];
2419 g = src[6*i+4];
2420 r = src[6*i+5];
2422 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2423 ydst[2*i+1] = Y;
2425 udst += chromStride;
2426 vdst += chromStride;
2427 ydst += lumStride;
2428 src += srcStride;
2432 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2433 long width, long height, long src1Stride,
2434 long src2Stride, long dstStride){
2435 long h;
2437 for (h=0; h < height; h++)
2439 long w;
2441 #ifdef HAVE_MMX
2442 #ifdef HAVE_SSE2
2443 asm(
2444 "xor %%"REG_a", %%"REG_a" \n\t"
2445 "1: \n\t"
2446 PREFETCH" 64(%1, %%"REG_a") \n\t"
2447 PREFETCH" 64(%2, %%"REG_a") \n\t"
2448 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2449 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2450 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2451 "punpcklbw %%xmm2, %%xmm0 \n\t"
2452 "punpckhbw %%xmm2, %%xmm1 \n\t"
2453 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2454 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2455 "add $16, %%"REG_a" \n\t"
2456 "cmp %3, %%"REG_a" \n\t"
2457 " jb 1b \n\t"
2458 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2459 : "memory", "%"REG_a""
2461 #else
2462 asm(
2463 "xor %%"REG_a", %%"REG_a" \n\t"
2464 "1: \n\t"
2465 PREFETCH" 64(%1, %%"REG_a") \n\t"
2466 PREFETCH" 64(%2, %%"REG_a") \n\t"
2467 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2468 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2469 "movq %%mm0, %%mm1 \n\t"
2470 "movq %%mm2, %%mm3 \n\t"
2471 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2472 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2473 "punpcklbw %%mm4, %%mm0 \n\t"
2474 "punpckhbw %%mm4, %%mm1 \n\t"
2475 "punpcklbw %%mm5, %%mm2 \n\t"
2476 "punpckhbw %%mm5, %%mm3 \n\t"
2477 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2478 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2479 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2480 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2481 "add $16, %%"REG_a" \n\t"
2482 "cmp %3, %%"REG_a" \n\t"
2483 " jb 1b \n\t"
2484 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2485 : "memory", "%"REG_a
2487 #endif
2488 for (w= (width&(~15)); w < width; w++)
2490 dest[2*w+0] = src1[w];
2491 dest[2*w+1] = src2[w];
2493 #else
2494 for (w=0; w < width; w++)
2496 dest[2*w+0] = src1[w];
2497 dest[2*w+1] = src2[w];
2499 #endif
2500 dest += dstStride;
2501 src1 += src1Stride;
2502 src2 += src2Stride;
2504 #ifdef HAVE_MMX
2505 asm(
2506 EMMS" \n\t"
2507 SFENCE" \n\t"
2508 ::: "memory"
2510 #endif
2513 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2514 uint8_t *dst1, uint8_t *dst2,
2515 long width, long height,
2516 long srcStride1, long srcStride2,
2517 long dstStride1, long dstStride2)
2519 long y,x,w,h;
2520 w=width/2; h=height/2;
2521 #ifdef HAVE_MMX
2522 asm volatile(
2523 PREFETCH" %0 \n\t"
2524 PREFETCH" %1 \n\t"
2525 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2526 #endif
2527 for (y=0;y<h;y++){
2528 const uint8_t* s1=src1+srcStride1*(y>>1);
2529 uint8_t* d=dst1+dstStride1*y;
2530 x=0;
2531 #ifdef HAVE_MMX
2532 for (;x<w-31;x+=32)
2534 asm volatile(
2535 PREFETCH" 32%1 \n\t"
2536 "movq %1, %%mm0 \n\t"
2537 "movq 8%1, %%mm2 \n\t"
2538 "movq 16%1, %%mm4 \n\t"
2539 "movq 24%1, %%mm6 \n\t"
2540 "movq %%mm0, %%mm1 \n\t"
2541 "movq %%mm2, %%mm3 \n\t"
2542 "movq %%mm4, %%mm5 \n\t"
2543 "movq %%mm6, %%mm7 \n\t"
2544 "punpcklbw %%mm0, %%mm0 \n\t"
2545 "punpckhbw %%mm1, %%mm1 \n\t"
2546 "punpcklbw %%mm2, %%mm2 \n\t"
2547 "punpckhbw %%mm3, %%mm3 \n\t"
2548 "punpcklbw %%mm4, %%mm4 \n\t"
2549 "punpckhbw %%mm5, %%mm5 \n\t"
2550 "punpcklbw %%mm6, %%mm6 \n\t"
2551 "punpckhbw %%mm7, %%mm7 \n\t"
2552 MOVNTQ" %%mm0, %0 \n\t"
2553 MOVNTQ" %%mm1, 8%0 \n\t"
2554 MOVNTQ" %%mm2, 16%0 \n\t"
2555 MOVNTQ" %%mm3, 24%0 \n\t"
2556 MOVNTQ" %%mm4, 32%0 \n\t"
2557 MOVNTQ" %%mm5, 40%0 \n\t"
2558 MOVNTQ" %%mm6, 48%0 \n\t"
2559 MOVNTQ" %%mm7, 56%0"
2560 :"=m"(d[2*x])
2561 :"m"(s1[x])
2562 :"memory");
2564 #endif
2565 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2567 for (y=0;y<h;y++){
2568 const uint8_t* s2=src2+srcStride2*(y>>1);
2569 uint8_t* d=dst2+dstStride2*y;
2570 x=0;
2571 #ifdef HAVE_MMX
2572 for (;x<w-31;x+=32)
2574 asm volatile(
2575 PREFETCH" 32%1 \n\t"
2576 "movq %1, %%mm0 \n\t"
2577 "movq 8%1, %%mm2 \n\t"
2578 "movq 16%1, %%mm4 \n\t"
2579 "movq 24%1, %%mm6 \n\t"
2580 "movq %%mm0, %%mm1 \n\t"
2581 "movq %%mm2, %%mm3 \n\t"
2582 "movq %%mm4, %%mm5 \n\t"
2583 "movq %%mm6, %%mm7 \n\t"
2584 "punpcklbw %%mm0, %%mm0 \n\t"
2585 "punpckhbw %%mm1, %%mm1 \n\t"
2586 "punpcklbw %%mm2, %%mm2 \n\t"
2587 "punpckhbw %%mm3, %%mm3 \n\t"
2588 "punpcklbw %%mm4, %%mm4 \n\t"
2589 "punpckhbw %%mm5, %%mm5 \n\t"
2590 "punpcklbw %%mm6, %%mm6 \n\t"
2591 "punpckhbw %%mm7, %%mm7 \n\t"
2592 MOVNTQ" %%mm0, %0 \n\t"
2593 MOVNTQ" %%mm1, 8%0 \n\t"
2594 MOVNTQ" %%mm2, 16%0 \n\t"
2595 MOVNTQ" %%mm3, 24%0 \n\t"
2596 MOVNTQ" %%mm4, 32%0 \n\t"
2597 MOVNTQ" %%mm5, 40%0 \n\t"
2598 MOVNTQ" %%mm6, 48%0 \n\t"
2599 MOVNTQ" %%mm7, 56%0"
2600 :"=m"(d[2*x])
2601 :"m"(s2[x])
2602 :"memory");
2604 #endif
2605 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2607 #ifdef HAVE_MMX
2608 asm(
2609 EMMS" \n\t"
2610 SFENCE" \n\t"
2611 ::: "memory"
2613 #endif
2616 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2617 uint8_t *dst,
2618 long width, long height,
2619 long srcStride1, long srcStride2,
2620 long srcStride3, long dstStride)
2622 long y,x,w,h;
2623 w=width/2; h=height;
2624 for (y=0;y<h;y++){
2625 const uint8_t* yp=src1+srcStride1*y;
2626 const uint8_t* up=src2+srcStride2*(y>>2);
2627 const uint8_t* vp=src3+srcStride3*(y>>2);
2628 uint8_t* d=dst+dstStride*y;
2629 x=0;
2630 #ifdef HAVE_MMX
2631 for (;x<w-7;x+=8)
2633 asm volatile(
2634 PREFETCH" 32(%1, %0) \n\t"
2635 PREFETCH" 32(%2, %0) \n\t"
2636 PREFETCH" 32(%3, %0) \n\t"
2637 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2638 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2639 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2640 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2641 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2642 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2643 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2644 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2645 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2646 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2648 "movq %%mm1, %%mm6 \n\t"
2649 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2650 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2651 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2652 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2653 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2655 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2656 "movq 8(%1, %0, 4), %%mm0 \n\t"
2657 "movq %%mm0, %%mm3 \n\t"
2658 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2659 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2660 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2661 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2663 "movq %%mm4, %%mm6 \n\t"
2664 "movq 16(%1, %0, 4), %%mm0 \n\t"
2665 "movq %%mm0, %%mm3 \n\t"
2666 "punpcklbw %%mm5, %%mm4 \n\t"
2667 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2668 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2669 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2670 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2672 "punpckhbw %%mm5, %%mm6 \n\t"
2673 "movq 24(%1, %0, 4), %%mm0 \n\t"
2674 "movq %%mm0, %%mm3 \n\t"
2675 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2676 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2677 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2678 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2680 : "+r" (x)
2681 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2682 :"memory");
2684 #endif
2685 for (; x<w; x++)
2687 const long x2 = x<<2;
2688 d[8*x+0] = yp[x2];
2689 d[8*x+1] = up[x];
2690 d[8*x+2] = yp[x2+1];
2691 d[8*x+3] = vp[x];
2692 d[8*x+4] = yp[x2+2];
2693 d[8*x+5] = up[x];
2694 d[8*x+6] = yp[x2+3];
2695 d[8*x+7] = vp[x];
2698 #ifdef HAVE_MMX
2699 asm(
2700 EMMS" \n\t"
2701 SFENCE" \n\t"
2702 ::: "memory"
2704 #endif
2707 static inline void RENAME(rgb2rgb_init)(void){
2708 rgb15to16 = RENAME(rgb15to16);
2709 rgb15to24 = RENAME(rgb15to24);
2710 rgb15to32 = RENAME(rgb15to32);
2711 rgb16to24 = RENAME(rgb16to24);
2712 rgb16to32 = RENAME(rgb16to32);
2713 rgb16to15 = RENAME(rgb16to15);
2714 rgb24to16 = RENAME(rgb24to16);
2715 rgb24to15 = RENAME(rgb24to15);
2716 rgb24to32 = RENAME(rgb24to32);
2717 rgb32to16 = RENAME(rgb32to16);
2718 rgb32to15 = RENAME(rgb32to15);
2719 rgb32to24 = RENAME(rgb32to24);
2720 rgb24tobgr15 = RENAME(rgb24tobgr15);
2721 rgb24tobgr16 = RENAME(rgb24tobgr16);
2722 rgb24tobgr24 = RENAME(rgb24tobgr24);
2723 rgb32tobgr32 = RENAME(rgb32tobgr32);
2724 rgb32tobgr16 = RENAME(rgb32tobgr16);
2725 rgb32tobgr15 = RENAME(rgb32tobgr15);
2726 yv12toyuy2 = RENAME(yv12toyuy2);
2727 yv12touyvy = RENAME(yv12touyvy);
2728 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2729 yuy2toyv12 = RENAME(yuy2toyv12);
2730 // uyvytoyv12 = RENAME(uyvytoyv12);
2731 // yvu9toyv12 = RENAME(yvu9toyv12);
2732 planar2x = RENAME(planar2x);
2733 rgb24toyv12 = RENAME(rgb24toyv12);
2734 interleaveBytes = RENAME(interleaveBytes);
2735 vu9_to_vu12 = RENAME(vu9_to_vu12);
2736 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);