Merge svn changes up to r29277
[mplayer.git] / libswscale / rgb2rgb_template.c
blob56c94264307bb53ca9eb059703d5fcfd3f1fcefb
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
30 #include <stddef.h>
32 #undef PREFETCH
33 #undef MOVNTQ
34 #undef EMMS
35 #undef SFENCE
36 #undef MMREG_SIZE
37 #undef PREFETCHW
38 #undef PAVGB
40 #if HAVE_SSE2
41 #define MMREG_SIZE 16
42 #else
43 #define MMREG_SIZE 8
44 #endif
46 #if HAVE_AMD3DNOW
47 #define PREFETCH "prefetch"
48 #define PREFETCHW "prefetchw"
49 #define PAVGB "pavgusb"
50 #elif HAVE_MMX2
51 #define PREFETCH "prefetchnta"
52 #define PREFETCHW "prefetcht0"
53 #define PAVGB "pavgb"
54 #else
55 #define PREFETCH " # nop"
56 #define PREFETCHW " # nop"
57 #endif
59 #if HAVE_AMD3DNOW
60 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
61 #define EMMS "femms"
62 #else
63 #define EMMS "emms"
64 #endif
66 #if HAVE_MMX2
67 #define MOVNTQ "movntq"
68 #define SFENCE "sfence"
69 #else
70 #define MOVNTQ "movq"
71 #define SFENCE " # nop"
72 #endif
74 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
76 uint8_t *dest = dst;
77 const uint8_t *s = src;
78 const uint8_t *end;
79 #if HAVE_MMX
80 const uint8_t *mm_end;
81 #endif
82 end = s + src_size;
83 #if HAVE_MMX
84 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
85 mm_end = end - 23;
86 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
87 while (s < mm_end)
89 __asm__ volatile(
90 PREFETCH" 32%1 \n\t"
91 "movd %1, %%mm0 \n\t"
92 "punpckldq 3%1, %%mm0 \n\t"
93 "movd 6%1, %%mm1 \n\t"
94 "punpckldq 9%1, %%mm1 \n\t"
95 "movd 12%1, %%mm2 \n\t"
96 "punpckldq 15%1, %%mm2 \n\t"
97 "movd 18%1, %%mm3 \n\t"
98 "punpckldq 21%1, %%mm3 \n\t"
99 "por %%mm7, %%mm0 \n\t"
100 "por %%mm7, %%mm1 \n\t"
101 "por %%mm7, %%mm2 \n\t"
102 "por %%mm7, %%mm3 \n\t"
103 MOVNTQ" %%mm0, %0 \n\t"
104 MOVNTQ" %%mm1, 8%0 \n\t"
105 MOVNTQ" %%mm2, 16%0 \n\t"
106 MOVNTQ" %%mm3, 24%0"
107 :"=m"(*dest)
108 :"m"(*s)
109 :"memory");
110 dest += 32;
111 s += 24;
113 __asm__ volatile(SFENCE:::"memory");
114 __asm__ volatile(EMMS:::"memory");
115 #endif
116 while (s < end)
118 #ifdef WORDS_BIGENDIAN
119 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
120 *dest++ = 255;
121 *dest++ = s[2];
122 *dest++ = s[1];
123 *dest++ = s[0];
124 s+=3;
125 #else
126 *dest++ = *s++;
127 *dest++ = *s++;
128 *dest++ = *s++;
129 *dest++ = 255;
130 #endif
134 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
136 uint8_t *dest = dst;
137 const uint8_t *s = src;
138 const uint8_t *end;
139 #if HAVE_MMX
140 const uint8_t *mm_end;
141 #endif
142 end = s + src_size;
143 #if HAVE_MMX
144 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
145 mm_end = end - 31;
146 while (s < mm_end)
148 __asm__ volatile(
149 PREFETCH" 32%1 \n\t"
150 "movq %1, %%mm0 \n\t"
151 "movq 8%1, %%mm1 \n\t"
152 "movq 16%1, %%mm4 \n\t"
153 "movq 24%1, %%mm5 \n\t"
154 "movq %%mm0, %%mm2 \n\t"
155 "movq %%mm1, %%mm3 \n\t"
156 "movq %%mm4, %%mm6 \n\t"
157 "movq %%mm5, %%mm7 \n\t"
158 "psrlq $8, %%mm2 \n\t"
159 "psrlq $8, %%mm3 \n\t"
160 "psrlq $8, %%mm6 \n\t"
161 "psrlq $8, %%mm7 \n\t"
162 "pand %2, %%mm0 \n\t"
163 "pand %2, %%mm1 \n\t"
164 "pand %2, %%mm4 \n\t"
165 "pand %2, %%mm5 \n\t"
166 "pand %3, %%mm2 \n\t"
167 "pand %3, %%mm3 \n\t"
168 "pand %3, %%mm6 \n\t"
169 "pand %3, %%mm7 \n\t"
170 "por %%mm2, %%mm0 \n\t"
171 "por %%mm3, %%mm1 \n\t"
172 "por %%mm6, %%mm4 \n\t"
173 "por %%mm7, %%mm5 \n\t"
175 "movq %%mm1, %%mm2 \n\t"
176 "movq %%mm4, %%mm3 \n\t"
177 "psllq $48, %%mm2 \n\t"
178 "psllq $32, %%mm3 \n\t"
179 "pand %4, %%mm2 \n\t"
180 "pand %5, %%mm3 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "psrlq $16, %%mm1 \n\t"
183 "psrlq $32, %%mm4 \n\t"
184 "psllq $16, %%mm5 \n\t"
185 "por %%mm3, %%mm1 \n\t"
186 "pand %6, %%mm5 \n\t"
187 "por %%mm5, %%mm4 \n\t"
189 MOVNTQ" %%mm0, %0 \n\t"
190 MOVNTQ" %%mm1, 8%0 \n\t"
191 MOVNTQ" %%mm4, 16%0"
192 :"=m"(*dest)
193 :"m"(*s),"m"(mask24l),
194 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
195 :"memory");
196 dest += 24;
197 s += 32;
199 __asm__ volatile(SFENCE:::"memory");
200 __asm__ volatile(EMMS:::"memory");
201 #endif
202 while (s < end)
204 #ifdef WORDS_BIGENDIAN
205 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
206 s++;
207 dest[2] = *s++;
208 dest[1] = *s++;
209 dest[0] = *s++;
210 dest += 3;
211 #else
212 *dest++ = *s++;
213 *dest++ = *s++;
214 *dest++ = *s++;
215 s++;
216 #endif
221 original by Strepto/Astral
222 ported to gcc & bugfixed: A'rpi
223 MMX2, 3DNOW optimization by Nick Kurshev
224 32-bit C version, and and&add trick by Michael Niedermayer
226 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
228 register const uint8_t* s=src;
229 register uint8_t* d=dst;
230 register const uint8_t *end;
231 const uint8_t *mm_end;
232 end = s + src_size;
233 #if HAVE_MMX
234 __asm__ volatile(PREFETCH" %0"::"m"(*s));
235 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
236 mm_end = end - 15;
237 while (s<mm_end)
239 __asm__ volatile(
240 PREFETCH" 32%1 \n\t"
241 "movq %1, %%mm0 \n\t"
242 "movq 8%1, %%mm2 \n\t"
243 "movq %%mm0, %%mm1 \n\t"
244 "movq %%mm2, %%mm3 \n\t"
245 "pand %%mm4, %%mm0 \n\t"
246 "pand %%mm4, %%mm2 \n\t"
247 "paddw %%mm1, %%mm0 \n\t"
248 "paddw %%mm3, %%mm2 \n\t"
249 MOVNTQ" %%mm0, %0 \n\t"
250 MOVNTQ" %%mm2, 8%0"
251 :"=m"(*d)
252 :"m"(*s)
254 d+=16;
255 s+=16;
257 __asm__ volatile(SFENCE:::"memory");
258 __asm__ volatile(EMMS:::"memory");
259 #endif
260 mm_end = end - 3;
261 while (s < mm_end)
263 register unsigned x= *((const uint32_t *)s);
264 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
265 d+=4;
266 s+=4;
268 if (s < end)
270 register unsigned short x= *((const uint16_t *)s);
271 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
275 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
277 register const uint8_t* s=src;
278 register uint8_t* d=dst;
279 register const uint8_t *end;
280 const uint8_t *mm_end;
281 end = s + src_size;
282 #if HAVE_MMX
283 __asm__ volatile(PREFETCH" %0"::"m"(*s));
284 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
285 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
286 mm_end = end - 15;
287 while (s<mm_end)
289 __asm__ volatile(
290 PREFETCH" 32%1 \n\t"
291 "movq %1, %%mm0 \n\t"
292 "movq 8%1, %%mm2 \n\t"
293 "movq %%mm0, %%mm1 \n\t"
294 "movq %%mm2, %%mm3 \n\t"
295 "psrlq $1, %%mm0 \n\t"
296 "psrlq $1, %%mm2 \n\t"
297 "pand %%mm7, %%mm0 \n\t"
298 "pand %%mm7, %%mm2 \n\t"
299 "pand %%mm6, %%mm1 \n\t"
300 "pand %%mm6, %%mm3 \n\t"
301 "por %%mm1, %%mm0 \n\t"
302 "por %%mm3, %%mm2 \n\t"
303 MOVNTQ" %%mm0, %0 \n\t"
304 MOVNTQ" %%mm2, 8%0"
305 :"=m"(*d)
306 :"m"(*s)
308 d+=16;
309 s+=16;
311 __asm__ volatile(SFENCE:::"memory");
312 __asm__ volatile(EMMS:::"memory");
313 #endif
314 mm_end = end - 3;
315 while (s < mm_end)
317 register uint32_t x= *((const uint32_t*)s);
318 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
319 s+=4;
320 d+=4;
322 if (s < end)
324 register uint16_t x= *((const uint16_t*)s);
325 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
329 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
331 const uint8_t *s = src;
332 const uint8_t *end;
333 #if HAVE_MMX
334 const uint8_t *mm_end;
335 #endif
336 uint16_t *d = (uint16_t *)dst;
337 end = s + src_size;
338 #if HAVE_MMX
339 mm_end = end - 15;
340 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
341 __asm__ volatile(
342 "movq %3, %%mm5 \n\t"
343 "movq %4, %%mm6 \n\t"
344 "movq %5, %%mm7 \n\t"
345 "jmp 2f \n\t"
346 ASMALIGN(4)
347 "1: \n\t"
348 PREFETCH" 32(%1) \n\t"
349 "movd (%1), %%mm0 \n\t"
350 "movd 4(%1), %%mm3 \n\t"
351 "punpckldq 8(%1), %%mm0 \n\t"
352 "punpckldq 12(%1), %%mm3 \n\t"
353 "movq %%mm0, %%mm1 \n\t"
354 "movq %%mm3, %%mm4 \n\t"
355 "pand %%mm6, %%mm0 \n\t"
356 "pand %%mm6, %%mm3 \n\t"
357 "pmaddwd %%mm7, %%mm0 \n\t"
358 "pmaddwd %%mm7, %%mm3 \n\t"
359 "pand %%mm5, %%mm1 \n\t"
360 "pand %%mm5, %%mm4 \n\t"
361 "por %%mm1, %%mm0 \n\t"
362 "por %%mm4, %%mm3 \n\t"
363 "psrld $5, %%mm0 \n\t"
364 "pslld $11, %%mm3 \n\t"
365 "por %%mm3, %%mm0 \n\t"
366 MOVNTQ" %%mm0, (%0) \n\t"
367 "add $16, %1 \n\t"
368 "add $8, %0 \n\t"
369 "2: \n\t"
370 "cmp %2, %1 \n\t"
371 " jb 1b \n\t"
372 : "+r" (d), "+r"(s)
373 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
375 #else
376 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
377 __asm__ volatile(
378 "movq %0, %%mm7 \n\t"
379 "movq %1, %%mm6 \n\t"
380 ::"m"(red_16mask),"m"(green_16mask));
381 while (s < mm_end)
383 __asm__ volatile(
384 PREFETCH" 32%1 \n\t"
385 "movd %1, %%mm0 \n\t"
386 "movd 4%1, %%mm3 \n\t"
387 "punpckldq 8%1, %%mm0 \n\t"
388 "punpckldq 12%1, %%mm3 \n\t"
389 "movq %%mm0, %%mm1 \n\t"
390 "movq %%mm0, %%mm2 \n\t"
391 "movq %%mm3, %%mm4 \n\t"
392 "movq %%mm3, %%mm5 \n\t"
393 "psrlq $3, %%mm0 \n\t"
394 "psrlq $3, %%mm3 \n\t"
395 "pand %2, %%mm0 \n\t"
396 "pand %2, %%mm3 \n\t"
397 "psrlq $5, %%mm1 \n\t"
398 "psrlq $5, %%mm4 \n\t"
399 "pand %%mm6, %%mm1 \n\t"
400 "pand %%mm6, %%mm4 \n\t"
401 "psrlq $8, %%mm2 \n\t"
402 "psrlq $8, %%mm5 \n\t"
403 "pand %%mm7, %%mm2 \n\t"
404 "pand %%mm7, %%mm5 \n\t"
405 "por %%mm1, %%mm0 \n\t"
406 "por %%mm4, %%mm3 \n\t"
407 "por %%mm2, %%mm0 \n\t"
408 "por %%mm5, %%mm3 \n\t"
409 "psllq $16, %%mm3 \n\t"
410 "por %%mm3, %%mm0 \n\t"
411 MOVNTQ" %%mm0, %0 \n\t"
412 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
413 d += 4;
414 s += 16;
416 #endif
417 __asm__ volatile(SFENCE:::"memory");
418 __asm__ volatile(EMMS:::"memory");
419 #endif
420 while (s < end)
422 register int rgb = *(const uint32_t*)s; s += 4;
423 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
427 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
429 const uint8_t *s = src;
430 const uint8_t *end;
431 #if HAVE_MMX
432 const uint8_t *mm_end;
433 #endif
434 uint16_t *d = (uint16_t *)dst;
435 end = s + src_size;
436 #if HAVE_MMX
437 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
438 __asm__ volatile(
439 "movq %0, %%mm7 \n\t"
440 "movq %1, %%mm6 \n\t"
441 ::"m"(red_16mask),"m"(green_16mask));
442 mm_end = end - 15;
443 while (s < mm_end)
445 __asm__ volatile(
446 PREFETCH" 32%1 \n\t"
447 "movd %1, %%mm0 \n\t"
448 "movd 4%1, %%mm3 \n\t"
449 "punpckldq 8%1, %%mm0 \n\t"
450 "punpckldq 12%1, %%mm3 \n\t"
451 "movq %%mm0, %%mm1 \n\t"
452 "movq %%mm0, %%mm2 \n\t"
453 "movq %%mm3, %%mm4 \n\t"
454 "movq %%mm3, %%mm5 \n\t"
455 "psllq $8, %%mm0 \n\t"
456 "psllq $8, %%mm3 \n\t"
457 "pand %%mm7, %%mm0 \n\t"
458 "pand %%mm7, %%mm3 \n\t"
459 "psrlq $5, %%mm1 \n\t"
460 "psrlq $5, %%mm4 \n\t"
461 "pand %%mm6, %%mm1 \n\t"
462 "pand %%mm6, %%mm4 \n\t"
463 "psrlq $19, %%mm2 \n\t"
464 "psrlq $19, %%mm5 \n\t"
465 "pand %2, %%mm2 \n\t"
466 "pand %2, %%mm5 \n\t"
467 "por %%mm1, %%mm0 \n\t"
468 "por %%mm4, %%mm3 \n\t"
469 "por %%mm2, %%mm0 \n\t"
470 "por %%mm5, %%mm3 \n\t"
471 "psllq $16, %%mm3 \n\t"
472 "por %%mm3, %%mm0 \n\t"
473 MOVNTQ" %%mm0, %0 \n\t"
474 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
475 d += 4;
476 s += 16;
478 __asm__ volatile(SFENCE:::"memory");
479 __asm__ volatile(EMMS:::"memory");
480 #endif
481 while (s < end)
483 register int rgb = *(const uint32_t*)s; s += 4;
484 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
488 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
490 const uint8_t *s = src;
491 const uint8_t *end;
492 #if HAVE_MMX
493 const uint8_t *mm_end;
494 #endif
495 uint16_t *d = (uint16_t *)dst;
496 end = s + src_size;
497 #if HAVE_MMX
498 mm_end = end - 15;
499 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
500 __asm__ volatile(
501 "movq %3, %%mm5 \n\t"
502 "movq %4, %%mm6 \n\t"
503 "movq %5, %%mm7 \n\t"
504 "jmp 2f \n\t"
505 ASMALIGN(4)
506 "1: \n\t"
507 PREFETCH" 32(%1) \n\t"
508 "movd (%1), %%mm0 \n\t"
509 "movd 4(%1), %%mm3 \n\t"
510 "punpckldq 8(%1), %%mm0 \n\t"
511 "punpckldq 12(%1), %%mm3 \n\t"
512 "movq %%mm0, %%mm1 \n\t"
513 "movq %%mm3, %%mm4 \n\t"
514 "pand %%mm6, %%mm0 \n\t"
515 "pand %%mm6, %%mm3 \n\t"
516 "pmaddwd %%mm7, %%mm0 \n\t"
517 "pmaddwd %%mm7, %%mm3 \n\t"
518 "pand %%mm5, %%mm1 \n\t"
519 "pand %%mm5, %%mm4 \n\t"
520 "por %%mm1, %%mm0 \n\t"
521 "por %%mm4, %%mm3 \n\t"
522 "psrld $6, %%mm0 \n\t"
523 "pslld $10, %%mm3 \n\t"
524 "por %%mm3, %%mm0 \n\t"
525 MOVNTQ" %%mm0, (%0) \n\t"
526 "add $16, %1 \n\t"
527 "add $8, %0 \n\t"
528 "2: \n\t"
529 "cmp %2, %1 \n\t"
530 " jb 1b \n\t"
531 : "+r" (d), "+r"(s)
532 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
534 #else
535 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
536 __asm__ volatile(
537 "movq %0, %%mm7 \n\t"
538 "movq %1, %%mm6 \n\t"
539 ::"m"(red_15mask),"m"(green_15mask));
540 while (s < mm_end)
542 __asm__ volatile(
543 PREFETCH" 32%1 \n\t"
544 "movd %1, %%mm0 \n\t"
545 "movd 4%1, %%mm3 \n\t"
546 "punpckldq 8%1, %%mm0 \n\t"
547 "punpckldq 12%1, %%mm3 \n\t"
548 "movq %%mm0, %%mm1 \n\t"
549 "movq %%mm0, %%mm2 \n\t"
550 "movq %%mm3, %%mm4 \n\t"
551 "movq %%mm3, %%mm5 \n\t"
552 "psrlq $3, %%mm0 \n\t"
553 "psrlq $3, %%mm3 \n\t"
554 "pand %2, %%mm0 \n\t"
555 "pand %2, %%mm3 \n\t"
556 "psrlq $6, %%mm1 \n\t"
557 "psrlq $6, %%mm4 \n\t"
558 "pand %%mm6, %%mm1 \n\t"
559 "pand %%mm6, %%mm4 \n\t"
560 "psrlq $9, %%mm2 \n\t"
561 "psrlq $9, %%mm5 \n\t"
562 "pand %%mm7, %%mm2 \n\t"
563 "pand %%mm7, %%mm5 \n\t"
564 "por %%mm1, %%mm0 \n\t"
565 "por %%mm4, %%mm3 \n\t"
566 "por %%mm2, %%mm0 \n\t"
567 "por %%mm5, %%mm3 \n\t"
568 "psllq $16, %%mm3 \n\t"
569 "por %%mm3, %%mm0 \n\t"
570 MOVNTQ" %%mm0, %0 \n\t"
571 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
572 d += 4;
573 s += 16;
575 #endif
576 __asm__ volatile(SFENCE:::"memory");
577 __asm__ volatile(EMMS:::"memory");
578 #endif
579 while (s < end)
581 register int rgb = *(const uint32_t*)s; s += 4;
582 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
586 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
588 const uint8_t *s = src;
589 const uint8_t *end;
590 #if HAVE_MMX
591 const uint8_t *mm_end;
592 #endif
593 uint16_t *d = (uint16_t *)dst;
594 end = s + src_size;
595 #if HAVE_MMX
596 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
597 __asm__ volatile(
598 "movq %0, %%mm7 \n\t"
599 "movq %1, %%mm6 \n\t"
600 ::"m"(red_15mask),"m"(green_15mask));
601 mm_end = end - 15;
602 while (s < mm_end)
604 __asm__ volatile(
605 PREFETCH" 32%1 \n\t"
606 "movd %1, %%mm0 \n\t"
607 "movd 4%1, %%mm3 \n\t"
608 "punpckldq 8%1, %%mm0 \n\t"
609 "punpckldq 12%1, %%mm3 \n\t"
610 "movq %%mm0, %%mm1 \n\t"
611 "movq %%mm0, %%mm2 \n\t"
612 "movq %%mm3, %%mm4 \n\t"
613 "movq %%mm3, %%mm5 \n\t"
614 "psllq $7, %%mm0 \n\t"
615 "psllq $7, %%mm3 \n\t"
616 "pand %%mm7, %%mm0 \n\t"
617 "pand %%mm7, %%mm3 \n\t"
618 "psrlq $6, %%mm1 \n\t"
619 "psrlq $6, %%mm4 \n\t"
620 "pand %%mm6, %%mm1 \n\t"
621 "pand %%mm6, %%mm4 \n\t"
622 "psrlq $19, %%mm2 \n\t"
623 "psrlq $19, %%mm5 \n\t"
624 "pand %2, %%mm2 \n\t"
625 "pand %2, %%mm5 \n\t"
626 "por %%mm1, %%mm0 \n\t"
627 "por %%mm4, %%mm3 \n\t"
628 "por %%mm2, %%mm0 \n\t"
629 "por %%mm5, %%mm3 \n\t"
630 "psllq $16, %%mm3 \n\t"
631 "por %%mm3, %%mm0 \n\t"
632 MOVNTQ" %%mm0, %0 \n\t"
633 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
634 d += 4;
635 s += 16;
637 __asm__ volatile(SFENCE:::"memory");
638 __asm__ volatile(EMMS:::"memory");
639 #endif
640 while (s < end)
642 register int rgb = *(const uint32_t*)s; s += 4;
643 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
647 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
649 const uint8_t *s = src;
650 const uint8_t *end;
651 #if HAVE_MMX
652 const uint8_t *mm_end;
653 #endif
654 uint16_t *d = (uint16_t *)dst;
655 end = s + src_size;
656 #if HAVE_MMX
657 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
658 __asm__ volatile(
659 "movq %0, %%mm7 \n\t"
660 "movq %1, %%mm6 \n\t"
661 ::"m"(red_16mask),"m"(green_16mask));
662 mm_end = end - 11;
663 while (s < mm_end)
665 __asm__ volatile(
666 PREFETCH" 32%1 \n\t"
667 "movd %1, %%mm0 \n\t"
668 "movd 3%1, %%mm3 \n\t"
669 "punpckldq 6%1, %%mm0 \n\t"
670 "punpckldq 9%1, %%mm3 \n\t"
671 "movq %%mm0, %%mm1 \n\t"
672 "movq %%mm0, %%mm2 \n\t"
673 "movq %%mm3, %%mm4 \n\t"
674 "movq %%mm3, %%mm5 \n\t"
675 "psrlq $3, %%mm0 \n\t"
676 "psrlq $3, %%mm3 \n\t"
677 "pand %2, %%mm0 \n\t"
678 "pand %2, %%mm3 \n\t"
679 "psrlq $5, %%mm1 \n\t"
680 "psrlq $5, %%mm4 \n\t"
681 "pand %%mm6, %%mm1 \n\t"
682 "pand %%mm6, %%mm4 \n\t"
683 "psrlq $8, %%mm2 \n\t"
684 "psrlq $8, %%mm5 \n\t"
685 "pand %%mm7, %%mm2 \n\t"
686 "pand %%mm7, %%mm5 \n\t"
687 "por %%mm1, %%mm0 \n\t"
688 "por %%mm4, %%mm3 \n\t"
689 "por %%mm2, %%mm0 \n\t"
690 "por %%mm5, %%mm3 \n\t"
691 "psllq $16, %%mm3 \n\t"
692 "por %%mm3, %%mm0 \n\t"
693 MOVNTQ" %%mm0, %0 \n\t"
694 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
695 d += 4;
696 s += 12;
698 __asm__ volatile(SFENCE:::"memory");
699 __asm__ volatile(EMMS:::"memory");
700 #endif
701 while (s < end)
703 const int b = *s++;
704 const int g = *s++;
705 const int r = *s++;
706 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
710 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
712 const uint8_t *s = src;
713 const uint8_t *end;
714 #if HAVE_MMX
715 const uint8_t *mm_end;
716 #endif
717 uint16_t *d = (uint16_t *)dst;
718 end = s + src_size;
719 #if HAVE_MMX
720 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
721 __asm__ volatile(
722 "movq %0, %%mm7 \n\t"
723 "movq %1, %%mm6 \n\t"
724 ::"m"(red_16mask),"m"(green_16mask));
725 mm_end = end - 15;
726 while (s < mm_end)
728 __asm__ volatile(
729 PREFETCH" 32%1 \n\t"
730 "movd %1, %%mm0 \n\t"
731 "movd 3%1, %%mm3 \n\t"
732 "punpckldq 6%1, %%mm0 \n\t"
733 "punpckldq 9%1, %%mm3 \n\t"
734 "movq %%mm0, %%mm1 \n\t"
735 "movq %%mm0, %%mm2 \n\t"
736 "movq %%mm3, %%mm4 \n\t"
737 "movq %%mm3, %%mm5 \n\t"
738 "psllq $8, %%mm0 \n\t"
739 "psllq $8, %%mm3 \n\t"
740 "pand %%mm7, %%mm0 \n\t"
741 "pand %%mm7, %%mm3 \n\t"
742 "psrlq $5, %%mm1 \n\t"
743 "psrlq $5, %%mm4 \n\t"
744 "pand %%mm6, %%mm1 \n\t"
745 "pand %%mm6, %%mm4 \n\t"
746 "psrlq $19, %%mm2 \n\t"
747 "psrlq $19, %%mm5 \n\t"
748 "pand %2, %%mm2 \n\t"
749 "pand %2, %%mm5 \n\t"
750 "por %%mm1, %%mm0 \n\t"
751 "por %%mm4, %%mm3 \n\t"
752 "por %%mm2, %%mm0 \n\t"
753 "por %%mm5, %%mm3 \n\t"
754 "psllq $16, %%mm3 \n\t"
755 "por %%mm3, %%mm0 \n\t"
756 MOVNTQ" %%mm0, %0 \n\t"
757 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
758 d += 4;
759 s += 12;
761 __asm__ volatile(SFENCE:::"memory");
762 __asm__ volatile(EMMS:::"memory");
763 #endif
764 while (s < end)
766 const int r = *s++;
767 const int g = *s++;
768 const int b = *s++;
769 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
773 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
775 const uint8_t *s = src;
776 const uint8_t *end;
777 #if HAVE_MMX
778 const uint8_t *mm_end;
779 #endif
780 uint16_t *d = (uint16_t *)dst;
781 end = s + src_size;
782 #if HAVE_MMX
783 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
784 __asm__ volatile(
785 "movq %0, %%mm7 \n\t"
786 "movq %1, %%mm6 \n\t"
787 ::"m"(red_15mask),"m"(green_15mask));
788 mm_end = end - 11;
789 while (s < mm_end)
791 __asm__ volatile(
792 PREFETCH" 32%1 \n\t"
793 "movd %1, %%mm0 \n\t"
794 "movd 3%1, %%mm3 \n\t"
795 "punpckldq 6%1, %%mm0 \n\t"
796 "punpckldq 9%1, %%mm3 \n\t"
797 "movq %%mm0, %%mm1 \n\t"
798 "movq %%mm0, %%mm2 \n\t"
799 "movq %%mm3, %%mm4 \n\t"
800 "movq %%mm3, %%mm5 \n\t"
801 "psrlq $3, %%mm0 \n\t"
802 "psrlq $3, %%mm3 \n\t"
803 "pand %2, %%mm0 \n\t"
804 "pand %2, %%mm3 \n\t"
805 "psrlq $6, %%mm1 \n\t"
806 "psrlq $6, %%mm4 \n\t"
807 "pand %%mm6, %%mm1 \n\t"
808 "pand %%mm6, %%mm4 \n\t"
809 "psrlq $9, %%mm2 \n\t"
810 "psrlq $9, %%mm5 \n\t"
811 "pand %%mm7, %%mm2 \n\t"
812 "pand %%mm7, %%mm5 \n\t"
813 "por %%mm1, %%mm0 \n\t"
814 "por %%mm4, %%mm3 \n\t"
815 "por %%mm2, %%mm0 \n\t"
816 "por %%mm5, %%mm3 \n\t"
817 "psllq $16, %%mm3 \n\t"
818 "por %%mm3, %%mm0 \n\t"
819 MOVNTQ" %%mm0, %0 \n\t"
820 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
821 d += 4;
822 s += 12;
824 __asm__ volatile(SFENCE:::"memory");
825 __asm__ volatile(EMMS:::"memory");
826 #endif
827 while (s < end)
829 const int b = *s++;
830 const int g = *s++;
831 const int r = *s++;
832 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
836 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
838 const uint8_t *s = src;
839 const uint8_t *end;
840 #if HAVE_MMX
841 const uint8_t *mm_end;
842 #endif
843 uint16_t *d = (uint16_t *)dst;
844 end = s + src_size;
845 #if HAVE_MMX
846 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
847 __asm__ volatile(
848 "movq %0, %%mm7 \n\t"
849 "movq %1, %%mm6 \n\t"
850 ::"m"(red_15mask),"m"(green_15mask));
851 mm_end = end - 15;
852 while (s < mm_end)
854 __asm__ volatile(
855 PREFETCH" 32%1 \n\t"
856 "movd %1, %%mm0 \n\t"
857 "movd 3%1, %%mm3 \n\t"
858 "punpckldq 6%1, %%mm0 \n\t"
859 "punpckldq 9%1, %%mm3 \n\t"
860 "movq %%mm0, %%mm1 \n\t"
861 "movq %%mm0, %%mm2 \n\t"
862 "movq %%mm3, %%mm4 \n\t"
863 "movq %%mm3, %%mm5 \n\t"
864 "psllq $7, %%mm0 \n\t"
865 "psllq $7, %%mm3 \n\t"
866 "pand %%mm7, %%mm0 \n\t"
867 "pand %%mm7, %%mm3 \n\t"
868 "psrlq $6, %%mm1 \n\t"
869 "psrlq $6, %%mm4 \n\t"
870 "pand %%mm6, %%mm1 \n\t"
871 "pand %%mm6, %%mm4 \n\t"
872 "psrlq $19, %%mm2 \n\t"
873 "psrlq $19, %%mm5 \n\t"
874 "pand %2, %%mm2 \n\t"
875 "pand %2, %%mm5 \n\t"
876 "por %%mm1, %%mm0 \n\t"
877 "por %%mm4, %%mm3 \n\t"
878 "por %%mm2, %%mm0 \n\t"
879 "por %%mm5, %%mm3 \n\t"
880 "psllq $16, %%mm3 \n\t"
881 "por %%mm3, %%mm0 \n\t"
882 MOVNTQ" %%mm0, %0 \n\t"
883 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
884 d += 4;
885 s += 12;
887 __asm__ volatile(SFENCE:::"memory");
888 __asm__ volatile(EMMS:::"memory");
889 #endif
890 while (s < end)
892 const int r = *s++;
893 const int g = *s++;
894 const int b = *s++;
895 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
900 I use less accurate approximation here by simply left-shifting the input
901 value and filling the low order bits with zeroes. This method improves PNG
902 compression but this scheme cannot reproduce white exactly, since it does
903 not generate an all-ones maximum value; the net effect is to darken the
904 image slightly.
906 The better method should be "left bit replication":
908 4 3 2 1 0
909 ---------
910 1 1 0 1 1
912 7 6 5 4 3 2 1 0
913 ----------------
914 1 1 0 1 1 1 1 0
915 |=======| |===|
916 | leftmost bits repeated to fill open bits
918 original bits
920 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
922 const uint16_t *end;
923 #if HAVE_MMX
924 const uint16_t *mm_end;
925 #endif
926 uint8_t *d = dst;
927 const uint16_t *s = (const uint16_t*)src;
928 end = s + src_size/2;
929 #if HAVE_MMX
930 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
931 mm_end = end - 7;
932 while (s < mm_end)
934 __asm__ volatile(
935 PREFETCH" 32%1 \n\t"
936 "movq %1, %%mm0 \n\t"
937 "movq %1, %%mm1 \n\t"
938 "movq %1, %%mm2 \n\t"
939 "pand %2, %%mm0 \n\t"
940 "pand %3, %%mm1 \n\t"
941 "pand %4, %%mm2 \n\t"
942 "psllq $3, %%mm0 \n\t"
943 "psrlq $2, %%mm1 \n\t"
944 "psrlq $7, %%mm2 \n\t"
945 "movq %%mm0, %%mm3 \n\t"
946 "movq %%mm1, %%mm4 \n\t"
947 "movq %%mm2, %%mm5 \n\t"
948 "punpcklwd %5, %%mm0 \n\t"
949 "punpcklwd %5, %%mm1 \n\t"
950 "punpcklwd %5, %%mm2 \n\t"
951 "punpckhwd %5, %%mm3 \n\t"
952 "punpckhwd %5, %%mm4 \n\t"
953 "punpckhwd %5, %%mm5 \n\t"
954 "psllq $8, %%mm1 \n\t"
955 "psllq $16, %%mm2 \n\t"
956 "por %%mm1, %%mm0 \n\t"
957 "por %%mm2, %%mm0 \n\t"
958 "psllq $8, %%mm4 \n\t"
959 "psllq $16, %%mm5 \n\t"
960 "por %%mm4, %%mm3 \n\t"
961 "por %%mm5, %%mm3 \n\t"
963 "movq %%mm0, %%mm6 \n\t"
964 "movq %%mm3, %%mm7 \n\t"
966 "movq 8%1, %%mm0 \n\t"
967 "movq 8%1, %%mm1 \n\t"
968 "movq 8%1, %%mm2 \n\t"
969 "pand %2, %%mm0 \n\t"
970 "pand %3, %%mm1 \n\t"
971 "pand %4, %%mm2 \n\t"
972 "psllq $3, %%mm0 \n\t"
973 "psrlq $2, %%mm1 \n\t"
974 "psrlq $7, %%mm2 \n\t"
975 "movq %%mm0, %%mm3 \n\t"
976 "movq %%mm1, %%mm4 \n\t"
977 "movq %%mm2, %%mm5 \n\t"
978 "punpcklwd %5, %%mm0 \n\t"
979 "punpcklwd %5, %%mm1 \n\t"
980 "punpcklwd %5, %%mm2 \n\t"
981 "punpckhwd %5, %%mm3 \n\t"
982 "punpckhwd %5, %%mm4 \n\t"
983 "punpckhwd %5, %%mm5 \n\t"
984 "psllq $8, %%mm1 \n\t"
985 "psllq $16, %%mm2 \n\t"
986 "por %%mm1, %%mm0 \n\t"
987 "por %%mm2, %%mm0 \n\t"
988 "psllq $8, %%mm4 \n\t"
989 "psllq $16, %%mm5 \n\t"
990 "por %%mm4, %%mm3 \n\t"
991 "por %%mm5, %%mm3 \n\t"
993 :"=m"(*d)
994 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
995 :"memory");
996 /* borrowed 32 to 24 */
997 __asm__ volatile(
998 "movq %%mm0, %%mm4 \n\t"
999 "movq %%mm3, %%mm5 \n\t"
1000 "movq %%mm6, %%mm0 \n\t"
1001 "movq %%mm7, %%mm1 \n\t"
1003 "movq %%mm4, %%mm6 \n\t"
1004 "movq %%mm5, %%mm7 \n\t"
1005 "movq %%mm0, %%mm2 \n\t"
1006 "movq %%mm1, %%mm3 \n\t"
1008 "psrlq $8, %%mm2 \n\t"
1009 "psrlq $8, %%mm3 \n\t"
1010 "psrlq $8, %%mm6 \n\t"
1011 "psrlq $8, %%mm7 \n\t"
1012 "pand %2, %%mm0 \n\t"
1013 "pand %2, %%mm1 \n\t"
1014 "pand %2, %%mm4 \n\t"
1015 "pand %2, %%mm5 \n\t"
1016 "pand %3, %%mm2 \n\t"
1017 "pand %3, %%mm3 \n\t"
1018 "pand %3, %%mm6 \n\t"
1019 "pand %3, %%mm7 \n\t"
1020 "por %%mm2, %%mm0 \n\t"
1021 "por %%mm3, %%mm1 \n\t"
1022 "por %%mm6, %%mm4 \n\t"
1023 "por %%mm7, %%mm5 \n\t"
1025 "movq %%mm1, %%mm2 \n\t"
1026 "movq %%mm4, %%mm3 \n\t"
1027 "psllq $48, %%mm2 \n\t"
1028 "psllq $32, %%mm3 \n\t"
1029 "pand %4, %%mm2 \n\t"
1030 "pand %5, %%mm3 \n\t"
1031 "por %%mm2, %%mm0 \n\t"
1032 "psrlq $16, %%mm1 \n\t"
1033 "psrlq $32, %%mm4 \n\t"
1034 "psllq $16, %%mm5 \n\t"
1035 "por %%mm3, %%mm1 \n\t"
1036 "pand %6, %%mm5 \n\t"
1037 "por %%mm5, %%mm4 \n\t"
1039 MOVNTQ" %%mm0, %0 \n\t"
1040 MOVNTQ" %%mm1, 8%0 \n\t"
1041 MOVNTQ" %%mm4, 16%0"
1043 :"=m"(*d)
1044 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1045 :"memory");
1046 d += 24;
1047 s += 8;
1049 __asm__ volatile(SFENCE:::"memory");
1050 __asm__ volatile(EMMS:::"memory");
1051 #endif
1052 while (s < end)
1054 register uint16_t bgr;
1055 bgr = *s++;
1056 *d++ = (bgr&0x1F)<<3;
1057 *d++ = (bgr&0x3E0)>>2;
1058 *d++ = (bgr&0x7C00)>>7;
1062 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1064 const uint16_t *end;
1065 #if HAVE_MMX
1066 const uint16_t *mm_end;
1067 #endif
1068 uint8_t *d = (uint8_t *)dst;
1069 const uint16_t *s = (const uint16_t *)src;
1070 end = s + src_size/2;
1071 #if HAVE_MMX
1072 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1073 mm_end = end - 7;
1074 while (s < mm_end)
1076 __asm__ volatile(
1077 PREFETCH" 32%1 \n\t"
1078 "movq %1, %%mm0 \n\t"
1079 "movq %1, %%mm1 \n\t"
1080 "movq %1, %%mm2 \n\t"
1081 "pand %2, %%mm0 \n\t"
1082 "pand %3, %%mm1 \n\t"
1083 "pand %4, %%mm2 \n\t"
1084 "psllq $3, %%mm0 \n\t"
1085 "psrlq $3, %%mm1 \n\t"
1086 "psrlq $8, %%mm2 \n\t"
1087 "movq %%mm0, %%mm3 \n\t"
1088 "movq %%mm1, %%mm4 \n\t"
1089 "movq %%mm2, %%mm5 \n\t"
1090 "punpcklwd %5, %%mm0 \n\t"
1091 "punpcklwd %5, %%mm1 \n\t"
1092 "punpcklwd %5, %%mm2 \n\t"
1093 "punpckhwd %5, %%mm3 \n\t"
1094 "punpckhwd %5, %%mm4 \n\t"
1095 "punpckhwd %5, %%mm5 \n\t"
1096 "psllq $8, %%mm1 \n\t"
1097 "psllq $16, %%mm2 \n\t"
1098 "por %%mm1, %%mm0 \n\t"
1099 "por %%mm2, %%mm0 \n\t"
1100 "psllq $8, %%mm4 \n\t"
1101 "psllq $16, %%mm5 \n\t"
1102 "por %%mm4, %%mm3 \n\t"
1103 "por %%mm5, %%mm3 \n\t"
1105 "movq %%mm0, %%mm6 \n\t"
1106 "movq %%mm3, %%mm7 \n\t"
1108 "movq 8%1, %%mm0 \n\t"
1109 "movq 8%1, %%mm1 \n\t"
1110 "movq 8%1, %%mm2 \n\t"
1111 "pand %2, %%mm0 \n\t"
1112 "pand %3, %%mm1 \n\t"
1113 "pand %4, %%mm2 \n\t"
1114 "psllq $3, %%mm0 \n\t"
1115 "psrlq $3, %%mm1 \n\t"
1116 "psrlq $8, %%mm2 \n\t"
1117 "movq %%mm0, %%mm3 \n\t"
1118 "movq %%mm1, %%mm4 \n\t"
1119 "movq %%mm2, %%mm5 \n\t"
1120 "punpcklwd %5, %%mm0 \n\t"
1121 "punpcklwd %5, %%mm1 \n\t"
1122 "punpcklwd %5, %%mm2 \n\t"
1123 "punpckhwd %5, %%mm3 \n\t"
1124 "punpckhwd %5, %%mm4 \n\t"
1125 "punpckhwd %5, %%mm5 \n\t"
1126 "psllq $8, %%mm1 \n\t"
1127 "psllq $16, %%mm2 \n\t"
1128 "por %%mm1, %%mm0 \n\t"
1129 "por %%mm2, %%mm0 \n\t"
1130 "psllq $8, %%mm4 \n\t"
1131 "psllq $16, %%mm5 \n\t"
1132 "por %%mm4, %%mm3 \n\t"
1133 "por %%mm5, %%mm3 \n\t"
1134 :"=m"(*d)
1135 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1136 :"memory");
1137 /* borrowed 32 to 24 */
1138 __asm__ volatile(
1139 "movq %%mm0, %%mm4 \n\t"
1140 "movq %%mm3, %%mm5 \n\t"
1141 "movq %%mm6, %%mm0 \n\t"
1142 "movq %%mm7, %%mm1 \n\t"
1144 "movq %%mm4, %%mm6 \n\t"
1145 "movq %%mm5, %%mm7 \n\t"
1146 "movq %%mm0, %%mm2 \n\t"
1147 "movq %%mm1, %%mm3 \n\t"
1149 "psrlq $8, %%mm2 \n\t"
1150 "psrlq $8, %%mm3 \n\t"
1151 "psrlq $8, %%mm6 \n\t"
1152 "psrlq $8, %%mm7 \n\t"
1153 "pand %2, %%mm0 \n\t"
1154 "pand %2, %%mm1 \n\t"
1155 "pand %2, %%mm4 \n\t"
1156 "pand %2, %%mm5 \n\t"
1157 "pand %3, %%mm2 \n\t"
1158 "pand %3, %%mm3 \n\t"
1159 "pand %3, %%mm6 \n\t"
1160 "pand %3, %%mm7 \n\t"
1161 "por %%mm2, %%mm0 \n\t"
1162 "por %%mm3, %%mm1 \n\t"
1163 "por %%mm6, %%mm4 \n\t"
1164 "por %%mm7, %%mm5 \n\t"
1166 "movq %%mm1, %%mm2 \n\t"
1167 "movq %%mm4, %%mm3 \n\t"
1168 "psllq $48, %%mm2 \n\t"
1169 "psllq $32, %%mm3 \n\t"
1170 "pand %4, %%mm2 \n\t"
1171 "pand %5, %%mm3 \n\t"
1172 "por %%mm2, %%mm0 \n\t"
1173 "psrlq $16, %%mm1 \n\t"
1174 "psrlq $32, %%mm4 \n\t"
1175 "psllq $16, %%mm5 \n\t"
1176 "por %%mm3, %%mm1 \n\t"
1177 "pand %6, %%mm5 \n\t"
1178 "por %%mm5, %%mm4 \n\t"
1180 MOVNTQ" %%mm0, %0 \n\t"
1181 MOVNTQ" %%mm1, 8%0 \n\t"
1182 MOVNTQ" %%mm4, 16%0"
1184 :"=m"(*d)
1185 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1186 :"memory");
1187 d += 24;
1188 s += 8;
1190 __asm__ volatile(SFENCE:::"memory");
1191 __asm__ volatile(EMMS:::"memory");
1192 #endif
1193 while (s < end)
1195 register uint16_t bgr;
1196 bgr = *s++;
1197 *d++ = (bgr&0x1F)<<3;
1198 *d++ = (bgr&0x7E0)>>3;
1199 *d++ = (bgr&0xF800)>>8;
1204 * mm0 = 00 B3 00 B2 00 B1 00 B0
1205 * mm1 = 00 G3 00 G2 00 G1 00 G0
1206 * mm2 = 00 R3 00 R2 00 R1 00 R0
1207 * mm6 = FF FF FF FF FF FF FF FF
1208 * mm7 = 00 00 00 00 00 00 00 00
1210 #define PACK_RGB32 \
1211 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1212 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1213 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1214 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1215 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1216 "movq %%mm0, %%mm3 \n\t" \
1217 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1218 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1219 MOVNTQ" %%mm0, %0 \n\t" \
1220 MOVNTQ" %%mm3, 8%0 \n\t" \
1222 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1224 const uint16_t *end;
1225 #if HAVE_MMX
1226 const uint16_t *mm_end;
1227 #endif
1228 uint8_t *d = dst;
1229 const uint16_t *s = (const uint16_t *)src;
1230 end = s + src_size/2;
1231 #if HAVE_MMX
1232 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1233 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1234 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1235 mm_end = end - 3;
1236 while (s < mm_end)
1238 __asm__ volatile(
1239 PREFETCH" 32%1 \n\t"
1240 "movq %1, %%mm0 \n\t"
1241 "movq %1, %%mm1 \n\t"
1242 "movq %1, %%mm2 \n\t"
1243 "pand %2, %%mm0 \n\t"
1244 "pand %3, %%mm1 \n\t"
1245 "pand %4, %%mm2 \n\t"
1246 "psllq $3, %%mm0 \n\t"
1247 "psrlq $2, %%mm1 \n\t"
1248 "psrlq $7, %%mm2 \n\t"
1249 PACK_RGB32
1250 :"=m"(*d)
1251 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1252 :"memory");
1253 d += 16;
1254 s += 4;
1256 __asm__ volatile(SFENCE:::"memory");
1257 __asm__ volatile(EMMS:::"memory");
1258 #endif
1259 while (s < end)
1261 #if 0 //slightly slower on Athlon
1262 int bgr= *s++;
1263 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1264 #else
1265 register uint16_t bgr;
1266 bgr = *s++;
1267 #ifdef WORDS_BIGENDIAN
1268 *d++ = 255;
1269 *d++ = (bgr&0x7C00)>>7;
1270 *d++ = (bgr&0x3E0)>>2;
1271 *d++ = (bgr&0x1F)<<3;
1272 #else
1273 *d++ = (bgr&0x1F)<<3;
1274 *d++ = (bgr&0x3E0)>>2;
1275 *d++ = (bgr&0x7C00)>>7;
1276 *d++ = 255;
1277 #endif
1279 #endif
1283 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1285 const uint16_t *end;
1286 #if HAVE_MMX
1287 const uint16_t *mm_end;
1288 #endif
1289 uint8_t *d = dst;
1290 const uint16_t *s = (const uint16_t*)src;
1291 end = s + src_size/2;
1292 #if HAVE_MMX
1293 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1294 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1295 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1296 mm_end = end - 3;
1297 while (s < mm_end)
1299 __asm__ volatile(
1300 PREFETCH" 32%1 \n\t"
1301 "movq %1, %%mm0 \n\t"
1302 "movq %1, %%mm1 \n\t"
1303 "movq %1, %%mm2 \n\t"
1304 "pand %2, %%mm0 \n\t"
1305 "pand %3, %%mm1 \n\t"
1306 "pand %4, %%mm2 \n\t"
1307 "psllq $3, %%mm0 \n\t"
1308 "psrlq $3, %%mm1 \n\t"
1309 "psrlq $8, %%mm2 \n\t"
1310 PACK_RGB32
1311 :"=m"(*d)
1312 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1313 :"memory");
1314 d += 16;
1315 s += 4;
1317 __asm__ volatile(SFENCE:::"memory");
1318 __asm__ volatile(EMMS:::"memory");
1319 #endif
1320 while (s < end)
1322 register uint16_t bgr;
1323 bgr = *s++;
1324 #ifdef WORDS_BIGENDIAN
1325 *d++ = 255;
1326 *d++ = (bgr&0xF800)>>8;
1327 *d++ = (bgr&0x7E0)>>3;
1328 *d++ = (bgr&0x1F)<<3;
1329 #else
1330 *d++ = (bgr&0x1F)<<3;
1331 *d++ = (bgr&0x7E0)>>3;
1332 *d++ = (bgr&0xF800)>>8;
1333 *d++ = 255;
1334 #endif
1338 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1340 x86_reg idx = 15 - src_size;
1341 const uint8_t *s = src-idx;
1342 uint8_t *d = dst-idx;
1343 #if HAVE_MMX
1344 __asm__ volatile(
1345 "test %0, %0 \n\t"
1346 "jns 2f \n\t"
1347 PREFETCH" (%1, %0) \n\t"
1348 "movq %3, %%mm7 \n\t"
1349 "pxor %4, %%mm7 \n\t"
1350 "movq %%mm7, %%mm6 \n\t"
1351 "pxor %5, %%mm7 \n\t"
1352 ASMALIGN(4)
1353 "1: \n\t"
1354 PREFETCH" 32(%1, %0) \n\t"
1355 "movq (%1, %0), %%mm0 \n\t"
1356 "movq 8(%1, %0), %%mm1 \n\t"
1357 # if HAVE_MMX2
1358 "pshufw $177, %%mm0, %%mm3 \n\t"
1359 "pshufw $177, %%mm1, %%mm5 \n\t"
1360 "pand %%mm7, %%mm0 \n\t"
1361 "pand %%mm6, %%mm3 \n\t"
1362 "pand %%mm7, %%mm1 \n\t"
1363 "pand %%mm6, %%mm5 \n\t"
1364 "por %%mm3, %%mm0 \n\t"
1365 "por %%mm5, %%mm1 \n\t"
1366 # else
1367 "movq %%mm0, %%mm2 \n\t"
1368 "movq %%mm1, %%mm4 \n\t"
1369 "pand %%mm7, %%mm0 \n\t"
1370 "pand %%mm6, %%mm2 \n\t"
1371 "pand %%mm7, %%mm1 \n\t"
1372 "pand %%mm6, %%mm4 \n\t"
1373 "movq %%mm2, %%mm3 \n\t"
1374 "movq %%mm4, %%mm5 \n\t"
1375 "pslld $16, %%mm2 \n\t"
1376 "psrld $16, %%mm3 \n\t"
1377 "pslld $16, %%mm4 \n\t"
1378 "psrld $16, %%mm5 \n\t"
1379 "por %%mm2, %%mm0 \n\t"
1380 "por %%mm4, %%mm1 \n\t"
1381 "por %%mm3, %%mm0 \n\t"
1382 "por %%mm5, %%mm1 \n\t"
1383 # endif
1384 MOVNTQ" %%mm0, (%2, %0) \n\t"
1385 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1386 "add $16, %0 \n\t"
1387 "js 1b \n\t"
1388 SFENCE" \n\t"
1389 EMMS" \n\t"
1390 "2: \n\t"
1391 : "+&r"(idx)
1392 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1393 : "memory");
1394 #endif
1395 for (; idx<15; idx+=4) {
1396 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1397 v &= 0xff00ff;
1398 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1402 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1404 unsigned i;
1405 #if HAVE_MMX
1406 x86_reg mmx_size= 23 - src_size;
1407 __asm__ volatile (
1408 "test %%"REG_a", %%"REG_a" \n\t"
1409 "jns 2f \n\t"
1410 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1411 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1412 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1413 ASMALIGN(4)
1414 "1: \n\t"
1415 PREFETCH" 32(%1, %%"REG_a") \n\t"
1416 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1417 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1418 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1419 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1420 "pand %%mm5, %%mm0 \n\t"
1421 "pand %%mm6, %%mm1 \n\t"
1422 "pand %%mm7, %%mm2 \n\t"
1423 "por %%mm0, %%mm1 \n\t"
1424 "por %%mm2, %%mm1 \n\t"
1425 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1426 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1427 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1428 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1429 "pand %%mm7, %%mm0 \n\t"
1430 "pand %%mm5, %%mm1 \n\t"
1431 "pand %%mm6, %%mm2 \n\t"
1432 "por %%mm0, %%mm1 \n\t"
1433 "por %%mm2, %%mm1 \n\t"
1434 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1435 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1436 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1437 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1438 "pand %%mm6, %%mm0 \n\t"
1439 "pand %%mm7, %%mm1 \n\t"
1440 "pand %%mm5, %%mm2 \n\t"
1441 "por %%mm0, %%mm1 \n\t"
1442 "por %%mm2, %%mm1 \n\t"
1443 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1444 "add $24, %%"REG_a" \n\t"
1445 " js 1b \n\t"
1446 "2: \n\t"
1447 : "+a" (mmx_size)
1448 : "r" (src-mmx_size), "r"(dst-mmx_size)
1451 __asm__ volatile(SFENCE:::"memory");
1452 __asm__ volatile(EMMS:::"memory");
1454 if (mmx_size==23) return; //finished, was multiple of 8
1456 src+= src_size;
1457 dst+= src_size;
1458 src_size= 23-mmx_size;
1459 src-= src_size;
1460 dst-= src_size;
1461 #endif
1462 for (i=0; i<src_size; i+=3)
1464 register uint8_t x;
1465 x = src[i + 2];
1466 dst[i + 1] = src[i + 1];
1467 dst[i + 2] = src[i + 0];
1468 dst[i + 0] = x;
1472 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1473 long width, long height,
1474 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1476 long y;
1477 const x86_reg chromWidth= width>>1;
1478 for (y=0; y<height; y++)
1480 #if HAVE_MMX
1481 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1482 __asm__ volatile(
1483 "xor %%"REG_a", %%"REG_a" \n\t"
1484 ASMALIGN(4)
1485 "1: \n\t"
1486 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1487 PREFETCH" 32(%2, %%"REG_a") \n\t"
1488 PREFETCH" 32(%3, %%"REG_a") \n\t"
1489 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1490 "movq %%mm0, %%mm2 \n\t" // U(0)
1491 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1492 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1493 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1495 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1496 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1497 "movq %%mm3, %%mm4 \n\t" // Y(0)
1498 "movq %%mm5, %%mm6 \n\t" // Y(8)
1499 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1500 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1501 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1502 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1504 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1505 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1506 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1507 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1509 "add $8, %%"REG_a" \n\t"
1510 "cmp %4, %%"REG_a" \n\t"
1511 " jb 1b \n\t"
1512 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1513 : "%"REG_a
1515 #else
1517 #if ARCH_ALPHA && HAVE_MVI
1518 #define pl2yuy2(n) \
1519 y1 = yc[n]; \
1520 y2 = yc2[n]; \
1521 u = uc[n]; \
1522 v = vc[n]; \
1523 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1524 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1525 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1526 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1527 yuv1 = (u << 8) + (v << 24); \
1528 yuv2 = yuv1 + y2; \
1529 yuv1 += y1; \
1530 qdst[n] = yuv1; \
1531 qdst2[n] = yuv2;
1533 int i;
1534 uint64_t *qdst = (uint64_t *) dst;
1535 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1536 const uint32_t *yc = (uint32_t *) ysrc;
1537 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1538 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1539 for (i = 0; i < chromWidth; i += 8){
1540 uint64_t y1, y2, yuv1, yuv2;
1541 uint64_t u, v;
1542 /* Prefetch */
1543 __asm__("ldq $31,64(%0)" :: "r"(yc));
1544 __asm__("ldq $31,64(%0)" :: "r"(yc2));
1545 __asm__("ldq $31,64(%0)" :: "r"(uc));
1546 __asm__("ldq $31,64(%0)" :: "r"(vc));
1548 pl2yuy2(0);
1549 pl2yuy2(1);
1550 pl2yuy2(2);
1551 pl2yuy2(3);
1553 yc += 4;
1554 yc2 += 4;
1555 uc += 4;
1556 vc += 4;
1557 qdst += 4;
1558 qdst2 += 4;
1560 y++;
1561 ysrc += lumStride;
1562 dst += dstStride;
1564 #elif HAVE_FAST_64BIT
1565 int i;
1566 uint64_t *ldst = (uint64_t *) dst;
1567 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1568 for (i = 0; i < chromWidth; i += 2){
1569 uint64_t k, l;
1570 k = yc[0] + (uc[0] << 8) +
1571 (yc[1] << 16) + (vc[0] << 24);
1572 l = yc[2] + (uc[1] << 8) +
1573 (yc[3] << 16) + (vc[1] << 24);
1574 *ldst++ = k + (l << 32);
1575 yc += 4;
1576 uc += 2;
1577 vc += 2;
1580 #else
1581 int i, *idst = (int32_t *) dst;
1582 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1583 for (i = 0; i < chromWidth; i++){
1584 #ifdef WORDS_BIGENDIAN
1585 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1586 (yc[1] << 8) + (vc[0] << 0);
1587 #else
1588 *idst++ = yc[0] + (uc[0] << 8) +
1589 (yc[1] << 16) + (vc[0] << 24);
1590 #endif
1591 yc += 2;
1592 uc++;
1593 vc++;
1595 #endif
1596 #endif
1597 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1599 usrc += chromStride;
1600 vsrc += chromStride;
1602 ysrc += lumStride;
1603 dst += dstStride;
1605 #if HAVE_MMX
1606 __asm__( EMMS" \n\t"
1607 SFENCE" \n\t"
1608 :::"memory");
1609 #endif
1613 * Height should be a multiple of 2 and width should be a multiple of 16.
1614 * (If this is a problem for anyone then tell me, and I will fix it.)
1616 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1617 long width, long height,
1618 long lumStride, long chromStride, long dstStride)
1620 //FIXME interpolate chroma
1621 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1624 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1625 long width, long height,
1626 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1628 long y;
1629 const x86_reg chromWidth= width>>1;
1630 for (y=0; y<height; y++)
1632 #if HAVE_MMX
1633 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1634 __asm__ volatile(
1635 "xor %%"REG_a", %%"REG_a" \n\t"
1636 ASMALIGN(4)
1637 "1: \n\t"
1638 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1639 PREFETCH" 32(%2, %%"REG_a") \n\t"
1640 PREFETCH" 32(%3, %%"REG_a") \n\t"
1641 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1642 "movq %%mm0, %%mm2 \n\t" // U(0)
1643 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1644 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1645 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1647 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1648 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1649 "movq %%mm0, %%mm4 \n\t" // Y(0)
1650 "movq %%mm2, %%mm6 \n\t" // Y(8)
1651 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1652 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1653 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1654 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1656 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1657 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1658 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1659 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1661 "add $8, %%"REG_a" \n\t"
1662 "cmp %4, %%"REG_a" \n\t"
1663 " jb 1b \n\t"
1664 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1665 : "%"REG_a
1667 #else
1668 //FIXME adapt the Alpha ASM code from yv12->yuy2
1670 #if HAVE_FAST_64BIT
1671 int i;
1672 uint64_t *ldst = (uint64_t *) dst;
1673 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1674 for (i = 0; i < chromWidth; i += 2){
1675 uint64_t k, l;
1676 k = uc[0] + (yc[0] << 8) +
1677 (vc[0] << 16) + (yc[1] << 24);
1678 l = uc[1] + (yc[2] << 8) +
1679 (vc[1] << 16) + (yc[3] << 24);
1680 *ldst++ = k + (l << 32);
1681 yc += 4;
1682 uc += 2;
1683 vc += 2;
1686 #else
1687 int i, *idst = (int32_t *) dst;
1688 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1689 for (i = 0; i < chromWidth; i++){
1690 #ifdef WORDS_BIGENDIAN
1691 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1692 (vc[0] << 8) + (yc[1] << 0);
1693 #else
1694 *idst++ = uc[0] + (yc[0] << 8) +
1695 (vc[0] << 16) + (yc[1] << 24);
1696 #endif
1697 yc += 2;
1698 uc++;
1699 vc++;
1701 #endif
1702 #endif
1703 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1705 usrc += chromStride;
1706 vsrc += chromStride;
1708 ysrc += lumStride;
1709 dst += dstStride;
1711 #if HAVE_MMX
1712 __asm__( EMMS" \n\t"
1713 SFENCE" \n\t"
1714 :::"memory");
1715 #endif
1719 * Height should be a multiple of 2 and width should be a multiple of 16
1720 * (If this is a problem for anyone then tell me, and I will fix it.)
1722 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1723 long width, long height,
1724 long lumStride, long chromStride, long dstStride)
1726 //FIXME interpolate chroma
1727 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1731 * Width should be a multiple of 16.
1733 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1734 long width, long height,
1735 long lumStride, long chromStride, long dstStride)
1737 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1741 * Width should be a multiple of 16.
1743 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1744 long width, long height,
1745 long lumStride, long chromStride, long dstStride)
1747 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1751 * Height should be a multiple of 2 and width should be a multiple of 16.
1752 * (If this is a problem for anyone then tell me, and I will fix it.)
1754 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1755 long width, long height,
1756 long lumStride, long chromStride, long srcStride)
1758 long y;
1759 const x86_reg chromWidth= width>>1;
1760 for (y=0; y<height; y+=2)
1762 #if HAVE_MMX
1763 __asm__ volatile(
1764 "xor %%"REG_a", %%"REG_a" \n\t"
1765 "pcmpeqw %%mm7, %%mm7 \n\t"
1766 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1767 ASMALIGN(4)
1768 "1: \n\t"
1769 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1770 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1771 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1772 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1773 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1774 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1775 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1776 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1777 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1778 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1779 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1781 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1783 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1784 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1785 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1786 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1787 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1788 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1789 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1790 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1791 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1792 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1794 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1796 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1797 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1798 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1799 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1800 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1801 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1802 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1803 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1805 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1806 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1808 "add $8, %%"REG_a" \n\t"
1809 "cmp %4, %%"REG_a" \n\t"
1810 " jb 1b \n\t"
1811 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1812 : "memory", "%"REG_a
1815 ydst += lumStride;
1816 src += srcStride;
1818 __asm__ volatile(
1819 "xor %%"REG_a", %%"REG_a" \n\t"
1820 ASMALIGN(4)
1821 "1: \n\t"
1822 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1823 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1824 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1825 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1826 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1827 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1828 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1829 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1830 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1831 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1832 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1834 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1835 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1837 "add $8, %%"REG_a" \n\t"
1838 "cmp %4, %%"REG_a" \n\t"
1839 " jb 1b \n\t"
1841 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1842 : "memory", "%"REG_a
1844 #else
1845 long i;
1846 for (i=0; i<chromWidth; i++)
1848 ydst[2*i+0] = src[4*i+0];
1849 udst[i] = src[4*i+1];
1850 ydst[2*i+1] = src[4*i+2];
1851 vdst[i] = src[4*i+3];
1853 ydst += lumStride;
1854 src += srcStride;
1856 for (i=0; i<chromWidth; i++)
1858 ydst[2*i+0] = src[4*i+0];
1859 ydst[2*i+1] = src[4*i+2];
1861 #endif
1862 udst += chromStride;
1863 vdst += chromStride;
1864 ydst += lumStride;
1865 src += srcStride;
1867 #if HAVE_MMX
1868 __asm__ volatile( EMMS" \n\t"
1869 SFENCE" \n\t"
1870 :::"memory");
1871 #endif
1874 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1875 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1876 long width, long height, long lumStride, long chromStride)
1878 /* Y Plane */
1879 memcpy(ydst, ysrc, width*height);
1881 /* XXX: implement upscaling for U,V */
1884 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1886 long x,y;
1888 dst[0]= src[0];
1890 // first line
1891 for (x=0; x<srcWidth-1; x++){
1892 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1893 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1895 dst[2*srcWidth-1]= src[srcWidth-1];
1897 dst+= dstStride;
1899 for (y=1; y<srcHeight; y++){
1900 #if HAVE_MMX2 || HAVE_AMD3DNOW
1901 const x86_reg mmxSize= srcWidth&~15;
1902 __asm__ volatile(
1903 "mov %4, %%"REG_a" \n\t"
1904 "1: \n\t"
1905 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1906 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1907 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1908 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1909 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1910 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1911 PAVGB" %%mm0, %%mm5 \n\t"
1912 PAVGB" %%mm0, %%mm3 \n\t"
1913 PAVGB" %%mm0, %%mm5 \n\t"
1914 PAVGB" %%mm0, %%mm3 \n\t"
1915 PAVGB" %%mm1, %%mm4 \n\t"
1916 PAVGB" %%mm1, %%mm2 \n\t"
1917 PAVGB" %%mm1, %%mm4 \n\t"
1918 PAVGB" %%mm1, %%mm2 \n\t"
1919 "movq %%mm5, %%mm7 \n\t"
1920 "movq %%mm4, %%mm6 \n\t"
1921 "punpcklbw %%mm3, %%mm5 \n\t"
1922 "punpckhbw %%mm3, %%mm7 \n\t"
1923 "punpcklbw %%mm2, %%mm4 \n\t"
1924 "punpckhbw %%mm2, %%mm6 \n\t"
1925 #if 1
1926 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1927 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1928 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1929 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1930 #else
1931 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1932 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1933 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1934 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1935 #endif
1936 "add $8, %%"REG_a" \n\t"
1937 " js 1b \n\t"
1938 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1939 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1940 "g" (-mmxSize)
1941 : "%"REG_a
1944 #else
1945 const x86_reg mmxSize=1;
1946 #endif
1947 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1948 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1950 for (x=mmxSize-1; x<srcWidth-1; x++){
1951 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1952 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1953 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1954 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1956 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1957 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1959 dst+=dstStride*2;
1960 src+=srcStride;
1963 // last line
1964 #if 1
1965 dst[0]= src[0];
1967 for (x=0; x<srcWidth-1; x++){
1968 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1969 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1971 dst[2*srcWidth-1]= src[srcWidth-1];
1972 #else
1973 for (x=0; x<srcWidth; x++){
1974 dst[2*x+0]=
1975 dst[2*x+1]= src[x];
1977 #endif
1979 #if HAVE_MMX
1980 __asm__ volatile( EMMS" \n\t"
1981 SFENCE" \n\t"
1982 :::"memory");
1983 #endif
1987 * Height should be a multiple of 2 and width should be a multiple of 16.
1988 * (If this is a problem for anyone then tell me, and I will fix it.)
1989 * Chrominance data is only taken from every second line, others are ignored.
1990 * FIXME: Write HQ version.
1992 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1993 long width, long height,
1994 long lumStride, long chromStride, long srcStride)
1996 long y;
1997 const x86_reg chromWidth= width>>1;
1998 for (y=0; y<height; y+=2)
2000 #if HAVE_MMX
2001 __asm__ volatile(
2002 "xor %%"REG_a", %%"REG_a" \n\t"
2003 "pcmpeqw %%mm7, %%mm7 \n\t"
2004 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2005 ASMALIGN(4)
2006 "1: \n\t"
2007 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2008 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
2009 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
2010 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2011 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2012 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2013 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2014 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2015 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2016 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2017 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2019 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
2021 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
2022 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
2023 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2024 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2025 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2026 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2027 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2028 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2029 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2030 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2032 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
2034 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2035 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2036 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2037 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2038 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2039 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2040 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2041 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2043 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
2044 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
2046 "add $8, %%"REG_a" \n\t"
2047 "cmp %4, %%"REG_a" \n\t"
2048 " jb 1b \n\t"
2049 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2050 : "memory", "%"REG_a
2053 ydst += lumStride;
2054 src += srcStride;
2056 __asm__ volatile(
2057 "xor %%"REG_a", %%"REG_a" \n\t"
2058 ASMALIGN(4)
2059 "1: \n\t"
2060 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2061 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
2062 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
2063 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
2064 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
2065 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2066 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2067 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2068 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2069 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2070 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2072 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
2073 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2075 "add $8, %%"REG_a" \n\t"
2076 "cmp %4, %%"REG_a" \n\t"
2077 " jb 1b \n\t"
2079 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2080 : "memory", "%"REG_a
2082 #else
2083 long i;
2084 for (i=0; i<chromWidth; i++)
2086 udst[i] = src[4*i+0];
2087 ydst[2*i+0] = src[4*i+1];
2088 vdst[i] = src[4*i+2];
2089 ydst[2*i+1] = src[4*i+3];
2091 ydst += lumStride;
2092 src += srcStride;
2094 for (i=0; i<chromWidth; i++)
2096 ydst[2*i+0] = src[4*i+1];
2097 ydst[2*i+1] = src[4*i+3];
2099 #endif
2100 udst += chromStride;
2101 vdst += chromStride;
2102 ydst += lumStride;
2103 src += srcStride;
2105 #if HAVE_MMX
2106 __asm__ volatile( EMMS" \n\t"
2107 SFENCE" \n\t"
2108 :::"memory");
2109 #endif
2113 * Height should be a multiple of 2 and width should be a multiple of 2.
2114 * (If this is a problem for anyone then tell me, and I will fix it.)
2115 * Chrominance data is only taken from every second line,
2116 * others are ignored in the C version.
2117 * FIXME: Write HQ version.
2119 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2120 long width, long height,
2121 long lumStride, long chromStride, long srcStride)
2123 long y;
2124 const x86_reg chromWidth= width>>1;
2125 #if HAVE_MMX
2126 for (y=0; y<height-2; y+=2)
2128 long i;
2129 for (i=0; i<2; i++)
2131 __asm__ volatile(
2132 "mov %2, %%"REG_a" \n\t"
2133 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2134 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2135 "pxor %%mm7, %%mm7 \n\t"
2136 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2137 ASMALIGN(4)
2138 "1: \n\t"
2139 PREFETCH" 64(%0, %%"REG_d") \n\t"
2140 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2141 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2142 "punpcklbw %%mm7, %%mm0 \n\t"
2143 "punpcklbw %%mm7, %%mm1 \n\t"
2144 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2145 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2146 "punpcklbw %%mm7, %%mm2 \n\t"
2147 "punpcklbw %%mm7, %%mm3 \n\t"
2148 "pmaddwd %%mm6, %%mm0 \n\t"
2149 "pmaddwd %%mm6, %%mm1 \n\t"
2150 "pmaddwd %%mm6, %%mm2 \n\t"
2151 "pmaddwd %%mm6, %%mm3 \n\t"
2152 #ifndef FAST_BGR2YV12
2153 "psrad $8, %%mm0 \n\t"
2154 "psrad $8, %%mm1 \n\t"
2155 "psrad $8, %%mm2 \n\t"
2156 "psrad $8, %%mm3 \n\t"
2157 #endif
2158 "packssdw %%mm1, %%mm0 \n\t"
2159 "packssdw %%mm3, %%mm2 \n\t"
2160 "pmaddwd %%mm5, %%mm0 \n\t"
2161 "pmaddwd %%mm5, %%mm2 \n\t"
2162 "packssdw %%mm2, %%mm0 \n\t"
2163 "psraw $7, %%mm0 \n\t"
2165 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2166 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2167 "punpcklbw %%mm7, %%mm4 \n\t"
2168 "punpcklbw %%mm7, %%mm1 \n\t"
2169 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2170 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2171 "punpcklbw %%mm7, %%mm2 \n\t"
2172 "punpcklbw %%mm7, %%mm3 \n\t"
2173 "pmaddwd %%mm6, %%mm4 \n\t"
2174 "pmaddwd %%mm6, %%mm1 \n\t"
2175 "pmaddwd %%mm6, %%mm2 \n\t"
2176 "pmaddwd %%mm6, %%mm3 \n\t"
2177 #ifndef FAST_BGR2YV12
2178 "psrad $8, %%mm4 \n\t"
2179 "psrad $8, %%mm1 \n\t"
2180 "psrad $8, %%mm2 \n\t"
2181 "psrad $8, %%mm3 \n\t"
2182 #endif
2183 "packssdw %%mm1, %%mm4 \n\t"
2184 "packssdw %%mm3, %%mm2 \n\t"
2185 "pmaddwd %%mm5, %%mm4 \n\t"
2186 "pmaddwd %%mm5, %%mm2 \n\t"
2187 "add $24, %%"REG_d" \n\t"
2188 "packssdw %%mm2, %%mm4 \n\t"
2189 "psraw $7, %%mm4 \n\t"
2191 "packuswb %%mm4, %%mm0 \n\t"
2192 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2194 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2195 "add $8, %%"REG_a" \n\t"
2196 " js 1b \n\t"
2197 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2198 : "%"REG_a, "%"REG_d
2200 ydst += lumStride;
2201 src += srcStride;
2203 src -= srcStride*2;
2204 __asm__ volatile(
2205 "mov %4, %%"REG_a" \n\t"
2206 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2207 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2208 "pxor %%mm7, %%mm7 \n\t"
2209 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2210 "add %%"REG_d", %%"REG_d" \n\t"
2211 ASMALIGN(4)
2212 "1: \n\t"
2213 PREFETCH" 64(%0, %%"REG_d") \n\t"
2214 PREFETCH" 64(%1, %%"REG_d") \n\t"
2215 #if HAVE_MMX2 || HAVE_AMD3DNOW
2216 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2217 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2218 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2219 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2220 PAVGB" %%mm1, %%mm0 \n\t"
2221 PAVGB" %%mm3, %%mm2 \n\t"
2222 "movq %%mm0, %%mm1 \n\t"
2223 "movq %%mm2, %%mm3 \n\t"
2224 "psrlq $24, %%mm0 \n\t"
2225 "psrlq $24, %%mm2 \n\t"
2226 PAVGB" %%mm1, %%mm0 \n\t"
2227 PAVGB" %%mm3, %%mm2 \n\t"
2228 "punpcklbw %%mm7, %%mm0 \n\t"
2229 "punpcklbw %%mm7, %%mm2 \n\t"
2230 #else
2231 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2232 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2233 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2234 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2235 "punpcklbw %%mm7, %%mm0 \n\t"
2236 "punpcklbw %%mm7, %%mm1 \n\t"
2237 "punpcklbw %%mm7, %%mm2 \n\t"
2238 "punpcklbw %%mm7, %%mm3 \n\t"
2239 "paddw %%mm1, %%mm0 \n\t"
2240 "paddw %%mm3, %%mm2 \n\t"
2241 "paddw %%mm2, %%mm0 \n\t"
2242 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2243 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2244 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2245 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2246 "punpcklbw %%mm7, %%mm4 \n\t"
2247 "punpcklbw %%mm7, %%mm1 \n\t"
2248 "punpcklbw %%mm7, %%mm2 \n\t"
2249 "punpcklbw %%mm7, %%mm3 \n\t"
2250 "paddw %%mm1, %%mm4 \n\t"
2251 "paddw %%mm3, %%mm2 \n\t"
2252 "paddw %%mm4, %%mm2 \n\t"
2253 "psrlw $2, %%mm0 \n\t"
2254 "psrlw $2, %%mm2 \n\t"
2255 #endif
2256 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2257 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2259 "pmaddwd %%mm0, %%mm1 \n\t"
2260 "pmaddwd %%mm2, %%mm3 \n\t"
2261 "pmaddwd %%mm6, %%mm0 \n\t"
2262 "pmaddwd %%mm6, %%mm2 \n\t"
2263 #ifndef FAST_BGR2YV12
2264 "psrad $8, %%mm0 \n\t"
2265 "psrad $8, %%mm1 \n\t"
2266 "psrad $8, %%mm2 \n\t"
2267 "psrad $8, %%mm3 \n\t"
2268 #endif
2269 "packssdw %%mm2, %%mm0 \n\t"
2270 "packssdw %%mm3, %%mm1 \n\t"
2271 "pmaddwd %%mm5, %%mm0 \n\t"
2272 "pmaddwd %%mm5, %%mm1 \n\t"
2273 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2274 "psraw $7, %%mm0 \n\t"
2276 #if HAVE_MMX2 || HAVE_AMD3DNOW
2277 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2278 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2279 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2280 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2281 PAVGB" %%mm1, %%mm4 \n\t"
2282 PAVGB" %%mm3, %%mm2 \n\t"
2283 "movq %%mm4, %%mm1 \n\t"
2284 "movq %%mm2, %%mm3 \n\t"
2285 "psrlq $24, %%mm4 \n\t"
2286 "psrlq $24, %%mm2 \n\t"
2287 PAVGB" %%mm1, %%mm4 \n\t"
2288 PAVGB" %%mm3, %%mm2 \n\t"
2289 "punpcklbw %%mm7, %%mm4 \n\t"
2290 "punpcklbw %%mm7, %%mm2 \n\t"
2291 #else
2292 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2293 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2294 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2295 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2296 "punpcklbw %%mm7, %%mm4 \n\t"
2297 "punpcklbw %%mm7, %%mm1 \n\t"
2298 "punpcklbw %%mm7, %%mm2 \n\t"
2299 "punpcklbw %%mm7, %%mm3 \n\t"
2300 "paddw %%mm1, %%mm4 \n\t"
2301 "paddw %%mm3, %%mm2 \n\t"
2302 "paddw %%mm2, %%mm4 \n\t"
2303 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2304 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2305 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2306 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2307 "punpcklbw %%mm7, %%mm5 \n\t"
2308 "punpcklbw %%mm7, %%mm1 \n\t"
2309 "punpcklbw %%mm7, %%mm2 \n\t"
2310 "punpcklbw %%mm7, %%mm3 \n\t"
2311 "paddw %%mm1, %%mm5 \n\t"
2312 "paddw %%mm3, %%mm2 \n\t"
2313 "paddw %%mm5, %%mm2 \n\t"
2314 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2315 "psrlw $2, %%mm4 \n\t"
2316 "psrlw $2, %%mm2 \n\t"
2317 #endif
2318 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2319 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2321 "pmaddwd %%mm4, %%mm1 \n\t"
2322 "pmaddwd %%mm2, %%mm3 \n\t"
2323 "pmaddwd %%mm6, %%mm4 \n\t"
2324 "pmaddwd %%mm6, %%mm2 \n\t"
2325 #ifndef FAST_BGR2YV12
2326 "psrad $8, %%mm4 \n\t"
2327 "psrad $8, %%mm1 \n\t"
2328 "psrad $8, %%mm2 \n\t"
2329 "psrad $8, %%mm3 \n\t"
2330 #endif
2331 "packssdw %%mm2, %%mm4 \n\t"
2332 "packssdw %%mm3, %%mm1 \n\t"
2333 "pmaddwd %%mm5, %%mm4 \n\t"
2334 "pmaddwd %%mm5, %%mm1 \n\t"
2335 "add $24, %%"REG_d" \n\t"
2336 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2337 "psraw $7, %%mm4 \n\t"
2339 "movq %%mm0, %%mm1 \n\t"
2340 "punpckldq %%mm4, %%mm0 \n\t"
2341 "punpckhdq %%mm4, %%mm1 \n\t"
2342 "packsswb %%mm1, %%mm0 \n\t"
2343 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2344 "movd %%mm0, (%2, %%"REG_a") \n\t"
2345 "punpckhdq %%mm0, %%mm0 \n\t"
2346 "movd %%mm0, (%3, %%"REG_a") \n\t"
2347 "add $4, %%"REG_a" \n\t"
2348 " js 1b \n\t"
2349 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2350 : "%"REG_a, "%"REG_d
2353 udst += chromStride;
2354 vdst += chromStride;
2355 src += srcStride*2;
2358 __asm__ volatile( EMMS" \n\t"
2359 SFENCE" \n\t"
2360 :::"memory");
2361 #else
2362 y=0;
2363 #endif
2364 for (; y<height; y+=2)
2366 long i;
2367 for (i=0; i<chromWidth; i++)
2369 unsigned int b = src[6*i+0];
2370 unsigned int g = src[6*i+1];
2371 unsigned int r = src[6*i+2];
2373 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2374 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2375 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2377 udst[i] = U;
2378 vdst[i] = V;
2379 ydst[2*i] = Y;
2381 b = src[6*i+3];
2382 g = src[6*i+4];
2383 r = src[6*i+5];
2385 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2386 ydst[2*i+1] = Y;
2388 ydst += lumStride;
2389 src += srcStride;
2391 for (i=0; i<chromWidth; i++)
2393 unsigned int b = src[6*i+0];
2394 unsigned int g = src[6*i+1];
2395 unsigned int r = src[6*i+2];
2397 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2399 ydst[2*i] = Y;
2401 b = src[6*i+3];
2402 g = src[6*i+4];
2403 r = src[6*i+5];
2405 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2406 ydst[2*i+1] = Y;
2408 udst += chromStride;
2409 vdst += chromStride;
2410 ydst += lumStride;
2411 src += srcStride;
2415 static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2416 long width, long height, long src1Stride,
2417 long src2Stride, long dstStride){
2418 long h;
2420 for (h=0; h < height; h++)
2422 long w;
2424 #if HAVE_MMX
2425 #if HAVE_SSE2
2426 __asm__(
2427 "xor %%"REG_a", %%"REG_a" \n\t"
2428 "1: \n\t"
2429 PREFETCH" 64(%1, %%"REG_a") \n\t"
2430 PREFETCH" 64(%2, %%"REG_a") \n\t"
2431 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2432 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2433 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2434 "punpcklbw %%xmm2, %%xmm0 \n\t"
2435 "punpckhbw %%xmm2, %%xmm1 \n\t"
2436 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2437 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2438 "add $16, %%"REG_a" \n\t"
2439 "cmp %3, %%"REG_a" \n\t"
2440 " jb 1b \n\t"
2441 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2442 : "memory", "%"REG_a""
2444 #else
2445 __asm__(
2446 "xor %%"REG_a", %%"REG_a" \n\t"
2447 "1: \n\t"
2448 PREFETCH" 64(%1, %%"REG_a") \n\t"
2449 PREFETCH" 64(%2, %%"REG_a") \n\t"
2450 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2451 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2452 "movq %%mm0, %%mm1 \n\t"
2453 "movq %%mm2, %%mm3 \n\t"
2454 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2455 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2456 "punpcklbw %%mm4, %%mm0 \n\t"
2457 "punpckhbw %%mm4, %%mm1 \n\t"
2458 "punpcklbw %%mm5, %%mm2 \n\t"
2459 "punpckhbw %%mm5, %%mm3 \n\t"
2460 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2461 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2462 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2463 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2464 "add $16, %%"REG_a" \n\t"
2465 "cmp %3, %%"REG_a" \n\t"
2466 " jb 1b \n\t"
2467 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2468 : "memory", "%"REG_a
2470 #endif
2471 for (w= (width&(~15)); w < width; w++)
2473 dest[2*w+0] = src1[w];
2474 dest[2*w+1] = src2[w];
2476 #else
2477 for (w=0; w < width; w++)
2479 dest[2*w+0] = src1[w];
2480 dest[2*w+1] = src2[w];
2482 #endif
2483 dest += dstStride;
2484 src1 += src1Stride;
2485 src2 += src2Stride;
2487 #if HAVE_MMX
2488 __asm__(
2489 EMMS" \n\t"
2490 SFENCE" \n\t"
2491 ::: "memory"
2493 #endif
2496 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2497 uint8_t *dst1, uint8_t *dst2,
2498 long width, long height,
2499 long srcStride1, long srcStride2,
2500 long dstStride1, long dstStride2)
2502 x86_reg y;
2503 long x,w,h;
2504 w=width/2; h=height/2;
2505 #if HAVE_MMX
2506 __asm__ volatile(
2507 PREFETCH" %0 \n\t"
2508 PREFETCH" %1 \n\t"
2509 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2510 #endif
2511 for (y=0;y<h;y++){
2512 const uint8_t* s1=src1+srcStride1*(y>>1);
2513 uint8_t* d=dst1+dstStride1*y;
2514 x=0;
2515 #if HAVE_MMX
2516 for (;x<w-31;x+=32)
2518 __asm__ volatile(
2519 PREFETCH" 32%1 \n\t"
2520 "movq %1, %%mm0 \n\t"
2521 "movq 8%1, %%mm2 \n\t"
2522 "movq 16%1, %%mm4 \n\t"
2523 "movq 24%1, %%mm6 \n\t"
2524 "movq %%mm0, %%mm1 \n\t"
2525 "movq %%mm2, %%mm3 \n\t"
2526 "movq %%mm4, %%mm5 \n\t"
2527 "movq %%mm6, %%mm7 \n\t"
2528 "punpcklbw %%mm0, %%mm0 \n\t"
2529 "punpckhbw %%mm1, %%mm1 \n\t"
2530 "punpcklbw %%mm2, %%mm2 \n\t"
2531 "punpckhbw %%mm3, %%mm3 \n\t"
2532 "punpcklbw %%mm4, %%mm4 \n\t"
2533 "punpckhbw %%mm5, %%mm5 \n\t"
2534 "punpcklbw %%mm6, %%mm6 \n\t"
2535 "punpckhbw %%mm7, %%mm7 \n\t"
2536 MOVNTQ" %%mm0, %0 \n\t"
2537 MOVNTQ" %%mm1, 8%0 \n\t"
2538 MOVNTQ" %%mm2, 16%0 \n\t"
2539 MOVNTQ" %%mm3, 24%0 \n\t"
2540 MOVNTQ" %%mm4, 32%0 \n\t"
2541 MOVNTQ" %%mm5, 40%0 \n\t"
2542 MOVNTQ" %%mm6, 48%0 \n\t"
2543 MOVNTQ" %%mm7, 56%0"
2544 :"=m"(d[2*x])
2545 :"m"(s1[x])
2546 :"memory");
2548 #endif
2549 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2551 for (y=0;y<h;y++){
2552 const uint8_t* s2=src2+srcStride2*(y>>1);
2553 uint8_t* d=dst2+dstStride2*y;
2554 x=0;
2555 #if HAVE_MMX
2556 for (;x<w-31;x+=32)
2558 __asm__ volatile(
2559 PREFETCH" 32%1 \n\t"
2560 "movq %1, %%mm0 \n\t"
2561 "movq 8%1, %%mm2 \n\t"
2562 "movq 16%1, %%mm4 \n\t"
2563 "movq 24%1, %%mm6 \n\t"
2564 "movq %%mm0, %%mm1 \n\t"
2565 "movq %%mm2, %%mm3 \n\t"
2566 "movq %%mm4, %%mm5 \n\t"
2567 "movq %%mm6, %%mm7 \n\t"
2568 "punpcklbw %%mm0, %%mm0 \n\t"
2569 "punpckhbw %%mm1, %%mm1 \n\t"
2570 "punpcklbw %%mm2, %%mm2 \n\t"
2571 "punpckhbw %%mm3, %%mm3 \n\t"
2572 "punpcklbw %%mm4, %%mm4 \n\t"
2573 "punpckhbw %%mm5, %%mm5 \n\t"
2574 "punpcklbw %%mm6, %%mm6 \n\t"
2575 "punpckhbw %%mm7, %%mm7 \n\t"
2576 MOVNTQ" %%mm0, %0 \n\t"
2577 MOVNTQ" %%mm1, 8%0 \n\t"
2578 MOVNTQ" %%mm2, 16%0 \n\t"
2579 MOVNTQ" %%mm3, 24%0 \n\t"
2580 MOVNTQ" %%mm4, 32%0 \n\t"
2581 MOVNTQ" %%mm5, 40%0 \n\t"
2582 MOVNTQ" %%mm6, 48%0 \n\t"
2583 MOVNTQ" %%mm7, 56%0"
2584 :"=m"(d[2*x])
2585 :"m"(s2[x])
2586 :"memory");
2588 #endif
2589 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2591 #if HAVE_MMX
2592 __asm__(
2593 EMMS" \n\t"
2594 SFENCE" \n\t"
2595 ::: "memory"
2597 #endif
2600 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2601 uint8_t *dst,
2602 long width, long height,
2603 long srcStride1, long srcStride2,
2604 long srcStride3, long dstStride)
2606 x86_reg x;
2607 long y,w,h;
2608 w=width/2; h=height;
2609 for (y=0;y<h;y++){
2610 const uint8_t* yp=src1+srcStride1*y;
2611 const uint8_t* up=src2+srcStride2*(y>>2);
2612 const uint8_t* vp=src3+srcStride3*(y>>2);
2613 uint8_t* d=dst+dstStride*y;
2614 x=0;
2615 #if HAVE_MMX
2616 for (;x<w-7;x+=8)
2618 __asm__ volatile(
2619 PREFETCH" 32(%1, %0) \n\t"
2620 PREFETCH" 32(%2, %0) \n\t"
2621 PREFETCH" 32(%3, %0) \n\t"
2622 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2623 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2624 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2625 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2626 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2627 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2628 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2629 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2630 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2631 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2633 "movq %%mm1, %%mm6 \n\t"
2634 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2635 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2636 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2637 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2638 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2640 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2641 "movq 8(%1, %0, 4), %%mm0 \n\t"
2642 "movq %%mm0, %%mm3 \n\t"
2643 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2644 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2645 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2646 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2648 "movq %%mm4, %%mm6 \n\t"
2649 "movq 16(%1, %0, 4), %%mm0 \n\t"
2650 "movq %%mm0, %%mm3 \n\t"
2651 "punpcklbw %%mm5, %%mm4 \n\t"
2652 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2653 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2654 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2655 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2657 "punpckhbw %%mm5, %%mm6 \n\t"
2658 "movq 24(%1, %0, 4), %%mm0 \n\t"
2659 "movq %%mm0, %%mm3 \n\t"
2660 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2661 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2662 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2663 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2665 : "+r" (x)
2666 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2667 :"memory");
2669 #endif
2670 for (; x<w; x++)
2672 const long x2 = x<<2;
2673 d[8*x+0] = yp[x2];
2674 d[8*x+1] = up[x];
2675 d[8*x+2] = yp[x2+1];
2676 d[8*x+3] = vp[x];
2677 d[8*x+4] = yp[x2+2];
2678 d[8*x+5] = up[x];
2679 d[8*x+6] = yp[x2+3];
2680 d[8*x+7] = vp[x];
2683 #if HAVE_MMX
2684 __asm__(
2685 EMMS" \n\t"
2686 SFENCE" \n\t"
2687 ::: "memory"
2689 #endif
2692 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2694 dst += count;
2695 src += 2*count;
2696 count= - count;
2698 #if HAVE_MMX
2699 if(count <= -16){
2700 count += 15;
2701 __asm__ volatile(
2702 "pcmpeqw %%mm7, %%mm7 \n\t"
2703 "psrlw $8, %%mm7 \n\t"
2704 "1: \n\t"
2705 "movq -30(%1, %0, 2), %%mm0 \n\t"
2706 "movq -22(%1, %0, 2), %%mm1 \n\t"
2707 "movq -14(%1, %0, 2), %%mm2 \n\t"
2708 "movq -6(%1, %0, 2), %%mm3 \n\t"
2709 "pand %%mm7, %%mm0 \n\t"
2710 "pand %%mm7, %%mm1 \n\t"
2711 "pand %%mm7, %%mm2 \n\t"
2712 "pand %%mm7, %%mm3 \n\t"
2713 "packuswb %%mm1, %%mm0 \n\t"
2714 "packuswb %%mm3, %%mm2 \n\t"
2715 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2716 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2717 "add $16, %0 \n\t"
2718 " js 1b \n\t"
2719 : "+r"(count)
2720 : "r"(src), "r"(dst)
2722 count -= 15;
2724 #endif
2725 while(count<0){
2726 dst[count]= src[2*count];
2727 count++;
2731 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2733 dst0+= count;
2734 dst1+= count;
2735 src += 4*count;
2736 count= - count;
2737 #if HAVE_MMX
2738 if(count <= -8){
2739 count += 7;
2740 __asm__ volatile(
2741 "pcmpeqw %%mm7, %%mm7 \n\t"
2742 "psrlw $8, %%mm7 \n\t"
2743 "1: \n\t"
2744 "movq -28(%1, %0, 4), %%mm0 \n\t"
2745 "movq -20(%1, %0, 4), %%mm1 \n\t"
2746 "movq -12(%1, %0, 4), %%mm2 \n\t"
2747 "movq -4(%1, %0, 4), %%mm3 \n\t"
2748 "pand %%mm7, %%mm0 \n\t"
2749 "pand %%mm7, %%mm1 \n\t"
2750 "pand %%mm7, %%mm2 \n\t"
2751 "pand %%mm7, %%mm3 \n\t"
2752 "packuswb %%mm1, %%mm0 \n\t"
2753 "packuswb %%mm3, %%mm2 \n\t"
2754 "movq %%mm0, %%mm1 \n\t"
2755 "movq %%mm2, %%mm3 \n\t"
2756 "psrlw $8, %%mm0 \n\t"
2757 "psrlw $8, %%mm2 \n\t"
2758 "pand %%mm7, %%mm1 \n\t"
2759 "pand %%mm7, %%mm3 \n\t"
2760 "packuswb %%mm2, %%mm0 \n\t"
2761 "packuswb %%mm3, %%mm1 \n\t"
2762 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2763 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2764 "add $8, %0 \n\t"
2765 " js 1b \n\t"
2766 : "+r"(count)
2767 : "r"(src), "r"(dst0), "r"(dst1)
2769 count -= 7;
2771 #endif
2772 while(count<0){
2773 dst0[count]= src[4*count+0];
2774 dst1[count]= src[4*count+2];
2775 count++;
2779 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2781 dst0 += count;
2782 dst1 += count;
2783 src0 += 4*count;
2784 src1 += 4*count;
2785 count= - count;
2786 #ifdef PAVGB
2787 if(count <= -8){
2788 count += 7;
2789 __asm__ volatile(
2790 "pcmpeqw %%mm7, %%mm7 \n\t"
2791 "psrlw $8, %%mm7 \n\t"
2792 "1: \n\t"
2793 "movq -28(%1, %0, 4), %%mm0 \n\t"
2794 "movq -20(%1, %0, 4), %%mm1 \n\t"
2795 "movq -12(%1, %0, 4), %%mm2 \n\t"
2796 "movq -4(%1, %0, 4), %%mm3 \n\t"
2797 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2798 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2799 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2800 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2801 "pand %%mm7, %%mm0 \n\t"
2802 "pand %%mm7, %%mm1 \n\t"
2803 "pand %%mm7, %%mm2 \n\t"
2804 "pand %%mm7, %%mm3 \n\t"
2805 "packuswb %%mm1, %%mm0 \n\t"
2806 "packuswb %%mm3, %%mm2 \n\t"
2807 "movq %%mm0, %%mm1 \n\t"
2808 "movq %%mm2, %%mm3 \n\t"
2809 "psrlw $8, %%mm0 \n\t"
2810 "psrlw $8, %%mm2 \n\t"
2811 "pand %%mm7, %%mm1 \n\t"
2812 "pand %%mm7, %%mm3 \n\t"
2813 "packuswb %%mm2, %%mm0 \n\t"
2814 "packuswb %%mm3, %%mm1 \n\t"
2815 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2816 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2817 "add $8, %0 \n\t"
2818 " js 1b \n\t"
2819 : "+r"(count)
2820 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2822 count -= 7;
2824 #endif
2825 while(count<0){
2826 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2827 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2828 count++;
2832 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2834 dst0+= count;
2835 dst1+= count;
2836 src += 4*count;
2837 count= - count;
2838 #if HAVE_MMX
2839 if(count <= -8){
2840 count += 7;
2841 __asm__ volatile(
2842 "pcmpeqw %%mm7, %%mm7 \n\t"
2843 "psrlw $8, %%mm7 \n\t"
2844 "1: \n\t"
2845 "movq -28(%1, %0, 4), %%mm0 \n\t"
2846 "movq -20(%1, %0, 4), %%mm1 \n\t"
2847 "movq -12(%1, %0, 4), %%mm2 \n\t"
2848 "movq -4(%1, %0, 4), %%mm3 \n\t"
2849 "psrlw $8, %%mm0 \n\t"
2850 "psrlw $8, %%mm1 \n\t"
2851 "psrlw $8, %%mm2 \n\t"
2852 "psrlw $8, %%mm3 \n\t"
2853 "packuswb %%mm1, %%mm0 \n\t"
2854 "packuswb %%mm3, %%mm2 \n\t"
2855 "movq %%mm0, %%mm1 \n\t"
2856 "movq %%mm2, %%mm3 \n\t"
2857 "psrlw $8, %%mm0 \n\t"
2858 "psrlw $8, %%mm2 \n\t"
2859 "pand %%mm7, %%mm1 \n\t"
2860 "pand %%mm7, %%mm3 \n\t"
2861 "packuswb %%mm2, %%mm0 \n\t"
2862 "packuswb %%mm3, %%mm1 \n\t"
2863 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2864 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2865 "add $8, %0 \n\t"
2866 " js 1b \n\t"
2867 : "+r"(count)
2868 : "r"(src), "r"(dst0), "r"(dst1)
2870 count -= 7;
2872 #endif
2873 src++;
2874 while(count<0){
2875 dst0[count]= src[4*count+0];
2876 dst1[count]= src[4*count+2];
2877 count++;
2881 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2883 dst0 += count;
2884 dst1 += count;
2885 src0 += 4*count;
2886 src1 += 4*count;
2887 count= - count;
2888 #ifdef PAVGB
2889 if(count <= -8){
2890 count += 7;
2891 __asm__ volatile(
2892 "pcmpeqw %%mm7, %%mm7 \n\t"
2893 "psrlw $8, %%mm7 \n\t"
2894 "1: \n\t"
2895 "movq -28(%1, %0, 4), %%mm0 \n\t"
2896 "movq -20(%1, %0, 4), %%mm1 \n\t"
2897 "movq -12(%1, %0, 4), %%mm2 \n\t"
2898 "movq -4(%1, %0, 4), %%mm3 \n\t"
2899 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2900 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2901 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2902 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2903 "psrlw $8, %%mm0 \n\t"
2904 "psrlw $8, %%mm1 \n\t"
2905 "psrlw $8, %%mm2 \n\t"
2906 "psrlw $8, %%mm3 \n\t"
2907 "packuswb %%mm1, %%mm0 \n\t"
2908 "packuswb %%mm3, %%mm2 \n\t"
2909 "movq %%mm0, %%mm1 \n\t"
2910 "movq %%mm2, %%mm3 \n\t"
2911 "psrlw $8, %%mm0 \n\t"
2912 "psrlw $8, %%mm2 \n\t"
2913 "pand %%mm7, %%mm1 \n\t"
2914 "pand %%mm7, %%mm3 \n\t"
2915 "packuswb %%mm2, %%mm0 \n\t"
2916 "packuswb %%mm3, %%mm1 \n\t"
2917 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2918 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2919 "add $8, %0 \n\t"
2920 " js 1b \n\t"
2921 : "+r"(count)
2922 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2924 count -= 7;
2926 #endif
2927 src0++;
2928 src1++;
2929 while(count<0){
2930 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2931 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2932 count++;
2936 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2937 long width, long height,
2938 long lumStride, long chromStride, long srcStride)
2940 long y;
2941 const long chromWidth= -((-width)>>1);
2943 for (y=0; y<height; y++){
2944 RENAME(extract_even)(src, ydst, width);
2945 if(y&1){
2946 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2947 udst+= chromStride;
2948 vdst+= chromStride;
2951 src += srcStride;
2952 ydst+= lumStride;
2954 #if HAVE_MMX
2955 __asm__(
2956 EMMS" \n\t"
2957 SFENCE" \n\t"
2958 ::: "memory"
2960 #endif
2963 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2964 long width, long height,
2965 long lumStride, long chromStride, long srcStride)
2967 long y;
2968 const long chromWidth= -((-width)>>1);
2970 for (y=0; y<height; y++){
2971 RENAME(extract_even)(src, ydst, width);
2972 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2974 src += srcStride;
2975 ydst+= lumStride;
2976 udst+= chromStride;
2977 vdst+= chromStride;
2979 #if HAVE_MMX
2980 __asm__(
2981 EMMS" \n\t"
2982 SFENCE" \n\t"
2983 ::: "memory"
2985 #endif
2988 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2989 long width, long height,
2990 long lumStride, long chromStride, long srcStride)
2992 long y;
2993 const long chromWidth= -((-width)>>1);
2995 for (y=0; y<height; y++){
2996 RENAME(extract_even)(src+1, ydst, width);
2997 if(y&1){
2998 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2999 udst+= chromStride;
3000 vdst+= chromStride;
3003 src += srcStride;
3004 ydst+= lumStride;
3006 #if HAVE_MMX
3007 __asm__(
3008 EMMS" \n\t"
3009 SFENCE" \n\t"
3010 ::: "memory"
3012 #endif
3015 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
3016 long width, long height,
3017 long lumStride, long chromStride, long srcStride)
3019 long y;
3020 const long chromWidth= -((-width)>>1);
3022 for (y=0; y<height; y++){
3023 RENAME(extract_even)(src+1, ydst, width);
3024 RENAME(extract_even2)(src, udst, vdst, chromWidth);
3026 src += srcStride;
3027 ydst+= lumStride;
3028 udst+= chromStride;
3029 vdst+= chromStride;
3031 #if HAVE_MMX
3032 __asm__(
3033 EMMS" \n\t"
3034 SFENCE" \n\t"
3035 ::: "memory"
3037 #endif
3040 static inline void RENAME(rgb2rgb_init)(void){
3041 rgb15to16 = RENAME(rgb15to16);
3042 rgb15tobgr24 = RENAME(rgb15tobgr24);
3043 rgb15to32 = RENAME(rgb15to32);
3044 rgb16tobgr24 = RENAME(rgb16tobgr24);
3045 rgb16to32 = RENAME(rgb16to32);
3046 rgb16to15 = RENAME(rgb16to15);
3047 rgb24tobgr16 = RENAME(rgb24tobgr16);
3048 rgb24tobgr15 = RENAME(rgb24tobgr15);
3049 rgb24tobgr32 = RENAME(rgb24tobgr32);
3050 rgb32to16 = RENAME(rgb32to16);
3051 rgb32to15 = RENAME(rgb32to15);
3052 rgb32tobgr24 = RENAME(rgb32tobgr24);
3053 rgb24to15 = RENAME(rgb24to15);
3054 rgb24to16 = RENAME(rgb24to16);
3055 rgb24tobgr24 = RENAME(rgb24tobgr24);
3056 rgb32tobgr32 = RENAME(rgb32tobgr32);
3057 rgb32tobgr16 = RENAME(rgb32tobgr16);
3058 rgb32tobgr15 = RENAME(rgb32tobgr15);
3059 yv12toyuy2 = RENAME(yv12toyuy2);
3060 yv12touyvy = RENAME(yv12touyvy);
3061 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
3062 yuv422ptouyvy = RENAME(yuv422ptouyvy);
3063 yuy2toyv12 = RENAME(yuy2toyv12);
3064 // yvu9toyv12 = RENAME(yvu9toyv12);
3065 planar2x = RENAME(planar2x);
3066 rgb24toyv12 = RENAME(rgb24toyv12);
3067 interleaveBytes = RENAME(interleaveBytes);
3068 vu9_to_vu12 = RENAME(vu9_to_vu12);
3069 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
3071 uyvytoyuv420 = RENAME(uyvytoyuv420);
3072 uyvytoyuv422 = RENAME(uyvytoyuv422);
3073 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
3074 yuyvtoyuv422 = RENAME(yuyvtoyuv422);