100l, add forgotten BGR15 format to fmt-conversion.c table
[mplayer/glamo.git] / libswscale / rgb2rgb_template.c
blob09a57cab8186a680ba6be8500cf04fde37944ccf
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
30 #include <stddef.h>
32 #undef PREFETCH
33 #undef MOVNTQ
34 #undef EMMS
35 #undef SFENCE
36 #undef MMREG_SIZE
37 #undef PREFETCHW
38 #undef PAVGB
40 #if HAVE_SSE2
41 #define MMREG_SIZE 16
42 #else
43 #define MMREG_SIZE 8
44 #endif
46 #if HAVE_AMD3DNOW
47 #define PREFETCH "prefetch"
48 #define PREFETCHW "prefetchw"
49 #define PAVGB "pavgusb"
50 #elif HAVE_MMX2
51 #define PREFETCH "prefetchnta"
52 #define PREFETCHW "prefetcht0"
53 #define PAVGB "pavgb"
54 #else
55 #define PREFETCH " # nop"
56 #define PREFETCHW " # nop"
57 #endif
59 #if HAVE_AMD3DNOW
60 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
61 #define EMMS "femms"
62 #else
63 #define EMMS "emms"
64 #endif
66 #if HAVE_MMX2
67 #define MOVNTQ "movntq"
68 #define SFENCE "sfence"
69 #else
70 #define MOVNTQ "movq"
71 #define SFENCE " # nop"
72 #endif
74 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
76 uint8_t *dest = dst;
77 const uint8_t *s = src;
78 const uint8_t *end;
79 #if HAVE_MMX
80 const uint8_t *mm_end;
81 #endif
82 end = s + src_size;
83 #if HAVE_MMX
84 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
85 mm_end = end - 23;
86 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
87 while (s < mm_end)
89 __asm__ volatile(
90 PREFETCH" 32%1 \n\t"
91 "movd %1, %%mm0 \n\t"
92 "punpckldq 3%1, %%mm0 \n\t"
93 "movd 6%1, %%mm1 \n\t"
94 "punpckldq 9%1, %%mm1 \n\t"
95 "movd 12%1, %%mm2 \n\t"
96 "punpckldq 15%1, %%mm2 \n\t"
97 "movd 18%1, %%mm3 \n\t"
98 "punpckldq 21%1, %%mm3 \n\t"
99 "por %%mm7, %%mm0 \n\t"
100 "por %%mm7, %%mm1 \n\t"
101 "por %%mm7, %%mm2 \n\t"
102 "por %%mm7, %%mm3 \n\t"
103 MOVNTQ" %%mm0, %0 \n\t"
104 MOVNTQ" %%mm1, 8%0 \n\t"
105 MOVNTQ" %%mm2, 16%0 \n\t"
106 MOVNTQ" %%mm3, 24%0"
107 :"=m"(*dest)
108 :"m"(*s)
109 :"memory");
110 dest += 32;
111 s += 24;
113 __asm__ volatile(SFENCE:::"memory");
114 __asm__ volatile(EMMS:::"memory");
115 #endif
116 while (s < end)
118 #ifdef WORDS_BIGENDIAN
119 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
120 *dest++ = 255;
121 *dest++ = s[2];
122 *dest++ = s[1];
123 *dest++ = s[0];
124 s+=3;
125 #else
126 *dest++ = *s++;
127 *dest++ = *s++;
128 *dest++ = *s++;
129 *dest++ = 255;
130 #endif
134 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
136 uint8_t *dest = dst;
137 const uint8_t *s = src;
138 const uint8_t *end;
139 #if HAVE_MMX
140 const uint8_t *mm_end;
141 #endif
142 end = s + src_size;
143 #if HAVE_MMX
144 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
145 mm_end = end - 31;
146 while (s < mm_end)
148 __asm__ volatile(
149 PREFETCH" 32%1 \n\t"
150 "movq %1, %%mm0 \n\t"
151 "movq 8%1, %%mm1 \n\t"
152 "movq 16%1, %%mm4 \n\t"
153 "movq 24%1, %%mm5 \n\t"
154 "movq %%mm0, %%mm2 \n\t"
155 "movq %%mm1, %%mm3 \n\t"
156 "movq %%mm4, %%mm6 \n\t"
157 "movq %%mm5, %%mm7 \n\t"
158 "psrlq $8, %%mm2 \n\t"
159 "psrlq $8, %%mm3 \n\t"
160 "psrlq $8, %%mm6 \n\t"
161 "psrlq $8, %%mm7 \n\t"
162 "pand %2, %%mm0 \n\t"
163 "pand %2, %%mm1 \n\t"
164 "pand %2, %%mm4 \n\t"
165 "pand %2, %%mm5 \n\t"
166 "pand %3, %%mm2 \n\t"
167 "pand %3, %%mm3 \n\t"
168 "pand %3, %%mm6 \n\t"
169 "pand %3, %%mm7 \n\t"
170 "por %%mm2, %%mm0 \n\t"
171 "por %%mm3, %%mm1 \n\t"
172 "por %%mm6, %%mm4 \n\t"
173 "por %%mm7, %%mm5 \n\t"
175 "movq %%mm1, %%mm2 \n\t"
176 "movq %%mm4, %%mm3 \n\t"
177 "psllq $48, %%mm2 \n\t"
178 "psllq $32, %%mm3 \n\t"
179 "pand %4, %%mm2 \n\t"
180 "pand %5, %%mm3 \n\t"
181 "por %%mm2, %%mm0 \n\t"
182 "psrlq $16, %%mm1 \n\t"
183 "psrlq $32, %%mm4 \n\t"
184 "psllq $16, %%mm5 \n\t"
185 "por %%mm3, %%mm1 \n\t"
186 "pand %6, %%mm5 \n\t"
187 "por %%mm5, %%mm4 \n\t"
189 MOVNTQ" %%mm0, %0 \n\t"
190 MOVNTQ" %%mm1, 8%0 \n\t"
191 MOVNTQ" %%mm4, 16%0"
192 :"=m"(*dest)
193 :"m"(*s),"m"(mask24l),
194 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
195 :"memory");
196 dest += 24;
197 s += 32;
199 __asm__ volatile(SFENCE:::"memory");
200 __asm__ volatile(EMMS:::"memory");
201 #endif
202 while (s < end)
204 #ifdef WORDS_BIGENDIAN
205 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
206 s++;
207 dest[2] = *s++;
208 dest[1] = *s++;
209 dest[0] = *s++;
210 dest += 3;
211 #else
212 *dest++ = *s++;
213 *dest++ = *s++;
214 *dest++ = *s++;
215 s++;
216 #endif
221 original by Strepto/Astral
222 ported to gcc & bugfixed: A'rpi
223 MMX2, 3DNOW optimization by Nick Kurshev
224 32-bit C version, and and&add trick by Michael Niedermayer
226 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
228 register const uint8_t* s=src;
229 register uint8_t* d=dst;
230 register const uint8_t *end;
231 const uint8_t *mm_end;
232 end = s + src_size;
233 #if HAVE_MMX
234 __asm__ volatile(PREFETCH" %0"::"m"(*s));
235 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
236 mm_end = end - 15;
237 while (s<mm_end)
239 __asm__ volatile(
240 PREFETCH" 32%1 \n\t"
241 "movq %1, %%mm0 \n\t"
242 "movq 8%1, %%mm2 \n\t"
243 "movq %%mm0, %%mm1 \n\t"
244 "movq %%mm2, %%mm3 \n\t"
245 "pand %%mm4, %%mm0 \n\t"
246 "pand %%mm4, %%mm2 \n\t"
247 "paddw %%mm1, %%mm0 \n\t"
248 "paddw %%mm3, %%mm2 \n\t"
249 MOVNTQ" %%mm0, %0 \n\t"
250 MOVNTQ" %%mm2, 8%0"
251 :"=m"(*d)
252 :"m"(*s)
254 d+=16;
255 s+=16;
257 __asm__ volatile(SFENCE:::"memory");
258 __asm__ volatile(EMMS:::"memory");
259 #endif
260 mm_end = end - 3;
261 while (s < mm_end)
263 register unsigned x= *((const uint32_t *)s);
264 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
265 d+=4;
266 s+=4;
268 if (s < end)
270 register unsigned short x= *((const uint16_t *)s);
271 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
275 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
277 register const uint8_t* s=src;
278 register uint8_t* d=dst;
279 register const uint8_t *end;
280 const uint8_t *mm_end;
281 end = s + src_size;
282 #if HAVE_MMX
283 __asm__ volatile(PREFETCH" %0"::"m"(*s));
284 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
285 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
286 mm_end = end - 15;
287 while (s<mm_end)
289 __asm__ volatile(
290 PREFETCH" 32%1 \n\t"
291 "movq %1, %%mm0 \n\t"
292 "movq 8%1, %%mm2 \n\t"
293 "movq %%mm0, %%mm1 \n\t"
294 "movq %%mm2, %%mm3 \n\t"
295 "psrlq $1, %%mm0 \n\t"
296 "psrlq $1, %%mm2 \n\t"
297 "pand %%mm7, %%mm0 \n\t"
298 "pand %%mm7, %%mm2 \n\t"
299 "pand %%mm6, %%mm1 \n\t"
300 "pand %%mm6, %%mm3 \n\t"
301 "por %%mm1, %%mm0 \n\t"
302 "por %%mm3, %%mm2 \n\t"
303 MOVNTQ" %%mm0, %0 \n\t"
304 MOVNTQ" %%mm2, 8%0"
305 :"=m"(*d)
306 :"m"(*s)
308 d+=16;
309 s+=16;
311 __asm__ volatile(SFENCE:::"memory");
312 __asm__ volatile(EMMS:::"memory");
313 #endif
314 mm_end = end - 3;
315 while (s < mm_end)
317 register uint32_t x= *((const uint32_t*)s);
318 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
319 s+=4;
320 d+=4;
322 if (s < end)
324 register uint16_t x= *((const uint16_t*)s);
325 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
326 s+=2;
327 d+=2;
331 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
333 const uint8_t *s = src;
334 const uint8_t *end;
335 #if HAVE_MMX
336 const uint8_t *mm_end;
337 #endif
338 uint16_t *d = (uint16_t *)dst;
339 end = s + src_size;
340 #if HAVE_MMX
341 mm_end = end - 15;
342 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
343 __asm__ volatile(
344 "movq %3, %%mm5 \n\t"
345 "movq %4, %%mm6 \n\t"
346 "movq %5, %%mm7 \n\t"
347 "jmp 2f \n\t"
348 ASMALIGN(4)
349 "1: \n\t"
350 PREFETCH" 32(%1) \n\t"
351 "movd (%1), %%mm0 \n\t"
352 "movd 4(%1), %%mm3 \n\t"
353 "punpckldq 8(%1), %%mm0 \n\t"
354 "punpckldq 12(%1), %%mm3 \n\t"
355 "movq %%mm0, %%mm1 \n\t"
356 "movq %%mm3, %%mm4 \n\t"
357 "pand %%mm6, %%mm0 \n\t"
358 "pand %%mm6, %%mm3 \n\t"
359 "pmaddwd %%mm7, %%mm0 \n\t"
360 "pmaddwd %%mm7, %%mm3 \n\t"
361 "pand %%mm5, %%mm1 \n\t"
362 "pand %%mm5, %%mm4 \n\t"
363 "por %%mm1, %%mm0 \n\t"
364 "por %%mm4, %%mm3 \n\t"
365 "psrld $5, %%mm0 \n\t"
366 "pslld $11, %%mm3 \n\t"
367 "por %%mm3, %%mm0 \n\t"
368 MOVNTQ" %%mm0, (%0) \n\t"
369 "add $16, %1 \n\t"
370 "add $8, %0 \n\t"
371 "2: \n\t"
372 "cmp %2, %1 \n\t"
373 " jb 1b \n\t"
374 : "+r" (d), "+r"(s)
375 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
377 #else
378 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
379 __asm__ volatile(
380 "movq %0, %%mm7 \n\t"
381 "movq %1, %%mm6 \n\t"
382 ::"m"(red_16mask),"m"(green_16mask));
383 while (s < mm_end)
385 __asm__ volatile(
386 PREFETCH" 32%1 \n\t"
387 "movd %1, %%mm0 \n\t"
388 "movd 4%1, %%mm3 \n\t"
389 "punpckldq 8%1, %%mm0 \n\t"
390 "punpckldq 12%1, %%mm3 \n\t"
391 "movq %%mm0, %%mm1 \n\t"
392 "movq %%mm0, %%mm2 \n\t"
393 "movq %%mm3, %%mm4 \n\t"
394 "movq %%mm3, %%mm5 \n\t"
395 "psrlq $3, %%mm0 \n\t"
396 "psrlq $3, %%mm3 \n\t"
397 "pand %2, %%mm0 \n\t"
398 "pand %2, %%mm3 \n\t"
399 "psrlq $5, %%mm1 \n\t"
400 "psrlq $5, %%mm4 \n\t"
401 "pand %%mm6, %%mm1 \n\t"
402 "pand %%mm6, %%mm4 \n\t"
403 "psrlq $8, %%mm2 \n\t"
404 "psrlq $8, %%mm5 \n\t"
405 "pand %%mm7, %%mm2 \n\t"
406 "pand %%mm7, %%mm5 \n\t"
407 "por %%mm1, %%mm0 \n\t"
408 "por %%mm4, %%mm3 \n\t"
409 "por %%mm2, %%mm0 \n\t"
410 "por %%mm5, %%mm3 \n\t"
411 "psllq $16, %%mm3 \n\t"
412 "por %%mm3, %%mm0 \n\t"
413 MOVNTQ" %%mm0, %0 \n\t"
414 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
415 d += 4;
416 s += 16;
418 #endif
419 __asm__ volatile(SFENCE:::"memory");
420 __asm__ volatile(EMMS:::"memory");
421 #endif
422 while (s < end)
424 register int rgb = *(const uint32_t*)s; s += 4;
425 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
429 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
431 const uint8_t *s = src;
432 const uint8_t *end;
433 #if HAVE_MMX
434 const uint8_t *mm_end;
435 #endif
436 uint16_t *d = (uint16_t *)dst;
437 end = s + src_size;
438 #if HAVE_MMX
439 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
440 __asm__ volatile(
441 "movq %0, %%mm7 \n\t"
442 "movq %1, %%mm6 \n\t"
443 ::"m"(red_16mask),"m"(green_16mask));
444 mm_end = end - 15;
445 while (s < mm_end)
447 __asm__ volatile(
448 PREFETCH" 32%1 \n\t"
449 "movd %1, %%mm0 \n\t"
450 "movd 4%1, %%mm3 \n\t"
451 "punpckldq 8%1, %%mm0 \n\t"
452 "punpckldq 12%1, %%mm3 \n\t"
453 "movq %%mm0, %%mm1 \n\t"
454 "movq %%mm0, %%mm2 \n\t"
455 "movq %%mm3, %%mm4 \n\t"
456 "movq %%mm3, %%mm5 \n\t"
457 "psllq $8, %%mm0 \n\t"
458 "psllq $8, %%mm3 \n\t"
459 "pand %%mm7, %%mm0 \n\t"
460 "pand %%mm7, %%mm3 \n\t"
461 "psrlq $5, %%mm1 \n\t"
462 "psrlq $5, %%mm4 \n\t"
463 "pand %%mm6, %%mm1 \n\t"
464 "pand %%mm6, %%mm4 \n\t"
465 "psrlq $19, %%mm2 \n\t"
466 "psrlq $19, %%mm5 \n\t"
467 "pand %2, %%mm2 \n\t"
468 "pand %2, %%mm5 \n\t"
469 "por %%mm1, %%mm0 \n\t"
470 "por %%mm4, %%mm3 \n\t"
471 "por %%mm2, %%mm0 \n\t"
472 "por %%mm5, %%mm3 \n\t"
473 "psllq $16, %%mm3 \n\t"
474 "por %%mm3, %%mm0 \n\t"
475 MOVNTQ" %%mm0, %0 \n\t"
476 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
477 d += 4;
478 s += 16;
480 __asm__ volatile(SFENCE:::"memory");
481 __asm__ volatile(EMMS:::"memory");
482 #endif
483 while (s < end)
485 register int rgb = *(const uint32_t*)s; s += 4;
486 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
490 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
492 const uint8_t *s = src;
493 const uint8_t *end;
494 #if HAVE_MMX
495 const uint8_t *mm_end;
496 #endif
497 uint16_t *d = (uint16_t *)dst;
498 end = s + src_size;
499 #if HAVE_MMX
500 mm_end = end - 15;
501 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
502 __asm__ volatile(
503 "movq %3, %%mm5 \n\t"
504 "movq %4, %%mm6 \n\t"
505 "movq %5, %%mm7 \n\t"
506 "jmp 2f \n\t"
507 ASMALIGN(4)
508 "1: \n\t"
509 PREFETCH" 32(%1) \n\t"
510 "movd (%1), %%mm0 \n\t"
511 "movd 4(%1), %%mm3 \n\t"
512 "punpckldq 8(%1), %%mm0 \n\t"
513 "punpckldq 12(%1), %%mm3 \n\t"
514 "movq %%mm0, %%mm1 \n\t"
515 "movq %%mm3, %%mm4 \n\t"
516 "pand %%mm6, %%mm0 \n\t"
517 "pand %%mm6, %%mm3 \n\t"
518 "pmaddwd %%mm7, %%mm0 \n\t"
519 "pmaddwd %%mm7, %%mm3 \n\t"
520 "pand %%mm5, %%mm1 \n\t"
521 "pand %%mm5, %%mm4 \n\t"
522 "por %%mm1, %%mm0 \n\t"
523 "por %%mm4, %%mm3 \n\t"
524 "psrld $6, %%mm0 \n\t"
525 "pslld $10, %%mm3 \n\t"
526 "por %%mm3, %%mm0 \n\t"
527 MOVNTQ" %%mm0, (%0) \n\t"
528 "add $16, %1 \n\t"
529 "add $8, %0 \n\t"
530 "2: \n\t"
531 "cmp %2, %1 \n\t"
532 " jb 1b \n\t"
533 : "+r" (d), "+r"(s)
534 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
536 #else
537 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
538 __asm__ volatile(
539 "movq %0, %%mm7 \n\t"
540 "movq %1, %%mm6 \n\t"
541 ::"m"(red_15mask),"m"(green_15mask));
542 while (s < mm_end)
544 __asm__ volatile(
545 PREFETCH" 32%1 \n\t"
546 "movd %1, %%mm0 \n\t"
547 "movd 4%1, %%mm3 \n\t"
548 "punpckldq 8%1, %%mm0 \n\t"
549 "punpckldq 12%1, %%mm3 \n\t"
550 "movq %%mm0, %%mm1 \n\t"
551 "movq %%mm0, %%mm2 \n\t"
552 "movq %%mm3, %%mm4 \n\t"
553 "movq %%mm3, %%mm5 \n\t"
554 "psrlq $3, %%mm0 \n\t"
555 "psrlq $3, %%mm3 \n\t"
556 "pand %2, %%mm0 \n\t"
557 "pand %2, %%mm3 \n\t"
558 "psrlq $6, %%mm1 \n\t"
559 "psrlq $6, %%mm4 \n\t"
560 "pand %%mm6, %%mm1 \n\t"
561 "pand %%mm6, %%mm4 \n\t"
562 "psrlq $9, %%mm2 \n\t"
563 "psrlq $9, %%mm5 \n\t"
564 "pand %%mm7, %%mm2 \n\t"
565 "pand %%mm7, %%mm5 \n\t"
566 "por %%mm1, %%mm0 \n\t"
567 "por %%mm4, %%mm3 \n\t"
568 "por %%mm2, %%mm0 \n\t"
569 "por %%mm5, %%mm3 \n\t"
570 "psllq $16, %%mm3 \n\t"
571 "por %%mm3, %%mm0 \n\t"
572 MOVNTQ" %%mm0, %0 \n\t"
573 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
574 d += 4;
575 s += 16;
577 #endif
578 __asm__ volatile(SFENCE:::"memory");
579 __asm__ volatile(EMMS:::"memory");
580 #endif
581 while (s < end)
583 register int rgb = *(const uint32_t*)s; s += 4;
584 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
588 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
590 const uint8_t *s = src;
591 const uint8_t *end;
592 #if HAVE_MMX
593 const uint8_t *mm_end;
594 #endif
595 uint16_t *d = (uint16_t *)dst;
596 end = s + src_size;
597 #if HAVE_MMX
598 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
599 __asm__ volatile(
600 "movq %0, %%mm7 \n\t"
601 "movq %1, %%mm6 \n\t"
602 ::"m"(red_15mask),"m"(green_15mask));
603 mm_end = end - 15;
604 while (s < mm_end)
606 __asm__ volatile(
607 PREFETCH" 32%1 \n\t"
608 "movd %1, %%mm0 \n\t"
609 "movd 4%1, %%mm3 \n\t"
610 "punpckldq 8%1, %%mm0 \n\t"
611 "punpckldq 12%1, %%mm3 \n\t"
612 "movq %%mm0, %%mm1 \n\t"
613 "movq %%mm0, %%mm2 \n\t"
614 "movq %%mm3, %%mm4 \n\t"
615 "movq %%mm3, %%mm5 \n\t"
616 "psllq $7, %%mm0 \n\t"
617 "psllq $7, %%mm3 \n\t"
618 "pand %%mm7, %%mm0 \n\t"
619 "pand %%mm7, %%mm3 \n\t"
620 "psrlq $6, %%mm1 \n\t"
621 "psrlq $6, %%mm4 \n\t"
622 "pand %%mm6, %%mm1 \n\t"
623 "pand %%mm6, %%mm4 \n\t"
624 "psrlq $19, %%mm2 \n\t"
625 "psrlq $19, %%mm5 \n\t"
626 "pand %2, %%mm2 \n\t"
627 "pand %2, %%mm5 \n\t"
628 "por %%mm1, %%mm0 \n\t"
629 "por %%mm4, %%mm3 \n\t"
630 "por %%mm2, %%mm0 \n\t"
631 "por %%mm5, %%mm3 \n\t"
632 "psllq $16, %%mm3 \n\t"
633 "por %%mm3, %%mm0 \n\t"
634 MOVNTQ" %%mm0, %0 \n\t"
635 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
636 d += 4;
637 s += 16;
639 __asm__ volatile(SFENCE:::"memory");
640 __asm__ volatile(EMMS:::"memory");
641 #endif
642 while (s < end)
644 register int rgb = *(const uint32_t*)s; s += 4;
645 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
649 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
651 const uint8_t *s = src;
652 const uint8_t *end;
653 #if HAVE_MMX
654 const uint8_t *mm_end;
655 #endif
656 uint16_t *d = (uint16_t *)dst;
657 end = s + src_size;
658 #if HAVE_MMX
659 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
660 __asm__ volatile(
661 "movq %0, %%mm7 \n\t"
662 "movq %1, %%mm6 \n\t"
663 ::"m"(red_16mask),"m"(green_16mask));
664 mm_end = end - 11;
665 while (s < mm_end)
667 __asm__ volatile(
668 PREFETCH" 32%1 \n\t"
669 "movd %1, %%mm0 \n\t"
670 "movd 3%1, %%mm3 \n\t"
671 "punpckldq 6%1, %%mm0 \n\t"
672 "punpckldq 9%1, %%mm3 \n\t"
673 "movq %%mm0, %%mm1 \n\t"
674 "movq %%mm0, %%mm2 \n\t"
675 "movq %%mm3, %%mm4 \n\t"
676 "movq %%mm3, %%mm5 \n\t"
677 "psrlq $3, %%mm0 \n\t"
678 "psrlq $3, %%mm3 \n\t"
679 "pand %2, %%mm0 \n\t"
680 "pand %2, %%mm3 \n\t"
681 "psrlq $5, %%mm1 \n\t"
682 "psrlq $5, %%mm4 \n\t"
683 "pand %%mm6, %%mm1 \n\t"
684 "pand %%mm6, %%mm4 \n\t"
685 "psrlq $8, %%mm2 \n\t"
686 "psrlq $8, %%mm5 \n\t"
687 "pand %%mm7, %%mm2 \n\t"
688 "pand %%mm7, %%mm5 \n\t"
689 "por %%mm1, %%mm0 \n\t"
690 "por %%mm4, %%mm3 \n\t"
691 "por %%mm2, %%mm0 \n\t"
692 "por %%mm5, %%mm3 \n\t"
693 "psllq $16, %%mm3 \n\t"
694 "por %%mm3, %%mm0 \n\t"
695 MOVNTQ" %%mm0, %0 \n\t"
696 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
697 d += 4;
698 s += 12;
700 __asm__ volatile(SFENCE:::"memory");
701 __asm__ volatile(EMMS:::"memory");
702 #endif
703 while (s < end)
705 const int b = *s++;
706 const int g = *s++;
707 const int r = *s++;
708 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
712 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
714 const uint8_t *s = src;
715 const uint8_t *end;
716 #if HAVE_MMX
717 const uint8_t *mm_end;
718 #endif
719 uint16_t *d = (uint16_t *)dst;
720 end = s + src_size;
721 #if HAVE_MMX
722 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
723 __asm__ volatile(
724 "movq %0, %%mm7 \n\t"
725 "movq %1, %%mm6 \n\t"
726 ::"m"(red_16mask),"m"(green_16mask));
727 mm_end = end - 15;
728 while (s < mm_end)
730 __asm__ volatile(
731 PREFETCH" 32%1 \n\t"
732 "movd %1, %%mm0 \n\t"
733 "movd 3%1, %%mm3 \n\t"
734 "punpckldq 6%1, %%mm0 \n\t"
735 "punpckldq 9%1, %%mm3 \n\t"
736 "movq %%mm0, %%mm1 \n\t"
737 "movq %%mm0, %%mm2 \n\t"
738 "movq %%mm3, %%mm4 \n\t"
739 "movq %%mm3, %%mm5 \n\t"
740 "psllq $8, %%mm0 \n\t"
741 "psllq $8, %%mm3 \n\t"
742 "pand %%mm7, %%mm0 \n\t"
743 "pand %%mm7, %%mm3 \n\t"
744 "psrlq $5, %%mm1 \n\t"
745 "psrlq $5, %%mm4 \n\t"
746 "pand %%mm6, %%mm1 \n\t"
747 "pand %%mm6, %%mm4 \n\t"
748 "psrlq $19, %%mm2 \n\t"
749 "psrlq $19, %%mm5 \n\t"
750 "pand %2, %%mm2 \n\t"
751 "pand %2, %%mm5 \n\t"
752 "por %%mm1, %%mm0 \n\t"
753 "por %%mm4, %%mm3 \n\t"
754 "por %%mm2, %%mm0 \n\t"
755 "por %%mm5, %%mm3 \n\t"
756 "psllq $16, %%mm3 \n\t"
757 "por %%mm3, %%mm0 \n\t"
758 MOVNTQ" %%mm0, %0 \n\t"
759 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
760 d += 4;
761 s += 12;
763 __asm__ volatile(SFENCE:::"memory");
764 __asm__ volatile(EMMS:::"memory");
765 #endif
766 while (s < end)
768 const int r = *s++;
769 const int g = *s++;
770 const int b = *s++;
771 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
775 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
777 const uint8_t *s = src;
778 const uint8_t *end;
779 #if HAVE_MMX
780 const uint8_t *mm_end;
781 #endif
782 uint16_t *d = (uint16_t *)dst;
783 end = s + src_size;
784 #if HAVE_MMX
785 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
786 __asm__ volatile(
787 "movq %0, %%mm7 \n\t"
788 "movq %1, %%mm6 \n\t"
789 ::"m"(red_15mask),"m"(green_15mask));
790 mm_end = end - 11;
791 while (s < mm_end)
793 __asm__ volatile(
794 PREFETCH" 32%1 \n\t"
795 "movd %1, %%mm0 \n\t"
796 "movd 3%1, %%mm3 \n\t"
797 "punpckldq 6%1, %%mm0 \n\t"
798 "punpckldq 9%1, %%mm3 \n\t"
799 "movq %%mm0, %%mm1 \n\t"
800 "movq %%mm0, %%mm2 \n\t"
801 "movq %%mm3, %%mm4 \n\t"
802 "movq %%mm3, %%mm5 \n\t"
803 "psrlq $3, %%mm0 \n\t"
804 "psrlq $3, %%mm3 \n\t"
805 "pand %2, %%mm0 \n\t"
806 "pand %2, %%mm3 \n\t"
807 "psrlq $6, %%mm1 \n\t"
808 "psrlq $6, %%mm4 \n\t"
809 "pand %%mm6, %%mm1 \n\t"
810 "pand %%mm6, %%mm4 \n\t"
811 "psrlq $9, %%mm2 \n\t"
812 "psrlq $9, %%mm5 \n\t"
813 "pand %%mm7, %%mm2 \n\t"
814 "pand %%mm7, %%mm5 \n\t"
815 "por %%mm1, %%mm0 \n\t"
816 "por %%mm4, %%mm3 \n\t"
817 "por %%mm2, %%mm0 \n\t"
818 "por %%mm5, %%mm3 \n\t"
819 "psllq $16, %%mm3 \n\t"
820 "por %%mm3, %%mm0 \n\t"
821 MOVNTQ" %%mm0, %0 \n\t"
822 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
823 d += 4;
824 s += 12;
826 __asm__ volatile(SFENCE:::"memory");
827 __asm__ volatile(EMMS:::"memory");
828 #endif
829 while (s < end)
831 const int b = *s++;
832 const int g = *s++;
833 const int r = *s++;
834 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
838 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
840 const uint8_t *s = src;
841 const uint8_t *end;
842 #if HAVE_MMX
843 const uint8_t *mm_end;
844 #endif
845 uint16_t *d = (uint16_t *)dst;
846 end = s + src_size;
847 #if HAVE_MMX
848 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
849 __asm__ volatile(
850 "movq %0, %%mm7 \n\t"
851 "movq %1, %%mm6 \n\t"
852 ::"m"(red_15mask),"m"(green_15mask));
853 mm_end = end - 15;
854 while (s < mm_end)
856 __asm__ volatile(
857 PREFETCH" 32%1 \n\t"
858 "movd %1, %%mm0 \n\t"
859 "movd 3%1, %%mm3 \n\t"
860 "punpckldq 6%1, %%mm0 \n\t"
861 "punpckldq 9%1, %%mm3 \n\t"
862 "movq %%mm0, %%mm1 \n\t"
863 "movq %%mm0, %%mm2 \n\t"
864 "movq %%mm3, %%mm4 \n\t"
865 "movq %%mm3, %%mm5 \n\t"
866 "psllq $7, %%mm0 \n\t"
867 "psllq $7, %%mm3 \n\t"
868 "pand %%mm7, %%mm0 \n\t"
869 "pand %%mm7, %%mm3 \n\t"
870 "psrlq $6, %%mm1 \n\t"
871 "psrlq $6, %%mm4 \n\t"
872 "pand %%mm6, %%mm1 \n\t"
873 "pand %%mm6, %%mm4 \n\t"
874 "psrlq $19, %%mm2 \n\t"
875 "psrlq $19, %%mm5 \n\t"
876 "pand %2, %%mm2 \n\t"
877 "pand %2, %%mm5 \n\t"
878 "por %%mm1, %%mm0 \n\t"
879 "por %%mm4, %%mm3 \n\t"
880 "por %%mm2, %%mm0 \n\t"
881 "por %%mm5, %%mm3 \n\t"
882 "psllq $16, %%mm3 \n\t"
883 "por %%mm3, %%mm0 \n\t"
884 MOVNTQ" %%mm0, %0 \n\t"
885 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
886 d += 4;
887 s += 12;
889 __asm__ volatile(SFENCE:::"memory");
890 __asm__ volatile(EMMS:::"memory");
891 #endif
892 while (s < end)
894 const int r = *s++;
895 const int g = *s++;
896 const int b = *s++;
897 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
902 I use less accurate approximation here by simply left-shifting the input
903 value and filling the low order bits with zeroes. This method improves PNG
904 compression but this scheme cannot reproduce white exactly, since it does
905 not generate an all-ones maximum value; the net effect is to darken the
906 image slightly.
908 The better method should be "left bit replication":
910 4 3 2 1 0
911 ---------
912 1 1 0 1 1
914 7 6 5 4 3 2 1 0
915 ----------------
916 1 1 0 1 1 1 1 0
917 |=======| |===|
918 | leftmost bits repeated to fill open bits
920 original bits
922 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
924 const uint16_t *end;
925 #if HAVE_MMX
926 const uint16_t *mm_end;
927 #endif
928 uint8_t *d = dst;
929 const uint16_t *s = (const uint16_t*)src;
930 end = s + src_size/2;
931 #if HAVE_MMX
932 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
933 mm_end = end - 7;
934 while (s < mm_end)
936 __asm__ volatile(
937 PREFETCH" 32%1 \n\t"
938 "movq %1, %%mm0 \n\t"
939 "movq %1, %%mm1 \n\t"
940 "movq %1, %%mm2 \n\t"
941 "pand %2, %%mm0 \n\t"
942 "pand %3, %%mm1 \n\t"
943 "pand %4, %%mm2 \n\t"
944 "psllq $3, %%mm0 \n\t"
945 "psrlq $2, %%mm1 \n\t"
946 "psrlq $7, %%mm2 \n\t"
947 "movq %%mm0, %%mm3 \n\t"
948 "movq %%mm1, %%mm4 \n\t"
949 "movq %%mm2, %%mm5 \n\t"
950 "punpcklwd %5, %%mm0 \n\t"
951 "punpcklwd %5, %%mm1 \n\t"
952 "punpcklwd %5, %%mm2 \n\t"
953 "punpckhwd %5, %%mm3 \n\t"
954 "punpckhwd %5, %%mm4 \n\t"
955 "punpckhwd %5, %%mm5 \n\t"
956 "psllq $8, %%mm1 \n\t"
957 "psllq $16, %%mm2 \n\t"
958 "por %%mm1, %%mm0 \n\t"
959 "por %%mm2, %%mm0 \n\t"
960 "psllq $8, %%mm4 \n\t"
961 "psllq $16, %%mm5 \n\t"
962 "por %%mm4, %%mm3 \n\t"
963 "por %%mm5, %%mm3 \n\t"
965 "movq %%mm0, %%mm6 \n\t"
966 "movq %%mm3, %%mm7 \n\t"
968 "movq 8%1, %%mm0 \n\t"
969 "movq 8%1, %%mm1 \n\t"
970 "movq 8%1, %%mm2 \n\t"
971 "pand %2, %%mm0 \n\t"
972 "pand %3, %%mm1 \n\t"
973 "pand %4, %%mm2 \n\t"
974 "psllq $3, %%mm0 \n\t"
975 "psrlq $2, %%mm1 \n\t"
976 "psrlq $7, %%mm2 \n\t"
977 "movq %%mm0, %%mm3 \n\t"
978 "movq %%mm1, %%mm4 \n\t"
979 "movq %%mm2, %%mm5 \n\t"
980 "punpcklwd %5, %%mm0 \n\t"
981 "punpcklwd %5, %%mm1 \n\t"
982 "punpcklwd %5, %%mm2 \n\t"
983 "punpckhwd %5, %%mm3 \n\t"
984 "punpckhwd %5, %%mm4 \n\t"
985 "punpckhwd %5, %%mm5 \n\t"
986 "psllq $8, %%mm1 \n\t"
987 "psllq $16, %%mm2 \n\t"
988 "por %%mm1, %%mm0 \n\t"
989 "por %%mm2, %%mm0 \n\t"
990 "psllq $8, %%mm4 \n\t"
991 "psllq $16, %%mm5 \n\t"
992 "por %%mm4, %%mm3 \n\t"
993 "por %%mm5, %%mm3 \n\t"
995 :"=m"(*d)
996 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
997 :"memory");
998 /* borrowed 32 to 24 */
999 __asm__ volatile(
1000 "movq %%mm0, %%mm4 \n\t"
1001 "movq %%mm3, %%mm5 \n\t"
1002 "movq %%mm6, %%mm0 \n\t"
1003 "movq %%mm7, %%mm1 \n\t"
1005 "movq %%mm4, %%mm6 \n\t"
1006 "movq %%mm5, %%mm7 \n\t"
1007 "movq %%mm0, %%mm2 \n\t"
1008 "movq %%mm1, %%mm3 \n\t"
1010 "psrlq $8, %%mm2 \n\t"
1011 "psrlq $8, %%mm3 \n\t"
1012 "psrlq $8, %%mm6 \n\t"
1013 "psrlq $8, %%mm7 \n\t"
1014 "pand %2, %%mm0 \n\t"
1015 "pand %2, %%mm1 \n\t"
1016 "pand %2, %%mm4 \n\t"
1017 "pand %2, %%mm5 \n\t"
1018 "pand %3, %%mm2 \n\t"
1019 "pand %3, %%mm3 \n\t"
1020 "pand %3, %%mm6 \n\t"
1021 "pand %3, %%mm7 \n\t"
1022 "por %%mm2, %%mm0 \n\t"
1023 "por %%mm3, %%mm1 \n\t"
1024 "por %%mm6, %%mm4 \n\t"
1025 "por %%mm7, %%mm5 \n\t"
1027 "movq %%mm1, %%mm2 \n\t"
1028 "movq %%mm4, %%mm3 \n\t"
1029 "psllq $48, %%mm2 \n\t"
1030 "psllq $32, %%mm3 \n\t"
1031 "pand %4, %%mm2 \n\t"
1032 "pand %5, %%mm3 \n\t"
1033 "por %%mm2, %%mm0 \n\t"
1034 "psrlq $16, %%mm1 \n\t"
1035 "psrlq $32, %%mm4 \n\t"
1036 "psllq $16, %%mm5 \n\t"
1037 "por %%mm3, %%mm1 \n\t"
1038 "pand %6, %%mm5 \n\t"
1039 "por %%mm5, %%mm4 \n\t"
1041 MOVNTQ" %%mm0, %0 \n\t"
1042 MOVNTQ" %%mm1, 8%0 \n\t"
1043 MOVNTQ" %%mm4, 16%0"
1045 :"=m"(*d)
1046 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1047 :"memory");
1048 d += 24;
1049 s += 8;
1051 __asm__ volatile(SFENCE:::"memory");
1052 __asm__ volatile(EMMS:::"memory");
1053 #endif
1054 while (s < end)
1056 register uint16_t bgr;
1057 bgr = *s++;
1058 *d++ = (bgr&0x1F)<<3;
1059 *d++ = (bgr&0x3E0)>>2;
1060 *d++ = (bgr&0x7C00)>>7;
1064 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1066 const uint16_t *end;
1067 #if HAVE_MMX
1068 const uint16_t *mm_end;
1069 #endif
1070 uint8_t *d = (uint8_t *)dst;
1071 const uint16_t *s = (const uint16_t *)src;
1072 end = s + src_size/2;
1073 #if HAVE_MMX
1074 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1075 mm_end = end - 7;
1076 while (s < mm_end)
1078 __asm__ volatile(
1079 PREFETCH" 32%1 \n\t"
1080 "movq %1, %%mm0 \n\t"
1081 "movq %1, %%mm1 \n\t"
1082 "movq %1, %%mm2 \n\t"
1083 "pand %2, %%mm0 \n\t"
1084 "pand %3, %%mm1 \n\t"
1085 "pand %4, %%mm2 \n\t"
1086 "psllq $3, %%mm0 \n\t"
1087 "psrlq $3, %%mm1 \n\t"
1088 "psrlq $8, %%mm2 \n\t"
1089 "movq %%mm0, %%mm3 \n\t"
1090 "movq %%mm1, %%mm4 \n\t"
1091 "movq %%mm2, %%mm5 \n\t"
1092 "punpcklwd %5, %%mm0 \n\t"
1093 "punpcklwd %5, %%mm1 \n\t"
1094 "punpcklwd %5, %%mm2 \n\t"
1095 "punpckhwd %5, %%mm3 \n\t"
1096 "punpckhwd %5, %%mm4 \n\t"
1097 "punpckhwd %5, %%mm5 \n\t"
1098 "psllq $8, %%mm1 \n\t"
1099 "psllq $16, %%mm2 \n\t"
1100 "por %%mm1, %%mm0 \n\t"
1101 "por %%mm2, %%mm0 \n\t"
1102 "psllq $8, %%mm4 \n\t"
1103 "psllq $16, %%mm5 \n\t"
1104 "por %%mm4, %%mm3 \n\t"
1105 "por %%mm5, %%mm3 \n\t"
1107 "movq %%mm0, %%mm6 \n\t"
1108 "movq %%mm3, %%mm7 \n\t"
1110 "movq 8%1, %%mm0 \n\t"
1111 "movq 8%1, %%mm1 \n\t"
1112 "movq 8%1, %%mm2 \n\t"
1113 "pand %2, %%mm0 \n\t"
1114 "pand %3, %%mm1 \n\t"
1115 "pand %4, %%mm2 \n\t"
1116 "psllq $3, %%mm0 \n\t"
1117 "psrlq $3, %%mm1 \n\t"
1118 "psrlq $8, %%mm2 \n\t"
1119 "movq %%mm0, %%mm3 \n\t"
1120 "movq %%mm1, %%mm4 \n\t"
1121 "movq %%mm2, %%mm5 \n\t"
1122 "punpcklwd %5, %%mm0 \n\t"
1123 "punpcklwd %5, %%mm1 \n\t"
1124 "punpcklwd %5, %%mm2 \n\t"
1125 "punpckhwd %5, %%mm3 \n\t"
1126 "punpckhwd %5, %%mm4 \n\t"
1127 "punpckhwd %5, %%mm5 \n\t"
1128 "psllq $8, %%mm1 \n\t"
1129 "psllq $16, %%mm2 \n\t"
1130 "por %%mm1, %%mm0 \n\t"
1131 "por %%mm2, %%mm0 \n\t"
1132 "psllq $8, %%mm4 \n\t"
1133 "psllq $16, %%mm5 \n\t"
1134 "por %%mm4, %%mm3 \n\t"
1135 "por %%mm5, %%mm3 \n\t"
1136 :"=m"(*d)
1137 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1138 :"memory");
1139 /* borrowed 32 to 24 */
1140 __asm__ volatile(
1141 "movq %%mm0, %%mm4 \n\t"
1142 "movq %%mm3, %%mm5 \n\t"
1143 "movq %%mm6, %%mm0 \n\t"
1144 "movq %%mm7, %%mm1 \n\t"
1146 "movq %%mm4, %%mm6 \n\t"
1147 "movq %%mm5, %%mm7 \n\t"
1148 "movq %%mm0, %%mm2 \n\t"
1149 "movq %%mm1, %%mm3 \n\t"
1151 "psrlq $8, %%mm2 \n\t"
1152 "psrlq $8, %%mm3 \n\t"
1153 "psrlq $8, %%mm6 \n\t"
1154 "psrlq $8, %%mm7 \n\t"
1155 "pand %2, %%mm0 \n\t"
1156 "pand %2, %%mm1 \n\t"
1157 "pand %2, %%mm4 \n\t"
1158 "pand %2, %%mm5 \n\t"
1159 "pand %3, %%mm2 \n\t"
1160 "pand %3, %%mm3 \n\t"
1161 "pand %3, %%mm6 \n\t"
1162 "pand %3, %%mm7 \n\t"
1163 "por %%mm2, %%mm0 \n\t"
1164 "por %%mm3, %%mm1 \n\t"
1165 "por %%mm6, %%mm4 \n\t"
1166 "por %%mm7, %%mm5 \n\t"
1168 "movq %%mm1, %%mm2 \n\t"
1169 "movq %%mm4, %%mm3 \n\t"
1170 "psllq $48, %%mm2 \n\t"
1171 "psllq $32, %%mm3 \n\t"
1172 "pand %4, %%mm2 \n\t"
1173 "pand %5, %%mm3 \n\t"
1174 "por %%mm2, %%mm0 \n\t"
1175 "psrlq $16, %%mm1 \n\t"
1176 "psrlq $32, %%mm4 \n\t"
1177 "psllq $16, %%mm5 \n\t"
1178 "por %%mm3, %%mm1 \n\t"
1179 "pand %6, %%mm5 \n\t"
1180 "por %%mm5, %%mm4 \n\t"
1182 MOVNTQ" %%mm0, %0 \n\t"
1183 MOVNTQ" %%mm1, 8%0 \n\t"
1184 MOVNTQ" %%mm4, 16%0"
1186 :"=m"(*d)
1187 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1188 :"memory");
1189 d += 24;
1190 s += 8;
1192 __asm__ volatile(SFENCE:::"memory");
1193 __asm__ volatile(EMMS:::"memory");
1194 #endif
1195 while (s < end)
1197 register uint16_t bgr;
1198 bgr = *s++;
1199 *d++ = (bgr&0x1F)<<3;
1200 *d++ = (bgr&0x7E0)>>3;
1201 *d++ = (bgr&0xF800)>>8;
1206 * mm0 = 00 B3 00 B2 00 B1 00 B0
1207 * mm1 = 00 G3 00 G2 00 G1 00 G0
1208 * mm2 = 00 R3 00 R2 00 R1 00 R0
1209 * mm6 = FF FF FF FF FF FF FF FF
1210 * mm7 = 00 00 00 00 00 00 00 00
1212 #define PACK_RGB32 \
1213 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1214 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1215 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1216 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1217 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1218 "movq %%mm0, %%mm3 \n\t" \
1219 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1220 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1221 MOVNTQ" %%mm0, %0 \n\t" \
1222 MOVNTQ" %%mm3, 8%0 \n\t" \
1224 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1226 const uint16_t *end;
1227 #if HAVE_MMX
1228 const uint16_t *mm_end;
1229 #endif
1230 uint8_t *d = dst;
1231 const uint16_t *s = (const uint16_t *)src;
1232 end = s + src_size/2;
1233 #if HAVE_MMX
1234 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1235 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1236 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1237 mm_end = end - 3;
1238 while (s < mm_end)
1240 __asm__ volatile(
1241 PREFETCH" 32%1 \n\t"
1242 "movq %1, %%mm0 \n\t"
1243 "movq %1, %%mm1 \n\t"
1244 "movq %1, %%mm2 \n\t"
1245 "pand %2, %%mm0 \n\t"
1246 "pand %3, %%mm1 \n\t"
1247 "pand %4, %%mm2 \n\t"
1248 "psllq $3, %%mm0 \n\t"
1249 "psrlq $2, %%mm1 \n\t"
1250 "psrlq $7, %%mm2 \n\t"
1251 PACK_RGB32
1252 :"=m"(*d)
1253 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1254 :"memory");
1255 d += 16;
1256 s += 4;
1258 __asm__ volatile(SFENCE:::"memory");
1259 __asm__ volatile(EMMS:::"memory");
1260 #endif
1261 while (s < end)
1263 #if 0 //slightly slower on Athlon
1264 int bgr= *s++;
1265 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1266 #else
1267 register uint16_t bgr;
1268 bgr = *s++;
1269 #ifdef WORDS_BIGENDIAN
1270 *d++ = 255;
1271 *d++ = (bgr&0x7C00)>>7;
1272 *d++ = (bgr&0x3E0)>>2;
1273 *d++ = (bgr&0x1F)<<3;
1274 #else
1275 *d++ = (bgr&0x1F)<<3;
1276 *d++ = (bgr&0x3E0)>>2;
1277 *d++ = (bgr&0x7C00)>>7;
1278 *d++ = 255;
1279 #endif
1281 #endif
1285 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1287 const uint16_t *end;
1288 #if HAVE_MMX
1289 const uint16_t *mm_end;
1290 #endif
1291 uint8_t *d = dst;
1292 const uint16_t *s = (const uint16_t*)src;
1293 end = s + src_size/2;
1294 #if HAVE_MMX
1295 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1296 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1297 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1298 mm_end = end - 3;
1299 while (s < mm_end)
1301 __asm__ volatile(
1302 PREFETCH" 32%1 \n\t"
1303 "movq %1, %%mm0 \n\t"
1304 "movq %1, %%mm1 \n\t"
1305 "movq %1, %%mm2 \n\t"
1306 "pand %2, %%mm0 \n\t"
1307 "pand %3, %%mm1 \n\t"
1308 "pand %4, %%mm2 \n\t"
1309 "psllq $3, %%mm0 \n\t"
1310 "psrlq $3, %%mm1 \n\t"
1311 "psrlq $8, %%mm2 \n\t"
1312 PACK_RGB32
1313 :"=m"(*d)
1314 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1315 :"memory");
1316 d += 16;
1317 s += 4;
1319 __asm__ volatile(SFENCE:::"memory");
1320 __asm__ volatile(EMMS:::"memory");
1321 #endif
1322 while (s < end)
1324 register uint16_t bgr;
1325 bgr = *s++;
1326 #ifdef WORDS_BIGENDIAN
1327 *d++ = 255;
1328 *d++ = (bgr&0xF800)>>8;
1329 *d++ = (bgr&0x7E0)>>3;
1330 *d++ = (bgr&0x1F)<<3;
1331 #else
1332 *d++ = (bgr&0x1F)<<3;
1333 *d++ = (bgr&0x7E0)>>3;
1334 *d++ = (bgr&0xF800)>>8;
1335 *d++ = 255;
1336 #endif
1340 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1342 x86_reg idx = 15 - src_size;
1343 const uint8_t *s = src-idx;
1344 uint8_t *d = dst-idx;
1345 #if HAVE_MMX
1346 __asm__ volatile(
1347 "test %0, %0 \n\t"
1348 "jns 2f \n\t"
1349 PREFETCH" (%1, %0) \n\t"
1350 "movq %3, %%mm7 \n\t"
1351 "pxor %4, %%mm7 \n\t"
1352 "movq %%mm7, %%mm6 \n\t"
1353 "pxor %5, %%mm7 \n\t"
1354 ASMALIGN(4)
1355 "1: \n\t"
1356 PREFETCH" 32(%1, %0) \n\t"
1357 "movq (%1, %0), %%mm0 \n\t"
1358 "movq 8(%1, %0), %%mm1 \n\t"
1359 # if HAVE_MMX2
1360 "pshufw $177, %%mm0, %%mm3 \n\t"
1361 "pshufw $177, %%mm1, %%mm5 \n\t"
1362 "pand %%mm7, %%mm0 \n\t"
1363 "pand %%mm6, %%mm3 \n\t"
1364 "pand %%mm7, %%mm1 \n\t"
1365 "pand %%mm6, %%mm5 \n\t"
1366 "por %%mm3, %%mm0 \n\t"
1367 "por %%mm5, %%mm1 \n\t"
1368 # else
1369 "movq %%mm0, %%mm2 \n\t"
1370 "movq %%mm1, %%mm4 \n\t"
1371 "pand %%mm7, %%mm0 \n\t"
1372 "pand %%mm6, %%mm2 \n\t"
1373 "pand %%mm7, %%mm1 \n\t"
1374 "pand %%mm6, %%mm4 \n\t"
1375 "movq %%mm2, %%mm3 \n\t"
1376 "movq %%mm4, %%mm5 \n\t"
1377 "pslld $16, %%mm2 \n\t"
1378 "psrld $16, %%mm3 \n\t"
1379 "pslld $16, %%mm4 \n\t"
1380 "psrld $16, %%mm5 \n\t"
1381 "por %%mm2, %%mm0 \n\t"
1382 "por %%mm4, %%mm1 \n\t"
1383 "por %%mm3, %%mm0 \n\t"
1384 "por %%mm5, %%mm1 \n\t"
1385 # endif
1386 MOVNTQ" %%mm0, (%2, %0) \n\t"
1387 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1388 "add $16, %0 \n\t"
1389 "js 1b \n\t"
1390 SFENCE" \n\t"
1391 EMMS" \n\t"
1392 "2: \n\t"
1393 : "+&r"(idx)
1394 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1395 : "memory");
1396 #endif
1397 for (; idx<15; idx+=4) {
1398 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1399 v &= 0xff00ff;
1400 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1404 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1406 unsigned i;
1407 #if HAVE_MMX
1408 x86_reg mmx_size= 23 - src_size;
1409 __asm__ volatile (
1410 "test %%"REG_a", %%"REG_a" \n\t"
1411 "jns 2f \n\t"
1412 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1413 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1414 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1415 ASMALIGN(4)
1416 "1: \n\t"
1417 PREFETCH" 32(%1, %%"REG_a") \n\t"
1418 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1419 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1420 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1421 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1422 "pand %%mm5, %%mm0 \n\t"
1423 "pand %%mm6, %%mm1 \n\t"
1424 "pand %%mm7, %%mm2 \n\t"
1425 "por %%mm0, %%mm1 \n\t"
1426 "por %%mm2, %%mm1 \n\t"
1427 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1428 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1429 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1430 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1431 "pand %%mm7, %%mm0 \n\t"
1432 "pand %%mm5, %%mm1 \n\t"
1433 "pand %%mm6, %%mm2 \n\t"
1434 "por %%mm0, %%mm1 \n\t"
1435 "por %%mm2, %%mm1 \n\t"
1436 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1437 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1438 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1439 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1440 "pand %%mm6, %%mm0 \n\t"
1441 "pand %%mm7, %%mm1 \n\t"
1442 "pand %%mm5, %%mm2 \n\t"
1443 "por %%mm0, %%mm1 \n\t"
1444 "por %%mm2, %%mm1 \n\t"
1445 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1446 "add $24, %%"REG_a" \n\t"
1447 " js 1b \n\t"
1448 "2: \n\t"
1449 : "+a" (mmx_size)
1450 : "r" (src-mmx_size), "r"(dst-mmx_size)
1453 __asm__ volatile(SFENCE:::"memory");
1454 __asm__ volatile(EMMS:::"memory");
1456 if (mmx_size==23) return; //finished, was multiple of 8
1458 src+= src_size;
1459 dst+= src_size;
1460 src_size= 23-mmx_size;
1461 src-= src_size;
1462 dst-= src_size;
1463 #endif
1464 for (i=0; i<src_size; i+=3)
1466 register uint8_t x;
1467 x = src[i + 2];
1468 dst[i + 1] = src[i + 1];
1469 dst[i + 2] = src[i + 0];
1470 dst[i + 0] = x;
1474 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1475 long width, long height,
1476 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1478 long y;
1479 const x86_reg chromWidth= width>>1;
1480 for (y=0; y<height; y++)
1482 #if HAVE_MMX
1483 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1484 __asm__ volatile(
1485 "xor %%"REG_a", %%"REG_a" \n\t"
1486 ASMALIGN(4)
1487 "1: \n\t"
1488 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1489 PREFETCH" 32(%2, %%"REG_a") \n\t"
1490 PREFETCH" 32(%3, %%"REG_a") \n\t"
1491 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1492 "movq %%mm0, %%mm2 \n\t" // U(0)
1493 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1494 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1495 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1497 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1498 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1499 "movq %%mm3, %%mm4 \n\t" // Y(0)
1500 "movq %%mm5, %%mm6 \n\t" // Y(8)
1501 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1502 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1503 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1504 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1506 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1507 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1508 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1509 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1511 "add $8, %%"REG_a" \n\t"
1512 "cmp %4, %%"REG_a" \n\t"
1513 " jb 1b \n\t"
1514 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1515 : "%"REG_a
1517 #else
1519 #if ARCH_ALPHA && HAVE_MVI
1520 #define pl2yuy2(n) \
1521 y1 = yc[n]; \
1522 y2 = yc2[n]; \
1523 u = uc[n]; \
1524 v = vc[n]; \
1525 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1526 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1527 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1528 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1529 yuv1 = (u << 8) + (v << 24); \
1530 yuv2 = yuv1 + y2; \
1531 yuv1 += y1; \
1532 qdst[n] = yuv1; \
1533 qdst2[n] = yuv2;
1535 int i;
1536 uint64_t *qdst = (uint64_t *) dst;
1537 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1538 const uint32_t *yc = (uint32_t *) ysrc;
1539 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1540 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1541 for (i = 0; i < chromWidth; i += 8){
1542 uint64_t y1, y2, yuv1, yuv2;
1543 uint64_t u, v;
1544 /* Prefetch */
1545 __asm__("ldq $31,64(%0)" :: "r"(yc));
1546 __asm__("ldq $31,64(%0)" :: "r"(yc2));
1547 __asm__("ldq $31,64(%0)" :: "r"(uc));
1548 __asm__("ldq $31,64(%0)" :: "r"(vc));
1550 pl2yuy2(0);
1551 pl2yuy2(1);
1552 pl2yuy2(2);
1553 pl2yuy2(3);
1555 yc += 4;
1556 yc2 += 4;
1557 uc += 4;
1558 vc += 4;
1559 qdst += 4;
1560 qdst2 += 4;
1562 y++;
1563 ysrc += lumStride;
1564 dst += dstStride;
1566 #elif HAVE_FAST_64BIT
1567 int i;
1568 uint64_t *ldst = (uint64_t *) dst;
1569 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1570 for (i = 0; i < chromWidth; i += 2){
1571 uint64_t k, l;
1572 k = yc[0] + (uc[0] << 8) +
1573 (yc[1] << 16) + (vc[0] << 24);
1574 l = yc[2] + (uc[1] << 8) +
1575 (yc[3] << 16) + (vc[1] << 24);
1576 *ldst++ = k + (l << 32);
1577 yc += 4;
1578 uc += 2;
1579 vc += 2;
1582 #else
1583 int i, *idst = (int32_t *) dst;
1584 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1585 for (i = 0; i < chromWidth; i++){
1586 #ifdef WORDS_BIGENDIAN
1587 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1588 (yc[1] << 8) + (vc[0] << 0);
1589 #else
1590 *idst++ = yc[0] + (uc[0] << 8) +
1591 (yc[1] << 16) + (vc[0] << 24);
1592 #endif
1593 yc += 2;
1594 uc++;
1595 vc++;
1597 #endif
1598 #endif
1599 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1601 usrc += chromStride;
1602 vsrc += chromStride;
1604 ysrc += lumStride;
1605 dst += dstStride;
1607 #if HAVE_MMX
1608 __asm__( EMMS" \n\t"
1609 SFENCE" \n\t"
1610 :::"memory");
1611 #endif
1615 * Height should be a multiple of 2 and width should be a multiple of 16.
1616 * (If this is a problem for anyone then tell me, and I will fix it.)
1618 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1619 long width, long height,
1620 long lumStride, long chromStride, long dstStride)
1622 //FIXME interpolate chroma
1623 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1626 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1627 long width, long height,
1628 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1630 long y;
1631 const x86_reg chromWidth= width>>1;
1632 for (y=0; y<height; y++)
1634 #if HAVE_MMX
1635 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1636 __asm__ volatile(
1637 "xor %%"REG_a", %%"REG_a" \n\t"
1638 ASMALIGN(4)
1639 "1: \n\t"
1640 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1641 PREFETCH" 32(%2, %%"REG_a") \n\t"
1642 PREFETCH" 32(%3, %%"REG_a") \n\t"
1643 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1644 "movq %%mm0, %%mm2 \n\t" // U(0)
1645 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1646 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1647 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1649 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1650 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1651 "movq %%mm0, %%mm4 \n\t" // Y(0)
1652 "movq %%mm2, %%mm6 \n\t" // Y(8)
1653 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1654 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1655 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1656 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1658 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1659 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1660 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1661 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1663 "add $8, %%"REG_a" \n\t"
1664 "cmp %4, %%"REG_a" \n\t"
1665 " jb 1b \n\t"
1666 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1667 : "%"REG_a
1669 #else
1670 //FIXME adapt the Alpha ASM code from yv12->yuy2
1672 #if HAVE_FAST_64BIT
1673 int i;
1674 uint64_t *ldst = (uint64_t *) dst;
1675 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1676 for (i = 0; i < chromWidth; i += 2){
1677 uint64_t k, l;
1678 k = uc[0] + (yc[0] << 8) +
1679 (vc[0] << 16) + (yc[1] << 24);
1680 l = uc[1] + (yc[2] << 8) +
1681 (vc[1] << 16) + (yc[3] << 24);
1682 *ldst++ = k + (l << 32);
1683 yc += 4;
1684 uc += 2;
1685 vc += 2;
1688 #else
1689 int i, *idst = (int32_t *) dst;
1690 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1691 for (i = 0; i < chromWidth; i++){
1692 #ifdef WORDS_BIGENDIAN
1693 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1694 (vc[0] << 8) + (yc[1] << 0);
1695 #else
1696 *idst++ = uc[0] + (yc[0] << 8) +
1697 (vc[0] << 16) + (yc[1] << 24);
1698 #endif
1699 yc += 2;
1700 uc++;
1701 vc++;
1703 #endif
1704 #endif
1705 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1)
1707 usrc += chromStride;
1708 vsrc += chromStride;
1710 ysrc += lumStride;
1711 dst += dstStride;
1713 #if HAVE_MMX
1714 __asm__( EMMS" \n\t"
1715 SFENCE" \n\t"
1716 :::"memory");
1717 #endif
1721 * Height should be a multiple of 2 and width should be a multiple of 16
1722 * (If this is a problem for anyone then tell me, and I will fix it.)
1724 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1725 long width, long height,
1726 long lumStride, long chromStride, long dstStride)
1728 //FIXME interpolate chroma
1729 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1733 * Width should be a multiple of 16.
1735 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1736 long width, long height,
1737 long lumStride, long chromStride, long dstStride)
1739 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1743 * Width should be a multiple of 16.
1745 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1746 long width, long height,
1747 long lumStride, long chromStride, long dstStride)
1749 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1753 * Height should be a multiple of 2 and width should be a multiple of 16.
1754 * (If this is a problem for anyone then tell me, and I will fix it.)
1756 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1757 long width, long height,
1758 long lumStride, long chromStride, long srcStride)
1760 long y;
1761 const x86_reg chromWidth= width>>1;
1762 for (y=0; y<height; y+=2)
1764 #if HAVE_MMX
1765 __asm__ volatile(
1766 "xor %%"REG_a", %%"REG_a" \n\t"
1767 "pcmpeqw %%mm7, %%mm7 \n\t"
1768 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1769 ASMALIGN(4)
1770 "1: \n\t"
1771 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1772 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1773 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1774 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1775 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1776 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1777 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1778 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1779 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1780 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1781 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1783 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1785 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1786 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1787 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1788 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1789 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1790 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1791 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1792 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1793 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1794 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1796 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1798 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1799 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1800 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1801 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1802 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1803 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1804 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1805 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1807 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1808 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1810 "add $8, %%"REG_a" \n\t"
1811 "cmp %4, %%"REG_a" \n\t"
1812 " jb 1b \n\t"
1813 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1814 : "memory", "%"REG_a
1817 ydst += lumStride;
1818 src += srcStride;
1820 __asm__ volatile(
1821 "xor %%"REG_a", %%"REG_a" \n\t"
1822 ASMALIGN(4)
1823 "1: \n\t"
1824 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1825 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1826 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1827 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1828 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1829 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1830 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1831 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1832 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1833 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1834 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1836 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1837 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1839 "add $8, %%"REG_a" \n\t"
1840 "cmp %4, %%"REG_a" \n\t"
1841 " jb 1b \n\t"
1843 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1844 : "memory", "%"REG_a
1846 #else
1847 long i;
1848 for (i=0; i<chromWidth; i++)
1850 ydst[2*i+0] = src[4*i+0];
1851 udst[i] = src[4*i+1];
1852 ydst[2*i+1] = src[4*i+2];
1853 vdst[i] = src[4*i+3];
1855 ydst += lumStride;
1856 src += srcStride;
1858 for (i=0; i<chromWidth; i++)
1860 ydst[2*i+0] = src[4*i+0];
1861 ydst[2*i+1] = src[4*i+2];
1863 #endif
1864 udst += chromStride;
1865 vdst += chromStride;
1866 ydst += lumStride;
1867 src += srcStride;
1869 #if HAVE_MMX
1870 __asm__ volatile( EMMS" \n\t"
1871 SFENCE" \n\t"
1872 :::"memory");
1873 #endif
1876 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1877 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1878 long width, long height, long lumStride, long chromStride)
1880 /* Y Plane */
1881 memcpy(ydst, ysrc, width*height);
1883 /* XXX: implement upscaling for U,V */
1886 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1888 long x,y;
1890 dst[0]= src[0];
1892 // first line
1893 for (x=0; x<srcWidth-1; x++){
1894 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1895 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1897 dst[2*srcWidth-1]= src[srcWidth-1];
1899 dst+= dstStride;
1901 for (y=1; y<srcHeight; y++){
1902 #if HAVE_MMX2 || HAVE_AMD3DNOW
1903 const x86_reg mmxSize= srcWidth&~15;
1904 __asm__ volatile(
1905 "mov %4, %%"REG_a" \n\t"
1906 "1: \n\t"
1907 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1908 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1909 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1910 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1911 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1912 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1913 PAVGB" %%mm0, %%mm5 \n\t"
1914 PAVGB" %%mm0, %%mm3 \n\t"
1915 PAVGB" %%mm0, %%mm5 \n\t"
1916 PAVGB" %%mm0, %%mm3 \n\t"
1917 PAVGB" %%mm1, %%mm4 \n\t"
1918 PAVGB" %%mm1, %%mm2 \n\t"
1919 PAVGB" %%mm1, %%mm4 \n\t"
1920 PAVGB" %%mm1, %%mm2 \n\t"
1921 "movq %%mm5, %%mm7 \n\t"
1922 "movq %%mm4, %%mm6 \n\t"
1923 "punpcklbw %%mm3, %%mm5 \n\t"
1924 "punpckhbw %%mm3, %%mm7 \n\t"
1925 "punpcklbw %%mm2, %%mm4 \n\t"
1926 "punpckhbw %%mm2, %%mm6 \n\t"
1927 #if 1
1928 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1929 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1930 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1931 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1932 #else
1933 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1934 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1935 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1936 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1937 #endif
1938 "add $8, %%"REG_a" \n\t"
1939 " js 1b \n\t"
1940 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1941 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1942 "g" (-mmxSize)
1943 : "%"REG_a
1946 #else
1947 const x86_reg mmxSize=1;
1948 #endif
1949 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1950 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1952 for (x=mmxSize-1; x<srcWidth-1; x++){
1953 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1954 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1955 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1956 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1958 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1959 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1961 dst+=dstStride*2;
1962 src+=srcStride;
1965 // last line
1966 #if 1
1967 dst[0]= src[0];
1969 for (x=0; x<srcWidth-1; x++){
1970 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1971 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1973 dst[2*srcWidth-1]= src[srcWidth-1];
1974 #else
1975 for (x=0; x<srcWidth; x++){
1976 dst[2*x+0]=
1977 dst[2*x+1]= src[x];
1979 #endif
1981 #if HAVE_MMX
1982 __asm__ volatile( EMMS" \n\t"
1983 SFENCE" \n\t"
1984 :::"memory");
1985 #endif
1989 * Height should be a multiple of 2 and width should be a multiple of 16.
1990 * (If this is a problem for anyone then tell me, and I will fix it.)
1991 * Chrominance data is only taken from every second line, others are ignored.
1992 * FIXME: Write HQ version.
1994 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1995 long width, long height,
1996 long lumStride, long chromStride, long srcStride)
1998 long y;
1999 const x86_reg chromWidth= width>>1;
2000 for (y=0; y<height; y+=2)
2002 #if HAVE_MMX
2003 __asm__ volatile(
2004 "xor %%"REG_a", %%"REG_a" \n\t"
2005 "pcmpeqw %%mm7, %%mm7 \n\t"
2006 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2007 ASMALIGN(4)
2008 "1: \n\t"
2009 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2010 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
2011 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
2012 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2013 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2014 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2015 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2016 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2017 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2018 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2019 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2021 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
2023 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
2024 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
2025 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2026 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2027 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2028 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2029 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2030 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2031 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2032 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2034 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
2036 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2037 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2038 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2039 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2040 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2041 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2042 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2043 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2045 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
2046 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
2048 "add $8, %%"REG_a" \n\t"
2049 "cmp %4, %%"REG_a" \n\t"
2050 " jb 1b \n\t"
2051 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2052 : "memory", "%"REG_a
2055 ydst += lumStride;
2056 src += srcStride;
2058 __asm__ volatile(
2059 "xor %%"REG_a", %%"REG_a" \n\t"
2060 ASMALIGN(4)
2061 "1: \n\t"
2062 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2063 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
2064 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
2065 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
2066 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
2067 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2068 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2069 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2070 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2071 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2072 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2074 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
2075 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2077 "add $8, %%"REG_a" \n\t"
2078 "cmp %4, %%"REG_a" \n\t"
2079 " jb 1b \n\t"
2081 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2082 : "memory", "%"REG_a
2084 #else
2085 long i;
2086 for (i=0; i<chromWidth; i++)
2088 udst[i] = src[4*i+0];
2089 ydst[2*i+0] = src[4*i+1];
2090 vdst[i] = src[4*i+2];
2091 ydst[2*i+1] = src[4*i+3];
2093 ydst += lumStride;
2094 src += srcStride;
2096 for (i=0; i<chromWidth; i++)
2098 ydst[2*i+0] = src[4*i+1];
2099 ydst[2*i+1] = src[4*i+3];
2101 #endif
2102 udst += chromStride;
2103 vdst += chromStride;
2104 ydst += lumStride;
2105 src += srcStride;
2107 #if HAVE_MMX
2108 __asm__ volatile( EMMS" \n\t"
2109 SFENCE" \n\t"
2110 :::"memory");
2111 #endif
2115 * Height should be a multiple of 2 and width should be a multiple of 2.
2116 * (If this is a problem for anyone then tell me, and I will fix it.)
2117 * Chrominance data is only taken from every second line,
2118 * others are ignored in the C version.
2119 * FIXME: Write HQ version.
2121 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2122 long width, long height,
2123 long lumStride, long chromStride, long srcStride)
2125 long y;
2126 const x86_reg chromWidth= width>>1;
2127 #if HAVE_MMX
2128 for (y=0; y<height-2; y+=2)
2130 long i;
2131 for (i=0; i<2; i++)
2133 __asm__ volatile(
2134 "mov %2, %%"REG_a" \n\t"
2135 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2136 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2137 "pxor %%mm7, %%mm7 \n\t"
2138 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2139 ASMALIGN(4)
2140 "1: \n\t"
2141 PREFETCH" 64(%0, %%"REG_d") \n\t"
2142 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2143 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2144 "punpcklbw %%mm7, %%mm0 \n\t"
2145 "punpcklbw %%mm7, %%mm1 \n\t"
2146 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2147 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2148 "punpcklbw %%mm7, %%mm2 \n\t"
2149 "punpcklbw %%mm7, %%mm3 \n\t"
2150 "pmaddwd %%mm6, %%mm0 \n\t"
2151 "pmaddwd %%mm6, %%mm1 \n\t"
2152 "pmaddwd %%mm6, %%mm2 \n\t"
2153 "pmaddwd %%mm6, %%mm3 \n\t"
2154 #ifndef FAST_BGR2YV12
2155 "psrad $8, %%mm0 \n\t"
2156 "psrad $8, %%mm1 \n\t"
2157 "psrad $8, %%mm2 \n\t"
2158 "psrad $8, %%mm3 \n\t"
2159 #endif
2160 "packssdw %%mm1, %%mm0 \n\t"
2161 "packssdw %%mm3, %%mm2 \n\t"
2162 "pmaddwd %%mm5, %%mm0 \n\t"
2163 "pmaddwd %%mm5, %%mm2 \n\t"
2164 "packssdw %%mm2, %%mm0 \n\t"
2165 "psraw $7, %%mm0 \n\t"
2167 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2168 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2169 "punpcklbw %%mm7, %%mm4 \n\t"
2170 "punpcklbw %%mm7, %%mm1 \n\t"
2171 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2172 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2173 "punpcklbw %%mm7, %%mm2 \n\t"
2174 "punpcklbw %%mm7, %%mm3 \n\t"
2175 "pmaddwd %%mm6, %%mm4 \n\t"
2176 "pmaddwd %%mm6, %%mm1 \n\t"
2177 "pmaddwd %%mm6, %%mm2 \n\t"
2178 "pmaddwd %%mm6, %%mm3 \n\t"
2179 #ifndef FAST_BGR2YV12
2180 "psrad $8, %%mm4 \n\t"
2181 "psrad $8, %%mm1 \n\t"
2182 "psrad $8, %%mm2 \n\t"
2183 "psrad $8, %%mm3 \n\t"
2184 #endif
2185 "packssdw %%mm1, %%mm4 \n\t"
2186 "packssdw %%mm3, %%mm2 \n\t"
2187 "pmaddwd %%mm5, %%mm4 \n\t"
2188 "pmaddwd %%mm5, %%mm2 \n\t"
2189 "add $24, %%"REG_d" \n\t"
2190 "packssdw %%mm2, %%mm4 \n\t"
2191 "psraw $7, %%mm4 \n\t"
2193 "packuswb %%mm4, %%mm0 \n\t"
2194 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2196 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2197 "add $8, %%"REG_a" \n\t"
2198 " js 1b \n\t"
2199 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2200 : "%"REG_a, "%"REG_d
2202 ydst += lumStride;
2203 src += srcStride;
2205 src -= srcStride*2;
2206 __asm__ volatile(
2207 "mov %4, %%"REG_a" \n\t"
2208 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2209 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2210 "pxor %%mm7, %%mm7 \n\t"
2211 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2212 "add %%"REG_d", %%"REG_d" \n\t"
2213 ASMALIGN(4)
2214 "1: \n\t"
2215 PREFETCH" 64(%0, %%"REG_d") \n\t"
2216 PREFETCH" 64(%1, %%"REG_d") \n\t"
2217 #if HAVE_MMX2 || HAVE_AMD3DNOW
2218 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2219 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2220 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2221 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2222 PAVGB" %%mm1, %%mm0 \n\t"
2223 PAVGB" %%mm3, %%mm2 \n\t"
2224 "movq %%mm0, %%mm1 \n\t"
2225 "movq %%mm2, %%mm3 \n\t"
2226 "psrlq $24, %%mm0 \n\t"
2227 "psrlq $24, %%mm2 \n\t"
2228 PAVGB" %%mm1, %%mm0 \n\t"
2229 PAVGB" %%mm3, %%mm2 \n\t"
2230 "punpcklbw %%mm7, %%mm0 \n\t"
2231 "punpcklbw %%mm7, %%mm2 \n\t"
2232 #else
2233 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2234 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2235 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2236 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2237 "punpcklbw %%mm7, %%mm0 \n\t"
2238 "punpcklbw %%mm7, %%mm1 \n\t"
2239 "punpcklbw %%mm7, %%mm2 \n\t"
2240 "punpcklbw %%mm7, %%mm3 \n\t"
2241 "paddw %%mm1, %%mm0 \n\t"
2242 "paddw %%mm3, %%mm2 \n\t"
2243 "paddw %%mm2, %%mm0 \n\t"
2244 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2245 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2246 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2247 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2248 "punpcklbw %%mm7, %%mm4 \n\t"
2249 "punpcklbw %%mm7, %%mm1 \n\t"
2250 "punpcklbw %%mm7, %%mm2 \n\t"
2251 "punpcklbw %%mm7, %%mm3 \n\t"
2252 "paddw %%mm1, %%mm4 \n\t"
2253 "paddw %%mm3, %%mm2 \n\t"
2254 "paddw %%mm4, %%mm2 \n\t"
2255 "psrlw $2, %%mm0 \n\t"
2256 "psrlw $2, %%mm2 \n\t"
2257 #endif
2258 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2259 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2261 "pmaddwd %%mm0, %%mm1 \n\t"
2262 "pmaddwd %%mm2, %%mm3 \n\t"
2263 "pmaddwd %%mm6, %%mm0 \n\t"
2264 "pmaddwd %%mm6, %%mm2 \n\t"
2265 #ifndef FAST_BGR2YV12
2266 "psrad $8, %%mm0 \n\t"
2267 "psrad $8, %%mm1 \n\t"
2268 "psrad $8, %%mm2 \n\t"
2269 "psrad $8, %%mm3 \n\t"
2270 #endif
2271 "packssdw %%mm2, %%mm0 \n\t"
2272 "packssdw %%mm3, %%mm1 \n\t"
2273 "pmaddwd %%mm5, %%mm0 \n\t"
2274 "pmaddwd %%mm5, %%mm1 \n\t"
2275 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2276 "psraw $7, %%mm0 \n\t"
2278 #if HAVE_MMX2 || HAVE_AMD3DNOW
2279 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2280 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2281 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2282 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2283 PAVGB" %%mm1, %%mm4 \n\t"
2284 PAVGB" %%mm3, %%mm2 \n\t"
2285 "movq %%mm4, %%mm1 \n\t"
2286 "movq %%mm2, %%mm3 \n\t"
2287 "psrlq $24, %%mm4 \n\t"
2288 "psrlq $24, %%mm2 \n\t"
2289 PAVGB" %%mm1, %%mm4 \n\t"
2290 PAVGB" %%mm3, %%mm2 \n\t"
2291 "punpcklbw %%mm7, %%mm4 \n\t"
2292 "punpcklbw %%mm7, %%mm2 \n\t"
2293 #else
2294 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2295 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2296 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2297 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2298 "punpcklbw %%mm7, %%mm4 \n\t"
2299 "punpcklbw %%mm7, %%mm1 \n\t"
2300 "punpcklbw %%mm7, %%mm2 \n\t"
2301 "punpcklbw %%mm7, %%mm3 \n\t"
2302 "paddw %%mm1, %%mm4 \n\t"
2303 "paddw %%mm3, %%mm2 \n\t"
2304 "paddw %%mm2, %%mm4 \n\t"
2305 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2306 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2307 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2308 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2309 "punpcklbw %%mm7, %%mm5 \n\t"
2310 "punpcklbw %%mm7, %%mm1 \n\t"
2311 "punpcklbw %%mm7, %%mm2 \n\t"
2312 "punpcklbw %%mm7, %%mm3 \n\t"
2313 "paddw %%mm1, %%mm5 \n\t"
2314 "paddw %%mm3, %%mm2 \n\t"
2315 "paddw %%mm5, %%mm2 \n\t"
2316 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2317 "psrlw $2, %%mm4 \n\t"
2318 "psrlw $2, %%mm2 \n\t"
2319 #endif
2320 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2321 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2323 "pmaddwd %%mm4, %%mm1 \n\t"
2324 "pmaddwd %%mm2, %%mm3 \n\t"
2325 "pmaddwd %%mm6, %%mm4 \n\t"
2326 "pmaddwd %%mm6, %%mm2 \n\t"
2327 #ifndef FAST_BGR2YV12
2328 "psrad $8, %%mm4 \n\t"
2329 "psrad $8, %%mm1 \n\t"
2330 "psrad $8, %%mm2 \n\t"
2331 "psrad $8, %%mm3 \n\t"
2332 #endif
2333 "packssdw %%mm2, %%mm4 \n\t"
2334 "packssdw %%mm3, %%mm1 \n\t"
2335 "pmaddwd %%mm5, %%mm4 \n\t"
2336 "pmaddwd %%mm5, %%mm1 \n\t"
2337 "add $24, %%"REG_d" \n\t"
2338 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2339 "psraw $7, %%mm4 \n\t"
2341 "movq %%mm0, %%mm1 \n\t"
2342 "punpckldq %%mm4, %%mm0 \n\t"
2343 "punpckhdq %%mm4, %%mm1 \n\t"
2344 "packsswb %%mm1, %%mm0 \n\t"
2345 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2346 "movd %%mm0, (%2, %%"REG_a") \n\t"
2347 "punpckhdq %%mm0, %%mm0 \n\t"
2348 "movd %%mm0, (%3, %%"REG_a") \n\t"
2349 "add $4, %%"REG_a" \n\t"
2350 " js 1b \n\t"
2351 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2352 : "%"REG_a, "%"REG_d
2355 udst += chromStride;
2356 vdst += chromStride;
2357 src += srcStride*2;
2360 __asm__ volatile( EMMS" \n\t"
2361 SFENCE" \n\t"
2362 :::"memory");
2363 #else
2364 y=0;
2365 #endif
2366 for (; y<height; y+=2)
2368 long i;
2369 for (i=0; i<chromWidth; i++)
2371 unsigned int b = src[6*i+0];
2372 unsigned int g = src[6*i+1];
2373 unsigned int r = src[6*i+2];
2375 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2376 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2377 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2379 udst[i] = U;
2380 vdst[i] = V;
2381 ydst[2*i] = Y;
2383 b = src[6*i+3];
2384 g = src[6*i+4];
2385 r = src[6*i+5];
2387 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2388 ydst[2*i+1] = Y;
2390 ydst += lumStride;
2391 src += srcStride;
2393 for (i=0; i<chromWidth; i++)
2395 unsigned int b = src[6*i+0];
2396 unsigned int g = src[6*i+1];
2397 unsigned int r = src[6*i+2];
2399 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2401 ydst[2*i] = Y;
2403 b = src[6*i+3];
2404 g = src[6*i+4];
2405 r = src[6*i+5];
2407 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2408 ydst[2*i+1] = Y;
2410 udst += chromStride;
2411 vdst += chromStride;
2412 ydst += lumStride;
2413 src += srcStride;
2417 static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2418 long width, long height, long src1Stride,
2419 long src2Stride, long dstStride){
2420 long h;
2422 for (h=0; h < height; h++)
2424 long w;
2426 #if HAVE_MMX
2427 #if HAVE_SSE2
2428 __asm__(
2429 "xor %%"REG_a", %%"REG_a" \n\t"
2430 "1: \n\t"
2431 PREFETCH" 64(%1, %%"REG_a") \n\t"
2432 PREFETCH" 64(%2, %%"REG_a") \n\t"
2433 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2434 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2435 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2436 "punpcklbw %%xmm2, %%xmm0 \n\t"
2437 "punpckhbw %%xmm2, %%xmm1 \n\t"
2438 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2439 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2440 "add $16, %%"REG_a" \n\t"
2441 "cmp %3, %%"REG_a" \n\t"
2442 " jb 1b \n\t"
2443 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2444 : "memory", "%"REG_a""
2446 #else
2447 __asm__(
2448 "xor %%"REG_a", %%"REG_a" \n\t"
2449 "1: \n\t"
2450 PREFETCH" 64(%1, %%"REG_a") \n\t"
2451 PREFETCH" 64(%2, %%"REG_a") \n\t"
2452 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2453 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2454 "movq %%mm0, %%mm1 \n\t"
2455 "movq %%mm2, %%mm3 \n\t"
2456 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2457 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2458 "punpcklbw %%mm4, %%mm0 \n\t"
2459 "punpckhbw %%mm4, %%mm1 \n\t"
2460 "punpcklbw %%mm5, %%mm2 \n\t"
2461 "punpckhbw %%mm5, %%mm3 \n\t"
2462 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2463 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2464 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2465 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2466 "add $16, %%"REG_a" \n\t"
2467 "cmp %3, %%"REG_a" \n\t"
2468 " jb 1b \n\t"
2469 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2470 : "memory", "%"REG_a
2472 #endif
2473 for (w= (width&(~15)); w < width; w++)
2475 dest[2*w+0] = src1[w];
2476 dest[2*w+1] = src2[w];
2478 #else
2479 for (w=0; w < width; w++)
2481 dest[2*w+0] = src1[w];
2482 dest[2*w+1] = src2[w];
2484 #endif
2485 dest += dstStride;
2486 src1 += src1Stride;
2487 src2 += src2Stride;
2489 #if HAVE_MMX
2490 __asm__(
2491 EMMS" \n\t"
2492 SFENCE" \n\t"
2493 ::: "memory"
2495 #endif
2498 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2499 uint8_t *dst1, uint8_t *dst2,
2500 long width, long height,
2501 long srcStride1, long srcStride2,
2502 long dstStride1, long dstStride2)
2504 x86_reg y;
2505 long x,w,h;
2506 w=width/2; h=height/2;
2507 #if HAVE_MMX
2508 __asm__ volatile(
2509 PREFETCH" %0 \n\t"
2510 PREFETCH" %1 \n\t"
2511 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2512 #endif
2513 for (y=0;y<h;y++){
2514 const uint8_t* s1=src1+srcStride1*(y>>1);
2515 uint8_t* d=dst1+dstStride1*y;
2516 x=0;
2517 #if HAVE_MMX
2518 for (;x<w-31;x+=32)
2520 __asm__ volatile(
2521 PREFETCH" 32%1 \n\t"
2522 "movq %1, %%mm0 \n\t"
2523 "movq 8%1, %%mm2 \n\t"
2524 "movq 16%1, %%mm4 \n\t"
2525 "movq 24%1, %%mm6 \n\t"
2526 "movq %%mm0, %%mm1 \n\t"
2527 "movq %%mm2, %%mm3 \n\t"
2528 "movq %%mm4, %%mm5 \n\t"
2529 "movq %%mm6, %%mm7 \n\t"
2530 "punpcklbw %%mm0, %%mm0 \n\t"
2531 "punpckhbw %%mm1, %%mm1 \n\t"
2532 "punpcklbw %%mm2, %%mm2 \n\t"
2533 "punpckhbw %%mm3, %%mm3 \n\t"
2534 "punpcklbw %%mm4, %%mm4 \n\t"
2535 "punpckhbw %%mm5, %%mm5 \n\t"
2536 "punpcklbw %%mm6, %%mm6 \n\t"
2537 "punpckhbw %%mm7, %%mm7 \n\t"
2538 MOVNTQ" %%mm0, %0 \n\t"
2539 MOVNTQ" %%mm1, 8%0 \n\t"
2540 MOVNTQ" %%mm2, 16%0 \n\t"
2541 MOVNTQ" %%mm3, 24%0 \n\t"
2542 MOVNTQ" %%mm4, 32%0 \n\t"
2543 MOVNTQ" %%mm5, 40%0 \n\t"
2544 MOVNTQ" %%mm6, 48%0 \n\t"
2545 MOVNTQ" %%mm7, 56%0"
2546 :"=m"(d[2*x])
2547 :"m"(s1[x])
2548 :"memory");
2550 #endif
2551 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2553 for (y=0;y<h;y++){
2554 const uint8_t* s2=src2+srcStride2*(y>>1);
2555 uint8_t* d=dst2+dstStride2*y;
2556 x=0;
2557 #if HAVE_MMX
2558 for (;x<w-31;x+=32)
2560 __asm__ volatile(
2561 PREFETCH" 32%1 \n\t"
2562 "movq %1, %%mm0 \n\t"
2563 "movq 8%1, %%mm2 \n\t"
2564 "movq 16%1, %%mm4 \n\t"
2565 "movq 24%1, %%mm6 \n\t"
2566 "movq %%mm0, %%mm1 \n\t"
2567 "movq %%mm2, %%mm3 \n\t"
2568 "movq %%mm4, %%mm5 \n\t"
2569 "movq %%mm6, %%mm7 \n\t"
2570 "punpcklbw %%mm0, %%mm0 \n\t"
2571 "punpckhbw %%mm1, %%mm1 \n\t"
2572 "punpcklbw %%mm2, %%mm2 \n\t"
2573 "punpckhbw %%mm3, %%mm3 \n\t"
2574 "punpcklbw %%mm4, %%mm4 \n\t"
2575 "punpckhbw %%mm5, %%mm5 \n\t"
2576 "punpcklbw %%mm6, %%mm6 \n\t"
2577 "punpckhbw %%mm7, %%mm7 \n\t"
2578 MOVNTQ" %%mm0, %0 \n\t"
2579 MOVNTQ" %%mm1, 8%0 \n\t"
2580 MOVNTQ" %%mm2, 16%0 \n\t"
2581 MOVNTQ" %%mm3, 24%0 \n\t"
2582 MOVNTQ" %%mm4, 32%0 \n\t"
2583 MOVNTQ" %%mm5, 40%0 \n\t"
2584 MOVNTQ" %%mm6, 48%0 \n\t"
2585 MOVNTQ" %%mm7, 56%0"
2586 :"=m"(d[2*x])
2587 :"m"(s2[x])
2588 :"memory");
2590 #endif
2591 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2593 #if HAVE_MMX
2594 __asm__(
2595 EMMS" \n\t"
2596 SFENCE" \n\t"
2597 ::: "memory"
2599 #endif
2602 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2603 uint8_t *dst,
2604 long width, long height,
2605 long srcStride1, long srcStride2,
2606 long srcStride3, long dstStride)
2608 x86_reg x;
2609 long y,w,h;
2610 w=width/2; h=height;
2611 for (y=0;y<h;y++){
2612 const uint8_t* yp=src1+srcStride1*y;
2613 const uint8_t* up=src2+srcStride2*(y>>2);
2614 const uint8_t* vp=src3+srcStride3*(y>>2);
2615 uint8_t* d=dst+dstStride*y;
2616 x=0;
2617 #if HAVE_MMX
2618 for (;x<w-7;x+=8)
2620 __asm__ volatile(
2621 PREFETCH" 32(%1, %0) \n\t"
2622 PREFETCH" 32(%2, %0) \n\t"
2623 PREFETCH" 32(%3, %0) \n\t"
2624 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2625 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2626 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2627 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2628 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2629 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2630 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2631 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2632 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2633 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2635 "movq %%mm1, %%mm6 \n\t"
2636 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2637 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2638 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2639 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2640 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2642 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2643 "movq 8(%1, %0, 4), %%mm0 \n\t"
2644 "movq %%mm0, %%mm3 \n\t"
2645 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2646 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2647 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2648 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2650 "movq %%mm4, %%mm6 \n\t"
2651 "movq 16(%1, %0, 4), %%mm0 \n\t"
2652 "movq %%mm0, %%mm3 \n\t"
2653 "punpcklbw %%mm5, %%mm4 \n\t"
2654 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2655 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2656 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2657 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2659 "punpckhbw %%mm5, %%mm6 \n\t"
2660 "movq 24(%1, %0, 4), %%mm0 \n\t"
2661 "movq %%mm0, %%mm3 \n\t"
2662 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2663 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2664 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2665 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2667 : "+r" (x)
2668 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2669 :"memory");
2671 #endif
2672 for (; x<w; x++)
2674 const long x2 = x<<2;
2675 d[8*x+0] = yp[x2];
2676 d[8*x+1] = up[x];
2677 d[8*x+2] = yp[x2+1];
2678 d[8*x+3] = vp[x];
2679 d[8*x+4] = yp[x2+2];
2680 d[8*x+5] = up[x];
2681 d[8*x+6] = yp[x2+3];
2682 d[8*x+7] = vp[x];
2685 #if HAVE_MMX
2686 __asm__(
2687 EMMS" \n\t"
2688 SFENCE" \n\t"
2689 ::: "memory"
2691 #endif
2694 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2696 dst += count;
2697 src += 2*count;
2698 count= - count;
2700 #if HAVE_MMX
2701 if(count <= -16){
2702 count += 15;
2703 __asm__ volatile(
2704 "pcmpeqw %%mm7, %%mm7 \n\t"
2705 "psrlw $8, %%mm7 \n\t"
2706 "1: \n\t"
2707 "movq -30(%1, %0, 2), %%mm0 \n\t"
2708 "movq -22(%1, %0, 2), %%mm1 \n\t"
2709 "movq -14(%1, %0, 2), %%mm2 \n\t"
2710 "movq -6(%1, %0, 2), %%mm3 \n\t"
2711 "pand %%mm7, %%mm0 \n\t"
2712 "pand %%mm7, %%mm1 \n\t"
2713 "pand %%mm7, %%mm2 \n\t"
2714 "pand %%mm7, %%mm3 \n\t"
2715 "packuswb %%mm1, %%mm0 \n\t"
2716 "packuswb %%mm3, %%mm2 \n\t"
2717 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2718 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2719 "add $16, %0 \n\t"
2720 " js 1b \n\t"
2721 : "+r"(count)
2722 : "r"(src), "r"(dst)
2724 count -= 15;
2726 #endif
2727 while(count<0){
2728 dst[count]= src[2*count];
2729 count++;
2733 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2735 dst0+= count;
2736 dst1+= count;
2737 src += 4*count;
2738 count= - count;
2739 #if HAVE_MMX
2740 if(count <= -8){
2741 count += 7;
2742 __asm__ volatile(
2743 "pcmpeqw %%mm7, %%mm7 \n\t"
2744 "psrlw $8, %%mm7 \n\t"
2745 "1: \n\t"
2746 "movq -28(%1, %0, 4), %%mm0 \n\t"
2747 "movq -20(%1, %0, 4), %%mm1 \n\t"
2748 "movq -12(%1, %0, 4), %%mm2 \n\t"
2749 "movq -4(%1, %0, 4), %%mm3 \n\t"
2750 "pand %%mm7, %%mm0 \n\t"
2751 "pand %%mm7, %%mm1 \n\t"
2752 "pand %%mm7, %%mm2 \n\t"
2753 "pand %%mm7, %%mm3 \n\t"
2754 "packuswb %%mm1, %%mm0 \n\t"
2755 "packuswb %%mm3, %%mm2 \n\t"
2756 "movq %%mm0, %%mm1 \n\t"
2757 "movq %%mm2, %%mm3 \n\t"
2758 "psrlw $8, %%mm0 \n\t"
2759 "psrlw $8, %%mm2 \n\t"
2760 "pand %%mm7, %%mm1 \n\t"
2761 "pand %%mm7, %%mm3 \n\t"
2762 "packuswb %%mm2, %%mm0 \n\t"
2763 "packuswb %%mm3, %%mm1 \n\t"
2764 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2765 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2766 "add $8, %0 \n\t"
2767 " js 1b \n\t"
2768 : "+r"(count)
2769 : "r"(src), "r"(dst0), "r"(dst1)
2771 count -= 7;
2773 #endif
2774 while(count<0){
2775 dst0[count]= src[4*count+0];
2776 dst1[count]= src[4*count+2];
2777 count++;
2781 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2783 dst0 += count;
2784 dst1 += count;
2785 src0 += 4*count;
2786 src1 += 4*count;
2787 count= - count;
2788 #ifdef PAVGB
2789 if(count <= -8){
2790 count += 7;
2791 __asm__ volatile(
2792 "pcmpeqw %%mm7, %%mm7 \n\t"
2793 "psrlw $8, %%mm7 \n\t"
2794 "1: \n\t"
2795 "movq -28(%1, %0, 4), %%mm0 \n\t"
2796 "movq -20(%1, %0, 4), %%mm1 \n\t"
2797 "movq -12(%1, %0, 4), %%mm2 \n\t"
2798 "movq -4(%1, %0, 4), %%mm3 \n\t"
2799 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2800 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2801 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2802 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2803 "pand %%mm7, %%mm0 \n\t"
2804 "pand %%mm7, %%mm1 \n\t"
2805 "pand %%mm7, %%mm2 \n\t"
2806 "pand %%mm7, %%mm3 \n\t"
2807 "packuswb %%mm1, %%mm0 \n\t"
2808 "packuswb %%mm3, %%mm2 \n\t"
2809 "movq %%mm0, %%mm1 \n\t"
2810 "movq %%mm2, %%mm3 \n\t"
2811 "psrlw $8, %%mm0 \n\t"
2812 "psrlw $8, %%mm2 \n\t"
2813 "pand %%mm7, %%mm1 \n\t"
2814 "pand %%mm7, %%mm3 \n\t"
2815 "packuswb %%mm2, %%mm0 \n\t"
2816 "packuswb %%mm3, %%mm1 \n\t"
2817 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2818 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2819 "add $8, %0 \n\t"
2820 " js 1b \n\t"
2821 : "+r"(count)
2822 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2824 count -= 7;
2826 #endif
2827 while(count<0){
2828 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2829 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2830 count++;
2834 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2836 dst0+= count;
2837 dst1+= count;
2838 src += 4*count;
2839 count= - count;
2840 #if HAVE_MMX
2841 if(count <= -8){
2842 count += 7;
2843 __asm__ volatile(
2844 "pcmpeqw %%mm7, %%mm7 \n\t"
2845 "psrlw $8, %%mm7 \n\t"
2846 "1: \n\t"
2847 "movq -28(%1, %0, 4), %%mm0 \n\t"
2848 "movq -20(%1, %0, 4), %%mm1 \n\t"
2849 "movq -12(%1, %0, 4), %%mm2 \n\t"
2850 "movq -4(%1, %0, 4), %%mm3 \n\t"
2851 "psrlw $8, %%mm0 \n\t"
2852 "psrlw $8, %%mm1 \n\t"
2853 "psrlw $8, %%mm2 \n\t"
2854 "psrlw $8, %%mm3 \n\t"
2855 "packuswb %%mm1, %%mm0 \n\t"
2856 "packuswb %%mm3, %%mm2 \n\t"
2857 "movq %%mm0, %%mm1 \n\t"
2858 "movq %%mm2, %%mm3 \n\t"
2859 "psrlw $8, %%mm0 \n\t"
2860 "psrlw $8, %%mm2 \n\t"
2861 "pand %%mm7, %%mm1 \n\t"
2862 "pand %%mm7, %%mm3 \n\t"
2863 "packuswb %%mm2, %%mm0 \n\t"
2864 "packuswb %%mm3, %%mm1 \n\t"
2865 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2866 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2867 "add $8, %0 \n\t"
2868 " js 1b \n\t"
2869 : "+r"(count)
2870 : "r"(src), "r"(dst0), "r"(dst1)
2872 count -= 7;
2874 #endif
2875 src++;
2876 while(count<0){
2877 dst0[count]= src[4*count+0];
2878 dst1[count]= src[4*count+2];
2879 count++;
2883 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2885 dst0 += count;
2886 dst1 += count;
2887 src0 += 4*count;
2888 src1 += 4*count;
2889 count= - count;
2890 #ifdef PAVGB
2891 if(count <= -8){
2892 count += 7;
2893 __asm__ volatile(
2894 "pcmpeqw %%mm7, %%mm7 \n\t"
2895 "psrlw $8, %%mm7 \n\t"
2896 "1: \n\t"
2897 "movq -28(%1, %0, 4), %%mm0 \n\t"
2898 "movq -20(%1, %0, 4), %%mm1 \n\t"
2899 "movq -12(%1, %0, 4), %%mm2 \n\t"
2900 "movq -4(%1, %0, 4), %%mm3 \n\t"
2901 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2902 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2903 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2904 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2905 "psrlw $8, %%mm0 \n\t"
2906 "psrlw $8, %%mm1 \n\t"
2907 "psrlw $8, %%mm2 \n\t"
2908 "psrlw $8, %%mm3 \n\t"
2909 "packuswb %%mm1, %%mm0 \n\t"
2910 "packuswb %%mm3, %%mm2 \n\t"
2911 "movq %%mm0, %%mm1 \n\t"
2912 "movq %%mm2, %%mm3 \n\t"
2913 "psrlw $8, %%mm0 \n\t"
2914 "psrlw $8, %%mm2 \n\t"
2915 "pand %%mm7, %%mm1 \n\t"
2916 "pand %%mm7, %%mm3 \n\t"
2917 "packuswb %%mm2, %%mm0 \n\t"
2918 "packuswb %%mm3, %%mm1 \n\t"
2919 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2920 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2921 "add $8, %0 \n\t"
2922 " js 1b \n\t"
2923 : "+r"(count)
2924 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2926 count -= 7;
2928 #endif
2929 src0++;
2930 src1++;
2931 while(count<0){
2932 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2933 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2934 count++;
2938 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2939 long width, long height,
2940 long lumStride, long chromStride, long srcStride)
2942 long y;
2943 const long chromWidth= -((-width)>>1);
2945 for (y=0; y<height; y++){
2946 RENAME(extract_even)(src, ydst, width);
2947 if(y&1){
2948 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2949 udst+= chromStride;
2950 vdst+= chromStride;
2953 src += srcStride;
2954 ydst+= lumStride;
2956 #if HAVE_MMX
2957 __asm__(
2958 EMMS" \n\t"
2959 SFENCE" \n\t"
2960 ::: "memory"
2962 #endif
2965 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2966 long width, long height,
2967 long lumStride, long chromStride, long srcStride)
2969 long y;
2970 const long chromWidth= -((-width)>>1);
2972 for (y=0; y<height; y++){
2973 RENAME(extract_even)(src, ydst, width);
2974 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2976 src += srcStride;
2977 ydst+= lumStride;
2978 udst+= chromStride;
2979 vdst+= chromStride;
2981 #if HAVE_MMX
2982 __asm__(
2983 EMMS" \n\t"
2984 SFENCE" \n\t"
2985 ::: "memory"
2987 #endif
2990 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2991 long width, long height,
2992 long lumStride, long chromStride, long srcStride)
2994 long y;
2995 const long chromWidth= -((-width)>>1);
2997 for (y=0; y<height; y++){
2998 RENAME(extract_even)(src+1, ydst, width);
2999 if(y&1){
3000 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
3001 udst+= chromStride;
3002 vdst+= chromStride;
3005 src += srcStride;
3006 ydst+= lumStride;
3008 #if HAVE_MMX
3009 __asm__(
3010 EMMS" \n\t"
3011 SFENCE" \n\t"
3012 ::: "memory"
3014 #endif
3017 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
3018 long width, long height,
3019 long lumStride, long chromStride, long srcStride)
3021 long y;
3022 const long chromWidth= -((-width)>>1);
3024 for (y=0; y<height; y++){
3025 RENAME(extract_even)(src+1, ydst, width);
3026 RENAME(extract_even2)(src, udst, vdst, chromWidth);
3028 src += srcStride;
3029 ydst+= lumStride;
3030 udst+= chromStride;
3031 vdst+= chromStride;
3033 #if HAVE_MMX
3034 __asm__(
3035 EMMS" \n\t"
3036 SFENCE" \n\t"
3037 ::: "memory"
3039 #endif
3042 static inline void RENAME(rgb2rgb_init)(void){
3043 rgb15to16 = RENAME(rgb15to16);
3044 rgb15tobgr24 = RENAME(rgb15tobgr24);
3045 rgb15to32 = RENAME(rgb15to32);
3046 rgb16tobgr24 = RENAME(rgb16tobgr24);
3047 rgb16to32 = RENAME(rgb16to32);
3048 rgb16to15 = RENAME(rgb16to15);
3049 rgb24tobgr16 = RENAME(rgb24tobgr16);
3050 rgb24tobgr15 = RENAME(rgb24tobgr15);
3051 rgb24tobgr32 = RENAME(rgb24tobgr32);
3052 rgb32to16 = RENAME(rgb32to16);
3053 rgb32to15 = RENAME(rgb32to15);
3054 rgb32tobgr24 = RENAME(rgb32tobgr24);
3055 rgb24to15 = RENAME(rgb24to15);
3056 rgb24to16 = RENAME(rgb24to16);
3057 rgb24tobgr24 = RENAME(rgb24tobgr24);
3058 rgb32tobgr32 = RENAME(rgb32tobgr32);
3059 rgb32tobgr16 = RENAME(rgb32tobgr16);
3060 rgb32tobgr15 = RENAME(rgb32tobgr15);
3061 yv12toyuy2 = RENAME(yv12toyuy2);
3062 yv12touyvy = RENAME(yv12touyvy);
3063 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
3064 yuv422ptouyvy = RENAME(yuv422ptouyvy);
3065 yuy2toyv12 = RENAME(yuy2toyv12);
3066 // yvu9toyv12 = RENAME(yvu9toyv12);
3067 planar2x = RENAME(planar2x);
3068 rgb24toyv12 = RENAME(rgb24toyv12);
3069 interleaveBytes = RENAME(interleaveBytes);
3070 vu9_to_vu12 = RENAME(vu9_to_vu12);
3071 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
3073 uyvytoyuv420 = RENAME(uyvytoyuv420);
3074 uyvytoyuv422 = RENAME(uyvytoyuv422);
3075 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
3076 yuyvtoyuv422 = RENAME(yuyvtoyuv422);