printf -> mp_msg
[mplayer/glamo.git] / postproc / rgb2rgb_template.c
blobeda2ccc83a7a312e47a81e5805d6630e2c3ecd20
1 /*
3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9 */
11 #include <stddef.h>
12 #include <inttypes.h> /* for __WORDSIZE */
14 #ifndef __WORDSIZE
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
17 #endif
19 #undef PREFETCH
20 #undef MOVNTQ
21 #undef EMMS
22 #undef SFENCE
23 #undef MMREG_SIZE
24 #undef PREFETCHW
25 #undef PAVGB
27 #ifdef HAVE_SSE2
28 #define MMREG_SIZE 16
29 #else
30 #define MMREG_SIZE 8
31 #endif
33 #ifdef HAVE_3DNOW
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
40 #define PAVGB "pavgb"
41 #else
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
44 #endif
46 #ifdef HAVE_3DNOW
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
48 #define EMMS "femms"
49 #else
50 #define EMMS "emms"
51 #endif
53 #ifdef HAVE_MMX2
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
56 #else
57 #define MOVNTQ "movq"
58 #define SFENCE "/nop"
59 #endif
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
63 uint8_t *dest = dst;
64 const uint8_t *s = src;
65 const uint8_t *end;
66 #ifdef HAVE_MMX
67 const uint8_t *mm_end;
68 #endif
69 end = s + src_size;
70 #ifdef HAVE_MMX
71 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
72 mm_end = end - 23;
73 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
74 while(s < mm_end)
76 __asm __volatile(
77 PREFETCH" 32%1\n\t"
78 "movd %1, %%mm0\n\t"
79 "punpckldq 3%1, %%mm0\n\t"
80 "movd 6%1, %%mm1\n\t"
81 "punpckldq 9%1, %%mm1\n\t"
82 "movd 12%1, %%mm2\n\t"
83 "punpckldq 15%1, %%mm2\n\t"
84 "movd 18%1, %%mm3\n\t"
85 "punpckldq 21%1, %%mm3\n\t"
86 "pand %%mm7, %%mm0\n\t"
87 "pand %%mm7, %%mm1\n\t"
88 "pand %%mm7, %%mm2\n\t"
89 "pand %%mm7, %%mm3\n\t"
90 MOVNTQ" %%mm0, %0\n\t"
91 MOVNTQ" %%mm1, 8%0\n\t"
92 MOVNTQ" %%mm2, 16%0\n\t"
93 MOVNTQ" %%mm3, 24%0"
94 :"=m"(*dest)
95 :"m"(*s)
96 :"memory");
97 dest += 32;
98 s += 24;
100 __asm __volatile(SFENCE:::"memory");
101 __asm __volatile(EMMS:::"memory");
102 #endif
103 while(s < end)
105 *dest++ = *s++;
106 *dest++ = *s++;
107 *dest++ = *s++;
108 *dest++ = 0;
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
114 uint8_t *dest = dst;
115 const uint8_t *s = src;
116 const uint8_t *end;
117 #ifdef HAVE_MMX
118 const uint8_t *mm_end;
119 #endif
120 end = s + src_size;
121 #ifdef HAVE_MMX
122 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
123 mm_end = end - 31;
124 while(s < mm_end)
126 __asm __volatile(
127 PREFETCH" 32%1\n\t"
128 "movq %1, %%mm0\n\t"
129 "movq 8%1, %%mm1\n\t"
130 "movq 16%1, %%mm4\n\t"
131 "movq 24%1, %%mm5\n\t"
132 "movq %%mm0, %%mm2\n\t"
133 "movq %%mm1, %%mm3\n\t"
134 "movq %%mm4, %%mm6\n\t"
135 "movq %%mm5, %%mm7\n\t"
136 "psrlq $8, %%mm2\n\t"
137 "psrlq $8, %%mm3\n\t"
138 "psrlq $8, %%mm6\n\t"
139 "psrlq $8, %%mm7\n\t"
140 "pand %2, %%mm0\n\t"
141 "pand %2, %%mm1\n\t"
142 "pand %2, %%mm4\n\t"
143 "pand %2, %%mm5\n\t"
144 "pand %3, %%mm2\n\t"
145 "pand %3, %%mm3\n\t"
146 "pand %3, %%mm6\n\t"
147 "pand %3, %%mm7\n\t"
148 "por %%mm2, %%mm0\n\t"
149 "por %%mm3, %%mm1\n\t"
150 "por %%mm6, %%mm4\n\t"
151 "por %%mm7, %%mm5\n\t"
153 "movq %%mm1, %%mm2\n\t"
154 "movq %%mm4, %%mm3\n\t"
155 "psllq $48, %%mm2\n\t"
156 "psllq $32, %%mm3\n\t"
157 "pand %4, %%mm2\n\t"
158 "pand %5, %%mm3\n\t"
159 "por %%mm2, %%mm0\n\t"
160 "psrlq $16, %%mm1\n\t"
161 "psrlq $32, %%mm4\n\t"
162 "psllq $16, %%mm5\n\t"
163 "por %%mm3, %%mm1\n\t"
164 "pand %6, %%mm5\n\t"
165 "por %%mm5, %%mm4\n\t"
167 MOVNTQ" %%mm0, %0\n\t"
168 MOVNTQ" %%mm1, 8%0\n\t"
169 MOVNTQ" %%mm4, 16%0"
170 :"=m"(*dest)
171 :"m"(*s),"m"(mask24l),
172 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
173 :"memory");
174 dest += 24;
175 s += 32;
177 __asm __volatile(SFENCE:::"memory");
178 __asm __volatile(EMMS:::"memory");
179 #endif
180 while(s < end)
182 *dest++ = *s++;
183 *dest++ = *s++;
184 *dest++ = *s++;
185 s++;
190 Original by Strepto/Astral
191 ported to gcc & bugfixed : A'rpi
192 MMX2, 3DNOW optimization by Nick Kurshev
193 32bit c version, and and&add trick by Michael Niedermayer
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
197 register const uint8_t* s=src;
198 register uint8_t* d=dst;
199 register const uint8_t *end;
200 const uint8_t *mm_end;
201 end = s + src_size;
202 #ifdef HAVE_MMX
203 __asm __volatile(PREFETCH" %0"::"m"(*s));
204 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
205 mm_end = end - 15;
206 while(s<mm_end)
208 __asm __volatile(
209 PREFETCH" 32%1\n\t"
210 "movq %1, %%mm0\n\t"
211 "movq 8%1, %%mm2\n\t"
212 "movq %%mm0, %%mm1\n\t"
213 "movq %%mm2, %%mm3\n\t"
214 "pand %%mm4, %%mm0\n\t"
215 "pand %%mm4, %%mm2\n\t"
216 "paddw %%mm1, %%mm0\n\t"
217 "paddw %%mm3, %%mm2\n\t"
218 MOVNTQ" %%mm0, %0\n\t"
219 MOVNTQ" %%mm2, 8%0"
220 :"=m"(*d)
221 :"m"(*s)
223 d+=16;
224 s+=16;
226 __asm __volatile(SFENCE:::"memory");
227 __asm __volatile(EMMS:::"memory");
228 #endif
229 mm_end = end - 3;
230 while(s < mm_end)
232 register unsigned x= *((uint32_t *)s);
233 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
234 d+=4;
235 s+=4;
237 if(s < end)
239 register unsigned short x= *((uint16_t *)s);
240 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
244 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
246 register const uint8_t* s=src;
247 register uint8_t* d=dst;
248 register const uint8_t *end;
249 const uint8_t *mm_end;
250 end = s + src_size;
251 #ifdef HAVE_MMX
252 __asm __volatile(PREFETCH" %0"::"m"(*s));
253 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
254 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
255 mm_end = end - 15;
256 while(s<mm_end)
258 __asm __volatile(
259 PREFETCH" 32%1\n\t"
260 "movq %1, %%mm0\n\t"
261 "movq 8%1, %%mm2\n\t"
262 "movq %%mm0, %%mm1\n\t"
263 "movq %%mm2, %%mm3\n\t"
264 "psrlq $1, %%mm0\n\t"
265 "psrlq $1, %%mm2\n\t"
266 "pand %%mm7, %%mm0\n\t"
267 "pand %%mm7, %%mm2\n\t"
268 "pand %%mm6, %%mm1\n\t"
269 "pand %%mm6, %%mm3\n\t"
270 "por %%mm1, %%mm0\n\t"
271 "por %%mm3, %%mm2\n\t"
272 MOVNTQ" %%mm0, %0\n\t"
273 MOVNTQ" %%mm2, 8%0"
274 :"=m"(*d)
275 :"m"(*s)
277 d+=16;
278 s+=16;
280 __asm __volatile(SFENCE:::"memory");
281 __asm __volatile(EMMS:::"memory");
282 #endif
283 mm_end = end - 3;
284 while(s < mm_end)
286 register uint32_t x= *((uint32_t *)s);
287 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
288 s+=4;
289 d+=4;
291 if(s < end)
293 register uint16_t x= *((uint16_t *)s);
294 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
295 s+=2;
296 d+=2;
300 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
302 const uint8_t *s = src;
303 const uint8_t *end;
304 #ifdef HAVE_MMX
305 const uint8_t *mm_end;
306 #endif
307 uint16_t *d = (uint16_t *)dst;
308 end = s + src_size;
309 #ifdef HAVE_MMX
310 mm_end = end - 15;
311 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
312 asm volatile(
313 "movq %3, %%mm5 \n\t"
314 "movq %4, %%mm6 \n\t"
315 "movq %5, %%mm7 \n\t"
316 ".balign 16 \n\t"
317 "1: \n\t"
318 PREFETCH" 32(%1) \n\t"
319 "movd (%1), %%mm0 \n\t"
320 "movd 4(%1), %%mm3 \n\t"
321 "punpckldq 8(%1), %%mm0 \n\t"
322 "punpckldq 12(%1), %%mm3 \n\t"
323 "movq %%mm0, %%mm1 \n\t"
324 "movq %%mm3, %%mm4 \n\t"
325 "pand %%mm6, %%mm0 \n\t"
326 "pand %%mm6, %%mm3 \n\t"
327 "pmaddwd %%mm7, %%mm0 \n\t"
328 "pmaddwd %%mm7, %%mm3 \n\t"
329 "pand %%mm5, %%mm1 \n\t"
330 "pand %%mm5, %%mm4 \n\t"
331 "por %%mm1, %%mm0 \n\t"
332 "por %%mm4, %%mm3 \n\t"
333 "psrld $5, %%mm0 \n\t"
334 "pslld $11, %%mm3 \n\t"
335 "por %%mm3, %%mm0 \n\t"
336 MOVNTQ" %%mm0, (%0) \n\t"
337 "addl $16, %1 \n\t"
338 "addl $8, %0 \n\t"
339 "cmpl %2, %1 \n\t"
340 " jb 1b \n\t"
341 : "+r" (d), "+r"(s)
342 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
344 #else
345 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
346 __asm __volatile(
347 "movq %0, %%mm7\n\t"
348 "movq %1, %%mm6\n\t"
349 ::"m"(red_16mask),"m"(green_16mask));
350 while(s < mm_end)
352 __asm __volatile(
353 PREFETCH" 32%1\n\t"
354 "movd %1, %%mm0\n\t"
355 "movd 4%1, %%mm3\n\t"
356 "punpckldq 8%1, %%mm0\n\t"
357 "punpckldq 12%1, %%mm3\n\t"
358 "movq %%mm0, %%mm1\n\t"
359 "movq %%mm0, %%mm2\n\t"
360 "movq %%mm3, %%mm4\n\t"
361 "movq %%mm3, %%mm5\n\t"
362 "psrlq $3, %%mm0\n\t"
363 "psrlq $3, %%mm3\n\t"
364 "pand %2, %%mm0\n\t"
365 "pand %2, %%mm3\n\t"
366 "psrlq $5, %%mm1\n\t"
367 "psrlq $5, %%mm4\n\t"
368 "pand %%mm6, %%mm1\n\t"
369 "pand %%mm6, %%mm4\n\t"
370 "psrlq $8, %%mm2\n\t"
371 "psrlq $8, %%mm5\n\t"
372 "pand %%mm7, %%mm2\n\t"
373 "pand %%mm7, %%mm5\n\t"
374 "por %%mm1, %%mm0\n\t"
375 "por %%mm4, %%mm3\n\t"
376 "por %%mm2, %%mm0\n\t"
377 "por %%mm5, %%mm3\n\t"
378 "psllq $16, %%mm3\n\t"
379 "por %%mm3, %%mm0\n\t"
380 MOVNTQ" %%mm0, %0\n\t"
381 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
382 d += 4;
383 s += 16;
385 #endif
386 __asm __volatile(SFENCE:::"memory");
387 __asm __volatile(EMMS:::"memory");
388 #endif
389 while(s < end)
391 const int src= *((uint32_t*)s)++;
392 *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393 // *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
397 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
399 const uint8_t *s = src;
400 const uint8_t *end;
401 #ifdef HAVE_MMX
402 const uint8_t *mm_end;
403 #endif
404 uint16_t *d = (uint16_t *)dst;
405 end = s + src_size;
406 #ifdef HAVE_MMX
407 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
408 __asm __volatile(
409 "movq %0, %%mm7\n\t"
410 "movq %1, %%mm6\n\t"
411 ::"m"(red_16mask),"m"(green_16mask));
412 mm_end = end - 15;
413 while(s < mm_end)
415 __asm __volatile(
416 PREFETCH" 32%1\n\t"
417 "movd %1, %%mm0\n\t"
418 "movd 4%1, %%mm3\n\t"
419 "punpckldq 8%1, %%mm0\n\t"
420 "punpckldq 12%1, %%mm3\n\t"
421 "movq %%mm0, %%mm1\n\t"
422 "movq %%mm0, %%mm2\n\t"
423 "movq %%mm3, %%mm4\n\t"
424 "movq %%mm3, %%mm5\n\t"
425 "psllq $8, %%mm0\n\t"
426 "psllq $8, %%mm3\n\t"
427 "pand %%mm7, %%mm0\n\t"
428 "pand %%mm7, %%mm3\n\t"
429 "psrlq $5, %%mm1\n\t"
430 "psrlq $5, %%mm4\n\t"
431 "pand %%mm6, %%mm1\n\t"
432 "pand %%mm6, %%mm4\n\t"
433 "psrlq $19, %%mm2\n\t"
434 "psrlq $19, %%mm5\n\t"
435 "pand %2, %%mm2\n\t"
436 "pand %2, %%mm5\n\t"
437 "por %%mm1, %%mm0\n\t"
438 "por %%mm4, %%mm3\n\t"
439 "por %%mm2, %%mm0\n\t"
440 "por %%mm5, %%mm3\n\t"
441 "psllq $16, %%mm3\n\t"
442 "por %%mm3, %%mm0\n\t"
443 MOVNTQ" %%mm0, %0\n\t"
444 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
445 d += 4;
446 s += 16;
448 __asm __volatile(SFENCE:::"memory");
449 __asm __volatile(EMMS:::"memory");
450 #endif
451 while(s < end)
453 const int src= *((uint32_t*)s)++;
454 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
458 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
460 const uint8_t *s = src;
461 const uint8_t *end;
462 #ifdef HAVE_MMX
463 const uint8_t *mm_end;
464 #endif
465 uint16_t *d = (uint16_t *)dst;
466 end = s + src_size;
467 #ifdef HAVE_MMX
468 mm_end = end - 15;
469 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
470 asm volatile(
471 "movq %3, %%mm5 \n\t"
472 "movq %4, %%mm6 \n\t"
473 "movq %5, %%mm7 \n\t"
474 ".balign 16 \n\t"
475 "1: \n\t"
476 PREFETCH" 32(%1) \n\t"
477 "movd (%1), %%mm0 \n\t"
478 "movd 4(%1), %%mm3 \n\t"
479 "punpckldq 8(%1), %%mm0 \n\t"
480 "punpckldq 12(%1), %%mm3 \n\t"
481 "movq %%mm0, %%mm1 \n\t"
482 "movq %%mm3, %%mm4 \n\t"
483 "pand %%mm6, %%mm0 \n\t"
484 "pand %%mm6, %%mm3 \n\t"
485 "pmaddwd %%mm7, %%mm0 \n\t"
486 "pmaddwd %%mm7, %%mm3 \n\t"
487 "pand %%mm5, %%mm1 \n\t"
488 "pand %%mm5, %%mm4 \n\t"
489 "por %%mm1, %%mm0 \n\t"
490 "por %%mm4, %%mm3 \n\t"
491 "psrld $6, %%mm0 \n\t"
492 "pslld $10, %%mm3 \n\t"
493 "por %%mm3, %%mm0 \n\t"
494 MOVNTQ" %%mm0, (%0) \n\t"
495 "addl $16, %1 \n\t"
496 "addl $8, %0 \n\t"
497 "cmpl %2, %1 \n\t"
498 " jb 1b \n\t"
499 : "+r" (d), "+r"(s)
500 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
502 #else
503 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
504 __asm __volatile(
505 "movq %0, %%mm7\n\t"
506 "movq %1, %%mm6\n\t"
507 ::"m"(red_15mask),"m"(green_15mask));
508 while(s < mm_end)
510 __asm __volatile(
511 PREFETCH" 32%1\n\t"
512 "movd %1, %%mm0\n\t"
513 "movd 4%1, %%mm3\n\t"
514 "punpckldq 8%1, %%mm0\n\t"
515 "punpckldq 12%1, %%mm3\n\t"
516 "movq %%mm0, %%mm1\n\t"
517 "movq %%mm0, %%mm2\n\t"
518 "movq %%mm3, %%mm4\n\t"
519 "movq %%mm3, %%mm5\n\t"
520 "psrlq $3, %%mm0\n\t"
521 "psrlq $3, %%mm3\n\t"
522 "pand %2, %%mm0\n\t"
523 "pand %2, %%mm3\n\t"
524 "psrlq $6, %%mm1\n\t"
525 "psrlq $6, %%mm4\n\t"
526 "pand %%mm6, %%mm1\n\t"
527 "pand %%mm6, %%mm4\n\t"
528 "psrlq $9, %%mm2\n\t"
529 "psrlq $9, %%mm5\n\t"
530 "pand %%mm7, %%mm2\n\t"
531 "pand %%mm7, %%mm5\n\t"
532 "por %%mm1, %%mm0\n\t"
533 "por %%mm4, %%mm3\n\t"
534 "por %%mm2, %%mm0\n\t"
535 "por %%mm5, %%mm3\n\t"
536 "psllq $16, %%mm3\n\t"
537 "por %%mm3, %%mm0\n\t"
538 MOVNTQ" %%mm0, %0\n\t"
539 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
540 d += 4;
541 s += 16;
543 #endif
544 __asm __volatile(SFENCE:::"memory");
545 __asm __volatile(EMMS:::"memory");
546 #endif
547 while(s < end)
549 const int src= *((uint32_t*)s)++;
550 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
554 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
556 const uint8_t *s = src;
557 const uint8_t *end;
558 #ifdef HAVE_MMX
559 const uint8_t *mm_end;
560 #endif
561 uint16_t *d = (uint16_t *)dst;
562 end = s + src_size;
563 #ifdef HAVE_MMX
564 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
565 __asm __volatile(
566 "movq %0, %%mm7\n\t"
567 "movq %1, %%mm6\n\t"
568 ::"m"(red_15mask),"m"(green_15mask));
569 mm_end = end - 15;
570 while(s < mm_end)
572 __asm __volatile(
573 PREFETCH" 32%1\n\t"
574 "movd %1, %%mm0\n\t"
575 "movd 4%1, %%mm3\n\t"
576 "punpckldq 8%1, %%mm0\n\t"
577 "punpckldq 12%1, %%mm3\n\t"
578 "movq %%mm0, %%mm1\n\t"
579 "movq %%mm0, %%mm2\n\t"
580 "movq %%mm3, %%mm4\n\t"
581 "movq %%mm3, %%mm5\n\t"
582 "psllq $7, %%mm0\n\t"
583 "psllq $7, %%mm3\n\t"
584 "pand %%mm7, %%mm0\n\t"
585 "pand %%mm7, %%mm3\n\t"
586 "psrlq $6, %%mm1\n\t"
587 "psrlq $6, %%mm4\n\t"
588 "pand %%mm6, %%mm1\n\t"
589 "pand %%mm6, %%mm4\n\t"
590 "psrlq $19, %%mm2\n\t"
591 "psrlq $19, %%mm5\n\t"
592 "pand %2, %%mm2\n\t"
593 "pand %2, %%mm5\n\t"
594 "por %%mm1, %%mm0\n\t"
595 "por %%mm4, %%mm3\n\t"
596 "por %%mm2, %%mm0\n\t"
597 "por %%mm5, %%mm3\n\t"
598 "psllq $16, %%mm3\n\t"
599 "por %%mm3, %%mm0\n\t"
600 MOVNTQ" %%mm0, %0\n\t"
601 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
602 d += 4;
603 s += 16;
605 __asm __volatile(SFENCE:::"memory");
606 __asm __volatile(EMMS:::"memory");
607 #endif
608 while(s < end)
610 const int src= *((uint32_t*)s)++;
611 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
615 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
617 const uint8_t *s = src;
618 const uint8_t *end;
619 #ifdef HAVE_MMX
620 const uint8_t *mm_end;
621 #endif
622 uint16_t *d = (uint16_t *)dst;
623 end = s + src_size;
624 #ifdef HAVE_MMX
625 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
626 __asm __volatile(
627 "movq %0, %%mm7\n\t"
628 "movq %1, %%mm6\n\t"
629 ::"m"(red_16mask),"m"(green_16mask));
630 mm_end = end - 11;
631 while(s < mm_end)
633 __asm __volatile(
634 PREFETCH" 32%1\n\t"
635 "movd %1, %%mm0\n\t"
636 "movd 3%1, %%mm3\n\t"
637 "punpckldq 6%1, %%mm0\n\t"
638 "punpckldq 9%1, %%mm3\n\t"
639 "movq %%mm0, %%mm1\n\t"
640 "movq %%mm0, %%mm2\n\t"
641 "movq %%mm3, %%mm4\n\t"
642 "movq %%mm3, %%mm5\n\t"
643 "psrlq $3, %%mm0\n\t"
644 "psrlq $3, %%mm3\n\t"
645 "pand %2, %%mm0\n\t"
646 "pand %2, %%mm3\n\t"
647 "psrlq $5, %%mm1\n\t"
648 "psrlq $5, %%mm4\n\t"
649 "pand %%mm6, %%mm1\n\t"
650 "pand %%mm6, %%mm4\n\t"
651 "psrlq $8, %%mm2\n\t"
652 "psrlq $8, %%mm5\n\t"
653 "pand %%mm7, %%mm2\n\t"
654 "pand %%mm7, %%mm5\n\t"
655 "por %%mm1, %%mm0\n\t"
656 "por %%mm4, %%mm3\n\t"
657 "por %%mm2, %%mm0\n\t"
658 "por %%mm5, %%mm3\n\t"
659 "psllq $16, %%mm3\n\t"
660 "por %%mm3, %%mm0\n\t"
661 MOVNTQ" %%mm0, %0\n\t"
662 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
663 d += 4;
664 s += 12;
666 __asm __volatile(SFENCE:::"memory");
667 __asm __volatile(EMMS:::"memory");
668 #endif
669 while(s < end)
671 const int b= *s++;
672 const int g= *s++;
673 const int r= *s++;
674 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
678 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
680 const uint8_t *s = src;
681 const uint8_t *end;
682 #ifdef HAVE_MMX
683 const uint8_t *mm_end;
684 #endif
685 uint16_t *d = (uint16_t *)dst;
686 end = s + src_size;
687 #ifdef HAVE_MMX
688 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
689 __asm __volatile(
690 "movq %0, %%mm7\n\t"
691 "movq %1, %%mm6\n\t"
692 ::"m"(red_16mask),"m"(green_16mask));
693 mm_end = end - 15;
694 while(s < mm_end)
696 __asm __volatile(
697 PREFETCH" 32%1\n\t"
698 "movd %1, %%mm0\n\t"
699 "movd 3%1, %%mm3\n\t"
700 "punpckldq 6%1, %%mm0\n\t"
701 "punpckldq 9%1, %%mm3\n\t"
702 "movq %%mm0, %%mm1\n\t"
703 "movq %%mm0, %%mm2\n\t"
704 "movq %%mm3, %%mm4\n\t"
705 "movq %%mm3, %%mm5\n\t"
706 "psllq $8, %%mm0\n\t"
707 "psllq $8, %%mm3\n\t"
708 "pand %%mm7, %%mm0\n\t"
709 "pand %%mm7, %%mm3\n\t"
710 "psrlq $5, %%mm1\n\t"
711 "psrlq $5, %%mm4\n\t"
712 "pand %%mm6, %%mm1\n\t"
713 "pand %%mm6, %%mm4\n\t"
714 "psrlq $19, %%mm2\n\t"
715 "psrlq $19, %%mm5\n\t"
716 "pand %2, %%mm2\n\t"
717 "pand %2, %%mm5\n\t"
718 "por %%mm1, %%mm0\n\t"
719 "por %%mm4, %%mm3\n\t"
720 "por %%mm2, %%mm0\n\t"
721 "por %%mm5, %%mm3\n\t"
722 "psllq $16, %%mm3\n\t"
723 "por %%mm3, %%mm0\n\t"
724 MOVNTQ" %%mm0, %0\n\t"
725 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
726 d += 4;
727 s += 12;
729 __asm __volatile(SFENCE:::"memory");
730 __asm __volatile(EMMS:::"memory");
731 #endif
732 while(s < end)
734 const int r= *s++;
735 const int g= *s++;
736 const int b= *s++;
737 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
741 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
743 const uint8_t *s = src;
744 const uint8_t *end;
745 #ifdef HAVE_MMX
746 const uint8_t *mm_end;
747 #endif
748 uint16_t *d = (uint16_t *)dst;
749 end = s + src_size;
750 #ifdef HAVE_MMX
751 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
752 __asm __volatile(
753 "movq %0, %%mm7\n\t"
754 "movq %1, %%mm6\n\t"
755 ::"m"(red_15mask),"m"(green_15mask));
756 mm_end = end - 11;
757 while(s < mm_end)
759 __asm __volatile(
760 PREFETCH" 32%1\n\t"
761 "movd %1, %%mm0\n\t"
762 "movd 3%1, %%mm3\n\t"
763 "punpckldq 6%1, %%mm0\n\t"
764 "punpckldq 9%1, %%mm3\n\t"
765 "movq %%mm0, %%mm1\n\t"
766 "movq %%mm0, %%mm2\n\t"
767 "movq %%mm3, %%mm4\n\t"
768 "movq %%mm3, %%mm5\n\t"
769 "psrlq $3, %%mm0\n\t"
770 "psrlq $3, %%mm3\n\t"
771 "pand %2, %%mm0\n\t"
772 "pand %2, %%mm3\n\t"
773 "psrlq $6, %%mm1\n\t"
774 "psrlq $6, %%mm4\n\t"
775 "pand %%mm6, %%mm1\n\t"
776 "pand %%mm6, %%mm4\n\t"
777 "psrlq $9, %%mm2\n\t"
778 "psrlq $9, %%mm5\n\t"
779 "pand %%mm7, %%mm2\n\t"
780 "pand %%mm7, %%mm5\n\t"
781 "por %%mm1, %%mm0\n\t"
782 "por %%mm4, %%mm3\n\t"
783 "por %%mm2, %%mm0\n\t"
784 "por %%mm5, %%mm3\n\t"
785 "psllq $16, %%mm3\n\t"
786 "por %%mm3, %%mm0\n\t"
787 MOVNTQ" %%mm0, %0\n\t"
788 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
789 d += 4;
790 s += 12;
792 __asm __volatile(SFENCE:::"memory");
793 __asm __volatile(EMMS:::"memory");
794 #endif
795 while(s < end)
797 const int b= *s++;
798 const int g= *s++;
799 const int r= *s++;
800 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
804 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
806 const uint8_t *s = src;
807 const uint8_t *end;
808 #ifdef HAVE_MMX
809 const uint8_t *mm_end;
810 #endif
811 uint16_t *d = (uint16_t *)dst;
812 end = s + src_size;
813 #ifdef HAVE_MMX
814 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
815 __asm __volatile(
816 "movq %0, %%mm7\n\t"
817 "movq %1, %%mm6\n\t"
818 ::"m"(red_15mask),"m"(green_15mask));
819 mm_end = end - 15;
820 while(s < mm_end)
822 __asm __volatile(
823 PREFETCH" 32%1\n\t"
824 "movd %1, %%mm0\n\t"
825 "movd 3%1, %%mm3\n\t"
826 "punpckldq 6%1, %%mm0\n\t"
827 "punpckldq 9%1, %%mm3\n\t"
828 "movq %%mm0, %%mm1\n\t"
829 "movq %%mm0, %%mm2\n\t"
830 "movq %%mm3, %%mm4\n\t"
831 "movq %%mm3, %%mm5\n\t"
832 "psllq $7, %%mm0\n\t"
833 "psllq $7, %%mm3\n\t"
834 "pand %%mm7, %%mm0\n\t"
835 "pand %%mm7, %%mm3\n\t"
836 "psrlq $6, %%mm1\n\t"
837 "psrlq $6, %%mm4\n\t"
838 "pand %%mm6, %%mm1\n\t"
839 "pand %%mm6, %%mm4\n\t"
840 "psrlq $19, %%mm2\n\t"
841 "psrlq $19, %%mm5\n\t"
842 "pand %2, %%mm2\n\t"
843 "pand %2, %%mm5\n\t"
844 "por %%mm1, %%mm0\n\t"
845 "por %%mm4, %%mm3\n\t"
846 "por %%mm2, %%mm0\n\t"
847 "por %%mm5, %%mm3\n\t"
848 "psllq $16, %%mm3\n\t"
849 "por %%mm3, %%mm0\n\t"
850 MOVNTQ" %%mm0, %0\n\t"
851 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
852 d += 4;
853 s += 12;
855 __asm __volatile(SFENCE:::"memory");
856 __asm __volatile(EMMS:::"memory");
857 #endif
858 while(s < end)
860 const int r= *s++;
861 const int g= *s++;
862 const int b= *s++;
863 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
868 I use here less accurate approximation by simply
869 left-shifting the input
870 value and filling the low order bits with
871 zeroes. This method improves png's
872 compression but this scheme cannot reproduce white exactly, since it does not
873 generate an all-ones maximum value; the net effect is to darken the
874 image slightly.
876 The better method should be "left bit replication":
878 4 3 2 1 0
879 ---------
880 1 1 0 1 1
882 7 6 5 4 3 2 1 0
883 ----------------
884 1 1 0 1 1 1 1 0
885 |=======| |===|
886 | Leftmost Bits Repeated to Fill Open Bits
888 Original Bits
890 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
892 const uint16_t *end;
893 #ifdef HAVE_MMX
894 const uint16_t *mm_end;
895 #endif
896 uint8_t *d = (uint8_t *)dst;
897 const uint16_t *s = (uint16_t *)src;
898 end = s + src_size/2;
899 #ifdef HAVE_MMX
900 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
901 mm_end = end - 7;
902 while(s < mm_end)
904 __asm __volatile(
905 PREFETCH" 32%1\n\t"
906 "movq %1, %%mm0\n\t"
907 "movq %1, %%mm1\n\t"
908 "movq %1, %%mm2\n\t"
909 "pand %2, %%mm0\n\t"
910 "pand %3, %%mm1\n\t"
911 "pand %4, %%mm2\n\t"
912 "psllq $3, %%mm0\n\t"
913 "psrlq $2, %%mm1\n\t"
914 "psrlq $7, %%mm2\n\t"
915 "movq %%mm0, %%mm3\n\t"
916 "movq %%mm1, %%mm4\n\t"
917 "movq %%mm2, %%mm5\n\t"
918 "punpcklwd %5, %%mm0\n\t"
919 "punpcklwd %5, %%mm1\n\t"
920 "punpcklwd %5, %%mm2\n\t"
921 "punpckhwd %5, %%mm3\n\t"
922 "punpckhwd %5, %%mm4\n\t"
923 "punpckhwd %5, %%mm5\n\t"
924 "psllq $8, %%mm1\n\t"
925 "psllq $16, %%mm2\n\t"
926 "por %%mm1, %%mm0\n\t"
927 "por %%mm2, %%mm0\n\t"
928 "psllq $8, %%mm4\n\t"
929 "psllq $16, %%mm5\n\t"
930 "por %%mm4, %%mm3\n\t"
931 "por %%mm5, %%mm3\n\t"
933 "movq %%mm0, %%mm6\n\t"
934 "movq %%mm3, %%mm7\n\t"
936 "movq 8%1, %%mm0\n\t"
937 "movq 8%1, %%mm1\n\t"
938 "movq 8%1, %%mm2\n\t"
939 "pand %2, %%mm0\n\t"
940 "pand %3, %%mm1\n\t"
941 "pand %4, %%mm2\n\t"
942 "psllq $3, %%mm0\n\t"
943 "psrlq $2, %%mm1\n\t"
944 "psrlq $7, %%mm2\n\t"
945 "movq %%mm0, %%mm3\n\t"
946 "movq %%mm1, %%mm4\n\t"
947 "movq %%mm2, %%mm5\n\t"
948 "punpcklwd %5, %%mm0\n\t"
949 "punpcklwd %5, %%mm1\n\t"
950 "punpcklwd %5, %%mm2\n\t"
951 "punpckhwd %5, %%mm3\n\t"
952 "punpckhwd %5, %%mm4\n\t"
953 "punpckhwd %5, %%mm5\n\t"
954 "psllq $8, %%mm1\n\t"
955 "psllq $16, %%mm2\n\t"
956 "por %%mm1, %%mm0\n\t"
957 "por %%mm2, %%mm0\n\t"
958 "psllq $8, %%mm4\n\t"
959 "psllq $16, %%mm5\n\t"
960 "por %%mm4, %%mm3\n\t"
961 "por %%mm5, %%mm3\n\t"
963 :"=m"(*d)
964 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
965 :"memory");
966 /* Borrowed 32 to 24 */
967 __asm __volatile(
968 "movq %%mm0, %%mm4\n\t"
969 "movq %%mm3, %%mm5\n\t"
970 "movq %%mm6, %%mm0\n\t"
971 "movq %%mm7, %%mm1\n\t"
973 "movq %%mm4, %%mm6\n\t"
974 "movq %%mm5, %%mm7\n\t"
975 "movq %%mm0, %%mm2\n\t"
976 "movq %%mm1, %%mm3\n\t"
978 "psrlq $8, %%mm2\n\t"
979 "psrlq $8, %%mm3\n\t"
980 "psrlq $8, %%mm6\n\t"
981 "psrlq $8, %%mm7\n\t"
982 "pand %2, %%mm0\n\t"
983 "pand %2, %%mm1\n\t"
984 "pand %2, %%mm4\n\t"
985 "pand %2, %%mm5\n\t"
986 "pand %3, %%mm2\n\t"
987 "pand %3, %%mm3\n\t"
988 "pand %3, %%mm6\n\t"
989 "pand %3, %%mm7\n\t"
990 "por %%mm2, %%mm0\n\t"
991 "por %%mm3, %%mm1\n\t"
992 "por %%mm6, %%mm4\n\t"
993 "por %%mm7, %%mm5\n\t"
995 "movq %%mm1, %%mm2\n\t"
996 "movq %%mm4, %%mm3\n\t"
997 "psllq $48, %%mm2\n\t"
998 "psllq $32, %%mm3\n\t"
999 "pand %4, %%mm2\n\t"
1000 "pand %5, %%mm3\n\t"
1001 "por %%mm2, %%mm0\n\t"
1002 "psrlq $16, %%mm1\n\t"
1003 "psrlq $32, %%mm4\n\t"
1004 "psllq $16, %%mm5\n\t"
1005 "por %%mm3, %%mm1\n\t"
1006 "pand %6, %%mm5\n\t"
1007 "por %%mm5, %%mm4\n\t"
1009 MOVNTQ" %%mm0, %0\n\t"
1010 MOVNTQ" %%mm1, 8%0\n\t"
1011 MOVNTQ" %%mm4, 16%0"
1013 :"=m"(*d)
1014 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1015 :"memory");
1016 d += 24;
1017 s += 8;
1019 __asm __volatile(SFENCE:::"memory");
1020 __asm __volatile(EMMS:::"memory");
1021 #endif
1022 while(s < end)
1024 register uint16_t bgr;
1025 bgr = *s++;
1026 *d++ = (bgr&0x1F)<<3;
1027 *d++ = (bgr&0x3E0)>>2;
1028 *d++ = (bgr&0x7C00)>>7;
1032 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1034 const uint16_t *end;
1035 #ifdef HAVE_MMX
1036 const uint16_t *mm_end;
1037 #endif
1038 uint8_t *d = (uint8_t *)dst;
1039 const uint16_t *s = (const uint16_t *)src;
1040 end = s + src_size/2;
1041 #ifdef HAVE_MMX
1042 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1043 mm_end = end - 7;
1044 while(s < mm_end)
1046 __asm __volatile(
1047 PREFETCH" 32%1\n\t"
1048 "movq %1, %%mm0\n\t"
1049 "movq %1, %%mm1\n\t"
1050 "movq %1, %%mm2\n\t"
1051 "pand %2, %%mm0\n\t"
1052 "pand %3, %%mm1\n\t"
1053 "pand %4, %%mm2\n\t"
1054 "psllq $3, %%mm0\n\t"
1055 "psrlq $3, %%mm1\n\t"
1056 "psrlq $8, %%mm2\n\t"
1057 "movq %%mm0, %%mm3\n\t"
1058 "movq %%mm1, %%mm4\n\t"
1059 "movq %%mm2, %%mm5\n\t"
1060 "punpcklwd %5, %%mm0\n\t"
1061 "punpcklwd %5, %%mm1\n\t"
1062 "punpcklwd %5, %%mm2\n\t"
1063 "punpckhwd %5, %%mm3\n\t"
1064 "punpckhwd %5, %%mm4\n\t"
1065 "punpckhwd %5, %%mm5\n\t"
1066 "psllq $8, %%mm1\n\t"
1067 "psllq $16, %%mm2\n\t"
1068 "por %%mm1, %%mm0\n\t"
1069 "por %%mm2, %%mm0\n\t"
1070 "psllq $8, %%mm4\n\t"
1071 "psllq $16, %%mm5\n\t"
1072 "por %%mm4, %%mm3\n\t"
1073 "por %%mm5, %%mm3\n\t"
1075 "movq %%mm0, %%mm6\n\t"
1076 "movq %%mm3, %%mm7\n\t"
1078 "movq 8%1, %%mm0\n\t"
1079 "movq 8%1, %%mm1\n\t"
1080 "movq 8%1, %%mm2\n\t"
1081 "pand %2, %%mm0\n\t"
1082 "pand %3, %%mm1\n\t"
1083 "pand %4, %%mm2\n\t"
1084 "psllq $3, %%mm0\n\t"
1085 "psrlq $3, %%mm1\n\t"
1086 "psrlq $8, %%mm2\n\t"
1087 "movq %%mm0, %%mm3\n\t"
1088 "movq %%mm1, %%mm4\n\t"
1089 "movq %%mm2, %%mm5\n\t"
1090 "punpcklwd %5, %%mm0\n\t"
1091 "punpcklwd %5, %%mm1\n\t"
1092 "punpcklwd %5, %%mm2\n\t"
1093 "punpckhwd %5, %%mm3\n\t"
1094 "punpckhwd %5, %%mm4\n\t"
1095 "punpckhwd %5, %%mm5\n\t"
1096 "psllq $8, %%mm1\n\t"
1097 "psllq $16, %%mm2\n\t"
1098 "por %%mm1, %%mm0\n\t"
1099 "por %%mm2, %%mm0\n\t"
1100 "psllq $8, %%mm4\n\t"
1101 "psllq $16, %%mm5\n\t"
1102 "por %%mm4, %%mm3\n\t"
1103 "por %%mm5, %%mm3\n\t"
1104 :"=m"(*d)
1105 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1106 :"memory");
1107 /* Borrowed 32 to 24 */
1108 __asm __volatile(
1109 "movq %%mm0, %%mm4\n\t"
1110 "movq %%mm3, %%mm5\n\t"
1111 "movq %%mm6, %%mm0\n\t"
1112 "movq %%mm7, %%mm1\n\t"
1114 "movq %%mm4, %%mm6\n\t"
1115 "movq %%mm5, %%mm7\n\t"
1116 "movq %%mm0, %%mm2\n\t"
1117 "movq %%mm1, %%mm3\n\t"
1119 "psrlq $8, %%mm2\n\t"
1120 "psrlq $8, %%mm3\n\t"
1121 "psrlq $8, %%mm6\n\t"
1122 "psrlq $8, %%mm7\n\t"
1123 "pand %2, %%mm0\n\t"
1124 "pand %2, %%mm1\n\t"
1125 "pand %2, %%mm4\n\t"
1126 "pand %2, %%mm5\n\t"
1127 "pand %3, %%mm2\n\t"
1128 "pand %3, %%mm3\n\t"
1129 "pand %3, %%mm6\n\t"
1130 "pand %3, %%mm7\n\t"
1131 "por %%mm2, %%mm0\n\t"
1132 "por %%mm3, %%mm1\n\t"
1133 "por %%mm6, %%mm4\n\t"
1134 "por %%mm7, %%mm5\n\t"
1136 "movq %%mm1, %%mm2\n\t"
1137 "movq %%mm4, %%mm3\n\t"
1138 "psllq $48, %%mm2\n\t"
1139 "psllq $32, %%mm3\n\t"
1140 "pand %4, %%mm2\n\t"
1141 "pand %5, %%mm3\n\t"
1142 "por %%mm2, %%mm0\n\t"
1143 "psrlq $16, %%mm1\n\t"
1144 "psrlq $32, %%mm4\n\t"
1145 "psllq $16, %%mm5\n\t"
1146 "por %%mm3, %%mm1\n\t"
1147 "pand %6, %%mm5\n\t"
1148 "por %%mm5, %%mm4\n\t"
1150 MOVNTQ" %%mm0, %0\n\t"
1151 MOVNTQ" %%mm1, 8%0\n\t"
1152 MOVNTQ" %%mm4, 16%0"
1154 :"=m"(*d)
1155 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1156 :"memory");
1157 d += 24;
1158 s += 8;
1160 __asm __volatile(SFENCE:::"memory");
1161 __asm __volatile(EMMS:::"memory");
1162 #endif
1163 while(s < end)
1165 register uint16_t bgr;
1166 bgr = *s++;
1167 *d++ = (bgr&0x1F)<<3;
1168 *d++ = (bgr&0x7E0)>>3;
1169 *d++ = (bgr&0xF800)>>8;
1173 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1175 const uint16_t *end;
1176 #ifdef HAVE_MMX
1177 const uint16_t *mm_end;
1178 #endif
1179 uint8_t *d = (uint8_t *)dst;
1180 const uint16_t *s = (const uint16_t *)src;
1181 end = s + src_size/2;
1182 #ifdef HAVE_MMX
1183 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1184 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1185 mm_end = end - 3;
1186 while(s < mm_end)
1188 __asm __volatile(
1189 PREFETCH" 32%1\n\t"
1190 "movq %1, %%mm0\n\t"
1191 "movq %1, %%mm1\n\t"
1192 "movq %1, %%mm2\n\t"
1193 "pand %2, %%mm0\n\t"
1194 "pand %3, %%mm1\n\t"
1195 "pand %4, %%mm2\n\t"
1196 "psllq $3, %%mm0\n\t"
1197 "psrlq $2, %%mm1\n\t"
1198 "psrlq $7, %%mm2\n\t"
1199 "movq %%mm0, %%mm3\n\t"
1200 "movq %%mm1, %%mm4\n\t"
1201 "movq %%mm2, %%mm5\n\t"
1202 "punpcklwd %%mm7, %%mm0\n\t"
1203 "punpcklwd %%mm7, %%mm1\n\t"
1204 "punpcklwd %%mm7, %%mm2\n\t"
1205 "punpckhwd %%mm7, %%mm3\n\t"
1206 "punpckhwd %%mm7, %%mm4\n\t"
1207 "punpckhwd %%mm7, %%mm5\n\t"
1208 "psllq $8, %%mm1\n\t"
1209 "psllq $16, %%mm2\n\t"
1210 "por %%mm1, %%mm0\n\t"
1211 "por %%mm2, %%mm0\n\t"
1212 "psllq $8, %%mm4\n\t"
1213 "psllq $16, %%mm5\n\t"
1214 "por %%mm4, %%mm3\n\t"
1215 "por %%mm5, %%mm3\n\t"
1216 MOVNTQ" %%mm0, %0\n\t"
1217 MOVNTQ" %%mm3, 8%0\n\t"
1218 :"=m"(*d)
1219 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1220 :"memory");
1221 d += 16;
1222 s += 4;
1224 __asm __volatile(SFENCE:::"memory");
1225 __asm __volatile(EMMS:::"memory");
1226 #endif
1227 while(s < end)
1229 #if 0 //slightly slower on athlon
1230 int bgr= *s++;
1231 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1232 #else
1233 //FIXME this is very likely wrong for bigendian (and the following converters too)
1234 register uint16_t bgr;
1235 bgr = *s++;
1236 *d++ = (bgr&0x1F)<<3;
1237 *d++ = (bgr&0x3E0)>>2;
1238 *d++ = (bgr&0x7C00)>>7;
1239 *d++ = 0;
1240 #endif
1244 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1246 const uint16_t *end;
1247 #ifdef HAVE_MMX
1248 const uint16_t *mm_end;
1249 #endif
1250 uint8_t *d = (uint8_t *)dst;
1251 const uint16_t *s = (uint16_t *)src;
1252 end = s + src_size/2;
1253 #ifdef HAVE_MMX
1254 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1255 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1256 mm_end = end - 3;
1257 while(s < mm_end)
1259 __asm __volatile(
1260 PREFETCH" 32%1\n\t"
1261 "movq %1, %%mm0\n\t"
1262 "movq %1, %%mm1\n\t"
1263 "movq %1, %%mm2\n\t"
1264 "pand %2, %%mm0\n\t"
1265 "pand %3, %%mm1\n\t"
1266 "pand %4, %%mm2\n\t"
1267 "psllq $3, %%mm0\n\t"
1268 "psrlq $3, %%mm1\n\t"
1269 "psrlq $8, %%mm2\n\t"
1270 "movq %%mm0, %%mm3\n\t"
1271 "movq %%mm1, %%mm4\n\t"
1272 "movq %%mm2, %%mm5\n\t"
1273 "punpcklwd %%mm7, %%mm0\n\t"
1274 "punpcklwd %%mm7, %%mm1\n\t"
1275 "punpcklwd %%mm7, %%mm2\n\t"
1276 "punpckhwd %%mm7, %%mm3\n\t"
1277 "punpckhwd %%mm7, %%mm4\n\t"
1278 "punpckhwd %%mm7, %%mm5\n\t"
1279 "psllq $8, %%mm1\n\t"
1280 "psllq $16, %%mm2\n\t"
1281 "por %%mm1, %%mm0\n\t"
1282 "por %%mm2, %%mm0\n\t"
1283 "psllq $8, %%mm4\n\t"
1284 "psllq $16, %%mm5\n\t"
1285 "por %%mm4, %%mm3\n\t"
1286 "por %%mm5, %%mm3\n\t"
1287 MOVNTQ" %%mm0, %0\n\t"
1288 MOVNTQ" %%mm3, 8%0\n\t"
1289 :"=m"(*d)
1290 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1291 :"memory");
1292 d += 16;
1293 s += 4;
1295 __asm __volatile(SFENCE:::"memory");
1296 __asm __volatile(EMMS:::"memory");
1297 #endif
1298 while(s < end)
1300 register uint16_t bgr;
1301 bgr = *s++;
1302 *d++ = (bgr&0x1F)<<3;
1303 *d++ = (bgr&0x7E0)>>3;
1304 *d++ = (bgr&0xF800)>>8;
1305 *d++ = 0;
1309 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1311 #ifdef HAVE_MMX
1312 /* TODO: unroll this loop */
1313 asm volatile (
1314 "xorl %%eax, %%eax \n\t"
1315 ".balign 16 \n\t"
1316 "1: \n\t"
1317 PREFETCH" 32(%0, %%eax) \n\t"
1318 "movq (%0, %%eax), %%mm0 \n\t"
1319 "movq %%mm0, %%mm1 \n\t"
1320 "movq %%mm0, %%mm2 \n\t"
1321 "pslld $16, %%mm0 \n\t"
1322 "psrld $16, %%mm1 \n\t"
1323 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1324 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1325 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1326 "por %%mm0, %%mm2 \n\t"
1327 "por %%mm1, %%mm2 \n\t"
1328 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
1329 "addl $8, %%eax \n\t"
1330 "cmpl %2, %%eax \n\t"
1331 " jb 1b \n\t"
1332 :: "r" (src), "r"(dst), "r" (src_size-7)
1333 : "%eax"
1336 __asm __volatile(SFENCE:::"memory");
1337 __asm __volatile(EMMS:::"memory");
1338 #else
1339 unsigned i;
1340 unsigned num_pixels = src_size >> 2;
1341 for(i=0; i<num_pixels; i++)
1343 #ifdef WORDS_BIGENDIAN
1344 dst[4*i + 1] = src[4*i + 3];
1345 dst[4*i + 2] = src[4*i + 2];
1346 dst[4*i + 3] = src[4*i + 1];
1347 #else
1348 dst[4*i + 0] = src[4*i + 2];
1349 dst[4*i + 1] = src[4*i + 1];
1350 dst[4*i + 2] = src[4*i + 0];
1351 #endif
1353 #endif
1356 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1358 unsigned i;
1359 #ifdef HAVE_MMX
1360 int mmx_size= 23 - src_size;
1361 asm volatile (
1362 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1363 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1364 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1365 ".balign 16 \n\t"
1366 "1: \n\t"
1367 PREFETCH" 32(%1, %%eax) \n\t"
1368 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1369 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1370 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1371 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1372 "pand %%mm5, %%mm0 \n\t"
1373 "pand %%mm6, %%mm1 \n\t"
1374 "pand %%mm7, %%mm2 \n\t"
1375 "por %%mm0, %%mm1 \n\t"
1376 "por %%mm2, %%mm1 \n\t"
1377 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1378 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1379 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1380 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1381 "pand %%mm7, %%mm0 \n\t"
1382 "pand %%mm5, %%mm1 \n\t"
1383 "pand %%mm6, %%mm2 \n\t"
1384 "por %%mm0, %%mm1 \n\t"
1385 "por %%mm2, %%mm1 \n\t"
1386 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1387 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1388 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1389 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1390 "pand %%mm6, %%mm0 \n\t"
1391 "pand %%mm7, %%mm1 \n\t"
1392 "pand %%mm5, %%mm2 \n\t"
1393 "por %%mm0, %%mm1 \n\t"
1394 "por %%mm2, %%mm1 \n\t"
1395 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1396 "addl $24, %%eax \n\t"
1397 " js 1b \n\t"
1398 : "+a" (mmx_size)
1399 : "r" (src-mmx_size), "r"(dst-mmx_size)
1402 __asm __volatile(SFENCE:::"memory");
1403 __asm __volatile(EMMS:::"memory");
1405 if(mmx_size==23) return; //finihsed, was multiple of 8
1407 src+= src_size;
1408 dst+= src_size;
1409 src_size= 23-mmx_size;
1410 src-= src_size;
1411 dst-= src_size;
1412 #endif
1413 for(i=0; i<src_size; i+=3)
1415 register uint8_t x;
1416 x = src[i + 2];
1417 dst[i + 1] = src[i + 1];
1418 dst[i + 2] = src[i + 0];
1419 dst[i + 0] = x;
1423 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1424 unsigned int width, unsigned int height,
1425 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1427 unsigned y;
1428 const unsigned chromWidth= width>>1;
1429 for(y=0; y<height; y++)
1431 #ifdef HAVE_MMX
1432 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1433 asm volatile(
1434 "xorl %%eax, %%eax \n\t"
1435 ".balign 16 \n\t"
1436 "1: \n\t"
1437 PREFETCH" 32(%1, %%eax, 2) \n\t"
1438 PREFETCH" 32(%2, %%eax) \n\t"
1439 PREFETCH" 32(%3, %%eax) \n\t"
1440 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1441 "movq %%mm0, %%mm2 \n\t" // U(0)
1442 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1443 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1444 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1446 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1447 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1448 "movq %%mm3, %%mm4 \n\t" // Y(0)
1449 "movq %%mm5, %%mm6 \n\t" // Y(8)
1450 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1451 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1452 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1453 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1455 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1456 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1457 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1458 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1460 "addl $8, %%eax \n\t"
1461 "cmpl %4, %%eax \n\t"
1462 " jb 1b \n\t"
1463 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1464 : "%eax"
1466 #else
1468 #if defined ARCH_ALPHA && defined HAVE_MVI
1469 #define pl2yuy2(n) \
1470 y1 = yc[n]; \
1471 y2 = yc2[n]; \
1472 u = uc[n]; \
1473 v = vc[n]; \
1474 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1475 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1476 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1477 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1478 yuv1 = (u << 8) + (v << 24); \
1479 yuv2 = yuv1 + y2; \
1480 yuv1 += y1; \
1481 qdst[n] = yuv1; \
1482 qdst2[n] = yuv2;
1484 int i;
1485 uint64_t *qdst = (uint64_t *) dst;
1486 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1487 const uint32_t *yc = (uint32_t *) ysrc;
1488 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1489 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1490 for(i = 0; i < chromWidth; i += 8){
1491 uint64_t y1, y2, yuv1, yuv2;
1492 uint64_t u, v;
1493 /* Prefetch */
1494 asm("ldq $31,64(%0)" :: "r"(yc));
1495 asm("ldq $31,64(%0)" :: "r"(yc2));
1496 asm("ldq $31,64(%0)" :: "r"(uc));
1497 asm("ldq $31,64(%0)" :: "r"(vc));
1499 pl2yuy2(0);
1500 pl2yuy2(1);
1501 pl2yuy2(2);
1502 pl2yuy2(3);
1504 yc += 4;
1505 yc2 += 4;
1506 uc += 4;
1507 vc += 4;
1508 qdst += 4;
1509 qdst2 += 4;
1511 y++;
1512 ysrc += lumStride;
1513 dst += dstStride;
1515 #elif __WORDSIZE >= 64
1516 int i;
1517 uint64_t *ldst = (uint64_t *) dst;
1518 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1519 for(i = 0; i < chromWidth; i += 2){
1520 uint64_t k, l;
1521 k = yc[0] + (uc[0] << 8) +
1522 (yc[1] << 16) + (vc[0] << 24);
1523 l = yc[2] + (uc[1] << 8) +
1524 (yc[3] << 16) + (vc[1] << 24);
1525 *ldst++ = k + (l << 32);
1526 yc += 4;
1527 uc += 2;
1528 vc += 2;
1531 #else
1532 int i, *idst = (int32_t *) dst;
1533 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1534 for(i = 0; i < chromWidth; i++){
1535 *idst++ = yc[0] + (uc[0] << 8) +
1536 (yc[1] << 16) + (vc[0] << 24);
1537 yc += 2;
1538 uc++;
1539 vc++;
1541 #endif
1542 #endif
1543 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1545 usrc += chromStride;
1546 vsrc += chromStride;
1548 ysrc += lumStride;
1549 dst += dstStride;
1551 #ifdef HAVE_MMX
1552 asm( EMMS" \n\t"
1553 SFENCE" \n\t"
1554 :::"memory");
1555 #endif
1560 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1561 * problem for anyone then tell me, and ill fix it)
1563 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1564 unsigned int width, unsigned int height,
1565 int lumStride, int chromStride, int dstStride)
1567 //FIXME interpolate chroma
1568 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1571 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1572 unsigned int width, unsigned int height,
1573 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1575 unsigned y;
1576 const unsigned chromWidth= width>>1;
1577 for(y=0; y<height; y++)
1579 #ifdef HAVE_MMX
1580 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1581 asm volatile(
1582 "xorl %%eax, %%eax \n\t"
1583 ".balign 16 \n\t"
1584 "1: \n\t"
1585 PREFETCH" 32(%1, %%eax, 2) \n\t"
1586 PREFETCH" 32(%2, %%eax) \n\t"
1587 PREFETCH" 32(%3, %%eax) \n\t"
1588 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1589 "movq %%mm0, %%mm2 \n\t" // U(0)
1590 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1591 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1592 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1594 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1595 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1596 "movq %%mm0, %%mm4 \n\t" // Y(0)
1597 "movq %%mm2, %%mm6 \n\t" // Y(8)
1598 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1599 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1600 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1601 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1603 MOVNTQ" %%mm0, (%0, %%eax, 4) \n\t"
1604 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1605 MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
1606 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1608 "addl $8, %%eax \n\t"
1609 "cmpl %4, %%eax \n\t"
1610 " jb 1b \n\t"
1611 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1612 : "%eax"
1614 #else
1615 //FIXME adapt the alpha asm code from yv12->yuy2
1617 #if __WORDSIZE >= 64
1618 int i;
1619 uint64_t *ldst = (uint64_t *) dst;
1620 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1621 for(i = 0; i < chromWidth; i += 2){
1622 uint64_t k, l;
1623 k = uc[0] + (yc[0] << 8) +
1624 (vc[0] << 16) + (yc[1] << 24);
1625 l = uc[1] + (yc[2] << 8) +
1626 (vc[1] << 16) + (yc[3] << 24);
1627 *ldst++ = k + (l << 32);
1628 yc += 4;
1629 uc += 2;
1630 vc += 2;
1633 #else
1634 int i, *idst = (int32_t *) dst;
1635 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1636 for(i = 0; i < chromWidth; i++){
1637 *idst++ = uc[0] + (yc[0] << 8) +
1638 (vc[0] << 16) + (yc[1] << 24);
1639 yc += 2;
1640 uc++;
1641 vc++;
1643 #endif
1644 #endif
1645 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1647 usrc += chromStride;
1648 vsrc += chromStride;
1650 ysrc += lumStride;
1651 dst += dstStride;
1653 #ifdef HAVE_MMX
1654 asm( EMMS" \n\t"
1655 SFENCE" \n\t"
1656 :::"memory");
1657 #endif
1662 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1663 * problem for anyone then tell me, and ill fix it)
1665 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1666 unsigned int width, unsigned int height,
1667 int lumStride, int chromStride, int dstStride)
1669 //FIXME interpolate chroma
1670 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1675 * width should be a multiple of 16
1677 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1678 unsigned int width, unsigned int height,
1679 int lumStride, int chromStride, int dstStride)
1681 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1686 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1687 * problem for anyone then tell me, and ill fix it)
1689 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1690 unsigned int width, unsigned int height,
1691 int lumStride, int chromStride, int srcStride)
1693 unsigned y;
1694 const unsigned chromWidth= width>>1;
1695 for(y=0; y<height; y+=2)
1697 #ifdef HAVE_MMX
1698 asm volatile(
1699 "xorl %%eax, %%eax \n\t"
1700 "pcmpeqw %%mm7, %%mm7 \n\t"
1701 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1702 ".balign 16 \n\t"
1703 "1: \n\t"
1704 PREFETCH" 64(%0, %%eax, 4) \n\t"
1705 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1706 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1707 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1708 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1709 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1710 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1711 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1712 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1713 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1714 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1716 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1718 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1719 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1720 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1721 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1722 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1723 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1724 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1725 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1726 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1727 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1729 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1731 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1732 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1733 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1734 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1735 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1736 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1737 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1738 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1740 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1741 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1743 "addl $8, %%eax \n\t"
1744 "cmpl %4, %%eax \n\t"
1745 " jb 1b \n\t"
1746 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1747 : "memory", "%eax"
1750 ydst += lumStride;
1751 src += srcStride;
1753 asm volatile(
1754 "xorl %%eax, %%eax \n\t"
1755 ".balign 16 \n\t"
1756 "1: \n\t"
1757 PREFETCH" 64(%0, %%eax, 4) \n\t"
1758 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1759 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1760 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1761 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1762 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1763 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1764 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1765 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1766 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1767 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1769 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1770 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1772 "addl $8, %%eax \n\t"
1773 "cmpl %4, %%eax \n\t"
1774 " jb 1b \n\t"
1776 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1777 : "memory", "%eax"
1779 #else
1780 unsigned i;
1781 for(i=0; i<chromWidth; i++)
1783 ydst[2*i+0] = src[4*i+0];
1784 udst[i] = src[4*i+1];
1785 ydst[2*i+1] = src[4*i+2];
1786 vdst[i] = src[4*i+3];
1788 ydst += lumStride;
1789 src += srcStride;
1791 for(i=0; i<chromWidth; i++)
1793 ydst[2*i+0] = src[4*i+0];
1794 ydst[2*i+1] = src[4*i+2];
1796 #endif
1797 udst += chromStride;
1798 vdst += chromStride;
1799 ydst += lumStride;
1800 src += srcStride;
1802 #ifdef HAVE_MMX
1803 asm volatile( EMMS" \n\t"
1804 SFENCE" \n\t"
1805 :::"memory");
1806 #endif
1809 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1810 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1811 unsigned int width, unsigned int height, int lumStride, int chromStride)
1813 /* Y Plane */
1814 memcpy(ydst, ysrc, width*height);
1816 /* XXX: implement upscaling for U,V */
1819 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1821 int x,y;
1823 dst[0]= src[0];
1825 // first line
1826 for(x=0; x<srcWidth-1; x++){
1827 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1828 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1830 dst[2*srcWidth-1]= src[srcWidth-1];
1832 dst+= dstStride;
1834 for(y=1; y<srcHeight; y++){
1835 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1836 const int mmxSize= srcWidth&~15;
1837 asm volatile(
1838 "movl %4, %%eax \n\t"
1839 "1: \n\t"
1840 "movq (%0, %%eax), %%mm0 \n\t"
1841 "movq (%1, %%eax), %%mm1 \n\t"
1842 "movq 1(%0, %%eax), %%mm2 \n\t"
1843 "movq 1(%1, %%eax), %%mm3 \n\t"
1844 "movq -1(%0, %%eax), %%mm4 \n\t"
1845 "movq -1(%1, %%eax), %%mm5 \n\t"
1846 PAVGB" %%mm0, %%mm5 \n\t"
1847 PAVGB" %%mm0, %%mm3 \n\t"
1848 PAVGB" %%mm0, %%mm5 \n\t"
1849 PAVGB" %%mm0, %%mm3 \n\t"
1850 PAVGB" %%mm1, %%mm4 \n\t"
1851 PAVGB" %%mm1, %%mm2 \n\t"
1852 PAVGB" %%mm1, %%mm4 \n\t"
1853 PAVGB" %%mm1, %%mm2 \n\t"
1854 "movq %%mm5, %%mm7 \n\t"
1855 "movq %%mm4, %%mm6 \n\t"
1856 "punpcklbw %%mm3, %%mm5 \n\t"
1857 "punpckhbw %%mm3, %%mm7 \n\t"
1858 "punpcklbw %%mm2, %%mm4 \n\t"
1859 "punpckhbw %%mm2, %%mm6 \n\t"
1860 #if 1
1861 MOVNTQ" %%mm5, (%2, %%eax, 2) \n\t"
1862 MOVNTQ" %%mm7, 8(%2, %%eax, 2) \n\t"
1863 MOVNTQ" %%mm4, (%3, %%eax, 2) \n\t"
1864 MOVNTQ" %%mm6, 8(%3, %%eax, 2) \n\t"
1865 #else
1866 "movq %%mm5, (%2, %%eax, 2) \n\t"
1867 "movq %%mm7, 8(%2, %%eax, 2) \n\t"
1868 "movq %%mm4, (%3, %%eax, 2) \n\t"
1869 "movq %%mm6, 8(%3, %%eax, 2) \n\t"
1870 #endif
1871 "addl $8, %%eax \n\t"
1872 " js 1b \n\t"
1873 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1874 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1875 "g" (-mmxSize)
1876 : "%eax"
1879 #else
1880 const int mmxSize=1;
1881 #endif
1882 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1883 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1885 for(x=mmxSize-1; x<srcWidth-1; x++){
1886 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1887 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1888 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1889 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1891 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1892 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1894 dst+=dstStride*2;
1895 src+=srcStride;
1898 // last line
1899 #if 1
1900 dst[0]= src[0];
1902 for(x=0; x<srcWidth-1; x++){
1903 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1904 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1906 dst[2*srcWidth-1]= src[srcWidth-1];
1907 #else
1908 for(x=0; x<srcWidth; x++){
1909 dst[2*x+0]=
1910 dst[2*x+1]= src[x];
1912 #endif
1914 #ifdef HAVE_MMX
1915 asm volatile( EMMS" \n\t"
1916 SFENCE" \n\t"
1917 :::"memory");
1918 #endif
1923 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1924 * problem for anyone then tell me, and ill fix it)
1925 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1927 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1928 unsigned int width, unsigned int height,
1929 int lumStride, int chromStride, int srcStride)
1931 unsigned y;
1932 const unsigned chromWidth= width>>1;
1933 for(y=0; y<height; y+=2)
1935 #ifdef HAVE_MMX
1936 asm volatile(
1937 "xorl %%eax, %%eax \n\t"
1938 "pcmpeqw %%mm7, %%mm7 \n\t"
1939 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1940 ".balign 16 \n\t"
1941 "1: \n\t"
1942 PREFETCH" 64(%0, %%eax, 4) \n\t"
1943 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1944 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1945 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1946 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1947 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1948 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1949 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1950 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1951 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1952 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1954 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1956 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1957 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
1958 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1959 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1960 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1961 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1962 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1963 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1964 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1965 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1967 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1969 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1970 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1971 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1972 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1973 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1974 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1975 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1976 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1978 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1979 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1981 "addl $8, %%eax \n\t"
1982 "cmpl %4, %%eax \n\t"
1983 " jb 1b \n\t"
1984 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1985 : "memory", "%eax"
1988 ydst += lumStride;
1989 src += srcStride;
1991 asm volatile(
1992 "xorl %%eax, %%eax \n\t"
1993 ".balign 16 \n\t"
1994 "1: \n\t"
1995 PREFETCH" 64(%0, %%eax, 4) \n\t"
1996 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1997 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1998 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1999 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2000 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2001 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2002 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2003 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2004 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2005 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2007 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2008 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2010 "addl $8, %%eax \n\t"
2011 "cmpl %4, %%eax \n\t"
2012 " jb 1b \n\t"
2014 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2015 : "memory", "%eax"
2017 #else
2018 unsigned i;
2019 for(i=0; i<chromWidth; i++)
2021 udst[i] = src[4*i+0];
2022 ydst[2*i+0] = src[4*i+1];
2023 vdst[i] = src[4*i+2];
2024 ydst[2*i+1] = src[4*i+3];
2026 ydst += lumStride;
2027 src += srcStride;
2029 for(i=0; i<chromWidth; i++)
2031 ydst[2*i+0] = src[4*i+1];
2032 ydst[2*i+1] = src[4*i+3];
2034 #endif
2035 udst += chromStride;
2036 vdst += chromStride;
2037 ydst += lumStride;
2038 src += srcStride;
2040 #ifdef HAVE_MMX
2041 asm volatile( EMMS" \n\t"
2042 SFENCE" \n\t"
2043 :::"memory");
2044 #endif
2049 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2050 * problem for anyone then tell me, and ill fix it)
2051 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2053 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2054 unsigned int width, unsigned int height,
2055 int lumStride, int chromStride, int srcStride)
2057 unsigned y;
2058 const unsigned chromWidth= width>>1;
2059 #ifdef HAVE_MMX
2060 for(y=0; y<height-2; y+=2)
2062 unsigned i;
2063 for(i=0; i<2; i++)
2065 asm volatile(
2066 "movl %2, %%eax \n\t"
2067 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2068 "movq "MANGLE(w1111)", %%mm5 \n\t"
2069 "pxor %%mm7, %%mm7 \n\t"
2070 "leal (%%eax, %%eax, 2), %%ebx \n\t"
2071 ".balign 16 \n\t"
2072 "1: \n\t"
2073 PREFETCH" 64(%0, %%ebx) \n\t"
2074 "movd (%0, %%ebx), %%mm0 \n\t"
2075 "movd 3(%0, %%ebx), %%mm1 \n\t"
2076 "punpcklbw %%mm7, %%mm0 \n\t"
2077 "punpcklbw %%mm7, %%mm1 \n\t"
2078 "movd 6(%0, %%ebx), %%mm2 \n\t"
2079 "movd 9(%0, %%ebx), %%mm3 \n\t"
2080 "punpcklbw %%mm7, %%mm2 \n\t"
2081 "punpcklbw %%mm7, %%mm3 \n\t"
2082 "pmaddwd %%mm6, %%mm0 \n\t"
2083 "pmaddwd %%mm6, %%mm1 \n\t"
2084 "pmaddwd %%mm6, %%mm2 \n\t"
2085 "pmaddwd %%mm6, %%mm3 \n\t"
2086 #ifndef FAST_BGR2YV12
2087 "psrad $8, %%mm0 \n\t"
2088 "psrad $8, %%mm1 \n\t"
2089 "psrad $8, %%mm2 \n\t"
2090 "psrad $8, %%mm3 \n\t"
2091 #endif
2092 "packssdw %%mm1, %%mm0 \n\t"
2093 "packssdw %%mm3, %%mm2 \n\t"
2094 "pmaddwd %%mm5, %%mm0 \n\t"
2095 "pmaddwd %%mm5, %%mm2 \n\t"
2096 "packssdw %%mm2, %%mm0 \n\t"
2097 "psraw $7, %%mm0 \n\t"
2099 "movd 12(%0, %%ebx), %%mm4 \n\t"
2100 "movd 15(%0, %%ebx), %%mm1 \n\t"
2101 "punpcklbw %%mm7, %%mm4 \n\t"
2102 "punpcklbw %%mm7, %%mm1 \n\t"
2103 "movd 18(%0, %%ebx), %%mm2 \n\t"
2104 "movd 21(%0, %%ebx), %%mm3 \n\t"
2105 "punpcklbw %%mm7, %%mm2 \n\t"
2106 "punpcklbw %%mm7, %%mm3 \n\t"
2107 "pmaddwd %%mm6, %%mm4 \n\t"
2108 "pmaddwd %%mm6, %%mm1 \n\t"
2109 "pmaddwd %%mm6, %%mm2 \n\t"
2110 "pmaddwd %%mm6, %%mm3 \n\t"
2111 #ifndef FAST_BGR2YV12
2112 "psrad $8, %%mm4 \n\t"
2113 "psrad $8, %%mm1 \n\t"
2114 "psrad $8, %%mm2 \n\t"
2115 "psrad $8, %%mm3 \n\t"
2116 #endif
2117 "packssdw %%mm1, %%mm4 \n\t"
2118 "packssdw %%mm3, %%mm2 \n\t"
2119 "pmaddwd %%mm5, %%mm4 \n\t"
2120 "pmaddwd %%mm5, %%mm2 \n\t"
2121 "addl $24, %%ebx \n\t"
2122 "packssdw %%mm2, %%mm4 \n\t"
2123 "psraw $7, %%mm4 \n\t"
2125 "packuswb %%mm4, %%mm0 \n\t"
2126 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2128 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
2129 "addl $8, %%eax \n\t"
2130 " js 1b \n\t"
2131 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2132 : "%eax", "%ebx"
2134 ydst += lumStride;
2135 src += srcStride;
2137 src -= srcStride*2;
2138 asm volatile(
2139 "movl %4, %%eax \n\t"
2140 "movq "MANGLE(w1111)", %%mm5 \n\t"
2141 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2142 "pxor %%mm7, %%mm7 \n\t"
2143 "leal (%%eax, %%eax, 2), %%ebx \n\t"
2144 "addl %%ebx, %%ebx \n\t"
2145 ".balign 16 \n\t"
2146 "1: \n\t"
2147 PREFETCH" 64(%0, %%ebx) \n\t"
2148 PREFETCH" 64(%1, %%ebx) \n\t"
2149 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2150 "movq (%0, %%ebx), %%mm0 \n\t"
2151 "movq (%1, %%ebx), %%mm1 \n\t"
2152 "movq 6(%0, %%ebx), %%mm2 \n\t"
2153 "movq 6(%1, %%ebx), %%mm3 \n\t"
2154 PAVGB" %%mm1, %%mm0 \n\t"
2155 PAVGB" %%mm3, %%mm2 \n\t"
2156 "movq %%mm0, %%mm1 \n\t"
2157 "movq %%mm2, %%mm3 \n\t"
2158 "psrlq $24, %%mm0 \n\t"
2159 "psrlq $24, %%mm2 \n\t"
2160 PAVGB" %%mm1, %%mm0 \n\t"
2161 PAVGB" %%mm3, %%mm2 \n\t"
2162 "punpcklbw %%mm7, %%mm0 \n\t"
2163 "punpcklbw %%mm7, %%mm2 \n\t"
2164 #else
2165 "movd (%0, %%ebx), %%mm0 \n\t"
2166 "movd (%1, %%ebx), %%mm1 \n\t"
2167 "movd 3(%0, %%ebx), %%mm2 \n\t"
2168 "movd 3(%1, %%ebx), %%mm3 \n\t"
2169 "punpcklbw %%mm7, %%mm0 \n\t"
2170 "punpcklbw %%mm7, %%mm1 \n\t"
2171 "punpcklbw %%mm7, %%mm2 \n\t"
2172 "punpcklbw %%mm7, %%mm3 \n\t"
2173 "paddw %%mm1, %%mm0 \n\t"
2174 "paddw %%mm3, %%mm2 \n\t"
2175 "paddw %%mm2, %%mm0 \n\t"
2176 "movd 6(%0, %%ebx), %%mm4 \n\t"
2177 "movd 6(%1, %%ebx), %%mm1 \n\t"
2178 "movd 9(%0, %%ebx), %%mm2 \n\t"
2179 "movd 9(%1, %%ebx), %%mm3 \n\t"
2180 "punpcklbw %%mm7, %%mm4 \n\t"
2181 "punpcklbw %%mm7, %%mm1 \n\t"
2182 "punpcklbw %%mm7, %%mm2 \n\t"
2183 "punpcklbw %%mm7, %%mm3 \n\t"
2184 "paddw %%mm1, %%mm4 \n\t"
2185 "paddw %%mm3, %%mm2 \n\t"
2186 "paddw %%mm4, %%mm2 \n\t"
2187 "psrlw $2, %%mm0 \n\t"
2188 "psrlw $2, %%mm2 \n\t"
2189 #endif
2190 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2191 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2193 "pmaddwd %%mm0, %%mm1 \n\t"
2194 "pmaddwd %%mm2, %%mm3 \n\t"
2195 "pmaddwd %%mm6, %%mm0 \n\t"
2196 "pmaddwd %%mm6, %%mm2 \n\t"
2197 #ifndef FAST_BGR2YV12
2198 "psrad $8, %%mm0 \n\t"
2199 "psrad $8, %%mm1 \n\t"
2200 "psrad $8, %%mm2 \n\t"
2201 "psrad $8, %%mm3 \n\t"
2202 #endif
2203 "packssdw %%mm2, %%mm0 \n\t"
2204 "packssdw %%mm3, %%mm1 \n\t"
2205 "pmaddwd %%mm5, %%mm0 \n\t"
2206 "pmaddwd %%mm5, %%mm1 \n\t"
2207 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2208 "psraw $7, %%mm0 \n\t"
2210 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2211 "movq 12(%0, %%ebx), %%mm4 \n\t"
2212 "movq 12(%1, %%ebx), %%mm1 \n\t"
2213 "movq 18(%0, %%ebx), %%mm2 \n\t"
2214 "movq 18(%1, %%ebx), %%mm3 \n\t"
2215 PAVGB" %%mm1, %%mm4 \n\t"
2216 PAVGB" %%mm3, %%mm2 \n\t"
2217 "movq %%mm4, %%mm1 \n\t"
2218 "movq %%mm2, %%mm3 \n\t"
2219 "psrlq $24, %%mm4 \n\t"
2220 "psrlq $24, %%mm2 \n\t"
2221 PAVGB" %%mm1, %%mm4 \n\t"
2222 PAVGB" %%mm3, %%mm2 \n\t"
2223 "punpcklbw %%mm7, %%mm4 \n\t"
2224 "punpcklbw %%mm7, %%mm2 \n\t"
2225 #else
2226 "movd 12(%0, %%ebx), %%mm4 \n\t"
2227 "movd 12(%1, %%ebx), %%mm1 \n\t"
2228 "movd 15(%0, %%ebx), %%mm2 \n\t"
2229 "movd 15(%1, %%ebx), %%mm3 \n\t"
2230 "punpcklbw %%mm7, %%mm4 \n\t"
2231 "punpcklbw %%mm7, %%mm1 \n\t"
2232 "punpcklbw %%mm7, %%mm2 \n\t"
2233 "punpcklbw %%mm7, %%mm3 \n\t"
2234 "paddw %%mm1, %%mm4 \n\t"
2235 "paddw %%mm3, %%mm2 \n\t"
2236 "paddw %%mm2, %%mm4 \n\t"
2237 "movd 18(%0, %%ebx), %%mm5 \n\t"
2238 "movd 18(%1, %%ebx), %%mm1 \n\t"
2239 "movd 21(%0, %%ebx), %%mm2 \n\t"
2240 "movd 21(%1, %%ebx), %%mm3 \n\t"
2241 "punpcklbw %%mm7, %%mm5 \n\t"
2242 "punpcklbw %%mm7, %%mm1 \n\t"
2243 "punpcklbw %%mm7, %%mm2 \n\t"
2244 "punpcklbw %%mm7, %%mm3 \n\t"
2245 "paddw %%mm1, %%mm5 \n\t"
2246 "paddw %%mm3, %%mm2 \n\t"
2247 "paddw %%mm5, %%mm2 \n\t"
2248 "movq "MANGLE(w1111)", %%mm5 \n\t"
2249 "psrlw $2, %%mm4 \n\t"
2250 "psrlw $2, %%mm2 \n\t"
2251 #endif
2252 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2253 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2255 "pmaddwd %%mm4, %%mm1 \n\t"
2256 "pmaddwd %%mm2, %%mm3 \n\t"
2257 "pmaddwd %%mm6, %%mm4 \n\t"
2258 "pmaddwd %%mm6, %%mm2 \n\t"
2259 #ifndef FAST_BGR2YV12
2260 "psrad $8, %%mm4 \n\t"
2261 "psrad $8, %%mm1 \n\t"
2262 "psrad $8, %%mm2 \n\t"
2263 "psrad $8, %%mm3 \n\t"
2264 #endif
2265 "packssdw %%mm2, %%mm4 \n\t"
2266 "packssdw %%mm3, %%mm1 \n\t"
2267 "pmaddwd %%mm5, %%mm4 \n\t"
2268 "pmaddwd %%mm5, %%mm1 \n\t"
2269 "addl $24, %%ebx \n\t"
2270 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2271 "psraw $7, %%mm4 \n\t"
2273 "movq %%mm0, %%mm1 \n\t"
2274 "punpckldq %%mm4, %%mm0 \n\t"
2275 "punpckhdq %%mm4, %%mm1 \n\t"
2276 "packsswb %%mm1, %%mm0 \n\t"
2277 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2279 "movd %%mm0, (%2, %%eax) \n\t"
2280 "punpckhdq %%mm0, %%mm0 \n\t"
2281 "movd %%mm0, (%3, %%eax) \n\t"
2282 "addl $4, %%eax \n\t"
2283 " js 1b \n\t"
2284 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2285 : "%eax", "%ebx"
2288 udst += chromStride;
2289 vdst += chromStride;
2290 src += srcStride*2;
2293 asm volatile( EMMS" \n\t"
2294 SFENCE" \n\t"
2295 :::"memory");
2296 #else
2297 y=0;
2298 #endif
2299 for(; y<height; y+=2)
2301 unsigned i;
2302 for(i=0; i<chromWidth; i++)
2304 unsigned int b= src[6*i+0];
2305 unsigned int g= src[6*i+1];
2306 unsigned int r= src[6*i+2];
2308 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2309 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2310 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2312 udst[i] = U;
2313 vdst[i] = V;
2314 ydst[2*i] = Y;
2316 b= src[6*i+3];
2317 g= src[6*i+4];
2318 r= src[6*i+5];
2320 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2321 ydst[2*i+1] = Y;
2323 ydst += lumStride;
2324 src += srcStride;
2326 for(i=0; i<chromWidth; i++)
2328 unsigned int b= src[6*i+0];
2329 unsigned int g= src[6*i+1];
2330 unsigned int r= src[6*i+2];
2332 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2334 ydst[2*i] = Y;
2336 b= src[6*i+3];
2337 g= src[6*i+4];
2338 r= src[6*i+5];
2340 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2341 ydst[2*i+1] = Y;
2343 udst += chromStride;
2344 vdst += chromStride;
2345 ydst += lumStride;
2346 src += srcStride;
2350 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2351 unsigned width, unsigned height, int src1Stride,
2352 int src2Stride, int dstStride){
2353 unsigned h;
2355 for(h=0; h < height; h++)
2357 unsigned w;
2359 #ifdef HAVE_MMX
2360 #ifdef HAVE_SSE2
2361 asm(
2362 "xorl %%eax, %%eax \n\t"
2363 "1: \n\t"
2364 PREFETCH" 64(%1, %%eax) \n\t"
2365 PREFETCH" 64(%2, %%eax) \n\t"
2366 "movdqa (%1, %%eax), %%xmm0 \n\t"
2367 "movdqa (%1, %%eax), %%xmm1 \n\t"
2368 "movdqa (%2, %%eax), %%xmm2 \n\t"
2369 "punpcklbw %%xmm2, %%xmm0 \n\t"
2370 "punpckhbw %%xmm2, %%xmm1 \n\t"
2371 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2372 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2373 "addl $16, %%eax \n\t"
2374 "cmpl %3, %%eax \n\t"
2375 " jb 1b \n\t"
2376 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2377 : "memory", "%eax"
2379 #else
2380 asm(
2381 "xorl %%eax, %%eax \n\t"
2382 "1: \n\t"
2383 PREFETCH" 64(%1, %%eax) \n\t"
2384 PREFETCH" 64(%2, %%eax) \n\t"
2385 "movq (%1, %%eax), %%mm0 \n\t"
2386 "movq 8(%1, %%eax), %%mm2 \n\t"
2387 "movq %%mm0, %%mm1 \n\t"
2388 "movq %%mm2, %%mm3 \n\t"
2389 "movq (%2, %%eax), %%mm4 \n\t"
2390 "movq 8(%2, %%eax), %%mm5 \n\t"
2391 "punpcklbw %%mm4, %%mm0 \n\t"
2392 "punpckhbw %%mm4, %%mm1 \n\t"
2393 "punpcklbw %%mm5, %%mm2 \n\t"
2394 "punpckhbw %%mm5, %%mm3 \n\t"
2395 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2396 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2397 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2398 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2399 "addl $16, %%eax \n\t"
2400 "cmpl %3, %%eax \n\t"
2401 " jb 1b \n\t"
2402 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2403 : "memory", "%eax"
2405 #endif
2406 for(w= (width&(~15)); w < width; w++)
2408 dest[2*w+0] = src1[w];
2409 dest[2*w+1] = src2[w];
2411 #else
2412 for(w=0; w < width; w++)
2414 dest[2*w+0] = src1[w];
2415 dest[2*w+1] = src2[w];
2417 #endif
2418 dest += dstStride;
2419 src1 += src1Stride;
2420 src2 += src2Stride;
2422 #ifdef HAVE_MMX
2423 asm(
2424 EMMS" \n\t"
2425 SFENCE" \n\t"
2426 ::: "memory"
2428 #endif
2431 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2432 uint8_t *dst1, uint8_t *dst2,
2433 unsigned width, unsigned height,
2434 int srcStride1, int srcStride2,
2435 int dstStride1, int dstStride2)
2437 unsigned int y,x,h;
2438 int w;
2439 w=width/2; h=height/2;
2440 #ifdef HAVE_MMX
2441 asm volatile(
2442 PREFETCH" %0\n\t"
2443 PREFETCH" %1\n\t"
2444 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2445 #endif
2446 for(y=0;y<h;y++){
2447 const uint8_t* s1=src1+srcStride1*(y>>1);
2448 uint8_t* d=dst1+dstStride1*y;
2449 x=0;
2450 #ifdef HAVE_MMX
2451 for(;x<w-31;x+=32)
2453 asm volatile(
2454 PREFETCH" 32%1\n\t"
2455 "movq %1, %%mm0\n\t"
2456 "movq 8%1, %%mm2\n\t"
2457 "movq 16%1, %%mm4\n\t"
2458 "movq 24%1, %%mm6\n\t"
2459 "movq %%mm0, %%mm1\n\t"
2460 "movq %%mm2, %%mm3\n\t"
2461 "movq %%mm4, %%mm5\n\t"
2462 "movq %%mm6, %%mm7\n\t"
2463 "punpcklbw %%mm0, %%mm0\n\t"
2464 "punpckhbw %%mm1, %%mm1\n\t"
2465 "punpcklbw %%mm2, %%mm2\n\t"
2466 "punpckhbw %%mm3, %%mm3\n\t"
2467 "punpcklbw %%mm4, %%mm4\n\t"
2468 "punpckhbw %%mm5, %%mm5\n\t"
2469 "punpcklbw %%mm6, %%mm6\n\t"
2470 "punpckhbw %%mm7, %%mm7\n\t"
2471 MOVNTQ" %%mm0, %0\n\t"
2472 MOVNTQ" %%mm1, 8%0\n\t"
2473 MOVNTQ" %%mm2, 16%0\n\t"
2474 MOVNTQ" %%mm3, 24%0\n\t"
2475 MOVNTQ" %%mm4, 32%0\n\t"
2476 MOVNTQ" %%mm5, 40%0\n\t"
2477 MOVNTQ" %%mm6, 48%0\n\t"
2478 MOVNTQ" %%mm7, 56%0"
2479 :"=m"(d[2*x])
2480 :"m"(s1[x])
2481 :"memory");
2483 #endif
2484 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2486 for(y=0;y<h;y++){
2487 const uint8_t* s2=src2+srcStride2*(y>>1);
2488 uint8_t* d=dst2+dstStride2*y;
2489 x=0;
2490 #ifdef HAVE_MMX
2491 for(;x<w-31;x+=32)
2493 asm volatile(
2494 PREFETCH" 32%1\n\t"
2495 "movq %1, %%mm0\n\t"
2496 "movq 8%1, %%mm2\n\t"
2497 "movq 16%1, %%mm4\n\t"
2498 "movq 24%1, %%mm6\n\t"
2499 "movq %%mm0, %%mm1\n\t"
2500 "movq %%mm2, %%mm3\n\t"
2501 "movq %%mm4, %%mm5\n\t"
2502 "movq %%mm6, %%mm7\n\t"
2503 "punpcklbw %%mm0, %%mm0\n\t"
2504 "punpckhbw %%mm1, %%mm1\n\t"
2505 "punpcklbw %%mm2, %%mm2\n\t"
2506 "punpckhbw %%mm3, %%mm3\n\t"
2507 "punpcklbw %%mm4, %%mm4\n\t"
2508 "punpckhbw %%mm5, %%mm5\n\t"
2509 "punpcklbw %%mm6, %%mm6\n\t"
2510 "punpckhbw %%mm7, %%mm7\n\t"
2511 MOVNTQ" %%mm0, %0\n\t"
2512 MOVNTQ" %%mm1, 8%0\n\t"
2513 MOVNTQ" %%mm2, 16%0\n\t"
2514 MOVNTQ" %%mm3, 24%0\n\t"
2515 MOVNTQ" %%mm4, 32%0\n\t"
2516 MOVNTQ" %%mm5, 40%0\n\t"
2517 MOVNTQ" %%mm6, 48%0\n\t"
2518 MOVNTQ" %%mm7, 56%0"
2519 :"=m"(d[2*x])
2520 :"m"(s2[x])
2521 :"memory");
2523 #endif
2524 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2526 #ifdef HAVE_MMX
2527 asm(
2528 EMMS" \n\t"
2529 SFENCE" \n\t"
2530 ::: "memory"
2532 #endif
2535 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2536 uint8_t *dst,
2537 unsigned width, unsigned height,
2538 int srcStride1, int srcStride2,
2539 int srcStride3, int dstStride)
2541 unsigned y,x,w,h;
2542 w=width/2; h=height;
2543 for(y=0;y<h;y++){
2544 const uint8_t* yp=src1+srcStride1*y;
2545 const uint8_t* up=src2+srcStride2*(y>>2);
2546 const uint8_t* vp=src3+srcStride3*(y>>2);
2547 uint8_t* d=dst+dstStride*y;
2548 x=0;
2549 #ifdef HAVE_MMX
2550 for(;x<w-7;x+=8)
2552 asm volatile(
2553 PREFETCH" 32(%1, %0)\n\t"
2554 PREFETCH" 32(%2, %0)\n\t"
2555 PREFETCH" 32(%3, %0)\n\t"
2556 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2557 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2558 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2559 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2560 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2561 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2562 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2563 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2564 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2565 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2567 "movq %%mm1, %%mm6\n\t"
2568 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2569 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2570 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2571 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2572 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2574 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2575 "movq 8(%1, %0, 4), %%mm0\n\t"
2576 "movq %%mm0, %%mm3\n\t"
2577 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2578 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2579 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2580 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2582 "movq %%mm4, %%mm6\n\t"
2583 "movq 16(%1, %0, 4), %%mm0\n\t"
2584 "movq %%mm0, %%mm3\n\t"
2585 "punpcklbw %%mm5, %%mm4\n\t"
2586 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2587 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2588 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2589 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2591 "punpckhbw %%mm5, %%mm6\n\t"
2592 "movq 24(%1, %0, 4), %%mm0\n\t"
2593 "movq %%mm0, %%mm3\n\t"
2594 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2595 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2596 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2597 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2599 : "+r" (x)
2600 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2601 :"memory");
2603 #endif
2604 for(; x<w; x++)
2606 const int x2= x<<2;
2607 d[8*x+0]=yp[x2];
2608 d[8*x+1]=up[x];
2609 d[8*x+2]=yp[x2+1];
2610 d[8*x+3]=vp[x];
2611 d[8*x+4]=yp[x2+2];
2612 d[8*x+5]=up[x];
2613 d[8*x+6]=yp[x2+3];
2614 d[8*x+7]=vp[x];
2617 #ifdef HAVE_MMX
2618 asm(
2619 EMMS" \n\t"
2620 SFENCE" \n\t"
2621 ::: "memory"
2623 #endif