2 * generic alpha renderers for all YUV modes and RGB depths
3 * Optimized by Nick and Michael.
5 * This file is part of MPlayer.
7 * MPlayer is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * MPlayer is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with MPlayer; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
28 #define PREFETCH "prefetch"
29 #define PREFETCHW "prefetchw"
30 #define PAVGB "pavgusb"
32 #define PREFETCH "prefetchnta"
33 #define PREFETCHW "prefetcht0"
36 #define PREFETCH " # nop"
37 #define PREFETCHW " # nop"
41 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
47 static inline void RENAME(vo_draw_alpha_yv12
)(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
49 #if defined(FAST_OSD) && !HAVE_MMX
54 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
55 "movq %%mm5, %%mm4\n\t"
56 "movq %%mm5, %%mm7\n\t"
57 "psllw $8, %%mm5\n\t" //FF00FF00FF00
58 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
68 ::"m"(*dstbase
),"m"(*srca
),"m"(*src
):"memory");
77 "movq %0, %%mm0\n\t" // dstbase
78 "movq %%mm0, %%mm1\n\t"
79 "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y
80 "psrlw $8, %%mm1\n\t" //0Y0Y0Y0Y
81 "movq %1, %%mm2\n\t" //srca HGFEDCBA
82 "paddb %%mm7, %%mm2\n\t"
83 "movq %%mm2, %%mm3\n\t"
84 "pand %%mm4, %%mm2\n\t" //0G0E0C0A
85 "psrlw $8, %%mm3\n\t" //0H0F0D0B
86 "pmullw %%mm2, %%mm0\n\t"
87 "pmullw %%mm3, %%mm1\n\t"
89 "pand %%mm5, %%mm1\n\t"
90 "por %%mm1, %%mm0\n\t"
94 :: "m" (dstbase
[x
]), "m" (srca
[x
]), "m" (src
[x
])
100 if(srca
[2*x
+0]) dstbase
[2*x
+0]=src
[2*x
+0];
101 if(srca
[2*x
+1]) dstbase
[2*x
+1]=src
[2*x
+1];
103 if(srca
[x
]) dstbase
[x
]=((dstbase
[x
]*srca
[x
])>>8)+src
[x
];
112 __asm__
volatile(EMMS:::"memory");
117 static inline void RENAME(vo_draw_alpha_yuy2
)(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
119 #if defined(FAST_OSD) && !HAVE_MMX
124 "pxor %%mm7, %%mm7\n\t"
125 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
126 "movq %%mm5, %%mm6\n\t"
127 "movq %%mm5, %%mm4\n\t"
128 "psllw $8, %%mm5\n\t" //FF00FF00FF00
129 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
139 ::"m"(*dstbase
),"m"(*srca
),"m"(*src
));
143 "orl %%eax, %%eax\n\t"
148 "movq %0, %%mm0\n\t" // dstbase
149 "movq %%mm0, %%mm1\n\t"
150 "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y
151 "movd %%eax, %%mm2\n\t" //srca 0000DCBA
152 "paddb %%mm6, %%mm2\n\t"
153 "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A
154 "pmullw %%mm2, %%mm0\n\t"
155 "psrlw $8, %%mm0\n\t"
156 "pand %%mm5, %%mm1\n\t" //U0V0U0V0
157 "movd %2, %%mm2\n\t" //src 0000DCBA
158 "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A
159 "por %%mm1, %%mm0\n\t"
160 "paddb %%mm2, %%mm0\n\t"
163 :: "m" (dstbase
[x
*2]), "m" (srca
[x
]), "m" (src
[x
])
169 if(srca
[2*x
+0]) dstbase
[4*x
+0]=src
[2*x
+0];
170 if(srca
[2*x
+1]) dstbase
[4*x
+2]=src
[2*x
+1];
173 dstbase
[2*x
]=((dstbase
[2*x
]*srca
[x
])>>8)+src
[x
];
174 dstbase
[2*x
+1]=((((signed)dstbase
[2*x
+1]-128)*srca
[x
])>>8)+128;
184 __asm__
volatile(EMMS:::"memory");
189 static inline void RENAME(vo_draw_alpha_uyvy
)(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
191 #if defined(FAST_OSD)
198 if(srca
[2*x
+0]) dstbase
[4*x
+2]=src
[2*x
+0];
199 if(srca
[2*x
+1]) dstbase
[4*x
+0]=src
[2*x
+1];
202 dstbase
[2*x
+1]=((dstbase
[2*x
+1]*srca
[x
])>>8)+src
[x
];
203 dstbase
[2*x
]=((((signed)dstbase
[2*x
]-128)*srca
[x
])>>8)+128;
213 static inline void RENAME(vo_draw_alpha_rgb24
)(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
217 "pxor %%mm7, %%mm7\n\t"
218 "pcmpeqb %%mm6, %%mm6\n\t" // F..F
222 register unsigned char *dst
= dstbase
;
224 #if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX)
230 ::"m"(*dst
),"m"(*srca
),"m"(*src
):"memory");
232 if(srca
[x
] || srca
[x
+1])
237 "movq %0, %%mm0\n\t" // dstbase
238 "movq %%mm0, %%mm1\n\t"
239 "movq %%mm0, %%mm5\n\t"
240 "punpcklbw %%mm7, %%mm0\n\t"
241 "punpckhbw %%mm7, %%mm1\n\t"
242 "movd %1, %%mm2\n\t" // srca ABCD0000
243 "paddb %%mm6, %%mm2\n\t"
244 "punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
245 "punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
246 "psrlq $8, %%mm2\n\t" // srca AAABBBB0
247 "movq %%mm2, %%mm3\n\t"
248 "punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0B
249 "punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B00
250 "pmullw %%mm2, %%mm0\n\t"
251 "pmullw %%mm3, %%mm1\n\t"
252 "psrlw $8, %%mm0\n\t"
253 "psrlw $8, %%mm1\n\t"
254 "packuswb %%mm1, %%mm0\n\t"
255 "movd %2, %%mm2 \n\t" // src ABCD0000
256 "punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
257 "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
258 "psrlq $8, %%mm2\n\t" // src AAABBBB0
259 "paddb %%mm2, %%mm0\n\t"
262 "por %%mm0, %%mm5\n\t"
264 :: "m" (dst
[0]), "m" (srca
[x
]), "m" (src
[x
]), "m"(mask24hl
), "m"(mask24lh
));
271 "movzbl (%0), %%ecx\n\t"
272 "movzbl 1(%0), %%eax\n\t"
274 "imull %1, %%ecx\n\t"
275 "imull %1, %%eax\n\t"
280 "movb %%ch, (%0)\n\t"
281 "movb %%ah, 1(%0)\n\t"
283 "movzbl 2(%0), %%eax\n\t"
284 "imull %1, %%eax\n\t"
286 "movb %%ah, 2(%0)\n\t"
289 "r" ((unsigned)srca
[x
]),
290 "r" (((unsigned)src
[x
])<<8)
296 #endif /* !HAVE_MMX */
297 #else /*non x86 arch or x86_64 with MMX disabled */
301 dst
[0]=dst
[1]=dst
[2]=src
[x
];
303 dst
[0]=((dst
[0]*srca
[x
])>>8)+src
[x
];
304 dst
[1]=((dst
[1]*srca
[x
])>>8)+src
[x
];
305 dst
[2]=((dst
[2]*srca
[x
])>>8)+src
[x
];
310 #endif /* arch_x86 */
316 __asm__
volatile(EMMS:::"memory");
321 static inline void RENAME(vo_draw_alpha_rgb32
)(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
329 "pxor %%mm7, %%mm7\n\t"
330 "pcmpeqb %%mm6, %%mm6\n\t" // F..F
332 #else /* HAVE_AMD3DNOW */
334 "pxor %%mm7, %%mm7\n\t"
335 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
336 "movq %%mm5, %%mm4\n\t"
337 "psllw $8, %%mm5\n\t" //FF00FF00FF00
338 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
340 #endif /* HAVE_AMD3DNOW */
341 #endif /* HAVE_MMX */
344 #if ARCH_X86 && (!ARCH_X86_64 || HAVE_MMX)
351 ::"m"(*dstbase
),"m"(*srca
),"m"(*src
):"memory");
353 if(srca
[x
] || srca
[x
+1])
358 "movq %0, %%mm0\n\t" // dstbase
359 "movq %%mm0, %%mm1\n\t"
360 "punpcklbw %%mm7, %%mm0\n\t"
361 "punpckhbw %%mm7, %%mm1\n\t"
362 "movd %1, %%mm2\n\t" // srca ABCD0000
363 "paddb %%mm6, %%mm2\n\t"
364 "punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
365 "punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
366 "movq %%mm2, %%mm3\n\t"
367 "punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
368 "punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
369 "pmullw %%mm2, %%mm0\n\t"
370 "pmullw %%mm3, %%mm1\n\t"
371 "psrlw $8, %%mm0\n\t"
372 "psrlw $8, %%mm1\n\t"
373 "packuswb %%mm1, %%mm0\n\t"
374 "movd %2, %%mm2 \n\t" // src ABCD0000
375 "punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
376 "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
377 "paddb %%mm2, %%mm0\n\t"
379 :: "m" (dstbase
[4*x
]), "m" (srca
[x
]), "m" (src
[x
]));
381 #else //this is faster for intels crap
386 ::"m"(*dstbase
),"m"(*srca
),"m"(*src
):"memory");
390 "orl %%eax, %%eax\n\t"
395 "movq %0, %%mm0\n\t" // dstbase
396 "movq %%mm0, %%mm1\n\t"
397 "pand %%mm4, %%mm0\n\t" //0R0B0R0B
398 "psrlw $8, %%mm1\n\t" //0?0G0?0G
399 "movd %%eax, %%mm2\n\t" //srca 0000DCBA
400 "paddb %3, %%mm2\n\t"
401 "punpcklbw %%mm2, %%mm2\n\t" //srca DDCCBBAA
402 "movq %%mm2, %%mm3\n\t"
403 "punpcklbw %%mm7, %%mm2\n\t" //srca 0B0B0A0A
404 "pmullw %%mm2, %%mm0\n\t"
405 "pmullw %%mm2, %%mm1\n\t"
406 "psrlw $8, %%mm0\n\t"
407 "pand %%mm5, %%mm1\n\t"
408 "por %%mm1, %%mm0\n\t"
409 "movd %2, %%mm2 \n\t" //src 0000DCBA
410 "punpcklbw %%mm2, %%mm2\n\t" //src DDCCBBAA
411 "movq %%mm2, %%mm6\n\t"
412 "punpcklbw %%mm2, %%mm2\n\t" //src BBBBAAAA
413 "paddb %%mm2, %%mm0\n\t"
416 "movq 8%0, %%mm0\n\t" // dstbase
417 "movq %%mm0, %%mm1\n\t"
418 "pand %%mm4, %%mm0\n\t" //0R0B0R0B
419 "psrlw $8, %%mm1\n\t" //0?0G0?0G
420 "punpckhbw %%mm7, %%mm3\n\t" //srca 0D0D0C0C
421 "pmullw %%mm3, %%mm0\n\t"
422 "pmullw %%mm3, %%mm1\n\t"
423 "psrlw $8, %%mm0\n\t"
424 "pand %%mm5, %%mm1\n\t"
425 "por %%mm1, %%mm0\n\t"
426 "punpckhbw %%mm6, %%mm6\n\t" //src DDDDCCCC
427 "paddb %%mm6, %%mm0\n\t"
428 "movq %%mm0, 8%0\n\t"
430 :: "m" (dstbase
[4*x
]), "m" (srca
[x
]), "m" (src
[x
]), "m" (bFF
)
438 "movzbl (%0), %%ecx\n\t"
439 "movzbl 1(%0), %%eax\n\t"
440 "movzbl 2(%0), %%edx\n\t"
442 "imull %1, %%ecx\n\t"
443 "imull %1, %%eax\n\t"
444 "imull %1, %%edx\n\t"
450 "movb %%ch, (%0)\n\t"
451 "movb %%ah, 1(%0)\n\t"
452 "movb %%dh, 2(%0)\n\t"
455 :"r" (&dstbase
[4*x
]),
456 "r" ((unsigned)srca
[x
]),
457 "r" (((unsigned)src
[x
])<<8)
458 :"%eax", "%ecx", "%edx"
462 #endif /* HAVE_MMX */
463 #else /*non x86 arch or x86_64 with MMX disabled */
467 dstbase
[4*x
+0]=dstbase
[4*x
+1]=dstbase
[4*x
+2]=src
[x
];
469 dstbase
[4*x
+0]=((dstbase
[4*x
+0]*srca
[x
])>>8)+src
[x
];
470 dstbase
[4*x
+1]=((dstbase
[4*x
+1]*srca
[x
])>>8)+src
[x
];
471 dstbase
[4*x
+2]=((dstbase
[4*x
+2]*srca
[x
])>>8)+src
[x
];
475 #endif /* arch_x86 */
481 __asm__
volatile(EMMS:::"memory");