1 // Generic alpha renderers for all YUV modes and RGB depths.
2 // Optimized by Nick and Michael
3 // Code from Michael Niedermayer (michaelni@gmx.at) is under GPL
11 #define PREFETCH "prefetch"
12 #define PREFETCHW "prefetchw"
13 #define PAVGB "pavgusb"
14 #elif defined ( HAVE_MMX2 )
15 #define PREFETCH "prefetchnta"
16 #define PREFETCHW "prefetcht0"
19 #define PREFETCH "/nop"
20 #define PREFETCHW "/nop"
24 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
30 static inline void RENAME(vo_draw_alpha_yv12
)(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
32 #if defined(FAST_OSD) && !defined(HAVE_MMX)
37 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
38 "movq %%mm5, %%mm4\n\t"
39 "movq %%mm5, %%mm7\n\t"
40 "psllw $8, %%mm5\n\t" //FF00FF00FF00
41 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
51 ::"m"(*dstbase
),"m"(*srca
),"m"(*src
):"memory");
60 "movq %0, %%mm0\n\t" // dstbase
61 "movq %%mm0, %%mm1\n\t"
62 "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y
63 "psrlw $8, %%mm1\n\t" //0Y0Y0Y0Y
64 "movq %1, %%mm2\n\t" //srca HGFEDCBA
65 "paddb %%mm7, %%mm2\n\t"
66 "movq %%mm2, %%mm3\n\t"
67 "pand %%mm4, %%mm2\n\t" //0G0E0C0A
68 "psrlw $8, %%mm3\n\t" //0H0F0D0B
69 "pmullw %%mm2, %%mm0\n\t"
70 "pmullw %%mm3, %%mm1\n\t"
72 "pand %%mm5, %%mm1\n\t"
73 "por %%mm1, %%mm0\n\t"
77 :: "m" (dstbase
[x
]), "m" (srca
[x
]), "m" (src
[x
])
83 if(srca
[2*x
+0]) dstbase
[2*x
+0]=src
[2*x
+0];
84 if(srca
[2*x
+1]) dstbase
[2*x
+1]=src
[2*x
+1];
86 if(srca
[x
]) dstbase
[x
]=((dstbase
[x
]*srca
[x
])>>8)+src
[x
];
95 asm volatile(EMMS:::"memory");
100 static inline void RENAME(vo_draw_alpha_yuy2
)(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
102 #if defined(FAST_OSD) && !defined(HAVE_MMX)
107 "pxor %%mm7, %%mm7\n\t"
108 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
109 "movq %%mm5, %%mm6\n\t"
110 "movq %%mm5, %%mm4\n\t"
111 "psllw $8, %%mm5\n\t" //FF00FF00FF00
112 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
122 ::"m"(*dstbase
),"m"(*srca
),"m"(*src
));
126 "orl %%eax, %%eax\n\t"
131 "movq %0, %%mm0\n\t" // dstbase
132 "movq %%mm0, %%mm1\n\t"
133 "pand %%mm4, %%mm0\n\t" //0Y0Y0Y0Y
134 "movd %%eax, %%mm2\n\t" //srca 0000DCBA
135 "paddb %%mm6, %%mm2\n\t"
136 "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A
137 "pmullw %%mm2, %%mm0\n\t"
138 "psrlw $8, %%mm0\n\t"
139 "pand %%mm5, %%mm1\n\t" //U0V0U0V0
140 "movd %2, %%mm2\n\t" //src 0000DCBA
141 "punpcklbw %%mm7, %%mm2\n\t" //srca 0D0C0B0A
142 "por %%mm1, %%mm0\n\t"
143 "paddb %%mm2, %%mm0\n\t"
146 :: "m" (dstbase
[x
*2]), "m" (srca
[x
]), "m" (src
[x
])
152 if(srca
[2*x
+0]) dstbase
[4*x
+0]=src
[2*x
+0];
153 if(srca
[2*x
+1]) dstbase
[4*x
+2]=src
[2*x
+1];
156 dstbase
[2*x
]=((dstbase
[2*x
]*srca
[x
])>>8)+src
[x
];
157 dstbase
[2*x
+1]=((((signed)dstbase
[2*x
+1]-128)*srca
[x
])>>8)+128;
167 asm volatile(EMMS:::"memory");
172 static inline void RENAME(vo_draw_alpha_uyvy
)(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
174 #if defined(FAST_OSD)
181 if(srca
[2*x
+0]) dstbase
[4*x
+2]=src
[2*x
+0];
182 if(srca
[2*x
+1]) dstbase
[4*x
+0]=src
[2*x
+1];
185 dstbase
[2*x
+1]=((dstbase
[2*x
+1]*srca
[x
])>>8)+src
[x
];
186 dstbase
[2*x
]=((((signed)dstbase
[2*x
]-128)*srca
[x
])>>8)+128;
196 static inline void RENAME(vo_draw_alpha_rgb24
)(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
200 "pxor %%mm7, %%mm7\n\t"
201 "pcmpeqb %%mm6, %%mm6\n\t" // F..F
205 register unsigned char *dst
= dstbase
;
207 #if defined(ARCH_X86) && (!defined(ARCH_X86_64) || defined(HAVE_MMX))
213 ::"m"(*dst
),"m"(*srca
),"m"(*src
):"memory");
215 if(srca
[x
] || srca
[x
+1])
220 "movq %0, %%mm0\n\t" // dstbase
221 "movq %%mm0, %%mm1\n\t"
222 "movq %%mm0, %%mm5\n\t"
223 "punpcklbw %%mm7, %%mm0\n\t"
224 "punpckhbw %%mm7, %%mm1\n\t"
225 "movd %1, %%mm2\n\t" // srca ABCD0000
226 "paddb %%mm6, %%mm2\n\t"
227 "punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
228 "punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
229 "psrlq $8, %%mm2\n\t" // srca AAABBBB0
230 "movq %%mm2, %%mm3\n\t"
231 "punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0B
232 "punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B00
233 "pmullw %%mm2, %%mm0\n\t"
234 "pmullw %%mm3, %%mm1\n\t"
235 "psrlw $8, %%mm0\n\t"
236 "psrlw $8, %%mm1\n\t"
237 "packuswb %%mm1, %%mm0\n\t"
238 "movd %2, %%mm2 \n\t" // src ABCD0000
239 "punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
240 "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
241 "psrlq $8, %%mm2\n\t" // src AAABBBB0
242 "paddb %%mm2, %%mm0\n\t"
245 "por %%mm0, %%mm5\n\t"
247 :: "m" (dst
[0]), "m" (srca
[x
]), "m" (src
[x
]), "m"(mask24hl
), "m"(mask24lh
));
254 "movzbl (%0), %%ecx\n\t"
255 "movzbl 1(%0), %%eax\n\t"
257 "imull %1, %%ecx\n\t"
258 "imull %1, %%eax\n\t"
263 "movb %%ch, (%0)\n\t"
264 "movb %%ah, 1(%0)\n\t"
266 "movzbl 2(%0), %%eax\n\t"
267 "imull %1, %%eax\n\t"
269 "movb %%ah, 2(%0)\n\t"
272 "r" ((unsigned)srca
[x
]),
273 "r" (((unsigned)src
[x
])<<8)
279 #endif /* !HAVE_MMX */
280 #else /*non x86 arch or x86_64 with MMX disabled */
284 dst
[0]=dst
[1]=dst
[2]=src
[x
];
286 dst
[0]=((dst
[0]*srca
[x
])>>8)+src
[x
];
287 dst
[1]=((dst
[1]*srca
[x
])>>8)+src
[x
];
288 dst
[2]=((dst
[2]*srca
[x
])>>8)+src
[x
];
293 #endif /* arch_x86 */
299 asm volatile(EMMS:::"memory");
304 static inline void RENAME(vo_draw_alpha_rgb32
)(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
306 #ifdef WORDS_BIGENDIAN
312 "pxor %%mm7, %%mm7\n\t"
313 "pcmpeqb %%mm6, %%mm6\n\t" // F..F
315 #else /* HAVE_3DNOW */
317 "pxor %%mm7, %%mm7\n\t"
318 "pcmpeqb %%mm5, %%mm5\n\t" // F..F
319 "movq %%mm5, %%mm4\n\t"
320 "psllw $8, %%mm5\n\t" //FF00FF00FF00
321 "psrlw $8, %%mm4\n\t" //00FF00FF00FF
323 #endif /* HAVE_3DNOW */
324 #endif /* HAVE_MMX */
327 #if defined(ARCH_X86) && (!defined(ARCH_X86_64) || defined(HAVE_MMX))
334 ::"m"(*dstbase
),"m"(*srca
),"m"(*src
):"memory");
336 if(srca
[x
] || srca
[x
+1])
341 "movq %0, %%mm0\n\t" // dstbase
342 "movq %%mm0, %%mm1\n\t"
343 "punpcklbw %%mm7, %%mm0\n\t"
344 "punpckhbw %%mm7, %%mm1\n\t"
345 "movd %1, %%mm2\n\t" // srca ABCD0000
346 "paddb %%mm6, %%mm2\n\t"
347 "punpcklbw %%mm2, %%mm2\n\t" // srca AABBCCDD
348 "punpcklbw %%mm2, %%mm2\n\t" // srca AAAABBBB
349 "movq %%mm2, %%mm3\n\t"
350 "punpcklbw %%mm7, %%mm2\n\t" // srca 0A0A0A0A
351 "punpckhbw %%mm7, %%mm3\n\t" // srca 0B0B0B0B
352 "pmullw %%mm2, %%mm0\n\t"
353 "pmullw %%mm3, %%mm1\n\t"
354 "psrlw $8, %%mm0\n\t"
355 "psrlw $8, %%mm1\n\t"
356 "packuswb %%mm1, %%mm0\n\t"
357 "movd %2, %%mm2 \n\t" // src ABCD0000
358 "punpcklbw %%mm2, %%mm2\n\t" // src AABBCCDD
359 "punpcklbw %%mm2, %%mm2\n\t" // src AAAABBBB
360 "paddb %%mm2, %%mm0\n\t"
362 :: "m" (dstbase
[4*x
]), "m" (srca
[x
]), "m" (src
[x
]));
364 #else //this is faster for intels crap
369 ::"m"(*dstbase
),"m"(*srca
),"m"(*src
):"memory");
373 "orl %%eax, %%eax\n\t"
378 "movq %0, %%mm0\n\t" // dstbase
379 "movq %%mm0, %%mm1\n\t"
380 "pand %%mm4, %%mm0\n\t" //0R0B0R0B
381 "psrlw $8, %%mm1\n\t" //0?0G0?0G
382 "movd %%eax, %%mm2\n\t" //srca 0000DCBA
383 "paddb "MANGLE(bFF
)", %%mm2\n\t"
384 "punpcklbw %%mm2, %%mm2\n\t" //srca DDCCBBAA
385 "movq %%mm2, %%mm3\n\t"
386 "punpcklbw %%mm7, %%mm2\n\t" //srca 0B0B0A0A
387 "pmullw %%mm2, %%mm0\n\t"
388 "pmullw %%mm2, %%mm1\n\t"
389 "psrlw $8, %%mm0\n\t"
390 "pand %%mm5, %%mm1\n\t"
391 "por %%mm1, %%mm0\n\t"
392 "movd %2, %%mm2 \n\t" //src 0000DCBA
393 "punpcklbw %%mm2, %%mm2\n\t" //src DDCCBBAA
394 "movq %%mm2, %%mm6\n\t"
395 "punpcklbw %%mm2, %%mm2\n\t" //src BBBBAAAA
396 "paddb %%mm2, %%mm0\n\t"
399 "movq 8%0, %%mm0\n\t" // dstbase
400 "movq %%mm0, %%mm1\n\t"
401 "pand %%mm4, %%mm0\n\t" //0R0B0R0B
402 "psrlw $8, %%mm1\n\t" //0?0G0?0G
403 "punpckhbw %%mm7, %%mm3\n\t" //srca 0D0D0C0C
404 "pmullw %%mm3, %%mm0\n\t"
405 "pmullw %%mm3, %%mm1\n\t"
406 "psrlw $8, %%mm0\n\t"
407 "pand %%mm5, %%mm1\n\t"
408 "por %%mm1, %%mm0\n\t"
409 "punpckhbw %%mm6, %%mm6\n\t" //src DDDDCCCC
410 "paddb %%mm6, %%mm0\n\t"
411 "movq %%mm0, 8%0\n\t"
413 :: "m" (dstbase
[4*x
]), "m" (srca
[x
]), "m" (src
[x
])
421 "movzbl (%0), %%ecx\n\t"
422 "movzbl 1(%0), %%eax\n\t"
423 "movzbl 2(%0), %%edx\n\t"
425 "imull %1, %%ecx\n\t"
426 "imull %1, %%eax\n\t"
427 "imull %1, %%edx\n\t"
433 "movb %%ch, (%0)\n\t"
434 "movb %%ah, 1(%0)\n\t"
435 "movb %%dh, 2(%0)\n\t"
438 :"r" (&dstbase
[4*x
]),
439 "r" ((unsigned)srca
[x
]),
440 "r" (((unsigned)src
[x
])<<8)
441 :"%eax", "%ecx", "%edx"
445 #endif /* HAVE_MMX */
446 #else /*non x86 arch or x86_64 with MMX disabled */
450 dstbase
[4*x
+0]=dstbase
[4*x
+1]=dstbase
[4*x
+2]=src
[x
];
452 dstbase
[4*x
+0]=((dstbase
[4*x
+0]*srca
[x
])>>8)+src
[x
];
453 dstbase
[4*x
+1]=((dstbase
[4*x
+1]*srca
[x
])>>8)+src
[x
];
454 dstbase
[4*x
+2]=((dstbase
[4*x
+2]*srca
[x
])>>8)+src
[x
];
458 #endif /* arch_x86 */
464 asm volatile(EMMS:::"memory");