1 // Generic alpha renderers for all YUV modes and RGB depths.
2 // These are "reference implementations", should be optimized later (MMX, etc)
3 // Templating Code from Michael Niedermayer (michaelni@gmx.at) is under GPL
6 //#define FAST_OSD_TABLE
12 #include "cpudetect.h"
15 #define CAN_COMPILE_X86_ASM
18 #ifdef CAN_COMPILE_X86_ASM
19 static const uint64_t bFF
__attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL
;
20 static const unsigned long long mask24lh
__attribute__((aligned(8))) = 0xFFFF000000000000ULL
;
21 static const unsigned long long mask24hl
__attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL
;
24 //Note: we have C, X86-nommx, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
26 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
30 #ifdef CAN_COMPILE_X86_ASM
32 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
36 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
40 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
43 #endif //CAN_COMPILE_X86_ASM
49 #ifndef CAN_COMPILE_X86_ASM
55 #define RENAME(a) a ## _C
56 #include "osd_template.c"
67 #define RENAME(a) a ## _X86
68 #include "osd_template.c"
77 #define RENAME(a) a ## _MMX
78 #include "osd_template.c"
87 #define RENAME(a) a ## _MMX2
88 #include "osd_template.c"
97 #define RENAME(a) a ## _3DNow
98 #include "osd_template.c"
101 #endif //CAN_COMPILE_X86_ASM
103 void vo_draw_alpha_yv12(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
104 #ifdef RUNTIME_CPUDETECT
105 #ifdef CAN_COMPILE_X86_ASM
106 // ordered by speed / fastest first
108 vo_draw_alpha_yv12_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
109 else if(gCpuCaps
.has3DNow
)
110 vo_draw_alpha_yv12_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
111 else if(gCpuCaps
.hasMMX
)
112 vo_draw_alpha_yv12_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
114 vo_draw_alpha_yv12_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
116 vo_draw_alpha_yv12_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
118 #else //RUNTIME_CPUDETECT
120 vo_draw_alpha_yv12_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
121 #elif defined (HAVE_3DNOW)
122 vo_draw_alpha_yv12_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
123 #elif defined (HAVE_MMX)
124 vo_draw_alpha_yv12_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
125 #elif defined(ARCH_X86)
126 vo_draw_alpha_yv12_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
128 vo_draw_alpha_yv12_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
130 #endif //!RUNTIME_CPUDETECT
133 void vo_draw_alpha_yuy2(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
134 #ifdef RUNTIME_CPUDETECT
135 #ifdef CAN_COMPILE_X86_ASM
136 // ordered by speed / fastest first
138 vo_draw_alpha_yuy2_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
139 else if(gCpuCaps
.has3DNow
)
140 vo_draw_alpha_yuy2_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
141 else if(gCpuCaps
.hasMMX
)
142 vo_draw_alpha_yuy2_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
144 vo_draw_alpha_yuy2_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
146 vo_draw_alpha_yuy2_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
148 #else //RUNTIME_CPUDETECT
150 vo_draw_alpha_yuy2_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
151 #elif defined (HAVE_3DNOW)
152 vo_draw_alpha_yuy2_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
153 #elif defined (HAVE_MMX)
154 vo_draw_alpha_yuy2_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
155 #elif defined(ARCH_X86)
156 vo_draw_alpha_yuy2_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
158 vo_draw_alpha_yuy2_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
160 #endif //!RUNTIME_CPUDETECT
163 void vo_draw_alpha_uyvy(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
164 #ifdef RUNTIME_CPUDETECT
165 #ifdef CAN_COMPILE_X86_ASM
166 // ordered by speed / fastest first
168 vo_draw_alpha_uyvy_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
169 else if(gCpuCaps
.has3DNow
)
170 vo_draw_alpha_uyvy_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
171 else if(gCpuCaps
.hasMMX
)
172 vo_draw_alpha_uyvy_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
174 vo_draw_alpha_uyvy_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
176 vo_draw_alpha_uyvy_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
178 #else //RUNTIME_CPUDETECT
180 vo_draw_alpha_uyvy_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
181 #elif defined (HAVE_3DNOW)
182 vo_draw_alpha_uyvy_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
183 #elif defined (HAVE_MMX)
184 vo_draw_alpha_uyvy_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
185 #elif defined(ARCH_X86)
186 vo_draw_alpha_uyvy_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
188 vo_draw_alpha_uyvy_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
190 #endif //!RUNTIME_CPUDETECT
193 void vo_draw_alpha_rgb24(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
194 #ifdef RUNTIME_CPUDETECT
195 #ifdef CAN_COMPILE_X86_ASM
196 // ordered by speed / fastest first
198 vo_draw_alpha_rgb24_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
199 else if(gCpuCaps
.has3DNow
)
200 vo_draw_alpha_rgb24_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
201 else if(gCpuCaps
.hasMMX
)
202 vo_draw_alpha_rgb24_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
204 vo_draw_alpha_rgb24_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
206 vo_draw_alpha_rgb24_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
208 #else //RUNTIME_CPUDETECT
210 vo_draw_alpha_rgb24_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
211 #elif defined (HAVE_3DNOW)
212 vo_draw_alpha_rgb24_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
213 #elif defined (HAVE_MMX)
214 vo_draw_alpha_rgb24_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
215 #elif defined(ARCH_X86)
216 vo_draw_alpha_rgb24_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
218 vo_draw_alpha_rgb24_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
220 #endif //!RUNTIME_CPUDETECT
223 void vo_draw_alpha_rgb32(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
224 #ifdef RUNTIME_CPUDETECT
225 #ifdef CAN_COMPILE_X86_ASM
226 // ordered by speed / fastest first
228 vo_draw_alpha_rgb32_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
229 else if(gCpuCaps
.has3DNow
)
230 vo_draw_alpha_rgb32_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
231 else if(gCpuCaps
.hasMMX
)
232 vo_draw_alpha_rgb32_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
234 vo_draw_alpha_rgb32_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
236 vo_draw_alpha_rgb32_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
238 #else //RUNTIME_CPUDETECT
240 vo_draw_alpha_rgb32_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
241 #elif defined (HAVE_3DNOW)
242 vo_draw_alpha_rgb32_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
243 #elif defined (HAVE_MMX)
244 vo_draw_alpha_rgb32_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
245 #elif defined(ARCH_X86)
246 vo_draw_alpha_rgb32_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
248 vo_draw_alpha_rgb32_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
250 #endif //!RUNTIME_CPUDETECT
253 #ifdef FAST_OSD_TABLE
254 static unsigned short fast_osd_15bpp_table
[256];
255 static unsigned short fast_osd_16bpp_table
[256];
258 void vo_draw_alpha_init(void){
259 #ifdef FAST_OSD_TABLE
262 fast_osd_15bpp_table
[i
]=((i
>>3)<<10)|((i
>>3)<<5)|(i
>>3);
263 fast_osd_16bpp_table
[i
]=((i
>>3)<<11)|((i
>>2)<<5)|(i
>>3);
266 //FIXME the optimized stuff is a lie for 15/16bpp as they aren't optimized yet
267 if( mp_msg_test(MSGT_OSD
,MSGL_V
) )
269 #ifdef RUNTIME_CPUDETECT
270 #ifdef CAN_COMPILE_X86_ASM
271 // ordered per speed fasterst first
273 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
274 else if(gCpuCaps
.has3DNow
)
275 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");
276 else if(gCpuCaps
.hasMMX
)
277 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX Optimized OnScreenDisplay\n");
279 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using X86 Optimized OnScreenDisplay\n");
281 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using Unoptimized OnScreenDisplay\n");
283 #else //RUNTIME_CPUDETECT
285 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
286 #elif defined (HAVE_3DNOW)
287 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");
288 #elif defined (HAVE_MMX)
289 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX Optimized OnScreenDisplay\n");
290 #elif defined(ARCH_X86)
291 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using X86 Optimized OnScreenDisplay\n");
293 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using Unoptimized OnScreenDisplay\n");
295 #endif //!RUNTIME_CPUDETECT
299 void vo_draw_alpha_rgb15(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
302 register unsigned short *dst
= (unsigned short*) dstbase
;
307 #ifdef FAST_OSD_TABLE
308 dst
[x
]=fast_osd_15bpp_table
[src
[x
]];
310 register unsigned int a
=src
[x
]>>3;
311 dst
[x
]=(a
<<10)|(a
<<5)|a
;
314 unsigned char r
=dst
[x
]&0x1F;
315 unsigned char g
=(dst
[x
]>>5)&0x1F;
316 unsigned char b
=(dst
[x
]>>10)&0x1F;
317 r
=(((r
*srca
[x
])>>5)+src
[x
])>>3;
318 g
=(((g
*srca
[x
])>>5)+src
[x
])>>3;
319 b
=(((b
*srca
[x
])>>5)+src
[x
])>>3;
320 dst
[x
]=(b
<<10)|(g
<<5)|r
;
331 void vo_draw_alpha_rgb16(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
334 register unsigned short *dst
= (unsigned short*) dstbase
;
339 #ifdef FAST_OSD_TABLE
340 dst
[x
]=fast_osd_16bpp_table
[src
[x
]];
342 dst
[x
]=((src
[x
]>>3)<<11)|((src
[x
]>>2)<<5)|(src
[x
]>>3);
345 unsigned char r
=dst
[x
]&0x1F;
346 unsigned char g
=(dst
[x
]>>5)&0x3F;
347 unsigned char b
=(dst
[x
]>>11)&0x1F;
348 r
=(((r
*srca
[x
])>>5)+src
[x
])>>3;
349 g
=(((g
*srca
[x
])>>6)+src
[x
])>>2;
350 b
=(((b
*srca
[x
])>>5)+src
[x
])>>3;
351 dst
[x
]=(b
<<11)|(g
<<5)|r
;