1 // Generic alpha renderers for all YUV modes and RGB depths.
2 // These are "reference implementations", should be optimized later (MMX, etc)
3 // Templating Code from Michael Niedermayer (michaelni@gmx.at) is under GPL
6 //#define FAST_OSD_TABLE
12 #include "cpudetect.h"
15 #define CAN_COMPILE_X86_ASM
18 #ifdef CAN_COMPILE_X86_ASM
19 static const uint64_t bFF
__attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL
;
20 static const unsigned long long mask24lh
__attribute__((aligned(8))) = 0xFFFF000000000000ULL
;
21 static const unsigned long long mask24hl
__attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL
;
24 //Note: we have C, X86-nommx, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
26 #if !HAVE_MMX || defined (RUNTIME_CPUDETECT)
30 #ifdef CAN_COMPILE_X86_ASM
32 #if (HAVE_MMX && !HAVE_AMD3DNOW && !HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
36 #if HAVE_MMX2 || defined (RUNTIME_CPUDETECT)
40 #if (HAVE_AMD3DNOW && !HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
43 #endif //CAN_COMPILE_X86_ASM
50 #define HAVE_AMD3DNOW 0
52 #ifndef CAN_COMPILE_X86_ASM
60 #define HAVE_AMD3DNOW 0
61 #define RENAME(a) a ## _C
62 #include "osd_template.c"
75 #define HAVE_AMD3DNOW 0
76 #define RENAME(a) a ## _X86
77 #include "osd_template.c"
88 #define HAVE_AMD3DNOW 0
89 #define RENAME(a) a ## _MMX
90 #include "osd_template.c"
101 #define HAVE_AMD3DNOW 0
102 #define RENAME(a) a ## _MMX2
103 #include "osd_template.c"
114 #define HAVE_AMD3DNOW 1
115 #define RENAME(a) a ## _3DNow
116 #include "osd_template.c"
119 #endif //CAN_COMPILE_X86_ASM
121 void vo_draw_alpha_yv12(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
122 #ifdef RUNTIME_CPUDETECT
123 #ifdef CAN_COMPILE_X86_ASM
124 // ordered by speed / fastest first
126 vo_draw_alpha_yv12_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
127 else if(gCpuCaps
.has3DNow
)
128 vo_draw_alpha_yv12_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
129 else if(gCpuCaps
.hasMMX
)
130 vo_draw_alpha_yv12_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
132 vo_draw_alpha_yv12_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
134 vo_draw_alpha_yv12_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
136 #else //RUNTIME_CPUDETECT
138 vo_draw_alpha_yv12_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
140 vo_draw_alpha_yv12_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
142 vo_draw_alpha_yv12_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
144 vo_draw_alpha_yv12_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
146 vo_draw_alpha_yv12_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
148 #endif //!RUNTIME_CPUDETECT
151 void vo_draw_alpha_yuy2(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
152 #ifdef RUNTIME_CPUDETECT
153 #ifdef CAN_COMPILE_X86_ASM
154 // ordered by speed / fastest first
156 vo_draw_alpha_yuy2_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
157 else if(gCpuCaps
.has3DNow
)
158 vo_draw_alpha_yuy2_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
159 else if(gCpuCaps
.hasMMX
)
160 vo_draw_alpha_yuy2_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
162 vo_draw_alpha_yuy2_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
164 vo_draw_alpha_yuy2_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
166 #else //RUNTIME_CPUDETECT
168 vo_draw_alpha_yuy2_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
170 vo_draw_alpha_yuy2_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
172 vo_draw_alpha_yuy2_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
174 vo_draw_alpha_yuy2_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
176 vo_draw_alpha_yuy2_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
178 #endif //!RUNTIME_CPUDETECT
181 void vo_draw_alpha_uyvy(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
182 #ifdef RUNTIME_CPUDETECT
183 #ifdef CAN_COMPILE_X86_ASM
184 // ordered by speed / fastest first
186 vo_draw_alpha_uyvy_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
187 else if(gCpuCaps
.has3DNow
)
188 vo_draw_alpha_uyvy_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
189 else if(gCpuCaps
.hasMMX
)
190 vo_draw_alpha_uyvy_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
192 vo_draw_alpha_uyvy_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
194 vo_draw_alpha_uyvy_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
196 #else //RUNTIME_CPUDETECT
198 vo_draw_alpha_uyvy_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
200 vo_draw_alpha_uyvy_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
202 vo_draw_alpha_uyvy_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
204 vo_draw_alpha_uyvy_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
206 vo_draw_alpha_uyvy_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
208 #endif //!RUNTIME_CPUDETECT
211 void vo_draw_alpha_rgb24(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
212 #ifdef RUNTIME_CPUDETECT
213 #ifdef CAN_COMPILE_X86_ASM
214 // ordered by speed / fastest first
216 vo_draw_alpha_rgb24_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
217 else if(gCpuCaps
.has3DNow
)
218 vo_draw_alpha_rgb24_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
219 else if(gCpuCaps
.hasMMX
)
220 vo_draw_alpha_rgb24_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
222 vo_draw_alpha_rgb24_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
224 vo_draw_alpha_rgb24_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
226 #else //RUNTIME_CPUDETECT
228 vo_draw_alpha_rgb24_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
230 vo_draw_alpha_rgb24_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
232 vo_draw_alpha_rgb24_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
234 vo_draw_alpha_rgb24_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
236 vo_draw_alpha_rgb24_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
238 #endif //!RUNTIME_CPUDETECT
241 void vo_draw_alpha_rgb32(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
242 #ifdef RUNTIME_CPUDETECT
243 #ifdef CAN_COMPILE_X86_ASM
244 // ordered by speed / fastest first
246 vo_draw_alpha_rgb32_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
247 else if(gCpuCaps
.has3DNow
)
248 vo_draw_alpha_rgb32_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
249 else if(gCpuCaps
.hasMMX
)
250 vo_draw_alpha_rgb32_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
252 vo_draw_alpha_rgb32_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
254 vo_draw_alpha_rgb32_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
256 #else //RUNTIME_CPUDETECT
258 vo_draw_alpha_rgb32_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
260 vo_draw_alpha_rgb32_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
262 vo_draw_alpha_rgb32_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
264 vo_draw_alpha_rgb32_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
266 vo_draw_alpha_rgb32_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
268 #endif //!RUNTIME_CPUDETECT
271 #ifdef FAST_OSD_TABLE
272 static unsigned short fast_osd_15bpp_table
[256];
273 static unsigned short fast_osd_16bpp_table
[256];
276 void vo_draw_alpha_init(void){
277 #ifdef FAST_OSD_TABLE
280 fast_osd_15bpp_table
[i
]=((i
>>3)<<10)|((i
>>3)<<5)|(i
>>3);
281 fast_osd_16bpp_table
[i
]=((i
>>3)<<11)|((i
>>2)<<5)|(i
>>3);
284 //FIXME the optimized stuff is a lie for 15/16bpp as they aren't optimized yet
285 if( mp_msg_test(MSGT_OSD
,MSGL_V
) )
287 #ifdef RUNTIME_CPUDETECT
288 #ifdef CAN_COMPILE_X86_ASM
289 // ordered per speed fasterst first
291 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
292 else if(gCpuCaps
.has3DNow
)
293 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");
294 else if(gCpuCaps
.hasMMX
)
295 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX Optimized OnScreenDisplay\n");
297 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using X86 Optimized OnScreenDisplay\n");
299 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using Unoptimized OnScreenDisplay\n");
301 #else //RUNTIME_CPUDETECT
303 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
305 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");
307 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX Optimized OnScreenDisplay\n");
309 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using X86 Optimized OnScreenDisplay\n");
311 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using Unoptimized OnScreenDisplay\n");
313 #endif //!RUNTIME_CPUDETECT
317 void vo_draw_alpha_rgb15(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
320 register unsigned short *dst
= (unsigned short*) dstbase
;
325 #ifdef FAST_OSD_TABLE
326 dst
[x
]=fast_osd_15bpp_table
[src
[x
]];
328 register unsigned int a
=src
[x
]>>3;
329 dst
[x
]=(a
<<10)|(a
<<5)|a
;
332 unsigned char r
=dst
[x
]&0x1F;
333 unsigned char g
=(dst
[x
]>>5)&0x1F;
334 unsigned char b
=(dst
[x
]>>10)&0x1F;
335 r
=(((r
*srca
[x
])>>5)+src
[x
])>>3;
336 g
=(((g
*srca
[x
])>>5)+src
[x
])>>3;
337 b
=(((b
*srca
[x
])>>5)+src
[x
])>>3;
338 dst
[x
]=(b
<<10)|(g
<<5)|r
;
349 void vo_draw_alpha_rgb16(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
352 register unsigned short *dst
= (unsigned short*) dstbase
;
357 #ifdef FAST_OSD_TABLE
358 dst
[x
]=fast_osd_16bpp_table
[src
[x
]];
360 dst
[x
]=((src
[x
]>>3)<<11)|((src
[x
]>>2)<<5)|(src
[x
]>>3);
363 unsigned char r
=dst
[x
]&0x1F;
364 unsigned char g
=(dst
[x
]>>5)&0x3F;
365 unsigned char b
=(dst
[x
]>>11)&0x1F;
366 r
=(((r
*srca
[x
])>>5)+src
[x
])>>3;
367 g
=(((g
*srca
[x
])>>6)+src
[x
])>>2;
368 b
=(((b
*srca
[x
])>>5)+src
[x
])>>3;
369 dst
[x
]=(b
<<11)|(g
<<5)|r
;