1 // Generic alpha renderers for all YUV modes and RGB depths.
2 // These are "reference implementations", should be optimized later (MMX, etc)
3 // Templating Code from Michael Niedermayer (michaelni@gmx.at) is under GPL
6 //#define FAST_OSD_TABLE
12 #include "cpudetect.h"
16 #define CAN_COMPILE_X86_ASM
19 #ifdef CAN_COMPILE_X86_ASM
20 static const uint64_t bFF attribute_used
__attribute__((aligned(8))) = 0xFFFFFFFFFFFFFFFFULL
;
21 static const unsigned long long mask24lh
__attribute__((aligned(8))) = 0xFFFF000000000000ULL
;
22 static const unsigned long long mask24hl
__attribute__((aligned(8))) = 0x0000FFFFFFFFFFFFULL
;
25 //Note: we have C, X86-nommx, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one
27 #if !defined (HAVE_MMX) || defined (RUNTIME_CPUDETECT)
31 #ifdef CAN_COMPILE_X86_ASM
33 #if (defined (HAVE_MMX) && !defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
37 #if defined (HAVE_MMX2) || defined (RUNTIME_CPUDETECT)
41 #if (defined (HAVE_3DNOW) && !defined (HAVE_MMX2)) || defined (RUNTIME_CPUDETECT)
44 #endif //CAN_COMPILE_X86_ASM
50 #ifndef CAN_COMPILE_X86_ASM
56 #define RENAME(a) a ## _C
57 #include "osd_template.c"
68 #define RENAME(a) a ## _X86
69 #include "osd_template.c"
78 #define RENAME(a) a ## _MMX
79 #include "osd_template.c"
88 #define RENAME(a) a ## _MMX2
89 #include "osd_template.c"
98 #define RENAME(a) a ## _3DNow
99 #include "osd_template.c"
102 #endif //CAN_COMPILE_X86_ASM
104 void vo_draw_alpha_yv12(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
105 #ifdef RUNTIME_CPUDETECT
106 #ifdef CAN_COMPILE_X86_ASM
107 // ordered by speed / fastest first
109 vo_draw_alpha_yv12_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
110 else if(gCpuCaps
.has3DNow
)
111 vo_draw_alpha_yv12_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
112 else if(gCpuCaps
.hasMMX
)
113 vo_draw_alpha_yv12_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
115 vo_draw_alpha_yv12_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
117 vo_draw_alpha_yv12_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
119 #else //RUNTIME_CPUDETECT
121 vo_draw_alpha_yv12_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
122 #elif defined (HAVE_3DNOW)
123 vo_draw_alpha_yv12_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
124 #elif defined (HAVE_MMX)
125 vo_draw_alpha_yv12_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
126 #elif defined(ARCH_X86)
127 vo_draw_alpha_yv12_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
129 vo_draw_alpha_yv12_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
131 #endif //!RUNTIME_CPUDETECT
134 void vo_draw_alpha_yuy2(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
135 #ifdef RUNTIME_CPUDETECT
136 #ifdef CAN_COMPILE_X86_ASM
137 // ordered by speed / fastest first
139 vo_draw_alpha_yuy2_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
140 else if(gCpuCaps
.has3DNow
)
141 vo_draw_alpha_yuy2_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
142 else if(gCpuCaps
.hasMMX
)
143 vo_draw_alpha_yuy2_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
145 vo_draw_alpha_yuy2_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
147 vo_draw_alpha_yuy2_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
149 #else //RUNTIME_CPUDETECT
151 vo_draw_alpha_yuy2_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
152 #elif defined (HAVE_3DNOW)
153 vo_draw_alpha_yuy2_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
154 #elif defined (HAVE_MMX)
155 vo_draw_alpha_yuy2_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
156 #elif defined(ARCH_X86)
157 vo_draw_alpha_yuy2_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
159 vo_draw_alpha_yuy2_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
161 #endif //!RUNTIME_CPUDETECT
164 void vo_draw_alpha_uyvy(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
165 #ifdef RUNTIME_CPUDETECT
166 #ifdef CAN_COMPILE_X86_ASM
167 // ordered by speed / fastest first
169 vo_draw_alpha_uyvy_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
170 else if(gCpuCaps
.has3DNow
)
171 vo_draw_alpha_uyvy_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
172 else if(gCpuCaps
.hasMMX
)
173 vo_draw_alpha_uyvy_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
175 vo_draw_alpha_uyvy_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
177 vo_draw_alpha_uyvy_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
179 #else //RUNTIME_CPUDETECT
181 vo_draw_alpha_uyvy_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
182 #elif defined (HAVE_3DNOW)
183 vo_draw_alpha_uyvy_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
184 #elif defined (HAVE_MMX)
185 vo_draw_alpha_uyvy_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
186 #elif defined(ARCH_X86)
187 vo_draw_alpha_uyvy_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
189 vo_draw_alpha_uyvy_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
191 #endif //!RUNTIME_CPUDETECT
194 void vo_draw_alpha_rgb24(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
195 #ifdef RUNTIME_CPUDETECT
196 #ifdef CAN_COMPILE_X86_ASM
197 // ordered by speed / fastest first
199 vo_draw_alpha_rgb24_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
200 else if(gCpuCaps
.has3DNow
)
201 vo_draw_alpha_rgb24_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
202 else if(gCpuCaps
.hasMMX
)
203 vo_draw_alpha_rgb24_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
205 vo_draw_alpha_rgb24_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
207 vo_draw_alpha_rgb24_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
209 #else //RUNTIME_CPUDETECT
211 vo_draw_alpha_rgb24_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
212 #elif defined (HAVE_3DNOW)
213 vo_draw_alpha_rgb24_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
214 #elif defined (HAVE_MMX)
215 vo_draw_alpha_rgb24_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
216 #elif defined(ARCH_X86)
217 vo_draw_alpha_rgb24_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
219 vo_draw_alpha_rgb24_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
221 #endif //!RUNTIME_CPUDETECT
224 void vo_draw_alpha_rgb32(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
225 #ifdef RUNTIME_CPUDETECT
226 #ifdef CAN_COMPILE_X86_ASM
227 // ordered by speed / fastest first
229 vo_draw_alpha_rgb32_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
230 else if(gCpuCaps
.has3DNow
)
231 vo_draw_alpha_rgb32_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
232 else if(gCpuCaps
.hasMMX
)
233 vo_draw_alpha_rgb32_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
235 vo_draw_alpha_rgb32_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
237 vo_draw_alpha_rgb32_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
239 #else //RUNTIME_CPUDETECT
241 vo_draw_alpha_rgb32_MMX2(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
242 #elif defined (HAVE_3DNOW)
243 vo_draw_alpha_rgb32_3DNow(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
244 #elif defined (HAVE_MMX)
245 vo_draw_alpha_rgb32_MMX(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
246 #elif defined(ARCH_X86)
247 vo_draw_alpha_rgb32_X86(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
249 vo_draw_alpha_rgb32_C(w
, h
, src
, srca
, srcstride
, dstbase
, dststride
);
251 #endif //!RUNTIME_CPUDETECT
254 #ifdef FAST_OSD_TABLE
255 static unsigned short fast_osd_15bpp_table
[256];
256 static unsigned short fast_osd_16bpp_table
[256];
259 void vo_draw_alpha_init(void){
260 #ifdef FAST_OSD_TABLE
263 fast_osd_15bpp_table
[i
]=((i
>>3)<<10)|((i
>>3)<<5)|(i
>>3);
264 fast_osd_16bpp_table
[i
]=((i
>>3)<<11)|((i
>>2)<<5)|(i
>>3);
267 //FIXME the optimized stuff is a lie for 15/16bpp as they aren't optimized yet
268 if( mp_msg_test(MSGT_OSD
,MSGL_V
) )
270 #ifdef RUNTIME_CPUDETECT
271 #ifdef CAN_COMPILE_X86_ASM
272 // ordered per speed fasterst first
274 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
275 else if(gCpuCaps
.has3DNow
)
276 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");
277 else if(gCpuCaps
.hasMMX
)
278 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX Optimized OnScreenDisplay\n");
280 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using X86 Optimized OnScreenDisplay\n");
282 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using Unoptimized OnScreenDisplay\n");
284 #else //RUNTIME_CPUDETECT
286 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX (with tiny bit MMX2) Optimized OnScreenDisplay\n");
287 #elif defined (HAVE_3DNOW)
288 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX (with tiny bit 3DNow) Optimized OnScreenDisplay\n");
289 #elif defined (HAVE_MMX)
290 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using MMX Optimized OnScreenDisplay\n");
291 #elif defined(ARCH_X86)
292 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using X86 Optimized OnScreenDisplay\n");
294 mp_msg(MSGT_OSD
,MSGL_INFO
,"Using Unoptimized OnScreenDisplay\n");
296 #endif //!RUNTIME_CPUDETECT
300 void vo_draw_alpha_rgb15(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
303 register unsigned short *dst
= (unsigned short*) dstbase
;
308 #ifdef FAST_OSD_TABLE
309 dst
[x
]=fast_osd_15bpp_table
[src
[x
]];
311 register unsigned int a
=src
[x
]>>3;
312 dst
[x
]=(a
<<10)|(a
<<5)|a
;
315 unsigned char r
=dst
[x
]&0x1F;
316 unsigned char g
=(dst
[x
]>>5)&0x1F;
317 unsigned char b
=(dst
[x
]>>10)&0x1F;
318 r
=(((r
*srca
[x
])>>5)+src
[x
])>>3;
319 g
=(((g
*srca
[x
])>>5)+src
[x
])>>3;
320 b
=(((b
*srca
[x
])>>5)+src
[x
])>>3;
321 dst
[x
]=(b
<<10)|(g
<<5)|r
;
332 void vo_draw_alpha_rgb16(int w
,int h
, unsigned char* src
, unsigned char *srca
, int srcstride
, unsigned char* dstbase
,int dststride
){
335 register unsigned short *dst
= (unsigned short*) dstbase
;
340 #ifdef FAST_OSD_TABLE
341 dst
[x
]=fast_osd_16bpp_table
[src
[x
]];
343 dst
[x
]=((src
[x
]>>3)<<11)|((src
[x
]>>2)<<5)|(src
[x
]>>3);
346 unsigned char r
=dst
[x
]&0x1F;
347 unsigned char g
=(dst
[x
]>>5)&0x3F;
348 unsigned char b
=(dst
[x
]>>11)&0x1F;
349 r
=(((r
*srca
[x
])>>5)+src
[x
])>>3;
350 g
=(((g
*srca
[x
])>>6)+src
[x
])>>2;
351 b
=(((b
*srca
[x
])>>5)+src
[x
])>>3;
352 dst
[x
]=(b
<<11)|(g
<<5)|r
;