Add config.h/config.mak bzlib variables missed in last commit.
[mplayer/glamo.git] / libvo / aclib_template.c
blobab2ed6e5f576cc1e5799514f5fa24e87340355a4
1 /*
2 aclib - advanced C library ;)
3 This file contains functions which improve and expand standard C-library
4 */
6 #if !HAVE_SSE2
7 /*
8 P3 processor has only one SSE decoder so can execute only 1 sse insn per
9 cpu clock, but it has 3 mmx decoders (include load/store unit)
10 and executes 3 mmx insns per cpu clock.
11 P4 processor has some chances, but after reading:
12 http://www.emulators.com/pentium4.htm
13 I have doubts. Anyway SSE2 version of this code can be written better.
15 #undef HAVE_SSE
16 #define HAVE_SSE 0
17 #endif
21 This part of code was taken by me from Linux-2.4.3 and slightly modified
22 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
23 blocks but mplayer uses weakly ordered data and original sources can not
24 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
26 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
28 Order Number 245470:
29 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
31 Data referenced by a program can be temporal (data will be used again) or
32 non-temporal (data will be referenced once and not reused in the immediate
33 future). To make efficient use of the processor's caches, it is generally
34 desirable to cache temporal data and not cache non-temporal data. Overloading
35 the processor's caches with non-temporal data is sometimes referred to as
36 "polluting the caches".
37 The non-temporal data is written to memory with Write-Combining semantics.
39 The PREFETCHh instructions permits a program to load data into the processor
40 at a suggested cache level, so that it is closer to the processors load and
41 store unit when it is needed. If the data is already present in a level of
42 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
43 will not result in any data movement.
44 But we should you PREFETCHNTA: Non-temporal data fetch data into location
45 close to the processor, minimizing cache pollution.
47 The MOVNTQ (store quadword using non-temporal hint) instruction stores
48 packed integer data from an MMX register to memory, using a non-temporal hint.
49 The MOVNTPS (store packed single-precision floating-point values using
50 non-temporal hint) instruction stores packed floating-point data from an
51 XMM register to memory, using a non-temporal hint.
53 The SFENCE (Store Fence) instruction controls write ordering by creating a
54 fence for memory store operations. This instruction guarantees that the results
55 of every store instruction that precedes the store fence in program order is
56 globally visible before any store instruction that follows the fence. The
57 SFENCE instruction provides an efficient way of ensuring ordering between
58 procedures that produce weakly-ordered data and procedures that consume that
59 data.
61 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
64 // 3dnow memcpy support from kernel 2.4.2
65 // by Pontscho/fresh!mindworkz
68 #undef HAVE_ONLY_MMX1
69 #if HAVE_MMX && !HAVE_MMX2 && !HAVE_AMD3DNOW && !HAVE_SSE
70 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups
71 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
72 standard (non MMX-optimized) version.
73 Note: on K6-2+ it speedups memory copying upto 25% and
74 on K7 and P3 about 500% (5 times). */
75 #define HAVE_ONLY_MMX1
76 #endif
79 #undef HAVE_K6_2PLUS
80 #if !HAVE_MMX2 && HAVE_AMD3DNOW
81 #define HAVE_K6_2PLUS
82 #endif
84 /* for small memory blocks (<256 bytes) this version is faster */
85 #define small_memcpy(to,from,n)\
87 register unsigned long int dummy;\
88 __asm__ volatile(\
89 "rep; movsb"\
90 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
91 /* It's most portable way to notify compiler */\
92 /* that edi, esi and ecx are clobbered in asm block. */\
93 /* Thanks to A'rpi for hint!!! */\
94 :"0" (to), "1" (from),"2" (n)\
95 : "memory");\
98 #undef MMREG_SIZE
99 #if HAVE_SSE
100 #define MMREG_SIZE 16
101 #else
102 #define MMREG_SIZE 64 //8
103 #endif
105 #undef PREFETCH
106 #undef EMMS
108 #if HAVE_MMX2
109 #define PREFETCH "prefetchnta"
110 #elif HAVE_AMD3DNOW
111 #define PREFETCH "prefetch"
112 #else
113 #define PREFETCH " # nop"
114 #endif
116 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
117 #if HAVE_AMD3DNOW
118 #define EMMS "femms"
119 #else
120 #define EMMS "emms"
121 #endif
123 #undef MOVNTQ
124 #if HAVE_MMX2
125 #define MOVNTQ "movntq"
126 #else
127 #define MOVNTQ "movq"
128 #endif
130 #undef MIN_LEN
131 #ifdef HAVE_ONLY_MMX1
132 #define MIN_LEN 0x800 /* 2K blocks */
133 #else
134 #define MIN_LEN 0x40 /* 64-byte blocks */
135 #endif
137 static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
139 void *retval;
140 size_t i;
141 retval = to;
142 #ifdef STATISTICS
144 static int freq[33];
145 static int t=0;
146 int i;
147 for(i=0; len>(1<<i); i++);
148 freq[i]++;
149 t++;
150 if(1024*1024*1024 % t == 0)
151 for(i=0; i<32; i++)
152 printf("freq < %8d %4d\n", 1<<i, freq[i]);
154 #endif
155 #ifndef HAVE_ONLY_MMX1
156 /* PREFETCH has effect even for MOVSB instruction ;) */
157 __asm__ volatile (
158 PREFETCH" (%0)\n"
159 PREFETCH" 64(%0)\n"
160 PREFETCH" 128(%0)\n"
161 PREFETCH" 192(%0)\n"
162 PREFETCH" 256(%0)\n"
163 : : "r" (from) );
164 #endif
165 if(len >= MIN_LEN)
167 register unsigned long int delta;
168 /* Align destinition to MMREG_SIZE -boundary */
169 delta = ((unsigned long int)to)&(MMREG_SIZE-1);
170 if(delta)
172 delta=MMREG_SIZE-delta;
173 len -= delta;
174 small_memcpy(to, from, delta);
176 i = len >> 6; /* len/64 */
177 len&=63;
179 This algorithm is top effective when the code consequently
180 reads and writes blocks which have size of cache line.
181 Size of cache line is processor-dependent.
182 It will, however, be a minimum of 32 bytes on any processors.
183 It would be better to have a number of instructions which
184 perform reading and writing to be multiple to a number of
185 processor's decoders, but it's not always possible.
187 #if HAVE_SSE /* Only P3 (may be Cyrix3) */
188 if(((unsigned long)from) & 15)
189 /* if SRC is misaligned */
190 for(; i>0; i--)
192 __asm__ volatile (
193 PREFETCH" 320(%0)\n"
194 "movups (%0), %%xmm0\n"
195 "movups 16(%0), %%xmm1\n"
196 "movups 32(%0), %%xmm2\n"
197 "movups 48(%0), %%xmm3\n"
198 "movntps %%xmm0, (%1)\n"
199 "movntps %%xmm1, 16(%1)\n"
200 "movntps %%xmm2, 32(%1)\n"
201 "movntps %%xmm3, 48(%1)\n"
202 :: "r" (from), "r" (to) : "memory");
203 from=((const unsigned char *) from)+64;
204 to=((unsigned char *)to)+64;
206 else
208 Only if SRC is aligned on 16-byte boundary.
209 It allows to use movaps instead of movups, which required data
210 to be aligned or a general-protection exception (#GP) is generated.
212 for(; i>0; i--)
214 __asm__ volatile (
215 PREFETCH" 320(%0)\n"
216 "movaps (%0), %%xmm0\n"
217 "movaps 16(%0), %%xmm1\n"
218 "movaps 32(%0), %%xmm2\n"
219 "movaps 48(%0), %%xmm3\n"
220 "movntps %%xmm0, (%1)\n"
221 "movntps %%xmm1, 16(%1)\n"
222 "movntps %%xmm2, 32(%1)\n"
223 "movntps %%xmm3, 48(%1)\n"
224 :: "r" (from), "r" (to) : "memory");
225 from=((const unsigned char *)from)+64;
226 to=((unsigned char *)to)+64;
228 #else
229 // Align destination at BLOCK_SIZE boundary
230 for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
232 __asm__ volatile (
233 #ifndef HAVE_ONLY_MMX1
234 PREFETCH" 320(%0)\n"
235 #endif
236 "movq (%0), %%mm0\n"
237 "movq 8(%0), %%mm1\n"
238 "movq 16(%0), %%mm2\n"
239 "movq 24(%0), %%mm3\n"
240 "movq 32(%0), %%mm4\n"
241 "movq 40(%0), %%mm5\n"
242 "movq 48(%0), %%mm6\n"
243 "movq 56(%0), %%mm7\n"
244 MOVNTQ" %%mm0, (%1)\n"
245 MOVNTQ" %%mm1, 8(%1)\n"
246 MOVNTQ" %%mm2, 16(%1)\n"
247 MOVNTQ" %%mm3, 24(%1)\n"
248 MOVNTQ" %%mm4, 32(%1)\n"
249 MOVNTQ" %%mm5, 40(%1)\n"
250 MOVNTQ" %%mm6, 48(%1)\n"
251 MOVNTQ" %%mm7, 56(%1)\n"
252 :: "r" (from), "r" (to) : "memory");
253 from=((const unsigned char *)from)+64;
254 to=((unsigned char *)to)+64;
257 // printf(" %d %d\n", (int)from&1023, (int)to&1023);
258 // Pure Assembly cuz gcc is a bit unpredictable ;)
259 if(i>=BLOCK_SIZE/64)
260 __asm__ volatile(
261 "xor %%"REG_a", %%"REG_a" \n\t"
262 ASMALIGN(4)
263 "1: \n\t"
264 "movl (%0, %%"REG_a"), %%ebx \n\t"
265 "movl 32(%0, %%"REG_a"), %%ebx \n\t"
266 "movl 64(%0, %%"REG_a"), %%ebx \n\t"
267 "movl 96(%0, %%"REG_a"), %%ebx \n\t"
268 "add $128, %%"REG_a" \n\t"
269 "cmp %3, %%"REG_a" \n\t"
270 " jb 1b \n\t"
272 "xor %%"REG_a", %%"REG_a" \n\t"
274 ASMALIGN(4)
275 "2: \n\t"
276 "movq (%0, %%"REG_a"), %%mm0\n"
277 "movq 8(%0, %%"REG_a"), %%mm1\n"
278 "movq 16(%0, %%"REG_a"), %%mm2\n"
279 "movq 24(%0, %%"REG_a"), %%mm3\n"
280 "movq 32(%0, %%"REG_a"), %%mm4\n"
281 "movq 40(%0, %%"REG_a"), %%mm5\n"
282 "movq 48(%0, %%"REG_a"), %%mm6\n"
283 "movq 56(%0, %%"REG_a"), %%mm7\n"
284 MOVNTQ" %%mm0, (%1, %%"REG_a")\n"
285 MOVNTQ" %%mm1, 8(%1, %%"REG_a")\n"
286 MOVNTQ" %%mm2, 16(%1, %%"REG_a")\n"
287 MOVNTQ" %%mm3, 24(%1, %%"REG_a")\n"
288 MOVNTQ" %%mm4, 32(%1, %%"REG_a")\n"
289 MOVNTQ" %%mm5, 40(%1, %%"REG_a")\n"
290 MOVNTQ" %%mm6, 48(%1, %%"REG_a")\n"
291 MOVNTQ" %%mm7, 56(%1, %%"REG_a")\n"
292 "add $64, %%"REG_a" \n\t"
293 "cmp %3, %%"REG_a" \n\t"
294 "jb 2b \n\t"
296 #if CONFUSION_FACTOR > 0
297 // a few percent speedup on out of order executing CPUs
298 "mov %5, %%"REG_a" \n\t"
299 "2: \n\t"
300 "movl (%0), %%ebx \n\t"
301 "movl (%0), %%ebx \n\t"
302 "movl (%0), %%ebx \n\t"
303 "movl (%0), %%ebx \n\t"
304 "dec %%"REG_a" \n\t"
305 " jnz 2b \n\t"
306 #endif
308 "xor %%"REG_a", %%"REG_a" \n\t"
309 "add %3, %0 \n\t"
310 "add %3, %1 \n\t"
311 "sub %4, %2 \n\t"
312 "cmp %4, %2 \n\t"
313 " jae 1b \n\t"
314 : "+r" (from), "+r" (to), "+r" (i)
315 : "r" ((long)BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" ((long)CONFUSION_FACTOR)
316 : "%"REG_a, "%ebx"
319 for(; i>0; i--)
321 __asm__ volatile (
322 #ifndef HAVE_ONLY_MMX1
323 PREFETCH" 320(%0)\n"
324 #endif
325 "movq (%0), %%mm0\n"
326 "movq 8(%0), %%mm1\n"
327 "movq 16(%0), %%mm2\n"
328 "movq 24(%0), %%mm3\n"
329 "movq 32(%0), %%mm4\n"
330 "movq 40(%0), %%mm5\n"
331 "movq 48(%0), %%mm6\n"
332 "movq 56(%0), %%mm7\n"
333 MOVNTQ" %%mm0, (%1)\n"
334 MOVNTQ" %%mm1, 8(%1)\n"
335 MOVNTQ" %%mm2, 16(%1)\n"
336 MOVNTQ" %%mm3, 24(%1)\n"
337 MOVNTQ" %%mm4, 32(%1)\n"
338 MOVNTQ" %%mm5, 40(%1)\n"
339 MOVNTQ" %%mm6, 48(%1)\n"
340 MOVNTQ" %%mm7, 56(%1)\n"
341 :: "r" (from), "r" (to) : "memory");
342 from=((const unsigned char *)from)+64;
343 to=((unsigned char *)to)+64;
346 #endif /* Have SSE */
347 #if HAVE_MMX2
348 /* since movntq is weakly-ordered, a "sfence"
349 * is needed to become ordered again. */
350 __asm__ volatile ("sfence":::"memory");
351 #endif
352 #if !HAVE_SSE
353 /* enables to use FPU */
354 __asm__ volatile (EMMS:::"memory");
355 #endif
358 * Now do the tail of the block
360 if(len) small_memcpy(to, from, len);
361 return retval;
365 * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
367 static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
369 void *retval;
370 size_t i;
371 retval = to;
372 #ifdef STATISTICS
374 static int freq[33];
375 static int t=0;
376 int i;
377 for(i=0; len>(1<<i); i++);
378 freq[i]++;
379 t++;
380 if(1024*1024*1024 % t == 0)
381 for(i=0; i<32; i++)
382 printf("mem2agp freq < %8d %4d\n", 1<<i, freq[i]);
384 #endif
385 if(len >= MIN_LEN)
387 register unsigned long int delta;
388 /* Align destinition to MMREG_SIZE -boundary */
389 delta = ((unsigned long int)to)&7;
390 if(delta)
392 delta=8-delta;
393 len -= delta;
394 small_memcpy(to, from, delta);
396 i = len >> 6; /* len/64 */
397 len &= 63;
399 This algorithm is top effective when the code consequently
400 reads and writes blocks which have size of cache line.
401 Size of cache line is processor-dependent.
402 It will, however, be a minimum of 32 bytes on any processors.
403 It would be better to have a number of instructions which
404 perform reading and writing to be multiple to a number of
405 processor's decoders, but it's not always possible.
407 for(; i>0; i--)
409 __asm__ volatile (
410 PREFETCH" 320(%0)\n"
411 "movq (%0), %%mm0\n"
412 "movq 8(%0), %%mm1\n"
413 "movq 16(%0), %%mm2\n"
414 "movq 24(%0), %%mm3\n"
415 "movq 32(%0), %%mm4\n"
416 "movq 40(%0), %%mm5\n"
417 "movq 48(%0), %%mm6\n"
418 "movq 56(%0), %%mm7\n"
419 MOVNTQ" %%mm0, (%1)\n"
420 MOVNTQ" %%mm1, 8(%1)\n"
421 MOVNTQ" %%mm2, 16(%1)\n"
422 MOVNTQ" %%mm3, 24(%1)\n"
423 MOVNTQ" %%mm4, 32(%1)\n"
424 MOVNTQ" %%mm5, 40(%1)\n"
425 MOVNTQ" %%mm6, 48(%1)\n"
426 MOVNTQ" %%mm7, 56(%1)\n"
427 :: "r" (from), "r" (to) : "memory");
428 from=((const unsigned char *)from)+64;
429 to=((unsigned char *)to)+64;
431 #if HAVE_MMX2
432 /* since movntq is weakly-ordered, a "sfence"
433 * is needed to become ordered again. */
434 __asm__ volatile ("sfence":::"memory");
435 #endif
436 /* enables to use FPU */
437 __asm__ volatile (EMMS:::"memory");
440 * Now do the tail of the block
442 if(len) small_memcpy(to, from, len);
443 return retval;