Merge svn changes up to r27441
[mplayer.git] / libvo / aclib_template.c
blobac08aab3f9dd3185d31f6d7733a020bd6eb67d9f
1 /*
2 aclib - advanced C library ;)
3 This file contains functions which improve and expand standard C-library
4 */
6 #ifndef HAVE_SSE2
7 /*
8 P3 processor has only one SSE decoder so can execute only 1 sse insn per
9 cpu clock, but it has 3 mmx decoders (include load/store unit)
10 and executes 3 mmx insns per cpu clock.
11 P4 processor has some chances, but after reading:
12 http://www.emulators.com/pentium4.htm
13 I have doubts. Anyway SSE2 version of this code can be written better.
15 #undef HAVE_SSE
16 #endif
20 This part of code was taken by me from Linux-2.4.3 and slightly modified
21 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
22 blocks but mplayer uses weakly ordered data and original sources can not
23 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
25 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
27 Order Number 245470:
28 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
30 Data referenced by a program can be temporal (data will be used again) or
31 non-temporal (data will be referenced once and not reused in the immediate
32 future). To make efficient use of the processor's caches, it is generally
33 desirable to cache temporal data and not cache non-temporal data. Overloading
34 the processor's caches with non-temporal data is sometimes referred to as
35 "polluting the caches".
36 The non-temporal data is written to memory with Write-Combining semantics.
38 The PREFETCHh instructions permits a program to load data into the processor
39 at a suggested cache level, so that it is closer to the processors load and
40 store unit when it is needed. If the data is already present in a level of
41 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
42 will not result in any data movement.
43 But we should you PREFETCHNTA: Non-temporal data fetch data into location
44 close to the processor, minimizing cache pollution.
46 The MOVNTQ (store quadword using non-temporal hint) instruction stores
47 packed integer data from an MMX register to memory, using a non-temporal hint.
48 The MOVNTPS (store packed single-precision floating-point values using
49 non-temporal hint) instruction stores packed floating-point data from an
50 XMM register to memory, using a non-temporal hint.
52 The SFENCE (Store Fence) instruction controls write ordering by creating a
53 fence for memory store operations. This instruction guarantees that the results
54 of every store instruction that precedes the store fence in program order is
55 globally visible before any store instruction that follows the fence. The
56 SFENCE instruction provides an efficient way of ensuring ordering between
57 procedures that produce weakly-ordered data and procedures that consume that
58 data.
60 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
63 // 3dnow memcpy support from kernel 2.4.2
64 // by Pontscho/fresh!mindworkz
67 #undef HAVE_ONLY_MMX1
68 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
69 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups
70 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
71 standard (non MMX-optimized) version.
72 Note: on K6-2+ it speedups memory copying upto 25% and
73 on K7 and P3 about 500% (5 times). */
74 #define HAVE_ONLY_MMX1
75 #endif
78 #undef HAVE_K6_2PLUS
79 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
80 #define HAVE_K6_2PLUS
81 #endif
83 /* for small memory blocks (<256 bytes) this version is faster */
84 #define small_memcpy(to,from,n)\
86 register unsigned long int dummy;\
87 __asm__ __volatile__(\
88 "rep; movsb"\
89 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
90 /* It's most portable way to notify compiler */\
91 /* that edi, esi and ecx are clobbered in asm block. */\
92 /* Thanks to A'rpi for hint!!! */\
93 :"0" (to), "1" (from),"2" (n)\
94 : "memory");\
97 #undef MMREG_SIZE
98 #ifdef HAVE_SSE
99 #define MMREG_SIZE 16
100 #else
101 #define MMREG_SIZE 64 //8
102 #endif
104 #undef PREFETCH
105 #undef EMMS
107 #ifdef HAVE_MMX2
108 #define PREFETCH "prefetchnta"
109 #elif defined ( HAVE_3DNOW )
110 #define PREFETCH "prefetch"
111 #else
112 #define PREFETCH " # nop"
113 #endif
115 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
116 #ifdef HAVE_3DNOW
117 #define EMMS "femms"
118 #else
119 #define EMMS "emms"
120 #endif
122 #undef MOVNTQ
123 #ifdef HAVE_MMX2
124 #define MOVNTQ "movntq"
125 #else
126 #define MOVNTQ "movq"
127 #endif
129 #undef MIN_LEN
130 #ifdef HAVE_ONLY_MMX1
131 #define MIN_LEN 0x800 /* 2K blocks */
132 #else
133 #define MIN_LEN 0x40 /* 64-byte blocks */
134 #endif
136 static void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
138 void *retval;
139 size_t i;
140 retval = to;
141 #ifdef STATISTICS
143 static int freq[33];
144 static int t=0;
145 int i;
146 for(i=0; len>(1<<i); i++);
147 freq[i]++;
148 t++;
149 if(1024*1024*1024 % t == 0)
150 for(i=0; i<32; i++)
151 printf("freq < %8d %4d\n", 1<<i, freq[i]);
153 #endif
154 #ifndef HAVE_ONLY_MMX1
155 /* PREFETCH has effect even for MOVSB instruction ;) */
156 __asm__ __volatile__ (
157 PREFETCH" (%0)\n"
158 PREFETCH" 64(%0)\n"
159 PREFETCH" 128(%0)\n"
160 PREFETCH" 192(%0)\n"
161 PREFETCH" 256(%0)\n"
162 : : "r" (from) );
163 #endif
164 if(len >= MIN_LEN)
166 register unsigned long int delta;
167 /* Align destinition to MMREG_SIZE -boundary */
168 delta = ((unsigned long int)to)&(MMREG_SIZE-1);
169 if(delta)
171 delta=MMREG_SIZE-delta;
172 len -= delta;
173 small_memcpy(to, from, delta);
175 i = len >> 6; /* len/64 */
176 len&=63;
178 This algorithm is top effective when the code consequently
179 reads and writes blocks which have size of cache line.
180 Size of cache line is processor-dependent.
181 It will, however, be a minimum of 32 bytes on any processors.
182 It would be better to have a number of instructions which
183 perform reading and writing to be multiple to a number of
184 processor's decoders, but it's not always possible.
186 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
187 if(((unsigned long)from) & 15)
188 /* if SRC is misaligned */
189 for(; i>0; i--)
191 __asm__ __volatile__ (
192 PREFETCH" 320(%0)\n"
193 "movups (%0), %%xmm0\n"
194 "movups 16(%0), %%xmm1\n"
195 "movups 32(%0), %%xmm2\n"
196 "movups 48(%0), %%xmm3\n"
197 "movntps %%xmm0, (%1)\n"
198 "movntps %%xmm1, 16(%1)\n"
199 "movntps %%xmm2, 32(%1)\n"
200 "movntps %%xmm3, 48(%1)\n"
201 :: "r" (from), "r" (to) : "memory");
202 from=((const unsigned char *) from)+64;
203 to=((unsigned char *)to)+64;
205 else
207 Only if SRC is aligned on 16-byte boundary.
208 It allows to use movaps instead of movups, which required data
209 to be aligned or a general-protection exception (#GP) is generated.
211 for(; i>0; i--)
213 __asm__ __volatile__ (
214 PREFETCH" 320(%0)\n"
215 "movaps (%0), %%xmm0\n"
216 "movaps 16(%0), %%xmm1\n"
217 "movaps 32(%0), %%xmm2\n"
218 "movaps 48(%0), %%xmm3\n"
219 "movntps %%xmm0, (%1)\n"
220 "movntps %%xmm1, 16(%1)\n"
221 "movntps %%xmm2, 32(%1)\n"
222 "movntps %%xmm3, 48(%1)\n"
223 :: "r" (from), "r" (to) : "memory");
224 from=((const unsigned char *)from)+64;
225 to=((unsigned char *)to)+64;
227 #else
228 // Align destination at BLOCK_SIZE boundary
229 for(; ((int)to & (BLOCK_SIZE-1)) && i>0; i--)
231 __asm__ __volatile__ (
232 #ifndef HAVE_ONLY_MMX1
233 PREFETCH" 320(%0)\n"
234 #endif
235 "movq (%0), %%mm0\n"
236 "movq 8(%0), %%mm1\n"
237 "movq 16(%0), %%mm2\n"
238 "movq 24(%0), %%mm3\n"
239 "movq 32(%0), %%mm4\n"
240 "movq 40(%0), %%mm5\n"
241 "movq 48(%0), %%mm6\n"
242 "movq 56(%0), %%mm7\n"
243 MOVNTQ" %%mm0, (%1)\n"
244 MOVNTQ" %%mm1, 8(%1)\n"
245 MOVNTQ" %%mm2, 16(%1)\n"
246 MOVNTQ" %%mm3, 24(%1)\n"
247 MOVNTQ" %%mm4, 32(%1)\n"
248 MOVNTQ" %%mm5, 40(%1)\n"
249 MOVNTQ" %%mm6, 48(%1)\n"
250 MOVNTQ" %%mm7, 56(%1)\n"
251 :: "r" (from), "r" (to) : "memory");
252 from=((const unsigned char *)from)+64;
253 to=((unsigned char *)to)+64;
256 // printf(" %d %d\n", (int)from&1023, (int)to&1023);
257 // Pure Assembly cuz gcc is a bit unpredictable ;)
258 if(i>=BLOCK_SIZE/64)
259 asm volatile(
260 "xor %%"REG_a", %%"REG_a" \n\t"
261 ASMALIGN(4)
262 "1: \n\t"
263 "movl (%0, %%"REG_a"), %%ebx \n\t"
264 "movl 32(%0, %%"REG_a"), %%ebx \n\t"
265 "movl 64(%0, %%"REG_a"), %%ebx \n\t"
266 "movl 96(%0, %%"REG_a"), %%ebx \n\t"
267 "add $128, %%"REG_a" \n\t"
268 "cmp %3, %%"REG_a" \n\t"
269 " jb 1b \n\t"
271 "xor %%"REG_a", %%"REG_a" \n\t"
273 ASMALIGN(4)
274 "2: \n\t"
275 "movq (%0, %%"REG_a"), %%mm0\n"
276 "movq 8(%0, %%"REG_a"), %%mm1\n"
277 "movq 16(%0, %%"REG_a"), %%mm2\n"
278 "movq 24(%0, %%"REG_a"), %%mm3\n"
279 "movq 32(%0, %%"REG_a"), %%mm4\n"
280 "movq 40(%0, %%"REG_a"), %%mm5\n"
281 "movq 48(%0, %%"REG_a"), %%mm6\n"
282 "movq 56(%0, %%"REG_a"), %%mm7\n"
283 MOVNTQ" %%mm0, (%1, %%"REG_a")\n"
284 MOVNTQ" %%mm1, 8(%1, %%"REG_a")\n"
285 MOVNTQ" %%mm2, 16(%1, %%"REG_a")\n"
286 MOVNTQ" %%mm3, 24(%1, %%"REG_a")\n"
287 MOVNTQ" %%mm4, 32(%1, %%"REG_a")\n"
288 MOVNTQ" %%mm5, 40(%1, %%"REG_a")\n"
289 MOVNTQ" %%mm6, 48(%1, %%"REG_a")\n"
290 MOVNTQ" %%mm7, 56(%1, %%"REG_a")\n"
291 "add $64, %%"REG_a" \n\t"
292 "cmp %3, %%"REG_a" \n\t"
293 "jb 2b \n\t"
295 #if CONFUSION_FACTOR > 0
296 // a few percent speedup on out of order executing CPUs
297 "mov %5, %%"REG_a" \n\t"
298 "2: \n\t"
299 "movl (%0), %%ebx \n\t"
300 "movl (%0), %%ebx \n\t"
301 "movl (%0), %%ebx \n\t"
302 "movl (%0), %%ebx \n\t"
303 "dec %%"REG_a" \n\t"
304 " jnz 2b \n\t"
305 #endif
307 "xor %%"REG_a", %%"REG_a" \n\t"
308 "add %3, %0 \n\t"
309 "add %3, %1 \n\t"
310 "sub %4, %2 \n\t"
311 "cmp %4, %2 \n\t"
312 " jae 1b \n\t"
313 : "+r" (from), "+r" (to), "+r" (i)
314 : "r" ((long)BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" ((long)CONFUSION_FACTOR)
315 : "%"REG_a, "%ebx"
318 for(; i>0; i--)
320 __asm__ __volatile__ (
321 #ifndef HAVE_ONLY_MMX1
322 PREFETCH" 320(%0)\n"
323 #endif
324 "movq (%0), %%mm0\n"
325 "movq 8(%0), %%mm1\n"
326 "movq 16(%0), %%mm2\n"
327 "movq 24(%0), %%mm3\n"
328 "movq 32(%0), %%mm4\n"
329 "movq 40(%0), %%mm5\n"
330 "movq 48(%0), %%mm6\n"
331 "movq 56(%0), %%mm7\n"
332 MOVNTQ" %%mm0, (%1)\n"
333 MOVNTQ" %%mm1, 8(%1)\n"
334 MOVNTQ" %%mm2, 16(%1)\n"
335 MOVNTQ" %%mm3, 24(%1)\n"
336 MOVNTQ" %%mm4, 32(%1)\n"
337 MOVNTQ" %%mm5, 40(%1)\n"
338 MOVNTQ" %%mm6, 48(%1)\n"
339 MOVNTQ" %%mm7, 56(%1)\n"
340 :: "r" (from), "r" (to) : "memory");
341 from=((const unsigned char *)from)+64;
342 to=((unsigned char *)to)+64;
345 #endif /* Have SSE */
346 #ifdef HAVE_MMX2
347 /* since movntq is weakly-ordered, a "sfence"
348 * is needed to become ordered again. */
349 __asm__ __volatile__ ("sfence":::"memory");
350 #endif
351 #ifndef HAVE_SSE
352 /* enables to use FPU */
353 __asm__ __volatile__ (EMMS:::"memory");
354 #endif
357 * Now do the tail of the block
359 if(len) small_memcpy(to, from, len);
360 return retval;
364 * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
366 static void * RENAME(mem2agpcpy)(void * to, const void * from, size_t len)
368 void *retval;
369 size_t i;
370 retval = to;
371 #ifdef STATISTICS
373 static int freq[33];
374 static int t=0;
375 int i;
376 for(i=0; len>(1<<i); i++);
377 freq[i]++;
378 t++;
379 if(1024*1024*1024 % t == 0)
380 for(i=0; i<32; i++)
381 printf("mem2agp freq < %8d %4d\n", 1<<i, freq[i]);
383 #endif
384 if(len >= MIN_LEN)
386 register unsigned long int delta;
387 /* Align destinition to MMREG_SIZE -boundary */
388 delta = ((unsigned long int)to)&7;
389 if(delta)
391 delta=8-delta;
392 len -= delta;
393 small_memcpy(to, from, delta);
395 i = len >> 6; /* len/64 */
396 len &= 63;
398 This algorithm is top effective when the code consequently
399 reads and writes blocks which have size of cache line.
400 Size of cache line is processor-dependent.
401 It will, however, be a minimum of 32 bytes on any processors.
402 It would be better to have a number of instructions which
403 perform reading and writing to be multiple to a number of
404 processor's decoders, but it's not always possible.
406 for(; i>0; i--)
408 __asm__ __volatile__ (
409 PREFETCH" 320(%0)\n"
410 "movq (%0), %%mm0\n"
411 "movq 8(%0), %%mm1\n"
412 "movq 16(%0), %%mm2\n"
413 "movq 24(%0), %%mm3\n"
414 "movq 32(%0), %%mm4\n"
415 "movq 40(%0), %%mm5\n"
416 "movq 48(%0), %%mm6\n"
417 "movq 56(%0), %%mm7\n"
418 MOVNTQ" %%mm0, (%1)\n"
419 MOVNTQ" %%mm1, 8(%1)\n"
420 MOVNTQ" %%mm2, 16(%1)\n"
421 MOVNTQ" %%mm3, 24(%1)\n"
422 MOVNTQ" %%mm4, 32(%1)\n"
423 MOVNTQ" %%mm5, 40(%1)\n"
424 MOVNTQ" %%mm6, 48(%1)\n"
425 MOVNTQ" %%mm7, 56(%1)\n"
426 :: "r" (from), "r" (to) : "memory");
427 from=((const unsigned char *)from)+64;
428 to=((unsigned char *)to)+64;
430 #ifdef HAVE_MMX2
431 /* since movntq is weakly-ordered, a "sfence"
432 * is needed to become ordered again. */
433 __asm__ __volatile__ ("sfence":::"memory");
434 #endif
435 /* enables to use FPU */
436 __asm__ __volatile__ (EMMS:::"memory");
439 * Now do the tail of the block
441 if(len) small_memcpy(to, from, len);
442 return retval;