2 aclib - advanced C library ;)
3 This file contains functions which improve and expand standard C-library
8 P3 processor has only one SSE decoder so can execute only 1 sse insn per
9 cpu clock, but it has 3 mmx decoders (include load/store unit)
10 and executes 3 mmx insns per cpu clock.
11 P4 processor has some chances, but after reading:
12 http://www.emulators.com/pentium4.htm
13 I have doubts. Anyway SSE2 version of this code can be written better.
20 This part of code was taken by me from Linux-2.4.3 and slightly modified
21 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
22 blocks but mplayer uses weakly ordered data and original sources can not
23 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
25 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
28 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
30 Data referenced by a program can be temporal (data will be used again) or
31 non-temporal (data will be referenced once and not reused in the immediate
32 future). To make efficient use of the processor's caches, it is generally
33 desirable to cache temporal data and not cache non-temporal data. Overloading
34 the processor's caches with non-temporal data is sometimes referred to as
35 "polluting the caches".
36 The non-temporal data is written to memory with Write-Combining semantics.
38 The PREFETCHh instructions permits a program to load data into the processor
39 at a suggested cache level, so that it is closer to the processors load and
40 store unit when it is needed. If the data is already present in a level of
41 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
42 will not result in any data movement.
43 But we should you PREFETCHNTA: Non-temporal data fetch data into location
44 close to the processor, minimizing cache pollution.
46 The MOVNTQ (store quadword using non-temporal hint) instruction stores
47 packed integer data from an MMX register to memory, using a non-temporal hint.
48 The MOVNTPS (store packed single-precision floating-point values using
49 non-temporal hint) instruction stores packed floating-point data from an
50 XMM register to memory, using a non-temporal hint.
52 The SFENCE (Store Fence) instruction controls write ordering by creating a
53 fence for memory store operations. This instruction guarantees that the results
54 of every store instruction that precedes the store fence in program order is
55 globally visible before any store instruction that follows the fence. The
56 SFENCE instruction provides an efficient way of ensuring ordering between
57 procedures that produce weakly-ordered data and procedures that consume that
60 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
63 // 3dnow memcpy support from kernel 2.4.2
64 // by Pontscho/fresh!mindworkz
68 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
69 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups
70 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
71 standard (non MMX-optimized) version.
72 Note: on K6-2+ it speedups memory copying upto 25% and
73 on K7 and P3 about 500% (5 times). */
74 #define HAVE_ONLY_MMX1
79 #if !defined( HAVE_MMX2) && defined( HAVE_3DNOW)
83 /* for small memory blocks (<256 bytes) this version is faster */
84 #define small_memcpy(to,from,n)\
86 register unsigned long int dummy;\
87 __asm__ __volatile__(\
89 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
90 /* It's most portable way to notify compiler */\
91 /* that edi, esi and ecx are clobbered in asm block. */\
92 /* Thanks to A'rpi for hint!!! */\
93 :"0" (to), "1" (from),"2" (n)\
101 #define MMREG_SIZE 64 //8
108 #define PREFETCH "prefetchnta"
109 #elif defined ( HAVE_3DNOW )
110 #define PREFETCH "prefetch"
112 #define PREFETCH "/nop"
115 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
124 #define MOVNTQ "movntq"
126 #define MOVNTQ "movq"
130 #ifdef HAVE_ONLY_MMX1
131 #define MIN_LEN 0x800 /* 2K blocks */
133 #define MIN_LEN 0x40 /* 64-byte blocks */
136 static void * RENAME(fast_memcpy
)(void * to
, const void * from
, size_t len
)
146 for(i
=0; len
>(1<<i
); i
++);
149 if(1024*1024*1024 % t
== 0)
151 printf("freq < %8d %4d\n", 1<<i
, freq
[i
]);
154 #ifndef HAVE_ONLY_MMX1
155 /* PREFETCH has effect even for MOVSB instruction ;) */
156 __asm__
__volatile__ (
166 register unsigned long int delta
;
167 /* Align destinition to MMREG_SIZE -boundary */
168 delta
= ((unsigned long int)to
)&(MMREG_SIZE
-1);
171 delta
=MMREG_SIZE
-delta
;
173 small_memcpy(to
, from
, delta
);
175 i
= len
>> 6; /* len/64 */
178 This algorithm is top effective when the code consequently
179 reads and writes blocks which have size of cache line.
180 Size of cache line is processor-dependent.
181 It will, however, be a minimum of 32 bytes on any processors.
182 It would be better to have a number of instructions which
183 perform reading and writing to be multiple to a number of
184 processor's decoders, but it's not always possible.
186 #ifdef HAVE_SSE /* Only P3 (may be Cyrix3) */
187 if(((unsigned long)from
) & 15)
188 /* if SRC is misaligned */
191 __asm__
__volatile__ (
193 "movups (%0), %%xmm0\n"
194 "movups 16(%0), %%xmm1\n"
195 "movups 32(%0), %%xmm2\n"
196 "movups 48(%0), %%xmm3\n"
197 "movntps %%xmm0, (%1)\n"
198 "movntps %%xmm1, 16(%1)\n"
199 "movntps %%xmm2, 32(%1)\n"
200 "movntps %%xmm3, 48(%1)\n"
201 :: "r" (from
), "r" (to
) : "memory");
202 from
=((const unsigned char *) from
)+64;
203 to
=((unsigned char *)to
)+64;
207 Only if SRC is aligned on 16-byte boundary.
208 It allows to use movaps instead of movups, which required data
209 to be aligned or a general-protection exception (#GP) is generated.
213 __asm__
__volatile__ (
215 "movaps (%0), %%xmm0\n"
216 "movaps 16(%0), %%xmm1\n"
217 "movaps 32(%0), %%xmm2\n"
218 "movaps 48(%0), %%xmm3\n"
219 "movntps %%xmm0, (%1)\n"
220 "movntps %%xmm1, 16(%1)\n"
221 "movntps %%xmm2, 32(%1)\n"
222 "movntps %%xmm3, 48(%1)\n"
223 :: "r" (from
), "r" (to
) : "memory");
224 from
=((const unsigned char *)from
)+64;
225 to
=((unsigned char *)to
)+64;
228 // Align destination at BLOCK_SIZE boundary
229 for(; ((int)to
& (BLOCK_SIZE
-1)) && i
>0; i
--)
231 __asm__
__volatile__ (
232 #ifndef HAVE_ONLY_MMX1
236 "movq 8(%0), %%mm1\n"
237 "movq 16(%0), %%mm2\n"
238 "movq 24(%0), %%mm3\n"
239 "movq 32(%0), %%mm4\n"
240 "movq 40(%0), %%mm5\n"
241 "movq 48(%0), %%mm6\n"
242 "movq 56(%0), %%mm7\n"
243 MOVNTQ
" %%mm0, (%1)\n"
244 MOVNTQ
" %%mm1, 8(%1)\n"
245 MOVNTQ
" %%mm2, 16(%1)\n"
246 MOVNTQ
" %%mm3, 24(%1)\n"
247 MOVNTQ
" %%mm4, 32(%1)\n"
248 MOVNTQ
" %%mm5, 40(%1)\n"
249 MOVNTQ
" %%mm6, 48(%1)\n"
250 MOVNTQ
" %%mm7, 56(%1)\n"
251 :: "r" (from
), "r" (to
) : "memory");
252 from
=((const unsigned char *)from
)+64;
253 to
=((unsigned char *)to
)+64;
256 // printf(" %d %d\n", (int)from&1023, (int)to&1023);
257 // Pure Assembly cuz gcc is a bit unpredictable ;)
260 "xor %%"REG_a
", %%"REG_a
" \n\t"
263 "movl (%0, %%"REG_a
"), %%ebx \n\t"
264 "movl 32(%0, %%"REG_a
"), %%ebx \n\t"
265 "movl 64(%0, %%"REG_a
"), %%ebx \n\t"
266 "movl 96(%0, %%"REG_a
"), %%ebx \n\t"
267 "add $128, %%"REG_a
" \n\t"
268 "cmp %3, %%"REG_a
" \n\t"
271 "xor %%"REG_a
", %%"REG_a
" \n\t"
275 "movq (%0, %%"REG_a
"), %%mm0\n"
276 "movq 8(%0, %%"REG_a
"), %%mm1\n"
277 "movq 16(%0, %%"REG_a
"), %%mm2\n"
278 "movq 24(%0, %%"REG_a
"), %%mm3\n"
279 "movq 32(%0, %%"REG_a
"), %%mm4\n"
280 "movq 40(%0, %%"REG_a
"), %%mm5\n"
281 "movq 48(%0, %%"REG_a
"), %%mm6\n"
282 "movq 56(%0, %%"REG_a
"), %%mm7\n"
283 MOVNTQ
" %%mm0, (%1, %%"REG_a
")\n"
284 MOVNTQ
" %%mm1, 8(%1, %%"REG_a
")\n"
285 MOVNTQ
" %%mm2, 16(%1, %%"REG_a
")\n"
286 MOVNTQ
" %%mm3, 24(%1, %%"REG_a
")\n"
287 MOVNTQ
" %%mm4, 32(%1, %%"REG_a
")\n"
288 MOVNTQ
" %%mm5, 40(%1, %%"REG_a
")\n"
289 MOVNTQ
" %%mm6, 48(%1, %%"REG_a
")\n"
290 MOVNTQ
" %%mm7, 56(%1, %%"REG_a
")\n"
291 "add $64, %%"REG_a
" \n\t"
292 "cmp %3, %%"REG_a
" \n\t"
295 #if CONFUSION_FACTOR > 0
296 // a few percent speedup on out of order executing CPUs
297 "mov %5, %%"REG_a
" \n\t"
299 "movl (%0), %%ebx \n\t"
300 "movl (%0), %%ebx \n\t"
301 "movl (%0), %%ebx \n\t"
302 "movl (%0), %%ebx \n\t"
307 "xor %%"REG_a
", %%"REG_a
" \n\t"
313 : "+r" (from
), "+r" (to
), "+r" (i
)
314 : "r" ((long)BLOCK_SIZE
), "i" (BLOCK_SIZE
/64), "i" ((long)CONFUSION_FACTOR
)
320 __asm__
__volatile__ (
321 #ifndef HAVE_ONLY_MMX1
325 "movq 8(%0), %%mm1\n"
326 "movq 16(%0), %%mm2\n"
327 "movq 24(%0), %%mm3\n"
328 "movq 32(%0), %%mm4\n"
329 "movq 40(%0), %%mm5\n"
330 "movq 48(%0), %%mm6\n"
331 "movq 56(%0), %%mm7\n"
332 MOVNTQ
" %%mm0, (%1)\n"
333 MOVNTQ
" %%mm1, 8(%1)\n"
334 MOVNTQ
" %%mm2, 16(%1)\n"
335 MOVNTQ
" %%mm3, 24(%1)\n"
336 MOVNTQ
" %%mm4, 32(%1)\n"
337 MOVNTQ
" %%mm5, 40(%1)\n"
338 MOVNTQ
" %%mm6, 48(%1)\n"
339 MOVNTQ
" %%mm7, 56(%1)\n"
340 :: "r" (from
), "r" (to
) : "memory");
341 from
=((const unsigned char *)from
)+64;
342 to
=((unsigned char *)to
)+64;
345 #endif /* Have SSE */
347 /* since movntq is weakly-ordered, a "sfence"
348 * is needed to become ordered again. */
349 __asm__
__volatile__ ("sfence":::"memory");
352 /* enables to use FPU */
353 __asm__
__volatile__ (EMMS:::"memory");
357 * Now do the tail of the block
359 if(len
) small_memcpy(to
, from
, len
);
364 * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
366 static void * RENAME(mem2agpcpy
)(void * to
, const void * from
, size_t len
)
376 for(i
=0; len
>(1<<i
); i
++);
379 if(1024*1024*1024 % t
== 0)
381 printf("mem2agp freq < %8d %4d\n", 1<<i
, freq
[i
]);
386 register unsigned long int delta
;
387 /* Align destinition to MMREG_SIZE -boundary */
388 delta
= ((unsigned long int)to
)&7;
393 small_memcpy(to
, from
, delta
);
395 i
= len
>> 6; /* len/64 */
398 This algorithm is top effective when the code consequently
399 reads and writes blocks which have size of cache line.
400 Size of cache line is processor-dependent.
401 It will, however, be a minimum of 32 bytes on any processors.
402 It would be better to have a number of instructions which
403 perform reading and writing to be multiple to a number of
404 processor's decoders, but it's not always possible.
408 __asm__
__volatile__ (
411 "movq 8(%0), %%mm1\n"
412 "movq 16(%0), %%mm2\n"
413 "movq 24(%0), %%mm3\n"
414 "movq 32(%0), %%mm4\n"
415 "movq 40(%0), %%mm5\n"
416 "movq 48(%0), %%mm6\n"
417 "movq 56(%0), %%mm7\n"
418 MOVNTQ
" %%mm0, (%1)\n"
419 MOVNTQ
" %%mm1, 8(%1)\n"
420 MOVNTQ
" %%mm2, 16(%1)\n"
421 MOVNTQ
" %%mm3, 24(%1)\n"
422 MOVNTQ
" %%mm4, 32(%1)\n"
423 MOVNTQ
" %%mm5, 40(%1)\n"
424 MOVNTQ
" %%mm6, 48(%1)\n"
425 MOVNTQ
" %%mm7, 56(%1)\n"
426 :: "r" (from
), "r" (to
) : "memory");
427 from
=((const unsigned char *)from
)+64;
428 to
=((unsigned char *)to
)+64;
431 /* since movntq is weakly-ordered, a "sfence"
432 * is needed to become ordered again. */
433 __asm__
__volatile__ ("sfence":::"memory");
435 /* enables to use FPU */
436 __asm__
__volatile__ (EMMS:::"memory");
439 * Now do the tail of the block
441 if(len
) small_memcpy(to
, from
, len
);