2 aclib - advanced C library ;)
3 This file contains functions which improve and expand standard C-library
8 P3 processor has only one SSE decoder so can execute only 1 sse insn per
9 cpu clock, but it has 3 mmx decoders (include load/store unit)
10 and executes 3 mmx insns per cpu clock.
11 P4 processor has some chances, but after reading:
12 http://www.emulators.com/pentium4.htm
13 I have doubts. Anyway SSE2 version of this code can be written better.
21 This part of code was taken by me from Linux-2.4.3 and slightly modified
22 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
23 blocks but mplayer uses weakly ordered data and original sources can not
24 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
26 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
29 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
31 Data referenced by a program can be temporal (data will be used again) or
32 non-temporal (data will be referenced once and not reused in the immediate
33 future). To make efficient use of the processor's caches, it is generally
34 desirable to cache temporal data and not cache non-temporal data. Overloading
35 the processor's caches with non-temporal data is sometimes referred to as
36 "polluting the caches".
37 The non-temporal data is written to memory with Write-Combining semantics.
39 The PREFETCHh instructions permits a program to load data into the processor
40 at a suggested cache level, so that it is closer to the processors load and
41 store unit when it is needed. If the data is already present in a level of
42 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
43 will not result in any data movement.
44 But we should you PREFETCHNTA: Non-temporal data fetch data into location
45 close to the processor, minimizing cache pollution.
47 The MOVNTQ (store quadword using non-temporal hint) instruction stores
48 packed integer data from an MMX register to memory, using a non-temporal hint.
49 The MOVNTPS (store packed single-precision floating-point values using
50 non-temporal hint) instruction stores packed floating-point data from an
51 XMM register to memory, using a non-temporal hint.
53 The SFENCE (Store Fence) instruction controls write ordering by creating a
54 fence for memory store operations. This instruction guarantees that the results
55 of every store instruction that precedes the store fence in program order is
56 globally visible before any store instruction that follows the fence. The
57 SFENCE instruction provides an efficient way of ensuring ordering between
58 procedures that produce weakly-ordered data and procedures that consume that
61 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
64 // 3dnow memcpy support from kernel 2.4.2
65 // by Pontscho/fresh!mindworkz
69 #if HAVE_MMX && !HAVE_MMX2 && !HAVE_AMD3DNOW && !HAVE_SSE
70 /* means: mmx v.1. Note: Since we added alignment of destinition it speedups
71 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
72 standard (non MMX-optimized) version.
73 Note: on K6-2+ it speedups memory copying upto 25% and
74 on K7 and P3 about 500% (5 times). */
75 #define HAVE_ONLY_MMX1
80 #if !HAVE_MMX2 && HAVE_AMD3DNOW
84 /* for small memory blocks (<256 bytes) this version is faster */
85 #define small_memcpy(to,from,n)\
87 register unsigned long int dummy;\
90 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
91 /* It's most portable way to notify compiler */\
92 /* that edi, esi and ecx are clobbered in asm block. */\
93 /* Thanks to A'rpi for hint!!! */\
94 :"0" (to), "1" (from),"2" (n)\
100 #define MMREG_SIZE 16
102 #define MMREG_SIZE 64 //8
109 #define PREFETCH "prefetchnta"
111 #define PREFETCH "prefetch"
113 #define PREFETCH " # nop"
116 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
125 #define MOVNTQ "movntq"
127 #define MOVNTQ "movq"
131 #ifdef HAVE_ONLY_MMX1
132 #define MIN_LEN 0x800 /* 2K blocks */
134 #define MIN_LEN 0x40 /* 64-byte blocks */
137 static void * RENAME(fast_memcpy
)(void * to
, const void * from
, size_t len
)
147 for(i
=0; len
>(1<<i
); i
++);
150 if(1024*1024*1024 % t
== 0)
152 printf("freq < %8d %4d\n", 1<<i
, freq
[i
]);
155 #ifndef HAVE_ONLY_MMX1
156 /* PREFETCH has effect even for MOVSB instruction ;) */
167 register unsigned long int delta
;
168 /* Align destinition to MMREG_SIZE -boundary */
169 delta
= ((unsigned long int)to
)&(MMREG_SIZE
-1);
172 delta
=MMREG_SIZE
-delta
;
174 small_memcpy(to
, from
, delta
);
176 i
= len
>> 6; /* len/64 */
179 This algorithm is top effective when the code consequently
180 reads and writes blocks which have size of cache line.
181 Size of cache line is processor-dependent.
182 It will, however, be a minimum of 32 bytes on any processors.
183 It would be better to have a number of instructions which
184 perform reading and writing to be multiple to a number of
185 processor's decoders, but it's not always possible.
187 #if HAVE_SSE /* Only P3 (may be Cyrix3) */
188 if(((unsigned long)from
) & 15)
189 /* if SRC is misaligned */
194 "movups (%0), %%xmm0\n"
195 "movups 16(%0), %%xmm1\n"
196 "movups 32(%0), %%xmm2\n"
197 "movups 48(%0), %%xmm3\n"
198 "movntps %%xmm0, (%1)\n"
199 "movntps %%xmm1, 16(%1)\n"
200 "movntps %%xmm2, 32(%1)\n"
201 "movntps %%xmm3, 48(%1)\n"
202 :: "r" (from
), "r" (to
) : "memory");
203 from
=((const unsigned char *) from
)+64;
204 to
=((unsigned char *)to
)+64;
208 Only if SRC is aligned on 16-byte boundary.
209 It allows to use movaps instead of movups, which required data
210 to be aligned or a general-protection exception (#GP) is generated.
216 "movaps (%0), %%xmm0\n"
217 "movaps 16(%0), %%xmm1\n"
218 "movaps 32(%0), %%xmm2\n"
219 "movaps 48(%0), %%xmm3\n"
220 "movntps %%xmm0, (%1)\n"
221 "movntps %%xmm1, 16(%1)\n"
222 "movntps %%xmm2, 32(%1)\n"
223 "movntps %%xmm3, 48(%1)\n"
224 :: "r" (from
), "r" (to
) : "memory");
225 from
=((const unsigned char *)from
)+64;
226 to
=((unsigned char *)to
)+64;
229 // Align destination at BLOCK_SIZE boundary
230 for(; ((int)to
& (BLOCK_SIZE
-1)) && i
>0; i
--)
233 #ifndef HAVE_ONLY_MMX1
237 "movq 8(%0), %%mm1\n"
238 "movq 16(%0), %%mm2\n"
239 "movq 24(%0), %%mm3\n"
240 "movq 32(%0), %%mm4\n"
241 "movq 40(%0), %%mm5\n"
242 "movq 48(%0), %%mm6\n"
243 "movq 56(%0), %%mm7\n"
244 MOVNTQ
" %%mm0, (%1)\n"
245 MOVNTQ
" %%mm1, 8(%1)\n"
246 MOVNTQ
" %%mm2, 16(%1)\n"
247 MOVNTQ
" %%mm3, 24(%1)\n"
248 MOVNTQ
" %%mm4, 32(%1)\n"
249 MOVNTQ
" %%mm5, 40(%1)\n"
250 MOVNTQ
" %%mm6, 48(%1)\n"
251 MOVNTQ
" %%mm7, 56(%1)\n"
252 :: "r" (from
), "r" (to
) : "memory");
253 from
=((const unsigned char *)from
)+64;
254 to
=((unsigned char *)to
)+64;
257 // printf(" %d %d\n", (int)from&1023, (int)to&1023);
258 // Pure Assembly cuz gcc is a bit unpredictable ;)
261 "xor %%"REG_a
", %%"REG_a
" \n\t"
264 "movl (%0, %%"REG_a
"), %%ebx \n\t"
265 "movl 32(%0, %%"REG_a
"), %%ebx \n\t"
266 "movl 64(%0, %%"REG_a
"), %%ebx \n\t"
267 "movl 96(%0, %%"REG_a
"), %%ebx \n\t"
268 "add $128, %%"REG_a
" \n\t"
269 "cmp %3, %%"REG_a
" \n\t"
272 "xor %%"REG_a
", %%"REG_a
" \n\t"
276 "movq (%0, %%"REG_a
"), %%mm0\n"
277 "movq 8(%0, %%"REG_a
"), %%mm1\n"
278 "movq 16(%0, %%"REG_a
"), %%mm2\n"
279 "movq 24(%0, %%"REG_a
"), %%mm3\n"
280 "movq 32(%0, %%"REG_a
"), %%mm4\n"
281 "movq 40(%0, %%"REG_a
"), %%mm5\n"
282 "movq 48(%0, %%"REG_a
"), %%mm6\n"
283 "movq 56(%0, %%"REG_a
"), %%mm7\n"
284 MOVNTQ
" %%mm0, (%1, %%"REG_a
")\n"
285 MOVNTQ
" %%mm1, 8(%1, %%"REG_a
")\n"
286 MOVNTQ
" %%mm2, 16(%1, %%"REG_a
")\n"
287 MOVNTQ
" %%mm3, 24(%1, %%"REG_a
")\n"
288 MOVNTQ
" %%mm4, 32(%1, %%"REG_a
")\n"
289 MOVNTQ
" %%mm5, 40(%1, %%"REG_a
")\n"
290 MOVNTQ
" %%mm6, 48(%1, %%"REG_a
")\n"
291 MOVNTQ
" %%mm7, 56(%1, %%"REG_a
")\n"
292 "add $64, %%"REG_a
" \n\t"
293 "cmp %3, %%"REG_a
" \n\t"
296 #if CONFUSION_FACTOR > 0
297 // a few percent speedup on out of order executing CPUs
298 "mov %5, %%"REG_a
" \n\t"
300 "movl (%0), %%ebx \n\t"
301 "movl (%0), %%ebx \n\t"
302 "movl (%0), %%ebx \n\t"
303 "movl (%0), %%ebx \n\t"
308 "xor %%"REG_a
", %%"REG_a
" \n\t"
314 : "+r" (from
), "+r" (to
), "+r" (i
)
315 : "r" ((long)BLOCK_SIZE
), "i" (BLOCK_SIZE
/64), "i" ((long)CONFUSION_FACTOR
)
322 #ifndef HAVE_ONLY_MMX1
326 "movq 8(%0), %%mm1\n"
327 "movq 16(%0), %%mm2\n"
328 "movq 24(%0), %%mm3\n"
329 "movq 32(%0), %%mm4\n"
330 "movq 40(%0), %%mm5\n"
331 "movq 48(%0), %%mm6\n"
332 "movq 56(%0), %%mm7\n"
333 MOVNTQ
" %%mm0, (%1)\n"
334 MOVNTQ
" %%mm1, 8(%1)\n"
335 MOVNTQ
" %%mm2, 16(%1)\n"
336 MOVNTQ
" %%mm3, 24(%1)\n"
337 MOVNTQ
" %%mm4, 32(%1)\n"
338 MOVNTQ
" %%mm5, 40(%1)\n"
339 MOVNTQ
" %%mm6, 48(%1)\n"
340 MOVNTQ
" %%mm7, 56(%1)\n"
341 :: "r" (from
), "r" (to
) : "memory");
342 from
=((const unsigned char *)from
)+64;
343 to
=((unsigned char *)to
)+64;
346 #endif /* Have SSE */
348 /* since movntq is weakly-ordered, a "sfence"
349 * is needed to become ordered again. */
350 __asm__
volatile ("sfence":::"memory");
353 /* enables to use FPU */
354 __asm__
volatile (EMMS:::"memory");
358 * Now do the tail of the block
360 if(len
) small_memcpy(to
, from
, len
);
365 * special copy routine for mem -> agp/pci copy (based upon fast_memcpy)
367 static void * RENAME(mem2agpcpy
)(void * to
, const void * from
, size_t len
)
377 for(i
=0; len
>(1<<i
); i
++);
380 if(1024*1024*1024 % t
== 0)
382 printf("mem2agp freq < %8d %4d\n", 1<<i
, freq
[i
]);
387 register unsigned long int delta
;
388 /* Align destinition to MMREG_SIZE -boundary */
389 delta
= ((unsigned long int)to
)&7;
394 small_memcpy(to
, from
, delta
);
396 i
= len
>> 6; /* len/64 */
399 This algorithm is top effective when the code consequently
400 reads and writes blocks which have size of cache line.
401 Size of cache line is processor-dependent.
402 It will, however, be a minimum of 32 bytes on any processors.
403 It would be better to have a number of instructions which
404 perform reading and writing to be multiple to a number of
405 processor's decoders, but it's not always possible.
412 "movq 8(%0), %%mm1\n"
413 "movq 16(%0), %%mm2\n"
414 "movq 24(%0), %%mm3\n"
415 "movq 32(%0), %%mm4\n"
416 "movq 40(%0), %%mm5\n"
417 "movq 48(%0), %%mm6\n"
418 "movq 56(%0), %%mm7\n"
419 MOVNTQ
" %%mm0, (%1)\n"
420 MOVNTQ
" %%mm1, 8(%1)\n"
421 MOVNTQ
" %%mm2, 16(%1)\n"
422 MOVNTQ
" %%mm3, 24(%1)\n"
423 MOVNTQ
" %%mm4, 32(%1)\n"
424 MOVNTQ
" %%mm5, 40(%1)\n"
425 MOVNTQ
" %%mm6, 48(%1)\n"
426 MOVNTQ
" %%mm7, 56(%1)\n"
427 :: "r" (from
), "r" (to
) : "memory");
428 from
=((const unsigned char *)from
)+64;
429 to
=((unsigned char *)to
)+64;
432 /* since movntq is weakly-ordered, a "sfence"
433 * is needed to become ordered again. */
434 __asm__
volatile ("sfence":::"memory");
436 /* enables to use FPU */
437 __asm__
volatile (EMMS:::"memory");
440 * Now do the tail of the block
442 if(len
) small_memcpy(to
, from
, len
);