5 * x86_64: MOVNTQ vs MOVQ, MOVNTDQ vs MOVDQ[A/U], PREFETCH[x]
7 * NT stands for 'non-temportal', which basically means
8 * 'bypass L1 cache on write'. Write bandwidth is
9 * effectively reduced to the L2 cache bandwidth but
10 * the L1 cache will not be wiped out by the copy.
12 * DO NOT MIX 'nt' and standard writes! Your performance
15 * PREFETCH[NTA,T0,T1,T2]
17 * These instructions prefetch a cache line (typically
18 * 128 bytes). 'NT' means 'non-temporal', which bypasses
19 * the L1 cache if the data is not already in the L1
20 * cache. HOWEVER, using PREFETCHNT can put a slow memory
21 * op in the cpu's memory request queue if a L1 or L2
22 * miss occurs, and it can stall an L1-cache-hit access
23 * for a small but noticeable period of time, so it is
24 * a good idea not to put a memory op just after a
25 * prefetchnta instruction.
27 * You can get better L2 bandwidth using prefetchnt but
28 * it will not be much more then prefetcht0 and
29 * 'prefetcht0' will give you better cache-miss
32 * The prefetch has to be done far enough ahead to do
33 * some good, but it only has a significant effect when
34 * it is able to move date from L2 to L1. Prefetching
35 * from main memory does not have a significant effect
36 * durign a copy or zeroing operation because main
37 * memory bandwidth is already saturated.
39 * $DragonFly: src/test/sysperf/memcpy.S,v 1.1 2004/04/29 16:14:53 dillon Exp $
102 movl 12+16(%esp),%ecx
145 movl 12+16(%esp),%ecx
175 movl 12+16(%esp),%ecx
187 prefetchnta 128(%esi)
217 movl 12+16(%esp),%ecx
248 movl 12+16(%esp),%ecx
254 movdqa 16(%esi),%xmm1
255 movdqa 32(%esi),%xmm2
256 movdqa 48(%esi),%xmm3
257 movdqa 64(%esi),%xmm4
258 movdqa 80(%esi),%xmm5
259 movdqa 96(%esi),%xmm6
260 movdqa 112(%esi),%xmm7
264 movntdq %xmm1,16(%edi)
265 movntdq %xmm2,32(%edi)
266 movntdq %xmm3,48(%edi)
267 movntdq %xmm4,64(%edi)
268 movntdq %xmm5,80(%edi)
269 movntdq %xmm6,96(%edi)
270 movntdq %xmm7,112(%edi)