Fix Boot Setup example:
[dragonfly/netmp.git] / test / sysperf / memcpy.S
blob4acc5214ce6875e6ac9566a6de0a46d97235163c
2                 /*
3                  * memcpy.S
4                  *
5                  * AMD64: MOVNTQ vs MOVQ, MOVNTDQ vs MOVDQ[A/U], PREFETCH[x]
6                  *
7                  *      NT stands for 'non-temportal', which basically means
8                  *      'bypass L1 cache on write'.  Write bandwidth is
9                  *      effectively reduced to the L2 cache bandwidth but
10                  *      the L1 cache will not be wiped out by the copy.
11                  *
12                  *      DO NOT MIX 'nt' and standard writes!  Your performance
13                  *      will go poof.
14                  *
15                  * PREFETCH[NTA,T0,T1,T2]
16                  *
17                  *      These instructions prefetch a cache line (typically
18                  *      128 bytes).  'NT' means 'non-temporal', which bypasses
19                  *      the L1 cache if the data is not already in the L1 
20                  *      cache.  HOWEVER, using PREFETCHNT can put a slow memory
21                  *      op in the cpu's memory request queue if a L1 or L2
22                  *      miss occurs, and it can stall an L1-cache-hit access
23                  *      for a small but noticeable period of time, so it is
24                  *      a good idea not to put a memory op just after a 
25                  *      prefetchnta instruction.
26                  *
27                  *      You can get better L2 bandwidth using prefetchnt but
28                  *      it will not be much more then prefetcht0 and 
29                  *      'prefetcht0' will give you better cache-miss
30                  *      bandwidth.
31                  *
32                  *      The prefetch has to be done far enough ahead to do
33                  *      some good, but it only has a significant effect when
34                  *      it is able to move date from L2 to L1.  Prefetching
35                  *      from main memory does not have a significant effect
36                  *      durign a copy or zeroing operation because main
37                  *      memory bandwidth is already saturated.
38                  *
39                  * $DragonFly: src/test/sysperf/memcpy.S,v 1.1 2004/04/29 16:14:53 dillon Exp $
40                  */
41                 .text
42                 .globl  docopy1
43                 .globl  docopy2
44                 .globl  docopy3
45                 .globl  docopy4
46                 .globl  docopy5
47                 .globl  docopy6
48                 .globl  docopy7
49                 .globl  fpcleanup
51                 .p2align 4,0x90
52 docopy1:
53                 pushl   %esi
54                 pushl   %edi
55                 pushl   %ecx
56                 pushl   %ebx
58                 movl    4+16(%esp),%esi
59                 movl    8+16(%esp),%edi
60                 movl    12+16(%esp),%ecx
61                 shrl    $2,%ecx
62                 cld
63                 rep
64                 movsl
65                 popl    %ebx
66                 popl    %ecx
67                 popl    %edi
68                 popl    %esi
69                 ret
71                 .p2align 4,0x90
72 docopy2:
73                 pushl   %esi
74                 pushl   %edi
75                 pushl   %ecx
76                 pushl   %ebx
78                 movl    4+16(%esp),%esi
79                 movl    8+16(%esp),%edi
80                 movl    12+16(%esp),%ecx
81                 addl    %ecx,%esi
82                 addl    %ecx,%edi
83                 shrl    $2,%ecx
84                 std
85                 rep
86                 movsl
87                 popl    %ebx
88                 popl    %ecx
89                 popl    %edi
90                 popl    %esi
91                 ret
93                 .p2align 4,0x90
94 docopy3:
95                 pushl   %esi
96                 pushl   %edi
97                 pushl   %ecx
98                 pushl   %ebx
100                 movl    4+16(%esp),%esi
101                 movl    8+16(%esp),%edi
102                 movl    12+16(%esp),%ecx
104                 .p2align 4,0x90
106                 movl    (%esi),%eax
107                 movl    4(%esi),%ebx
108                 movl    8(%esi),%edx
109                 movl    %eax,(%edi)
110                 movl    12(%esi),%eax
111                 movl    %ebx,4(%edi)
112                 movl    16(%esi),%ebx
113                 movl    %edx,8(%edi)
114                 movl    20(%esi),%edx
115                 movl    %eax,12(%edi)
116                 movl    24(%esi),%eax
117                 movl    %ebx,16(%edi)
118                 movl    28(%esi),%ebx
119                 movl    %edx,20(%edi)
120                 prefetcht0 96(%esi)
121                 subl    $32,%ecx
122                 movl    %eax,24(%edi)
123                 addl    $32,%esi
124                 movl    %ebx,28(%edi)
125                 addl    $32,%edi
127                 testl   %ecx,%ecx
128                 jnz     1b
130                 popl    %ebx
131                 popl    %ecx
132                 popl    %edi
133                 popl    %esi
134                 ret
136                 .p2align 4,0x90
137 docopy4:
138                 pushl   %esi
139                 pushl   %edi
140                 pushl   %ecx
141                 pushl   %ebx
143                 movl    4+16(%esp),%esi
144                 movl    8+16(%esp),%edi
145                 movl    12+16(%esp),%ecx
147                 .p2align 4,0x90
149                 movl    (%esi),%eax
150                 movl    4(%esi),%ebx
151                 addl    $8,%esi
152                 prefetcht0 64(%esi)
153                 subl    $8,%ecx
154                 movl    %eax,(%edi)
155                 movl    %ebx,4(%edi)
156                 addl    $8,%edi
157                 testl   %ecx,%ecx
158                 jnz     1b
160                 popl    %ebx
161                 popl    %ecx
162                 popl    %edi
163                 popl    %esi
164                 ret
166                 .p2align 4,0x90
167 docopy5:
168                 pushl   %esi
169                 pushl   %edi
170                 pushl   %ecx
171                 pushl   %ebx
173                 movl    4+16(%esp),%esi
174                 movl    8+16(%esp),%edi
175                 movl    12+16(%esp),%ecx
177                 .p2align 4,0x90
179                 movq    (%esi),%mm0
180                 movq    8(%esi),%mm1
181                 movq    16(%esi),%mm2
182                 movq    24(%esi),%mm3
183                 movq    32(%esi),%mm4
184                 movq    40(%esi),%mm5
185                 movq    48(%esi),%mm6
186                 movq    56(%esi),%mm7
187                 prefetchnta 128(%esi)
188                 subl    $64,%ecx
189                 addl    $64,%esi
190                 movq    %mm0,(%edi)
191                 movq    %mm1,8(%edi)
192                 movq    %mm2,16(%edi)
193                 movq    %mm3,24(%edi)
194                 movq    %mm4,32(%edi)
195                 movq    %mm5,40(%edi)
196                 movq    %mm6,48(%edi)
197                 movq    %mm7,56(%edi)
198                 addl    $64,%edi
199                 testl   %ecx,%ecx
200                 jnz     1b
202                 popl    %ebx
203                 popl    %ecx
204                 popl    %edi
205                 popl    %esi
206                 ret
208                 .p2align 4,0x90
209 docopy6:
210                 pushl   %esi
211                 pushl   %edi
212                 pushl   %ecx
213                 pushl   %ebx
215                 movl    4+16(%esp),%esi
216                 movl    8+16(%esp),%edi
217                 movl    12+16(%esp),%ecx
218                 movl    $16,%eax
220                 .p2align 4,0x90
222                 prefetcht0 96(%esi)
223                 subl    %eax,%ecx
224                 movq    (%esi),%mm0
225                 movq    8(%esi),%mm1
226                 addl    %eax,%esi
227                 movntq  %mm0,(%edi)
228                 movntq  %mm1,8(%edi)
229                 addl    %eax,%edi
230                 testl   %ecx,%ecx
231                 jnz     1b
233                 popl    %ebx
234                 popl    %ecx
235                 popl    %edi
236                 popl    %esi
237                 ret
239                 .p2align 4,0x90
240 docopy7:
241                 pushl   %esi
242                 pushl   %edi
243                 pushl   %ecx
244                 pushl   %ebx
246                 movl    4+16(%esp),%esi
247                 movl    8+16(%esp),%edi
248                 movl    12+16(%esp),%ecx
249                 movl    $128,%eax
251                 .p2align 4,0x90
253                 movdqa  (%esi),%xmm0
254                 movdqa  16(%esi),%xmm1
255                 movdqa  32(%esi),%xmm2
256                 movdqa  48(%esi),%xmm3
257                 movdqa  64(%esi),%xmm4
258                 movdqa  80(%esi),%xmm5
259                 movdqa  96(%esi),%xmm6
260                 movdqa  112(%esi),%xmm7
261                 subl    %eax,%ecx
262                 addl    %eax,%esi
263                 movntdq  %xmm0,(%edi)
264                 movntdq  %xmm1,16(%edi)
265                 movntdq  %xmm2,32(%edi)
266                 movntdq  %xmm3,48(%edi)
267                 movntdq  %xmm4,64(%edi)
268                 movntdq  %xmm5,80(%edi)
269                 movntdq  %xmm6,96(%edi)
270                 movntdq  %xmm7,112(%edi)
271                 addl    %eax,%edi
272                 testl   %ecx,%ecx
273                 jnz     1b
275                 popl    %ebx
276                 popl    %ecx
277                 popl    %edi
278                 popl    %esi
279                 ret
281                 .p2align 4,0x90
282 fpcleanup:
283                 fninit
284                 ret