build.sh: pass OPTS also to linking stage, so -static can be passed
[rofl0r-memcpy-test.git] / amd.c
blob9d93bf0ae0231bddc77883935e2567b92223d513
1 /*
2 * Copyright (C) 2019-2020, Advanced Micro Devices, Inc. All Rights Reserved.
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <stdlib.h>
23 #undef memcpy
25 void * mymemcpy(void *dest, const void *src, size_t size)
27 asm goto (
28 "movq %0, %%rsi\n\t"
29 "movq %1, %%rdi\n\t"
30 "movq %2, %%rdx\n\t"
31 "movq %%rdi, %%rax\n\t"
32 "cmp $32, %%rdx\n\t"
33 "jb less_vec\n\t"
34 "cmp $(32 * 2), %%rdx\n\t"
35 "ja more_2x_vec\n\t"
36 "vmovdqu (%%rsi), %%ymm0\n\t"
37 "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
38 "vmovdqu %%ymm0, (%%rdi)\n\t"
39 "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
40 "vzeroupper\n\t"
41 "jmp %l[done]\n\t"
42 "less_vec:\n\t"
43 /* Less than 1 VEC. */
44 "cmpb $32, %%dl\n\t"
45 "jae between_32_63\n\t"
46 "cmpb $16, %%dl\n\t"
47 "jae between_16_31\n\t"
48 "cmpb $8, %%dl\n\t"
49 "jae between_8_15\n\t"
50 "cmpb $4, %%dl\n\t"
51 "jae between_4_7\n\t"
52 "cmpb $1, %%dl\n\t"
53 "ja between_2_3\n\t"
54 "jb 1f\n\t"
55 "movzbl (%%rsi), %%ecx\n\t"
56 "movb %%cl, (%%rdi)\n\t"
57 "1:\n\t"
58 "jmp %l[done]\n\t"
59 "between_32_63:\n\t"
60 /* From 32 to 63. No branch when size == 32. */
61 "vmovdqu (%%rsi), %%ymm0\n\t"
62 "vmovdqu -32(%%rsi,%%rdx), %%ymm1\n\t"
63 "vmovdqu %%ymm0, (%%rdi)\n\t"
64 "vmovdqu %%ymm1, -32(%%rdi,%%rdx)\n\t"
65 "vzeroupper\n\t"
66 "jmp %l[done]\n\t"
67 /* From 16 to 31. No branch when size == 16. */
68 "between_16_31:\n\t"
69 "vmovdqu (%%rsi), %%xmm0\n\t"
70 "vmovdqu -16(%%rsi,%%rdx), %%xmm1\n\t"
71 "vmovdqu %%xmm0, (%%rdi)\n\t"
72 "vmovdqu %%xmm1, -16(%%rdi,%%rdx)\n\t"
73 "jmp %l[done]\n\t"
74 "between_8_15:\n\t"
75 /* From 8 to 15. No branch when size == 8. */
76 "movq -8(%%rsi,%%rdx), %%rcx\n\t"
77 "movq (%%rsi), %%rsi\n\t"
78 "movq %%rcx, -8(%%rdi,%%rdx)\n\t"
79 "movq %%rsi, (%%rdi)\n\t"
80 "jmp %l[done]\n\t"
81 "between_4_7:\n\t"
82 /* From 4 to 7. No branch when size == 4. */
83 "movl -4(%%rsi,%%rdx), %%ecx\n\t"
84 "movl (%%rsi), %%esi\n\t"
85 "movl %%ecx, -4(%%rdi,%%rdx)\n\t"
86 "movl %%esi, (%%rdi)\n\t"
87 "jmp %l[done]\n\t"
88 "between_2_3:\n\t"
89 /* From 2 to 3. No branch when size == 2. */
90 "movzwl -2(%%rsi,%%rdx), %%ecx\n\t"
91 "movzwl (%%rsi), %%esi\n\t"
92 "movw %%cx, -2(%%rdi,%%rdx)\n\t"
93 "movw %%si, (%%rdi)\n\t"
94 "jmp %l[done]\n\t"
95 "more_2x_vec:\n\t"
96 /* More than 2 * VEC and there may be overlap between destination */
97 /* and source. */
98 "cmpq $(32 * 8), %%rdx\n\t"
99 "ja more_8x_vec\n\t"
100 "cmpq $(32 * 4), %%rdx\n\t"
101 "jb last_4x_vec\n\t"
102 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
103 "vmovdqu (%%rsi), %%ymm0\n\t"
104 "vmovdqu 32(%%rsi), %%ymm1\n\t"
105 "vmovdqu (32 * 2)(%%rsi), %%ymm2\n\t"
106 "vmovdqu (32 * 3)(%%rsi), %%ymm3\n\t"
107 "vmovdqu -32(%%rsi,%%rdx), %%ymm4\n\t"
108 "vmovdqu -(32 * 2)(%%rsi,%%rdx), %%ymm5\n\t"
109 "vmovdqu -(32 * 3)(%%rsi,%%rdx), %%ymm6\n\t"
110 "vmovdqu -(32 * 4)(%%rsi,%%rdx), %%ymm7\n\t"
111 "vmovdqu %%ymm0, (%%rdi)\n\t"
112 "vmovdqu %%ymm1, 32(%%rdi)\n\t"
113 "vmovdqu %%ymm2, (32 * 2)(%%rdi)\n\t"
114 "vmovdqu %%ymm3, (32 * 3)(%%rdi)\n\t"
115 "vmovdqu %%ymm4, -32(%%rdi,%%rdx)\n\t"
116 "vmovdqu %%ymm5, -(32 * 2)(%%rdi,%%rdx)\n\t"
117 "vmovdqu %%ymm6, -(32 * 3)(%%rdi,%%rdx)\n\t"
118 "vmovdqu %%ymm7, -(32 * 4)(%%rdi,%%rdx)\n\t"
119 "vzeroupper\n\t"
120 "jmp %l[done]\n\t"
121 "last_4x_vec:\n\t"
122 /* Copy from 2 * VEC to 4 * VEC. */
123 "vmovdqu (%%rsi), %%ymm0\n\t"
124 "vmovdqu 32(%%rsi), %%ymm1\n\t"
125 "vmovdqu -32(%%rsi,%%rdx), %%ymm2\n\t"
126 "vmovdqu -(32 * 2)(%%rsi,%%rdx), %%ymm3\n\t"
127 "vmovdqu %%ymm0, (%%rdi)\n\t"
128 "vmovdqu %%ymm1, 32(%%rdi)\n\t"
129 "vmovdqu %%ymm2, -32(%%rdi,%%rdx)\n\t"
130 "vmovdqu %%ymm3, -(32 * 2)(%%rdi,%%rdx)\n\t"
131 "vzeroupper\n\t"
132 "nop:\n\t"
133 "jmp %l[done]\n\t"
134 "more_8x_vec:\n\t"
135 "cmpq %%rsi, %%rdi\n\t"
136 "ja more_8x_vec_backward\n\t"
137 /* Source == destination is less common. */
138 "je nop\n\t"
139 /* Load the first VEC and last 4 * VEC to support overlapping addresses. */
140 "vmovdqu (%%rsi), %%ymm4\n\t"
141 "vmovdqu -32(%%rsi, %%rdx), %%ymm5\n\t"
142 "vmovdqu -(32 * 2)(%%rsi, %%rdx), %%ymm6\n\t"
143 "vmovdqu -(32 * 3)(%%rsi, %%rdx), %%ymm7\n\t"
144 "vmovdqu -(32 * 4)(%%rsi, %%rdx), %%ymm8\n\t"
145 /* Save start and stop of the destination buffer. */
146 "movq %%rdi, %%r11\n\t"
147 "leaq -32(%%rdi, %%rdx), %%rcx\n\t"
148 /* Align destination for aligned stores in the loop. Compute */
149 /* how much destination is misaligned. */
150 "movq %%rdi, %%r8\n\t"
151 "andq $(32 - 1), %%r8\n\t"
152 /* Get the negative of offset for alignment. */
153 "subq $32, %%r8\n\t"
154 /* Adjust source. */
155 "subq %%r8, %%rsi\n\t"
156 /* Adjust destination which should be aligned now. */
157 "subq %%r8, %%rdi\n\t"
158 /* Adjust length. */
159 "addq %%r8, %%rdx\n\t"
160 /* Check non-temporal store threshold. */
161 "cmpq $(1024*1024), %%rdx\n\t"
162 "ja large_forward\n\t"
163 "loop_4x_vec_forward:\n\t"
164 /* Copy 4 * VEC a time forward. */
165 "vmovdqu (%%rsi), %%ymm0\n\t"
166 "vmovdqu 32(%%rsi), %%ymm1\n\t"
167 "vmovdqu (32 * 2)(%%rsi), %%ymm2\n\t"
168 "vmovdqu (32 * 3)(%%rsi), %%ymm3\n\t"
169 "addq $(32 * 4), %%rsi\n\t"
170 "subq $(32 * 4), %%rdx\n\t"
171 "vmovdqa %%ymm0, (%%rdi)\n\t"
172 "vmovdqa %%ymm1, 32(%%rdi)\n\t"
173 "vmovdqa %%ymm2, (32 * 2)(%%rdi)\n\t"
174 "vmovdqa %%ymm3, (32 * 3)(%%rdi)\n\t"
175 "addq $(32 * 4), %%rdi\n\t"
176 "cmpq $(32 * 4), %%rdx\n\t"
177 "ja loop_4x_vec_forward\n\t"
178 /* Store the last 4 * VEC. */
179 "vmovdqu %%ymm5, (%%rcx)\n\t"
180 "vmovdqu %%ymm6, -32(%%rcx)\n\t"
181 "vmovdqu %%ymm7, -(32 * 2)(%%rcx)\n\t"
182 "vmovdqu %%ymm8, -(32 * 3)(%%rcx)\n\t"
183 /* Store the first VEC. */
184 "vmovdqu %%ymm4, (%%r11)\n\t"
185 "vzeroupper\n\t"
186 "jmp %l[done]\n\t"
187 "more_8x_vec_backward:\n\t"
188 /* Load the first 4*VEC and last VEC to support overlapping addresses.*/
189 "vmovdqu (%%rsi), %%ymm4\n\t"
190 "vmovdqu 32(%%rsi), %%ymm5\n\t"
191 "vmovdqu (32 * 2)(%%rsi), %%ymm6\n\t"
192 "vmovdqu (32 * 3)(%%rsi), %%ymm7\n\t"
193 "vmovdqu -32(%%rsi,%%rdx), %%ymm8\n\t"
194 /* Save stop of the destination buffer. */
195 "leaq -32(%%rdi, %%rdx), %%r11\n\t"
196 /* Align destination end for aligned stores in the loop. Compute */
197 /* how much destination end is misaligned. */
198 "leaq -32(%%rsi, %%rdx), %%rcx\n\t"
199 "movq %%r11, %%r9\n\t"
200 "movq %%r11, %%r8\n\t"
201 "andq $(32 - 1), %%r8\n\t"
202 /* Adjust source. */
203 "subq %%r8, %%rcx\n\t"
204 /* Adjust the end of destination which should be aligned now. */
205 "subq %%r8, %%r9\n\t"
206 /* Adjust length. */
207 "subq %%r8, %%rdx\n\t"
208 /* Check non-temporal store threshold. */
209 "cmpq $(1024*1024), %%rdx\n\t"
210 "ja large_backward\n\t"
211 "loop_4x_vec_backward:\n\t"
212 /* Copy 4 * VEC a time backward. */
213 "vmovdqu (%%rcx), %%ymm0\n\t"
214 "vmovdqu -32(%%rcx), %%ymm1\n\t"
215 "vmovdqu -(32 * 2)(%%rcx), %%ymm2\n\t"
216 "vmovdqu -(32 * 3)(%%rcx), %%ymm3\n\t"
217 "subq $(32 * 4), %%rcx\n\t"
218 "subq $(32 * 4), %%rdx\n\t"
219 "vmovdqa %%ymm0, (%%r9)\n\t"
220 "vmovdqa %%ymm1, -32(%%r9)\n\t"
221 "vmovdqa %%ymm2, -(32 * 2)(%%r9)\n\t"
222 "vmovdqa %%ymm3, -(32 * 3)(%%r9)\n\t"
223 "subq $(32 * 4), %%r9\n\t"
224 "cmpq $(32 * 4), %%rdx\n\t"
225 "ja loop_4x_vec_backward\n\t"
226 /* Store the first 4 * VEC. */
227 "vmovdqu %%ymm4, (%%rdi)\n\t"
228 "vmovdqu %%ymm5, 32(%%rdi)\n\t"
229 "vmovdqu %%ymm6, (32 * 2)(%%rdi)\n\t"
230 "vmovdqu %%ymm7, (32 * 3)(%%rdi)\n\t"
231 /* Store the last VEC. */
232 "vmovdqu %%ymm8, (%%r11)\n\t"
233 "vzeroupper\n\t"
234 "jmp %l[done]\n\t"
236 "large_forward:\n\t"
237 /* Don't use non-temporal store if there is overlap between */
238 /* destination and source since destination may be in cache */
239 /* when source is loaded. */
240 "leaq (%%rdi, %%rdx), %%r10\n\t"
241 "cmpq %%r10, %%rsi\n\t"
242 "jb loop_4x_vec_forward\n\t"
243 "loop_large_forward:\n\t"
244 /* Copy 4 * VEC a time forward with non-temporal stores. */
245 "prefetcht0 (32*4*2)(%%rsi)\n\t"
246 "prefetcht0 (32*4*2 + 64)(%%rsi)\n\t"
247 "prefetcht0 (32*4*3)(%%rsi)\n\t"
248 "prefetcht0 (32*4*3 + 64)(%%rsi)\n\t"
249 "vmovdqu (%%rsi), %%ymm0\n\t"
250 "vmovdqu 32(%%rsi), %%ymm1\n\t"
251 "vmovdqu (32 * 2)(%%rsi), %%ymm2\n\t"
252 "vmovdqu (32 * 3)(%%rsi), %%ymm3\n\t"
253 "addq $(32*4), %%rsi\n\t"
254 "subq $(32*4), %%rdx\n\t"
255 "vmovntdq %%ymm0, (%%rdi)\n\t"
256 "vmovntdq %%ymm1, 32(%%rdi)\n\t"
257 "vmovntdq %%ymm2, (32 * 2)(%%rdi)\n\t"
258 "vmovntdq %%ymm3, (32 * 3)(%%rdi)\n\t"
259 "addq $(32*4), %%rdi\n\t"
260 "cmpq $(32*4), %%rdx\n\t"
261 "ja loop_large_forward\n\t"
262 "sfence\n\t"
263 /* Store the last 4 * VEC. */
264 "vmovdqu %%ymm5, (%%rcx)\n\t"
265 "vmovdqu %%ymm6, -32(%%rcx)\n\t"
266 "vmovdqu %%ymm7, -(32 * 2)(%%rcx)\n\t"
267 "vmovdqu %%ymm8, -(32 * 3)(%%rcx)\n\t"
268 /* Store the first VEC. */
269 "vmovdqu %%ymm4, (%%r11)\n\t"
270 "vzeroupper\n\t"
271 "jmp %l[done]\n\t"
272 "large_backward:\n\t"
273 /* Don't use non-temporal store if there is overlap between */
274 /* destination and source since destination may be in cache */
275 /* when source is loaded. */
276 "leaq (%%rcx, %%rdx), %%r10\n\t"
277 "cmpq %%r10, %%r9\n\t"
278 "jb loop_4x_vec_backward\n\t"
279 "loop_large_backward:\n\t"
280 /* Copy 4 * VEC a time backward with non-temporal stores. */
281 "prefetcht0 (-32 * 4 * 2)(%%rcx)\n\t"
282 "prefetcht0 (-32 * 4 * 2 - 64)(%%rcx)\n\t"
283 "prefetcht0 (-32 * 4 * 3)(%%rcx)\n\t"
284 "prefetcht0 (-32 * 4 * 3 - 64)(%%rcx)\n\t"
285 "vmovdqu (%%rcx), %%ymm0\n\t"
286 "vmovdqu -32(%%rcx), %%ymm1\n\t"
287 "vmovdqu -(32 * 2)(%%rcx), %%ymm2\n\t"
288 "vmovdqu -(32 * 3)(%%rcx), %%ymm3\n\t"
289 "subq $(32*4), %%rcx\n\t"
290 "subq $(32*4), %%rdx\n\t"
291 "vmovntdq %%ymm0, (%%r9)\n\t"
292 "vmovntdq %%ymm1, -32(%%r9)\n\t"
293 "vmovntdq %%ymm2, -(32 * 2)(%%r9)\n\t"
294 "vmovntdq %%ymm3, -(32 * 3)(%%r9)\n\t"
295 "subq $(32 * 4), %%r9\n\t"
296 "cmpq $(32 * 4), %%rdx\n\t"
297 "ja loop_large_backward\n\t"
298 "sfence\n\t"
299 /* Store the first 4 * VEC. */
300 "vmovdqu %%ymm4, (%%rdi)\n\t"
301 "vmovdqu %%ymm5, 32(%%rdi)\n\t"
302 "vmovdqu %%ymm6, (32 * 2)(%%rdi)\n\t"
303 "vmovdqu %%ymm7, (32 * 3)(%%rdi)\n\t"
304 /* Store the last VEC. */
305 "vmovdqu %%ymm8, (%%r11)\n\t"
306 "vzeroupper\n\t"
307 "jmp %l[done]"
309 : "r"(src), "r"(dest), "r"(size)
310 : "rax", "rcx", "rdx", "rdi", "rsi", "r8", "r9", "r10", "r11", "r12", "ymm0",
311 "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7", "ymm8", "memory"
312 : done
314 done:
315 return dest;