2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
23 || defined USE_AS_MEMMOVE \
24 || !defined USE_MULTIARCH)
26 #include "asm-syntax.h"
28 # define MEMCPY __memcpy_avx_unaligned
29 # define MEMCPY_CHK __memcpy_chk_avx_unaligned
30 # define MEMPCPY __mempcpy_avx_unaligned
31 # define MEMPCPY_CHK __mempcpy_chk_avx_unaligned
34 .section .text.avx,"ax",@progbits
35 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
38 jb HIDDEN_JUMPTARGET (__chk_fail)
48 #if !defined USE_AS_BCOPY
51 jb HIDDEN_JUMPTARGET (__chk_fail)
68 lea (%rsi, %rdx), %rcx
69 vmovdqu 0x10(%rsi), %xmm1
70 vmovdqu 0x20(%rsi), %xmm2
71 vmovdqu 0x30(%rsi), %xmm3
72 vmovdqu 0x40(%rsi), %xmm4
73 vmovdqu 0x50(%rsi), %xmm5
74 vmovdqu 0x60(%rsi), %xmm6
75 vmovdqu 0x70(%rsi), %xmm7
76 vmovdqu -0x80(%rcx), %xmm8
77 vmovdqu -0x70(%rcx), %xmm9
78 vmovdqu -0x60(%rcx), %xmm10
79 vmovdqu -0x50(%rcx), %xmm11
80 vmovdqu -0x40(%rcx), %xmm12
81 vmovdqu -0x30(%rcx), %xmm13
82 vmovdqu -0x20(%rcx), %xmm14
83 vmovdqu -0x10(%rcx), %xmm15
84 lea (%rdi, %rdx), %rdx
86 vmovdqu %xmm1, 0x10(%rdi)
87 vmovdqu %xmm2, 0x20(%rdi)
88 vmovdqu %xmm3, 0x30(%rdi)
89 vmovdqu %xmm4, 0x40(%rdi)
90 vmovdqu %xmm5, 0x50(%rdi)
91 vmovdqu %xmm6, 0x60(%rdi)
92 vmovdqu %xmm7, 0x70(%rdi)
93 vmovdqu %xmm8, -0x80(%rdx)
94 vmovdqu %xmm9, -0x70(%rdx)
95 vmovdqu %xmm10, -0x60(%rdx)
96 vmovdqu %xmm11, -0x50(%rdx)
97 vmovdqu %xmm12, -0x40(%rdx)
98 vmovdqu %xmm13, -0x30(%rdx)
99 vmovdqu %xmm14, -0x20(%rdx)
100 vmovdqu %xmm15, -0x10(%rdx)
106 vmovdqu (%rsi), %xmm0
107 lea (%rsi, %rdx), %rcx
108 vmovdqu 0x10(%rsi), %xmm1
109 vmovdqu 0x20(%rsi), %xmm2
110 lea (%rdi, %rdx), %rdx
111 vmovdqu 0x30(%rsi), %xmm3
112 vmovdqu -0x40(%rcx), %xmm4
113 vmovdqu -0x30(%rcx), %xmm5
114 vmovdqu -0x20(%rcx), %xmm6
115 vmovdqu -0x10(%rcx), %xmm7
116 vmovdqu %xmm0, (%rdi)
117 vmovdqu %xmm1, 0x10(%rdi)
118 vmovdqu %xmm2, 0x20(%rdi)
119 vmovdqu %xmm3, 0x30(%rdi)
120 vmovdqu %xmm4, -0x40(%rdx)
121 vmovdqu %xmm5, -0x30(%rdx)
122 vmovdqu %xmm6, -0x20(%rdx)
123 vmovdqu %xmm7, -0x10(%rdx)
130 vmovdqu (%rsi), %xmm0
131 vmovdqu 0x10(%rsi), %xmm1
132 vmovdqu -0x20(%rsi, %rdx), %xmm6
133 vmovdqu -0x10(%rsi, %rdx), %xmm7
134 vmovdqu %xmm0, (%rdi)
135 vmovdqu %xmm1, 0x10(%rdi)
136 vmovdqu %xmm6, -0x20(%rdi, %rdx)
137 vmovdqu %xmm7, -0x10(%rdi, %rdx)
142 vmovdqu (%rsi), %xmm0
143 vmovdqu -0x10(%rsi, %rdx), %xmm7
144 vmovdqu %xmm0, (%rdi)
145 vmovdqu %xmm7, -0x10(%rdi, %rdx)
152 movq -0x08(%rsi, %rdx), %rcx
155 movq %rcx, -0x08(%rdi, %rdx)
162 mov -0x04(%rsi, %rdx), %ecx
165 mov %ecx, -0x04(%rdi, %rdx)
171 mov -0x02(%rsi, %rdx), %cx
174 mov %cx, -0x02(%rdi, %rdx)
186 #ifdef USE_AS_MEMMOVE
193 jae L(gobble_data_movsb)
195 lea (%rsi, %rdx), %rcx
197 vmovdqu -0x80(%rcx), %xmm5
198 vmovdqu -0x70(%rcx), %xmm6
202 vmovdqu -0x60(%rcx), %xmm7
203 vmovdqu -0x50(%rcx), %xmm8
206 vmovdqu -0x40(%rcx), %xmm9
207 vmovdqu -0x30(%rcx), %xmm10
209 vmovdqu -0x20(%rcx), %xmm11
210 vmovdqu -0x10(%rcx), %xmm12
211 vmovdqu (%rsi), %ymm4
215 vmovdqu (%rsi), %ymm0
216 vmovdqu 0x20(%rsi), %ymm1
217 vmovdqu 0x40(%rsi), %ymm2
218 vmovdqu 0x60(%rsi), %ymm3
220 vmovdqa %ymm0, (%rdi)
221 vmovdqa %ymm1, 0x20(%rdi)
222 vmovdqa %ymm2, 0x40(%rdi)
223 vmovdqa %ymm3, 0x60(%rdi)
226 jae L(goble_128_loop)
229 vmovdqu %ymm4, (%r10)
231 vmovdqu %xmm5, -0x80(%rdx)
232 vmovdqu %xmm6, -0x70(%rdx)
233 vmovdqu %xmm7, -0x60(%rdx)
234 vmovdqu %xmm8, -0x50(%rdx)
235 vmovdqu %xmm9, -0x40(%rdx)
236 vmovdqu %xmm10, -0x30(%rdx)
237 vmovdqu %xmm11, -0x20(%rdx)
238 vmovdqu %xmm12, -0x10(%rdx)
243 L(gobble_data_movsb):
244 #ifdef SHARED_CACHE_SIZE_HALF
245 mov $SHARED_CACHE_SIZE_HALF, %rcx
247 mov __x86_shared_cache_size_half(%rip), %rcx
251 jae L(gobble_big_data_fwd)
257 L(gobble_big_data_fwd):
258 lea (%rsi, %rdx), %rcx
259 vmovdqu (%rsi), %ymm4
260 vmovdqu -0x80(%rsi,%rdx), %xmm5
261 vmovdqu -0x70(%rcx), %xmm6
262 vmovdqu -0x60(%rcx), %xmm7
263 vmovdqu -0x50(%rcx), %xmm8
264 vmovdqu -0x40(%rcx), %xmm9
265 vmovdqu -0x30(%rcx), %xmm10
266 vmovdqu -0x20(%rcx), %xmm11
267 vmovdqu -0x10(%rcx), %xmm12
275 lea (%rdi, %rdx), %rcx
277 L(gobble_mem_fwd_loop):
278 prefetchnta 0x1c0(%rsi)
279 prefetchnta 0x280(%rsi)
280 vmovdqu (%rsi), %ymm0
281 vmovdqu 0x20(%rsi), %ymm1
282 vmovdqu 0x40(%rsi), %ymm2
283 vmovdqu 0x60(%rsi), %ymm3
285 vmovntdq %ymm0, (%rdi)
286 vmovntdq %ymm1, 0x20(%rdi)
287 vmovntdq %ymm2, 0x40(%rdi)
288 vmovntdq %ymm3, 0x60(%rdi)
291 jb L(gobble_mem_fwd_loop)
295 vmovdqu %xmm5, -0x80(%rcx)
296 vmovdqu %xmm6, -0x70(%rcx)
297 vmovdqu %xmm7, -0x60(%rcx)
298 vmovdqu %xmm8, -0x50(%rcx)
299 vmovdqu %xmm9, -0x40(%rcx)
300 vmovdqu %xmm10, -0x30(%rcx)
301 vmovdqu %xmm11, -0x20(%rcx)
302 vmovdqu %xmm12, -0x10(%rcx)
305 #ifdef USE_AS_MEMMOVE
308 #ifdef SHARED_CACHE_SIZE_HALF
309 mov $SHARED_CACHE_SIZE_HALF, %rcx
311 mov __x86_shared_cache_size_half(%rip), %rcx
314 vmovdqu (%rsi), %xmm5
315 vmovdqu 0x10(%rsi), %xmm6
317 vmovdqu 0x20(%rsi), %xmm7
318 vmovdqu 0x30(%rsi), %xmm8
319 lea -0x20(%rdi), %r10
321 vmovdqu 0x40(%rsi), %xmm9
322 vmovdqu 0x50(%rsi), %xmm10
324 vmovdqu 0x60(%rsi), %xmm11
325 vmovdqu 0x70(%rsi), %xmm12
328 vmovdqu -0x20(%rsi), %ymm4
332 ja L(gobble_big_data_bwd)
334 L(gobble_mem_bwd_llc):
335 vmovdqu -0x20(%rsi), %ymm0
336 vmovdqu -0x40(%rsi), %ymm1
337 vmovdqu -0x60(%rsi), %ymm2
338 vmovdqu -0x80(%rsi), %ymm3
339 lea -0x80(%rsi), %rsi
340 vmovdqa %ymm0, -0x20(%rdi)
341 vmovdqa %ymm1, -0x40(%rdi)
342 vmovdqa %ymm2, -0x60(%rdi)
343 vmovdqa %ymm3, -0x80(%rdi)
344 lea -0x80(%rdi), %rdi
346 jb L(gobble_mem_bwd_llc)
347 vmovdqu %ymm4, (%r10)
349 vmovdqu %xmm5, (%rax)
350 vmovdqu %xmm6, 0x10(%rax)
351 vmovdqu %xmm7, 0x20(%rax)
352 vmovdqu %xmm8, 0x30(%rax)
353 vmovdqu %xmm9, 0x40(%rax)
354 vmovdqu %xmm10, 0x50(%rax)
355 vmovdqu %xmm11, 0x60(%rax)
356 vmovdqu %xmm12, 0x70(%rax)
360 L(gobble_big_data_bwd):
362 L(gobble_mem_bwd_loop):
363 prefetchnta -0x1c0(%rsi)
364 prefetchnta -0x280(%rsi)
365 vmovdqu -0x20(%rsi), %ymm0
366 vmovdqu -0x40(%rsi), %ymm1
367 vmovdqu -0x60(%rsi), %ymm2
368 vmovdqu -0x80(%rsi), %ymm3
369 lea -0x80(%rsi), %rsi
370 vmovntdq %ymm0, -0x20(%rdi)
371 vmovntdq %ymm1, -0x40(%rdi)
372 vmovntdq %ymm2, -0x60(%rdi)
373 vmovntdq %ymm3, -0x80(%rdi)
374 lea -0x80(%rdi), %rdi
376 jb L(gobble_mem_bwd_loop)
378 vmovdqu %ymm4, (%r10)
380 vmovdqu %xmm5, (%rax)
381 vmovdqu %xmm6, 0x10(%rax)
382 vmovdqu %xmm7, 0x20(%rax)
383 vmovdqu %xmm8, 0x30(%rax)
384 vmovdqu %xmm9, 0x40(%rax)
385 vmovdqu %xmm10, 0x50(%rax)
386 vmovdqu %xmm11, 0x60(%rax)
387 vmovdqu %xmm12, 0x70(%rax)