2 Copyright (C) 2014-2015 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
23 || defined USE_AS_MEMMOVE \
24 || !defined USE_MULTIARCH)
26 #include "asm-syntax.h"
28 # define MEMCPY __memcpy_avx_unaligned
29 # define MEMCPY_CHK __memcpy_chk_avx_unaligned
32 .section .text.avx,"ax",@progbits
33 #if !defined USE_AS_BCOPY
36 jb HIDDEN_JUMPTARGET (__chk_fail)
52 lea (%rsi, %rdx), %rcx
53 vmovdqu 0x10(%rsi), %xmm1
54 vmovdqu 0x20(%rsi), %xmm2
55 vmovdqu 0x30(%rsi), %xmm3
56 vmovdqu 0x40(%rsi), %xmm4
57 vmovdqu 0x50(%rsi), %xmm5
58 vmovdqu 0x60(%rsi), %xmm6
59 vmovdqu 0x70(%rsi), %xmm7
60 vmovdqu -0x80(%rcx), %xmm8
61 vmovdqu -0x70(%rcx), %xmm9
62 vmovdqu -0x60(%rcx), %xmm10
63 vmovdqu -0x50(%rcx), %xmm11
64 vmovdqu -0x40(%rcx), %xmm12
65 vmovdqu -0x30(%rcx), %xmm13
66 vmovdqu -0x20(%rcx), %xmm14
67 vmovdqu -0x10(%rcx), %xmm15
68 lea (%rdi, %rdx), %rdx
70 vmovdqu %xmm1, 0x10(%rdi)
71 vmovdqu %xmm2, 0x20(%rdi)
72 vmovdqu %xmm3, 0x30(%rdi)
73 vmovdqu %xmm4, 0x40(%rdi)
74 vmovdqu %xmm5, 0x50(%rdi)
75 vmovdqu %xmm6, 0x60(%rdi)
76 vmovdqu %xmm7, 0x70(%rdi)
77 vmovdqu %xmm8, -0x80(%rdx)
78 vmovdqu %xmm9, -0x70(%rdx)
79 vmovdqu %xmm10, -0x60(%rdx)
80 vmovdqu %xmm11, -0x50(%rdx)
81 vmovdqu %xmm12, -0x40(%rdx)
82 vmovdqu %xmm13, -0x30(%rdx)
83 vmovdqu %xmm14, -0x20(%rdx)
84 vmovdqu %xmm15, -0x10(%rdx)
91 lea (%rsi, %rdx), %rcx
92 vmovdqu 0x10(%rsi), %xmm1
93 vmovdqu 0x20(%rsi), %xmm2
94 lea (%rdi, %rdx), %rdx
95 vmovdqu 0x30(%rsi), %xmm3
96 vmovdqu -0x40(%rcx), %xmm4
97 vmovdqu -0x30(%rcx), %xmm5
98 vmovdqu -0x20(%rcx), %xmm6
99 vmovdqu -0x10(%rcx), %xmm7
100 vmovdqu %xmm0, (%rdi)
101 vmovdqu %xmm1, 0x10(%rdi)
102 vmovdqu %xmm2, 0x20(%rdi)
103 vmovdqu %xmm3, 0x30(%rdi)
104 vmovdqu %xmm4, -0x40(%rdx)
105 vmovdqu %xmm5, -0x30(%rdx)
106 vmovdqu %xmm6, -0x20(%rdx)
107 vmovdqu %xmm7, -0x10(%rdx)
114 vmovdqu (%rsi), %xmm0
115 vmovdqu 0x10(%rsi), %xmm1
116 vmovdqu -0x20(%rsi, %rdx), %xmm6
117 vmovdqu -0x10(%rsi, %rdx), %xmm7
118 vmovdqu %xmm0, (%rdi)
119 vmovdqu %xmm1, 0x10(%rdi)
120 vmovdqu %xmm6, -0x20(%rdi, %rdx)
121 vmovdqu %xmm7, -0x10(%rdi, %rdx)
126 vmovdqu (%rsi), %xmm0
127 vmovdqu -0x10(%rsi, %rdx), %xmm7
128 vmovdqu %xmm0, (%rdi)
129 vmovdqu %xmm7, -0x10(%rdi, %rdx)
136 movq -0x08(%rsi, %rdx), %rcx
139 movq %rcx, -0x08(%rdi, %rdx)
146 mov -0x04(%rsi, %rdx), %ecx
149 mov %ecx, -0x04(%rdi, %rdx)
155 mov -0x02(%rsi, %rdx), %cx
158 mov %cx, -0x02(%rdi, %rdx)
170 #ifdef USE_AS_MEMMOVE
177 jae L(gobble_data_movsb)
179 lea (%rsi, %rdx), %rcx
181 vmovdqu -0x80(%rcx), %xmm5
182 vmovdqu -0x70(%rcx), %xmm6
186 vmovdqu -0x60(%rcx), %xmm7
187 vmovdqu -0x50(%rcx), %xmm8
190 vmovdqu -0x40(%rcx), %xmm9
191 vmovdqu -0x30(%rcx), %xmm10
193 vmovdqu -0x20(%rcx), %xmm11
194 vmovdqu -0x10(%rcx), %xmm12
195 vmovdqu (%rsi), %ymm4
199 vmovdqu (%rsi), %ymm0
200 vmovdqu 0x20(%rsi), %ymm1
201 vmovdqu 0x40(%rsi), %ymm2
202 vmovdqu 0x60(%rsi), %ymm3
204 vmovdqa %ymm0, (%rdi)
205 vmovdqa %ymm1, 0x20(%rdi)
206 vmovdqa %ymm2, 0x40(%rdi)
207 vmovdqa %ymm3, 0x60(%rdi)
210 jae L(goble_128_loop)
213 vmovdqu %ymm4, (%r10)
215 vmovdqu %xmm5, -0x80(%rdx)
216 vmovdqu %xmm6, -0x70(%rdx)
217 vmovdqu %xmm7, -0x60(%rdx)
218 vmovdqu %xmm8, -0x50(%rdx)
219 vmovdqu %xmm9, -0x40(%rdx)
220 vmovdqu %xmm10, -0x30(%rdx)
221 vmovdqu %xmm11, -0x20(%rdx)
222 vmovdqu %xmm12, -0x10(%rdx)
227 L(gobble_data_movsb):
228 #ifdef SHARED_CACHE_SIZE_HALF
229 mov $SHARED_CACHE_SIZE_HALF, %rcx
231 mov __x86_shared_cache_size_half(%rip), %rcx
235 jae L(gobble_big_data_fwd)
242 L(gobble_big_data_fwd):
243 lea (%rsi, %rdx), %rcx
244 vmovdqu (%rsi), %ymm4
245 vmovdqu -0x80(%rsi,%rdx), %xmm5
246 vmovdqu -0x70(%rcx), %xmm6
247 vmovdqu -0x60(%rcx), %xmm7
248 vmovdqu -0x50(%rcx), %xmm8
249 vmovdqu -0x40(%rcx), %xmm9
250 vmovdqu -0x30(%rcx), %xmm10
251 vmovdqu -0x20(%rcx), %xmm11
252 vmovdqu -0x10(%rcx), %xmm12
260 lea (%rdi, %rdx), %rcx
262 L(gobble_mem_fwd_loop):
263 prefetchnta 0x1c0(%rsi)
264 prefetchnta 0x280(%rsi)
265 vmovdqu (%rsi), %ymm0
266 vmovdqu 0x20(%rsi), %ymm1
267 vmovdqu 0x40(%rsi), %ymm2
268 vmovdqu 0x60(%rsi), %ymm3
270 vmovntdq %ymm0, (%rdi)
271 vmovntdq %ymm1, 0x20(%rdi)
272 vmovntdq %ymm2, 0x40(%rdi)
273 vmovntdq %ymm3, 0x60(%rdi)
276 jb L(gobble_mem_fwd_loop)
280 vmovdqu %xmm5, -0x80(%rcx)
281 vmovdqu %xmm6, -0x70(%rcx)
282 vmovdqu %xmm7, -0x60(%rcx)
283 vmovdqu %xmm8, -0x50(%rcx)
284 vmovdqu %xmm9, -0x40(%rcx)
285 vmovdqu %xmm10, -0x30(%rcx)
286 vmovdqu %xmm11, -0x20(%rcx)
287 vmovdqu %xmm12, -0x10(%rcx)
290 #ifdef USE_AS_MEMMOVE
293 #ifdef SHARED_CACHE_SIZE_HALF
294 mov $SHARED_CACHE_SIZE_HALF, %rcx
296 mov __x86_shared_cache_size_half(%rip), %rcx
299 vmovdqu (%rsi), %xmm5
300 vmovdqu 0x10(%rsi), %xmm6
302 vmovdqu 0x20(%rsi), %xmm7
303 vmovdqu 0x30(%rsi), %xmm8
304 lea -0x20(%rdi), %r10
306 vmovdqu 0x40(%rsi), %xmm9
307 vmovdqu 0x50(%rsi), %xmm10
309 vmovdqu 0x60(%rsi), %xmm11
310 vmovdqu 0x70(%rsi), %xmm12
313 vmovdqu -0x20(%rsi), %ymm4
317 ja L(gobble_big_data_bwd)
319 L(gobble_mem_bwd_llc):
320 vmovdqu -0x20(%rsi), %ymm0
321 vmovdqu -0x40(%rsi), %ymm1
322 vmovdqu -0x60(%rsi), %ymm2
323 vmovdqu -0x80(%rsi), %ymm3
324 lea -0x80(%rsi), %rsi
325 vmovdqa %ymm0, -0x20(%rdi)
326 vmovdqa %ymm1, -0x40(%rdi)
327 vmovdqa %ymm2, -0x60(%rdi)
328 vmovdqa %ymm3, -0x80(%rdi)
329 lea -0x80(%rdi), %rdi
331 jb L(gobble_mem_bwd_llc)
332 vmovdqu %ymm4, (%r10)
334 vmovdqu %xmm5, (%rax)
335 vmovdqu %xmm6, 0x10(%rax)
336 vmovdqu %xmm7, 0x20(%rax)
337 vmovdqu %xmm8, 0x30(%rax)
338 vmovdqu %xmm9, 0x40(%rax)
339 vmovdqu %xmm10, 0x50(%rax)
340 vmovdqu %xmm11, 0x60(%rax)
341 vmovdqu %xmm12, 0x70(%rax)
345 L(gobble_big_data_bwd):
347 L(gobble_mem_bwd_loop):
348 prefetchnta -0x1c0(%rsi)
349 prefetchnta -0x280(%rsi)
350 vmovdqu -0x20(%rsi), %ymm0
351 vmovdqu -0x40(%rsi), %ymm1
352 vmovdqu -0x60(%rsi), %ymm2
353 vmovdqu -0x80(%rsi), %ymm3
354 lea -0x80(%rsi), %rsi
355 vmovntdq %ymm0, -0x20(%rdi)
356 vmovntdq %ymm1, -0x40(%rdi)
357 vmovntdq %ymm2, -0x60(%rdi)
358 vmovntdq %ymm3, -0x80(%rdi)
359 lea -0x80(%rdi), %rdi
361 jb L(gobble_mem_bwd_loop)
363 vmovdqu %ymm4, (%r10)
365 vmovdqu %xmm5, (%rax)
366 vmovdqu %xmm6, 0x10(%rax)
367 vmovdqu %xmm7, 0x20(%rax)
368 vmovdqu %xmm8, 0x30(%rax)
369 vmovdqu %xmm9, 0x40(%rax)
370 vmovdqu %xmm10, 0x50(%rax)
371 vmovdqu %xmm11, 0x60(%rax)
372 vmovdqu %xmm12, 0x70(%rax)