2 Optimized memcpy for x86-64.
4 Copyright (C) 2007-2014 Free Software Foundation, Inc.
5 Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
7 This file is part of the GNU C Library.
9 The GNU C Library is free software; you can redistribute it and/or
10 modify it under the terms of the GNU Lesser General Public
11 License as published by the Free Software Foundation; either
12 version 2.1 of the License, or (at your option) any later version.
14 The GNU C Library is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 Lesser General Public License for more details.
19 You should have received a copy of the GNU Lesser General Public
20 License along with the GNU C Library; if not, see
21 <http://www.gnu.org/licenses/>.
25 #include "asm-syntax.h"
27 /* Stack slots in the red-zone. */
33 # if defined SHARED && !defined USE_MULTIARCH && !defined NOT_IN_libc
34 # define memcpy __memcpy
35 # undef libc_hidden_builtin_def
36 # define libc_hidden_builtin_def(name) \
37 .globl __GI_memcpy; __GI_memcpy = __memcpy
40 #define SAVE0 (RETVAL - 8)
41 #define SAVE1 (SAVE0 - 8)
42 #define SAVE2 (SAVE1 - 8)
43 #define SAVE3 (SAVE2 - 8)
47 #if defined PIC && !defined NOT_IN_libc
48 ENTRY_CHK (__memcpy_chk)
51 jb HIDDEN_JUMPTARGET (__chk_fail)
53 END_CHK (__memcpy_chk)
56 ENTRY(memcpy) /* (void *, const void*, size_t) */
58 /* Handle tiny blocks. */
60 L(1try): /* up to 32B */
62 #ifndef USE_AS_MEMPCPY
63 movq %rdi, %rax /* save return value */
67 L(1): /* 1-byte once */
79 L(1a): /* 2-byte once */
91 L(1b): /* 4-byte once */
103 L(1c): /* 8-byte once */
115 L(1d): /* 16-byte loop */
137 #ifdef USE_AS_MEMPCPY
138 movq %rdi, %rax /* return value */
147 #ifndef USE_AS_MEMPCPY
148 movq %rax, RETVAL(%rsp) /* save return value */
151 /* Align to the natural word size. */
154 movl %esi, %ecx /* align by source */
157 jz L(alignafter) /* already aligned */
159 L(align): /* align */
160 leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */
165 L(alignloop): /* 1-byte alignment loop */
180 /* Handle mid-sized blocks. */
182 L(32try): /* up to 1KB */
186 L(32): /* 32-byte loop */
209 jz L(32skip) /* help out smaller blocks */
231 andl $31, %edx /* check for left overs */
232 #ifdef USE_AS_MEMPCPY
237 movq RETVAL(%rsp), %rax
249 In order to minimize code-size in RTLD, algorithms specific for
250 larger blocks are excluded when building for RTLD.
253 /* Handle blocks smaller than 1/2 L1. */
255 L(fasttry): /* first 1/2 L1 */
256 #ifndef NOT_IN_libc /* only up to this algorithm outside of libc.so */
257 mov __x86_data_cache_size_half(%rip), %R11_LP
258 cmpq %rdx, %r11 /* calculate the smaller of */
259 cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */
262 L(fast): /* good ol' MOVS */
279 subq %r11, %rdx /* check for more */
284 andl $7, %edx /* check for left overs */
285 #ifdef USE_AS_MEMPCPY
290 movq RETVAL(%rsp), %rax
297 #ifndef NOT_IN_libc /* none of the algorithms below for RTLD */
303 /* Handle large blocks smaller than 1/2 L2. */
305 L(pretry): /* first 1/2 L2 */
306 mov __x86_shared_cache_size_half (%rip), %R8_LP
307 cmpq %rdx, %r8 /* calculate the lesser of */
308 cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */
310 L(pre): /* 64-byte with prefetching */
316 movq %r14, SAVE0(%rsp)
317 cfi_rel_offset (%r14, SAVE0)
318 movq %r13, SAVE1(%rsp)
319 cfi_rel_offset (%r13, SAVE1)
320 movq %r12, SAVE2(%rsp)
321 cfi_rel_offset (%r12, SAVE2)
322 movq %rbx, SAVE3(%rsp)
323 cfi_rel_offset (%rbx, SAVE3)
325 cmpl $0, __x86_prefetchw(%rip)
326 jz L(preloop) /* check if PREFETCHW OK */
330 /* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */
332 L(prewloop): /* cache-line in state M */
344 prefetcht0 0 + 896 (%rsi)
345 prefetcht0 64 + 896 (%rsi)
381 prefetchw 896 - 64(%rdi)
382 prefetchw 896 - 0(%rdi)
392 /* ... when PREFETCHW is not available. */
394 L(preloop): /* cache-line in state E */
406 prefetcht0 896 + 0(%rsi)
407 prefetcht0 896 + 64(%rsi)
434 prefetcht0 896 - 64(%rdi)
435 prefetcht0 896 - 0(%rdi)
452 movq SAVE3(%rsp), %rbx
454 movq SAVE2(%rsp), %r12
456 movq SAVE1(%rsp), %r13
458 movq SAVE0(%rsp), %r14
464 subq %r8, %rdx /* check for more */
468 andl $63, %edx /* check for left overs */
469 #ifdef USE_AS_MEMPCPY
474 movq RETVAL(%rsp), %rax
485 /* Handle huge blocks. */
489 L(NT): /* non-temporal 128-byte */
494 movq %r14, SAVE0(%rsp)
495 cfi_rel_offset (%r14, SAVE0)
496 movq %r13, SAVE1(%rsp)
497 cfi_rel_offset (%r13, SAVE1)
498 movq %r12, SAVE2(%rsp)
499 cfi_rel_offset (%r12, SAVE2)
504 prefetchnta 768(%rsi)
505 prefetchnta 832(%rsi)
520 movntiq %r9, 16(%rdi)
521 movntiq %r10, 24(%rdi)
522 movntiq %r11, 32(%rdi)
523 movntiq %r12, 40(%rdi)
524 movntiq %r13, 48(%rdi)
525 movntiq %r14, 56(%rdi)
536 movntiq %rax, 64(%rdi)
537 movntiq %r8, 72(%rdi)
538 movntiq %r9, 80(%rdi)
539 movntiq %r10, 88(%rdi)
540 movntiq %r11, 96(%rdi)
541 movntiq %r12, 104(%rdi)
542 movntiq %r13, 112(%rdi)
543 movntiq %r14, 120(%rdi)
550 sfence /* serialize memory stores */
552 movq SAVE2(%rsp), %r12
554 movq SAVE1(%rsp), %r13
556 movq SAVE0(%rsp), %r14
560 andl $127, %edx /* check for left overs */
561 #ifdef USE_AS_MEMPCPY
566 movq RETVAL(%rsp), %rax
573 #endif /* !NOT_IN_libc */
577 #ifndef USE_AS_MEMPCPY
578 libc_hidden_builtin_def (memcpy)
579 # if defined SHARED && !defined USE_MULTIARCH && !defined NOT_IN_libc
581 # include <shlib-compat.h>
582 versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);