2 Optimized memcpy for x86-64.
4 Copyright (C) 2007 Free Software Foundation, Inc.
5 Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
7 This file is part of the GNU C Library.
9 The GNU C Library is free software; you can redistribute it and/or
10 modify it under the terms of the GNU Lesser General Public
11 License as published by the Free Software Foundation; either
12 version 2.1 of the License, or (at your option) any later version.
14 The GNU C Library is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 Lesser General Public License for more details.
19 You should have received a copy of the GNU Lesser General Public
20 License along with the GNU C Library; if not, write to the Free
21 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26 #include "asm-syntax.h"
28 /* Stack slots in the red-zone. */
35 #define SAVE0 (RETVAL - 8)
36 #define SAVE1 (SAVE0 - 8)
37 #define SAVE2 (SAVE1 - 8)
38 #define SAVE3 (SAVE2 - 8)
42 #if defined PIC && !defined NOT_IN_libc
46 jb HIDDEN_JUMPTARGET (__chk_fail)
51 ENTRY(memcpy) /* (void *, const void*, size_t) */
53 /* Handle tiny blocks. */
55 L(1try): /* up to 32B */
57 #ifndef USE_AS_MEMPCPY
58 movq %rdi, %rax /* save return value */
62 L(1): /* 1-byte once */
74 L(1a): /* 2-byte once */
86 L(1b): /* 4-byte once */
98 L(1c): /* 8-byte once */
110 L(1d): /* 16-byte loop */
132 #ifdef USE_AS_MEMPCPY
133 movq %rdi, %rax /* return value */
142 #ifndef USE_AS_MEMPCPY
143 movq %rax, RETVAL (%rsp) /* save return value */
146 /* Align to the natural word size. */
149 movl %esi, %ecx /* align by destination */
152 jz L(alignafter) /* already aligned */
154 L(align): /* align */
155 leaq -8 (%rcx, %rdx), %rdx /* calculate remaining bytes */
160 L(alignloop): /* 1-byte alignment loop */
175 /* Loop to handle mid-sized blocks. */
177 L(32try): /* up to 1KB */
181 L(32): /* 32-byte loop */
204 jz L(32skip) /* help out smaller blocks */
226 andl $31, %edx /* check for left overs */
227 #ifdef USE_AS_MEMPCPY
232 movq RETVAL (%rsp), %rax
244 In order to minimize code-size in RTLD, algorithms specific for
245 larger blocks are excluded when building for RTLD.
248 /* Handle large blocks smaller than 1/2 L1. */
250 L(fasttry): /* first 1/2 L1 */
251 #ifndef NOT_IN_libc /* only up to this algorithm outside of libc.so */
252 movq __x86_64_core_cache_size_half (%rip), %r11
253 cmpq %rdx, %r11 /* calculate the smaller of */
254 cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */
257 L(fast): /* good ol' MOVS */
274 subq %r11, %rdx /* check for more */
279 andl $7, %edx /* check for left overs */
280 #ifdef USE_AS_MEMPCPY
285 movq RETVAL (%rsp), %rax
292 #ifndef NOT_IN_libc /* none of the algorithms below for RTLD */
298 /* Handle large blocks smaller than 1/2 L2. */
300 L(pretry): /* first 1/2 L2 */
301 movq __x86_64_shared_cache_size_half (%rip), %r8
302 cmpq %rdx, %r8 /* calculate the lesser of */
303 cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */
305 L(pre): /* 64-byte with prefetching */
311 movq %r14, SAVE0 (%rsp)
312 cfi_rel_offset (%r14, SAVE0)
313 movq %r13, SAVE1 (%rsp)
314 cfi_rel_offset (%r13, SAVE1)
315 movq %r12, SAVE2 (%rsp)
316 cfi_rel_offset (%r12, SAVE2)
317 movq %rbx, SAVE3 (%rsp)
318 cfi_rel_offset (%rbx, SAVE3)
320 cmpl $0, __x86_64_prefetchw (%rip)
321 jz L(preloop) /* check if PREFETCHW OK */
325 /* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */
327 L(prewloop): /* cache-line in state M */
339 prefetcht0 0 + 896 (%rsi)
340 prefetcht0 64 + 896 (%rsi)
376 prefetchw 896 - 64 (%rdi)
377 prefetchw 896 - 0 (%rdi)
387 /* ... when PREFETCHW is not available. */
389 L(preloop): /* cache-line in state E */
401 prefetcht0 896 + 0 (%rsi)
402 prefetcht0 896 + 64 (%rsi)
429 prefetcht0 896 - 64 (%rdi)
430 prefetcht0 896 - 0 (%rdi)
447 movq SAVE3 (%rsp), %rbx
449 movq SAVE2 (%rsp), %r12
451 movq SAVE1 (%rsp), %r13
453 movq SAVE0 (%rsp), %r14
459 subq %r8, %rdx /* check for more */
463 andl $63, %edx /* check for left overs */
464 #ifdef USE_AS_MEMPCPY
469 movq RETVAL (%rsp), %rax
480 /* Loop to handle huge blocks. */
484 L(NT): /* non-temporal 128-byte */
489 movq %r14, SAVE0 (%rsp)
490 cfi_rel_offset (%r14, SAVE0)
491 movq %r13, SAVE1 (%rsp)
492 cfi_rel_offset (%r13, SAVE1)
493 movq %r12, SAVE2 (%rsp)
494 cfi_rel_offset (%r12, SAVE2)
499 prefetchnta 768 (%rsi)
500 prefetchnta 832 (%rsi)
514 movntiq %r8, 8 (%rdi)
515 movntiq %r9, 16 (%rdi)
516 movntiq %r10, 24 (%rdi)
517 movntiq %r11, 32 (%rdi)
518 movntiq %r12, 40 (%rdi)
519 movntiq %r13, 48 (%rdi)
520 movntiq %r14, 56 (%rdi)
527 movq 104 (%rsi), %r12
528 movq 112 (%rsi), %r13
529 movq 120 (%rsi), %r14
531 movntiq %rax, 64 (%rdi)
532 movntiq %r8, 72 (%rdi)
533 movntiq %r9, 80 (%rdi)
534 movntiq %r10, 88 (%rdi)
535 movntiq %r11, 96 (%rdi)
536 movntiq %r12, 104 (%rdi)
537 movntiq %r13, 112 (%rdi)
538 movntiq %r14, 120 (%rdi)
540 leaq 128 (%rsi), %rsi
541 leaq 128 (%rdi), %rdi
545 sfence /* serialize memory stores */
547 movq SAVE2 (%rsp), %r12
549 movq SAVE1 (%rsp), %r13
551 movq SAVE0 (%rsp), %r14
555 andl $127, %edx /* check for left overs */
556 #ifdef USE_AS_MEMPCPY
561 movq RETVAL (%rsp), %rax
568 #endif /* !NOT_IN_libc */
572 #ifndef USE_AS_MEMPCPY
573 libc_hidden_builtin_def (memcpy)