1 /* memset/bzero -- set memory area to CH/0
2 Optimized version for x86-64.
3 Copyright (C) 2002 Free Software Foundation, Inc.
4 This file is part of the GNU C Library.
5 Contributed by Andreas Jaeger <aj@suse.de>.
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, write to the Free
19 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
23 #include "asm-syntax.h"
27 /* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */
28 #define BZERO_P (defined memset)
30 /* This is somehow experimental and could made dependend on the cache
37 mov %rsi,%rdx /* Adjust parameter. */
38 xorq %rsi,%rsi /* Fill with 0s. */
40 cmp $0x7,%rdx /* Check for small length. */
41 mov %rdi,%rcx /* Save ptr as return value. */
45 mov %rsi,%r8 /* Just copy 0. */
47 /* Populate 8 bit data to full 64-bit. */
48 movabs $0x0101010101010101,%r8
52 test $0x7,%edi /* Check for alignment. */
56 1: /* Align ptr to 8 byte. */
63 2: /* Check for really large regions. */
71 3: /* Copy 64 bytes. */
84 4: /* Copy final bytes. */
90 5: /* First in chunks of 8 bytes. */
100 8: /* And finally as bytes (up to 7). */
109 /* Load result (only if used as memset). */
110 mov %rdi,%rax /* start address of destination is result */
115 11: /* Copy 64 bytes without polluting the cache. */
116 /* We could use movntdq %xmm0,(%rcx) here to further
117 speed up for large cases but let's not use XMM registers. */
120 movnti %r8,0x10(%rcx)
121 movnti %r8,0x18(%rcx)
122 movnti %r8,0x20(%rcx)
123 movnti %r8,0x28(%rcx)
124 movnti %r8,0x30(%rcx)
125 movnti %r8,0x38(%rcx)