1 /* memset/bzero with unaligned store and rep stosb
2 Copyright (C) 2016-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
19 /* memset is implemented as:
20 1. Use overlapping store to avoid branch.
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25 4 VEC stores and store 4 * VEC at a time until done. */
29 #ifndef MEMSET_CHK_SYMBOL
30 # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
33 #ifndef WMEMSET_CHK_SYMBOL
34 # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
39 # define VZEROUPPER vzeroupper
45 #ifndef VZEROUPPER_SHORT_RETURN
47 # define VZEROUPPER_SHORT_RETURN vzeroupper
49 # define VZEROUPPER_SHORT_RETURN rep
61 /* Threshold to use Enhanced REP STOSB. Since there is overhead to set
62 up REP STOSB operation, REP STOSB isn't faster on short data. The
63 memset micro benchmark in glibc shows that 2KB is the approximate
64 value above which REP STOSB becomes faster on processors with
65 Enhanced REP STOSB. Since the stored value is fixed, larger register
66 size has minimal impact on threshold. */
67 #ifndef REP_STOSB_THRESHOLD
68 # define REP_STOSB_THRESHOLD 2048
72 # error SECTION is not defined!
75 .section SECTION(.text),"ax",@progbits
76 #if VEC_SIZE == 16 && IS_IN (libc)
78 movq %rdi, %rax /* Set return value. */
79 movq %rsi, %rdx /* Set n. */
81 jmp L(entry_from_bzero)
83 weak_alias (__bzero, bzero)
88 ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
90 jb HIDDEN_JUMPTARGET (__chk_fail)
91 END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
94 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
96 WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
97 jmp L(entry_from_bzero)
98 END (WMEMSET_SYMBOL (__wmemset, unaligned))
101 #if defined SHARED && IS_IN (libc)
102 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
104 jb HIDDEN_JUMPTARGET (__chk_fail)
105 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
108 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
109 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
113 cmpq $(VEC_SIZE * 2), %rdx
115 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
116 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
117 VMOVU %VEC(0), (%rdi)
120 #if defined USE_MULTIARCH && IS_IN (libc)
121 END (MEMSET_SYMBOL (__memset, unaligned))
124 ENTRY (__memset_chk_erms)
126 jb HIDDEN_JUMPTARGET (__chk_fail)
127 END (__memset_chk_erms)
129 /* Only used to measure performance of REP STOSB. */
130 ENTRY (__memset_erms)
132 /* Provide a hidden symbol to debugger. */
133 .hidden MEMSET_SYMBOL (__memset, erms)
134 ENTRY (MEMSET_SYMBOL (__memset, erms))
137 /* Issue vzeroupper before rep stosb. */
148 END (MEMSET_SYMBOL (__memset, erms))
151 # if defined SHARED && IS_IN (libc)
152 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
154 jb HIDDEN_JUMPTARGET (__chk_fail)
155 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
158 ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
159 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
162 cmpq $(VEC_SIZE * 2), %rdx
163 ja L(stosb_more_2x_vec)
164 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
165 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
166 VMOVU %VEC(0), (%rdi)
170 L(stosb_more_2x_vec):
171 cmpq $REP_STOSB_THRESHOLD, %rdx
175 cmpq $(VEC_SIZE * 4), %rdx
177 VMOVU %VEC(0), (%rdi)
178 VMOVU %VEC(0), VEC_SIZE(%rdi)
179 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
180 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
186 leaq (VEC_SIZE * 4)(%rdi), %rcx
187 VMOVU %VEC(0), (%rdi)
188 andq $-(VEC_SIZE * 4), %rcx
189 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
190 VMOVU %VEC(0), VEC_SIZE(%rdi)
191 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
192 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
193 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
194 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
195 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
197 andq $-(VEC_SIZE * 4), %rdx
201 VMOVA %VEC(0), (%rcx)
202 VMOVA %VEC(0), VEC_SIZE(%rcx)
203 VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
204 VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
205 addq $(VEC_SIZE * 4), %rcx
208 VZEROUPPER_SHORT_RETURN
211 /* Less than 1 VEC. */
212 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
213 # error Unsupported VEC_SIZE!
236 /* From 32 to 63. No branch when size == 32. */
238 vmovdqu %ymm0, -32(%rdi,%rdx)
239 vmovdqu %ymm0, (%rdi)
244 /* From 16 to 31. No branch when size == 16. */
246 vmovdqu %xmm0, -16(%rdi,%rdx)
247 vmovdqu %xmm0, (%rdi)
251 /* From 8 to 15. No branch when size == 8. */
253 movq %rcx, -8(%rdi,%rdx)
258 /* From 4 to 7. No branch when size == 4. */
259 movl %ecx, -4(%rdi,%rdx)
264 /* From 2 to 3. No branch when size == 2. */
265 movw %cx, -2(%rdi,%rdx)
269 END (MEMSET_SYMBOL (__memset, unaligned_erms))