2 Copyright (C) 2011-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 #if ISA_SHOULD_BUILD (3)
26 # include "x86-avx-vecs.h"
30 # define STRCPY __strcpy_avx2
33 /* Use movsb in page cross case to save code size. */
34 # define USE_MOVSB_IN_PAGE_CROSS 1
37 # define VPCMPEQ vpcmpeqd
38 # define VPMIN vpminud
41 # define VPCMPEQ vpcmpeqb
42 # define VPMIN vpminub
46 # define PAGE_SIZE 4096
51 # define END_REG rdi, %rdx
55 # define PAGE_ALIGN_REG ecx
57 # define PAGE_ALIGN_REG eax
61 # define VZERO_128 VMM_128(7)
63 .section SECTION(.text), "ax", @progbits
65 vpxor %VZERO_128, %VZERO_128, %VZERO_128
69 # include "strcat-strlen-avx2.h.S"
72 movl %esi, %PAGE_ALIGN_REG
73 andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
74 cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
76 L(page_cross_continue):
77 # if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
81 VPCMPEQ %VMM(0), %VZERO, %VMM(6)
82 vpmovmskb %VMM(6), %ecx
87 /* No longer need ymm registers so just vzeroupper so it doesn't
88 need to be duplicated at each return statement. */
94 leaq (%rdi, %rdx), %rax
97 /* Use mask bits in rcx to detect which copy we need. If the low
98 mask is zero then there must be a bit set in the upper half.
99 I.e if ecx != 0 and cx == 0, then match must be upper 16
100 bits so we use L(copy_16_31). */
106 # ifdef USE_AS_WCSCPY
126 movl -3(%rsi, %rdx), %ecx
128 movl %ecx, -3(%END_REG)
134 VMOVU -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
136 VMOVU %xmm1, -(16 - CHAR_SIZE)(%END_REG)
141 # ifdef USE_AS_WCSCPY
142 movl -(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
144 movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
147 movq %rcx, -(8 - CHAR_SIZE)(%END_REG)
153 # if defined USE_AS_STPCPY || defined USE_AS_STRCAT
154 VMOVU %VMM(0), (%rdi)
157 orq $(VEC_SIZE - 1), %rsi
159 VMOVA 1(%rsi), %VMM(1)
161 /* Try and order stores after as many loads as is reasonable to
162 avoid potential false dependencies. */
163 # if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
164 VMOVU %VMM(0), (%rax)
166 VPCMPEQ %VMM(1), %VZERO, %VMM(6)
167 vpmovmskb %VMM(6), %ecx
171 VMOVA (VEC_SIZE + 1)(%rsi), %VMM(2)
172 VMOVU %VMM(1), 1(%rdi)
174 VPCMPEQ %VMM(2), %VZERO, %VMM(6)
175 vpmovmskb %VMM(6), %ecx
179 VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
180 VMOVU %VMM(2), (VEC_SIZE + 1)(%rdi)
182 VPCMPEQ %VMM(3), %VZERO, %VMM(6)
183 vpmovmskb %VMM(6), %ecx
187 VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
188 VMOVU %VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
189 VPCMPEQ %VMM(4), %VZERO, %VMM(6)
190 vpmovmskb %VMM(6), %edx
194 VMOVU %VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
196 /* Subtract rsi from rdi before aligning. Adding back rsi will
197 get proper rdi (dst) for new src. */
200 orq $(VEC_SIZE * 4 - 1), %rsi
202 /* Do first half of loop ahead of time so loop can just start by
204 VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
205 VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
206 VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
207 VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
209 VPMIN %VMM(0), %VMM(1), %VMM(4)
210 VPMIN %VMM(2), %VMM(3), %VMM(6)
211 VPMIN %VMM(4), %VMM(6), %VMM(6)
212 VPCMPEQ %VMM(6), %VZERO, %VMM(6)
213 vpmovmskb %VMM(6), %edx
222 VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
223 VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
224 subq $(VEC_SIZE * -4), %rsi
225 VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
226 VMOVU %VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
229 VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
230 VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
231 VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
232 VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
234 VPMIN %VMM(0), %VMM(1), %VMM(4)
235 VPMIN %VMM(2), %VMM(3), %VMM(6)
236 VPMIN %VMM(4), %VMM(6), %VMM(6)
237 VPCMPEQ %VMM(6), %VZERO, %VMM(6)
239 vpmovmskb %VMM(6), %edx
240 subq $(VEC_SIZE * -4), %rdi
245 VPCMPEQ %VMM(0), %VZERO, %VMM(6)
246 vpmovmskb %VMM(6), %ecx
249 VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
251 VPCMPEQ %VMM(1), %VZERO, %VMM(6)
252 vpmovmskb %VMM(6), %ecx
255 VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
257 VPCMPEQ %VMM(2), %VZERO, %VMM(6)
258 vpmovmskb %VMM(6), %ecx
261 VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
264 VMOVU ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
265 VMOVU %VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
266 # ifdef USE_AS_STPCPY
267 leaq (VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
275 VMOVU (1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
276 VMOVU %VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
277 # ifdef USE_AS_STPCPY
278 leaq 1(%rcx, %rdi), %rax
280 L(return_vzeroupper):
281 ZERO_UPPER_VEC_REGISTERS_RETURN
286 VMOVU ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
287 VMOVU %VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
288 # ifdef USE_AS_STPCPY
289 leaq (VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
296 VMOVU ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
297 VMOVU %VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
298 # ifdef USE_AS_STPCPY
299 leaq (VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
307 andq $(VEC_SIZE * -1), %rcx
309 VPCMPEQ (%rcx), %VZERO, %VMM(6)
310 vpmovmskb %VMM(6), %ecx
311 shrxl %esi, %ecx, %ecx
312 # if USE_MOVSB_IN_PAGE_CROSS
313 /* Optimizing more aggressively for space as this is very cold
314 code. This saves 2x cache lines. */
316 /* This adds once to the later result which will get correct
317 copy bounds. NB: this can never zero-out a non-zero RCX as
318 to be in the page cross case rsi cannot be aligned and we
319 already right-shift rcx by the misalignment. */
320 shll $CHAR_SIZE, %ecx
321 jz L(page_cross_continue)
323 # if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
327 # ifdef USE_AS_STPCPY
328 leaq -CHAR_SIZE(%rdi), %rax
335 jz L(page_cross_continue)
337 /* Traditional copy case, essentially same as used in non-page-
338 cross case but since we can't reuse VMM(0) we need twice as
339 many loads from rsi. */
340 # ifndef USE_AS_STRCAT
344 # ifdef USE_AS_STPCPY
345 leaq (%rdi, %rdx), %rax
346 # elif !defined USE_AS_STRCAT
350 /* vzeroupper early to avoid duplicating at each return. */
354 jz L(page_cross_copy_16_31)
357 jz L(page_cross_copy_8_15)
360 jz L(page_cross_copy_4_7)
363 jz L(page_cross_set_null_term)
366 L(page_cross_set_null_term):
371 L(page_cross_copy_4_7):
373 movl -3(%rsi, %rdx), %esi
375 movl %esi, -3(%END_REG)
379 L(page_cross_copy_8_15):
381 movq -7(%rsi, %rdx), %rsi
383 movq %rsi, -7(%END_REG)
388 L(page_cross_copy_16_31):
390 VMOVU -15(%rsi, %rdx), %xmm1
392 VMOVU %xmm1, -15(%END_REG)