1 /* {wcs|str}ncat with 256/512-bit EVEX.
2 Copyright (C) 2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 #if ISA_SHOULD_BUILD (4)
23 /* Use evex-masked stores for small sizes. Turned off at the
25 # define USE_EVEX_MASKED_STORE 0
30 # include "x86-evex256-vecs.h"
34 # define STRNCAT __strncat_evex
40 # define VMOVU_MASK vmovdqu32
41 # define VPMIN vpminud
42 # define VPTESTN vptestnmd
43 # define VPTEST vptestmd
44 # define VPCMPEQ vpcmpeqd
47 # define REP_MOVS rep movsd
49 # define VMASK_REG VR10
50 # define FIND_FIRST_ONE(src, dst) movl $CHAR_PER_VEC, %dst; bsf %src, %dst
52 # define USE_WIDE_CHAR
55 # define VMOVU_MASK vmovdqu8
56 # define VPMIN vpminub
57 # define VPTESTN vptestnmb
58 # define VPTEST vptestmb
59 # define VPCMPEQ vpcmpeqb
62 # define REP_MOVS rep movsb
64 # define VMASK_REG VRCX
65 # define FIND_FIRST_ONE(src, dst) tzcnt %src, %dst
69 # include "strncpy-or-cat-overflow-def.h"
71 # include "reg-macros.h"
75 # define VZERO_128 VMM_128(7)
77 # define PAGE_SIZE 4096
78 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
80 .section SECTION(.text), "ax", @progbits
84 /* NB: It's safe to filter out zero-length strings WITHOUT
85 setting null-term. Destination MUST be a null-terminated
86 string so essentially the work is already done. */
96 # include "strcat-strlen-evex.h.S"
99 andl $(PAGE_SIZE - 1), %ecx
100 cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
102 L(page_cross_continue):
103 VMOVU (%rsi), %VMM(0)
104 VPTESTN %VMM(0), %VMM(0), %k0
106 /* If USE_EVEX_MASK_STORE is enabled then we just handle length
107 <= CHAR_PER_VEC with masked instructions (which have
108 potential for dramatically bad perf if dst splits a page and
109 is not in the TLB). */
110 # if USE_EVEX_MASKED_STORE
112 FIND_FIRST_ONE (VRCX, VR8)
121 VMOVU_MASK %VMM(0), (%rdi){%k1}
126 bzhi %VRDX, %VRCX, %VRCX
128 MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
129 VMOVU_MASK %VMM(0), (%rdi){%k1}
134 /* tzcnt for strncat and `movl $CHAR_PER_VEC, %VRCX; bsf
135 %VMASK_REG, %VRCX` for wcsncat. */
136 FIND_FIRST_ONE (VMASK_REG, VRCX)
140 /* If there were no zero-CHARs (rcx was zero before
141 FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
142 cmpl $CHAR_PER_VEC, %ecx
149 cmpl $(32 / CHAR_SIZE), %edx
153 cmpl $(16 / CHAR_SIZE), %edx
157 cmpl $(8 / CHAR_SIZE), %edx
160 # ifdef USE_AS_WCSCPY
161 vmovd %VMM_128(0), (%rdi)
162 MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
179 MOVCHAR $0, (%rdi, %rdx)
186 VMOVU -(32)(%rsi, %rdx, CHAR_SIZE), %VMM_256(1)
187 VMOVU %VMM_256(0), (%rdi)
188 VMOVU %VMM_256(1), -(32)(%rdi, %rdx, CHAR_SIZE)
189 MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
194 /* Use xmm1 explicitly here as it won't require a `vzeroupper`
195 and will save code size. */
196 vmovdqu -(16)(%rsi, %rdx, CHAR_SIZE), %xmm1
197 VMOVU %VMM_128(0), (%rdi)
198 vmovdqu %xmm1, -(16)(%rdi, %rdx, CHAR_SIZE)
199 MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
204 movq -(8)(%rsi, %rdx, CHAR_SIZE), %rcx
205 vmovq %VMM_128(0), (%rdi)
206 movq %rcx, -(8)(%rdi, %rdx, CHAR_SIZE)
207 MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
210 # ifndef USE_AS_WCSCPY
213 movl -(4)(%rsi, %rdx, CHAR_SIZE), %ecx
214 vmovd %VMM_128(0), (%rdi)
215 movl %ecx, -(4)(%rdi, %rdx, CHAR_SIZE)
216 MOVCHAR $0, (%rdi, %rdx, CHAR_SIZE)
223 # ifdef USE_AS_WCSCPY
231 VMOVU %VMM(0), (%rdi)
233 /* We are going to align rsi here so will need to be able to re-
234 adjust rdi/rdx afterwords. NB: We filtered out huge lengths
235 so rsi + rdx * CHAR_SIZE cannot overflow. */
237 leaq (VEC_SIZE * -1)(%rsi, %rdx, CHAR_SIZE), %rdx
239 andq $-(VEC_SIZE), %rsi
243 # ifdef USE_AS_WCSCPY
247 /* Will need this regardless. */
248 VMOVA (VEC_SIZE * 1)(%rsi), %VMM(1)
249 VPTESTN %VMM(1), %VMM(1), %k0
252 cmpq $(CHAR_PER_VEC * 2), %rdx
256 FIND_FIRST_ONE (VMASK_REG, VRCX)
258 jbe L(ret_vec_x1_len)
260 /* If there were no zero-CHARs (rcx was zero before
261 FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
262 cmpl $CHAR_PER_VEC, %ecx
265 VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
266 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
267 VPTESTN %VMM(2), %VMM(2), %k0
269 addl $-CHAR_PER_VEC, %edx
270 bzhi %VRDX, %VRCX, %VR8
275 VMOVU (VEC_SIZE * 2 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
276 MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
277 VMOVU %VMM(0), (VEC_SIZE * 2 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
284 VMOVU (VEC_SIZE -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
285 MOVCHAR $0, (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE)
286 VMOVU %VMM(0), (VEC_SIZE-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
292 addl $-(CHAR_PER_VEC * 4), %edx
293 VMOVA (VEC_SIZE * 5)(%rsi), %VMM(1)
294 VPTESTN %VMM(1), %VMM(1), %k0
296 subq $-(VEC_SIZE * 4), %rsi
297 subq $-(VEC_SIZE * 4), %rdi
298 cmpl $(CHAR_PER_VEC * 2), %edx
302 # ifdef USE_AS_WCSCPY
305 bsf %VMASK_REG, %VRCX
308 VMOVA (VEC_SIZE * 2)(%rsi), %VMM(2)
309 VMOVU %VMM(1), (VEC_SIZE * 1)(%rdi)
310 VPTESTN %VMM(2), %VMM(2), %k0
315 VMOVA (VEC_SIZE * 3)(%rsi), %VMM(3)
316 VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi)
317 VPTESTN %VMM(3), %VMM(3), %k0
320 cmpq $(CHAR_PER_VEC * 4), %rdx
323 /* Adjust length before going to L(ret_vec_x3_len) or
325 addl $(CHAR_PER_VEC * -2), %edx
327 FIND_FIRST_ONE (VMASK_REG, VRCX)
329 jbe L(ret_vec_x3_len)
331 /* If there were no zero-CHARs (rcx was zero before
332 FIND_FIRST_ONE), then ecx will be $CHAR_PER_VEC. */
333 cmpl $CHAR_PER_VEC, %ecx
336 VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
337 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
338 VPTESTN %VMM(4), %VMM(4), %k0
340 addl $-CHAR_PER_VEC, %edx
341 bzhi %VRDX, %VRCX, %VR8
346 VMOVU (VEC_SIZE * 4 -(VEC_SIZE))(%rsi, %rdx, CHAR_SIZE), %VMM(0)
347 MOVCHAR $0, (VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE)
348 VMOVU %VMM(0), (VEC_SIZE * 4 -(VEC_SIZE))(%rdi, %rdx, CHAR_SIZE)
355 VMOVU (VEC_SIZE * 3 -(VEC_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
356 MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE)
357 VMOVU %VMM(0), (VEC_SIZE * 3-(VEC_SIZE))(%rdi, %rcx, CHAR_SIZE)
362 # ifdef USE_AS_WCSCPY
365 bsf %VMASK_REG, %VRCX
368 VMOVA (VEC_SIZE * 4)(%rsi), %VMM(4)
369 VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi)
370 VPTESTN %VMM(4), %VMM(4), %k0
375 VMOVU %VMM(4), (VEC_SIZE * 4)(%rdi)
377 /* Check if we are near the end before aligning. */
378 cmpq $(CHAR_PER_VEC * 8), %rdx
382 /* Add rsi to rdx (length) before aligning rsi. NB: Since we
383 filtered out huge lengths this cannot overflow. */
384 # ifdef USE_AS_WCSCPY
385 leaq (%rsi, %rdx, CHAR_SIZE), %rdx
390 /* Subtract rsi from rdi before aligning (add back will have
391 correct rdi for aligned rsi). */
393 subq $-(VEC_SIZE * 5), %rsi
394 andq $(VEC_SIZE * -4), %rsi
396 /* Load first half of the loop before entry. */
397 VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
398 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
399 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
400 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
402 VPMIN %VMM(0), %VMM(1), %VMM(4)
403 VPMIN %VMM(2), %VMM(3), %VMM(6)
404 VPTESTN %VMM(4), %VMM(4), %k2
405 VPTESTN %VMM(6), %VMM(6), %k4
407 /* Offset rsi by VEC_SIZE so that we can jump to
408 L(loop_last_4x_vec). */
409 addq $-(VEC_SIZE), %rsi
413 /* Store loop end in r9. */
414 leaq -(VEC_SIZE * 5)(%rdx), %r9
418 VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi, %rsi)
419 VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi, %rsi)
420 VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi, %rsi)
421 VMOVU %VMM(3), (VEC_SIZE * 4 + 0)(%rdi, %rsi)
423 subq $(VEC_SIZE * -4), %rsi
425 jbe L(loop_last_4x_vec)
427 VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(0)
428 VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(1)
429 VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(2)
430 VMOVA (VEC_SIZE * 4 + 0)(%rsi), %VMM(3)
432 VPMIN %VMM(0), %VMM(1), %VMM(4)
433 VPMIN %VMM(2), %VMM(3), %VMM(6)
434 VPTESTN %VMM(4), %VMM(4), %k2
435 VPTESTN %VMM(6), %VMM(6), %k4
440 VPTESTN %VMM(0), %VMM(0), %k0
442 /* Restore rdi (dst). */
445 /* L(ret_vec_x1) expects rcx to have position of zero-CHAR so
449 VMOVU %VMM(0), (VEC_SIZE * 1 + 0)(%rdi)
454 VMOVU %VMM(1), (VEC_SIZE * 2 + 0)(%rdi)
456 VPTESTN %VMM(2), %VMM(2), %k0
460 VMOVU %VMM(2), (VEC_SIZE * 3 + 0)(%rdi)
464 VMOVU ((VEC_SIZE * 4)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx, CHAR_SIZE), %VMM(0)
465 VMOVU %VMM(0), ((VEC_SIZE * 4 + 0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx, CHAR_SIZE)
472 andq $(VEC_SIZE * -1), %r8
473 VPCMPEQ (%r8), %VZERO, %k0
475 # ifdef USE_AS_WCSCPY
478 andl $(CHAR_PER_VEC - 1), %ecx
479 shrx %VRCX, %VR9, %VRCX
482 shrx %VRSI, %VRCX, %VRCX
486 andl $(VEC_SIZE - 1), %r8d
487 # ifdef USE_AS_WCSCPY
491 jbe L(page_cross_small)
492 /* Optimizing more for space as this is very cold code. This
493 saves 2x cache lines. */
495 /* This adds once to the later result which will get correct
496 copy bounds. NB: this can never zero-out a non-zero RCX as
497 to be in the page cross case rsi cannot be aligned and we
498 already right-shift rcx by the misalignment. */
500 jz L(page_cross_continue)
507 jz L(page_cross_setz)
511 # ifdef USE_AS_WCSCPY