1 /* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
2 Copyright (C) 2022-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
22 #if ISA_SHOULD_BUILD (4)
25 # include "x86-evex256-vecs.h"
30 # define STRNLEN __strnlen_evex
34 # define VPCMPEQ vpcmpeqd
35 # define VPCMPNEQ vpcmpneqd
36 # define VPTESTN vptestnmd
37 # define VPTEST vptestmd
38 # define VPMINU vpminud
42 # define VPCMPEQ vpcmpeqb
43 # define VPCMPNEQ vpcmpneqb
44 # define VPTESTN vptestnmb
45 # define VPTEST vptestmb
46 # define VPMINU vpminub
49 # define REG_WIDTH VEC_SIZE
52 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
54 # include "reg-macros.h"
56 # if CHAR_PER_VEC == 32
57 # define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8)
59 # define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32)
64 # if CHAR_PER_VEC == 64
65 # define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
67 # define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
71 # define XZERO VMM_128(0)
73 # define PAGE_SIZE 4096
75 .section SECTION(.text), "ax", @progbits
76 ENTRY_P2ALIGN (STRNLEN, 6)
77 /* Check zero length. */
81 /* Clear the upper 32 bits. */
86 vpxorq %XZERO, %XZERO, %XZERO
87 andl $(PAGE_SIZE - 1), %eax
88 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
89 ja L(cross_page_boundary)
91 /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
93 VPCMPEQ (%rdi), %VZERO, %k0
98 /* If src (rcx) is zero, bsf does not change the result. NB:
99 Must use 64-bit bsf here so that upper bits of len are not
102 /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
103 CHAR) and rsi must be > CHAR_PER_VEC. */
104 cmpq $CHAR_PER_VEC, %rax
106 /* Check if first match in bounds. */
112 # if CHAR_PER_VEC != 32
120 /* Aligned more for strnlen compares remaining length vs 2 *
121 CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
122 going to the loop. */
125 L(cross_page_continue):
126 /* Compute number of words checked after aligning. */
127 # ifdef USE_AS_WCSLEN
128 /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
131 andq $(VEC_SIZE * -1), %rdi
134 leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
136 leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax
137 andq $(VEC_SIZE * -1), %rdi
142 VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
144 cmpq $(CHAR_PER_VEC * 2), %rax
147 L(last_2x_vec_or_less):
150 jnz L(last_vec_check)
152 /* Check the end of data. */
153 SUB_SHORT (CHAR_PER_VEC, rax)
155 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
159 /* Best place for LAST_VEC_CHECK if ZMM. */
164 lea (%rsi, %rdx), %eax
168 # if CHAR_PER_VEC == 32
177 L(last_4x_vec_or_less):
178 addl $(CHAR_PER_VEC * -4), %eax
179 VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
180 subq $(VEC_SIZE * -4), %rdi
181 cmpl $(CHAR_PER_VEC * 2), %eax
182 jbe L(last_2x_vec_or_less)
186 /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
187 rechecking bounds. */
194 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
199 cmpq $(CHAR_PER_VEC * 4), %rax
203 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
205 addl $(CHAR_PER_VEC * -2), %eax
207 jnz L(last_vec_check)
209 subl $(CHAR_PER_VEC), %eax
212 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
216 jnz L(last_vec_check)
224 /* If VEC_SIZE == 64 we can fit logic for full return label in
225 spare bytes before next cache line. */
228 leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
232 addl $CHAR_PER_VEC, %esi
237 leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
244 /* If VEC_SIZE == 64 we can fit logic for full return label in
245 spare bytes before next cache line. */
248 leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
252 addl $CHAR_PER_VEC, %esi
257 leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
262 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
267 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
272 /* Check if at last VEC_SIZE * 4 length before aligning for the
274 cmpq $(CHAR_PER_VEC * 8), %rax
275 jbe L(last_4x_vec_or_less)
278 /* Compute number of words checked after aligning. */
279 # ifdef USE_AS_WCSLEN
280 /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
282 leaq (VEC_SIZE * -3)(%rdi), %rdx
284 leaq (VEC_SIZE * -3)(%rdi, %rax), %rax
287 subq $(VEC_SIZE * -1), %rdi
289 /* Align data to VEC_SIZE * 4. */
291 /* Saves code size. No evex512 processor has partial register
292 stalls. If that change this can be replaced with `andq
293 $-(VEC_SIZE * 4), %rdi`. */
296 andq $-(VEC_SIZE * 4), %rdi
299 # ifdef USE_AS_WCSLEN
306 /* Compare 4 * VEC at a time forward. */
309 VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1)
310 VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
311 VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3)
312 VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
313 VPTESTN %VMM(2), %VMM(2), %k0
314 VPTESTN %VMM(4), %VMM(4), %k2
315 subq $-(VEC_SIZE * 4), %rdi
316 /* Break if at end of length. */
317 subq $(CHAR_PER_VEC * 4), %rax
328 VPTESTN %VMM(1), %VMM(1), %k1
337 VPTESTN %VMM(3), %VMM(3), %k0
339 /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
340 returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
341 individually, for VEC_SIZE == 32 we combine them in a single
343 # if CHAR_PER_VEC == 64
349 /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
353 salq $CHAR_PER_VEC, %rdx
357 /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
360 leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
365 /* Handle last 4x VEC after loop. All VECs have been loaded. */
369 jnz L(loop_last_4x_vec)
374 # if CHAR_PER_VEC == 64
375 /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
376 need return label for it. */
380 leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
389 addq $CHAR_PER_VEC, %rsi
392 leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
399 L(cross_page_boundary):
400 /* Align data to VEC_SIZE. */
402 andq $-VEC_SIZE, %rcx
403 VPCMPEQ (%rcx), %VZERO, %k0
406 # ifdef USE_AS_WCSLEN
408 andl $(CHAR_PER_VEC - 1), %eax
410 shrx %VRAX, %VRCX, %VRCX
413 andl $(CHAR_PER_VEC - 1), %eax
417 ja L(cross_page_continue)