1 /* strlen optimized with SSE2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 /* ISA level >= 2 for both strlen and wcslen. wcslen uses `pminud`
22 which is SSE4.1. strlen doesn't have an ISA level == 2
23 implementation so the SSE2 implementation must be built with ISA
25 # if ISA_SHOULD_BUILD (2)
30 # define STRLEN __strlen_sse2
35 # define PCMPEQ pcmpeqd
36 # define SHIFT_RETURN shrq $2, %rax
39 # define PCMPEQ pcmpeqb
47 /* Long lived register in strlen(s), strnlen(s, n) are:
51 %r10 (s+n) & (~(64-1))
56 .section SECTION(.text),"ax",@progbits
59 /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
61 PCMPEQ (%rax), %xmm0; \
62 PCMPEQ 16(%rax), %xmm1; \
63 PCMPEQ 32(%rax), %xmm2; \
64 PCMPEQ 48(%rax), %xmm3; \
65 pmovmskb %xmm0, %esi; \
66 pmovmskb %xmm1, %edx; \
67 pmovmskb %xmm2, %r8d; \
68 pmovmskb %xmm3, %ecx; \
77 /* Do not read anything when n==0. */
84 /* Check for overflow from maxlen * sizeof(wchar_t). If it would
85 overflow the only way this program doesn't have undefined behavior
86 is if there is a null terminator in valid memory so wcslen will
94 /* Initialize long lived registers. */
108 /* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
110 /* We cannot unify this branching as it would be ~6 cycles slower. */
114 /* Test if end is among first 64 bytes. */
115 # define STRNLEN_PROLOG \
122 # define STRNLEN_PROLOG andq $-64, %rax;
125 /* Ignore bits in mask that come before start of string. */
126 # define PROLOG(lab) \
141 /* Test first 16 bytes unaligned. */
147 bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
152 /* Same as FIND_ZERO except we do not check first 16 bytes. */
154 PCMPEQ 16(%rax), %xmm1
155 PCMPEQ 32(%rax), %xmm2
156 PCMPEQ 48(%rax), %xmm3
167 /* When no zero byte is found xmm1-3 are zero so we do not have to
178 /* We must do this check to correctly handle strnlen (s, -1). */
202 PMINU 16(%rax), %xmm0
203 PMINU 32(%rax), %xmm0
204 PMINU 48(%rax), %xmm0
214 je L(first) /* Do not read when end is at page boundary. */
239 /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
243 movdqa 64(%rax), %xmm0
244 PMINU 80(%rax), %xmm0
245 PMINU 96(%rax), %xmm0
246 PMINU 112(%rax), %xmm0
255 PMINU 16(%rax), %xmm0
256 PMINU 32(%rax), %xmm0
257 PMINU 48(%rax), %xmm0