1 /* strrchr/wcsrchr optimized with AVX2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 #if ISA_SHOULD_BUILD (3)
26 # define STRRCHR __strrchr_avx2
29 # ifdef USE_AS_WCSRCHR
30 # define VPBROADCAST vpbroadcastd
31 # define VPCMPEQ vpcmpeqd
32 # define VPMIN vpminud
35 # define VPBROADCAST vpbroadcastb
36 # define VPCMPEQ vpcmpeqb
37 # define VPMIN vpminub
42 # define VZEROUPPER vzeroupper
46 # define SECTION(p) p##.avx
50 # define PAGE_SIZE 4096
52 .section SECTION(.text), "ax", @progbits
56 /* Broadcast CHAR to YMM4. */
57 VPBROADCAST %xmm7, %ymm7
58 vpxor %xmm0, %xmm0, %xmm0
60 /* Shift here instead of `andl` to save code size (saves a fetch
63 cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
66 L(page_cross_continue):
68 /* Check end of string match. */
69 VPCMPEQ %ymm1, %ymm0, %ymm6
74 /* Only check match with search CHAR if needed. */
75 VPCMPEQ %ymm1, %ymm7, %ymm1
77 /* Check if match before first zero. */
83 /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
84 search CHAR is zero we are correct. Either way `andq
85 -CHAR_SIZE, %rax` gets the correct result. */
86 # ifdef USE_AS_WCSRCHR
87 andq $-CHAR_SIZE, %rax
91 ZERO_UPPER_VEC_REGISTERS_RETURN
93 /* Returns for first vec x1/x2 have hard coded backward search
94 path for earlier matches. */
97 VPCMPEQ %ymm2, %ymm7, %ymm6
101 jnz L(first_vec_x1_return)
104 L(first_vec_x0_test):
105 VPCMPEQ %ymm1, %ymm7, %ymm6
106 vpmovmskb %ymm6, %eax
111 # ifdef USE_AS_WCSRCHR
112 andq $-CHAR_SIZE, %rax
118 L(first_vec_x0_x1_test):
119 VPCMPEQ %ymm2, %ymm7, %ymm6
120 vpmovmskb %ymm6, %eax
121 /* Check ymm2 for search CHAR match. If no match then check ymm1
124 jz L(first_vec_x0_test)
126 L(first_vec_x1_return):
128 leaq 1(%rdi, %rax), %rax
129 # ifdef USE_AS_WCSRCHR
130 andq $-CHAR_SIZE, %rax
137 VPCMPEQ %ymm3, %ymm7, %ymm6
138 vpmovmskb %ymm6, %eax
140 /* If no in-range search CHAR match in ymm3 then need to check
141 ymm1/ymm2 for an earlier match (we delay checking search
142 CHAR matches until needed). */
144 jz L(first_vec_x0_x1_test)
146 leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
147 # ifdef USE_AS_WCSRCHR
148 andq $-CHAR_SIZE, %rax
155 /* Save original pointer if match was in VEC 0. */
159 orq $(VEC_SIZE - 1), %rdi
160 vmovdqu 1(%rdi), %ymm2
161 VPCMPEQ %ymm2, %ymm0, %ymm6
162 vpmovmskb %ymm6, %ecx
166 vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
167 VPCMPEQ %ymm3, %ymm0, %ymm6
168 vpmovmskb %ymm6, %ecx
172 /* Save pointer again before realigning. */
174 addq $(VEC_SIZE + 1), %rdi
175 andq $-(VEC_SIZE * 2), %rdi
177 L(first_aligned_loop):
178 /* Do 2x VEC at a time. Any more and the cost of finding the
179 match outweighs loop benefit. */
180 vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
181 vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
183 VPCMPEQ %ymm4, %ymm7, %ymm6
184 VPMIN %ymm4, %ymm5, %ymm8
185 VPCMPEQ %ymm5, %ymm7, %ymm10
186 vpor %ymm6, %ymm10, %ymm5
187 VPCMPEQ %ymm8, %ymm0, %ymm8
188 vpor %ymm5, %ymm8, %ymm9
190 vpmovmskb %ymm9, %eax
191 addq $(VEC_SIZE * 2), %rdi
192 /* No zero or search CHAR. */
194 jz L(first_aligned_loop)
196 /* If no zero CHAR then go to second loop (this allows us to
197 throw away all prior work). */
198 vpmovmskb %ymm8, %ecx
200 jz L(second_aligned_loop_prep)
202 /* Search char could be zero so we need to get the true match.
204 vpmovmskb %ymm5, %eax
206 jnz L(first_aligned_loop_return)
209 L(first_vec_x1_or_x2):
210 VPCMPEQ %ymm3, %ymm7, %ymm3
211 VPCMPEQ %ymm2, %ymm7, %ymm2
212 vpmovmskb %ymm3, %eax
213 vpmovmskb %ymm2, %edx
214 /* Use add for macro-fusion. */
216 jz L(first_vec_x0_test)
217 /* NB: We could move this shift to before the branch and save a
218 bit of code size / performance on the fall through. The
219 branch leads to the null case which generally seems hotter
220 than char in first 3x VEC. */
224 leaq 1(%rsi, %rax), %rax
225 # ifdef USE_AS_WCSRCHR
226 andq $-CHAR_SIZE, %rax
231 L(first_aligned_loop_return):
232 VPCMPEQ %ymm4, %ymm0, %ymm4
233 vpmovmskb %ymm4, %edx
237 vpmovmskb %ymm10, %eax
238 vpmovmskb %ymm6, %edx
243 jz L(first_vec_x1_or_x2)
246 leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
247 # ifdef USE_AS_WCSRCHR
248 andq $-CHAR_SIZE, %rax
252 /* Search char cannot be zero. */
254 L(second_aligned_loop_set_furthest_match):
255 /* Save VEC and pointer from most recent match. */
256 L(second_aligned_loop_prep):
259 vmovdqu %ymm10, %ymm3
262 L(second_aligned_loop):
263 /* Search 2x at at time. */
264 vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
265 vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
267 VPCMPEQ %ymm4, %ymm7, %ymm6
268 VPMIN %ymm4, %ymm5, %ymm1
269 VPCMPEQ %ymm5, %ymm7, %ymm10
270 vpor %ymm6, %ymm10, %ymm5
271 VPCMPEQ %ymm1, %ymm0, %ymm1
272 vpor %ymm5, %ymm1, %ymm9
274 vpmovmskb %ymm9, %eax
275 addq $(VEC_SIZE * 2), %rdi
277 jz L(second_aligned_loop)
278 vpmovmskb %ymm1, %ecx
280 jz L(second_aligned_loop_set_furthest_match)
281 vpmovmskb %ymm5, %eax
283 jnz L(return_new_match)
285 /* This is the hot patch. We know CHAR is inbounds and that
286 ymm3/ymm2 have latest match. */
289 vpmovmskb %ymm3, %eax
290 vpmovmskb %ymm2, %edx
294 /* Search char cannot be zero so safe to just use lea for
296 leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
299 /* Last iteration also potentially has a match. */
302 VPCMPEQ %ymm4, %ymm0, %ymm4
303 vpmovmskb %ymm4, %edx
307 vpmovmskb %ymm10, %eax
308 vpmovmskb %ymm6, %edx
313 jz L(return_old_match)
315 /* Search char cannot be zero so safe to just use lea for
317 leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
323 andq $-VEC_SIZE, %rsi
324 vmovdqu (%rsi), %ymm1
325 VPCMPEQ %ymm1, %ymm0, %ymm6
326 vpmovmskb %ymm6, %ecx
327 /* Shift out zero CHAR matches that are before the beginning of
329 shrxl %edi, %ecx, %ecx
331 jz L(page_cross_continue)
332 VPCMPEQ %ymm1, %ymm7, %ymm1
333 vpmovmskb %ymm1, %eax
335 /* Shift out search CHAR matches that are before the beginning of
337 shrxl %edi, %eax, %eax
339 /* Check if any search CHAR match in range. */
344 # ifdef USE_AS_WCSRCHR
345 andq $-CHAR_SIZE, %rax