1 /* memchr/wmemchr optimized with AVX2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
22 #if ISA_SHOULD_BUILD (3)
25 # define MEMCHR __memchr_avx2
28 # ifdef USE_AS_WMEMCHR
29 # define VPCMPEQ vpcmpeqd
30 # define VPBROADCAST vpbroadcastd
33 # define VPCMPEQ vpcmpeqb
34 # define VPBROADCAST vpbroadcastb
38 # ifdef USE_AS_RAWMEMCHR
39 # define ERAW_PTR_REG ecx
40 # define RRAW_PTR_REG rcx
41 # define ALGN_PTR_REG rdi
43 # define ERAW_PTR_REG edi
44 # define RRAW_PTR_REG rdi
45 # define ALGN_PTR_REG rcx
49 # define VZEROUPPER vzeroupper
53 # define SECTION(p) p##.avx
57 # define PAGE_SIZE 4096
58 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
60 .section SECTION(.text),"ax",@progbits
61 ENTRY_P2ALIGN (MEMCHR, 5)
62 # ifndef USE_AS_RAWMEMCHR
63 /* Check for zero length. */
65 /* Clear upper bits. */
72 /* Broadcast CHAR to YMMMATCH. */
74 VPBROADCAST %xmm0, %ymm0
75 /* Check if we may cross page boundary with one vector load. */
77 andl $(PAGE_SIZE - 1), %eax
78 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
79 ja L(cross_page_boundary)
81 /* Check the first VEC_SIZE bytes. */
82 VPCMPEQ (%rdi), %ymm0, %ymm1
84 # ifndef USE_AS_RAWMEMCHR
85 /* If length < CHAR_PER_VEC handle special. */
86 cmpq $CHAR_PER_VEC, %rdx
94 ZERO_UPPER_VEC_REGISTERS_RETURN
97 # ifndef USE_AS_RAWMEMCHR
100 /* Check if first match was before length. */
102 # ifdef USE_AS_WMEMCHR
103 /* NB: Multiply length by 4 to get byte count. */
107 /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
108 block. branch here as opposed to cmovcc is not that costly. Common
109 usage of memchr is to check if the return was NULL (if string was
110 known to contain CHAR user would use rawmemchr). This branch will be
111 highly correlated with the user branch and can be used by most
112 modern branch predictors to predict the user branch. */
125 # ifndef USE_AS_RAWMEMCHR
126 /* First in aligning bytes here. */
134 addq $(VEC_SIZE + 1), %rdi
141 addq $(VEC_SIZE * 2 + 1), %rdi
149 addq $(VEC_SIZE * 3 + 1), %rdi
155 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
156 since data is only aligned to VEC_SIZE. */
158 # ifndef USE_AS_RAWMEMCHR
159 L(cross_page_continue):
160 /* Align data to VEC_SIZE - 1. */
163 orq $(VEC_SIZE - 1), %rdi
164 /* esi is for adjusting length to see if near the end. */
165 leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
166 # ifdef USE_AS_WMEMCHR
167 /* NB: Divide bytes by 4 to get the wchar_t count. */
171 orq $(VEC_SIZE - 1), %rdi
172 L(cross_page_continue):
174 /* Load first VEC regardless. */
175 VPCMPEQ 1(%rdi), %ymm0, %ymm1
176 vpmovmskb %ymm1, %eax
177 # ifndef USE_AS_RAWMEMCHR
178 /* Adjust length. If near end handle specially. */
180 jbe L(last_4x_vec_or_less)
185 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
186 vpmovmskb %ymm1, %eax
190 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
191 vpmovmskb %ymm1, %eax
195 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
196 vpmovmskb %ymm1, %eax
200 # ifndef USE_AS_RAWMEMCHR
201 /* Check if at last VEC_SIZE * 4 length. */
202 subq $(CHAR_PER_VEC * 4), %rdx
203 jbe L(last_4x_vec_or_less_cmpeq)
204 /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
208 orq $(VEC_SIZE * 4 - 1), %rdi
209 andl $(VEC_SIZE * 4 - 1), %ecx
210 # ifdef USE_AS_WMEMCHR
211 /* NB: Divide bytes by 4 to get the wchar_t count. */
216 /* Align data to VEC_SIZE * 4 - 1 for loop. */
218 orq $(VEC_SIZE * 4 - 1), %rdi
221 /* Compare 4 * VEC at a time forward. */
224 VPCMPEQ 1(%rdi), %ymm0, %ymm1
225 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
226 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
227 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
228 vpor %ymm1, %ymm2, %ymm5
229 vpor %ymm3, %ymm4, %ymm6
230 vpor %ymm5, %ymm6, %ymm5
232 vpmovmskb %ymm5, %ecx
233 # ifdef USE_AS_RAWMEMCHR
234 subq $-(VEC_SIZE * 4), %rdi
239 jnz L(loop_4x_vec_end)
241 subq $-(VEC_SIZE * 4), %rdi
243 subq $(CHAR_PER_VEC * 4), %rdx
246 /* Fall through into less than 4 remaining vectors of length
248 VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
249 vpmovmskb %ymm1, %eax
251 L(last_4x_vec_or_less):
252 # ifdef USE_AS_WMEMCHR
253 /* NB: Multiply length by 4 to get byte count. */
256 /* Check if first VEC contained match. */
258 jnz L(first_vec_x1_check)
260 /* If remaining length > VEC_SIZE * 2. */
261 addl $(VEC_SIZE * 2), %edx
265 /* If remaining length < VEC_SIZE. */
269 /* Check VEC2 and compare any match with remaining length. */
270 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
271 vpmovmskb %ymm1, %eax
275 addq $(VEC_SIZE + 1), %rdi
283 /* rawmemchr will fall through into this if match was found in
286 vpmovmskb %ymm1, %eax
288 jnz L(last_vec_x1_return)
290 vpmovmskb %ymm2, %eax
292 jnz L(last_vec_x2_return)
294 vpmovmskb %ymm3, %eax
295 /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
299 # ifdef USE_AS_RAWMEMCHR
300 subq $(VEC_SIZE * 2 - 1), %rdi
302 subq $-(VEC_SIZE * 2 + 1), %rdi
306 # ifndef USE_AS_RAWMEMCHR
309 L(first_vec_x1_check):
312 subl $-(VEC_SIZE * 4), %edx
313 /* Check if match within remaining length. */
326 L(last_vec_x1_return):
328 # ifdef USE_AS_RAWMEMCHR
329 subq $(VEC_SIZE * 4 - 1), %rdi
337 L(last_vec_x2_return):
339 # ifdef USE_AS_RAWMEMCHR
340 subq $(VEC_SIZE * 3 - 1), %rdi
342 subq $-(VEC_SIZE + 1), %rdi
347 # ifndef USE_AS_RAWMEMCHR
349 L(last_4x_vec_or_less_cmpeq):
350 VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
351 vpmovmskb %ymm1, %eax
352 # ifdef USE_AS_WMEMCHR
353 /* NB: Multiply length by 4 to get byte count. */
356 subq $-(VEC_SIZE * 4), %rdi
357 /* Check first VEC regardless. */
359 jnz L(first_vec_x1_check)
361 /* If remaining length <= CHAR_PER_VEC * 2. */
362 addl $(VEC_SIZE * 2), %edx
366 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
367 vpmovmskb %ymm1, %eax
369 jnz L(last_vec_x2_return)
371 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
372 vpmovmskb %ymm1, %eax
374 /* Create mask for possible matches within remaining length. */
376 bzhiq %rdx, %rcx, %rcx
378 /* Test matches in data against length match. */
382 /* if remaining length <= VEC_SIZE * 3 (Note this is after
383 remaining length was found to be > VEC_SIZE * 2. */
387 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
388 vpmovmskb %ymm1, %eax
389 /* Shift remaining length mask for last VEC. */
394 addq $(VEC_SIZE * 3 + 1), %rdi
402 subq $-(VEC_SIZE * 2 + 1), %rdi
408 L(cross_page_boundary):
409 /* Save pointer before aligning as its original value is necessary for
410 computer return address if byte is found or adjusting length if it
411 is not and this is memchr. */
413 /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
414 and rdi for rawmemchr. */
415 orq $(VEC_SIZE - 1), %ALGN_PTR_REG
416 VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
417 vpmovmskb %ymm1, %eax
418 # ifndef USE_AS_RAWMEMCHR
419 /* Calculate length until end of page (length checked for a match). */
420 leaq 1(%ALGN_PTR_REG), %rsi
421 subq %RRAW_PTR_REG, %rsi
422 # ifdef USE_AS_WMEMCHR
423 /* NB: Divide bytes by 4 to get wchar_t count. */
427 /* Remove the leading bytes. */
428 sarxl %ERAW_PTR_REG, %eax, %eax
429 # ifndef USE_AS_RAWMEMCHR
430 /* Check the end of data. */
435 jz L(cross_page_continue)
437 addq %RRAW_PTR_REG, %rax