1 /* strchr/strchrnul optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
24 # define STRCHR __strchr_evex
27 # define VMOVU vmovdqu64
28 # define VMOVA vmovdqa64
31 # define VPBROADCAST vpbroadcastd
33 # define VPTESTN vptestnmd
34 # define VPMINU vpminud
36 # define SHIFT_REG ecx
39 # define VPBROADCAST vpbroadcastb
41 # define VPTESTN vptestnmb
42 # define VPMINU vpminub
44 # define SHIFT_REG edx
48 # define XMMZERO xmm16
50 # define YMMZERO ymm16
62 # define PAGE_SIZE 4096
63 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
65 .section .text.evex,"ax",@progbits
66 ENTRY_P2ALIGN (STRCHR, 5)
67 /* Broadcast CHAR to YMM0. */
68 VPBROADCAST %esi, %YMM0
70 andl $(PAGE_SIZE - 1), %eax
71 /* Check if we cross page boundary with one vector load.
72 Otherwise it is safe to use an unaligned load. */
73 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
74 ja L(cross_page_boundary)
76 /* Check the first VEC_SIZE bytes. Search for both CHAR and the
80 /* Leaves only CHARS matching esi as 0. */
81 vpxorq %YMM1, %YMM0, %YMM2
82 VPMINU %YMM2, %YMM1, %YMM2
83 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
84 VPTESTN %YMM2, %YMM2, %k0
89 # ifndef USE_AS_STRCHRNUL
90 /* Found CHAR or the null byte. */
91 cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
92 /* NB: Use a branch instead of cmovcc here. The expectation is
93 that with strchr the user will branch based on input being
94 null. Since this branch will be 100% predictive of the user
95 branch a branch miss here should save what otherwise would
96 be branch miss in the user code. Otherwise using a branch 1)
97 saves code size and 2) is faster in highly predictable
101 # ifdef USE_AS_WCSCHR
102 /* NB: Multiply wchar_t count by 4 to get the number of bytes.
104 leaq (%rdi, %rax, CHAR_SIZE), %rax
114 # ifndef USE_AS_STRCHRNUL
115 /* Check to see if first match was CHAR (k0) or null (k1). */
119 /* bzhil will not be 0 if first match was null. */
120 bzhil %eax, %ecx, %ecx
123 /* Combine CHAR and null matches. */
128 /* NB: Multiply sizeof char type (1 or 4) to get the number of
130 leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
133 # ifndef USE_AS_STRCHRNUL
142 /* Use bsf here to save 1-byte keeping keeping the block in 1x
143 fetch block. eax guranteed non-zero. */
145 # ifndef USE_AS_STRCHRNUL
146 /* Found CHAR or the null byte. */
147 cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
151 /* NB: Multiply sizeof char type (1 or 4) to get the number of
153 leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
158 # ifndef USE_AS_STRCHRNUL
159 /* Check to see if first match was CHAR (k0) or null (k1). */
163 /* bzhil will not be 0 if first match was null. */
164 bzhil %eax, %ecx, %ecx
167 /* Combine CHAR and null matches. */
172 /* NB: Multiply sizeof char type (1 or 4) to get the number of
174 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
179 /* Use bsf here to save 1-byte keeping keeping the block in 1x
180 fetch block. eax guranteed non-zero. */
182 # ifndef USE_AS_STRCHRNUL
183 /* Found CHAR or the null byte. */
184 cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
187 /* NB: Multiply sizeof char type (1 or 4) to get the number of
189 leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
194 /* Align data to VEC_SIZE. */
195 andq $-VEC_SIZE, %rdi
196 L(cross_page_continue):
197 /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
198 data is only aligned to VEC_SIZE. Use two alternating methods
199 for checking VEC to balance latency and port contention. */
201 /* This method has higher latency but has better port
203 VMOVA (VEC_SIZE)(%rdi), %YMM1
204 /* Leaves only CHARS matching esi as 0. */
205 vpxorq %YMM1, %YMM0, %YMM2
206 VPMINU %YMM2, %YMM1, %YMM2
207 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
208 VPTESTN %YMM2, %YMM2, %k0
213 /* This method has higher latency but has better port
215 VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
216 /* Each bit in K0 represents a CHAR in YMM1. */
217 VPCMP $0, %YMM1, %YMM0, %k0
218 /* Each bit in K1 represents a CHAR in YMM1. */
219 VPTESTN %YMM1, %YMM1, %k1
223 VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
224 /* Leaves only CHARS matching esi as 0. */
225 vpxorq %YMM1, %YMM0, %YMM2
226 VPMINU %YMM2, %YMM1, %YMM2
227 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
228 VPTESTN %YMM2, %YMM2, %k0
233 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
234 /* Each bit in K0 represents a CHAR in YMM1. */
235 VPCMP $0, %YMM1, %YMM0, %k0
236 /* Each bit in K1 represents a CHAR in YMM1. */
237 VPTESTN %YMM1, %YMM1, %k1
241 /* Align data to VEC_SIZE * 4 for the loop. */
243 andq $-(VEC_SIZE * 4), %rdi
247 /* Check 4x VEC at a time. No penalty to imm32 offset with evex
249 VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
250 VMOVA (VEC_SIZE * 5)(%rdi), %YMM2
251 VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
252 VMOVA (VEC_SIZE * 7)(%rdi), %YMM4
254 /* For YMM1 and YMM3 use xor to set the CHARs matching esi to
256 vpxorq %YMM1, %YMM0, %YMM5
257 /* For YMM2 and YMM4 cmp not equals to CHAR and store result in
258 k register. Its possible to save either 1 or 2 instructions
259 using cmp no equals method for either YMM1 or YMM1 and YMM3
260 respectively but bottleneck on p5 makes it not worth it. */
261 VPCMP $4, %YMM0, %YMM2, %k2
262 vpxorq %YMM3, %YMM0, %YMM7
263 VPCMP $4, %YMM0, %YMM4, %k4
265 /* Use min to select all zeros from either xor or end of string).
267 VPMINU %YMM1, %YMM5, %YMM1
268 VPMINU %YMM3, %YMM7, %YMM3
270 /* Use min + zeromask to select for zeros. Since k2 and k4 will
271 have 0 as positions that matched with CHAR which will set
272 zero in the corresponding destination bytes in YMM2 / YMM4.
274 VPMINU %YMM1, %YMM2, %YMM2{%k2}{z}
275 VPMINU %YMM3, %YMM4, %YMM4
276 VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
278 VPTESTN %YMM4, %YMM4, %k1
280 subq $-(VEC_SIZE * 4), %rdi
284 VPTESTN %YMM1, %YMM1, %k0
289 VPTESTN %YMM2, %YMM2, %k0
294 VPTESTN %YMM3, %YMM3, %k0
296 /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
297 # ifdef USE_AS_WCSCHR
306 # ifndef USE_AS_STRCHRNUL
307 /* Check if match was CHAR or null. */
308 cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
311 /* NB: Multiply sizeof char type (1 or 4) to get the number of
313 leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
319 # ifdef USE_AS_WCSCHR
320 /* NB: Multiply wchar_t count by 4 to get the number of bytes.
322 leaq (%rdi, %rax, CHAR_SIZE), %rax
327 # ifndef USE_AS_STRCHRNUL
328 /* Check if match was null. */
329 cmp (%rax), %CHAR_REG
338 # ifndef USE_AS_STRCHRNUL
339 /* Check if match was null. */
340 cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
343 /* NB: Multiply sizeof char type (1 or 4) to get the number of
345 leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
348 /* Cold case for crossing page with first load. */
350 L(cross_page_boundary):
353 andq $-VEC_SIZE, %rdi
355 /* Leaves only CHARS matching esi as 0. */
356 vpxorq %YMM1, %YMM0, %YMM2
357 VPMINU %YMM2, %YMM1, %YMM2
358 /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
359 VPTESTN %YMM2, %YMM2, %k0
361 /* Remove the leading bits. */
362 # ifdef USE_AS_WCSCHR
363 movl %edx, %SHIFT_REG
364 /* NB: Divide shift count by 4 since each bit in K1 represent 4
367 andl $(CHAR_PER_VEC - 1), %SHIFT_REG
369 sarxl %SHIFT_REG, %eax, %eax
370 /* If eax is zero continue. */
372 jz L(cross_page_continue)
375 # ifdef USE_AS_WCSCHR
376 /* NB: Multiply wchar_t count by 4 to get the number of
378 leaq (%rdx, %rax, CHAR_SIZE), %rax
382 # ifndef USE_AS_STRCHRNUL
383 /* Check to see if match was CHAR or null. */
384 cmp (%rax), %CHAR_REG