1 /* memchr/wmemchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
22 #if ISA_SHOULD_BUILD (4)
25 # include "x86-evex256-vecs.h"
29 # define MEMCHR __memchr_evex
32 # ifdef USE_AS_WMEMCHR
33 # define PC_SHIFT_GPR rcx
34 # define VPTESTN vptestnmd
35 # define VPBROADCAST vpbroadcastd
36 # define VPMINU vpminud
38 # define VPCMPEQ vpcmpeqd
41 # define USE_WIDE_CHAR
43 # define PC_SHIFT_GPR rdi
44 # define VPTESTN vptestnmb
45 # define VPBROADCAST vpbroadcastb
46 # define VPMINU vpminub
48 # define VPCMPEQ vpcmpeqb
52 # include "reg-macros.h"
55 /* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
56 doesn't have VEX encoding), use VEX encoding in loop so we
57 can use vpcmpeqb + vptern which is more efficient than the
59 # if defined USE_IN_RTM || VEC_SIZE == 64
60 # undef COND_VZEROUPPER
61 # undef VZEROUPPER_RETURN
64 # define COND_VZEROUPPER
65 # define VZEROUPPER_RETURN ret
68 # define USE_TERN_IN_LOOP 0
70 # define USE_TERN_IN_LOOP 1
72 # define VZEROUPPER vzeroupper
76 /* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
77 so we don't want to multiply resulting index. */
78 # define TERN_CHAR_MULT 1
80 # ifdef USE_AS_WMEMCHR
81 # define TEST_END() inc %VRCX
83 # define TEST_END() add %rdx, %rcx
86 # define TERN_CHAR_MULT CHAR_SIZE
87 # define TEST_END() KORTEST %k2, %k3
90 # if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
91 # ifndef USE_AS_WMEMCHR
92 # define GPR_X0_IS_RET 1
94 # define GPR_X0_IS_RET 0
98 # define GPR_X0_IS_RET 0
102 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
104 # if CHAR_PER_VEC == 64
105 # define LAST_VEC_OFFSET (VEC_SIZE * 3)
107 # define LAST_VEC_OFFSET (VEC_SIZE * 2)
109 # if CHAR_PER_VEC >= 32
110 # define MASK_GPR(...) VGPR(__VA_ARGS__)
111 # elif CHAR_PER_VEC == 16
112 # define MASK_GPR(reg) VGPR_SZ(reg, 16)
114 # define MASK_GPR(reg) VGPR_SZ(reg, 8)
117 # define VMATCH VMM(0)
118 # define VMATCH_LO VMM_lo(0)
120 # define PAGE_SIZE 4096
123 .section SECTION(.text), "ax", @progbits
124 ENTRY_P2ALIGN (MEMCHR, 6)
125 /* Check for zero length. */
126 test %RDX_LP, %RDX_LP
130 /* Clear the upper 32 bits. */
133 VPBROADCAST %esi, %VMATCH
134 /* Check if we may cross page boundary with one vector load. */
136 andl $(PAGE_SIZE - 1), %eax
137 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
140 VPCMPEQ (%rdi), %VMATCH, %k0
142 # ifndef USE_AS_WMEMCHR
143 /* If rcx is zero then tzcnt -> CHAR_PER_VEC. NB: there is a
144 already a dependency between rcx and rsi so no worries about
147 /* If rdx <= rsi then either 1) rcx was non-zero (there was a
148 match) but it was out of bounds or 2) rcx was zero and rdx
149 was <= VEC_SIZE so we are done scanning. */
151 /* NB: Use branch to return zero/non-zero. Common usage will
152 branch on result of function (if return is null/non-null).
153 This branch can be used to predict the ensuing one so there
154 is no reason to extend the data-dependency with cmovcc. */
157 /* If rcx is zero then len must be > RDX, otherwise since we
158 already tested len vs lzcnt(rcx) (in rsi) we are good to
159 return this match. */
162 leaq (%rdi, %rsi), %rax
165 /* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
166 > 1 so if rcx is tzcnt != CHAR_PER_VEC. */
167 cmpq $CHAR_PER_VEC, %rdx
173 leaq (%rdi, %rax, CHAR_SIZE), %rax
177 /* Only fits in first cache line for VEC_SIZE == 32. */
187 # ifdef USE_AS_WMEMCHR
188 /* If wmemchr still need to test if there was a match in first
189 VEC. Use bsf to test here so we can reuse
190 L(first_vec_x0_ret). */
192 jnz L(first_vec_x0_ret)
195 L(page_cross_continue):
196 # ifdef USE_AS_WMEMCHR
197 /* We can't use end of the buffer to re-calculate length for
198 wmemchr as len * CHAR_SIZE may overflow. */
199 leaq -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
200 andq $(VEC_SIZE * -1), %rdi
205 leaq -(VEC_SIZE + 1)(%rdx, %rdi), %rax
206 andq $(VEC_SIZE * -1), %rdi
210 /* rax contains remaining length - 1. -1 so we can get imm8
211 encoding in a few additional places saving code size. */
213 /* Needed regardless of remaining length. */
214 VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
217 /* We cannot fold the above `sub %rdi, %rax` with the `cmp
218 $(CHAR_PER_VEC * 2), %rax` because its possible for a very
219 large length to overflow and cause the subtract to carry
220 despite length being above CHAR_PER_VEC * 2. */
221 cmpq $(CHAR_PER_VEC * 2 - 1), %rax
226 jnz L(first_vec_x1_check)
228 /* Check the end of data. NB: use 8-bit operations to save code
229 size. We no longer need the full-width of eax and will
230 perform a write-only operation over eax so there will be no
231 partial-register stalls. */
232 subb $(CHAR_PER_VEC * 1 - 1), %al
235 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
237 # ifdef USE_AS_WMEMCHR
238 /* For wmemchr against we can't take advantage of tzcnt(0) ==
239 VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */
246 /* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32. We give
247 fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
248 not enough space before the next cache line to fit the `lea`
251 ja L(first_vec_x2_ret)
257 leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
262 L(first_vec_x1_check):
266 leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
269 /* Fits at the end of the cache line here for VEC_SIZE == 32.
282 leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
285 /* Fits at the end of the cache line here for VEC_SIZE == 64.
296 leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
302 /* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
306 /* Already computed matches for first VEC in rdx. */
311 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
316 /* Needed regardless of next length check. */
317 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
320 /* Check if we are near the end. */
321 cmpq $(CHAR_PER_VEC * 4 - 1), %rax
325 jnz L(first_vec_x3_check)
327 /* Use 8-bit instructions to save code size. We won't use full-
328 width eax again and will perform a write-only operation to
329 eax so no worries about partial-register stalls. */
330 subb $(CHAR_PER_VEC * 3), %al
333 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
335 # ifdef USE_AS_WMEMCHR
336 /* For wmemchr against we can't take advantage of tzcnt(0) ==
337 VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */
343 jae L(first_vec_x4_ret)
348 /* Fits at the end of the cache line here for VEC_SIZE == 64.
349 For VEC_SIZE == 32 we put the return label at the end of
353 leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
361 /* Place L(first_vec_x4_ret) here as we can't fit it in the same
362 cache line as where it is called from so we might as well
363 save code size by reusing return of L(first_vec_x4). */
366 leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
370 L(first_vec_x3_check):
371 /* Need to adjust remaining length before checking. */
372 addb $-(CHAR_PER_VEC * 2), %al
376 leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
382 leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
386 # if !USE_TERN_IN_LOOP
393 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
398 subq $-(VEC_SIZE * 5), %rdi
399 subq $(CHAR_PER_VEC * 8), %rax
402 # ifdef USE_AS_WMEMCHR
410 /* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
411 processor has partial register stalls (all have merging
412 uop). If that changes this can be removed. */
415 andq $-(VEC_SIZE * 4), %rdi
418 # ifdef USE_AS_WMEMCHR
428 # if USE_TERN_IN_LOOP
429 /* copy VMATCH to low ymm so we can use vpcmpeq which is not
430 encodable with EVEX registers. NB: this is VEC_SIZE == 32
431 only as there is no way to encode vpcmpeq with zmm0-15. */
432 vmovdqa64 %VMATCH, %VMATCH_LO
437 /* Two versions of the loop. One that does not require
438 vzeroupper by not using ymmm0-15 and another does that
439 require vzeroupper because it uses ymmm0-15. The reason why
440 ymm0-15 is used at all is because there is no EVEX encoding
441 vpcmpeq and with vpcmpeq this loop can be performed more
442 efficiently. The non-vzeroupper version is safe for RTM
443 while the vzeroupper version should be prefered if RTM are
444 not supported. Which loop version we use is determined by
447 # if USE_TERN_IN_LOOP
448 /* Since vptern can only take 3x vectors fastest to do 1 vec
449 seperately with EVEX vpcmp. */
450 # ifdef USE_AS_WMEMCHR
451 /* vptern can only accept masks for epi32/epi64 so can only save
452 instruction using not equals mask on vptern with wmemchr.
454 VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
456 VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
458 /* Compare 3x with vpcmpeq and or them all together with vptern.
460 VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
461 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
462 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
463 # ifdef USE_AS_WMEMCHR
464 /* This takes the not of or between VEC_lo(2), VEC_lo(3),
465 VEC_lo(4) as well as combines result from VEC(0) with zero
467 vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
468 vpmovmskb %VMM_lo(4), %VRCX
470 /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
472 vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
473 vpmovmskb %VMM_lo(4), %VRCX
478 /* Loop version that uses EVEX encoding. */
479 VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
480 vpxorq (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
481 vpxorq (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
482 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3
483 VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z}
484 VPTESTN %VMM(3), %VMM(3), %k2
491 subq $-(VEC_SIZE * 4), %rdi
493 subq $(CHAR_PER_VEC * 4), %rax
496 /* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
502 /* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
503 instructions on eax from here on out. */
504 # if CHAR_PER_VEC != 64
505 andl $(CHAR_PER_VEC * 4 - 1), %eax
507 VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0
508 subq $(VEC_SIZE * 1), %rdi
510 cmpb $(CHAR_PER_VEC * 2 - 1), %al
513 jnz L(last_vec_x1_novzero)
515 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
518 jnz L(last_vec_x2_novzero)
520 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
523 jnz L(first_vec_x3_check)
525 subb $(CHAR_PER_VEC * 3), %al
526 jae L(last_vec_check)
531 # if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
532 L(last_vec_x2_novzero):
534 L(last_vec_x1_novzero):
536 leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
540 # if CHAR_PER_VEC == 64
541 /* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
542 64 it needs a seperate return label. */
545 L(last_vec_x2_novzero):
547 leaq (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
553 # if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
562 # if USE_TERN_IN_LOOP
563 vpmovmskb %VMM_lo(2), %VRDX
565 VPTESTN %VMM(2), %VMM(2), %k1
572 # if USE_TERN_IN_LOOP
573 vpmovmskb %VMM_lo(3), %VRDX
578 /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
579 (only if used VEX encoded loop). */
582 /* Seperate logic for CHAR_PER_VEC == 64 vs the rest. For
583 CHAR_PER_VEC we test the last 2x VEC seperately, for
584 CHAR_PER_VEC <= 32 we can combine the results from the 2x
585 VEC in a single GPR. */
586 # if CHAR_PER_VEC == 64
587 # if USE_TERN_IN_LOOP
588 # error "Unsupported"
592 /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */
597 /* CHAR_PER_VEC <= 32 so we can combine the results from the
600 # if !USE_TERN_IN_LOOP
603 salq $(VEC_SIZE / TERN_CHAR_MULT), %rcx
605 # if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
606 L(last_vec_x2_novzero):
610 leaq (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
616 # if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
617 L(last_vec_x1_novzero):
620 leaq (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
627 bsf %VGPR(GPR_X0), %VGPR(GPR_X0)
631 leaq (%rdi, %GPR_X0, CHAR_SIZE), %rax
637 /* Need to preserve eax to compute inbound bytes we are
639 # ifdef USE_AS_WMEMCHR
647 VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
650 # ifdef USE_AS_WMEMCHR
651 /* NB: Divide by CHAR_SIZE to shift out out of bounds bytes. */
653 andl $(CHAR_PER_VEC - 1), %ecx
657 shrx %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
659 # ifdef USE_AS_WMEMCHR
663 /* mask lower bits from ecx (negative eax) to get bytes till
665 andl $(CHAR_PER_VEC - 1), %ecx
667 /* Check if VEC is entirely contained in the remainder of the
670 jbe L(page_cross_ret)
672 /* Length crosses the page so if rax is zero (no matches)
675 jz L(page_cross_continue)
677 /* if rdx > rcx then any match here must be in [buf:buf + len].
680 # ifdef USE_AS_WMEMCHR
681 leaq (%rdi, %rax, CHAR_SIZE), %rax
694 /* Search is entirely contained in page cross case. */
695 # ifdef USE_AS_WMEMCHR
697 jz L(page_cross_zero)
701 jbe L(page_cross_zero)
702 # ifdef USE_AS_WMEMCHR
703 leaq (%rdi, %rax, CHAR_SIZE), %rax