1 /* rawmemchr optimized with 256-bit EVEX instructions.
2 Copyright (C) 2022-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
22 #if ISA_SHOULD_BUILD (4)
25 # include "x86-evex256-vecs.h"
29 # define RAWMEMCHR __rawmemchr_evex
33 # define PC_SHIFT_GPR rdi
34 # define REG_WIDTH VEC_SIZE
35 # define VPTESTN vptestnmb
36 # define VPBROADCAST vpbroadcastb
37 # define VPMINU vpminub
39 # define VPCMPEQ vpcmpeqb
42 # include "reg-macros.h"
44 /* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
45 doesn't have VEX encoding), use VEX encoding in loop so we
46 can use vpcmpeqb + vptern which is more efficient than the
48 # if defined USE_IN_RTM || VEC_SIZE == 64
49 # undef COND_VZEROUPPER
50 # undef VZEROUPPER_RETURN
54 # define COND_VZEROUPPER
55 # define VZEROUPPER_RETURN ret
58 # define USE_TERN_IN_LOOP 0
60 # define USE_TERN_IN_LOOP 1
62 # define VZEROUPPER vzeroupper
65 # define CHAR_PER_VEC VEC_SIZE
67 # if CHAR_PER_VEC == 64
69 # define TAIL_RETURN_LBL first_vec_x2
70 # define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2)
72 # define FALLTHROUGH_RETURN_LBL first_vec_x3
73 # define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3)
75 # else /* !(CHAR_PER_VEC == 64) */
77 # define TAIL_RETURN_LBL first_vec_x3
78 # define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3)
80 # define FALLTHROUGH_RETURN_LBL first_vec_x2
81 # define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2)
82 # endif /* !(CHAR_PER_VEC == 64) */
85 # define VMATCH VMM(0)
86 # define VMATCH_LO VMM_lo(0)
88 # define PAGE_SIZE 4096
90 .section SECTION(.text), "ax", @progbits
91 ENTRY_P2ALIGN (RAWMEMCHR, 6)
92 VPBROADCAST %esi, %VMATCH
93 /* Check if we may cross page boundary with one vector load. */
95 andl $(PAGE_SIZE - 1), %eax
96 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
99 VPCMPEQ (%rdi), %VMATCH, %k0
112 leaq (VEC_SIZE * 4)(%rdi, %rax), %rax
115 /* For VEC_SIZE == 32 we can fit this in aligning bytes so might
116 as well place it more locally. For VEC_SIZE == 64 we reuse
117 return code at the end of loop's return. */
120 L(FALLTHROUGH_RETURN_LBL):
122 leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
128 /* eax has lower page-offset bits of rdi so xor will zero them
131 VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
134 /* Shift out out-of-bounds matches. */
135 shrx %VRDI, %VRAX, %VRAX
141 L(page_cross_continue):
143 andq $(VEC_SIZE * -1), %rdi
145 VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
150 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
155 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
160 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
165 subq $-(VEC_SIZE * 1), %rdi
167 /* Saves code size. No evex512 processor has partial register
168 stalls. If that change this can be replaced with `andq
169 $-(VEC_SIZE * 4), %rdi`. */
172 andq $-(VEC_SIZE * 4), %rdi
175 # if USE_TERN_IN_LOOP
176 /* copy VMATCH to low ymm so we can use vpcmpeq which is not
177 encodable with EVEX registers. NB: this is VEC_SIZE == 32
178 only as there is no way to encode vpcmpeq with zmm0-15. */
179 vmovdqa64 %VMATCH, %VMATCH_LO
184 /* Two versions of the loop. One that does not require
185 vzeroupper by not using ymm0-15 and another does that
186 require vzeroupper because it uses ymm0-15. The reason why
187 ymm0-15 is used at all is because there is no EVEX encoding
188 vpcmpeq and with vpcmpeq this loop can be performed more
189 efficiently. The non-vzeroupper version is safe for RTM
190 while the vzeroupper version should be preferred if RTM are
191 not supported. Which loop version we use is determined by
194 # if USE_TERN_IN_LOOP
195 /* Since vptern can only take 3x vectors fastest to do 1 vec
196 separately with EVEX vpcmp. */
197 VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
198 /* Compare 3x with vpcmpeq and or them all together with vptern.
201 VPCMPEQ (VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2)
202 subq $(VEC_SIZE * -4), %rdi
203 VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
204 VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
206 /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
208 vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
209 vpmovmskb %VMM_lo(4), %VRCX
213 /* NB: rax has match from first VEC and rcx has matches from
214 VEC 2-4. If rax is non-zero we will return that match. If
215 rax is zero adding won't disturb the bits in rcx. */
218 /* Loop version that uses EVEX encoding. */
219 VPCMP $4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
220 vpxorq (VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2)
221 vpxorq (VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3)
222 VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMATCH, %k3
223 VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z}
224 VPTESTN %VMM(3), %VMM(3), %k2
225 subq $(VEC_SIZE * -4), %rdi
230 # if USE_TERN_IN_LOOP
239 # if USE_TERN_IN_LOOP
240 vpmovmskb %VMM_lo(2), %VRAX
242 VPTESTN %VMM(2), %VMM(2), %k1
249 # if USE_TERN_IN_LOOP
250 vpmovmskb %VMM_lo(3), %VRAX
255 /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
256 (only if used VEX encoded loop). */
259 /* Separate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
260 returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
261 individually, for VEC_SIZE == 32 we combine them in a single
263 # if CHAR_PER_VEC == 64
264 # if USE_TERN_IN_LOOP
265 # error "Unsupported"
269 /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */
273 L(FALLTHROUGH_RETURN_LBL):
275 /* CHAR_PER_VEC <= 32 so we can combine the results from the
277 # if !USE_TERN_IN_LOOP
280 salq $CHAR_PER_VEC, %rcx
284 leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
290 leaq (TAIL_RETURN_OFFSET)(%rdi, %rax), %rax
298 leaq (VEC_SIZE * 1)(%rdi, %rax), %rax