1 /* memrchr optimized with SSE2.
2 Copyright (C) 2017-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 /* MINIMUM_X86_ISA_LEVEL <= 2 because there is no V2 implementation
22 so we need this to build for ISA V2 builds. */
23 #if ISA_SHOULD_BUILD (2)
26 # define MEMRCHR __memrchr_sse2
31 # define PAGE_SIZE 4096
34 ENTRY_P2ALIGN(MEMRCHR, 6)
36 /* Clear upper bits. */
41 /* Get end pointer. */
42 leaq (%rdx, %rdi), %rcx
44 punpcklbw %xmm0, %xmm0
45 punpcklwd %xmm0, %xmm0
46 pshufd $0, %xmm0, %xmm0
48 /* Check if we can load 1x VEC without cross a page. */
49 testl $(PAGE_SIZE - VEC_SIZE), %ecx
52 /* NB: This load happens regardless of whether rdx (len) is zero. Since
53 it doesn't cross a page and the standard guarantees any pointer have
54 at least one-valid byte this load must be safe. For the entire
55 history of the x86 memrchr implementation this has been possible so
56 no code "should" be relying on a zero-length check before this load.
57 The zero-length check is moved to the page cross case because it is
58 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
59 into 2-cache lines. */
60 movups -(VEC_SIZE)(%rcx), %xmm1
67 /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
71 /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
75 /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
84 leaq -(VEC_SIZE)(%rcx, %rax), %rax
98 /* Align rcx (pointer to string). */
100 andq $-VEC_SIZE, %rcx
103 /* NB: We could consistenyl save 1-byte in this pattern with `movaps
104 %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
105 it adds more frontend uops (even if the moves can be eliminated) and
106 some percentage of the time actual backend uops. */
107 movaps -(VEC_SIZE)(%rcx), %xmm1
112 cmpq $(VEC_SIZE * 2), %rdx
116 jbe L(ret_vec_x0_test)
121 movaps -(VEC_SIZE * 2)(%rcx), %xmm1
134 /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
135 causes the hot pause (length <= VEC_SIZE) to span multiple cache
136 lines. Naturally aligned % 16 to 8-bytes. */
138 /* Zero length check. */
143 andq $-(VEC_SIZE), %r8
148 /* Shift out negative alignment (because we are starting from endptr and
149 working backwards). */
151 /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
153 andl $(VEC_SIZE - 1), %ecx
156 leaq (%rdi, %rdx), %rcx
168 /* Fits in aliging bytes. */
176 leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
184 movaps -(VEC_SIZE * 2)(%rcx), %xmm1
191 movaps -(VEC_SIZE * 3)(%rcx), %xmm1
195 subq $(VEC_SIZE * 4), %rdx
198 addl $(VEC_SIZE), %edx
199 jle L(ret_vec_x2_test)
205 movaps -(VEC_SIZE * 4)(%rcx), %xmm1
209 subl $(VEC_SIZE), %edx
235 leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
241 leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
249 movaps -(VEC_SIZE * 4)(%rcx), %xmm1
256 addq $-(VEC_SIZE * 4), %rcx
257 cmpq $(VEC_SIZE * 4), %rdx
260 /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
261 keeping the code from spilling to the next cache line. */
262 addq $(VEC_SIZE * 4 - 1), %rcx
263 andq $-(VEC_SIZE * 4), %rcx
264 leaq (VEC_SIZE * 4)(%rdi), %rdx
265 andq $-(VEC_SIZE * 4), %rdx
269 movaps (VEC_SIZE * -1)(%rcx), %xmm1
270 movaps (VEC_SIZE * -2)(%rcx), %xmm2
271 movaps (VEC_SIZE * -3)(%rcx), %xmm3
272 movaps (VEC_SIZE * -4)(%rcx), %xmm4
286 addq $-(VEC_SIZE * 4), %rcx
292 /* Ends up being 1-byte nop. */
295 movaps -(VEC_SIZE)(%rcx), %xmm1
299 cmpl $(VEC_SIZE * 2), %edx
306 movaps -(VEC_SIZE * 2)(%rcx), %xmm1
313 movaps -(VEC_SIZE * 3)(%rcx), %xmm1
317 subl $(VEC_SIZE * 3), %edx
327 /* Ends up being 1-byte nop. */
339 /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
340 then it won't affect the result in esi (VEC4). If ecx is non-zero
341 then CHAR in VEC3 and bsrq will use that position. */
345 leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
350 leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
352 /* Use in L(last_4x_vec). In the same cache line. This is just a spare
357 /* 2-bytes from next cache line. */