1 /* Optimized memrchr with sse2
2 Copyright (C) 2011-2023 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
23 # define CFI_PUSH(REG) \
24 cfi_adjust_cfa_offset (4); \
25 cfi_rel_offset (REG, 0)
27 # define CFI_POP(REG) \
28 cfi_adjust_cfa_offset (-4); \
31 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
32 # define POP(REG) popl REG; CFI_POP (REG)
39 # define MEMCHR __memrchr_sse2_bsf
44 movd STR2(%esp), %xmm1
50 punpcklbw %xmm1, %xmm1
52 punpcklbw %xmm1, %xmm1
55 pshufd $0, %xmm1, %xmm1
58 /* Check if there is a match. */
74 /* Loop start on aligned string. */
79 movdqa 48(%ecx), %xmm0
85 movdqa 32(%ecx), %xmm2
91 movdqa 16(%ecx), %xmm3
107 movdqa 48(%ecx), %xmm0
113 movdqa 32(%ecx), %xmm2
119 movdqa 16(%ecx), %xmm3
148 movdqa 16(%ecx), %xmm2
149 movdqa 32(%ecx), %xmm3
150 movdqa 48(%ecx), %xmm4
173 movdqa 16(%ecx), %xmm2
176 pcmpeqb (%ecx), %xmm1
194 movdqa 48(%ecx), %xmm0
200 movdqa 32(%ecx), %xmm2
206 movdqa 16(%ecx), %xmm3
214 pcmpeqb (%ecx), %xmm1
223 movdqa 48(%ecx), %xmm0
231 pcmpeqb 32(%ecx), %xmm1
247 lea 16(%eax, %ecx), %eax
253 lea 32(%eax, %ecx), %eax
259 lea 48(%eax, %ecx), %eax
277 lea 16(%ecx, %eax), %eax
286 lea 32(%ecx, %eax), %eax
295 lea 48(%ecx, %eax), %eax
304 L(length_less16_offset0):
306 pcmpeqb (%eax), %xmm1
325 punpcklbw %xmm1, %xmm1
327 punpcklbw %xmm1, %xmm1
331 pshufd $0, %xmm1, %xmm1
333 jz L(length_less16_offset0)
341 ja L(length_less16_part2)
343 pcmpeqb (%eax), %xmm1
366 L(length_less16_part2):
367 movdqa 16(%eax), %xmm2
381 jnz L(length_less16_part2_return)
383 pcmpeqb (%eax), %xmm1
401 L(length_less16_part2_return):
403 lea 16(%eax, %edi), %eax