2 Copyright (C) 2009 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 #include <ifunc-defines.h>
24 /* Define multiple versions only for the definition in libc. */
28 .type strchr, @gnu_indirect_function
29 cmpl $0, __cpu_features+KIND_OFFSET(%rip)
31 call __init_cpu_features
32 1: leaq __strchr_sse2(%rip), %rax
33 testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
35 leaq __strchr_sse42(%rip), %rax
41 This implementation uses SSE4 instructions to compare up to 16 bytes
42 at a time looking for the first occurrence of the character c in the
45 char *strchr (const char *s, int c);
49 | _SIDD_CMP_EQUAL_EACH
50 | _SIDD_LEAST_SIGNIFICANT
51 on pcmpistri to compare xmm/mem128
53 0 1 2 3 4 5 6 7 8 9 A B C D E F
54 X X X X X X X X X X X X X X X X
58 0 1 2 3 4 5 6 7 8 9 A B C D E F
59 C C C C C C C C C C C C C C C C
61 to find out if the first 16byte data element has a byte C and the
62 offset of the first byte. There are 3 cases:
64 1. The first 16byte data element has the byte C at the offset X.
65 2. The first 16byte data element has EOS and doesn't have the byte C.
66 3. The first 16byte data element is valid and doesn't have the byte C.
68 Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
70 case ECX CFlag ZFlag SFlag
75 We exit from the loop for cases 1 and 2 with jbe which branches
76 when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
79 .section .text.sse4.2,"ax",@progbits
81 .type __strchr_sse42, @function
94 /* Handle unaligned string. */
100 /* Find where NULL is. */
102 /* Check if there is a match. */
104 /* Remove the leading bytes. */
108 je L(unaligned_no_match)
109 /* Check which byte is a match. */
111 /* Is there a NULL? */
113 je L(unaligned_match)
116 /* Return NULL if NULL comes first. */
123 L(unaligned_no_match):
127 /* Loop start on aligned string. */
131 pcmpistri $0x2, (%r8), %xmm1
134 pcmpistri $0x2, (%r8), %xmm1
137 pcmpistri $0x2, (%r8), %xmm1
140 pcmpistri $0x2, (%r8), %xmm1
154 leaq (%r8,%rcx), %rax
157 .size __strchr_sse42, .-__strchr_sse42
161 # define ENTRY(name) \
162 .type __strchr_sse2, @function; \
164 __strchr_sse2: cfi_startproc; \
168 cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
169 # undef libc_hidden_builtin_def
170 /* It doesn't make sense to send libc-internal strchr calls through a PLT.
171 The speedup we get from using SSE4.2 instruction is likely eaten away
172 by the indirect call in the PLT. */
173 # define libc_hidden_builtin_def(name) \
174 .globl __GI_strchr; __GI_strchr = __strchr_sse2
177 #include "../strchr.S"