2 Copyright (C) 2009, 2010 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
20 #include <init-arch.h>
23 /* Define multiple versions only for the definition in libc. */
27 .type strchr, @gnu_indirect_function
28 cmpl $0, __cpu_features+KIND_OFFSET(%rip)
30 call __init_cpu_features
31 1: leaq __strchr_sse2(%rip), %rax
32 testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
34 leaq __strchr_sse42(%rip), %rax
36 2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
38 leaq __strchr_sse2_no_bsf(%rip), %rax
44 This implementation uses SSE4 instructions to compare up to 16 bytes
45 at a time looking for the first occurrence of the character c in the
48 char *strchr (const char *s, int c);
52 | _SIDD_CMP_EQUAL_EACH
53 | _SIDD_LEAST_SIGNIFICANT
54 on pcmpistri to compare xmm/mem128
56 0 1 2 3 4 5 6 7 8 9 A B C D E F
57 X X X X X X X X X X X X X X X X
61 0 1 2 3 4 5 6 7 8 9 A B C D E F
62 C C C C C C C C C C C C C C C C
64 to find out if the first 16byte data element has a byte C and the
65 offset of the first byte. There are 3 cases:
67 1. The first 16byte data element has the byte C at the offset X.
68 2. The first 16byte data element has EOS and doesn't have the byte C.
69 3. The first 16byte data element is valid and doesn't have the byte C.
71 Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
73 case ECX CFlag ZFlag SFlag
78 We exit from the loop for cases 1 and 2 with jbe which branches
79 when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
82 .section .text.sse4.2,"ax",@progbits
84 .type __strchr_sse42, @function
98 /* Handle unaligned string. */
103 /* Find where NULL is. */
105 /* Check if there is a match. */
107 /* Remove the leading bytes. */
111 je L(unaligned_no_match)
112 /* Check which byte is a match. */
114 /* Is there a NULL? */
116 je L(unaligned_match)
119 /* Return NULL if NULL comes first. */
126 L(unaligned_no_match):
130 /* Loop start on aligned string. */
134 pcmpistri $0x2, (%r8), %xmm1
137 pcmpistri $0x2, (%r8), %xmm1
140 pcmpistri $0x2, (%r8), %xmm1
143 pcmpistri $0x2, (%r8), %xmm1
157 leaq (%r8,%rcx), %rax
160 .size __strchr_sse42, .-__strchr_sse42
164 # define ENTRY(name) \
165 .type __strchr_sse2, @function; \
167 __strchr_sse2: cfi_startproc; \
171 cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
172 # undef libc_hidden_builtin_def
173 /* It doesn't make sense to send libc-internal strchr calls through a PLT.
174 The speedup we get from using SSE4.2 instruction is likely eaten away
175 by the indirect call in the PLT. */
176 # define libc_hidden_builtin_def(name) \
177 .globl __GI_strchr; __GI_strchr = __strchr_sse2
180 #include "../strchr.S"