Replace FSF snail mail address with URLs.
[glibc.git] / sysdeps / x86_64 / multiarch / strstr.c
blobb1b41397c0faa3f8ca789ab630f24dd337342a04
1 /* strstr with SSE4.2 intrinsics
2 Copyright (C) 2009, 2010, 2011 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
20 #include <nmmintrin.h>
21 #include "varshift.h"
23 #ifndef STRSTR_SSE42
24 # define STRSTR_SSE42 __strstr_sse42
25 #endif
27 #ifdef USE_AS_STRCASESTR
28 # include <ctype.h>
29 # include <locale/localeinfo.h>
31 # define LOADBYTE(C) tolower (C)
32 # define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
33 #else
34 # define LOADBYTE(C) (C)
35 # define CMPBYTE(C1, C2) ((C1) == (C2))
36 #endif
38 /* We use 0xe ordered-compare:
39 _SIDD_SBYTE_OPS
40 | _SIDD_CMP_EQUAL_ORDER
41 | _SIDD_LEAST_SIGNIFICANT
42 on pcmpistri to do the scanning and string comparsion requirements of
43 sub-string match. In the scanning phase, we process Cflag and ECX
44 index to locate the first fragment match; once the first fragment
45 match position has been identified, we do comparison of subsequent
46 string fragments until we can conclude false or true match; whe
47 n concluding a false match, we may need to repeat scanning process
48 from next relevant offset in the target string.
50 In the scanning phase we have 4 cases:
51 case ECX CFlag ZFlag SFlag
52 1 16 0 0 0
53 2a 16 0 0 1
54 2b 16 0 1 0
55 2c 16 0 1 1
57 1. No ordered-comparison match, both 16B fragments are valid, so
58 continue to next fragment.
59 2. No ordered-comparison match, there is EOS in either fragment,
60 2a. Zflg = 0, Sflg = 1, we continue
61 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
62 2c. Zflg = 1, sflg = 1, lenth determine match or no match
64 In the string comparison phase, the 1st fragment match is fixed up
65 to produce ECX = 0. Subsequent fragment compare of nonzero index
66 and no match conclude a false match.
68 case ECX CFlag ZFlag SFlag
69 3 X 1 0 0/1
70 4a 0 1 0 0
71 4b 0 1 0 1
72 4c 0 < X 1 0 0/1
73 5 16 0 1 0
75 3. An initial ordered-comparison fragment match, we fix up to do
76 subsequent string comparison
77 4a. Continuation of fragment comparison of a string compare.
78 4b. EOS reached in the reference string, we conclude true match and
79 return
80 4c. String compare failed if index is nonzero, we need to go back to
81 scanning
82 5. failed string compare, go back to scanning
85 /* Simple replacement of movdqu to address 4KB boundary cross issue.
86 If EOS occurs within less than 16B before 4KB boundary, we don't
87 cross to next page. */
89 static inline __m128i
90 __m128i_strloadu (const unsigned char * p, __m128i zero)
92 if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
94 size_t offset = ((size_t) p & (16 - 1));
95 __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
96 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
97 if ((bmsk >> offset) != 0)
98 return __m128i_shift_right (a, offset);
100 return _mm_loadu_si128 ((__m128i *) p);
103 #if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
105 /* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
106 locale and other which have single-byte letters only in the ASCII
107 range. */
108 static inline __m128i
109 __m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
110 __m128i uchigh, __m128i lcqword)
112 __m128i frag = __m128i_strloadu (p, zero);
114 /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
115 __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
116 /* Compare if bytes are > 'A' - 1. */
117 __m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
118 /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */
119 __m128i mask = _mm_and_si128 (r2, r1);
120 /* Apply lowercase bit 6 mask for above mask bytes == ff. */
121 return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
124 #endif
126 /* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
127 algorithm) overlap for a fully populated 16B vector.
128 Input parameter: 1st 16Byte loaded from the reference string of a
129 strstr function.
130 We don't use KMP algorithm if reference string is less than 16B. */
131 static int
132 __inline__ __attribute__ ((__always_inline__,))
133 KMP16Bovrlap (__m128i s2)
135 __m128i b = _mm_unpacklo_epi8 (s2, s2);
136 __m128i a = _mm_unpacklo_epi8 (b, b);
137 a = _mm_shuffle_epi32 (a, 0);
138 b = _mm_srli_si128 (s2, sizeof (char));
139 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
141 /* _BitScanForward(&k1, bmsk); */
142 int k1;
143 __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
144 if (!bmsk)
145 return 16;
146 else if (bmsk == 0x7fff)
147 return 1;
148 else if (!k1)
150 /* There are al least two distinct chars in s2. If byte 0 and 1 are
151 idential and the distinct value lies farther down, we can deduce
152 the next byte offset to restart full compare is least no earlier
153 than byte 3. */
154 return 3;
156 else
158 /* Byte 1 is not degenerated to byte 0. */
159 return k1 + 1;
163 char *
164 __attribute__ ((section (".text.sse4.2")))
165 STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
167 #define p1 s1
168 const unsigned char *p2 = s2;
170 #ifndef STRCASESTR_NONASCII
171 if (__builtin_expect (p2[0] == '\0', 0))
172 return (char *) p1;
174 if (__builtin_expect (p1[0] == '\0', 0))
175 return NULL;
177 /* Check if p1 length is 1 byte long. */
178 if (__builtin_expect (p1[1] == '\0', 0))
179 return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
180 #endif
182 #ifdef USE_AS_STRCASESTR
183 # ifndef STRCASESTR_NONASCII
184 if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
185 != 0, 0))
186 return __strcasestr_sse42_nonascii (s1, s2);
188 const __m128i uclow = _mm_set1_epi8 (0x40);
189 const __m128i uchigh = _mm_set1_epi8 (0x5b);
190 const __m128i lcqword = _mm_set1_epi8 (0x20);
191 const __m128i zero = _mm_setzero_si128 ();
192 # define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
193 # else
194 # define strloadu __m128i_strloadu_tolower
195 # define zero _mm_setzero_si128 ()
196 # endif
197 #else
198 # define strloadu(p) __m128i_strloadu (p, zero)
199 const __m128i zero = _mm_setzero_si128 ();
200 #endif
202 /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
203 __m128i frag1 = strloadu (p1);
205 __m128i frag2;
206 if (p2[1] != '\0')
207 /* p2 is > 1 byte long. */
208 frag2 = strloadu (p2);
209 else
210 frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
212 /* Unsigned bytes, equal order, does frag2 has null? */
213 int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
214 int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
215 int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
216 int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
217 if (cmp_s & cmp_c)
219 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
220 int len;
221 __asm ("bsfl %[bmsk], %[len]"
222 : [len] "=r" (len) : [bmsk] "r" (bmsk));
223 p1 += cmp;
224 if ((len + cmp) <= 16)
225 return (char *) p1;
227 /* Load up to 16 bytes of fragment. */
228 frag1 = strloadu (p1);
229 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
230 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
231 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
232 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
233 if ((len + cmp) <= 16)
234 return (char *) p1 + cmp;
237 if (cmp_s)
239 /* Adjust addr for 16B alginment in ensuing loop. */
240 while (!cmp_z)
242 p1 += cmp;
243 /* Load up to 16 bytes of fragment. */
244 frag1 = strloadu (p1);
245 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
246 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
247 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
248 /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
249 once already, this time cmp will be zero and we can exit. */
250 if ((!cmp) & cmp_c)
251 break;
254 if (!cmp_c)
255 return NULL;
257 /* Since s2 is less than 16 bytes, com_c is definitive
258 determination of full match. */
259 return (char *) p1 + cmp;
262 /* General case, s2 is at least 16 bytes or more.
263 First, the common case of false-match at first byte of p2. */
264 const unsigned char *pt = NULL;
265 int kmp_fwd = 0;
266 re_trace:
267 while (!cmp_c)
269 /* frag1 has null. */
270 if (cmp_z)
271 return NULL;
273 /* frag 1 has no null, advance 16 bytes. */
274 p1 += 16;
275 /* Load up to 16 bytes of fragment. */
276 frag1 = strloadu (p1);
277 /* Unsigned bytes, equal order, is there a partial match? */
278 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
279 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
280 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
283 /* Next, handle initial positive match as first byte of p2. We have
284 a partial fragment match, make full determination until we reached
285 end of s2. */
286 if (!cmp)
288 if (cmp_z)
289 return (char *) p1;
291 pt = p1;
292 p1 += 16;
293 p2 += 16;
294 /* Load up to 16 bytes of fragment. */
295 frag2 = strloadu (p2);
297 else
299 /* Adjust 16B alignment. */
300 p1 += cmp;
301 pt = p1;
304 /* Load up to 16 bytes of fragment. */
305 frag1 = strloadu (p1);
307 /* Unsigned bytes, equal order, does frag2 has null? */
308 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
309 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
310 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
311 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
312 while (!(cmp | cmp_z | cmp_s))
314 p1 += 16;
315 p2 += 16;
316 /* Load up to 16 bytes of fragment. */
317 frag2 = strloadu (p2);
318 /* Load up to 16 bytes of fragment. */
319 frag1 = strloadu (p1);
320 /* Unsigned bytes, equal order, does frag2 has null? */
321 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
322 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
323 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
324 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
327 /* Full determination yielded a false result, retrace s1 to next
328 starting position.
329 Zflg 1 0 1 0/1
330 Sflg 0 1 1 0/1
331 cmp na 0 0 >0
332 action done done continue continue if s2 < s1
333 false match retrace s1 else false
336 if (cmp_s & !cmp)
337 return (char *) pt;
338 if (cmp_z)
340 if (!cmp_s)
341 return NULL;
343 /* Handle both zero and sign flag set and s1 is shorter in
344 length. */
345 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
346 int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
347 int len;
348 int len1;
349 __asm ("bsfl %[bmsk], %[len]"
350 : [len] "=r" (len) : [bmsk] "r" (bmsk));
351 __asm ("bsfl %[bmsk1], %[len1]"
352 : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
353 if (len >= len1)
354 return NULL;
356 else if (!cmp)
357 return (char *) pt;
359 /* Otherwise, we have to retrace and continue. Default of multiple
360 paths that need to retrace from next byte in s1. */
361 p2 = s2;
362 frag2 = strloadu (p2);
364 if (!kmp_fwd)
365 kmp_fwd = KMP16Bovrlap (frag2);
367 /* KMP algorithm predicted overlap needs to be corrected for
368 partial fragment compare. */
369 p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
371 /* Since s2 is at least 16 bytes long, we're certain there is no
372 match. */
373 if (p1[0] == '\0')
374 return NULL;
376 /* Load up to 16 bytes of fragment. */
377 frag1 = strloadu (p1);
379 /* Unsigned bytes, equal order, is there a partial match? */
380 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
381 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
382 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
383 goto re_trace;