Speed up x86-64 strcasestr a bit moew.
[glibc.git] / sysdeps / x86_64 / multiarch / strstr.c
blobe2b19a34bcfa20509fdb521a30dde5e0f4f043ee
1 /* strstr with SSE4.2 intrinsics
2 Copyright (C) 2009, 2010 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <nmmintrin.h>
23 #ifndef STRSTR_SSE42
24 # define STRSTR_SSE42 __strstr_sse42
25 #endif
27 #ifdef USE_AS_STRCASESTR
28 # include <ctype.h>
29 # include <locale/localeinfo.h>
31 # define LOADBYTE(C) tolower (C)
32 # define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
33 #else
34 # define LOADBYTE(C) (C)
35 # define CMPBYTE(C1, C2) ((C1) == (C2))
36 #endif
38 /* We use 0xe ordered-compare:
39 _SIDD_SBYTE_OPS
40 | _SIDD_CMP_EQUAL_ORDER
41 | _SIDD_LEAST_SIGNIFICANT
42 on pcmpistri to do the scanning and string comparsion requirements of
43 sub-string match. In the scanning phase, we process Cflag and ECX
44 index to locate the first fragment match; once the first fragment
45 match position has been identified, we do comparison of subsequent
46 string fragments until we can conclude false or true match; whe
47 n concluding a false match, we may need to repeat scanning process
48 from next relevant offset in the target string.
50 In the scanning phase we have 4 cases:
51 case ECX CFlag ZFlag SFlag
52 1 16 0 0 0
53 2a 16 0 0 1
54 2b 16 0 1 0
55 2c 16 0 1 1
57 1. No ordered-comparison match, both 16B fragments are valid, so
58 continue to next fragment.
59 2. No ordered-comparison match, there is EOS in either fragment,
60 2a. Zflg = 0, Sflg = 1, we continue
61 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
62 2c. Zflg = 1, sflg = 1, lenth determine match or no match
64 In the string comparison phase, the 1st fragment match is fixed up
65 to produce ECX = 0. Subsequent fragment compare of nonzero index
66 and no match conclude a false match.
68 case ECX CFlag ZFlag SFlag
69 3 X 1 0 0/1
70 4a 0 1 0 0
71 4b 0 1 0 1
72 4c 0 < X 1 0 0/1
73 5 16 0 1 0
75 3. An initial ordered-comparison fragment match, we fix up to do
76 subsequent string comparison
77 4a. Continuation of fragment comparison of a string compare.
78 4b. EOS reached in the reference string, we conclude true match and
79 return
80 4c. String compare failed if index is nonzero, we need to go back to
81 scanning
82 5. failed string compare, go back to scanning
85 /* Fix-up of removal of unneeded data due to 16B aligned load
86 parameters:
87 value: 16B data loaded from 16B aligned address.
88 offset: Offset of target data address relative to 16B aligned load
89 address.
92 static __inline__ __m128i
93 __m128i_shift_right (__m128i value, int offset)
95 switch (offset)
97 case 1:
98 value = _mm_srli_si128 (value, 1);
99 break;
100 case 2:
101 value = _mm_srli_si128 (value, 2);
102 break;
103 case 3:
104 value = _mm_srli_si128 (value, 3);
105 break;
106 case 4:
107 value = _mm_srli_si128 (value, 4);
108 break;
109 case 5:
110 value = _mm_srli_si128 (value, 5);
111 break;
112 case 6:
113 value = _mm_srli_si128 (value, 6);
114 break;
115 case 7:
116 value = _mm_srli_si128 (value, 7);
117 break;
118 case 8:
119 value = _mm_srli_si128 (value, 8);
120 break;
121 case 9:
122 value = _mm_srli_si128 (value, 9);
123 break;
124 case 10:
125 value = _mm_srli_si128 (value, 10);
126 break;
127 case 11:
128 value = _mm_srli_si128 (value, 11);
129 break;
130 case 12:
131 value = _mm_srli_si128 (value, 12);
132 break;
133 case 13:
134 value = _mm_srli_si128 (value, 13);
135 break;
136 case 14:
137 value = _mm_srli_si128 (value, 14);
138 break;
139 case 15:
140 value = _mm_srli_si128 (value, 15);
141 break;
143 return value;
146 /* Simple replacement of movdqu to address 4KB boundary cross issue.
147 If EOS occurs within less than 16B before 4KB boundary, we don't
148 cross to next page. */
150 static inline __m128i
151 __m128i_strloadu (const unsigned char * p)
153 int offset = ((size_t) p & (16 - 1));
155 if (offset && (int) ((size_t) p & 0xfff) > 0xff0)
157 __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
158 __m128i zero = _mm_setzero_si128 ();
159 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
160 if ((bmsk >> offset) != 0)
161 return __m128i_shift_right (a, offset);
163 return _mm_loadu_si128 ((__m128i *) p);
166 #if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
168 /* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
169 locale. */
170 static inline __m128i
171 __m128i_strloadu_tolower (const unsigned char *p, __m128i rangeuc,
172 __m128i u2ldelta)
174 __m128i frag = __m128i_strloadu (p);
176 #define UCLOW 0x4040404040404040ULL
177 #define UCHIGH 0x5a5a5a5a5a5a5a5aULL
178 #define LCQWORD 0x2020202020202020ULL
179 /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
180 __m128i r2 = _mm_cmpgt_epi8 (_mm_set1_epi64x (UCHIGH), frag);
181 /* Compare if bytes are > 'A' - 1. */
182 __m128i r1 = _mm_cmpgt_epi8 (frag, _mm_set1_epi64x (UCLOW));
183 /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */
184 __m128i mask = _mm_and_si128 (r2, r1);
185 /* Apply lowercase bit 6 mask for above mask bytes == ff. */
186 return _mm_or_si128 (frag, _mm_and_si128 (mask, _mm_set1_epi64x (LCQWORD)));
189 #endif
191 /* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
192 algorithm) overlap for a fully populated 16B vector.
193 Input parameter: 1st 16Byte loaded from the reference string of a
194 strstr function.
195 We don't use KMP algorithm if reference string is less than 16B. */
196 static int
197 __inline__ __attribute__ ((__always_inline__,))
198 KMP16Bovrlap (__m128i s2)
200 __m128i b = _mm_unpacklo_epi8 (s2, s2);
201 __m128i a = _mm_unpacklo_epi8 (b, b);
202 a = _mm_shuffle_epi32 (a, 0);
203 b = _mm_srli_si128 (s2, sizeof (char));
204 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
206 /* _BitScanForward(&k1, bmsk); */
207 int k1;
208 __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
209 if (!bmsk)
210 return 16;
211 else if (bmsk == 0x7fff)
212 return 1;
213 else if (!k1)
215 /* There are al least two distinct chars in s2. If byte 0 and 1 are
216 idential and the distinct value lies farther down, we can deduce
217 the next byte offset to restart full compare is least no earlier
218 than byte 3. */
219 return 3;
221 else
223 /* Byte 1 is not degenerated to byte 0. */
224 return k1 + 1;
228 char *
229 __attribute__ ((section (".text.sse4.2")))
230 STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
232 #define p1 s1
233 const unsigned char *p2 = s2;
235 #ifndef STRCASESTR_NONASCII
236 if (__builtin_expect (p2[0] == '\0', 0))
237 return (char *) p1;
239 if (__builtin_expect (p1[0] == '\0', 0))
240 return NULL;
242 /* Check if p1 length is 1 byte long. */
243 if (__builtin_expect (p1[1] == '\0', 0))
244 return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
245 #endif
247 #ifdef USE_AS_STRCASESTR
248 # ifndef STRCASESTR_NONASCII
249 if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
250 != 0, 0))
251 return __strcasestr_sse42_nonascii (s1, s2);
253 const __m128i rangeuc = _mm_set_epi64x (0x0, 0x5a41);
254 const __m128i u2ldelta = _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0);
255 # define strloadu(p) __m128i_strloadu_tolower (p, rangeuc, u2ldelta)
256 # else
257 # define strloadu __m128i_strloadu_tolower
258 # endif
259 #else
260 # define strloadu __m128i_strloadu
261 #endif
263 /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
264 __m128i frag1 = strloadu (p1);
266 __m128i frag2;
267 if (p2[1] != '\0')
268 /* p2 is > 1 byte long. */
269 frag2 = strloadu (p2);
270 else
271 frag2 = _mm_insert_epi8 (_mm_setzero_si128 (), LOADBYTE (p2[0]), 0);
273 /* Unsigned bytes, equal order, does frag2 has null? */
274 int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
275 int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
276 int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
277 int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
278 if (cmp_s & cmp_c)
280 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2,
281 _mm_setzero_si128 ()));
282 int len;
283 __asm ("bsfl %[bmsk], %[len]"
284 : [len] "=r" (len) : [bmsk] "r" (bmsk));
285 p1 += cmp;
286 if ((len + cmp) <= 16)
287 return (char *) p1;
289 /* Load up to 16 bytes of fragment. */
290 frag1 = strloadu (p1);
291 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
292 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
293 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
294 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
295 if ((len + cmp) <= 16)
296 return (char *) p1 + cmp;
299 if (cmp_s)
301 /* Adjust addr for 16B alginment in ensuing loop. */
302 while (!cmp_z)
304 p1 += cmp;
305 /* Load up to 16 bytes of fragment. */
306 frag1 = strloadu (p1);
307 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
308 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
309 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
310 /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
311 once already, this time cmp will be zero and we can exit. */
312 if ((!cmp) & cmp_c)
313 break;
316 if (!cmp_c)
317 return NULL;
319 /* Since s2 is less than 16 bytes, com_c is definitive
320 determination of full match. */
321 return (char *) p1 + cmp;
324 /* General case, s2 is at least 16 bytes or more.
325 First, the common case of false-match at first byte of p2. */
326 const unsigned char *pt = NULL;
327 int kmp_fwd = 0;
328 re_trace:
329 while (!cmp_c)
331 /* frag1 has null. */
332 if (cmp_z)
333 return NULL;
335 /* frag 1 has no null, advance 16 bytes. */
336 p1 += 16;
337 /* Load up to 16 bytes of fragment. */
338 frag1 = strloadu (p1);
339 /* Unsigned bytes, equal order, is there a partial match? */
340 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
341 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
342 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
345 /* Next, handle initial positive match as first byte of p2. We have
346 a partial fragment match, make full determination until we reached
347 end of s2. */
348 if (!cmp)
350 if (cmp_z)
351 return (char *) p1;
353 pt = p1;
354 p1 += 16;
355 p2 += 16;
356 /* Load up to 16 bytes of fragment. */
357 frag2 = strloadu (p2);
359 else
361 /* Adjust 16B alignment. */
362 p1 += cmp;
363 pt = p1;
366 /* Load up to 16 bytes of fragment. */
367 frag1 = strloadu (p1);
369 /* Unsigned bytes, equal order, does frag2 has null? */
370 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
371 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
372 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
373 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
374 while (!(cmp | cmp_z | cmp_s))
376 p1 += 16;
377 p2 += 16;
378 /* Load up to 16 bytes of fragment. */
379 frag2 = strloadu (p2);
380 /* Load up to 16 bytes of fragment. */
381 frag1 = strloadu (p1);
382 /* Unsigned bytes, equal order, does frag2 has null? */
383 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
384 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
385 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
386 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
389 /* Full determination yielded a false result, retrace s1 to next
390 starting position.
391 Zflg 1 0 1 0/1
392 Sflg 0 1 1 0/1
393 cmp na 0 0 >0
394 action done done continue continue if s2 < s1
395 false match retrace s1 else false
398 if (cmp_s & !cmp)
399 return (char *) pt;
400 if (cmp_z)
402 if (!cmp_s)
403 return NULL;
405 /* Handle both zero and sign flag set and s1 is shorter in
406 length. */
407 __m128i zero = _mm_setzero_si128 ();
408 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
409 int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
410 int len;
411 int len1;
412 __asm ("bsfl %[bmsk], %[len]"
413 : [len] "=r" (len) : [bmsk] "r" (bmsk));
414 __asm ("bsfl %[bmsk1], %[len1]"
415 : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
416 if (len >= len1)
417 return NULL;
419 else if (!cmp)
420 return (char *) pt;
422 /* Otherwise, we have to retrace and continue. Default of multiple
423 paths that need to retrace from next byte in s1. */
424 p2 = s2;
425 frag2 = strloadu (p2);
427 if (!kmp_fwd)
428 kmp_fwd = KMP16Bovrlap (frag2);
430 /* KMP algorithm predicted overlap needs to be corrected for
431 partial fragment compare. */
432 p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
434 /* Since s2 is at least 16 bytes long, we're certain there is no
435 match. */
436 if (p1[0] == '\0')
437 return NULL;
439 /* Load up to 16 bytes of fragment. */
440 frag1 = strloadu (p1);
442 /* Unsigned bytes, equal order, is there a partial match? */
443 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
444 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
445 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
446 goto re_trace;