Cleanup Changelog files
[glibc.git] / sysdeps / x86_64 / multiarch / strstr.c
blob76d5ad16dff9a5540d870698881c0672b5563dec
1 /* strstr with SSE4.2 intrinsics
2 Copyright (C) 2009 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <nmmintrin.h>
23 #ifndef STRSTR_SSE42
24 # define STRSTR_SSE42 __strstr_sse42
25 #endif
27 #ifdef USE_AS_STRCASESTR
28 # include <ctype.h>
29 # include <locale/localeinfo.h>
31 # define LOADBYTE(C) tolower (C)
32 # define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
33 #else
34 # define LOADBYTE(C) (C)
35 # define CMPBYTE(C1, C2) ((C1) == (C2))
36 #endif
38 /* We use 0xe ordered-compare:
39 _SIDD_SBYTE_OPS
40 | _SIDD_CMP_EQUAL_ORDER
41 | _SIDD_LEAST_SIGNIFICANT
42 on pcmpistri to do the scanning and string comparsion requirements of
43 sub-string match. In the scanning phase, we process Cflag and ECX
44 index to locate the first fragment match; once the first fragment
45 match position has been identified, we do comparison of subsequent
46 string fragments until we can conclude false or true match; whe
47 n concluding a false match, we may need to repeat scanning process
48 from next relevant offset in the target string.
50 In the scanning phase we have 4 cases:
51 case ECX CFlag ZFlag SFlag
52 1 16 0 0 0
53 2a 16 0 0 1
54 2b 16 0 1 0
55 2c 16 0 1 1
57 1. No ordered-comparison match, both 16B fragments are valid, so
58 continue to next fragment.
59 2. No ordered-comparison match, there is EOS in either fragment,
60 2a. Zflg = 0, Sflg = 1, we continue
61 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
62 2c. Zflg = 1, sflg = 1, lenth determine match or no match
64 In the string comparison phase, the 1st fragment match is fixed up
65 to produce ECX = 0. Subsequent fragment compare of nonzero index
66 and no match conclude a false match.
68 case ECX CFlag ZFlag SFlag
69 3 X 1 0 0/1
70 4a 0 1 0 0
71 4b 0 1 0 1
72 4c 0 < X 1 0 0/1
73 5 16 0 1 0
75 3. An initial ordered-comparison fragment match, we fix up to do
76 subsequent string comparison
77 4a. Continuation of fragment comparison of a string compare.
78 4b. EOS reached in the reference string, we conclude true match and
79 return
80 4c. String compare failed if index is nonzero, we need to go back to
81 scanning
82 5. failed string compare, go back to scanning
85 /* Fix-up of removal of unneeded data due to 16B aligned load
86 parameters:
87 value: 16B data loaded from 16B aligned address.
88 offset: Offset of target data address relative to 16B aligned load
89 address.
92 static __inline__ __m128i
93 __m128i_shift_right (__m128i value, int offset)
95 switch (offset)
97 case 1:
98 value = _mm_srli_si128 (value, 1);
99 break;
100 case 2:
101 value = _mm_srli_si128 (value, 2);
102 break;
103 case 3:
104 value = _mm_srli_si128 (value, 3);
105 break;
106 case 4:
107 value = _mm_srli_si128 (value, 4);
108 break;
109 case 5:
110 value = _mm_srli_si128 (value, 5);
111 break;
112 case 6:
113 value = _mm_srli_si128 (value, 6);
114 break;
115 case 7:
116 value = _mm_srli_si128 (value, 7);
117 break;
118 case 8:
119 value = _mm_srli_si128 (value, 8);
120 break;
121 case 9:
122 value = _mm_srli_si128 (value, 9);
123 break;
124 case 10:
125 value = _mm_srli_si128 (value, 10);
126 break;
127 case 11:
128 value = _mm_srli_si128 (value, 11);
129 break;
130 case 12:
131 value = _mm_srli_si128 (value, 12);
132 break;
133 case 13:
134 value = _mm_srli_si128 (value, 13);
135 break;
136 case 14:
137 value = _mm_srli_si128 (value, 14);
138 break;
139 case 15:
140 value = _mm_srli_si128 (value, 15);
141 break;
143 return value;
146 /* Simple replacement of movdqu to address 4KB boundary cross issue.
147 If EOS occurs within less than 16B before 4KB boundary, we don't
148 cross to next page. */
150 static __m128i
151 __attribute__ ((section (".text.sse4.2")))
152 __m128i_strloadu (const unsigned char * p)
154 int offset = ((size_t) p & (16 - 1));
156 if (offset && (int) ((size_t) p & 0xfff) > 0xff0)
158 __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
159 __m128i zero = _mm_setzero_si128 ();
160 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
161 if ((bmsk >> offset) != 0)
162 return __m128i_shift_right (a, offset);
164 return _mm_loadu_si128 ((__m128i *) p);
167 #ifdef USE_AS_STRCASESTR
169 /* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
170 locale. */
172 static __m128i
173 __attribute__ ((section (".text.sse4.2")))
174 __m128i_strloadu_tolower_posix (const unsigned char * p)
176 __m128i frag = __m128i_strloadu (p);
178 /* Convert frag to lower case for POSIX/C locale. */
179 __m128i rangeuc = _mm_set_epi64x (0x0, 0x5a41);
180 __m128i u2ldelta = _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0);
181 __m128i mask1 = _mm_cmpistrm (rangeuc, frag, 0x44);
182 __m128i mask2 = _mm_blendv_epi8 (u2ldelta, frag, mask1);
183 mask2 = _mm_sub_epi8 (mask2, u2ldelta);
184 return _mm_blendv_epi8 (frag, mask2, mask1);
187 /* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C
188 locale. */
190 static __m128i
191 __attribute__ ((section (".text.sse4.2")))
192 __m128i_strloadu_tolower (const unsigned char * p)
194 union
196 char b[16];
197 __m128i x;
198 } u;
200 for (int i = 0; i < 16; i++)
201 if (p[i] == 0)
203 u.b[i] = 0;
204 break;
206 else
207 u.b[i] = tolower (p[i]);
209 return u.x;
211 #endif
213 /* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
214 algorithm) overlap for a fully populated 16B vector.
215 Input parameter: 1st 16Byte loaded from the reference string of a
216 strstr function.
217 We don't use KMP algorithm if reference string is less than 16B.
220 static int
221 __inline__ __attribute__ ((__always_inline__,))
222 KMP16Bovrlap (__m128i s2)
224 __m128i b = _mm_unpacklo_epi8 (s2, s2);
225 __m128i a = _mm_unpacklo_epi8 (b, b);
226 a = _mm_shuffle_epi32 (a, 0);
227 b = _mm_srli_si128 (s2, sizeof (char));
228 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
230 /* _BitScanForward(&k1, bmsk); */
231 int k1;
232 __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
233 if (!bmsk)
234 return 16;
235 else if (bmsk == 0x7fff)
236 return 1;
237 else if (!k1)
239 /* There are al least two ditinct char in s2. If byte 0 and 1 are
240 idential and the distinct value lies farther down, we can deduce
241 the next byte offset to restart full compare is least no earlier
242 than byte 3. */
243 return 3;
245 else
247 /* Byte 1 is not degenerated to byte 0. */
248 return k1 + 1;
252 char *
253 __attribute__ ((section (".text.sse4.2")))
254 STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
256 #define p1 s1
257 const unsigned char *p2 = s2;
259 if (p2[0] == '\0')
260 return (char *) p1;
262 if (p1[0] == '\0')
263 return NULL;
265 /* Check if p1 length is 1 byte long. */
266 if (p1[1] == '\0')
267 return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
269 #ifdef USE_AS_STRCASESTR
270 __m128i (*strloadu) (const unsigned char *);
272 if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE) == 0)
273 strloadu = __m128i_strloadu_tolower_posix;
274 else
275 strloadu = __m128i_strloadu_tolower;
276 #else
277 # define strloadu __m128i_strloadu
278 #endif
280 /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
281 __m128i frag1 = strloadu (p1);
283 __m128i frag2;
284 if (p2[1] != '\0')
285 /* p2 is > 1 byte long. */
286 frag2 = strloadu (p2);
287 else
288 frag2 = _mm_insert_epi8 (_mm_setzero_si128 (), LOADBYTE (p2[0]), 0);
290 /* Unsigned bytes, equal order, does frag2 has null? */
291 int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
292 int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
293 int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
294 int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
295 if (cmp_s & cmp_c)
297 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2,
298 _mm_setzero_si128 ()));
299 int len;
300 __asm ("bsfl %[bmsk], %[len]"
301 : [len] "=r" (len) : [bmsk] "r" (bmsk));
302 p1 += cmp;
303 if ((len + cmp) <= 16)
304 return (char *) p1;
306 /* Load up to 16 bytes of fragment. */
307 frag1 = strloadu (p1);
308 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
309 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
310 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
311 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
312 if ((len + cmp) <= 16)
313 return (char *) p1 + cmp;
316 if (cmp_s)
318 /* Adjust addr for 16B alginment in ensuing loop. */
319 while (!cmp_z)
321 p1 += cmp;
322 /* Load up to 16 bytes of fragment. */
323 frag1 = strloadu (p1);
324 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
325 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
326 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
327 /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
328 once already, this time cmp will be zero and we can exit. */
329 if ((!cmp) & cmp_c)
330 break;
333 if (!cmp_c)
334 return NULL;
336 /* Since s2 is less than 16 bytes, com_c is definitive
337 determination of full match. */
338 return (char *) p1 + cmp;
341 /* General case, s2 is at least 16 bytes or more.
342 First, the common case of false-match at first byte of p2. */
343 const unsigned char *pt = NULL;
344 int kmp_fwd = 0;
345 re_trace:
346 while (!cmp_c)
348 /* frag1 has null. */
349 if (cmp_z)
350 return NULL;
352 /* frag 1 has no null, advance 16 bytes. */
353 p1 += 16;
354 /* Load up to 16 bytes of fragment. */
355 frag1 = strloadu (p1);
356 /* Unsigned bytes, equal order, is there a partial match? */
357 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
358 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
359 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
362 /* Next, handle initial positive match as first byte of p2. We have
363 a partial fragment match, make full determination until we reached
364 end of s2. */
365 if (!cmp)
367 if (cmp_z)
368 return (char *) p1;
370 pt = p1;
371 p1 += 16;
372 p2 += 16;
373 /* Load up to 16 bytes of fragment. */
374 frag2 = strloadu (p2);
376 else
378 /* Adjust 16B alignment. */
379 p1 += cmp;
380 pt = p1;
383 /* Load up to 16 bytes of fragment. */
384 frag1 = strloadu (p1);
386 /* Unsigned bytes, equal order, does frag2 has null? */
387 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
388 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
389 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
390 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
391 while (!(cmp | cmp_z | cmp_s))
393 p1 += 16;
394 p2 += 16;
395 /* Load up to 16 bytes of fragment. */
396 frag2 = strloadu (p2);
397 /* Load up to 16 bytes of fragment. */
398 frag1 = strloadu (p1);
399 /* Unsigned bytes, equal order, does frag2 has null? */
400 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
401 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
402 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
403 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
406 /* Full determination yielded a false result, retrace s1 to next
407 starting position.
408 Zflg 1 0 1 0/1
409 Sflg 0 1 1 0/1
410 cmp na 0 0 >0
411 action done done continue continue if s2 < s1
412 false match retrace s1 else false
415 if (cmp_s & !cmp)
416 return (char *) pt;
417 if (cmp_z)
419 if (!cmp_s)
420 return NULL;
422 /* Handle both zero and sign flag set and s1 is shorter in
423 length. */
424 __m128i zero = _mm_setzero_si128 ();
425 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
426 int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
427 int len;
428 int len1;
429 __asm ("bsfl %[bmsk], %[len]"
430 : [len] "=r" (len) : [bmsk] "r" (bmsk));
431 __asm ("bsfl %[bmsk1], %[len1]"
432 : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
433 if (len >= len1)
434 return NULL;
436 else if (!cmp)
437 return (char *) pt;
439 /* Otherwise, we have to retrace and continue. Default of multiple
440 paths that need to retrace from next byte in s1. */
441 p2 = s2;
442 frag2 = strloadu (p2);
444 if (!kmp_fwd)
445 kmp_fwd = KMP16Bovrlap (frag2);
447 /* KMP algorithm predicted overlap needs to be corrected for
448 partial fragment compare. */
449 p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
451 /* Since s2 is at least 16 bytes long, we're certain there is no
452 match. */
453 if (p1[0] == '\0')
454 return NULL;
456 /* Load up to 16 bytes of fragment. */
457 frag1 = strloadu (p1);
459 /* Unsigned bytes, equal order, is there a partial match? */
460 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
461 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
462 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
463 goto re_trace;