1 /* strstr with SSE4.2 intrinsics
2 Copyright (C) 2009, 2010 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 #include <nmmintrin.h>
24 # define STRSTR_SSE42 __strstr_sse42
27 #ifdef USE_AS_STRCASESTR
29 # include <locale/localeinfo.h>
31 # define LOADBYTE(C) tolower (C)
32 # define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
34 # define LOADBYTE(C) (C)
35 # define CMPBYTE(C1, C2) ((C1) == (C2))
38 /* We use 0xe ordered-compare:
40 | _SIDD_CMP_EQUAL_ORDER
41 | _SIDD_LEAST_SIGNIFICANT
42 on pcmpistri to do the scanning and string comparsion requirements of
43 sub-string match. In the scanning phase, we process Cflag and ECX
44 index to locate the first fragment match; once the first fragment
45 match position has been identified, we do comparison of subsequent
46 string fragments until we can conclude false or true match; whe
47 n concluding a false match, we may need to repeat scanning process
48 from next relevant offset in the target string.
50 In the scanning phase we have 4 cases:
51 case ECX CFlag ZFlag SFlag
57 1. No ordered-comparison match, both 16B fragments are valid, so
58 continue to next fragment.
59 2. No ordered-comparison match, there is EOS in either fragment,
60 2a. Zflg = 0, Sflg = 1, we continue
61 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
62 2c. Zflg = 1, sflg = 1, lenth determine match or no match
64 In the string comparison phase, the 1st fragment match is fixed up
65 to produce ECX = 0. Subsequent fragment compare of nonzero index
66 and no match conclude a false match.
68 case ECX CFlag ZFlag SFlag
75 3. An initial ordered-comparison fragment match, we fix up to do
76 subsequent string comparison
77 4a. Continuation of fragment comparison of a string compare.
78 4b. EOS reached in the reference string, we conclude true match and
80 4c. String compare failed if index is nonzero, we need to go back to
82 5. failed string compare, go back to scanning
85 /* Fix-up of removal of unneeded data due to 16B aligned load
87 value: 16B data loaded from 16B aligned address.
88 offset: Offset of target data address relative to 16B aligned load
92 static __inline__ __m128i
93 __m128i_shift_right (__m128i value
, int offset
)
98 value
= _mm_srli_si128 (value
, 1);
101 value
= _mm_srli_si128 (value
, 2);
104 value
= _mm_srli_si128 (value
, 3);
107 value
= _mm_srli_si128 (value
, 4);
110 value
= _mm_srli_si128 (value
, 5);
113 value
= _mm_srli_si128 (value
, 6);
116 value
= _mm_srli_si128 (value
, 7);
119 value
= _mm_srli_si128 (value
, 8);
122 value
= _mm_srli_si128 (value
, 9);
125 value
= _mm_srli_si128 (value
, 10);
128 value
= _mm_srli_si128 (value
, 11);
131 value
= _mm_srli_si128 (value
, 12);
134 value
= _mm_srli_si128 (value
, 13);
137 value
= _mm_srli_si128 (value
, 14);
140 value
= _mm_srli_si128 (value
, 15);
146 /* Simple replacement of movdqu to address 4KB boundary cross issue.
147 If EOS occurs within less than 16B before 4KB boundary, we don't
148 cross to next page. */
150 static inline __m128i
151 __m128i_strloadu (const unsigned char * p
)
153 int offset
= ((size_t) p
& (16 - 1));
155 if (offset
&& (int) ((size_t) p
& 0xfff) > 0xff0)
157 __m128i a
= _mm_load_si128 ((__m128i
*) (p
- offset
));
158 __m128i zero
= _mm_setzero_si128 ();
159 int bmsk
= _mm_movemask_epi8 (_mm_cmpeq_epi8 (a
, zero
));
160 if ((bmsk
>> offset
) != 0)
161 return __m128i_shift_right (a
, offset
);
163 return _mm_loadu_si128 ((__m128i
*) p
);
166 #if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
168 /* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
170 static inline __m128i
171 __m128i_strloadu_tolower (const unsigned char * p
)
173 __m128i frag
= __m128i_strloadu (p
);
175 /* Convert frag to lower case for POSIX/C locale. */
176 __m128i rangeuc
= _mm_set_epi64x (0x0, 0x5a41);
177 __m128i u2ldelta
= _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0);
178 __m128i mask1
= _mm_cmpistrm (rangeuc
, frag
, 0x44);
179 __m128i mask2
= _mm_blendv_epi8 (u2ldelta
, frag
, mask1
);
180 mask2
= _mm_sub_epi8 (mask2
, u2ldelta
);
181 return _mm_blendv_epi8 (frag
, mask2
, mask1
);
186 /* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
187 algorithm) overlap for a fully populated 16B vector.
188 Input parameter: 1st 16Byte loaded from the reference string of a
190 We don't use KMP algorithm if reference string is less than 16B. */
192 __inline__
__attribute__ ((__always_inline__
,))
193 KMP16Bovrlap (__m128i s2
)
195 __m128i b
= _mm_unpacklo_epi8 (s2
, s2
);
196 __m128i a
= _mm_unpacklo_epi8 (b
, b
);
197 a
= _mm_shuffle_epi32 (a
, 0);
198 b
= _mm_srli_si128 (s2
, sizeof (char));
199 int bmsk
= _mm_movemask_epi8 (_mm_cmpeq_epi8 (b
, a
));
201 /* _BitScanForward(&k1, bmsk); */
203 __asm ("bsfl %[bmsk], %[k1]" : [k1
] "=r" (k1
) : [bmsk
] "r" (bmsk
));
206 else if (bmsk
== 0x7fff)
210 /* There are al least two distinct chars in s2. If byte 0 and 1 are
211 idential and the distinct value lies farther down, we can deduce
212 the next byte offset to restart full compare is least no earlier
218 /* Byte 1 is not degenerated to byte 0. */
224 __attribute__ ((section (".text.sse4.2")))
225 STRSTR_SSE42 (const unsigned char *s1
, const unsigned char *s2
)
228 const unsigned char *p2
= s2
;
230 #ifndef STRCASESTR_NONASCII
231 if (__builtin_expect (p2
[0] == '\0', 0))
234 if (__builtin_expect (p1
[0] == '\0', 0))
237 /* Check if p1 length is 1 byte long. */
238 if (__builtin_expect (p1
[1] == '\0', 0))
239 return p2
[1] == '\0' && CMPBYTE (p1
[0], p2
[0]) ? (char *) p1
: NULL
;
242 #ifdef USE_AS_STRCASESTR
243 # ifndef STRCASESTR_NONASCII
244 if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE
, _NL_CTYPE_NONASCII_CASE
)
246 return __strcasestr_sse42_nonascii (s1
, s2
);
249 # define strloadu __m128i_strloadu_tolower
251 # define strloadu __m128i_strloadu
254 /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
255 __m128i frag1
= strloadu (p1
);
259 /* p2 is > 1 byte long. */
260 frag2
= strloadu (p2
);
262 frag2
= _mm_insert_epi8 (_mm_setzero_si128 (), LOADBYTE (p2
[0]), 0);
264 /* Unsigned bytes, equal order, does frag2 has null? */
265 int cmp_c
= _mm_cmpistrc (frag2
, frag1
, 0x0c);
266 int cmp_z
= _mm_cmpistrz (frag2
, frag1
, 0x0c);
267 int cmp
= _mm_cmpistri (frag2
, frag1
, 0x0c);
268 int cmp_s
= _mm_cmpistrs (frag2
, frag1
, 0x0c);
271 int bmsk
= _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2
,
272 _mm_setzero_si128 ()));
274 __asm ("bsfl %[bmsk], %[len]"
275 : [len
] "=r" (len
) : [bmsk
] "r" (bmsk
));
277 if ((len
+ cmp
) <= 16)
280 /* Load up to 16 bytes of fragment. */
281 frag1
= strloadu (p1
);
282 cmp_c
= _mm_cmpistrc (frag2
, frag1
, 0x0c);
283 cmp_s
= _mm_cmpistrs (frag2
, frag1
, 0x0c);
284 cmp_z
= _mm_cmpistrz (frag2
, frag1
, 0x0c);
285 cmp
= _mm_cmpistri (frag2
, frag1
, 0x0c);
286 if ((len
+ cmp
) <= 16)
287 return (char *) p1
+ cmp
;
292 /* Adjust addr for 16B alginment in ensuing loop. */
296 /* Load up to 16 bytes of fragment. */
297 frag1
= strloadu (p1
);
298 cmp
= _mm_cmpistri (frag2
, frag1
, 0x0c);
299 cmp_c
= _mm_cmpistrc (frag2
, frag1
, 0x0c);
300 cmp_z
= _mm_cmpistrz (frag2
, frag1
, 0x0c);
301 /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
302 once already, this time cmp will be zero and we can exit. */
310 /* Since s2 is less than 16 bytes, com_c is definitive
311 determination of full match. */
312 return (char *) p1
+ cmp
;
315 /* General case, s2 is at least 16 bytes or more.
316 First, the common case of false-match at first byte of p2. */
317 const unsigned char *pt
= NULL
;
322 /* frag1 has null. */
326 /* frag 1 has no null, advance 16 bytes. */
328 /* Load up to 16 bytes of fragment. */
329 frag1
= strloadu (p1
);
330 /* Unsigned bytes, equal order, is there a partial match? */
331 cmp_c
= _mm_cmpistrc (frag2
, frag1
, 0x0c);
332 cmp
= _mm_cmpistri (frag2
, frag1
, 0x0c);
333 cmp_z
= _mm_cmpistrz (frag2
, frag1
, 0x0c);
336 /* Next, handle initial positive match as first byte of p2. We have
337 a partial fragment match, make full determination until we reached
347 /* Load up to 16 bytes of fragment. */
348 frag2
= strloadu (p2
);
352 /* Adjust 16B alignment. */
357 /* Load up to 16 bytes of fragment. */
358 frag1
= strloadu (p1
);
360 /* Unsigned bytes, equal order, does frag2 has null? */
361 cmp_c
= _mm_cmpistrc (frag2
, frag1
, 0x0c);
362 cmp_z
= _mm_cmpistrz (frag2
, frag1
, 0x0c);
363 cmp
= _mm_cmpistri (frag2
, frag1
, 0x0c);
364 cmp_s
= _mm_cmpistrs (frag2
, frag1
, 0x0c);
365 while (!(cmp
| cmp_z
| cmp_s
))
369 /* Load up to 16 bytes of fragment. */
370 frag2
= strloadu (p2
);
371 /* Load up to 16 bytes of fragment. */
372 frag1
= strloadu (p1
);
373 /* Unsigned bytes, equal order, does frag2 has null? */
374 cmp_c
= _mm_cmpistrc (frag2
, frag1
, 0x0c);
375 cmp_z
= _mm_cmpistrz (frag2
, frag1
, 0x0c);
376 cmp
= _mm_cmpistri (frag2
, frag1
, 0x0c);
377 cmp_s
= _mm_cmpistrs (frag2
, frag1
, 0x0c);
380 /* Full determination yielded a false result, retrace s1 to next
385 action done done continue continue if s2 < s1
386 false match retrace s1 else false
396 /* Handle both zero and sign flag set and s1 is shorter in
398 __m128i zero
= _mm_setzero_si128 ();
399 int bmsk
= _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero
, frag2
));
400 int bmsk1
= _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero
, frag1
));
403 __asm ("bsfl %[bmsk], %[len]"
404 : [len
] "=r" (len
) : [bmsk
] "r" (bmsk
));
405 __asm ("bsfl %[bmsk1], %[len1]"
406 : [len1
] "=r" (len1
) : [bmsk1
] "r" (bmsk1
));
413 /* Otherwise, we have to retrace and continue. Default of multiple
414 paths that need to retrace from next byte in s1. */
416 frag2
= strloadu (p2
);
419 kmp_fwd
= KMP16Bovrlap (frag2
);
421 /* KMP algorithm predicted overlap needs to be corrected for
422 partial fragment compare. */
423 p1
= pt
+ (kmp_fwd
> cmp
? cmp
: kmp_fwd
);
425 /* Since s2 is at least 16 bytes long, we're certain there is no
430 /* Load up to 16 bytes of fragment. */
431 frag1
= strloadu (p1
);
433 /* Unsigned bytes, equal order, is there a partial match? */
434 cmp_c
= _mm_cmpistrc (frag2
, frag1
, 0x0c);
435 cmp
= _mm_cmpistri (frag2
, frag1
, 0x0c);
436 cmp_z
= _mm_cmpistrz (frag2
, frag1
, 0x0c);