Work around old buggy program which cannot cope with memcpy semantics.
[glibc.git] / sysdeps / x86_64 / multiarch / strstr.c
blobb408b752fa460988790eea85046fa1cd7e3e14b0
1 /* strstr with SSE4.2 intrinsics
2 Copyright (C) 2009, 2010 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <nmmintrin.h>
22 #include "varshift.h"
24 #ifndef STRSTR_SSE42
25 # define STRSTR_SSE42 __strstr_sse42
26 #endif
28 #ifdef USE_AS_STRCASESTR
29 # include <ctype.h>
30 # include <locale/localeinfo.h>
32 # define LOADBYTE(C) tolower (C)
33 # define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
34 #else
35 # define LOADBYTE(C) (C)
36 # define CMPBYTE(C1, C2) ((C1) == (C2))
37 #endif
39 /* We use 0xe ordered-compare:
40 _SIDD_SBYTE_OPS
41 | _SIDD_CMP_EQUAL_ORDER
42 | _SIDD_LEAST_SIGNIFICANT
43 on pcmpistri to do the scanning and string comparsion requirements of
44 sub-string match. In the scanning phase, we process Cflag and ECX
45 index to locate the first fragment match; once the first fragment
46 match position has been identified, we do comparison of subsequent
47 string fragments until we can conclude false or true match; whe
48 n concluding a false match, we may need to repeat scanning process
49 from next relevant offset in the target string.
51 In the scanning phase we have 4 cases:
52 case ECX CFlag ZFlag SFlag
53 1 16 0 0 0
54 2a 16 0 0 1
55 2b 16 0 1 0
56 2c 16 0 1 1
58 1. No ordered-comparison match, both 16B fragments are valid, so
59 continue to next fragment.
60 2. No ordered-comparison match, there is EOS in either fragment,
61 2a. Zflg = 0, Sflg = 1, we continue
62 2b. Zflg = 1, Sflg = 0, we conclude no match and return.
63 2c. Zflg = 1, sflg = 1, lenth determine match or no match
65 In the string comparison phase, the 1st fragment match is fixed up
66 to produce ECX = 0. Subsequent fragment compare of nonzero index
67 and no match conclude a false match.
69 case ECX CFlag ZFlag SFlag
70 3 X 1 0 0/1
71 4a 0 1 0 0
72 4b 0 1 0 1
73 4c 0 < X 1 0 0/1
74 5 16 0 1 0
76 3. An initial ordered-comparison fragment match, we fix up to do
77 subsequent string comparison
78 4a. Continuation of fragment comparison of a string compare.
79 4b. EOS reached in the reference string, we conclude true match and
80 return
81 4c. String compare failed if index is nonzero, we need to go back to
82 scanning
83 5. failed string compare, go back to scanning
86 /* Simple replacement of movdqu to address 4KB boundary cross issue.
87 If EOS occurs within less than 16B before 4KB boundary, we don't
88 cross to next page. */
90 static inline __m128i
91 __m128i_strloadu (const unsigned char * p)
93 int offset = ((size_t) p & (16 - 1));
95 if (offset && (int) ((size_t) p & 0xfff) > 0xff0)
97 __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
98 __m128i zero = _mm_setzero_si128 ();
99 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
100 if ((bmsk >> offset) != 0)
101 return __m128i_shift_right (a, offset);
103 return _mm_loadu_si128 ((__m128i *) p);
106 #if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
108 /* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
109 locale. */
110 static inline __m128i
111 __m128i_strloadu_tolower (const unsigned char *p, __m128i rangeuc,
112 __m128i u2ldelta)
114 __m128i frag = __m128i_strloadu (p);
116 #define UCLOW 0x4040404040404040ULL
117 #define UCHIGH 0x5b5b5b5b5b5b5b5bULL
118 #define LCQWORD 0x2020202020202020ULL
119 /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
120 __m128i r2 = _mm_cmpgt_epi8 (_mm_set1_epi64x (UCHIGH), frag);
121 /* Compare if bytes are > 'A' - 1. */
122 __m128i r1 = _mm_cmpgt_epi8 (frag, _mm_set1_epi64x (UCLOW));
123 /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */
124 __m128i mask = _mm_and_si128 (r2, r1);
125 /* Apply lowercase bit 6 mask for above mask bytes == ff. */
126 return _mm_or_si128 (frag, _mm_and_si128 (mask, _mm_set1_epi64x (LCQWORD)));
129 #endif
131 /* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
132 algorithm) overlap for a fully populated 16B vector.
133 Input parameter: 1st 16Byte loaded from the reference string of a
134 strstr function.
135 We don't use KMP algorithm if reference string is less than 16B. */
136 static int
137 __inline__ __attribute__ ((__always_inline__,))
138 KMP16Bovrlap (__m128i s2)
140 __m128i b = _mm_unpacklo_epi8 (s2, s2);
141 __m128i a = _mm_unpacklo_epi8 (b, b);
142 a = _mm_shuffle_epi32 (a, 0);
143 b = _mm_srli_si128 (s2, sizeof (char));
144 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
146 /* _BitScanForward(&k1, bmsk); */
147 int k1;
148 __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
149 if (!bmsk)
150 return 16;
151 else if (bmsk == 0x7fff)
152 return 1;
153 else if (!k1)
155 /* There are al least two distinct chars in s2. If byte 0 and 1 are
156 idential and the distinct value lies farther down, we can deduce
157 the next byte offset to restart full compare is least no earlier
158 than byte 3. */
159 return 3;
161 else
163 /* Byte 1 is not degenerated to byte 0. */
164 return k1 + 1;
168 char *
169 __attribute__ ((section (".text.sse4.2")))
170 STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
172 #define p1 s1
173 const unsigned char *p2 = s2;
175 #ifndef STRCASESTR_NONASCII
176 if (__builtin_expect (p2[0] == '\0', 0))
177 return (char *) p1;
179 if (__builtin_expect (p1[0] == '\0', 0))
180 return NULL;
182 /* Check if p1 length is 1 byte long. */
183 if (__builtin_expect (p1[1] == '\0', 0))
184 return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
185 #endif
187 #ifdef USE_AS_STRCASESTR
188 # ifndef STRCASESTR_NONASCII
189 if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
190 != 0, 0))
191 return __strcasestr_sse42_nonascii (s1, s2);
193 const __m128i rangeuc = _mm_set_epi64x (0x0, 0x5a41);
194 const __m128i u2ldelta = _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0);
195 # define strloadu(p) __m128i_strloadu_tolower (p, rangeuc, u2ldelta)
196 # else
197 # define strloadu __m128i_strloadu_tolower
198 # endif
199 #else
200 # define strloadu __m128i_strloadu
201 #endif
203 /* p1 > 1 byte long. Load up to 16 bytes of fragment. */
204 __m128i frag1 = strloadu (p1);
206 __m128i frag2;
207 if (p2[1] != '\0')
208 /* p2 is > 1 byte long. */
209 frag2 = strloadu (p2);
210 else
211 frag2 = _mm_insert_epi8 (_mm_setzero_si128 (), LOADBYTE (p2[0]), 0);
213 /* Unsigned bytes, equal order, does frag2 has null? */
214 int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
215 int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
216 int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
217 int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
218 if (cmp_s & cmp_c)
220 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2,
221 _mm_setzero_si128 ()));
222 int len;
223 __asm ("bsfl %[bmsk], %[len]"
224 : [len] "=r" (len) : [bmsk] "r" (bmsk));
225 p1 += cmp;
226 if ((len + cmp) <= 16)
227 return (char *) p1;
229 /* Load up to 16 bytes of fragment. */
230 frag1 = strloadu (p1);
231 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
232 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
233 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
234 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
235 if ((len + cmp) <= 16)
236 return (char *) p1 + cmp;
239 if (cmp_s)
241 /* Adjust addr for 16B alginment in ensuing loop. */
242 while (!cmp_z)
244 p1 += cmp;
245 /* Load up to 16 bytes of fragment. */
246 frag1 = strloadu (p1);
247 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
248 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
249 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
250 /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
251 once already, this time cmp will be zero and we can exit. */
252 if ((!cmp) & cmp_c)
253 break;
256 if (!cmp_c)
257 return NULL;
259 /* Since s2 is less than 16 bytes, com_c is definitive
260 determination of full match. */
261 return (char *) p1 + cmp;
264 /* General case, s2 is at least 16 bytes or more.
265 First, the common case of false-match at first byte of p2. */
266 const unsigned char *pt = NULL;
267 int kmp_fwd = 0;
268 re_trace:
269 while (!cmp_c)
271 /* frag1 has null. */
272 if (cmp_z)
273 return NULL;
275 /* frag 1 has no null, advance 16 bytes. */
276 p1 += 16;
277 /* Load up to 16 bytes of fragment. */
278 frag1 = strloadu (p1);
279 /* Unsigned bytes, equal order, is there a partial match? */
280 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
281 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
282 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
285 /* Next, handle initial positive match as first byte of p2. We have
286 a partial fragment match, make full determination until we reached
287 end of s2. */
288 if (!cmp)
290 if (cmp_z)
291 return (char *) p1;
293 pt = p1;
294 p1 += 16;
295 p2 += 16;
296 /* Load up to 16 bytes of fragment. */
297 frag2 = strloadu (p2);
299 else
301 /* Adjust 16B alignment. */
302 p1 += cmp;
303 pt = p1;
306 /* Load up to 16 bytes of fragment. */
307 frag1 = strloadu (p1);
309 /* Unsigned bytes, equal order, does frag2 has null? */
310 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
311 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
312 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
313 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
314 while (!(cmp | cmp_z | cmp_s))
316 p1 += 16;
317 p2 += 16;
318 /* Load up to 16 bytes of fragment. */
319 frag2 = strloadu (p2);
320 /* Load up to 16 bytes of fragment. */
321 frag1 = strloadu (p1);
322 /* Unsigned bytes, equal order, does frag2 has null? */
323 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
324 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
325 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
326 cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
329 /* Full determination yielded a false result, retrace s1 to next
330 starting position.
331 Zflg 1 0 1 0/1
332 Sflg 0 1 1 0/1
333 cmp na 0 0 >0
334 action done done continue continue if s2 < s1
335 false match retrace s1 else false
338 if (cmp_s & !cmp)
339 return (char *) pt;
340 if (cmp_z)
342 if (!cmp_s)
343 return NULL;
345 /* Handle both zero and sign flag set and s1 is shorter in
346 length. */
347 __m128i zero = _mm_setzero_si128 ();
348 int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
349 int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
350 int len;
351 int len1;
352 __asm ("bsfl %[bmsk], %[len]"
353 : [len] "=r" (len) : [bmsk] "r" (bmsk));
354 __asm ("bsfl %[bmsk1], %[len1]"
355 : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
356 if (len >= len1)
357 return NULL;
359 else if (!cmp)
360 return (char *) pt;
362 /* Otherwise, we have to retrace and continue. Default of multiple
363 paths that need to retrace from next byte in s1. */
364 p2 = s2;
365 frag2 = strloadu (p2);
367 if (!kmp_fwd)
368 kmp_fwd = KMP16Bovrlap (frag2);
370 /* KMP algorithm predicted overlap needs to be corrected for
371 partial fragment compare. */
372 p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
374 /* Since s2 is at least 16 bytes long, we're certain there is no
375 match. */
376 if (p1[0] == '\0')
377 return NULL;
379 /* Load up to 16 bytes of fragment. */
380 frag1 = strloadu (p1);
382 /* Unsigned bytes, equal order, is there a partial match? */
383 cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
384 cmp = _mm_cmpistri (frag2, frag1, 0x0c);
385 cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
386 goto re_trace;