Cleanup _IO_wfile_seekoff.
[glibc/pb-stable.git] / sysdeps / x86_64 / multiarch / strcspn-c.c
blobdaeebe1bf58022faa5ac4da98e7fd3d1f6ba2977
1 /* strcspn with SSE4.2 intrinsics
2 Copyright (C) 2009 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <nmmintrin.h>
22 #include <string.h>
24 /* We use 0x2:
25 _SIDD_SBYTE_OPS
26 | _SIDD_CMP_EQUAL_ANY
27 | _SIDD_POSITIVE_POLARITY
28 | _SIDD_LEAST_SIGNIFICANT
29 on pcmpistri to compare xmm/mem128
31 0 1 2 3 4 5 6 7 8 9 A B C D E F
32 X X X X X X X X X X X X X X X X
34 against xmm
36 0 1 2 3 4 5 6 7 8 9 A B C D E F
37 A A A A A A A A A A A A A A A A
39 to find out if the first 16byte data element has any byte A and
40 the offset of the first byte. There are 3 cases:
42 1. The first 16byte data element has the byte A at the offset X.
43 2. The first 16byte data element has EOS and doesn't have the byte A.
44 3. The first 16byte data element is valid and doesn't have the byte A.
46 Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
48 1 X 1 0/1 0
49 2 16 0 1 0
50 3 16 0 0 0
52 We exit from the loop for cases 1 and 2 with jbe which branches
53 when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
54 X for case 1. */
56 #ifndef STRCSPN_SSE2
57 # define STRCSPN_SSE2 __strcspn_sse2
58 # define STRCSPN_SSE42 __strcspn_sse42
59 #endif
61 #ifdef USE_AS_STRPBRK
62 # define RETURN(val1, val2) return val1
63 #else
64 # define RETURN(val1, val2) return val2
65 #endif
67 extern
68 #ifdef USE_AS_STRPBRK
69 char *
70 #else
71 size_t
72 #endif
73 STRCSPN_SSE2 (const char *, const char *);
76 #ifdef USE_AS_STRPBRK
77 char *
78 #else
79 size_t
80 #endif
81 __attribute__ ((section (".text.sse4.2")))
82 STRCSPN_SSE42 (const char *s, const char *a)
84 if (*a == 0)
85 RETURN (NULL, strlen (s));
87 const char *aligned;
88 __m128i mask;
89 /* Fake initialization. gcc otherwise will warn. */
90 asm ("" : "=xm" (mask));
91 int offset = (int) ((size_t) a & 15);
92 if (offset != 0)
94 /* Load masks. */
95 aligned = (const char *) ((size_t) a & -16L);
96 __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
98 switch (offset)
100 case 1:
101 mask = _mm_srli_si128 (mask0, 1);
102 break;
103 case 2:
104 mask = _mm_srli_si128 (mask0, 2);
105 break;
106 case 3:
107 mask = _mm_srli_si128 (mask0, 3);
108 break;
109 case 4:
110 mask = _mm_srli_si128 (mask0, 4);
111 break;
112 case 5:
113 mask = _mm_srli_si128 (mask0, 5);
114 break;
115 case 6:
116 mask = _mm_srli_si128 (mask0, 6);
117 break;
118 case 7:
119 mask = _mm_srli_si128 (mask0, 7);
120 break;
121 case 8:
122 mask = _mm_srli_si128 (mask0, 8);
123 break;
124 case 9:
125 mask = _mm_srli_si128 (mask0, 9);
126 break;
127 case 10:
128 mask = _mm_srli_si128 (mask0, 10);
129 break;
130 case 11:
131 mask = _mm_srli_si128 (mask0, 11);
132 break;
133 case 12:
134 mask = _mm_srli_si128 (mask0, 12);
135 break;
136 case 13:
137 mask = _mm_srli_si128 (mask0, 13);
138 break;
139 case 14:
140 mask = _mm_srli_si128 (mask0, 14);
141 break;
142 case 15:
143 mask = _mm_srli_si128 (mask0, 15);
144 break;
147 /* Find where the NULL terminator is. */
148 int length = _mm_cmpistri (mask, mask, 0x3a);
149 if (length == 16 - offset)
151 /* There is no NULL terminator. */
152 __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
153 int index = _mm_cmpistri (mask1, mask1, 0x3a);
154 length += index;
156 /* Don't use SSE4.2 if the length of A > 16. */
157 if (length > 16)
158 return STRCSPN_SSE2 (s, a);
160 if (index != 0)
162 /* Combine mask0 and mask1. */
163 switch (offset)
165 case 1:
166 mask = _mm_alignr_epi8 (mask1, mask0, 1);
167 break;
168 case 2:
169 mask = _mm_alignr_epi8 (mask1, mask0, 2);
170 break;
171 case 3:
172 mask = _mm_alignr_epi8 (mask1, mask0, 3);
173 break;
174 case 4:
175 mask = _mm_alignr_epi8 (mask1, mask0, 4);
176 break;
177 case 5:
178 mask = _mm_alignr_epi8 (mask1, mask0, 5);
179 break;
180 case 6:
181 mask = _mm_alignr_epi8 (mask1, mask0, 6);
182 break;
183 case 7:
184 mask = _mm_alignr_epi8 (mask1, mask0, 7);
185 break;
186 case 8:
187 mask = _mm_alignr_epi8 (mask1, mask0, 8);
188 break;
189 case 9:
190 mask = _mm_alignr_epi8 (mask1, mask0, 9);
191 break;
192 case 10:
193 mask = _mm_alignr_epi8 (mask1, mask0, 10);
194 break;
195 case 11:
196 mask = _mm_alignr_epi8 (mask1, mask0, 11);
197 break;
198 case 12:
199 mask = _mm_alignr_epi8 (mask1, mask0, 12);
200 break;
201 case 13:
202 mask = _mm_alignr_epi8 (mask1, mask0, 13);
203 break;
204 case 14:
205 mask = _mm_alignr_epi8 (mask1, mask0, 14);
206 break;
207 case 15:
208 mask = _mm_alignr_epi8 (mask1, mask0, 15);
209 break;
214 else
216 /* A is aligned. */
217 mask = _mm_load_si128 ((__m128i *) a);
219 /* Find where the NULL terminator is. */
220 int length = _mm_cmpistri (mask, mask, 0x3a);
221 if (length == 16)
223 /* There is no NULL terminator. Don't use SSE4.2 if the length
224 of A > 16. */
225 if (a[16] != 0)
226 return STRCSPN_SSE2 (s, a);
230 offset = (int) ((size_t) s & 15);
231 if (offset != 0)
233 /* Check partial string. */
234 aligned = (const char *) ((size_t) s & -16L);
235 __m128i value = _mm_load_si128 ((__m128i *) aligned);
237 switch (offset)
239 case 1:
240 value = _mm_srli_si128 (value, 1);
241 break;
242 case 2:
243 value = _mm_srli_si128 (value, 2);
244 break;
245 case 3:
246 value = _mm_srli_si128 (value, 3);
247 break;
248 case 4:
249 value = _mm_srli_si128 (value, 4);
250 break;
251 case 5:
252 value = _mm_srli_si128 (value, 5);
253 break;
254 case 6:
255 value = _mm_srli_si128 (value, 6);
256 break;
257 case 7:
258 value = _mm_srli_si128 (value, 7);
259 break;
260 case 8:
261 value = _mm_srli_si128 (value, 8);
262 break;
263 case 9:
264 value = _mm_srli_si128 (value, 9);
265 break;
266 case 10:
267 value = _mm_srli_si128 (value, 10);
268 break;
269 case 11:
270 value = _mm_srli_si128 (value, 11);
271 break;
272 case 12:
273 value = _mm_srli_si128 (value, 12);
274 break;
275 case 13:
276 value = _mm_srli_si128 (value, 13);
277 break;
278 case 14:
279 value = _mm_srli_si128 (value, 14);
280 break;
281 case 15:
282 value = _mm_srli_si128 (value, 15);
283 break;
286 int length = _mm_cmpistri (mask, value, 0x2);
287 /* No need to check ZFlag since ZFlag is always 1. */
288 int cflag = _mm_cmpistrc (mask, value, 0x2);
289 if (cflag)
290 RETURN ((char *) (s + length), length);
291 /* Find where the NULL terminator is. */
292 int index = _mm_cmpistri (value, value, 0x3a);
293 if (index < 16 - offset)
294 RETURN (NULL, index);
295 aligned += 16;
297 else
298 aligned = s;
300 while (1)
302 __m128i value = _mm_load_si128 ((__m128i *) aligned);
303 int index = _mm_cmpistri (mask, value, 0x2);
304 int cflag = _mm_cmpistrc (mask, value, 0x2);
305 int zflag = _mm_cmpistrz (mask, value, 0x2);
306 if (cflag)
307 RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
308 if (zflag)
309 RETURN (NULL,
310 /* Find where the NULL terminator is. */
311 (size_t) (aligned + _mm_cmpistri (value, value, 0x3a) - s));
312 aligned += 16;