Add x86 32-bit SSE4.2 string functions.
[glibc.git] / sysdeps / x86_64 / multiarch / strspn-c.c
blobbe9e8ac0a85961091834e1d8a5a427ffb98f82e7
1 /* strspn with SSE4.2 intrinsics
2 Copyright (C) 2009 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <nmmintrin.h>
22 #include <string.h>
24 /* We use 0x12:
25 _SIDD_SBYTE_OPS
26 | _SIDD_CMP_EQUAL_ANY
27 | _SIDD_NEGATIVE_POLARITY
28 | _SIDD_LEAST_SIGNIFICANT
29 on pcmpistri to compare xmm/mem128
31 0 1 2 3 4 5 6 7 8 9 A B C D E F
32 X X X X X X X X X X X X X X X X
34 against xmm
36 0 1 2 3 4 5 6 7 8 9 A B C D E F
37 A A A A A A A A A A A A A A A A
39 to find out if the first 16byte data element has any non-A byte and
40 the offset of the first byte. There are 2 cases:
42 1. The first 16byte data element has the non-A byte, including
43 EOS, at the offset X.
44 2. The first 16byte data element is valid and doesn't have the non-A
45 byte.
47 Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
49 case ECX CFlag ZFlag SFlag
50 1 X 1 0/1 0
51 2 16 0 0 0
53 We exit from the loop for case 1. */
55 extern size_t __strspn_sse2 (const char *, const char *);
58 size_t
59 __attribute__ ((section (".text.sse4.2")))
60 __strspn_sse42 (const char *s, const char *a)
62 if (*a == 0)
63 return 0;
65 const char *aligned;
66 __m128i mask;
67 int offset = (int) ((size_t) a & 15);
68 if (offset != 0)
70 /* Load masks. */
71 aligned = (const char *) ((size_t) a & -16L);
72 __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
74 switch (offset)
76 case 1:
77 mask = _mm_srli_si128 (mask0, 1);
78 break;
79 case 2:
80 mask = _mm_srli_si128 (mask0, 2);
81 break;
82 case 3:
83 mask = _mm_srli_si128 (mask0, 3);
84 break;
85 case 4:
86 mask = _mm_srli_si128 (mask0, 4);
87 break;
88 case 5:
89 mask = _mm_srli_si128 (mask0, 5);
90 break;
91 case 6:
92 mask = _mm_srli_si128 (mask0, 6);
93 break;
94 case 7:
95 mask = _mm_srli_si128 (mask0, 7);
96 break;
97 case 8:
98 mask = _mm_srli_si128 (mask0, 8);
99 break;
100 case 9:
101 mask = _mm_srli_si128 (mask0, 9);
102 break;
103 case 10:
104 mask = _mm_srli_si128 (mask0, 10);
105 break;
106 case 11:
107 mask = _mm_srli_si128 (mask0, 11);
108 break;
109 case 12:
110 mask = _mm_srli_si128 (mask0, 12);
111 break;
112 case 13:
113 mask = _mm_srli_si128 (mask0, 13);
114 break;
115 case 14:
116 mask = _mm_srli_si128 (mask0, 14);
117 break;
118 case 15:
119 mask = _mm_srli_si128 (mask0, 15);
120 break;
123 /* Find where the NULL terminator is. */
124 int length = _mm_cmpistri (mask, mask, 0x3a);
125 if (length == 16 - offset)
127 /* There is no NULL terminator. */
128 __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
129 int index = _mm_cmpistri (mask1, mask1, 0x3a);
130 length += index;
132 /* Don't use SSE4.2 if the length of A > 16. */
133 if (length > 16)
134 return __strspn_sse2 (s, a);
136 if (index != 0)
138 /* Combine mask0 and mask1. */
139 switch (offset)
141 case 1:
142 mask = _mm_alignr_epi8 (mask1, mask0, 1);
143 break;
144 case 2:
145 mask = _mm_alignr_epi8 (mask1, mask0, 2);
146 break;
147 case 3:
148 mask = _mm_alignr_epi8 (mask1, mask0, 3);
149 break;
150 case 4:
151 mask = _mm_alignr_epi8 (mask1, mask0, 4);
152 break;
153 case 5:
154 mask = _mm_alignr_epi8 (mask1, mask0, 5);
155 break;
156 case 6:
157 mask = _mm_alignr_epi8 (mask1, mask0, 6);
158 break;
159 case 7:
160 mask = _mm_alignr_epi8 (mask1, mask0, 7);
161 break;
162 case 8:
163 mask = _mm_alignr_epi8 (mask1, mask0, 8);
164 break;
165 case 9:
166 mask = _mm_alignr_epi8 (mask1, mask0, 9);
167 break;
168 case 10:
169 mask = _mm_alignr_epi8 (mask1, mask0, 10);
170 break;
171 case 11:
172 mask = _mm_alignr_epi8 (mask1, mask0, 11);
173 break;
174 case 12:
175 mask = _mm_alignr_epi8 (mask1, mask0, 12);
176 break;
177 case 13:
178 mask = _mm_alignr_epi8 (mask1, mask0, 13);
179 break;
180 case 14:
181 mask = _mm_alignr_epi8 (mask1, mask0, 14);
182 break;
183 case 15:
184 mask = _mm_alignr_epi8 (mask1, mask0, 15);
185 break;
190 else
192 /* A is aligned. */
193 mask = _mm_load_si128 ((__m128i *) a);
195 /* Find where the NULL terminator is. */
196 int length = _mm_cmpistri (mask, mask, 0x3a);
197 if (length == 16)
199 /* There is no NULL terminator. Don't use SSE4.2 if the length
200 of A > 16. */
201 if (a[16] != 0)
202 return __strspn_sse2 (s, a);
206 offset = (int) ((size_t) s & 15);
207 if (offset != 0)
209 /* Check partial string. */
210 aligned = (const char *) ((size_t) s & -16L);
211 __m128i value = _mm_load_si128 ((__m128i *) aligned);
213 switch (offset)
215 case 1:
216 value = _mm_srli_si128 (value, 1);
217 break;
218 case 2:
219 value = _mm_srli_si128 (value, 2);
220 break;
221 case 3:
222 value = _mm_srli_si128 (value, 3);
223 break;
224 case 4:
225 value = _mm_srli_si128 (value, 4);
226 break;
227 case 5:
228 value = _mm_srli_si128 (value, 5);
229 break;
230 case 6:
231 value = _mm_srli_si128 (value, 6);
232 break;
233 case 7:
234 value = _mm_srli_si128 (value, 7);
235 break;
236 case 8:
237 value = _mm_srli_si128 (value, 8);
238 break;
239 case 9:
240 value = _mm_srli_si128 (value, 9);
241 break;
242 case 10:
243 value = _mm_srli_si128 (value, 10);
244 break;
245 case 11:
246 value = _mm_srli_si128 (value, 11);
247 break;
248 case 12:
249 value = _mm_srli_si128 (value, 12);
250 break;
251 case 13:
252 value = _mm_srli_si128 (value, 13);
253 break;
254 case 14:
255 value = _mm_srli_si128 (value, 14);
256 break;
257 case 15:
258 value = _mm_srli_si128 (value, 15);
259 break;
262 int length = _mm_cmpistri (mask, value, 0x12);
263 /* No need to check CFlag since it is always 1. */
264 if (length < 16 - offset)
265 return length;
266 /* Find where the NULL terminator is. */
267 int index = _mm_cmpistri (value, value, 0x3a);
268 if (index < 16 - offset)
269 return length;
270 aligned += 16;
272 else
273 aligned = s;
275 while (1)
277 __m128i value = _mm_load_si128 ((__m128i *) aligned);
278 int index = _mm_cmpistri (mask, value, 0x12);
279 int cflag = _mm_cmpistrc (mask, value, 0x12);
280 if (cflag)
281 return (size_t) (aligned + index - s);
282 aligned += 16;