1 /* strspn with SSE4.2 intrinsics
2 Copyright (C) 2009 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
21 #include <nmmintrin.h>
27 | _SIDD_NEGATIVE_POLARITY
28 | _SIDD_LEAST_SIGNIFICANT
29 on pcmpistri to compare xmm/mem128
31 0 1 2 3 4 5 6 7 8 9 A B C D E F
32 X X X X X X X X X X X X X X X X
36 0 1 2 3 4 5 6 7 8 9 A B C D E F
37 A A A A A A A A A A A A A A A A
39 to find out if the first 16byte data element has any non-A byte and
40 the offset of the first byte. There are 2 cases:
42 1. The first 16byte data element has the non-A byte, including
44 2. The first 16byte data element is valid and doesn't have the non-A
47 Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
49 case ECX CFlag ZFlag SFlag
53 We exit from the loop for case 1. */
55 extern size_t __strspn_sse2 (const char *, const char *);
59 __attribute__ ((section (".text.sse4.2")))
60 __strspn_sse42 (const char *s
, const char *a
)
67 int offset
= (int) ((size_t) a
& 15);
71 aligned
= (const char *) ((size_t) a
& -16L);
72 __m128i mask0
= _mm_load_si128 ((__m128i
*) aligned
);
77 mask
= _mm_srli_si128 (mask0
, 1);
80 mask
= _mm_srli_si128 (mask0
, 2);
83 mask
= _mm_srli_si128 (mask0
, 3);
86 mask
= _mm_srli_si128 (mask0
, 4);
89 mask
= _mm_srli_si128 (mask0
, 5);
92 mask
= _mm_srli_si128 (mask0
, 6);
95 mask
= _mm_srli_si128 (mask0
, 7);
98 mask
= _mm_srli_si128 (mask0
, 8);
101 mask
= _mm_srli_si128 (mask0
, 9);
104 mask
= _mm_srli_si128 (mask0
, 10);
107 mask
= _mm_srli_si128 (mask0
, 11);
110 mask
= _mm_srli_si128 (mask0
, 12);
113 mask
= _mm_srli_si128 (mask0
, 13);
116 mask
= _mm_srli_si128 (mask0
, 14);
119 mask
= _mm_srli_si128 (mask0
, 15);
123 /* Find where the NULL terminator is. */
124 int length
= _mm_cmpistri (mask
, mask
, 0x3a);
125 if (length
== 16 - offset
)
127 /* There is no NULL terminator. */
128 __m128i mask1
= _mm_load_si128 ((__m128i
*) (aligned
+ 16));
129 int index
= _mm_cmpistri (mask1
, mask1
, 0x3a);
132 /* Don't use SSE4.2 if the length of A > 16. */
134 return __strspn_sse2 (s
, a
);
138 /* Combine mask0 and mask1. */
142 mask
= _mm_alignr_epi8 (mask1
, mask0
, 1);
145 mask
= _mm_alignr_epi8 (mask1
, mask0
, 2);
148 mask
= _mm_alignr_epi8 (mask1
, mask0
, 3);
151 mask
= _mm_alignr_epi8 (mask1
, mask0
, 4);
154 mask
= _mm_alignr_epi8 (mask1
, mask0
, 5);
157 mask
= _mm_alignr_epi8 (mask1
, mask0
, 6);
160 mask
= _mm_alignr_epi8 (mask1
, mask0
, 7);
163 mask
= _mm_alignr_epi8 (mask1
, mask0
, 8);
166 mask
= _mm_alignr_epi8 (mask1
, mask0
, 9);
169 mask
= _mm_alignr_epi8 (mask1
, mask0
, 10);
172 mask
= _mm_alignr_epi8 (mask1
, mask0
, 11);
175 mask
= _mm_alignr_epi8 (mask1
, mask0
, 12);
178 mask
= _mm_alignr_epi8 (mask1
, mask0
, 13);
181 mask
= _mm_alignr_epi8 (mask1
, mask0
, 14);
184 mask
= _mm_alignr_epi8 (mask1
, mask0
, 15);
193 mask
= _mm_load_si128 ((__m128i
*) a
);
195 /* Find where the NULL terminator is. */
196 int length
= _mm_cmpistri (mask
, mask
, 0x3a);
199 /* There is no NULL terminator. Don't use SSE4.2 if the length
202 return __strspn_sse2 (s
, a
);
206 offset
= (int) ((size_t) s
& 15);
209 /* Check partial string. */
210 aligned
= (const char *) ((size_t) s
& -16L);
211 __m128i value
= _mm_load_si128 ((__m128i
*) aligned
);
216 value
= _mm_srli_si128 (value
, 1);
219 value
= _mm_srli_si128 (value
, 2);
222 value
= _mm_srli_si128 (value
, 3);
225 value
= _mm_srli_si128 (value
, 4);
228 value
= _mm_srli_si128 (value
, 5);
231 value
= _mm_srli_si128 (value
, 6);
234 value
= _mm_srli_si128 (value
, 7);
237 value
= _mm_srli_si128 (value
, 8);
240 value
= _mm_srli_si128 (value
, 9);
243 value
= _mm_srli_si128 (value
, 10);
246 value
= _mm_srli_si128 (value
, 11);
249 value
= _mm_srli_si128 (value
, 12);
252 value
= _mm_srli_si128 (value
, 13);
255 value
= _mm_srli_si128 (value
, 14);
258 value
= _mm_srli_si128 (value
, 15);
262 int length
= _mm_cmpistri (mask
, value
, 0x12);
263 /* No need to check CFlag since it is always 1. */
264 if (length
< 16 - offset
)
266 /* Find where the NULL terminator is. */
267 int index
= _mm_cmpistri (value
, value
, 0x3a);
268 if (index
< 16 - offset
)
277 __m128i value
= _mm_load_si128 ((__m128i
*) aligned
);
278 int index
= _mm_cmpistri (mask
, value
, 0x12);
279 int cflag
= _mm_cmpistrc (mask
, value
, 0x12);
281 return (size_t) (aligned
+ index
- s
);