Add missing copyright year updated and pretty printing.
[glibc.git] / sysdeps / x86_64 / multiarch / strspn-c.c
blob6faa259fd7089f76a1a21000d2437c3f13381563
1 /* strspn with SSE4.2 intrinsics
2 Copyright (C) 2009, 2010 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
21 #include <nmmintrin.h>
22 #include <string.h>
23 #include "varshift.h"
25 /* We use 0x12:
26 _SIDD_SBYTE_OPS
27 | _SIDD_CMP_EQUAL_ANY
28 | _SIDD_NEGATIVE_POLARITY
29 | _SIDD_LEAST_SIGNIFICANT
30 on pcmpistri to compare xmm/mem128
32 0 1 2 3 4 5 6 7 8 9 A B C D E F
33 X X X X X X X X X X X X X X X X
35 against xmm
37 0 1 2 3 4 5 6 7 8 9 A B C D E F
38 A A A A A A A A A A A A A A A A
40 to find out if the first 16byte data element has any non-A byte and
41 the offset of the first byte. There are 2 cases:
43 1. The first 16byte data element has the non-A byte, including
44 EOS, at the offset X.
45 2. The first 16byte data element is valid and doesn't have the non-A
46 byte.
48 Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
50 case ECX CFlag ZFlag SFlag
51 1 X 1 0/1 0
52 2 16 0 0 0
54 We exit from the loop for case 1. */
56 extern size_t __strspn_sse2 (const char *, const char *);
59 size_t
60 __attribute__ ((section (".text.sse4.2")))
61 __strspn_sse42 (const char *s, const char *a)
63 if (*a == 0)
64 return 0;
66 const char *aligned;
67 __m128i mask;
68 int offset = (int) ((size_t) a & 15);
69 if (offset != 0)
71 /* Load masks. */
72 aligned = (const char *) ((size_t) a & -16L);
73 __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
75 mask = __m128i_shift_right (mask0, offset);
77 /* Find where the NULL terminator is. */
78 int length = _mm_cmpistri (mask, mask, 0x3a);
79 if (length == 16 - offset)
81 /* There is no NULL terminator. */
82 __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
83 int index = _mm_cmpistri (mask1, mask1, 0x3a);
84 length += index;
86 /* Don't use SSE4.2 if the length of A > 16. */
87 if (length > 16)
88 return __strspn_sse2 (s, a);
90 if (index != 0)
92 /* Combine mask0 and mask1. We could play games with
93 palignr, but frankly this data should be in L1 now
94 so do the merge via an unaligned load. */
95 mask = _mm_loadu_si128 ((__m128i *) a);
99 else
101 /* A is aligned. */
102 mask = _mm_load_si128 ((__m128i *) a);
104 /* Find where the NULL terminator is. */
105 int length = _mm_cmpistri (mask, mask, 0x3a);
106 if (length == 16)
108 /* There is no NULL terminator. Don't use SSE4.2 if the length
109 of A > 16. */
110 if (a[16] != 0)
111 return __strspn_sse2 (s, a);
115 offset = (int) ((size_t) s & 15);
116 if (offset != 0)
118 /* Check partial string. */
119 aligned = (const char *) ((size_t) s & -16L);
120 __m128i value = _mm_load_si128 ((__m128i *) aligned);
122 value = __m128i_shift_right (value, offset);
124 int length = _mm_cmpistri (mask, value, 0x12);
125 /* No need to check CFlag since it is always 1. */
126 if (length < 16 - offset)
127 return length;
128 /* Find where the NULL terminator is. */
129 int index = _mm_cmpistri (value, value, 0x3a);
130 if (index < 16 - offset)
131 return length;
132 aligned += 16;
134 else
135 aligned = s;
137 while (1)
139 __m128i value = _mm_load_si128 ((__m128i *) aligned);
140 int index = _mm_cmpistri (mask, value, 0x12);
141 int cflag = _mm_cmpistrc (mask, value, 0x12);
142 if (cflag)
143 return (size_t) (aligned + index - s);
144 aligned += 16;