1 /* Copyright (C) 1995-2013 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
29 # define STRING_TYPE char
30 # define USTRING_TYPE unsigned char
31 # define STRCOLL __strcoll_l
32 # define STRCMP strcmp
33 # define STRLEN strlen
34 # define WEIGHT_H "../locale/weight.h"
39 #define CONCAT(a,b) CONCAT1(a,b)
40 #define CONCAT1(a,b) a##b
42 #include "../locale/localeinfo.h"
44 /* Track status while looking for sequences in a string. */
47 int len
; /* Length of the current sequence. */
48 int val
; /* Position of the sequence relative to the
49 previous non-ignored sequence. */
50 size_t idxnow
; /* Current index in sequences. */
51 size_t idxmax
; /* Maximum index in sequences. */
52 size_t idxcnt
; /* Current count of indices. */
53 size_t backw
; /* Current Backward sequence index. */
54 size_t backw_stop
; /* Index where the backward sequences stop. */
55 const USTRING_TYPE
*us
; /* The string. */
56 int32_t *idxarr
; /* Array to cache weight indices. */
57 unsigned char *rulearr
; /* Array to cache rules. */
60 /* Get next sequence. The weight indices are cached, so we don't need to
61 traverse the string. */
63 get_next_seq_cached (coll_seq
*seq
, int nrules
, int pass
,
64 const unsigned char *rulesets
,
65 const USTRING_TYPE
*weights
)
67 int val
= seq
->val
= 0;
69 size_t backw_stop
= seq
->backw_stop
;
70 size_t backw
= seq
->backw
;
71 size_t idxcnt
= seq
->idxcnt
;
72 size_t idxmax
= seq
->idxmax
;
73 size_t idxnow
= seq
->idxnow
;
74 unsigned char *rulearr
= seq
->rulearr
;
75 int32_t *idxarr
= seq
->idxarr
;
80 if (backw_stop
!= ~0ul)
82 /* There is something pushed. */
83 if (backw
== backw_stop
)
85 /* The last pushed character was handled. Continue
86 with forward characters. */
94 /* Nothing any more. The backward sequence
95 ended with the last sequence in the string. */
107 while (idxcnt
< idxmax
)
109 if ((rulesets
[rulearr
[idxcnt
] * nrules
+ pass
]
110 & sort_backward
) == 0)
111 /* No more backward characters to push. */
116 if (backw_stop
== idxcnt
)
118 /* No sequence at all or just one. */
119 if (idxcnt
== idxmax
)
120 /* Note that LEN is still zero. */
127 /* We pushed backward sequences. */
128 idxnow
= backw
= idxcnt
- 1;
130 len
= weights
[idxarr
[idxnow
]++];
133 /* Update the structure. */
136 seq
->backw_stop
= backw_stop
;
138 seq
->idxcnt
= idxcnt
;
139 seq
->idxnow
= idxnow
;
142 /* Get next sequence. Traverse the string as required. */
144 get_next_seq (coll_seq
*seq
, int nrules
, const unsigned char *rulesets
,
145 const USTRING_TYPE
*weights
, const int32_t *table
,
146 const USTRING_TYPE
*extra
, const int32_t *indirect
)
149 int val
= seq
->val
= 0;
151 size_t backw_stop
= seq
->backw_stop
;
152 size_t backw
= seq
->backw
;
153 size_t idxcnt
= seq
->idxcnt
;
154 size_t idxmax
= seq
->idxmax
;
155 size_t idxnow
= seq
->idxnow
;
156 unsigned char *rulearr
= seq
->rulearr
;
157 int32_t *idxarr
= seq
->idxarr
;
158 const USTRING_TYPE
*us
= seq
->us
;
163 if (backw_stop
!= ~0ul)
165 /* The is something pushed. */
166 if (backw
== backw_stop
)
168 /* The last pushed character was handled. Continue
169 with forward characters. */
176 /* Nothing any more. The backward sequence ended with
177 the last sequence in the string. Note that LEN
188 while (*us
!= L('\0'))
190 int32_t tmp
= findidx (&us
, -1);
191 rulearr
[idxmax
] = tmp
>> 24;
192 idxarr
[idxmax
] = tmp
& 0xffffff;
195 if ((rulesets
[rulearr
[idxcnt
] * nrules
]
196 & sort_backward
) == 0)
197 /* No more backward characters to push. */
202 if (backw_stop
>= idxcnt
)
204 /* No sequence at all or just one. */
205 if (idxcnt
== idxmax
|| backw_stop
> idxcnt
)
206 /* Note that LEN is still zero. */
213 /* We pushed backward sequences. */
214 idxnow
= backw
= idxcnt
- 1;
216 len
= weights
[idxarr
[idxnow
]++];
219 /* Update the structure. */
222 seq
->backw_stop
= backw_stop
;
224 seq
->idxcnt
= idxcnt
;
225 seq
->idxmax
= idxmax
;
226 seq
->idxnow
= idxnow
;
230 /* Compare two sequences. */
232 do_compare (coll_seq
*seq1
, coll_seq
*seq2
, int position
,
233 const USTRING_TYPE
*weights
)
235 int seq1len
= seq1
->len
;
236 int seq2len
= seq2
->len
;
237 int val1
= seq1
->val
;
238 int val2
= seq2
->val
;
239 int32_t *idx1arr
= seq1
->idxarr
;
240 int32_t *idx2arr
= seq2
->idxarr
;
241 int idx1now
= seq1
->idxnow
;
242 int idx2now
= seq2
->idxnow
;
245 /* Test for position if necessary. */
246 if (position
&& val1
!= val2
)
248 result
= val1
- val2
;
252 /* Compare the two sequences. */
255 if (weights
[idx1arr
[idx1now
]] != weights
[idx2arr
[idx2now
]])
257 /* The sequences differ. */
258 result
= weights
[idx1arr
[idx1now
]] - weights
[idx2arr
[idx2now
]];
262 /* Increment the offsets. */
269 while (seq1len
> 0 && seq2len
> 0);
271 if (position
&& seq1len
!= seq2len
)
272 result
= seq1len
- seq2len
;
281 STRCOLL (const STRING_TYPE
*s1
, const STRING_TYPE
*s2
, __locale_t l
)
283 struct __locale_data
*current
= l
->__locales
[LC_COLLATE
];
284 uint_fast32_t nrules
= current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_NRULES
)].word
;
285 /* We don't assign the following values right away since it might be
286 unnecessary in case there are no rules. */
287 const unsigned char *rulesets
;
288 const int32_t *table
;
289 const USTRING_TYPE
*weights
;
290 const USTRING_TYPE
*extra
;
291 const int32_t *indirect
;
294 return STRCMP (s1
, s2
);
296 rulesets
= (const unsigned char *)
297 current
->values
[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS
)].string
;
298 table
= (const int32_t *)
299 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE
,SUFFIX
))].string
;
300 weights
= (const USTRING_TYPE
*)
301 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT
,SUFFIX
))].string
;
302 extra
= (const USTRING_TYPE
*)
303 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA
,SUFFIX
))].string
;
304 indirect
= (const int32_t *)
305 current
->values
[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT
,SUFFIX
))].string
;
307 assert (((uintptr_t) table
) % __alignof__ (table
[0]) == 0);
308 assert (((uintptr_t) weights
) % __alignof__ (weights
[0]) == 0);
309 assert (((uintptr_t) extra
) % __alignof__ (extra
[0]) == 0);
310 assert (((uintptr_t) indirect
) % __alignof__ (indirect
[0]) == 0);
312 /* We need this a few times. */
313 size_t s1len
= STRLEN (s1
);
314 size_t s2len
= STRLEN (s2
);
316 /* Catch empty strings. */
317 if (__glibc_unlikely (s1len
== 0) || __glibc_unlikely (s2len
== 0))
318 return (s1len
!= 0) - (s2len
!= 0);
320 /* Perform the first pass over the string and while doing this find
321 and store the weights for each character. Since we want this to
322 be as fast as possible we are using `alloca' to store the temporary
323 values. But since there is no limit on the length of the string
324 we have to use `malloc' if the string is too long. We should be
325 very conservative here.
327 Please note that the localedef programs makes sure that `position'
328 is not used at the first level. */
331 bool use_malloc
= false;
334 memset (&seq1
, 0, sizeof (seq1
));
337 /* We need the elements of the strings as unsigned values since they
338 are used as indices. */
339 seq1
.us
= (const USTRING_TYPE
*) s1
;
340 seq2
.us
= (const USTRING_TYPE
*) s2
;
342 if (! __libc_use_alloca ((s1len
+ s2len
) * (sizeof (int32_t) + 1)))
344 seq1
.idxarr
= (int32_t *) malloc ((s1len
+ s2len
) * (sizeof (int32_t) + 1));
345 seq2
.idxarr
= &seq1
.idxarr
[s1len
];
346 seq1
.rulearr
= (unsigned char *) &seq2
.idxarr
[s2len
];
347 seq2
.rulearr
= &seq1
.rulearr
[s1len
];
349 if (seq1
.idxarr
== NULL
)
350 /* No memory. Well, go with the stack then.
352 XXX Once this implementation is stable we will handle this
353 differently. Instead of precomputing the indices we will
354 do this in time. This means, though, that this happens for
362 seq1
.idxarr
= (int32_t *) alloca (s1len
* sizeof (int32_t));
363 seq2
.idxarr
= (int32_t *) alloca (s2len
* sizeof (int32_t));
364 seq1
.rulearr
= (unsigned char *) alloca (s1len
);
365 seq2
.rulearr
= (unsigned char *) alloca (s2len
);
370 /* Cache values in the first pass and if needed, use them in subsequent
372 for (int pass
= 0; pass
< nrules
; ++pass
)
375 seq1
.backw_stop
= ~0ul;
378 seq2
.backw_stop
= ~0ul;
381 /* We assume that if a rule has defined `position' in one section
382 this is true for all of them. */
383 int position
= rulesets
[seq1
.rulearr
[0] * nrules
+ pass
] & sort_position
;
389 get_next_seq (&seq1
, nrules
, rulesets
, weights
, table
, extra
,
391 get_next_seq (&seq2
, nrules
, rulesets
, weights
, table
, extra
,
396 get_next_seq_cached (&seq1
, nrules
, pass
, rulesets
, weights
);
397 get_next_seq_cached (&seq2
, nrules
, pass
, rulesets
, weights
);
400 /* See whether any or both strings are empty. */
401 if (seq1
.len
== 0 || seq2
.len
== 0)
403 if (seq1
.len
== seq2
.len
)
404 /* Both ended. So far so good, both strings are equal
408 /* This means one string is shorter than the other. Find out
409 which one and return an appropriate value. */
410 result
= seq1
.len
== 0 ? -1 : 1;
411 goto free_and_return
;
414 result
= do_compare (&seq1
, &seq2
, position
, weights
);
416 goto free_and_return
;
420 /* Free the memory if needed. */
427 libc_hidden_def (STRCOLL
)
429 #ifndef WIDE_CHAR_VERSION
430 weak_alias (__strcoll_l
, strcoll_l
)