Require autoconf 2.69
[glibc.git] / string / strcoll_l.c
blobd4f42a32e552afc41c57182f3bda01061b53aaae
1 /* Copyright (C) 1995-2014 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
20 #include <assert.h>
21 #include <langinfo.h>
22 #include <locale.h>
23 #include <stddef.h>
24 #include <stdint.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <sys/param.h>
29 #ifndef STRING_TYPE
30 # define STRING_TYPE char
31 # define USTRING_TYPE unsigned char
32 # define STRCOLL __strcoll_l
33 # define STRCMP strcmp
34 # define STRLEN strlen
35 # define WEIGHT_H "../locale/weight.h"
36 # define SUFFIX MB
37 # define L(arg) arg
38 #endif
40 #define CONCAT(a,b) CONCAT1(a,b)
41 #define CONCAT1(a,b) a##b
43 #include "../locale/localeinfo.h"
44 #include WEIGHT_H
46 /* Track status while looking for sequences in a string. */
47 typedef struct
49 int len; /* Length of the current sequence. */
50 size_t val; /* Position of the sequence relative to the
51 previous non-ignored sequence. */
52 size_t idxnow; /* Current index in sequences. */
53 size_t idxmax; /* Maximum index in sequences. */
54 size_t idxcnt; /* Current count of indices. */
55 size_t backw; /* Current Backward sequence index. */
56 size_t backw_stop; /* Index where the backward sequences stop. */
57 const USTRING_TYPE *us; /* The string. */
58 int32_t *idxarr; /* Array to cache weight indices. */
59 unsigned char *rulearr; /* Array to cache rules. */
60 unsigned char rule; /* Saved rule for the first sequence. */
61 int32_t idx; /* Index to weight of the current sequence. */
62 int32_t save_idx; /* Save looked up index of a forward
63 sequence after the last backward
64 sequence. */
65 const USTRING_TYPE *back_us; /* Beginning of the backward sequence. */
66 } coll_seq;
68 /* Get next sequence. The weight indices are cached, so we don't need to
69 traverse the string. */
70 static void
71 get_next_seq_cached (coll_seq *seq, int nrules, int pass,
72 const unsigned char *rulesets,
73 const USTRING_TYPE *weights)
75 size_t val = seq->val = 0;
76 int len = seq->len;
77 size_t backw_stop = seq->backw_stop;
78 size_t backw = seq->backw;
79 size_t idxcnt = seq->idxcnt;
80 size_t idxmax = seq->idxmax;
81 size_t idxnow = seq->idxnow;
82 unsigned char *rulearr = seq->rulearr;
83 int32_t *idxarr = seq->idxarr;
85 while (len == 0)
87 ++val;
88 if (backw_stop != ~0ul)
90 /* There is something pushed. */
91 if (backw == backw_stop)
93 /* The last pushed character was handled. Continue
94 with forward characters. */
95 if (idxcnt < idxmax)
97 idxnow = idxcnt;
98 backw_stop = ~0ul;
100 else
102 /* Nothing any more. The backward sequence
103 ended with the last sequence in the string. */
104 idxnow = ~0ul;
105 break;
108 else
109 idxnow = --backw;
111 else
113 backw_stop = idxcnt;
115 while (idxcnt < idxmax)
117 if ((rulesets[rulearr[idxcnt] * nrules + pass]
118 & sort_backward) == 0)
119 /* No more backward characters to push. */
120 break;
121 ++idxcnt;
124 if (backw_stop == idxcnt)
126 /* No sequence at all or just one. */
127 if (idxcnt == idxmax)
128 /* Note that LEN is still zero. */
129 break;
131 backw_stop = ~0ul;
132 idxnow = idxcnt++;
134 else
135 /* We pushed backward sequences. */
136 idxnow = backw = idxcnt - 1;
138 len = weights[idxarr[idxnow]++];
141 /* Update the structure. */
142 seq->val = val;
143 seq->len = len;
144 seq->backw_stop = backw_stop;
145 seq->backw = backw;
146 seq->idxcnt = idxcnt;
147 seq->idxnow = idxnow;
150 /* Get next sequence. Traverse the string as required. */
151 static void
152 get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
153 const USTRING_TYPE *weights, const int32_t *table,
154 const USTRING_TYPE *extra, const int32_t *indirect)
156 size_t val = seq->val = 0;
157 int len = seq->len;
158 size_t backw_stop = seq->backw_stop;
159 size_t backw = seq->backw;
160 size_t idxcnt = seq->idxcnt;
161 size_t idxmax = seq->idxmax;
162 size_t idxnow = seq->idxnow;
163 unsigned char *rulearr = seq->rulearr;
164 int32_t *idxarr = seq->idxarr;
165 const USTRING_TYPE *us = seq->us;
167 while (len == 0)
169 ++val;
170 if (backw_stop != ~0ul)
172 /* There is something pushed. */
173 if (backw == backw_stop)
175 /* The last pushed character was handled. Continue
176 with forward characters. */
177 if (idxcnt < idxmax)
179 idxnow = idxcnt;
180 backw_stop = ~0ul;
182 else
183 /* Nothing any more. The backward sequence ended with
184 the last sequence in the string. Note that LEN
185 is still zero. */
186 break;
188 else
189 idxnow = --backw;
191 else
193 backw_stop = idxmax;
195 while (*us != L('\0'))
197 int32_t tmp = findidx (table, indirect, extra, &us, -1);
198 rulearr[idxmax] = tmp >> 24;
199 idxarr[idxmax] = tmp & 0xffffff;
200 idxcnt = idxmax++;
202 if ((rulesets[rulearr[idxcnt] * nrules]
203 & sort_backward) == 0)
204 /* No more backward characters to push. */
205 break;
206 ++idxcnt;
209 if (backw_stop >= idxcnt)
211 /* No sequence at all or just one. */
212 if (idxcnt == idxmax || backw_stop > idxcnt)
213 /* Note that LEN is still zero. */
214 break;
216 backw_stop = ~0ul;
217 idxnow = idxcnt;
219 else
220 /* We pushed backward sequences. */
221 idxnow = backw = idxcnt - 1;
223 len = weights[idxarr[idxnow]++];
226 /* Update the structure. */
227 seq->val = val;
228 seq->len = len;
229 seq->backw_stop = backw_stop;
230 seq->backw = backw;
231 seq->idxcnt = idxcnt;
232 seq->idxmax = idxmax;
233 seq->idxnow = idxnow;
234 seq->us = us;
237 /* Get next sequence. Traverse the string as required. This function does not
238 set or use any index or rule cache. */
239 static void
240 get_next_seq_nocache (coll_seq *seq, int nrules, const unsigned char *rulesets,
241 const USTRING_TYPE *weights, const int32_t *table,
242 const USTRING_TYPE *extra, const int32_t *indirect,
243 int pass)
245 size_t val = seq->val = 0;
246 int len = seq->len;
247 size_t backw_stop = seq->backw_stop;
248 size_t backw = seq->backw;
249 size_t idxcnt = seq->idxcnt;
250 size_t idxmax = seq->idxmax;
251 int32_t idx = seq->idx;
252 const USTRING_TYPE *us = seq->us;
254 while (len == 0)
256 ++val;
257 if (backw_stop != ~0ul)
259 /* There is something pushed. */
260 if (backw == backw_stop)
262 /* The last pushed character was handled. Continue
263 with forward characters. */
264 if (idxcnt < idxmax)
266 idx = seq->save_idx;
267 backw_stop = ~0ul;
269 else
271 /* Nothing anymore. The backward sequence ended with
272 the last sequence in the string. Note that len is
273 still zero. */
274 idx = 0;
275 break;
278 else
280 /* XXX Traverse BACKW sequences from the beginning of
281 BACKW_STOP to get the next sequence. Is ther a quicker way
282 to do this? */
283 size_t i = backw_stop;
284 us = seq->back_us;
285 while (i < backw)
287 int32_t tmp = findidx (table, indirect, extra, &us, -1);
288 idx = tmp & 0xffffff;
289 i++;
291 --backw;
292 us = seq->us;
295 else
297 backw_stop = idxmax;
298 int32_t prev_idx = idx;
300 while (*us != L('\0'))
302 int32_t tmp = findidx (table, indirect, extra, &us, -1);
303 unsigned char rule = tmp >> 24;
304 prev_idx = idx;
305 idx = tmp & 0xffffff;
306 idxcnt = idxmax++;
308 /* Save the rule for the first sequence. */
309 if (__glibc_unlikely (idxcnt == 0))
310 seq->rule = rule;
312 if ((rulesets[rule * nrules + pass]
313 & sort_backward) == 0)
314 /* No more backward characters to push. */
315 break;
316 ++idxcnt;
319 if (backw_stop >= idxcnt)
321 /* No sequence at all or just one. */
322 if (idxcnt == idxmax || backw_stop > idxcnt)
323 /* Note that len is still zero. */
324 break;
326 backw_stop = ~0ul;
328 else
330 /* We pushed backward sequences. If the stream ended with the
331 backward sequence, then we process the last sequence we
332 found. Otherwise we process the sequence before the last
333 one since the last one was a forward sequence. */
334 seq->back_us = seq->us;
335 seq->us = us;
336 backw = idxcnt;
337 if (idxmax > idxcnt)
339 backw--;
340 seq->save_idx = idx;
341 idx = prev_idx;
343 if (backw > backw_stop)
344 backw--;
348 len = weights[idx++];
349 /* Skip over indices of previous levels. */
350 for (int i = 0; i < pass; i++)
352 idx += len;
353 len = weights[idx];
354 idx++;
358 /* Update the structure. */
359 seq->val = val;
360 seq->len = len;
361 seq->backw_stop = backw_stop;
362 seq->backw = backw;
363 seq->idxcnt = idxcnt;
364 seq->idxmax = idxmax;
365 seq->us = us;
366 seq->idx = idx;
369 /* Compare two sequences. This version does not use the index and rules
370 cache. */
371 static int
372 do_compare_nocache (coll_seq *seq1, coll_seq *seq2, int position,
373 const USTRING_TYPE *weights)
375 int seq1len = seq1->len;
376 int seq2len = seq2->len;
377 size_t val1 = seq1->val;
378 size_t val2 = seq2->val;
379 int idx1 = seq1->idx;
380 int idx2 = seq2->idx;
381 int result = 0;
383 /* Test for position if necessary. */
384 if (position && val1 != val2)
386 result = val1 > val2 ? 1 : -1;
387 goto out;
390 /* Compare the two sequences. */
393 if (weights[idx1] != weights[idx2])
395 /* The sequences differ. */
396 result = weights[idx1] - weights[idx2];
397 goto out;
400 /* Increment the offsets. */
401 ++idx1;
402 ++idx2;
404 --seq1len;
405 --seq2len;
407 while (seq1len > 0 && seq2len > 0);
409 if (position && seq1len != seq2len)
410 result = seq1len - seq2len;
412 out:
413 seq1->len = seq1len;
414 seq2->len = seq2len;
415 seq1->idx = idx1;
416 seq2->idx = idx2;
417 return result;
420 /* Compare two sequences using the index cache. */
421 static int
422 do_compare (coll_seq *seq1, coll_seq *seq2, int position,
423 const USTRING_TYPE *weights)
425 int seq1len = seq1->len;
426 int seq2len = seq2->len;
427 size_t val1 = seq1->val;
428 size_t val2 = seq2->val;
429 int32_t *idx1arr = seq1->idxarr;
430 int32_t *idx2arr = seq2->idxarr;
431 int idx1now = seq1->idxnow;
432 int idx2now = seq2->idxnow;
433 int result = 0;
435 /* Test for position if necessary. */
436 if (position && val1 != val2)
438 result = val1 > val2 ? 1 : -1;
439 goto out;
442 /* Compare the two sequences. */
445 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
447 /* The sequences differ. */
448 result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
449 goto out;
452 /* Increment the offsets. */
453 ++idx1arr[idx1now];
454 ++idx2arr[idx2now];
456 --seq1len;
457 --seq2len;
459 while (seq1len > 0 && seq2len > 0);
461 if (position && seq1len != seq2len)
462 result = seq1len - seq2len;
464 out:
465 seq1->len = seq1len;
466 seq2->len = seq2len;
467 return result;
471 STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
473 struct __locale_data *current = l->__locales[LC_COLLATE];
474 uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
475 /* We don't assign the following values right away since it might be
476 unnecessary in case there are no rules. */
477 const unsigned char *rulesets;
478 const int32_t *table;
479 const USTRING_TYPE *weights;
480 const USTRING_TYPE *extra;
481 const int32_t *indirect;
483 if (nrules == 0)
484 return STRCMP (s1, s2);
486 rulesets = (const unsigned char *)
487 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
488 table = (const int32_t *)
489 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
490 weights = (const USTRING_TYPE *)
491 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
492 extra = (const USTRING_TYPE *)
493 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
494 indirect = (const int32_t *)
495 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
497 assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
498 assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
499 assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
500 assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
502 /* We need this a few times. */
503 size_t s1len = STRLEN (s1);
504 size_t s2len = STRLEN (s2);
506 /* Catch empty strings. */
507 if (__glibc_unlikely (s1len == 0) || __glibc_unlikely (s2len == 0))
508 return (s1len != 0) - (s2len != 0);
510 /* Perform the first pass over the string and while doing this find
511 and store the weights for each character. Since we want this to
512 be as fast as possible we are using `alloca' to store the temporary
513 values. But since there is no limit on the length of the string
514 we have to use `malloc' if the string is too long. We should be
515 very conservative here.
517 Please note that the localedef programs makes sure that `position'
518 is not used at the first level. */
520 coll_seq seq1, seq2;
521 bool use_malloc = false;
522 int result = 0;
524 memset (&seq1, 0, sizeof (seq1));
525 seq2 = seq1;
527 size_t size_max = SIZE_MAX / (sizeof (int32_t) + 1);
529 if (MIN (s1len, s2len) > size_max
530 || MAX (s1len, s2len) > size_max - MIN (s1len, s2len))
532 /* If the strings are long enough to cause overflow in the size request,
533 then skip the allocation and proceed with the non-cached routines. */
535 else if (! __libc_use_alloca ((s1len + s2len) * (sizeof (int32_t) + 1)))
537 seq1.idxarr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
539 /* If we failed to allocate memory, we leave everything as NULL so that
540 we use the nocache version of traversal and comparison functions. */
541 if (seq1.idxarr != NULL)
543 seq2.idxarr = &seq1.idxarr[s1len];
544 seq1.rulearr = (unsigned char *) &seq2.idxarr[s2len];
545 seq2.rulearr = &seq1.rulearr[s1len];
546 use_malloc = true;
549 else
551 seq1.idxarr = (int32_t *) alloca (s1len * sizeof (int32_t));
552 seq2.idxarr = (int32_t *) alloca (s2len * sizeof (int32_t));
553 seq1.rulearr = (unsigned char *) alloca (s1len);
554 seq2.rulearr = (unsigned char *) alloca (s2len);
557 int rule = 0;
559 /* Cache values in the first pass and if needed, use them in subsequent
560 passes. */
561 for (int pass = 0; pass < nrules; ++pass)
563 seq1.idxcnt = 0;
564 seq1.idx = 0;
565 seq2.idx = 0;
566 seq1.backw_stop = ~0ul;
567 seq1.backw = ~0ul;
568 seq2.idxcnt = 0;
569 seq2.backw_stop = ~0ul;
570 seq2.backw = ~0ul;
572 /* We need the elements of the strings as unsigned values since they
573 are used as indices. */
574 seq1.us = (const USTRING_TYPE *) s1;
575 seq2.us = (const USTRING_TYPE *) s2;
577 /* We assume that if a rule has defined `position' in one section
578 this is true for all of them. */
579 int position = rulesets[rule * nrules + pass] & sort_position;
581 while (1)
583 if (__glibc_unlikely (seq1.idxarr == NULL))
585 get_next_seq_nocache (&seq1, nrules, rulesets, weights, table,
586 extra, indirect, pass);
587 get_next_seq_nocache (&seq2, nrules, rulesets, weights, table,
588 extra, indirect, pass);
590 else if (pass == 0)
592 get_next_seq (&seq1, nrules, rulesets, weights, table, extra,
593 indirect);
594 get_next_seq (&seq2, nrules, rulesets, weights, table, extra,
595 indirect);
597 else
599 get_next_seq_cached (&seq1, nrules, pass, rulesets, weights);
600 get_next_seq_cached (&seq2, nrules, pass, rulesets, weights);
603 /* See whether any or both strings are empty. */
604 if (seq1.len == 0 || seq2.len == 0)
606 if (seq1.len == seq2.len)
607 /* Both ended. So far so good, both strings are equal
608 at this level. */
609 break;
611 /* This means one string is shorter than the other. Find out
612 which one and return an appropriate value. */
613 result = seq1.len == 0 ? -1 : 1;
614 goto free_and_return;
617 if (__glibc_unlikely (seq1.idxarr == NULL))
618 result = do_compare_nocache (&seq1, &seq2, position, weights);
619 else
620 result = do_compare (&seq1, &seq2, position, weights);
621 if (result != 0)
622 goto free_and_return;
625 if (__glibc_likely (seq1.rulearr != NULL))
626 rule = seq1.rulearr[0];
627 else
628 rule = seq1.rule;
631 /* Free the memory if needed. */
632 free_and_return:
633 if (use_malloc)
634 free (seq1.idxarr);
636 return result;
638 libc_hidden_def (STRCOLL)
640 #ifndef WIDE_CHAR_VERSION
641 weak_alias (__strcoll_l, strcoll_l)
642 #endif