Add localedef --big-endian and --little-endian options.
[glibc.git] / string / strcoll_l.c
blob4ee101a118a15194f8ca0e3cbdc53e2d5aa483f8
1 /* Copyright (C) 1995-2013 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
20 #include <assert.h>
21 #include <langinfo.h>
22 #include <locale.h>
23 #include <stddef.h>
24 #include <stdint.h>
25 #include <stdlib.h>
26 #include <string.h>
28 #ifndef STRING_TYPE
29 # define STRING_TYPE char
30 # define USTRING_TYPE unsigned char
31 # define STRCOLL __strcoll_l
32 # define STRCMP strcmp
33 # define STRLEN strlen
34 # define WEIGHT_H "../locale/weight.h"
35 # define SUFFIX MB
36 # define L(arg) arg
37 #endif
39 #define CONCAT(a,b) CONCAT1(a,b)
40 #define CONCAT1(a,b) a##b
42 #include "../locale/localeinfo.h"
44 /* Track status while looking for sequences in a string. */
45 typedef struct
47 int len; /* Length of the current sequence. */
48 size_t val; /* Position of the sequence relative to the
49 previous non-ignored sequence. */
50 size_t idxnow; /* Current index in sequences. */
51 size_t idxmax; /* Maximum index in sequences. */
52 size_t idxcnt; /* Current count of indices. */
53 size_t backw; /* Current Backward sequence index. */
54 size_t backw_stop; /* Index where the backward sequences stop. */
55 const USTRING_TYPE *us; /* The string. */
56 int32_t *idxarr; /* Array to cache weight indices. */
57 unsigned char *rulearr; /* Array to cache rules. */
58 unsigned char rule; /* Saved rule for the first sequence. */
59 int32_t idx; /* Index to weight of the current sequence. */
60 int32_t save_idx; /* Save looked up index of a forward
61 sequence after the last backward
62 sequence. */
63 const USTRING_TYPE *back_us; /* Beginning of the backward sequence. */
64 } coll_seq;
66 /* Get next sequence. The weight indices are cached, so we don't need to
67 traverse the string. */
68 static void
69 get_next_seq_cached (coll_seq *seq, int nrules, int pass,
70 const unsigned char *rulesets,
71 const USTRING_TYPE *weights)
73 size_t val = seq->val = 0;
74 int len = seq->len;
75 size_t backw_stop = seq->backw_stop;
76 size_t backw = seq->backw;
77 size_t idxcnt = seq->idxcnt;
78 size_t idxmax = seq->idxmax;
79 size_t idxnow = seq->idxnow;
80 unsigned char *rulearr = seq->rulearr;
81 int32_t *idxarr = seq->idxarr;
83 while (len == 0)
85 ++val;
86 if (backw_stop != ~0ul)
88 /* There is something pushed. */
89 if (backw == backw_stop)
91 /* The last pushed character was handled. Continue
92 with forward characters. */
93 if (idxcnt < idxmax)
95 idxnow = idxcnt;
96 backw_stop = ~0ul;
98 else
100 /* Nothing any more. The backward sequence
101 ended with the last sequence in the string. */
102 idxnow = ~0ul;
103 break;
106 else
107 idxnow = --backw;
109 else
111 backw_stop = idxcnt;
113 while (idxcnt < idxmax)
115 if ((rulesets[rulearr[idxcnt] * nrules + pass]
116 & sort_backward) == 0)
117 /* No more backward characters to push. */
118 break;
119 ++idxcnt;
122 if (backw_stop == idxcnt)
124 /* No sequence at all or just one. */
125 if (idxcnt == idxmax)
126 /* Note that LEN is still zero. */
127 break;
129 backw_stop = ~0ul;
130 idxnow = idxcnt++;
132 else
133 /* We pushed backward sequences. */
134 idxnow = backw = idxcnt - 1;
136 len = weights[idxarr[idxnow]++];
139 /* Update the structure. */
140 seq->val = val;
141 seq->len = len;
142 seq->backw_stop = backw_stop;
143 seq->backw = backw;
144 seq->idxcnt = idxcnt;
145 seq->idxnow = idxnow;
148 /* Get next sequence. Traverse the string as required. */
149 static void
150 get_next_seq (coll_seq *seq, int nrules, const unsigned char *rulesets,
151 const USTRING_TYPE *weights, const int32_t *table,
152 const USTRING_TYPE *extra, const int32_t *indirect)
154 #include WEIGHT_H
155 size_t val = seq->val = 0;
156 int len = seq->len;
157 size_t backw_stop = seq->backw_stop;
158 size_t backw = seq->backw;
159 size_t idxcnt = seq->idxcnt;
160 size_t idxmax = seq->idxmax;
161 size_t idxnow = seq->idxnow;
162 unsigned char *rulearr = seq->rulearr;
163 int32_t *idxarr = seq->idxarr;
164 const USTRING_TYPE *us = seq->us;
166 while (len == 0)
168 ++val;
169 if (backw_stop != ~0ul)
171 /* There is something pushed. */
172 if (backw == backw_stop)
174 /* The last pushed character was handled. Continue
175 with forward characters. */
176 if (idxcnt < idxmax)
178 idxnow = idxcnt;
179 backw_stop = ~0ul;
181 else
182 /* Nothing any more. The backward sequence ended with
183 the last sequence in the string. Note that LEN
184 is still zero. */
185 break;
187 else
188 idxnow = --backw;
190 else
192 backw_stop = idxmax;
194 while (*us != L('\0'))
196 int32_t tmp = findidx (&us, -1);
197 rulearr[idxmax] = tmp >> 24;
198 idxarr[idxmax] = tmp & 0xffffff;
199 idxcnt = idxmax++;
201 if ((rulesets[rulearr[idxcnt] * nrules]
202 & sort_backward) == 0)
203 /* No more backward characters to push. */
204 break;
205 ++idxcnt;
208 if (backw_stop >= idxcnt)
210 /* No sequence at all or just one. */
211 if (idxcnt == idxmax || backw_stop > idxcnt)
212 /* Note that LEN is still zero. */
213 break;
215 backw_stop = ~0ul;
216 idxnow = idxcnt;
218 else
219 /* We pushed backward sequences. */
220 idxnow = backw = idxcnt - 1;
222 len = weights[idxarr[idxnow]++];
225 /* Update the structure. */
226 seq->val = val;
227 seq->len = len;
228 seq->backw_stop = backw_stop;
229 seq->backw = backw;
230 seq->idxcnt = idxcnt;
231 seq->idxmax = idxmax;
232 seq->idxnow = idxnow;
233 seq->us = us;
236 /* Get next sequence. Traverse the string as required. This function does not
237 set or use any index or rule cache. */
238 static void
239 get_next_seq_nocache (coll_seq *seq, int nrules, const unsigned char *rulesets,
240 const USTRING_TYPE *weights, const int32_t *table,
241 const USTRING_TYPE *extra, const int32_t *indirect,
242 int pass)
244 #include WEIGHT_H
245 size_t val = seq->val = 0;
246 int len = seq->len;
247 size_t backw_stop = seq->backw_stop;
248 size_t backw = seq->backw;
249 size_t idxcnt = seq->idxcnt;
250 size_t idxmax = seq->idxmax;
251 int32_t idx = seq->idx;
252 const USTRING_TYPE *us = seq->us;
254 while (len == 0)
256 ++val;
257 if (backw_stop != ~0ul)
259 /* There is something pushed. */
260 if (backw == backw_stop)
262 /* The last pushed character was handled. Continue
263 with forward characters. */
264 if (idxcnt < idxmax)
266 idx = seq->save_idx;
267 backw_stop = ~0ul;
269 else
271 /* Nothing anymore. The backward sequence ended with
272 the last sequence in the string. Note that len is
273 still zero. */
274 idx = 0;
275 break;
278 else
280 /* XXX Traverse BACKW sequences from the beginning of
281 BACKW_STOP to get the next sequence. Is ther a quicker way
282 to do this? */
283 size_t i = backw_stop;
284 us = seq->back_us;
285 while (i < backw)
287 int32_t tmp = findidx (&us, -1);
288 idx = tmp & 0xffffff;
289 i++;
291 --backw;
292 us = seq->us;
295 else
297 backw_stop = idxmax;
298 int32_t prev_idx = idx;
300 while (*us != L('\0'))
302 int32_t tmp = findidx (&us, -1);
303 unsigned char rule = tmp >> 24;
304 prev_idx = idx;
305 idx = tmp & 0xffffff;
306 idxcnt = idxmax++;
308 /* Save the rule for the first sequence. */
309 if (__glibc_unlikely (idxcnt == 0))
310 seq->rule = rule;
312 if ((rulesets[rule * nrules + pass]
313 & sort_backward) == 0)
314 /* No more backward characters to push. */
315 break;
316 ++idxcnt;
319 if (backw_stop >= idxcnt)
321 /* No sequence at all or just one. */
322 if (idxcnt == idxmax || backw_stop > idxcnt)
323 /* Note that len is still zero. */
324 break;
326 backw_stop = ~0ul;
328 else
330 /* We pushed backward sequences. If the stream ended with the
331 backward sequence, then we process the last sequence we
332 found. Otherwise we process the sequence before the last
333 one since the last one was a forward sequence. */
334 seq->back_us = seq->us;
335 seq->us = us;
336 backw = idxcnt;
337 if (idxmax > idxcnt)
339 backw--;
340 seq->save_idx = idx;
341 idx = prev_idx;
343 if (backw > backw_stop)
344 backw--;
348 len = weights[idx++];
349 /* Skip over indices of previous levels. */
350 for (int i = 0; i < pass; i++)
352 idx += len;
353 len = weights[idx];
354 idx++;
358 /* Update the structure. */
359 seq->val = val;
360 seq->len = len;
361 seq->backw_stop = backw_stop;
362 seq->backw = backw;
363 seq->idxcnt = idxcnt;
364 seq->idxmax = idxmax;
365 seq->us = us;
366 seq->idx = idx;
369 /* Compare two sequences. This version does not use the index and rules
370 cache. */
371 static int
372 do_compare_nocache (coll_seq *seq1, coll_seq *seq2, int position,
373 const USTRING_TYPE *weights)
375 int seq1len = seq1->len;
376 int seq2len = seq2->len;
377 size_t val1 = seq1->val;
378 size_t val2 = seq2->val;
379 int idx1 = seq1->idx;
380 int idx2 = seq2->idx;
381 int result = 0;
383 /* Test for position if necessary. */
384 if (position && val1 != val2)
386 result = val1 > val2 ? 1 : -1;
387 goto out;
390 /* Compare the two sequences. */
393 if (weights[idx1] != weights[idx2])
395 /* The sequences differ. */
396 result = weights[idx1] - weights[idx2];
397 goto out;
400 /* Increment the offsets. */
401 ++idx1;
402 ++idx2;
404 --seq1len;
405 --seq2len;
407 while (seq1len > 0 && seq2len > 0);
409 if (position && seq1len != seq2len)
410 result = seq1len - seq2len;
412 out:
413 seq1->len = seq1len;
414 seq2->len = seq2len;
415 seq1->idx = idx1;
416 seq2->idx = idx2;
417 return result;
420 /* Compare two sequences using the index cache. */
421 static int
422 do_compare (coll_seq *seq1, coll_seq *seq2, int position,
423 const USTRING_TYPE *weights)
425 int seq1len = seq1->len;
426 int seq2len = seq2->len;
427 size_t val1 = seq1->val;
428 size_t val2 = seq2->val;
429 int32_t *idx1arr = seq1->idxarr;
430 int32_t *idx2arr = seq2->idxarr;
431 int idx1now = seq1->idxnow;
432 int idx2now = seq2->idxnow;
433 int result = 0;
435 /* Test for position if necessary. */
436 if (position && val1 != val2)
438 result = val1 > val2 ? 1 : -1;
439 goto out;
442 /* Compare the two sequences. */
445 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
447 /* The sequences differ. */
448 result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
449 goto out;
452 /* Increment the offsets. */
453 ++idx1arr[idx1now];
454 ++idx2arr[idx2now];
456 --seq1len;
457 --seq2len;
459 while (seq1len > 0 && seq2len > 0);
461 if (position && seq1len != seq2len)
462 result = seq1len - seq2len;
464 out:
465 seq1->len = seq1len;
466 seq2->len = seq2len;
467 return result;
471 STRCOLL (const STRING_TYPE *s1, const STRING_TYPE *s2, __locale_t l)
473 struct __locale_data *current = l->__locales[LC_COLLATE];
474 uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
475 /* We don't assign the following values right away since it might be
476 unnecessary in case there are no rules. */
477 const unsigned char *rulesets;
478 const int32_t *table;
479 const USTRING_TYPE *weights;
480 const USTRING_TYPE *extra;
481 const int32_t *indirect;
483 if (nrules == 0)
484 return STRCMP (s1, s2);
486 rulesets = (const unsigned char *)
487 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
488 table = (const int32_t *)
489 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
490 weights = (const USTRING_TYPE *)
491 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
492 extra = (const USTRING_TYPE *)
493 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
494 indirect = (const int32_t *)
495 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
497 assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
498 assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
499 assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
500 assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
502 /* We need this a few times. */
503 size_t s1len = STRLEN (s1);
504 size_t s2len = STRLEN (s2);
506 /* Catch empty strings. */
507 if (__glibc_unlikely (s1len == 0) || __glibc_unlikely (s2len == 0))
508 return (s1len != 0) - (s2len != 0);
510 /* Perform the first pass over the string and while doing this find
511 and store the weights for each character. Since we want this to
512 be as fast as possible we are using `alloca' to store the temporary
513 values. But since there is no limit on the length of the string
514 we have to use `malloc' if the string is too long. We should be
515 very conservative here.
517 Please note that the localedef programs makes sure that `position'
518 is not used at the first level. */
520 coll_seq seq1, seq2;
521 bool use_malloc = false;
522 int result = 0;
524 memset (&seq1, 0, sizeof (seq1));
525 seq2 = seq1;
527 size_t size_max = SIZE_MAX / (sizeof (int32_t) + 1);
529 if (MIN (s1len, s2len) > size_max
530 || MAX (s1len, s2len) > size_max - MIN (s1len, s2len))
532 /* If the strings are long enough to cause overflow in the size request,
533 then skip the allocation and proceed with the non-cached routines. */
535 else if (! __libc_use_alloca ((s1len + s2len) * (sizeof (int32_t) + 1)))
537 seq1.idxarr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
539 /* If we failed to allocate memory, we leave everything as NULL so that
540 we use the nocache version of traversal and comparison functions. */
541 if (seq1.idxarr != NULL)
543 seq2.idxarr = &seq1.idxarr[s1len];
544 seq1.rulearr = (unsigned char *) &seq2.idxarr[s2len];
545 seq2.rulearr = &seq1.rulearr[s1len];
546 use_malloc = true;
549 else
551 seq1.idxarr = (int32_t *) alloca (s1len * sizeof (int32_t));
552 seq2.idxarr = (int32_t *) alloca (s2len * sizeof (int32_t));
553 seq1.rulearr = (unsigned char *) alloca (s1len);
554 seq2.rulearr = (unsigned char *) alloca (s2len);
557 int rule = 0;
559 /* Cache values in the first pass and if needed, use them in subsequent
560 passes. */
561 for (int pass = 0; pass < nrules; ++pass)
563 seq1.idxcnt = 0;
564 seq1.idx = 0;
565 seq2.idx = 0;
566 seq1.backw_stop = ~0ul;
567 seq1.backw = ~0ul;
568 seq2.idxcnt = 0;
569 seq2.backw_stop = ~0ul;
570 seq2.backw = ~0ul;
572 /* We need the elements of the strings as unsigned values since they
573 are used as indices. */
574 seq1.us = (const USTRING_TYPE *) s1;
575 seq2.us = (const USTRING_TYPE *) s2;
577 /* We assume that if a rule has defined `position' in one section
578 this is true for all of them. */
579 int position = rulesets[rule * nrules + pass] & sort_position;
581 while (1)
583 if (__glibc_unlikely (seq1.idxarr == NULL))
585 get_next_seq_nocache (&seq1, nrules, rulesets, weights, table,
586 extra, indirect, pass);
587 get_next_seq_nocache (&seq2, nrules, rulesets, weights, table,
588 extra, indirect, pass);
590 else if (pass == 0)
592 get_next_seq (&seq1, nrules, rulesets, weights, table, extra,
593 indirect);
594 get_next_seq (&seq2, nrules, rulesets, weights, table, extra,
595 indirect);
597 else
599 get_next_seq_cached (&seq1, nrules, pass, rulesets, weights);
600 get_next_seq_cached (&seq2, nrules, pass, rulesets, weights);
603 /* See whether any or both strings are empty. */
604 if (seq1.len == 0 || seq2.len == 0)
606 if (seq1.len == seq2.len)
607 /* Both ended. So far so good, both strings are equal
608 at this level. */
609 break;
611 /* This means one string is shorter than the other. Find out
612 which one and return an appropriate value. */
613 result = seq1.len == 0 ? -1 : 1;
614 goto free_and_return;
617 if (__glibc_unlikely (seq1.idxarr == NULL))
618 result = do_compare_nocache (&seq1, &seq2, position, weights);
619 else
620 result = do_compare (&seq1, &seq2, position, weights);
621 if (result != 0)
622 goto free_and_return;
625 if (__glibc_likely (seq1.rulearr != NULL))
626 rule = seq1.rulearr[0];
627 else
628 rule = seq1.rule;
631 /* Free the memory if needed. */
632 free_and_return:
633 if (use_malloc)
634 free (seq1.idxarr);
636 return result;
638 libc_hidden_def (STRCOLL)
640 #ifndef WIDE_CHAR_VERSION
641 weak_alias (__strcoll_l, strcoll_l)
642 #endif