Set LC_ALL=C before sed.
[glibc.git] / string / strcoll_l.c
blobecda08fb89c855a7d0474ca9b379fdded1b1d505
1 /* Copyright (C) 1995-2013 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
20 #include <assert.h>
21 #include <langinfo.h>
22 #include <locale.h>
23 #include <stddef.h>
24 #include <stdint.h>
25 #include <stdlib.h>
26 #include <string.h>
28 #ifndef STRING_TYPE
29 # define STRING_TYPE char
30 # define USTRING_TYPE unsigned char
31 # define STRCOLL __strcoll_l
32 # define STRCMP strcmp
33 # define STRLEN strlen
34 # define WEIGHT_H "../locale/weight.h"
35 # define SUFFIX MB
36 # define L(arg) arg
37 #endif
39 #define CONCAT(a,b) CONCAT1(a,b)
40 #define CONCAT1(a,b) a##b
42 #include "../locale/localeinfo.h"
44 int
45 STRCOLL (s1, s2, l)
46 const STRING_TYPE *s1;
47 const STRING_TYPE *s2;
48 __locale_t l;
50 struct __locale_data *current = l->__locales[LC_COLLATE];
51 uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
52 /* We don't assign the following values right away since it might be
53 unnecessary in case there are no rules. */
54 const unsigned char *rulesets;
55 const int32_t *table;
56 const USTRING_TYPE *weights;
57 const USTRING_TYPE *extra;
58 const int32_t *indirect;
59 uint_fast32_t pass;
60 int result = 0;
61 const USTRING_TYPE *us1;
62 const USTRING_TYPE *us2;
63 size_t s1len;
64 size_t s2len;
65 int32_t *idx1arr;
66 int32_t *idx2arr;
67 unsigned char *rule1arr;
68 unsigned char *rule2arr;
69 size_t idx1max;
70 size_t idx2max;
71 size_t idx1cnt;
72 size_t idx2cnt;
73 size_t idx1now;
74 size_t idx2now;
75 size_t backw1_stop;
76 size_t backw2_stop;
77 size_t backw1;
78 size_t backw2;
79 int val1;
80 int val2;
81 int position;
82 int seq1len;
83 int seq2len;
84 int use_malloc;
86 #include WEIGHT_H
88 if (nrules == 0)
89 return STRCMP (s1, s2);
91 rulesets = (const unsigned char *)
92 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
93 table = (const int32_t *)
94 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
95 weights = (const USTRING_TYPE *)
96 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
97 extra = (const USTRING_TYPE *)
98 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
99 indirect = (const int32_t *)
100 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
101 use_malloc = 0;
103 assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
104 assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
105 assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
106 assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
108 /* We need this a few times. */
109 s1len = STRLEN (s1);
110 s2len = STRLEN (s2);
112 /* Catch empty strings. */
113 if (__builtin_expect (s1len == 0, 0) || __builtin_expect (s2len == 0, 0))
114 return (s1len != 0) - (s2len != 0);
116 /* We need the elements of the strings as unsigned values since they
117 are used as indeces. */
118 us1 = (const USTRING_TYPE *) s1;
119 us2 = (const USTRING_TYPE *) s2;
121 /* Perform the first pass over the string and while doing this find
122 and store the weights for each character. Since we want this to
123 be as fast as possible we are using `alloca' to store the temporary
124 values. But since there is no limit on the length of the string
125 we have to use `malloc' if the string is too long. We should be
126 very conservative here.
128 Please note that the localedef programs makes sure that `position'
129 is not used at the first level. */
130 if (! __libc_use_alloca ((s1len + s2len) * (sizeof (int32_t) + 1)))
132 idx1arr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
133 idx2arr = &idx1arr[s1len];
134 rule1arr = (unsigned char *) &idx2arr[s2len];
135 rule2arr = &rule1arr[s1len];
137 if (idx1arr == NULL)
138 /* No memory. Well, go with the stack then.
140 XXX Once this implementation is stable we will handle this
141 differently. Instead of precomputing the indeces we will
142 do this in time. This means, though, that this happens for
143 every pass again. */
144 goto try_stack;
145 use_malloc = 1;
147 else
149 try_stack:
150 idx1arr = (int32_t *) alloca (s1len * sizeof (int32_t));
151 idx2arr = (int32_t *) alloca (s2len * sizeof (int32_t));
152 rule1arr = (unsigned char *) alloca (s1len);
153 rule2arr = (unsigned char *) alloca (s2len);
156 idx1cnt = 0;
157 idx2cnt = 0;
158 idx1max = 0;
159 idx2max = 0;
160 idx1now = 0;
161 idx2now = 0;
162 backw1_stop = ~0ul;
163 backw2_stop = ~0ul;
164 backw1 = ~0ul;
165 backw2 = ~0ul;
166 seq1len = 0;
167 seq2len = 0;
168 position = rulesets[0] & sort_position;
169 while (1)
171 val1 = 0;
172 val2 = 0;
174 /* Get the next non-IGNOREd element for string `s1'. */
175 if (seq1len == 0)
178 ++val1;
180 if (backw1_stop != ~0ul)
182 /* The is something pushed. */
183 if (backw1 == backw1_stop)
185 /* The last pushed character was handled. Continue
186 with forward characters. */
187 if (idx1cnt < idx1max)
189 idx1now = idx1cnt;
190 backw1_stop = ~0ul;
192 else
193 /* Nothing anymore. The backward sequence ended with
194 the last sequence in the string. Note that seq1len
195 is still zero. */
196 break;
198 else
199 idx1now = --backw1;
201 else
203 backw1_stop = idx1max;
205 while (*us1 != L('\0'))
207 int32_t tmp = findidx (&us1, -1);
208 rule1arr[idx1max] = tmp >> 24;
209 idx1arr[idx1max] = tmp & 0xffffff;
210 idx1cnt = idx1max++;
212 if ((rulesets[rule1arr[idx1cnt] * nrules]
213 & sort_backward) == 0)
214 /* No more backward characters to push. */
215 break;
216 ++idx1cnt;
219 if (backw1_stop >= idx1cnt)
221 /* No sequence at all or just one. */
222 if (idx1cnt == idx1max || backw1_stop > idx1cnt)
223 /* Note that seq1len is still zero. */
224 break;
226 backw1_stop = ~0ul;
227 idx1now = idx1cnt;
229 else
230 /* We pushed backward sequences. */
231 idx1now = backw1 = idx1cnt - 1;
234 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
236 /* And the same for string `s2'. */
237 if (seq2len == 0)
240 ++val2;
242 if (backw2_stop != ~0ul)
244 /* The is something pushed. */
245 if (backw2 == backw2_stop)
247 /* The last pushed character was handled. Continue
248 with forward characters. */
249 if (idx2cnt < idx2max)
251 idx2now = idx2cnt;
252 backw2_stop = ~0ul;
254 else
255 /* Nothing anymore. The backward sequence ended with
256 the last sequence in the string. Note that seq2len
257 is still zero. */
258 break;
260 else
261 idx2now = --backw2;
263 else
265 backw2_stop = idx2max;
267 while (*us2 != L('\0'))
269 int32_t tmp = findidx (&us2, -1);
270 rule2arr[idx2max] = tmp >> 24;
271 idx2arr[idx2max] = tmp & 0xffffff;
272 idx2cnt = idx2max++;
274 if ((rulesets[rule2arr[idx2cnt] * nrules]
275 & sort_backward) == 0)
276 /* No more backward characters to push. */
277 break;
278 ++idx2cnt;
281 if (backw2_stop >= idx2cnt)
283 /* No sequence at all or just one. */
284 if (idx2cnt == idx2max || backw2_stop > idx2cnt)
285 /* Note that seq1len is still zero. */
286 break;
288 backw2_stop = ~0ul;
289 idx2now = idx2cnt;
291 else
292 /* We pushed backward sequences. */
293 idx2now = backw2 = idx2cnt - 1;
296 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
298 /* See whether any or both strings are empty. */
299 if (seq1len == 0 || seq2len == 0)
301 if (seq1len == seq2len)
302 /* Both ended. So far so good, both strings are equal at the
303 first level. */
304 break;
306 /* This means one string is shorter than the other. Find out
307 which one and return an appropriate value. */
308 result = seq1len == 0 ? -1 : 1;
309 goto free_and_return;
312 /* Test for position if necessary. */
313 if (position && val1 != val2)
315 result = val1 - val2;
316 goto free_and_return;
319 /* Compare the two sequences. */
322 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
324 /* The sequences differ. */
325 result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
326 goto free_and_return;
329 /* Increment the offsets. */
330 ++idx1arr[idx1now];
331 ++idx2arr[idx2now];
333 --seq1len;
334 --seq2len;
336 while (seq1len > 0 && seq2len > 0);
338 if (position && seq1len != seq2len)
340 result = seq1len - seq2len;
341 goto free_and_return;
345 /* Now the remaining passes over the weights. We now use the
346 indeces we found before. */
347 for (pass = 1; pass < nrules; ++pass)
349 /* We assume that if a rule has defined `position' in one section
350 this is true for all of them. */
351 idx1cnt = 0;
352 idx2cnt = 0;
353 backw1_stop = ~0ul;
354 backw2_stop = ~0ul;
355 backw1 = ~0ul;
356 backw2 = ~0ul;
357 position = rulesets[rule1arr[0] * nrules + pass] & sort_position;
359 while (1)
361 val1 = 0;
362 val2 = 0;
364 /* Get the next non-IGNOREd element for string `s1'. */
365 if (seq1len == 0)
368 ++val1;
370 if (backw1_stop != ~0ul)
372 /* The is something pushed. */
373 if (backw1 == backw1_stop)
375 /* The last pushed character was handled. Continue
376 with forward characters. */
377 if (idx1cnt < idx1max)
379 idx1now = idx1cnt;
380 backw1_stop = ~0ul;
382 else
384 /* Nothing anymore. The backward sequence
385 ended with the last sequence in the string. */
386 idx1now = ~0ul;
387 break;
390 else
391 idx1now = --backw1;
393 else
395 backw1_stop = idx1cnt;
397 while (idx1cnt < idx1max)
399 if ((rulesets[rule1arr[idx1cnt] * nrules + pass]
400 & sort_backward) == 0)
401 /* No more backward characters to push. */
402 break;
403 ++idx1cnt;
406 if (backw1_stop == idx1cnt)
408 /* No sequence at all or just one. */
409 if (idx1cnt == idx1max)
410 /* Note that seq1len is still zero. */
411 break;
413 backw1_stop = ~0ul;
414 idx1now = idx1cnt++;
416 else
417 /* We pushed backward sequences. */
418 idx1now = backw1 = idx1cnt - 1;
421 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
423 /* And the same for string `s2'. */
424 if (seq2len == 0)
427 ++val2;
429 if (backw2_stop != ~0ul)
431 /* The is something pushed. */
432 if (backw2 == backw2_stop)
434 /* The last pushed character was handled. Continue
435 with forward characters. */
436 if (idx2cnt < idx2max)
438 idx2now = idx2cnt;
439 backw2_stop = ~0ul;
441 else
443 /* Nothing anymore. The backward sequence
444 ended with the last sequence in the string. */
445 idx2now = ~0ul;
446 break;
449 else
450 idx2now = --backw2;
452 else
454 backw2_stop = idx2cnt;
456 while (idx2cnt < idx2max)
458 if ((rulesets[rule2arr[idx2cnt] * nrules + pass]
459 & sort_backward) == 0)
460 /* No more backward characters to push. */
461 break;
462 ++idx2cnt;
465 if (backw2_stop == idx2cnt)
467 /* No sequence at all or just one. */
468 if (idx2cnt == idx2max)
469 /* Note that seq2len is still zero. */
470 break;
472 backw2_stop = ~0ul;
473 idx2now = idx2cnt++;
475 else
476 /* We pushed backward sequences. */
477 idx2now = backw2 = idx2cnt - 1;
480 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
482 /* See whether any or both strings are empty. */
483 if (seq1len == 0 || seq2len == 0)
485 if (seq1len == seq2len)
486 /* Both ended. So far so good, both strings are equal
487 at this level. */
488 break;
490 /* This means one string is shorter than the other. Find out
491 which one and return an appropriate value. */
492 result = seq1len == 0 ? -1 : 1;
493 goto free_and_return;
496 /* Test for position if necessary. */
497 if (position && val1 != val2)
499 result = val1 - val2;
500 goto free_and_return;
503 /* Compare the two sequences. */
506 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
508 /* The sequences differ. */
509 result = (weights[idx1arr[idx1now]]
510 - weights[idx2arr[idx2now]]);
511 goto free_and_return;
514 /* Increment the offsets. */
515 ++idx1arr[idx1now];
516 ++idx2arr[idx2now];
518 --seq1len;
519 --seq2len;
521 while (seq1len > 0 && seq2len > 0);
523 if (position && seq1len != seq2len)
525 result = seq1len - seq2len;
526 goto free_and_return;
531 /* Free the memory if needed. */
532 free_and_return:
533 if (use_malloc)
534 free (idx1arr);
536 return result;
538 libc_hidden_def (STRCOLL)
540 #ifndef WIDE_CHAR_VERSION
541 weak_alias (__strcoll_l, strcoll_l)
542 #endif