Update.
[glibc.git] / string / strcoll.c
blobce8ec04435c039642f5a18e5618d9b50fa057141
1 /* Copyright (C) 1995,96,97,98,99,2000,2001 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #include <assert.h>
21 #include <langinfo.h>
22 #include <locale.h>
23 #include <stddef.h>
24 #include <stdint.h>
25 #include <stdlib.h>
26 #include <string.h>
28 #ifndef STRING_TYPE
29 # define STRING_TYPE char
30 # define USTRING_TYPE unsigned char
31 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
32 # define STRCOLL __strcoll_l
33 # else
34 # define STRCOLL strcoll
35 # endif
36 # define STRCMP strcmp
37 # define STRLEN strlen
38 # define WEIGHT_H "../locale/weight.h"
39 # define SUFFIX MB
40 # define L(arg) arg
41 #endif
43 #define CONCAT(a,b) CONCAT1(a,b)
44 #define CONCAT1(a,b) a##b
46 #include "../locale/localeinfo.h"
48 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
49 int
50 STRCOLL (s1, s2)
51 const STRING_TYPE *s1;
52 const STRING_TYPE *s2;
53 #else
54 int
55 STRCOLL (s1, s2, l)
56 const STRING_TYPE *s1;
57 const STRING_TYPE *s2;
58 __locale_t l;
59 #endif
61 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
62 struct locale_data *current = l->__locales[LC_COLLATE];
63 uint_fast32_t nrules = *((const uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
64 #else
65 uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
66 #endif
67 /* We don't assign the following values right away since it might be
68 unnecessary in case there are no rules. */
69 const unsigned char *rulesets;
70 const int32_t *table;
71 const USTRING_TYPE *weights;
72 const USTRING_TYPE *extra;
73 const int32_t *indirect;
74 uint_fast32_t pass;
75 int result = 0;
76 const USTRING_TYPE *us1;
77 const USTRING_TYPE *us2;
78 size_t s1len;
79 size_t s2len;
80 int32_t *idx1arr;
81 int32_t *idx2arr;
82 unsigned char *rule1arr;
83 unsigned char *rule2arr;
84 size_t idx1max;
85 size_t idx2max;
86 size_t idx1cnt;
87 size_t idx2cnt;
88 size_t idx1now;
89 size_t idx2now;
90 size_t backw1_stop;
91 size_t backw2_stop;
92 size_t backw1;
93 size_t backw2;
94 int val1;
95 int val2;
96 int position;
97 int seq1len;
98 int seq2len;
99 int use_malloc;
101 #include WEIGHT_H
103 if (nrules == 0)
104 return STRCMP (s1, s2);
106 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
107 rulesets = (const unsigned char *)
108 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
109 table = (const int32_t *)
110 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
111 weights = (const USTRING_TYPE *)
112 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
113 extra = (const USTRING_TYPE *)
114 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
115 indirect = (const int32_t *)
116 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
117 #else
118 rulesets = (const unsigned char *)
119 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
120 table = (const int32_t *)
121 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_TABLE,SUFFIX));
122 weights = (const USTRING_TYPE *)
123 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_WEIGHT,SUFFIX));
124 extra = (const USTRING_TYPE *)
125 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_EXTRA,SUFFIX));
126 indirect = (const int32_t *)
127 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_INDIRECT,SUFFIX));
128 #endif
129 use_malloc = 0;
131 assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
132 assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
133 assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
134 assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
136 /* We need this a few times. */
137 s1len = STRLEN (s1);
138 s2len = STRLEN (s2);
140 /* We need the elements of the strings as unsigned values since they
141 are used as indeces. */
142 us1 = (const USTRING_TYPE *) s1;
143 us2 = (const USTRING_TYPE *) s2;
145 /* Perform the first pass over the string and while doing this find
146 and store the weights for each character. Since we want this to
147 be as fast as possible we are using `alloca' to store the temporary
148 values. But since there is no limit on the length of the string
149 we have to use `malloc' if the string is too long. We should be
150 very conservative here.
152 Please note that the localedef programs makes sure that `position'
153 is not used at the first level. */
154 if (s1len + s2len >= 16384)
156 idx1arr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
157 idx2arr = &idx1arr[s1len];
158 rule1arr = (unsigned char *) &idx2arr[s2len];
159 rule2arr = &rule1arr[s1len];
161 if (idx1arr == NULL)
162 /* No memory. Well, go with the stack then.
164 XXX Once this implementation is stable we will handle this
165 differently. Instead of precomputing the indeces we will
166 do this in time. This means, though, that this happens for
167 every pass again. */
168 goto try_stack;
169 use_malloc = 1;
171 else
173 try_stack:
174 idx1arr = (int32_t *) alloca (s1len * sizeof (int32_t));
175 idx2arr = (int32_t *) alloca (s2len * sizeof (int32_t));
176 rule1arr = (unsigned char *) alloca (s1len);
177 rule2arr = (unsigned char *) alloca (s2len);
180 idx1cnt = 0;
181 idx2cnt = 0;
182 idx1max = 0;
183 idx2max = 0;
184 idx1now = 0;
185 idx2now = 0;
186 backw1_stop = ~0ul;
187 backw2_stop = ~0ul;
188 backw1 = ~0ul;
189 backw2 = ~0ul;
190 seq1len = 0;
191 seq2len = 0;
192 position = rulesets[0] & sort_position;
193 while (1)
195 val1 = 0;
196 val2 = 0;
198 /* Get the next non-IGNOREd element for string `s1'. */
199 if (seq1len == 0)
202 ++val1;
204 if (backw1_stop != ~0ul)
206 /* The is something pushed. */
207 if (backw1 == backw1_stop)
209 /* The last pushed character was handled. Continue
210 with forward characters. */
211 if (idx1cnt < idx1max)
212 idx1now = idx1cnt;
213 else
214 /* Nothing anymore. The backward sequence ended with
215 the last sequence in the string. Note that seq1len
216 is still zero. */
217 break;
219 else
220 idx1now = --backw1;
222 else
224 backw1_stop = idx1max;
226 while (*us1 != L('\0'))
228 int32_t tmp = findidx (&us1);
229 rule1arr[idx1max] = tmp >> 24;
230 idx1arr[idx1max] = tmp & 0xffffff;
231 idx1cnt = idx1max++;
233 if ((rulesets[rule1arr[idx1cnt] * nrules]
234 & sort_backward) == 0)
235 /* No more backward characters to push. */
236 break;
237 ++idx1cnt;
240 if (backw1_stop >= idx1cnt)
242 /* No sequence at all or just one. */
243 if (idx1cnt == idx1max || backw1_stop > idx1cnt)
244 /* Note that seq1len is still zero. */
245 break;
247 backw1_stop = ~0ul;
248 idx1now = idx1cnt;
250 else
251 /* We pushed backward sequences. */
252 idx1now = backw1 = idx1cnt - 1;
255 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
257 /* And the same for string `s2'. */
258 if (seq2len == 0)
261 ++val2;
263 if (backw2_stop != ~0ul)
265 /* The is something pushed. */
266 if (backw2 == backw2_stop)
268 /* The last pushed character was handled. Continue
269 with forward characters. */
270 if (idx2cnt < idx2max)
271 idx2now = idx2cnt;
272 else
273 /* Nothing anymore. The backward sequence ended with
274 the last sequence in the string. Note that seq2len
275 is still zero. */
276 break;
278 else
279 idx2now = --backw2;
281 else
283 backw2_stop = idx2max;
285 while (*us2 != L('\0'))
287 int32_t tmp = findidx (&us2);
288 rule2arr[idx2max] = tmp >> 24;
289 idx2arr[idx2max] = tmp & 0xffffff;
290 idx2cnt = idx2max++;
292 if ((rulesets[rule2arr[idx2cnt] * nrules]
293 & sort_backward) == 0)
294 /* No more backward characters to push. */
295 break;
296 ++idx2cnt;
299 if (backw2_stop >= idx2cnt)
301 /* No sequence at all or just one. */
302 if (idx2cnt == idx2max || backw2_stop > idx2cnt)
303 /* Note that seq1len is still zero. */
304 break;
306 backw2_stop = ~0ul;
307 idx2now = idx2cnt;
309 else
310 /* We pushed backward sequences. */
311 idx2now = backw2 = idx2cnt - 1;
314 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
316 /* See whether any or both strings are empty. */
317 if (seq1len == 0 || seq2len == 0)
319 if (seq1len == seq2len)
320 /* Both ended. So far so good, both strings are equal at the
321 first level. */
322 break;
324 /* This means one string is shorter than the other. Find out
325 which one and return an appropriate value. */
326 result = seq1len == 0 ? -1 : 1;
327 goto free_and_return;
330 /* Test for position if necessary. */
331 if (position && val1 != val2)
333 result = val1 - val2;
334 goto free_and_return;
337 /* Compare the two sequences. */
340 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
342 /* The sequences differ. */
343 result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
344 goto free_and_return;
347 /* Increment the offsets. */
348 ++idx1arr[idx1now];
349 ++idx2arr[idx2now];
351 --seq1len;
352 --seq2len;
354 while (seq1len > 0 && seq2len > 0);
356 if (position && seq1len != seq2len)
358 result = seq1len - seq2len;
359 goto free_and_return;
363 /* Now the remaining passes over the weights. We now use the
364 indeces we found before. */
365 for (pass = 1; pass < nrules; ++pass)
367 /* We assume that if a rule has defined `position' in one section
368 this is true for all of them. */
369 idx1cnt = 0;
370 idx2cnt = 0;
371 backw1_stop = ~0ul;
372 backw2_stop = ~0ul;
373 backw1 = ~0ul;
374 backw2 = ~0ul;
375 position = rulesets[rule1arr[0] * nrules + pass] & sort_position;
377 while (1)
379 val1 = 0;
380 val2 = 0;
382 /* Get the next non-IGNOREd element for string `s1'. */
383 if (seq1len == 0)
386 ++val1;
388 if (backw1_stop != ~0ul)
390 /* The is something pushed. */
391 if (backw1 == backw1_stop)
393 /* The last pushed character was handled. Continue
394 with forward characters. */
395 if (idx1cnt < idx1max)
396 idx1now = idx1cnt;
397 else
399 /* Nothing anymore. The backward sequence
400 ended with the last sequence in the string. */
401 idx1now = ~0ul;
402 break;
405 else
406 idx1now = --backw1;
408 else
410 backw1_stop = idx1cnt;
412 while (idx1cnt < idx1max)
414 if ((rulesets[rule1arr[idx1cnt] * nrules + pass]
415 & sort_backward) == 0)
416 /* No more backward characters to push. */
417 break;
418 ++idx1cnt;
421 if (backw1_stop == idx1cnt)
423 /* No sequence at all or just one. */
424 if (idx1cnt == idx1max)
425 /* Note that seq1len is still zero. */
426 break;
428 backw1_stop = ~0ul;
429 idx1now = idx1cnt++;
431 else
432 /* We pushed backward sequences. */
433 idx1now = backw1 = idx1cnt - 1;
436 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
438 /* And the same for string `s2'. */
439 if (seq2len == 0)
442 ++val2;
444 if (backw2_stop != ~0ul)
446 /* The is something pushed. */
447 if (backw2 == backw2_stop)
449 /* The last pushed character was handled. Continue
450 with forward characters. */
451 if (idx2cnt < idx2max)
452 idx2now = idx2cnt;
453 else
455 /* Nothing anymore. The backward sequence
456 ended with the last sequence in the string. */
457 idx2now = ~0ul;
458 break;
461 else
462 idx2now = --backw2;
464 else
466 backw2_stop = idx2cnt;
468 while (idx2cnt < idx2max)
470 if ((rulesets[rule2arr[idx2cnt] * nrules + pass]
471 & sort_backward) == 0)
472 /* No more backward characters to push. */
473 break;
474 ++idx2cnt;
477 if (backw2_stop == idx2cnt)
479 /* No sequence at all or just one. */
480 if (idx2cnt == idx2max)
481 /* Note that seq2len is still zero. */
482 break;
484 backw2_stop = ~0ul;
485 idx2now = idx2cnt++;
487 else
488 /* We pushed backward sequences. */
489 idx2now = backw2 = idx2cnt - 1;
492 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
494 /* See whether any or both strings are empty. */
495 if (seq1len == 0 || seq2len == 0)
497 if (seq1len == seq2len)
498 /* Both ended. So far so good, both strings are equal
499 at this level. */
500 break;
502 /* This means one string is shorter than the other. Find out
503 which one and return an appropriate value. */
504 result = seq1len == 0 ? -1 : 1;
505 goto free_and_return;
508 /* Test for position if necessary. */
509 if (position && val1 != val2)
511 result = val1 - val2;
512 goto free_and_return;
515 /* Compare the two sequences. */
518 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
520 /* The sequences differ. */
521 result = (weights[idx1arr[idx1now]]
522 - weights[idx2arr[idx2now]]);
523 goto free_and_return;
526 /* Increment the offsets. */
527 ++idx1arr[idx1now];
528 ++idx2arr[idx2now];
530 --seq1len;
531 --seq2len;
533 while (seq1len > 0 && seq2len > 0);
535 if (position && seq1len != seq2len)
537 result = seq1len - seq2len;
538 goto free_and_return;
543 /* Free the memory if needed. */
544 free_and_return:
545 if (use_malloc)
546 free (idx1arr);
548 return result;