Update.
[glibc.git] / string / strcoll.c
blob32d91244215a3863e6db8b7cfced6ac707df7a62
1 /* Copyright (C) 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #include <assert.h>
21 #include <langinfo.h>
22 #include <stddef.h>
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <string.h>
27 #ifndef STRING_TYPE
28 # define STRING_TYPE char
29 # define USTRING_TYPE unsigned char
30 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
31 # define STRCOLL __strcoll_l
32 # else
33 # define STRCOLL strcoll
34 # endif
35 # define STRCMP strcmp
36 # define STRLEN strlen
37 # define WEIGHT_H "../locale/weight.h"
38 # define SUFFIX MB
39 # define L(arg) arg
40 #endif
42 #define CONCAT(a,b) CONCAT1(a,b)
43 #define CONCAT1(a,b) a##b
45 #include "../locale/localeinfo.h"
47 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
48 int
49 STRCOLL (s1, s2)
50 const STRING_TYPE *s1;
51 const STRING_TYPE *s2;
52 #else
53 int
54 STRCOLL (s1, s2, l)
55 const STRING_TYPE *s1;
56 const STRING_TYPE *s2;
57 __locale_t l;
58 #endif
60 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
61 struct locale_data *current = l->__locales[LC_COLLATE];
62 uint_fast32_t nrules = *((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
63 #else
64 uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
65 #endif
66 /* We don't assign the following values right away since it might be
67 unnecessary in case there are no rules. */
68 const unsigned char *rulesets;
69 const int32_t *table;
70 const USTRING_TYPE *weights;
71 const USTRING_TYPE *extra;
72 const int32_t *indirect;
73 uint_fast32_t pass;
74 int result = 0;
75 const USTRING_TYPE *us1;
76 const USTRING_TYPE *us2;
77 size_t s1len;
78 size_t s2len;
79 int32_t *idx1arr;
80 int32_t *idx2arr;
81 unsigned char *rule1arr;
82 unsigned char *rule2arr;
83 size_t idx1max;
84 size_t idx2max;
85 size_t idx1cnt;
86 size_t idx2cnt;
87 size_t idx1now;
88 size_t idx2now;
89 size_t backw1_stop;
90 size_t backw2_stop;
91 size_t backw1;
92 size_t backw2;
93 int val1;
94 int val2;
95 int position;
96 int seq1len;
97 int seq2len;
98 int use_malloc;
99 #ifdef WIDE_CHAR_VERSION
100 size_t size;
101 size_t layers;
102 const wint_t *names;
103 #endif
105 #include WEIGHT_H
107 if (nrules == 0)
108 return STRCMP (s1, s2);
110 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
111 rulesets = (const unsigned char *)
112 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
113 table = (const int32_t *)
114 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
115 weights = (const USTRING_TYPE *)
116 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
117 extra = (const USTRING_TYPE *)
118 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
119 indirect = (const int32_t *)
120 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
121 # ifdef WIDE_CHAR_VERSION
122 names = (const wint_t *)
123 current->values[_NL_ITEM_INDEX (_NL_COLLATE_NAMES)].string;
124 size = current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].word;
125 layers = current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].word;
126 # endif
127 #else
128 rulesets = (const unsigned char *)
129 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
130 table = (const int32_t *)
131 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_TABLE,SUFFIX));
132 weights = (const USTRING_TYPE *)
133 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_WEIGHT,SUFFIX));
134 extra = (const USTRING_TYPE *)
135 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_EXTRA,SUFFIX));
136 indirect = (const int32_t *)
137 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_INDIRECT,SUFFIX));
138 # ifdef WIDE_CHAR_VERSION
139 names = (const wint_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_NAMES);
140 size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_SIZE);
141 layers = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_LAYERS);
142 # endif
143 #endif
144 use_malloc = 0;
146 /* We need this a few times. */
147 s1len = STRLEN (s1);
148 s2len = STRLEN (s2);
150 /* We need the elements of the strings as unsigned values since they
151 are used as indeces. */
152 us1 = (const USTRING_TYPE *) s1;
153 us2 = (const USTRING_TYPE *) s2;
155 /* Perform the first pass over the string and while doing this find
156 and store the weights for each character. Since we want this to
157 be as fast as possible we are using `alloca' to store the temporary
158 values. But since there is no limit on the length of the string
159 we have to use `malloc' if the string is too long. We should be
160 very conservative here.
162 Please note that the localedef programs makes sure that `position'
163 is not used at the first level. */
164 if (s1len + s2len >= 16384)
166 idx1arr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
167 idx2arr = &idx1arr[s2len];
168 rule1arr = (unsigned char *) &idx2arr[s2len];
169 rule2arr = &rule1arr[s1len];
171 if (idx1arr == NULL)
172 /* No memory. Well, go with the stack then.
174 XXX Once this implementation is stable we will handle this
175 differently. Instead of precomputing the indeces we will
176 do this in time. This means, though, that this happens for
177 every pass again. */
178 goto try_stack;
179 use_malloc = 1;
181 else
183 try_stack:
184 idx1arr = (int32_t *) alloca (s1len * sizeof (int32_t));
185 idx2arr = (int32_t *) alloca (s2len * sizeof (int32_t));
186 rule1arr = (unsigned char *) alloca (s2len);
187 rule2arr = (unsigned char *) alloca (s2len);
190 idx1cnt = 0;
191 idx2cnt = 0;
192 idx1max = 0;
193 idx2max = 0;
194 idx1now = 0;
195 idx2now = 0;
196 backw1_stop = ~0ul;
197 backw2_stop = ~0ul;
198 backw1 = ~0ul;
199 backw2 = ~0ul;
200 seq1len = 0;
201 seq2len = 0;
202 position = rulesets[0] & sort_position;
203 while (1)
205 val1 = 0;
206 val2 = 0;
208 /* Get the next non-IGNOREd element for string `s1'. */
209 if (seq1len == 0)
212 ++val1;
214 if (backw1_stop != ~0ul)
216 /* The is something pushed. */
217 if (backw1 == backw1_stop)
219 /* The last pushed character was handled. Continue
220 with forward characters. */
221 if (idx1cnt < idx1max)
222 idx1now = idx1cnt;
223 else
224 /* Nothing anymore. The backward sequence ended with
225 the last sequence in the string. Note that seq1len
226 is still zero. */
227 break;
229 else
230 idx1now = --backw1;
232 else
234 backw1_stop = idx1max;
236 while (*us1 != L('\0'))
238 int32_t tmp = findidx (&us1);
239 rule1arr[idx1max] = tmp >> 24;
240 idx1arr[idx1max] = tmp & 0xffffff;
241 idx1cnt = idx1max++;
243 if ((rulesets[rule1arr[idx1cnt] * nrules]
244 & sort_backward) == 0)
245 /* No more backward characters to push. */
246 break;
247 ++idx1cnt;
250 if (backw1_stop >= idx1cnt)
252 /* No sequence at all or just one. */
253 if (idx1cnt == idx1max || backw1_stop > idx1cnt)
254 /* Note that seq1len is still zero. */
255 break;
257 backw1_stop = ~0ul;
258 idx1now = idx1cnt;
260 else
261 /* We pushed backward sequences. */
262 idx1now = backw1 = idx1cnt - 1;
265 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
267 /* And the same for string `s2'. */
268 if (seq2len == 0)
271 ++val2;
273 if (backw2_stop != ~0ul)
275 /* The is something pushed. */
276 if (backw2 == backw2_stop)
278 /* The last pushed character was handled. Continue
279 with forward characters. */
280 if (idx2cnt < idx2max)
281 idx2now = idx2cnt;
282 else
283 /* Nothing anymore. The backward sequence ended with
284 the last sequence in the string. Note that seq2len
285 is still zero. */
286 break;
288 else
289 idx2now = --backw2;
291 else
293 backw2_stop = idx2max;
295 while (*us2 != L('\0'))
297 int32_t tmp = findidx (&us2);
298 rule2arr[idx2max] = tmp >> 24;
299 idx2arr[idx2max] = tmp & 0xffffff;
300 idx2cnt = idx2max++;
302 if ((rulesets[rule2arr[idx2cnt] * nrules]
303 & sort_backward) == 0)
304 /* No more backward characters to push. */
305 break;
306 ++idx2cnt;
309 if (backw2_stop >= idx2cnt)
311 /* No sequence at all or just one. */
312 if (idx2cnt == idx2max || backw2_stop > idx2cnt)
313 /* Note that seq1len is still zero. */
314 break;
316 backw2_stop = ~0ul;
317 idx2now = idx2cnt;
319 else
320 /* We pushed backward sequences. */
321 idx2now = backw2 = idx2cnt - 1;
324 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
326 /* See whether any or both strings are empty. */
327 if (seq1len == 0 || seq2len == 0)
329 if (seq1len == seq2len)
330 /* Both ended. So far so good, both strings are equal at the
331 first level. */
332 break;
334 /* This means one string is shorter than the other. Find out
335 which one and return an appropriate value. */
336 result = seq1len == 0 ? -1 : 1;
337 goto free_and_return;
340 /* Test for position if necessary. */
341 if (position && val1 != val2)
343 result = val1 - val2;
344 goto free_and_return;
347 /* Compare the two sequences. */
350 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
352 /* The sequences differ. */
353 result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
354 goto free_and_return;
357 /* Increment the offsets. */
358 ++idx1arr[idx1now];
359 ++idx2arr[idx2now];
361 --seq1len;
362 --seq2len;
364 while (seq1len > 0 && seq2len > 0);
366 if (position && seq1len != seq2len)
368 result = seq1len - seq2len;
369 goto free_and_return;
373 /* Now the remaining passes over the weights. We now use the
374 indeces we found before. */
375 for (pass = 1; pass < nrules; ++pass)
377 /* We assume that if a rule has defined `position' in one section
378 this is true for all of them. */
379 idx1cnt = 0;
380 idx2cnt = 0;
381 backw1_stop = ~0ul;
382 backw2_stop = ~0ul;
383 backw1 = ~0ul;
384 backw2 = ~0ul;
385 position = rulesets[rule1arr[0] * nrules + pass] & sort_position;
387 while (1)
389 val1 = 0;
390 val2 = 0;
392 /* Get the next non-IGNOREd element for string `s1'. */
393 if (seq1len == 0)
396 ++val1;
398 if (backw1_stop != ~0ul)
400 /* The is something pushed. */
401 if (backw1 == backw1_stop)
403 /* The last pushed character was handled. Continue
404 with forward characters. */
405 if (idx1cnt < idx1max)
406 idx1now = idx1cnt;
407 else
409 /* Nothing anymore. The backward sequence
410 ended with the last sequence in the string. */
411 idx1now = ~0ul;
412 break;
415 else
416 idx1now = --backw1;
418 else
420 backw1_stop = idx1cnt;
422 while (idx1cnt < idx1max)
424 if ((rulesets[rule1arr[idx1cnt] * nrules + pass]
425 & sort_backward) == 0)
426 /* No more backward characters to push. */
427 break;
428 ++idx1cnt;
431 if (backw1_stop == idx1cnt)
433 /* No sequence at all or just one. */
434 if (idx1cnt == idx1max)
435 /* Note that seq2len is still zero. */
436 break;
438 backw1_stop = ~0ul;
439 idx1now = idx1cnt++;
441 else
442 /* We pushed backward sequences. */
443 idx1now = backw1 = idx1cnt - 1;
446 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
448 /* And the same for string `s2'. */
449 if (seq2len == 0)
452 ++val2;
454 if (backw2_stop != ~0ul)
456 /* The is something pushed. */
457 if (backw2 == backw2_stop)
459 /* The last pushed character was handled. Continue
460 with forward characters. */
461 if (idx2cnt < idx2max)
462 idx2now = idx2cnt;
463 else
465 /* Nothing anymore. The backward sequence
466 ended with the last sequence in the string. */
467 idx2now = ~0ul;
468 break;
471 else
472 idx2now = --backw2;
474 else
476 backw2_stop = idx2cnt;
478 while (idx2cnt < idx2max)
480 if ((rulesets[rule2arr[idx2cnt] * nrules + pass]
481 & sort_backward) == 0)
482 /* No more backward characters to push. */
483 break;
484 ++idx2cnt;
487 if (backw2_stop == idx2cnt)
489 /* No sequence at all or just one. */
490 if (idx2cnt == idx2max)
491 /* Note that seq2len is still zero. */
492 break;
494 backw2_stop = ~0ul;
495 idx2now = idx2cnt++;
497 else
498 /* We pushed backward sequences. */
499 idx2now = backw2 = idx2cnt - 1;
502 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
504 /* See whether any or both strings are empty. */
505 if (seq1len == 0 || seq2len == 0)
507 if (seq1len == seq2len)
508 /* Both ended. So far so good, both strings are equal
509 at this level. */
510 break;
512 /* This means one string is shorter than the other. Find out
513 which one and return an appropriate value. */
514 result = seq1len == 0 ? -1 : 1;
515 goto free_and_return;
518 /* Test for position if necessary. */
519 if (position && val1 != val2)
521 result = val1 - val2;
522 goto free_and_return;
525 /* Compare the two sequences. */
528 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
530 /* The sequences differ. */
531 result = (weights[idx1arr[idx1now]]
532 - weights[idx2arr[idx2now]]);
533 goto free_and_return;
536 /* Increment the offsets. */
537 ++idx1arr[idx1now];
538 ++idx2arr[idx2now];
540 --seq1len;
541 --seq2len;
543 while (seq1len > 0 && seq2len > 0);
545 if (position && seq1len != seq2len)
547 result = seq1len - seq2len;
548 goto free_and_return;
553 /* Free the memory if needed. */
554 free_and_return:
555 if (use_malloc)
556 free (idx1arr);
558 return result;