Update.
[glibc.git] / string / strcoll.c
blob0df5752e50c52b2eab6356135651772a00f34ee0
1 /* Copyright (C) 1995, 96, 97, 98, 99, 2000 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #include <assert.h>
21 #include <langinfo.h>
22 #include <stddef.h>
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <string.h>
27 #ifndef STRING_TYPE
28 # define STRING_TYPE char
29 # define USTRING_TYPE unsigned char
30 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
31 # define STRCOLL __strcoll_l
32 # else
33 # define STRCOLL strcoll
34 # endif
35 # define STRCMP strcmp
36 # define STRLEN strlen
37 # define WEIGHT_H "../locale/weight.h"
38 # define SUFFIX MB
39 # define L(arg) arg
40 #endif
42 #define CONCAT(a,b) CONCAT1(a,b)
43 #define CONCAT1(a,b) a##b
45 #include "../locale/localeinfo.h"
47 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
48 int
49 STRCOLL (s1, s2)
50 const STRING_TYPE *s1;
51 const STRING_TYPE *s2;
52 #else
53 int
54 STRCOLL (s1, s2, l)
55 const STRING_TYPE *s1;
56 const STRING_TYPE *s2;
57 __locale_t l;
58 #endif
60 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
61 struct locale_data *current = l->__locales[LC_COLLATE];
62 uint_fast32_t nrules = *((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
63 #else
64 uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
65 #endif
66 /* We don't assign the following values right away since it might be
67 unnecessary in case there are no rules. */
68 const unsigned char *rulesets;
69 const int32_t *table;
70 const USTRING_TYPE *weights;
71 const USTRING_TYPE *extra;
72 const int32_t *indirect;
73 uint_fast32_t pass;
74 int result = 0;
75 const USTRING_TYPE *us1;
76 const USTRING_TYPE *us2;
77 size_t s1len;
78 size_t s2len;
79 int32_t *idx1arr;
80 int32_t *idx2arr;
81 unsigned char *rule1arr;
82 unsigned char *rule2arr;
83 size_t idx1max;
84 size_t idx2max;
85 size_t idx1cnt;
86 size_t idx2cnt;
87 size_t idx1now;
88 size_t idx2now;
89 size_t backw1_stop;
90 size_t backw2_stop;
91 size_t backw1;
92 size_t backw2;
93 int val1;
94 int val2;
95 int position;
96 int seq1len;
97 int seq2len;
98 int use_malloc;
99 #ifdef WIDE_CHAR_VERSION
100 size_t size;
101 size_t layers;
102 const wint_t *names;
103 #endif
105 #include WEIGHT_H
107 if (nrules == 0)
108 return STRCMP (s1, s2);
110 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
111 rulesets = (const unsigned char *)
112 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
113 table = (const int32_t *)
114 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
115 weights = (const USTRING_TYPE *)
116 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
117 extra = (const USTRING_TYPE *)
118 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
119 indirect = (const int32_t *)
120 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
121 # ifdef WIDE_CHAR_VERSION
122 names = (const wint_t *)
123 current->values[_NL_ITEM_INDEX (_NL_COLLATE_NAMES)].string;
124 size = current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].word;
125 layers = current->values[_NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].word;
126 # endif
127 #else
128 rulesets = (const unsigned char *)
129 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
130 table = (const int32_t *)
131 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_TABLE,SUFFIX));
132 weights = (const USTRING_TYPE *)
133 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_WEIGHT,SUFFIX));
134 extra = (const USTRING_TYPE *)
135 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_EXTRA,SUFFIX));
136 indirect = (const int32_t *)
137 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_INDIRECT,SUFFIX));
138 # ifdef WIDE_CHAR_VERSION
139 names = (const wint_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_NAMES);
140 size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_SIZE);
141 layers = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_HASH_LAYERS);
142 # endif
143 #endif
144 use_malloc = 0;
146 assert (((uintptr_t) table) % sizeof (table[0]) == 0);
147 assert (((uintptr_t) weights) % sizeof (weights[0]) == 0);
148 assert (((uintptr_t) weights) % sizeof (weights[0]) == 0);
149 assert (((uintptr_t) extra) % sizeof (extra[0]) == 0);
150 assert (((uintptr_t) indirect) % sizeof (indirect[0]) == 0);
151 #ifdef WIDE_CHAR_VERSION
152 assert (((uintptr_t) names) % sizeof (names[0]) == 0);
153 #endif
155 /* We need this a few times. */
156 s1len = STRLEN (s1);
157 s2len = STRLEN (s2);
159 /* We need the elements of the strings as unsigned values since they
160 are used as indeces. */
161 us1 = (const USTRING_TYPE *) s1;
162 us2 = (const USTRING_TYPE *) s2;
164 /* Perform the first pass over the string and while doing this find
165 and store the weights for each character. Since we want this to
166 be as fast as possible we are using `alloca' to store the temporary
167 values. But since there is no limit on the length of the string
168 we have to use `malloc' if the string is too long. We should be
169 very conservative here.
171 Please note that the localedef programs makes sure that `position'
172 is not used at the first level. */
173 if (s1len + s2len >= 16384)
175 idx1arr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
176 idx2arr = &idx1arr[s2len];
177 rule1arr = (unsigned char *) &idx2arr[s2len];
178 rule2arr = &rule1arr[s1len];
180 if (idx1arr == NULL)
181 /* No memory. Well, go with the stack then.
183 XXX Once this implementation is stable we will handle this
184 differently. Instead of precomputing the indeces we will
185 do this in time. This means, though, that this happens for
186 every pass again. */
187 goto try_stack;
188 use_malloc = 1;
190 else
192 try_stack:
193 idx1arr = (int32_t *) alloca (s1len * sizeof (int32_t));
194 idx2arr = (int32_t *) alloca (s2len * sizeof (int32_t));
195 rule1arr = (unsigned char *) alloca (s2len);
196 rule2arr = (unsigned char *) alloca (s2len);
199 idx1cnt = 0;
200 idx2cnt = 0;
201 idx1max = 0;
202 idx2max = 0;
203 idx1now = 0;
204 idx2now = 0;
205 backw1_stop = ~0ul;
206 backw2_stop = ~0ul;
207 backw1 = ~0ul;
208 backw2 = ~0ul;
209 seq1len = 0;
210 seq2len = 0;
211 position = rulesets[0] & sort_position;
212 while (1)
214 val1 = 0;
215 val2 = 0;
217 /* Get the next non-IGNOREd element for string `s1'. */
218 if (seq1len == 0)
221 ++val1;
223 if (backw1_stop != ~0ul)
225 /* The is something pushed. */
226 if (backw1 == backw1_stop)
228 /* The last pushed character was handled. Continue
229 with forward characters. */
230 if (idx1cnt < idx1max)
231 idx1now = idx1cnt;
232 else
233 /* Nothing anymore. The backward sequence ended with
234 the last sequence in the string. Note that seq1len
235 is still zero. */
236 break;
238 else
239 idx1now = --backw1;
241 else
243 backw1_stop = idx1max;
245 while (*us1 != L('\0'))
247 int32_t tmp = findidx (&us1);
248 rule1arr[idx1max] = tmp >> 24;
249 idx1arr[idx1max] = tmp & 0xffffff;
250 idx1cnt = idx1max++;
252 if ((rulesets[rule1arr[idx1cnt] * nrules]
253 & sort_backward) == 0)
254 /* No more backward characters to push. */
255 break;
256 ++idx1cnt;
259 if (backw1_stop >= idx1cnt)
261 /* No sequence at all or just one. */
262 if (idx1cnt == idx1max || backw1_stop > idx1cnt)
263 /* Note that seq1len is still zero. */
264 break;
266 backw1_stop = ~0ul;
267 idx1now = idx1cnt;
269 else
270 /* We pushed backward sequences. */
271 idx1now = backw1 = idx1cnt - 1;
274 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
276 /* And the same for string `s2'. */
277 if (seq2len == 0)
280 ++val2;
282 if (backw2_stop != ~0ul)
284 /* The is something pushed. */
285 if (backw2 == backw2_stop)
287 /* The last pushed character was handled. Continue
288 with forward characters. */
289 if (idx2cnt < idx2max)
290 idx2now = idx2cnt;
291 else
292 /* Nothing anymore. The backward sequence ended with
293 the last sequence in the string. Note that seq2len
294 is still zero. */
295 break;
297 else
298 idx2now = --backw2;
300 else
302 backw2_stop = idx2max;
304 while (*us2 != L('\0'))
306 int32_t tmp = findidx (&us2);
307 rule2arr[idx2max] = tmp >> 24;
308 idx2arr[idx2max] = tmp & 0xffffff;
309 idx2cnt = idx2max++;
311 if ((rulesets[rule2arr[idx2cnt] * nrules]
312 & sort_backward) == 0)
313 /* No more backward characters to push. */
314 break;
315 ++idx2cnt;
318 if (backw2_stop >= idx2cnt)
320 /* No sequence at all or just one. */
321 if (idx2cnt == idx2max || backw2_stop > idx2cnt)
322 /* Note that seq1len is still zero. */
323 break;
325 backw2_stop = ~0ul;
326 idx2now = idx2cnt;
328 else
329 /* We pushed backward sequences. */
330 idx2now = backw2 = idx2cnt - 1;
333 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
335 /* See whether any or both strings are empty. */
336 if (seq1len == 0 || seq2len == 0)
338 if (seq1len == seq2len)
339 /* Both ended. So far so good, both strings are equal at the
340 first level. */
341 break;
343 /* This means one string is shorter than the other. Find out
344 which one and return an appropriate value. */
345 result = seq1len == 0 ? -1 : 1;
346 goto free_and_return;
349 /* Test for position if necessary. */
350 if (position && val1 != val2)
352 result = val1 - val2;
353 goto free_and_return;
356 /* Compare the two sequences. */
359 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
361 /* The sequences differ. */
362 result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
363 goto free_and_return;
366 /* Increment the offsets. */
367 ++idx1arr[idx1now];
368 ++idx2arr[idx2now];
370 --seq1len;
371 --seq2len;
373 while (seq1len > 0 && seq2len > 0);
375 if (position && seq1len != seq2len)
377 result = seq1len - seq2len;
378 goto free_and_return;
382 /* Now the remaining passes over the weights. We now use the
383 indeces we found before. */
384 for (pass = 1; pass < nrules; ++pass)
386 /* We assume that if a rule has defined `position' in one section
387 this is true for all of them. */
388 idx1cnt = 0;
389 idx2cnt = 0;
390 backw1_stop = ~0ul;
391 backw2_stop = ~0ul;
392 backw1 = ~0ul;
393 backw2 = ~0ul;
394 position = rulesets[rule1arr[0] * nrules + pass] & sort_position;
396 while (1)
398 val1 = 0;
399 val2 = 0;
401 /* Get the next non-IGNOREd element for string `s1'. */
402 if (seq1len == 0)
405 ++val1;
407 if (backw1_stop != ~0ul)
409 /* The is something pushed. */
410 if (backw1 == backw1_stop)
412 /* The last pushed character was handled. Continue
413 with forward characters. */
414 if (idx1cnt < idx1max)
415 idx1now = idx1cnt;
416 else
418 /* Nothing anymore. The backward sequence
419 ended with the last sequence in the string. */
420 idx1now = ~0ul;
421 break;
424 else
425 idx1now = --backw1;
427 else
429 backw1_stop = idx1cnt;
431 while (idx1cnt < idx1max)
433 if ((rulesets[rule1arr[idx1cnt] * nrules + pass]
434 & sort_backward) == 0)
435 /* No more backward characters to push. */
436 break;
437 ++idx1cnt;
440 if (backw1_stop == idx1cnt)
442 /* No sequence at all or just one. */
443 if (idx1cnt == idx1max)
444 /* Note that seq2len is still zero. */
445 break;
447 backw1_stop = ~0ul;
448 idx1now = idx1cnt++;
450 else
451 /* We pushed backward sequences. */
452 idx1now = backw1 = idx1cnt - 1;
455 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
457 /* And the same for string `s2'. */
458 if (seq2len == 0)
461 ++val2;
463 if (backw2_stop != ~0ul)
465 /* The is something pushed. */
466 if (backw2 == backw2_stop)
468 /* The last pushed character was handled. Continue
469 with forward characters. */
470 if (idx2cnt < idx2max)
471 idx2now = idx2cnt;
472 else
474 /* Nothing anymore. The backward sequence
475 ended with the last sequence in the string. */
476 idx2now = ~0ul;
477 break;
480 else
481 idx2now = --backw2;
483 else
485 backw2_stop = idx2cnt;
487 while (idx2cnt < idx2max)
489 if ((rulesets[rule2arr[idx2cnt] * nrules + pass]
490 & sort_backward) == 0)
491 /* No more backward characters to push. */
492 break;
493 ++idx2cnt;
496 if (backw2_stop == idx2cnt)
498 /* No sequence at all or just one. */
499 if (idx2cnt == idx2max)
500 /* Note that seq2len is still zero. */
501 break;
503 backw2_stop = ~0ul;
504 idx2now = idx2cnt++;
506 else
507 /* We pushed backward sequences. */
508 idx2now = backw2 = idx2cnt - 1;
511 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
513 /* See whether any or both strings are empty. */
514 if (seq1len == 0 || seq2len == 0)
516 if (seq1len == seq2len)
517 /* Both ended. So far so good, both strings are equal
518 at this level. */
519 break;
521 /* This means one string is shorter than the other. Find out
522 which one and return an appropriate value. */
523 result = seq1len == 0 ? -1 : 1;
524 goto free_and_return;
527 /* Test for position if necessary. */
528 if (position && val1 != val2)
530 result = val1 - val2;
531 goto free_and_return;
534 /* Compare the two sequences. */
537 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
539 /* The sequences differ. */
540 result = (weights[idx1arr[idx1now]]
541 - weights[idx2arr[idx2now]]);
542 goto free_and_return;
545 /* Increment the offsets. */
546 ++idx1arr[idx1now];
547 ++idx2arr[idx2now];
549 --seq1len;
550 --seq2len;
552 while (seq1len > 0 && seq2len > 0);
554 if (position && seq1len != seq2len)
556 result = seq1len - seq2len;
557 goto free_and_return;
562 /* Free the memory if needed. */
563 free_and_return:
564 if (use_malloc)
565 free (idx1arr);
567 return result;