Update.
[glibc.git] / string / strcoll.c
blob49725e1a6921b740415fb34dc8135f4461e862f2
1 /* Copyright (C) 1995, 96, 97, 98, 99, 2000 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #include <assert.h>
21 #include <langinfo.h>
22 #include <stddef.h>
23 #include <stdint.h>
24 #include <stdlib.h>
25 #include <string.h>
27 #ifndef STRING_TYPE
28 # define STRING_TYPE char
29 # define USTRING_TYPE unsigned char
30 # ifdef USE_IN_EXTENDED_LOCALE_MODEL
31 # define STRCOLL __strcoll_l
32 # else
33 # define STRCOLL strcoll
34 # endif
35 # define STRCMP strcmp
36 # define STRLEN strlen
37 # define WEIGHT_H "../locale/weight.h"
38 # define SUFFIX MB
39 # define L(arg) arg
40 #endif
42 #define CONCAT(a,b) CONCAT1(a,b)
43 #define CONCAT1(a,b) a##b
45 #include "../locale/localeinfo.h"
47 #ifndef USE_IN_EXTENDED_LOCALE_MODEL
48 int
49 STRCOLL (s1, s2)
50 const STRING_TYPE *s1;
51 const STRING_TYPE *s2;
52 #else
53 int
54 STRCOLL (s1, s2, l)
55 const STRING_TYPE *s1;
56 const STRING_TYPE *s2;
57 __locale_t l;
58 #endif
60 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
61 struct locale_data *current = l->__locales[LC_COLLATE];
62 uint_fast32_t nrules = *((uint32_t *) current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].string);
63 #else
64 uint_fast32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
65 #endif
66 /* We don't assign the following values right away since it might be
67 unnecessary in case there are no rules. */
68 const unsigned char *rulesets;
69 const int32_t *table;
70 const USTRING_TYPE *weights;
71 const USTRING_TYPE *extra;
72 const int32_t *indirect;
73 uint_fast32_t pass;
74 int result = 0;
75 const USTRING_TYPE *us1;
76 const USTRING_TYPE *us2;
77 size_t s1len;
78 size_t s2len;
79 int32_t *idx1arr;
80 int32_t *idx2arr;
81 unsigned char *rule1arr;
82 unsigned char *rule2arr;
83 size_t idx1max;
84 size_t idx2max;
85 size_t idx1cnt;
86 size_t idx2cnt;
87 size_t idx1now;
88 size_t idx2now;
89 size_t backw1_stop;
90 size_t backw2_stop;
91 size_t backw1;
92 size_t backw2;
93 int val1;
94 int val2;
95 int position;
96 int seq1len;
97 int seq2len;
98 int use_malloc;
100 #include WEIGHT_H
102 if (nrules == 0)
103 return STRCMP (s1, s2);
105 #ifdef USE_IN_EXTENDED_LOCALE_MODEL
106 rulesets = (const unsigned char *)
107 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
108 table = (const int32_t *)
109 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
110 weights = (const USTRING_TYPE *)
111 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
112 extra = (const USTRING_TYPE *)
113 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
114 indirect = (const int32_t *)
115 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
116 #else
117 rulesets = (const unsigned char *)
118 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_RULESETS);
119 table = (const int32_t *)
120 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_TABLE,SUFFIX));
121 weights = (const USTRING_TYPE *)
122 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_WEIGHT,SUFFIX));
123 extra = (const USTRING_TYPE *)
124 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_EXTRA,SUFFIX));
125 indirect = (const int32_t *)
126 _NL_CURRENT (LC_COLLATE, CONCAT(_NL_COLLATE_INDIRECT,SUFFIX));
127 #endif
128 use_malloc = 0;
130 assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
131 assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
132 assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
133 assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
135 /* We need this a few times. */
136 s1len = STRLEN (s1);
137 s2len = STRLEN (s2);
139 /* We need the elements of the strings as unsigned values since they
140 are used as indeces. */
141 us1 = (const USTRING_TYPE *) s1;
142 us2 = (const USTRING_TYPE *) s2;
144 /* Perform the first pass over the string and while doing this find
145 and store the weights for each character. Since we want this to
146 be as fast as possible we are using `alloca' to store the temporary
147 values. But since there is no limit on the length of the string
148 we have to use `malloc' if the string is too long. We should be
149 very conservative here.
151 Please note that the localedef programs makes sure that `position'
152 is not used at the first level. */
153 if (s1len + s2len >= 16384)
155 idx1arr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
156 idx2arr = &idx1arr[s2len];
157 rule1arr = (unsigned char *) &idx2arr[s2len];
158 rule2arr = &rule1arr[s1len];
160 if (idx1arr == NULL)
161 /* No memory. Well, go with the stack then.
163 XXX Once this implementation is stable we will handle this
164 differently. Instead of precomputing the indeces we will
165 do this in time. This means, though, that this happens for
166 every pass again. */
167 goto try_stack;
168 use_malloc = 1;
170 else
172 try_stack:
173 idx1arr = (int32_t *) alloca (s1len * sizeof (int32_t));
174 idx2arr = (int32_t *) alloca (s2len * sizeof (int32_t));
175 rule1arr = (unsigned char *) alloca (s2len);
176 rule2arr = (unsigned char *) alloca (s2len);
179 idx1cnt = 0;
180 idx2cnt = 0;
181 idx1max = 0;
182 idx2max = 0;
183 idx1now = 0;
184 idx2now = 0;
185 backw1_stop = ~0ul;
186 backw2_stop = ~0ul;
187 backw1 = ~0ul;
188 backw2 = ~0ul;
189 seq1len = 0;
190 seq2len = 0;
191 position = rulesets[0] & sort_position;
192 while (1)
194 val1 = 0;
195 val2 = 0;
197 /* Get the next non-IGNOREd element for string `s1'. */
198 if (seq1len == 0)
201 ++val1;
203 if (backw1_stop != ~0ul)
205 /* The is something pushed. */
206 if (backw1 == backw1_stop)
208 /* The last pushed character was handled. Continue
209 with forward characters. */
210 if (idx1cnt < idx1max)
211 idx1now = idx1cnt;
212 else
213 /* Nothing anymore. The backward sequence ended with
214 the last sequence in the string. Note that seq1len
215 is still zero. */
216 break;
218 else
219 idx1now = --backw1;
221 else
223 backw1_stop = idx1max;
225 while (*us1 != L('\0'))
227 int32_t tmp = findidx (&us1);
228 rule1arr[idx1max] = tmp >> 24;
229 idx1arr[idx1max] = tmp & 0xffffff;
230 idx1cnt = idx1max++;
232 if ((rulesets[rule1arr[idx1cnt] * nrules]
233 & sort_backward) == 0)
234 /* No more backward characters to push. */
235 break;
236 ++idx1cnt;
239 if (backw1_stop >= idx1cnt)
241 /* No sequence at all or just one. */
242 if (idx1cnt == idx1max || backw1_stop > idx1cnt)
243 /* Note that seq1len is still zero. */
244 break;
246 backw1_stop = ~0ul;
247 idx1now = idx1cnt;
249 else
250 /* We pushed backward sequences. */
251 idx1now = backw1 = idx1cnt - 1;
254 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
256 /* And the same for string `s2'. */
257 if (seq2len == 0)
260 ++val2;
262 if (backw2_stop != ~0ul)
264 /* The is something pushed. */
265 if (backw2 == backw2_stop)
267 /* The last pushed character was handled. Continue
268 with forward characters. */
269 if (idx2cnt < idx2max)
270 idx2now = idx2cnt;
271 else
272 /* Nothing anymore. The backward sequence ended with
273 the last sequence in the string. Note that seq2len
274 is still zero. */
275 break;
277 else
278 idx2now = --backw2;
280 else
282 backw2_stop = idx2max;
284 while (*us2 != L('\0'))
286 int32_t tmp = findidx (&us2);
287 rule2arr[idx2max] = tmp >> 24;
288 idx2arr[idx2max] = tmp & 0xffffff;
289 idx2cnt = idx2max++;
291 if ((rulesets[rule2arr[idx2cnt] * nrules]
292 & sort_backward) == 0)
293 /* No more backward characters to push. */
294 break;
295 ++idx2cnt;
298 if (backw2_stop >= idx2cnt)
300 /* No sequence at all or just one. */
301 if (idx2cnt == idx2max || backw2_stop > idx2cnt)
302 /* Note that seq1len is still zero. */
303 break;
305 backw2_stop = ~0ul;
306 idx2now = idx2cnt;
308 else
309 /* We pushed backward sequences. */
310 idx2now = backw2 = idx2cnt - 1;
313 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
315 /* See whether any or both strings are empty. */
316 if (seq1len == 0 || seq2len == 0)
318 if (seq1len == seq2len)
319 /* Both ended. So far so good, both strings are equal at the
320 first level. */
321 break;
323 /* This means one string is shorter than the other. Find out
324 which one and return an appropriate value. */
325 result = seq1len == 0 ? -1 : 1;
326 goto free_and_return;
329 /* Test for position if necessary. */
330 if (position && val1 != val2)
332 result = val1 - val2;
333 goto free_and_return;
336 /* Compare the two sequences. */
339 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
341 /* The sequences differ. */
342 result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
343 goto free_and_return;
346 /* Increment the offsets. */
347 ++idx1arr[idx1now];
348 ++idx2arr[idx2now];
350 --seq1len;
351 --seq2len;
353 while (seq1len > 0 && seq2len > 0);
355 if (position && seq1len != seq2len)
357 result = seq1len - seq2len;
358 goto free_and_return;
362 /* Now the remaining passes over the weights. We now use the
363 indeces we found before. */
364 for (pass = 1; pass < nrules; ++pass)
366 /* We assume that if a rule has defined `position' in one section
367 this is true for all of them. */
368 idx1cnt = 0;
369 idx2cnt = 0;
370 backw1_stop = ~0ul;
371 backw2_stop = ~0ul;
372 backw1 = ~0ul;
373 backw2 = ~0ul;
374 position = rulesets[rule1arr[0] * nrules + pass] & sort_position;
376 while (1)
378 val1 = 0;
379 val2 = 0;
381 /* Get the next non-IGNOREd element for string `s1'. */
382 if (seq1len == 0)
385 ++val1;
387 if (backw1_stop != ~0ul)
389 /* The is something pushed. */
390 if (backw1 == backw1_stop)
392 /* The last pushed character was handled. Continue
393 with forward characters. */
394 if (idx1cnt < idx1max)
395 idx1now = idx1cnt;
396 else
398 /* Nothing anymore. The backward sequence
399 ended with the last sequence in the string. */
400 idx1now = ~0ul;
401 break;
404 else
405 idx1now = --backw1;
407 else
409 backw1_stop = idx1cnt;
411 while (idx1cnt < idx1max)
413 if ((rulesets[rule1arr[idx1cnt] * nrules + pass]
414 & sort_backward) == 0)
415 /* No more backward characters to push. */
416 break;
417 ++idx1cnt;
420 if (backw1_stop == idx1cnt)
422 /* No sequence at all or just one. */
423 if (idx1cnt == idx1max)
424 /* Note that seq2len is still zero. */
425 break;
427 backw1_stop = ~0ul;
428 idx1now = idx1cnt++;
430 else
431 /* We pushed backward sequences. */
432 idx1now = backw1 = idx1cnt - 1;
435 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
437 /* And the same for string `s2'. */
438 if (seq2len == 0)
441 ++val2;
443 if (backw2_stop != ~0ul)
445 /* The is something pushed. */
446 if (backw2 == backw2_stop)
448 /* The last pushed character was handled. Continue
449 with forward characters. */
450 if (idx2cnt < idx2max)
451 idx2now = idx2cnt;
452 else
454 /* Nothing anymore. The backward sequence
455 ended with the last sequence in the string. */
456 idx2now = ~0ul;
457 break;
460 else
461 idx2now = --backw2;
463 else
465 backw2_stop = idx2cnt;
467 while (idx2cnt < idx2max)
469 if ((rulesets[rule2arr[idx2cnt] * nrules + pass]
470 & sort_backward) == 0)
471 /* No more backward characters to push. */
472 break;
473 ++idx2cnt;
476 if (backw2_stop == idx2cnt)
478 /* No sequence at all or just one. */
479 if (idx2cnt == idx2max)
480 /* Note that seq2len is still zero. */
481 break;
483 backw2_stop = ~0ul;
484 idx2now = idx2cnt++;
486 else
487 /* We pushed backward sequences. */
488 idx2now = backw2 = idx2cnt - 1;
491 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
493 /* See whether any or both strings are empty. */
494 if (seq1len == 0 || seq2len == 0)
496 if (seq1len == seq2len)
497 /* Both ended. So far so good, both strings are equal
498 at this level. */
499 break;
501 /* This means one string is shorter than the other. Find out
502 which one and return an appropriate value. */
503 result = seq1len == 0 ? -1 : 1;
504 goto free_and_return;
507 /* Test for position if necessary. */
508 if (position && val1 != val2)
510 result = val1 - val2;
511 goto free_and_return;
514 /* Compare the two sequences. */
517 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
519 /* The sequences differ. */
520 result = (weights[idx1arr[idx1now]]
521 - weights[idx2arr[idx2now]]);
522 goto free_and_return;
525 /* Increment the offsets. */
526 ++idx1arr[idx1now];
527 ++idx2arr[idx2now];
529 --seq1len;
530 --seq2len;
532 while (seq1len > 0 && seq2len > 0);
534 if (position && seq1len != seq2len)
536 result = seq1len - seq2len;
537 goto free_and_return;
542 /* Free the memory if needed. */
543 free_and_return:
544 if (use_malloc)
545 free (idx1arr);
547 return result;