2007-06-09 Ulrich Drepper <drepper@redhat.com>
[glibc.git] / string / strcoll_l.c
blobc46921dcc9f35d412e05ce5a538e65b0eaeebb5f
1 /* Copyright (C) 1995,96,97,2002, 2004 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Written by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
21 #include <assert.h>
22 #include <langinfo.h>
23 #include <locale.h>
24 #include <stddef.h>
25 #include <stdint.h>
26 #include <stdlib.h>
27 #include <string.h>
29 #ifndef STRING_TYPE
30 # define STRING_TYPE char
31 # define USTRING_TYPE unsigned char
32 # define STRCOLL __strcoll_l
33 # define STRCMP strcmp
34 # define STRLEN strlen
35 # define WEIGHT_H "../locale/weight.h"
36 # define SUFFIX MB
37 # define L(arg) arg
38 #endif
40 #define CONCAT(a,b) CONCAT1(a,b)
41 #define CONCAT1(a,b) a##b
43 #include "../locale/localeinfo.h"
45 int
46 STRCOLL (s1, s2, l)
47 const STRING_TYPE *s1;
48 const STRING_TYPE *s2;
49 __locale_t l;
51 struct locale_data *current = l->__locales[LC_COLLATE];
52 uint_fast32_t nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
53 /* We don't assign the following values right away since it might be
54 unnecessary in case there are no rules. */
55 const unsigned char *rulesets;
56 const int32_t *table;
57 const USTRING_TYPE *weights;
58 const USTRING_TYPE *extra;
59 const int32_t *indirect;
60 uint_fast32_t pass;
61 int result = 0;
62 const USTRING_TYPE *us1;
63 const USTRING_TYPE *us2;
64 size_t s1len;
65 size_t s2len;
66 int32_t *idx1arr;
67 int32_t *idx2arr;
68 unsigned char *rule1arr;
69 unsigned char *rule2arr;
70 size_t idx1max;
71 size_t idx2max;
72 size_t idx1cnt;
73 size_t idx2cnt;
74 size_t idx1now;
75 size_t idx2now;
76 size_t backw1_stop;
77 size_t backw2_stop;
78 size_t backw1;
79 size_t backw2;
80 int val1;
81 int val2;
82 int position;
83 int seq1len;
84 int seq2len;
85 int use_malloc;
87 #include WEIGHT_H
89 if (nrules == 0)
90 return STRCMP (s1, s2);
92 rulesets = (const unsigned char *)
93 current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
94 table = (const int32_t *)
95 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
96 weights = (const USTRING_TYPE *)
97 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
98 extra = (const USTRING_TYPE *)
99 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
100 indirect = (const int32_t *)
101 current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
102 use_malloc = 0;
104 assert (((uintptr_t) table) % __alignof__ (table[0]) == 0);
105 assert (((uintptr_t) weights) % __alignof__ (weights[0]) == 0);
106 assert (((uintptr_t) extra) % __alignof__ (extra[0]) == 0);
107 assert (((uintptr_t) indirect) % __alignof__ (indirect[0]) == 0);
109 /* We need this a few times. */
110 s1len = STRLEN (s1);
111 s2len = STRLEN (s2);
113 /* Catch empty strings. */
114 if (__builtin_expect (s1len == 0, 0) || __builtin_expect (s2len == 0, 0))
115 return (s1len != 0) - (s2len != 0);
117 /* We need the elements of the strings as unsigned values since they
118 are used as indeces. */
119 us1 = (const USTRING_TYPE *) s1;
120 us2 = (const USTRING_TYPE *) s2;
122 /* Perform the first pass over the string and while doing this find
123 and store the weights for each character. Since we want this to
124 be as fast as possible we are using `alloca' to store the temporary
125 values. But since there is no limit on the length of the string
126 we have to use `malloc' if the string is too long. We should be
127 very conservative here.
129 Please note that the localedef programs makes sure that `position'
130 is not used at the first level. */
131 if (! __libc_use_alloca (s1len + s2len))
133 idx1arr = (int32_t *) malloc ((s1len + s2len) * (sizeof (int32_t) + 1));
134 idx2arr = &idx1arr[s1len];
135 rule1arr = (unsigned char *) &idx2arr[s2len];
136 rule2arr = &rule1arr[s1len];
138 if (idx1arr == NULL)
139 /* No memory. Well, go with the stack then.
141 XXX Once this implementation is stable we will handle this
142 differently. Instead of precomputing the indeces we will
143 do this in time. This means, though, that this happens for
144 every pass again. */
145 goto try_stack;
146 use_malloc = 1;
148 else
150 try_stack:
151 idx1arr = (int32_t *) alloca (s1len * sizeof (int32_t));
152 idx2arr = (int32_t *) alloca (s2len * sizeof (int32_t));
153 rule1arr = (unsigned char *) alloca (s1len);
154 rule2arr = (unsigned char *) alloca (s2len);
157 idx1cnt = 0;
158 idx2cnt = 0;
159 idx1max = 0;
160 idx2max = 0;
161 idx1now = 0;
162 idx2now = 0;
163 backw1_stop = ~0ul;
164 backw2_stop = ~0ul;
165 backw1 = ~0ul;
166 backw2 = ~0ul;
167 seq1len = 0;
168 seq2len = 0;
169 position = rulesets[0] & sort_position;
170 while (1)
172 val1 = 0;
173 val2 = 0;
175 /* Get the next non-IGNOREd element for string `s1'. */
176 if (seq1len == 0)
179 ++val1;
181 if (backw1_stop != ~0ul)
183 /* The is something pushed. */
184 if (backw1 == backw1_stop)
186 /* The last pushed character was handled. Continue
187 with forward characters. */
188 if (idx1cnt < idx1max)
189 idx1now = idx1cnt;
190 else
191 /* Nothing anymore. The backward sequence ended with
192 the last sequence in the string. Note that seq1len
193 is still zero. */
194 break;
196 else
197 idx1now = --backw1;
199 else
201 backw1_stop = idx1max;
203 while (*us1 != L('\0'))
205 int32_t tmp = findidx (&us1);
206 rule1arr[idx1max] = tmp >> 24;
207 idx1arr[idx1max] = tmp & 0xffffff;
208 idx1cnt = idx1max++;
210 if ((rulesets[rule1arr[idx1cnt] * nrules]
211 & sort_backward) == 0)
212 /* No more backward characters to push. */
213 break;
214 ++idx1cnt;
217 if (backw1_stop >= idx1cnt)
219 /* No sequence at all or just one. */
220 if (idx1cnt == idx1max || backw1_stop > idx1cnt)
221 /* Note that seq1len is still zero. */
222 break;
224 backw1_stop = ~0ul;
225 idx1now = idx1cnt;
227 else
228 /* We pushed backward sequences. */
229 idx1now = backw1 = idx1cnt - 1;
232 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
234 /* And the same for string `s2'. */
235 if (seq2len == 0)
238 ++val2;
240 if (backw2_stop != ~0ul)
242 /* The is something pushed. */
243 if (backw2 == backw2_stop)
245 /* The last pushed character was handled. Continue
246 with forward characters. */
247 if (idx2cnt < idx2max)
248 idx2now = idx2cnt;
249 else
250 /* Nothing anymore. The backward sequence ended with
251 the last sequence in the string. Note that seq2len
252 is still zero. */
253 break;
255 else
256 idx2now = --backw2;
258 else
260 backw2_stop = idx2max;
262 while (*us2 != L('\0'))
264 int32_t tmp = findidx (&us2);
265 rule2arr[idx2max] = tmp >> 24;
266 idx2arr[idx2max] = tmp & 0xffffff;
267 idx2cnt = idx2max++;
269 if ((rulesets[rule2arr[idx2cnt] * nrules]
270 & sort_backward) == 0)
271 /* No more backward characters to push. */
272 break;
273 ++idx2cnt;
276 if (backw2_stop >= idx2cnt)
278 /* No sequence at all or just one. */
279 if (idx2cnt == idx2max || backw2_stop > idx2cnt)
280 /* Note that seq1len is still zero. */
281 break;
283 backw2_stop = ~0ul;
284 idx2now = idx2cnt;
286 else
287 /* We pushed backward sequences. */
288 idx2now = backw2 = idx2cnt - 1;
291 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
293 /* See whether any or both strings are empty. */
294 if (seq1len == 0 || seq2len == 0)
296 if (seq1len == seq2len)
297 /* Both ended. So far so good, both strings are equal at the
298 first level. */
299 break;
301 /* This means one string is shorter than the other. Find out
302 which one and return an appropriate value. */
303 result = seq1len == 0 ? -1 : 1;
304 goto free_and_return;
307 /* Test for position if necessary. */
308 if (position && val1 != val2)
310 result = val1 - val2;
311 goto free_and_return;
314 /* Compare the two sequences. */
317 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
319 /* The sequences differ. */
320 result = weights[idx1arr[idx1now]] - weights[idx2arr[idx2now]];
321 goto free_and_return;
324 /* Increment the offsets. */
325 ++idx1arr[idx1now];
326 ++idx2arr[idx2now];
328 --seq1len;
329 --seq2len;
331 while (seq1len > 0 && seq2len > 0);
333 if (position && seq1len != seq2len)
335 result = seq1len - seq2len;
336 goto free_and_return;
340 /* Now the remaining passes over the weights. We now use the
341 indeces we found before. */
342 for (pass = 1; pass < nrules; ++pass)
344 /* We assume that if a rule has defined `position' in one section
345 this is true for all of them. */
346 idx1cnt = 0;
347 idx2cnt = 0;
348 backw1_stop = ~0ul;
349 backw2_stop = ~0ul;
350 backw1 = ~0ul;
351 backw2 = ~0ul;
352 position = rulesets[rule1arr[0] * nrules + pass] & sort_position;
354 while (1)
356 val1 = 0;
357 val2 = 0;
359 /* Get the next non-IGNOREd element for string `s1'. */
360 if (seq1len == 0)
363 ++val1;
365 if (backw1_stop != ~0ul)
367 /* The is something pushed. */
368 if (backw1 == backw1_stop)
370 /* The last pushed character was handled. Continue
371 with forward characters. */
372 if (idx1cnt < idx1max)
373 idx1now = idx1cnt;
374 else
376 /* Nothing anymore. The backward sequence
377 ended with the last sequence in the string. */
378 idx1now = ~0ul;
379 break;
382 else
383 idx1now = --backw1;
385 else
387 backw1_stop = idx1cnt;
389 while (idx1cnt < idx1max)
391 if ((rulesets[rule1arr[idx1cnt] * nrules + pass]
392 & sort_backward) == 0)
393 /* No more backward characters to push. */
394 break;
395 ++idx1cnt;
398 if (backw1_stop == idx1cnt)
400 /* No sequence at all or just one. */
401 if (idx1cnt == idx1max)
402 /* Note that seq1len is still zero. */
403 break;
405 backw1_stop = ~0ul;
406 idx1now = idx1cnt++;
408 else
409 /* We pushed backward sequences. */
410 idx1now = backw1 = idx1cnt - 1;
413 while ((seq1len = weights[idx1arr[idx1now]++]) == 0);
415 /* And the same for string `s2'. */
416 if (seq2len == 0)
419 ++val2;
421 if (backw2_stop != ~0ul)
423 /* The is something pushed. */
424 if (backw2 == backw2_stop)
426 /* The last pushed character was handled. Continue
427 with forward characters. */
428 if (idx2cnt < idx2max)
429 idx2now = idx2cnt;
430 else
432 /* Nothing anymore. The backward sequence
433 ended with the last sequence in the string. */
434 idx2now = ~0ul;
435 break;
438 else
439 idx2now = --backw2;
441 else
443 backw2_stop = idx2cnt;
445 while (idx2cnt < idx2max)
447 if ((rulesets[rule2arr[idx2cnt] * nrules + pass]
448 & sort_backward) == 0)
449 /* No more backward characters to push. */
450 break;
451 ++idx2cnt;
454 if (backw2_stop == idx2cnt)
456 /* No sequence at all or just one. */
457 if (idx2cnt == idx2max)
458 /* Note that seq2len is still zero. */
459 break;
461 backw2_stop = ~0ul;
462 idx2now = idx2cnt++;
464 else
465 /* We pushed backward sequences. */
466 idx2now = backw2 = idx2cnt - 1;
469 while ((seq2len = weights[idx2arr[idx2now]++]) == 0);
471 /* See whether any or both strings are empty. */
472 if (seq1len == 0 || seq2len == 0)
474 if (seq1len == seq2len)
475 /* Both ended. So far so good, both strings are equal
476 at this level. */
477 break;
479 /* This means one string is shorter than the other. Find out
480 which one and return an appropriate value. */
481 result = seq1len == 0 ? -1 : 1;
482 goto free_and_return;
485 /* Test for position if necessary. */
486 if (position && val1 != val2)
488 result = val1 - val2;
489 goto free_and_return;
492 /* Compare the two sequences. */
495 if (weights[idx1arr[idx1now]] != weights[idx2arr[idx2now]])
497 /* The sequences differ. */
498 result = (weights[idx1arr[idx1now]]
499 - weights[idx2arr[idx2now]]);
500 goto free_and_return;
503 /* Increment the offsets. */
504 ++idx1arr[idx1now];
505 ++idx2arr[idx2now];
507 --seq1len;
508 --seq2len;
510 while (seq1len > 0 && seq2len > 0);
512 if (position && seq1len != seq2len)
514 result = seq1len - seq2len;
515 goto free_and_return;
520 /* Free the memory if needed. */
521 free_and_return:
522 if (use_malloc)
523 free (idx1arr);
525 return result;
527 libc_hidden_def (STRCOLL)
529 #ifndef WIDE_CHAR_VERSION
530 weak_alias (__strcoll_l, strcoll_l)
531 #endif