update from main archive 961113
[glibc.git] / locale / programs / ld-collate.c
blobc8741e83cb82964498d62713b7507a17a5eaad1c
1 /* Copyright (C) 1995, 1996 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If
17 not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <endian.h>
25 #include <errno.h>
26 #include <limits.h>
27 #include <locale.h>
28 #include <obstack.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <wchar.h>
33 #include "localeinfo.h"
34 #include "locales.h"
35 #include "simple-hash.h"
36 #include "stringtrans.h"
37 #include "strlen-hash.h"
39 /* Uncomment the following line in the production version. */
40 /* #define NDEBUG 1 */
41 #include <assert.h>
44 #define MAX(a, b) ((a) > (b) ? (a) : (b))
46 #define SWAPU32(w) \
47 (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
50 /* What kind of symbols get defined? */
51 enum coll_symbol
53 undefined,
54 ellipsis,
55 character,
56 element,
57 symbol
61 typedef struct patch_t
63 const char *fname;
64 size_t lineno;
65 const char *token;
66 union
68 unsigned int *pos;
69 size_t idx;
70 } where;
71 struct patch_t *next;
72 } patch_t;
75 typedef struct element_t
77 const wchar_t *name;
78 unsigned int this_weight;
80 struct element_t *next;
82 unsigned int *ordering;
83 size_t ordering_len;
84 } element_t;
87 /* The real definition of the struct for the LC_COLLATE locale. */
88 struct locale_collate_t
90 /* Collate symbol table. Simple mapping to number. */
91 hash_table symbols;
93 /* The collation elements. */
94 hash_table elements;
95 struct obstack element_mem;
97 /* The result table. */
98 hash_table result;
100 /* Sorting rules given in order_start line. */
101 u_int32_t nrules;
102 u_int32_t nrules_max;
103 enum coll_sort_rule *rules;
105 /* Used while recognizing symbol composed of multiple tokens
106 (collating-element). */
107 const char *combine_token;
108 size_t combine_token_len;
110 /* How many sorting order specifications so far. */
111 unsigned int order_cnt;
113 /* Was lastline ellipsis? */
114 int was_ellipsis;
115 /* Value of last entry if was character. */
116 wchar_t last_char;
117 /* Current element. */
118 element_t *current_element;
119 /* What kind of symbol is current element. */
120 enum coll_symbol kind;
122 /* While collecting the weigths we need some temporary space. */
123 unsigned int current_order;
124 int *weight_cnt;
125 unsigned int weight_idx;
126 unsigned int *weight;
127 size_t nweight;
128 size_t nweight_max;
130 /* Patch lists. */
131 patch_t *current_patch;
132 patch_t *all_patches;
134 /* Room for the UNDEFINED information. */
135 element_t undefined;
136 unsigned int undefined_len;
140 /* Be verbose? Defined in localedef.c. */
141 extern int verbose;
144 void *xmalloc (size_t __n);
145 void *xrealloc (void *__p, size_t __n);
148 #define obstack_chunk_alloc xmalloc
149 #define obstack_chunk_free free
152 void
153 collate_startup (struct linereader *lr, struct localedef_t *locale,
154 struct charset_t *charset)
156 struct locale_collate_t *collate;
158 /* It is important that we always use UCS4 encoding for strings now. */
159 encoding_method = ENC_UCS4;
161 /* Allocate the needed room. */
162 locale->categories[LC_COLLATE].collate = collate =
163 (struct locale_collate_t *) xmalloc (sizeof (struct locale_collate_t));
165 /* Allocate hash table for collating elements. */
166 if (init_hash (&collate->elements, 512))
167 error (4, 0, _("memory exhausted"));
168 collate->combine_token = NULL;
169 obstack_init (&collate->element_mem);
171 /* Allocate hash table for collating elements. */
172 if (init_hash (&collate->symbols, 64))
173 error (4, 0, _("memory exhausted"));
175 /* Allocate hash table for result. */
176 if (init_hash (&collate->result, 512))
177 error (4, 0, _("memory exhausted"));
179 collate->nrules = 0;
180 collate->nrules_max = 10;
181 collate->rules
182 = (enum coll_sort_rule *) xmalloc (collate->nrules_max
183 * sizeof (enum coll_sort_rule));
185 collate->order_cnt = 1; /* The smallest weight is 2. */
187 collate->was_ellipsis = 0;
188 collate->last_char = L'\0'; /* 0 because leading ellipsis is allowed. */
190 collate->all_patches = NULL;
192 /* This tells us no UNDEFINED entry was found until now. */
193 collate->undefined.this_weight = 0;
195 lr->translate_strings = 0;
199 void
200 collate_finish (struct localedef_t *locale, struct charset_t *charset)
202 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
203 patch_t *patch;
204 size_t cnt;
206 /* Patch the constructed table so that forward references are
207 correctly filled. */
208 for (patch = collate->all_patches; patch != NULL; patch = patch->next)
210 wchar_t wch;
211 size_t toklen = strlen (patch->token);
212 void *ptmp;
213 unsigned int value = 0;
215 wch = charset_find_value (charset, patch->token, toklen);
216 if (wch != ILLEGAL_CHAR_VALUE)
218 element_t *runp;
220 if (find_entry (&collate->result, &wch, sizeof (wchar_t),
221 (void *) &runp) < 0)
222 runp = NULL;
223 for (; runp != NULL; runp = runp->next)
224 if (runp->name[0] == wch && runp->name[1] == L'\0')
225 break;
227 value = runp == NULL ? 0 : runp->this_weight;
229 else if (find_entry (&collate->elements, patch->token, toklen, &ptmp)
230 >= 0)
232 value = ((element_t *) ptmp)->this_weight;
234 else if (find_entry (&collate->symbols, patch->token, toklen, &ptmp)
235 >= 0)
237 value = (unsigned long int) ptmp;
239 else
240 value = 0;
242 if (value == 0)
243 error_at_line (0, 0, patch->fname, patch->lineno,
244 _("no weight defined for symbol `%s'"), patch->token);
245 else
246 *patch->where.pos = value;
249 /* If no definition for UNDEFINED is given, all characters in the
250 given charset must be specified. */
251 if (collate->undefined.ordering == NULL)
253 /**************************************************************\
254 |* XXX We should test whether really an unspecified character *|
255 |* exists before giving the message. *|
256 \**************************************************************/
257 u_int32_t weight;
259 error (0, 0, _("no definition of `UNDEFINED'"));
261 collate->undefined.ordering_len = collate->nrules;
262 weight = ++collate->order_cnt;
264 for (cnt = 0; cnt < collate->nrules; ++cnt)
266 u_int32_t one = 1;
267 obstack_grow (&collate->element_mem, &one, sizeof (one));
270 for (cnt = 0; cnt < collate->nrules; ++cnt)
271 obstack_grow (&collate->element_mem, &weight, sizeof (weight));
273 collate->undefined.ordering = obstack_finish (&collate->element_mem);
276 collate->undefined_len = 2; /* For the name: 1 x wchar_t + L'\0'. */
277 for (cnt = 0; cnt < collate->nrules; ++cnt)
278 collate->undefined_len += 1 + collate->undefined.ordering[cnt];
283 void
284 collate_output (struct localedef_t *locale, struct charset_t *charset,
285 const char *output_path)
287 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
288 u_int32_t table_size, table_best, level_best, sum_best;
289 void *last;
290 element_t *pelem;
291 wchar_t *name;
292 size_t len;
293 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
294 struct iovec iov[2 + nelems];
295 struct locale_file data;
296 u_int32_t idx[nelems];
297 struct obstack non_simple;
298 struct obstack string_pool;
299 size_t cnt, entry_size;
300 u_int32_t undefined_offset = UINT_MAX;
301 u_int32_t *table, *extra, *table2, *extra2;
302 size_t extra_len;
303 u_int32_t element_hash_tab_size;
304 u_int32_t *element_hash_tab;
305 u_int32_t *element_hash_tab_ob;
306 u_int32_t element_string_pool_size;
307 char *element_string_pool;
308 u_int32_t element_value_size;
309 wchar_t *element_value;
310 wchar_t *element_value_ob;
311 u_int32_t symbols_hash_tab_size;
312 u_int32_t *symbols_hash_tab;
313 u_int32_t *symbols_hash_tab_ob;
314 u_int32_t symbols_string_pool_size;
315 char *symbols_string_pool;
316 u_int32_t symbols_class_size;
317 u_int32_t *symbols_class;
318 u_int32_t *symbols_class_ob;
319 hash_table *hash_tab;
320 unsigned int dummy_weights[collate->nrules + 1];
322 sum_best = UINT_MAX;
323 table_best = 0xffff;
324 level_best = 0xffff;
326 /* Compute table size. */
327 fputs (_("\
328 Computing table size for collation information might take a while..."),
329 stderr);
330 for (table_size = 256; table_size < sum_best; ++table_size)
332 size_t hits[table_size];
333 unsigned int worst = 1;
334 size_t cnt;
336 last = NULL;
338 for (cnt = 0; cnt < 256; ++cnt)
339 hits[cnt] = 1;
340 memset (&hits[256], '\0', sizeof (hits) - 256 * sizeof (size_t));
342 while (iterate_table (&collate->result, &last, (const void **) &name,
343 &len, (void **) &pelem) >= 0)
344 if (pelem->ordering != NULL && pelem->name[0] > 0xff)
345 if (++hits[(unsigned int) pelem->name[0] % table_size] > worst)
347 worst = hits[(unsigned int) pelem->name[0] % table_size];
348 if (table_size * worst > sum_best)
349 break;
352 if (table_size * worst < sum_best)
354 sum_best = table_size * worst;
355 table_best = table_size;
356 level_best = worst;
359 assert (table_best != 0xffff || level_best != 0xffff);
360 fputs (_(" done\n"), stderr);
362 obstack_init (&non_simple);
363 obstack_init (&string_pool);
365 data.magic = LIMAGIC (LC_COLLATE);
366 data.n = nelems;
367 iov[0].iov_base = (void *) &data;
368 iov[0].iov_len = sizeof (data);
370 iov[1].iov_base = (void *) idx;
371 iov[1].iov_len = sizeof (idx);
373 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_NRULES)].iov_base = &collate->nrules;
374 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_NRULES)].iov_len = sizeof (u_int32_t);
376 table = (u_int32_t *) alloca (collate->nrules * sizeof (u_int32_t));
377 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_RULES)].iov_base = table;
378 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_RULES)].iov_len
379 = collate->nrules * sizeof (u_int32_t);
380 /* Another trick here. Describing the collation method needs only a
381 few bits (3, to be exact). But the binary file should be
382 accessible by maschines with both endianesses and so we store both
383 information in the same word. */
384 for (cnt = 0; cnt < collate->nrules; ++cnt)
385 table[cnt] = collate->rules[cnt] | SWAPU32 (collate->rules[cnt]);
387 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].iov_base = &table_best;
388 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].iov_len = sizeof (u_int32_t);
390 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].iov_base = &level_best;
391 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].iov_len
392 = sizeof (u_int32_t);
394 entry_size = 1 + MAX (collate->nrules, 2);
396 table = (u_int32_t *) alloca (table_best * level_best * entry_size
397 * sizeof (table[0]));
398 memset (table, '\0', table_best * level_best * entry_size
399 * sizeof (table[0]));
402 /* Macros for inserting in output table. */
403 #define ADD_VALUE(expr) \
404 do { \
405 u_int32_t to_write = (u_int32_t) expr; \
406 obstack_grow (&non_simple, &to_write, sizeof (to_write)); \
407 } while (0)
409 #define ADD_ELEMENT(pelem, len) \
410 do { \
411 size_t cnt, idx; \
413 ADD_VALUE (len); \
415 wlen = wcslen (pelem->name); \
416 obstack_grow (&non_simple, pelem->name, (wlen + 1) * sizeof (u_int32_t)); \
418 idx = collate->nrules; \
419 for (cnt = 0; cnt < collate->nrules; ++cnt) \
421 size_t disp; \
423 ADD_VALUE (pelem->ordering[cnt]); \
424 for (disp = 0; disp < pelem->ordering[cnt]; ++disp) \
425 ADD_VALUE (pelem->ordering[idx++]); \
427 } while (0)
429 #define ADD_FORWARD(pelem) \
430 do { \
431 /* We leave a reference in the main table and put all \
432 information in the table for the extended entries. */ \
433 element_t *runp; \
434 element_t *has_simple = NULL; \
435 size_t wlen; \
437 table[(level * table_best + slot) * entry_size + 1] \
438 = FORWARD_CHAR; \
439 table[(level * table_best + slot) * entry_size + 2] \
440 = obstack_object_size (&non_simple) / sizeof (u_int32_t); \
442 /* Here we have to construct the non-simple table entry. First \
443 compute the total length of this entry. */ \
444 for (runp = (pelem); runp != NULL; runp = runp->next) \
445 if (runp->ordering != NULL) \
447 u_int32_t value; \
448 size_t cnt; \
450 value = 1 + wcslen (runp->name) + 1; \
452 for (cnt = 0; cnt < collate->nrules; ++cnt) \
453 /* We have to take care for entries without ordering \
454 information. While reading them they get inserted in the \
455 table and later not removed when something goes wrong with \
456 reading its weights. */ \
458 value += 1 + runp->ordering[cnt]; \
460 if (runp->name[1] == L'\0') \
461 has_simple = runp; \
464 ADD_ELEMENT (runp, value); \
467 if (has_simple == NULL) \
469 size_t idx, cnt; \
471 ADD_VALUE (collate->undefined_len + 1); \
473 /* Add the name. */ \
474 ADD_VALUE ((pelem)->name[0]); \
475 ADD_VALUE (0); \
477 idx = collate->nrules; \
478 for (cnt = 0; cnt < collate->nrules; ++cnt) \
480 size_t disp; \
482 ADD_VALUE (collate->undefined.ordering[cnt]); \
483 for (disp = 0; disp < collate->undefined.ordering[cnt]; ++disp) \
485 if ((wchar_t) collate->undefined.ordering[idx] \
486 == ELLIPSIS_CHAR) \
487 ADD_VALUE ((pelem)->name[0]); \
488 else \
489 ADD_VALUE (collate->undefined.ordering[idx++]); \
490 ++idx; \
494 } while (0)
498 /* Fill the table now. First we look for all the characters which
499 fit into one single byte. This speeds up the 8-bit string
500 functions. */
501 last = NULL;
502 while (iterate_table (&collate->result, &last, (const void **) &name,
503 &len, (void **) &pelem) >= 0)
504 if (pelem->name[0] <= 0xff)
506 /* We have a single byte name. Now we must distinguish
507 between entries in simple form (i.e., only one value per
508 weight and no collation element starting with the same
509 character) and those which are not. */
510 size_t slot = ((size_t) pelem->name[0]);
511 const size_t level = 0;
513 table[slot * entry_size] = pelem->name[0];
515 if (pelem->name[1] == L'\0' && pelem->next == NULL
516 && pelem->ordering_len == collate->nrules)
518 /* Yes, we have a simple one. Lucky us. */
519 size_t cnt;
521 for (cnt = 0; cnt < collate->nrules; ++cnt)
522 table[slot * entry_size + 1 + cnt]
523 = pelem->ordering[collate->nrules + cnt];
525 else
526 ADD_FORWARD (pelem);
529 /* Now check for missing single byte entries. If one exist we fill
530 with the UNDEFINED entry. */
531 for (cnt = 0; cnt < 256; ++cnt)
532 /* The first weight is never 0 for existing entries. */
533 if (table[cnt * entry_size + 1] == 0)
535 /* We have to fill in the information from the UNDEFINED
536 entry. */
537 table[cnt * entry_size] = (u_int32_t) cnt;
539 if (collate->undefined.ordering_len == collate->nrules)
541 size_t inner;
543 for (inner = 0; inner < collate->nrules; ++inner)
544 if ((wchar_t)collate->undefined.ordering[collate->nrules + inner]
545 == ELLIPSIS_CHAR)
546 table[cnt * entry_size + 1 + inner] = cnt;
547 else
548 table[cnt * entry_size + 1 + inner]
549 = collate->undefined.ordering[collate->nrules + inner];
551 else
553 if (undefined_offset != UINT_MAX)
555 table[cnt * entry_size + 1] = FORWARD_CHAR;
556 table[cnt * entry_size + 2] = undefined_offset;
558 else
560 const size_t slot = cnt;
561 const size_t level = 0;
563 ADD_FORWARD (&collate->undefined);
564 undefined_offset = table[cnt * entry_size + 2];
569 /* Now we are ready for inserting the whole rest. */
570 last = NULL;
571 while (iterate_table (&collate->result, &last, (const void **) &name,
572 &len, (void **) &pelem) >= 0)
573 if (pelem->name[0] > 0xff)
575 /* Find the position. */
576 size_t slot = ((size_t) pelem->name[0]) % table_best;
577 size_t level = 0;
579 while (table[(level * table_best + slot) * entry_size + 1] != 0)
580 ++level;
581 assert (level < level_best);
583 if (pelem->name[1] == L'\0' && pelem->next == NULL
584 && pelem->ordering_len == collate->nrules)
586 /* Again a simple entry. */
587 size_t inner;
589 for (inner = 0; inner < collate->nrules; ++inner)
590 table[(level * table_best + slot) * entry_size + 1 + inner]
591 = pelem->ordering[collate->nrules + inner];
593 else
594 ADD_FORWARD (pelem);
597 /* Add the UNDEFINED entry. */
599 /* Here we have to construct the non-simple table entry. */
600 size_t idx, cnt;
602 undefined_offset = obstack_object_size (&non_simple);
604 idx = collate->nrules;
605 for (cnt = 0; cnt < collate->nrules; ++cnt)
607 size_t disp;
609 ADD_VALUE (collate->undefined.ordering[cnt]);
610 for (disp = 0; disp < collate->undefined.ordering[cnt]; ++disp)
611 ADD_VALUE (collate->undefined.ordering[idx++]);
615 /* Finish the extra block. */
616 extra_len = obstack_object_size (&non_simple);
617 extra = (u_int32_t *) obstack_finish (&non_simple);
618 assert ((extra_len % sizeof (u_int32_t)) == 0);
620 /* Now we have to build the two array for the other byte ordering. */
621 table2 = (u_int32_t *) alloca (table_best * level_best * entry_size
622 * sizeof (table[0]));
623 extra2 = (u_int32_t *) alloca (extra_len);
625 for (cnt = 0; cnt < table_best * level_best * entry_size; ++cnt)
626 table2[cnt] = SWAPU32 (table[cnt]);
628 for (cnt = 0; cnt < extra_len / sizeof (u_int32_t); ++cnt)
629 extra2[cnt] = SWAPU32 (extra2[cnt]);
631 /* We need a simple hashing table to get a collation-element->chars
632 mapping. We again use internal hasing using a secondary hashing
633 function.
635 Each string has an associate hashing value V, computed by a
636 fixed function. To locate the string we use open addressing with
637 double hashing. The first index will be V % M, where M is the
638 size of the hashing table. If no entry is found, iterating with
639 a second, independent hashing function takes place. This second
640 value will be 1 + V % (M - 2). The approximate number of probes
641 will be
643 for unsuccessful search: (1 - N / M) ^ -1
644 for successful search: - (N / M) ^ -1 * ln (1 - N / M)
646 where N is the number of keys.
648 If we now choose M to be the next prime bigger than 4 / 3 * N,
649 we get the values 4 and 1.85 resp. Because unsuccesful searches
650 are unlikely this is a good value. Formulas: [Knuth, The Art of
651 Computer Programming, Volume 3, Sorting and Searching, 1973,
652 Addison Wesley] */
653 if (collate->elements.filled == 0)
655 /* We don't need any element table since there are no collating
656 elements. */
657 element_hash_tab_size = 0;
658 element_hash_tab = NULL;
659 element_hash_tab_ob = NULL;
660 element_string_pool_size = 0;
661 element_string_pool = NULL;
662 element_value_size = 0;
663 element_value = NULL;
664 element_value_ob = NULL;
666 else
668 void *ptr; /* Running pointer. */
669 const char *key; /* Key for current bucket. */
670 size_t keylen; /* Length of key data. */
671 const element_t *data; /* Data, i.e., the character sequence. */
673 element_hash_tab_size = next_prime ((collate->elements.filled * 4) / 3);
674 if (element_hash_tab_size < 7)
675 /* We need a minimum to make the following code work. */
676 element_hash_tab_size = 7;
678 element_hash_tab = obstack_alloc (&non_simple, (2 * element_hash_tab_size
679 * sizeof (u_int32_t)));
680 memset (element_hash_tab, '\377', (2 * element_hash_tab_size
681 * sizeof (u_int32_t)));
683 ptr = NULL;
684 while (iterate_table (&collate->elements, &ptr, (const void **) &key,
685 &keylen, (void **) &data) == 0)
687 size_t hash_val = hash_string (key, keylen);
688 size_t idx = hash_val % element_hash_tab_size;
690 if (element_hash_tab[2 * idx] != (~((u_int32_t) 0)))
692 /* We need the second hashing function. */
693 size_t c = 1 + (hash_val % (element_hash_tab_size - 2));
696 if (idx >= element_hash_tab_size - c)
697 idx -= element_hash_tab_size - c;
698 else
699 idx += c;
700 while (element_hash_tab[2 * idx] != (~((u_int32_t) 0)));
703 element_hash_tab[2 * idx] = obstack_object_size (&non_simple);
704 element_hash_tab[2 * idx + 1] = (obstack_object_size (&string_pool)
705 / sizeof (wchar_t));
707 obstack_grow0 (&non_simple, key, keylen);
708 obstack_grow (&string_pool, data->name,
709 (wcslen (data->name) + 1) * sizeof (wchar_t));
712 if (obstack_object_size (&non_simple) % 4 != 0)
713 obstack_blank (&non_simple,
714 4 - (obstack_object_size (&non_simple) % 4));
715 element_string_pool_size = obstack_object_size (&non_simple);
716 element_string_pool = obstack_finish (&non_simple);
718 element_value_size = obstack_object_size (&string_pool);
719 element_value = obstack_finish (&string_pool);
721 /* Create the tables for the other byte order. */
722 element_hash_tab_ob = obstack_alloc (&non_simple,
723 (2 * element_hash_tab_size
724 * sizeof (u_int32_t)));
725 for (cnt = 0; cnt < 2 * element_hash_tab_size; ++cnt)
726 element_hash_tab_ob[cnt] = SWAPU32 (element_hash_tab[cnt]);
728 element_value_ob = obstack_alloc (&string_pool, element_value_size);
729 if (sizeof (wchar_t) != 4)
731 fputs ("sizeof (wchar_t) != 4 currently not handled", stderr);
732 abort ();
734 for (cnt = 0; cnt < element_value_size / 4; ++cnt)
735 element_value_ob[cnt] = SWAPU32 (element_value[cnt]);
738 /* Store collation elements as map to collation class. There are
739 three kinds of symbols:
740 - simple characters
741 - collation elements
742 - collation symbols
743 We need to make a table which lets the user to access the primary
744 weight based on the symbol string. */
745 symbols_hash_tab_size = next_prime ((4 * (charset->char_table.filled
746 + collate->elements.filled
747 + collate->symbols.filled)) / 3);
748 symbols_hash_tab = obstack_alloc (&non_simple, (2 * symbols_hash_tab_size
749 * sizeof (u_int32_t)));
750 memset (symbols_hash_tab, '\377', (2 * symbols_hash_tab_size
751 * sizeof (u_int32_t)));
753 /* Now fill the array. First the symbols from the character set,
754 then the collation elements and last the collation symbols. */
755 hash_tab = &charset->char_table;
756 while (1)
758 void *ptr; /* Running pointer. */
759 const char *key; /* Key for current bucket. */
760 size_t keylen; /* Length of key data. */
761 void *data; /* Data. */
763 ptr = NULL;
764 while (iterate_table (hash_tab, &ptr, (const void **) &key,
765 &keylen, (void **) &data) == 0)
767 size_t hash_val;
768 size_t idx;
769 u_int32_t word;
770 unsigned int *weights;
772 if (hash_tab == &charset->char_table
773 || hash_tab == &collate->elements)
775 element_t *lastp, *firstp;
776 wchar_t dummy_name[2];
777 const wchar_t *name;
778 size_t name_len;
780 if (hash_tab == &charset->char_table)
782 dummy_name[0] = (wchar_t) ((unsigned long int) data);
783 dummy_name[1] = L'\0';
784 name = dummy_name;
785 name_len = sizeof (wchar_t);
787 else
789 element_t *elemp = (element_t *) data;
790 name = elemp->name;
791 name_len = wcslen (name) * sizeof (wchar_t);
794 /* First check whether this character is used at all. */
795 if (find_entry (&collate->result, name, name_len,
796 (void *) &firstp) < 0)
797 /* The symbol is not directly mentioned in the collation.
798 I.e., we use the value for UNDEFINED. */
799 lastp = &collate->undefined;
800 else
802 /* The entry for the simple character is always found at
803 the end. */
804 lastp = firstp;
805 while (lastp->next != NULL && wcscmp (name, lastp->name))
806 lastp = lastp->next;
809 weights = lastp->ordering;
811 else
813 dummy_weights[0] = 1;
814 dummy_weights[collate->nrules]
815 = (unsigned int) ((unsigned long int) data);
817 weights = dummy_weights;
820 /* In LASTP->ordering we now have the collation class.
821 Determine the place in the hashing table next. */
822 hash_val = hash_string (key, keylen);
823 idx = hash_val % symbols_hash_tab_size;
825 if (symbols_hash_tab[2 * idx] != (~((u_int32_t) 0)))
827 /* We need the second hashing function. */
828 size_t c = 1 + (hash_val % (symbols_hash_tab_size - 2));
831 if (idx >= symbols_hash_tab_size - c)
832 idx -= symbols_hash_tab_size - c;
833 else
834 idx += c;
835 while (symbols_hash_tab[2 * idx] != (~((u_int32_t) 0)));
838 symbols_hash_tab[2 * idx] = obstack_object_size (&string_pool);
839 symbols_hash_tab[2 * idx + 1] = (obstack_object_size (&non_simple)
840 / sizeof (u_int32_t));
842 obstack_grow0 (&string_pool, key, keylen);
843 /* Adding the first weight looks complicated. We have to deal
844 with the kind it is stored and with the fact that original
845 form uses `unsigned int's while we need `u_int32_t' here. */
846 word = weights[0];
847 obstack_grow (&non_simple, &word, sizeof (u_int32_t));
848 for (cnt = 0; cnt < weights[0]; ++cnt)
850 word = weights[collate->nrules + cnt];
851 obstack_grow (&non_simple, &word, sizeof (u_int32_t));
855 if (hash_tab == &charset->char_table)
856 hash_tab = &collate->elements;
857 else if (hash_tab == &collate->elements)
858 hash_tab = &collate->symbols;
859 else
860 break;
863 /* Now we have the complete tables. */
864 if (obstack_object_size (&string_pool) % 4 != 0)
865 obstack_blank (&non_simple, 4 - (obstack_object_size (&string_pool) % 4));
866 symbols_string_pool_size = obstack_object_size (&string_pool);
867 symbols_string_pool = obstack_finish (&string_pool);
869 symbols_class_size = obstack_object_size (&non_simple);
870 symbols_class = obstack_finish (&non_simple);
872 /* Generate tables with other byte order. */
873 symbols_hash_tab_ob = obstack_alloc (&non_simple, (2 * symbols_hash_tab_size
874 * sizeof (u_int32_t)));
875 for (cnt = 0; cnt < 2 * symbols_hash_tab_size; ++cnt)
876 symbols_hash_tab_ob[cnt] = SWAPU32 (symbols_hash_tab[cnt]);
878 symbols_class_ob = obstack_alloc (&non_simple, symbols_class_size);
879 for (cnt = 0; cnt < symbols_class_size / 4; ++cnt)
880 symbols_class_ob[cnt] = SWAPU32 (symbols_class[cnt]);
883 /* Store table adresses and lengths. */
884 #if __BYTE_ORDER == __BIG_ENDIAN
885 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_base = table;
886 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_len
887 = table_best * level_best * entry_size * sizeof (table[0]);
889 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_base = table2;
890 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_len
891 = table_best * level_best * entry_size * sizeof (table[0]);
893 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_base = extra;
894 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_len = extra_len;
896 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_base = extra2;
897 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_len = extra_len;
898 #else
899 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_base = table2;
900 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_len
901 = table_best * level_best * entry_size * sizeof (table[0]);
903 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_base = table;
904 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_len
905 = table_best * level_best * entry_size * sizeof (table[0]);
907 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_base = extra2;
908 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_len = extra_len;
910 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_base = extra;
911 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_len = extra_len;
912 #endif
914 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_base = &undefined_offset;
915 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_len = sizeof (u_int32_t);
918 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_SIZE)].iov_base
919 = &element_hash_tab_size;
920 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_SIZE)].iov_len
921 = sizeof (u_int32_t);
923 #if __BYTE_ORDER == __BIG_ENDIAN
924 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_base
925 = element_hash_tab;
926 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_len
927 = 2 * element_hash_tab_size * sizeof (u_int32_t);
929 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_base
930 = element_hash_tab_ob;
931 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_len
932 = 2 * element_hash_tab_size * sizeof (u_int32_t);
933 #else
934 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_base
935 = element_hash_tab;
936 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_len
937 = 2 * element_hash_tab_size * sizeof (u_int32_t);
939 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_base
940 = element_hash_tab_ob;
941 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_len
942 = 2 * element_hash_tab_size * sizeof (u_int32_t);
943 #endif
945 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_STR_POOL)].iov_base
946 = element_string_pool;
947 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_STR_POOL)].iov_len
948 = element_string_pool_size;
950 #if __BYTE_ORDER == __BIG_ENDIAN
951 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_base
952 = element_value;
953 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_len
954 = element_value_size;
956 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_base
957 = element_value_ob;
958 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_len
959 = element_value_size;
960 #else
961 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_base
962 = element_value;
963 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_len
964 = element_value_size;
966 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_base
967 = element_value_ob;
968 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_len
969 = element_value_size;
970 #endif
972 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZE)].iov_base
973 = &symbols_hash_tab_size;
974 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZE)].iov_len
975 = sizeof (u_int32_t);
977 #if __BYTE_ORDER == __BIG_ENDIAN
978 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_base
979 = symbols_hash_tab;
980 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_len
981 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
983 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_base
984 = symbols_hash_tab_ob;
985 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_len
986 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
987 #else
988 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_base
989 = symbols_hash_tab;
990 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_len
991 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
993 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_base
994 = symbols_hash_tab_ob;
995 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_len
996 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
997 #endif
999 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_STR_POOL)].iov_base
1000 = symbols_string_pool;
1001 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_STR_POOL)].iov_len
1002 = symbols_string_pool_size;
1004 #if __BYTE_ORDER == __BIG_ENDIAN
1005 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_base
1006 = symbols_class;
1007 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_len
1008 = symbols_class_size;
1010 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_base
1011 = symbols_class_ob;
1012 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_len
1013 = symbols_class_size;
1014 #else
1015 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_base
1016 = symbols_class;
1017 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_len
1018 = symbols_class_size;
1020 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_base
1021 = symbols_class_ob;
1022 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_len
1023 = symbols_class_size;
1024 #endif
1026 /* Update idx array. */
1027 idx[0] = iov[0].iov_len + iov[1].iov_len;
1028 for (cnt = 1; cnt < nelems; ++cnt)
1029 idx[cnt] = idx[cnt - 1] + iov[1 + cnt].iov_len;
1031 write_locale_data (output_path, "LC_COLLATE", 2 + nelems, iov);
1033 obstack_free (&non_simple, NULL);
1034 obstack_free (&string_pool, NULL);
1038 void
1039 collate_element_to (struct linereader *lr, struct localedef_t *locale,
1040 struct token *code, struct charset_t *charset)
1042 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1043 unsigned int value;
1044 void *not_used;
1046 if (collate->combine_token != NULL)
1048 free ((void *) collate->combine_token);
1049 collate->combine_token = NULL;
1052 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
1053 if ((wchar_t) value != ILLEGAL_CHAR_VALUE)
1055 lr_error (lr, _("symbol for multicharacter collating element "
1056 "`%.*s' duplicates symbolic name in charset"),
1057 (int) code->val.str.len, code->val.str.start);
1058 return;
1061 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1062 &not_used) >= 0)
1064 lr_error (lr, _("symbol for multicharacter collating element "
1065 "`%.*s' duplicates other element definition"),
1066 (int) code->val.str.len, code->val.str.start);
1067 return;
1070 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1071 &not_used) >= 0)
1073 lr_error (lr, _("symbol for multicharacter collating element "
1074 "`%.*s' duplicates symbol definition"),
1075 (int) code->val.str.len, code->val.str.start);
1076 return;
1079 collate->combine_token = code->val.str.start;
1080 collate->combine_token_len = code->val.str.len;
1084 void
1085 collate_element_from (struct linereader *lr, struct localedef_t *locale,
1086 struct token *code, struct charset_t *charset)
1088 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1089 element_t *elemp, *runp;
1091 /* CODE is a string. */
1092 elemp = (element_t *) obstack_alloc (&collate->element_mem,
1093 sizeof (element_t));
1095 /* We have to translate the string. It may contain <...> character
1096 names. */
1097 elemp->name = (wchar_t *) translate_string (code->val.str.start, charset);
1098 elemp->this_weight = 0;
1099 elemp->ordering = NULL;
1100 elemp->ordering_len = 0;
1102 free (code->val.str.start);
1104 if (elemp->name == NULL)
1106 /* At least one character in the string is not defined. We simply
1107 do nothing. */
1108 if (verbose)
1109 lr_error (lr, _("\
1110 `from' string in collation element declaration contains unknown character"));
1111 return;
1114 if (elemp->name[0] == L'\0' || elemp->name[1] == L'\0')
1116 lr_error (lr, _("illegal collation element"));
1117 return;
1120 /* The entries in the linked lists of RESULT are sorting in
1121 descending order. The order is important for the `strcoll' and
1122 `wcscoll' functions. */
1123 if (find_entry (&collate->result, elemp->name, sizeof (wchar_t),
1124 (void *) &runp) >= 0)
1126 /* We already have an entry with this key. Check whether it is
1127 identical. */
1128 element_t *prevp = NULL;
1129 int cmpres;
1133 cmpres = wcscmp (elemp->name, runp->name);
1134 if (cmpres <= 0)
1135 break;
1136 prevp = runp;
1138 while ((runp = runp->next) != NULL);
1140 if (cmpres == 0)
1141 lr_error (lr, _("duplicate collating element definition"));
1142 else
1144 elemp->next = runp;
1145 if (prevp == NULL)
1147 if (set_entry (&collate->result, elemp->name, sizeof (wchar_t),
1148 elemp) < 0)
1149 error (EXIT_FAILURE, 0, _("\
1150 error while inserting collation element into hash table"));
1152 else
1153 prevp->next = elemp;
1156 else
1158 elemp->next = NULL;
1159 if (insert_entry (&collate->result, elemp->name, sizeof (wchar_t), elemp)
1160 < 0)
1161 error (EXIT_FAILURE, errno, _("error while inserting to hash table"));
1164 if (insert_entry (&collate->elements, collate->combine_token,
1165 collate->combine_token_len, (void *) elemp) < 0)
1166 lr_error (lr, _("cannot insert new collating symbol definition: %s"),
1167 strerror (errno));
1171 void
1172 collate_symbol (struct linereader *lr, struct localedef_t *locale,
1173 struct token *code, struct charset_t *charset)
1175 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1176 wchar_t value;
1177 void *not_used;
1179 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
1180 if (value != ILLEGAL_CHAR_VALUE)
1182 lr_error (lr, _("symbol for multicharacter collating element "
1183 "`%.*s' duplicates symbolic name in charset"),
1184 (int) code->val.str.len, code->val.str.start);
1185 return;
1188 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1189 &not_used) >= 0)
1191 lr_error (lr, _("symbol for multicharacter collating element "
1192 "`%.*s' duplicates element definition"),
1193 (int) code->val.str.len, code->val.str.start);
1194 return;
1197 if (find_entry (&collate->symbols, code->val.str.start, code->val.str.len,
1198 &not_used) >= 0)
1200 lr_error (lr, _("symbol for multicharacter collating element "
1201 "`%.*s' duplicates other symbol definition"),
1202 (int) code->val.str.len, code->val.str.start);
1203 return;
1206 if (insert_entry (&collate->symbols, code->val.str.start, code->val.str.len,
1207 (void *) 0) < 0)
1208 lr_error (lr, _("cannot insert new collating symbol definition: %s"),
1209 strerror (errno));
1213 void
1214 collate_new_order (struct linereader *lr, struct localedef_t *locale,
1215 enum coll_sort_rule sort_rule)
1217 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1219 if (collate->nrules >= collate->nrules_max)
1221 collate->nrules_max *= 2;
1222 collate->rules
1223 = (enum coll_sort_rule *) xrealloc (collate->rules,
1224 collate->nrules_max
1225 * sizeof (enum coll_sort_rule));
1228 collate->rules[collate->nrules++] = sort_rule;
1232 void
1233 collate_build_arrays (struct linereader *lr, struct localedef_t *locale)
1235 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1237 collate->rules
1238 = (enum coll_sort_rule *) xrealloc (collate->rules,
1239 collate->nrules
1240 * sizeof (enum coll_sort_rule));
1242 /* Allocate arrays for temporary weights. */
1243 collate->weight_cnt = (int *) xmalloc (collate->nrules * sizeof (int));
1245 /* Choose arbitrary start value for table size. */
1246 collate->nweight_max = 5 * collate->nrules;
1247 collate->weight = (int *) xmalloc (collate->nweight_max * sizeof (int));
1252 collate_order_elem (struct linereader *lr, struct localedef_t *locale,
1253 struct token *code, struct charset_t *charset)
1255 const wchar_t zero = L'\0';
1256 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1257 int result = 0;
1258 wchar_t value;
1259 void *tmp;
1260 unsigned int i;
1262 switch (code->tok)
1264 case tok_bsymbol:
1265 /* We have a string to find in one of the three hashing tables. */
1266 value = charset_find_value (charset, code->val.str.start,
1267 code->val.str.len);
1268 if (value != ILLEGAL_CHAR_VALUE)
1270 element_t *lastp, *firstp;
1272 collate->kind = character;
1274 if (find_entry (&collate->result, &value, sizeof (wchar_t),
1275 (void *) &firstp) < 0)
1276 firstp = lastp = NULL;
1277 else
1279 /* The entry for the simple character is always found at
1280 the end. */
1281 lastp = firstp;
1282 while (lastp->next != NULL)
1283 lastp = lastp->next;
1285 if (lastp->name[0] == value && lastp->name[1] == L'\0')
1287 lr_error (lr, _("duplicate definition for character `%.*s'"),
1288 (int) code->val.str.len, code->val.str.start);
1289 lr_ignore_rest (lr, 0);
1290 result = -1;
1291 break;
1295 collate->current_element
1296 = (element_t *) obstack_alloc (&collate->element_mem,
1297 sizeof (element_t));
1299 obstack_grow (&collate->element_mem, &value, sizeof (value));
1300 obstack_grow (&collate->element_mem, &zero, sizeof (zero));
1302 collate->current_element->name =
1303 (const wchar_t *) obstack_finish (&collate->element_mem);
1305 collate->current_element->this_weight = ++collate->order_cnt;
1307 collate->current_element->next = NULL;
1309 if (firstp == NULL)
1311 if (insert_entry (&collate->result, &value, sizeof (wchar_t),
1312 (void *) collate->current_element) < 0)
1314 lr_error (lr, _("cannot insert collation element `%.*s'"),
1315 (int) code->val.str.len, code->val.str.start);
1316 exit (4);
1319 else
1320 lastp->next = collate->current_element;
1322 else if (find_entry (&collate->elements, code->val.str.start,
1323 code->val.str.len, &tmp) >= 0)
1325 collate->current_element = (element_t *) tmp;
1327 if (collate->current_element->this_weight != 0)
1329 lr_error (lr, _("\
1330 collation element `%.*s' appears more than once: ignore line"),
1331 (int) code->val.str.len, code->val.str.start);
1332 lr_ignore_rest (lr, 0);
1333 result = -1;
1334 break;
1337 collate->kind = element;
1338 collate->current_element->this_weight = ++collate->order_cnt;
1340 else if (find_entry (&collate->symbols, code->val.str.start,
1341 code->val.str.len, &tmp) >= 0)
1343 unsigned int order = ++collate->order_cnt;
1345 if ((unsigned long int) tmp != 0ul)
1347 lr_error (lr, _("\
1348 collation symbol `%.*s' appears more than once: ignore line"),
1349 (int) code->val.str.len, code->val.str.start);
1350 lr_ignore_rest (lr, 0);
1351 result = -1;
1352 break;
1355 collate->kind = symbol;
1357 if (set_entry (&collate->symbols, code->val.str.start,
1358 code->val.str.len, (void *) order) < 0)
1360 lr_error (lr, _("cannot process order specification"));
1361 exit (4);
1364 else
1366 if (verbose)
1367 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1368 (int) code->val.str.len, code->val.str.start);
1369 lr_ignore_rest (lr, 0);
1371 result = -1;
1373 break;
1375 case tok_undefined:
1376 collate->kind = undefined;
1377 collate->current_element = &collate->undefined;
1378 break;
1380 case tok_ellipsis:
1381 if (collate->was_ellipsis)
1383 lr_error (lr, _("\
1384 two lines in a row containing `...' are not allowed"));
1385 result = -1;
1387 else if (collate->kind != character)
1389 /* An ellipsis requires the previous line to be an
1390 character definition. */
1391 lr_error (lr, _("\
1392 line before ellipsis does not contain definition for character constant"));
1393 lr_ignore_rest (lr, 0);
1394 result = -1;
1396 else
1397 collate->kind = ellipsis;
1398 break;
1400 default:
1401 assert (! "illegal token in `collate_order_elem'");
1404 /* Now it's time to handle the ellipsis in the previous line. We do
1405 this only when the last line contained an definition for a
1406 character, the current line also defines an character, the
1407 character code for the later is bigger than the former. */
1408 if (collate->was_ellipsis)
1410 if (collate->kind != character)
1412 lr_error (lr, _("\
1413 line after ellipsis must contain character definition"));
1414 lr_ignore_rest (lr, 0);
1415 result = -1;
1417 else if (collate->last_char > value)
1419 lr_error (lr, _("end point of ellipsis range is bigger then start"));
1420 lr_ignore_rest (lr, 0);
1421 result = -1;
1423 else
1425 /* We can fill the arrays with the information we need. */
1426 wchar_t name[2];
1427 unsigned int *data;
1428 size_t *ptr;
1429 size_t cnt;
1431 name[0] = collate->last_char + 1;
1432 name[1] = L'\0';
1434 data = (unsigned int *) alloca ((collate->nrules + collate->nweight)
1435 * sizeof (unsigned int));
1436 ptr = (size_t *) alloca (collate->nrules * sizeof (size_t));
1438 if (data == NULL || ptr == NULL)
1439 error (4, 0, _("memory exhausted"));
1441 /* Prepare data. Because the characters covered by an
1442 ellipsis all have equal values we prepare the data once
1443 and only change the variable number (if there are any).
1444 PTR[...] will point to the entries which will have to be
1445 fixed during the output loop. */
1446 for (cnt = 0; cnt < collate->nrules; ++cnt)
1448 data[cnt] = collate->weight_cnt[cnt];
1449 ptr[cnt] = (cnt == 0
1450 ? collate->nweight
1451 : ptr[cnt - 1] + collate->weight_cnt[cnt - 1]);
1454 for (cnt = 0; cnt < collate->nweight; ++cnt)
1455 data[collate->nrules + cnt] = collate->weight[cnt];
1457 for (cnt = 0; cnt < collate->nrules; ++cnt)
1458 if ((wchar_t) data[ptr[cnt]] != ELLIPSIS_CHAR)
1459 ptr[cnt] = 0;
1461 while (name[0] <= value)
1463 element_t *pelem;
1465 pelem = (element_t *) obstack_alloc (&collate->element_mem,
1466 sizeof (element_t));
1467 if (pelem == NULL)
1468 error (4, 0, _("memory exhausted"));
1470 pelem->name
1471 = (const wchar_t *) obstack_copy (&collate->element_mem,
1472 name, 2 * sizeof (wchar_t));
1473 pelem->this_weight = ++collate->order_cnt;
1475 pelem->ordering_len = collate->nweight;
1476 pelem->ordering
1477 = (unsigned int *) obstack_copy (&collate->element_mem, data,
1478 (collate->nrules
1479 * pelem->ordering_len)
1480 * sizeof (unsigned int));
1482 /* `...' weights need to be adjusted. */
1483 for (cnt = 0; cnt < collate->nrules; ++cnt)
1484 if (ptr[cnt] != 0)
1485 pelem->ordering[ptr[cnt]] = pelem->this_weight;
1487 /* Insert new entry into result table. */
1488 if (find_entry (&collate->result, name, sizeof (wchar_t),
1489 (void *) &pelem->next) >= 0)
1491 if (set_entry (&collate->result, name, sizeof (wchar_t),
1492 (void *) pelem->next) < 0)
1493 error (4, 0, _("cannot insert into result table"));
1495 else
1496 if (insert_entry (&collate->result, name, sizeof (wchar_t),
1497 (void *) pelem->next) < 0)
1498 error (4, 0, _("cannot insert into result table"));
1500 /* Increment counter. */
1501 ++name[0];
1506 /* Reset counters for weights. */
1507 collate->weight_idx = 0;
1508 collate->nweight = 0;
1509 for (i = 0; i < collate->nrules; ++i)
1510 collate->weight_cnt[i] = 0;
1511 collate->current_patch = NULL;
1513 return result;
1518 collate_weight_bsymbol (struct linereader *lr, struct localedef_t *locale,
1519 struct token *code, struct charset_t *charset)
1521 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1522 unsigned int here_weight;
1523 wchar_t value;
1524 void *tmp;
1526 assert (code->tok == tok_bsymbol);
1528 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
1529 if (value != ILLEGAL_CHAR_VALUE)
1531 element_t *runp;
1533 if (find_entry (&collate->result, &value, sizeof (wchar_t),
1534 (void *)&runp) < 0)
1535 runp = NULL;
1537 while (runp != NULL
1538 && (runp->name[0] != value || runp->name[1] != L'\0'))
1539 runp = runp->next;
1541 here_weight = runp == NULL ? 0 : runp->this_weight;
1543 else if (find_entry (&collate->elements, code->val.str.start,
1544 code->val.str.len, &tmp) >= 0)
1546 element_t *runp = (element_t *) tmp;
1548 here_weight = runp->this_weight;
1550 else if (find_entry (&collate->symbols, code->val.str.start,
1551 code->val.str.len, &tmp) >= 0)
1553 here_weight = (unsigned int) tmp;
1555 else
1557 if (verbose)
1558 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1559 (int) code->val.str.len, code->val.str.start);
1560 lr_ignore_rest (lr, 0);
1561 return -1;
1564 /* When we currently work on a collation symbol we do not expect any
1565 weight. */
1566 if (collate->kind == symbol)
1568 lr_error (lr, _("\
1569 specification of sorting weight for collation symbol does not make sense"));
1570 lr_ignore_rest (lr, 0);
1571 return -1;
1574 /* Add to the current collection of weights. */
1575 if (collate->nweight >= collate->nweight_max)
1577 collate->nweight_max *= 2;
1578 collate->weight = (unsigned int *) xrealloc (collate->weight,
1579 collate->nweight_max);
1582 /* If the weight is currently not known, we remember to patch the
1583 resulting tables. */
1584 if (here_weight == 0)
1586 patch_t *newp;
1588 newp = (patch_t *) obstack_alloc (&collate->element_mem,
1589 sizeof (patch_t));
1590 newp->fname = lr->fname;
1591 newp->lineno = lr->lineno;
1592 newp->token = (const char *) obstack_copy0 (&collate->element_mem,
1593 code->val.str.start,
1594 code->val.str.len);
1595 newp->where.idx = collate->nweight++;
1596 newp->next = collate->current_patch;
1597 collate->current_patch = newp;
1599 else
1600 collate->weight[collate->nweight++] = here_weight;
1601 ++collate->weight_cnt[collate->weight_idx];
1603 return 0;
1608 collate_next_weight (struct linereader *lr, struct localedef_t *locale)
1610 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1612 if (collate->kind == symbol)
1614 lr_error (lr, _("\
1615 specification of sorting weight for collation symbol does not make sense"));
1616 lr_ignore_rest (lr, 0);
1617 return -1;
1620 ++collate->weight_idx;
1621 if (collate->weight_idx >= collate->nrules)
1623 lr_error (lr, _("too many weights"));
1624 lr_ignore_rest (lr, 0);
1625 return -1;
1628 return 0;
1633 collate_simple_weight (struct linereader *lr, struct localedef_t *locale,
1634 struct token *code, struct charset_t *charset)
1636 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1637 unsigned int value = 0;
1639 /* There current tokens can be `IGNORE', `...', or a string. */
1640 switch (code->tok)
1642 case tok_ignore:
1643 /* This token is allowed in all situations. */
1644 value = IGNORE_CHAR;
1645 break;
1647 case tok_ellipsis:
1648 /* The ellipsis is only allowed for the `...' or `UNDEFINED'
1649 entry. */
1650 if (collate->kind != ellipsis && collate->kind != undefined)
1652 lr_error (lr, _("\
1653 `...' must only be used in `...' and `UNDEFINED' entries"));
1654 lr_ignore_rest (lr, 0);
1655 return -1;
1657 value = ELLIPSIS_CHAR;
1658 break;
1660 case tok_string:
1661 /* This can become difficult. We have to get the weights which
1662 correspind the the single wide chars in the string. But some
1663 of the `chars' might not be real characters, but collation
1664 elements or symbols. And so the string decoder might have
1665 signaled errors. The string at this point is not translated.
1666 I.e., all <...> sequences are still there. */
1668 char *runp = code->val.str.start;
1669 void *tmp;
1671 while (*runp != '\0')
1673 char *startp = (char *) runp;
1674 char *putp = (char *) runp;
1675 wchar_t wch;
1677 /* Lookup weight for char and store it. */
1678 if (*runp == '<')
1680 while (*++runp != '\0' && *runp != '>')
1682 if (*runp == lr->escape_char)
1683 if (*++runp == '\0')
1685 lr_error (lr, _("unterminated weight name"));
1686 lr_ignore_rest (lr, 0);
1687 return -1;
1689 *putp++ = *runp;
1691 if (*runp == '>')
1692 ++runp;
1694 if (putp == startp)
1696 lr_error (lr, _("empty weight name: line ignored"));
1697 lr_ignore_rest (lr, 0);
1698 return -1;
1701 wch = charset_find_value (charset, startp, putp - startp);
1702 if (wch != ILLEGAL_CHAR_VALUE)
1704 element_t *pelem;
1706 if (find_entry (&collate->result, &wch, sizeof (wchar_t),
1707 (void *)&pelem) < 0)
1708 pelem = NULL;
1710 while (pelem != NULL
1711 && (pelem->name[0] != wch
1712 || pelem->name[1] != L'\0'))
1713 pelem = pelem->next;
1715 value = pelem == NULL ? 0 : pelem->this_weight;
1717 else if (find_entry (&collate->elements, startp, putp - startp,
1718 &tmp) >= 0)
1720 element_t *pelem = (element_t *) tmp;
1722 value = pelem->this_weight;
1724 else if (find_entry (&collate->symbols, startp, putp - startp,
1725 &tmp) >= 0)
1727 value = (unsigned int) tmp;
1729 else
1731 if (verbose)
1732 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1733 (int) (putp - startp), startp);
1734 lr_ignore_rest (lr, 0);
1735 return -1;
1738 else
1740 element_t *wp;
1741 wchar_t wch;
1743 if (*runp == lr->escape_char)
1745 static const char digits[] = "0123456789abcdef";
1746 const char *dp;
1747 int base;
1749 ++runp;
1750 if (tolower (*runp) == 'x')
1752 ++runp;
1753 base = 16;
1755 else if (tolower (*runp) == 'd')
1757 ++runp;
1758 base = 10;
1760 else
1761 base = 8;
1763 dp = strchr (digits, tolower (*runp));
1764 if (dp == NULL || (dp - digits) >= base)
1766 illegal_char:
1767 lr_error (lr, _("\
1768 illegal character constant in string"));
1769 lr_ignore_rest (lr, 0);
1770 return -1;
1772 wch = dp - digits;
1773 ++runp;
1775 dp = strchr (digits, tolower (*runp));
1776 if (dp == NULL || (dp - digits) >= base)
1777 goto illegal_char;
1778 wch *= base;
1779 wch += dp - digits;
1780 ++runp;
1782 if (base != 16)
1784 dp = strchr (digits, tolower (*runp));
1785 if (dp != NULL && (dp - digits < base))
1787 wch *= base;
1788 wch += dp - digits;
1789 ++runp;
1793 else
1794 wch = (wchar_t) *runp++;
1796 /* Lookup the weight for WCH. */
1797 if (find_entry (&collate->result, &wch, sizeof (wch),
1798 (void *)&wp) < 0)
1799 wp = NULL;
1801 while (wp != NULL
1802 && (wp->name[0] != wch || wp->name[1] != L'\0'))
1803 wp = wp->next;
1805 value = wp == NULL ? 0 : wp->this_weight;
1807 /* To get the correct name for the error message. */
1808 putp = runp;
1810 /**************************************************\
1811 |* I know here is something wrong. Characters in *|
1812 |* the string which are not in the <...> form *|
1813 |* cannot be declared forward for now!!! *|
1814 \**************************************************/
1817 /* Store in weight array. */
1818 if (collate->nweight >= collate->nweight_max)
1820 collate->nweight_max *= 2;
1821 collate->weight
1822 = (unsigned int *) xrealloc (collate->weight,
1823 collate->nweight_max);
1826 if (value == 0)
1828 patch_t *newp;
1830 newp = (patch_t *) obstack_alloc (&collate->element_mem,
1831 sizeof (patch_t));
1832 newp->fname = lr->fname;
1833 newp->lineno = lr->lineno;
1834 newp->token
1835 = (const char *) obstack_copy0 (&collate->element_mem,
1836 startp, putp - startp);
1837 newp->where.idx = collate->nweight++;
1838 newp->next = collate->current_patch;
1839 collate->current_patch = newp;
1841 else
1842 collate->weight[collate->nweight++] = value;
1843 ++collate->weight_cnt[collate->weight_idx];
1846 return 0;
1848 default:
1849 assert (! "should not happen");
1853 if (collate->nweight >= collate->nweight_max)
1855 collate->nweight_max *= 2;
1856 collate->weight = (unsigned int *) xrealloc (collate->weight,
1857 collate->nweight_max);
1860 collate->weight[collate->nweight++] = value;
1861 ++collate->weight_cnt[collate->weight_idx];
1863 return 0;
1867 void
1868 collate_end_weight (struct linereader *lr, struct localedef_t *locale)
1870 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1871 element_t *pelem = collate->current_element;
1873 if (collate->kind == symbol)
1875 /* We don't have to do anything. */
1876 collate->was_ellipsis = 0;
1877 return;
1880 if (collate->kind == ellipsis)
1882 /* Before the next line is processed the ellipsis is handled. */
1883 collate->was_ellipsis = 1;
1884 return;
1887 assert (collate->kind == character || collate->kind == element
1888 || collate->kind == undefined);
1890 /* Fill in the missing weights. */
1891 while (++collate->weight_idx < collate->nrules)
1893 collate->weight[collate->nweight++] = pelem->this_weight;
1894 ++collate->weight_cnt[collate->weight_idx];
1897 /* Now we know how many ordering weights the current
1898 character/element has. Allocate room in the element structure
1899 and copy information. */
1900 pelem->ordering_len = collate->nweight;
1902 /* First we write an array with the number of values for each
1903 weight. */
1904 obstack_grow (&collate->element_mem, collate->weight_cnt,
1905 collate->nrules * sizeof (unsigned int));
1907 /* Now the weights itselves. */
1908 obstack_grow (&collate->element_mem, collate->weight,
1909 collate->nweight * sizeof (unsigned int));
1911 /* Get result. */
1912 pelem->ordering = obstack_finish (&collate->element_mem);
1914 /* Now we handle the "patches". */
1915 while (collate->current_patch != NULL)
1917 patch_t *this_patch;
1919 this_patch = collate->current_patch;
1921 this_patch->where.pos = &pelem->ordering[collate->nrules
1922 + this_patch->where.idx];
1924 collate->current_patch = this_patch->next;
1925 this_patch->next = collate->all_patches;
1926 collate->all_patches = this_patch;
1929 /* Set information for next round. */
1930 collate->was_ellipsis = 0;
1931 if (collate->kind != undefined)
1932 collate->last_char = pelem->name[0];