Update.
[glibc.git] / locale / programs / ld-collate.c
bloba92ff1154a7afd7f9e9205db409a35a5115ebc39
1 /* Copyright (C) 1995, 1996, 1997, 1998 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <endian.h>
25 #include <errno.h>
26 #include <limits.h>
27 #include <locale.h>
28 #include <obstack.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <wchar.h>
33 #include "localeinfo.h"
34 #include "locales.h"
35 #include "simple-hash.h"
36 #include "stringtrans.h"
37 #include "strlen-hash.h"
39 /* Uncomment the following line in the production version. */
40 /* #define NDEBUG 1 */
41 #include <assert.h>
44 #define MAX(a, b) ((a) > (b) ? (a) : (b))
46 #define SWAPU32(w) \
47 (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
50 /* What kind of symbols get defined? */
51 enum coll_symbol
53 undefined,
54 ellipsis,
55 character,
56 element,
57 symbol
61 typedef struct patch_t
63 const char *fname;
64 size_t lineno;
65 const char *token;
66 union
68 unsigned int *pos;
69 size_t idx;
70 } where;
71 struct patch_t *next;
72 } patch_t;
75 typedef struct element_t
77 const wchar_t *name;
78 unsigned int this_weight;
80 struct element_t *next;
82 unsigned int *ordering;
83 size_t ordering_len;
84 } element_t;
87 /* The real definition of the struct for the LC_COLLATE locale. */
88 struct locale_collate_t
90 /* Collate symbol table. Simple mapping to number. */
91 hash_table symbols;
93 /* The collation elements. */
94 hash_table elements;
95 struct obstack element_mem;
97 /* The result table. */
98 hash_table result;
100 /* Sorting rules given in order_start line. */
101 u_int32_t nrules;
102 u_int32_t nrules_max;
103 enum coll_sort_rule *rules;
105 /* Used while recognizing symbol composed of multiple tokens
106 (collating-element). */
107 const char *combine_token;
108 size_t combine_token_len;
110 /* How many sorting order specifications so far. */
111 unsigned int order_cnt;
113 /* Was lastline ellipsis? */
114 int was_ellipsis;
115 /* Value of last entry if was character. */
116 wchar_t last_char;
117 /* Current element. */
118 element_t *current_element;
119 /* What kind of symbol is current element. */
120 enum coll_symbol kind;
122 /* While collecting the weights we need some temporary space. */
123 unsigned int current_order;
124 int *weight_cnt;
125 unsigned int weight_idx;
126 unsigned int *weight;
127 size_t nweight;
128 size_t nweight_max;
130 /* Patch lists. */
131 patch_t *current_patch;
132 patch_t *all_patches;
134 /* Room for the UNDEFINED information. */
135 element_t undefined;
136 unsigned int undefined_len;
140 /* Be verbose? Defined in localedef.c. */
141 extern int verbose;
144 void *xmalloc (size_t __n);
145 void *xrealloc (void *__p, size_t __n);
148 #define obstack_chunk_alloc malloc
149 #define obstack_chunk_free free
152 void
153 collate_startup (struct linereader *lr, struct localedef_t *locale,
154 struct charset_t *charset)
156 struct locale_collate_t *collate;
158 /* It is important that we always use UCS4 encoding for strings now. */
159 encoding_method = ENC_UCS4;
161 /* Allocate the needed room. */
162 locale->categories[LC_COLLATE].collate = collate =
163 (struct locale_collate_t *) xmalloc (sizeof (struct locale_collate_t));
165 /* Allocate hash table for collating elements. */
166 if (init_hash (&collate->elements, 512))
167 error (4, 0, _("memory exhausted"));
168 collate->combine_token = NULL;
169 obstack_init (&collate->element_mem);
171 /* Allocate hash table for collating elements. */
172 if (init_hash (&collate->symbols, 64))
173 error (4, 0, _("memory exhausted"));
175 /* Allocate hash table for result. */
176 if (init_hash (&collate->result, 512))
177 error (4, 0, _("memory exhausted"));
179 collate->nrules = 0;
180 collate->nrules_max = 10;
181 collate->rules
182 = (enum coll_sort_rule *) xmalloc (collate->nrules_max
183 * sizeof (enum coll_sort_rule));
185 collate->order_cnt = 1; /* The smallest weight is 2. */
187 collate->was_ellipsis = 0;
188 collate->last_char = L'\0'; /* 0 because leading ellipsis is allowed. */
190 collate->all_patches = NULL;
192 /* This tells us no UNDEFINED entry was found until now. */
193 memset (&collate->undefined, '\0', sizeof (collate->undefined));
195 lr->translate_strings = 0;
199 void
200 collate_finish (struct localedef_t *locale, struct charset_t *charset)
202 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
203 patch_t *patch;
204 size_t cnt;
206 /* Patch the constructed table so that forward references are
207 correctly filled. */
208 for (patch = collate->all_patches; patch != NULL; patch = patch->next)
210 wchar_t wch;
211 size_t toklen = strlen (patch->token);
212 void *ptmp;
213 unsigned int value = 0;
215 wch = charset_find_value (&charset->char_table, patch->token, toklen);
216 if (wch != ILLEGAL_CHAR_VALUE)
218 element_t *runp;
220 if (find_entry (&collate->result, &wch, sizeof (wchar_t),
221 (void *) &runp) < 0)
222 runp = NULL;
223 for (; runp != NULL; runp = runp->next)
224 if (runp->name[0] == wch && runp->name[1] == L'\0')
225 break;
227 value = runp == NULL ? 0 : runp->this_weight;
229 else if (find_entry (&collate->elements, patch->token, toklen, &ptmp)
230 >= 0)
232 value = ((element_t *) ptmp)->this_weight;
234 else if (find_entry (&collate->symbols, patch->token, toklen, &ptmp)
235 >= 0)
237 value = (unsigned long int) ptmp;
239 else
240 value = 0;
242 if (value == 0)
244 if (!be_quiet)
245 error_at_line (0, 0, patch->fname, patch->lineno,
246 _("no weight defined for symbol `%s'"),
247 patch->token);
249 else
250 *patch->where.pos = value;
253 /* If no definition for UNDEFINED is given, all characters in the
254 given charset must be specified. */
255 if (collate->undefined.ordering == NULL)
257 /**************************************************************\
258 |* XXX We should test whether really an unspecified character *|
259 |* exists before giving the message. *|
260 \**************************************************************/
261 u_int32_t weight;
263 if (!be_quiet)
264 error (0, 0, _("no definition of `UNDEFINED'"));
266 collate->undefined.ordering_len = collate->nrules;
267 weight = ++collate->order_cnt;
269 for (cnt = 0; cnt < collate->nrules; ++cnt)
271 u_int32_t one = 1;
272 obstack_grow (&collate->element_mem, &one, sizeof (one));
275 for (cnt = 0; cnt < collate->nrules; ++cnt)
276 obstack_grow (&collate->element_mem, &weight, sizeof (weight));
278 collate->undefined.ordering = obstack_finish (&collate->element_mem);
281 collate->undefined_len = 2; /* For the name: 1 x wchar_t + L'\0'. */
282 for (cnt = 0; cnt < collate->nrules; ++cnt)
283 collate->undefined_len += 1 + collate->undefined.ordering[cnt];
288 void
289 collate_output (struct localedef_t *locale, struct charset_t *charset,
290 const char *output_path)
292 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
293 u_int32_t table_size, table_best, level_best, sum_best;
294 void *last;
295 element_t *pelem;
296 wchar_t *name;
297 size_t len;
298 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
299 struct iovec iov[2 + nelems];
300 struct locale_file data;
301 u_int32_t idx[nelems];
302 struct obstack non_simple;
303 struct obstack string_pool;
304 size_t cnt, entry_size;
305 u_int32_t undefined_offset = UINT_MAX;
306 u_int32_t *table, *extra, *table2, *extra2;
307 size_t extra_len;
308 u_int32_t element_hash_tab_size;
309 u_int32_t *element_hash_tab;
310 u_int32_t *element_hash_tab_ob;
311 u_int32_t element_string_pool_size;
312 char *element_string_pool;
313 u_int32_t element_value_size;
314 wchar_t *element_value;
315 wchar_t *element_value_ob;
316 u_int32_t symbols_hash_tab_size;
317 u_int32_t *symbols_hash_tab;
318 u_int32_t *symbols_hash_tab_ob;
319 u_int32_t symbols_string_pool_size;
320 char *symbols_string_pool;
321 u_int32_t symbols_class_size;
322 u_int32_t *symbols_class;
323 u_int32_t *symbols_class_ob;
324 hash_table *hash_tab;
325 unsigned int dummy_weights[collate->nrules + 1];
327 sum_best = UINT_MAX;
328 table_best = 0xffff;
329 level_best = 0xffff;
331 /* Compute table size. */
332 if (!be_quiet)
333 fputs (_("\
334 Computing table size for collation information might take a while..."),
335 stderr);
336 for (table_size = 256; table_size < sum_best; ++table_size)
338 size_t hits[table_size];
339 unsigned int worst = 1;
340 size_t cnt;
342 last = NULL;
344 for (cnt = 0; cnt < 256; ++cnt)
345 hits[cnt] = 1;
346 memset (&hits[256], '\0', sizeof (hits) - 256 * sizeof (size_t));
348 while (iterate_table (&collate->result, &last, (const void **) &name,
349 &len, (void **) &pelem) >= 0)
350 if (pelem->ordering != NULL && pelem->name[0] > 0xff)
351 if (++hits[(unsigned int) pelem->name[0] % table_size] > worst)
353 worst = hits[(unsigned int) pelem->name[0] % table_size];
354 if (table_size * worst > sum_best)
355 break;
358 if (table_size * worst < sum_best)
360 sum_best = table_size * worst;
361 table_best = table_size;
362 level_best = worst;
365 assert (table_best != 0xffff || level_best != 0xffff);
366 if (!be_quiet)
367 fputs (_(" done\n"), stderr);
369 obstack_init (&non_simple);
370 obstack_init (&string_pool);
372 data.magic = LIMAGIC (LC_COLLATE);
373 data.n = nelems;
374 iov[0].iov_base = (void *) &data;
375 iov[0].iov_len = sizeof (data);
377 iov[1].iov_base = (void *) idx;
378 iov[1].iov_len = sizeof (idx);
380 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_NRULES)].iov_base = &collate->nrules;
381 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_NRULES)].iov_len = sizeof (u_int32_t);
383 table = (u_int32_t *) alloca (collate->nrules * sizeof (u_int32_t));
384 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_RULES)].iov_base = table;
385 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_RULES)].iov_len
386 = collate->nrules * sizeof (u_int32_t);
387 /* Another trick here. Describing the collation method needs only a
388 few bits (3, to be exact). But the binary file should be
389 accessible by machines with both endianesses and so we store both
390 forms in the same word. */
391 for (cnt = 0; cnt < collate->nrules; ++cnt)
392 table[cnt] = collate->rules[cnt] | SWAPU32 (collate->rules[cnt]);
394 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].iov_base = &table_best;
395 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].iov_len = sizeof (u_int32_t);
397 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].iov_base = &level_best;
398 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].iov_len
399 = sizeof (u_int32_t);
401 entry_size = 1 + MAX (collate->nrules, 2);
403 table = (u_int32_t *) alloca (table_best * level_best * entry_size
404 * sizeof (table[0]));
405 memset (table, '\0', table_best * level_best * entry_size
406 * sizeof (table[0]));
409 /* Macros for inserting in output table. */
410 #define ADD_VALUE(expr) \
411 do { \
412 u_int32_t to_write = (u_int32_t) expr; \
413 obstack_grow (&non_simple, &to_write, sizeof (to_write)); \
414 } while (0)
416 #define ADD_ELEMENT(pelem, len) \
417 do { \
418 size_t cnt, idx; \
420 ADD_VALUE (len); \
422 wlen = wcslen (pelem->name); \
423 obstack_grow (&non_simple, pelem->name, (wlen + 1) * sizeof (u_int32_t)); \
425 idx = collate->nrules; \
426 for (cnt = 0; cnt < collate->nrules; ++cnt) \
428 size_t disp; \
430 ADD_VALUE (pelem->ordering[cnt]); \
431 for (disp = 0; disp < pelem->ordering[cnt]; ++disp) \
432 ADD_VALUE (pelem->ordering[idx++]); \
434 } while (0)
436 #define ADD_FORWARD(pelem) \
437 do { \
438 /* We leave a reference in the main table and put all \
439 information in the table for the extended entries. */ \
440 element_t *runp; \
441 element_t *has_simple = NULL; \
442 size_t wlen; \
444 table[(level * table_best + slot) * entry_size + 1] \
445 = FORWARD_CHAR; \
446 table[(level * table_best + slot) * entry_size + 2] \
447 = obstack_object_size (&non_simple) / sizeof (u_int32_t); \
449 /* Here we have to construct the non-simple table entry. First \
450 compute the total length of this entry. */ \
451 for (runp = (pelem); runp != NULL; runp = runp->next) \
452 if (runp->ordering != NULL) \
454 u_int32_t value; \
455 size_t cnt; \
457 value = 1 + wcslen (runp->name) + 1; \
459 for (cnt = 0; cnt < collate->nrules; ++cnt) \
460 /* We have to take care for entries without ordering \
461 information. While reading them they get inserted in the \
462 table and later not removed when something goes wrong with \
463 reading its weights. */ \
464 value += 1 + runp->ordering[cnt]; \
466 if (runp->name[1] == L'\0') \
467 has_simple = runp; \
469 ADD_ELEMENT (runp, value); \
472 if (has_simple == NULL) \
474 size_t idx, cnt; \
476 ADD_VALUE (collate->undefined_len + 1); \
478 /* Add the name. */ \
479 ADD_VALUE ((pelem)->name[0]); \
480 ADD_VALUE (0); \
482 idx = collate->nrules; \
483 for (cnt = 0; cnt < collate->nrules; ++cnt) \
485 size_t disp; \
487 ADD_VALUE (collate->undefined.ordering[cnt]); \
488 for (disp = 0; disp < collate->undefined.ordering[cnt]; ++disp) \
490 if ((wchar_t) collate->undefined.ordering[idx] \
491 == ELLIPSIS_CHAR) \
492 ADD_VALUE ((pelem)->name[0]); \
493 else \
494 ADD_VALUE (collate->undefined.ordering[idx++]); \
495 ++idx; \
499 } while (0)
503 /* Fill the table now. First we look for all the characters which
504 fit into one single byte. This speeds up the 8-bit string
505 functions. */
506 last = NULL;
507 while (iterate_table (&collate->result, &last, (const void **) &name,
508 &len, (void **) &pelem) >= 0)
509 if (pelem->name[0] <= 0xff)
511 /* We have a single byte name. Now we must distinguish
512 between entries in simple form (i.e., only one value per
513 weight and no collation element starting with the same
514 character) and those which are not. */
515 size_t slot = ((size_t) pelem->name[0]);
516 const size_t level = 0;
518 table[slot * entry_size] = pelem->name[0];
520 if (pelem->name[1] == L'\0' && pelem->next == NULL
521 && pelem->ordering_len == collate->nrules)
523 /* Yes, we have a simple one. Lucky us. */
524 size_t cnt;
526 for (cnt = 0; cnt < collate->nrules; ++cnt)
527 table[slot * entry_size + 1 + cnt]
528 = pelem->ordering[collate->nrules + cnt];
530 else
531 ADD_FORWARD (pelem);
534 /* Now check for missing single byte entries. If one exist we fill
535 with the UNDEFINED entry. */
536 for (cnt = 0; cnt < 256; ++cnt)
537 /* The first weight is never 0 for existing entries. */
538 if (table[cnt * entry_size + 1] == 0)
540 /* We have to fill in the information from the UNDEFINED
541 entry. */
542 table[cnt * entry_size] = (u_int32_t) cnt;
544 if (collate->undefined.ordering_len == collate->nrules)
546 size_t inner;
548 for (inner = 0; inner < collate->nrules; ++inner)
549 if ((wchar_t)collate->undefined.ordering[collate->nrules + inner]
550 == ELLIPSIS_CHAR)
551 table[cnt * entry_size + 1 + inner] = cnt;
552 else
553 table[cnt * entry_size + 1 + inner]
554 = collate->undefined.ordering[collate->nrules + inner];
556 else
558 if (undefined_offset != UINT_MAX)
560 table[cnt * entry_size + 1] = FORWARD_CHAR;
561 table[cnt * entry_size + 2] = undefined_offset;
563 else
565 const size_t slot = cnt;
566 const size_t level = 0;
568 ADD_FORWARD (&collate->undefined);
569 undefined_offset = table[cnt * entry_size + 2];
574 /* Now we are ready for inserting the whole rest. */
575 last = NULL;
576 while (iterate_table (&collate->result, &last, (const void **) &name,
577 &len, (void **) &pelem) >= 0)
578 if (pelem->name[0] > 0xff)
580 /* Find the position. */
581 size_t slot = ((size_t) pelem->name[0]) % table_best;
582 size_t level = 0;
584 while (table[(level * table_best + slot) * entry_size + 1] != 0)
585 ++level;
586 assert (level < level_best);
588 if (pelem->name[1] == L'\0' && pelem->next == NULL
589 && pelem->ordering_len == collate->nrules)
591 /* Again a simple entry. */
592 size_t inner;
594 for (inner = 0; inner < collate->nrules; ++inner)
595 table[(level * table_best + slot) * entry_size + 1 + inner]
596 = pelem->ordering[collate->nrules + inner];
598 else
599 ADD_FORWARD (pelem);
602 /* Add the UNDEFINED entry. */
604 /* Here we have to construct the non-simple table entry. */
605 size_t idx, cnt;
607 undefined_offset = obstack_object_size (&non_simple);
609 idx = collate->nrules;
610 for (cnt = 0; cnt < collate->nrules; ++cnt)
612 size_t disp;
614 ADD_VALUE (collate->undefined.ordering[cnt]);
615 for (disp = 0; disp < collate->undefined.ordering[cnt]; ++disp)
616 ADD_VALUE (collate->undefined.ordering[idx++]);
620 /* Finish the extra block. */
621 extra_len = obstack_object_size (&non_simple);
622 extra = (u_int32_t *) obstack_finish (&non_simple);
623 assert ((extra_len % sizeof (u_int32_t)) == 0);
625 /* Now we have to build the two array for the other byte ordering. */
626 table2 = (u_int32_t *) alloca (table_best * level_best * entry_size
627 * sizeof (table[0]));
628 extra2 = (u_int32_t *) alloca (extra_len);
630 for (cnt = 0; cnt < table_best * level_best * entry_size; ++cnt)
631 table2[cnt] = SWAPU32 (table[cnt]);
633 for (cnt = 0; cnt < extra_len / sizeof (u_int32_t); ++cnt)
634 extra2[cnt] = SWAPU32 (extra2[cnt]);
636 /* We need a simple hashing table to get a collation-element->chars
637 mapping. We again use internal hashing using a secondary hashing
638 function.
640 Each string has an associate hashing value V, computed by a
641 fixed function. To locate the string we use open addressing with
642 double hashing. The first index will be V % M, where M is the
643 size of the hashing table. If no entry is found, iterating with
644 a second, independent hashing function takes place. This second
645 value will be 1 + V % (M - 2). The approximate number of probes
646 will be
648 for unsuccessful search: (1 - N / M) ^ -1
649 for successful search: - (N / M) ^ -1 * ln (1 - N / M)
651 where N is the number of keys.
653 If we now choose M to be the next prime bigger than 4 / 3 * N,
654 we get the values 4 and 1.85 resp. Because unsuccessful searches
655 are unlikely this is a good value. Formulas: [Knuth, The Art of
656 Computer Programming, Volume 3, Sorting and Searching, 1973,
657 Addison Wesley] */
658 if (collate->elements.filled == 0)
660 /* We don't need any element table since there are no collating
661 elements. */
662 element_hash_tab_size = 0;
663 element_hash_tab = NULL;
664 element_hash_tab_ob = NULL;
665 element_string_pool_size = 0;
666 element_string_pool = NULL;
667 element_value_size = 0;
668 element_value = NULL;
669 element_value_ob = NULL;
671 else
673 void *ptr; /* Running pointer. */
674 const char *key; /* Key for current bucket. */
675 size_t keylen; /* Length of key data. */
676 const element_t *data; /* Data, i.e., the character sequence. */
678 element_hash_tab_size = next_prime ((collate->elements.filled * 4) / 3);
679 if (element_hash_tab_size < 7)
680 /* We need a minimum to make the following code work. */
681 element_hash_tab_size = 7;
683 element_hash_tab = obstack_alloc (&non_simple, (2 * element_hash_tab_size
684 * sizeof (u_int32_t)));
685 memset (element_hash_tab, '\377', (2 * element_hash_tab_size
686 * sizeof (u_int32_t)));
688 ptr = NULL;
689 while (iterate_table (&collate->elements, &ptr, (const void **) &key,
690 &keylen, (void **) &data) == 0)
692 size_t hash_val = hash_string (key, keylen);
693 size_t idx = hash_val % element_hash_tab_size;
695 if (element_hash_tab[2 * idx] != (~((u_int32_t) 0)))
697 /* We need the second hashing function. */
698 size_t c = 1 + (hash_val % (element_hash_tab_size - 2));
701 if (idx >= element_hash_tab_size - c)
702 idx -= element_hash_tab_size - c;
703 else
704 idx += c;
705 while (element_hash_tab[2 * idx] != (~((u_int32_t) 0)));
708 element_hash_tab[2 * idx] = obstack_object_size (&non_simple);
709 element_hash_tab[2 * idx + 1] = (obstack_object_size (&string_pool)
710 / sizeof (wchar_t));
712 obstack_grow0 (&non_simple, key, keylen);
713 obstack_grow (&string_pool, data->name,
714 (wcslen (data->name) + 1) * sizeof (wchar_t));
717 if (obstack_object_size (&non_simple) % 4 != 0)
718 obstack_blank (&non_simple,
719 4 - (obstack_object_size (&non_simple) % 4));
720 element_string_pool_size = obstack_object_size (&non_simple);
721 element_string_pool = obstack_finish (&non_simple);
723 element_value_size = obstack_object_size (&string_pool);
724 element_value = obstack_finish (&string_pool);
726 /* Create the tables for the other byte order. */
727 element_hash_tab_ob = obstack_alloc (&non_simple,
728 (2 * element_hash_tab_size
729 * sizeof (u_int32_t)));
730 for (cnt = 0; cnt < 2 * element_hash_tab_size; ++cnt)
731 element_hash_tab_ob[cnt] = SWAPU32 (element_hash_tab[cnt]);
733 element_value_ob = obstack_alloc (&string_pool, element_value_size);
734 if (sizeof (wchar_t) != 4)
736 fputs ("sizeof (wchar_t) != 4 currently not handled", stderr);
737 abort ();
739 for (cnt = 0; cnt < element_value_size / 4; ++cnt)
740 element_value_ob[cnt] = SWAPU32 (element_value[cnt]);
743 /* Store collation elements as map to collation class. There are
744 three kinds of symbols:
745 - simple characters
746 - collation elements
747 - collation symbols
748 We need to make a table which lets the user to access the primary
749 weight based on the symbol string. */
750 symbols_hash_tab_size = next_prime ((4 * (charset->char_table.filled
751 + collate->elements.filled
752 + collate->symbols.filled)) / 3);
753 symbols_hash_tab = obstack_alloc (&non_simple, (2 * symbols_hash_tab_size
754 * sizeof (u_int32_t)));
755 memset (symbols_hash_tab, '\377', (2 * symbols_hash_tab_size
756 * sizeof (u_int32_t)));
758 /* Now fill the array. First the symbols from the character set,
759 then the collation elements and last the collation symbols. */
760 hash_tab = &charset->char_table;
761 while (1)
763 void *ptr; /* Running pointer. */
764 const char *key; /* Key for current bucket. */
765 size_t keylen; /* Length of key data. */
766 void *data; /* Data. */
768 ptr = NULL;
769 while (iterate_table (hash_tab, &ptr, (const void **) &key,
770 &keylen, (void **) &data) == 0)
772 size_t hash_val;
773 size_t idx;
774 u_int32_t word;
775 unsigned int *weights;
777 if (hash_tab == &charset->char_table
778 || hash_tab == &collate->elements)
780 element_t *lastp, *firstp;
781 wchar_t dummy_name[2];
782 const wchar_t *name;
783 size_t name_len;
785 if (hash_tab == &charset->char_table)
787 dummy_name[0] = (wchar_t) ((unsigned long int) data);
788 dummy_name[1] = L'\0';
789 name = dummy_name;
790 name_len = sizeof (wchar_t);
792 else
794 element_t *elemp = (element_t *) data;
795 name = elemp->name;
796 name_len = wcslen (name) * sizeof (wchar_t);
799 /* First check whether this character is used at all. */
800 if (find_entry (&collate->result, name, name_len,
801 (void *) &firstp) < 0)
802 /* The symbol is not directly mentioned in the collation.
803 I.e., we use the value for UNDEFINED. */
804 lastp = &collate->undefined;
805 else
807 /* The entry for the simple character is always found at
808 the end. */
809 lastp = firstp;
810 while (lastp->next != NULL && wcscmp (name, lastp->name))
811 lastp = lastp->next;
814 weights = lastp->ordering;
816 else
818 dummy_weights[0] = 1;
819 dummy_weights[collate->nrules]
820 = (unsigned int) ((unsigned long int) data);
822 weights = dummy_weights;
825 /* In LASTP->ordering we now have the collation class.
826 Determine the place in the hashing table next. */
827 hash_val = hash_string (key, keylen);
828 idx = hash_val % symbols_hash_tab_size;
830 if (symbols_hash_tab[2 * idx] != (~((u_int32_t) 0)))
832 /* We need the second hashing function. */
833 size_t c = 1 + (hash_val % (symbols_hash_tab_size - 2));
836 if (idx >= symbols_hash_tab_size - c)
837 idx -= symbols_hash_tab_size - c;
838 else
839 idx += c;
840 while (symbols_hash_tab[2 * idx] != (~((u_int32_t) 0)));
843 symbols_hash_tab[2 * idx] = obstack_object_size (&string_pool);
844 symbols_hash_tab[2 * idx + 1] = (obstack_object_size (&non_simple)
845 / sizeof (u_int32_t));
847 obstack_grow0 (&string_pool, key, keylen);
848 /* Adding the first weight looks complicated. We have to deal
849 with the kind it is stored and with the fact that original
850 form uses `unsigned int's while we need `u_int32_t' here. */
851 word = weights[0];
852 obstack_grow (&non_simple, &word, sizeof (u_int32_t));
853 for (cnt = 0; cnt < weights[0]; ++cnt)
855 word = weights[collate->nrules + cnt];
856 obstack_grow (&non_simple, &word, sizeof (u_int32_t));
860 if (hash_tab == &charset->char_table)
861 hash_tab = &collate->elements;
862 else if (hash_tab == &collate->elements)
863 hash_tab = &collate->symbols;
864 else
865 break;
868 /* Now we have the complete tables. */
869 if (obstack_object_size (&string_pool) % 4 != 0)
870 obstack_blank (&non_simple, 4 - (obstack_object_size (&string_pool) % 4));
871 symbols_string_pool_size = obstack_object_size (&string_pool);
872 symbols_string_pool = obstack_finish (&string_pool);
874 symbols_class_size = obstack_object_size (&non_simple);
875 symbols_class = obstack_finish (&non_simple);
877 /* Generate tables with other byte order. */
878 symbols_hash_tab_ob = obstack_alloc (&non_simple, (2 * symbols_hash_tab_size
879 * sizeof (u_int32_t)));
880 for (cnt = 0; cnt < 2 * symbols_hash_tab_size; ++cnt)
881 symbols_hash_tab_ob[cnt] = SWAPU32 (symbols_hash_tab[cnt]);
883 symbols_class_ob = obstack_alloc (&non_simple, symbols_class_size);
884 for (cnt = 0; cnt < symbols_class_size / 4; ++cnt)
885 symbols_class_ob[cnt] = SWAPU32 (symbols_class[cnt]);
888 /* Store table addresses and lengths. */
889 #if __BYTE_ORDER == __BIG_ENDIAN
890 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_base = table;
891 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_len
892 = table_best * level_best * entry_size * sizeof (table[0]);
894 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_base = table2;
895 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_len
896 = table_best * level_best * entry_size * sizeof (table[0]);
898 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_base = extra;
899 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_len = extra_len;
901 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_base = extra2;
902 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_len = extra_len;
903 #else
904 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_base = table2;
905 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_len
906 = table_best * level_best * entry_size * sizeof (table[0]);
908 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_base = table;
909 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_len
910 = table_best * level_best * entry_size * sizeof (table[0]);
912 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_base = extra2;
913 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_len = extra_len;
915 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_base = extra;
916 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_len = extra_len;
917 #endif
919 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_base = &undefined_offset;
920 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_len = sizeof (u_int32_t);
923 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_SIZE)].iov_base
924 = &element_hash_tab_size;
925 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_SIZE)].iov_len
926 = sizeof (u_int32_t);
928 #if __BYTE_ORDER == __BIG_ENDIAN
929 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_base
930 = element_hash_tab;
931 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_len
932 = 2 * element_hash_tab_size * sizeof (u_int32_t);
934 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_base
935 = element_hash_tab_ob;
936 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_len
937 = 2 * element_hash_tab_size * sizeof (u_int32_t);
938 #else
939 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_base
940 = element_hash_tab;
941 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_len
942 = 2 * element_hash_tab_size * sizeof (u_int32_t);
944 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_base
945 = element_hash_tab_ob;
946 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_len
947 = 2 * element_hash_tab_size * sizeof (u_int32_t);
948 #endif
950 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_STR_POOL)].iov_base
951 = element_string_pool;
952 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_STR_POOL)].iov_len
953 = element_string_pool_size;
955 #if __BYTE_ORDER == __BIG_ENDIAN
956 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_base
957 = element_value;
958 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_len
959 = element_value_size;
961 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_base
962 = element_value_ob;
963 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_len
964 = element_value_size;
965 #else
966 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_base
967 = element_value;
968 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_len
969 = element_value_size;
971 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_base
972 = element_value_ob;
973 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_len
974 = element_value_size;
975 #endif
977 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZE)].iov_base
978 = &symbols_hash_tab_size;
979 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZE)].iov_len
980 = sizeof (u_int32_t);
982 #if __BYTE_ORDER == __BIG_ENDIAN
983 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_base
984 = symbols_hash_tab;
985 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_len
986 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
988 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_base
989 = symbols_hash_tab_ob;
990 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_len
991 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
992 #else
993 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_base
994 = symbols_hash_tab;
995 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_len
996 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
998 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_base
999 = symbols_hash_tab_ob;
1000 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_len
1001 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
1002 #endif
1004 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_STR_POOL)].iov_base
1005 = symbols_string_pool;
1006 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_STR_POOL)].iov_len
1007 = symbols_string_pool_size;
1009 #if __BYTE_ORDER == __BIG_ENDIAN
1010 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_base
1011 = symbols_class;
1012 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_len
1013 = symbols_class_size;
1015 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_base
1016 = symbols_class_ob;
1017 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_len
1018 = symbols_class_size;
1019 #else
1020 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_base
1021 = symbols_class;
1022 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_len
1023 = symbols_class_size;
1025 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_base
1026 = symbols_class_ob;
1027 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_len
1028 = symbols_class_size;
1029 #endif
1031 /* Update idx array. */
1032 idx[0] = iov[0].iov_len + iov[1].iov_len;
1033 for (cnt = 1; cnt < nelems; ++cnt)
1034 idx[cnt] = idx[cnt - 1] + iov[1 + cnt].iov_len;
1036 write_locale_data (output_path, "LC_COLLATE", 2 + nelems, iov);
1038 obstack_free (&non_simple, NULL);
1039 obstack_free (&string_pool, NULL);
1043 void
1044 collate_element_to (struct linereader *lr, struct localedef_t *locale,
1045 struct token *code, struct charset_t *charset)
1047 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1048 unsigned int value;
1049 void *not_used;
1051 if (collate->combine_token != NULL)
1053 free ((void *) collate->combine_token);
1054 collate->combine_token = NULL;
1057 value = charset_find_value (&charset->char_table, code->val.str.start,
1058 code->val.str.len);
1059 if ((wchar_t) value != ILLEGAL_CHAR_VALUE)
1061 lr_error (lr, _("symbol for multicharacter collating element "
1062 "`%.*s' duplicates symbolic name in charset"),
1063 (int) code->val.str.len, code->val.str.start);
1064 return;
1067 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1068 &not_used) >= 0)
1070 lr_error (lr, _("symbol for multicharacter collating element "
1071 "`%.*s' duplicates other element definition"),
1072 (int) code->val.str.len, code->val.str.start);
1073 return;
1076 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1077 &not_used) >= 0)
1079 lr_error (lr, _("symbol for multicharacter collating element "
1080 "`%.*s' duplicates symbol definition"),
1081 (int) code->val.str.len, code->val.str.start);
1082 return;
1085 collate->combine_token = code->val.str.start;
1086 collate->combine_token_len = code->val.str.len;
1090 void
1091 collate_element_from (struct linereader *lr, struct localedef_t *locale,
1092 struct token *code, struct charset_t *charset)
1094 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1095 element_t *elemp, *runp;
1097 /* CODE is a string. */
1098 elemp = (element_t *) obstack_alloc (&collate->element_mem,
1099 sizeof (element_t));
1101 /* We have to translate the string. It may contain <...> character
1102 names. */
1103 elemp->name = (wchar_t *) translate_string (code->val.str.start, charset);
1104 elemp->this_weight = 0;
1105 elemp->ordering = NULL;
1106 elemp->ordering_len = 0;
1108 free (code->val.str.start);
1110 if (elemp->name == NULL)
1112 /* At least one character in the string is not defined. We simply
1113 do nothing. */
1114 if (verbose)
1115 lr_error (lr, _("\
1116 `from' string in collation element declaration contains unknown character"));
1117 return;
1120 if (elemp->name[0] == L'\0' || elemp->name[1] == L'\0')
1122 lr_error (lr, _("illegal collation element"));
1123 return;
1126 /* The entries in the linked lists of RESULT are sorting in
1127 descending order. The order is important for the `strcoll' and
1128 `wcscoll' functions. */
1129 if (find_entry (&collate->result, elemp->name, sizeof (wchar_t),
1130 (void *) &runp) >= 0)
1132 /* We already have an entry with this key. Check whether it is
1133 identical. */
1134 element_t *prevp = NULL;
1135 int cmpres;
1139 cmpres = wcscmp (elemp->name, runp->name);
1140 if (cmpres <= 0)
1141 break;
1142 prevp = runp;
1144 while ((runp = runp->next) != NULL);
1146 if (cmpres == 0)
1147 lr_error (lr, _("duplicate collating element definition"));
1148 else
1150 elemp->next = runp;
1151 if (prevp == NULL)
1153 if (set_entry (&collate->result, elemp->name, sizeof (wchar_t),
1154 elemp) < 0)
1155 error (EXIT_FAILURE, 0, _("\
1156 error while inserting collation element into hash table"));
1158 else
1159 prevp->next = elemp;
1162 else
1164 elemp->next = NULL;
1165 if (insert_entry (&collate->result, elemp->name, sizeof (wchar_t), elemp)
1166 < 0)
1167 error (EXIT_FAILURE, errno, _("error while inserting to hash table"));
1170 if (insert_entry (&collate->elements, collate->combine_token,
1171 collate->combine_token_len, (void *) elemp) < 0)
1172 lr_error (lr, _("cannot insert new collating symbol definition: %s"),
1173 strerror (errno));
1177 void
1178 collate_symbol (struct linereader *lr, struct localedef_t *locale,
1179 struct token *code, struct charset_t *charset)
1181 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1182 wchar_t value;
1183 void *not_used;
1185 value = charset_find_value (&charset->char_table, code->val.str.start,
1186 code->val.str.len);
1187 if (value != ILLEGAL_CHAR_VALUE)
1189 lr_error (lr, _("symbol for multicharacter collating element "
1190 "`%.*s' duplicates symbolic name in charset"),
1191 (int) code->val.str.len, code->val.str.start);
1192 return;
1195 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1196 &not_used) >= 0)
1198 lr_error (lr, _("symbol for multicharacter collating element "
1199 "`%.*s' duplicates element definition"),
1200 (int) code->val.str.len, code->val.str.start);
1201 return;
1204 if (find_entry (&collate->symbols, code->val.str.start, code->val.str.len,
1205 &not_used) >= 0)
1207 lr_error (lr, _("symbol for multicharacter collating element "
1208 "`%.*s' duplicates other symbol definition"),
1209 (int) code->val.str.len, code->val.str.start);
1210 return;
1213 if (insert_entry (&collate->symbols, code->val.str.start, code->val.str.len,
1214 (void *) 0) < 0)
1215 lr_error (lr, _("cannot insert new collating symbol definition: %s"),
1216 strerror (errno));
1220 void
1221 collate_new_order (struct linereader *lr, struct localedef_t *locale,
1222 enum coll_sort_rule sort_rule)
1224 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1226 if (collate->nrules >= collate->nrules_max)
1228 collate->nrules_max *= 2;
1229 collate->rules
1230 = (enum coll_sort_rule *) xrealloc (collate->rules,
1231 collate->nrules_max
1232 * sizeof (enum coll_sort_rule));
1235 collate->rules[collate->nrules++] = sort_rule;
1239 void
1240 collate_build_arrays (struct linereader *lr, struct localedef_t *locale)
1242 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1244 collate->rules
1245 = (enum coll_sort_rule *) xrealloc (collate->rules,
1246 collate->nrules
1247 * sizeof (enum coll_sort_rule));
1249 /* Allocate arrays for temporary weights. */
1250 collate->weight_cnt = (int *) xmalloc (collate->nrules * sizeof (int));
1252 /* Choose arbitrary start value for table size. */
1253 collate->nweight_max = 5 * collate->nrules;
1254 collate->weight = (int *) xmalloc (collate->nweight_max * sizeof (int));
1259 collate_order_elem (struct linereader *lr, struct localedef_t *locale,
1260 struct token *code, struct charset_t *charset)
1262 const wchar_t zero = L'\0';
1263 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1264 int result = 0;
1265 wchar_t value;
1266 void *tmp;
1267 unsigned int i;
1269 switch (code->tok)
1271 case tok_bsymbol:
1272 /* We have a string to find in one of the three hashing tables. */
1273 value = charset_find_value (&charset->char_table, code->val.str.start,
1274 code->val.str.len);
1275 if (value != ILLEGAL_CHAR_VALUE)
1277 element_t *lastp, *firstp;
1279 collate->kind = character;
1281 if (find_entry (&collate->result, &value, sizeof (wchar_t),
1282 (void *) &firstp) < 0)
1283 firstp = lastp = NULL;
1284 else
1286 /* The entry for the simple character is always found at
1287 the end. */
1288 lastp = firstp;
1289 while (lastp->next != NULL)
1290 lastp = lastp->next;
1292 if (lastp->name[0] == value && lastp->name[1] == L'\0')
1294 lr_error (lr, _("duplicate definition for character `%.*s'"),
1295 (int) code->val.str.len, code->val.str.start);
1296 lr_ignore_rest (lr, 0);
1297 result = -1;
1298 break;
1302 collate->current_element
1303 = (element_t *) obstack_alloc (&collate->element_mem,
1304 sizeof (element_t));
1306 obstack_grow (&collate->element_mem, &value, sizeof (value));
1307 obstack_grow (&collate->element_mem, &zero, sizeof (zero));
1309 collate->current_element->name =
1310 (const wchar_t *) obstack_finish (&collate->element_mem);
1312 collate->current_element->this_weight = ++collate->order_cnt;
1314 collate->current_element->next = NULL;
1316 if (firstp == NULL)
1318 if (insert_entry (&collate->result, &value, sizeof (wchar_t),
1319 (void *) collate->current_element) < 0)
1321 lr_error (lr, _("cannot insert collation element `%.*s'"),
1322 (int) code->val.str.len, code->val.str.start);
1323 exit (4);
1326 else
1327 lastp->next = collate->current_element;
1329 else if (find_entry (&collate->elements, code->val.str.start,
1330 code->val.str.len, &tmp) >= 0)
1332 collate->current_element = (element_t *) tmp;
1334 if (collate->current_element->this_weight != 0)
1336 lr_error (lr, _("\
1337 collation element `%.*s' appears more than once: ignore line"),
1338 (int) code->val.str.len, code->val.str.start);
1339 lr_ignore_rest (lr, 0);
1340 result = -1;
1341 break;
1344 collate->kind = element;
1345 collate->current_element->this_weight = ++collate->order_cnt;
1347 else if (find_entry (&collate->symbols, code->val.str.start,
1348 code->val.str.len, &tmp) >= 0)
1350 unsigned int order = ++collate->order_cnt;
1352 if ((unsigned long int) tmp != 0ul)
1354 lr_error (lr, _("\
1355 collation symbol `%.*s' appears more than once: ignore line"),
1356 (int) code->val.str.len, code->val.str.start);
1357 lr_ignore_rest (lr, 0);
1358 result = -1;
1359 break;
1362 collate->kind = symbol;
1364 if (set_entry (&collate->symbols, code->val.str.start,
1365 code->val.str.len, (void *) order) < 0)
1367 lr_error (lr, _("cannot process order specification"));
1368 exit (4);
1371 else
1373 if (verbose)
1374 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1375 (int) code->val.str.len, code->val.str.start);
1376 lr_ignore_rest (lr, 0);
1378 result = -1;
1380 break;
1382 case tok_undefined:
1383 collate->kind = undefined;
1384 collate->current_element = &collate->undefined;
1385 break;
1387 case tok_ellipsis:
1388 if (collate->was_ellipsis)
1390 lr_error (lr, _("\
1391 two lines in a row containing `...' are not allowed"));
1392 result = -1;
1394 else if (collate->kind != character)
1396 /* An ellipsis requires the previous line to be an
1397 character definition. */
1398 lr_error (lr, _("\
1399 line before ellipsis does not contain definition for character constant"));
1400 lr_ignore_rest (lr, 0);
1401 result = -1;
1403 else
1404 collate->kind = ellipsis;
1405 break;
1407 default:
1408 assert (! "illegal token in `collate_order_elem'");
1411 /* Now it's time to handle the ellipsis in the previous line. We do
1412 this only when the last line contained an definition for a
1413 character, the current line also defines an character, the
1414 character code for the later is bigger than the former. */
1415 if (collate->was_ellipsis)
1417 if (collate->kind != character)
1419 lr_error (lr, _("\
1420 line after ellipsis must contain character definition"));
1421 lr_ignore_rest (lr, 0);
1422 result = -1;
1424 else if (collate->last_char > value)
1426 lr_error (lr, _("end point of ellipsis range is bigger then start"));
1427 lr_ignore_rest (lr, 0);
1428 result = -1;
1430 else
1432 /* We can fill the arrays with the information we need. */
1433 wchar_t name[2];
1434 unsigned int *data;
1435 size_t *ptr;
1436 size_t cnt;
1438 name[0] = collate->last_char + 1;
1439 name[1] = L'\0';
1441 data = (unsigned int *) alloca ((collate->nrules + collate->nweight)
1442 * sizeof (unsigned int));
1443 ptr = (size_t *) alloca (collate->nrules * sizeof (size_t));
1445 if (data == NULL || ptr == NULL)
1446 error (4, 0, _("memory exhausted"));
1448 /* Prepare data. Because the characters covered by an
1449 ellipsis all have equal values we prepare the data once
1450 and only change the variable number (if there are any).
1451 PTR[...] will point to the entries which will have to be
1452 fixed during the output loop. */
1453 for (cnt = 0; cnt < collate->nrules; ++cnt)
1455 data[cnt] = collate->weight_cnt[cnt];
1456 ptr[cnt] = (cnt == 0
1457 ? collate->nweight
1458 : ptr[cnt - 1] + collate->weight_cnt[cnt - 1]);
1461 for (cnt = 0; cnt < collate->nweight; ++cnt)
1462 data[collate->nrules + cnt] = collate->weight[cnt];
1464 for (cnt = 0; cnt < collate->nrules; ++cnt)
1465 if ((wchar_t) data[ptr[cnt]] != ELLIPSIS_CHAR)
1466 ptr[cnt] = 0;
1468 while (name[0] <= value)
1470 element_t *pelem;
1472 pelem = (element_t *) obstack_alloc (&collate->element_mem,
1473 sizeof (element_t));
1474 if (pelem == NULL)
1475 error (4, 0, _("memory exhausted"));
1477 pelem->name
1478 = (const wchar_t *) obstack_copy (&collate->element_mem,
1479 name, 2 * sizeof (wchar_t));
1480 pelem->this_weight = ++collate->order_cnt;
1482 pelem->ordering_len = collate->nweight;
1483 pelem->ordering
1484 = (unsigned int *) obstack_copy (&collate->element_mem, data,
1485 (collate->nrules
1486 + pelem->ordering_len)
1487 * sizeof (unsigned int));
1489 /* `...' weights need to be adjusted. */
1490 for (cnt = 0; cnt < collate->nrules; ++cnt)
1491 if (ptr[cnt] != 0)
1492 pelem->ordering[ptr[cnt]] = pelem->this_weight;
1494 /* Insert new entry into result table. */
1495 if (find_entry (&collate->result, name, sizeof (wchar_t),
1496 (void *) &pelem->next) >= 0)
1498 if (set_entry (&collate->result, name, sizeof (wchar_t),
1499 (void *) pelem) < 0)
1500 error (4, 0, _("cannot insert into result table"));
1502 else
1504 pelem->next = NULL;
1505 if (insert_entry (&collate->result, name, sizeof (wchar_t),
1506 (void *) pelem) < 0)
1507 error (4, 0, _("cannot insert into result table"));
1510 /* Increment counter. */
1511 ++name[0];
1516 /* Reset counters for weights. */
1517 collate->weight_idx = 0;
1518 collate->nweight = 0;
1519 for (i = 0; i < collate->nrules; ++i)
1520 collate->weight_cnt[i] = 0;
1521 collate->current_patch = NULL;
1523 return result;
1528 collate_weight_bsymbol (struct linereader *lr, struct localedef_t *locale,
1529 struct token *code, struct charset_t *charset)
1531 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1532 unsigned int here_weight;
1533 wchar_t value;
1534 void *tmp;
1536 assert (code->tok == tok_bsymbol);
1538 value = charset_find_value (&charset->char_table, code->val.str.start,
1539 code->val.str.len);
1540 if (value != ILLEGAL_CHAR_VALUE)
1542 element_t *runp;
1544 if (find_entry (&collate->result, &value, sizeof (wchar_t),
1545 (void *)&runp) < 0)
1546 runp = NULL;
1548 while (runp != NULL
1549 && (runp->name[0] != value || runp->name[1] != L'\0'))
1550 runp = runp->next;
1552 here_weight = runp == NULL ? 0 : runp->this_weight;
1554 else if (find_entry (&collate->elements, code->val.str.start,
1555 code->val.str.len, &tmp) >= 0)
1557 element_t *runp = (element_t *) tmp;
1559 here_weight = runp->this_weight;
1561 else if (find_entry (&collate->symbols, code->val.str.start,
1562 code->val.str.len, &tmp) >= 0)
1564 here_weight = (unsigned int) tmp;
1566 else
1568 if (verbose)
1569 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1570 (int) code->val.str.len, code->val.str.start);
1571 lr_ignore_rest (lr, 0);
1572 return -1;
1575 /* When we currently work on a collation symbol we do not expect any
1576 weight. */
1577 if (collate->kind == symbol)
1579 lr_error (lr, _("\
1580 specification of sorting weight for collation symbol does not make sense"));
1581 lr_ignore_rest (lr, 0);
1582 return -1;
1585 /* Add to the current collection of weights. */
1586 if (collate->nweight >= collate->nweight_max)
1588 collate->nweight_max *= 2;
1589 collate->weight = (unsigned int *) xrealloc (collate->weight,
1590 collate->nweight_max);
1593 /* If the weight is currently not known, we remember to patch the
1594 resulting tables. */
1595 if (here_weight == 0)
1597 patch_t *newp;
1599 newp = (patch_t *) obstack_alloc (&collate->element_mem,
1600 sizeof (patch_t));
1601 newp->fname = lr->fname;
1602 newp->lineno = lr->lineno;
1603 newp->token = (const char *) obstack_copy0 (&collate->element_mem,
1604 code->val.str.start,
1605 code->val.str.len);
1606 newp->where.idx = collate->nweight++;
1607 newp->next = collate->current_patch;
1608 collate->current_patch = newp;
1610 else
1611 collate->weight[collate->nweight++] = here_weight;
1612 ++collate->weight_cnt[collate->weight_idx];
1614 return 0;
1619 collate_next_weight (struct linereader *lr, struct localedef_t *locale)
1621 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1623 if (collate->kind == symbol)
1625 lr_error (lr, _("\
1626 specification of sorting weight for collation symbol does not make sense"));
1627 lr_ignore_rest (lr, 0);
1628 return -1;
1631 ++collate->weight_idx;
1632 if (collate->weight_idx >= collate->nrules)
1634 lr_error (lr, _("too many weights"));
1635 lr_ignore_rest (lr, 0);
1636 return -1;
1639 return 0;
1644 collate_simple_weight (struct linereader *lr, struct localedef_t *locale,
1645 struct token *code, struct charset_t *charset)
1647 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1648 unsigned int value = 0;
1650 /* There current tokens can be `IGNORE', `...', or a string. */
1651 switch (code->tok)
1653 case tok_ignore:
1654 /* This token is allowed in all situations. */
1655 value = IGNORE_CHAR;
1656 break;
1658 case tok_ellipsis:
1659 /* The ellipsis is only allowed for the `...' or `UNDEFINED'
1660 entry. */
1661 if (collate->kind != ellipsis && collate->kind != undefined)
1663 lr_error (lr, _("\
1664 `...' must only be used in `...' and `UNDEFINED' entries"));
1665 lr_ignore_rest (lr, 0);
1666 return -1;
1668 value = ELLIPSIS_CHAR;
1669 break;
1671 case tok_string:
1672 /* This can become difficult. We have to get the weights which
1673 correspond to the single wide chars in the string. But some
1674 of the `chars' might not be real characters, but collation
1675 elements or symbols. And so the string decoder might have
1676 signaled errors. The string at this point is not translated.
1677 I.e., all <...> sequences are still there. */
1679 char *runp = code->val.str.start;
1680 void *tmp;
1682 while (*runp != '\0')
1684 char *startp = (char *) runp;
1685 char *putp = (char *) runp;
1686 wchar_t wch;
1688 /* Lookup weight for char and store it. */
1689 if (*runp == '<')
1691 while (*++runp != '\0' && *runp != '>')
1693 if (*runp == lr->escape_char)
1694 if (*++runp == '\0')
1696 lr_error (lr, _("unterminated weight name"));
1697 lr_ignore_rest (lr, 0);
1698 return -1;
1700 *putp++ = *runp;
1702 if (*runp == '>')
1703 ++runp;
1705 if (putp == startp)
1707 lr_error (lr, _("empty weight name: line ignored"));
1708 lr_ignore_rest (lr, 0);
1709 return -1;
1712 wch = charset_find_value (&charset->char_table, startp,
1713 putp - startp);
1714 if (wch != ILLEGAL_CHAR_VALUE)
1716 element_t *pelem;
1718 if (find_entry (&collate->result, &wch, sizeof (wchar_t),
1719 (void *)&pelem) < 0)
1720 pelem = NULL;
1722 while (pelem != NULL
1723 && (pelem->name[0] != wch
1724 || pelem->name[1] != L'\0'))
1725 pelem = pelem->next;
1727 value = pelem == NULL ? 0 : pelem->this_weight;
1729 else if (find_entry (&collate->elements, startp, putp - startp,
1730 &tmp) >= 0)
1732 element_t *pelem = (element_t *) tmp;
1734 value = pelem->this_weight;
1736 else if (find_entry (&collate->symbols, startp, putp - startp,
1737 &tmp) >= 0)
1739 value = (unsigned int) tmp;
1741 else
1743 if (verbose)
1744 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1745 (int) (putp - startp), startp);
1746 lr_ignore_rest (lr, 0);
1747 return -1;
1750 else
1752 element_t *wp;
1753 wchar_t wch;
1755 if (*runp == lr->escape_char)
1757 static const char digits[] = "0123456789abcdef";
1758 const char *dp;
1759 int base;
1761 ++runp;
1762 if (tolower (*runp) == 'x')
1764 ++runp;
1765 base = 16;
1767 else if (tolower (*runp) == 'd')
1769 ++runp;
1770 base = 10;
1772 else
1773 base = 8;
1775 dp = strchr (digits, tolower (*runp));
1776 if (dp == NULL || (dp - digits) >= base)
1778 illegal_char:
1779 lr_error (lr, _("\
1780 illegal character constant in string"));
1781 lr_ignore_rest (lr, 0);
1782 return -1;
1784 wch = dp - digits;
1785 ++runp;
1787 dp = strchr (digits, tolower (*runp));
1788 if (dp == NULL || (dp - digits) >= base)
1789 goto illegal_char;
1790 wch *= base;
1791 wch += dp - digits;
1792 ++runp;
1794 if (base != 16)
1796 dp = strchr (digits, tolower (*runp));
1797 if (dp != NULL && (dp - digits < base))
1799 wch *= base;
1800 wch += dp - digits;
1801 ++runp;
1805 else
1806 wch = (wchar_t) *runp++;
1808 /* Lookup the weight for WCH. */
1809 if (find_entry (&collate->result, &wch, sizeof (wch),
1810 (void *)&wp) < 0)
1811 wp = NULL;
1813 while (wp != NULL
1814 && (wp->name[0] != wch || wp->name[1] != L'\0'))
1815 wp = wp->next;
1817 value = wp == NULL ? 0 : wp->this_weight;
1819 /* To get the correct name for the error message. */
1820 putp = runp;
1822 /**************************************************\
1823 |* I know here is something wrong. Characters in *|
1824 |* the string which are not in the <...> form *|
1825 |* cannot be declared forward for now!!! *|
1826 \**************************************************/
1829 /* Store in weight array. */
1830 if (collate->nweight >= collate->nweight_max)
1832 collate->nweight_max *= 2;
1833 collate->weight
1834 = (unsigned int *) xrealloc (collate->weight,
1835 collate->nweight_max);
1838 if (value == 0)
1840 patch_t *newp;
1842 newp = (patch_t *) obstack_alloc (&collate->element_mem,
1843 sizeof (patch_t));
1844 newp->fname = lr->fname;
1845 newp->lineno = lr->lineno;
1846 newp->token
1847 = (const char *) obstack_copy0 (&collate->element_mem,
1848 startp, putp - startp);
1849 newp->where.idx = collate->nweight++;
1850 newp->next = collate->current_patch;
1851 collate->current_patch = newp;
1853 else
1854 collate->weight[collate->nweight++] = value;
1855 ++collate->weight_cnt[collate->weight_idx];
1858 return 0;
1860 default:
1861 assert (! "should not happen");
1865 if (collate->nweight >= collate->nweight_max)
1867 collate->nweight_max *= 2;
1868 collate->weight = (unsigned int *) xrealloc (collate->weight,
1869 collate->nweight_max);
1872 collate->weight[collate->nweight++] = value;
1873 ++collate->weight_cnt[collate->weight_idx];
1875 return 0;
1879 void
1880 collate_end_weight (struct linereader *lr, struct localedef_t *locale)
1882 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1883 element_t *pelem = collate->current_element;
1885 if (collate->kind == symbol)
1887 /* We don't have to do anything. */
1888 collate->was_ellipsis = 0;
1889 return;
1892 if (collate->kind == ellipsis)
1894 /* Before the next line is processed the ellipsis is handled. */
1895 collate->was_ellipsis = 1;
1896 return;
1899 assert (collate->kind == character || collate->kind == element
1900 || collate->kind == undefined);
1902 /* Fill in the missing weights. */
1903 while (++collate->weight_idx < collate->nrules)
1905 collate->weight[collate->nweight++] = pelem->this_weight;
1906 ++collate->weight_cnt[collate->weight_idx];
1909 /* Now we know how many ordering weights the current
1910 character/element has. Allocate room in the element structure
1911 and copy information. */
1912 pelem->ordering_len = collate->nweight;
1914 /* First we write an array with the number of values for each
1915 weight. */
1916 obstack_grow (&collate->element_mem, collate->weight_cnt,
1917 collate->nrules * sizeof (unsigned int));
1919 /* Now the weights itselves. */
1920 obstack_grow (&collate->element_mem, collate->weight,
1921 collate->nweight * sizeof (unsigned int));
1923 /* Get result. */
1924 pelem->ordering = obstack_finish (&collate->element_mem);
1926 /* Now we handle the "patches". */
1927 while (collate->current_patch != NULL)
1929 patch_t *this_patch;
1931 this_patch = collate->current_patch;
1933 this_patch->where.pos = &pelem->ordering[collate->nrules
1934 + this_patch->where.idx];
1936 collate->current_patch = this_patch->next;
1937 this_patch->next = collate->all_patches;
1938 collate->all_patches = this_patch;
1941 /* Set information for next round. */
1942 collate->was_ellipsis = 0;
1943 if (collate->kind != undefined)
1944 collate->last_char = pelem->name[0];