Update.
[glibc.git] / locale / programs / ld-collate.c
blob772ab1af33fca4e94489b77cc207baf4e2aa08b3
1 /* Copyright (C) 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <endian.h>
25 #include <errno.h>
26 #include <limits.h>
27 #include <locale.h>
28 #include <obstack.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <wchar.h>
33 #include "localeinfo.h"
34 #include "locales.h"
35 #include "simple-hash.h"
36 #include "stringtrans.h"
37 #include "strlen-hash.h"
39 /* Uncomment the following line in the production version. */
40 /* #define NDEBUG 1 */
41 #include <assert.h>
44 #define MAX(a, b) ((a) > (b) ? (a) : (b))
46 #define SWAPU32(w) \
47 (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
50 /* What kind of symbols get defined? */
51 enum coll_symbol
53 undefined,
54 ellipsis,
55 character,
56 element,
57 symbol
61 typedef struct patch_t
63 const char *fname;
64 size_t lineno;
65 const char *token;
66 union
68 unsigned int *pos;
69 size_t idx;
70 } where;
71 struct patch_t *next;
72 } patch_t;
75 typedef struct element_t
77 const wchar_t *name;
78 unsigned int this_weight;
80 struct element_t *next;
82 unsigned int *ordering;
83 size_t ordering_len;
84 } element_t;
87 /* The real definition of the struct for the LC_COLLATE locale. */
88 struct locale_collate_t
90 /* Collate symbol table. Simple mapping to number. */
91 hash_table symbols;
93 /* The collation elements. */
94 hash_table elements;
95 struct obstack element_mem;
97 /* The result table. */
98 hash_table result;
100 /* Sorting rules given in order_start line. */
101 u_int32_t nrules;
102 u_int32_t nrules_max;
103 enum coll_sort_rule *rules;
105 /* Used while recognizing symbol composed of multiple tokens
106 (collating-element). */
107 const char *combine_token;
108 size_t combine_token_len;
110 /* How many sorting order specifications so far. */
111 unsigned int order_cnt;
113 /* Was lastline ellipsis? */
114 int was_ellipsis;
115 /* Value of last entry if was character. */
116 wchar_t last_char;
117 /* Current element. */
118 element_t *current_element;
119 /* What kind of symbol is current element. */
120 enum coll_symbol kind;
122 /* While collecting the weights we need some temporary space. */
123 unsigned int current_order;
124 int *weight_cnt;
125 unsigned int weight_idx;
126 unsigned int *weight;
127 size_t nweight;
128 size_t nweight_max;
130 /* Patch lists. */
131 patch_t *current_patch;
132 patch_t *all_patches;
134 /* Room for the UNDEFINED information. */
135 element_t undefined;
136 unsigned int undefined_len;
140 /* Be verbose? Defined in localedef.c. */
141 extern int verbose;
144 void *xmalloc (size_t __n);
145 void *xrealloc (void *__p, size_t __n);
148 #define obstack_chunk_alloc malloc
149 #define obstack_chunk_free free
152 void
153 collate_startup (struct linereader *lr, struct localedef_t *locale,
154 struct charset_t *charset)
156 struct locale_collate_t *collate;
158 /* We have a definition for LC_COLLATE. */
159 copy_posix.mask &= ~(1 << LC_COLLATE);
161 /* It is important that we always use UCS4 encoding for strings now. */
162 encoding_method = ENC_UCS4;
164 /* Allocate the needed room. */
165 locale->categories[LC_COLLATE].collate = collate =
166 (struct locale_collate_t *) xmalloc (sizeof (struct locale_collate_t));
168 /* Allocate hash table for collating elements. */
169 if (init_hash (&collate->elements, 512))
170 error (4, 0, _("memory exhausted"));
171 collate->combine_token = NULL;
172 obstack_init (&collate->element_mem);
174 /* Allocate hash table for collating elements. */
175 if (init_hash (&collate->symbols, 64))
176 error (4, 0, _("memory exhausted"));
178 /* Allocate hash table for result. */
179 if (init_hash (&collate->result, 512))
180 error (4, 0, _("memory exhausted"));
182 collate->nrules = 0;
183 collate->nrules_max = 10;
184 collate->rules
185 = (enum coll_sort_rule *) xmalloc (collate->nrules_max
186 * sizeof (enum coll_sort_rule));
188 collate->order_cnt = 1; /* The smallest weight is 2. */
190 collate->was_ellipsis = 0;
191 collate->last_char = L'\0'; /* 0 because leading ellipsis is allowed. */
193 collate->all_patches = NULL;
195 /* This tells us no UNDEFINED entry was found until now. */
196 memset (&collate->undefined, '\0', sizeof (collate->undefined));
198 lr->translate_strings = 0;
202 void
203 collate_finish (struct localedef_t *locale, struct charset_t *charset)
205 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
206 patch_t *patch;
207 size_t cnt;
209 /* Patch the constructed table so that forward references are
210 correctly filled. */
211 for (patch = collate->all_patches; patch != NULL; patch = patch->next)
213 wchar_t wch;
214 size_t toklen = strlen (patch->token);
215 void *ptmp;
216 unsigned int value = 0;
218 wch = charset_find_value (&charset->char_table, patch->token, toklen);
219 if (wch != ILLEGAL_CHAR_VALUE)
221 element_t *runp;
223 if (find_entry (&collate->result, &wch, sizeof (wchar_t),
224 (void *) &runp) < 0)
225 runp = NULL;
226 for (; runp != NULL; runp = runp->next)
227 if (runp->name[0] == wch && runp->name[1] == L'\0')
228 break;
230 value = runp == NULL ? 0 : runp->this_weight;
232 else if (find_entry (&collate->elements, patch->token, toklen, &ptmp)
233 >= 0)
235 value = ((element_t *) ptmp)->this_weight;
237 else if (find_entry (&collate->symbols, patch->token, toklen, &ptmp)
238 >= 0)
240 value = (unsigned long int) ptmp;
242 else
243 value = 0;
245 if (value == 0)
247 if (!be_quiet)
248 error_at_line (0, 0, patch->fname, patch->lineno,
249 _("no weight defined for symbol `%s'"),
250 patch->token);
252 else
253 *patch->where.pos = value;
256 /* If no definition for UNDEFINED is given, all characters in the
257 given charset must be specified. */
258 if (collate->undefined.ordering == NULL)
260 /**************************************************************\
261 |* XXX We should test whether really an unspecified character *|
262 |* exists before giving the message. *|
263 \**************************************************************/
264 u_int32_t weight;
266 if (/* XXX Remove the 0 & */ 0 && !be_quiet)
267 error (0, 0, _("no definition of `UNDEFINED'"));
269 collate->undefined.ordering_len = collate->nrules;
270 weight = ++collate->order_cnt;
272 for (cnt = 0; cnt < collate->nrules; ++cnt)
274 u_int32_t one = 1;
275 obstack_grow (&collate->element_mem, &one, sizeof (one));
278 for (cnt = 0; cnt < collate->nrules; ++cnt)
279 obstack_grow (&collate->element_mem, &weight, sizeof (weight));
281 collate->undefined.ordering = obstack_finish (&collate->element_mem);
284 collate->undefined_len = 2; /* For the name: 1 x wchar_t + L'\0'. */
285 for (cnt = 0; cnt < collate->nrules; ++cnt)
286 collate->undefined_len += 1 + collate->undefined.ordering[cnt];
291 void
292 collate_output (struct localedef_t *locale, struct charset_t *charset,
293 const char *output_path)
295 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
296 u_int32_t table_size, table_best, level_best, sum_best;
297 void *last;
298 element_t *pelem;
299 wchar_t *name;
300 size_t len;
301 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
302 struct iovec iov[2 + nelems];
303 struct locale_file data;
304 u_int32_t idx[nelems];
305 struct obstack non_simple;
306 struct obstack string_pool;
307 size_t cnt, entry_size;
308 u_int32_t undefined_offset = UINT_MAX;
309 u_int32_t *table, *extra, *table2, *extra2;
310 size_t extra_len;
311 u_int32_t element_hash_tab_size;
312 u_int32_t *element_hash_tab;
313 u_int32_t *element_hash_tab_ob;
314 u_int32_t element_string_pool_size;
315 char *element_string_pool;
316 u_int32_t element_value_size;
317 wchar_t *element_value;
318 wchar_t *element_value_ob;
319 u_int32_t symbols_hash_tab_size;
320 u_int32_t *symbols_hash_tab;
321 u_int32_t *symbols_hash_tab_ob;
322 u_int32_t symbols_string_pool_size;
323 char *symbols_string_pool;
324 u_int32_t symbols_class_size;
325 u_int32_t *symbols_class;
326 u_int32_t *symbols_class_ob;
327 hash_table *hash_tab;
328 unsigned int dummy_weights[collate->nrules + 1];
330 sum_best = UINT_MAX;
331 table_best = 0xffff;
332 level_best = 0xffff;
334 /* Compute table size. */
335 if (!be_quiet)
336 fputs (_("\
337 Computing table size for collation information might take a while..."),
338 stderr);
339 for (table_size = 256; table_size < sum_best; ++table_size)
341 size_t hits[table_size];
342 unsigned int worst = 1;
343 size_t cnt;
345 last = NULL;
347 for (cnt = 0; cnt < 256; ++cnt)
348 hits[cnt] = 1;
349 memset (&hits[256], '\0', sizeof (hits) - 256 * sizeof (size_t));
351 while (iterate_table (&collate->result, &last, (const void **) &name,
352 &len, (void **) &pelem) >= 0)
353 if (pelem->ordering != NULL && pelem->name[0] > 0xff)
354 if (++hits[(unsigned int) pelem->name[0] % table_size] > worst)
356 worst = hits[(unsigned int) pelem->name[0] % table_size];
357 if (table_size * worst > sum_best)
358 break;
361 if (table_size * worst < sum_best)
363 sum_best = table_size * worst;
364 table_best = table_size;
365 level_best = worst;
368 assert (table_best != 0xffff || level_best != 0xffff);
369 if (!be_quiet)
370 fputs (_(" done\n"), stderr);
372 obstack_init (&non_simple);
373 obstack_init (&string_pool);
375 data.magic = LIMAGIC (LC_COLLATE);
376 data.n = nelems;
377 iov[0].iov_base = (void *) &data;
378 iov[0].iov_len = sizeof (data);
380 iov[1].iov_base = (void *) idx;
381 iov[1].iov_len = sizeof (idx);
383 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_NRULES)].iov_base = &collate->nrules;
384 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_NRULES)].iov_len = sizeof (u_int32_t);
386 table = (u_int32_t *) alloca (collate->nrules * sizeof (u_int32_t));
387 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_RULES)].iov_base = table;
388 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_RULES)].iov_len
389 = collate->nrules * sizeof (u_int32_t);
390 /* Another trick here. Describing the collation method needs only a
391 few bits (3, to be exact). But the binary file should be
392 accessible by machines with both endianesses and so we store both
393 forms in the same word. */
394 for (cnt = 0; cnt < collate->nrules; ++cnt)
395 table[cnt] = collate->rules[cnt] | SWAPU32 (collate->rules[cnt]);
397 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].iov_base = &table_best;
398 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].iov_len = sizeof (u_int32_t);
400 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].iov_base = &level_best;
401 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].iov_len
402 = sizeof (u_int32_t);
404 entry_size = 1 + MAX (collate->nrules, 2);
406 table = (u_int32_t *) alloca (table_best * level_best * entry_size
407 * sizeof (table[0]));
408 memset (table, '\0', table_best * level_best * entry_size
409 * sizeof (table[0]));
412 /* Macros for inserting in output table. */
413 #define ADD_VALUE(expr) \
414 do { \
415 u_int32_t to_write = (u_int32_t) expr; \
416 obstack_grow (&non_simple, &to_write, sizeof (to_write)); \
417 } while (0)
419 #define ADD_ELEMENT(pelem, len) \
420 do { \
421 size_t cnt, idx; \
423 ADD_VALUE (len); \
425 wlen = wcslen (pelem->name); \
426 obstack_grow (&non_simple, pelem->name, (wlen + 1) * sizeof (u_int32_t)); \
428 idx = collate->nrules; \
429 for (cnt = 0; cnt < collate->nrules; ++cnt) \
431 size_t disp; \
433 ADD_VALUE (pelem->ordering[cnt]); \
434 for (disp = 0; disp < pelem->ordering[cnt]; ++disp) \
435 ADD_VALUE (pelem->ordering[idx++]); \
437 } while (0)
439 #define ADD_FORWARD(pelem) \
440 do { \
441 /* We leave a reference in the main table and put all \
442 information in the table for the extended entries. */ \
443 element_t *runp; \
444 element_t *has_simple = NULL; \
445 size_t wlen; \
447 table[(level * table_best + slot) * entry_size + 1] \
448 = FORWARD_CHAR; \
449 table[(level * table_best + slot) * entry_size + 2] \
450 = obstack_object_size (&non_simple) / sizeof (u_int32_t); \
452 /* Here we have to construct the non-simple table entry. First \
453 compute the total length of this entry. */ \
454 for (runp = (pelem); runp != NULL; runp = runp->next) \
455 if (runp->ordering != NULL) \
457 u_int32_t value; \
458 size_t cnt; \
460 value = 1 + wcslen (runp->name) + 1; \
462 for (cnt = 0; cnt < collate->nrules; ++cnt) \
463 /* We have to take care for entries without ordering \
464 information. While reading them they get inserted in the \
465 table and later not removed when something goes wrong with \
466 reading its weights. */ \
467 value += 1 + runp->ordering[cnt]; \
469 if (runp->name[1] == L'\0') \
470 has_simple = runp; \
472 ADD_ELEMENT (runp, value); \
475 if (has_simple == NULL) \
477 size_t idx, cnt; \
479 ADD_VALUE (collate->undefined_len + 1); \
481 /* Add the name. */ \
482 ADD_VALUE ((pelem)->name[0]); \
483 ADD_VALUE (0); \
485 idx = collate->nrules; \
486 for (cnt = 0; cnt < collate->nrules; ++cnt) \
488 size_t disp; \
490 ADD_VALUE (collate->undefined.ordering[cnt]); \
491 for (disp = 0; disp < collate->undefined.ordering[cnt]; ++disp) \
493 if ((wchar_t) collate->undefined.ordering[idx] \
494 == ELLIPSIS_CHAR) \
495 ADD_VALUE ((pelem)->name[0]); \
496 else \
497 ADD_VALUE (collate->undefined.ordering[idx++]); \
498 ++idx; \
502 } while (0)
506 /* Fill the table now. First we look for all the characters which
507 fit into one single byte. This speeds up the 8-bit string
508 functions. */
509 last = NULL;
510 while (iterate_table (&collate->result, &last, (const void **) &name,
511 &len, (void **) &pelem) >= 0)
512 if (pelem->name[0] <= 0xff)
514 /* We have a single byte name. Now we must distinguish
515 between entries in simple form (i.e., only one value per
516 weight and no collation element starting with the same
517 character) and those which are not. */
518 size_t slot = ((size_t) pelem->name[0]);
519 const size_t level = 0;
521 table[slot * entry_size] = pelem->name[0];
523 if (pelem->name[1] == L'\0' && pelem->next == NULL
524 && pelem->ordering_len == collate->nrules)
526 /* Yes, we have a simple one. Lucky us. */
527 size_t cnt;
529 for (cnt = 0; cnt < collate->nrules; ++cnt)
530 table[slot * entry_size + 1 + cnt]
531 = pelem->ordering[collate->nrules + cnt];
533 else
534 ADD_FORWARD (pelem);
537 /* Now check for missing single byte entries. If one exist we fill
538 with the UNDEFINED entry. */
539 for (cnt = 0; cnt < 256; ++cnt)
540 /* The first weight is never 0 for existing entries. */
541 if (table[cnt * entry_size + 1] == 0)
543 /* We have to fill in the information from the UNDEFINED
544 entry. */
545 table[cnt * entry_size] = (u_int32_t) cnt;
547 if (collate->undefined.ordering_len == collate->nrules)
549 size_t inner;
551 for (inner = 0; inner < collate->nrules; ++inner)
552 if ((wchar_t)collate->undefined.ordering[collate->nrules + inner]
553 == ELLIPSIS_CHAR)
554 table[cnt * entry_size + 1 + inner] = cnt;
555 else
556 table[cnt * entry_size + 1 + inner]
557 = collate->undefined.ordering[collate->nrules + inner];
559 else
561 if (undefined_offset != UINT_MAX)
563 table[cnt * entry_size + 1] = FORWARD_CHAR;
564 table[cnt * entry_size + 2] = undefined_offset;
566 else
568 const size_t slot = cnt;
569 const size_t level = 0;
571 ADD_FORWARD (&collate->undefined);
572 undefined_offset = table[cnt * entry_size + 2];
577 /* Now we are ready for inserting the whole rest. */
578 last = NULL;
579 while (iterate_table (&collate->result, &last, (const void **) &name,
580 &len, (void **) &pelem) >= 0)
581 if (pelem->name[0] > 0xff)
583 /* Find the position. */
584 size_t slot = ((size_t) pelem->name[0]) % table_best;
585 size_t level = 0;
587 while (table[(level * table_best + slot) * entry_size + 1] != 0)
588 ++level;
589 assert (level < level_best);
591 if (pelem->name[1] == L'\0' && pelem->next == NULL
592 && pelem->ordering_len == collate->nrules)
594 /* Again a simple entry. */
595 size_t inner;
597 for (inner = 0; inner < collate->nrules; ++inner)
598 table[(level * table_best + slot) * entry_size + 1 + inner]
599 = pelem->ordering[collate->nrules + inner];
601 else
602 ADD_FORWARD (pelem);
605 /* Add the UNDEFINED entry. */
607 /* Here we have to construct the non-simple table entry. */
608 size_t idx, cnt;
610 undefined_offset = obstack_object_size (&non_simple);
612 idx = collate->nrules;
613 for (cnt = 0; cnt < collate->nrules; ++cnt)
615 size_t disp;
617 ADD_VALUE (collate->undefined.ordering[cnt]);
618 for (disp = 0; disp < collate->undefined.ordering[cnt]; ++disp)
619 ADD_VALUE (collate->undefined.ordering[idx++]);
623 /* Finish the extra block. */
624 extra_len = obstack_object_size (&non_simple);
625 extra = (u_int32_t *) obstack_finish (&non_simple);
626 assert ((extra_len % sizeof (u_int32_t)) == 0);
628 /* Now we have to build the two array for the other byte ordering. */
629 table2 = (u_int32_t *) alloca (table_best * level_best * entry_size
630 * sizeof (table[0]));
631 extra2 = (u_int32_t *) alloca (extra_len);
633 for (cnt = 0; cnt < table_best * level_best * entry_size; ++cnt)
634 table2[cnt] = SWAPU32 (table[cnt]);
636 for (cnt = 0; cnt < extra_len / sizeof (u_int32_t); ++cnt)
637 extra2[cnt] = SWAPU32 (extra2[cnt]);
639 /* We need a simple hashing table to get a collation-element->chars
640 mapping. We again use internal hashing using a secondary hashing
641 function.
643 Each string has an associate hashing value V, computed by a
644 fixed function. To locate the string we use open addressing with
645 double hashing. The first index will be V % M, where M is the
646 size of the hashing table. If no entry is found, iterating with
647 a second, independent hashing function takes place. This second
648 value will be 1 + V % (M - 2). The approximate number of probes
649 will be
651 for unsuccessful search: (1 - N / M) ^ -1
652 for successful search: - (N / M) ^ -1 * ln (1 - N / M)
654 where N is the number of keys.
656 If we now choose M to be the next prime bigger than 4 / 3 * N,
657 we get the values 4 and 1.85 resp. Because unsuccessful searches
658 are unlikely this is a good value. Formulas: [Knuth, The Art of
659 Computer Programming, Volume 3, Sorting and Searching, 1973,
660 Addison Wesley] */
661 if (collate->elements.filled == 0)
663 /* We don't need any element table since there are no collating
664 elements. */
665 element_hash_tab_size = 0;
666 element_hash_tab = NULL;
667 element_hash_tab_ob = NULL;
668 element_string_pool_size = 0;
669 element_string_pool = NULL;
670 element_value_size = 0;
671 element_value = NULL;
672 element_value_ob = NULL;
674 else
676 void *ptr; /* Running pointer. */
677 const char *key; /* Key for current bucket. */
678 size_t keylen; /* Length of key data. */
679 const element_t *data; /* Data, i.e., the character sequence. */
681 element_hash_tab_size = next_prime ((collate->elements.filled * 4) / 3);
682 if (element_hash_tab_size < 7)
683 /* We need a minimum to make the following code work. */
684 element_hash_tab_size = 7;
686 element_hash_tab = obstack_alloc (&non_simple, (2 * element_hash_tab_size
687 * sizeof (u_int32_t)));
688 memset (element_hash_tab, '\377', (2 * element_hash_tab_size
689 * sizeof (u_int32_t)));
691 ptr = NULL;
692 while (iterate_table (&collate->elements, &ptr, (const void **) &key,
693 &keylen, (void **) &data) == 0)
695 size_t hash_val = hash_string (key, keylen);
696 size_t idx = hash_val % element_hash_tab_size;
698 if (element_hash_tab[2 * idx] != (~((u_int32_t) 0)))
700 /* We need the second hashing function. */
701 size_t c = 1 + (hash_val % (element_hash_tab_size - 2));
704 if (idx >= element_hash_tab_size - c)
705 idx -= element_hash_tab_size - c;
706 else
707 idx += c;
708 while (element_hash_tab[2 * idx] != (~((u_int32_t) 0)));
711 element_hash_tab[2 * idx] = obstack_object_size (&non_simple);
712 element_hash_tab[2 * idx + 1] = (obstack_object_size (&string_pool)
713 / sizeof (wchar_t));
715 obstack_grow0 (&non_simple, key, keylen);
716 obstack_grow (&string_pool, data->name,
717 (wcslen (data->name) + 1) * sizeof (wchar_t));
720 if (obstack_object_size (&non_simple) % 4 != 0)
721 obstack_blank (&non_simple,
722 4 - (obstack_object_size (&non_simple) % 4));
723 element_string_pool_size = obstack_object_size (&non_simple);
724 element_string_pool = obstack_finish (&non_simple);
726 element_value_size = obstack_object_size (&string_pool);
727 element_value = obstack_finish (&string_pool);
729 /* Create the tables for the other byte order. */
730 element_hash_tab_ob = obstack_alloc (&non_simple,
731 (2 * element_hash_tab_size
732 * sizeof (u_int32_t)));
733 for (cnt = 0; cnt < 2 * element_hash_tab_size; ++cnt)
734 element_hash_tab_ob[cnt] = SWAPU32 (element_hash_tab[cnt]);
736 element_value_ob = obstack_alloc (&string_pool, element_value_size);
737 if (sizeof (wchar_t) != 4)
739 fputs ("sizeof (wchar_t) != 4 currently not handled", stderr);
740 abort ();
742 for (cnt = 0; cnt < element_value_size / 4; ++cnt)
743 element_value_ob[cnt] = SWAPU32 (element_value[cnt]);
746 /* Store collation elements as map to collation class. There are
747 three kinds of symbols:
748 - simple characters
749 - collation elements
750 - collation symbols
751 We need to make a table which lets the user to access the primary
752 weight based on the symbol string. */
753 symbols_hash_tab_size = next_prime ((4 * (charset->char_table.filled
754 + collate->elements.filled
755 + collate->symbols.filled)) / 3);
756 symbols_hash_tab = obstack_alloc (&non_simple, (2 * symbols_hash_tab_size
757 * sizeof (u_int32_t)));
758 memset (symbols_hash_tab, '\377', (2 * symbols_hash_tab_size
759 * sizeof (u_int32_t)));
761 /* Now fill the array. First the symbols from the character set,
762 then the collation elements and last the collation symbols. */
763 hash_tab = &charset->char_table;
764 while (1)
766 void *ptr; /* Running pointer. */
767 const char *key; /* Key for current bucket. */
768 size_t keylen; /* Length of key data. */
769 void *data; /* Data. */
771 ptr = NULL;
772 while (iterate_table (hash_tab, &ptr, (const void **) &key,
773 &keylen, (void **) &data) == 0)
775 size_t hash_val;
776 size_t idx;
777 u_int32_t word;
778 unsigned int *weights;
780 if (hash_tab == &charset->char_table
781 || hash_tab == &collate->elements)
783 element_t *lastp, *firstp;
784 wchar_t dummy_name[2];
785 const wchar_t *name;
786 size_t name_len;
788 if (hash_tab == &charset->char_table)
790 dummy_name[0] = (wchar_t) ((unsigned long int) data);
791 dummy_name[1] = L'\0';
792 name = dummy_name;
793 name_len = sizeof (wchar_t);
795 else
797 element_t *elemp = (element_t *) data;
798 name = elemp->name;
799 name_len = wcslen (name) * sizeof (wchar_t);
802 /* First check whether this character is used at all. */
803 if (find_entry (&collate->result, name, name_len,
804 (void *) &firstp) < 0)
805 /* The symbol is not directly mentioned in the collation.
806 I.e., we use the value for UNDEFINED. */
807 lastp = &collate->undefined;
808 else
810 /* The entry for the simple character is always found at
811 the end. */
812 lastp = firstp;
813 while (lastp->next != NULL && wcscmp (name, lastp->name))
814 lastp = lastp->next;
815 if (lastp->ordering == NULL)
816 lastp = &collate->undefined;
819 weights = lastp->ordering;
821 else
823 dummy_weights[0] = 1;
824 dummy_weights[collate->nrules]
825 = (unsigned int) ((unsigned long int) data);
827 weights = dummy_weights;
830 /* In LASTP->ordering we now have the collation class.
831 Determine the place in the hashing table next. */
832 hash_val = hash_string (key, keylen);
833 idx = hash_val % symbols_hash_tab_size;
835 if (symbols_hash_tab[2 * idx] != (~((u_int32_t) 0)))
837 /* We need the second hashing function. */
838 size_t c = 1 + (hash_val % (symbols_hash_tab_size - 2));
841 if (idx >= symbols_hash_tab_size - c)
842 idx -= symbols_hash_tab_size - c;
843 else
844 idx += c;
845 while (symbols_hash_tab[2 * idx] != (~((u_int32_t) 0)));
848 symbols_hash_tab[2 * idx] = obstack_object_size (&string_pool);
849 symbols_hash_tab[2 * idx + 1] = (obstack_object_size (&non_simple)
850 / sizeof (u_int32_t));
852 obstack_grow0 (&string_pool, key, keylen);
853 /* Adding the first weight looks complicated. We have to deal
854 with the kind it is stored and with the fact that original
855 form uses `unsigned int's while we need `u_int32_t' here. */
856 word = weights[0];
857 obstack_grow (&non_simple, &word, sizeof (u_int32_t));
858 for (cnt = 0; cnt < weights[0]; ++cnt)
860 word = weights[collate->nrules + cnt];
861 obstack_grow (&non_simple, &word, sizeof (u_int32_t));
865 if (hash_tab == &charset->char_table)
866 hash_tab = &collate->elements;
867 else if (hash_tab == &collate->elements)
868 hash_tab = &collate->symbols;
869 else
870 break;
873 /* Now we have the complete tables. */
874 if (obstack_object_size (&string_pool) % 4 != 0)
875 obstack_blank (&non_simple, 4 - (obstack_object_size (&string_pool) % 4));
876 symbols_string_pool_size = obstack_object_size (&string_pool);
877 symbols_string_pool = obstack_finish (&string_pool);
879 symbols_class_size = obstack_object_size (&non_simple);
880 symbols_class = obstack_finish (&non_simple);
882 /* Generate tables with other byte order. */
883 symbols_hash_tab_ob = obstack_alloc (&non_simple, (2 * symbols_hash_tab_size
884 * sizeof (u_int32_t)));
885 for (cnt = 0; cnt < 2 * symbols_hash_tab_size; ++cnt)
886 symbols_hash_tab_ob[cnt] = SWAPU32 (symbols_hash_tab[cnt]);
888 symbols_class_ob = obstack_alloc (&non_simple, symbols_class_size);
889 for (cnt = 0; cnt < symbols_class_size / 4; ++cnt)
890 symbols_class_ob[cnt] = SWAPU32 (symbols_class[cnt]);
893 /* Store table addresses and lengths. */
894 #if __BYTE_ORDER == __BIG_ENDIAN
895 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_base = table;
896 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_len
897 = table_best * level_best * entry_size * sizeof (table[0]);
899 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_base = table2;
900 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_len
901 = table_best * level_best * entry_size * sizeof (table[0]);
903 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_base = extra;
904 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_len = extra_len;
906 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_base = extra2;
907 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_len = extra_len;
908 #else
909 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_base = table2;
910 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_len
911 = table_best * level_best * entry_size * sizeof (table[0]);
913 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_base = table;
914 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_len
915 = table_best * level_best * entry_size * sizeof (table[0]);
917 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_base = extra2;
918 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_len = extra_len;
920 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_base = extra;
921 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_len = extra_len;
922 #endif
924 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_base = &undefined_offset;
925 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_len = sizeof (u_int32_t);
928 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_SIZE)].iov_base
929 = &element_hash_tab_size;
930 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_SIZE)].iov_len
931 = sizeof (u_int32_t);
933 #if __BYTE_ORDER == __BIG_ENDIAN
934 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_base
935 = element_hash_tab;
936 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_len
937 = 2 * element_hash_tab_size * sizeof (u_int32_t);
939 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_base
940 = element_hash_tab_ob;
941 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_len
942 = 2 * element_hash_tab_size * sizeof (u_int32_t);
943 #else
944 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_base
945 = element_hash_tab;
946 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_len
947 = 2 * element_hash_tab_size * sizeof (u_int32_t);
949 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_base
950 = element_hash_tab_ob;
951 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_len
952 = 2 * element_hash_tab_size * sizeof (u_int32_t);
953 #endif
955 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_STR_POOL)].iov_base
956 = element_string_pool;
957 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_STR_POOL)].iov_len
958 = element_string_pool_size;
960 #if __BYTE_ORDER == __BIG_ENDIAN
961 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_base
962 = element_value;
963 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_len
964 = element_value_size;
966 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_base
967 = element_value_ob;
968 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_len
969 = element_value_size;
970 #else
971 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_base
972 = element_value;
973 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_len
974 = element_value_size;
976 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_base
977 = element_value_ob;
978 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_len
979 = element_value_size;
980 #endif
982 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZE)].iov_base
983 = &symbols_hash_tab_size;
984 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZE)].iov_len
985 = sizeof (u_int32_t);
987 #if __BYTE_ORDER == __BIG_ENDIAN
988 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_base
989 = symbols_hash_tab;
990 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_len
991 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
993 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_base
994 = symbols_hash_tab_ob;
995 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_len
996 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
997 #else
998 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_base
999 = symbols_hash_tab;
1000 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_len
1001 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
1003 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_base
1004 = symbols_hash_tab_ob;
1005 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_len
1006 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
1007 #endif
1009 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_STR_POOL)].iov_base
1010 = symbols_string_pool;
1011 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_STR_POOL)].iov_len
1012 = symbols_string_pool_size;
1014 #if __BYTE_ORDER == __BIG_ENDIAN
1015 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_base
1016 = symbols_class;
1017 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_len
1018 = symbols_class_size;
1020 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_base
1021 = symbols_class_ob;
1022 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_len
1023 = symbols_class_size;
1024 #else
1025 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_base
1026 = symbols_class;
1027 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_len
1028 = symbols_class_size;
1030 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_base
1031 = symbols_class_ob;
1032 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_len
1033 = symbols_class_size;
1034 #endif
1036 /* Update idx array. */
1037 idx[0] = iov[0].iov_len + iov[1].iov_len;
1038 for (cnt = 1; cnt < nelems; ++cnt)
1039 idx[cnt] = idx[cnt - 1] + iov[1 + cnt].iov_len;
1041 write_locale_data (output_path, "LC_COLLATE", 2 + nelems, iov);
1043 obstack_free (&non_simple, NULL);
1044 obstack_free (&string_pool, NULL);
1048 void
1049 collate_element_to (struct linereader *lr, struct localedef_t *locale,
1050 struct token *code, struct charset_t *charset)
1052 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1053 unsigned int value;
1054 void *not_used;
1056 if (collate->combine_token != NULL)
1058 free ((void *) collate->combine_token);
1059 collate->combine_token = NULL;
1062 value = charset_find_value (&charset->char_table, code->val.str.start,
1063 code->val.str.len);
1064 if ((wchar_t) value != ILLEGAL_CHAR_VALUE)
1066 lr_error (lr, _("symbol for multicharacter collating element "
1067 "`%.*s' duplicates symbolic name in charset"),
1068 (int) code->val.str.len, code->val.str.start);
1069 return;
1072 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1073 &not_used) >= 0)
1075 lr_error (lr, _("symbol for multicharacter collating element "
1076 "`%.*s' duplicates element definition"),
1077 (int) code->val.str.len, code->val.str.start);
1078 return;
1081 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1082 &not_used) >= 0)
1084 lr_error (lr, _("symbol for multicharacter collating element "
1085 "`%.*s' duplicates symbol definition"),
1086 (int) code->val.str.len, code->val.str.start);
1087 return;
1090 collate->combine_token = code->val.str.start;
1091 collate->combine_token_len = code->val.str.len;
1095 void
1096 collate_element_from (struct linereader *lr, struct localedef_t *locale,
1097 struct token *code, struct charset_t *charset)
1099 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1100 element_t *elemp, *runp;
1102 /* CODE is a string. */
1103 elemp = (element_t *) obstack_alloc (&collate->element_mem,
1104 sizeof (element_t));
1106 /* We have to translate the string. It may contain <...> character
1107 names. */
1108 elemp->name = (wchar_t *) translate_string (code->val.str.start, charset);
1109 elemp->this_weight = 0;
1110 elemp->ordering = NULL;
1111 elemp->ordering_len = 0;
1113 free (code->val.str.start);
1115 if (elemp->name == NULL)
1117 /* At least one character in the string is not defined. We simply
1118 do nothing. */
1119 if (verbose)
1120 lr_error (lr, _("\
1121 `from' string in collation element declaration contains unknown character"));
1122 return;
1125 if (elemp->name[0] == L'\0' || elemp->name[1] == L'\0')
1127 lr_error (lr, _("illegal collation element"));
1128 return;
1131 /* The entries in the linked lists of RESULT are sorting in
1132 descending order. The order is important for the `strcoll' and
1133 `wcscoll' functions. */
1134 if (find_entry (&collate->result, elemp->name, sizeof (wchar_t),
1135 (void *) &runp) >= 0)
1137 /* We already have an entry with this key. Check whether it is
1138 identical. */
1139 element_t *prevp = NULL;
1140 int cmpres;
1144 cmpres = wcscmp (elemp->name, runp->name);
1145 if (cmpres <= 0)
1146 break;
1147 prevp = runp;
1149 while ((runp = runp->next) != NULL);
1151 if (cmpres == 0)
1152 lr_error (lr, _("duplicate collating element definition"));
1153 else
1155 elemp->next = runp;
1156 if (prevp == NULL)
1158 if (set_entry (&collate->result, elemp->name, sizeof (wchar_t),
1159 elemp) < 0)
1160 error (EXIT_FAILURE, 0, _("\
1161 error while inserting collation element into hash table"));
1163 else
1164 prevp->next = elemp;
1167 else
1169 elemp->next = NULL;
1170 if (insert_entry (&collate->result, elemp->name, sizeof (wchar_t), elemp)
1171 < 0)
1172 error (EXIT_FAILURE, errno, _("error while inserting to hash table"));
1175 if (insert_entry (&collate->elements, collate->combine_token,
1176 collate->combine_token_len, (void *) elemp) < 0)
1177 lr_error (lr, _("cannot insert new collating symbol definition: %s"),
1178 strerror (errno));
1182 void
1183 collate_symbol (struct linereader *lr, struct localedef_t *locale,
1184 struct token *code, struct charset_t *charset)
1186 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1187 wchar_t value;
1188 void *not_used;
1190 value = charset_find_value (&charset->char_table, code->val.str.start,
1191 code->val.str.len);
1192 if (value != ILLEGAL_CHAR_VALUE)
1194 lr_error (lr, _("symbol for multicharacter collating element "
1195 "`%.*s' duplicates symbolic name in charset"),
1196 (int) code->val.str.len, code->val.str.start);
1197 return;
1200 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1201 &not_used) >= 0)
1203 lr_error (lr, _("symbol for multicharacter collating element "
1204 "`%.*s' duplicates element definition"),
1205 (int) code->val.str.len, code->val.str.start);
1206 return;
1209 if (find_entry (&collate->symbols, code->val.str.start, code->val.str.len,
1210 &not_used) >= 0)
1212 lr_error (lr, _("symbol for multicharacter collating element "
1213 "`%.*s' duplicates other symbol definition"),
1214 (int) code->val.str.len, code->val.str.start);
1215 return;
1218 if (insert_entry (&collate->symbols, code->val.str.start, code->val.str.len,
1219 (void *) 0) < 0)
1220 lr_error (lr, _("cannot insert new collating symbol definition: %s"),
1221 strerror (errno));
1225 void
1226 collate_new_order (struct linereader *lr, struct localedef_t *locale,
1227 enum coll_sort_rule sort_rule)
1229 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1231 if (collate->nrules >= collate->nrules_max)
1233 collate->nrules_max *= 2;
1234 collate->rules
1235 = (enum coll_sort_rule *) xrealloc (collate->rules,
1236 collate->nrules_max
1237 * sizeof (enum coll_sort_rule));
1240 collate->rules[collate->nrules++] = sort_rule;
1244 void
1245 collate_build_arrays (struct linereader *lr, struct localedef_t *locale)
1247 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1249 collate->rules
1250 = (enum coll_sort_rule *) xrealloc (collate->rules,
1251 collate->nrules
1252 * sizeof (enum coll_sort_rule));
1254 /* Allocate arrays for temporary weights. */
1255 collate->weight_cnt = (int *) xmalloc (collate->nrules * sizeof (int));
1257 /* Choose arbitrary start value for table size. */
1258 collate->nweight_max = 5 * collate->nrules;
1259 collate->weight = (int *) xmalloc (collate->nweight_max * sizeof (int));
1264 collate_order_elem (struct linereader *lr, struct localedef_t *locale,
1265 struct token *code, struct charset_t *charset)
1267 const wchar_t zero = L'\0';
1268 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1269 int result = 0;
1270 wchar_t value;
1271 void *tmp;
1272 unsigned int i;
1274 switch (code->tok)
1276 case tok_bsymbol:
1277 /* We have a string to find in one of the three hashing tables. */
1278 value = charset_find_value (&charset->char_table, code->val.str.start,
1279 code->val.str.len);
1280 if (value != ILLEGAL_CHAR_VALUE)
1282 element_t *lastp, *firstp;
1284 collate->kind = character;
1286 if (find_entry (&collate->result, &value, sizeof (wchar_t),
1287 (void *) &firstp) < 0)
1288 firstp = lastp = NULL;
1289 else
1291 /* The entry for the simple character is always found at
1292 the end. */
1293 lastp = firstp;
1294 while (lastp->next != NULL)
1295 lastp = lastp->next;
1297 if (lastp->name[0] == value && lastp->name[1] == L'\0')
1299 lr_error (lr, _("duplicate definition for character `%.*s'"),
1300 (int) code->val.str.len, code->val.str.start);
1301 lr_ignore_rest (lr, 0);
1302 result = -1;
1303 break;
1307 collate->current_element
1308 = (element_t *) obstack_alloc (&collate->element_mem,
1309 sizeof (element_t));
1311 obstack_grow (&collate->element_mem, &value, sizeof (value));
1312 obstack_grow (&collate->element_mem, &zero, sizeof (zero));
1314 collate->current_element->name =
1315 (const wchar_t *) obstack_finish (&collate->element_mem);
1317 collate->current_element->this_weight = ++collate->order_cnt;
1319 collate->current_element->next = NULL;
1321 if (firstp == NULL)
1323 if (insert_entry (&collate->result, &value, sizeof (wchar_t),
1324 (void *) collate->current_element) < 0)
1326 lr_error (lr, _("cannot insert collation element `%.*s'"),
1327 (int) code->val.str.len, code->val.str.start);
1328 exit (4);
1331 else
1332 lastp->next = collate->current_element;
1334 else if (find_entry (&collate->elements, code->val.str.start,
1335 code->val.str.len, &tmp) >= 0)
1337 collate->current_element = (element_t *) tmp;
1339 if (collate->current_element->this_weight != 0)
1341 lr_error (lr, _("\
1342 collation element `%.*s' appears more than once: ignore line"),
1343 (int) code->val.str.len, code->val.str.start);
1344 lr_ignore_rest (lr, 0);
1345 result = -1;
1346 break;
1349 collate->kind = element;
1350 collate->current_element->this_weight = ++collate->order_cnt;
1352 else if (find_entry (&collate->symbols, code->val.str.start,
1353 code->val.str.len, &tmp) >= 0)
1355 unsigned int order = ++collate->order_cnt;
1357 if ((unsigned long int) tmp != 0ul)
1359 lr_error (lr, _("\
1360 collation symbol `%.*s' appears more than once: ignore line"),
1361 (int) code->val.str.len, code->val.str.start);
1362 lr_ignore_rest (lr, 0);
1363 result = -1;
1364 break;
1367 collate->kind = symbol;
1369 if (set_entry (&collate->symbols, code->val.str.start,
1370 code->val.str.len, (void *) order) < 0)
1372 lr_error (lr, _("cannot process order specification"));
1373 exit (4);
1376 else
1378 if (verbose)
1379 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1380 (int) code->val.str.len, code->val.str.start);
1381 lr_ignore_rest (lr, 0);
1383 result = -1;
1385 break;
1387 case tok_undefined:
1388 collate->kind = undefined;
1389 collate->current_element = &collate->undefined;
1390 break;
1392 case tok_ellipsis:
1393 if (collate->was_ellipsis)
1395 lr_error (lr, _("\
1396 two lines in a row containing `...' are not allowed"));
1397 result = -1;
1399 else if (collate->kind != character)
1401 /* An ellipsis requires the previous line to be an
1402 character definition. */
1403 lr_error (lr, _("\
1404 line before ellipsis does not contain definition for character constant"));
1405 lr_ignore_rest (lr, 0);
1406 result = -1;
1408 else
1409 collate->kind = ellipsis;
1410 break;
1412 default:
1413 assert (! "illegal token in `collate_order_elem'");
1416 /* Now it's time to handle the ellipsis in the previous line. We do
1417 this only when the last line contained an definition for a
1418 character, the current line also defines an character, the
1419 character code for the later is bigger than the former. */
1420 if (collate->was_ellipsis)
1422 if (collate->kind != character)
1424 lr_error (lr, _("\
1425 line after ellipsis must contain character definition"));
1426 lr_ignore_rest (lr, 0);
1427 result = -1;
1429 else if (collate->last_char > value)
1431 lr_error (lr, _("end point of ellipsis range is bigger then start"));
1432 lr_ignore_rest (lr, 0);
1433 result = -1;
1435 else
1437 /* We can fill the arrays with the information we need. */
1438 wchar_t name[2];
1439 unsigned int *data;
1440 size_t *ptr;
1441 size_t cnt;
1443 name[0] = collate->last_char + 1;
1444 name[1] = L'\0';
1446 data = (unsigned int *) alloca ((collate->nrules + collate->nweight)
1447 * sizeof (unsigned int));
1448 ptr = (size_t *) alloca (collate->nrules * sizeof (size_t));
1450 if (data == NULL || ptr == NULL)
1451 error (4, 0, _("memory exhausted"));
1453 /* Prepare data. Because the characters covered by an
1454 ellipsis all have equal values we prepare the data once
1455 and only change the variable number (if there are any).
1456 PTR[...] will point to the entries which will have to be
1457 fixed during the output loop. */
1458 for (cnt = 0; cnt < collate->nrules; ++cnt)
1460 data[cnt] = collate->weight_cnt[cnt];
1461 ptr[cnt] = (cnt == 0
1462 ? collate->nweight
1463 : ptr[cnt - 1] + collate->weight_cnt[cnt - 1]);
1466 for (cnt = 0; cnt < collate->nweight; ++cnt)
1467 data[collate->nrules + cnt] = collate->weight[cnt];
1469 for (cnt = 0; cnt < collate->nrules; ++cnt)
1470 if ((wchar_t) data[ptr[cnt]] != ELLIPSIS_CHAR)
1471 ptr[cnt] = 0;
1473 while (name[0] <= value)
1475 element_t *pelem;
1477 pelem = (element_t *) obstack_alloc (&collate->element_mem,
1478 sizeof (element_t));
1479 if (pelem == NULL)
1480 error (4, 0, _("memory exhausted"));
1482 pelem->name
1483 = (const wchar_t *) obstack_copy (&collate->element_mem,
1484 name, 2 * sizeof (wchar_t));
1485 pelem->this_weight = ++collate->order_cnt;
1487 pelem->ordering_len = collate->nweight;
1488 pelem->ordering
1489 = (unsigned int *) obstack_copy (&collate->element_mem, data,
1490 (collate->nrules
1491 + pelem->ordering_len)
1492 * sizeof (unsigned int));
1494 /* `...' weights need to be adjusted. */
1495 for (cnt = 0; cnt < collate->nrules; ++cnt)
1496 if (ptr[cnt] != 0)
1497 pelem->ordering[ptr[cnt]] = pelem->this_weight;
1499 /* Insert new entry into result table. */
1500 if (find_entry (&collate->result, name, sizeof (wchar_t),
1501 (void *) &pelem->next) >= 0)
1503 if (set_entry (&collate->result, name, sizeof (wchar_t),
1504 (void *) pelem) < 0)
1505 error (4, 0, _("cannot insert into result table"));
1507 else
1509 pelem->next = NULL;
1510 if (insert_entry (&collate->result, name, sizeof (wchar_t),
1511 (void *) pelem) < 0)
1512 error (4, 0, _("cannot insert into result table"));
1515 /* Increment counter. */
1516 ++name[0];
1521 /* Reset counters for weights. */
1522 collate->weight_idx = 0;
1523 collate->nweight = 0;
1524 for (i = 0; i < collate->nrules; ++i)
1525 collate->weight_cnt[i] = 0;
1526 collate->current_patch = NULL;
1528 return result;
1533 collate_weight_bsymbol (struct linereader *lr, struct localedef_t *locale,
1534 struct token *code, struct charset_t *charset)
1536 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1537 unsigned int here_weight;
1538 wchar_t value;
1539 void *tmp;
1541 assert (code->tok == tok_bsymbol);
1543 value = charset_find_value (&charset->char_table, code->val.str.start,
1544 code->val.str.len);
1545 if (value != ILLEGAL_CHAR_VALUE)
1547 element_t *runp;
1549 if (find_entry (&collate->result, &value, sizeof (wchar_t),
1550 (void *)&runp) < 0)
1551 runp = NULL;
1553 while (runp != NULL
1554 && (runp->name[0] != value || runp->name[1] != L'\0'))
1555 runp = runp->next;
1557 here_weight = runp == NULL ? 0 : runp->this_weight;
1559 else if (find_entry (&collate->elements, code->val.str.start,
1560 code->val.str.len, &tmp) >= 0)
1562 element_t *runp = (element_t *) tmp;
1564 here_weight = runp->this_weight;
1566 else if (find_entry (&collate->symbols, code->val.str.start,
1567 code->val.str.len, &tmp) >= 0)
1569 here_weight = (unsigned int) tmp;
1571 else
1573 if (verbose)
1574 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1575 (int) code->val.str.len, code->val.str.start);
1576 lr_ignore_rest (lr, 0);
1577 return -1;
1580 /* When we currently work on a collation symbol we do not expect any
1581 weight. */
1582 if (collate->kind == symbol)
1584 lr_error (lr, _("\
1585 specification of sorting weight for collation symbol does not make sense"));
1586 lr_ignore_rest (lr, 0);
1587 return -1;
1590 /* Add to the current collection of weights. */
1591 if (collate->nweight >= collate->nweight_max)
1593 collate->nweight_max *= 2;
1594 collate->weight = (unsigned int *) xrealloc (collate->weight,
1595 collate->nweight_max);
1598 /* If the weight is currently not known, we remember to patch the
1599 resulting tables. */
1600 if (here_weight == 0)
1602 patch_t *newp;
1604 newp = (patch_t *) obstack_alloc (&collate->element_mem,
1605 sizeof (patch_t));
1606 newp->fname = lr->fname;
1607 newp->lineno = lr->lineno;
1608 newp->token = (const char *) obstack_copy0 (&collate->element_mem,
1609 code->val.str.start,
1610 code->val.str.len);
1611 newp->where.idx = collate->nweight++;
1612 newp->next = collate->current_patch;
1613 collate->current_patch = newp;
1615 else
1616 collate->weight[collate->nweight++] = here_weight;
1617 ++collate->weight_cnt[collate->weight_idx];
1619 return 0;
1624 collate_next_weight (struct linereader *lr, struct localedef_t *locale)
1626 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1628 if (collate->kind == symbol)
1630 lr_error (lr, _("\
1631 specification of sorting weight for collation symbol does not make sense"));
1632 lr_ignore_rest (lr, 0);
1633 return -1;
1636 ++collate->weight_idx;
1637 if (collate->weight_idx >= collate->nrules)
1639 lr_error (lr, _("too many weights"));
1640 lr_ignore_rest (lr, 0);
1641 return -1;
1644 return 0;
1649 collate_simple_weight (struct linereader *lr, struct localedef_t *locale,
1650 struct token *code, struct charset_t *charset)
1652 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1653 unsigned int value = 0;
1655 /* There current tokens can be `IGNORE', `...', or a string. */
1656 switch (code->tok)
1658 case tok_ignore:
1659 /* This token is allowed in all situations. */
1660 value = IGNORE_CHAR;
1661 break;
1663 case tok_ellipsis:
1664 /* The ellipsis is only allowed for the `...' or `UNDEFINED'
1665 entry. */
1666 if (collate->kind != ellipsis && collate->kind != undefined)
1668 lr_error (lr, _("\
1669 `...' must only be used in `...' and `UNDEFINED' entries"));
1670 lr_ignore_rest (lr, 0);
1671 return -1;
1673 value = ELLIPSIS_CHAR;
1674 break;
1676 case tok_string:
1677 /* This can become difficult. We have to get the weights which
1678 correspond to the single wide chars in the string. But some
1679 of the `chars' might not be real characters, but collation
1680 elements or symbols. And so the string decoder might have
1681 signaled errors. The string at this point is not translated.
1682 I.e., all <...> sequences are still there. */
1684 char *runp = code->val.str.start;
1685 void *tmp;
1687 while (*runp != '\0')
1689 char *startp = (char *) runp;
1690 char *putp = (char *) runp;
1691 wchar_t wch;
1693 /* Lookup weight for char and store it. */
1694 if (*runp == '<')
1696 while (*++runp != '\0' && *runp != '>')
1698 if (*runp == lr->escape_char)
1699 if (*++runp == '\0')
1701 lr_error (lr, _("unterminated weight name"));
1702 lr_ignore_rest (lr, 0);
1703 return -1;
1705 *putp++ = *runp;
1707 if (*runp == '>')
1708 ++runp;
1710 if (putp == startp)
1712 lr_error (lr, _("empty weight name: line ignored"));
1713 lr_ignore_rest (lr, 0);
1714 return -1;
1717 wch = charset_find_value (&charset->char_table, startp,
1718 putp - startp);
1719 if (wch != ILLEGAL_CHAR_VALUE)
1721 element_t *pelem;
1723 if (find_entry (&collate->result, &wch, sizeof (wchar_t),
1724 (void *)&pelem) < 0)
1725 pelem = NULL;
1727 while (pelem != NULL
1728 && (pelem->name[0] != wch
1729 || pelem->name[1] != L'\0'))
1730 pelem = pelem->next;
1732 value = pelem == NULL ? 0 : pelem->this_weight;
1734 else if (find_entry (&collate->elements, startp, putp - startp,
1735 &tmp) >= 0)
1737 element_t *pelem = (element_t *) tmp;
1739 value = pelem->this_weight;
1741 else if (find_entry (&collate->symbols, startp, putp - startp,
1742 &tmp) >= 0)
1744 value = (unsigned int) tmp;
1746 else
1748 if (verbose)
1749 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1750 (int) (putp - startp), startp);
1751 lr_ignore_rest (lr, 0);
1752 return -1;
1755 else
1757 element_t *wp;
1758 wchar_t wch;
1760 if (*runp == lr->escape_char)
1762 static const char digits[] = "0123456789abcdef";
1763 const char *dp;
1764 int base;
1766 ++runp;
1767 if (_tolower (*runp) == 'x')
1769 ++runp;
1770 base = 16;
1772 else if (_tolower (*runp) == 'd')
1774 ++runp;
1775 base = 10;
1777 else
1778 base = 8;
1780 dp = strchr (digits, _tolower (*runp));
1781 if (dp == NULL || (dp - digits) >= base)
1783 illegal_char:
1784 lr_error (lr, _("\
1785 illegal character constant in string"));
1786 lr_ignore_rest (lr, 0);
1787 return -1;
1789 wch = dp - digits;
1790 ++runp;
1792 dp = strchr (digits, _tolower (*runp));
1793 if (dp == NULL || (dp - digits) >= base)
1794 goto illegal_char;
1795 wch *= base;
1796 wch += dp - digits;
1797 ++runp;
1799 if (base != 16)
1801 dp = strchr (digits, _tolower (*runp));
1802 if (dp != NULL && (dp - digits < base))
1804 wch *= base;
1805 wch += dp - digits;
1806 ++runp;
1810 else
1811 wch = (wchar_t) *runp++;
1813 /* Lookup the weight for WCH. */
1814 if (find_entry (&collate->result, &wch, sizeof (wch),
1815 (void *)&wp) < 0)
1816 wp = NULL;
1818 while (wp != NULL
1819 && (wp->name[0] != wch || wp->name[1] != L'\0'))
1820 wp = wp->next;
1822 value = wp == NULL ? 0 : wp->this_weight;
1824 /* To get the correct name for the error message. */
1825 putp = runp;
1827 /**************************************************\
1828 |* I know here is something wrong. Characters in *|
1829 |* the string which are not in the <...> form *|
1830 |* cannot be declared forward for now!!! *|
1831 \**************************************************/
1834 /* Store in weight array. */
1835 if (collate->nweight >= collate->nweight_max)
1837 collate->nweight_max *= 2;
1838 collate->weight
1839 = (unsigned int *) xrealloc (collate->weight,
1840 collate->nweight_max);
1843 if (value == 0)
1845 patch_t *newp;
1847 newp = (patch_t *) obstack_alloc (&collate->element_mem,
1848 sizeof (patch_t));
1849 newp->fname = lr->fname;
1850 newp->lineno = lr->lineno;
1851 newp->token
1852 = (const char *) obstack_copy0 (&collate->element_mem,
1853 startp, putp - startp);
1854 newp->where.idx = collate->nweight++;
1855 newp->next = collate->current_patch;
1856 collate->current_patch = newp;
1858 else
1859 collate->weight[collate->nweight++] = value;
1860 ++collate->weight_cnt[collate->weight_idx];
1863 return 0;
1865 default:
1866 assert (! "should not happen");
1870 if (collate->nweight >= collate->nweight_max)
1872 collate->nweight_max *= 2;
1873 collate->weight = (unsigned int *) xrealloc (collate->weight,
1874 collate->nweight_max);
1877 collate->weight[collate->nweight++] = value;
1878 ++collate->weight_cnt[collate->weight_idx];
1880 return 0;
1884 void
1885 collate_end_weight (struct linereader *lr, struct localedef_t *locale)
1887 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1888 element_t *pelem = collate->current_element;
1890 if (collate->kind == symbol)
1892 /* We don't have to do anything. */
1893 collate->was_ellipsis = 0;
1894 return;
1897 if (collate->kind == ellipsis)
1899 /* Before the next line is processed the ellipsis is handled. */
1900 collate->was_ellipsis = 1;
1901 return;
1904 assert (collate->kind == character || collate->kind == element
1905 || collate->kind == undefined);
1907 /* Fill in the missing weights. */
1908 while (++collate->weight_idx < collate->nrules)
1910 collate->weight[collate->nweight++] = pelem->this_weight;
1911 ++collate->weight_cnt[collate->weight_idx];
1914 /* Now we know how many ordering weights the current
1915 character/element has. Allocate room in the element structure
1916 and copy information. */
1917 pelem->ordering_len = collate->nweight;
1919 /* First we write an array with the number of values for each
1920 weight. */
1921 obstack_grow (&collate->element_mem, collate->weight_cnt,
1922 collate->nrules * sizeof (unsigned int));
1924 /* Now the weights itselves. */
1925 obstack_grow (&collate->element_mem, collate->weight,
1926 collate->nweight * sizeof (unsigned int));
1928 /* Get result. */
1929 pelem->ordering = obstack_finish (&collate->element_mem);
1931 /* Now we handle the "patches". */
1932 while (collate->current_patch != NULL)
1934 patch_t *this_patch;
1936 this_patch = collate->current_patch;
1938 this_patch->where.pos = &pelem->ordering[collate->nrules
1939 + this_patch->where.idx];
1941 collate->current_patch = this_patch->next;
1942 this_patch->next = collate->all_patches;
1943 collate->all_patches = this_patch;
1946 /* Set information for next round. */
1947 collate->was_ellipsis = 0;
1948 if (collate->kind != undefined)
1949 collate->last_char = pelem->name[0];