update from main archive 961001
[glibc.git] / locale / programs / ld-collate.c
blob1bfa533d9857ce7a2dbd0d0053a2fcf5d8ad4b73
1 /* Copyright (C) 1995, 1996 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If
17 not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <endian.h>
25 #include <errno.h>
26 #include <limits.h>
27 #include <locale.h>
28 #include <obstack.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <wchar.h>
33 #include "localeinfo.h"
34 #include "locales.h"
35 #include "simple-hash.h"
36 #include "stringtrans.h"
37 #include "strlen-hash.h"
39 /* Uncomment the following line in the production version. */
40 /* #define NDEBUG 1 */
41 #include <assert.h>
44 #define MAX(a, b) ((a) > (b) ? (a) : (b))
46 #define SWAPU32(w) \
47 (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
50 /* What kind of symbols get defined? */
51 enum coll_symbol
53 undefined,
54 ellipsis,
55 character,
56 element,
57 symbol
61 typedef struct patch_t
63 const char *fname;
64 size_t lineno;
65 const char *token;
66 union
68 unsigned int *pos;
69 size_t idx;
70 } where;
71 struct patch_t *next;
72 } patch_t;
75 typedef struct element_t
77 const wchar_t *name;
78 unsigned int this_weight;
80 struct element_t *next;
82 unsigned int *ordering;
83 size_t ordering_len;
84 } element_t;
87 /* The real definition of the struct for the LC_COLLATE locale. */
88 struct locale_collate_t
90 /* Collate symbol table. Simple mapping to number. */
91 hash_table symbols;
93 /* The collation elements. */
94 hash_table elements;
95 struct obstack element_mem;
97 /* The result table. */
98 hash_table result;
100 /* Sorting rules given in order_start line. */
101 int nrules;
102 int nrules_max;
103 enum coll_sort_rule *rules;
105 /* Used while recognizing symbol composed of multiple tokens
106 (collating-element). */
107 const char *combine_token;
108 size_t combine_token_len;
110 /* How many sorting order specifications so far. */
111 unsigned int order_cnt;
113 /* Was lastline ellipsis? */
114 int was_ellipsis;
115 /* Value of last entry if was character. */
116 wchar_t last_char;
117 /* Current element. */
118 element_t *current_element;
119 /* What kind of symbol is current element. */
120 enum coll_symbol kind;
122 /* While collecting the weigths we need some temporary space. */
123 unsigned int current_order;
124 int *weight_cnt;
125 int weight_idx;
126 unsigned int *weight;
127 int nweight;
128 int nweight_max;
130 /* Patch lists. */
131 patch_t *current_patch;
132 patch_t *all_patches;
134 /* Room for the UNDEFINED information. */
135 element_t undefined;
136 unsigned int undefined_len;
140 /* Be verbose? Defined in localedef.c. */
141 extern int verbose;
144 void *xmalloc (size_t __n);
145 void *xrealloc (void *__p, size_t __n);
148 #define obstack_chunk_alloc xmalloc
149 #define obstack_chunk_free free
152 void
153 collate_startup (struct linereader *lr, struct localedef_t *locale,
154 struct charset_t *charset)
156 struct locale_collate_t *collate;
158 /* It is important that we always use UCS4 encoding for strings now. */
159 encoding_method = ENC_UCS4;
161 /* Allocate the needed room. */
162 locale->categories[LC_COLLATE].collate = collate =
163 (struct locale_collate_t *) xmalloc (sizeof (struct locale_collate_t));
165 /* Allocate hash table for collating elements. */
166 if (init_hash (&collate->elements, 512))
167 error (4, 0, _("memory exhausted"));
168 collate->combine_token = NULL;
169 obstack_init (&collate->element_mem);
171 /* Allocate hash table for collating elements. */
172 if (init_hash (&collate->symbols, 64))
173 error (4, 0, _("memory exhausted"));
175 /* Allocate hash table for result. */
176 if (init_hash (&collate->result, 512))
177 error (4, 0, _("memory exhausted"));
179 collate->nrules = 0;
180 collate->nrules_max = 10;
181 collate->rules
182 = (enum coll_sort_rule *) xmalloc (collate->nrules_max
183 * sizeof (enum coll_sort_rule));
185 collate->order_cnt = 1; /* The smallest weight is 2. */
187 collate->was_ellipsis = 0;
188 collate->last_char = L'\0'; /* 0 because leading ellipsis is allowed. */
190 collate->all_patches = NULL;
192 /* This tells us no UNDEFINED entry was found until now. */
193 collate->undefined.this_weight = 0;
195 lr->translate_strings = 0;
199 void
200 collate_finish (struct localedef_t *locale, struct charset_t *charset)
202 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
203 patch_t *patch;
204 size_t cnt;
206 /* Patch the constructed table so that forward references are
207 correctly filled. */
208 for (patch = collate->all_patches; patch != NULL; patch = patch->next)
210 wchar_t wch;
211 size_t toklen = strlen (patch->token);
212 void *ptmp;
213 unsigned int value = 0;
215 wch = charset_find_value (charset, patch->token, toklen);
216 if (wch != ILLEGAL_CHAR_VALUE)
218 element_t *runp;
220 if (find_entry (&collate->result, &wch, sizeof (wchar_t),
221 (void *) &runp) < 0)
222 runp = NULL;
223 for (; runp != NULL; runp = runp->next)
224 if (runp->name[0] == wch && runp->name[1] == L'\0')
225 break;
227 value = runp == NULL ? 0 : runp->this_weight;
229 else if (find_entry (&collate->elements, patch->token, toklen, &ptmp)
230 >= 0)
232 value = ((element_t *) ptmp)->this_weight;
234 else if (find_entry (&collate->symbols, patch->token, toklen, &ptmp)
235 >= 0)
237 value = (unsigned long int) ptmp;
239 else
240 value = 0;
242 if (value == 0)
243 error_at_line (0, 0, patch->fname, patch->lineno,
244 _("no weight defined for symbol `%s'"), patch->token);
245 else
246 *patch->where.pos = value;
249 /* If no definition for UNDEFINED is given, all characters in the
250 given charset must be specified. */
251 if (collate->undefined.ordering == NULL)
253 /**************************************************************\
254 |* XXX We should test whether really an unspecified character *|
255 |* exists before giving the message. *|
256 \**************************************************************/
257 u_int32_t weight;
259 error (0, 0, _("no definition of `UNDEFINED'"));
261 collate->undefined.ordering_len = collate->nrules;
262 weight = ++collate->order_cnt;
264 for (cnt = 0; cnt < collate->nrules; ++cnt)
266 u_int32_t one = 1;
267 obstack_grow (&collate->element_mem, &one, sizeof (one));
270 for (cnt = 0; cnt < collate->nrules; ++cnt)
271 obstack_grow (&collate->element_mem, &weight, sizeof (weight));
273 collate->undefined.ordering = obstack_finish (&collate->element_mem);
276 collate->undefined_len = 2; /* For the name: 1 x wchar_t + L'\0'. */
277 for (cnt = 0; cnt < collate->nrules; ++cnt)
278 collate->undefined_len += 1 + collate->undefined.ordering[cnt];
283 void
284 collate_output (struct localedef_t *locale, struct charset_t *charset,
285 const char *output_path)
287 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
288 u_int32_t table_size, table_best, level_best, sum_best;
289 void *last;
290 element_t *pelem;
291 wchar_t *name;
292 size_t len;
293 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
294 struct iovec iov[2 + nelems];
295 struct locale_file data;
296 u_int32_t idx[nelems];
297 struct obstack non_simple;
298 struct obstack string_pool;
299 size_t cnt, entry_size;
300 u_int32_t undefined_offset = UINT_MAX;
301 u_int32_t *table, *extra, *table2, *extra2;
302 size_t extra_len;
303 u_int32_t element_hash_tab_size;
304 u_int32_t *element_hash_tab;
305 u_int32_t *element_hash_tab_ob;
306 u_int32_t element_string_pool_size;
307 char *element_string_pool;
308 u_int32_t element_value_size;
309 wchar_t *element_value;
310 wchar_t *element_value_ob;
311 u_int32_t symbols_hash_tab_size;
312 u_int32_t *symbols_hash_tab;
313 u_int32_t *symbols_hash_tab_ob;
314 u_int32_t symbols_string_pool_size;
315 char *symbols_string_pool;
316 u_int32_t symbols_class_size;
317 u_int32_t *symbols_class;
318 u_int32_t *symbols_class_ob;
319 hash_table *hash_tab;
320 unsigned int dummy_weights[collate->nrules + 1];
322 sum_best = UINT_MAX;
323 table_best = 0xffff;
324 level_best = 0xffff;
326 /* Compute table size. */
327 fputs (_("\
328 Computing table size for collation information might take a while..."),
329 stderr);
330 for (table_size = 256; table_size < sum_best; ++table_size)
332 size_t hits[table_size];
333 unsigned int worst = 1;
334 size_t cnt;
336 last = NULL;
338 for (cnt = 0; cnt < 256; ++cnt)
339 hits[cnt] = 1;
340 memset (&hits[256], '\0', sizeof (hits) - 256 * sizeof (size_t));
342 while (iterate_table (&collate->result, &last, (const void **) &name,
343 &len, (void **) &pelem) >= 0)
344 if (pelem->ordering != NULL && pelem->name[0] > 0xff)
345 if (++hits[(unsigned int) pelem->name[0] % table_size] > worst)
347 worst = hits[(unsigned int) pelem->name[0] % table_size];
348 if (table_size * worst > sum_best)
349 break;
352 if (table_size * worst < sum_best)
354 sum_best = table_size * worst;
355 table_best = table_size;
356 level_best = worst;
359 assert (table_best != 0xffff || level_best != 0xffff);
360 fputs (_(" done\n"), stderr);
362 obstack_init (&non_simple);
363 obstack_init (&string_pool);
365 data.magic = LIMAGIC (LC_COLLATE);
366 data.n = nelems;
367 iov[0].iov_base = (void *) &data;
368 iov[0].iov_len = sizeof (data);
370 iov[1].iov_base = (void *) idx;
371 iov[1].iov_len = sizeof (idx);
373 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_NRULES)].iov_base = &collate->nrules;
374 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_NRULES)].iov_len = sizeof (u_int32_t);
376 table = (u_int32_t *) alloca (collate->nrules * sizeof (u_int32_t));
377 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_RULES)].iov_base = table;
378 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_RULES)].iov_len
379 = collate->nrules * sizeof (u_int32_t);
380 /* Another trick here. Describing the collation method needs only a
381 few bits (3, to be exact). But the binary file should be
382 accessible by maschines with both endianesses and so we store both
383 information in the same word. */
384 for (cnt = 0; cnt < collate->nrules; ++cnt)
385 table[cnt] = collate->rules[cnt] | SWAPU32 (collate->rules[cnt]);
387 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].iov_base = &table_best;
388 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_SIZE)].iov_len = sizeof (u_int32_t);
390 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].iov_base = &level_best;
391 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_HASH_LAYERS)].iov_len
392 = sizeof (u_int32_t);
394 entry_size = 1 + MAX (collate->nrules, 2);
396 table = (u_int32_t *) alloca (table_best * level_best * entry_size
397 * sizeof (table[0]));
398 memset (table, '\0', table_best * level_best * entry_size
399 * sizeof (table[0]));
402 /* Macros for inserting in output table. */
403 #define ADD_VALUE(expr) \
404 do { \
405 u_int32_t to_write = (u_int32_t) expr; \
406 obstack_grow (&non_simple, &to_write, sizeof (to_write)); \
407 } while (0)
409 #define ADD_ELEMENT(pelem, len) \
410 do { \
411 size_t cnt, idx; \
413 ADD_VALUE (len); \
415 wlen = wcslen (pelem->name); \
416 obstack_grow (&non_simple, pelem->name, (wlen + 1) * sizeof (u_int32_t)); \
418 idx = collate->nrules; \
419 for (cnt = 0; cnt < collate->nrules; ++cnt) \
421 size_t disp; \
423 ADD_VALUE (pelem->ordering[cnt]); \
424 for (disp = 0; disp < pelem->ordering[cnt]; ++disp) \
425 ADD_VALUE (pelem->ordering[idx++]); \
427 } while (0)
429 #define ADD_FORWARD(pelem) \
430 do { \
431 /* We leave a reference in the main table and put all \
432 information in the table for the extended entries. */ \
433 element_t *runp; \
434 element_t *has_simple = NULL; \
435 size_t wlen; \
437 table[(level * table_best + slot) * entry_size + 1] \
438 = FORWARD_CHAR; \
439 table[(level * table_best + slot) * entry_size + 2] \
440 = obstack_object_size (&non_simple) / sizeof (u_int32_t); \
442 /* Here we have to construct the non-simple table entry. First \
443 compute the total length of this entry. */ \
444 for (runp = (pelem); runp != NULL; runp = runp->next) \
445 if (runp->ordering != NULL) \
447 u_int32_t value; \
448 size_t cnt; \
450 value = 1 + wcslen (runp->name) + 1; \
452 for (cnt = 0; cnt < collate->nrules; ++cnt) \
453 /* We have to take care for entries without ordering \
454 information. While reading them they get inserted in the \
455 table and later not removed when something goes wrong with \
456 reading its weights. */ \
458 value += 1 + runp->ordering[cnt]; \
460 if (runp->name[1] == L'\0') \
461 has_simple = runp; \
464 ADD_ELEMENT (runp, value); \
467 if (has_simple == NULL) \
469 size_t idx, cnt; \
471 ADD_VALUE (collate->undefined_len + 1); \
473 /* Add the name. */ \
474 ADD_VALUE ((pelem)->name[0]); \
475 ADD_VALUE (0); \
477 idx = collate->nrules; \
478 for (cnt = 0; cnt < collate->nrules; ++cnt) \
480 size_t disp; \
482 ADD_VALUE (collate->undefined.ordering[cnt]); \
483 for (disp = 0; disp < collate->undefined.ordering[cnt]; ++disp) \
485 if (collate->undefined.ordering[idx] == ELLIPSIS_CHAR) \
486 ADD_VALUE ((pelem)->name[0]); \
487 else \
488 ADD_VALUE (collate->undefined.ordering[idx++]); \
489 ++idx; \
493 } while (0)
497 /* Fill the table now. First we look for all the characters which
498 fit into one single byte. This speeds up the 8-bit string
499 functions. */
500 last = NULL;
501 while (iterate_table (&collate->result, &last, (const void **) &name,
502 &len, (void **) &pelem) >= 0)
503 if (pelem->name[0] <= 0xff)
505 /* We have a single byte name. Now we must distinguish
506 between entries in simple form (i.e., only one value per
507 weight and no collation element starting with the same
508 character) and those which are not. */
509 size_t slot = ((size_t) pelem->name[0]);
510 const size_t level = 0;
512 table[slot * entry_size] = pelem->name[0];
514 if (pelem->name[1] == L'\0' && pelem->next == NULL
515 && pelem->ordering_len == collate->nrules)
517 /* Yes, we have a simple one. Lucky us. */
518 size_t cnt;
520 for (cnt = 0; cnt < collate->nrules; ++cnt)
521 table[slot * entry_size + 1 + cnt]
522 = pelem->ordering[collate->nrules + cnt];
524 else
525 ADD_FORWARD (pelem);
528 /* Now check for missing single byte entries. If one exist we fill
529 with the UNDEFINED entry. */
530 for (cnt = 0; cnt < 256; ++cnt)
531 /* The first weight is never 0 for existing entries. */
532 if (table[cnt * entry_size + 1] == 0)
534 /* We have to fill in the information from the UNDEFINED
535 entry. */
536 table[cnt * entry_size] = (u_int32_t) cnt;
538 if (collate->undefined.ordering_len == collate->nrules)
540 size_t inner;
542 for (inner = 0; inner < collate->nrules; ++inner)
543 if (collate->undefined.ordering[collate->nrules + inner]
544 == ELLIPSIS_CHAR)
545 table[cnt * entry_size + 1 + inner] = cnt;
546 else
547 table[cnt * entry_size + 1 + inner]
548 = collate->undefined.ordering[collate->nrules + inner];
550 else
552 if (undefined_offset != UINT_MAX)
554 table[cnt * entry_size + 1] = FORWARD_CHAR;
555 table[cnt * entry_size + 2] = undefined_offset;
557 else
559 const size_t slot = cnt;
560 const size_t level = 0;
562 ADD_FORWARD (&collate->undefined);
563 undefined_offset = table[cnt * entry_size + 2];
568 /* Now we are ready for inserting the whole rest. */
569 last = NULL;
570 while (iterate_table (&collate->result, &last, (const void **) &name,
571 &len, (void **) &pelem) >= 0)
572 if (pelem->name[0] > 0xff)
574 /* Find the position. */
575 size_t slot = ((size_t) pelem->name[0]) % table_best;
576 size_t level = 0;
578 while (table[(level * table_best + slot) * entry_size + 1] != 0)
579 ++level;
580 assert (level < level_best);
582 if (pelem->name[1] == L'\0' && pelem->next == NULL
583 && pelem->ordering_len == collate->nrules)
585 /* Again a simple entry. */
586 size_t inner;
588 for (inner = 0; inner < collate->nrules; ++inner)
589 table[(level * table_best + slot) * entry_size + 1 + inner]
590 = pelem->ordering[collate->nrules + inner];
592 else
593 ADD_FORWARD (pelem);
596 /* Add the UNDEFINED entry. */
598 /* Here we have to construct the non-simple table entry. */
599 size_t idx, cnt;
601 undefined_offset = obstack_object_size (&non_simple);
603 idx = collate->nrules;
604 for (cnt = 0; cnt < collate->nrules; ++cnt)
606 size_t disp;
608 ADD_VALUE (collate->undefined.ordering[cnt]);
609 for (disp = 0; disp < collate->undefined.ordering[cnt]; ++disp)
610 ADD_VALUE (collate->undefined.ordering[idx++]);
614 /* Finish the extra block. */
615 extra_len = obstack_object_size (&non_simple);
616 extra = (u_int32_t *) obstack_finish (&non_simple);
617 assert ((extra_len % sizeof (u_int32_t)) == 0);
619 /* Now we have to build the two array for the other byte ordering. */
620 table2 = (u_int32_t *) alloca (table_best * level_best * entry_size
621 * sizeof (table[0]));
622 extra2 = (u_int32_t *) alloca (extra_len);
624 for (cnt = 0; cnt < table_best * level_best * entry_size; ++cnt)
625 table2[cnt] = SWAPU32 (table[cnt]);
627 for (cnt = 0; cnt < extra_len / sizeof (u_int32_t); ++cnt)
628 extra2[cnt] = SWAPU32 (extra2[cnt]);
630 /* We need a simple hashing table to get a collation-element->chars
631 mapping. We again use internal hasing using a secondary hashing
632 function.
634 Each string has an associate hashing value V, computed by a
635 fixed function. To locate the string we use open addressing with
636 double hashing. The first index will be V % M, where M is the
637 size of the hashing table. If no entry is found, iterating with
638 a second, independent hashing function takes place. This second
639 value will be 1 + V % (M - 2). The approximate number of probes
640 will be
642 for unsuccessful search: (1 - N / M) ^ -1
643 for successful search: - (N / M) ^ -1 * ln (1 - N / M)
645 where N is the number of keys.
647 If we now choose M to be the next prime bigger than 4 / 3 * N,
648 we get the values 4 and 1.85 resp. Because unsuccesful searches
649 are unlikely this is a good value. Formulas: [Knuth, The Art of
650 Computer Programming, Volume 3, Sorting and Searching, 1973,
651 Addison Wesley] */
652 if (collate->elements.filled == 0)
654 /* We don't need any element table since there are no collating
655 elements. */
656 element_hash_tab_size = 0;
657 element_hash_tab = NULL;
658 element_hash_tab_ob = NULL;
659 element_string_pool_size = 0;
660 element_string_pool = NULL;
661 element_value_size = 0;
662 element_value = NULL;
663 element_value_ob = NULL;
665 else
667 void *ptr; /* Running pointer. */
668 const char *key; /* Key for current bucket. */
669 size_t keylen; /* Length of key data. */
670 const element_t *data; /* Data, i.e., the character sequence. */
672 element_hash_tab_size = next_prime ((collate->elements.filled * 4) / 3);
673 if (element_hash_tab_size < 7)
674 /* We need a minimum to make the following code work. */
675 element_hash_tab_size = 7;
677 element_hash_tab = obstack_alloc (&non_simple, (2 * element_hash_tab_size
678 * sizeof (u_int32_t)));
679 memset (element_hash_tab, '\377', (2 * element_hash_tab_size
680 * sizeof (u_int32_t)));
682 ptr = NULL;
683 while (iterate_table (&collate->elements, &ptr, (const void **) &key,
684 &keylen, (void **) &data) == 0)
686 size_t hash_val = hash_string (key, keylen);
687 size_t idx = hash_val % element_hash_tab_size;
689 if (element_hash_tab[2 * idx] != (~((u_int32_t) 0)))
691 /* We need the second hashing function. */
692 size_t c = 1 + (hash_val % (element_hash_tab_size - 2));
695 if (idx >= element_hash_tab_size - c)
696 idx -= element_hash_tab_size - c;
697 else
698 idx += c;
699 while (element_hash_tab[2 * idx] != (~((u_int32_t) 0)));
702 element_hash_tab[2 * idx] = obstack_object_size (&non_simple);
703 element_hash_tab[2 * idx + 1] = (obstack_object_size (&string_pool)
704 / sizeof (wchar_t));
706 obstack_grow0 (&non_simple, key, keylen);
707 obstack_grow (&string_pool, data->name,
708 (wcslen (data->name) + 1) * sizeof (wchar_t));
711 if (obstack_object_size (&non_simple) % 4 != 0)
712 obstack_blank (&non_simple,
713 4 - (obstack_object_size (&non_simple) % 4));
714 element_string_pool_size = obstack_object_size (&non_simple);
715 element_string_pool = obstack_finish (&non_simple);
717 element_value_size = obstack_object_size (&string_pool);
718 element_value = obstack_finish (&string_pool);
720 /* Create the tables for the other byte order. */
721 element_hash_tab_ob = obstack_alloc (&non_simple,
722 (2 * element_hash_tab_size
723 * sizeof (u_int32_t)));
724 for (cnt = 0; cnt < 2 * element_hash_tab_size; ++cnt)
725 element_hash_tab_ob[cnt] = SWAPU32 (element_hash_tab[cnt]);
727 element_value_ob = obstack_alloc (&string_pool, element_value_size);
728 if (sizeof (wchar_t) != 4)
730 fputs ("sizeof (wchar_t) != 4 currently not handled", stderr);
731 abort ();
733 for (cnt = 0; cnt < element_value_size / 4; ++cnt)
734 element_value_ob[cnt] = SWAPU32 (element_value[cnt]);
737 /* Store collation elements as map to collation class. There are
738 three kinds of symbols:
739 - simple characters
740 - collation elements
741 - collation symbols
742 We need to make a table which lets the user to access the primary
743 weight based on the symbol string. */
744 symbols_hash_tab_size = next_prime ((4 * (charset->char_table.filled
745 + collate->elements.filled
746 + collate->symbols.filled)) / 3);
747 symbols_hash_tab = obstack_alloc (&non_simple, (2 * symbols_hash_tab_size
748 * sizeof (u_int32_t)));
749 memset (symbols_hash_tab, '\377', (2 * symbols_hash_tab_size
750 * sizeof (u_int32_t)));
752 /* Now fill the array. First the symbols from the character set,
753 then the collation elements and last the collation symbols. */
754 hash_tab = &charset->char_table;
755 while (1)
757 void *ptr; /* Running pointer. */
758 const char *key; /* Key for current bucket. */
759 size_t keylen; /* Length of key data. */
760 void *data; /* Data. */
762 ptr = NULL;
763 while (iterate_table (hash_tab, &ptr, (const void **) &key,
764 &keylen, (void **) &data) == 0)
766 size_t hash_val;
767 size_t idx;
768 u_int32_t word;
769 unsigned int *weights;
771 if (hash_tab == &charset->char_table
772 || hash_tab == &collate->elements)
774 element_t *lastp, *firstp;
775 wchar_t dummy_name[2];
776 const wchar_t *name;
777 size_t name_len;
779 if (hash_tab == &charset->char_table)
781 dummy_name[0] = (wchar_t) ((unsigned long int) data);
782 dummy_name[1] = L'\0';
783 name = dummy_name;
784 name_len = sizeof (wchar_t);
786 else
788 element_t *elemp = (element_t *) data;
789 name = elemp->name;
790 name_len = wcslen (name) * sizeof (wchar_t);
793 /* First check whether this character is used at all. */
794 if (find_entry (&collate->result, name, name_len,
795 (void *) &firstp) < 0)
796 /* The symbol is not directly mentioned in the collation.
797 I.e., we use the value for UNDEFINED. */
798 lastp = &collate->undefined;
799 else
801 /* The entry for the simple character is always found at
802 the end. */
803 lastp = firstp;
804 while (lastp->next != NULL && wcscmp (name, lastp->name))
805 lastp = lastp->next;
808 weights = lastp->ordering;
810 else
812 dummy_weights[0] = 1;
813 dummy_weights[collate->nrules]
814 = (unsigned int) ((unsigned long int) data);
816 weights = dummy_weights;
819 /* In LASTP->ordering we now have the collation class.
820 Determine the place in the hashing table next. */
821 hash_val = hash_string (key, keylen);
822 idx = hash_val % symbols_hash_tab_size;
824 if (symbols_hash_tab[2 * idx] != (~((u_int32_t) 0)))
826 /* We need the second hashing function. */
827 size_t c = 1 + (hash_val % (symbols_hash_tab_size - 2));
830 if (idx >= symbols_hash_tab_size - c)
831 idx -= symbols_hash_tab_size - c;
832 else
833 idx += c;
834 while (symbols_hash_tab[2 * idx] != (~((u_int32_t) 0)));
837 symbols_hash_tab[2 * idx] = obstack_object_size (&string_pool);
838 symbols_hash_tab[2 * idx + 1] = (obstack_object_size (&non_simple)
839 / sizeof (u_int32_t));
841 obstack_grow0 (&string_pool, key, keylen);
842 /* Adding the first weight looks complicated. We have to deal
843 with the kind it is stored and with the fact that original
844 form uses `unsigned int's while we need `u_int32_t' here. */
845 word = weights[0];
846 obstack_grow (&non_simple, &word, sizeof (u_int32_t));
847 for (cnt = 0; cnt < weights[0]; ++cnt)
849 word = weights[collate->nrules + cnt];
850 obstack_grow (&non_simple, &word, sizeof (u_int32_t));
854 if (hash_tab == &charset->char_table)
855 hash_tab = &collate->elements;
856 else if (hash_tab == &collate->elements)
857 hash_tab = &collate->symbols;
858 else
859 break;
862 /* Now we have the complete tables. */
863 if (obstack_object_size (&string_pool) % 4 != 0)
864 obstack_blank (&non_simple, 4 - (obstack_object_size (&string_pool) % 4));
865 symbols_string_pool_size = obstack_object_size (&string_pool);
866 symbols_string_pool = obstack_finish (&string_pool);
868 symbols_class_size = obstack_object_size (&non_simple);
869 symbols_class = obstack_finish (&non_simple);
871 /* Generate tables with other byte order. */
872 symbols_hash_tab_ob = obstack_alloc (&non_simple, (2 * symbols_hash_tab_size
873 * sizeof (u_int32_t)));
874 for (cnt = 0; cnt < 2 * symbols_hash_tab_size; ++cnt)
875 symbols_hash_tab_ob[cnt] = SWAPU32 (symbols_hash_tab[cnt]);
877 symbols_class_ob = obstack_alloc (&non_simple, symbols_class_size);
878 for (cnt = 0; cnt < symbols_class_size / 4; ++cnt)
879 symbols_class_ob[cnt] = SWAPU32 (symbols_class[cnt]);
882 /* Store table adresses and lengths. */
883 #if __BYTE_ORDER == __BIG_ENDIAN
884 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_base = table;
885 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_len
886 = table_best * level_best * entry_size * sizeof (table[0]);
888 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_base = table2;
889 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_len
890 = table_best * level_best * entry_size * sizeof (table[0]);
892 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_base = extra;
893 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_len = extra_len;
895 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_base = extra2;
896 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_len = extra_len;
897 #else
898 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_base = table2;
899 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EB)].iov_len
900 = table_best * level_best * entry_size * sizeof (table[0]);
902 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_base = table;
903 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_TABLE_EL)].iov_len
904 = table_best * level_best * entry_size * sizeof (table[0]);
906 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_base = extra2;
907 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EB)].iov_len = extra_len;
909 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_base = extra;
910 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_EXTRA_EL)].iov_len = extra_len;
911 #endif
913 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_base = &undefined_offset;
914 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_UNDEFINED)].iov_len = sizeof (u_int32_t);
917 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_SIZE)].iov_base
918 = &element_hash_tab_size;
919 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_SIZE)].iov_len
920 = sizeof (u_int32_t);
922 #if __BYTE_ORDER == __BIG_ENDIAN
923 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_base
924 = element_hash_tab;
925 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_len
926 = 2 * element_hash_tab_size * sizeof (u_int32_t);
928 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_base
929 = element_hash_tab_ob;
930 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_len
931 = 2 * element_hash_tab_size * sizeof (u_int32_t);
932 #else
933 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_base
934 = element_hash_tab;
935 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EL)].iov_len
936 = 2 * element_hash_tab_size * sizeof (u_int32_t);
938 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_base
939 = element_hash_tab_ob;
940 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_HASH_EB)].iov_len
941 = 2 * element_hash_tab_size * sizeof (u_int32_t);
942 #endif
944 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_STR_POOL)].iov_base
945 = element_string_pool;
946 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_STR_POOL)].iov_len
947 = element_string_pool_size;
949 #if __BYTE_ORDER == __BIG_ENDIAN
950 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_base
951 = element_value;
952 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_len
953 = element_value_size;
955 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_base
956 = element_value_ob;
957 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_len
958 = element_value_size;
959 #else
960 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_base
961 = element_value;
962 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EL)].iov_len
963 = element_value_size;
965 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_base
966 = element_value_ob;
967 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_ELEM_VAL_EB)].iov_len
968 = element_value_size;
969 #endif
971 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZE)].iov_base
972 = &symbols_hash_tab_size;
973 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZE)].iov_len
974 = sizeof (u_int32_t);
976 #if __BYTE_ORDER == __BIG_ENDIAN
977 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_base
978 = symbols_hash_tab;
979 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_len
980 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
982 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_base
983 = symbols_hash_tab_ob;
984 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_len
985 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
986 #else
987 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_base
988 = symbols_hash_tab;
989 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EL)].iov_len
990 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
992 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_base
993 = symbols_hash_tab_ob;
994 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_EB)].iov_len
995 = 2 * symbols_hash_tab_size * sizeof (u_int32_t);
996 #endif
998 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_STR_POOL)].iov_base
999 = symbols_string_pool;
1000 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_STR_POOL)].iov_len
1001 = symbols_string_pool_size;
1003 #if __BYTE_ORDER == __BIG_ENDIAN
1004 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_base
1005 = symbols_class;
1006 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_len
1007 = symbols_class_size;
1009 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_base
1010 = symbols_class_ob;
1011 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_len
1012 = symbols_class_size;
1013 #else
1014 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_base
1015 = symbols_class;
1016 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EL)].iov_len
1017 = symbols_class_size;
1019 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_base
1020 = symbols_class_ob;
1021 iov[2 + _NL_ITEM_INDEX (_NL_COLLATE_SYMB_CLASS_EB)].iov_len
1022 = symbols_class_size;
1023 #endif
1025 /* Update idx array. */
1026 idx[0] = iov[0].iov_len + iov[1].iov_len;
1027 for (cnt = 1; cnt < nelems; ++cnt)
1028 idx[cnt] = idx[cnt - 1] + iov[1 + cnt].iov_len;
1030 write_locale_data (output_path, "LC_COLLATE", 2 + nelems, iov);
1032 obstack_free (&non_simple, NULL);
1033 obstack_free (&string_pool, NULL);
1037 void
1038 collate_element_to (struct linereader *lr, struct localedef_t *locale,
1039 struct token *code, struct charset_t *charset)
1041 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1042 unsigned int value;
1043 void *not_used;
1045 if (collate->combine_token != NULL)
1047 free ((void *) collate->combine_token);
1048 collate->combine_token = NULL;
1051 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
1052 if (value != ILLEGAL_CHAR_VALUE)
1054 lr_error (lr, _("symbol for multicharacter collating element "
1055 "`%.*s' duplicates symbolic name in charset"),
1056 code->val.str.len, code->val.str.start);
1057 return;
1060 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1061 &not_used) >= 0)
1063 lr_error (lr, _("symbol for multicharacter collating element "
1064 "`%.*s' duplicates other element definition"),
1065 code->val.str.len, code->val.str.start);
1066 return;
1069 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1070 &not_used) >= 0)
1072 lr_error (lr, _("symbol for multicharacter collating element "
1073 "`%.*s' duplicates symbol definition"),
1074 code->val.str.len, code->val.str.start);
1075 return;
1078 collate->combine_token = code->val.str.start;
1079 collate->combine_token_len = code->val.str.len;
1083 void
1084 collate_element_from (struct linereader *lr, struct localedef_t *locale,
1085 struct token *code, struct charset_t *charset)
1087 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1088 element_t *elemp, *runp;
1090 /* CODE is a string. */
1091 elemp = (element_t *) obstack_alloc (&collate->element_mem,
1092 sizeof (element_t));
1094 /* We have to translate the string. It may contain <...> character
1095 names. */
1096 elemp->name = (wchar_t *) translate_string (code->val.str.start, charset);
1097 elemp->this_weight = 0;
1098 elemp->ordering = NULL;
1099 elemp->ordering_len = 0;
1101 free (code->val.str.start);
1103 if (elemp->name == NULL)
1105 /* At least one character in the string is not defined. We simply
1106 do nothing. */
1107 if (verbose)
1108 lr_error (lr, _("\
1109 `from' string in collation element declaration contains unknown character"));
1110 return;
1113 if (elemp->name[0] == L'\0' || elemp->name[1] == L'\0')
1115 lr_error (lr, _("illegal collation element"));
1116 return;
1119 /* The entries in the linked lists of RESULT are sorting in
1120 descending order. The order is important for the `strcoll' and
1121 `wcscoll' functions. */
1122 if (find_entry (&collate->result, elemp->name, sizeof (wchar_t),
1123 (void *) &runp) >= 0)
1125 /* We already have an entry with this key. Check whether it is
1126 identical. */
1127 element_t *prevp = NULL;
1128 int cmpres;
1132 cmpres = wcscmp (elemp->name, runp->name);
1133 if (cmpres <= 0)
1134 break;
1135 prevp = runp;
1137 while ((runp = runp->next) != NULL);
1139 if (cmpres == 0)
1140 lr_error (lr, _("duplicate collating element definition"));
1141 else
1143 elemp->next = runp;
1144 if (prevp == NULL)
1146 if (set_entry (&collate->result, elemp->name, sizeof (wchar_t),
1147 elemp) < 0)
1148 error (EXIT_FAILURE, 0, _("\
1149 error while inserting collation element into hash table"));
1151 else
1152 prevp->next = elemp;
1155 else
1157 elemp->next = NULL;
1158 if (insert_entry (&collate->result, elemp->name, sizeof (wchar_t), elemp)
1159 < 0)
1160 error (EXIT_FAILURE, errno, _("error while inserting to hash table"));
1163 if (insert_entry (&collate->elements, collate->combine_token,
1164 collate->combine_token_len, (void *) elemp) < 0)
1165 lr_error (lr, _("cannot insert new collating symbol definition: %s"),
1166 strerror (errno));
1170 void
1171 collate_symbol (struct linereader *lr, struct localedef_t *locale,
1172 struct token *code, struct charset_t *charset)
1174 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1175 wchar_t value;
1176 void *not_used;
1178 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
1179 if (value != ILLEGAL_CHAR_VALUE)
1181 lr_error (lr, _("symbol for multicharacter collating element "
1182 "`%.*s' duplicates symbolic name in charset"),
1183 code->val.str.len, code->val.str.start);
1184 return;
1187 if (find_entry (&collate->elements, code->val.str.start, code->val.str.len,
1188 &not_used) >= 0)
1190 lr_error (lr, _("symbol for multicharacter collating element "
1191 "`%.*s' duplicates element definition"),
1192 code->val.str.len, code->val.str.start);
1193 return;
1196 if (find_entry (&collate->symbols, code->val.str.start, code->val.str.len,
1197 &not_used) >= 0)
1199 lr_error (lr, _("symbol for multicharacter collating element "
1200 "`%.*s' duplicates other symbol definition"),
1201 code->val.str.len, code->val.str.start);
1202 return;
1205 if (insert_entry (&collate->symbols, code->val.str.start, code->val.str.len,
1206 (void *) 0) < 0)
1207 lr_error (lr, _("cannot insert new collating symbol definition: %s"),
1208 strerror (errno));
1212 void
1213 collate_new_order (struct linereader *lr, struct localedef_t *locale,
1214 enum coll_sort_rule sort_rule)
1216 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1218 if (collate->nrules >= collate->nrules_max)
1220 collate->nrules_max *= 2;
1221 collate->rules
1222 = (enum coll_sort_rule *) xrealloc (collate->rules,
1223 collate->nrules_max
1224 * sizeof (enum coll_sort_rule));
1227 collate->rules[collate->nrules++] = sort_rule;
1231 void
1232 collate_build_arrays (struct linereader *lr, struct localedef_t *locale)
1234 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1236 collate->rules
1237 = (enum coll_sort_rule *) xrealloc (collate->rules,
1238 collate->nrules
1239 * sizeof (enum coll_sort_rule));
1241 /* Allocate arrays for temporary weights. */
1242 collate->weight_cnt = (int *) xmalloc (collate->nrules * sizeof (int));
1244 /* Choose arbitrary start value for table size. */
1245 collate->nweight_max = 5 * collate->nrules;
1246 collate->weight = (int *) xmalloc (collate->nweight_max * sizeof (int));
1251 collate_order_elem (struct linereader *lr, struct localedef_t *locale,
1252 struct token *code, struct charset_t *charset)
1254 const wchar_t zero = L'\0';
1255 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1256 int result = 0;
1257 wchar_t value;
1258 void *tmp;
1259 int i;
1261 switch (code->tok)
1263 case tok_bsymbol:
1264 /* We have a string to find in one of the three hashing tables. */
1265 value = charset_find_value (charset, code->val.str.start,
1266 code->val.str.len);
1267 if (value != ILLEGAL_CHAR_VALUE)
1269 element_t *lastp, *firstp;
1271 collate->kind = character;
1273 if (find_entry (&collate->result, &value, sizeof (wchar_t),
1274 (void *) &firstp) < 0)
1275 firstp = lastp = NULL;
1276 else
1278 /* The entry for the simple character is always found at
1279 the end. */
1280 lastp = firstp;
1281 while (lastp->next != NULL)
1282 lastp = lastp->next;
1284 if (lastp->name[0] == value && lastp->name[1] == L'\0')
1286 lr_error (lr, _("duplicate definition for character `%.*s'"),
1287 code->val.str.len, code->val.str.start);
1288 lr_ignore_rest (lr, 0);
1289 result = -1;
1290 break;
1294 collate->current_element
1295 = (element_t *) obstack_alloc (&collate->element_mem,
1296 sizeof (element_t));
1298 obstack_grow (&collate->element_mem, &value, sizeof (value));
1299 obstack_grow (&collate->element_mem, &zero, sizeof (zero));
1301 collate->current_element->name =
1302 (const wchar_t *) obstack_finish (&collate->element_mem);
1304 collate->current_element->this_weight = ++collate->order_cnt;
1306 collate->current_element->next = NULL;
1308 if (firstp == NULL)
1310 if (insert_entry (&collate->result, &value, sizeof (wchar_t),
1311 (void *) collate->current_element) < 0)
1313 lr_error (lr, _("cannot insert collation element `%.*s'"),
1314 code->val.str.len, code->val.str.start);
1315 exit (4);
1318 else
1319 lastp->next = collate->current_element;
1321 else if (find_entry (&collate->elements, code->val.str.start,
1322 code->val.str.len, &tmp) >= 0)
1324 collate->current_element = (element_t *) tmp;
1326 if (collate->current_element->this_weight != 0)
1328 lr_error (lr, _("\
1329 collation element `%.*s' appears more than once: ignore line"),
1330 code->val.str.len, code->val.str.start);
1331 lr_ignore_rest (lr, 0);
1332 result = -1;
1333 break;
1336 collate->kind = element;
1337 collate->current_element->this_weight = ++collate->order_cnt;
1339 else if (find_entry (&collate->symbols, code->val.str.start,
1340 code->val.str.len, &tmp) >= 0)
1342 unsigned int order = ++collate->order_cnt;
1344 if ((unsigned long int) tmp != 0ul)
1346 lr_error (lr, _("\
1347 collation symbol `.*s' appears more than once: ignore line"),
1348 code->val.str.len, code->val.str.start);
1349 lr_ignore_rest (lr, 0);
1350 result = -1;
1351 break;
1354 collate->kind = symbol;
1356 if (set_entry (&collate->symbols, code->val.str.start,
1357 code->val.str.len, (void *) order) < 0)
1359 lr_error (lr, _("cannot process order specification"));
1360 exit (4);
1363 else
1365 if (verbose)
1366 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1367 code->val.str.len, code->val.str.start);
1368 lr_ignore_rest (lr, 0);
1370 result = -1;
1372 break;
1374 case tok_undefined:
1375 collate->kind = undefined;
1376 collate->current_element = &collate->undefined;
1377 break;
1379 case tok_ellipsis:
1380 if (collate->was_ellipsis)
1382 lr_error (lr, _("\
1383 two lines in a row containing `...' are not allowed"));
1384 result = -1;
1386 else if (collate->kind != character)
1388 /* An ellipsis requires the previous line to be an
1389 character definition. */
1390 lr_error (lr, _("\
1391 line before ellipsis does not contain definition for character constant"));
1392 lr_ignore_rest (lr, 0);
1393 result = -1;
1395 else
1396 collate->kind = ellipsis;
1397 break;
1399 default:
1400 assert (! "illegal token in `collate_order_elem'");
1403 /* Now it's time to handle the ellipsis in the previous line. We do
1404 this only when the last line contained an definition for a
1405 character, the current line also defines an character, the
1406 character code for the later is bigger than the former. */
1407 if (collate->was_ellipsis)
1409 if (collate->kind != character)
1411 lr_error (lr, _("\
1412 line after ellipsis must contain character definition"));
1413 lr_ignore_rest (lr, 0);
1414 result = -1;
1416 else if (collate->last_char > value)
1418 lr_error (lr, _("end point of ellipsis range is bigger then start"));
1419 lr_ignore_rest (lr, 0);
1420 result = -1;
1422 else
1424 /* We can fill the arrays with the information we need. */
1425 wchar_t name[2];
1426 unsigned int *data;
1427 size_t *ptr;
1428 size_t cnt;
1430 name[0] = collate->last_char + 1;
1431 name[1] = L'\0';
1433 data = (unsigned int *) alloca ((collate->nrules + collate->nweight)
1434 * sizeof (unsigned int));
1435 ptr = (size_t *) alloca (collate->nrules * sizeof (size_t));
1437 if (data == NULL || ptr == NULL)
1438 error (4, 0, _("memory exhausted"));
1440 /* Prepare data. Because the characters covered by an
1441 ellipsis all have equal values we prepare the data once
1442 and only change the variable number (if there are any).
1443 PTR[...] will point to the entries which will have to be
1444 fixed during the output loop. */
1445 for (cnt = 0; cnt < collate->nrules; ++cnt)
1447 data[cnt] = collate->weight_cnt[cnt];
1448 ptr[cnt] = (cnt == 0
1449 ? collate->nweight
1450 : ptr[cnt - 1] + collate->weight_cnt[cnt - 1]);
1453 for (cnt = 0; cnt < collate->nweight; ++cnt)
1454 data[collate->nrules + cnt] = collate->weight[cnt];
1456 for (cnt = 0; cnt < collate->nrules; ++cnt)
1457 if (data[ptr[cnt]] != ELLIPSIS_CHAR)
1458 ptr[cnt] = 0;
1460 while (name[0] <= value)
1462 element_t *pelem;
1464 pelem = (element_t *) obstack_alloc (&collate->element_mem,
1465 sizeof (element_t));
1466 if (pelem == NULL)
1467 error (4, 0, _("memory exhausted"));
1469 pelem->name
1470 = (const wchar_t *) obstack_copy (&collate->element_mem,
1471 name, 2 * sizeof (wchar_t));
1472 pelem->this_weight = ++collate->order_cnt;
1474 pelem->ordering_len = collate->nweight;
1475 pelem->ordering
1476 = (unsigned int *) obstack_copy (&collate->element_mem, data,
1477 (collate->nrules
1478 * pelem->ordering_len)
1479 * sizeof (unsigned int));
1481 /* `...' weights need to be adjusted. */
1482 for (cnt = 0; cnt < collate->nrules; ++cnt)
1483 if (ptr[cnt] != 0)
1484 pelem->ordering[ptr[cnt]] = pelem->this_weight;
1486 /* Insert new entry into result table. */
1487 if (find_entry (&collate->result, name, sizeof (wchar_t),
1488 (void *) &pelem->next) >= 0)
1490 if (set_entry (&collate->result, name, sizeof (wchar_t),
1491 (void *) pelem->next) < 0)
1492 error (4, 0, _("cannot insert into result table"));
1494 else
1495 if (insert_entry (&collate->result, name, sizeof (wchar_t),
1496 (void *) pelem->next) < 0)
1497 error (4, 0, _("cannot insert into result table"));
1499 /* Increment counter. */
1500 ++name[0];
1505 /* Reset counters for weights. */
1506 collate->weight_idx = 0;
1507 collate->nweight = 0;
1508 for (i = 0; i < collate->nrules; ++i)
1509 collate->weight_cnt[i] = 0;
1510 collate->current_patch = NULL;
1512 return result;
1517 collate_weight_bsymbol (struct linereader *lr, struct localedef_t *locale,
1518 struct token *code, struct charset_t *charset)
1520 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1521 unsigned int here_weight;
1522 wchar_t value;
1523 void *tmp;
1525 assert (code->tok == tok_bsymbol);
1527 value = charset_find_value (charset, code->val.str.start, code->val.str.len);
1528 if (value != ILLEGAL_CHAR_VALUE)
1530 element_t *runp;
1532 if (find_entry (&collate->result, &value, sizeof (wchar_t),
1533 (void *)&runp) < 0)
1534 runp = NULL;
1536 while (runp != NULL
1537 && (runp->name[0] != value || runp->name[1] != L'\0'))
1538 runp = runp->next;
1540 here_weight = runp == NULL ? 0 : runp->this_weight;
1542 else if (find_entry (&collate->elements, code->val.str.start,
1543 code->val.str.len, &tmp) >= 0)
1545 element_t *runp = (element_t *) tmp;
1547 here_weight = runp->this_weight;
1549 else if (find_entry (&collate->symbols, code->val.str.start,
1550 code->val.str.len, &tmp) >= 0)
1552 here_weight = (unsigned int) tmp;
1554 else
1556 if (verbose)
1557 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1558 code->val.str.len, code->val.str.start);
1559 lr_ignore_rest (lr, 0);
1560 return -1;
1563 /* When we currently work on a collation symbol we do not expect any
1564 weight. */
1565 if (collate->kind == symbol)
1567 lr_error (lr, _("\
1568 specification of sorting weight for collation symbol does not make sense"));
1569 lr_ignore_rest (lr, 0);
1570 return -1;
1573 /* Add to the current collection of weights. */
1574 if (collate->nweight >= collate->nweight_max)
1576 collate->nweight_max *= 2;
1577 collate->weight = (unsigned int *) xrealloc (collate->weight,
1578 collate->nweight_max);
1581 /* If the weight is currently not known, we remember to patch the
1582 resulting tables. */
1583 if (here_weight == 0)
1585 patch_t *newp;
1587 newp = (patch_t *) obstack_alloc (&collate->element_mem,
1588 sizeof (patch_t));
1589 newp->fname = lr->fname;
1590 newp->lineno = lr->lineno;
1591 newp->token = (const char *) obstack_copy0 (&collate->element_mem,
1592 code->val.str.start,
1593 code->val.str.len);
1594 newp->where.idx = collate->nweight++;
1595 newp->next = collate->current_patch;
1596 collate->current_patch = newp;
1598 else
1599 collate->weight[collate->nweight++] = here_weight;
1600 ++collate->weight_cnt[collate->weight_idx];
1602 return 0;
1607 collate_next_weight (struct linereader *lr, struct localedef_t *locale)
1609 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1611 if (collate->kind == symbol)
1613 lr_error (lr, _("\
1614 specification of sorting weight for collation symbol does not make sense"));
1615 lr_ignore_rest (lr, 0);
1616 return -1;
1619 ++collate->weight_idx;
1620 if (collate->weight_idx >= collate->nrules)
1622 lr_error (lr, _("too many weights"));
1623 lr_ignore_rest (lr, 0);
1624 return -1;
1627 return 0;
1632 collate_simple_weight (struct linereader *lr, struct localedef_t *locale,
1633 struct token *code, struct charset_t *charset)
1635 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1636 unsigned int value = 0;
1638 /* There current tokens can be `IGNORE', `...', or a string. */
1639 switch (code->tok)
1641 case tok_ignore:
1642 /* This token is allowed in all situations. */
1643 value = IGNORE_CHAR;
1644 break;
1646 case tok_ellipsis:
1647 /* The ellipsis is only allowed for the `...' or `UNDEFINED'
1648 entry. */
1649 if (collate->kind != ellipsis && collate->kind != undefined)
1651 lr_error (lr, _("\
1652 `...' must only be used in `...' and `UNDEFINED' entries"));
1653 lr_ignore_rest (lr, 0);
1654 return -1;
1656 value = ELLIPSIS_CHAR;
1657 break;
1659 case tok_string:
1660 /* This can become difficult. We have to get the weights which
1661 correspind the the single wide chars in the string. But some
1662 of the `chars' might not be real characters, but collation
1663 elements or symbols. And so the string decoder might have
1664 signaled errors. The string at this point is not translated.
1665 I.e., all <...> sequences are still there. */
1667 char *runp = code->val.str.start;
1668 void *tmp;
1670 while (*runp != '\0')
1672 char *startp = (char *) runp;
1673 char *putp = (char *) runp;
1674 wchar_t wch;
1676 /* Lookup weight for char and store it. */
1677 if (*runp == '<')
1679 while (*++runp != '\0' && *runp != '>')
1681 if (*runp == lr->escape_char)
1682 if (*++runp == '\0')
1684 lr_error (lr, _("unterminated weight name"));
1685 lr_ignore_rest (lr, 0);
1686 return -1;
1688 *putp++ = *runp;
1690 if (*runp == '>')
1691 ++runp;
1693 if (putp == startp)
1695 lr_error (lr, _("empty weight name: line ignored"));
1696 lr_ignore_rest (lr, 0);
1697 return -1;
1700 wch = charset_find_value (charset, startp, putp - startp);
1701 if (wch != ILLEGAL_CHAR_VALUE)
1703 element_t *pelem;
1705 if (find_entry (&collate->result, &wch, sizeof (wchar_t),
1706 (void *)&pelem) < 0)
1707 pelem = NULL;
1709 while (pelem != NULL
1710 && (pelem->name[0] != wch
1711 || pelem->name[1] != L'\0'))
1712 pelem = pelem->next;
1714 value = pelem == NULL ? 0 : pelem->this_weight;
1716 else if (find_entry (&collate->elements, startp, putp - startp,
1717 &tmp) >= 0)
1719 element_t *pelem = (element_t *) tmp;
1721 value = pelem->this_weight;
1723 else if (find_entry (&collate->symbols, startp, putp - startp,
1724 &tmp) >= 0)
1726 value = (unsigned int) tmp;
1728 else
1730 if (verbose)
1731 lr_error (lr, _("unknown symbol `%.*s': line ignored"),
1732 putp - startp, startp);
1733 lr_ignore_rest (lr, 0);
1734 return -1;
1737 else
1739 element_t *wp;
1740 wchar_t wch;
1742 if (*runp == lr->escape_char)
1744 static char digits[] = "0123456789abcdef";
1745 char *dp;
1746 int base;
1748 ++runp;
1749 if (tolower (*runp) == 'x')
1751 ++runp;
1752 base = 16;
1754 else if (tolower (*runp) == 'd')
1756 ++runp;
1757 base = 10;
1759 else
1760 base = 8;
1762 dp = strchr (digits, tolower (*runp));
1763 if (dp == NULL || (dp - digits) >= base)
1765 illegal_char:
1766 lr_error (lr, _("\
1767 illegal character constant in string"));
1768 lr_ignore_rest (lr, 0);
1769 return -1;
1771 wch = dp - digits;
1772 ++runp;
1774 dp = strchr (digits, tolower (*runp));
1775 if (dp == NULL || (dp - digits) >= base)
1776 goto illegal_char;
1777 wch *= base;
1778 wch += dp - digits;
1779 ++runp;
1781 if (base != 16)
1783 dp = strchr (digits, tolower (*runp));
1784 if (dp != NULL && (dp - digits < base))
1786 wch *= base;
1787 wch += dp - digits;
1788 ++runp;
1792 else
1793 wch = (wchar_t) *runp++;
1795 /* Lookup the weight for WCH. */
1796 if (find_entry (&collate->result, &wch, sizeof (wch),
1797 (void *)&wp) < 0)
1798 wp = NULL;
1800 while (wp != NULL
1801 && (wp->name[0] != wch || wp->name[1] != L'\0'))
1802 wp = wp->next;
1804 value = wp == NULL ? 0 : wp->this_weight;
1806 /* To get the correct name for the error message. */
1807 putp = runp;
1809 /**************************************************\
1810 |* I know here is something wrong. Characters in *|
1811 |* the string which are not in the <...> form *|
1812 |* cannot be declared forward for now!!! *|
1813 \**************************************************/
1816 /* Store in weight array. */
1817 if (collate->nweight >= collate->nweight_max)
1819 collate->nweight_max *= 2;
1820 collate->weight
1821 = (unsigned int *) xrealloc (collate->weight,
1822 collate->nweight_max);
1825 if (value == 0)
1827 patch_t *newp;
1829 newp = (patch_t *) obstack_alloc (&collate->element_mem,
1830 sizeof (patch_t));
1831 newp->fname = lr->fname;
1832 newp->lineno = lr->lineno;
1833 newp->token
1834 = (const char *) obstack_copy0 (&collate->element_mem,
1835 startp, putp - startp);
1836 newp->where.idx = collate->nweight++;
1837 newp->next = collate->current_patch;
1838 collate->current_patch = newp;
1840 else
1841 collate->weight[collate->nweight++] = value;
1842 ++collate->weight_cnt[collate->weight_idx];
1845 return 0;
1847 default:
1848 assert (! "should not happen");
1852 if (collate->nweight >= collate->nweight_max)
1854 collate->nweight_max *= 2;
1855 collate->weight = (unsigned int *) xrealloc (collate->weight,
1856 collate->nweight_max);
1859 collate->weight[collate->nweight++] = value;
1860 ++collate->weight_cnt[collate->weight_idx];
1862 return 0;
1866 void
1867 collate_end_weight (struct linereader *lr, struct localedef_t *locale)
1869 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1870 element_t *pelem = collate->current_element;
1872 if (collate->kind == symbol)
1874 /* We don't have to do anything. */
1875 collate->was_ellipsis = 0;
1876 return;
1879 if (collate->kind == ellipsis)
1881 /* Before the next line is processed the ellipsis is handled. */
1882 collate->was_ellipsis = 1;
1883 return;
1886 assert (collate->kind == character || collate->kind == element
1887 || collate->kind == undefined);
1889 /* Fill in the missing weights. */
1890 while (++collate->weight_idx < collate->nrules)
1892 collate->weight[collate->nweight++] = pelem->this_weight;
1893 ++collate->weight_cnt[collate->weight_idx];
1896 /* Now we know how many ordering weights the current
1897 character/element has. Allocate room in the element structure
1898 and copy information. */
1899 pelem->ordering_len = collate->nweight;
1901 /* First we write an array with the number of values for each
1902 weight. */
1903 obstack_grow (&collate->element_mem, collate->weight_cnt,
1904 collate->nrules * sizeof (unsigned int));
1906 /* Now the weights itselves. */
1907 obstack_grow (&collate->element_mem, collate->weight,
1908 collate->nweight * sizeof (unsigned int));
1910 /* Get result. */
1911 pelem->ordering = obstack_finish (&collate->element_mem);
1913 /* Now we handle the "patches". */
1914 while (collate->current_patch != NULL)
1916 patch_t *this_patch;
1918 this_patch = collate->current_patch;
1920 this_patch->where.pos = &pelem->ordering[collate->nrules
1921 + this_patch->where.idx];
1923 collate->current_patch = this_patch->next;
1924 this_patch->next = collate->all_patches;
1925 collate->all_patches = this_patch;
1928 /* Set information for next round. */
1929 collate->was_ellipsis = 0;
1930 if (collate->kind != undefined)
1931 collate->last_char = pelem->name[0];