Update.
[glibc.git] / locale / programs / ld-collate.c
blobb9ea186d3129b5b6577d421afa62d9f3e44938b0
1 /* Copyright (C) 1995-1999, 2000, 2001, 2002 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <errno.h>
25 #include <error.h>
26 #include <stdlib.h>
27 #include <wchar.h>
28 #include <sys/param.h>
30 #include "localedef.h"
31 #include "charmap.h"
32 #include "localeinfo.h"
33 #include "linereader.h"
34 #include "locfile.h"
35 #include "elem-hash.h"
37 /* Uncomment the following line in the production version. */
38 /* #define NDEBUG 1 */
39 #include <assert.h>
41 #define obstack_chunk_alloc malloc
42 #define obstack_chunk_free free
44 static inline void
45 obstack_int32_grow (struct obstack *obstack, int32_t data)
47 if (sizeof (int32_t) == sizeof (int))
48 obstack_int_grow (obstack, data);
49 else
50 obstack_grow (obstack, &data, sizeof (int32_t));
53 static inline void
54 obstack_int32_grow_fast (struct obstack *obstack, int32_t data)
56 if (sizeof (int32_t) == sizeof (int))
57 obstack_int_grow_fast (obstack, data);
58 else
59 obstack_grow (obstack, &data, sizeof (int32_t));
62 /* Forward declaration. */
63 struct element_t;
65 /* Data type for list of strings. */
66 struct section_list
68 /* Successor in the known_sections list. */
69 struct section_list *def_next;
70 /* Successor in the sections list. */
71 struct section_list *next;
72 /* Name of the section. */
73 const char *name;
74 /* First element of this section. */
75 struct element_t *first;
76 /* Last element of this section. */
77 struct element_t *last;
78 /* These are the rules for this section. */
79 enum coll_sort_rule *rules;
80 /* Index of the rule set in the appropriate section of the output file. */
81 int ruleidx;
84 struct element_t;
86 struct element_list_t
88 /* Number of elements. */
89 int cnt;
91 struct element_t **w;
94 /* Data type for collating element. */
95 struct element_t
97 const char *name;
99 const char *mbs;
100 size_t nmbs;
101 const uint32_t *wcs;
102 size_t nwcs;
103 int *mborder;
104 int wcorder;
106 /* The following is a bit mask which bits are set if this element is
107 used in the appropriate level. Interesting for the singlebyte
108 weight computation.
110 XXX The type here restricts the number of levels to 32. It could
111 be changed if necessary but I doubt this is necessary. */
112 unsigned int used_in_level;
114 struct element_list_t *weights;
116 /* Nonzero if this is a real character definition. */
117 int is_character;
119 /* Order of the character in the sequence. This information will
120 be used in range expressions. */
121 int mbseqorder;
122 int wcseqorder;
124 /* Where does the definition come from. */
125 const char *file;
126 size_t line;
128 /* Which section does this belong to. */
129 struct section_list *section;
131 /* Predecessor and successor in the order list. */
132 struct element_t *last;
133 struct element_t *next;
135 /* Next element in multibyte output list. */
136 struct element_t *mbnext;
137 struct element_t *mblast;
139 /* Next element in wide character output list. */
140 struct element_t *wcnext;
141 struct element_t *wclast;
144 /* Special element value. */
145 #define ELEMENT_ELLIPSIS2 ((struct element_t *) 1)
146 #define ELEMENT_ELLIPSIS3 ((struct element_t *) 2)
147 #define ELEMENT_ELLIPSIS4 ((struct element_t *) 3)
149 /* Data type for collating symbol. */
150 struct symbol_t
152 const char *name;
154 /* Point to place in the order list. */
155 struct element_t *order;
157 /* Where does the definition come from. */
158 const char *file;
159 size_t line;
162 /* Sparse table of struct element_t *. */
163 #define TABLE wchead_table
164 #define ELEMENT struct element_t *
165 #define DEFAULT NULL
166 #define ITERATE
167 #define NO_FINALIZE
168 #include "3level.h"
170 /* Sparse table of int32_t. */
171 #define TABLE collidx_table
172 #define ELEMENT int32_t
173 #define DEFAULT 0
174 #include "3level.h"
176 /* Sparse table of uint32_t. */
177 #define TABLE collseq_table
178 #define ELEMENT uint32_t
179 #define DEFAULT ~((uint32_t) 0)
180 #include "3level.h"
183 /* The real definition of the struct for the LC_COLLATE locale. */
184 struct locale_collate_t
186 int col_weight_max;
187 int cur_weight_max;
189 /* List of known scripts. */
190 struct section_list *known_sections;
191 /* List of used sections. */
192 struct section_list *sections;
193 /* Current section using definition. */
194 struct section_list *current_section;
195 /* There always can be an unnamed section. */
196 struct section_list unnamed_section;
197 /* To make handling of errors easier we have another section. */
198 struct section_list error_section;
199 /* Sometimes we are defining the values for collating symbols before
200 the first actual section. */
201 struct section_list symbol_section;
203 /* Start of the order list. */
204 struct element_t *start;
206 /* The undefined element. */
207 struct element_t undefined;
209 /* This is the cursor for `reorder_after' insertions. */
210 struct element_t *cursor;
212 /* This value is used when handling ellipsis. */
213 struct element_t ellipsis_weight;
215 /* Known collating elements. */
216 hash_table elem_table;
218 /* Known collating symbols. */
219 hash_table sym_table;
221 /* Known collation sequences. */
222 hash_table seq_table;
224 struct obstack mempool;
226 /* The LC_COLLATE category is a bit special as it is sometimes possible
227 that the definitions from more than one input file contains information.
228 Therefore we keep all relevant input in a list. */
229 struct locale_collate_t *next;
231 /* Arrays with heads of the list for each of the leading bytes in
232 the multibyte sequences. */
233 struct element_t *mbheads[256];
235 /* Arrays with heads of the list for each of the leading bytes in
236 the multibyte sequences. */
237 struct wchead_table wcheads;
239 /* The arrays with the collation sequence order. */
240 unsigned char mbseqorder[256];
241 struct collseq_table wcseqorder;
245 /* We have a few global variables which are used for reading all
246 LC_COLLATE category descriptions in all files. */
247 static uint32_t nrules;
250 /* We need UTF-8 encoding of numbers. */
251 static inline int
252 utf8_encode (char *buf, int val)
254 int retval;
256 if (val < 0x80)
258 *buf++ = (char) val;
259 retval = 1;
261 else
263 int step;
265 for (step = 2; step < 6; ++step)
266 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
267 break;
268 retval = step;
270 *buf = (unsigned char) (~0xff >> step);
271 --step;
274 buf[step] = 0x80 | (val & 0x3f);
275 val >>= 6;
277 while (--step > 0);
278 *buf |= val;
281 return retval;
285 static struct section_list *
286 make_seclist_elem (struct locale_collate_t *collate, const char *string,
287 struct section_list *next)
289 struct section_list *newp;
291 newp = (struct section_list *) obstack_alloc (&collate->mempool,
292 sizeof (*newp));
293 newp->next = next;
294 newp->name = string;
295 newp->first = NULL;
296 newp->last = NULL;
298 return newp;
302 static struct element_t *
303 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
304 const uint32_t *wcs, const char *name, size_t namelen,
305 int is_character)
307 struct element_t *newp;
309 newp = (struct element_t *) obstack_alloc (&collate->mempool,
310 sizeof (*newp));
311 newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
312 name, namelen);
313 if (mbs != NULL)
315 newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
316 newp->nmbs = mbslen;
318 else
320 newp->mbs = NULL;
321 newp->nmbs = 0;
323 if (wcs != NULL)
325 size_t nwcs = wcslen ((wchar_t *) wcs);
326 uint32_t zero = 0;
327 obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
328 obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
329 newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
330 newp->nwcs = nwcs;
332 else
334 newp->wcs = NULL;
335 newp->nwcs = 0;
337 newp->mborder = NULL;
338 newp->wcorder = 0;
339 newp->used_in_level = 0;
340 newp->is_character = is_character;
342 /* Will be assigned later. XXX */
343 newp->mbseqorder = 0;
344 newp->wcseqorder = 0;
346 /* Will be allocated later. */
347 newp->weights = NULL;
349 newp->file = NULL;
350 newp->line = 0;
352 newp->section = collate->current_section;
354 newp->last = NULL;
355 newp->next = NULL;
357 newp->mbnext = NULL;
358 newp->mblast = NULL;
360 newp->wcnext = NULL;
361 newp->wclast = NULL;
363 return newp;
367 static struct symbol_t *
368 new_symbol (struct locale_collate_t *collate, const char *name, size_t len)
370 struct symbol_t *newp;
372 newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
374 newp->name = obstack_copy0 (&collate->mempool, name, len);
375 newp->order = NULL;
377 newp->file = NULL;
378 newp->line = 0;
380 return newp;
384 /* Test whether this name is already defined somewhere. */
385 static int
386 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
387 const struct charmap_t *charmap,
388 struct repertoire_t *repertoire, const char *symbol,
389 size_t symbol_len)
391 void *ignore = NULL;
393 if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
395 lr_error (ldfile, _("`%.*s' already defined in charmap"),
396 (int) symbol_len, symbol);
397 return 1;
400 if (repertoire != NULL
401 && (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore)
402 == 0))
404 lr_error (ldfile, _("`%.*s' already defined in repertoire"),
405 (int) symbol_len, symbol);
406 return 1;
409 if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
411 lr_error (ldfile, _("`%.*s' already defined as collating symbol"),
412 (int) symbol_len, symbol);
413 return 1;
416 if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
418 lr_error (ldfile, _("`%.*s' already defined as collating element"),
419 (int) symbol_len, symbol);
420 return 1;
423 return 0;
427 /* Read the direction specification. */
428 static void
429 read_directions (struct linereader *ldfile, struct token *arg,
430 const struct charmap_t *charmap,
431 struct repertoire_t *repertoire, struct localedef_t *result)
433 int cnt = 0;
434 int max = nrules ?: 10;
435 enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
436 int warned = 0;
437 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
439 while (1)
441 int valid = 0;
443 if (arg->tok == tok_forward)
445 if (rules[cnt] & sort_backward)
447 if (! warned)
449 lr_error (ldfile, _("\
450 %s: `forward' and `backward' are mutually excluding each other"),
451 "LC_COLLATE");
452 warned = 1;
455 else if (rules[cnt] & sort_forward)
457 if (! warned)
459 lr_error (ldfile, _("\
460 %s: `%s' mentioned more than once in definition of weight %d"),
461 "LC_COLLATE", "forward", cnt + 1);
464 else
465 rules[cnt] |= sort_forward;
467 valid = 1;
469 else if (arg->tok == tok_backward)
471 if (rules[cnt] & sort_forward)
473 if (! warned)
475 lr_error (ldfile, _("\
476 %s: `forward' and `backward' are mutually excluding each other"),
477 "LC_COLLATE");
478 warned = 1;
481 else if (rules[cnt] & sort_backward)
483 if (! warned)
485 lr_error (ldfile, _("\
486 %s: `%s' mentioned more than once in definition of weight %d"),
487 "LC_COLLATE", "backward", cnt + 1);
490 else
491 rules[cnt] |= sort_backward;
493 valid = 1;
495 else if (arg->tok == tok_position)
497 if (rules[cnt] & sort_position)
499 if (! warned)
501 lr_error (ldfile, _("\
502 %s: `%s' mentioned more than once in definition of weight %d"),
503 "LC_COLLATE", "position", cnt + 1);
506 else
507 rules[cnt] |= sort_position;
509 valid = 1;
512 if (valid)
513 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
515 if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
516 || arg->tok == tok_semicolon)
518 if (! valid && ! warned)
520 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
521 warned = 1;
524 /* See whether we have to increment the counter. */
525 if (arg->tok != tok_comma && rules[cnt] != 0)
527 /* Add the default `forward' if we have seen only `position'. */
528 if (rules[cnt] == sort_position)
529 rules[cnt] = sort_position | sort_forward;
531 ++cnt;
534 if (arg->tok == tok_eof || arg->tok == tok_eol)
535 /* End of line or file, so we exit the loop. */
536 break;
538 if (nrules == 0)
540 /* See whether we have enough room in the array. */
541 if (cnt == max)
543 max += 10;
544 rules = (enum coll_sort_rule *) xrealloc (rules,
546 * sizeof (*rules));
547 memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
550 else
552 if (cnt == nrules)
554 /* There must not be any more rule. */
555 if (! warned)
557 lr_error (ldfile, _("\
558 %s: too many rules; first entry only had %d"),
559 "LC_COLLATE", nrules);
560 warned = 1;
563 lr_ignore_rest (ldfile, 0);
564 break;
568 else
570 if (! warned)
572 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
573 warned = 1;
577 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
580 if (nrules == 0)
582 /* Now we know how many rules we have. */
583 nrules = cnt;
584 rules = (enum coll_sort_rule *) xrealloc (rules,
585 nrules * sizeof (*rules));
587 else
589 if (cnt < nrules)
591 /* Not enough rules in this specification. */
592 if (! warned)
593 lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
596 rules[cnt] = sort_forward;
597 while (++cnt < nrules);
601 collate->current_section->rules = rules;
605 static struct element_t *
606 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
607 const char *str, size_t len)
609 struct element_t *result = NULL;
611 /* Search for the entries among the collation sequences already define. */
612 if (find_entry (&collate->seq_table, str, len, (void **) &result) != 0)
614 /* Nope, not define yet. So we see whether it is a
615 collation symbol. */
616 void *ptr;
618 if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
620 /* It's a collation symbol. */
621 struct symbol_t *sym = (struct symbol_t *) ptr;
622 result = sym->order;
624 if (result == NULL)
625 result = sym->order = new_element (collate, NULL, 0, NULL,
626 NULL, 0, 0);
628 else if (find_entry (&collate->elem_table, str, len,
629 (void **) &result) != 0)
631 /* It's also no collation element. So it is a character
632 element defined later. */
633 result = new_element (collate, NULL, 0, NULL, str, len, 1);
634 /* Insert it into the sequence table. */
635 insert_entry (&collate->seq_table, str, len, result);
639 return result;
643 static void
644 unlink_element (struct locale_collate_t *collate)
646 if (collate->cursor == collate->start)
648 assert (collate->cursor->next == NULL);
649 assert (collate->cursor->last == NULL);
650 collate->cursor = NULL;
652 else
654 if (collate->cursor->next != NULL)
655 collate->cursor->next->last = collate->cursor->last;
656 if (collate->cursor->last != NULL)
657 collate->cursor->last->next = collate->cursor->next;
658 collate->cursor = collate->cursor->last;
663 static void
664 insert_weights (struct linereader *ldfile, struct element_t *elem,
665 const struct charmap_t *charmap,
666 struct repertoire_t *repertoire, struct localedef_t *result,
667 enum token_t ellipsis)
669 int weight_cnt;
670 struct token *arg;
671 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
673 /* Initialize all the fields. */
674 elem->file = ldfile->fname;
675 elem->line = ldfile->lineno;
677 elem->last = collate->cursor;
678 elem->next = collate->cursor ? collate->cursor->next : NULL;
679 if (collate->cursor != NULL && collate->cursor->next != NULL)
680 collate->cursor->next->last = elem;
681 if (collate->cursor != NULL)
682 collate->cursor->next = elem;
683 if (collate->start == NULL)
685 assert (collate->cursor == NULL);
686 collate->start = elem;
689 elem->section = collate->current_section;
691 if (collate->current_section->first == NULL)
692 collate->current_section->first = elem;
693 if (collate->current_section->last == collate->cursor)
694 collate->current_section->last = elem;
696 collate->cursor = elem;
698 elem->weights = (struct element_list_t *)
699 obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
700 memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
702 weight_cnt = 0;
704 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
707 if (arg->tok == tok_eof || arg->tok == tok_eol)
708 break;
710 if (arg->tok == tok_ignore)
712 /* The weight for this level has to be ignored. We use the
713 null pointer to indicate this. */
714 elem->weights[weight_cnt].w = (struct element_t **)
715 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
716 elem->weights[weight_cnt].w[0] = NULL;
717 elem->weights[weight_cnt].cnt = 1;
719 else if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
721 char ucs4str[10];
722 struct element_t *val;
723 char *symstr;
724 size_t symlen;
726 if (arg->tok == tok_bsymbol)
728 symstr = arg->val.str.startmb;
729 symlen = arg->val.str.lenmb;
731 else
733 snprintf (ucs4str, sizeof (ucs4str), "U%08X", arg->val.ucs4);
734 symstr = ucs4str;
735 symlen = 9;
738 val = find_element (ldfile, collate, symstr, symlen);
739 if (val == NULL)
740 break;
742 elem->weights[weight_cnt].w = (struct element_t **)
743 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
744 elem->weights[weight_cnt].w[0] = val;
745 elem->weights[weight_cnt].cnt = 1;
747 else if (arg->tok == tok_string)
749 /* Split the string up in the individual characters and put
750 the element definitions in the list. */
751 const char *cp = arg->val.str.startmb;
752 int cnt = 0;
753 struct element_t *charelem;
754 struct element_t **weights = NULL;
755 int max = 0;
757 if (*cp == '\0')
759 lr_error (ldfile, _("%s: empty weight string not allowed"),
760 "LC_COLLATE");
761 lr_ignore_rest (ldfile, 0);
762 break;
767 if (*cp == '<')
769 /* Ahh, it's a bsymbol or an UCS4 value. If it's
770 the latter we have to unify the name. */
771 const char *startp = ++cp;
772 size_t len;
774 while (*cp != '>')
776 if (*cp == ldfile->escape_char)
777 ++cp;
778 if (*cp == '\0')
779 /* It's a syntax error. */
780 goto syntax;
782 ++cp;
785 if (cp - startp == 5 && startp[0] == 'U'
786 && isxdigit (startp[1]) && isxdigit (startp[2])
787 && isxdigit (startp[3]) && isxdigit (startp[4]))
789 unsigned int ucs4 = strtoul (startp + 1, NULL, 16);
790 char *newstr;
792 newstr = (char *) xmalloc (10);
793 snprintf (newstr, 10, "U%08X", ucs4);
794 startp = newstr;
796 len = 9;
798 else
799 len = cp - startp;
801 charelem = find_element (ldfile, collate, startp, len);
802 ++cp;
804 else
806 /* People really shouldn't use characters directly in
807 the string. Especially since it's not really clear
808 what this means. We interpret all characters in the
809 string as if that would be bsymbols. Otherwise we
810 would have to match back to bsymbols somehow and this
811 is normally not what people normally expect. */
812 charelem = find_element (ldfile, collate, cp++, 1);
815 if (charelem == NULL)
817 /* We ignore the rest of the line. */
818 lr_ignore_rest (ldfile, 0);
819 break;
822 /* Add the pointer. */
823 if (cnt >= max)
825 struct element_t **newp;
826 max += 10;
827 newp = (struct element_t **)
828 alloca (max * sizeof (struct element_t *));
829 memcpy (newp, weights, cnt * sizeof (struct element_t *));
830 weights = newp;
832 weights[cnt++] = charelem;
834 while (*cp != '\0');
836 /* Now store the information. */
837 elem->weights[weight_cnt].w = (struct element_t **)
838 obstack_alloc (&collate->mempool,
839 cnt * sizeof (struct element_t *));
840 memcpy (elem->weights[weight_cnt].w, weights,
841 cnt * sizeof (struct element_t *));
842 elem->weights[weight_cnt].cnt = cnt;
844 /* We don't need the string anymore. */
845 free (arg->val.str.startmb);
847 else if (ellipsis != tok_none
848 && (arg->tok == tok_ellipsis2
849 || arg->tok == tok_ellipsis3
850 || arg->tok == tok_ellipsis4))
852 /* It must be the same ellipsis as used in the initial column. */
853 if (arg->tok != ellipsis)
854 lr_error (ldfile, _("\
855 %s: weights must use the same ellipsis symbol as the name"),
856 "LC_COLLATE");
858 /* The weight for this level will depend on the element
859 iterating over the range. Put a placeholder. */
860 elem->weights[weight_cnt].w = (struct element_t **)
861 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
862 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
863 elem->weights[weight_cnt].cnt = 1;
865 else
867 syntax:
868 /* It's a syntax error. */
869 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
870 lr_ignore_rest (ldfile, 0);
871 break;
874 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
875 /* This better should be the end of the line or a semicolon. */
876 if (arg->tok == tok_semicolon)
877 /* OK, ignore this and read the next token. */
878 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
879 else if (arg->tok != tok_eof && arg->tok != tok_eol)
881 /* It's a syntax error. */
882 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
883 lr_ignore_rest (ldfile, 0);
884 break;
887 while (++weight_cnt < nrules);
889 if (weight_cnt < nrules)
891 /* This means the rest of the line uses the current element as
892 the weight. */
895 elem->weights[weight_cnt].w = (struct element_t **)
896 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
897 if (ellipsis == tok_none)
898 elem->weights[weight_cnt].w[0] = elem;
899 else
900 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
901 elem->weights[weight_cnt].cnt = 1;
903 while (++weight_cnt < nrules);
905 else
907 if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
909 /* Too many rule values. */
910 lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
911 lr_ignore_rest (ldfile, 0);
913 else
914 lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
919 static int
920 insert_value (struct linereader *ldfile, const char *symstr, size_t symlen,
921 const struct charmap_t *charmap, struct repertoire_t *repertoire,
922 struct localedef_t *result)
924 /* First find out what kind of symbol this is. */
925 struct charseq *seq;
926 uint32_t wc;
927 struct element_t *elem = NULL;
928 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
930 /* Try to find the character in the charmap. */
931 seq = charmap_find_value (charmap, symstr, symlen);
933 /* Determine the wide character. */
934 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
936 wc = repertoire_find_value (repertoire, symstr, symlen);
937 if (seq != NULL)
938 seq->ucs4 = wc;
940 else
941 wc = seq->ucs4;
943 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
945 /* It's no character, so look through the collation elements and
946 symbol list. */
947 if (find_entry (&collate->elem_table, symstr, symlen,
948 (void **) &elem) != 0)
950 void *result;
951 struct symbol_t *sym = NULL;
953 /* It's also collation element. Therefore it's either a
954 collating symbol or it's a character which is not
955 supported by the character set. In the later case we
956 simply create a dummy entry. */
957 if (find_entry (&collate->sym_table, symstr, symlen, &result) == 0)
959 /* It's a collation symbol. */
960 sym = (struct symbol_t *) result;
962 elem = sym->order;
965 if (elem == NULL)
967 elem = new_element (collate, NULL, 0, NULL, symstr, symlen, 0);
969 if (sym != NULL)
970 sym->order = elem;
971 else
972 /* Enter a fake element in the sequence table. This
973 won't cause anything in the output since there is
974 no multibyte or wide character associated with
975 it. */
976 insert_entry (&collate->seq_table, symstr, symlen, elem);
980 else
982 /* Otherwise the symbols stands for a character. */
983 if (find_entry (&collate->seq_table, symstr, symlen,
984 (void **) &elem) != 0)
986 uint32_t wcs[2] = { wc, 0 };
988 /* We have to allocate an entry. */
989 elem = new_element (collate, seq != NULL ? seq->bytes : NULL,
990 seq != NULL ? seq->nbytes : 0,
991 wc == ILLEGAL_CHAR_VALUE ? NULL : wcs,
992 symstr, symlen, 1);
994 /* And add it to the table. */
995 if (insert_entry (&collate->seq_table, symstr, symlen, elem) != 0)
996 /* This cannot happen. */
997 assert (! "Internal error");
999 else
1001 /* Maybe the character was used before the definition. In this case
1002 we have to insert the byte sequences now. */
1003 if (elem->mbs == NULL && seq != NULL)
1005 elem->mbs = obstack_copy0 (&collate->mempool,
1006 seq->bytes, seq->nbytes);
1007 elem->nmbs = seq->nbytes;
1010 if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
1012 uint32_t wcs[2] = { wc, 0 };
1014 elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
1015 elem->nwcs = 1;
1020 /* Test whether this element is not already in the list. */
1021 if (elem->next != NULL || elem == collate->cursor)
1023 lr_error (ldfile, _("order for `%.*s' already defined at %s:%Zu"),
1024 (int) symlen, symstr, elem->file, elem->line);
1025 lr_ignore_rest (ldfile, 0);
1026 return 1;
1029 insert_weights (ldfile, elem, charmap, repertoire, result, tok_none);
1031 return 0;
1035 static void
1036 handle_ellipsis (struct linereader *ldfile, const char *symstr, size_t symlen,
1037 enum token_t ellipsis, const struct charmap_t *charmap,
1038 struct repertoire_t *repertoire,
1039 struct localedef_t *result)
1041 struct element_t *startp;
1042 struct element_t *endp;
1043 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
1045 /* Unlink the entry added for the ellipsis. */
1046 unlink_element (collate);
1047 startp = collate->cursor;
1049 /* Process and add the end-entry. */
1050 if (symstr != NULL
1051 && insert_value (ldfile, symstr, symlen, charmap, repertoire, result))
1052 /* Something went wrong with inserting the to-value. This means
1053 we cannot process the ellipsis. */
1054 return;
1056 /* Reset the cursor. */
1057 collate->cursor = startp;
1059 /* Now we have to handle many different situations:
1060 - we have to distinguish between the three different ellipsis forms
1061 - the is the ellipsis at the beginning, in the middle, or at the end.
1063 endp = collate->cursor->next;
1064 assert (symstr == NULL || endp != NULL);
1066 /* XXX The following is probably very wrong since also collating symbols
1067 can appear in ranges. But do we want/can refine the test for that? */
1068 #if 0
1069 /* Both, the start and the end symbol, must stand for characters. */
1070 if ((startp != NULL && (startp->name == NULL || ! startp->is_character))
1071 || (endp != NULL && (endp->name == NULL|| ! endp->is_character)))
1073 lr_error (ldfile, _("\
1074 %s: the start and the end symbol of a range must stand for characters"),
1075 "LC_COLLATE");
1076 return;
1078 #endif
1080 if (ellipsis == tok_ellipsis3)
1082 /* One requirement we make here: the length of the byte
1083 sequences for the first and end character must be the same.
1084 This is mainly to prevent unwanted effects and this is often
1085 not what is wanted. */
1086 size_t len = (startp->mbs != NULL ? startp->nmbs
1087 : (endp->mbs != NULL ? endp->nmbs : 0));
1088 char mbcnt[len + 1];
1089 char mbend[len + 1];
1091 /* Well, this should be caught somewhere else already. Just to
1092 make sure. */
1093 assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
1094 assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
1096 if (startp != NULL && endp != NULL
1097 && startp->mbs != NULL && endp->mbs != NULL
1098 && startp->nmbs != endp->nmbs)
1100 lr_error (ldfile, _("\
1101 %s: byte sequences of first and last character must have the same length"),
1102 "LC_COLLATE");
1103 return;
1106 /* Determine whether we have to generate multibyte sequences. */
1107 if ((startp == NULL || startp->mbs != NULL)
1108 && (endp == NULL || endp->mbs != NULL))
1110 int cnt;
1111 int ret;
1113 /* Prepare the beginning byte sequence. This is either from the
1114 beginning byte sequence or it is all nulls if it was an
1115 initial ellipsis. */
1116 if (startp == NULL || startp->mbs == NULL)
1117 memset (mbcnt, '\0', len);
1118 else
1120 memcpy (mbcnt, startp->mbs, len);
1122 /* And increment it so that the value is the first one we will
1123 try to insert. */
1124 for (cnt = len - 1; cnt >= 0; --cnt)
1125 if (++mbcnt[cnt] != '\0')
1126 break;
1128 mbcnt[len] = '\0';
1130 /* And the end sequence. */
1131 if (endp == NULL || endp->mbs == NULL)
1132 memset (mbend, '\0', len);
1133 else
1134 memcpy (mbend, endp->mbs, len);
1135 mbend[len] = '\0';
1137 /* Test whether we have a correct range. */
1138 ret = memcmp (mbcnt, mbend, len);
1139 if (ret >= 0)
1141 if (ret > 0)
1142 lr_error (ldfile, _("%s: byte sequence of first character of \
1143 sequence is not lower than that of the last character"), "LC_COLLATE");
1144 return;
1147 /* Generate the byte sequences data. */
1148 while (1)
1150 struct charseq *seq;
1152 /* Quite a bit of work ahead. We have to find the character
1153 definition for the byte sequence and then determine the
1154 wide character belonging to it. */
1155 seq = charmap_find_symbol (charmap, mbcnt, len);
1156 if (seq != NULL)
1158 struct element_t *elem;
1159 size_t namelen;
1161 /* I don't this this can ever happen. */
1162 assert (seq->name != NULL);
1163 namelen = strlen (seq->name);
1165 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1166 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1167 namelen);
1169 /* Now we are ready to insert the new value in the
1170 sequence. Find out whether the element is
1171 already known. */
1172 if (find_entry (&collate->seq_table, seq->name, namelen,
1173 (void **) &elem) != 0)
1175 uint32_t wcs[2] = { seq->ucs4, 0 };
1177 /* We have to allocate an entry. */
1178 elem = new_element (collate, mbcnt, len,
1179 seq->ucs4 == ILLEGAL_CHAR_VALUE
1180 ? NULL : wcs, seq->name,
1181 namelen, 1);
1183 /* And add it to the table. */
1184 if (insert_entry (&collate->seq_table, seq->name,
1185 namelen, elem) != 0)
1186 /* This cannot happen. */
1187 assert (! "Internal error");
1190 /* Test whether this element is not already in the list. */
1191 if (elem->next != NULL || (collate->cursor != NULL
1192 && elem->next == collate->cursor))
1194 lr_error (ldfile, _("\
1195 order for `%.*s' already defined at %s:%Zu"),
1196 (int) namelen, seq->name,
1197 elem->file, elem->line);
1198 goto increment;
1201 /* Enqueue the new element. */
1202 elem->last = collate->cursor;
1203 if (collate->cursor == NULL)
1204 elem->next = NULL;
1205 else
1207 elem->next = collate->cursor->next;
1208 elem->last->next = elem;
1209 if (elem->next != NULL)
1210 elem->next->last = elem;
1212 if (collate->start == NULL)
1214 assert (collate->cursor == NULL);
1215 collate->start = elem;
1217 collate->cursor = elem;
1219 /* Add the weight value. We take them from the
1220 `ellipsis_weights' member of `collate'. */
1221 elem->weights = (struct element_list_t *)
1222 obstack_alloc (&collate->mempool,
1223 nrules * sizeof (struct element_list_t));
1224 for (cnt = 0; cnt < nrules; ++cnt)
1225 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1226 && (collate->ellipsis_weight.weights[cnt].w[0]
1227 == ELEMENT_ELLIPSIS2))
1229 elem->weights[cnt].w = (struct element_t **)
1230 obstack_alloc (&collate->mempool,
1231 sizeof (struct element_t *));
1232 elem->weights[cnt].w[0] = elem;
1233 elem->weights[cnt].cnt = 1;
1235 else
1237 /* Simply use the weight from `ellipsis_weight'. */
1238 elem->weights[cnt].w =
1239 collate->ellipsis_weight.weights[cnt].w;
1240 elem->weights[cnt].cnt =
1241 collate->ellipsis_weight.weights[cnt].cnt;
1245 /* Increment for the next round. */
1246 increment:
1247 for (cnt = len - 1; cnt >= 0; --cnt)
1248 if (++mbcnt[cnt] != '\0')
1249 break;
1251 /* Find out whether this was all. */
1252 if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1253 /* Yep, that's all. */
1254 break;
1258 else
1260 /* For symbolic range we naturally must have a beginning and an
1261 end specified by the user. */
1262 if (startp == NULL)
1263 lr_error (ldfile, _("\
1264 %s: symbolic range ellipsis must not directly follow `order_start'"),
1265 "LC_COLLATE");
1266 else if (endp == NULL)
1267 lr_error (ldfile, _("\
1268 %s: symbolic range ellipsis must not be directly followed by `order_end'"),
1269 "LC_COLLATE");
1270 else
1272 /* Determine the range. To do so we have to determine the
1273 common prefix of the both names and then the numeric
1274 values of both ends. */
1275 size_t lenfrom = strlen (startp->name);
1276 size_t lento = strlen (endp->name);
1277 char buf[lento + 1];
1278 int preflen = 0;
1279 long int from;
1280 long int to;
1281 char *cp;
1282 int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1284 if (lenfrom != lento)
1286 invalid_range:
1287 lr_error (ldfile, _("\
1288 `%s' and `%.*s' are no valid names for symbolic range"),
1289 startp->name, (int) lento, endp->name);
1290 return;
1293 while (startp->name[preflen] == endp->name[preflen])
1294 if (startp->name[preflen] == '\0')
1295 /* Nothing to be done. The start and end point are identical
1296 and while inserting the end point we have already given
1297 the user an error message. */
1298 return;
1299 else
1300 ++preflen;
1302 errno = 0;
1303 from = strtol (startp->name + preflen, &cp, base);
1304 if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1305 goto invalid_range;
1307 errno = 0;
1308 to = strtol (endp->name + preflen, &cp, base);
1309 if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1310 goto invalid_range;
1312 /* Copy the prefix. */
1313 memcpy (buf, startp->name, preflen);
1315 /* Loop over all values. */
1316 for (++from; from < to; ++from)
1318 struct element_t *elem = NULL;
1319 struct charseq *seq;
1320 uint32_t wc;
1321 int cnt;
1323 /* Generate the the name. */
1324 sprintf (buf + preflen, base == 10 ? "%ld" : "%lX", from);
1326 /* Look whether this name is already defined. */
1327 if (find_entry (&collate->seq_table, buf, symlen,
1328 (void **) &elem) == 0)
1330 if (elem->next != NULL || (collate->cursor != NULL
1331 && elem->next == collate->cursor))
1333 lr_error (ldfile, _("\
1334 %s: order for `%.*s' already defined at %s:%Zu"),
1335 "LC_COLLATE", (int) lenfrom, buf,
1336 elem->file, elem->line);
1337 continue;
1340 if (elem->name == NULL)
1342 lr_error (ldfile, _("%s: `%s' must be a character"),
1343 "LC_COLLATE", buf);
1344 continue;
1348 if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1350 /* Search for a character of this name. */
1351 seq = charmap_find_value (charmap, buf, lenfrom);
1352 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1354 wc = repertoire_find_value (repertoire, buf, lenfrom);
1356 if (seq != NULL)
1357 seq->ucs4 = wc;
1359 else
1360 wc = seq->ucs4;
1362 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1363 /* We don't know anything about a character with this
1364 name. XXX Should we warn? */
1365 continue;
1367 if (elem == NULL)
1369 uint32_t wcs[2] = { wc, 0 };
1371 /* We have to allocate an entry. */
1372 elem = new_element (collate,
1373 seq != NULL ? seq->bytes : NULL,
1374 seq != NULL ? seq->nbytes : 0,
1375 wc == ILLEGAL_CHAR_VALUE
1376 ? NULL : wcs, buf, lenfrom, 1);
1378 else
1380 /* Update the element. */
1381 if (seq != NULL)
1383 elem->mbs = obstack_copy0 (&collate->mempool,
1384 seq->bytes, seq->nbytes);
1385 elem->nmbs = seq->nbytes;
1388 if (wc != ILLEGAL_CHAR_VALUE)
1390 uint32_t zero = 0;
1392 obstack_grow (&collate->mempool,
1393 &wc, sizeof (uint32_t));
1394 obstack_grow (&collate->mempool,
1395 &zero, sizeof (uint32_t));
1396 elem->wcs = obstack_finish (&collate->mempool);
1397 elem->nwcs = 1;
1401 elem->file = ldfile->fname;
1402 elem->line = ldfile->lineno;
1403 elem->section = collate->current_section;
1406 /* Enqueue the new element. */
1407 elem->last = collate->cursor;
1408 elem->next = collate->cursor->next;
1409 elem->last->next = elem;
1410 if (elem->next != NULL)
1411 elem->next->last = elem;
1412 collate->cursor = elem;
1414 /* Now add the weights. They come from the `ellipsis_weights'
1415 member of `collate'. */
1416 elem->weights = (struct element_list_t *)
1417 obstack_alloc (&collate->mempool,
1418 nrules * sizeof (struct element_list_t));
1419 for (cnt = 0; cnt < nrules; ++cnt)
1420 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1421 && (collate->ellipsis_weight.weights[cnt].w[0]
1422 == ELEMENT_ELLIPSIS2))
1424 elem->weights[cnt].w = (struct element_t **)
1425 obstack_alloc (&collate->mempool,
1426 sizeof (struct element_t *));
1427 elem->weights[cnt].w[0] = elem;
1428 elem->weights[cnt].cnt = 1;
1430 else
1432 /* Simly use the weight from `ellipsis_weight'. */
1433 elem->weights[cnt].w =
1434 collate->ellipsis_weight.weights[cnt].w;
1435 elem->weights[cnt].cnt =
1436 collate->ellipsis_weight.weights[cnt].cnt;
1444 static void
1445 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1446 struct localedef_t *copy_locale, int ignore_content)
1448 if (!ignore_content && locale->categories[LC_COLLATE].collate == NULL)
1450 struct locale_collate_t *collate;
1452 if (copy_locale == NULL)
1454 collate = locale->categories[LC_COLLATE].collate =
1455 (struct locale_collate_t *)
1456 xcalloc (1, sizeof (struct locale_collate_t));
1458 /* Init the various data structures. */
1459 init_hash (&collate->elem_table, 100);
1460 init_hash (&collate->sym_table, 100);
1461 init_hash (&collate->seq_table, 500);
1462 obstack_init (&collate->mempool);
1464 collate->col_weight_max = -1;
1466 else
1467 /* Reuse the copy_locale's data structures. */
1468 collate = locale->categories[LC_COLLATE].collate =
1469 copy_locale->categories[LC_COLLATE].collate;
1472 ldfile->translate_strings = 0;
1473 ldfile->return_widestr = 0;
1477 void
1478 collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
1480 /* Now is the time when we can assign the individual collation
1481 values for all the symbols. We have possibly different values
1482 for the wide- and the multibyte-character symbols. This is done
1483 since it might make a difference in the encoding if there is in
1484 some cases no multibyte-character but there are wide-characters.
1485 (The other way around it is not important since theencoded
1486 collation value in the wide-character case is 32 bits wide and
1487 therefore requires no encoding).
1489 The lowest collation value assigned is 2. Zero is reserved for
1490 the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1491 functions and 1 is used to separate the individual passes for the
1492 different rules.
1494 We also have to construct is list with all the bytes/words which
1495 can come first in a sequence, followed by all the elements which
1496 also start with this byte/word. The order is reverse which has
1497 among others the important effect that longer strings are located
1498 first in the list. This is required for the output data since
1499 the algorithm used in `strcoll' etc depends on this.
1501 The multibyte case is easy. We simply sort into an array with
1502 256 elements. */
1503 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1504 int mbact[nrules];
1505 int wcact;
1506 int mbseqact;
1507 int wcseqact;
1508 struct element_t *runp;
1509 int i;
1510 int need_undefined = 0;
1511 struct section_list *sect;
1512 int ruleidx;
1513 int nr_wide_elems = 0;
1515 if (collate == NULL)
1517 /* No data, no check. */
1518 if (! be_quiet)
1519 WITH_CUR_LOCALE (error (0, 0, _("No definition for %s category found"),
1520 "LC_COLLATE"));
1521 return;
1524 /* If this assertion is hit change the type in `element_t'. */
1525 assert (nrules <= sizeof (runp->used_in_level) * 8);
1527 /* Make sure that the `position' rule is used either in all sections
1528 or in none. */
1529 for (i = 0; i < nrules; ++i)
1530 for (sect = collate->sections; sect != NULL; sect = sect->next)
1531 if (sect->rules != NULL
1532 && ((sect->rules[i] & sort_position)
1533 != (collate->sections->rules[i] & sort_position)))
1535 WITH_CUR_LOCALE (error (0, 0, _("\
1536 %s: `position' must be used for a specific level in all sections or none"),
1537 "LC_COLLATE"));
1538 break;
1541 /* Find out which elements are used at which level. At the same
1542 time we find out whether we have any undefined symbols. */
1543 runp = collate->start;
1544 while (runp != NULL)
1546 if (runp->mbs != NULL)
1548 for (i = 0; i < nrules; ++i)
1550 int j;
1552 for (j = 0; j < runp->weights[i].cnt; ++j)
1553 /* A NULL pointer as the weight means IGNORE. */
1554 if (runp->weights[i].w[j] != NULL)
1556 if (runp->weights[i].w[j]->weights == NULL)
1558 WITH_CUR_LOCALE (error_at_line (0, 0, runp->file,
1559 runp->line,
1560 _("symbol `%s' not defined"),
1561 runp->weights[i].w[j]->name));
1563 need_undefined = 1;
1564 runp->weights[i].w[j] = &collate->undefined;
1566 else
1567 /* Set the bit for the level. */
1568 runp->weights[i].w[j]->used_in_level |= 1 << i;
1573 /* Up to the next entry. */
1574 runp = runp->next;
1577 /* Walk through the list of defined sequences and assign weights. Also
1578 create the data structure which will allow generating the single byte
1579 character based tables.
1581 Since at each time only the weights for each of the rules are
1582 only compared to other weights for this rule it is possible to
1583 assign more compact weight values than simply counting all
1584 weights in sequence. We can assign weights from 3, one for each
1585 rule individually and only for those elements, which are actually
1586 used for this rule.
1588 Why is this important? It is not for the wide char table. But
1589 it is for the singlebyte output since here larger numbers have to
1590 be encoded to make it possible to emit the value as a byte
1591 string. */
1592 for (i = 0; i < nrules; ++i)
1593 mbact[i] = 2;
1594 wcact = 2;
1595 mbseqact = 0;
1596 wcseqact = 0;
1597 runp = collate->start;
1598 while (runp != NULL)
1600 /* Determine the order. */
1601 if (runp->used_in_level != 0)
1603 runp->mborder = (int *) obstack_alloc (&collate->mempool,
1604 nrules * sizeof (int));
1606 for (i = 0; i < nrules; ++i)
1607 if ((runp->used_in_level & (1 << i)) != 0)
1608 runp->mborder[i] = mbact[i]++;
1609 else
1610 runp->mborder[i] = 0;
1613 if (runp->mbs != NULL)
1615 struct element_t **eptr;
1616 struct element_t *lastp = NULL;
1618 /* Find the point where to insert in the list. */
1619 eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1620 while (*eptr != NULL)
1622 if ((*eptr)->nmbs < runp->nmbs)
1623 break;
1625 if ((*eptr)->nmbs == runp->nmbs)
1627 int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1629 if (c == 0)
1631 /* This should not happen. It means that we have
1632 to symbols with the same byte sequence. It is
1633 of course an error. */
1634 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1635 (*eptr)->line,
1636 _("\
1637 symbol `%s' has the same encoding as"), (*eptr)->name);
1638 error_at_line (0, 0, runp->file,
1639 runp->line,
1640 _("symbol `%s'"),
1641 runp->name));
1642 goto dont_insert;
1644 else if (c < 0)
1645 /* Insert it here. */
1646 break;
1649 /* To the next entry. */
1650 lastp = *eptr;
1651 eptr = &(*eptr)->mbnext;
1654 /* Set the pointers. */
1655 runp->mbnext = *eptr;
1656 runp->mblast = lastp;
1657 if (*eptr != NULL)
1658 (*eptr)->mblast = runp;
1659 *eptr = runp;
1660 dont_insert:
1664 if (runp->used_in_level)
1666 runp->wcorder = wcact++;
1668 /* We take the opportunity to count the elements which have
1669 wide characters. */
1670 ++nr_wide_elems;
1673 if (runp->is_character)
1675 if (runp->nmbs == 1)
1676 collate->mbseqorder[((unsigned char *) runp->mbs)[0]] = mbseqact++;
1678 runp->wcseqorder = wcseqact++;
1680 else if (runp->mbs != NULL && runp->weights != NULL)
1681 /* This is for collation elements. */
1682 runp->wcseqorder = wcseqact++;
1684 /* Up to the next entry. */
1685 runp = runp->next;
1688 /* Find out whether any of the `mbheads' entries is unset. In this
1689 case we use the UNDEFINED entry. */
1690 for (i = 1; i < 256; ++i)
1691 if (collate->mbheads[i] == NULL)
1693 need_undefined = 1;
1694 collate->mbheads[i] = &collate->undefined;
1697 /* Now to the wide character case. */
1698 collate->wcheads.p = 6;
1699 collate->wcheads.q = 10;
1700 wchead_table_init (&collate->wcheads);
1702 collate->wcseqorder.p = 6;
1703 collate->wcseqorder.q = 10;
1704 collseq_table_init (&collate->wcseqorder);
1706 /* Start adding. */
1707 runp = collate->start;
1708 while (runp != NULL)
1710 if (runp->wcs != NULL)
1712 struct element_t *e;
1713 struct element_t **eptr;
1714 struct element_t *lastp;
1716 /* Insert the collation sequence value. */
1717 if (runp->is_character)
1718 collseq_table_add (&collate->wcseqorder, runp->wcs[0],
1719 runp->wcseqorder);
1721 /* Find the point where to insert in the list. */
1722 e = wchead_table_get (&collate->wcheads, runp->wcs[0]);
1723 eptr = &e;
1724 lastp = NULL;
1725 while (*eptr != NULL)
1727 if ((*eptr)->nwcs < runp->nwcs)
1728 break;
1730 if ((*eptr)->nwcs == runp->nwcs)
1732 int c = wmemcmp ((wchar_t *) (*eptr)->wcs,
1733 (wchar_t *) runp->wcs, runp->nwcs);
1735 if (c == 0)
1737 /* This should not happen. It means that we have
1738 two symbols with the same byte sequence. It is
1739 of course an error. */
1740 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1741 (*eptr)->line,
1742 _("\
1743 symbol `%s' has the same encoding as"), (*eptr)->name);
1744 error_at_line (0, 0, runp->file,
1745 runp->line,
1746 _("symbol `%s'"),
1747 runp->name));
1748 goto dont_insertwc;
1750 else if (c < 0)
1751 /* Insert it here. */
1752 break;
1755 /* To the next entry. */
1756 lastp = *eptr;
1757 eptr = &(*eptr)->wcnext;
1760 /* Set the pointers. */
1761 runp->wcnext = *eptr;
1762 runp->wclast = lastp;
1763 if (*eptr != NULL)
1764 (*eptr)->wclast = runp;
1765 *eptr = runp;
1766 if (eptr == &e)
1767 wchead_table_add (&collate->wcheads, runp->wcs[0], e);
1768 dont_insertwc:
1772 /* Up to the next entry. */
1773 runp = runp->next;
1776 collseq_table_finalize (&collate->wcseqorder);
1778 /* Now determine whether the UNDEFINED entry is needed and if yes,
1779 whether it was defined. */
1780 collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1781 if (collate->undefined.file == NULL)
1783 if (need_undefined)
1785 /* This seems not to be enforced by recent standards. Don't
1786 emit an error, simply append UNDEFINED at the end. */
1787 if (0)
1788 WITH_CUR_LOCALE (error (0, 0, _("no definition of `UNDEFINED'")));
1790 /* Add UNDEFINED at the end. */
1791 collate->undefined.mborder =
1792 (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1794 for (i = 0; i < nrules; ++i)
1795 collate->undefined.mborder[i] = mbact[i]++;
1798 /* In any case we will need the definition for the wide character
1799 case. But we will not complain that it is missing since the
1800 specification strangely enough does not seem to account for
1801 this. */
1802 collate->undefined.wcorder = wcact++;
1805 /* Finally, try to unify the rules for the sections. Whenever the rules
1806 for a section are the same as those for another section give the
1807 ruleset the same index. Since there are never many section we can
1808 use an O(n^2) algorithm here. */
1809 sect = collate->sections;
1810 while (sect != NULL && sect->rules == NULL)
1811 sect = sect->next;
1812 assert (sect != NULL);
1813 ruleidx = 0;
1816 struct section_list *osect = collate->sections;
1818 while (osect != sect)
1819 if (osect->rules != NULL
1820 && memcmp (osect->rules, sect->rules, nrules) == 0)
1821 break;
1822 else
1823 osect = osect->next;
1825 if (osect == sect)
1826 sect->ruleidx = ruleidx++;
1827 else
1828 sect->ruleidx = osect->ruleidx;
1830 /* Next section. */
1832 sect = sect->next;
1833 while (sect != NULL && sect->rules == NULL);
1835 while (sect != NULL);
1836 /* We are currently not prepared for more than 128 rulesets. But this
1837 should never really be a problem. */
1838 assert (ruleidx <= 128);
1842 static int32_t
1843 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1844 struct element_t *elem)
1846 size_t cnt;
1847 int32_t retval;
1849 /* Optimize the use of UNDEFINED. */
1850 if (elem == &collate->undefined)
1851 /* The weights are already inserted. */
1852 return 0;
1854 /* This byte can start exactly one collation element and this is
1855 a single byte. We can directly give the index to the weights. */
1856 retval = obstack_object_size (pool);
1858 /* Construct the weight. */
1859 for (cnt = 0; cnt < nrules; ++cnt)
1861 char buf[elem->weights[cnt].cnt * 7];
1862 int len = 0;
1863 int i;
1865 for (i = 0; i < elem->weights[cnt].cnt; ++i)
1866 /* Encode the weight value. We do nothing for IGNORE entries. */
1867 if (elem->weights[cnt].w[i] != NULL)
1868 len += utf8_encode (&buf[len],
1869 elem->weights[cnt].w[i]->mborder[cnt]);
1871 /* And add the buffer content. */
1872 obstack_1grow (pool, len);
1873 obstack_grow (pool, buf, len);
1876 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1880 static int32_t
1881 output_weightwc (struct obstack *pool, struct locale_collate_t *collate,
1882 struct element_t *elem)
1884 size_t cnt;
1885 int32_t retval;
1887 /* Optimize the use of UNDEFINED. */
1888 if (elem == &collate->undefined)
1889 /* The weights are already inserted. */
1890 return 0;
1892 /* This byte can start exactly one collation element and this is
1893 a single byte. We can directly give the index to the weights. */
1894 retval = obstack_object_size (pool) / sizeof (int32_t);
1896 /* Construct the weight. */
1897 for (cnt = 0; cnt < nrules; ++cnt)
1899 int32_t buf[elem->weights[cnt].cnt];
1900 int i;
1901 int32_t j;
1903 for (i = 0, j = 0; i < elem->weights[cnt].cnt; ++i)
1904 if (elem->weights[cnt].w[i] != NULL)
1905 buf[j++] = elem->weights[cnt].w[i]->wcorder;
1907 /* And add the buffer content. */
1908 obstack_int32_grow (pool, j);
1910 obstack_grow (pool, buf, j * sizeof (int32_t));
1913 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1917 void
1918 collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
1919 const char *output_path)
1921 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1922 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
1923 struct iovec iov[2 + nelems];
1924 struct locale_file data;
1925 uint32_t idx[nelems];
1926 size_t cnt;
1927 size_t ch;
1928 int32_t tablemb[256];
1929 struct obstack weightpool;
1930 struct obstack extrapool;
1931 struct obstack indirectpool;
1932 struct section_list *sect;
1933 struct collidx_table tablewc;
1934 uint32_t elem_size;
1935 uint32_t *elem_table;
1936 int i;
1937 struct element_t *runp;
1939 data.magic = LIMAGIC (LC_COLLATE);
1940 data.n = nelems;
1941 iov[0].iov_base = (void *) &data;
1942 iov[0].iov_len = sizeof (data);
1944 iov[1].iov_base = (void *) idx;
1945 iov[1].iov_len = sizeof (idx);
1947 idx[0] = iov[0].iov_len + iov[1].iov_len;
1948 cnt = 0;
1950 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES));
1951 iov[2 + cnt].iov_base = &nrules;
1952 iov[2 + cnt].iov_len = sizeof (uint32_t);
1953 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1954 ++cnt;
1956 /* If we have no LC_COLLATE data emit only the number of rules as zero. */
1957 if (collate == NULL)
1959 int32_t dummy = 0;
1961 while (cnt < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
1963 /* The words have to be handled specially. */
1964 if (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
1966 iov[2 + cnt].iov_base = &dummy;
1967 iov[2 + cnt].iov_len = sizeof (int32_t);
1969 else
1971 iov[2 + cnt].iov_base = NULL;
1972 iov[2 + cnt].iov_len = 0;
1975 if (cnt + 1 < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
1976 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1977 ++cnt;
1980 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
1982 write_locale_data (output_path, "LC_COLLATE", 2 + cnt, iov);
1984 return;
1987 obstack_init (&weightpool);
1988 obstack_init (&extrapool);
1989 obstack_init (&indirectpool);
1991 /* Since we are using the sign of an integer to mark indirection the
1992 offsets in the arrays we are indirectly referring to must not be
1993 zero since -0 == 0. Therefore we add a bit of dummy content. */
1994 obstack_int32_grow (&extrapool, 0);
1995 obstack_int32_grow (&indirectpool, 0);
1997 /* Prepare the ruleset table. */
1998 for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
1999 if (sect->rules != NULL && sect->ruleidx == i)
2001 int j;
2003 obstack_make_room (&weightpool, nrules);
2005 for (j = 0; j < nrules; ++j)
2006 obstack_1grow_fast (&weightpool, sect->rules[j]);
2007 ++i;
2009 /* And align the output. */
2010 i = (nrules * i) % __alignof__ (int32_t);
2011 if (i > 0)
2013 obstack_1grow (&weightpool, '\0');
2014 while (++i < __alignof__ (int32_t));
2016 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_RULESETS));
2017 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2018 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2019 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2020 ++cnt;
2022 /* Generate the 8-bit table. Walk through the lists of sequences
2023 starting with the same byte and add them one after the other to
2024 the table. In case we have more than one sequence starting with
2025 the same byte we have to use extra indirection.
2027 First add a record for the NUL byte. This entry will never be used
2028 so it does not matter. */
2029 tablemb[0] = 0;
2031 /* Now insert the `UNDEFINED' value if it is used. Since this value
2032 will probably be used more than once it is good to store the
2033 weights only once. */
2034 if (collate->undefined.used_in_level != 0)
2035 output_weight (&weightpool, collate, &collate->undefined);
2037 for (ch = 1; ch < 256; ++ch)
2038 if (collate->mbheads[ch]->mbnext == NULL
2039 && collate->mbheads[ch]->nmbs <= 1)
2041 tablemb[ch] = output_weight (&weightpool, collate,
2042 collate->mbheads[ch]);
2044 else
2046 /* The entries in the list are sorted by length and then
2047 alphabetically. This is the order in which we will add the
2048 elements to the collation table. This allows simply walking
2049 the table in sequence and stopping at the first matching
2050 entry. Since the longer sequences are coming first in the
2051 list they have the possibility to match first, just as it
2052 has to be. In the worst case we are walking to the end of
2053 the list where we put, if no singlebyte sequence is defined
2054 in the locale definition, the weights for UNDEFINED.
2056 To reduce the length of the search list we compress them a bit.
2057 This happens by collecting sequences of consecutive byte
2058 sequences in one entry (having and begin and end byte sequence)
2059 and add only one index into the weight table. We can find the
2060 consecutive entries since they are also consecutive in the list. */
2061 struct element_t *runp = collate->mbheads[ch];
2062 struct element_t *lastp;
2064 assert ((obstack_object_size (&extrapool)
2065 & (__alignof__ (int32_t) - 1)) == 0);
2067 tablemb[ch] = -obstack_object_size (&extrapool);
2071 /* Store the current index in the weight table. We know that
2072 the current position in the `extrapool' is aligned on a
2073 32-bit address. */
2074 int32_t weightidx;
2075 int added;
2077 /* Find out wether this is a single entry or we have more than
2078 one consecutive entry. */
2079 if (runp->mbnext != NULL
2080 && runp->nmbs == runp->mbnext->nmbs
2081 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
2082 && (runp->mbs[runp->nmbs - 1]
2083 == runp->mbnext->mbs[runp->nmbs - 1] + 1))
2085 int i;
2086 struct element_t *series_startp = runp;
2087 struct element_t *curp;
2089 /* Compute how much space we will need. */
2090 added = ((sizeof (int32_t) + 1 + 2 * (runp->nmbs - 1)
2091 + __alignof__ (int32_t) - 1)
2092 & ~(__alignof__ (int32_t) - 1));
2093 assert ((obstack_object_size (&extrapool)
2094 & (__alignof__ (int32_t) - 1)) == 0);
2095 obstack_make_room (&extrapool, added);
2097 /* More than one consecutive entry. We mark this by having
2098 a negative index into the indirect table. */
2099 obstack_int32_grow_fast (&extrapool,
2100 -(obstack_object_size (&indirectpool)
2101 / sizeof (int32_t)));
2103 /* Now search first the end of the series. */
2105 runp = runp->mbnext;
2106 while (runp->mbnext != NULL
2107 && runp->nmbs == runp->mbnext->nmbs
2108 && memcmp (runp->mbs, runp->mbnext->mbs,
2109 runp->nmbs - 1) == 0
2110 && (runp->mbs[runp->nmbs - 1]
2111 == runp->mbnext->mbs[runp->nmbs - 1] + 1));
2113 /* Now walk backward from here to the beginning. */
2114 curp = runp;
2116 assert (runp->nmbs <= 256);
2117 obstack_1grow_fast (&extrapool, curp->nmbs - 1);
2118 for (i = 1; i < curp->nmbs; ++i)
2119 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2121 /* Now find the end of the consecutive sequence and
2122 add all the indeces in the indirect pool. */
2125 weightidx = output_weight (&weightpool, collate, curp);
2126 obstack_int32_grow (&indirectpool, weightidx);
2128 curp = curp->mblast;
2130 while (curp != series_startp);
2132 /* Add the final weight. */
2133 weightidx = output_weight (&weightpool, collate, curp);
2134 obstack_int32_grow (&indirectpool, weightidx);
2136 /* And add the end byte sequence. Without length this
2137 time. */
2138 for (i = 1; i < curp->nmbs; ++i)
2139 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2141 else
2143 /* A single entry. Simply add the index and the length and
2144 string (except for the first character which is already
2145 tested for). */
2146 int i;
2148 /* Output the weight info. */
2149 weightidx = output_weight (&weightpool, collate, runp);
2151 added = ((sizeof (int32_t) + 1 + runp->nmbs - 1
2152 + __alignof__ (int32_t) - 1)
2153 & ~(__alignof__ (int32_t) - 1));
2154 assert ((obstack_object_size (&extrapool)
2155 & (__alignof__ (int32_t) - 1)) == 0);
2156 obstack_make_room (&extrapool, added);
2158 obstack_int32_grow_fast (&extrapool, weightidx);
2159 assert (runp->nmbs <= 256);
2160 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
2162 for (i = 1; i < runp->nmbs; ++i)
2163 obstack_1grow_fast (&extrapool, runp->mbs[i]);
2166 /* Add alignment bytes if necessary. */
2167 while ((obstack_object_size (&extrapool)
2168 & (__alignof__ (int32_t) - 1)) != 0)
2169 obstack_1grow_fast (&extrapool, '\0');
2171 /* Next entry. */
2172 lastp = runp;
2173 runp = runp->mbnext;
2175 while (runp != NULL);
2177 assert ((obstack_object_size (&extrapool)
2178 & (__alignof__ (int32_t) - 1)) == 0);
2180 /* If the final entry in the list is not a single character we
2181 add an UNDEFINED entry here. */
2182 if (lastp->nmbs != 1)
2184 int added = ((sizeof (int32_t) + 1 + 1 + __alignof__ (int32_t) - 1)
2185 & ~(__alignof__ (int32_t) - 1));
2186 obstack_make_room (&extrapool, added);
2188 obstack_int32_grow_fast (&extrapool, 0);
2189 /* XXX What rule? We just pick the first. */
2190 obstack_1grow_fast (&extrapool, 0);
2191 /* Length is zero. */
2192 obstack_1grow_fast (&extrapool, 0);
2194 /* Add alignment bytes if necessary. */
2195 while ((obstack_object_size (&extrapool)
2196 & (__alignof__ (int32_t) - 1)) != 0)
2197 obstack_1grow_fast (&extrapool, '\0');
2201 /* Add padding to the tables if necessary. */
2202 while ((obstack_object_size (&weightpool) & (__alignof__ (int32_t) - 1))
2203 != 0)
2204 obstack_1grow (&weightpool, 0);
2206 /* Now add the four tables. */
2207 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEMB));
2208 iov[2 + cnt].iov_base = tablemb;
2209 iov[2 + cnt].iov_len = sizeof (tablemb);
2210 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2211 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2212 ++cnt;
2214 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB));
2215 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2216 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2217 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2218 ++cnt;
2220 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB));
2221 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2222 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2223 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2224 ++cnt;
2226 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB));
2227 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2228 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2229 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2230 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2231 ++cnt;
2234 /* Now the same for the wide character table. We need to store some
2235 more information here. */
2236 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP1));
2237 iov[2 + cnt].iov_base = NULL;
2238 iov[2 + cnt].iov_len = 0;
2239 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2240 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2241 ++cnt;
2243 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP2));
2244 iov[2 + cnt].iov_base = NULL;
2245 iov[2 + cnt].iov_len = 0;
2246 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2247 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2248 ++cnt;
2250 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP3));
2251 iov[2 + cnt].iov_base = NULL;
2252 iov[2 + cnt].iov_len = 0;
2253 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2254 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2255 ++cnt;
2257 /* Since we are using the sign of an integer to mark indirection the
2258 offsets in the arrays we are indirectly referring to must not be
2259 zero since -0 == 0. Therefore we add a bit of dummy content. */
2260 obstack_int32_grow (&extrapool, 0);
2261 obstack_int32_grow (&indirectpool, 0);
2263 /* Now insert the `UNDEFINED' value if it is used. Since this value
2264 will probably be used more than once it is good to store the
2265 weights only once. */
2266 if (output_weightwc (&weightpool, collate, &collate->undefined) != 0)
2267 abort ();
2269 /* Generate the table. Walk through the lists of sequences starting
2270 with the same wide character and add them one after the other to
2271 the table. In case we have more than one sequence starting with
2272 the same byte we have to use extra indirection. */
2274 auto void add_to_tablewc (uint32_t ch, struct element_t *runp);
2276 void add_to_tablewc (uint32_t ch, struct element_t *runp)
2278 if (runp->wcnext == NULL && runp->nwcs == 1)
2280 int32_t weigthidx = output_weightwc (&weightpool, collate, runp);
2281 collidx_table_add (&tablewc, ch, weigthidx);
2283 else
2285 /* As for the singlebyte table, we recognize sequences and
2286 compress them. */
2287 struct element_t *lastp;
2289 collidx_table_add (&tablewc, ch,
2290 -(obstack_object_size (&extrapool) / sizeof (uint32_t)));
2294 /* Store the current index in the weight table. We know that
2295 the current position in the `extrapool' is aligned on a
2296 32-bit address. */
2297 int32_t weightidx;
2298 int added;
2300 /* Find out wether this is a single entry or we have more than
2301 one consecutive entry. */
2302 if (runp->wcnext != NULL
2303 && runp->nwcs == runp->wcnext->nwcs
2304 && wmemcmp ((wchar_t *) runp->wcs,
2305 (wchar_t *)runp->wcnext->wcs,
2306 runp->nwcs - 1) == 0
2307 && (runp->wcs[runp->nwcs - 1]
2308 == runp->wcnext->wcs[runp->nwcs - 1] + 1))
2310 int i;
2311 struct element_t *series_startp = runp;
2312 struct element_t *curp;
2314 /* Now add first the initial byte sequence. */
2315 added = (1 + 1 + 2 * (runp->nwcs - 1)) * sizeof (int32_t);
2316 if (sizeof (int32_t) == sizeof (int))
2317 obstack_make_room (&extrapool, added);
2319 /* More than one consecutive entry. We mark this by having
2320 a negative index into the indirect table. */
2321 obstack_int32_grow_fast (&extrapool,
2322 -(obstack_object_size (&indirectpool)
2323 / sizeof (int32_t)));
2324 obstack_int32_grow_fast (&extrapool, runp->nwcs - 1);
2327 runp = runp->wcnext;
2328 while (runp->wcnext != NULL
2329 && runp->nwcs == runp->wcnext->nwcs
2330 && wmemcmp ((wchar_t *) runp->wcs,
2331 (wchar_t *)runp->wcnext->wcs,
2332 runp->nwcs - 1) == 0
2333 && (runp->wcs[runp->nwcs - 1]
2334 == runp->wcnext->wcs[runp->nwcs - 1] + 1));
2336 /* Now walk backward from here to the beginning. */
2337 curp = runp;
2339 for (i = 1; i < runp->nwcs; ++i)
2340 obstack_int32_grow_fast (&extrapool, curp->wcs[i]);
2342 /* Now find the end of the consecutive sequence and
2343 add all the indeces in the indirect pool. */
2346 weightidx = output_weightwc (&weightpool, collate,
2347 curp);
2348 obstack_int32_grow (&indirectpool, weightidx);
2350 curp = curp->wclast;
2352 while (curp != series_startp);
2354 /* Add the final weight. */
2355 weightidx = output_weightwc (&weightpool, collate, curp);
2356 obstack_int32_grow (&indirectpool, weightidx);
2358 /* And add the end byte sequence. Without length this
2359 time. */
2360 for (i = 1; i < curp->nwcs; ++i)
2361 obstack_int32_grow (&extrapool, curp->wcs[i]);
2363 else
2365 /* A single entry. Simply add the index and the length and
2366 string (except for the first character which is already
2367 tested for). */
2368 int i;
2370 /* Output the weight info. */
2371 weightidx = output_weightwc (&weightpool, collate, runp);
2373 added = (1 + 1 + runp->nwcs - 1) * sizeof (int32_t);
2374 if (sizeof (int) == sizeof (int32_t))
2375 obstack_make_room (&extrapool, added);
2377 obstack_int32_grow_fast (&extrapool, weightidx);
2378 obstack_int32_grow_fast (&extrapool, runp->nwcs - 1);
2379 for (i = 1; i < runp->nwcs; ++i)
2380 obstack_int32_grow_fast (&extrapool, runp->wcs[i]);
2383 /* Next entry. */
2384 lastp = runp;
2385 runp = runp->wcnext;
2387 while (runp != NULL);
2391 tablewc.p = 6;
2392 tablewc.q = 10;
2393 collidx_table_init (&tablewc);
2395 wchead_table_iterate (&collate->wcheads, add_to_tablewc);
2397 collidx_table_finalize (&tablewc);
2400 /* Now add the four tables. */
2401 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEWC));
2402 iov[2 + cnt].iov_base = tablewc.result;
2403 iov[2 + cnt].iov_len = tablewc.result_size;
2404 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2405 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2406 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2407 ++cnt;
2409 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTWC));
2410 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2411 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2412 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2413 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2414 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2415 ++cnt;
2417 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAWC));
2418 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2419 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2420 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2421 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2422 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2423 ++cnt;
2425 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTWC));
2426 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2427 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2428 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2429 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2430 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2431 ++cnt;
2434 /* Finally write the table with collation element names out. It is
2435 a hash table with a simple function which gets the name of the
2436 character as the input. One character might have many names. The
2437 value associated with the name is an index into the weight table
2438 where we are then interested in the first-level weight value.
2440 To determine how large the table should be we are counting the
2441 elements have to put in. Since we are using internal chaining
2442 using a secondary hash function we have to make the table a bit
2443 larger to avoid extremely long search times. We can achieve
2444 good results with a 40% larger table than there are entries. */
2445 elem_size = 0;
2446 runp = collate->start;
2447 while (runp != NULL)
2449 if (runp->mbs != NULL && runp->weights != NULL)
2450 /* Yep, the element really counts. */
2451 ++elem_size;
2453 runp = runp->next;
2455 /* Add 40% and find the next prime number. */
2456 elem_size = MIN (next_prime (elem_size * 1.4), 257);
2458 /* Allocate the table. Each entry consists of two words: the hash
2459 value and an index in a secondary table which provides the index
2460 into the weight table and the string itself (so that a match can
2461 be determined). */
2462 elem_table = (uint32_t *) obstack_alloc (&extrapool,
2463 elem_size * 2 * sizeof (uint32_t));
2464 memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t));
2466 /* Now add the elements. */
2467 runp = collate->start;
2468 while (runp != NULL)
2470 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2472 /* Compute the hash value of the name. */
2473 uint32_t namelen = strlen (runp->name);
2474 uint32_t hash = elem_hash (runp->name, namelen);
2475 size_t idx = hash % elem_size;
2477 if (elem_table[idx * 2] != 0)
2479 /* The spot is already take. Try iterating using the value
2480 from the secondary hashing function. */
2481 size_t iter = hash % (elem_size - 2);
2485 idx += iter;
2486 if (idx >= elem_size)
2487 idx -= elem_size;
2489 while (elem_table[idx * 2] != 0);
2491 /* This is the spot where we will insert the value. */
2492 elem_table[idx * 2] = hash;
2493 elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
2495 /* The the string itself including length. */
2496 obstack_1grow (&extrapool, namelen);
2497 obstack_grow (&extrapool, runp->name, namelen);
2499 /* And the multibyte representation. */
2500 obstack_1grow (&extrapool, runp->nmbs);
2501 obstack_grow (&extrapool, runp->mbs, runp->nmbs);
2503 /* And align again to 32 bits. */
2504 if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0)
2505 obstack_grow (&extrapool, "\0\0",
2506 (sizeof (int32_t)
2507 - ((1 + namelen + 1 + runp->nmbs)
2508 % sizeof (int32_t))));
2510 /* Now some 32-bit values: multibyte collation sequence,
2511 wide char string (including length), and wide char
2512 collation sequence. */
2513 obstack_int32_grow (&extrapool, runp->mbseqorder);
2515 obstack_int32_grow (&extrapool, runp->nwcs);
2516 obstack_grow (&extrapool, runp->wcs,
2517 runp->nwcs * sizeof (uint32_t));
2519 obstack_int32_grow (&extrapool, runp->wcseqorder);
2522 runp = runp->next;
2525 /* Prepare to write out this data. */
2526 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB));
2527 iov[2 + cnt].iov_base = &elem_size;
2528 iov[2 + cnt].iov_len = sizeof (int32_t);
2529 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2530 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2531 ++cnt;
2533 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_TABLEMB));
2534 iov[2 + cnt].iov_base = elem_table;
2535 iov[2 + cnt].iov_len = elem_size * 2 * sizeof (int32_t);
2536 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2537 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2538 ++cnt;
2540 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_EXTRAMB));
2541 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2542 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2543 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2544 ++cnt;
2546 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQMB));
2547 iov[2 + cnt].iov_base = collate->mbseqorder;
2548 iov[2 + cnt].iov_len = 256;
2549 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2550 ++cnt;
2552 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQWC));
2553 iov[2 + cnt].iov_base = collate->wcseqorder.result;
2554 iov[2 + cnt].iov_len = collate->wcseqorder.result_size;
2555 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2556 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2557 ++cnt;
2559 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_CODESET));
2560 iov[2 + cnt].iov_base = (void *) charmap->code_set_name;
2561 iov[2 + cnt].iov_len = strlen (iov[2 + cnt].iov_base) + 1;
2562 ++cnt;
2564 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2566 write_locale_data (output_path, "LC_COLLATE", 2 + cnt, iov);
2568 obstack_free (&weightpool, NULL);
2569 obstack_free (&extrapool, NULL);
2570 obstack_free (&indirectpool, NULL);
2574 void
2575 collate_read (struct linereader *ldfile, struct localedef_t *result,
2576 const struct charmap_t *charmap, const char *repertoire_name,
2577 int ignore_content)
2579 struct repertoire_t *repertoire = NULL;
2580 struct locale_collate_t *collate;
2581 struct token *now;
2582 struct token *arg = NULL;
2583 enum token_t nowtok;
2584 enum token_t was_ellipsis = tok_none;
2585 struct localedef_t *copy_locale = NULL;
2586 /* Parsing state:
2587 0 - start
2588 1 - between `order-start' and `order-end'
2589 2 - after `order-end'
2590 3 - after `reorder-after', waiting for `reorder-end'
2591 4 - after `reorder-end'
2592 5 - after `reorder-sections-after', waiting for `reorder-sections-end'
2593 6 - after `reorder-sections-end'
2595 int state = 0;
2597 /* Get the repertoire we have to use. */
2598 if (repertoire_name != NULL)
2599 repertoire = repertoire_read (repertoire_name);
2601 /* The rest of the line containing `LC_COLLATE' must be free. */
2602 lr_ignore_rest (ldfile, 1);
2606 now = lr_token (ldfile, charmap, result, NULL, verbose);
2607 nowtok = now->tok;
2609 while (nowtok == tok_eol);
2611 if (nowtok == tok_copy)
2613 state = 2;
2614 now = lr_token (ldfile, charmap, result, NULL, verbose);
2615 if (now->tok != tok_string)
2617 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2619 skip_category:
2621 now = lr_token (ldfile, charmap, result, NULL, verbose);
2622 while (now->tok != tok_eof && now->tok != tok_end);
2624 if (now->tok != tok_eof
2625 || (now = lr_token (ldfile, charmap, result, NULL, verbose),
2626 now->tok == tok_eof))
2627 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2628 else if (now->tok != tok_lc_collate)
2630 lr_error (ldfile, _("\
2631 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2632 lr_ignore_rest (ldfile, 0);
2634 else
2635 lr_ignore_rest (ldfile, 1);
2637 return;
2640 if (! ignore_content)
2642 /* Get the locale definition. */
2643 copy_locale = load_locale (LC_COLLATE, now->val.str.startmb,
2644 repertoire_name, charmap, NULL);
2645 if ((copy_locale->avail & COLLATE_LOCALE) == 0)
2647 /* Not yet loaded. So do it now. */
2648 if (locfile_read (copy_locale, charmap) != 0)
2649 goto skip_category;
2653 lr_ignore_rest (ldfile, 1);
2655 now = lr_token (ldfile, charmap, result, NULL, verbose);
2656 nowtok = now->tok;
2659 /* Prepare the data structures. */
2660 collate_startup (ldfile, result, copy_locale, ignore_content);
2661 collate = result->categories[LC_COLLATE].collate;
2663 while (1)
2665 char ucs4buf[10];
2666 char *symstr;
2667 size_t symlen;
2669 /* Of course we don't proceed beyond the end of file. */
2670 if (nowtok == tok_eof)
2671 break;
2673 /* Ingore empty lines. */
2674 if (nowtok == tok_eol)
2676 now = lr_token (ldfile, charmap, result, NULL, verbose);
2677 nowtok = now->tok;
2678 continue;
2681 switch (nowtok)
2683 case tok_copy:
2684 /* Allow copying other locales. */
2685 now = lr_token (ldfile, charmap, result, NULL, verbose);
2686 if (now->tok != tok_string)
2687 goto err_label;
2689 if (! ignore_content)
2690 load_locale (LC_COLLATE, now->val.str.startmb, repertoire_name,
2691 charmap, result);
2693 lr_ignore_rest (ldfile, 1);
2694 break;
2696 case tok_coll_weight_max:
2697 /* Ignore the rest of the line if we don't need the input of
2698 this line. */
2699 if (ignore_content)
2701 lr_ignore_rest (ldfile, 0);
2702 break;
2705 if (state != 0)
2706 goto err_label;
2708 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2709 if (arg->tok != tok_number)
2710 goto err_label;
2711 if (collate->col_weight_max != -1)
2712 lr_error (ldfile, _("%s: duplicate definition of `%s'"),
2713 "LC_COLLATE", "col_weight_max");
2714 else
2715 collate->col_weight_max = arg->val.num;
2716 lr_ignore_rest (ldfile, 1);
2717 break;
2719 case tok_section_symbol:
2720 /* Ignore the rest of the line if we don't need the input of
2721 this line. */
2722 if (ignore_content)
2724 lr_ignore_rest (ldfile, 0);
2725 break;
2728 if (state != 0)
2729 goto err_label;
2731 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2732 if (arg->tok != tok_bsymbol)
2733 goto err_label;
2734 else if (!ignore_content)
2736 /* Check whether this section is already known. */
2737 struct section_list *known = collate->sections;
2738 while (known != NULL)
2740 if (strcmp (known->name, arg->val.str.startmb) == 0)
2741 break;
2742 known = known->next;
2745 if (known != NULL)
2747 lr_error (ldfile,
2748 _("%s: duplicate declaration of section `%s'"),
2749 "LC_COLLATE", arg->val.str.startmb);
2750 free (arg->val.str.startmb);
2752 else
2753 collate->sections = make_seclist_elem (collate,
2754 arg->val.str.startmb,
2755 collate->sections);
2757 lr_ignore_rest (ldfile, known == NULL);
2759 else
2761 free (arg->val.str.startmb);
2762 lr_ignore_rest (ldfile, 0);
2764 break;
2766 case tok_collating_element:
2767 /* Ignore the rest of the line if we don't need the input of
2768 this line. */
2769 if (ignore_content)
2771 lr_ignore_rest (ldfile, 0);
2772 break;
2775 if (state != 0 && state != 2)
2776 goto err_label;
2778 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2779 if (arg->tok != tok_bsymbol)
2780 goto err_label;
2781 else
2783 const char *symbol = arg->val.str.startmb;
2784 size_t symbol_len = arg->val.str.lenmb;
2786 /* Next the `from' keyword. */
2787 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2788 if (arg->tok != tok_from)
2790 free ((char *) symbol);
2791 goto err_label;
2794 ldfile->return_widestr = 1;
2795 ldfile->translate_strings = 1;
2797 /* Finally the string with the replacement. */
2798 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2800 ldfile->return_widestr = 0;
2801 ldfile->translate_strings = 0;
2803 if (arg->tok != tok_string)
2804 goto err_label;
2806 if (!ignore_content && symbol != NULL)
2808 /* The name is already defined. */
2809 if (check_duplicate (ldfile, collate, charmap,
2810 repertoire, symbol, symbol_len))
2811 goto col_elem_free;
2813 if (arg->val.str.startmb != NULL)
2814 insert_entry (&collate->elem_table, symbol, symbol_len,
2815 new_element (collate,
2816 arg->val.str.startmb,
2817 arg->val.str.lenmb - 1,
2818 arg->val.str.startwc,
2819 symbol, symbol_len, 0));
2821 else
2823 col_elem_free:
2824 if (symbol != NULL)
2825 free ((char *) symbol);
2826 if (arg->val.str.startmb != NULL)
2827 free (arg->val.str.startmb);
2828 if (arg->val.str.startwc != NULL)
2829 free (arg->val.str.startwc);
2831 lr_ignore_rest (ldfile, 1);
2833 break;
2835 case tok_collating_symbol:
2836 /* Ignore the rest of the line if we don't need the input of
2837 this line. */
2838 if (ignore_content)
2840 lr_ignore_rest (ldfile, 0);
2841 break;
2844 if (state != 0 && state != 2)
2845 goto err_label;
2847 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2848 if (arg->tok != tok_bsymbol)
2849 goto err_label;
2850 else
2852 char *symbol = arg->val.str.startmb;
2853 size_t symbol_len = arg->val.str.lenmb;
2854 char *endsymbol = NULL;
2855 size_t endsymbol_len = 0;
2856 enum token_t ellipsis = tok_none;
2858 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2859 if (arg->tok == tok_ellipsis2 || arg->tok == tok_ellipsis4)
2861 ellipsis = arg->tok;
2863 arg = lr_token (ldfile, charmap, result, repertoire,
2864 verbose);
2865 if (arg->tok != tok_bsymbol)
2867 free (symbol);
2868 goto err_label;
2871 endsymbol = arg->val.str.startmb;
2872 endsymbol_len = arg->val.str.lenmb;
2874 lr_ignore_rest (ldfile, 1);
2876 else if (arg->tok != tok_eol)
2878 free (symbol);
2879 goto err_label;
2882 if (!ignore_content)
2884 if (symbol == NULL
2885 || (ellipsis != tok_none && endsymbol == NULL))
2887 lr_error (ldfile, _("\
2888 %s: unknown character in collating symbol name"),
2889 "LC_COLLATE");
2890 goto col_sym_free;
2892 else if (ellipsis == tok_none)
2894 /* A single symbol, no ellipsis. */
2895 if (check_duplicate (ldfile, collate, charmap,
2896 repertoire, symbol, symbol_len))
2897 /* The name is already defined. */
2898 goto col_sym_free;
2900 insert_entry (&collate->sym_table, symbol, symbol_len,
2901 new_symbol (collate, symbol, symbol_len));
2903 else if (symbol_len != endsymbol_len)
2905 col_sym_inv_range:
2906 lr_error (ldfile,
2907 _("invalid names for character range"));
2908 goto col_sym_free;
2910 else
2912 /* Oh my, we have to handle an ellipsis. First, as
2913 usual, determine the common prefix and then
2914 convert the rest into a range. */
2915 size_t prefixlen;
2916 unsigned long int from;
2917 unsigned long int to;
2918 char *endp;
2920 for (prefixlen = 0; prefixlen < symbol_len; ++prefixlen)
2921 if (symbol[prefixlen] != endsymbol[prefixlen])
2922 break;
2924 /* Convert the rest into numbers. */
2925 symbol[symbol_len] = '\0';
2926 from = strtoul (&symbol[prefixlen], &endp,
2927 ellipsis == tok_ellipsis2 ? 16 : 10);
2928 if (*endp != '\0')
2929 goto col_sym_inv_range;
2931 endsymbol[symbol_len] = '\0';
2932 to = strtoul (&endsymbol[prefixlen], &endp,
2933 ellipsis == tok_ellipsis2 ? 16 : 10);
2934 if (*endp != '\0')
2935 goto col_sym_inv_range;
2937 if (from > to)
2938 goto col_sym_inv_range;
2940 /* Now loop over all entries. */
2941 while (from <= to)
2943 char *symbuf;
2945 symbuf = (char *) obstack_alloc (&collate->mempool,
2946 symbol_len + 1);
2948 /* Create the name. */
2949 sprintf (symbuf,
2950 ellipsis == tok_ellipsis2
2951 ? "%.*s%.*lX" : "%.*s%.*lu",
2952 (int) prefixlen, symbol,
2953 (int) (symbol_len - prefixlen), from);
2955 if (check_duplicate (ldfile, collate, charmap,
2956 repertoire, symbuf, symbol_len))
2957 /* The name is already defined. */
2958 goto col_sym_free;
2960 insert_entry (&collate->sym_table, symbuf,
2961 symbol_len,
2962 new_symbol (collate, symbuf,
2963 symbol_len));
2965 /* Increment the counter. */
2966 ++from;
2969 goto col_sym_free;
2972 else
2974 col_sym_free:
2975 if (symbol != NULL)
2976 free (symbol);
2977 if (endsymbol != NULL)
2978 free (endsymbol);
2981 break;
2983 case tok_symbol_equivalence:
2984 /* Ignore the rest of the line if we don't need the input of
2985 this line. */
2986 if (ignore_content)
2988 lr_ignore_rest (ldfile, 0);
2989 break;
2992 if (state != 0)
2993 goto err_label;
2995 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2996 if (arg->tok != tok_bsymbol)
2997 goto err_label;
2998 else
3000 const char *newname = arg->val.str.startmb;
3001 size_t newname_len = arg->val.str.lenmb;
3002 const char *symname;
3003 size_t symname_len;
3004 struct symbol_t *symval;
3006 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3007 if (arg->tok != tok_bsymbol)
3009 if (newname != NULL)
3010 free ((char *) newname);
3011 goto err_label;
3014 symname = arg->val.str.startmb;
3015 symname_len = arg->val.str.lenmb;
3017 if (newname == NULL)
3019 lr_error (ldfile, _("\
3020 %s: unknown character in equivalent definition name"),
3021 "LC_COLLATE");
3023 sym_equiv_free:
3024 if (newname != NULL)
3025 free ((char *) newname);
3026 if (symname != NULL)
3027 free ((char *) symname);
3028 break;
3030 if (symname == NULL)
3032 lr_error (ldfile, _("\
3033 %s: unknown character in equivalent definition value"),
3034 "LC_COLLATE");
3035 goto sym_equiv_free;
3038 /* See whether the symbol name is already defined. */
3039 if (find_entry (&collate->sym_table, symname, symname_len,
3040 (void **) &symval) != 0)
3042 lr_error (ldfile, _("\
3043 %s: unknown symbol `%s' in equivalent definition"),
3044 "LC_COLLATE", symname);
3045 goto col_sym_free;
3048 if (insert_entry (&collate->sym_table,
3049 newname, newname_len, symval) < 0)
3051 lr_error (ldfile, _("\
3052 error while adding equivalent collating symbol"));
3053 goto sym_equiv_free;
3056 free ((char *) symname);
3058 lr_ignore_rest (ldfile, 1);
3059 break;
3061 case tok_script:
3062 /* We get told about the scripts we know. */
3063 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3064 if (arg->tok != tok_bsymbol)
3065 goto err_label;
3066 else
3068 struct section_list *runp = collate->known_sections;
3069 char *name;
3071 while (runp != NULL)
3072 if (strncmp (runp->name, arg->val.str.startmb,
3073 arg->val.str.lenmb) == 0
3074 && runp->name[arg->val.str.lenmb] == '\0')
3075 break;
3076 else
3077 runp = runp->def_next;
3079 if (runp != NULL)
3081 lr_error (ldfile, _("duplicate definition of script `%s'"),
3082 runp->name);
3083 lr_ignore_rest (ldfile, 0);
3084 break;
3087 runp = (struct section_list *) xcalloc (1, sizeof (*runp));
3088 name = (char *) xmalloc (arg->val.str.lenmb + 1);
3089 memcpy (name, arg->val.str.startmb, arg->val.str.lenmb);
3090 name[arg->val.str.lenmb] = '\0';
3091 runp->name = name;
3093 runp->def_next = collate->known_sections;
3094 collate->known_sections = runp;
3096 lr_ignore_rest (ldfile, 1);
3097 break;
3099 case tok_order_start:
3100 /* Ignore the rest of the line if we don't need the input of
3101 this line. */
3102 if (ignore_content)
3104 lr_ignore_rest (ldfile, 0);
3105 break;
3108 if (state != 0 && state != 1)
3109 goto err_label;
3110 state = 1;
3112 /* The 14652 draft does not specify whether all `order_start' lines
3113 must contain the same number of sort-rules, but 14651 does. So
3114 we require this here as well. */
3115 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3116 if (arg->tok == tok_bsymbol)
3118 /* This better should be a section name. */
3119 struct section_list *sp = collate->known_sections;
3120 while (sp != NULL
3121 && (sp->name == NULL
3122 || strncmp (sp->name, arg->val.str.startmb,
3123 arg->val.str.lenmb) != 0
3124 || sp->name[arg->val.str.lenmb] != '\0'))
3125 sp = sp->def_next;
3127 if (sp == NULL)
3129 lr_error (ldfile, _("\
3130 %s: unknown section name `%s'"),
3131 "LC_COLLATE", arg->val.str.startmb);
3132 /* We use the error section. */
3133 collate->current_section = &collate->error_section;
3135 if (collate->error_section.first == NULL)
3137 /* Insert &collate->error_section at the end of
3138 the collate->sections list. */
3139 if (collate->sections == NULL)
3140 collate->sections = &collate->error_section;
3141 else
3143 sp = collate->sections;
3144 while (sp->next != NULL)
3145 sp = sp->next;
3147 sp->next = &collate->error_section;
3149 collate->error_section.next = NULL;
3152 else
3154 /* One should not be allowed to open the same
3155 section twice. */
3156 if (sp->first != NULL)
3157 lr_error (ldfile, _("\
3158 %s: multiple order definitions for section `%s'"),
3159 "LC_COLLATE", sp->name);
3160 else
3162 /* Insert sp in the collate->sections list,
3163 right after collate->current_section. */
3164 if (collate->current_section == NULL)
3165 collate->current_section = sp;
3166 else
3168 sp->next = collate->current_section->next;
3169 collate->current_section->next = sp;
3173 /* Next should come the end of the line or a semicolon. */
3174 arg = lr_token (ldfile, charmap, result, repertoire,
3175 verbose);
3176 if (arg->tok == tok_eol)
3178 uint32_t cnt;
3180 /* This means we have exactly one rule: `forward'. */
3181 if (nrules > 1)
3182 lr_error (ldfile, _("\
3183 %s: invalid number of sorting rules"),
3184 "LC_COLLATE");
3185 else
3186 nrules = 1;
3187 sp->rules = obstack_alloc (&collate->mempool,
3188 (sizeof (enum coll_sort_rule)
3189 * nrules));
3190 for (cnt = 0; cnt < nrules; ++cnt)
3191 sp->rules[cnt] = sort_forward;
3193 /* Next line. */
3194 break;
3197 /* Get the next token. */
3198 arg = lr_token (ldfile, charmap, result, repertoire,
3199 verbose);
3202 else
3204 /* There is no section symbol. Therefore we use the unnamed
3205 section. */
3206 collate->current_section = &collate->unnamed_section;
3208 if (collate->unnamed_section.first != NULL)
3209 lr_error (ldfile, _("\
3210 %s: multiple order definitions for unnamed section"),
3211 "LC_COLLATE");
3212 else
3214 /* Insert &collate->unnamed_section at the beginning of
3215 the collate->sections list. */
3216 collate->unnamed_section.next = collate->sections;
3217 collate->sections = &collate->unnamed_section;
3221 /* Now read the direction names. */
3222 read_directions (ldfile, arg, charmap, repertoire, result);
3224 /* From now we need the strings untranslated. */
3225 ldfile->translate_strings = 0;
3226 break;
3228 case tok_order_end:
3229 /* Ignore the rest of the line if we don't need the input of
3230 this line. */
3231 if (ignore_content)
3233 lr_ignore_rest (ldfile, 0);
3234 break;
3237 if (state != 1)
3238 goto err_label;
3240 /* Handle ellipsis at end of list. */
3241 if (was_ellipsis != tok_none)
3243 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3244 repertoire, result);
3245 was_ellipsis = tok_none;
3248 state = 2;
3249 lr_ignore_rest (ldfile, 1);
3250 break;
3252 case tok_reorder_after:
3253 /* Ignore the rest of the line if we don't need the input of
3254 this line. */
3255 if (ignore_content)
3257 lr_ignore_rest (ldfile, 0);
3258 break;
3261 if (state == 1)
3263 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3264 "LC_COLLATE");
3265 state = 2;
3267 /* Handle ellipsis at end of list. */
3268 if (was_ellipsis != tok_none)
3270 handle_ellipsis (ldfile, arg->val.str.startmb,
3271 arg->val.str.lenmb, was_ellipsis, charmap,
3272 repertoire, result);
3273 was_ellipsis = tok_none;
3276 else if (state != 2 && state != 3)
3277 goto err_label;
3278 state = 3;
3280 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3281 if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
3283 /* Find this symbol in the sequence table. */
3284 char ucsbuf[10];
3285 char *startmb;
3286 size_t lenmb;
3287 struct element_t *insp;
3288 int no_error = 1;
3290 if (arg->tok == tok_bsymbol)
3292 startmb = arg->val.str.startmb;
3293 lenmb = arg->val.str.lenmb;
3295 else
3297 sprintf (ucsbuf, "U%08X", arg->val.ucs4);
3298 startmb = ucsbuf;
3299 lenmb = 9;
3302 if (find_entry (&collate->seq_table, startmb, lenmb,
3303 (void **) &insp) == 0)
3304 /* Yes, the symbol exists. Simply point the cursor
3305 to it. */
3306 collate->cursor = insp;
3307 else
3309 struct symbol_t *symbp;
3311 if (find_entry (&collate->sym_table, startmb, lenmb,
3312 (void **) &symbp) == 0)
3314 if (symbp->order->last != NULL
3315 || symbp->order->next != NULL)
3316 collate->cursor = symbp->order;
3317 else
3319 /* This is a collating symbol but its position
3320 is not yet defined. */
3321 lr_error (ldfile, _("\
3322 %s: order for collating symbol %.*s not yet defined"),
3323 "LC_COLLATE", (int) lenmb, startmb);
3324 collate->cursor = NULL;
3325 no_error = 0;
3328 else if (find_entry (&collate->elem_table, startmb, lenmb,
3329 (void **) &insp) == 0)
3331 if (insp->last != NULL || insp->next != NULL)
3332 collate->cursor = insp;
3333 else
3335 /* This is a collating element but its position
3336 is not yet defined. */
3337 lr_error (ldfile, _("\
3338 %s: order for collating element %.*s not yet defined"),
3339 "LC_COLLATE", (int) lenmb, startmb);
3340 collate->cursor = NULL;
3341 no_error = 0;
3344 else
3346 /* This is bad. The symbol after which we have to
3347 insert does not exist. */
3348 lr_error (ldfile, _("\
3349 %s: cannot reorder after %.*s: symbol not known"),
3350 "LC_COLLATE", (int) lenmb, startmb);
3351 collate->cursor = NULL;
3352 no_error = 0;
3356 lr_ignore_rest (ldfile, no_error);
3358 else
3359 /* This must not happen. */
3360 goto err_label;
3361 break;
3363 case tok_reorder_end:
3364 /* Ignore the rest of the line if we don't need the input of
3365 this line. */
3366 if (ignore_content)
3367 break;
3369 if (state != 3)
3370 goto err_label;
3371 state = 4;
3372 lr_ignore_rest (ldfile, 1);
3373 break;
3375 case tok_reorder_sections_after:
3376 /* Ignore the rest of the line if we don't need the input of
3377 this line. */
3378 if (ignore_content)
3380 lr_ignore_rest (ldfile, 0);
3381 break;
3384 if (state == 1)
3386 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3387 "LC_COLLATE");
3388 state = 2;
3390 /* Handle ellipsis at end of list. */
3391 if (was_ellipsis != tok_none)
3393 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3394 repertoire, result);
3395 was_ellipsis = tok_none;
3398 else if (state == 3)
3400 WITH_CUR_LOCALE (error (0, 0, _("\
3401 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3402 state = 4;
3404 else if (state != 2 && state != 4)
3405 goto err_label;
3406 state = 5;
3408 /* Get the name of the sections we are adding after. */
3409 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3410 if (arg->tok == tok_bsymbol)
3412 /* Now find a section with this name. */
3413 struct section_list *runp = collate->sections;
3415 while (runp != NULL)
3417 if (runp->name != NULL
3418 && strlen (runp->name) == arg->val.str.lenmb
3419 && memcmp (runp->name, arg->val.str.startmb,
3420 arg->val.str.lenmb) == 0)
3421 break;
3423 runp = runp->next;
3426 if (runp != NULL)
3427 collate->current_section = runp;
3428 else
3430 /* This is bad. The section after which we have to
3431 reorder does not exist. Therefore we cannot
3432 process the whole rest of this reorder
3433 specification. */
3434 lr_error (ldfile, _("%s: section `%.*s' not known"),
3435 "LC_COLLATE", (int) arg->val.str.lenmb,
3436 arg->val.str.startmb);
3440 lr_ignore_rest (ldfile, 0);
3442 now = lr_token (ldfile, charmap, result, NULL, verbose);
3444 while (now->tok == tok_reorder_sections_after
3445 || now->tok == tok_reorder_sections_end
3446 || now->tok == tok_end);
3448 /* Process the token we just saw. */
3449 nowtok = now->tok;
3450 continue;
3453 else
3454 /* This must not happen. */
3455 goto err_label;
3456 break;
3458 case tok_reorder_sections_end:
3459 /* Ignore the rest of the line if we don't need the input of
3460 this line. */
3461 if (ignore_content)
3462 break;
3464 if (state != 5)
3465 goto err_label;
3466 state = 6;
3467 lr_ignore_rest (ldfile, 1);
3468 break;
3470 case tok_bsymbol:
3471 case tok_ucs4:
3472 /* Ignore the rest of the line if we don't need the input of
3473 this line. */
3474 if (ignore_content)
3476 lr_ignore_rest (ldfile, 0);
3477 break;
3480 if (state != 0 && state != 1 && state != 3 && state != 5)
3481 goto err_label;
3483 if ((state == 0 || state == 5) && nowtok == tok_ucs4)
3484 goto err_label;
3486 if (nowtok == tok_ucs4)
3488 snprintf (ucs4buf, sizeof (ucs4buf), "U%08X", now->val.ucs4);
3489 symstr = ucs4buf;
3490 symlen = 9;
3492 else
3494 symstr = arg->val.str.startmb;
3495 symlen = arg->val.str.lenmb;
3498 if (state == 0)
3500 /* We are outside an `order_start' region. This means
3501 we must only accept definitions of values for
3502 collation symbols since these are purely abstract
3503 values and don't need directions associated. */
3504 struct element_t *seqp;
3506 if (find_entry (&collate->seq_table, symstr, symlen,
3507 (void **) &seqp) == 0)
3509 /* It's already defined. First check whether this
3510 is really a collating symbol. */
3511 if (seqp->is_character)
3512 goto err_label;
3514 goto move_entry;
3516 else
3518 void *result;
3520 if (find_entry (&collate->sym_table, symstr, symlen,
3521 &result) != 0)
3522 /* No collating symbol, it's an error. */
3523 goto err_label;
3525 /* Maybe this is the first time we define a symbol
3526 value and it is before the first actual section. */
3527 if (collate->sections == NULL)
3528 collate->sections = collate->current_section =
3529 &collate->symbol_section;
3532 if (was_ellipsis != tok_none)
3535 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis,
3536 charmap, repertoire, result);
3538 /* Remember that we processed the ellipsis. */
3539 was_ellipsis = tok_none;
3541 /* And don't add the value a second time. */
3542 break;
3545 else if (state == 3)
3547 /* It is possible that we already have this collation sequence.
3548 In this case we move the entry. */
3549 struct element_t *seqp;
3550 void *sym;
3552 /* If the symbol after which we have to insert was not found
3553 ignore all entries. */
3554 if (collate->cursor == NULL)
3556 lr_ignore_rest (ldfile, 0);
3557 break;
3560 if (find_entry (&collate->seq_table, symstr, symlen,
3561 (void **) &seqp) == 0)
3562 goto move_entry;
3564 if (find_entry (&collate->sym_table, symstr, symlen, &sym) == 0
3565 && (seqp = ((struct symbol_t *) sym)->order) != NULL)
3566 goto move_entry;
3568 if (find_entry (&collate->elem_table, symstr, symlen,
3569 (void **) &seqp) == 0
3570 && (seqp->last != NULL || seqp->next != NULL
3571 || (collate->start != NULL && seqp == collate->start)))
3573 move_entry:
3574 /* Remove the entry from the old position. */
3575 if (seqp->last == NULL)
3576 collate->start = seqp->next;
3577 else
3578 seqp->last->next = seqp->next;
3579 if (seqp->next != NULL)
3580 seqp->next->last = seqp->last;
3582 /* We also have to check whether this entry is the
3583 first or last of a section. */
3584 if (seqp->section->first == seqp)
3586 if (seqp->section->first == seqp->section->last)
3587 /* This section has no content anymore. */
3588 seqp->section->first = seqp->section->last = NULL;
3589 else
3590 seqp->section->first = seqp->next;
3592 else if (seqp->section->last == seqp)
3593 seqp->section->last = seqp->last;
3595 /* Now insert it in the new place. */
3596 insert_weights (ldfile, seqp, charmap, repertoire, result,
3597 tok_none);
3598 break;
3601 /* Otherwise we just add a new entry. */
3603 else if (state == 5)
3605 /* We are reordering sections. Find the named section. */
3606 struct section_list *runp = collate->sections;
3607 struct section_list *prevp = NULL;
3609 while (runp != NULL)
3611 if (runp->name != NULL
3612 && strlen (runp->name) == symlen
3613 && memcmp (runp->name, symstr, symlen) == 0)
3614 break;
3616 prevp = runp;
3617 runp = runp->next;
3620 if (runp == NULL)
3622 lr_error (ldfile, _("%s: section `%.*s' not known"),
3623 "LC_COLLATE", (int) symlen, symstr);
3624 lr_ignore_rest (ldfile, 0);
3626 else
3628 if (runp != collate->current_section)
3630 /* Remove the named section from the old place and
3631 insert it in the new one. */
3632 prevp->next = runp->next;
3634 runp->next = collate->current_section->next;
3635 collate->current_section->next = runp;
3636 collate->current_section = runp;
3639 /* Process the rest of the line which might change
3640 the collation rules. */
3641 arg = lr_token (ldfile, charmap, result, repertoire,
3642 verbose);
3643 if (arg->tok != tok_eof && arg->tok != tok_eol)
3644 read_directions (ldfile, arg, charmap, repertoire,
3645 result);
3647 break;
3649 else if (was_ellipsis != tok_none)
3651 /* Using the information in the `ellipsis_weight'
3652 element and this and the last value we have to handle
3653 the ellipsis now. */
3654 assert (state == 1);
3656 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis, charmap,
3657 repertoire, result);
3659 /* Remember that we processed the ellipsis. */
3660 was_ellipsis = tok_none;
3662 /* And don't add the value a second time. */
3663 break;
3666 /* Now insert in the new place. */
3667 insert_value (ldfile, symstr, symlen, charmap, repertoire, result);
3668 break;
3670 case tok_undefined:
3671 /* Ignore the rest of the line if we don't need the input of
3672 this line. */
3673 if (ignore_content)
3675 lr_ignore_rest (ldfile, 0);
3676 break;
3679 if (state != 1)
3680 goto err_label;
3682 if (was_ellipsis != tok_none)
3684 lr_error (ldfile,
3685 _("%s: cannot have `%s' as end of ellipsis range"),
3686 "LC_COLLATE", "UNDEFINED");
3688 unlink_element (collate);
3689 was_ellipsis = tok_none;
3692 /* See whether UNDEFINED already appeared somewhere. */
3693 if (collate->undefined.next != NULL
3694 || &collate->undefined == collate->cursor)
3696 lr_error (ldfile,
3697 _("%s: order for `%.*s' already defined at %s:%Zu"),
3698 "LC_COLLATE", 9, "UNDEFINED",
3699 collate->undefined.file,
3700 collate->undefined.line);
3701 lr_ignore_rest (ldfile, 0);
3703 else
3704 /* Parse the weights. */
3705 insert_weights (ldfile, &collate->undefined, charmap,
3706 repertoire, result, tok_none);
3707 break;
3709 case tok_ellipsis2: /* symbolic hexadecimal ellipsis */
3710 case tok_ellipsis3: /* absolute ellipsis */
3711 case tok_ellipsis4: /* symbolic decimal ellipsis */
3712 /* This is the symbolic (decimal or hexadecimal) or absolute
3713 ellipsis. */
3714 if (was_ellipsis != tok_none)
3715 goto err_label;
3717 if (state != 0 && state != 1 && state != 3)
3718 goto err_label;
3720 was_ellipsis = nowtok;
3722 insert_weights (ldfile, &collate->ellipsis_weight, charmap,
3723 repertoire, result, nowtok);
3724 break;
3726 case tok_end:
3727 /* Next we assume `LC_COLLATE'. */
3728 if (!ignore_content)
3730 if (state == 0)
3731 /* We must either see a copy statement or have
3732 ordering values. */
3733 lr_error (ldfile,
3734 _("%s: empty category description not allowed"),
3735 "LC_COLLATE");
3736 else if (state == 1)
3738 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3739 "LC_COLLATE");
3741 /* Handle ellipsis at end of list. */
3742 if (was_ellipsis != tok_none)
3744 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3745 repertoire, result);
3746 was_ellipsis = tok_none;
3749 else if (state == 3)
3750 WITH_CUR_LOCALE (error (0, 0, _("\
3751 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3752 else if (state == 5)
3753 WITH_CUR_LOCALE (error (0, 0, _("\
3754 %s: missing `reorder-sections-end' keyword"), "LC_COLLATE"));
3756 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3757 if (arg->tok == tok_eof)
3758 break;
3759 if (arg->tok == tok_eol)
3760 lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
3761 else if (arg->tok != tok_lc_collate)
3762 lr_error (ldfile, _("\
3763 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
3764 lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
3765 return;
3767 default:
3768 err_label:
3769 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
3772 /* Prepare for the next round. */
3773 now = lr_token (ldfile, charmap, result, NULL, verbose);
3774 nowtok = now->tok;
3777 /* When we come here we reached the end of the file. */
3778 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");