Clean up locale file alignment handling.
[glibc.git] / locale / programs / ld-collate.c
blobd88a6de56e6403d2554a184acfbedf4ad56197a8
1 /* Copyright (C) 1995-2013 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 #include <errno.h>
23 #include <error.h>
24 #include <stdlib.h>
25 #include <wchar.h>
26 #include <stdint.h>
27 #include <sys/param.h>
29 #include "localedef.h"
30 #include "charmap.h"
31 #include "localeinfo.h"
32 #include "linereader.h"
33 #include "locfile.h"
34 #include "elem-hash.h"
36 /* Uncomment the following line in the production version. */
37 /* #define NDEBUG 1 */
38 #include <assert.h>
40 #define obstack_chunk_alloc malloc
41 #define obstack_chunk_free free
43 static inline void
44 __attribute ((always_inline))
45 obstack_int32_grow (struct obstack *obstack, int32_t data)
47 assert (LOCFILE_ALIGNED_P (obstack_object_size (obstack)));
48 data = maybe_swap_uint32 (data);
49 if (sizeof (int32_t) == sizeof (int))
50 obstack_int_grow (obstack, data);
51 else
52 obstack_grow (obstack, &data, sizeof (int32_t));
55 static inline void
56 __attribute ((always_inline))
57 obstack_int32_grow_fast (struct obstack *obstack, int32_t data)
59 assert (LOCFILE_ALIGNED_P (obstack_object_size (obstack)));
60 data = maybe_swap_uint32 (data);
61 if (sizeof (int32_t) == sizeof (int))
62 obstack_int_grow_fast (obstack, data);
63 else
64 obstack_grow (obstack, &data, sizeof (int32_t));
67 /* Forward declaration. */
68 struct element_t;
70 /* Data type for list of strings. */
71 struct section_list
73 /* Successor in the known_sections list. */
74 struct section_list *def_next;
75 /* Successor in the sections list. */
76 struct section_list *next;
77 /* Name of the section. */
78 const char *name;
79 /* First element of this section. */
80 struct element_t *first;
81 /* Last element of this section. */
82 struct element_t *last;
83 /* These are the rules for this section. */
84 enum coll_sort_rule *rules;
85 /* Index of the rule set in the appropriate section of the output file. */
86 int ruleidx;
89 struct element_t;
91 struct element_list_t
93 /* Number of elements. */
94 int cnt;
96 struct element_t **w;
99 /* Data type for collating element. */
100 struct element_t
102 const char *name;
104 const char *mbs;
105 size_t nmbs;
106 const uint32_t *wcs;
107 size_t nwcs;
108 int *mborder;
109 int wcorder;
111 /* The following is a bit mask which bits are set if this element is
112 used in the appropriate level. Interesting for the singlebyte
113 weight computation.
115 XXX The type here restricts the number of levels to 32. It could
116 be changed if necessary but I doubt this is necessary. */
117 unsigned int used_in_level;
119 struct element_list_t *weights;
121 /* Nonzero if this is a real character definition. */
122 int is_character;
124 /* Order of the character in the sequence. This information will
125 be used in range expressions. */
126 int mbseqorder;
127 int wcseqorder;
129 /* Where does the definition come from. */
130 const char *file;
131 size_t line;
133 /* Which section does this belong to. */
134 struct section_list *section;
136 /* Predecessor and successor in the order list. */
137 struct element_t *last;
138 struct element_t *next;
140 /* Next element in multibyte output list. */
141 struct element_t *mbnext;
142 struct element_t *mblast;
144 /* Next element in wide character output list. */
145 struct element_t *wcnext;
146 struct element_t *wclast;
149 /* Special element value. */
150 #define ELEMENT_ELLIPSIS2 ((struct element_t *) 1)
151 #define ELEMENT_ELLIPSIS3 ((struct element_t *) 2)
152 #define ELEMENT_ELLIPSIS4 ((struct element_t *) 3)
154 /* Data type for collating symbol. */
155 struct symbol_t
157 const char *name;
159 /* Point to place in the order list. */
160 struct element_t *order;
162 /* Where does the definition come from. */
163 const char *file;
164 size_t line;
167 /* Sparse table of struct element_t *. */
168 #define TABLE wchead_table
169 #define ELEMENT struct element_t *
170 #define DEFAULT NULL
171 #define ITERATE
172 #define NO_ADD_LOCALE
173 #include "3level.h"
175 /* Sparse table of int32_t. */
176 #define TABLE collidx_table
177 #define ELEMENT int32_t
178 #define DEFAULT 0
179 #include "3level.h"
181 /* Sparse table of uint32_t. */
182 #define TABLE collseq_table
183 #define ELEMENT uint32_t
184 #define DEFAULT ~((uint32_t) 0)
185 #include "3level.h"
188 /* Simple name list for the preprocessor. */
189 struct name_list
191 struct name_list *next;
192 char str[0];
196 /* The real definition of the struct for the LC_COLLATE locale. */
197 struct locale_collate_t
199 int col_weight_max;
200 int cur_weight_max;
202 /* List of known scripts. */
203 struct section_list *known_sections;
204 /* List of used sections. */
205 struct section_list *sections;
206 /* Current section using definition. */
207 struct section_list *current_section;
208 /* There always can be an unnamed section. */
209 struct section_list unnamed_section;
210 /* Flag whether the unnamed section has been defined. */
211 bool unnamed_section_defined;
212 /* To make handling of errors easier we have another section. */
213 struct section_list error_section;
214 /* Sometimes we are defining the values for collating symbols before
215 the first actual section. */
216 struct section_list symbol_section;
218 /* Start of the order list. */
219 struct element_t *start;
221 /* The undefined element. */
222 struct element_t undefined;
224 /* This is the cursor for `reorder_after' insertions. */
225 struct element_t *cursor;
227 /* This value is used when handling ellipsis. */
228 struct element_t ellipsis_weight;
230 /* Known collating elements. */
231 hash_table elem_table;
233 /* Known collating symbols. */
234 hash_table sym_table;
236 /* Known collation sequences. */
237 hash_table seq_table;
239 struct obstack mempool;
241 /* The LC_COLLATE category is a bit special as it is sometimes possible
242 that the definitions from more than one input file contains information.
243 Therefore we keep all relevant input in a list. */
244 struct locale_collate_t *next;
246 /* Arrays with heads of the list for each of the leading bytes in
247 the multibyte sequences. */
248 struct element_t *mbheads[256];
250 /* Arrays with heads of the list for each of the leading bytes in
251 the multibyte sequences. */
252 struct wchead_table wcheads;
254 /* The arrays with the collation sequence order. */
255 unsigned char mbseqorder[256];
256 struct collseq_table wcseqorder;
258 /* State of the preprocessor. */
259 enum
261 else_none = 0,
262 else_ignore,
263 else_seen
265 else_action;
269 /* We have a few global variables which are used for reading all
270 LC_COLLATE category descriptions in all files. */
271 static uint32_t nrules;
273 /* List of defined preprocessor symbols. */
274 static struct name_list *defined;
277 /* We need UTF-8 encoding of numbers. */
278 static inline int
279 __attribute ((always_inline))
280 utf8_encode (char *buf, int val)
282 int retval;
284 if (val < 0x80)
286 *buf++ = (char) val;
287 retval = 1;
289 else
291 int step;
293 for (step = 2; step < 6; ++step)
294 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
295 break;
296 retval = step;
298 *buf = (unsigned char) (~0xff >> step);
299 --step;
302 buf[step] = 0x80 | (val & 0x3f);
303 val >>= 6;
305 while (--step > 0);
306 *buf |= val;
309 return retval;
313 static struct section_list *
314 make_seclist_elem (struct locale_collate_t *collate, const char *string,
315 struct section_list *next)
317 struct section_list *newp;
319 newp = (struct section_list *) obstack_alloc (&collate->mempool,
320 sizeof (*newp));
321 newp->next = next;
322 newp->name = string;
323 newp->first = NULL;
324 newp->last = NULL;
326 return newp;
330 static struct element_t *
331 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
332 const uint32_t *wcs, const char *name, size_t namelen,
333 int is_character)
335 struct element_t *newp;
337 newp = (struct element_t *) obstack_alloc (&collate->mempool,
338 sizeof (*newp));
339 newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
340 name, namelen);
341 if (mbs != NULL)
343 newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
344 newp->nmbs = mbslen;
346 else
348 newp->mbs = NULL;
349 newp->nmbs = 0;
351 if (wcs != NULL)
353 size_t nwcs = wcslen ((wchar_t *) wcs);
354 uint32_t zero = 0;
355 obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
356 obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
357 newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
358 newp->nwcs = nwcs;
360 else
362 newp->wcs = NULL;
363 newp->nwcs = 0;
365 newp->mborder = NULL;
366 newp->wcorder = 0;
367 newp->used_in_level = 0;
368 newp->is_character = is_character;
370 /* Will be assigned later. XXX */
371 newp->mbseqorder = 0;
372 newp->wcseqorder = 0;
374 /* Will be allocated later. */
375 newp->weights = NULL;
377 newp->file = NULL;
378 newp->line = 0;
380 newp->section = collate->current_section;
382 newp->last = NULL;
383 newp->next = NULL;
385 newp->mbnext = NULL;
386 newp->mblast = NULL;
388 newp->wcnext = NULL;
389 newp->wclast = NULL;
391 return newp;
395 static struct symbol_t *
396 new_symbol (struct locale_collate_t *collate, const char *name, size_t len)
398 struct symbol_t *newp;
400 newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
402 newp->name = obstack_copy0 (&collate->mempool, name, len);
403 newp->order = NULL;
405 newp->file = NULL;
406 newp->line = 0;
408 return newp;
412 /* Test whether this name is already defined somewhere. */
413 static int
414 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
415 const struct charmap_t *charmap,
416 struct repertoire_t *repertoire, const char *symbol,
417 size_t symbol_len)
419 void *ignore = NULL;
421 if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
423 lr_error (ldfile, _("`%.*s' already defined in charmap"),
424 (int) symbol_len, symbol);
425 return 1;
428 if (repertoire != NULL
429 && (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore)
430 == 0))
432 lr_error (ldfile, _("`%.*s' already defined in repertoire"),
433 (int) symbol_len, symbol);
434 return 1;
437 if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
439 lr_error (ldfile, _("`%.*s' already defined as collating symbol"),
440 (int) symbol_len, symbol);
441 return 1;
444 if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
446 lr_error (ldfile, _("`%.*s' already defined as collating element"),
447 (int) symbol_len, symbol);
448 return 1;
451 return 0;
455 /* Read the direction specification. */
456 static void
457 read_directions (struct linereader *ldfile, struct token *arg,
458 const struct charmap_t *charmap,
459 struct repertoire_t *repertoire, struct localedef_t *result)
461 int cnt = 0;
462 int max = nrules ?: 10;
463 enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
464 int warned = 0;
465 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
467 while (1)
469 int valid = 0;
471 if (arg->tok == tok_forward)
473 if (rules[cnt] & sort_backward)
475 if (! warned)
477 lr_error (ldfile, _("\
478 %s: `forward' and `backward' are mutually excluding each other"),
479 "LC_COLLATE");
480 warned = 1;
483 else if (rules[cnt] & sort_forward)
485 if (! warned)
487 lr_error (ldfile, _("\
488 %s: `%s' mentioned more than once in definition of weight %d"),
489 "LC_COLLATE", "forward", cnt + 1);
492 else
493 rules[cnt] |= sort_forward;
495 valid = 1;
497 else if (arg->tok == tok_backward)
499 if (rules[cnt] & sort_forward)
501 if (! warned)
503 lr_error (ldfile, _("\
504 %s: `forward' and `backward' are mutually excluding each other"),
505 "LC_COLLATE");
506 warned = 1;
509 else if (rules[cnt] & sort_backward)
511 if (! warned)
513 lr_error (ldfile, _("\
514 %s: `%s' mentioned more than once in definition of weight %d"),
515 "LC_COLLATE", "backward", cnt + 1);
518 else
519 rules[cnt] |= sort_backward;
521 valid = 1;
523 else if (arg->tok == tok_position)
525 if (rules[cnt] & sort_position)
527 if (! warned)
529 lr_error (ldfile, _("\
530 %s: `%s' mentioned more than once in definition of weight %d"),
531 "LC_COLLATE", "position", cnt + 1);
534 else
535 rules[cnt] |= sort_position;
537 valid = 1;
540 if (valid)
541 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
543 if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
544 || arg->tok == tok_semicolon)
546 if (! valid && ! warned)
548 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
549 warned = 1;
552 /* See whether we have to increment the counter. */
553 if (arg->tok != tok_comma && rules[cnt] != 0)
555 /* Add the default `forward' if we have seen only `position'. */
556 if (rules[cnt] == sort_position)
557 rules[cnt] = sort_position | sort_forward;
559 ++cnt;
562 if (arg->tok == tok_eof || arg->tok == tok_eol)
563 /* End of line or file, so we exit the loop. */
564 break;
566 if (nrules == 0)
568 /* See whether we have enough room in the array. */
569 if (cnt == max)
571 max += 10;
572 rules = (enum coll_sort_rule *) xrealloc (rules,
574 * sizeof (*rules));
575 memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
578 else
580 if (cnt == nrules)
582 /* There must not be any more rule. */
583 if (! warned)
585 lr_error (ldfile, _("\
586 %s: too many rules; first entry only had %d"),
587 "LC_COLLATE", nrules);
588 warned = 1;
591 lr_ignore_rest (ldfile, 0);
592 break;
596 else
598 if (! warned)
600 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
601 warned = 1;
605 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
608 if (nrules == 0)
610 /* Now we know how many rules we have. */
611 nrules = cnt;
612 rules = (enum coll_sort_rule *) xrealloc (rules,
613 nrules * sizeof (*rules));
615 else
617 if (cnt < nrules)
619 /* Not enough rules in this specification. */
620 if (! warned)
621 lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
624 rules[cnt] = sort_forward;
625 while (++cnt < nrules);
629 collate->current_section->rules = rules;
633 static struct element_t *
634 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
635 const char *str, size_t len)
637 void *result = NULL;
639 /* Search for the entries among the collation sequences already define. */
640 if (find_entry (&collate->seq_table, str, len, &result) != 0)
642 /* Nope, not define yet. So we see whether it is a
643 collation symbol. */
644 void *ptr;
646 if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
648 /* It's a collation symbol. */
649 struct symbol_t *sym = (struct symbol_t *) ptr;
650 result = sym->order;
652 if (result == NULL)
653 result = sym->order = new_element (collate, NULL, 0, NULL,
654 NULL, 0, 0);
656 else if (find_entry (&collate->elem_table, str, len, &result) != 0)
658 /* It's also no collation element. So it is a character
659 element defined later. */
660 result = new_element (collate, NULL, 0, NULL, str, len, 1);
661 /* Insert it into the sequence table. */
662 insert_entry (&collate->seq_table, str, len, result);
666 return (struct element_t *) result;
670 static void
671 unlink_element (struct locale_collate_t *collate)
673 if (collate->cursor == collate->start)
675 assert (collate->cursor->next == NULL);
676 assert (collate->cursor->last == NULL);
677 collate->cursor = NULL;
679 else
681 if (collate->cursor->next != NULL)
682 collate->cursor->next->last = collate->cursor->last;
683 if (collate->cursor->last != NULL)
684 collate->cursor->last->next = collate->cursor->next;
685 collate->cursor = collate->cursor->last;
690 static void
691 insert_weights (struct linereader *ldfile, struct element_t *elem,
692 const struct charmap_t *charmap,
693 struct repertoire_t *repertoire, struct localedef_t *result,
694 enum token_t ellipsis)
696 int weight_cnt;
697 struct token *arg;
698 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
700 /* Initialize all the fields. */
701 elem->file = ldfile->fname;
702 elem->line = ldfile->lineno;
704 elem->last = collate->cursor;
705 elem->next = collate->cursor ? collate->cursor->next : NULL;
706 if (collate->cursor != NULL && collate->cursor->next != NULL)
707 collate->cursor->next->last = elem;
708 if (collate->cursor != NULL)
709 collate->cursor->next = elem;
710 if (collate->start == NULL)
712 assert (collate->cursor == NULL);
713 collate->start = elem;
716 elem->section = collate->current_section;
718 if (collate->current_section->first == NULL)
719 collate->current_section->first = elem;
720 if (collate->current_section->last == collate->cursor)
721 collate->current_section->last = elem;
723 collate->cursor = elem;
725 elem->weights = (struct element_list_t *)
726 obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
727 memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
729 weight_cnt = 0;
731 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
734 if (arg->tok == tok_eof || arg->tok == tok_eol)
735 break;
737 if (arg->tok == tok_ignore)
739 /* The weight for this level has to be ignored. We use the
740 null pointer to indicate this. */
741 elem->weights[weight_cnt].w = (struct element_t **)
742 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
743 elem->weights[weight_cnt].w[0] = NULL;
744 elem->weights[weight_cnt].cnt = 1;
746 else if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
748 char ucs4str[10];
749 struct element_t *val;
750 char *symstr;
751 size_t symlen;
753 if (arg->tok == tok_bsymbol)
755 symstr = arg->val.str.startmb;
756 symlen = arg->val.str.lenmb;
758 else
760 snprintf (ucs4str, sizeof (ucs4str), "U%08X", arg->val.ucs4);
761 symstr = ucs4str;
762 symlen = 9;
765 val = find_element (ldfile, collate, symstr, symlen);
766 if (val == NULL)
767 break;
769 elem->weights[weight_cnt].w = (struct element_t **)
770 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
771 elem->weights[weight_cnt].w[0] = val;
772 elem->weights[weight_cnt].cnt = 1;
774 else if (arg->tok == tok_string)
776 /* Split the string up in the individual characters and put
777 the element definitions in the list. */
778 const char *cp = arg->val.str.startmb;
779 int cnt = 0;
780 struct element_t *charelem;
781 struct element_t **weights = NULL;
782 int max = 0;
784 if (*cp == '\0')
786 lr_error (ldfile, _("%s: empty weight string not allowed"),
787 "LC_COLLATE");
788 lr_ignore_rest (ldfile, 0);
789 break;
794 if (*cp == '<')
796 /* Ahh, it's a bsymbol or an UCS4 value. If it's
797 the latter we have to unify the name. */
798 const char *startp = ++cp;
799 size_t len;
801 while (*cp != '>')
803 if (*cp == ldfile->escape_char)
804 ++cp;
805 if (*cp == '\0')
806 /* It's a syntax error. */
807 goto syntax;
809 ++cp;
812 if (cp - startp == 5 && startp[0] == 'U'
813 && isxdigit (startp[1]) && isxdigit (startp[2])
814 && isxdigit (startp[3]) && isxdigit (startp[4]))
816 unsigned int ucs4 = strtoul (startp + 1, NULL, 16);
817 char *newstr;
819 newstr = (char *) xmalloc (10);
820 snprintf (newstr, 10, "U%08X", ucs4);
821 startp = newstr;
823 len = 9;
825 else
826 len = cp - startp;
828 charelem = find_element (ldfile, collate, startp, len);
829 ++cp;
831 else
833 /* People really shouldn't use characters directly in
834 the string. Especially since it's not really clear
835 what this means. We interpret all characters in the
836 string as if that would be bsymbols. Otherwise we
837 would have to match back to bsymbols somehow and this
838 is normally not what people normally expect. */
839 charelem = find_element (ldfile, collate, cp++, 1);
842 if (charelem == NULL)
844 /* We ignore the rest of the line. */
845 lr_ignore_rest (ldfile, 0);
846 break;
849 /* Add the pointer. */
850 if (cnt >= max)
852 struct element_t **newp;
853 max += 10;
854 newp = (struct element_t **)
855 alloca (max * sizeof (struct element_t *));
856 memcpy (newp, weights, cnt * sizeof (struct element_t *));
857 weights = newp;
859 weights[cnt++] = charelem;
861 while (*cp != '\0');
863 /* Now store the information. */
864 elem->weights[weight_cnt].w = (struct element_t **)
865 obstack_alloc (&collate->mempool,
866 cnt * sizeof (struct element_t *));
867 memcpy (elem->weights[weight_cnt].w, weights,
868 cnt * sizeof (struct element_t *));
869 elem->weights[weight_cnt].cnt = cnt;
871 /* We don't need the string anymore. */
872 free (arg->val.str.startmb);
874 else if (ellipsis != tok_none
875 && (arg->tok == tok_ellipsis2
876 || arg->tok == tok_ellipsis3
877 || arg->tok == tok_ellipsis4))
879 /* It must be the same ellipsis as used in the initial column. */
880 if (arg->tok != ellipsis)
881 lr_error (ldfile, _("\
882 %s: weights must use the same ellipsis symbol as the name"),
883 "LC_COLLATE");
885 /* The weight for this level will depend on the element
886 iterating over the range. Put a placeholder. */
887 elem->weights[weight_cnt].w = (struct element_t **)
888 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
889 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
890 elem->weights[weight_cnt].cnt = 1;
892 else
894 syntax:
895 /* It's a syntax error. */
896 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
897 lr_ignore_rest (ldfile, 0);
898 break;
901 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
902 /* This better should be the end of the line or a semicolon. */
903 if (arg->tok == tok_semicolon)
904 /* OK, ignore this and read the next token. */
905 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
906 else if (arg->tok != tok_eof && arg->tok != tok_eol)
908 /* It's a syntax error. */
909 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
910 lr_ignore_rest (ldfile, 0);
911 break;
914 while (++weight_cnt < nrules);
916 if (weight_cnt < nrules)
918 /* This means the rest of the line uses the current element as
919 the weight. */
922 elem->weights[weight_cnt].w = (struct element_t **)
923 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
924 if (ellipsis == tok_none)
925 elem->weights[weight_cnt].w[0] = elem;
926 else
927 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
928 elem->weights[weight_cnt].cnt = 1;
930 while (++weight_cnt < nrules);
932 else
934 if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
936 /* Too many rule values. */
937 lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
938 lr_ignore_rest (ldfile, 0);
940 else
941 lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
946 static int
947 insert_value (struct linereader *ldfile, const char *symstr, size_t symlen,
948 const struct charmap_t *charmap, struct repertoire_t *repertoire,
949 struct localedef_t *result)
951 /* First find out what kind of symbol this is. */
952 struct charseq *seq;
953 uint32_t wc;
954 struct element_t *elem = NULL;
955 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
957 /* Try to find the character in the charmap. */
958 seq = charmap_find_value (charmap, symstr, symlen);
960 /* Determine the wide character. */
961 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
963 wc = repertoire_find_value (repertoire, symstr, symlen);
964 if (seq != NULL)
965 seq->ucs4 = wc;
967 else
968 wc = seq->ucs4;
970 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
972 /* It's no character, so look through the collation elements and
973 symbol list. */
974 void *ptr = elem;
975 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) != 0)
977 void *result;
978 struct symbol_t *sym = NULL;
980 /* It's also collation element. Therefore it's either a
981 collating symbol or it's a character which is not
982 supported by the character set. In the later case we
983 simply create a dummy entry. */
984 if (find_entry (&collate->sym_table, symstr, symlen, &result) == 0)
986 /* It's a collation symbol. */
987 sym = (struct symbol_t *) result;
989 elem = sym->order;
992 if (elem == NULL)
994 elem = new_element (collate, NULL, 0, NULL, symstr, symlen, 0);
996 if (sym != NULL)
997 sym->order = elem;
998 else
999 /* Enter a fake element in the sequence table. This
1000 won't cause anything in the output since there is
1001 no multibyte or wide character associated with
1002 it. */
1003 insert_entry (&collate->seq_table, symstr, symlen, elem);
1006 else
1007 /* Copy the result back. */
1008 elem = ptr;
1010 else
1012 /* Otherwise the symbols stands for a character. */
1013 void *ptr = elem;
1014 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) != 0)
1016 uint32_t wcs[2] = { wc, 0 };
1018 /* We have to allocate an entry. */
1019 elem = new_element (collate,
1020 seq != NULL ? (char *) seq->bytes : NULL,
1021 seq != NULL ? seq->nbytes : 0,
1022 wc == ILLEGAL_CHAR_VALUE ? NULL : wcs,
1023 symstr, symlen, 1);
1025 /* And add it to the table. */
1026 if (insert_entry (&collate->seq_table, symstr, symlen, elem) != 0)
1027 /* This cannot happen. */
1028 assert (! "Internal error");
1030 else
1032 /* Copy the result back. */
1033 elem = ptr;
1035 /* Maybe the character was used before the definition. In this case
1036 we have to insert the byte sequences now. */
1037 if (elem->mbs == NULL && seq != NULL)
1039 elem->mbs = obstack_copy0 (&collate->mempool,
1040 seq->bytes, seq->nbytes);
1041 elem->nmbs = seq->nbytes;
1044 if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
1046 uint32_t wcs[2] = { wc, 0 };
1048 elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
1049 elem->nwcs = 1;
1054 /* Test whether this element is not already in the list. */
1055 if (elem->next != NULL || elem == collate->cursor)
1057 lr_error (ldfile, _("order for `%.*s' already defined at %s:%Zu"),
1058 (int) symlen, symstr, elem->file, elem->line);
1059 lr_ignore_rest (ldfile, 0);
1060 return 1;
1063 insert_weights (ldfile, elem, charmap, repertoire, result, tok_none);
1065 return 0;
1069 static void
1070 handle_ellipsis (struct linereader *ldfile, const char *symstr, size_t symlen,
1071 enum token_t ellipsis, const struct charmap_t *charmap,
1072 struct repertoire_t *repertoire,
1073 struct localedef_t *result)
1075 struct element_t *startp;
1076 struct element_t *endp;
1077 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
1079 /* Unlink the entry added for the ellipsis. */
1080 unlink_element (collate);
1081 startp = collate->cursor;
1083 /* Process and add the end-entry. */
1084 if (symstr != NULL
1085 && insert_value (ldfile, symstr, symlen, charmap, repertoire, result))
1086 /* Something went wrong with inserting the to-value. This means
1087 we cannot process the ellipsis. */
1088 return;
1090 /* Reset the cursor. */
1091 collate->cursor = startp;
1093 /* Now we have to handle many different situations:
1094 - we have to distinguish between the three different ellipsis forms
1095 - the is the ellipsis at the beginning, in the middle, or at the end.
1097 endp = collate->cursor->next;
1098 assert (symstr == NULL || endp != NULL);
1100 /* XXX The following is probably very wrong since also collating symbols
1101 can appear in ranges. But do we want/can refine the test for that? */
1102 #if 0
1103 /* Both, the start and the end symbol, must stand for characters. */
1104 if ((startp != NULL && (startp->name == NULL || ! startp->is_character))
1105 || (endp != NULL && (endp->name == NULL|| ! endp->is_character)))
1107 lr_error (ldfile, _("\
1108 %s: the start and the end symbol of a range must stand for characters"),
1109 "LC_COLLATE");
1110 return;
1112 #endif
1114 if (ellipsis == tok_ellipsis3)
1116 /* One requirement we make here: the length of the byte
1117 sequences for the first and end character must be the same.
1118 This is mainly to prevent unwanted effects and this is often
1119 not what is wanted. */
1120 size_t len = (startp->mbs != NULL ? startp->nmbs
1121 : (endp->mbs != NULL ? endp->nmbs : 0));
1122 char mbcnt[len + 1];
1123 char mbend[len + 1];
1125 /* Well, this should be caught somewhere else already. Just to
1126 make sure. */
1127 assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
1128 assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
1130 if (startp != NULL && endp != NULL
1131 && startp->mbs != NULL && endp->mbs != NULL
1132 && startp->nmbs != endp->nmbs)
1134 lr_error (ldfile, _("\
1135 %s: byte sequences of first and last character must have the same length"),
1136 "LC_COLLATE");
1137 return;
1140 /* Determine whether we have to generate multibyte sequences. */
1141 if ((startp == NULL || startp->mbs != NULL)
1142 && (endp == NULL || endp->mbs != NULL))
1144 int cnt;
1145 int ret;
1147 /* Prepare the beginning byte sequence. This is either from the
1148 beginning byte sequence or it is all nulls if it was an
1149 initial ellipsis. */
1150 if (startp == NULL || startp->mbs == NULL)
1151 memset (mbcnt, '\0', len);
1152 else
1154 memcpy (mbcnt, startp->mbs, len);
1156 /* And increment it so that the value is the first one we will
1157 try to insert. */
1158 for (cnt = len - 1; cnt >= 0; --cnt)
1159 if (++mbcnt[cnt] != '\0')
1160 break;
1162 mbcnt[len] = '\0';
1164 /* And the end sequence. */
1165 if (endp == NULL || endp->mbs == NULL)
1166 memset (mbend, '\0', len);
1167 else
1168 memcpy (mbend, endp->mbs, len);
1169 mbend[len] = '\0';
1171 /* Test whether we have a correct range. */
1172 ret = memcmp (mbcnt, mbend, len);
1173 if (ret >= 0)
1175 if (ret > 0)
1176 lr_error (ldfile, _("%s: byte sequence of first character of \
1177 range is not lower than that of the last character"), "LC_COLLATE");
1178 return;
1181 /* Generate the byte sequences data. */
1182 while (1)
1184 struct charseq *seq;
1186 /* Quite a bit of work ahead. We have to find the character
1187 definition for the byte sequence and then determine the
1188 wide character belonging to it. */
1189 seq = charmap_find_symbol (charmap, mbcnt, len);
1190 if (seq != NULL)
1192 struct element_t *elem;
1193 size_t namelen;
1195 /* I don't think this can ever happen. */
1196 assert (seq->name != NULL);
1197 namelen = strlen (seq->name);
1199 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1200 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1201 namelen);
1203 /* Now we are ready to insert the new value in the
1204 sequence. Find out whether the element is
1205 already known. */
1206 void *ptr;
1207 if (find_entry (&collate->seq_table, seq->name, namelen,
1208 &ptr) != 0)
1210 uint32_t wcs[2] = { seq->ucs4, 0 };
1212 /* We have to allocate an entry. */
1213 elem = new_element (collate, mbcnt, len,
1214 seq->ucs4 == ILLEGAL_CHAR_VALUE
1215 ? NULL : wcs, seq->name,
1216 namelen, 1);
1218 /* And add it to the table. */
1219 if (insert_entry (&collate->seq_table, seq->name,
1220 namelen, elem) != 0)
1221 /* This cannot happen. */
1222 assert (! "Internal error");
1224 else
1225 /* Copy the result. */
1226 elem = ptr;
1228 /* Test whether this element is not already in the list. */
1229 if (elem->next != NULL || (collate->cursor != NULL
1230 && elem->next == collate->cursor))
1232 lr_error (ldfile, _("\
1233 order for `%.*s' already defined at %s:%Zu"),
1234 (int) namelen, seq->name,
1235 elem->file, elem->line);
1236 goto increment;
1239 /* Enqueue the new element. */
1240 elem->last = collate->cursor;
1241 if (collate->cursor == NULL)
1242 elem->next = NULL;
1243 else
1245 elem->next = collate->cursor->next;
1246 elem->last->next = elem;
1247 if (elem->next != NULL)
1248 elem->next->last = elem;
1250 if (collate->start == NULL)
1252 assert (collate->cursor == NULL);
1253 collate->start = elem;
1255 collate->cursor = elem;
1257 /* Add the weight value. We take them from the
1258 `ellipsis_weights' member of `collate'. */
1259 elem->weights = (struct element_list_t *)
1260 obstack_alloc (&collate->mempool,
1261 nrules * sizeof (struct element_list_t));
1262 for (cnt = 0; cnt < nrules; ++cnt)
1263 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1264 && (collate->ellipsis_weight.weights[cnt].w[0]
1265 == ELEMENT_ELLIPSIS2))
1267 elem->weights[cnt].w = (struct element_t **)
1268 obstack_alloc (&collate->mempool,
1269 sizeof (struct element_t *));
1270 elem->weights[cnt].w[0] = elem;
1271 elem->weights[cnt].cnt = 1;
1273 else
1275 /* Simply use the weight from `ellipsis_weight'. */
1276 elem->weights[cnt].w =
1277 collate->ellipsis_weight.weights[cnt].w;
1278 elem->weights[cnt].cnt =
1279 collate->ellipsis_weight.weights[cnt].cnt;
1283 /* Increment for the next round. */
1284 increment:
1285 for (cnt = len - 1; cnt >= 0; --cnt)
1286 if (++mbcnt[cnt] != '\0')
1287 break;
1289 /* Find out whether this was all. */
1290 if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1291 /* Yep, that's all. */
1292 break;
1296 else
1298 /* For symbolic range we naturally must have a beginning and an
1299 end specified by the user. */
1300 if (startp == NULL)
1301 lr_error (ldfile, _("\
1302 %s: symbolic range ellipsis must not directly follow `order_start'"),
1303 "LC_COLLATE");
1304 else if (endp == NULL)
1305 lr_error (ldfile, _("\
1306 %s: symbolic range ellipsis must not be directly followed by `order_end'"),
1307 "LC_COLLATE");
1308 else
1310 /* Determine the range. To do so we have to determine the
1311 common prefix of the both names and then the numeric
1312 values of both ends. */
1313 size_t lenfrom = strlen (startp->name);
1314 size_t lento = strlen (endp->name);
1315 char buf[lento + 1];
1316 int preflen = 0;
1317 long int from;
1318 long int to;
1319 char *cp;
1320 int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1322 if (lenfrom != lento)
1324 invalid_range:
1325 lr_error (ldfile, _("\
1326 `%s' and `%.*s' are not valid names for symbolic range"),
1327 startp->name, (int) lento, endp->name);
1328 return;
1331 while (startp->name[preflen] == endp->name[preflen])
1332 if (startp->name[preflen] == '\0')
1333 /* Nothing to be done. The start and end point are identical
1334 and while inserting the end point we have already given
1335 the user an error message. */
1336 return;
1337 else
1338 ++preflen;
1340 errno = 0;
1341 from = strtol (startp->name + preflen, &cp, base);
1342 if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1343 goto invalid_range;
1345 errno = 0;
1346 to = strtol (endp->name + preflen, &cp, base);
1347 if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1348 goto invalid_range;
1350 /* Copy the prefix. */
1351 memcpy (buf, startp->name, preflen);
1353 /* Loop over all values. */
1354 for (++from; from < to; ++from)
1356 struct element_t *elem = NULL;
1357 struct charseq *seq;
1358 uint32_t wc;
1359 int cnt;
1361 /* Generate the name. */
1362 sprintf (buf + preflen, base == 10 ? "%0*ld" : "%0*lX",
1363 (int) (lenfrom - preflen), from);
1365 /* Look whether this name is already defined. */
1366 void *ptr;
1367 if (find_entry (&collate->seq_table, buf, symlen, &ptr) == 0)
1369 /* Copy back the result. */
1370 elem = ptr;
1372 if (elem->next != NULL || (collate->cursor != NULL
1373 && elem->next == collate->cursor))
1375 lr_error (ldfile, _("\
1376 %s: order for `%.*s' already defined at %s:%Zu"),
1377 "LC_COLLATE", (int) lenfrom, buf,
1378 elem->file, elem->line);
1379 continue;
1382 if (elem->name == NULL)
1384 lr_error (ldfile, _("%s: `%s' must be a character"),
1385 "LC_COLLATE", buf);
1386 continue;
1390 if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1392 /* Search for a character of this name. */
1393 seq = charmap_find_value (charmap, buf, lenfrom);
1394 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1396 wc = repertoire_find_value (repertoire, buf, lenfrom);
1398 if (seq != NULL)
1399 seq->ucs4 = wc;
1401 else
1402 wc = seq->ucs4;
1404 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1405 /* We don't know anything about a character with this
1406 name. XXX Should we warn? */
1407 continue;
1409 if (elem == NULL)
1411 uint32_t wcs[2] = { wc, 0 };
1413 /* We have to allocate an entry. */
1414 elem = new_element (collate,
1415 seq != NULL
1416 ? (char *) seq->bytes : NULL,
1417 seq != NULL ? seq->nbytes : 0,
1418 wc == ILLEGAL_CHAR_VALUE
1419 ? NULL : wcs, buf, lenfrom, 1);
1421 else
1423 /* Update the element. */
1424 if (seq != NULL)
1426 elem->mbs = obstack_copy0 (&collate->mempool,
1427 seq->bytes, seq->nbytes);
1428 elem->nmbs = seq->nbytes;
1431 if (wc != ILLEGAL_CHAR_VALUE)
1433 uint32_t zero = 0;
1435 obstack_grow (&collate->mempool,
1436 &wc, sizeof (uint32_t));
1437 obstack_grow (&collate->mempool,
1438 &zero, sizeof (uint32_t));
1439 elem->wcs = obstack_finish (&collate->mempool);
1440 elem->nwcs = 1;
1444 elem->file = ldfile->fname;
1445 elem->line = ldfile->lineno;
1446 elem->section = collate->current_section;
1449 /* Enqueue the new element. */
1450 elem->last = collate->cursor;
1451 elem->next = collate->cursor->next;
1452 elem->last->next = elem;
1453 if (elem->next != NULL)
1454 elem->next->last = elem;
1455 collate->cursor = elem;
1457 /* Now add the weights. They come from the `ellipsis_weights'
1458 member of `collate'. */
1459 elem->weights = (struct element_list_t *)
1460 obstack_alloc (&collate->mempool,
1461 nrules * sizeof (struct element_list_t));
1462 for (cnt = 0; cnt < nrules; ++cnt)
1463 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1464 && (collate->ellipsis_weight.weights[cnt].w[0]
1465 == ELEMENT_ELLIPSIS2))
1467 elem->weights[cnt].w = (struct element_t **)
1468 obstack_alloc (&collate->mempool,
1469 sizeof (struct element_t *));
1470 elem->weights[cnt].w[0] = elem;
1471 elem->weights[cnt].cnt = 1;
1473 else
1475 /* Simly use the weight from `ellipsis_weight'. */
1476 elem->weights[cnt].w =
1477 collate->ellipsis_weight.weights[cnt].w;
1478 elem->weights[cnt].cnt =
1479 collate->ellipsis_weight.weights[cnt].cnt;
1487 static void
1488 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1489 struct localedef_t *copy_locale, int ignore_content)
1491 if (!ignore_content && locale->categories[LC_COLLATE].collate == NULL)
1493 struct locale_collate_t *collate;
1495 if (copy_locale == NULL)
1497 collate = locale->categories[LC_COLLATE].collate =
1498 (struct locale_collate_t *)
1499 xcalloc (1, sizeof (struct locale_collate_t));
1501 /* Init the various data structures. */
1502 init_hash (&collate->elem_table, 100);
1503 init_hash (&collate->sym_table, 100);
1504 init_hash (&collate->seq_table, 500);
1505 obstack_init (&collate->mempool);
1507 collate->col_weight_max = -1;
1509 else
1510 /* Reuse the copy_locale's data structures. */
1511 collate = locale->categories[LC_COLLATE].collate =
1512 copy_locale->categories[LC_COLLATE].collate;
1515 ldfile->translate_strings = 0;
1516 ldfile->return_widestr = 0;
1520 void
1521 collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
1523 /* Now is the time when we can assign the individual collation
1524 values for all the symbols. We have possibly different values
1525 for the wide- and the multibyte-character symbols. This is done
1526 since it might make a difference in the encoding if there is in
1527 some cases no multibyte-character but there are wide-characters.
1528 (The other way around it is not important since theencoded
1529 collation value in the wide-character case is 32 bits wide and
1530 therefore requires no encoding).
1532 The lowest collation value assigned is 2. Zero is reserved for
1533 the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1534 functions and 1 is used to separate the individual passes for the
1535 different rules.
1537 We also have to construct is list with all the bytes/words which
1538 can come first in a sequence, followed by all the elements which
1539 also start with this byte/word. The order is reverse which has
1540 among others the important effect that longer strings are located
1541 first in the list. This is required for the output data since
1542 the algorithm used in `strcoll' etc depends on this.
1544 The multibyte case is easy. We simply sort into an array with
1545 256 elements. */
1546 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1547 int mbact[nrules];
1548 int wcact;
1549 int mbseqact;
1550 int wcseqact;
1551 struct element_t *runp;
1552 int i;
1553 int need_undefined = 0;
1554 struct section_list *sect;
1555 int ruleidx;
1556 int nr_wide_elems = 0;
1558 if (collate == NULL)
1560 /* No data, no check. */
1561 if (! be_quiet)
1562 WITH_CUR_LOCALE (error (0, 0, _("No definition for %s category found"),
1563 "LC_COLLATE"));
1564 return;
1567 /* If this assertion is hit change the type in `element_t'. */
1568 assert (nrules <= sizeof (runp->used_in_level) * 8);
1570 /* Make sure that the `position' rule is used either in all sections
1571 or in none. */
1572 for (i = 0; i < nrules; ++i)
1573 for (sect = collate->sections; sect != NULL; sect = sect->next)
1574 if (sect != collate->current_section
1575 && sect->rules != NULL
1576 && ((sect->rules[i] & sort_position)
1577 != (collate->current_section->rules[i] & sort_position)))
1579 WITH_CUR_LOCALE (error (0, 0, _("\
1580 %s: `position' must be used for a specific level in all sections or none"),
1581 "LC_COLLATE"));
1582 break;
1585 /* Find out which elements are used at which level. At the same
1586 time we find out whether we have any undefined symbols. */
1587 runp = collate->start;
1588 while (runp != NULL)
1590 if (runp->mbs != NULL)
1592 for (i = 0; i < nrules; ++i)
1594 int j;
1596 for (j = 0; j < runp->weights[i].cnt; ++j)
1597 /* A NULL pointer as the weight means IGNORE. */
1598 if (runp->weights[i].w[j] != NULL)
1600 if (runp->weights[i].w[j]->weights == NULL)
1602 WITH_CUR_LOCALE (error_at_line (0, 0, runp->file,
1603 runp->line,
1604 _("symbol `%s' not defined"),
1605 runp->weights[i].w[j]->name));
1607 need_undefined = 1;
1608 runp->weights[i].w[j] = &collate->undefined;
1610 else
1611 /* Set the bit for the level. */
1612 runp->weights[i].w[j]->used_in_level |= 1 << i;
1617 /* Up to the next entry. */
1618 runp = runp->next;
1621 /* Walk through the list of defined sequences and assign weights. Also
1622 create the data structure which will allow generating the single byte
1623 character based tables.
1625 Since at each time only the weights for each of the rules are
1626 only compared to other weights for this rule it is possible to
1627 assign more compact weight values than simply counting all
1628 weights in sequence. We can assign weights from 3, one for each
1629 rule individually and only for those elements, which are actually
1630 used for this rule.
1632 Why is this important? It is not for the wide char table. But
1633 it is for the singlebyte output since here larger numbers have to
1634 be encoded to make it possible to emit the value as a byte
1635 string. */
1636 for (i = 0; i < nrules; ++i)
1637 mbact[i] = 2;
1638 wcact = 2;
1639 mbseqact = 0;
1640 wcseqact = 0;
1641 runp = collate->start;
1642 while (runp != NULL)
1644 /* Determine the order. */
1645 if (runp->used_in_level != 0)
1647 runp->mborder = (int *) obstack_alloc (&collate->mempool,
1648 nrules * sizeof (int));
1650 for (i = 0; i < nrules; ++i)
1651 if ((runp->used_in_level & (1 << i)) != 0)
1652 runp->mborder[i] = mbact[i]++;
1653 else
1654 runp->mborder[i] = 0;
1657 if (runp->mbs != NULL)
1659 struct element_t **eptr;
1660 struct element_t *lastp = NULL;
1662 /* Find the point where to insert in the list. */
1663 eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1664 while (*eptr != NULL)
1666 if ((*eptr)->nmbs < runp->nmbs)
1667 break;
1669 if ((*eptr)->nmbs == runp->nmbs)
1671 int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1673 if (c == 0)
1675 /* This should not happen. It means that we have
1676 to symbols with the same byte sequence. It is
1677 of course an error. */
1678 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1679 (*eptr)->line,
1680 _("\
1681 symbol `%s' has the same encoding as"), (*eptr)->name);
1682 error_at_line (0, 0, runp->file,
1683 runp->line,
1684 _("symbol `%s'"),
1685 runp->name));
1686 goto dont_insert;
1688 else if (c < 0)
1689 /* Insert it here. */
1690 break;
1693 /* To the next entry. */
1694 lastp = *eptr;
1695 eptr = &(*eptr)->mbnext;
1698 /* Set the pointers. */
1699 runp->mbnext = *eptr;
1700 runp->mblast = lastp;
1701 if (*eptr != NULL)
1702 (*eptr)->mblast = runp;
1703 *eptr = runp;
1704 dont_insert:
1708 if (runp->used_in_level)
1710 runp->wcorder = wcact++;
1712 /* We take the opportunity to count the elements which have
1713 wide characters. */
1714 ++nr_wide_elems;
1717 if (runp->is_character)
1719 if (runp->nmbs == 1)
1720 collate->mbseqorder[((unsigned char *) runp->mbs)[0]] = mbseqact++;
1722 runp->wcseqorder = wcseqact++;
1724 else if (runp->mbs != NULL && runp->weights != NULL)
1725 /* This is for collation elements. */
1726 runp->wcseqorder = wcseqact++;
1728 /* Up to the next entry. */
1729 runp = runp->next;
1732 /* Find out whether any of the `mbheads' entries is unset. In this
1733 case we use the UNDEFINED entry. */
1734 for (i = 1; i < 256; ++i)
1735 if (collate->mbheads[i] == NULL)
1737 need_undefined = 1;
1738 collate->mbheads[i] = &collate->undefined;
1741 /* Now to the wide character case. */
1742 collate->wcheads.p = 6;
1743 collate->wcheads.q = 10;
1744 wchead_table_init (&collate->wcheads);
1746 collate->wcseqorder.p = 6;
1747 collate->wcseqorder.q = 10;
1748 collseq_table_init (&collate->wcseqorder);
1750 /* Start adding. */
1751 runp = collate->start;
1752 while (runp != NULL)
1754 if (runp->wcs != NULL)
1756 struct element_t *e;
1757 struct element_t **eptr;
1758 struct element_t *lastp;
1760 /* Insert the collation sequence value. */
1761 if (runp->is_character)
1762 collseq_table_add (&collate->wcseqorder, runp->wcs[0],
1763 runp->wcseqorder);
1765 /* Find the point where to insert in the list. */
1766 e = wchead_table_get (&collate->wcheads, runp->wcs[0]);
1767 eptr = &e;
1768 lastp = NULL;
1769 while (*eptr != NULL)
1771 if ((*eptr)->nwcs < runp->nwcs)
1772 break;
1774 if ((*eptr)->nwcs == runp->nwcs)
1776 int c = wmemcmp ((wchar_t *) (*eptr)->wcs,
1777 (wchar_t *) runp->wcs, runp->nwcs);
1779 if (c == 0)
1781 /* This should not happen. It means that we have
1782 two symbols with the same byte sequence. It is
1783 of course an error. */
1784 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1785 (*eptr)->line,
1786 _("\
1787 symbol `%s' has the same encoding as"), (*eptr)->name);
1788 error_at_line (0, 0, runp->file,
1789 runp->line,
1790 _("symbol `%s'"),
1791 runp->name));
1792 goto dont_insertwc;
1794 else if (c < 0)
1795 /* Insert it here. */
1796 break;
1799 /* To the next entry. */
1800 lastp = *eptr;
1801 eptr = &(*eptr)->wcnext;
1804 /* Set the pointers. */
1805 runp->wcnext = *eptr;
1806 runp->wclast = lastp;
1807 if (*eptr != NULL)
1808 (*eptr)->wclast = runp;
1809 *eptr = runp;
1810 if (eptr == &e)
1811 wchead_table_add (&collate->wcheads, runp->wcs[0], e);
1812 dont_insertwc:
1816 /* Up to the next entry. */
1817 runp = runp->next;
1820 /* Now determine whether the UNDEFINED entry is needed and if yes,
1821 whether it was defined. */
1822 collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1823 if (collate->undefined.file == NULL)
1825 if (need_undefined)
1827 /* This seems not to be enforced by recent standards. Don't
1828 emit an error, simply append UNDEFINED at the end. */
1829 if (0)
1830 WITH_CUR_LOCALE (error (0, 0, _("no definition of `UNDEFINED'")));
1832 /* Add UNDEFINED at the end. */
1833 collate->undefined.mborder =
1834 (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1836 for (i = 0; i < nrules; ++i)
1837 collate->undefined.mborder[i] = mbact[i]++;
1840 /* In any case we will need the definition for the wide character
1841 case. But we will not complain that it is missing since the
1842 specification strangely enough does not seem to account for
1843 this. */
1844 collate->undefined.wcorder = wcact++;
1847 /* Finally, try to unify the rules for the sections. Whenever the rules
1848 for a section are the same as those for another section give the
1849 ruleset the same index. Since there are never many section we can
1850 use an O(n^2) algorithm here. */
1851 sect = collate->sections;
1852 while (sect != NULL && sect->rules == NULL)
1853 sect = sect->next;
1855 /* Bail out if we have no sections because of earlier errors. */
1856 if (sect == NULL)
1858 WITH_CUR_LOCALE (error (EXIT_FAILURE, 0,
1859 _("too many errors; giving up")));
1860 return;
1863 ruleidx = 0;
1866 struct section_list *osect = collate->sections;
1868 while (osect != sect)
1869 if (osect->rules != NULL
1870 && memcmp (osect->rules, sect->rules,
1871 nrules * sizeof (osect->rules[0])) == 0)
1872 break;
1873 else
1874 osect = osect->next;
1876 if (osect == sect)
1877 sect->ruleidx = ruleidx++;
1878 else
1879 sect->ruleidx = osect->ruleidx;
1881 /* Next section. */
1883 sect = sect->next;
1884 while (sect != NULL && sect->rules == NULL);
1886 while (sect != NULL);
1887 /* We are currently not prepared for more than 128 rulesets. But this
1888 should never really be a problem. */
1889 assert (ruleidx <= 128);
1893 static int32_t
1894 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1895 struct element_t *elem)
1897 size_t cnt;
1898 int32_t retval;
1900 /* Optimize the use of UNDEFINED. */
1901 if (elem == &collate->undefined)
1902 /* The weights are already inserted. */
1903 return 0;
1905 /* This byte can start exactly one collation element and this is
1906 a single byte. We can directly give the index to the weights. */
1907 retval = obstack_object_size (pool);
1909 /* Construct the weight. */
1910 for (cnt = 0; cnt < nrules; ++cnt)
1912 char buf[elem->weights[cnt].cnt * 7];
1913 int len = 0;
1914 int i;
1916 for (i = 0; i < elem->weights[cnt].cnt; ++i)
1917 /* Encode the weight value. We do nothing for IGNORE entries. */
1918 if (elem->weights[cnt].w[i] != NULL)
1919 len += utf8_encode (&buf[len],
1920 elem->weights[cnt].w[i]->mborder[cnt]);
1922 /* And add the buffer content. */
1923 obstack_1grow (pool, len);
1924 obstack_grow (pool, buf, len);
1927 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1931 static int32_t
1932 output_weightwc (struct obstack *pool, struct locale_collate_t *collate,
1933 struct element_t *elem)
1935 size_t cnt;
1936 int32_t retval;
1938 /* Optimize the use of UNDEFINED. */
1939 if (elem == &collate->undefined)
1940 /* The weights are already inserted. */
1941 return 0;
1943 /* This byte can start exactly one collation element and this is
1944 a single byte. We can directly give the index to the weights. */
1945 retval = obstack_object_size (pool) / sizeof (int32_t);
1947 /* Construct the weight. */
1948 for (cnt = 0; cnt < nrules; ++cnt)
1950 int32_t buf[elem->weights[cnt].cnt];
1951 int i;
1952 int32_t j;
1954 for (i = 0, j = 0; i < elem->weights[cnt].cnt; ++i)
1955 if (elem->weights[cnt].w[i] != NULL)
1956 buf[j++] = elem->weights[cnt].w[i]->wcorder;
1958 /* And add the buffer content. */
1959 obstack_int32_grow (pool, j);
1961 obstack_grow (pool, buf, j * sizeof (int32_t));
1962 maybe_swap_uint32_obstack (pool, j);
1965 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1968 /* If localedef is every threaded, this would need to be __thread var. */
1969 static struct
1971 struct obstack *weightpool;
1972 struct obstack *extrapool;
1973 struct obstack *indpool;
1974 struct locale_collate_t *collate;
1975 struct collidx_table *tablewc;
1976 } atwc;
1978 static void add_to_tablewc (uint32_t ch, struct element_t *runp);
1980 static void
1981 add_to_tablewc (uint32_t ch, struct element_t *runp)
1983 if (runp->wcnext == NULL && runp->nwcs == 1)
1985 int32_t weigthidx = output_weightwc (atwc.weightpool, atwc.collate,
1986 runp);
1987 collidx_table_add (atwc.tablewc, ch, weigthidx);
1989 else
1991 /* As for the singlebyte table, we recognize sequences and
1992 compress them. */
1994 collidx_table_add (atwc.tablewc, ch,
1995 -(obstack_object_size (atwc.extrapool)
1996 / sizeof (uint32_t)));
2000 /* Store the current index in the weight table. We know that
2001 the current position in the `extrapool' is aligned on a
2002 32-bit address. */
2003 int32_t weightidx;
2004 int added;
2006 /* Find out wether this is a single entry or we have more than
2007 one consecutive entry. */
2008 if (runp->wcnext != NULL
2009 && runp->nwcs == runp->wcnext->nwcs
2010 && wmemcmp ((wchar_t *) runp->wcs,
2011 (wchar_t *)runp->wcnext->wcs,
2012 runp->nwcs - 1) == 0
2013 && (runp->wcs[runp->nwcs - 1]
2014 == runp->wcnext->wcs[runp->nwcs - 1] + 1))
2016 int i;
2017 struct element_t *series_startp = runp;
2018 struct element_t *curp;
2020 /* Now add first the initial byte sequence. */
2021 added = (1 + 1 + 2 * (runp->nwcs - 1)) * sizeof (int32_t);
2022 if (sizeof (int32_t) == sizeof (int))
2023 obstack_make_room (atwc.extrapool, added);
2025 /* More than one consecutive entry. We mark this by having
2026 a negative index into the indirect table. */
2027 obstack_int32_grow_fast (atwc.extrapool,
2028 -(obstack_object_size (atwc.indpool)
2029 / sizeof (int32_t)));
2030 obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2033 runp = runp->wcnext;
2034 while (runp->wcnext != NULL
2035 && runp->nwcs == runp->wcnext->nwcs
2036 && wmemcmp ((wchar_t *) runp->wcs,
2037 (wchar_t *)runp->wcnext->wcs,
2038 runp->nwcs - 1) == 0
2039 && (runp->wcs[runp->nwcs - 1]
2040 == runp->wcnext->wcs[runp->nwcs - 1] + 1));
2042 /* Now walk backward from here to the beginning. */
2043 curp = runp;
2045 for (i = 1; i < runp->nwcs; ++i)
2046 obstack_int32_grow_fast (atwc.extrapool, curp->wcs[i]);
2048 /* Now find the end of the consecutive sequence and
2049 add all the indeces in the indirect pool. */
2052 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2053 curp);
2054 obstack_int32_grow (atwc.indpool, weightidx);
2056 curp = curp->wclast;
2058 while (curp != series_startp);
2060 /* Add the final weight. */
2061 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2062 curp);
2063 obstack_int32_grow (atwc.indpool, weightidx);
2065 /* And add the end byte sequence. Without length this
2066 time. */
2067 for (i = 1; i < curp->nwcs; ++i)
2068 obstack_int32_grow (atwc.extrapool, curp->wcs[i]);
2070 else
2072 /* A single entry. Simply add the index and the length and
2073 string (except for the first character which is already
2074 tested for). */
2075 int i;
2077 /* Output the weight info. */
2078 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2079 runp);
2081 added = (1 + 1 + runp->nwcs - 1) * sizeof (int32_t);
2082 if (sizeof (int) == sizeof (int32_t))
2083 obstack_make_room (atwc.extrapool, added);
2085 obstack_int32_grow_fast (atwc.extrapool, weightidx);
2086 obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2087 for (i = 1; i < runp->nwcs; ++i)
2088 obstack_int32_grow_fast (atwc.extrapool, runp->wcs[i]);
2091 /* Next entry. */
2092 runp = runp->wcnext;
2094 while (runp != NULL);
2098 void
2099 collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
2100 const char *output_path)
2102 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
2103 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
2104 struct locale_file file;
2105 size_t ch;
2106 int32_t tablemb[256];
2107 struct obstack weightpool;
2108 struct obstack extrapool;
2109 struct obstack indirectpool;
2110 struct section_list *sect;
2111 struct collidx_table tablewc;
2112 uint32_t elem_size;
2113 uint32_t *elem_table;
2114 int i;
2115 struct element_t *runp;
2117 init_locale_data (&file, nelems);
2118 add_locale_uint32 (&file, nrules);
2120 /* If we have no LC_COLLATE data emit only the number of rules as zero. */
2121 if (collate == NULL)
2123 size_t idx;
2124 for (idx = 1; idx < nelems; idx++)
2126 /* The words have to be handled specially. */
2127 if (idx == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
2128 add_locale_uint32 (&file, 0);
2129 else
2130 add_locale_empty (&file);
2132 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file);
2133 return;
2136 obstack_init (&weightpool);
2137 obstack_init (&extrapool);
2138 obstack_init (&indirectpool);
2140 /* Since we are using the sign of an integer to mark indirection the
2141 offsets in the arrays we are indirectly referring to must not be
2142 zero since -0 == 0. Therefore we add a bit of dummy content. */
2143 obstack_int32_grow (&extrapool, 0);
2144 obstack_int32_grow (&indirectpool, 0);
2146 /* Prepare the ruleset table. */
2147 for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
2148 if (sect->rules != NULL && sect->ruleidx == i)
2150 int j;
2152 obstack_make_room (&weightpool, nrules);
2154 for (j = 0; j < nrules; ++j)
2155 obstack_1grow_fast (&weightpool, sect->rules[j]);
2156 ++i;
2158 /* And align the output. */
2159 i = (nrules * i) % LOCFILE_ALIGN;
2160 if (i > 0)
2162 obstack_1grow (&weightpool, '\0');
2163 while (++i < LOCFILE_ALIGN);
2165 add_locale_raw_obstack (&file, &weightpool);
2167 /* Generate the 8-bit table. Walk through the lists of sequences
2168 starting with the same byte and add them one after the other to
2169 the table. In case we have more than one sequence starting with
2170 the same byte we have to use extra indirection.
2172 First add a record for the NUL byte. This entry will never be used
2173 so it does not matter. */
2174 tablemb[0] = 0;
2176 /* Now insert the `UNDEFINED' value if it is used. Since this value
2177 will probably be used more than once it is good to store the
2178 weights only once. */
2179 if (collate->undefined.used_in_level != 0)
2180 output_weight (&weightpool, collate, &collate->undefined);
2182 for (ch = 1; ch < 256; ++ch)
2183 if (collate->mbheads[ch]->mbnext == NULL
2184 && collate->mbheads[ch]->nmbs <= 1)
2186 tablemb[ch] = output_weight (&weightpool, collate,
2187 collate->mbheads[ch]);
2189 else
2191 /* The entries in the list are sorted by length and then
2192 alphabetically. This is the order in which we will add the
2193 elements to the collation table. This allows simply walking
2194 the table in sequence and stopping at the first matching
2195 entry. Since the longer sequences are coming first in the
2196 list they have the possibility to match first, just as it
2197 has to be. In the worst case we are walking to the end of
2198 the list where we put, if no singlebyte sequence is defined
2199 in the locale definition, the weights for UNDEFINED.
2201 To reduce the length of the search list we compress them a bit.
2202 This happens by collecting sequences of consecutive byte
2203 sequences in one entry (having and begin and end byte sequence)
2204 and add only one index into the weight table. We can find the
2205 consecutive entries since they are also consecutive in the list. */
2206 struct element_t *runp = collate->mbheads[ch];
2207 struct element_t *lastp;
2209 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2211 tablemb[ch] = -obstack_object_size (&extrapool);
2215 /* Store the current index in the weight table. We know that
2216 the current position in the `extrapool' is aligned on a
2217 32-bit address. */
2218 int32_t weightidx;
2219 int added;
2221 /* Find out wether this is a single entry or we have more than
2222 one consecutive entry. */
2223 if (runp->mbnext != NULL
2224 && runp->nmbs == runp->mbnext->nmbs
2225 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
2226 && (runp->mbs[runp->nmbs - 1]
2227 == runp->mbnext->mbs[runp->nmbs - 1] + 1))
2229 int i;
2230 struct element_t *series_startp = runp;
2231 struct element_t *curp;
2233 /* Compute how much space we will need. */
2234 added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1
2235 + 2 * (runp->nmbs - 1));
2236 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2237 obstack_make_room (&extrapool, added);
2239 /* More than one consecutive entry. We mark this by having
2240 a negative index into the indirect table. */
2241 obstack_int32_grow_fast (&extrapool,
2242 -(obstack_object_size (&indirectpool)
2243 / sizeof (int32_t)));
2245 /* Now search first the end of the series. */
2247 runp = runp->mbnext;
2248 while (runp->mbnext != NULL
2249 && runp->nmbs == runp->mbnext->nmbs
2250 && memcmp (runp->mbs, runp->mbnext->mbs,
2251 runp->nmbs - 1) == 0
2252 && (runp->mbs[runp->nmbs - 1]
2253 == runp->mbnext->mbs[runp->nmbs - 1] + 1));
2255 /* Now walk backward from here to the beginning. */
2256 curp = runp;
2258 assert (runp->nmbs <= 256);
2259 obstack_1grow_fast (&extrapool, curp->nmbs - 1);
2260 for (i = 1; i < curp->nmbs; ++i)
2261 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2263 /* Now find the end of the consecutive sequence and
2264 add all the indeces in the indirect pool. */
2267 weightidx = output_weight (&weightpool, collate, curp);
2268 obstack_int32_grow (&indirectpool, weightidx);
2270 curp = curp->mblast;
2272 while (curp != series_startp);
2274 /* Add the final weight. */
2275 weightidx = output_weight (&weightpool, collate, curp);
2276 obstack_int32_grow (&indirectpool, weightidx);
2278 /* And add the end byte sequence. Without length this
2279 time. */
2280 for (i = 1; i < curp->nmbs; ++i)
2281 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2283 else
2285 /* A single entry. Simply add the index and the length and
2286 string (except for the first character which is already
2287 tested for). */
2288 int i;
2290 /* Output the weight info. */
2291 weightidx = output_weight (&weightpool, collate, runp);
2293 added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1
2294 + runp->nmbs - 1);
2295 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2296 obstack_make_room (&extrapool, added);
2298 obstack_int32_grow_fast (&extrapool, weightidx);
2299 assert (runp->nmbs <= 256);
2300 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
2302 for (i = 1; i < runp->nmbs; ++i)
2303 obstack_1grow_fast (&extrapool, runp->mbs[i]);
2306 /* Add alignment bytes if necessary. */
2307 while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)))
2308 obstack_1grow_fast (&extrapool, '\0');
2310 /* Next entry. */
2311 lastp = runp;
2312 runp = runp->mbnext;
2314 while (runp != NULL);
2316 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2318 /* If the final entry in the list is not a single character we
2319 add an UNDEFINED entry here. */
2320 if (lastp->nmbs != 1)
2322 int added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1 + 1);
2323 obstack_make_room (&extrapool, added);
2325 obstack_int32_grow_fast (&extrapool, 0);
2326 /* XXX What rule? We just pick the first. */
2327 obstack_1grow_fast (&extrapool, 0);
2328 /* Length is zero. */
2329 obstack_1grow_fast (&extrapool, 0);
2331 /* Add alignment bytes if necessary. */
2332 while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)))
2333 obstack_1grow_fast (&extrapool, '\0');
2337 /* Add padding to the tables if necessary. */
2338 while (!LOCFILE_ALIGNED_P (obstack_object_size (&weightpool)))
2339 obstack_1grow (&weightpool, 0);
2341 /* Now add the four tables. */
2342 add_locale_uint32_array (&file, (const uint32_t *) tablemb, 256);
2343 add_locale_raw_obstack (&file, &weightpool);
2344 add_locale_raw_obstack (&file, &extrapool);
2345 add_locale_raw_obstack (&file, &indirectpool);
2347 /* Now the same for the wide character table. We need to store some
2348 more information here. */
2349 add_locale_empty (&file);
2350 add_locale_empty (&file);
2351 add_locale_empty (&file);
2353 /* Since we are using the sign of an integer to mark indirection the
2354 offsets in the arrays we are indirectly referring to must not be
2355 zero since -0 == 0. Therefore we add a bit of dummy content. */
2356 obstack_int32_grow (&extrapool, 0);
2357 obstack_int32_grow (&indirectpool, 0);
2359 /* Now insert the `UNDEFINED' value if it is used. Since this value
2360 will probably be used more than once it is good to store the
2361 weights only once. */
2362 if (output_weightwc (&weightpool, collate, &collate->undefined) != 0)
2363 abort ();
2365 /* Generate the table. Walk through the lists of sequences starting
2366 with the same wide character and add them one after the other to
2367 the table. In case we have more than one sequence starting with
2368 the same byte we have to use extra indirection. */
2369 tablewc.p = 6;
2370 tablewc.q = 10;
2371 collidx_table_init (&tablewc);
2373 atwc.weightpool = &weightpool;
2374 atwc.extrapool = &extrapool;
2375 atwc.indpool = &indirectpool;
2376 atwc.collate = collate;
2377 atwc.tablewc = &tablewc;
2379 wchead_table_iterate (&collate->wcheads, add_to_tablewc);
2381 memset (&atwc, 0, sizeof (atwc));
2383 /* Now add the four tables. */
2384 add_locale_collidx_table (&file, &tablewc);
2385 add_locale_raw_obstack (&file, &weightpool);
2386 add_locale_raw_obstack (&file, &extrapool);
2387 add_locale_raw_obstack (&file, &indirectpool);
2389 /* Finally write the table with collation element names out. It is
2390 a hash table with a simple function which gets the name of the
2391 character as the input. One character might have many names. The
2392 value associated with the name is an index into the weight table
2393 where we are then interested in the first-level weight value.
2395 To determine how large the table should be we are counting the
2396 elements have to put in. Since we are using internal chaining
2397 using a secondary hash function we have to make the table a bit
2398 larger to avoid extremely long search times. We can achieve
2399 good results with a 40% larger table than there are entries. */
2400 elem_size = 0;
2401 runp = collate->start;
2402 while (runp != NULL)
2404 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2405 /* Yep, the element really counts. */
2406 ++elem_size;
2408 runp = runp->next;
2410 /* Add 40% and find the next prime number. */
2411 elem_size = next_prime (elem_size * 1.4);
2413 /* Allocate the table. Each entry consists of two words: the hash
2414 value and an index in a secondary table which provides the index
2415 into the weight table and the string itself (so that a match can
2416 be determined). */
2417 elem_table = (uint32_t *) obstack_alloc (&extrapool,
2418 elem_size * 2 * sizeof (uint32_t));
2419 memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t));
2421 /* Now add the elements. */
2422 runp = collate->start;
2423 while (runp != NULL)
2425 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2427 /* Compute the hash value of the name. */
2428 uint32_t namelen = strlen (runp->name);
2429 uint32_t hash = elem_hash (runp->name, namelen);
2430 size_t idx = hash % elem_size;
2431 #ifndef NDEBUG
2432 size_t start_idx = idx;
2433 #endif
2435 if (elem_table[idx * 2] != 0)
2437 /* The spot is already taken. Try iterating using the value
2438 from the secondary hashing function. */
2439 size_t iter = hash % (elem_size - 2) + 1;
2443 idx += iter;
2444 if (idx >= elem_size)
2445 idx -= elem_size;
2446 assert (idx != start_idx);
2448 while (elem_table[idx * 2] != 0);
2450 /* This is the spot where we will insert the value. */
2451 elem_table[idx * 2] = hash;
2452 elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
2454 /* The string itself including length. */
2455 obstack_1grow (&extrapool, namelen);
2456 obstack_grow (&extrapool, runp->name, namelen);
2458 /* And the multibyte representation. */
2459 obstack_1grow (&extrapool, runp->nmbs);
2460 obstack_grow (&extrapool, runp->mbs, runp->nmbs);
2462 /* And align again to 32 bits. */
2463 if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0)
2464 obstack_grow (&extrapool, "\0\0",
2465 (sizeof (int32_t)
2466 - ((1 + namelen + 1 + runp->nmbs)
2467 % sizeof (int32_t))));
2469 /* Now some 32-bit values: multibyte collation sequence,
2470 wide char string (including length), and wide char
2471 collation sequence. */
2472 obstack_int32_grow (&extrapool, runp->mbseqorder);
2474 obstack_int32_grow (&extrapool, runp->nwcs);
2475 obstack_grow (&extrapool, runp->wcs,
2476 runp->nwcs * sizeof (uint32_t));
2477 maybe_swap_uint32_obstack (&extrapool, runp->nwcs);
2479 obstack_int32_grow (&extrapool, runp->wcseqorder);
2482 runp = runp->next;
2485 /* Prepare to write out this data. */
2486 add_locale_uint32 (&file, elem_size);
2487 add_locale_uint32_array (&file, elem_table, 2 * elem_size);
2488 add_locale_raw_obstack (&file, &extrapool);
2489 add_locale_raw_data (&file, collate->mbseqorder, 256);
2490 add_locale_collseq_table (&file, &collate->wcseqorder);
2491 add_locale_string (&file, charmap->code_set_name);
2492 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file);
2494 obstack_free (&weightpool, NULL);
2495 obstack_free (&extrapool, NULL);
2496 obstack_free (&indirectpool, NULL);
2500 static enum token_t
2501 skip_to (struct linereader *ldfile, struct locale_collate_t *collate,
2502 const struct charmap_t *charmap, int to_endif)
2504 while (1)
2506 struct token *now = lr_token (ldfile, charmap, NULL, NULL, 0);
2507 enum token_t nowtok = now->tok;
2509 if (nowtok == tok_eof || nowtok == tok_end)
2510 return nowtok;
2512 if (nowtok == tok_ifdef || nowtok == tok_ifndef)
2514 lr_error (ldfile, _("%s: nested conditionals not supported"),
2515 "LC_COLLATE");
2516 nowtok = skip_to (ldfile, collate, charmap, tok_endif);
2517 if (nowtok == tok_eof || nowtok == tok_end)
2518 return nowtok;
2520 else if (nowtok == tok_endif || (!to_endif && nowtok == tok_else))
2522 lr_ignore_rest (ldfile, 1);
2523 return nowtok;
2525 else if (!to_endif && (nowtok == tok_elifdef || nowtok == tok_elifndef))
2527 /* Do not read the rest of the line. */
2528 return nowtok;
2530 else if (nowtok == tok_else)
2532 lr_error (ldfile, _("%s: more than one 'else'"), "LC_COLLATE");
2535 lr_ignore_rest (ldfile, 0);
2540 void
2541 collate_read (struct linereader *ldfile, struct localedef_t *result,
2542 const struct charmap_t *charmap, const char *repertoire_name,
2543 int ignore_content)
2545 struct repertoire_t *repertoire = NULL;
2546 struct locale_collate_t *collate;
2547 struct token *now;
2548 struct token *arg = NULL;
2549 enum token_t nowtok;
2550 enum token_t was_ellipsis = tok_none;
2551 struct localedef_t *copy_locale = NULL;
2552 /* Parsing state:
2553 0 - start
2554 1 - between `order-start' and `order-end'
2555 2 - after `order-end'
2556 3 - after `reorder-after', waiting for `reorder-end'
2557 4 - after `reorder-end'
2558 5 - after `reorder-sections-after', waiting for `reorder-sections-end'
2559 6 - after `reorder-sections-end'
2561 int state = 0;
2563 /* Get the repertoire we have to use. */
2564 if (repertoire_name != NULL)
2565 repertoire = repertoire_read (repertoire_name);
2567 /* The rest of the line containing `LC_COLLATE' must be free. */
2568 lr_ignore_rest (ldfile, 1);
2570 while (1)
2574 now = lr_token (ldfile, charmap, result, NULL, verbose);
2575 nowtok = now->tok;
2577 while (nowtok == tok_eol);
2579 if (nowtok != tok_define)
2580 break;
2582 if (ignore_content)
2583 lr_ignore_rest (ldfile, 0);
2584 else
2586 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2587 if (arg->tok != tok_ident)
2588 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2589 else
2591 /* Simply add the new symbol. */
2592 struct name_list *newsym = xmalloc (sizeof (*newsym)
2593 + arg->val.str.lenmb + 1);
2594 memcpy (newsym->str, arg->val.str.startmb, arg->val.str.lenmb);
2595 newsym->str[arg->val.str.lenmb] = '\0';
2596 newsym->next = defined;
2597 defined = newsym;
2599 lr_ignore_rest (ldfile, 1);
2604 if (nowtok == tok_copy)
2606 now = lr_token (ldfile, charmap, result, NULL, verbose);
2607 if (now->tok != tok_string)
2609 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2611 skip_category:
2613 now = lr_token (ldfile, charmap, result, NULL, verbose);
2614 while (now->tok != tok_eof && now->tok != tok_end);
2616 if (now->tok != tok_eof
2617 || (now = lr_token (ldfile, charmap, result, NULL, verbose),
2618 now->tok == tok_eof))
2619 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2620 else if (now->tok != tok_lc_collate)
2622 lr_error (ldfile, _("\
2623 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2624 lr_ignore_rest (ldfile, 0);
2626 else
2627 lr_ignore_rest (ldfile, 1);
2629 return;
2632 if (! ignore_content)
2634 /* Get the locale definition. */
2635 copy_locale = load_locale (LC_COLLATE, now->val.str.startmb,
2636 repertoire_name, charmap, NULL);
2637 if ((copy_locale->avail & COLLATE_LOCALE) == 0)
2639 /* Not yet loaded. So do it now. */
2640 if (locfile_read (copy_locale, charmap) != 0)
2641 goto skip_category;
2644 if (copy_locale->categories[LC_COLLATE].collate == NULL)
2645 return;
2648 lr_ignore_rest (ldfile, 1);
2650 now = lr_token (ldfile, charmap, result, NULL, verbose);
2651 nowtok = now->tok;
2654 /* Prepare the data structures. */
2655 collate_startup (ldfile, result, copy_locale, ignore_content);
2656 collate = result->categories[LC_COLLATE].collate;
2658 while (1)
2660 char ucs4buf[10];
2661 char *symstr;
2662 size_t symlen;
2664 /* Of course we don't proceed beyond the end of file. */
2665 if (nowtok == tok_eof)
2666 break;
2668 /* Ingore empty lines. */
2669 if (nowtok == tok_eol)
2671 now = lr_token (ldfile, charmap, result, NULL, verbose);
2672 nowtok = now->tok;
2673 continue;
2676 switch (nowtok)
2678 case tok_copy:
2679 /* Allow copying other locales. */
2680 now = lr_token (ldfile, charmap, result, NULL, verbose);
2681 if (now->tok != tok_string)
2682 goto err_label;
2684 if (! ignore_content)
2685 load_locale (LC_COLLATE, now->val.str.startmb, repertoire_name,
2686 charmap, result);
2688 lr_ignore_rest (ldfile, 1);
2689 break;
2691 case tok_coll_weight_max:
2692 /* Ignore the rest of the line if we don't need the input of
2693 this line. */
2694 if (ignore_content)
2696 lr_ignore_rest (ldfile, 0);
2697 break;
2700 if (state != 0)
2701 goto err_label;
2703 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2704 if (arg->tok != tok_number)
2705 goto err_label;
2706 if (collate->col_weight_max != -1)
2707 lr_error (ldfile, _("%s: duplicate definition of `%s'"),
2708 "LC_COLLATE", "col_weight_max");
2709 else
2710 collate->col_weight_max = arg->val.num;
2711 lr_ignore_rest (ldfile, 1);
2712 break;
2714 case tok_section_symbol:
2715 /* Ignore the rest of the line if we don't need the input of
2716 this line. */
2717 if (ignore_content)
2719 lr_ignore_rest (ldfile, 0);
2720 break;
2723 if (state != 0)
2724 goto err_label;
2726 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2727 if (arg->tok != tok_bsymbol)
2728 goto err_label;
2729 else if (!ignore_content)
2731 /* Check whether this section is already known. */
2732 struct section_list *known = collate->sections;
2733 while (known != NULL)
2735 if (strcmp (known->name, arg->val.str.startmb) == 0)
2736 break;
2737 known = known->next;
2740 if (known != NULL)
2742 lr_error (ldfile,
2743 _("%s: duplicate declaration of section `%s'"),
2744 "LC_COLLATE", arg->val.str.startmb);
2745 free (arg->val.str.startmb);
2747 else
2748 collate->sections = make_seclist_elem (collate,
2749 arg->val.str.startmb,
2750 collate->sections);
2752 lr_ignore_rest (ldfile, known == NULL);
2754 else
2756 free (arg->val.str.startmb);
2757 lr_ignore_rest (ldfile, 0);
2759 break;
2761 case tok_collating_element:
2762 /* Ignore the rest of the line if we don't need the input of
2763 this line. */
2764 if (ignore_content)
2766 lr_ignore_rest (ldfile, 0);
2767 break;
2770 if (state != 0 && state != 2)
2771 goto err_label;
2773 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2774 if (arg->tok != tok_bsymbol)
2775 goto err_label;
2776 else
2778 const char *symbol = arg->val.str.startmb;
2779 size_t symbol_len = arg->val.str.lenmb;
2781 /* Next the `from' keyword. */
2782 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2783 if (arg->tok != tok_from)
2785 free ((char *) symbol);
2786 goto err_label;
2789 ldfile->return_widestr = 1;
2790 ldfile->translate_strings = 1;
2792 /* Finally the string with the replacement. */
2793 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2795 ldfile->return_widestr = 0;
2796 ldfile->translate_strings = 0;
2798 if (arg->tok != tok_string)
2799 goto err_label;
2801 if (!ignore_content && symbol != NULL)
2803 /* The name is already defined. */
2804 if (check_duplicate (ldfile, collate, charmap,
2805 repertoire, symbol, symbol_len))
2806 goto col_elem_free;
2808 if (arg->val.str.startmb != NULL)
2809 insert_entry (&collate->elem_table, symbol, symbol_len,
2810 new_element (collate,
2811 arg->val.str.startmb,
2812 arg->val.str.lenmb - 1,
2813 arg->val.str.startwc,
2814 symbol, symbol_len, 0));
2816 else
2818 col_elem_free:
2819 free ((char *) symbol);
2820 free (arg->val.str.startmb);
2821 free (arg->val.str.startwc);
2823 lr_ignore_rest (ldfile, 1);
2825 break;
2827 case tok_collating_symbol:
2828 /* Ignore the rest of the line if we don't need the input of
2829 this line. */
2830 if (ignore_content)
2832 lr_ignore_rest (ldfile, 0);
2833 break;
2836 if (state != 0 && state != 2)
2837 goto err_label;
2839 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2840 if (arg->tok != tok_bsymbol)
2841 goto err_label;
2842 else
2844 char *symbol = arg->val.str.startmb;
2845 size_t symbol_len = arg->val.str.lenmb;
2846 char *endsymbol = NULL;
2847 size_t endsymbol_len = 0;
2848 enum token_t ellipsis = tok_none;
2850 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2851 if (arg->tok == tok_ellipsis2 || arg->tok == tok_ellipsis4)
2853 ellipsis = arg->tok;
2855 arg = lr_token (ldfile, charmap, result, repertoire,
2856 verbose);
2857 if (arg->tok != tok_bsymbol)
2859 free (symbol);
2860 goto err_label;
2863 endsymbol = arg->val.str.startmb;
2864 endsymbol_len = arg->val.str.lenmb;
2866 lr_ignore_rest (ldfile, 1);
2868 else if (arg->tok != tok_eol)
2870 free (symbol);
2871 goto err_label;
2874 if (!ignore_content)
2876 if (symbol == NULL
2877 || (ellipsis != tok_none && endsymbol == NULL))
2879 lr_error (ldfile, _("\
2880 %s: unknown character in collating symbol name"),
2881 "LC_COLLATE");
2882 goto col_sym_free;
2884 else if (ellipsis == tok_none)
2886 /* A single symbol, no ellipsis. */
2887 if (check_duplicate (ldfile, collate, charmap,
2888 repertoire, symbol, symbol_len))
2889 /* The name is already defined. */
2890 goto col_sym_free;
2892 insert_entry (&collate->sym_table, symbol, symbol_len,
2893 new_symbol (collate, symbol, symbol_len));
2895 else if (symbol_len != endsymbol_len)
2897 col_sym_inv_range:
2898 lr_error (ldfile,
2899 _("invalid names for character range"));
2900 goto col_sym_free;
2902 else
2904 /* Oh my, we have to handle an ellipsis. First, as
2905 usual, determine the common prefix and then
2906 convert the rest into a range. */
2907 size_t prefixlen;
2908 unsigned long int from;
2909 unsigned long int to;
2910 char *endp;
2912 for (prefixlen = 0; prefixlen < symbol_len; ++prefixlen)
2913 if (symbol[prefixlen] != endsymbol[prefixlen])
2914 break;
2916 /* Convert the rest into numbers. */
2917 symbol[symbol_len] = '\0';
2918 from = strtoul (&symbol[prefixlen], &endp,
2919 ellipsis == tok_ellipsis2 ? 16 : 10);
2920 if (*endp != '\0')
2921 goto col_sym_inv_range;
2923 endsymbol[symbol_len] = '\0';
2924 to = strtoul (&endsymbol[prefixlen], &endp,
2925 ellipsis == tok_ellipsis2 ? 16 : 10);
2926 if (*endp != '\0')
2927 goto col_sym_inv_range;
2929 if (from > to)
2930 goto col_sym_inv_range;
2932 /* Now loop over all entries. */
2933 while (from <= to)
2935 char *symbuf;
2937 symbuf = (char *) obstack_alloc (&collate->mempool,
2938 symbol_len + 1);
2940 /* Create the name. */
2941 sprintf (symbuf,
2942 ellipsis == tok_ellipsis2
2943 ? "%.*s%.*lX" : "%.*s%.*lu",
2944 (int) prefixlen, symbol,
2945 (int) (symbol_len - prefixlen), from);
2947 if (check_duplicate (ldfile, collate, charmap,
2948 repertoire, symbuf, symbol_len))
2949 /* The name is already defined. */
2950 goto col_sym_free;
2952 insert_entry (&collate->sym_table, symbuf,
2953 symbol_len,
2954 new_symbol (collate, symbuf,
2955 symbol_len));
2957 /* Increment the counter. */
2958 ++from;
2961 goto col_sym_free;
2964 else
2966 col_sym_free:
2967 free (symbol);
2968 free (endsymbol);
2971 break;
2973 case tok_symbol_equivalence:
2974 /* Ignore the rest of the line if we don't need the input of
2975 this line. */
2976 if (ignore_content)
2978 lr_ignore_rest (ldfile, 0);
2979 break;
2982 if (state != 0)
2983 goto err_label;
2985 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2986 if (arg->tok != tok_bsymbol)
2987 goto err_label;
2988 else
2990 const char *newname = arg->val.str.startmb;
2991 size_t newname_len = arg->val.str.lenmb;
2992 const char *symname;
2993 size_t symname_len;
2994 void *symval; /* Actually struct symbol_t* */
2996 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2997 if (arg->tok != tok_bsymbol)
2999 free ((char *) newname);
3000 goto err_label;
3003 symname = arg->val.str.startmb;
3004 symname_len = arg->val.str.lenmb;
3006 if (newname == NULL)
3008 lr_error (ldfile, _("\
3009 %s: unknown character in equivalent definition name"),
3010 "LC_COLLATE");
3012 sym_equiv_free:
3013 free ((char *) newname);
3014 free ((char *) symname);
3015 break;
3017 if (symname == NULL)
3019 lr_error (ldfile, _("\
3020 %s: unknown character in equivalent definition value"),
3021 "LC_COLLATE");
3022 goto sym_equiv_free;
3025 /* See whether the symbol name is already defined. */
3026 if (find_entry (&collate->sym_table, symname, symname_len,
3027 &symval) != 0)
3029 lr_error (ldfile, _("\
3030 %s: unknown symbol `%s' in equivalent definition"),
3031 "LC_COLLATE", symname);
3032 goto sym_equiv_free;
3035 if (insert_entry (&collate->sym_table,
3036 newname, newname_len, symval) < 0)
3038 lr_error (ldfile, _("\
3039 error while adding equivalent collating symbol"));
3040 goto sym_equiv_free;
3043 free ((char *) symname);
3045 lr_ignore_rest (ldfile, 1);
3046 break;
3048 case tok_script:
3049 /* Ignore the rest of the line if we don't need the input of
3050 this line. */
3051 if (ignore_content)
3053 lr_ignore_rest (ldfile, 0);
3054 break;
3057 /* We get told about the scripts we know. */
3058 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3059 if (arg->tok != tok_bsymbol)
3060 goto err_label;
3061 else
3063 struct section_list *runp = collate->known_sections;
3064 char *name;
3066 while (runp != NULL)
3067 if (strncmp (runp->name, arg->val.str.startmb,
3068 arg->val.str.lenmb) == 0
3069 && runp->name[arg->val.str.lenmb] == '\0')
3070 break;
3071 else
3072 runp = runp->def_next;
3074 if (runp != NULL)
3076 lr_error (ldfile, _("duplicate definition of script `%s'"),
3077 runp->name);
3078 lr_ignore_rest (ldfile, 0);
3079 break;
3082 runp = (struct section_list *) xcalloc (1, sizeof (*runp));
3083 name = (char *) xmalloc (arg->val.str.lenmb + 1);
3084 memcpy (name, arg->val.str.startmb, arg->val.str.lenmb);
3085 name[arg->val.str.lenmb] = '\0';
3086 runp->name = name;
3088 runp->def_next = collate->known_sections;
3089 collate->known_sections = runp;
3091 lr_ignore_rest (ldfile, 1);
3092 break;
3094 case tok_order_start:
3095 /* Ignore the rest of the line if we don't need the input of
3096 this line. */
3097 if (ignore_content)
3099 lr_ignore_rest (ldfile, 0);
3100 break;
3103 if (state != 0 && state != 1 && state != 2)
3104 goto err_label;
3105 state = 1;
3107 /* The 14652 draft does not specify whether all `order_start' lines
3108 must contain the same number of sort-rules, but 14651 does. So
3109 we require this here as well. */
3110 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3111 if (arg->tok == tok_bsymbol)
3113 /* This better should be a section name. */
3114 struct section_list *sp = collate->known_sections;
3115 while (sp != NULL
3116 && (sp->name == NULL
3117 || strncmp (sp->name, arg->val.str.startmb,
3118 arg->val.str.lenmb) != 0
3119 || sp->name[arg->val.str.lenmb] != '\0'))
3120 sp = sp->def_next;
3122 if (sp == NULL)
3124 lr_error (ldfile, _("\
3125 %s: unknown section name `%.*s'"),
3126 "LC_COLLATE", (int) arg->val.str.lenmb,
3127 arg->val.str.startmb);
3128 /* We use the error section. */
3129 collate->current_section = &collate->error_section;
3131 if (collate->error_section.first == NULL)
3133 /* Insert &collate->error_section at the end of
3134 the collate->sections list. */
3135 if (collate->sections == NULL)
3136 collate->sections = &collate->error_section;
3137 else
3139 sp = collate->sections;
3140 while (sp->next != NULL)
3141 sp = sp->next;
3143 sp->next = &collate->error_section;
3145 collate->error_section.next = NULL;
3148 else
3150 /* One should not be allowed to open the same
3151 section twice. */
3152 if (sp->first != NULL)
3153 lr_error (ldfile, _("\
3154 %s: multiple order definitions for section `%s'"),
3155 "LC_COLLATE", sp->name);
3156 else
3158 /* Insert sp in the collate->sections list,
3159 right after collate->current_section. */
3160 if (collate->current_section != NULL)
3162 sp->next = collate->current_section->next;
3163 collate->current_section->next = sp;
3165 else if (collate->sections == NULL)
3166 /* This is the first section to be defined. */
3167 collate->sections = sp;
3169 collate->current_section = sp;
3172 /* Next should come the end of the line or a semicolon. */
3173 arg = lr_token (ldfile, charmap, result, repertoire,
3174 verbose);
3175 if (arg->tok == tok_eol)
3177 uint32_t cnt;
3179 /* This means we have exactly one rule: `forward'. */
3180 if (nrules > 1)
3181 lr_error (ldfile, _("\
3182 %s: invalid number of sorting rules"),
3183 "LC_COLLATE");
3184 else
3185 nrules = 1;
3186 sp->rules = obstack_alloc (&collate->mempool,
3187 (sizeof (enum coll_sort_rule)
3188 * nrules));
3189 for (cnt = 0; cnt < nrules; ++cnt)
3190 sp->rules[cnt] = sort_forward;
3192 /* Next line. */
3193 break;
3196 /* Get the next token. */
3197 arg = lr_token (ldfile, charmap, result, repertoire,
3198 verbose);
3201 else
3203 /* There is no section symbol. Therefore we use the unnamed
3204 section. */
3205 collate->current_section = &collate->unnamed_section;
3207 if (collate->unnamed_section_defined)
3208 lr_error (ldfile, _("\
3209 %s: multiple order definitions for unnamed section"),
3210 "LC_COLLATE");
3211 else
3213 /* Insert &collate->unnamed_section at the beginning of
3214 the collate->sections list. */
3215 collate->unnamed_section.next = collate->sections;
3216 collate->sections = &collate->unnamed_section;
3217 collate->unnamed_section_defined = true;
3221 /* Now read the direction names. */
3222 read_directions (ldfile, arg, charmap, repertoire, result);
3224 /* From now we need the strings untranslated. */
3225 ldfile->translate_strings = 0;
3226 break;
3228 case tok_order_end:
3229 /* Ignore the rest of the line if we don't need the input of
3230 this line. */
3231 if (ignore_content)
3233 lr_ignore_rest (ldfile, 0);
3234 break;
3237 if (state != 1)
3238 goto err_label;
3240 /* Handle ellipsis at end of list. */
3241 if (was_ellipsis != tok_none)
3243 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3244 repertoire, result);
3245 was_ellipsis = tok_none;
3248 state = 2;
3249 lr_ignore_rest (ldfile, 1);
3250 break;
3252 case tok_reorder_after:
3253 /* Ignore the rest of the line if we don't need the input of
3254 this line. */
3255 if (ignore_content)
3257 lr_ignore_rest (ldfile, 0);
3258 break;
3261 if (state == 1)
3263 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3264 "LC_COLLATE");
3265 state = 2;
3267 /* Handle ellipsis at end of list. */
3268 if (was_ellipsis != tok_none)
3270 handle_ellipsis (ldfile, arg->val.str.startmb,
3271 arg->val.str.lenmb, was_ellipsis, charmap,
3272 repertoire, result);
3273 was_ellipsis = tok_none;
3276 else if (state == 0 && copy_locale == NULL)
3277 goto err_label;
3278 else if (state != 0 && state != 2 && state != 3)
3279 goto err_label;
3280 state = 3;
3282 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3283 if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
3285 /* Find this symbol in the sequence table. */
3286 char ucsbuf[10];
3287 char *startmb;
3288 size_t lenmb;
3289 struct element_t *insp;
3290 int no_error = 1;
3291 void *ptr;
3293 if (arg->tok == tok_bsymbol)
3295 startmb = arg->val.str.startmb;
3296 lenmb = arg->val.str.lenmb;
3298 else
3300 sprintf (ucsbuf, "U%08X", arg->val.ucs4);
3301 startmb = ucsbuf;
3302 lenmb = 9;
3305 if (find_entry (&collate->seq_table, startmb, lenmb, &ptr) == 0)
3306 /* Yes, the symbol exists. Simply point the cursor
3307 to it. */
3308 collate->cursor = (struct element_t *) ptr;
3309 else
3311 struct symbol_t *symbp;
3312 void *ptr;
3314 if (find_entry (&collate->sym_table, startmb, lenmb,
3315 &ptr) == 0)
3317 symbp = ptr;
3319 if (symbp->order->last != NULL
3320 || symbp->order->next != NULL)
3321 collate->cursor = symbp->order;
3322 else
3324 /* This is a collating symbol but its position
3325 is not yet defined. */
3326 lr_error (ldfile, _("\
3327 %s: order for collating symbol %.*s not yet defined"),
3328 "LC_COLLATE", (int) lenmb, startmb);
3329 collate->cursor = NULL;
3330 no_error = 0;
3333 else if (find_entry (&collate->elem_table, startmb, lenmb,
3334 &ptr) == 0)
3336 insp = (struct element_t *) ptr;
3338 if (insp->last != NULL || insp->next != NULL)
3339 collate->cursor = insp;
3340 else
3342 /* This is a collating element but its position
3343 is not yet defined. */
3344 lr_error (ldfile, _("\
3345 %s: order for collating element %.*s not yet defined"),
3346 "LC_COLLATE", (int) lenmb, startmb);
3347 collate->cursor = NULL;
3348 no_error = 0;
3351 else
3353 /* This is bad. The symbol after which we have to
3354 insert does not exist. */
3355 lr_error (ldfile, _("\
3356 %s: cannot reorder after %.*s: symbol not known"),
3357 "LC_COLLATE", (int) lenmb, startmb);
3358 collate->cursor = NULL;
3359 no_error = 0;
3363 lr_ignore_rest (ldfile, no_error);
3365 else
3366 /* This must not happen. */
3367 goto err_label;
3368 break;
3370 case tok_reorder_end:
3371 /* Ignore the rest of the line if we don't need the input of
3372 this line. */
3373 if (ignore_content)
3374 break;
3376 if (state != 3)
3377 goto err_label;
3378 state = 4;
3379 lr_ignore_rest (ldfile, 1);
3380 break;
3382 case tok_reorder_sections_after:
3383 /* Ignore the rest of the line if we don't need the input of
3384 this line. */
3385 if (ignore_content)
3387 lr_ignore_rest (ldfile, 0);
3388 break;
3391 if (state == 1)
3393 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3394 "LC_COLLATE");
3395 state = 2;
3397 /* Handle ellipsis at end of list. */
3398 if (was_ellipsis != tok_none)
3400 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3401 repertoire, result);
3402 was_ellipsis = tok_none;
3405 else if (state == 3)
3407 WITH_CUR_LOCALE (error (0, 0, _("\
3408 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3409 state = 4;
3411 else if (state != 2 && state != 4)
3412 goto err_label;
3413 state = 5;
3415 /* Get the name of the sections we are adding after. */
3416 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3417 if (arg->tok == tok_bsymbol)
3419 /* Now find a section with this name. */
3420 struct section_list *runp = collate->sections;
3422 while (runp != NULL)
3424 if (runp->name != NULL
3425 && strlen (runp->name) == arg->val.str.lenmb
3426 && memcmp (runp->name, arg->val.str.startmb,
3427 arg->val.str.lenmb) == 0)
3428 break;
3430 runp = runp->next;
3433 if (runp != NULL)
3434 collate->current_section = runp;
3435 else
3437 /* This is bad. The section after which we have to
3438 reorder does not exist. Therefore we cannot
3439 process the whole rest of this reorder
3440 specification. */
3441 lr_error (ldfile, _("%s: section `%.*s' not known"),
3442 "LC_COLLATE", (int) arg->val.str.lenmb,
3443 arg->val.str.startmb);
3447 lr_ignore_rest (ldfile, 0);
3449 now = lr_token (ldfile, charmap, result, NULL, verbose);
3451 while (now->tok == tok_reorder_sections_after
3452 || now->tok == tok_reorder_sections_end
3453 || now->tok == tok_end);
3455 /* Process the token we just saw. */
3456 nowtok = now->tok;
3457 continue;
3460 else
3461 /* This must not happen. */
3462 goto err_label;
3463 break;
3465 case tok_reorder_sections_end:
3466 /* Ignore the rest of the line if we don't need the input of
3467 this line. */
3468 if (ignore_content)
3469 break;
3471 if (state != 5)
3472 goto err_label;
3473 state = 6;
3474 lr_ignore_rest (ldfile, 1);
3475 break;
3477 case tok_bsymbol:
3478 case tok_ucs4:
3479 /* Ignore the rest of the line if we don't need the input of
3480 this line. */
3481 if (ignore_content)
3483 lr_ignore_rest (ldfile, 0);
3484 break;
3487 if (state != 0 && state != 1 && state != 3 && state != 5)
3488 goto err_label;
3490 if ((state == 0 || state == 5) && nowtok == tok_ucs4)
3491 goto err_label;
3493 if (nowtok == tok_ucs4)
3495 snprintf (ucs4buf, sizeof (ucs4buf), "U%08X", now->val.ucs4);
3496 symstr = ucs4buf;
3497 symlen = 9;
3499 else if (arg != NULL)
3501 symstr = arg->val.str.startmb;
3502 symlen = arg->val.str.lenmb;
3504 else
3506 lr_error (ldfile, _("%s: bad symbol <%.*s>"), "LC_COLLATE",
3507 (int) ldfile->token.val.str.lenmb,
3508 ldfile->token.val.str.startmb);
3509 break;
3512 struct element_t *seqp;
3513 if (state == 0)
3515 /* We are outside an `order_start' region. This means
3516 we must only accept definitions of values for
3517 collation symbols since these are purely abstract
3518 values and don't need directions associated. */
3519 void *ptr;
3521 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3523 seqp = ptr;
3525 /* It's already defined. First check whether this
3526 is really a collating symbol. */
3527 if (seqp->is_character)
3528 goto err_label;
3530 goto move_entry;
3532 else
3534 void *result;
3536 if (find_entry (&collate->sym_table, symstr, symlen,
3537 &result) != 0)
3538 /* No collating symbol, it's an error. */
3539 goto err_label;
3541 /* Maybe this is the first time we define a symbol
3542 value and it is before the first actual section. */
3543 if (collate->sections == NULL)
3544 collate->sections = collate->current_section =
3545 &collate->symbol_section;
3548 if (was_ellipsis != tok_none)
3550 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis,
3551 charmap, repertoire, result);
3553 /* Remember that we processed the ellipsis. */
3554 was_ellipsis = tok_none;
3556 /* And don't add the value a second time. */
3557 break;
3560 else if (state == 3)
3562 /* It is possible that we already have this collation sequence.
3563 In this case we move the entry. */
3564 void *sym;
3565 void *ptr;
3567 /* If the symbol after which we have to insert was not found
3568 ignore all entries. */
3569 if (collate->cursor == NULL)
3571 lr_ignore_rest (ldfile, 0);
3572 break;
3575 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3577 seqp = (struct element_t *) ptr;
3578 goto move_entry;
3581 if (find_entry (&collate->sym_table, symstr, symlen, &sym) == 0
3582 && (seqp = ((struct symbol_t *) sym)->order) != NULL)
3583 goto move_entry;
3585 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) == 0
3586 && (seqp = (struct element_t *) ptr,
3587 seqp->last != NULL || seqp->next != NULL
3588 || (collate->start != NULL && seqp == collate->start)))
3590 move_entry:
3591 /* Remove the entry from the old position. */
3592 if (seqp->last == NULL)
3593 collate->start = seqp->next;
3594 else
3595 seqp->last->next = seqp->next;
3596 if (seqp->next != NULL)
3597 seqp->next->last = seqp->last;
3599 /* We also have to check whether this entry is the
3600 first or last of a section. */
3601 if (seqp->section->first == seqp)
3603 if (seqp->section->first == seqp->section->last)
3604 /* This section has no content anymore. */
3605 seqp->section->first = seqp->section->last = NULL;
3606 else
3607 seqp->section->first = seqp->next;
3609 else if (seqp->section->last == seqp)
3610 seqp->section->last = seqp->last;
3612 /* Now insert it in the new place. */
3613 insert_weights (ldfile, seqp, charmap, repertoire, result,
3614 tok_none);
3615 break;
3618 /* Otherwise we just add a new entry. */
3620 else if (state == 5)
3622 /* We are reordering sections. Find the named section. */
3623 struct section_list *runp = collate->sections;
3624 struct section_list *prevp = NULL;
3626 while (runp != NULL)
3628 if (runp->name != NULL
3629 && strlen (runp->name) == symlen
3630 && memcmp (runp->name, symstr, symlen) == 0)
3631 break;
3633 prevp = runp;
3634 runp = runp->next;
3637 if (runp == NULL)
3639 lr_error (ldfile, _("%s: section `%.*s' not known"),
3640 "LC_COLLATE", (int) symlen, symstr);
3641 lr_ignore_rest (ldfile, 0);
3643 else
3645 if (runp != collate->current_section)
3647 /* Remove the named section from the old place and
3648 insert it in the new one. */
3649 prevp->next = runp->next;
3651 runp->next = collate->current_section->next;
3652 collate->current_section->next = runp;
3653 collate->current_section = runp;
3656 /* Process the rest of the line which might change
3657 the collation rules. */
3658 arg = lr_token (ldfile, charmap, result, repertoire,
3659 verbose);
3660 if (arg->tok != tok_eof && arg->tok != tok_eol)
3661 read_directions (ldfile, arg, charmap, repertoire,
3662 result);
3664 break;
3666 else if (was_ellipsis != tok_none)
3668 /* Using the information in the `ellipsis_weight'
3669 element and this and the last value we have to handle
3670 the ellipsis now. */
3671 assert (state == 1);
3673 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis, charmap,
3674 repertoire, result);
3676 /* Remember that we processed the ellipsis. */
3677 was_ellipsis = tok_none;
3679 /* And don't add the value a second time. */
3680 break;
3683 /* Now insert in the new place. */
3684 insert_value (ldfile, symstr, symlen, charmap, repertoire, result);
3685 break;
3687 case tok_undefined:
3688 /* Ignore the rest of the line if we don't need the input of
3689 this line. */
3690 if (ignore_content)
3692 lr_ignore_rest (ldfile, 0);
3693 break;
3696 if (state != 1)
3697 goto err_label;
3699 if (was_ellipsis != tok_none)
3701 lr_error (ldfile,
3702 _("%s: cannot have `%s' as end of ellipsis range"),
3703 "LC_COLLATE", "UNDEFINED");
3705 unlink_element (collate);
3706 was_ellipsis = tok_none;
3709 /* See whether UNDEFINED already appeared somewhere. */
3710 if (collate->undefined.next != NULL
3711 || &collate->undefined == collate->cursor)
3713 lr_error (ldfile,
3714 _("%s: order for `%.*s' already defined at %s:%Zu"),
3715 "LC_COLLATE", 9, "UNDEFINED",
3716 collate->undefined.file,
3717 collate->undefined.line);
3718 lr_ignore_rest (ldfile, 0);
3720 else
3721 /* Parse the weights. */
3722 insert_weights (ldfile, &collate->undefined, charmap,
3723 repertoire, result, tok_none);
3724 break;
3726 case tok_ellipsis2: /* symbolic hexadecimal ellipsis */
3727 case tok_ellipsis3: /* absolute ellipsis */
3728 case tok_ellipsis4: /* symbolic decimal ellipsis */
3729 /* This is the symbolic (decimal or hexadecimal) or absolute
3730 ellipsis. */
3731 if (was_ellipsis != tok_none)
3732 goto err_label;
3734 if (state != 0 && state != 1 && state != 3)
3735 goto err_label;
3737 was_ellipsis = nowtok;
3739 insert_weights (ldfile, &collate->ellipsis_weight, charmap,
3740 repertoire, result, nowtok);
3741 break;
3743 case tok_end:
3744 seen_end:
3745 /* Next we assume `LC_COLLATE'. */
3746 if (!ignore_content)
3748 if (state == 0 && copy_locale == NULL)
3749 /* We must either see a copy statement or have
3750 ordering values. */
3751 lr_error (ldfile,
3752 _("%s: empty category description not allowed"),
3753 "LC_COLLATE");
3754 else if (state == 1)
3756 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3757 "LC_COLLATE");
3759 /* Handle ellipsis at end of list. */
3760 if (was_ellipsis != tok_none)
3762 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3763 repertoire, result);
3764 was_ellipsis = tok_none;
3767 else if (state == 3)
3768 WITH_CUR_LOCALE (error (0, 0, _("\
3769 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3770 else if (state == 5)
3771 WITH_CUR_LOCALE (error (0, 0, _("\
3772 %s: missing `reorder-sections-end' keyword"), "LC_COLLATE"));
3774 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3775 if (arg->tok == tok_eof)
3776 break;
3777 if (arg->tok == tok_eol)
3778 lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
3779 else if (arg->tok != tok_lc_collate)
3780 lr_error (ldfile, _("\
3781 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
3782 lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
3783 return;
3785 case tok_define:
3786 if (ignore_content)
3788 lr_ignore_rest (ldfile, 0);
3789 break;
3792 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3793 if (arg->tok != tok_ident)
3794 goto err_label;
3796 /* Simply add the new symbol. */
3797 struct name_list *newsym = xmalloc (sizeof (*newsym)
3798 + arg->val.str.lenmb + 1);
3799 memcpy (newsym->str, arg->val.str.startmb, arg->val.str.lenmb);
3800 newsym->str[arg->val.str.lenmb] = '\0';
3801 newsym->next = defined;
3802 defined = newsym;
3804 lr_ignore_rest (ldfile, 1);
3805 break;
3807 case tok_undef:
3808 if (ignore_content)
3810 lr_ignore_rest (ldfile, 0);
3811 break;
3814 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3815 if (arg->tok != tok_ident)
3816 goto err_label;
3818 /* Remove _all_ occurrences of the symbol from the list. */
3819 struct name_list *prevdef = NULL;
3820 struct name_list *curdef = defined;
3821 while (curdef != NULL)
3822 if (strncmp (arg->val.str.startmb, curdef->str,
3823 arg->val.str.lenmb) == 0
3824 && curdef->str[arg->val.str.lenmb] == '\0')
3826 if (prevdef == NULL)
3827 defined = curdef->next;
3828 else
3829 prevdef->next = curdef->next;
3831 struct name_list *olddef = curdef;
3832 curdef = curdef->next;
3834 free (olddef);
3836 else
3838 prevdef = curdef;
3839 curdef = curdef->next;
3842 lr_ignore_rest (ldfile, 1);
3843 break;
3845 case tok_ifdef:
3846 case tok_ifndef:
3847 if (ignore_content)
3849 lr_ignore_rest (ldfile, 0);
3850 break;
3853 found_ifdef:
3854 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3855 if (arg->tok != tok_ident)
3856 goto err_label;
3857 lr_ignore_rest (ldfile, 1);
3859 if (collate->else_action == else_none)
3861 curdef = defined;
3862 while (curdef != NULL)
3863 if (strncmp (arg->val.str.startmb, curdef->str,
3864 arg->val.str.lenmb) == 0
3865 && curdef->str[arg->val.str.lenmb] == '\0')
3866 break;
3867 else
3868 curdef = curdef->next;
3870 if ((nowtok == tok_ifdef && curdef != NULL)
3871 || (nowtok == tok_ifndef && curdef == NULL))
3873 /* We have to use the if-branch. */
3874 collate->else_action = else_ignore;
3876 else
3878 /* We have to use the else-branch, if there is one. */
3879 nowtok = skip_to (ldfile, collate, charmap, 0);
3880 if (nowtok == tok_else)
3881 collate->else_action = else_seen;
3882 else if (nowtok == tok_elifdef)
3884 nowtok = tok_ifdef;
3885 goto found_ifdef;
3887 else if (nowtok == tok_elifndef)
3889 nowtok = tok_ifndef;
3890 goto found_ifdef;
3892 else if (nowtok == tok_eof)
3893 goto seen_eof;
3894 else if (nowtok == tok_end)
3895 goto seen_end;
3898 else
3900 /* XXX Should it really become necessary to support nested
3901 preprocessor handling we will push the state here. */
3902 lr_error (ldfile, _("%s: nested conditionals not supported"),
3903 "LC_COLLATE");
3904 nowtok = skip_to (ldfile, collate, charmap, 1);
3905 if (nowtok == tok_eof)
3906 goto seen_eof;
3907 else if (nowtok == tok_end)
3908 goto seen_end;
3910 break;
3912 case tok_elifdef:
3913 case tok_elifndef:
3914 case tok_else:
3915 if (ignore_content)
3917 lr_ignore_rest (ldfile, 0);
3918 break;
3921 lr_ignore_rest (ldfile, 1);
3923 if (collate->else_action == else_ignore)
3925 /* Ignore everything until the endif. */
3926 nowtok = skip_to (ldfile, collate, charmap, 1);
3927 if (nowtok == tok_eof)
3928 goto seen_eof;
3929 else if (nowtok == tok_end)
3930 goto seen_end;
3932 else
3934 assert (collate->else_action == else_none);
3935 lr_error (ldfile, _("\
3936 %s: '%s' without matching 'ifdef' or 'ifndef'"), "LC_COLLATE",
3937 nowtok == tok_else ? "else"
3938 : nowtok == tok_elifdef ? "elifdef" : "elifndef");
3940 break;
3942 case tok_endif:
3943 if (ignore_content)
3945 lr_ignore_rest (ldfile, 0);
3946 break;
3949 lr_ignore_rest (ldfile, 1);
3951 if (collate->else_action != else_ignore
3952 && collate->else_action != else_seen)
3953 lr_error (ldfile, _("\
3954 %s: 'endif' without matching 'ifdef' or 'ifndef'"), "LC_COLLATE");
3956 /* XXX If we support nested preprocessor directives we pop
3957 the state here. */
3958 collate->else_action = else_none;
3959 break;
3961 default:
3962 err_label:
3963 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
3966 /* Prepare for the next round. */
3967 now = lr_token (ldfile, charmap, result, NULL, verbose);
3968 nowtok = now->tok;
3971 seen_eof:
3972 /* When we come here we reached the end of the file. */
3973 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");