Update copyright notices with scripts/update-copyrights
[glibc.git] / locale / programs / ld-collate.c
blob037fd2fcc5a9f1cf6d43b7b599eab704b94de69f
1 /* Copyright (C) 1995-2014 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 #include <errno.h>
23 #include <error.h>
24 #include <stdlib.h>
25 #include <wchar.h>
26 #include <stdint.h>
27 #include <sys/param.h>
29 #include "localedef.h"
30 #include "charmap.h"
31 #include "localeinfo.h"
32 #include "linereader.h"
33 #include "locfile.h"
34 #include "elem-hash.h"
36 /* Uncomment the following line in the production version. */
37 /* #define NDEBUG 1 */
38 #include <assert.h>
40 #define obstack_chunk_alloc malloc
41 #define obstack_chunk_free free
43 static inline void
44 __attribute ((always_inline))
45 obstack_int32_grow (struct obstack *obstack, int32_t data)
47 assert (LOCFILE_ALIGNED_P (obstack_object_size (obstack)));
48 data = maybe_swap_uint32 (data);
49 if (sizeof (int32_t) == sizeof (int))
50 obstack_int_grow (obstack, data);
51 else
52 obstack_grow (obstack, &data, sizeof (int32_t));
55 static inline void
56 __attribute ((always_inline))
57 obstack_int32_grow_fast (struct obstack *obstack, int32_t data)
59 assert (LOCFILE_ALIGNED_P (obstack_object_size (obstack)));
60 data = maybe_swap_uint32 (data);
61 if (sizeof (int32_t) == sizeof (int))
62 obstack_int_grow_fast (obstack, data);
63 else
64 obstack_grow (obstack, &data, sizeof (int32_t));
67 /* Forward declaration. */
68 struct element_t;
70 /* Data type for list of strings. */
71 struct section_list
73 /* Successor in the known_sections list. */
74 struct section_list *def_next;
75 /* Successor in the sections list. */
76 struct section_list *next;
77 /* Name of the section. */
78 const char *name;
79 /* First element of this section. */
80 struct element_t *first;
81 /* Last element of this section. */
82 struct element_t *last;
83 /* These are the rules for this section. */
84 enum coll_sort_rule *rules;
85 /* Index of the rule set in the appropriate section of the output file. */
86 int ruleidx;
89 struct element_t;
91 struct element_list_t
93 /* Number of elements. */
94 int cnt;
96 struct element_t **w;
99 /* Data type for collating element. */
100 struct element_t
102 const char *name;
104 const char *mbs;
105 size_t nmbs;
106 const uint32_t *wcs;
107 size_t nwcs;
108 int *mborder;
109 int wcorder;
111 /* The following is a bit mask which bits are set if this element is
112 used in the appropriate level. Interesting for the singlebyte
113 weight computation.
115 XXX The type here restricts the number of levels to 32. It could
116 be changed if necessary but I doubt this is necessary. */
117 unsigned int used_in_level;
119 struct element_list_t *weights;
121 /* Nonzero if this is a real character definition. */
122 int is_character;
124 /* Order of the character in the sequence. This information will
125 be used in range expressions. */
126 int mbseqorder;
127 int wcseqorder;
129 /* Where does the definition come from. */
130 const char *file;
131 size_t line;
133 /* Which section does this belong to. */
134 struct section_list *section;
136 /* Predecessor and successor in the order list. */
137 struct element_t *last;
138 struct element_t *next;
140 /* Next element in multibyte output list. */
141 struct element_t *mbnext;
142 struct element_t *mblast;
144 /* Next element in wide character output list. */
145 struct element_t *wcnext;
146 struct element_t *wclast;
149 /* Special element value. */
150 #define ELEMENT_ELLIPSIS2 ((struct element_t *) 1)
151 #define ELEMENT_ELLIPSIS3 ((struct element_t *) 2)
152 #define ELEMENT_ELLIPSIS4 ((struct element_t *) 3)
154 /* Data type for collating symbol. */
155 struct symbol_t
157 const char *name;
159 /* Point to place in the order list. */
160 struct element_t *order;
162 /* Where does the definition come from. */
163 const char *file;
164 size_t line;
167 /* Sparse table of struct element_t *. */
168 #define TABLE wchead_table
169 #define ELEMENT struct element_t *
170 #define DEFAULT NULL
171 #define ITERATE
172 #define NO_ADD_LOCALE
173 #include "3level.h"
175 /* Sparse table of int32_t. */
176 #define TABLE collidx_table
177 #define ELEMENT int32_t
178 #define DEFAULT 0
179 #include "3level.h"
181 /* Sparse table of uint32_t. */
182 #define TABLE collseq_table
183 #define ELEMENT uint32_t
184 #define DEFAULT ~((uint32_t) 0)
185 #include "3level.h"
188 /* Simple name list for the preprocessor. */
189 struct name_list
191 struct name_list *next;
192 char str[0];
196 /* The real definition of the struct for the LC_COLLATE locale. */
197 struct locale_collate_t
199 int col_weight_max;
200 int cur_weight_max;
202 /* List of known scripts. */
203 struct section_list *known_sections;
204 /* List of used sections. */
205 struct section_list *sections;
206 /* Current section using definition. */
207 struct section_list *current_section;
208 /* There always can be an unnamed section. */
209 struct section_list unnamed_section;
210 /* Flag whether the unnamed section has been defined. */
211 bool unnamed_section_defined;
212 /* To make handling of errors easier we have another section. */
213 struct section_list error_section;
214 /* Sometimes we are defining the values for collating symbols before
215 the first actual section. */
216 struct section_list symbol_section;
218 /* Start of the order list. */
219 struct element_t *start;
221 /* The undefined element. */
222 struct element_t undefined;
224 /* This is the cursor for `reorder_after' insertions. */
225 struct element_t *cursor;
227 /* This value is used when handling ellipsis. */
228 struct element_t ellipsis_weight;
230 /* Known collating elements. */
231 hash_table elem_table;
233 /* Known collating symbols. */
234 hash_table sym_table;
236 /* Known collation sequences. */
237 hash_table seq_table;
239 struct obstack mempool;
241 /* The LC_COLLATE category is a bit special as it is sometimes possible
242 that the definitions from more than one input file contains information.
243 Therefore we keep all relevant input in a list. */
244 struct locale_collate_t *next;
246 /* Arrays with heads of the list for each of the leading bytes in
247 the multibyte sequences. */
248 struct element_t *mbheads[256];
250 /* Arrays with heads of the list for each of the leading bytes in
251 the multibyte sequences. */
252 struct wchead_table wcheads;
254 /* The arrays with the collation sequence order. */
255 unsigned char mbseqorder[256];
256 struct collseq_table wcseqorder;
258 /* State of the preprocessor. */
259 enum
261 else_none = 0,
262 else_ignore,
263 else_seen
265 else_action;
269 /* We have a few global variables which are used for reading all
270 LC_COLLATE category descriptions in all files. */
271 static uint32_t nrules;
273 /* List of defined preprocessor symbols. */
274 static struct name_list *defined;
277 /* We need UTF-8 encoding of numbers. */
278 static inline int
279 __attribute ((always_inline))
280 utf8_encode (char *buf, int val)
282 int retval;
284 if (val < 0x80)
286 *buf++ = (char) val;
287 retval = 1;
289 else
291 int step;
293 for (step = 2; step < 6; ++step)
294 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
295 break;
296 retval = step;
298 *buf = (unsigned char) (~0xff >> step);
299 --step;
302 buf[step] = 0x80 | (val & 0x3f);
303 val >>= 6;
305 while (--step > 0);
306 *buf |= val;
309 return retval;
313 static struct section_list *
314 make_seclist_elem (struct locale_collate_t *collate, const char *string,
315 struct section_list *next)
317 struct section_list *newp;
319 newp = (struct section_list *) obstack_alloc (&collate->mempool,
320 sizeof (*newp));
321 newp->next = next;
322 newp->name = string;
323 newp->first = NULL;
324 newp->last = NULL;
326 return newp;
330 static struct element_t *
331 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
332 const uint32_t *wcs, const char *name, size_t namelen,
333 int is_character)
335 struct element_t *newp;
337 newp = (struct element_t *) obstack_alloc (&collate->mempool,
338 sizeof (*newp));
339 newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
340 name, namelen);
341 if (mbs != NULL)
343 newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
344 newp->nmbs = mbslen;
346 else
348 newp->mbs = NULL;
349 newp->nmbs = 0;
351 if (wcs != NULL)
353 size_t nwcs = wcslen ((wchar_t *) wcs);
354 uint32_t zero = 0;
355 /* Handle <U0000> as a single character. */
356 if (nwcs == 0)
357 nwcs = 1;
358 obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
359 obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
360 newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
361 newp->nwcs = nwcs;
363 else
365 newp->wcs = NULL;
366 newp->nwcs = 0;
368 newp->mborder = NULL;
369 newp->wcorder = 0;
370 newp->used_in_level = 0;
371 newp->is_character = is_character;
373 /* Will be assigned later. XXX */
374 newp->mbseqorder = 0;
375 newp->wcseqorder = 0;
377 /* Will be allocated later. */
378 newp->weights = NULL;
380 newp->file = NULL;
381 newp->line = 0;
383 newp->section = collate->current_section;
385 newp->last = NULL;
386 newp->next = NULL;
388 newp->mbnext = NULL;
389 newp->mblast = NULL;
391 newp->wcnext = NULL;
392 newp->wclast = NULL;
394 return newp;
398 static struct symbol_t *
399 new_symbol (struct locale_collate_t *collate, const char *name, size_t len)
401 struct symbol_t *newp;
403 newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
405 newp->name = obstack_copy0 (&collate->mempool, name, len);
406 newp->order = NULL;
408 newp->file = NULL;
409 newp->line = 0;
411 return newp;
415 /* Test whether this name is already defined somewhere. */
416 static int
417 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
418 const struct charmap_t *charmap,
419 struct repertoire_t *repertoire, const char *symbol,
420 size_t symbol_len)
422 void *ignore = NULL;
424 if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
426 lr_error (ldfile, _("`%.*s' already defined in charmap"),
427 (int) symbol_len, symbol);
428 return 1;
431 if (repertoire != NULL
432 && (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore)
433 == 0))
435 lr_error (ldfile, _("`%.*s' already defined in repertoire"),
436 (int) symbol_len, symbol);
437 return 1;
440 if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
442 lr_error (ldfile, _("`%.*s' already defined as collating symbol"),
443 (int) symbol_len, symbol);
444 return 1;
447 if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
449 lr_error (ldfile, _("`%.*s' already defined as collating element"),
450 (int) symbol_len, symbol);
451 return 1;
454 return 0;
458 /* Read the direction specification. */
459 static void
460 read_directions (struct linereader *ldfile, struct token *arg,
461 const struct charmap_t *charmap,
462 struct repertoire_t *repertoire, struct localedef_t *result)
464 int cnt = 0;
465 int max = nrules ?: 10;
466 enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
467 int warned = 0;
468 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
470 while (1)
472 int valid = 0;
474 if (arg->tok == tok_forward)
476 if (rules[cnt] & sort_backward)
478 if (! warned)
480 lr_error (ldfile, _("\
481 %s: `forward' and `backward' are mutually excluding each other"),
482 "LC_COLLATE");
483 warned = 1;
486 else if (rules[cnt] & sort_forward)
488 if (! warned)
490 lr_error (ldfile, _("\
491 %s: `%s' mentioned more than once in definition of weight %d"),
492 "LC_COLLATE", "forward", cnt + 1);
495 else
496 rules[cnt] |= sort_forward;
498 valid = 1;
500 else if (arg->tok == tok_backward)
502 if (rules[cnt] & sort_forward)
504 if (! warned)
506 lr_error (ldfile, _("\
507 %s: `forward' and `backward' are mutually excluding each other"),
508 "LC_COLLATE");
509 warned = 1;
512 else if (rules[cnt] & sort_backward)
514 if (! warned)
516 lr_error (ldfile, _("\
517 %s: `%s' mentioned more than once in definition of weight %d"),
518 "LC_COLLATE", "backward", cnt + 1);
521 else
522 rules[cnt] |= sort_backward;
524 valid = 1;
526 else if (arg->tok == tok_position)
528 if (rules[cnt] & sort_position)
530 if (! warned)
532 lr_error (ldfile, _("\
533 %s: `%s' mentioned more than once in definition of weight %d"),
534 "LC_COLLATE", "position", cnt + 1);
537 else
538 rules[cnt] |= sort_position;
540 valid = 1;
543 if (valid)
544 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
546 if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
547 || arg->tok == tok_semicolon)
549 if (! valid && ! warned)
551 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
552 warned = 1;
555 /* See whether we have to increment the counter. */
556 if (arg->tok != tok_comma && rules[cnt] != 0)
558 /* Add the default `forward' if we have seen only `position'. */
559 if (rules[cnt] == sort_position)
560 rules[cnt] = sort_position | sort_forward;
562 ++cnt;
565 if (arg->tok == tok_eof || arg->tok == tok_eol)
566 /* End of line or file, so we exit the loop. */
567 break;
569 if (nrules == 0)
571 /* See whether we have enough room in the array. */
572 if (cnt == max)
574 max += 10;
575 rules = (enum coll_sort_rule *) xrealloc (rules,
577 * sizeof (*rules));
578 memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
581 else
583 if (cnt == nrules)
585 /* There must not be any more rule. */
586 if (! warned)
588 lr_error (ldfile, _("\
589 %s: too many rules; first entry only had %d"),
590 "LC_COLLATE", nrules);
591 warned = 1;
594 lr_ignore_rest (ldfile, 0);
595 break;
599 else
601 if (! warned)
603 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
604 warned = 1;
608 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
611 if (nrules == 0)
613 /* Now we know how many rules we have. */
614 nrules = cnt;
615 rules = (enum coll_sort_rule *) xrealloc (rules,
616 nrules * sizeof (*rules));
618 else
620 if (cnt < nrules)
622 /* Not enough rules in this specification. */
623 if (! warned)
624 lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
627 rules[cnt] = sort_forward;
628 while (++cnt < nrules);
632 collate->current_section->rules = rules;
636 static struct element_t *
637 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
638 const char *str, size_t len)
640 void *result = NULL;
642 /* Search for the entries among the collation sequences already define. */
643 if (find_entry (&collate->seq_table, str, len, &result) != 0)
645 /* Nope, not define yet. So we see whether it is a
646 collation symbol. */
647 void *ptr;
649 if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
651 /* It's a collation symbol. */
652 struct symbol_t *sym = (struct symbol_t *) ptr;
653 result = sym->order;
655 if (result == NULL)
656 result = sym->order = new_element (collate, NULL, 0, NULL,
657 NULL, 0, 0);
659 else if (find_entry (&collate->elem_table, str, len, &result) != 0)
661 /* It's also no collation element. So it is a character
662 element defined later. */
663 result = new_element (collate, NULL, 0, NULL, str, len, 1);
664 /* Insert it into the sequence table. */
665 insert_entry (&collate->seq_table, str, len, result);
669 return (struct element_t *) result;
673 static void
674 unlink_element (struct locale_collate_t *collate)
676 if (collate->cursor == collate->start)
678 assert (collate->cursor->next == NULL);
679 assert (collate->cursor->last == NULL);
680 collate->cursor = NULL;
682 else
684 if (collate->cursor->next != NULL)
685 collate->cursor->next->last = collate->cursor->last;
686 if (collate->cursor->last != NULL)
687 collate->cursor->last->next = collate->cursor->next;
688 collate->cursor = collate->cursor->last;
693 static void
694 insert_weights (struct linereader *ldfile, struct element_t *elem,
695 const struct charmap_t *charmap,
696 struct repertoire_t *repertoire, struct localedef_t *result,
697 enum token_t ellipsis)
699 int weight_cnt;
700 struct token *arg;
701 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
703 /* Initialize all the fields. */
704 elem->file = ldfile->fname;
705 elem->line = ldfile->lineno;
707 elem->last = collate->cursor;
708 elem->next = collate->cursor ? collate->cursor->next : NULL;
709 if (collate->cursor != NULL && collate->cursor->next != NULL)
710 collate->cursor->next->last = elem;
711 if (collate->cursor != NULL)
712 collate->cursor->next = elem;
713 if (collate->start == NULL)
715 assert (collate->cursor == NULL);
716 collate->start = elem;
719 elem->section = collate->current_section;
721 if (collate->current_section->first == NULL)
722 collate->current_section->first = elem;
723 if (collate->current_section->last == collate->cursor)
724 collate->current_section->last = elem;
726 collate->cursor = elem;
728 elem->weights = (struct element_list_t *)
729 obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
730 memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
732 weight_cnt = 0;
734 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
737 if (arg->tok == tok_eof || arg->tok == tok_eol)
738 break;
740 if (arg->tok == tok_ignore)
742 /* The weight for this level has to be ignored. We use the
743 null pointer to indicate this. */
744 elem->weights[weight_cnt].w = (struct element_t **)
745 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
746 elem->weights[weight_cnt].w[0] = NULL;
747 elem->weights[weight_cnt].cnt = 1;
749 else if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
751 char ucs4str[10];
752 struct element_t *val;
753 char *symstr;
754 size_t symlen;
756 if (arg->tok == tok_bsymbol)
758 symstr = arg->val.str.startmb;
759 symlen = arg->val.str.lenmb;
761 else
763 snprintf (ucs4str, sizeof (ucs4str), "U%08X", arg->val.ucs4);
764 symstr = ucs4str;
765 symlen = 9;
768 val = find_element (ldfile, collate, symstr, symlen);
769 if (val == NULL)
770 break;
772 elem->weights[weight_cnt].w = (struct element_t **)
773 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
774 elem->weights[weight_cnt].w[0] = val;
775 elem->weights[weight_cnt].cnt = 1;
777 else if (arg->tok == tok_string)
779 /* Split the string up in the individual characters and put
780 the element definitions in the list. */
781 const char *cp = arg->val.str.startmb;
782 int cnt = 0;
783 struct element_t *charelem;
784 struct element_t **weights = NULL;
785 int max = 0;
787 if (*cp == '\0')
789 lr_error (ldfile, _("%s: empty weight string not allowed"),
790 "LC_COLLATE");
791 lr_ignore_rest (ldfile, 0);
792 break;
797 if (*cp == '<')
799 /* Ahh, it's a bsymbol or an UCS4 value. If it's
800 the latter we have to unify the name. */
801 const char *startp = ++cp;
802 size_t len;
804 while (*cp != '>')
806 if (*cp == ldfile->escape_char)
807 ++cp;
808 if (*cp == '\0')
809 /* It's a syntax error. */
810 goto syntax;
812 ++cp;
815 if (cp - startp == 5 && startp[0] == 'U'
816 && isxdigit (startp[1]) && isxdigit (startp[2])
817 && isxdigit (startp[3]) && isxdigit (startp[4]))
819 unsigned int ucs4 = strtoul (startp + 1, NULL, 16);
820 char *newstr;
822 newstr = (char *) xmalloc (10);
823 snprintf (newstr, 10, "U%08X", ucs4);
824 startp = newstr;
826 len = 9;
828 else
829 len = cp - startp;
831 charelem = find_element (ldfile, collate, startp, len);
832 ++cp;
834 else
836 /* People really shouldn't use characters directly in
837 the string. Especially since it's not really clear
838 what this means. We interpret all characters in the
839 string as if that would be bsymbols. Otherwise we
840 would have to match back to bsymbols somehow and this
841 is normally not what people normally expect. */
842 charelem = find_element (ldfile, collate, cp++, 1);
845 if (charelem == NULL)
847 /* We ignore the rest of the line. */
848 lr_ignore_rest (ldfile, 0);
849 break;
852 /* Add the pointer. */
853 if (cnt >= max)
855 struct element_t **newp;
856 max += 10;
857 newp = (struct element_t **)
858 alloca (max * sizeof (struct element_t *));
859 memcpy (newp, weights, cnt * sizeof (struct element_t *));
860 weights = newp;
862 weights[cnt++] = charelem;
864 while (*cp != '\0');
866 /* Now store the information. */
867 elem->weights[weight_cnt].w = (struct element_t **)
868 obstack_alloc (&collate->mempool,
869 cnt * sizeof (struct element_t *));
870 memcpy (elem->weights[weight_cnt].w, weights,
871 cnt * sizeof (struct element_t *));
872 elem->weights[weight_cnt].cnt = cnt;
874 /* We don't need the string anymore. */
875 free (arg->val.str.startmb);
877 else if (ellipsis != tok_none
878 && (arg->tok == tok_ellipsis2
879 || arg->tok == tok_ellipsis3
880 || arg->tok == tok_ellipsis4))
882 /* It must be the same ellipsis as used in the initial column. */
883 if (arg->tok != ellipsis)
884 lr_error (ldfile, _("\
885 %s: weights must use the same ellipsis symbol as the name"),
886 "LC_COLLATE");
888 /* The weight for this level will depend on the element
889 iterating over the range. Put a placeholder. */
890 elem->weights[weight_cnt].w = (struct element_t **)
891 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
892 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
893 elem->weights[weight_cnt].cnt = 1;
895 else
897 syntax:
898 /* It's a syntax error. */
899 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
900 lr_ignore_rest (ldfile, 0);
901 break;
904 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
905 /* This better should be the end of the line or a semicolon. */
906 if (arg->tok == tok_semicolon)
907 /* OK, ignore this and read the next token. */
908 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
909 else if (arg->tok != tok_eof && arg->tok != tok_eol)
911 /* It's a syntax error. */
912 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
913 lr_ignore_rest (ldfile, 0);
914 break;
917 while (++weight_cnt < nrules);
919 if (weight_cnt < nrules)
921 /* This means the rest of the line uses the current element as
922 the weight. */
925 elem->weights[weight_cnt].w = (struct element_t **)
926 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
927 if (ellipsis == tok_none)
928 elem->weights[weight_cnt].w[0] = elem;
929 else
930 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
931 elem->weights[weight_cnt].cnt = 1;
933 while (++weight_cnt < nrules);
935 else
937 if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
939 /* Too many rule values. */
940 lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
941 lr_ignore_rest (ldfile, 0);
943 else
944 lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
949 static int
950 insert_value (struct linereader *ldfile, const char *symstr, size_t symlen,
951 const struct charmap_t *charmap, struct repertoire_t *repertoire,
952 struct localedef_t *result)
954 /* First find out what kind of symbol this is. */
955 struct charseq *seq;
956 uint32_t wc;
957 struct element_t *elem = NULL;
958 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
960 /* Try to find the character in the charmap. */
961 seq = charmap_find_value (charmap, symstr, symlen);
963 /* Determine the wide character. */
964 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
966 wc = repertoire_find_value (repertoire, symstr, symlen);
967 if (seq != NULL)
968 seq->ucs4 = wc;
970 else
971 wc = seq->ucs4;
973 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
975 /* It's no character, so look through the collation elements and
976 symbol list. */
977 void *ptr = elem;
978 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) != 0)
980 void *result;
981 struct symbol_t *sym = NULL;
983 /* It's also collation element. Therefore it's either a
984 collating symbol or it's a character which is not
985 supported by the character set. In the later case we
986 simply create a dummy entry. */
987 if (find_entry (&collate->sym_table, symstr, symlen, &result) == 0)
989 /* It's a collation symbol. */
990 sym = (struct symbol_t *) result;
992 elem = sym->order;
995 if (elem == NULL)
997 elem = new_element (collate, NULL, 0, NULL, symstr, symlen, 0);
999 if (sym != NULL)
1000 sym->order = elem;
1001 else
1002 /* Enter a fake element in the sequence table. This
1003 won't cause anything in the output since there is
1004 no multibyte or wide character associated with
1005 it. */
1006 insert_entry (&collate->seq_table, symstr, symlen, elem);
1009 else
1010 /* Copy the result back. */
1011 elem = ptr;
1013 else
1015 /* Otherwise the symbols stands for a character. */
1016 void *ptr = elem;
1017 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) != 0)
1019 uint32_t wcs[2] = { wc, 0 };
1021 /* We have to allocate an entry. */
1022 elem = new_element (collate,
1023 seq != NULL ? (char *) seq->bytes : NULL,
1024 seq != NULL ? seq->nbytes : 0,
1025 wc == ILLEGAL_CHAR_VALUE ? NULL : wcs,
1026 symstr, symlen, 1);
1028 /* And add it to the table. */
1029 if (insert_entry (&collate->seq_table, symstr, symlen, elem) != 0)
1030 /* This cannot happen. */
1031 assert (! "Internal error");
1033 else
1035 /* Copy the result back. */
1036 elem = ptr;
1038 /* Maybe the character was used before the definition. In this case
1039 we have to insert the byte sequences now. */
1040 if (elem->mbs == NULL && seq != NULL)
1042 elem->mbs = obstack_copy0 (&collate->mempool,
1043 seq->bytes, seq->nbytes);
1044 elem->nmbs = seq->nbytes;
1047 if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
1049 uint32_t wcs[2] = { wc, 0 };
1051 elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
1052 elem->nwcs = 1;
1057 /* Test whether this element is not already in the list. */
1058 if (elem->next != NULL || elem == collate->cursor)
1060 lr_error (ldfile, _("order for `%.*s' already defined at %s:%Zu"),
1061 (int) symlen, symstr, elem->file, elem->line);
1062 lr_ignore_rest (ldfile, 0);
1063 return 1;
1066 insert_weights (ldfile, elem, charmap, repertoire, result, tok_none);
1068 return 0;
1072 static void
1073 handle_ellipsis (struct linereader *ldfile, const char *symstr, size_t symlen,
1074 enum token_t ellipsis, const struct charmap_t *charmap,
1075 struct repertoire_t *repertoire,
1076 struct localedef_t *result)
1078 struct element_t *startp;
1079 struct element_t *endp;
1080 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
1082 /* Unlink the entry added for the ellipsis. */
1083 unlink_element (collate);
1084 startp = collate->cursor;
1086 /* Process and add the end-entry. */
1087 if (symstr != NULL
1088 && insert_value (ldfile, symstr, symlen, charmap, repertoire, result))
1089 /* Something went wrong with inserting the to-value. This means
1090 we cannot process the ellipsis. */
1091 return;
1093 /* Reset the cursor. */
1094 collate->cursor = startp;
1096 /* Now we have to handle many different situations:
1097 - we have to distinguish between the three different ellipsis forms
1098 - the is the ellipsis at the beginning, in the middle, or at the end.
1100 endp = collate->cursor->next;
1101 assert (symstr == NULL || endp != NULL);
1103 /* XXX The following is probably very wrong since also collating symbols
1104 can appear in ranges. But do we want/can refine the test for that? */
1105 #if 0
1106 /* Both, the start and the end symbol, must stand for characters. */
1107 if ((startp != NULL && (startp->name == NULL || ! startp->is_character))
1108 || (endp != NULL && (endp->name == NULL|| ! endp->is_character)))
1110 lr_error (ldfile, _("\
1111 %s: the start and the end symbol of a range must stand for characters"),
1112 "LC_COLLATE");
1113 return;
1115 #endif
1117 if (ellipsis == tok_ellipsis3)
1119 /* One requirement we make here: the length of the byte
1120 sequences for the first and end character must be the same.
1121 This is mainly to prevent unwanted effects and this is often
1122 not what is wanted. */
1123 size_t len = (startp->mbs != NULL ? startp->nmbs
1124 : (endp->mbs != NULL ? endp->nmbs : 0));
1125 char mbcnt[len + 1];
1126 char mbend[len + 1];
1128 /* Well, this should be caught somewhere else already. Just to
1129 make sure. */
1130 assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
1131 assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
1133 if (startp != NULL && endp != NULL
1134 && startp->mbs != NULL && endp->mbs != NULL
1135 && startp->nmbs != endp->nmbs)
1137 lr_error (ldfile, _("\
1138 %s: byte sequences of first and last character must have the same length"),
1139 "LC_COLLATE");
1140 return;
1143 /* Determine whether we have to generate multibyte sequences. */
1144 if ((startp == NULL || startp->mbs != NULL)
1145 && (endp == NULL || endp->mbs != NULL))
1147 int cnt;
1148 int ret;
1150 /* Prepare the beginning byte sequence. This is either from the
1151 beginning byte sequence or it is all nulls if it was an
1152 initial ellipsis. */
1153 if (startp == NULL || startp->mbs == NULL)
1154 memset (mbcnt, '\0', len);
1155 else
1157 memcpy (mbcnt, startp->mbs, len);
1159 /* And increment it so that the value is the first one we will
1160 try to insert. */
1161 for (cnt = len - 1; cnt >= 0; --cnt)
1162 if (++mbcnt[cnt] != '\0')
1163 break;
1165 mbcnt[len] = '\0';
1167 /* And the end sequence. */
1168 if (endp == NULL || endp->mbs == NULL)
1169 memset (mbend, '\0', len);
1170 else
1171 memcpy (mbend, endp->mbs, len);
1172 mbend[len] = '\0';
1174 /* Test whether we have a correct range. */
1175 ret = memcmp (mbcnt, mbend, len);
1176 if (ret >= 0)
1178 if (ret > 0)
1179 lr_error (ldfile, _("%s: byte sequence of first character of \
1180 range is not lower than that of the last character"), "LC_COLLATE");
1181 return;
1184 /* Generate the byte sequences data. */
1185 while (1)
1187 struct charseq *seq;
1189 /* Quite a bit of work ahead. We have to find the character
1190 definition for the byte sequence and then determine the
1191 wide character belonging to it. */
1192 seq = charmap_find_symbol (charmap, mbcnt, len);
1193 if (seq != NULL)
1195 struct element_t *elem;
1196 size_t namelen;
1198 /* I don't think this can ever happen. */
1199 assert (seq->name != NULL);
1200 namelen = strlen (seq->name);
1202 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1203 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1204 namelen);
1206 /* Now we are ready to insert the new value in the
1207 sequence. Find out whether the element is
1208 already known. */
1209 void *ptr;
1210 if (find_entry (&collate->seq_table, seq->name, namelen,
1211 &ptr) != 0)
1213 uint32_t wcs[2] = { seq->ucs4, 0 };
1215 /* We have to allocate an entry. */
1216 elem = new_element (collate, mbcnt, len,
1217 seq->ucs4 == ILLEGAL_CHAR_VALUE
1218 ? NULL : wcs, seq->name,
1219 namelen, 1);
1221 /* And add it to the table. */
1222 if (insert_entry (&collate->seq_table, seq->name,
1223 namelen, elem) != 0)
1224 /* This cannot happen. */
1225 assert (! "Internal error");
1227 else
1228 /* Copy the result. */
1229 elem = ptr;
1231 /* Test whether this element is not already in the list. */
1232 if (elem->next != NULL || (collate->cursor != NULL
1233 && elem->next == collate->cursor))
1235 lr_error (ldfile, _("\
1236 order for `%.*s' already defined at %s:%Zu"),
1237 (int) namelen, seq->name,
1238 elem->file, elem->line);
1239 goto increment;
1242 /* Enqueue the new element. */
1243 elem->last = collate->cursor;
1244 if (collate->cursor == NULL)
1245 elem->next = NULL;
1246 else
1248 elem->next = collate->cursor->next;
1249 elem->last->next = elem;
1250 if (elem->next != NULL)
1251 elem->next->last = elem;
1253 if (collate->start == NULL)
1255 assert (collate->cursor == NULL);
1256 collate->start = elem;
1258 collate->cursor = elem;
1260 /* Add the weight value. We take them from the
1261 `ellipsis_weights' member of `collate'. */
1262 elem->weights = (struct element_list_t *)
1263 obstack_alloc (&collate->mempool,
1264 nrules * sizeof (struct element_list_t));
1265 for (cnt = 0; cnt < nrules; ++cnt)
1266 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1267 && (collate->ellipsis_weight.weights[cnt].w[0]
1268 == ELEMENT_ELLIPSIS2))
1270 elem->weights[cnt].w = (struct element_t **)
1271 obstack_alloc (&collate->mempool,
1272 sizeof (struct element_t *));
1273 elem->weights[cnt].w[0] = elem;
1274 elem->weights[cnt].cnt = 1;
1276 else
1278 /* Simply use the weight from `ellipsis_weight'. */
1279 elem->weights[cnt].w =
1280 collate->ellipsis_weight.weights[cnt].w;
1281 elem->weights[cnt].cnt =
1282 collate->ellipsis_weight.weights[cnt].cnt;
1286 /* Increment for the next round. */
1287 increment:
1288 for (cnt = len - 1; cnt >= 0; --cnt)
1289 if (++mbcnt[cnt] != '\0')
1290 break;
1292 /* Find out whether this was all. */
1293 if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1294 /* Yep, that's all. */
1295 break;
1299 else
1301 /* For symbolic range we naturally must have a beginning and an
1302 end specified by the user. */
1303 if (startp == NULL)
1304 lr_error (ldfile, _("\
1305 %s: symbolic range ellipsis must not directly follow `order_start'"),
1306 "LC_COLLATE");
1307 else if (endp == NULL)
1308 lr_error (ldfile, _("\
1309 %s: symbolic range ellipsis must not be directly followed by `order_end'"),
1310 "LC_COLLATE");
1311 else
1313 /* Determine the range. To do so we have to determine the
1314 common prefix of the both names and then the numeric
1315 values of both ends. */
1316 size_t lenfrom = strlen (startp->name);
1317 size_t lento = strlen (endp->name);
1318 char buf[lento + 1];
1319 int preflen = 0;
1320 long int from;
1321 long int to;
1322 char *cp;
1323 int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1325 if (lenfrom != lento)
1327 invalid_range:
1328 lr_error (ldfile, _("\
1329 `%s' and `%.*s' are not valid names for symbolic range"),
1330 startp->name, (int) lento, endp->name);
1331 return;
1334 while (startp->name[preflen] == endp->name[preflen])
1335 if (startp->name[preflen] == '\0')
1336 /* Nothing to be done. The start and end point are identical
1337 and while inserting the end point we have already given
1338 the user an error message. */
1339 return;
1340 else
1341 ++preflen;
1343 errno = 0;
1344 from = strtol (startp->name + preflen, &cp, base);
1345 if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1346 goto invalid_range;
1348 errno = 0;
1349 to = strtol (endp->name + preflen, &cp, base);
1350 if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1351 goto invalid_range;
1353 /* Copy the prefix. */
1354 memcpy (buf, startp->name, preflen);
1356 /* Loop over all values. */
1357 for (++from; from < to; ++from)
1359 struct element_t *elem = NULL;
1360 struct charseq *seq;
1361 uint32_t wc;
1362 int cnt;
1364 /* Generate the name. */
1365 sprintf (buf + preflen, base == 10 ? "%0*ld" : "%0*lX",
1366 (int) (lenfrom - preflen), from);
1368 /* Look whether this name is already defined. */
1369 void *ptr;
1370 if (find_entry (&collate->seq_table, buf, symlen, &ptr) == 0)
1372 /* Copy back the result. */
1373 elem = ptr;
1375 if (elem->next != NULL || (collate->cursor != NULL
1376 && elem->next == collate->cursor))
1378 lr_error (ldfile, _("\
1379 %s: order for `%.*s' already defined at %s:%Zu"),
1380 "LC_COLLATE", (int) lenfrom, buf,
1381 elem->file, elem->line);
1382 continue;
1385 if (elem->name == NULL)
1387 lr_error (ldfile, _("%s: `%s' must be a character"),
1388 "LC_COLLATE", buf);
1389 continue;
1393 if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1395 /* Search for a character of this name. */
1396 seq = charmap_find_value (charmap, buf, lenfrom);
1397 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1399 wc = repertoire_find_value (repertoire, buf, lenfrom);
1401 if (seq != NULL)
1402 seq->ucs4 = wc;
1404 else
1405 wc = seq->ucs4;
1407 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1408 /* We don't know anything about a character with this
1409 name. XXX Should we warn? */
1410 continue;
1412 if (elem == NULL)
1414 uint32_t wcs[2] = { wc, 0 };
1416 /* We have to allocate an entry. */
1417 elem = new_element (collate,
1418 seq != NULL
1419 ? (char *) seq->bytes : NULL,
1420 seq != NULL ? seq->nbytes : 0,
1421 wc == ILLEGAL_CHAR_VALUE
1422 ? NULL : wcs, buf, lenfrom, 1);
1424 else
1426 /* Update the element. */
1427 if (seq != NULL)
1429 elem->mbs = obstack_copy0 (&collate->mempool,
1430 seq->bytes, seq->nbytes);
1431 elem->nmbs = seq->nbytes;
1434 if (wc != ILLEGAL_CHAR_VALUE)
1436 uint32_t zero = 0;
1438 obstack_grow (&collate->mempool,
1439 &wc, sizeof (uint32_t));
1440 obstack_grow (&collate->mempool,
1441 &zero, sizeof (uint32_t));
1442 elem->wcs = obstack_finish (&collate->mempool);
1443 elem->nwcs = 1;
1447 elem->file = ldfile->fname;
1448 elem->line = ldfile->lineno;
1449 elem->section = collate->current_section;
1452 /* Enqueue the new element. */
1453 elem->last = collate->cursor;
1454 elem->next = collate->cursor->next;
1455 elem->last->next = elem;
1456 if (elem->next != NULL)
1457 elem->next->last = elem;
1458 collate->cursor = elem;
1460 /* Now add the weights. They come from the `ellipsis_weights'
1461 member of `collate'. */
1462 elem->weights = (struct element_list_t *)
1463 obstack_alloc (&collate->mempool,
1464 nrules * sizeof (struct element_list_t));
1465 for (cnt = 0; cnt < nrules; ++cnt)
1466 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1467 && (collate->ellipsis_weight.weights[cnt].w[0]
1468 == ELEMENT_ELLIPSIS2))
1470 elem->weights[cnt].w = (struct element_t **)
1471 obstack_alloc (&collate->mempool,
1472 sizeof (struct element_t *));
1473 elem->weights[cnt].w[0] = elem;
1474 elem->weights[cnt].cnt = 1;
1476 else
1478 /* Simly use the weight from `ellipsis_weight'. */
1479 elem->weights[cnt].w =
1480 collate->ellipsis_weight.weights[cnt].w;
1481 elem->weights[cnt].cnt =
1482 collate->ellipsis_weight.weights[cnt].cnt;
1490 static void
1491 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1492 struct localedef_t *copy_locale, int ignore_content)
1494 if (!ignore_content && locale->categories[LC_COLLATE].collate == NULL)
1496 struct locale_collate_t *collate;
1498 if (copy_locale == NULL)
1500 collate = locale->categories[LC_COLLATE].collate =
1501 (struct locale_collate_t *)
1502 xcalloc (1, sizeof (struct locale_collate_t));
1504 /* Init the various data structures. */
1505 init_hash (&collate->elem_table, 100);
1506 init_hash (&collate->sym_table, 100);
1507 init_hash (&collate->seq_table, 500);
1508 obstack_init (&collate->mempool);
1510 collate->col_weight_max = -1;
1512 else
1513 /* Reuse the copy_locale's data structures. */
1514 collate = locale->categories[LC_COLLATE].collate =
1515 copy_locale->categories[LC_COLLATE].collate;
1518 ldfile->translate_strings = 0;
1519 ldfile->return_widestr = 0;
1523 void
1524 collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
1526 /* Now is the time when we can assign the individual collation
1527 values for all the symbols. We have possibly different values
1528 for the wide- and the multibyte-character symbols. This is done
1529 since it might make a difference in the encoding if there is in
1530 some cases no multibyte-character but there are wide-characters.
1531 (The other way around it is not important since theencoded
1532 collation value in the wide-character case is 32 bits wide and
1533 therefore requires no encoding).
1535 The lowest collation value assigned is 2. Zero is reserved for
1536 the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1537 functions and 1 is used to separate the individual passes for the
1538 different rules.
1540 We also have to construct is list with all the bytes/words which
1541 can come first in a sequence, followed by all the elements which
1542 also start with this byte/word. The order is reverse which has
1543 among others the important effect that longer strings are located
1544 first in the list. This is required for the output data since
1545 the algorithm used in `strcoll' etc depends on this.
1547 The multibyte case is easy. We simply sort into an array with
1548 256 elements. */
1549 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1550 int mbact[nrules];
1551 int wcact;
1552 int mbseqact;
1553 int wcseqact;
1554 struct element_t *runp;
1555 int i;
1556 int need_undefined = 0;
1557 struct section_list *sect;
1558 int ruleidx;
1559 int nr_wide_elems = 0;
1561 if (collate == NULL)
1563 /* No data, no check. */
1564 if (! be_quiet)
1565 WITH_CUR_LOCALE (error (0, 0, _("No definition for %s category found"),
1566 "LC_COLLATE"));
1567 return;
1570 /* If this assertion is hit change the type in `element_t'. */
1571 assert (nrules <= sizeof (runp->used_in_level) * 8);
1573 /* Make sure that the `position' rule is used either in all sections
1574 or in none. */
1575 for (i = 0; i < nrules; ++i)
1576 for (sect = collate->sections; sect != NULL; sect = sect->next)
1577 if (sect != collate->current_section
1578 && sect->rules != NULL
1579 && ((sect->rules[i] & sort_position)
1580 != (collate->current_section->rules[i] & sort_position)))
1582 WITH_CUR_LOCALE (error (0, 0, _("\
1583 %s: `position' must be used for a specific level in all sections or none"),
1584 "LC_COLLATE"));
1585 break;
1588 /* Find out which elements are used at which level. At the same
1589 time we find out whether we have any undefined symbols. */
1590 runp = collate->start;
1591 while (runp != NULL)
1593 if (runp->mbs != NULL)
1595 for (i = 0; i < nrules; ++i)
1597 int j;
1599 for (j = 0; j < runp->weights[i].cnt; ++j)
1600 /* A NULL pointer as the weight means IGNORE. */
1601 if (runp->weights[i].w[j] != NULL)
1603 if (runp->weights[i].w[j]->weights == NULL)
1605 WITH_CUR_LOCALE (error_at_line (0, 0, runp->file,
1606 runp->line,
1607 _("symbol `%s' not defined"),
1608 runp->weights[i].w[j]->name));
1610 need_undefined = 1;
1611 runp->weights[i].w[j] = &collate->undefined;
1613 else
1614 /* Set the bit for the level. */
1615 runp->weights[i].w[j]->used_in_level |= 1 << i;
1620 /* Up to the next entry. */
1621 runp = runp->next;
1624 /* Walk through the list of defined sequences and assign weights. Also
1625 create the data structure which will allow generating the single byte
1626 character based tables.
1628 Since at each time only the weights for each of the rules are
1629 only compared to other weights for this rule it is possible to
1630 assign more compact weight values than simply counting all
1631 weights in sequence. We can assign weights from 3, one for each
1632 rule individually and only for those elements, which are actually
1633 used for this rule.
1635 Why is this important? It is not for the wide char table. But
1636 it is for the singlebyte output since here larger numbers have to
1637 be encoded to make it possible to emit the value as a byte
1638 string. */
1639 for (i = 0; i < nrules; ++i)
1640 mbact[i] = 2;
1641 wcact = 2;
1642 mbseqact = 0;
1643 wcseqact = 0;
1644 runp = collate->start;
1645 while (runp != NULL)
1647 /* Determine the order. */
1648 if (runp->used_in_level != 0)
1650 runp->mborder = (int *) obstack_alloc (&collate->mempool,
1651 nrules * sizeof (int));
1653 for (i = 0; i < nrules; ++i)
1654 if ((runp->used_in_level & (1 << i)) != 0)
1655 runp->mborder[i] = mbact[i]++;
1656 else
1657 runp->mborder[i] = 0;
1660 if (runp->mbs != NULL)
1662 struct element_t **eptr;
1663 struct element_t *lastp = NULL;
1665 /* Find the point where to insert in the list. */
1666 eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1667 while (*eptr != NULL)
1669 if ((*eptr)->nmbs < runp->nmbs)
1670 break;
1672 if ((*eptr)->nmbs == runp->nmbs)
1674 int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1676 if (c == 0)
1678 /* This should not happen. It means that we have
1679 to symbols with the same byte sequence. It is
1680 of course an error. */
1681 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1682 (*eptr)->line,
1683 _("\
1684 symbol `%s' has the same encoding as"), (*eptr)->name);
1685 error_at_line (0, 0, runp->file,
1686 runp->line,
1687 _("symbol `%s'"),
1688 runp->name));
1689 goto dont_insert;
1691 else if (c < 0)
1692 /* Insert it here. */
1693 break;
1696 /* To the next entry. */
1697 lastp = *eptr;
1698 eptr = &(*eptr)->mbnext;
1701 /* Set the pointers. */
1702 runp->mbnext = *eptr;
1703 runp->mblast = lastp;
1704 if (*eptr != NULL)
1705 (*eptr)->mblast = runp;
1706 *eptr = runp;
1707 dont_insert:
1711 if (runp->used_in_level)
1713 runp->wcorder = wcact++;
1715 /* We take the opportunity to count the elements which have
1716 wide characters. */
1717 ++nr_wide_elems;
1720 if (runp->is_character)
1722 if (runp->nmbs == 1)
1723 collate->mbseqorder[((unsigned char *) runp->mbs)[0]] = mbseqact++;
1725 runp->wcseqorder = wcseqact++;
1727 else if (runp->mbs != NULL && runp->weights != NULL)
1728 /* This is for collation elements. */
1729 runp->wcseqorder = wcseqact++;
1731 /* Up to the next entry. */
1732 runp = runp->next;
1735 /* Find out whether any of the `mbheads' entries is unset. In this
1736 case we use the UNDEFINED entry. */
1737 for (i = 1; i < 256; ++i)
1738 if (collate->mbheads[i] == NULL)
1740 need_undefined = 1;
1741 collate->mbheads[i] = &collate->undefined;
1744 /* Now to the wide character case. */
1745 collate->wcheads.p = 6;
1746 collate->wcheads.q = 10;
1747 wchead_table_init (&collate->wcheads);
1749 collate->wcseqorder.p = 6;
1750 collate->wcseqorder.q = 10;
1751 collseq_table_init (&collate->wcseqorder);
1753 /* Start adding. */
1754 runp = collate->start;
1755 while (runp != NULL)
1757 if (runp->wcs != NULL)
1759 struct element_t *e;
1760 struct element_t **eptr;
1761 struct element_t *lastp;
1763 /* Insert the collation sequence value. */
1764 if (runp->is_character)
1765 collseq_table_add (&collate->wcseqorder, runp->wcs[0],
1766 runp->wcseqorder);
1768 /* Find the point where to insert in the list. */
1769 e = wchead_table_get (&collate->wcheads, runp->wcs[0]);
1770 eptr = &e;
1771 lastp = NULL;
1772 while (*eptr != NULL)
1774 if ((*eptr)->nwcs < runp->nwcs)
1775 break;
1777 if ((*eptr)->nwcs == runp->nwcs)
1779 int c = wmemcmp ((wchar_t *) (*eptr)->wcs,
1780 (wchar_t *) runp->wcs, runp->nwcs);
1782 if (c == 0)
1784 /* This should not happen. It means that we have
1785 two symbols with the same byte sequence. It is
1786 of course an error. */
1787 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1788 (*eptr)->line,
1789 _("\
1790 symbol `%s' has the same encoding as"), (*eptr)->name);
1791 error_at_line (0, 0, runp->file,
1792 runp->line,
1793 _("symbol `%s'"),
1794 runp->name));
1795 goto dont_insertwc;
1797 else if (c < 0)
1798 /* Insert it here. */
1799 break;
1802 /* To the next entry. */
1803 lastp = *eptr;
1804 eptr = &(*eptr)->wcnext;
1807 /* Set the pointers. */
1808 runp->wcnext = *eptr;
1809 runp->wclast = lastp;
1810 if (*eptr != NULL)
1811 (*eptr)->wclast = runp;
1812 *eptr = runp;
1813 if (eptr == &e)
1814 wchead_table_add (&collate->wcheads, runp->wcs[0], e);
1815 dont_insertwc:
1819 /* Up to the next entry. */
1820 runp = runp->next;
1823 /* Now determine whether the UNDEFINED entry is needed and if yes,
1824 whether it was defined. */
1825 collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1826 if (collate->undefined.file == NULL)
1828 if (need_undefined)
1830 /* This seems not to be enforced by recent standards. Don't
1831 emit an error, simply append UNDEFINED at the end. */
1832 if (0)
1833 WITH_CUR_LOCALE (error (0, 0, _("no definition of `UNDEFINED'")));
1835 /* Add UNDEFINED at the end. */
1836 collate->undefined.mborder =
1837 (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1839 for (i = 0; i < nrules; ++i)
1840 collate->undefined.mborder[i] = mbact[i]++;
1843 /* In any case we will need the definition for the wide character
1844 case. But we will not complain that it is missing since the
1845 specification strangely enough does not seem to account for
1846 this. */
1847 collate->undefined.wcorder = wcact++;
1850 /* Finally, try to unify the rules for the sections. Whenever the rules
1851 for a section are the same as those for another section give the
1852 ruleset the same index. Since there are never many section we can
1853 use an O(n^2) algorithm here. */
1854 sect = collate->sections;
1855 while (sect != NULL && sect->rules == NULL)
1856 sect = sect->next;
1858 /* Bail out if we have no sections because of earlier errors. */
1859 if (sect == NULL)
1861 WITH_CUR_LOCALE (error (EXIT_FAILURE, 0,
1862 _("too many errors; giving up")));
1863 return;
1866 ruleidx = 0;
1869 struct section_list *osect = collate->sections;
1871 while (osect != sect)
1872 if (osect->rules != NULL
1873 && memcmp (osect->rules, sect->rules,
1874 nrules * sizeof (osect->rules[0])) == 0)
1875 break;
1876 else
1877 osect = osect->next;
1879 if (osect == sect)
1880 sect->ruleidx = ruleidx++;
1881 else
1882 sect->ruleidx = osect->ruleidx;
1884 /* Next section. */
1886 sect = sect->next;
1887 while (sect != NULL && sect->rules == NULL);
1889 while (sect != NULL);
1890 /* We are currently not prepared for more than 128 rulesets. But this
1891 should never really be a problem. */
1892 assert (ruleidx <= 128);
1896 static int32_t
1897 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1898 struct element_t *elem)
1900 size_t cnt;
1901 int32_t retval;
1903 /* Optimize the use of UNDEFINED. */
1904 if (elem == &collate->undefined)
1905 /* The weights are already inserted. */
1906 return 0;
1908 /* This byte can start exactly one collation element and this is
1909 a single byte. We can directly give the index to the weights. */
1910 retval = obstack_object_size (pool);
1912 /* Construct the weight. */
1913 for (cnt = 0; cnt < nrules; ++cnt)
1915 char buf[elem->weights[cnt].cnt * 7];
1916 int len = 0;
1917 int i;
1919 for (i = 0; i < elem->weights[cnt].cnt; ++i)
1920 /* Encode the weight value. We do nothing for IGNORE entries. */
1921 if (elem->weights[cnt].w[i] != NULL)
1922 len += utf8_encode (&buf[len],
1923 elem->weights[cnt].w[i]->mborder[cnt]);
1925 /* And add the buffer content. */
1926 obstack_1grow (pool, len);
1927 obstack_grow (pool, buf, len);
1930 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1934 static int32_t
1935 output_weightwc (struct obstack *pool, struct locale_collate_t *collate,
1936 struct element_t *elem)
1938 size_t cnt;
1939 int32_t retval;
1941 /* Optimize the use of UNDEFINED. */
1942 if (elem == &collate->undefined)
1943 /* The weights are already inserted. */
1944 return 0;
1946 /* This byte can start exactly one collation element and this is
1947 a single byte. We can directly give the index to the weights. */
1948 retval = obstack_object_size (pool) / sizeof (int32_t);
1950 /* Construct the weight. */
1951 for (cnt = 0; cnt < nrules; ++cnt)
1953 int32_t buf[elem->weights[cnt].cnt];
1954 int i;
1955 int32_t j;
1957 for (i = 0, j = 0; i < elem->weights[cnt].cnt; ++i)
1958 if (elem->weights[cnt].w[i] != NULL)
1959 buf[j++] = elem->weights[cnt].w[i]->wcorder;
1961 /* And add the buffer content. */
1962 obstack_int32_grow (pool, j);
1964 obstack_grow (pool, buf, j * sizeof (int32_t));
1965 maybe_swap_uint32_obstack (pool, j);
1968 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1971 /* If localedef is every threaded, this would need to be __thread var. */
1972 static struct
1974 struct obstack *weightpool;
1975 struct obstack *extrapool;
1976 struct obstack *indpool;
1977 struct locale_collate_t *collate;
1978 struct collidx_table *tablewc;
1979 } atwc;
1981 static void add_to_tablewc (uint32_t ch, struct element_t *runp);
1983 static void
1984 add_to_tablewc (uint32_t ch, struct element_t *runp)
1986 if (runp->wcnext == NULL && runp->nwcs == 1)
1988 int32_t weigthidx = output_weightwc (atwc.weightpool, atwc.collate,
1989 runp);
1990 collidx_table_add (atwc.tablewc, ch, weigthidx);
1992 else
1994 /* As for the singlebyte table, we recognize sequences and
1995 compress them. */
1997 collidx_table_add (atwc.tablewc, ch,
1998 -(obstack_object_size (atwc.extrapool)
1999 / sizeof (uint32_t)));
2003 /* Store the current index in the weight table. We know that
2004 the current position in the `extrapool' is aligned on a
2005 32-bit address. */
2006 int32_t weightidx;
2007 int added;
2009 /* Find out wether this is a single entry or we have more than
2010 one consecutive entry. */
2011 if (runp->wcnext != NULL
2012 && runp->nwcs == runp->wcnext->nwcs
2013 && wmemcmp ((wchar_t *) runp->wcs,
2014 (wchar_t *)runp->wcnext->wcs,
2015 runp->nwcs - 1) == 0
2016 && (runp->wcs[runp->nwcs - 1]
2017 == runp->wcnext->wcs[runp->nwcs - 1] + 1))
2019 int i;
2020 struct element_t *series_startp = runp;
2021 struct element_t *curp;
2023 /* Now add first the initial byte sequence. */
2024 added = (1 + 1 + 2 * (runp->nwcs - 1)) * sizeof (int32_t);
2025 if (sizeof (int32_t) == sizeof (int))
2026 obstack_make_room (atwc.extrapool, added);
2028 /* More than one consecutive entry. We mark this by having
2029 a negative index into the indirect table. */
2030 obstack_int32_grow_fast (atwc.extrapool,
2031 -(obstack_object_size (atwc.indpool)
2032 / sizeof (int32_t)));
2033 obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2036 runp = runp->wcnext;
2037 while (runp->wcnext != NULL
2038 && runp->nwcs == runp->wcnext->nwcs
2039 && wmemcmp ((wchar_t *) runp->wcs,
2040 (wchar_t *)runp->wcnext->wcs,
2041 runp->nwcs - 1) == 0
2042 && (runp->wcs[runp->nwcs - 1]
2043 == runp->wcnext->wcs[runp->nwcs - 1] + 1));
2045 /* Now walk backward from here to the beginning. */
2046 curp = runp;
2048 for (i = 1; i < runp->nwcs; ++i)
2049 obstack_int32_grow_fast (atwc.extrapool, curp->wcs[i]);
2051 /* Now find the end of the consecutive sequence and
2052 add all the indeces in the indirect pool. */
2055 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2056 curp);
2057 obstack_int32_grow (atwc.indpool, weightidx);
2059 curp = curp->wclast;
2061 while (curp != series_startp);
2063 /* Add the final weight. */
2064 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2065 curp);
2066 obstack_int32_grow (atwc.indpool, weightidx);
2068 /* And add the end byte sequence. Without length this
2069 time. */
2070 for (i = 1; i < curp->nwcs; ++i)
2071 obstack_int32_grow (atwc.extrapool, curp->wcs[i]);
2073 else
2075 /* A single entry. Simply add the index and the length and
2076 string (except for the first character which is already
2077 tested for). */
2078 int i;
2080 /* Output the weight info. */
2081 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2082 runp);
2084 assert (runp->nwcs > 0);
2085 added = (1 + 1 + runp->nwcs - 1) * sizeof (int32_t);
2086 if (sizeof (int) == sizeof (int32_t))
2087 obstack_make_room (atwc.extrapool, added);
2089 obstack_int32_grow_fast (atwc.extrapool, weightidx);
2090 obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2091 for (i = 1; i < runp->nwcs; ++i)
2092 obstack_int32_grow_fast (atwc.extrapool, runp->wcs[i]);
2095 /* Next entry. */
2096 runp = runp->wcnext;
2098 while (runp != NULL);
2102 void
2103 collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
2104 const char *output_path)
2106 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
2107 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
2108 struct locale_file file;
2109 size_t ch;
2110 int32_t tablemb[256];
2111 struct obstack weightpool;
2112 struct obstack extrapool;
2113 struct obstack indirectpool;
2114 struct section_list *sect;
2115 struct collidx_table tablewc;
2116 uint32_t elem_size;
2117 uint32_t *elem_table;
2118 int i;
2119 struct element_t *runp;
2121 init_locale_data (&file, nelems);
2122 add_locale_uint32 (&file, nrules);
2124 /* If we have no LC_COLLATE data emit only the number of rules as zero. */
2125 if (collate == NULL)
2127 size_t idx;
2128 for (idx = 1; idx < nelems; idx++)
2130 /* The words have to be handled specially. */
2131 if (idx == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
2132 add_locale_uint32 (&file, 0);
2133 else
2134 add_locale_empty (&file);
2136 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file);
2137 return;
2140 obstack_init (&weightpool);
2141 obstack_init (&extrapool);
2142 obstack_init (&indirectpool);
2144 /* Since we are using the sign of an integer to mark indirection the
2145 offsets in the arrays we are indirectly referring to must not be
2146 zero since -0 == 0. Therefore we add a bit of dummy content. */
2147 obstack_int32_grow (&extrapool, 0);
2148 obstack_int32_grow (&indirectpool, 0);
2150 /* Prepare the ruleset table. */
2151 for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
2152 if (sect->rules != NULL && sect->ruleidx == i)
2154 int j;
2156 obstack_make_room (&weightpool, nrules);
2158 for (j = 0; j < nrules; ++j)
2159 obstack_1grow_fast (&weightpool, sect->rules[j]);
2160 ++i;
2162 /* And align the output. */
2163 i = (nrules * i) % LOCFILE_ALIGN;
2164 if (i > 0)
2166 obstack_1grow (&weightpool, '\0');
2167 while (++i < LOCFILE_ALIGN);
2169 add_locale_raw_obstack (&file, &weightpool);
2171 /* Generate the 8-bit table. Walk through the lists of sequences
2172 starting with the same byte and add them one after the other to
2173 the table. In case we have more than one sequence starting with
2174 the same byte we have to use extra indirection.
2176 First add a record for the NUL byte. This entry will never be used
2177 so it does not matter. */
2178 tablemb[0] = 0;
2180 /* Now insert the `UNDEFINED' value if it is used. Since this value
2181 will probably be used more than once it is good to store the
2182 weights only once. */
2183 if (collate->undefined.used_in_level != 0)
2184 output_weight (&weightpool, collate, &collate->undefined);
2186 for (ch = 1; ch < 256; ++ch)
2187 if (collate->mbheads[ch]->mbnext == NULL
2188 && collate->mbheads[ch]->nmbs <= 1)
2190 tablemb[ch] = output_weight (&weightpool, collate,
2191 collate->mbheads[ch]);
2193 else
2195 /* The entries in the list are sorted by length and then
2196 alphabetically. This is the order in which we will add the
2197 elements to the collation table. This allows simply walking
2198 the table in sequence and stopping at the first matching
2199 entry. Since the longer sequences are coming first in the
2200 list they have the possibility to match first, just as it
2201 has to be. In the worst case we are walking to the end of
2202 the list where we put, if no singlebyte sequence is defined
2203 in the locale definition, the weights for UNDEFINED.
2205 To reduce the length of the search list we compress them a bit.
2206 This happens by collecting sequences of consecutive byte
2207 sequences in one entry (having and begin and end byte sequence)
2208 and add only one index into the weight table. We can find the
2209 consecutive entries since they are also consecutive in the list. */
2210 struct element_t *runp = collate->mbheads[ch];
2211 struct element_t *lastp;
2213 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2215 tablemb[ch] = -obstack_object_size (&extrapool);
2219 /* Store the current index in the weight table. We know that
2220 the current position in the `extrapool' is aligned on a
2221 32-bit address. */
2222 int32_t weightidx;
2223 int added;
2225 /* Find out wether this is a single entry or we have more than
2226 one consecutive entry. */
2227 if (runp->mbnext != NULL
2228 && runp->nmbs == runp->mbnext->nmbs
2229 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
2230 && (runp->mbs[runp->nmbs - 1]
2231 == runp->mbnext->mbs[runp->nmbs - 1] + 1))
2233 int i;
2234 struct element_t *series_startp = runp;
2235 struct element_t *curp;
2237 /* Compute how much space we will need. */
2238 added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1
2239 + 2 * (runp->nmbs - 1));
2240 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2241 obstack_make_room (&extrapool, added);
2243 /* More than one consecutive entry. We mark this by having
2244 a negative index into the indirect table. */
2245 obstack_int32_grow_fast (&extrapool,
2246 -(obstack_object_size (&indirectpool)
2247 / sizeof (int32_t)));
2249 /* Now search first the end of the series. */
2251 runp = runp->mbnext;
2252 while (runp->mbnext != NULL
2253 && runp->nmbs == runp->mbnext->nmbs
2254 && memcmp (runp->mbs, runp->mbnext->mbs,
2255 runp->nmbs - 1) == 0
2256 && (runp->mbs[runp->nmbs - 1]
2257 == runp->mbnext->mbs[runp->nmbs - 1] + 1));
2259 /* Now walk backward from here to the beginning. */
2260 curp = runp;
2262 assert (runp->nmbs <= 256);
2263 obstack_1grow_fast (&extrapool, curp->nmbs - 1);
2264 for (i = 1; i < curp->nmbs; ++i)
2265 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2267 /* Now find the end of the consecutive sequence and
2268 add all the indeces in the indirect pool. */
2271 weightidx = output_weight (&weightpool, collate, curp);
2272 obstack_int32_grow (&indirectpool, weightidx);
2274 curp = curp->mblast;
2276 while (curp != series_startp);
2278 /* Add the final weight. */
2279 weightidx = output_weight (&weightpool, collate, curp);
2280 obstack_int32_grow (&indirectpool, weightidx);
2282 /* And add the end byte sequence. Without length this
2283 time. */
2284 for (i = 1; i < curp->nmbs; ++i)
2285 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2287 else
2289 /* A single entry. Simply add the index and the length and
2290 string (except for the first character which is already
2291 tested for). */
2292 int i;
2294 /* Output the weight info. */
2295 weightidx = output_weight (&weightpool, collate, runp);
2297 added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1
2298 + runp->nmbs - 1);
2299 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2300 obstack_make_room (&extrapool, added);
2302 obstack_int32_grow_fast (&extrapool, weightidx);
2303 assert (runp->nmbs <= 256);
2304 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
2306 for (i = 1; i < runp->nmbs; ++i)
2307 obstack_1grow_fast (&extrapool, runp->mbs[i]);
2310 /* Add alignment bytes if necessary. */
2311 while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)))
2312 obstack_1grow_fast (&extrapool, '\0');
2314 /* Next entry. */
2315 lastp = runp;
2316 runp = runp->mbnext;
2318 while (runp != NULL);
2320 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2322 /* If the final entry in the list is not a single character we
2323 add an UNDEFINED entry here. */
2324 if (lastp->nmbs != 1)
2326 int added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1 + 1);
2327 obstack_make_room (&extrapool, added);
2329 obstack_int32_grow_fast (&extrapool, 0);
2330 /* XXX What rule? We just pick the first. */
2331 obstack_1grow_fast (&extrapool, 0);
2332 /* Length is zero. */
2333 obstack_1grow_fast (&extrapool, 0);
2335 /* Add alignment bytes if necessary. */
2336 while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)))
2337 obstack_1grow_fast (&extrapool, '\0');
2341 /* Add padding to the tables if necessary. */
2342 while (!LOCFILE_ALIGNED_P (obstack_object_size (&weightpool)))
2343 obstack_1grow (&weightpool, 0);
2345 /* Now add the four tables. */
2346 add_locale_uint32_array (&file, (const uint32_t *) tablemb, 256);
2347 add_locale_raw_obstack (&file, &weightpool);
2348 add_locale_raw_obstack (&file, &extrapool);
2349 add_locale_raw_obstack (&file, &indirectpool);
2351 /* Now the same for the wide character table. We need to store some
2352 more information here. */
2353 add_locale_empty (&file);
2354 add_locale_empty (&file);
2355 add_locale_empty (&file);
2357 /* Since we are using the sign of an integer to mark indirection the
2358 offsets in the arrays we are indirectly referring to must not be
2359 zero since -0 == 0. Therefore we add a bit of dummy content. */
2360 obstack_int32_grow (&extrapool, 0);
2361 obstack_int32_grow (&indirectpool, 0);
2363 /* Now insert the `UNDEFINED' value if it is used. Since this value
2364 will probably be used more than once it is good to store the
2365 weights only once. */
2366 if (output_weightwc (&weightpool, collate, &collate->undefined) != 0)
2367 abort ();
2369 /* Generate the table. Walk through the lists of sequences starting
2370 with the same wide character and add them one after the other to
2371 the table. In case we have more than one sequence starting with
2372 the same byte we have to use extra indirection. */
2373 tablewc.p = 6;
2374 tablewc.q = 10;
2375 collidx_table_init (&tablewc);
2377 atwc.weightpool = &weightpool;
2378 atwc.extrapool = &extrapool;
2379 atwc.indpool = &indirectpool;
2380 atwc.collate = collate;
2381 atwc.tablewc = &tablewc;
2383 wchead_table_iterate (&collate->wcheads, add_to_tablewc);
2385 memset (&atwc, 0, sizeof (atwc));
2387 /* Now add the four tables. */
2388 add_locale_collidx_table (&file, &tablewc);
2389 add_locale_raw_obstack (&file, &weightpool);
2390 add_locale_raw_obstack (&file, &extrapool);
2391 add_locale_raw_obstack (&file, &indirectpool);
2393 /* Finally write the table with collation element names out. It is
2394 a hash table with a simple function which gets the name of the
2395 character as the input. One character might have many names. The
2396 value associated with the name is an index into the weight table
2397 where we are then interested in the first-level weight value.
2399 To determine how large the table should be we are counting the
2400 elements have to put in. Since we are using internal chaining
2401 using a secondary hash function we have to make the table a bit
2402 larger to avoid extremely long search times. We can achieve
2403 good results with a 40% larger table than there are entries. */
2404 elem_size = 0;
2405 runp = collate->start;
2406 while (runp != NULL)
2408 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2409 /* Yep, the element really counts. */
2410 ++elem_size;
2412 runp = runp->next;
2414 /* Add 40% and find the next prime number. */
2415 elem_size = next_prime (elem_size * 1.4);
2417 /* Allocate the table. Each entry consists of two words: the hash
2418 value and an index in a secondary table which provides the index
2419 into the weight table and the string itself (so that a match can
2420 be determined). */
2421 elem_table = (uint32_t *) obstack_alloc (&extrapool,
2422 elem_size * 2 * sizeof (uint32_t));
2423 memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t));
2425 /* Now add the elements. */
2426 runp = collate->start;
2427 while (runp != NULL)
2429 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2431 /* Compute the hash value of the name. */
2432 uint32_t namelen = strlen (runp->name);
2433 uint32_t hash = elem_hash (runp->name, namelen);
2434 size_t idx = hash % elem_size;
2435 #ifndef NDEBUG
2436 size_t start_idx = idx;
2437 #endif
2439 if (elem_table[idx * 2] != 0)
2441 /* The spot is already taken. Try iterating using the value
2442 from the secondary hashing function. */
2443 size_t iter = hash % (elem_size - 2) + 1;
2447 idx += iter;
2448 if (idx >= elem_size)
2449 idx -= elem_size;
2450 assert (idx != start_idx);
2452 while (elem_table[idx * 2] != 0);
2454 /* This is the spot where we will insert the value. */
2455 elem_table[idx * 2] = hash;
2456 elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
2458 /* The string itself including length. */
2459 obstack_1grow (&extrapool, namelen);
2460 obstack_grow (&extrapool, runp->name, namelen);
2462 /* And the multibyte representation. */
2463 obstack_1grow (&extrapool, runp->nmbs);
2464 obstack_grow (&extrapool, runp->mbs, runp->nmbs);
2466 /* And align again to 32 bits. */
2467 if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0)
2468 obstack_grow (&extrapool, "\0\0",
2469 (sizeof (int32_t)
2470 - ((1 + namelen + 1 + runp->nmbs)
2471 % sizeof (int32_t))));
2473 /* Now some 32-bit values: multibyte collation sequence,
2474 wide char string (including length), and wide char
2475 collation sequence. */
2476 obstack_int32_grow (&extrapool, runp->mbseqorder);
2478 obstack_int32_grow (&extrapool, runp->nwcs);
2479 obstack_grow (&extrapool, runp->wcs,
2480 runp->nwcs * sizeof (uint32_t));
2481 maybe_swap_uint32_obstack (&extrapool, runp->nwcs);
2483 obstack_int32_grow (&extrapool, runp->wcseqorder);
2486 runp = runp->next;
2489 /* Prepare to write out this data. */
2490 add_locale_uint32 (&file, elem_size);
2491 add_locale_uint32_array (&file, elem_table, 2 * elem_size);
2492 add_locale_raw_obstack (&file, &extrapool);
2493 add_locale_raw_data (&file, collate->mbseqorder, 256);
2494 add_locale_collseq_table (&file, &collate->wcseqorder);
2495 add_locale_string (&file, charmap->code_set_name);
2496 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file);
2498 obstack_free (&weightpool, NULL);
2499 obstack_free (&extrapool, NULL);
2500 obstack_free (&indirectpool, NULL);
2504 static enum token_t
2505 skip_to (struct linereader *ldfile, struct locale_collate_t *collate,
2506 const struct charmap_t *charmap, int to_endif)
2508 while (1)
2510 struct token *now = lr_token (ldfile, charmap, NULL, NULL, 0);
2511 enum token_t nowtok = now->tok;
2513 if (nowtok == tok_eof || nowtok == tok_end)
2514 return nowtok;
2516 if (nowtok == tok_ifdef || nowtok == tok_ifndef)
2518 lr_error (ldfile, _("%s: nested conditionals not supported"),
2519 "LC_COLLATE");
2520 nowtok = skip_to (ldfile, collate, charmap, tok_endif);
2521 if (nowtok == tok_eof || nowtok == tok_end)
2522 return nowtok;
2524 else if (nowtok == tok_endif || (!to_endif && nowtok == tok_else))
2526 lr_ignore_rest (ldfile, 1);
2527 return nowtok;
2529 else if (!to_endif && (nowtok == tok_elifdef || nowtok == tok_elifndef))
2531 /* Do not read the rest of the line. */
2532 return nowtok;
2534 else if (nowtok == tok_else)
2536 lr_error (ldfile, _("%s: more than one 'else'"), "LC_COLLATE");
2539 lr_ignore_rest (ldfile, 0);
2544 void
2545 collate_read (struct linereader *ldfile, struct localedef_t *result,
2546 const struct charmap_t *charmap, const char *repertoire_name,
2547 int ignore_content)
2549 struct repertoire_t *repertoire = NULL;
2550 struct locale_collate_t *collate;
2551 struct token *now;
2552 struct token *arg = NULL;
2553 enum token_t nowtok;
2554 enum token_t was_ellipsis = tok_none;
2555 struct localedef_t *copy_locale = NULL;
2556 /* Parsing state:
2557 0 - start
2558 1 - between `order-start' and `order-end'
2559 2 - after `order-end'
2560 3 - after `reorder-after', waiting for `reorder-end'
2561 4 - after `reorder-end'
2562 5 - after `reorder-sections-after', waiting for `reorder-sections-end'
2563 6 - after `reorder-sections-end'
2565 int state = 0;
2567 /* Get the repertoire we have to use. */
2568 if (repertoire_name != NULL)
2569 repertoire = repertoire_read (repertoire_name);
2571 /* The rest of the line containing `LC_COLLATE' must be free. */
2572 lr_ignore_rest (ldfile, 1);
2574 while (1)
2578 now = lr_token (ldfile, charmap, result, NULL, verbose);
2579 nowtok = now->tok;
2581 while (nowtok == tok_eol);
2583 if (nowtok != tok_define)
2584 break;
2586 if (ignore_content)
2587 lr_ignore_rest (ldfile, 0);
2588 else
2590 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2591 if (arg->tok != tok_ident)
2592 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2593 else
2595 /* Simply add the new symbol. */
2596 struct name_list *newsym = xmalloc (sizeof (*newsym)
2597 + arg->val.str.lenmb + 1);
2598 memcpy (newsym->str, arg->val.str.startmb, arg->val.str.lenmb);
2599 newsym->str[arg->val.str.lenmb] = '\0';
2600 newsym->next = defined;
2601 defined = newsym;
2603 lr_ignore_rest (ldfile, 1);
2608 if (nowtok == tok_copy)
2610 now = lr_token (ldfile, charmap, result, NULL, verbose);
2611 if (now->tok != tok_string)
2613 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2615 skip_category:
2617 now = lr_token (ldfile, charmap, result, NULL, verbose);
2618 while (now->tok != tok_eof && now->tok != tok_end);
2620 if (now->tok != tok_eof
2621 || (now = lr_token (ldfile, charmap, result, NULL, verbose),
2622 now->tok == tok_eof))
2623 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2624 else if (now->tok != tok_lc_collate)
2626 lr_error (ldfile, _("\
2627 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2628 lr_ignore_rest (ldfile, 0);
2630 else
2631 lr_ignore_rest (ldfile, 1);
2633 return;
2636 if (! ignore_content)
2638 /* Get the locale definition. */
2639 copy_locale = load_locale (LC_COLLATE, now->val.str.startmb,
2640 repertoire_name, charmap, NULL);
2641 if ((copy_locale->avail & COLLATE_LOCALE) == 0)
2643 /* Not yet loaded. So do it now. */
2644 if (locfile_read (copy_locale, charmap) != 0)
2645 goto skip_category;
2648 if (copy_locale->categories[LC_COLLATE].collate == NULL)
2649 return;
2652 lr_ignore_rest (ldfile, 1);
2654 now = lr_token (ldfile, charmap, result, NULL, verbose);
2655 nowtok = now->tok;
2658 /* Prepare the data structures. */
2659 collate_startup (ldfile, result, copy_locale, ignore_content);
2660 collate = result->categories[LC_COLLATE].collate;
2662 while (1)
2664 char ucs4buf[10];
2665 char *symstr;
2666 size_t symlen;
2668 /* Of course we don't proceed beyond the end of file. */
2669 if (nowtok == tok_eof)
2670 break;
2672 /* Ingore empty lines. */
2673 if (nowtok == tok_eol)
2675 now = lr_token (ldfile, charmap, result, NULL, verbose);
2676 nowtok = now->tok;
2677 continue;
2680 switch (nowtok)
2682 case tok_copy:
2683 /* Allow copying other locales. */
2684 now = lr_token (ldfile, charmap, result, NULL, verbose);
2685 if (now->tok != tok_string)
2686 goto err_label;
2688 if (! ignore_content)
2689 load_locale (LC_COLLATE, now->val.str.startmb, repertoire_name,
2690 charmap, result);
2692 lr_ignore_rest (ldfile, 1);
2693 break;
2695 case tok_coll_weight_max:
2696 /* Ignore the rest of the line if we don't need the input of
2697 this line. */
2698 if (ignore_content)
2700 lr_ignore_rest (ldfile, 0);
2701 break;
2704 if (state != 0)
2705 goto err_label;
2707 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2708 if (arg->tok != tok_number)
2709 goto err_label;
2710 if (collate->col_weight_max != -1)
2711 lr_error (ldfile, _("%s: duplicate definition of `%s'"),
2712 "LC_COLLATE", "col_weight_max");
2713 else
2714 collate->col_weight_max = arg->val.num;
2715 lr_ignore_rest (ldfile, 1);
2716 break;
2718 case tok_section_symbol:
2719 /* Ignore the rest of the line if we don't need the input of
2720 this line. */
2721 if (ignore_content)
2723 lr_ignore_rest (ldfile, 0);
2724 break;
2727 if (state != 0)
2728 goto err_label;
2730 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2731 if (arg->tok != tok_bsymbol)
2732 goto err_label;
2733 else if (!ignore_content)
2735 /* Check whether this section is already known. */
2736 struct section_list *known = collate->sections;
2737 while (known != NULL)
2739 if (strcmp (known->name, arg->val.str.startmb) == 0)
2740 break;
2741 known = known->next;
2744 if (known != NULL)
2746 lr_error (ldfile,
2747 _("%s: duplicate declaration of section `%s'"),
2748 "LC_COLLATE", arg->val.str.startmb);
2749 free (arg->val.str.startmb);
2751 else
2752 collate->sections = make_seclist_elem (collate,
2753 arg->val.str.startmb,
2754 collate->sections);
2756 lr_ignore_rest (ldfile, known == NULL);
2758 else
2760 free (arg->val.str.startmb);
2761 lr_ignore_rest (ldfile, 0);
2763 break;
2765 case tok_collating_element:
2766 /* Ignore the rest of the line if we don't need the input of
2767 this line. */
2768 if (ignore_content)
2770 lr_ignore_rest (ldfile, 0);
2771 break;
2774 if (state != 0 && state != 2)
2775 goto err_label;
2777 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2778 if (arg->tok != tok_bsymbol)
2779 goto err_label;
2780 else
2782 const char *symbol = arg->val.str.startmb;
2783 size_t symbol_len = arg->val.str.lenmb;
2785 /* Next the `from' keyword. */
2786 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2787 if (arg->tok != tok_from)
2789 free ((char *) symbol);
2790 goto err_label;
2793 ldfile->return_widestr = 1;
2794 ldfile->translate_strings = 1;
2796 /* Finally the string with the replacement. */
2797 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2799 ldfile->return_widestr = 0;
2800 ldfile->translate_strings = 0;
2802 if (arg->tok != tok_string)
2803 goto err_label;
2805 if (!ignore_content && symbol != NULL)
2807 /* The name is already defined. */
2808 if (check_duplicate (ldfile, collate, charmap,
2809 repertoire, symbol, symbol_len))
2810 goto col_elem_free;
2812 if (arg->val.str.startmb != NULL)
2813 insert_entry (&collate->elem_table, symbol, symbol_len,
2814 new_element (collate,
2815 arg->val.str.startmb,
2816 arg->val.str.lenmb - 1,
2817 arg->val.str.startwc,
2818 symbol, symbol_len, 0));
2820 else
2822 col_elem_free:
2823 free ((char *) symbol);
2824 free (arg->val.str.startmb);
2825 free (arg->val.str.startwc);
2827 lr_ignore_rest (ldfile, 1);
2829 break;
2831 case tok_collating_symbol:
2832 /* Ignore the rest of the line if we don't need the input of
2833 this line. */
2834 if (ignore_content)
2836 lr_ignore_rest (ldfile, 0);
2837 break;
2840 if (state != 0 && state != 2)
2841 goto err_label;
2843 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2844 if (arg->tok != tok_bsymbol)
2845 goto err_label;
2846 else
2848 char *symbol = arg->val.str.startmb;
2849 size_t symbol_len = arg->val.str.lenmb;
2850 char *endsymbol = NULL;
2851 size_t endsymbol_len = 0;
2852 enum token_t ellipsis = tok_none;
2854 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2855 if (arg->tok == tok_ellipsis2 || arg->tok == tok_ellipsis4)
2857 ellipsis = arg->tok;
2859 arg = lr_token (ldfile, charmap, result, repertoire,
2860 verbose);
2861 if (arg->tok != tok_bsymbol)
2863 free (symbol);
2864 goto err_label;
2867 endsymbol = arg->val.str.startmb;
2868 endsymbol_len = arg->val.str.lenmb;
2870 lr_ignore_rest (ldfile, 1);
2872 else if (arg->tok != tok_eol)
2874 free (symbol);
2875 goto err_label;
2878 if (!ignore_content)
2880 if (symbol == NULL
2881 || (ellipsis != tok_none && endsymbol == NULL))
2883 lr_error (ldfile, _("\
2884 %s: unknown character in collating symbol name"),
2885 "LC_COLLATE");
2886 goto col_sym_free;
2888 else if (ellipsis == tok_none)
2890 /* A single symbol, no ellipsis. */
2891 if (check_duplicate (ldfile, collate, charmap,
2892 repertoire, symbol, symbol_len))
2893 /* The name is already defined. */
2894 goto col_sym_free;
2896 insert_entry (&collate->sym_table, symbol, symbol_len,
2897 new_symbol (collate, symbol, symbol_len));
2899 else if (symbol_len != endsymbol_len)
2901 col_sym_inv_range:
2902 lr_error (ldfile,
2903 _("invalid names for character range"));
2904 goto col_sym_free;
2906 else
2908 /* Oh my, we have to handle an ellipsis. First, as
2909 usual, determine the common prefix and then
2910 convert the rest into a range. */
2911 size_t prefixlen;
2912 unsigned long int from;
2913 unsigned long int to;
2914 char *endp;
2916 for (prefixlen = 0; prefixlen < symbol_len; ++prefixlen)
2917 if (symbol[prefixlen] != endsymbol[prefixlen])
2918 break;
2920 /* Convert the rest into numbers. */
2921 symbol[symbol_len] = '\0';
2922 from = strtoul (&symbol[prefixlen], &endp,
2923 ellipsis == tok_ellipsis2 ? 16 : 10);
2924 if (*endp != '\0')
2925 goto col_sym_inv_range;
2927 endsymbol[symbol_len] = '\0';
2928 to = strtoul (&endsymbol[prefixlen], &endp,
2929 ellipsis == tok_ellipsis2 ? 16 : 10);
2930 if (*endp != '\0')
2931 goto col_sym_inv_range;
2933 if (from > to)
2934 goto col_sym_inv_range;
2936 /* Now loop over all entries. */
2937 while (from <= to)
2939 char *symbuf;
2941 symbuf = (char *) obstack_alloc (&collate->mempool,
2942 symbol_len + 1);
2944 /* Create the name. */
2945 sprintf (symbuf,
2946 ellipsis == tok_ellipsis2
2947 ? "%.*s%.*lX" : "%.*s%.*lu",
2948 (int) prefixlen, symbol,
2949 (int) (symbol_len - prefixlen), from);
2951 if (check_duplicate (ldfile, collate, charmap,
2952 repertoire, symbuf, symbol_len))
2953 /* The name is already defined. */
2954 goto col_sym_free;
2956 insert_entry (&collate->sym_table, symbuf,
2957 symbol_len,
2958 new_symbol (collate, symbuf,
2959 symbol_len));
2961 /* Increment the counter. */
2962 ++from;
2965 goto col_sym_free;
2968 else
2970 col_sym_free:
2971 free (symbol);
2972 free (endsymbol);
2975 break;
2977 case tok_symbol_equivalence:
2978 /* Ignore the rest of the line if we don't need the input of
2979 this line. */
2980 if (ignore_content)
2982 lr_ignore_rest (ldfile, 0);
2983 break;
2986 if (state != 0)
2987 goto err_label;
2989 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2990 if (arg->tok != tok_bsymbol)
2991 goto err_label;
2992 else
2994 const char *newname = arg->val.str.startmb;
2995 size_t newname_len = arg->val.str.lenmb;
2996 const char *symname;
2997 size_t symname_len;
2998 void *symval; /* Actually struct symbol_t* */
3000 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3001 if (arg->tok != tok_bsymbol)
3003 free ((char *) newname);
3004 goto err_label;
3007 symname = arg->val.str.startmb;
3008 symname_len = arg->val.str.lenmb;
3010 if (newname == NULL)
3012 lr_error (ldfile, _("\
3013 %s: unknown character in equivalent definition name"),
3014 "LC_COLLATE");
3016 sym_equiv_free:
3017 free ((char *) newname);
3018 free ((char *) symname);
3019 break;
3021 if (symname == NULL)
3023 lr_error (ldfile, _("\
3024 %s: unknown character in equivalent definition value"),
3025 "LC_COLLATE");
3026 goto sym_equiv_free;
3029 /* See whether the symbol name is already defined. */
3030 if (find_entry (&collate->sym_table, symname, symname_len,
3031 &symval) != 0)
3033 lr_error (ldfile, _("\
3034 %s: unknown symbol `%s' in equivalent definition"),
3035 "LC_COLLATE", symname);
3036 goto sym_equiv_free;
3039 if (insert_entry (&collate->sym_table,
3040 newname, newname_len, symval) < 0)
3042 lr_error (ldfile, _("\
3043 error while adding equivalent collating symbol"));
3044 goto sym_equiv_free;
3047 free ((char *) symname);
3049 lr_ignore_rest (ldfile, 1);
3050 break;
3052 case tok_script:
3053 /* Ignore the rest of the line if we don't need the input of
3054 this line. */
3055 if (ignore_content)
3057 lr_ignore_rest (ldfile, 0);
3058 break;
3061 /* We get told about the scripts we know. */
3062 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3063 if (arg->tok != tok_bsymbol)
3064 goto err_label;
3065 else
3067 struct section_list *runp = collate->known_sections;
3068 char *name;
3070 while (runp != NULL)
3071 if (strncmp (runp->name, arg->val.str.startmb,
3072 arg->val.str.lenmb) == 0
3073 && runp->name[arg->val.str.lenmb] == '\0')
3074 break;
3075 else
3076 runp = runp->def_next;
3078 if (runp != NULL)
3080 lr_error (ldfile, _("duplicate definition of script `%s'"),
3081 runp->name);
3082 lr_ignore_rest (ldfile, 0);
3083 break;
3086 runp = (struct section_list *) xcalloc (1, sizeof (*runp));
3087 name = (char *) xmalloc (arg->val.str.lenmb + 1);
3088 memcpy (name, arg->val.str.startmb, arg->val.str.lenmb);
3089 name[arg->val.str.lenmb] = '\0';
3090 runp->name = name;
3092 runp->def_next = collate->known_sections;
3093 collate->known_sections = runp;
3095 lr_ignore_rest (ldfile, 1);
3096 break;
3098 case tok_order_start:
3099 /* Ignore the rest of the line if we don't need the input of
3100 this line. */
3101 if (ignore_content)
3103 lr_ignore_rest (ldfile, 0);
3104 break;
3107 if (state != 0 && state != 1 && state != 2)
3108 goto err_label;
3109 state = 1;
3111 /* The 14652 draft does not specify whether all `order_start' lines
3112 must contain the same number of sort-rules, but 14651 does. So
3113 we require this here as well. */
3114 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3115 if (arg->tok == tok_bsymbol)
3117 /* This better should be a section name. */
3118 struct section_list *sp = collate->known_sections;
3119 while (sp != NULL
3120 && (sp->name == NULL
3121 || strncmp (sp->name, arg->val.str.startmb,
3122 arg->val.str.lenmb) != 0
3123 || sp->name[arg->val.str.lenmb] != '\0'))
3124 sp = sp->def_next;
3126 if (sp == NULL)
3128 lr_error (ldfile, _("\
3129 %s: unknown section name `%.*s'"),
3130 "LC_COLLATE", (int) arg->val.str.lenmb,
3131 arg->val.str.startmb);
3132 /* We use the error section. */
3133 collate->current_section = &collate->error_section;
3135 if (collate->error_section.first == NULL)
3137 /* Insert &collate->error_section at the end of
3138 the collate->sections list. */
3139 if (collate->sections == NULL)
3140 collate->sections = &collate->error_section;
3141 else
3143 sp = collate->sections;
3144 while (sp->next != NULL)
3145 sp = sp->next;
3147 sp->next = &collate->error_section;
3149 collate->error_section.next = NULL;
3152 else
3154 /* One should not be allowed to open the same
3155 section twice. */
3156 if (sp->first != NULL)
3157 lr_error (ldfile, _("\
3158 %s: multiple order definitions for section `%s'"),
3159 "LC_COLLATE", sp->name);
3160 else
3162 /* Insert sp in the collate->sections list,
3163 right after collate->current_section. */
3164 if (collate->current_section != NULL)
3166 sp->next = collate->current_section->next;
3167 collate->current_section->next = sp;
3169 else if (collate->sections == NULL)
3170 /* This is the first section to be defined. */
3171 collate->sections = sp;
3173 collate->current_section = sp;
3176 /* Next should come the end of the line or a semicolon. */
3177 arg = lr_token (ldfile, charmap, result, repertoire,
3178 verbose);
3179 if (arg->tok == tok_eol)
3181 uint32_t cnt;
3183 /* This means we have exactly one rule: `forward'. */
3184 if (nrules > 1)
3185 lr_error (ldfile, _("\
3186 %s: invalid number of sorting rules"),
3187 "LC_COLLATE");
3188 else
3189 nrules = 1;
3190 sp->rules = obstack_alloc (&collate->mempool,
3191 (sizeof (enum coll_sort_rule)
3192 * nrules));
3193 for (cnt = 0; cnt < nrules; ++cnt)
3194 sp->rules[cnt] = sort_forward;
3196 /* Next line. */
3197 break;
3200 /* Get the next token. */
3201 arg = lr_token (ldfile, charmap, result, repertoire,
3202 verbose);
3205 else
3207 /* There is no section symbol. Therefore we use the unnamed
3208 section. */
3209 collate->current_section = &collate->unnamed_section;
3211 if (collate->unnamed_section_defined)
3212 lr_error (ldfile, _("\
3213 %s: multiple order definitions for unnamed section"),
3214 "LC_COLLATE");
3215 else
3217 /* Insert &collate->unnamed_section at the beginning of
3218 the collate->sections list. */
3219 collate->unnamed_section.next = collate->sections;
3220 collate->sections = &collate->unnamed_section;
3221 collate->unnamed_section_defined = true;
3225 /* Now read the direction names. */
3226 read_directions (ldfile, arg, charmap, repertoire, result);
3228 /* From now we need the strings untranslated. */
3229 ldfile->translate_strings = 0;
3230 break;
3232 case tok_order_end:
3233 /* Ignore the rest of the line if we don't need the input of
3234 this line. */
3235 if (ignore_content)
3237 lr_ignore_rest (ldfile, 0);
3238 break;
3241 if (state != 1)
3242 goto err_label;
3244 /* Handle ellipsis at end of list. */
3245 if (was_ellipsis != tok_none)
3247 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3248 repertoire, result);
3249 was_ellipsis = tok_none;
3252 state = 2;
3253 lr_ignore_rest (ldfile, 1);
3254 break;
3256 case tok_reorder_after:
3257 /* Ignore the rest of the line if we don't need the input of
3258 this line. */
3259 if (ignore_content)
3261 lr_ignore_rest (ldfile, 0);
3262 break;
3265 if (state == 1)
3267 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3268 "LC_COLLATE");
3269 state = 2;
3271 /* Handle ellipsis at end of list. */
3272 if (was_ellipsis != tok_none)
3274 handle_ellipsis (ldfile, arg->val.str.startmb,
3275 arg->val.str.lenmb, was_ellipsis, charmap,
3276 repertoire, result);
3277 was_ellipsis = tok_none;
3280 else if (state == 0 && copy_locale == NULL)
3281 goto err_label;
3282 else if (state != 0 && state != 2 && state != 3)
3283 goto err_label;
3284 state = 3;
3286 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3287 if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
3289 /* Find this symbol in the sequence table. */
3290 char ucsbuf[10];
3291 char *startmb;
3292 size_t lenmb;
3293 struct element_t *insp;
3294 int no_error = 1;
3295 void *ptr;
3297 if (arg->tok == tok_bsymbol)
3299 startmb = arg->val.str.startmb;
3300 lenmb = arg->val.str.lenmb;
3302 else
3304 sprintf (ucsbuf, "U%08X", arg->val.ucs4);
3305 startmb = ucsbuf;
3306 lenmb = 9;
3309 if (find_entry (&collate->seq_table, startmb, lenmb, &ptr) == 0)
3310 /* Yes, the symbol exists. Simply point the cursor
3311 to it. */
3312 collate->cursor = (struct element_t *) ptr;
3313 else
3315 struct symbol_t *symbp;
3316 void *ptr;
3318 if (find_entry (&collate->sym_table, startmb, lenmb,
3319 &ptr) == 0)
3321 symbp = ptr;
3323 if (symbp->order->last != NULL
3324 || symbp->order->next != NULL)
3325 collate->cursor = symbp->order;
3326 else
3328 /* This is a collating symbol but its position
3329 is not yet defined. */
3330 lr_error (ldfile, _("\
3331 %s: order for collating symbol %.*s not yet defined"),
3332 "LC_COLLATE", (int) lenmb, startmb);
3333 collate->cursor = NULL;
3334 no_error = 0;
3337 else if (find_entry (&collate->elem_table, startmb, lenmb,
3338 &ptr) == 0)
3340 insp = (struct element_t *) ptr;
3342 if (insp->last != NULL || insp->next != NULL)
3343 collate->cursor = insp;
3344 else
3346 /* This is a collating element but its position
3347 is not yet defined. */
3348 lr_error (ldfile, _("\
3349 %s: order for collating element %.*s not yet defined"),
3350 "LC_COLLATE", (int) lenmb, startmb);
3351 collate->cursor = NULL;
3352 no_error = 0;
3355 else
3357 /* This is bad. The symbol after which we have to
3358 insert does not exist. */
3359 lr_error (ldfile, _("\
3360 %s: cannot reorder after %.*s: symbol not known"),
3361 "LC_COLLATE", (int) lenmb, startmb);
3362 collate->cursor = NULL;
3363 no_error = 0;
3367 lr_ignore_rest (ldfile, no_error);
3369 else
3370 /* This must not happen. */
3371 goto err_label;
3372 break;
3374 case tok_reorder_end:
3375 /* Ignore the rest of the line if we don't need the input of
3376 this line. */
3377 if (ignore_content)
3378 break;
3380 if (state != 3)
3381 goto err_label;
3382 state = 4;
3383 lr_ignore_rest (ldfile, 1);
3384 break;
3386 case tok_reorder_sections_after:
3387 /* Ignore the rest of the line if we don't need the input of
3388 this line. */
3389 if (ignore_content)
3391 lr_ignore_rest (ldfile, 0);
3392 break;
3395 if (state == 1)
3397 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3398 "LC_COLLATE");
3399 state = 2;
3401 /* Handle ellipsis at end of list. */
3402 if (was_ellipsis != tok_none)
3404 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3405 repertoire, result);
3406 was_ellipsis = tok_none;
3409 else if (state == 3)
3411 WITH_CUR_LOCALE (error (0, 0, _("\
3412 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3413 state = 4;
3415 else if (state != 2 && state != 4)
3416 goto err_label;
3417 state = 5;
3419 /* Get the name of the sections we are adding after. */
3420 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3421 if (arg->tok == tok_bsymbol)
3423 /* Now find a section with this name. */
3424 struct section_list *runp = collate->sections;
3426 while (runp != NULL)
3428 if (runp->name != NULL
3429 && strlen (runp->name) == arg->val.str.lenmb
3430 && memcmp (runp->name, arg->val.str.startmb,
3431 arg->val.str.lenmb) == 0)
3432 break;
3434 runp = runp->next;
3437 if (runp != NULL)
3438 collate->current_section = runp;
3439 else
3441 /* This is bad. The section after which we have to
3442 reorder does not exist. Therefore we cannot
3443 process the whole rest of this reorder
3444 specification. */
3445 lr_error (ldfile, _("%s: section `%.*s' not known"),
3446 "LC_COLLATE", (int) arg->val.str.lenmb,
3447 arg->val.str.startmb);
3451 lr_ignore_rest (ldfile, 0);
3453 now = lr_token (ldfile, charmap, result, NULL, verbose);
3455 while (now->tok == tok_reorder_sections_after
3456 || now->tok == tok_reorder_sections_end
3457 || now->tok == tok_end);
3459 /* Process the token we just saw. */
3460 nowtok = now->tok;
3461 continue;
3464 else
3465 /* This must not happen. */
3466 goto err_label;
3467 break;
3469 case tok_reorder_sections_end:
3470 /* Ignore the rest of the line if we don't need the input of
3471 this line. */
3472 if (ignore_content)
3473 break;
3475 if (state != 5)
3476 goto err_label;
3477 state = 6;
3478 lr_ignore_rest (ldfile, 1);
3479 break;
3481 case tok_bsymbol:
3482 case tok_ucs4:
3483 /* Ignore the rest of the line if we don't need the input of
3484 this line. */
3485 if (ignore_content)
3487 lr_ignore_rest (ldfile, 0);
3488 break;
3491 if (state != 0 && state != 1 && state != 3 && state != 5)
3492 goto err_label;
3494 if ((state == 0 || state == 5) && nowtok == tok_ucs4)
3495 goto err_label;
3497 if (nowtok == tok_ucs4)
3499 snprintf (ucs4buf, sizeof (ucs4buf), "U%08X", now->val.ucs4);
3500 symstr = ucs4buf;
3501 symlen = 9;
3503 else if (arg != NULL)
3505 symstr = arg->val.str.startmb;
3506 symlen = arg->val.str.lenmb;
3508 else
3510 lr_error (ldfile, _("%s: bad symbol <%.*s>"), "LC_COLLATE",
3511 (int) ldfile->token.val.str.lenmb,
3512 ldfile->token.val.str.startmb);
3513 break;
3516 struct element_t *seqp;
3517 if (state == 0)
3519 /* We are outside an `order_start' region. This means
3520 we must only accept definitions of values for
3521 collation symbols since these are purely abstract
3522 values and don't need directions associated. */
3523 void *ptr;
3525 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3527 seqp = ptr;
3529 /* It's already defined. First check whether this
3530 is really a collating symbol. */
3531 if (seqp->is_character)
3532 goto err_label;
3534 goto move_entry;
3536 else
3538 void *result;
3540 if (find_entry (&collate->sym_table, symstr, symlen,
3541 &result) != 0)
3542 /* No collating symbol, it's an error. */
3543 goto err_label;
3545 /* Maybe this is the first time we define a symbol
3546 value and it is before the first actual section. */
3547 if (collate->sections == NULL)
3548 collate->sections = collate->current_section =
3549 &collate->symbol_section;
3552 if (was_ellipsis != tok_none)
3554 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis,
3555 charmap, repertoire, result);
3557 /* Remember that we processed the ellipsis. */
3558 was_ellipsis = tok_none;
3560 /* And don't add the value a second time. */
3561 break;
3564 else if (state == 3)
3566 /* It is possible that we already have this collation sequence.
3567 In this case we move the entry. */
3568 void *sym;
3569 void *ptr;
3571 /* If the symbol after which we have to insert was not found
3572 ignore all entries. */
3573 if (collate->cursor == NULL)
3575 lr_ignore_rest (ldfile, 0);
3576 break;
3579 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3581 seqp = (struct element_t *) ptr;
3582 goto move_entry;
3585 if (find_entry (&collate->sym_table, symstr, symlen, &sym) == 0
3586 && (seqp = ((struct symbol_t *) sym)->order) != NULL)
3587 goto move_entry;
3589 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) == 0
3590 && (seqp = (struct element_t *) ptr,
3591 seqp->last != NULL || seqp->next != NULL
3592 || (collate->start != NULL && seqp == collate->start)))
3594 move_entry:
3595 /* Remove the entry from the old position. */
3596 if (seqp->last == NULL)
3597 collate->start = seqp->next;
3598 else
3599 seqp->last->next = seqp->next;
3600 if (seqp->next != NULL)
3601 seqp->next->last = seqp->last;
3603 /* We also have to check whether this entry is the
3604 first or last of a section. */
3605 if (seqp->section->first == seqp)
3607 if (seqp->section->first == seqp->section->last)
3608 /* This section has no content anymore. */
3609 seqp->section->first = seqp->section->last = NULL;
3610 else
3611 seqp->section->first = seqp->next;
3613 else if (seqp->section->last == seqp)
3614 seqp->section->last = seqp->last;
3616 /* Now insert it in the new place. */
3617 insert_weights (ldfile, seqp, charmap, repertoire, result,
3618 tok_none);
3619 break;
3622 /* Otherwise we just add a new entry. */
3624 else if (state == 5)
3626 /* We are reordering sections. Find the named section. */
3627 struct section_list *runp = collate->sections;
3628 struct section_list *prevp = NULL;
3630 while (runp != NULL)
3632 if (runp->name != NULL
3633 && strlen (runp->name) == symlen
3634 && memcmp (runp->name, symstr, symlen) == 0)
3635 break;
3637 prevp = runp;
3638 runp = runp->next;
3641 if (runp == NULL)
3643 lr_error (ldfile, _("%s: section `%.*s' not known"),
3644 "LC_COLLATE", (int) symlen, symstr);
3645 lr_ignore_rest (ldfile, 0);
3647 else
3649 if (runp != collate->current_section)
3651 /* Remove the named section from the old place and
3652 insert it in the new one. */
3653 prevp->next = runp->next;
3655 runp->next = collate->current_section->next;
3656 collate->current_section->next = runp;
3657 collate->current_section = runp;
3660 /* Process the rest of the line which might change
3661 the collation rules. */
3662 arg = lr_token (ldfile, charmap, result, repertoire,
3663 verbose);
3664 if (arg->tok != tok_eof && arg->tok != tok_eol)
3665 read_directions (ldfile, arg, charmap, repertoire,
3666 result);
3668 break;
3670 else if (was_ellipsis != tok_none)
3672 /* Using the information in the `ellipsis_weight'
3673 element and this and the last value we have to handle
3674 the ellipsis now. */
3675 assert (state == 1);
3677 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis, charmap,
3678 repertoire, result);
3680 /* Remember that we processed the ellipsis. */
3681 was_ellipsis = tok_none;
3683 /* And don't add the value a second time. */
3684 break;
3687 /* Now insert in the new place. */
3688 insert_value (ldfile, symstr, symlen, charmap, repertoire, result);
3689 break;
3691 case tok_undefined:
3692 /* Ignore the rest of the line if we don't need the input of
3693 this line. */
3694 if (ignore_content)
3696 lr_ignore_rest (ldfile, 0);
3697 break;
3700 if (state != 1)
3701 goto err_label;
3703 if (was_ellipsis != tok_none)
3705 lr_error (ldfile,
3706 _("%s: cannot have `%s' as end of ellipsis range"),
3707 "LC_COLLATE", "UNDEFINED");
3709 unlink_element (collate);
3710 was_ellipsis = tok_none;
3713 /* See whether UNDEFINED already appeared somewhere. */
3714 if (collate->undefined.next != NULL
3715 || &collate->undefined == collate->cursor)
3717 lr_error (ldfile,
3718 _("%s: order for `%.*s' already defined at %s:%Zu"),
3719 "LC_COLLATE", 9, "UNDEFINED",
3720 collate->undefined.file,
3721 collate->undefined.line);
3722 lr_ignore_rest (ldfile, 0);
3724 else
3725 /* Parse the weights. */
3726 insert_weights (ldfile, &collate->undefined, charmap,
3727 repertoire, result, tok_none);
3728 break;
3730 case tok_ellipsis2: /* symbolic hexadecimal ellipsis */
3731 case tok_ellipsis3: /* absolute ellipsis */
3732 case tok_ellipsis4: /* symbolic decimal ellipsis */
3733 /* This is the symbolic (decimal or hexadecimal) or absolute
3734 ellipsis. */
3735 if (was_ellipsis != tok_none)
3736 goto err_label;
3738 if (state != 0 && state != 1 && state != 3)
3739 goto err_label;
3741 was_ellipsis = nowtok;
3743 insert_weights (ldfile, &collate->ellipsis_weight, charmap,
3744 repertoire, result, nowtok);
3745 break;
3747 case tok_end:
3748 seen_end:
3749 /* Next we assume `LC_COLLATE'. */
3750 if (!ignore_content)
3752 if (state == 0 && copy_locale == NULL)
3753 /* We must either see a copy statement or have
3754 ordering values. */
3755 lr_error (ldfile,
3756 _("%s: empty category description not allowed"),
3757 "LC_COLLATE");
3758 else if (state == 1)
3760 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3761 "LC_COLLATE");
3763 /* Handle ellipsis at end of list. */
3764 if (was_ellipsis != tok_none)
3766 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3767 repertoire, result);
3768 was_ellipsis = tok_none;
3771 else if (state == 3)
3772 WITH_CUR_LOCALE (error (0, 0, _("\
3773 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3774 else if (state == 5)
3775 WITH_CUR_LOCALE (error (0, 0, _("\
3776 %s: missing `reorder-sections-end' keyword"), "LC_COLLATE"));
3778 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3779 if (arg->tok == tok_eof)
3780 break;
3781 if (arg->tok == tok_eol)
3782 lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
3783 else if (arg->tok != tok_lc_collate)
3784 lr_error (ldfile, _("\
3785 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
3786 lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
3787 return;
3789 case tok_define:
3790 if (ignore_content)
3792 lr_ignore_rest (ldfile, 0);
3793 break;
3796 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3797 if (arg->tok != tok_ident)
3798 goto err_label;
3800 /* Simply add the new symbol. */
3801 struct name_list *newsym = xmalloc (sizeof (*newsym)
3802 + arg->val.str.lenmb + 1);
3803 memcpy (newsym->str, arg->val.str.startmb, arg->val.str.lenmb);
3804 newsym->str[arg->val.str.lenmb] = '\0';
3805 newsym->next = defined;
3806 defined = newsym;
3808 lr_ignore_rest (ldfile, 1);
3809 break;
3811 case tok_undef:
3812 if (ignore_content)
3814 lr_ignore_rest (ldfile, 0);
3815 break;
3818 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3819 if (arg->tok != tok_ident)
3820 goto err_label;
3822 /* Remove _all_ occurrences of the symbol from the list. */
3823 struct name_list *prevdef = NULL;
3824 struct name_list *curdef = defined;
3825 while (curdef != NULL)
3826 if (strncmp (arg->val.str.startmb, curdef->str,
3827 arg->val.str.lenmb) == 0
3828 && curdef->str[arg->val.str.lenmb] == '\0')
3830 if (prevdef == NULL)
3831 defined = curdef->next;
3832 else
3833 prevdef->next = curdef->next;
3835 struct name_list *olddef = curdef;
3836 curdef = curdef->next;
3838 free (olddef);
3840 else
3842 prevdef = curdef;
3843 curdef = curdef->next;
3846 lr_ignore_rest (ldfile, 1);
3847 break;
3849 case tok_ifdef:
3850 case tok_ifndef:
3851 if (ignore_content)
3853 lr_ignore_rest (ldfile, 0);
3854 break;
3857 found_ifdef:
3858 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3859 if (arg->tok != tok_ident)
3860 goto err_label;
3861 lr_ignore_rest (ldfile, 1);
3863 if (collate->else_action == else_none)
3865 curdef = defined;
3866 while (curdef != NULL)
3867 if (strncmp (arg->val.str.startmb, curdef->str,
3868 arg->val.str.lenmb) == 0
3869 && curdef->str[arg->val.str.lenmb] == '\0')
3870 break;
3871 else
3872 curdef = curdef->next;
3874 if ((nowtok == tok_ifdef && curdef != NULL)
3875 || (nowtok == tok_ifndef && curdef == NULL))
3877 /* We have to use the if-branch. */
3878 collate->else_action = else_ignore;
3880 else
3882 /* We have to use the else-branch, if there is one. */
3883 nowtok = skip_to (ldfile, collate, charmap, 0);
3884 if (nowtok == tok_else)
3885 collate->else_action = else_seen;
3886 else if (nowtok == tok_elifdef)
3888 nowtok = tok_ifdef;
3889 goto found_ifdef;
3891 else if (nowtok == tok_elifndef)
3893 nowtok = tok_ifndef;
3894 goto found_ifdef;
3896 else if (nowtok == tok_eof)
3897 goto seen_eof;
3898 else if (nowtok == tok_end)
3899 goto seen_end;
3902 else
3904 /* XXX Should it really become necessary to support nested
3905 preprocessor handling we will push the state here. */
3906 lr_error (ldfile, _("%s: nested conditionals not supported"),
3907 "LC_COLLATE");
3908 nowtok = skip_to (ldfile, collate, charmap, 1);
3909 if (nowtok == tok_eof)
3910 goto seen_eof;
3911 else if (nowtok == tok_end)
3912 goto seen_end;
3914 break;
3916 case tok_elifdef:
3917 case tok_elifndef:
3918 case tok_else:
3919 if (ignore_content)
3921 lr_ignore_rest (ldfile, 0);
3922 break;
3925 lr_ignore_rest (ldfile, 1);
3927 if (collate->else_action == else_ignore)
3929 /* Ignore everything until the endif. */
3930 nowtok = skip_to (ldfile, collate, charmap, 1);
3931 if (nowtok == tok_eof)
3932 goto seen_eof;
3933 else if (nowtok == tok_end)
3934 goto seen_end;
3936 else
3938 assert (collate->else_action == else_none);
3939 lr_error (ldfile, _("\
3940 %s: '%s' without matching 'ifdef' or 'ifndef'"), "LC_COLLATE",
3941 nowtok == tok_else ? "else"
3942 : nowtok == tok_elifdef ? "elifdef" : "elifndef");
3944 break;
3946 case tok_endif:
3947 if (ignore_content)
3949 lr_ignore_rest (ldfile, 0);
3950 break;
3953 lr_ignore_rest (ldfile, 1);
3955 if (collate->else_action != else_ignore
3956 && collate->else_action != else_seen)
3957 lr_error (ldfile, _("\
3958 %s: 'endif' without matching 'ifdef' or 'ifndef'"), "LC_COLLATE");
3960 /* XXX If we support nested preprocessor directives we pop
3961 the state here. */
3962 collate->else_action = else_none;
3963 break;
3965 default:
3966 err_label:
3967 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
3970 /* Prepare for the next round. */
3971 now = lr_token (ldfile, charmap, result, NULL, verbose);
3972 nowtok = now->tok;
3975 seen_eof:
3976 /* When we come here we reached the end of the file. */
3977 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");