Improve strcoll with strdiff.
[glibc.git] / locale / programs / ld-collate.c
bloba39a94f2cc3508b42308c3b11eb116ef582e4a5b
1 /* Copyright (C) 1995-2015 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 #include <errno.h>
23 #include <error.h>
24 #include <stdlib.h>
25 #include <wchar.h>
26 #include <stdint.h>
27 #include <sys/param.h>
29 #include "localedef.h"
30 #include "charmap.h"
31 #include "localeinfo.h"
32 #include "linereader.h"
33 #include "locfile.h"
34 #include "elem-hash.h"
35 #include "../localeinfo.h"
37 /* Uncomment the following line in the production version. */
38 /* #define NDEBUG 1 */
39 #include <assert.h>
41 #define obstack_chunk_alloc malloc
42 #define obstack_chunk_free free
44 static inline void
45 __attribute ((always_inline))
46 obstack_int32_grow (struct obstack *obstack, int32_t data)
48 assert (LOCFILE_ALIGNED_P (obstack_object_size (obstack)));
49 data = maybe_swap_uint32 (data);
50 if (sizeof (int32_t) == sizeof (int))
51 obstack_int_grow (obstack, data);
52 else
53 obstack_grow (obstack, &data, sizeof (int32_t));
56 static inline void
57 __attribute ((always_inline))
58 obstack_int32_grow_fast (struct obstack *obstack, int32_t data)
60 assert (LOCFILE_ALIGNED_P (obstack_object_size (obstack)));
61 data = maybe_swap_uint32 (data);
62 if (sizeof (int32_t) == sizeof (int))
63 obstack_int_grow_fast (obstack, data);
64 else
65 obstack_grow (obstack, &data, sizeof (int32_t));
68 /* Forward declaration. */
69 struct element_t;
71 /* Data type for list of strings. */
72 struct section_list
74 /* Successor in the known_sections list. */
75 struct section_list *def_next;
76 /* Successor in the sections list. */
77 struct section_list *next;
78 /* Name of the section. */
79 const char *name;
80 /* First element of this section. */
81 struct element_t *first;
82 /* Last element of this section. */
83 struct element_t *last;
84 /* These are the rules for this section. */
85 enum coll_sort_rule *rules;
86 /* Index of the rule set in the appropriate section of the output file. */
87 int ruleidx;
90 struct element_t;
92 struct element_list_t
94 /* Number of elements. */
95 int cnt;
97 struct element_t **w;
100 /* Data type for collating element. */
101 struct element_t
103 const char *name;
105 const char *mbs;
106 size_t nmbs;
107 const uint32_t *wcs;
108 size_t nwcs;
109 int *mborder;
110 int wcorder;
112 /* The following is a bit mask which bits are set if this element is
113 used in the appropriate level. Interesting for the singlebyte
114 weight computation.
116 XXX The type here restricts the number of levels to 32. It could
117 be changed if necessary but I doubt this is necessary. */
118 unsigned int used_in_level;
120 struct element_list_t *weights;
122 /* Nonzero if this is a real character definition. */
123 int is_character;
125 /* Order of the character in the sequence. This information will
126 be used in range expressions. */
127 int mbseqorder;
128 int wcseqorder;
130 /* Where does the definition come from. */
131 const char *file;
132 size_t line;
134 /* Which section does this belong to. */
135 struct section_list *section;
137 /* Predecessor and successor in the order list. */
138 struct element_t *last;
139 struct element_t *next;
141 /* Next element in multibyte output list. */
142 struct element_t *mbnext;
143 struct element_t *mblast;
145 /* Next element in wide character output list. */
146 struct element_t *wcnext;
147 struct element_t *wclast;
150 /* Special element value. */
151 #define ELEMENT_ELLIPSIS2 ((struct element_t *) 1)
152 #define ELEMENT_ELLIPSIS3 ((struct element_t *) 2)
153 #define ELEMENT_ELLIPSIS4 ((struct element_t *) 3)
155 /* Data type for collating symbol. */
156 struct symbol_t
158 const char *name;
160 /* Point to place in the order list. */
161 struct element_t *order;
163 /* Where does the definition come from. */
164 const char *file;
165 size_t line;
168 /* Sparse table of struct element_t *. */
169 #define TABLE wchead_table
170 #define ELEMENT struct element_t *
171 #define DEFAULT NULL
172 #define ITERATE
173 #define NO_ADD_LOCALE
174 #include "3level.h"
176 /* Sparse table of int32_t. */
177 #define TABLE collidx_table
178 #define ELEMENT int32_t
179 #define DEFAULT 0
180 #include "3level.h"
182 /* Sparse table of uint32_t. */
183 #define TABLE collseq_table
184 #define ELEMENT uint32_t
185 #define DEFAULT ~((uint32_t) 0)
186 #include "3level.h"
189 /* Simple name list for the preprocessor. */
190 struct name_list
192 struct name_list *next;
193 char str[0];
197 /* The real definition of the struct for the LC_COLLATE locale. */
198 struct locale_collate_t
200 int col_weight_max;
201 int cur_weight_max;
203 /* List of known scripts. */
204 struct section_list *known_sections;
205 /* List of used sections. */
206 struct section_list *sections;
207 /* Current section using definition. */
208 struct section_list *current_section;
209 /* There always can be an unnamed section. */
210 struct section_list unnamed_section;
211 /* Flag whether the unnamed section has been defined. */
212 bool unnamed_section_defined;
213 /* To make handling of errors easier we have another section. */
214 struct section_list error_section;
215 /* Sometimes we are defining the values for collating symbols before
216 the first actual section. */
217 struct section_list symbol_section;
219 /* Start of the order list. */
220 struct element_t *start;
222 /* The undefined element. */
223 struct element_t undefined;
225 /* This is the cursor for `reorder_after' insertions. */
226 struct element_t *cursor;
228 /* This value is used when handling ellipsis. */
229 struct element_t ellipsis_weight;
231 /* Known collating elements. */
232 hash_table elem_table;
234 /* Known collating symbols. */
235 hash_table sym_table;
237 /* Known collation sequences. */
238 hash_table seq_table;
240 struct obstack mempool;
242 /* The LC_COLLATE category is a bit special as it is sometimes possible
243 that the definitions from more than one input file contains information.
244 Therefore we keep all relevant input in a list. */
245 struct locale_collate_t *next;
247 /* Arrays with heads of the list for each of the leading bytes in
248 the multibyte sequences. */
249 struct element_t *mbheads[256];
251 /* Arrays with heads of the list for each of the leading bytes in
252 the multibyte sequences. */
253 struct wchead_table wcheads;
255 /* The arrays with the collation sequence order. */
256 unsigned char mbseqorder[256];
257 struct collseq_table wcseqorder;
259 /* State of the preprocessor. */
260 enum
262 else_none = 0,
263 else_ignore,
264 else_seen
266 else_action;
270 /* We have a few global variables which are used for reading all
271 LC_COLLATE category descriptions in all files. */
272 static uint32_t nrules;
274 /* List of defined preprocessor symbols. */
275 static struct name_list *defined;
278 /* We need UTF-8 encoding of numbers. */
279 static inline int
280 __attribute ((always_inline))
281 utf8_encode (char *buf, int val)
283 int retval;
285 if (val < 0x80)
287 *buf++ = (char) val;
288 retval = 1;
290 else
292 int step;
294 for (step = 2; step < 6; ++step)
295 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
296 break;
297 retval = step;
299 *buf = (unsigned char) (~0xff >> step);
300 --step;
303 buf[step] = 0x80 | (val & 0x3f);
304 val >>= 6;
306 while (--step > 0);
307 *buf |= val;
310 return retval;
314 static struct section_list *
315 make_seclist_elem (struct locale_collate_t *collate, const char *string,
316 struct section_list *next)
318 struct section_list *newp;
320 newp = (struct section_list *) obstack_alloc (&collate->mempool,
321 sizeof (*newp));
322 newp->next = next;
323 newp->name = string;
324 newp->first = NULL;
325 newp->last = NULL;
327 return newp;
331 static struct element_t *
332 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
333 const uint32_t *wcs, const char *name, size_t namelen,
334 int is_character)
336 struct element_t *newp;
338 newp = (struct element_t *) obstack_alloc (&collate->mempool,
339 sizeof (*newp));
340 newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
341 name, namelen);
342 if (mbs != NULL)
344 newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
345 newp->nmbs = mbslen;
347 else
349 newp->mbs = NULL;
350 newp->nmbs = 0;
352 if (wcs != NULL)
354 size_t nwcs = wcslen ((wchar_t *) wcs);
355 uint32_t zero = 0;
356 /* Handle <U0000> as a single character. */
357 if (nwcs == 0)
358 nwcs = 1;
359 obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
360 obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
361 newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
362 newp->nwcs = nwcs;
364 else
366 newp->wcs = NULL;
367 newp->nwcs = 0;
369 newp->mborder = NULL;
370 newp->wcorder = 0;
371 newp->used_in_level = 0;
372 newp->is_character = is_character;
374 /* Will be assigned later. XXX */
375 newp->mbseqorder = 0;
376 newp->wcseqorder = 0;
378 /* Will be allocated later. */
379 newp->weights = NULL;
381 newp->file = NULL;
382 newp->line = 0;
384 newp->section = collate->current_section;
386 newp->last = NULL;
387 newp->next = NULL;
389 newp->mbnext = NULL;
390 newp->mblast = NULL;
392 newp->wcnext = NULL;
393 newp->wclast = NULL;
395 return newp;
399 static struct symbol_t *
400 new_symbol (struct locale_collate_t *collate, const char *name, size_t len)
402 struct symbol_t *newp;
404 newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
406 newp->name = obstack_copy0 (&collate->mempool, name, len);
407 newp->order = NULL;
409 newp->file = NULL;
410 newp->line = 0;
412 return newp;
416 /* Test whether this name is already defined somewhere. */
417 static int
418 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
419 const struct charmap_t *charmap,
420 struct repertoire_t *repertoire, const char *symbol,
421 size_t symbol_len)
423 void *ignore = NULL;
425 if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
427 lr_error (ldfile, _("`%.*s' already defined in charmap"),
428 (int) symbol_len, symbol);
429 return 1;
432 if (repertoire != NULL
433 && (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore)
434 == 0))
436 lr_error (ldfile, _("`%.*s' already defined in repertoire"),
437 (int) symbol_len, symbol);
438 return 1;
441 if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
443 lr_error (ldfile, _("`%.*s' already defined as collating symbol"),
444 (int) symbol_len, symbol);
445 return 1;
448 if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
450 lr_error (ldfile, _("`%.*s' already defined as collating element"),
451 (int) symbol_len, symbol);
452 return 1;
455 return 0;
459 /* Read the direction specification. */
460 static void
461 read_directions (struct linereader *ldfile, struct token *arg,
462 const struct charmap_t *charmap,
463 struct repertoire_t *repertoire, struct localedef_t *result)
465 int cnt = 0;
466 int max = nrules ?: 10;
467 enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
468 int warned = 0;
469 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
471 while (1)
473 int valid = 0;
475 if (arg->tok == tok_forward)
477 if (rules[cnt] & sort_backward)
479 if (! warned)
481 lr_error (ldfile, _("\
482 %s: `forward' and `backward' are mutually excluding each other"),
483 "LC_COLLATE");
484 warned = 1;
487 else if (rules[cnt] & sort_forward)
489 if (! warned)
491 lr_error (ldfile, _("\
492 %s: `%s' mentioned more than once in definition of weight %d"),
493 "LC_COLLATE", "forward", cnt + 1);
496 else
497 rules[cnt] |= sort_forward;
499 valid = 1;
501 else if (arg->tok == tok_backward)
503 if (rules[cnt] & sort_forward)
505 if (! warned)
507 lr_error (ldfile, _("\
508 %s: `forward' and `backward' are mutually excluding each other"),
509 "LC_COLLATE");
510 warned = 1;
513 else if (rules[cnt] & sort_backward)
515 if (! warned)
517 lr_error (ldfile, _("\
518 %s: `%s' mentioned more than once in definition of weight %d"),
519 "LC_COLLATE", "backward", cnt + 1);
522 else
523 rules[cnt] |= sort_backward;
525 valid = 1;
527 else if (arg->tok == tok_position)
529 if (rules[cnt] & sort_position)
531 if (! warned)
533 lr_error (ldfile, _("\
534 %s: `%s' mentioned more than once in definition of weight %d"),
535 "LC_COLLATE", "position", cnt + 1);
538 else
539 rules[cnt] |= sort_position;
541 valid = 1;
544 if (valid)
545 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
547 if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
548 || arg->tok == tok_semicolon)
550 if (! valid && ! warned)
552 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
553 warned = 1;
556 /* See whether we have to increment the counter. */
557 if (arg->tok != tok_comma && rules[cnt] != 0)
559 /* Add the default `forward' if we have seen only `position'. */
560 if (rules[cnt] == sort_position)
561 rules[cnt] = sort_position | sort_forward;
563 ++cnt;
566 if (arg->tok == tok_eof || arg->tok == tok_eol)
567 /* End of line or file, so we exit the loop. */
568 break;
570 if (nrules == 0)
572 /* See whether we have enough room in the array. */
573 if (cnt == max)
575 max += 10;
576 rules = (enum coll_sort_rule *) xrealloc (rules,
578 * sizeof (*rules));
579 memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
582 else
584 if (cnt == nrules)
586 /* There must not be any more rule. */
587 if (! warned)
589 lr_error (ldfile, _("\
590 %s: too many rules; first entry only had %d"),
591 "LC_COLLATE", nrules);
592 warned = 1;
595 lr_ignore_rest (ldfile, 0);
596 break;
600 else
602 if (! warned)
604 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
605 warned = 1;
609 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
612 if (nrules == 0)
614 /* Now we know how many rules we have. */
615 nrules = cnt;
616 rules = (enum coll_sort_rule *) xrealloc (rules,
617 nrules * sizeof (*rules));
619 else
621 if (cnt < nrules)
623 /* Not enough rules in this specification. */
624 if (! warned)
625 lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
628 rules[cnt] = sort_forward;
629 while (++cnt < nrules);
633 collate->current_section->rules = rules;
637 static struct element_t *
638 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
639 const char *str, size_t len)
641 void *result = NULL;
643 /* Search for the entries among the collation sequences already define. */
644 if (find_entry (&collate->seq_table, str, len, &result) != 0)
646 /* Nope, not define yet. So we see whether it is a
647 collation symbol. */
648 void *ptr;
650 if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
652 /* It's a collation symbol. */
653 struct symbol_t *sym = (struct symbol_t *) ptr;
654 result = sym->order;
656 if (result == NULL)
657 result = sym->order = new_element (collate, NULL, 0, NULL,
658 NULL, 0, 0);
660 else if (find_entry (&collate->elem_table, str, len, &result) != 0)
662 /* It's also no collation element. So it is a character
663 element defined later. */
664 result = new_element (collate, NULL, 0, NULL, str, len, 1);
665 /* Insert it into the sequence table. */
666 insert_entry (&collate->seq_table, str, len, result);
670 return (struct element_t *) result;
674 static void
675 unlink_element (struct locale_collate_t *collate)
677 if (collate->cursor == collate->start)
679 assert (collate->cursor->next == NULL);
680 assert (collate->cursor->last == NULL);
681 collate->cursor = NULL;
683 else
685 if (collate->cursor->next != NULL)
686 collate->cursor->next->last = collate->cursor->last;
687 if (collate->cursor->last != NULL)
688 collate->cursor->last->next = collate->cursor->next;
689 collate->cursor = collate->cursor->last;
694 static void
695 insert_weights (struct linereader *ldfile, struct element_t *elem,
696 const struct charmap_t *charmap,
697 struct repertoire_t *repertoire, struct localedef_t *result,
698 enum token_t ellipsis)
700 int weight_cnt;
701 struct token *arg;
702 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
704 /* Initialize all the fields. */
705 elem->file = ldfile->fname;
706 elem->line = ldfile->lineno;
708 elem->last = collate->cursor;
709 elem->next = collate->cursor ? collate->cursor->next : NULL;
710 if (collate->cursor != NULL && collate->cursor->next != NULL)
711 collate->cursor->next->last = elem;
712 if (collate->cursor != NULL)
713 collate->cursor->next = elem;
714 if (collate->start == NULL)
716 assert (collate->cursor == NULL);
717 collate->start = elem;
720 elem->section = collate->current_section;
722 if (collate->current_section->first == NULL)
723 collate->current_section->first = elem;
724 if (collate->current_section->last == collate->cursor)
725 collate->current_section->last = elem;
727 collate->cursor = elem;
729 elem->weights = (struct element_list_t *)
730 obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
731 memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
733 weight_cnt = 0;
735 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
738 if (arg->tok == tok_eof || arg->tok == tok_eol)
739 break;
741 if (arg->tok == tok_ignore)
743 /* The weight for this level has to be ignored. We use the
744 null pointer to indicate this. */
745 elem->weights[weight_cnt].w = (struct element_t **)
746 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
747 elem->weights[weight_cnt].w[0] = NULL;
748 elem->weights[weight_cnt].cnt = 1;
750 else if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
752 char ucs4str[10];
753 struct element_t *val;
754 char *symstr;
755 size_t symlen;
757 if (arg->tok == tok_bsymbol)
759 symstr = arg->val.str.startmb;
760 symlen = arg->val.str.lenmb;
762 else
764 snprintf (ucs4str, sizeof (ucs4str), "U%08X", arg->val.ucs4);
765 symstr = ucs4str;
766 symlen = 9;
769 val = find_element (ldfile, collate, symstr, symlen);
770 if (val == NULL)
771 break;
773 elem->weights[weight_cnt].w = (struct element_t **)
774 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
775 elem->weights[weight_cnt].w[0] = val;
776 elem->weights[weight_cnt].cnt = 1;
778 else if (arg->tok == tok_string)
780 /* Split the string up in the individual characters and put
781 the element definitions in the list. */
782 const char *cp = arg->val.str.startmb;
783 int cnt = 0;
784 struct element_t *charelem;
785 struct element_t **weights = NULL;
786 int max = 0;
788 if (*cp == '\0')
790 lr_error (ldfile, _("%s: empty weight string not allowed"),
791 "LC_COLLATE");
792 lr_ignore_rest (ldfile, 0);
793 break;
798 if (*cp == '<')
800 /* Ahh, it's a bsymbol or an UCS4 value. If it's
801 the latter we have to unify the name. */
802 const char *startp = ++cp;
803 size_t len;
805 while (*cp != '>')
807 if (*cp == ldfile->escape_char)
808 ++cp;
809 if (*cp == '\0')
810 /* It's a syntax error. */
811 goto syntax;
813 ++cp;
816 if (cp - startp == 5 && startp[0] == 'U'
817 && isxdigit (startp[1]) && isxdigit (startp[2])
818 && isxdigit (startp[3]) && isxdigit (startp[4]))
820 unsigned int ucs4 = strtoul (startp + 1, NULL, 16);
821 char *newstr;
823 newstr = (char *) xmalloc (10);
824 snprintf (newstr, 10, "U%08X", ucs4);
825 startp = newstr;
827 len = 9;
829 else
830 len = cp - startp;
832 charelem = find_element (ldfile, collate, startp, len);
833 ++cp;
835 else
837 /* People really shouldn't use characters directly in
838 the string. Especially since it's not really clear
839 what this means. We interpret all characters in the
840 string as if that would be bsymbols. Otherwise we
841 would have to match back to bsymbols somehow and this
842 is normally not what people normally expect. */
843 charelem = find_element (ldfile, collate, cp++, 1);
846 if (charelem == NULL)
848 /* We ignore the rest of the line. */
849 lr_ignore_rest (ldfile, 0);
850 break;
853 /* Add the pointer. */
854 if (cnt >= max)
856 struct element_t **newp;
857 max += 10;
858 newp = (struct element_t **)
859 alloca (max * sizeof (struct element_t *));
860 memcpy (newp, weights, cnt * sizeof (struct element_t *));
861 weights = newp;
863 weights[cnt++] = charelem;
865 while (*cp != '\0');
867 /* Now store the information. */
868 elem->weights[weight_cnt].w = (struct element_t **)
869 obstack_alloc (&collate->mempool,
870 cnt * sizeof (struct element_t *));
871 memcpy (elem->weights[weight_cnt].w, weights,
872 cnt * sizeof (struct element_t *));
873 elem->weights[weight_cnt].cnt = cnt;
875 /* We don't need the string anymore. */
876 free (arg->val.str.startmb);
878 else if (ellipsis != tok_none
879 && (arg->tok == tok_ellipsis2
880 || arg->tok == tok_ellipsis3
881 || arg->tok == tok_ellipsis4))
883 /* It must be the same ellipsis as used in the initial column. */
884 if (arg->tok != ellipsis)
885 lr_error (ldfile, _("\
886 %s: weights must use the same ellipsis symbol as the name"),
887 "LC_COLLATE");
889 /* The weight for this level will depend on the element
890 iterating over the range. Put a placeholder. */
891 elem->weights[weight_cnt].w = (struct element_t **)
892 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
893 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
894 elem->weights[weight_cnt].cnt = 1;
896 else
898 syntax:
899 /* It's a syntax error. */
900 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
901 lr_ignore_rest (ldfile, 0);
902 break;
905 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
906 /* This better should be the end of the line or a semicolon. */
907 if (arg->tok == tok_semicolon)
908 /* OK, ignore this and read the next token. */
909 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
910 else if (arg->tok != tok_eof && arg->tok != tok_eol)
912 /* It's a syntax error. */
913 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
914 lr_ignore_rest (ldfile, 0);
915 break;
918 while (++weight_cnt < nrules);
920 if (weight_cnt < nrules)
922 /* This means the rest of the line uses the current element as
923 the weight. */
926 elem->weights[weight_cnt].w = (struct element_t **)
927 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
928 if (ellipsis == tok_none)
929 elem->weights[weight_cnt].w[0] = elem;
930 else
931 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
932 elem->weights[weight_cnt].cnt = 1;
934 while (++weight_cnt < nrules);
936 else
938 if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
940 /* Too many rule values. */
941 lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
942 lr_ignore_rest (ldfile, 0);
944 else
945 lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
950 static int
951 insert_value (struct linereader *ldfile, const char *symstr, size_t symlen,
952 const struct charmap_t *charmap, struct repertoire_t *repertoire,
953 struct localedef_t *result)
955 /* First find out what kind of symbol this is. */
956 struct charseq *seq;
957 uint32_t wc;
958 struct element_t *elem = NULL;
959 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
961 /* Try to find the character in the charmap. */
962 seq = charmap_find_value (charmap, symstr, symlen);
964 /* Determine the wide character. */
965 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
967 wc = repertoire_find_value (repertoire, symstr, symlen);
968 if (seq != NULL)
969 seq->ucs4 = wc;
971 else
972 wc = seq->ucs4;
974 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
976 /* It's no character, so look through the collation elements and
977 symbol list. */
978 void *ptr = elem;
979 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) != 0)
981 void *result;
982 struct symbol_t *sym = NULL;
984 /* It's also collation element. Therefore it's either a
985 collating symbol or it's a character which is not
986 supported by the character set. In the later case we
987 simply create a dummy entry. */
988 if (find_entry (&collate->sym_table, symstr, symlen, &result) == 0)
990 /* It's a collation symbol. */
991 sym = (struct symbol_t *) result;
993 elem = sym->order;
996 if (elem == NULL)
998 elem = new_element (collate, NULL, 0, NULL, symstr, symlen, 0);
1000 if (sym != NULL)
1001 sym->order = elem;
1002 else
1003 /* Enter a fake element in the sequence table. This
1004 won't cause anything in the output since there is
1005 no multibyte or wide character associated with
1006 it. */
1007 insert_entry (&collate->seq_table, symstr, symlen, elem);
1010 else
1011 /* Copy the result back. */
1012 elem = ptr;
1014 else
1016 /* Otherwise the symbols stands for a character. */
1017 void *ptr = elem;
1018 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) != 0)
1020 uint32_t wcs[2] = { wc, 0 };
1022 /* We have to allocate an entry. */
1023 elem = new_element (collate,
1024 seq != NULL ? (char *) seq->bytes : NULL,
1025 seq != NULL ? seq->nbytes : 0,
1026 wc == ILLEGAL_CHAR_VALUE ? NULL : wcs,
1027 symstr, symlen, 1);
1029 /* And add it to the table. */
1030 if (insert_entry (&collate->seq_table, symstr, symlen, elem) != 0)
1031 /* This cannot happen. */
1032 assert (! "Internal error");
1034 else
1036 /* Copy the result back. */
1037 elem = ptr;
1039 /* Maybe the character was used before the definition. In this case
1040 we have to insert the byte sequences now. */
1041 if (elem->mbs == NULL && seq != NULL)
1043 elem->mbs = obstack_copy0 (&collate->mempool,
1044 seq->bytes, seq->nbytes);
1045 elem->nmbs = seq->nbytes;
1048 if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
1050 uint32_t wcs[2] = { wc, 0 };
1052 elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
1053 elem->nwcs = 1;
1058 /* Test whether this element is not already in the list. */
1059 if (elem->next != NULL || elem == collate->cursor)
1061 lr_error (ldfile, _("order for `%.*s' already defined at %s:%Zu"),
1062 (int) symlen, symstr, elem->file, elem->line);
1063 lr_ignore_rest (ldfile, 0);
1064 return 1;
1067 insert_weights (ldfile, elem, charmap, repertoire, result, tok_none);
1069 return 0;
1073 static void
1074 handle_ellipsis (struct linereader *ldfile, const char *symstr, size_t symlen,
1075 enum token_t ellipsis, const struct charmap_t *charmap,
1076 struct repertoire_t *repertoire,
1077 struct localedef_t *result)
1079 struct element_t *startp;
1080 struct element_t *endp;
1081 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
1083 /* Unlink the entry added for the ellipsis. */
1084 unlink_element (collate);
1085 startp = collate->cursor;
1087 /* Process and add the end-entry. */
1088 if (symstr != NULL
1089 && insert_value (ldfile, symstr, symlen, charmap, repertoire, result))
1090 /* Something went wrong with inserting the to-value. This means
1091 we cannot process the ellipsis. */
1092 return;
1094 /* Reset the cursor. */
1095 collate->cursor = startp;
1097 /* Now we have to handle many different situations:
1098 - we have to distinguish between the three different ellipsis forms
1099 - the is the ellipsis at the beginning, in the middle, or at the end.
1101 endp = collate->cursor->next;
1102 assert (symstr == NULL || endp != NULL);
1104 /* XXX The following is probably very wrong since also collating symbols
1105 can appear in ranges. But do we want/can refine the test for that? */
1106 #if 0
1107 /* Both, the start and the end symbol, must stand for characters. */
1108 if ((startp != NULL && (startp->name == NULL || ! startp->is_character))
1109 || (endp != NULL && (endp->name == NULL|| ! endp->is_character)))
1111 lr_error (ldfile, _("\
1112 %s: the start and the end symbol of a range must stand for characters"),
1113 "LC_COLLATE");
1114 return;
1116 #endif
1118 if (ellipsis == tok_ellipsis3)
1120 /* One requirement we make here: the length of the byte
1121 sequences for the first and end character must be the same.
1122 This is mainly to prevent unwanted effects and this is often
1123 not what is wanted. */
1124 size_t len = (startp->mbs != NULL ? startp->nmbs
1125 : (endp->mbs != NULL ? endp->nmbs : 0));
1126 char mbcnt[len + 1];
1127 char mbend[len + 1];
1129 /* Well, this should be caught somewhere else already. Just to
1130 make sure. */
1131 assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
1132 assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
1134 if (startp != NULL && endp != NULL
1135 && startp->mbs != NULL && endp->mbs != NULL
1136 && startp->nmbs != endp->nmbs)
1138 lr_error (ldfile, _("\
1139 %s: byte sequences of first and last character must have the same length"),
1140 "LC_COLLATE");
1141 return;
1144 /* Determine whether we have to generate multibyte sequences. */
1145 if ((startp == NULL || startp->mbs != NULL)
1146 && (endp == NULL || endp->mbs != NULL))
1148 int cnt;
1149 int ret;
1151 /* Prepare the beginning byte sequence. This is either from the
1152 beginning byte sequence or it is all nulls if it was an
1153 initial ellipsis. */
1154 if (startp == NULL || startp->mbs == NULL)
1155 memset (mbcnt, '\0', len);
1156 else
1158 memcpy (mbcnt, startp->mbs, len);
1160 /* And increment it so that the value is the first one we will
1161 try to insert. */
1162 for (cnt = len - 1; cnt >= 0; --cnt)
1163 if (++mbcnt[cnt] != '\0')
1164 break;
1166 mbcnt[len] = '\0';
1168 /* And the end sequence. */
1169 if (endp == NULL || endp->mbs == NULL)
1170 memset (mbend, '\0', len);
1171 else
1172 memcpy (mbend, endp->mbs, len);
1173 mbend[len] = '\0';
1175 /* Test whether we have a correct range. */
1176 ret = memcmp (mbcnt, mbend, len);
1177 if (ret >= 0)
1179 if (ret > 0)
1180 lr_error (ldfile, _("%s: byte sequence of first character of \
1181 range is not lower than that of the last character"), "LC_COLLATE");
1182 return;
1185 /* Generate the byte sequences data. */
1186 while (1)
1188 struct charseq *seq;
1190 /* Quite a bit of work ahead. We have to find the character
1191 definition for the byte sequence and then determine the
1192 wide character belonging to it. */
1193 seq = charmap_find_symbol (charmap, mbcnt, len);
1194 if (seq != NULL)
1196 struct element_t *elem;
1197 size_t namelen;
1199 /* I don't think this can ever happen. */
1200 assert (seq->name != NULL);
1201 namelen = strlen (seq->name);
1203 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1204 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1205 namelen);
1207 /* Now we are ready to insert the new value in the
1208 sequence. Find out whether the element is
1209 already known. */
1210 void *ptr;
1211 if (find_entry (&collate->seq_table, seq->name, namelen,
1212 &ptr) != 0)
1214 uint32_t wcs[2] = { seq->ucs4, 0 };
1216 /* We have to allocate an entry. */
1217 elem = new_element (collate, mbcnt, len,
1218 seq->ucs4 == ILLEGAL_CHAR_VALUE
1219 ? NULL : wcs, seq->name,
1220 namelen, 1);
1222 /* And add it to the table. */
1223 if (insert_entry (&collate->seq_table, seq->name,
1224 namelen, elem) != 0)
1225 /* This cannot happen. */
1226 assert (! "Internal error");
1228 else
1229 /* Copy the result. */
1230 elem = ptr;
1232 /* Test whether this element is not already in the list. */
1233 if (elem->next != NULL || (collate->cursor != NULL
1234 && elem->next == collate->cursor))
1236 lr_error (ldfile, _("\
1237 order for `%.*s' already defined at %s:%Zu"),
1238 (int) namelen, seq->name,
1239 elem->file, elem->line);
1240 goto increment;
1243 /* Enqueue the new element. */
1244 elem->last = collate->cursor;
1245 if (collate->cursor == NULL)
1246 elem->next = NULL;
1247 else
1249 elem->next = collate->cursor->next;
1250 elem->last->next = elem;
1251 if (elem->next != NULL)
1252 elem->next->last = elem;
1254 if (collate->start == NULL)
1256 assert (collate->cursor == NULL);
1257 collate->start = elem;
1259 collate->cursor = elem;
1261 /* Add the weight value. We take them from the
1262 `ellipsis_weights' member of `collate'. */
1263 elem->weights = (struct element_list_t *)
1264 obstack_alloc (&collate->mempool,
1265 nrules * sizeof (struct element_list_t));
1266 for (cnt = 0; cnt < nrules; ++cnt)
1267 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1268 && (collate->ellipsis_weight.weights[cnt].w[0]
1269 == ELEMENT_ELLIPSIS2))
1271 elem->weights[cnt].w = (struct element_t **)
1272 obstack_alloc (&collate->mempool,
1273 sizeof (struct element_t *));
1274 elem->weights[cnt].w[0] = elem;
1275 elem->weights[cnt].cnt = 1;
1277 else
1279 /* Simply use the weight from `ellipsis_weight'. */
1280 elem->weights[cnt].w =
1281 collate->ellipsis_weight.weights[cnt].w;
1282 elem->weights[cnt].cnt =
1283 collate->ellipsis_weight.weights[cnt].cnt;
1287 /* Increment for the next round. */
1288 increment:
1289 for (cnt = len - 1; cnt >= 0; --cnt)
1290 if (++mbcnt[cnt] != '\0')
1291 break;
1293 /* Find out whether this was all. */
1294 if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1295 /* Yep, that's all. */
1296 break;
1300 else
1302 /* For symbolic range we naturally must have a beginning and an
1303 end specified by the user. */
1304 if (startp == NULL)
1305 lr_error (ldfile, _("\
1306 %s: symbolic range ellipsis must not directly follow `order_start'"),
1307 "LC_COLLATE");
1308 else if (endp == NULL)
1309 lr_error (ldfile, _("\
1310 %s: symbolic range ellipsis must not be directly followed by `order_end'"),
1311 "LC_COLLATE");
1312 else
1314 /* Determine the range. To do so we have to determine the
1315 common prefix of the both names and then the numeric
1316 values of both ends. */
1317 size_t lenfrom = strlen (startp->name);
1318 size_t lento = strlen (endp->name);
1319 char buf[lento + 1];
1320 int preflen = 0;
1321 long int from;
1322 long int to;
1323 char *cp;
1324 int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1326 if (lenfrom != lento)
1328 invalid_range:
1329 lr_error (ldfile, _("\
1330 `%s' and `%.*s' are not valid names for symbolic range"),
1331 startp->name, (int) lento, endp->name);
1332 return;
1335 while (startp->name[preflen] == endp->name[preflen])
1336 if (startp->name[preflen] == '\0')
1337 /* Nothing to be done. The start and end point are identical
1338 and while inserting the end point we have already given
1339 the user an error message. */
1340 return;
1341 else
1342 ++preflen;
1344 errno = 0;
1345 from = strtol (startp->name + preflen, &cp, base);
1346 if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1347 goto invalid_range;
1349 errno = 0;
1350 to = strtol (endp->name + preflen, &cp, base);
1351 if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1352 goto invalid_range;
1354 /* Copy the prefix. */
1355 memcpy (buf, startp->name, preflen);
1357 /* Loop over all values. */
1358 for (++from; from < to; ++from)
1360 struct element_t *elem = NULL;
1361 struct charseq *seq;
1362 uint32_t wc;
1363 int cnt;
1365 /* Generate the name. */
1366 sprintf (buf + preflen, base == 10 ? "%0*ld" : "%0*lX",
1367 (int) (lenfrom - preflen), from);
1369 /* Look whether this name is already defined. */
1370 void *ptr;
1371 if (find_entry (&collate->seq_table, buf, symlen, &ptr) == 0)
1373 /* Copy back the result. */
1374 elem = ptr;
1376 if (elem->next != NULL || (collate->cursor != NULL
1377 && elem->next == collate->cursor))
1379 lr_error (ldfile, _("\
1380 %s: order for `%.*s' already defined at %s:%Zu"),
1381 "LC_COLLATE", (int) lenfrom, buf,
1382 elem->file, elem->line);
1383 continue;
1386 if (elem->name == NULL)
1388 lr_error (ldfile, _("%s: `%s' must be a character"),
1389 "LC_COLLATE", buf);
1390 continue;
1394 if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1396 /* Search for a character of this name. */
1397 seq = charmap_find_value (charmap, buf, lenfrom);
1398 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1400 wc = repertoire_find_value (repertoire, buf, lenfrom);
1402 if (seq != NULL)
1403 seq->ucs4 = wc;
1405 else
1406 wc = seq->ucs4;
1408 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1409 /* We don't know anything about a character with this
1410 name. XXX Should we warn? */
1411 continue;
1413 if (elem == NULL)
1415 uint32_t wcs[2] = { wc, 0 };
1417 /* We have to allocate an entry. */
1418 elem = new_element (collate,
1419 seq != NULL
1420 ? (char *) seq->bytes : NULL,
1421 seq != NULL ? seq->nbytes : 0,
1422 wc == ILLEGAL_CHAR_VALUE
1423 ? NULL : wcs, buf, lenfrom, 1);
1425 else
1427 /* Update the element. */
1428 if (seq != NULL)
1430 elem->mbs = obstack_copy0 (&collate->mempool,
1431 seq->bytes, seq->nbytes);
1432 elem->nmbs = seq->nbytes;
1435 if (wc != ILLEGAL_CHAR_VALUE)
1437 uint32_t zero = 0;
1439 obstack_grow (&collate->mempool,
1440 &wc, sizeof (uint32_t));
1441 obstack_grow (&collate->mempool,
1442 &zero, sizeof (uint32_t));
1443 elem->wcs = obstack_finish (&collate->mempool);
1444 elem->nwcs = 1;
1448 elem->file = ldfile->fname;
1449 elem->line = ldfile->lineno;
1450 elem->section = collate->current_section;
1453 /* Enqueue the new element. */
1454 elem->last = collate->cursor;
1455 elem->next = collate->cursor->next;
1456 elem->last->next = elem;
1457 if (elem->next != NULL)
1458 elem->next->last = elem;
1459 collate->cursor = elem;
1461 /* Now add the weights. They come from the `ellipsis_weights'
1462 member of `collate'. */
1463 elem->weights = (struct element_list_t *)
1464 obstack_alloc (&collate->mempool,
1465 nrules * sizeof (struct element_list_t));
1466 for (cnt = 0; cnt < nrules; ++cnt)
1467 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1468 && (collate->ellipsis_weight.weights[cnt].w[0]
1469 == ELEMENT_ELLIPSIS2))
1471 elem->weights[cnt].w = (struct element_t **)
1472 obstack_alloc (&collate->mempool,
1473 sizeof (struct element_t *));
1474 elem->weights[cnt].w[0] = elem;
1475 elem->weights[cnt].cnt = 1;
1477 else
1479 /* Simly use the weight from `ellipsis_weight'. */
1480 elem->weights[cnt].w =
1481 collate->ellipsis_weight.weights[cnt].w;
1482 elem->weights[cnt].cnt =
1483 collate->ellipsis_weight.weights[cnt].cnt;
1491 static void
1492 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1493 struct localedef_t *copy_locale, int ignore_content)
1495 if (!ignore_content && locale->categories[LC_COLLATE].collate == NULL)
1497 struct locale_collate_t *collate;
1499 if (copy_locale == NULL)
1501 collate = locale->categories[LC_COLLATE].collate =
1502 (struct locale_collate_t *)
1503 xcalloc (1, sizeof (struct locale_collate_t));
1505 /* Init the various data structures. */
1506 init_hash (&collate->elem_table, 100);
1507 init_hash (&collate->sym_table, 100);
1508 init_hash (&collate->seq_table, 500);
1509 obstack_init (&collate->mempool);
1511 collate->col_weight_max = -1;
1513 else
1514 /* Reuse the copy_locale's data structures. */
1515 collate = locale->categories[LC_COLLATE].collate =
1516 copy_locale->categories[LC_COLLATE].collate;
1519 ldfile->translate_strings = 0;
1520 ldfile->return_widestr = 0;
1524 void
1525 collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
1527 /* Now is the time when we can assign the individual collation
1528 values for all the symbols. We have possibly different values
1529 for the wide- and the multibyte-character symbols. This is done
1530 since it might make a difference in the encoding if there is in
1531 some cases no multibyte-character but there are wide-characters.
1532 (The other way around it is not important since theencoded
1533 collation value in the wide-character case is 32 bits wide and
1534 therefore requires no encoding).
1536 The lowest collation value assigned is 2. Zero is reserved for
1537 the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1538 functions and 1 is used to separate the individual passes for the
1539 different rules.
1541 We also have to construct is list with all the bytes/words which
1542 can come first in a sequence, followed by all the elements which
1543 also start with this byte/word. The order is reverse which has
1544 among others the important effect that longer strings are located
1545 first in the list. This is required for the output data since
1546 the algorithm used in `strcoll' etc depends on this.
1548 The multibyte case is easy. We simply sort into an array with
1549 256 elements. */
1550 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1551 int mbact[nrules];
1552 int wcact;
1553 int mbseqact;
1554 int wcseqact;
1555 struct element_t *runp;
1556 int i;
1557 int need_undefined = 0;
1558 struct section_list *sect;
1559 int ruleidx;
1560 int nr_wide_elems = 0;
1562 if (collate == NULL)
1564 /* No data, no check. */
1565 if (! be_quiet)
1566 WITH_CUR_LOCALE (error (0, 0, _("No definition for %s category found"),
1567 "LC_COLLATE"));
1568 return;
1571 /* If this assertion is hit change the type in `element_t'. */
1572 assert (nrules <= sizeof (runp->used_in_level) * 8);
1574 /* Make sure that the `position' rule is used either in all sections
1575 or in none. */
1576 for (i = 0; i < nrules; ++i)
1577 for (sect = collate->sections; sect != NULL; sect = sect->next)
1578 if (sect != collate->current_section
1579 && sect->rules != NULL
1580 && ((sect->rules[i] & sort_position)
1581 != (collate->current_section->rules[i] & sort_position)))
1583 WITH_CUR_LOCALE (error (0, 0, _("\
1584 %s: `position' must be used for a specific level in all sections or none"),
1585 "LC_COLLATE"));
1586 break;
1589 /* Find out which elements are used at which level. At the same
1590 time we find out whether we have any undefined symbols. */
1591 runp = collate->start;
1592 while (runp != NULL)
1594 if (runp->mbs != NULL)
1596 for (i = 0; i < nrules; ++i)
1598 int j;
1600 for (j = 0; j < runp->weights[i].cnt; ++j)
1601 /* A NULL pointer as the weight means IGNORE. */
1602 if (runp->weights[i].w[j] != NULL)
1604 if (runp->weights[i].w[j]->weights == NULL)
1606 WITH_CUR_LOCALE (error_at_line (0, 0, runp->file,
1607 runp->line,
1608 _("symbol `%s' not defined"),
1609 runp->weights[i].w[j]->name));
1611 need_undefined = 1;
1612 runp->weights[i].w[j] = &collate->undefined;
1614 else
1615 /* Set the bit for the level. */
1616 runp->weights[i].w[j]->used_in_level |= 1 << i;
1621 /* Up to the next entry. */
1622 runp = runp->next;
1625 /* Walk through the list of defined sequences and assign weights. Also
1626 create the data structure which will allow generating the single byte
1627 character based tables.
1629 Since at each time only the weights for each of the rules are
1630 only compared to other weights for this rule it is possible to
1631 assign more compact weight values than simply counting all
1632 weights in sequence. We can assign weights from 3, one for each
1633 rule individually and only for those elements, which are actually
1634 used for this rule.
1636 Why is this important? It is not for the wide char table. But
1637 it is for the singlebyte output since here larger numbers have to
1638 be encoded to make it possible to emit the value as a byte
1639 string. */
1640 for (i = 0; i < nrules; ++i)
1641 mbact[i] = 2;
1642 wcact = 2;
1643 mbseqact = 0;
1644 wcseqact = 0;
1645 runp = collate->start;
1646 while (runp != NULL)
1648 /* Determine the order. */
1649 if (runp->used_in_level != 0)
1651 runp->mborder = (int *) obstack_alloc (&collate->mempool,
1652 nrules * sizeof (int));
1654 for (i = 0; i < nrules; ++i)
1655 if ((runp->used_in_level & (1 << i)) != 0)
1656 runp->mborder[i] = mbact[i]++;
1657 else
1658 runp->mborder[i] = 0;
1661 if (runp->mbs != NULL)
1663 struct element_t **eptr;
1664 struct element_t *lastp = NULL;
1666 /* Find the point where to insert in the list. */
1667 eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1668 while (*eptr != NULL)
1670 if ((*eptr)->nmbs < runp->nmbs)
1671 break;
1673 if ((*eptr)->nmbs == runp->nmbs)
1675 int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1677 if (c == 0)
1679 /* This should not happen. It means that we have
1680 to symbols with the same byte sequence. It is
1681 of course an error. */
1682 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1683 (*eptr)->line,
1684 _("\
1685 symbol `%s' has the same encoding as"), (*eptr)->name);
1686 error_at_line (0, 0, runp->file,
1687 runp->line,
1688 _("symbol `%s'"),
1689 runp->name));
1690 goto dont_insert;
1692 else if (c < 0)
1693 /* Insert it here. */
1694 break;
1697 /* To the next entry. */
1698 lastp = *eptr;
1699 eptr = &(*eptr)->mbnext;
1702 /* Set the pointers. */
1703 runp->mbnext = *eptr;
1704 runp->mblast = lastp;
1705 if (*eptr != NULL)
1706 (*eptr)->mblast = runp;
1707 *eptr = runp;
1708 dont_insert:
1712 if (runp->used_in_level)
1714 runp->wcorder = wcact++;
1716 /* We take the opportunity to count the elements which have
1717 wide characters. */
1718 ++nr_wide_elems;
1721 if (runp->is_character)
1723 if (runp->nmbs == 1)
1724 collate->mbseqorder[((unsigned char *) runp->mbs)[0]] = mbseqact++;
1726 runp->wcseqorder = wcseqact++;
1728 else if (runp->mbs != NULL && runp->weights != NULL)
1729 /* This is for collation elements. */
1730 runp->wcseqorder = wcseqact++;
1732 /* Up to the next entry. */
1733 runp = runp->next;
1736 /* Find out whether any of the `mbheads' entries is unset. In this
1737 case we use the UNDEFINED entry. */
1738 for (i = 1; i < 256; ++i)
1739 if (collate->mbheads[i] == NULL)
1741 need_undefined = 1;
1742 collate->mbheads[i] = &collate->undefined;
1745 /* Now to the wide character case. */
1746 collate->wcheads.p = 6;
1747 collate->wcheads.q = 10;
1748 wchead_table_init (&collate->wcheads);
1750 collate->wcseqorder.p = 6;
1751 collate->wcseqorder.q = 10;
1752 collseq_table_init (&collate->wcseqorder);
1754 /* Start adding. */
1755 runp = collate->start;
1756 while (runp != NULL)
1758 if (runp->wcs != NULL)
1760 struct element_t *e;
1761 struct element_t **eptr;
1762 struct element_t *lastp;
1764 /* Insert the collation sequence value. */
1765 if (runp->is_character)
1766 collseq_table_add (&collate->wcseqorder, runp->wcs[0],
1767 runp->wcseqorder);
1769 /* Find the point where to insert in the list. */
1770 e = wchead_table_get (&collate->wcheads, runp->wcs[0]);
1771 eptr = &e;
1772 lastp = NULL;
1773 while (*eptr != NULL)
1775 if ((*eptr)->nwcs < runp->nwcs)
1776 break;
1778 if ((*eptr)->nwcs == runp->nwcs)
1780 int c = wmemcmp ((wchar_t *) (*eptr)->wcs,
1781 (wchar_t *) runp->wcs, runp->nwcs);
1783 if (c == 0)
1785 /* This should not happen. It means that we have
1786 two symbols with the same byte sequence. It is
1787 of course an error. */
1788 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1789 (*eptr)->line,
1790 _("\
1791 symbol `%s' has the same encoding as"), (*eptr)->name);
1792 error_at_line (0, 0, runp->file,
1793 runp->line,
1794 _("symbol `%s'"),
1795 runp->name));
1796 goto dont_insertwc;
1798 else if (c < 0)
1799 /* Insert it here. */
1800 break;
1803 /* To the next entry. */
1804 lastp = *eptr;
1805 eptr = &(*eptr)->wcnext;
1808 /* Set the pointers. */
1809 runp->wcnext = *eptr;
1810 runp->wclast = lastp;
1811 if (*eptr != NULL)
1812 (*eptr)->wclast = runp;
1813 *eptr = runp;
1814 if (eptr == &e)
1815 wchead_table_add (&collate->wcheads, runp->wcs[0], e);
1816 dont_insertwc:
1820 /* Up to the next entry. */
1821 runp = runp->next;
1824 /* Now determine whether the UNDEFINED entry is needed and if yes,
1825 whether it was defined. */
1826 collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1827 if (collate->undefined.file == NULL)
1829 if (need_undefined)
1831 /* This seems not to be enforced by recent standards. Don't
1832 emit an error, simply append UNDEFINED at the end. */
1833 if (0)
1834 WITH_CUR_LOCALE (error (0, 0, _("no definition of `UNDEFINED'")));
1836 /* Add UNDEFINED at the end. */
1837 collate->undefined.mborder =
1838 (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1840 for (i = 0; i < nrules; ++i)
1841 collate->undefined.mborder[i] = mbact[i]++;
1844 /* In any case we will need the definition for the wide character
1845 case. But we will not complain that it is missing since the
1846 specification strangely enough does not seem to account for
1847 this. */
1848 collate->undefined.wcorder = wcact++;
1851 /* Finally, try to unify the rules for the sections. Whenever the rules
1852 for a section are the same as those for another section give the
1853 ruleset the same index. Since there are never many section we can
1854 use an O(n^2) algorithm here. */
1855 sect = collate->sections;
1856 while (sect != NULL && sect->rules == NULL)
1857 sect = sect->next;
1859 /* Bail out if we have no sections because of earlier errors. */
1860 if (sect == NULL)
1862 WITH_CUR_LOCALE (error (EXIT_FAILURE, 0,
1863 _("too many errors; giving up")));
1864 return;
1867 ruleidx = 0;
1870 struct section_list *osect = collate->sections;
1872 while (osect != sect)
1873 if (osect->rules != NULL
1874 && memcmp (osect->rules, sect->rules,
1875 nrules * sizeof (osect->rules[0])) == 0)
1876 break;
1877 else
1878 osect = osect->next;
1880 if (osect == sect)
1881 sect->ruleidx = ruleidx++;
1882 else
1883 sect->ruleidx = osect->ruleidx;
1885 /* Next section. */
1887 sect = sect->next;
1888 while (sect != NULL && sect->rules == NULL);
1890 while (sect != NULL);
1891 /* We are currently not prepared for more than 128 rulesets. But this
1892 should never really be a problem. */
1893 assert (ruleidx <= 128);
1897 static int32_t
1898 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1899 struct element_t *elem)
1901 size_t cnt;
1902 int32_t retval;
1904 /* Optimize the use of UNDEFINED. */
1905 if (elem == &collate->undefined)
1906 /* The weights are already inserted. */
1907 return 0;
1909 /* This byte can start exactly one collation element and this is
1910 a single byte. We can directly give the index to the weights. */
1911 retval = obstack_object_size (pool);
1913 /* Construct the weight. */
1914 for (cnt = 0; cnt < nrules; ++cnt)
1916 char buf[elem->weights[cnt].cnt * 7];
1917 int len = 0;
1918 int i;
1920 for (i = 0; i < elem->weights[cnt].cnt; ++i)
1921 /* Encode the weight value. We do nothing for IGNORE entries. */
1922 if (elem->weights[cnt].w[i] != NULL)
1923 len += utf8_encode (&buf[len],
1924 elem->weights[cnt].w[i]->mborder[cnt]);
1926 /* And add the buffer content. */
1927 obstack_1grow (pool, len);
1928 obstack_grow (pool, buf, len);
1931 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1935 static int32_t
1936 output_weightwc (struct obstack *pool, struct locale_collate_t *collate,
1937 struct element_t *elem)
1939 size_t cnt;
1940 int32_t retval;
1942 /* Optimize the use of UNDEFINED. */
1943 if (elem == &collate->undefined)
1944 /* The weights are already inserted. */
1945 return 0;
1947 /* This byte can start exactly one collation element and this is
1948 a single byte. We can directly give the index to the weights. */
1949 retval = obstack_object_size (pool) / sizeof (int32_t);
1951 /* Construct the weight. */
1952 for (cnt = 0; cnt < nrules; ++cnt)
1954 int32_t buf[elem->weights[cnt].cnt];
1955 int i;
1956 int32_t j;
1958 for (i = 0, j = 0; i < elem->weights[cnt].cnt; ++i)
1959 if (elem->weights[cnt].w[i] != NULL)
1960 buf[j++] = elem->weights[cnt].w[i]->wcorder;
1962 /* And add the buffer content. */
1963 obstack_int32_grow (pool, j);
1965 obstack_grow (pool, buf, j * sizeof (int32_t));
1966 maybe_swap_uint32_obstack (pool, j);
1969 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1972 /* If localedef is every threaded, this would need to be __thread var. */
1973 static struct
1975 struct obstack *weightpool;
1976 struct obstack *extrapool;
1977 struct obstack *indpool;
1978 struct locale_collate_t *collate;
1979 struct collidx_table *tablewc;
1980 } atwc;
1982 static void add_to_tablewc (uint32_t ch, struct element_t *runp);
1984 static void
1985 add_to_tablewc (uint32_t ch, struct element_t *runp)
1987 if (runp->wcnext == NULL && runp->nwcs == 1)
1989 int32_t weigthidx = output_weightwc (atwc.weightpool, atwc.collate,
1990 runp);
1991 collidx_table_add (atwc.tablewc, ch, weigthidx);
1993 else
1995 /* As for the singlebyte table, we recognize sequences and
1996 compress them. */
1998 collidx_table_add (atwc.tablewc, ch,
1999 -(obstack_object_size (atwc.extrapool)
2000 / sizeof (uint32_t)));
2004 /* Store the current index in the weight table. We know that
2005 the current position in the `extrapool' is aligned on a
2006 32-bit address. */
2007 int32_t weightidx;
2008 int added;
2010 /* Find out wether this is a single entry or we have more than
2011 one consecutive entry. */
2012 if (runp->wcnext != NULL
2013 && runp->nwcs == runp->wcnext->nwcs
2014 && wmemcmp ((wchar_t *) runp->wcs,
2015 (wchar_t *)runp->wcnext->wcs,
2016 runp->nwcs - 1) == 0
2017 && (runp->wcs[runp->nwcs - 1]
2018 == runp->wcnext->wcs[runp->nwcs - 1] + 1))
2020 int i;
2021 struct element_t *series_startp = runp;
2022 struct element_t *curp;
2024 /* Now add first the initial byte sequence. */
2025 added = (1 + 1 + 2 * (runp->nwcs - 1)) * sizeof (int32_t);
2026 if (sizeof (int32_t) == sizeof (int))
2027 obstack_make_room (atwc.extrapool, added);
2029 /* More than one consecutive entry. We mark this by having
2030 a negative index into the indirect table. */
2031 obstack_int32_grow_fast (atwc.extrapool,
2032 -(obstack_object_size (atwc.indpool)
2033 / sizeof (int32_t)));
2034 obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2037 runp = runp->wcnext;
2038 while (runp->wcnext != NULL
2039 && runp->nwcs == runp->wcnext->nwcs
2040 && wmemcmp ((wchar_t *) runp->wcs,
2041 (wchar_t *)runp->wcnext->wcs,
2042 runp->nwcs - 1) == 0
2043 && (runp->wcs[runp->nwcs - 1]
2044 == runp->wcnext->wcs[runp->nwcs - 1] + 1));
2046 /* Now walk backward from here to the beginning. */
2047 curp = runp;
2049 for (i = 1; i < runp->nwcs; ++i)
2050 obstack_int32_grow_fast (atwc.extrapool, curp->wcs[i]);
2052 /* Now find the end of the consecutive sequence and
2053 add all the indeces in the indirect pool. */
2056 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2057 curp);
2058 obstack_int32_grow (atwc.indpool, weightidx);
2060 curp = curp->wclast;
2062 while (curp != series_startp);
2064 /* Add the final weight. */
2065 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2066 curp);
2067 obstack_int32_grow (atwc.indpool, weightidx);
2069 /* And add the end byte sequence. Without length this
2070 time. */
2071 for (i = 1; i < curp->nwcs; ++i)
2072 obstack_int32_grow (atwc.extrapool, curp->wcs[i]);
2074 else
2076 /* A single entry. Simply add the index and the length and
2077 string (except for the first character which is already
2078 tested for). */
2079 int i;
2081 /* Output the weight info. */
2082 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2083 runp);
2085 assert (runp->nwcs > 0);
2086 added = (1 + 1 + runp->nwcs - 1) * sizeof (int32_t);
2087 if (sizeof (int) == sizeof (int32_t))
2088 obstack_make_room (atwc.extrapool, added);
2090 obstack_int32_grow_fast (atwc.extrapool, weightidx);
2091 obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2092 for (i = 1; i < runp->nwcs; ++i)
2093 obstack_int32_grow_fast (atwc.extrapool, runp->wcs[i]);
2096 /* Next entry. */
2097 runp = runp->wcnext;
2099 while (runp != NULL);
2103 void
2104 collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
2105 const char *output_path)
2107 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
2108 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
2109 struct locale_file file;
2110 size_t ch;
2111 int32_t tablemb[256];
2112 struct obstack weightpool;
2113 struct obstack extrapool;
2114 struct obstack indirectpool;
2115 struct section_list *sect;
2116 struct collidx_table tablewc;
2117 uint32_t elem_size;
2118 uint32_t *elem_table;
2119 int i;
2120 struct element_t *runp;
2122 init_locale_data (&file, nelems);
2123 add_locale_uint32 (&file, nrules);
2125 /* If we have no LC_COLLATE data emit only the number of rules as zero. */
2126 if (collate == NULL)
2128 size_t idx;
2129 for (idx = 1; idx < nelems; idx++)
2131 /* The words have to be handled specially. */
2132 if (idx == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
2133 add_locale_uint32 (&file, 0);
2134 else if (idx == _NL_ITEM_INDEX (_NL_COLLATE_ENCODING_TYPE))
2135 add_locale_uint32 (&file, __cet_other);
2136 else
2137 add_locale_empty (&file);
2139 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file);
2140 return;
2143 obstack_init (&weightpool);
2144 obstack_init (&extrapool);
2145 obstack_init (&indirectpool);
2147 /* Since we are using the sign of an integer to mark indirection the
2148 offsets in the arrays we are indirectly referring to must not be
2149 zero since -0 == 0. Therefore we add a bit of dummy content. */
2150 obstack_int32_grow (&extrapool, 0);
2151 obstack_int32_grow (&indirectpool, 0);
2153 /* Prepare the ruleset table. */
2154 for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
2155 if (sect->rules != NULL && sect->ruleidx == i)
2157 int j;
2159 obstack_make_room (&weightpool, nrules);
2161 for (j = 0; j < nrules; ++j)
2162 obstack_1grow_fast (&weightpool, sect->rules[j]);
2163 ++i;
2165 /* And align the output. */
2166 i = (nrules * i) % LOCFILE_ALIGN;
2167 if (i > 0)
2169 obstack_1grow (&weightpool, '\0');
2170 while (++i < LOCFILE_ALIGN);
2172 add_locale_raw_obstack (&file, &weightpool);
2174 /* Generate the 8-bit table. Walk through the lists of sequences
2175 starting with the same byte and add them one after the other to
2176 the table. In case we have more than one sequence starting with
2177 the same byte we have to use extra indirection.
2179 First add a record for the NUL byte. This entry will never be used
2180 so it does not matter. */
2181 tablemb[0] = 0;
2183 /* Now insert the `UNDEFINED' value if it is used. Since this value
2184 will probably be used more than once it is good to store the
2185 weights only once. */
2186 if (collate->undefined.used_in_level != 0)
2187 output_weight (&weightpool, collate, &collate->undefined);
2189 for (ch = 1; ch < 256; ++ch)
2190 if (collate->mbheads[ch]->mbnext == NULL
2191 && collate->mbheads[ch]->nmbs <= 1)
2193 tablemb[ch] = output_weight (&weightpool, collate,
2194 collate->mbheads[ch]);
2196 else
2198 /* The entries in the list are sorted by length and then
2199 alphabetically. This is the order in which we will add the
2200 elements to the collation table. This allows simply walking
2201 the table in sequence and stopping at the first matching
2202 entry. Since the longer sequences are coming first in the
2203 list they have the possibility to match first, just as it
2204 has to be. In the worst case we are walking to the end of
2205 the list where we put, if no singlebyte sequence is defined
2206 in the locale definition, the weights for UNDEFINED.
2208 To reduce the length of the search list we compress them a bit.
2209 This happens by collecting sequences of consecutive byte
2210 sequences in one entry (having and begin and end byte sequence)
2211 and add only one index into the weight table. We can find the
2212 consecutive entries since they are also consecutive in the list. */
2213 struct element_t *runp = collate->mbheads[ch];
2214 struct element_t *lastp;
2216 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2218 tablemb[ch] = -obstack_object_size (&extrapool);
2222 /* Store the current index in the weight table. We know that
2223 the current position in the `extrapool' is aligned on a
2224 32-bit address. */
2225 int32_t weightidx;
2226 int added;
2228 /* Find out wether this is a single entry or we have more than
2229 one consecutive entry. */
2230 if (runp->mbnext != NULL
2231 && runp->nmbs == runp->mbnext->nmbs
2232 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
2233 && (runp->mbs[runp->nmbs - 1]
2234 == runp->mbnext->mbs[runp->nmbs - 1] + 1))
2236 int i;
2237 struct element_t *series_startp = runp;
2238 struct element_t *curp;
2240 /* Compute how much space we will need. */
2241 added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1
2242 + 2 * (runp->nmbs - 1));
2243 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2244 obstack_make_room (&extrapool, added);
2246 /* More than one consecutive entry. We mark this by having
2247 a negative index into the indirect table. */
2248 obstack_int32_grow_fast (&extrapool,
2249 -(obstack_object_size (&indirectpool)
2250 / sizeof (int32_t)));
2252 /* Now search first the end of the series. */
2254 runp = runp->mbnext;
2255 while (runp->mbnext != NULL
2256 && runp->nmbs == runp->mbnext->nmbs
2257 && memcmp (runp->mbs, runp->mbnext->mbs,
2258 runp->nmbs - 1) == 0
2259 && (runp->mbs[runp->nmbs - 1]
2260 == runp->mbnext->mbs[runp->nmbs - 1] + 1));
2262 /* Now walk backward from here to the beginning. */
2263 curp = runp;
2265 assert (runp->nmbs <= 256);
2266 obstack_1grow_fast (&extrapool, curp->nmbs - 1);
2267 for (i = 1; i < curp->nmbs; ++i)
2268 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2270 /* Now find the end of the consecutive sequence and
2271 add all the indeces in the indirect pool. */
2274 weightidx = output_weight (&weightpool, collate, curp);
2275 obstack_int32_grow (&indirectpool, weightidx);
2277 curp = curp->mblast;
2279 while (curp != series_startp);
2281 /* Add the final weight. */
2282 weightidx = output_weight (&weightpool, collate, curp);
2283 obstack_int32_grow (&indirectpool, weightidx);
2285 /* And add the end byte sequence. Without length this
2286 time. */
2287 for (i = 1; i < curp->nmbs; ++i)
2288 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2290 else
2292 /* A single entry. Simply add the index and the length and
2293 string (except for the first character which is already
2294 tested for). */
2295 int i;
2297 /* Output the weight info. */
2298 weightidx = output_weight (&weightpool, collate, runp);
2300 added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1
2301 + runp->nmbs - 1);
2302 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2303 obstack_make_room (&extrapool, added);
2305 obstack_int32_grow_fast (&extrapool, weightidx);
2306 assert (runp->nmbs <= 256);
2307 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
2309 for (i = 1; i < runp->nmbs; ++i)
2310 obstack_1grow_fast (&extrapool, runp->mbs[i]);
2313 /* Add alignment bytes if necessary. */
2314 while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)))
2315 obstack_1grow_fast (&extrapool, '\0');
2317 /* Next entry. */
2318 lastp = runp;
2319 runp = runp->mbnext;
2321 while (runp != NULL);
2323 assert (LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)));
2325 /* If the final entry in the list is not a single character we
2326 add an UNDEFINED entry here. */
2327 if (lastp->nmbs != 1)
2329 int added = LOCFILE_ALIGN_UP (sizeof (int32_t) + 1 + 1);
2330 obstack_make_room (&extrapool, added);
2332 obstack_int32_grow_fast (&extrapool, 0);
2333 /* XXX What rule? We just pick the first. */
2334 obstack_1grow_fast (&extrapool, 0);
2335 /* Length is zero. */
2336 obstack_1grow_fast (&extrapool, 0);
2338 /* Add alignment bytes if necessary. */
2339 while (!LOCFILE_ALIGNED_P (obstack_object_size (&extrapool)))
2340 obstack_1grow_fast (&extrapool, '\0');
2344 /* Add padding to the tables if necessary. */
2345 while (!LOCFILE_ALIGNED_P (obstack_object_size (&weightpool)))
2346 obstack_1grow (&weightpool, 0);
2348 /* Now add the four tables. */
2349 add_locale_uint32_array (&file, (const uint32_t *) tablemb, 256);
2350 add_locale_raw_obstack (&file, &weightpool);
2351 add_locale_raw_obstack (&file, &extrapool);
2352 add_locale_raw_obstack (&file, &indirectpool);
2354 /* Now the same for the wide character table. We need to store some
2355 more information here. */
2356 add_locale_empty (&file);
2357 add_locale_empty (&file);
2358 add_locale_empty (&file);
2360 /* Since we are using the sign of an integer to mark indirection the
2361 offsets in the arrays we are indirectly referring to must not be
2362 zero since -0 == 0. Therefore we add a bit of dummy content. */
2363 obstack_int32_grow (&extrapool, 0);
2364 obstack_int32_grow (&indirectpool, 0);
2366 /* Now insert the `UNDEFINED' value if it is used. Since this value
2367 will probably be used more than once it is good to store the
2368 weights only once. */
2369 if (output_weightwc (&weightpool, collate, &collate->undefined) != 0)
2370 abort ();
2372 /* Generate the table. Walk through the lists of sequences starting
2373 with the same wide character and add them one after the other to
2374 the table. In case we have more than one sequence starting with
2375 the same byte we have to use extra indirection. */
2376 tablewc.p = 6;
2377 tablewc.q = 10;
2378 collidx_table_init (&tablewc);
2380 atwc.weightpool = &weightpool;
2381 atwc.extrapool = &extrapool;
2382 atwc.indpool = &indirectpool;
2383 atwc.collate = collate;
2384 atwc.tablewc = &tablewc;
2386 wchead_table_iterate (&collate->wcheads, add_to_tablewc);
2388 memset (&atwc, 0, sizeof (atwc));
2390 /* Now add the four tables. */
2391 add_locale_collidx_table (&file, &tablewc);
2392 add_locale_raw_obstack (&file, &weightpool);
2393 add_locale_raw_obstack (&file, &extrapool);
2394 add_locale_raw_obstack (&file, &indirectpool);
2396 /* Finally write the table with collation element names out. It is
2397 a hash table with a simple function which gets the name of the
2398 character as the input. One character might have many names. The
2399 value associated with the name is an index into the weight table
2400 where we are then interested in the first-level weight value.
2402 To determine how large the table should be we are counting the
2403 elements have to put in. Since we are using internal chaining
2404 using a secondary hash function we have to make the table a bit
2405 larger to avoid extremely long search times. We can achieve
2406 good results with a 40% larger table than there are entries. */
2407 elem_size = 0;
2408 runp = collate->start;
2409 while (runp != NULL)
2411 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2412 /* Yep, the element really counts. */
2413 ++elem_size;
2415 runp = runp->next;
2417 /* Add 40% and find the next prime number. */
2418 elem_size = next_prime (elem_size * 1.4);
2420 /* Allocate the table. Each entry consists of two words: the hash
2421 value and an index in a secondary table which provides the index
2422 into the weight table and the string itself (so that a match can
2423 be determined). */
2424 elem_table = (uint32_t *) obstack_alloc (&extrapool,
2425 elem_size * 2 * sizeof (uint32_t));
2426 memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t));
2428 /* Now add the elements. */
2429 runp = collate->start;
2430 while (runp != NULL)
2432 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2434 /* Compute the hash value of the name. */
2435 uint32_t namelen = strlen (runp->name);
2436 uint32_t hash = elem_hash (runp->name, namelen);
2437 size_t idx = hash % elem_size;
2438 #ifndef NDEBUG
2439 size_t start_idx = idx;
2440 #endif
2442 if (elem_table[idx * 2] != 0)
2444 /* The spot is already taken. Try iterating using the value
2445 from the secondary hashing function. */
2446 size_t iter = hash % (elem_size - 2) + 1;
2450 idx += iter;
2451 if (idx >= elem_size)
2452 idx -= elem_size;
2453 assert (idx != start_idx);
2455 while (elem_table[idx * 2] != 0);
2457 /* This is the spot where we will insert the value. */
2458 elem_table[idx * 2] = hash;
2459 elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
2461 /* The string itself including length. */
2462 obstack_1grow (&extrapool, namelen);
2463 obstack_grow (&extrapool, runp->name, namelen);
2465 /* And the multibyte representation. */
2466 obstack_1grow (&extrapool, runp->nmbs);
2467 obstack_grow (&extrapool, runp->mbs, runp->nmbs);
2469 /* And align again to 32 bits. */
2470 if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0)
2471 obstack_grow (&extrapool, "\0\0",
2472 (sizeof (int32_t)
2473 - ((1 + namelen + 1 + runp->nmbs)
2474 % sizeof (int32_t))));
2476 /* Now some 32-bit values: multibyte collation sequence,
2477 wide char string (including length), and wide char
2478 collation sequence. */
2479 obstack_int32_grow (&extrapool, runp->mbseqorder);
2481 obstack_int32_grow (&extrapool, runp->nwcs);
2482 obstack_grow (&extrapool, runp->wcs,
2483 runp->nwcs * sizeof (uint32_t));
2484 maybe_swap_uint32_obstack (&extrapool, runp->nwcs);
2486 obstack_int32_grow (&extrapool, runp->wcseqorder);
2489 runp = runp->next;
2492 /* Prepare to write out this data. */
2493 add_locale_uint32 (&file, elem_size);
2494 add_locale_uint32_array (&file, elem_table, 2 * elem_size);
2495 add_locale_raw_obstack (&file, &extrapool);
2496 add_locale_raw_data (&file, collate->mbseqorder, 256);
2497 add_locale_collseq_table (&file, &collate->wcseqorder);
2498 add_locale_string (&file, charmap->code_set_name);
2499 if (strcmp (charmap->code_set_name, "UTF-8") == 0)
2500 add_locale_uint32 (&file, __cet_utf8);
2501 else if (charmap->mb_cur_max == 1)
2502 add_locale_uint32 (&file, __cet_8bit);
2503 else
2504 add_locale_uint32 (&file, __cet_other);
2505 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", &file);
2507 obstack_free (&weightpool, NULL);
2508 obstack_free (&extrapool, NULL);
2509 obstack_free (&indirectpool, NULL);
2513 static enum token_t
2514 skip_to (struct linereader *ldfile, struct locale_collate_t *collate,
2515 const struct charmap_t *charmap, int to_endif)
2517 while (1)
2519 struct token *now = lr_token (ldfile, charmap, NULL, NULL, 0);
2520 enum token_t nowtok = now->tok;
2522 if (nowtok == tok_eof || nowtok == tok_end)
2523 return nowtok;
2525 if (nowtok == tok_ifdef || nowtok == tok_ifndef)
2527 lr_error (ldfile, _("%s: nested conditionals not supported"),
2528 "LC_COLLATE");
2529 nowtok = skip_to (ldfile, collate, charmap, tok_endif);
2530 if (nowtok == tok_eof || nowtok == tok_end)
2531 return nowtok;
2533 else if (nowtok == tok_endif || (!to_endif && nowtok == tok_else))
2535 lr_ignore_rest (ldfile, 1);
2536 return nowtok;
2538 else if (!to_endif && (nowtok == tok_elifdef || nowtok == tok_elifndef))
2540 /* Do not read the rest of the line. */
2541 return nowtok;
2543 else if (nowtok == tok_else)
2545 lr_error (ldfile, _("%s: more than one 'else'"), "LC_COLLATE");
2548 lr_ignore_rest (ldfile, 0);
2553 void
2554 collate_read (struct linereader *ldfile, struct localedef_t *result,
2555 const struct charmap_t *charmap, const char *repertoire_name,
2556 int ignore_content)
2558 struct repertoire_t *repertoire = NULL;
2559 struct locale_collate_t *collate;
2560 struct token *now;
2561 struct token *arg = NULL;
2562 enum token_t nowtok;
2563 enum token_t was_ellipsis = tok_none;
2564 struct localedef_t *copy_locale = NULL;
2565 /* Parsing state:
2566 0 - start
2567 1 - between `order-start' and `order-end'
2568 2 - after `order-end'
2569 3 - after `reorder-after', waiting for `reorder-end'
2570 4 - after `reorder-end'
2571 5 - after `reorder-sections-after', waiting for `reorder-sections-end'
2572 6 - after `reorder-sections-end'
2574 int state = 0;
2576 /* Get the repertoire we have to use. */
2577 if (repertoire_name != NULL)
2578 repertoire = repertoire_read (repertoire_name);
2580 /* The rest of the line containing `LC_COLLATE' must be free. */
2581 lr_ignore_rest (ldfile, 1);
2583 while (1)
2587 now = lr_token (ldfile, charmap, result, NULL, verbose);
2588 nowtok = now->tok;
2590 while (nowtok == tok_eol);
2592 if (nowtok != tok_define)
2593 break;
2595 if (ignore_content)
2596 lr_ignore_rest (ldfile, 0);
2597 else
2599 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2600 if (arg->tok != tok_ident)
2601 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2602 else
2604 /* Simply add the new symbol. */
2605 struct name_list *newsym = xmalloc (sizeof (*newsym)
2606 + arg->val.str.lenmb + 1);
2607 memcpy (newsym->str, arg->val.str.startmb, arg->val.str.lenmb);
2608 newsym->str[arg->val.str.lenmb] = '\0';
2609 newsym->next = defined;
2610 defined = newsym;
2612 lr_ignore_rest (ldfile, 1);
2617 if (nowtok == tok_copy)
2619 now = lr_token (ldfile, charmap, result, NULL, verbose);
2620 if (now->tok != tok_string)
2622 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2624 skip_category:
2626 now = lr_token (ldfile, charmap, result, NULL, verbose);
2627 while (now->tok != tok_eof && now->tok != tok_end);
2629 if (now->tok != tok_eof
2630 || (now = lr_token (ldfile, charmap, result, NULL, verbose),
2631 now->tok == tok_eof))
2632 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2633 else if (now->tok != tok_lc_collate)
2635 lr_error (ldfile, _("\
2636 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2637 lr_ignore_rest (ldfile, 0);
2639 else
2640 lr_ignore_rest (ldfile, 1);
2642 return;
2645 if (! ignore_content)
2647 /* Get the locale definition. */
2648 copy_locale = load_locale (LC_COLLATE, now->val.str.startmb,
2649 repertoire_name, charmap, NULL);
2650 if ((copy_locale->avail & COLLATE_LOCALE) == 0)
2652 /* Not yet loaded. So do it now. */
2653 if (locfile_read (copy_locale, charmap) != 0)
2654 goto skip_category;
2657 if (copy_locale->categories[LC_COLLATE].collate == NULL)
2658 return;
2661 lr_ignore_rest (ldfile, 1);
2663 now = lr_token (ldfile, charmap, result, NULL, verbose);
2664 nowtok = now->tok;
2667 /* Prepare the data structures. */
2668 collate_startup (ldfile, result, copy_locale, ignore_content);
2669 collate = result->categories[LC_COLLATE].collate;
2671 while (1)
2673 char ucs4buf[10];
2674 char *symstr;
2675 size_t symlen;
2677 /* Of course we don't proceed beyond the end of file. */
2678 if (nowtok == tok_eof)
2679 break;
2681 /* Ingore empty lines. */
2682 if (nowtok == tok_eol)
2684 now = lr_token (ldfile, charmap, result, NULL, verbose);
2685 nowtok = now->tok;
2686 continue;
2689 switch (nowtok)
2691 case tok_copy:
2692 /* Allow copying other locales. */
2693 now = lr_token (ldfile, charmap, result, NULL, verbose);
2694 if (now->tok != tok_string)
2695 goto err_label;
2697 if (! ignore_content)
2698 load_locale (LC_COLLATE, now->val.str.startmb, repertoire_name,
2699 charmap, result);
2701 lr_ignore_rest (ldfile, 1);
2702 break;
2704 case tok_coll_weight_max:
2705 /* Ignore the rest of the line if we don't need the input of
2706 this line. */
2707 if (ignore_content)
2709 lr_ignore_rest (ldfile, 0);
2710 break;
2713 if (state != 0)
2714 goto err_label;
2716 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2717 if (arg->tok != tok_number)
2718 goto err_label;
2719 if (collate->col_weight_max != -1)
2720 lr_error (ldfile, _("%s: duplicate definition of `%s'"),
2721 "LC_COLLATE", "col_weight_max");
2722 else
2723 collate->col_weight_max = arg->val.num;
2724 lr_ignore_rest (ldfile, 1);
2725 break;
2727 case tok_section_symbol:
2728 /* Ignore the rest of the line if we don't need the input of
2729 this line. */
2730 if (ignore_content)
2732 lr_ignore_rest (ldfile, 0);
2733 break;
2736 if (state != 0)
2737 goto err_label;
2739 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2740 if (arg->tok != tok_bsymbol)
2741 goto err_label;
2742 else if (!ignore_content)
2744 /* Check whether this section is already known. */
2745 struct section_list *known = collate->sections;
2746 while (known != NULL)
2748 if (strcmp (known->name, arg->val.str.startmb) == 0)
2749 break;
2750 known = known->next;
2753 if (known != NULL)
2755 lr_error (ldfile,
2756 _("%s: duplicate declaration of section `%s'"),
2757 "LC_COLLATE", arg->val.str.startmb);
2758 free (arg->val.str.startmb);
2760 else
2761 collate->sections = make_seclist_elem (collate,
2762 arg->val.str.startmb,
2763 collate->sections);
2765 lr_ignore_rest (ldfile, known == NULL);
2767 else
2769 free (arg->val.str.startmb);
2770 lr_ignore_rest (ldfile, 0);
2772 break;
2774 case tok_collating_element:
2775 /* Ignore the rest of the line if we don't need the input of
2776 this line. */
2777 if (ignore_content)
2779 lr_ignore_rest (ldfile, 0);
2780 break;
2783 if (state != 0 && state != 2)
2784 goto err_label;
2786 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2787 if (arg->tok != tok_bsymbol)
2788 goto err_label;
2789 else
2791 const char *symbol = arg->val.str.startmb;
2792 size_t symbol_len = arg->val.str.lenmb;
2794 /* Next the `from' keyword. */
2795 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2796 if (arg->tok != tok_from)
2798 free ((char *) symbol);
2799 goto err_label;
2802 ldfile->return_widestr = 1;
2803 ldfile->translate_strings = 1;
2805 /* Finally the string with the replacement. */
2806 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2808 ldfile->return_widestr = 0;
2809 ldfile->translate_strings = 0;
2811 if (arg->tok != tok_string)
2812 goto err_label;
2814 if (!ignore_content && symbol != NULL)
2816 /* The name is already defined. */
2817 if (check_duplicate (ldfile, collate, charmap,
2818 repertoire, symbol, symbol_len))
2819 goto col_elem_free;
2821 if (arg->val.str.startmb != NULL)
2822 insert_entry (&collate->elem_table, symbol, symbol_len,
2823 new_element (collate,
2824 arg->val.str.startmb,
2825 arg->val.str.lenmb - 1,
2826 arg->val.str.startwc,
2827 symbol, symbol_len, 0));
2829 else
2831 col_elem_free:
2832 free ((char *) symbol);
2833 free (arg->val.str.startmb);
2834 free (arg->val.str.startwc);
2836 lr_ignore_rest (ldfile, 1);
2838 break;
2840 case tok_collating_symbol:
2841 /* Ignore the rest of the line if we don't need the input of
2842 this line. */
2843 if (ignore_content)
2845 lr_ignore_rest (ldfile, 0);
2846 break;
2849 if (state != 0 && state != 2)
2850 goto err_label;
2852 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2853 if (arg->tok != tok_bsymbol)
2854 goto err_label;
2855 else
2857 char *symbol = arg->val.str.startmb;
2858 size_t symbol_len = arg->val.str.lenmb;
2859 char *endsymbol = NULL;
2860 size_t endsymbol_len = 0;
2861 enum token_t ellipsis = tok_none;
2863 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2864 if (arg->tok == tok_ellipsis2 || arg->tok == tok_ellipsis4)
2866 ellipsis = arg->tok;
2868 arg = lr_token (ldfile, charmap, result, repertoire,
2869 verbose);
2870 if (arg->tok != tok_bsymbol)
2872 free (symbol);
2873 goto err_label;
2876 endsymbol = arg->val.str.startmb;
2877 endsymbol_len = arg->val.str.lenmb;
2879 lr_ignore_rest (ldfile, 1);
2881 else if (arg->tok != tok_eol)
2883 free (symbol);
2884 goto err_label;
2887 if (!ignore_content)
2889 if (symbol == NULL
2890 || (ellipsis != tok_none && endsymbol == NULL))
2892 lr_error (ldfile, _("\
2893 %s: unknown character in collating symbol name"),
2894 "LC_COLLATE");
2895 goto col_sym_free;
2897 else if (ellipsis == tok_none)
2899 /* A single symbol, no ellipsis. */
2900 if (check_duplicate (ldfile, collate, charmap,
2901 repertoire, symbol, symbol_len))
2902 /* The name is already defined. */
2903 goto col_sym_free;
2905 insert_entry (&collate->sym_table, symbol, symbol_len,
2906 new_symbol (collate, symbol, symbol_len));
2908 else if (symbol_len != endsymbol_len)
2910 col_sym_inv_range:
2911 lr_error (ldfile,
2912 _("invalid names for character range"));
2913 goto col_sym_free;
2915 else
2917 /* Oh my, we have to handle an ellipsis. First, as
2918 usual, determine the common prefix and then
2919 convert the rest into a range. */
2920 size_t prefixlen;
2921 unsigned long int from;
2922 unsigned long int to;
2923 char *endp;
2925 for (prefixlen = 0; prefixlen < symbol_len; ++prefixlen)
2926 if (symbol[prefixlen] != endsymbol[prefixlen])
2927 break;
2929 /* Convert the rest into numbers. */
2930 symbol[symbol_len] = '\0';
2931 from = strtoul (&symbol[prefixlen], &endp,
2932 ellipsis == tok_ellipsis2 ? 16 : 10);
2933 if (*endp != '\0')
2934 goto col_sym_inv_range;
2936 endsymbol[symbol_len] = '\0';
2937 to = strtoul (&endsymbol[prefixlen], &endp,
2938 ellipsis == tok_ellipsis2 ? 16 : 10);
2939 if (*endp != '\0')
2940 goto col_sym_inv_range;
2942 if (from > to)
2943 goto col_sym_inv_range;
2945 /* Now loop over all entries. */
2946 while (from <= to)
2948 char *symbuf;
2950 symbuf = (char *) obstack_alloc (&collate->mempool,
2951 symbol_len + 1);
2953 /* Create the name. */
2954 sprintf (symbuf,
2955 ellipsis == tok_ellipsis2
2956 ? "%.*s%.*lX" : "%.*s%.*lu",
2957 (int) prefixlen, symbol,
2958 (int) (symbol_len - prefixlen), from);
2960 if (check_duplicate (ldfile, collate, charmap,
2961 repertoire, symbuf, symbol_len))
2962 /* The name is already defined. */
2963 goto col_sym_free;
2965 insert_entry (&collate->sym_table, symbuf,
2966 symbol_len,
2967 new_symbol (collate, symbuf,
2968 symbol_len));
2970 /* Increment the counter. */
2971 ++from;
2974 goto col_sym_free;
2977 else
2979 col_sym_free:
2980 free (symbol);
2981 free (endsymbol);
2984 break;
2986 case tok_symbol_equivalence:
2987 /* Ignore the rest of the line if we don't need the input of
2988 this line. */
2989 if (ignore_content)
2991 lr_ignore_rest (ldfile, 0);
2992 break;
2995 if (state != 0)
2996 goto err_label;
2998 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2999 if (arg->tok != tok_bsymbol)
3000 goto err_label;
3001 else
3003 const char *newname = arg->val.str.startmb;
3004 size_t newname_len = arg->val.str.lenmb;
3005 const char *symname;
3006 size_t symname_len;
3007 void *symval; /* Actually struct symbol_t* */
3009 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3010 if (arg->tok != tok_bsymbol)
3012 free ((char *) newname);
3013 goto err_label;
3016 symname = arg->val.str.startmb;
3017 symname_len = arg->val.str.lenmb;
3019 if (newname == NULL)
3021 lr_error (ldfile, _("\
3022 %s: unknown character in equivalent definition name"),
3023 "LC_COLLATE");
3025 sym_equiv_free:
3026 free ((char *) newname);
3027 free ((char *) symname);
3028 break;
3030 if (symname == NULL)
3032 lr_error (ldfile, _("\
3033 %s: unknown character in equivalent definition value"),
3034 "LC_COLLATE");
3035 goto sym_equiv_free;
3038 /* See whether the symbol name is already defined. */
3039 if (find_entry (&collate->sym_table, symname, symname_len,
3040 &symval) != 0)
3042 lr_error (ldfile, _("\
3043 %s: unknown symbol `%s' in equivalent definition"),
3044 "LC_COLLATE", symname);
3045 goto sym_equiv_free;
3048 if (insert_entry (&collate->sym_table,
3049 newname, newname_len, symval) < 0)
3051 lr_error (ldfile, _("\
3052 error while adding equivalent collating symbol"));
3053 goto sym_equiv_free;
3056 free ((char *) symname);
3058 lr_ignore_rest (ldfile, 1);
3059 break;
3061 case tok_script:
3062 /* Ignore the rest of the line if we don't need the input of
3063 this line. */
3064 if (ignore_content)
3066 lr_ignore_rest (ldfile, 0);
3067 break;
3070 /* We get told about the scripts we know. */
3071 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3072 if (arg->tok != tok_bsymbol)
3073 goto err_label;
3074 else
3076 struct section_list *runp = collate->known_sections;
3077 char *name;
3079 while (runp != NULL)
3080 if (strncmp (runp->name, arg->val.str.startmb,
3081 arg->val.str.lenmb) == 0
3082 && runp->name[arg->val.str.lenmb] == '\0')
3083 break;
3084 else
3085 runp = runp->def_next;
3087 if (runp != NULL)
3089 lr_error (ldfile, _("duplicate definition of script `%s'"),
3090 runp->name);
3091 lr_ignore_rest (ldfile, 0);
3092 break;
3095 runp = (struct section_list *) xcalloc (1, sizeof (*runp));
3096 name = (char *) xmalloc (arg->val.str.lenmb + 1);
3097 memcpy (name, arg->val.str.startmb, arg->val.str.lenmb);
3098 name[arg->val.str.lenmb] = '\0';
3099 runp->name = name;
3101 runp->def_next = collate->known_sections;
3102 collate->known_sections = runp;
3104 lr_ignore_rest (ldfile, 1);
3105 break;
3107 case tok_order_start:
3108 /* Ignore the rest of the line if we don't need the input of
3109 this line. */
3110 if (ignore_content)
3112 lr_ignore_rest (ldfile, 0);
3113 break;
3116 if (state != 0 && state != 1 && state != 2)
3117 goto err_label;
3118 state = 1;
3120 /* The 14652 draft does not specify whether all `order_start' lines
3121 must contain the same number of sort-rules, but 14651 does. So
3122 we require this here as well. */
3123 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3124 if (arg->tok == tok_bsymbol)
3126 /* This better should be a section name. */
3127 struct section_list *sp = collate->known_sections;
3128 while (sp != NULL
3129 && (sp->name == NULL
3130 || strncmp (sp->name, arg->val.str.startmb,
3131 arg->val.str.lenmb) != 0
3132 || sp->name[arg->val.str.lenmb] != '\0'))
3133 sp = sp->def_next;
3135 if (sp == NULL)
3137 lr_error (ldfile, _("\
3138 %s: unknown section name `%.*s'"),
3139 "LC_COLLATE", (int) arg->val.str.lenmb,
3140 arg->val.str.startmb);
3141 /* We use the error section. */
3142 collate->current_section = &collate->error_section;
3144 if (collate->error_section.first == NULL)
3146 /* Insert &collate->error_section at the end of
3147 the collate->sections list. */
3148 if (collate->sections == NULL)
3149 collate->sections = &collate->error_section;
3150 else
3152 sp = collate->sections;
3153 while (sp->next != NULL)
3154 sp = sp->next;
3156 sp->next = &collate->error_section;
3158 collate->error_section.next = NULL;
3161 else
3163 /* One should not be allowed to open the same
3164 section twice. */
3165 if (sp->first != NULL)
3166 lr_error (ldfile, _("\
3167 %s: multiple order definitions for section `%s'"),
3168 "LC_COLLATE", sp->name);
3169 else
3171 /* Insert sp in the collate->sections list,
3172 right after collate->current_section. */
3173 if (collate->current_section != NULL)
3175 sp->next = collate->current_section->next;
3176 collate->current_section->next = sp;
3178 else if (collate->sections == NULL)
3179 /* This is the first section to be defined. */
3180 collate->sections = sp;
3182 collate->current_section = sp;
3185 /* Next should come the end of the line or a semicolon. */
3186 arg = lr_token (ldfile, charmap, result, repertoire,
3187 verbose);
3188 if (arg->tok == tok_eol)
3190 uint32_t cnt;
3192 /* This means we have exactly one rule: `forward'. */
3193 if (nrules > 1)
3194 lr_error (ldfile, _("\
3195 %s: invalid number of sorting rules"),
3196 "LC_COLLATE");
3197 else
3198 nrules = 1;
3199 sp->rules = obstack_alloc (&collate->mempool,
3200 (sizeof (enum coll_sort_rule)
3201 * nrules));
3202 for (cnt = 0; cnt < nrules; ++cnt)
3203 sp->rules[cnt] = sort_forward;
3205 /* Next line. */
3206 break;
3209 /* Get the next token. */
3210 arg = lr_token (ldfile, charmap, result, repertoire,
3211 verbose);
3214 else
3216 /* There is no section symbol. Therefore we use the unnamed
3217 section. */
3218 collate->current_section = &collate->unnamed_section;
3220 if (collate->unnamed_section_defined)
3221 lr_error (ldfile, _("\
3222 %s: multiple order definitions for unnamed section"),
3223 "LC_COLLATE");
3224 else
3226 /* Insert &collate->unnamed_section at the beginning of
3227 the collate->sections list. */
3228 collate->unnamed_section.next = collate->sections;
3229 collate->sections = &collate->unnamed_section;
3230 collate->unnamed_section_defined = true;
3234 /* Now read the direction names. */
3235 read_directions (ldfile, arg, charmap, repertoire, result);
3237 /* From now we need the strings untranslated. */
3238 ldfile->translate_strings = 0;
3239 break;
3241 case tok_order_end:
3242 /* Ignore the rest of the line if we don't need the input of
3243 this line. */
3244 if (ignore_content)
3246 lr_ignore_rest (ldfile, 0);
3247 break;
3250 if (state != 1)
3251 goto err_label;
3253 /* Handle ellipsis at end of list. */
3254 if (was_ellipsis != tok_none)
3256 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3257 repertoire, result);
3258 was_ellipsis = tok_none;
3261 state = 2;
3262 lr_ignore_rest (ldfile, 1);
3263 break;
3265 case tok_reorder_after:
3266 /* Ignore the rest of the line if we don't need the input of
3267 this line. */
3268 if (ignore_content)
3270 lr_ignore_rest (ldfile, 0);
3271 break;
3274 if (state == 1)
3276 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3277 "LC_COLLATE");
3278 state = 2;
3280 /* Handle ellipsis at end of list. */
3281 if (was_ellipsis != tok_none)
3283 handle_ellipsis (ldfile, arg->val.str.startmb,
3284 arg->val.str.lenmb, was_ellipsis, charmap,
3285 repertoire, result);
3286 was_ellipsis = tok_none;
3289 else if (state == 0 && copy_locale == NULL)
3290 goto err_label;
3291 else if (state != 0 && state != 2 && state != 3)
3292 goto err_label;
3293 state = 3;
3295 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3296 if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
3298 /* Find this symbol in the sequence table. */
3299 char ucsbuf[10];
3300 char *startmb;
3301 size_t lenmb;
3302 struct element_t *insp;
3303 int no_error = 1;
3304 void *ptr;
3306 if (arg->tok == tok_bsymbol)
3308 startmb = arg->val.str.startmb;
3309 lenmb = arg->val.str.lenmb;
3311 else
3313 sprintf (ucsbuf, "U%08X", arg->val.ucs4);
3314 startmb = ucsbuf;
3315 lenmb = 9;
3318 if (find_entry (&collate->seq_table, startmb, lenmb, &ptr) == 0)
3319 /* Yes, the symbol exists. Simply point the cursor
3320 to it. */
3321 collate->cursor = (struct element_t *) ptr;
3322 else
3324 struct symbol_t *symbp;
3325 void *ptr;
3327 if (find_entry (&collate->sym_table, startmb, lenmb,
3328 &ptr) == 0)
3330 symbp = ptr;
3332 if (symbp->order->last != NULL
3333 || symbp->order->next != NULL)
3334 collate->cursor = symbp->order;
3335 else
3337 /* This is a collating symbol but its position
3338 is not yet defined. */
3339 lr_error (ldfile, _("\
3340 %s: order for collating symbol %.*s not yet defined"),
3341 "LC_COLLATE", (int) lenmb, startmb);
3342 collate->cursor = NULL;
3343 no_error = 0;
3346 else if (find_entry (&collate->elem_table, startmb, lenmb,
3347 &ptr) == 0)
3349 insp = (struct element_t *) ptr;
3351 if (insp->last != NULL || insp->next != NULL)
3352 collate->cursor = insp;
3353 else
3355 /* This is a collating element but its position
3356 is not yet defined. */
3357 lr_error (ldfile, _("\
3358 %s: order for collating element %.*s not yet defined"),
3359 "LC_COLLATE", (int) lenmb, startmb);
3360 collate->cursor = NULL;
3361 no_error = 0;
3364 else
3366 /* This is bad. The symbol after which we have to
3367 insert does not exist. */
3368 lr_error (ldfile, _("\
3369 %s: cannot reorder after %.*s: symbol not known"),
3370 "LC_COLLATE", (int) lenmb, startmb);
3371 collate->cursor = NULL;
3372 no_error = 0;
3376 lr_ignore_rest (ldfile, no_error);
3378 else
3379 /* This must not happen. */
3380 goto err_label;
3381 break;
3383 case tok_reorder_end:
3384 /* Ignore the rest of the line if we don't need the input of
3385 this line. */
3386 if (ignore_content)
3387 break;
3389 if (state != 3)
3390 goto err_label;
3391 state = 4;
3392 lr_ignore_rest (ldfile, 1);
3393 break;
3395 case tok_reorder_sections_after:
3396 /* Ignore the rest of the line if we don't need the input of
3397 this line. */
3398 if (ignore_content)
3400 lr_ignore_rest (ldfile, 0);
3401 break;
3404 if (state == 1)
3406 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3407 "LC_COLLATE");
3408 state = 2;
3410 /* Handle ellipsis at end of list. */
3411 if (was_ellipsis != tok_none)
3413 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3414 repertoire, result);
3415 was_ellipsis = tok_none;
3418 else if (state == 3)
3420 WITH_CUR_LOCALE (error (0, 0, _("\
3421 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3422 state = 4;
3424 else if (state != 2 && state != 4)
3425 goto err_label;
3426 state = 5;
3428 /* Get the name of the sections we are adding after. */
3429 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3430 if (arg->tok == tok_bsymbol)
3432 /* Now find a section with this name. */
3433 struct section_list *runp = collate->sections;
3435 while (runp != NULL)
3437 if (runp->name != NULL
3438 && strlen (runp->name) == arg->val.str.lenmb
3439 && memcmp (runp->name, arg->val.str.startmb,
3440 arg->val.str.lenmb) == 0)
3441 break;
3443 runp = runp->next;
3446 if (runp != NULL)
3447 collate->current_section = runp;
3448 else
3450 /* This is bad. The section after which we have to
3451 reorder does not exist. Therefore we cannot
3452 process the whole rest of this reorder
3453 specification. */
3454 lr_error (ldfile, _("%s: section `%.*s' not known"),
3455 "LC_COLLATE", (int) arg->val.str.lenmb,
3456 arg->val.str.startmb);
3460 lr_ignore_rest (ldfile, 0);
3462 now = lr_token (ldfile, charmap, result, NULL, verbose);
3464 while (now->tok == tok_reorder_sections_after
3465 || now->tok == tok_reorder_sections_end
3466 || now->tok == tok_end);
3468 /* Process the token we just saw. */
3469 nowtok = now->tok;
3470 continue;
3473 else
3474 /* This must not happen. */
3475 goto err_label;
3476 break;
3478 case tok_reorder_sections_end:
3479 /* Ignore the rest of the line if we don't need the input of
3480 this line. */
3481 if (ignore_content)
3482 break;
3484 if (state != 5)
3485 goto err_label;
3486 state = 6;
3487 lr_ignore_rest (ldfile, 1);
3488 break;
3490 case tok_bsymbol:
3491 case tok_ucs4:
3492 /* Ignore the rest of the line if we don't need the input of
3493 this line. */
3494 if (ignore_content)
3496 lr_ignore_rest (ldfile, 0);
3497 break;
3500 if (state != 0 && state != 1 && state != 3 && state != 5)
3501 goto err_label;
3503 if ((state == 0 || state == 5) && nowtok == tok_ucs4)
3504 goto err_label;
3506 if (nowtok == tok_ucs4)
3508 snprintf (ucs4buf, sizeof (ucs4buf), "U%08X", now->val.ucs4);
3509 symstr = ucs4buf;
3510 symlen = 9;
3512 else if (arg != NULL)
3514 symstr = arg->val.str.startmb;
3515 symlen = arg->val.str.lenmb;
3517 else
3519 lr_error (ldfile, _("%s: bad symbol <%.*s>"), "LC_COLLATE",
3520 (int) ldfile->token.val.str.lenmb,
3521 ldfile->token.val.str.startmb);
3522 break;
3525 struct element_t *seqp;
3526 if (state == 0)
3528 /* We are outside an `order_start' region. This means
3529 we must only accept definitions of values for
3530 collation symbols since these are purely abstract
3531 values and don't need directions associated. */
3532 void *ptr;
3534 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3536 seqp = ptr;
3538 /* It's already defined. First check whether this
3539 is really a collating symbol. */
3540 if (seqp->is_character)
3541 goto err_label;
3543 goto move_entry;
3545 else
3547 void *result;
3549 if (find_entry (&collate->sym_table, symstr, symlen,
3550 &result) != 0)
3551 /* No collating symbol, it's an error. */
3552 goto err_label;
3554 /* Maybe this is the first time we define a symbol
3555 value and it is before the first actual section. */
3556 if (collate->sections == NULL)
3557 collate->sections = collate->current_section =
3558 &collate->symbol_section;
3561 if (was_ellipsis != tok_none)
3563 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis,
3564 charmap, repertoire, result);
3566 /* Remember that we processed the ellipsis. */
3567 was_ellipsis = tok_none;
3569 /* And don't add the value a second time. */
3570 break;
3573 else if (state == 3)
3575 /* It is possible that we already have this collation sequence.
3576 In this case we move the entry. */
3577 void *sym;
3578 void *ptr;
3580 /* If the symbol after which we have to insert was not found
3581 ignore all entries. */
3582 if (collate->cursor == NULL)
3584 lr_ignore_rest (ldfile, 0);
3585 break;
3588 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3590 seqp = (struct element_t *) ptr;
3591 goto move_entry;
3594 if (find_entry (&collate->sym_table, symstr, symlen, &sym) == 0
3595 && (seqp = ((struct symbol_t *) sym)->order) != NULL)
3596 goto move_entry;
3598 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) == 0
3599 && (seqp = (struct element_t *) ptr,
3600 seqp->last != NULL || seqp->next != NULL
3601 || (collate->start != NULL && seqp == collate->start)))
3603 move_entry:
3604 /* Remove the entry from the old position. */
3605 if (seqp->last == NULL)
3606 collate->start = seqp->next;
3607 else
3608 seqp->last->next = seqp->next;
3609 if (seqp->next != NULL)
3610 seqp->next->last = seqp->last;
3612 /* We also have to check whether this entry is the
3613 first or last of a section. */
3614 if (seqp->section->first == seqp)
3616 if (seqp->section->first == seqp->section->last)
3617 /* This section has no content anymore. */
3618 seqp->section->first = seqp->section->last = NULL;
3619 else
3620 seqp->section->first = seqp->next;
3622 else if (seqp->section->last == seqp)
3623 seqp->section->last = seqp->last;
3625 /* Now insert it in the new place. */
3626 insert_weights (ldfile, seqp, charmap, repertoire, result,
3627 tok_none);
3628 break;
3631 /* Otherwise we just add a new entry. */
3633 else if (state == 5)
3635 /* We are reordering sections. Find the named section. */
3636 struct section_list *runp = collate->sections;
3637 struct section_list *prevp = NULL;
3639 while (runp != NULL)
3641 if (runp->name != NULL
3642 && strlen (runp->name) == symlen
3643 && memcmp (runp->name, symstr, symlen) == 0)
3644 break;
3646 prevp = runp;
3647 runp = runp->next;
3650 if (runp == NULL)
3652 lr_error (ldfile, _("%s: section `%.*s' not known"),
3653 "LC_COLLATE", (int) symlen, symstr);
3654 lr_ignore_rest (ldfile, 0);
3656 else
3658 if (runp != collate->current_section)
3660 /* Remove the named section from the old place and
3661 insert it in the new one. */
3662 prevp->next = runp->next;
3664 runp->next = collate->current_section->next;
3665 collate->current_section->next = runp;
3666 collate->current_section = runp;
3669 /* Process the rest of the line which might change
3670 the collation rules. */
3671 arg = lr_token (ldfile, charmap, result, repertoire,
3672 verbose);
3673 if (arg->tok != tok_eof && arg->tok != tok_eol)
3674 read_directions (ldfile, arg, charmap, repertoire,
3675 result);
3677 break;
3679 else if (was_ellipsis != tok_none)
3681 /* Using the information in the `ellipsis_weight'
3682 element and this and the last value we have to handle
3683 the ellipsis now. */
3684 assert (state == 1);
3686 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis, charmap,
3687 repertoire, result);
3689 /* Remember that we processed the ellipsis. */
3690 was_ellipsis = tok_none;
3692 /* And don't add the value a second time. */
3693 break;
3696 /* Now insert in the new place. */
3697 insert_value (ldfile, symstr, symlen, charmap, repertoire, result);
3698 break;
3700 case tok_undefined:
3701 /* Ignore the rest of the line if we don't need the input of
3702 this line. */
3703 if (ignore_content)
3705 lr_ignore_rest (ldfile, 0);
3706 break;
3709 if (state != 1)
3710 goto err_label;
3712 if (was_ellipsis != tok_none)
3714 lr_error (ldfile,
3715 _("%s: cannot have `%s' as end of ellipsis range"),
3716 "LC_COLLATE", "UNDEFINED");
3718 unlink_element (collate);
3719 was_ellipsis = tok_none;
3722 /* See whether UNDEFINED already appeared somewhere. */
3723 if (collate->undefined.next != NULL
3724 || &collate->undefined == collate->cursor)
3726 lr_error (ldfile,
3727 _("%s: order for `%.*s' already defined at %s:%Zu"),
3728 "LC_COLLATE", 9, "UNDEFINED",
3729 collate->undefined.file,
3730 collate->undefined.line);
3731 lr_ignore_rest (ldfile, 0);
3733 else
3734 /* Parse the weights. */
3735 insert_weights (ldfile, &collate->undefined, charmap,
3736 repertoire, result, tok_none);
3737 break;
3739 case tok_ellipsis2: /* symbolic hexadecimal ellipsis */
3740 case tok_ellipsis3: /* absolute ellipsis */
3741 case tok_ellipsis4: /* symbolic decimal ellipsis */
3742 /* This is the symbolic (decimal or hexadecimal) or absolute
3743 ellipsis. */
3744 if (was_ellipsis != tok_none)
3745 goto err_label;
3747 if (state != 0 && state != 1 && state != 3)
3748 goto err_label;
3750 was_ellipsis = nowtok;
3752 insert_weights (ldfile, &collate->ellipsis_weight, charmap,
3753 repertoire, result, nowtok);
3754 break;
3756 case tok_end:
3757 seen_end:
3758 /* Next we assume `LC_COLLATE'. */
3759 if (!ignore_content)
3761 if (state == 0 && copy_locale == NULL)
3762 /* We must either see a copy statement or have
3763 ordering values. */
3764 lr_error (ldfile,
3765 _("%s: empty category description not allowed"),
3766 "LC_COLLATE");
3767 else if (state == 1)
3769 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3770 "LC_COLLATE");
3772 /* Handle ellipsis at end of list. */
3773 if (was_ellipsis != tok_none)
3775 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3776 repertoire, result);
3777 was_ellipsis = tok_none;
3780 else if (state == 3)
3781 WITH_CUR_LOCALE (error (0, 0, _("\
3782 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3783 else if (state == 5)
3784 WITH_CUR_LOCALE (error (0, 0, _("\
3785 %s: missing `reorder-sections-end' keyword"), "LC_COLLATE"));
3787 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3788 if (arg->tok == tok_eof)
3789 break;
3790 if (arg->tok == tok_eol)
3791 lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
3792 else if (arg->tok != tok_lc_collate)
3793 lr_error (ldfile, _("\
3794 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
3795 lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
3796 return;
3798 case tok_define:
3799 if (ignore_content)
3801 lr_ignore_rest (ldfile, 0);
3802 break;
3805 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3806 if (arg->tok != tok_ident)
3807 goto err_label;
3809 /* Simply add the new symbol. */
3810 struct name_list *newsym = xmalloc (sizeof (*newsym)
3811 + arg->val.str.lenmb + 1);
3812 memcpy (newsym->str, arg->val.str.startmb, arg->val.str.lenmb);
3813 newsym->str[arg->val.str.lenmb] = '\0';
3814 newsym->next = defined;
3815 defined = newsym;
3817 lr_ignore_rest (ldfile, 1);
3818 break;
3820 case tok_undef:
3821 if (ignore_content)
3823 lr_ignore_rest (ldfile, 0);
3824 break;
3827 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3828 if (arg->tok != tok_ident)
3829 goto err_label;
3831 /* Remove _all_ occurrences of the symbol from the list. */
3832 struct name_list *prevdef = NULL;
3833 struct name_list *curdef = defined;
3834 while (curdef != NULL)
3835 if (strncmp (arg->val.str.startmb, curdef->str,
3836 arg->val.str.lenmb) == 0
3837 && curdef->str[arg->val.str.lenmb] == '\0')
3839 if (prevdef == NULL)
3840 defined = curdef->next;
3841 else
3842 prevdef->next = curdef->next;
3844 struct name_list *olddef = curdef;
3845 curdef = curdef->next;
3847 free (olddef);
3849 else
3851 prevdef = curdef;
3852 curdef = curdef->next;
3855 lr_ignore_rest (ldfile, 1);
3856 break;
3858 case tok_ifdef:
3859 case tok_ifndef:
3860 if (ignore_content)
3862 lr_ignore_rest (ldfile, 0);
3863 break;
3866 found_ifdef:
3867 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3868 if (arg->tok != tok_ident)
3869 goto err_label;
3870 lr_ignore_rest (ldfile, 1);
3872 if (collate->else_action == else_none)
3874 curdef = defined;
3875 while (curdef != NULL)
3876 if (strncmp (arg->val.str.startmb, curdef->str,
3877 arg->val.str.lenmb) == 0
3878 && curdef->str[arg->val.str.lenmb] == '\0')
3879 break;
3880 else
3881 curdef = curdef->next;
3883 if ((nowtok == tok_ifdef && curdef != NULL)
3884 || (nowtok == tok_ifndef && curdef == NULL))
3886 /* We have to use the if-branch. */
3887 collate->else_action = else_ignore;
3889 else
3891 /* We have to use the else-branch, if there is one. */
3892 nowtok = skip_to (ldfile, collate, charmap, 0);
3893 if (nowtok == tok_else)
3894 collate->else_action = else_seen;
3895 else if (nowtok == tok_elifdef)
3897 nowtok = tok_ifdef;
3898 goto found_ifdef;
3900 else if (nowtok == tok_elifndef)
3902 nowtok = tok_ifndef;
3903 goto found_ifdef;
3905 else if (nowtok == tok_eof)
3906 goto seen_eof;
3907 else if (nowtok == tok_end)
3908 goto seen_end;
3911 else
3913 /* XXX Should it really become necessary to support nested
3914 preprocessor handling we will push the state here. */
3915 lr_error (ldfile, _("%s: nested conditionals not supported"),
3916 "LC_COLLATE");
3917 nowtok = skip_to (ldfile, collate, charmap, 1);
3918 if (nowtok == tok_eof)
3919 goto seen_eof;
3920 else if (nowtok == tok_end)
3921 goto seen_end;
3923 break;
3925 case tok_elifdef:
3926 case tok_elifndef:
3927 case tok_else:
3928 if (ignore_content)
3930 lr_ignore_rest (ldfile, 0);
3931 break;
3934 lr_ignore_rest (ldfile, 1);
3936 if (collate->else_action == else_ignore)
3938 /* Ignore everything until the endif. */
3939 nowtok = skip_to (ldfile, collate, charmap, 1);
3940 if (nowtok == tok_eof)
3941 goto seen_eof;
3942 else if (nowtok == tok_end)
3943 goto seen_end;
3945 else
3947 assert (collate->else_action == else_none);
3948 lr_error (ldfile, _("\
3949 %s: '%s' without matching 'ifdef' or 'ifndef'"), "LC_COLLATE",
3950 nowtok == tok_else ? "else"
3951 : nowtok == tok_elifdef ? "elifdef" : "elifndef");
3953 break;
3955 case tok_endif:
3956 if (ignore_content)
3958 lr_ignore_rest (ldfile, 0);
3959 break;
3962 lr_ignore_rest (ldfile, 1);
3964 if (collate->else_action != else_ignore
3965 && collate->else_action != else_seen)
3966 lr_error (ldfile, _("\
3967 %s: 'endif' without matching 'ifdef' or 'ifndef'"), "LC_COLLATE");
3969 /* XXX If we support nested preprocessor directives we pop
3970 the state here. */
3971 collate->else_action = else_none;
3972 break;
3974 default:
3975 err_label:
3976 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
3979 /* Prepare for the next round. */
3980 now = lr_token (ldfile, charmap, result, NULL, verbose);
3981 nowtok = now->tok;
3984 seen_eof:
3985 /* When we come here we reached the end of the file. */
3986 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");