Update.
[glibc.git] / locale / programs / ld-collate.c
blob191194799de17f0cbbc40c4b082c10c5e3fa5143
1 /* Copyright (C) 1995-1999, 2000, 2001, 2002 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <errno.h>
25 #include <error.h>
26 #include <stdlib.h>
27 #include <wchar.h>
28 #include <sys/param.h>
30 #include "localedef.h"
31 #include "charmap.h"
32 #include "localeinfo.h"
33 #include "linereader.h"
34 #include "locfile.h"
35 #include "elem-hash.h"
37 /* Uncomment the following line in the production version. */
38 /* #define NDEBUG 1 */
39 #include <assert.h>
41 #define obstack_chunk_alloc malloc
42 #define obstack_chunk_free free
44 static inline void
45 obstack_int32_grow (struct obstack *obstack, int32_t data)
47 if (sizeof (int32_t) == sizeof (int))
48 obstack_int_grow (obstack, data);
49 else
50 obstack_grow (obstack, &data, sizeof (int32_t));
53 static inline void
54 obstack_int32_grow_fast (struct obstack *obstack, int32_t data)
56 if (sizeof (int32_t) == sizeof (int))
57 obstack_int_grow_fast (obstack, data);
58 else
59 obstack_grow (obstack, &data, sizeof (int32_t));
62 /* Forward declaration. */
63 struct element_t;
65 /* Data type for list of strings. */
66 struct section_list
68 /* Successor in the known_sections list. */
69 struct section_list *def_next;
70 /* Successor in the sections list. */
71 struct section_list *next;
72 /* Name of the section. */
73 const char *name;
74 /* First element of this section. */
75 struct element_t *first;
76 /* Last element of this section. */
77 struct element_t *last;
78 /* These are the rules for this section. */
79 enum coll_sort_rule *rules;
80 /* Index of the rule set in the appropriate section of the output file. */
81 int ruleidx;
84 struct element_t;
86 struct element_list_t
88 /* Number of elements. */
89 int cnt;
91 struct element_t **w;
94 /* Data type for collating element. */
95 struct element_t
97 const char *name;
99 const char *mbs;
100 size_t nmbs;
101 const uint32_t *wcs;
102 size_t nwcs;
103 int *mborder;
104 int wcorder;
106 /* The following is a bit mask which bits are set if this element is
107 used in the appropriate level. Interesting for the singlebyte
108 weight computation.
110 XXX The type here restricts the number of levels to 32. It could
111 be changed if necessary but I doubt this is necessary. */
112 unsigned int used_in_level;
114 struct element_list_t *weights;
116 /* Nonzero if this is a real character definition. */
117 int is_character;
119 /* Order of the character in the sequence. This information will
120 be used in range expressions. */
121 int mbseqorder;
122 int wcseqorder;
124 /* Where does the definition come from. */
125 const char *file;
126 size_t line;
128 /* Which section does this belong to. */
129 struct section_list *section;
131 /* Predecessor and successor in the order list. */
132 struct element_t *last;
133 struct element_t *next;
135 /* Next element in multibyte output list. */
136 struct element_t *mbnext;
137 struct element_t *mblast;
139 /* Next element in wide character output list. */
140 struct element_t *wcnext;
141 struct element_t *wclast;
144 /* Special element value. */
145 #define ELEMENT_ELLIPSIS2 ((struct element_t *) 1)
146 #define ELEMENT_ELLIPSIS3 ((struct element_t *) 2)
147 #define ELEMENT_ELLIPSIS4 ((struct element_t *) 3)
149 /* Data type for collating symbol. */
150 struct symbol_t
152 const char *name;
154 /* Point to place in the order list. */
155 struct element_t *order;
157 /* Where does the definition come from. */
158 const char *file;
159 size_t line;
162 /* Sparse table of struct element_t *. */
163 #define TABLE wchead_table
164 #define ELEMENT struct element_t *
165 #define DEFAULT NULL
166 #define ITERATE
167 #define NO_FINALIZE
168 #include "3level.h"
170 /* Sparse table of int32_t. */
171 #define TABLE collidx_table
172 #define ELEMENT int32_t
173 #define DEFAULT 0
174 #include "3level.h"
176 /* Sparse table of uint32_t. */
177 #define TABLE collseq_table
178 #define ELEMENT uint32_t
179 #define DEFAULT ~((uint32_t) 0)
180 #include "3level.h"
183 /* The real definition of the struct for the LC_COLLATE locale. */
184 struct locale_collate_t
186 int col_weight_max;
187 int cur_weight_max;
189 /* List of known scripts. */
190 struct section_list *known_sections;
191 /* List of used sections. */
192 struct section_list *sections;
193 /* Current section using definition. */
194 struct section_list *current_section;
195 /* There always can be an unnamed section. */
196 struct section_list unnamed_section;
197 /* To make handling of errors easier we have another section. */
198 struct section_list error_section;
199 /* Sometimes we are defining the values for collating symbols before
200 the first actual section. */
201 struct section_list symbol_section;
203 /* Start of the order list. */
204 struct element_t *start;
206 /* The undefined element. */
207 struct element_t undefined;
209 /* This is the cursor for `reorder_after' insertions. */
210 struct element_t *cursor;
212 /* This value is used when handling ellipsis. */
213 struct element_t ellipsis_weight;
215 /* Known collating elements. */
216 hash_table elem_table;
218 /* Known collating symbols. */
219 hash_table sym_table;
221 /* Known collation sequences. */
222 hash_table seq_table;
224 struct obstack mempool;
226 /* The LC_COLLATE category is a bit special as it is sometimes possible
227 that the definitions from more than one input file contains information.
228 Therefore we keep all relevant input in a list. */
229 struct locale_collate_t *next;
231 /* Arrays with heads of the list for each of the leading bytes in
232 the multibyte sequences. */
233 struct element_t *mbheads[256];
235 /* Arrays with heads of the list for each of the leading bytes in
236 the multibyte sequences. */
237 struct wchead_table wcheads;
239 /* The arrays with the collation sequence order. */
240 unsigned char mbseqorder[256];
241 struct collseq_table wcseqorder;
245 /* We have a few global variables which are used for reading all
246 LC_COLLATE category descriptions in all files. */
247 static uint32_t nrules;
250 /* We need UTF-8 encoding of numbers. */
251 static inline int
252 utf8_encode (char *buf, int val)
254 int retval;
256 if (val < 0x80)
258 *buf++ = (char) val;
259 retval = 1;
261 else
263 int step;
265 for (step = 2; step < 6; ++step)
266 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
267 break;
268 retval = step;
270 *buf = (unsigned char) (~0xff >> step);
271 --step;
274 buf[step] = 0x80 | (val & 0x3f);
275 val >>= 6;
277 while (--step > 0);
278 *buf |= val;
281 return retval;
285 static struct section_list *
286 make_seclist_elem (struct locale_collate_t *collate, const char *string,
287 struct section_list *next)
289 struct section_list *newp;
291 newp = (struct section_list *) obstack_alloc (&collate->mempool,
292 sizeof (*newp));
293 newp->next = next;
294 newp->name = string;
295 newp->first = NULL;
296 newp->last = NULL;
298 return newp;
302 static struct element_t *
303 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
304 const uint32_t *wcs, const char *name, size_t namelen,
305 int is_character)
307 struct element_t *newp;
309 newp = (struct element_t *) obstack_alloc (&collate->mempool,
310 sizeof (*newp));
311 newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
312 name, namelen);
313 if (mbs != NULL)
315 newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
316 newp->nmbs = mbslen;
318 else
320 newp->mbs = NULL;
321 newp->nmbs = 0;
323 if (wcs != NULL)
325 size_t nwcs = wcslen ((wchar_t *) wcs);
326 uint32_t zero = 0;
327 obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
328 obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
329 newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
330 newp->nwcs = nwcs;
332 else
334 newp->wcs = NULL;
335 newp->nwcs = 0;
337 newp->mborder = NULL;
338 newp->wcorder = 0;
339 newp->used_in_level = 0;
340 newp->is_character = is_character;
342 /* Will be assigned later. XXX */
343 newp->mbseqorder = 0;
344 newp->wcseqorder = 0;
346 /* Will be allocated later. */
347 newp->weights = NULL;
349 newp->file = NULL;
350 newp->line = 0;
352 newp->section = collate->current_section;
354 newp->last = NULL;
355 newp->next = NULL;
357 newp->mbnext = NULL;
358 newp->mblast = NULL;
360 newp->wcnext = NULL;
361 newp->wclast = NULL;
363 return newp;
367 static struct symbol_t *
368 new_symbol (struct locale_collate_t *collate, const char *name, size_t len)
370 struct symbol_t *newp;
372 newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
374 newp->name = obstack_copy0 (&collate->mempool, name, len);
375 newp->order = NULL;
377 newp->file = NULL;
378 newp->line = 0;
380 return newp;
384 /* Test whether this name is already defined somewhere. */
385 static int
386 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
387 const struct charmap_t *charmap,
388 struct repertoire_t *repertoire, const char *symbol,
389 size_t symbol_len)
391 void *ignore = NULL;
393 if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
395 lr_error (ldfile, _("`%.*s' already defined in charmap"),
396 (int) symbol_len, symbol);
397 return 1;
400 if (repertoire != NULL
401 && (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore)
402 == 0))
404 lr_error (ldfile, _("`%.*s' already defined in repertoire"),
405 (int) symbol_len, symbol);
406 return 1;
409 if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
411 lr_error (ldfile, _("`%.*s' already defined as collating symbol"),
412 (int) symbol_len, symbol);
413 return 1;
416 if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
418 lr_error (ldfile, _("`%.*s' already defined as collating element"),
419 (int) symbol_len, symbol);
420 return 1;
423 return 0;
427 /* Read the direction specification. */
428 static void
429 read_directions (struct linereader *ldfile, struct token *arg,
430 const struct charmap_t *charmap,
431 struct repertoire_t *repertoire, struct localedef_t *result)
433 int cnt = 0;
434 int max = nrules ?: 10;
435 enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
436 int warned = 0;
437 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
439 while (1)
441 int valid = 0;
443 if (arg->tok == tok_forward)
445 if (rules[cnt] & sort_backward)
447 if (! warned)
449 lr_error (ldfile, _("\
450 %s: `forward' and `backward' are mutually excluding each other"),
451 "LC_COLLATE");
452 warned = 1;
455 else if (rules[cnt] & sort_forward)
457 if (! warned)
459 lr_error (ldfile, _("\
460 %s: `%s' mentioned more than once in definition of weight %d"),
461 "LC_COLLATE", "forward", cnt + 1);
464 else
465 rules[cnt] |= sort_forward;
467 valid = 1;
469 else if (arg->tok == tok_backward)
471 if (rules[cnt] & sort_forward)
473 if (! warned)
475 lr_error (ldfile, _("\
476 %s: `forward' and `backward' are mutually excluding each other"),
477 "LC_COLLATE");
478 warned = 1;
481 else if (rules[cnt] & sort_backward)
483 if (! warned)
485 lr_error (ldfile, _("\
486 %s: `%s' mentioned more than once in definition of weight %d"),
487 "LC_COLLATE", "backward", cnt + 1);
490 else
491 rules[cnt] |= sort_backward;
493 valid = 1;
495 else if (arg->tok == tok_position)
497 if (rules[cnt] & sort_position)
499 if (! warned)
501 lr_error (ldfile, _("\
502 %s: `%s' mentioned more than once in definition of weight %d"),
503 "LC_COLLATE", "position", cnt + 1);
506 else
507 rules[cnt] |= sort_position;
509 valid = 1;
512 if (valid)
513 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
515 if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
516 || arg->tok == tok_semicolon)
518 if (! valid && ! warned)
520 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
521 warned = 1;
524 /* See whether we have to increment the counter. */
525 if (arg->tok != tok_comma && rules[cnt] != 0)
527 /* Add the default `forward' if we have seen only `position'. */
528 if (rules[cnt] == sort_position)
529 rules[cnt] = sort_position | sort_forward;
531 ++cnt;
534 if (arg->tok == tok_eof || arg->tok == tok_eol)
535 /* End of line or file, so we exit the loop. */
536 break;
538 if (nrules == 0)
540 /* See whether we have enough room in the array. */
541 if (cnt == max)
543 max += 10;
544 rules = (enum coll_sort_rule *) xrealloc (rules,
546 * sizeof (*rules));
547 memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
550 else
552 if (cnt == nrules)
554 /* There must not be any more rule. */
555 if (! warned)
557 lr_error (ldfile, _("\
558 %s: too many rules; first entry only had %d"),
559 "LC_COLLATE", nrules);
560 warned = 1;
563 lr_ignore_rest (ldfile, 0);
564 break;
568 else
570 if (! warned)
572 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
573 warned = 1;
577 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
580 if (nrules == 0)
582 /* Now we know how many rules we have. */
583 nrules = cnt;
584 rules = (enum coll_sort_rule *) xrealloc (rules,
585 nrules * sizeof (*rules));
587 else
589 if (cnt < nrules)
591 /* Not enough rules in this specification. */
592 if (! warned)
593 lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
596 rules[cnt] = sort_forward;
597 while (++cnt < nrules);
601 collate->current_section->rules = rules;
605 static struct element_t *
606 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
607 const char *str, size_t len)
609 struct element_t *result = NULL;
611 /* Search for the entries among the collation sequences already define. */
612 if (find_entry (&collate->seq_table, str, len, (void **) &result) != 0)
614 /* Nope, not define yet. So we see whether it is a
615 collation symbol. */
616 void *ptr;
618 if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
620 /* It's a collation symbol. */
621 struct symbol_t *sym = (struct symbol_t *) ptr;
622 result = sym->order;
624 if (result == NULL)
625 result = sym->order = new_element (collate, NULL, 0, NULL,
626 NULL, 0, 0);
628 else if (find_entry (&collate->elem_table, str, len,
629 (void **) &result) != 0)
631 /* It's also no collation element. So it is a character
632 element defined later. */
633 result = new_element (collate, NULL, 0, NULL, str, len, 1);
634 /* Insert it into the sequence table. */
635 insert_entry (&collate->seq_table, str, len, result);
639 return result;
643 static void
644 unlink_element (struct locale_collate_t *collate)
646 if (collate->cursor == collate->start)
648 assert (collate->cursor->next == NULL);
649 assert (collate->cursor->last == NULL);
650 collate->cursor = NULL;
652 else
654 if (collate->cursor->next != NULL)
655 collate->cursor->next->last = collate->cursor->last;
656 if (collate->cursor->last != NULL)
657 collate->cursor->last->next = collate->cursor->next;
658 collate->cursor = collate->cursor->last;
663 static void
664 insert_weights (struct linereader *ldfile, struct element_t *elem,
665 const struct charmap_t *charmap,
666 struct repertoire_t *repertoire, struct localedef_t *result,
667 enum token_t ellipsis)
669 int weight_cnt;
670 struct token *arg;
671 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
673 /* Initialize all the fields. */
674 elem->file = ldfile->fname;
675 elem->line = ldfile->lineno;
677 elem->last = collate->cursor;
678 elem->next = collate->cursor ? collate->cursor->next : NULL;
679 if (collate->cursor != NULL && collate->cursor->next != NULL)
680 collate->cursor->next->last = elem;
681 if (collate->cursor != NULL)
682 collate->cursor->next = elem;
683 if (collate->start == NULL)
685 assert (collate->cursor == NULL);
686 collate->start = elem;
689 elem->section = collate->current_section;
691 if (collate->current_section->first == NULL)
692 collate->current_section->first = elem;
693 if (collate->current_section->last == collate->cursor)
694 collate->current_section->last = elem;
696 collate->cursor = elem;
698 elem->weights = (struct element_list_t *)
699 obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
700 memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
702 weight_cnt = 0;
704 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
707 if (arg->tok == tok_eof || arg->tok == tok_eol)
708 break;
710 if (arg->tok == tok_ignore)
712 /* The weight for this level has to be ignored. We use the
713 null pointer to indicate this. */
714 elem->weights[weight_cnt].w = (struct element_t **)
715 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
716 elem->weights[weight_cnt].w[0] = NULL;
717 elem->weights[weight_cnt].cnt = 1;
719 else if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
721 char ucs4str[10];
722 struct element_t *val;
723 char *symstr;
724 size_t symlen;
726 if (arg->tok == tok_bsymbol)
728 symstr = arg->val.str.startmb;
729 symlen = arg->val.str.lenmb;
731 else
733 snprintf (ucs4str, sizeof (ucs4str), "U%08X", arg->val.ucs4);
734 symstr = ucs4str;
735 symlen = 9;
738 val = find_element (ldfile, collate, symstr, symlen);
739 if (val == NULL)
740 break;
742 elem->weights[weight_cnt].w = (struct element_t **)
743 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
744 elem->weights[weight_cnt].w[0] = val;
745 elem->weights[weight_cnt].cnt = 1;
747 else if (arg->tok == tok_string)
749 /* Split the string up in the individual characters and put
750 the element definitions in the list. */
751 const char *cp = arg->val.str.startmb;
752 int cnt = 0;
753 struct element_t *charelem;
754 struct element_t **weights = NULL;
755 int max = 0;
757 if (*cp == '\0')
759 lr_error (ldfile, _("%s: empty weight string not allowed"),
760 "LC_COLLATE");
761 lr_ignore_rest (ldfile, 0);
762 break;
767 if (*cp == '<')
769 /* Ahh, it's a bsymbol or an UCS4 value. If it's
770 the latter we have to unify the name. */
771 const char *startp = ++cp;
772 size_t len;
774 while (*cp != '>')
776 if (*cp == ldfile->escape_char)
777 ++cp;
778 if (*cp == '\0')
779 /* It's a syntax error. */
780 goto syntax;
782 ++cp;
785 if (cp - startp == 5 && startp[0] == 'U'
786 && isxdigit (startp[1]) && isxdigit (startp[2])
787 && isxdigit (startp[3]) && isxdigit (startp[4]))
789 unsigned int ucs4 = strtoul (startp + 1, NULL, 16);
790 char *newstr;
792 newstr = (char *) xmalloc (10);
793 snprintf (newstr, 10, "U%08X", ucs4);
794 startp = newstr;
796 len = 9;
798 else
799 len = cp - startp;
801 charelem = find_element (ldfile, collate, startp, len);
802 ++cp;
804 else
806 /* People really shouldn't use characters directly in
807 the string. Especially since it's not really clear
808 what this means. We interpret all characters in the
809 string as if that would be bsymbols. Otherwise we
810 would have to match back to bsymbols somehow and this
811 is normally not what people normally expect. */
812 charelem = find_element (ldfile, collate, cp++, 1);
815 if (charelem == NULL)
817 /* We ignore the rest of the line. */
818 lr_ignore_rest (ldfile, 0);
819 break;
822 /* Add the pointer. */
823 if (cnt >= max)
825 struct element_t **newp;
826 max += 10;
827 newp = (struct element_t **)
828 alloca (max * sizeof (struct element_t *));
829 memcpy (newp, weights, cnt * sizeof (struct element_t *));
830 weights = newp;
832 weights[cnt++] = charelem;
834 while (*cp != '\0');
836 /* Now store the information. */
837 elem->weights[weight_cnt].w = (struct element_t **)
838 obstack_alloc (&collate->mempool,
839 cnt * sizeof (struct element_t *));
840 memcpy (elem->weights[weight_cnt].w, weights,
841 cnt * sizeof (struct element_t *));
842 elem->weights[weight_cnt].cnt = cnt;
844 /* We don't need the string anymore. */
845 free (arg->val.str.startmb);
847 else if (ellipsis != tok_none
848 && (arg->tok == tok_ellipsis2
849 || arg->tok == tok_ellipsis3
850 || arg->tok == tok_ellipsis4))
852 /* It must be the same ellipsis as used in the initial column. */
853 if (arg->tok != ellipsis)
854 lr_error (ldfile, _("\
855 %s: weights must use the same ellipsis symbol as the name"),
856 "LC_COLLATE");
858 /* The weight for this level will depend on the element
859 iterating over the range. Put a placeholder. */
860 elem->weights[weight_cnt].w = (struct element_t **)
861 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
862 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
863 elem->weights[weight_cnt].cnt = 1;
865 else
867 syntax:
868 /* It's a syntax error. */
869 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
870 lr_ignore_rest (ldfile, 0);
871 break;
874 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
875 /* This better should be the end of the line or a semicolon. */
876 if (arg->tok == tok_semicolon)
877 /* OK, ignore this and read the next token. */
878 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
879 else if (arg->tok != tok_eof && arg->tok != tok_eol)
881 /* It's a syntax error. */
882 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
883 lr_ignore_rest (ldfile, 0);
884 break;
887 while (++weight_cnt < nrules);
889 if (weight_cnt < nrules)
891 /* This means the rest of the line uses the current element as
892 the weight. */
895 elem->weights[weight_cnt].w = (struct element_t **)
896 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
897 if (ellipsis == tok_none)
898 elem->weights[weight_cnt].w[0] = elem;
899 else
900 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
901 elem->weights[weight_cnt].cnt = 1;
903 while (++weight_cnt < nrules);
905 else
907 if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
909 /* Too many rule values. */
910 lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
911 lr_ignore_rest (ldfile, 0);
913 else
914 lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
919 static int
920 insert_value (struct linereader *ldfile, const char *symstr, size_t symlen,
921 const struct charmap_t *charmap, struct repertoire_t *repertoire,
922 struct localedef_t *result)
924 /* First find out what kind of symbol this is. */
925 struct charseq *seq;
926 uint32_t wc;
927 struct element_t *elem = NULL;
928 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
930 /* Try to find the character in the charmap. */
931 seq = charmap_find_value (charmap, symstr, symlen);
933 /* Determine the wide character. */
934 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
936 wc = repertoire_find_value (repertoire, symstr, symlen);
937 if (seq != NULL)
938 seq->ucs4 = wc;
940 else
941 wc = seq->ucs4;
943 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
945 /* It's no character, so look through the collation elements and
946 symbol list. */
947 if (find_entry (&collate->elem_table, symstr, symlen,
948 (void **) &elem) != 0)
950 void *result;
951 struct symbol_t *sym = NULL;
953 /* It's also collation element. Therefore it's either a
954 collating symbol or it's a character which is not
955 supported by the character set. In the later case we
956 simply create a dummy entry. */
957 if (find_entry (&collate->sym_table, symstr, symlen, &result) == 0)
959 /* It's a collation symbol. */
960 sym = (struct symbol_t *) result;
962 elem = sym->order;
965 if (elem == NULL)
967 elem = new_element (collate, NULL, 0, NULL, symstr, symlen, 0);
969 if (sym != NULL)
970 sym->order = elem;
971 else
972 /* Enter a fake element in the sequence table. This
973 won't cause anything in the output since there is
974 no multibyte or wide character associated with
975 it. */
976 insert_entry (&collate->seq_table, symstr, symlen, elem);
980 else
982 /* Otherwise the symbols stands for a character. */
983 if (find_entry (&collate->seq_table, symstr, symlen,
984 (void **) &elem) != 0)
986 uint32_t wcs[2] = { wc, 0 };
988 /* We have to allocate an entry. */
989 elem = new_element (collate, seq != NULL ? seq->bytes : NULL,
990 seq != NULL ? seq->nbytes : 0,
991 wc == ILLEGAL_CHAR_VALUE ? NULL : wcs,
992 symstr, symlen, 1);
994 /* And add it to the table. */
995 if (insert_entry (&collate->seq_table, symstr, symlen, elem) != 0)
996 /* This cannot happen. */
997 assert (! "Internal error");
999 else
1001 /* Maybe the character was used before the definition. In this case
1002 we have to insert the byte sequences now. */
1003 if (elem->mbs == NULL && seq != NULL)
1005 elem->mbs = obstack_copy0 (&collate->mempool,
1006 seq->bytes, seq->nbytes);
1007 elem->nmbs = seq->nbytes;
1010 if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
1012 uint32_t wcs[2] = { wc, 0 };
1014 elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
1015 elem->nwcs = 1;
1020 /* Test whether this element is not already in the list. */
1021 if (elem->next != NULL || elem == collate->cursor)
1023 lr_error (ldfile, _("order for `%.*s' already defined at %s:%Zu"),
1024 (int) symlen, symstr, elem->file, elem->line);
1025 lr_ignore_rest (ldfile, 0);
1026 return 1;
1029 insert_weights (ldfile, elem, charmap, repertoire, result, tok_none);
1031 return 0;
1035 static void
1036 handle_ellipsis (struct linereader *ldfile, const char *symstr, size_t symlen,
1037 enum token_t ellipsis, const struct charmap_t *charmap,
1038 struct repertoire_t *repertoire,
1039 struct localedef_t *result)
1041 struct element_t *startp;
1042 struct element_t *endp;
1043 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
1045 /* Unlink the entry added for the ellipsis. */
1046 unlink_element (collate);
1047 startp = collate->cursor;
1049 /* Process and add the end-entry. */
1050 if (symstr != NULL
1051 && insert_value (ldfile, symstr, symlen, charmap, repertoire, result))
1052 /* Something went wrong with inserting the to-value. This means
1053 we cannot process the ellipsis. */
1054 return;
1056 /* Reset the cursor. */
1057 collate->cursor = startp;
1059 /* Now we have to handle many different situations:
1060 - we have to distinguish between the three different ellipsis forms
1061 - the is the ellipsis at the beginning, in the middle, or at the end.
1063 endp = collate->cursor->next;
1064 assert (symstr == NULL || endp != NULL);
1066 /* XXX The following is probably very wrong since also collating symbols
1067 can appear in ranges. But do we want/can refine the test for that? */
1068 #if 0
1069 /* Both, the start and the end symbol, must stand for characters. */
1070 if ((startp != NULL && (startp->name == NULL || ! startp->is_character))
1071 || (endp != NULL && (endp->name == NULL|| ! endp->is_character)))
1073 lr_error (ldfile, _("\
1074 %s: the start and the end symbol of a range must stand for characters"),
1075 "LC_COLLATE");
1076 return;
1078 #endif
1080 if (ellipsis == tok_ellipsis3)
1082 /* One requirement we make here: the length of the byte
1083 sequences for the first and end character must be the same.
1084 This is mainly to prevent unwanted effects and this is often
1085 not what is wanted. */
1086 size_t len = (startp->mbs != NULL ? startp->nmbs
1087 : (endp->mbs != NULL ? endp->nmbs : 0));
1088 char mbcnt[len + 1];
1089 char mbend[len + 1];
1091 /* Well, this should be caught somewhere else already. Just to
1092 make sure. */
1093 assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
1094 assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
1096 if (startp != NULL && endp != NULL
1097 && startp->mbs != NULL && endp->mbs != NULL
1098 && startp->nmbs != endp->nmbs)
1100 lr_error (ldfile, _("\
1101 %s: byte sequences of first and last character must have the same length"),
1102 "LC_COLLATE");
1103 return;
1106 /* Determine whether we have to generate multibyte sequences. */
1107 if ((startp == NULL || startp->mbs != NULL)
1108 && (endp == NULL || endp->mbs != NULL))
1110 int cnt;
1111 int ret;
1113 /* Prepare the beginning byte sequence. This is either from the
1114 beginning byte sequence or it is all nulls if it was an
1115 initial ellipsis. */
1116 if (startp == NULL || startp->mbs == NULL)
1117 memset (mbcnt, '\0', len);
1118 else
1120 memcpy (mbcnt, startp->mbs, len);
1122 /* And increment it so that the value is the first one we will
1123 try to insert. */
1124 for (cnt = len - 1; cnt >= 0; --cnt)
1125 if (++mbcnt[cnt] != '\0')
1126 break;
1128 mbcnt[len] = '\0';
1130 /* And the end sequence. */
1131 if (endp == NULL || endp->mbs == NULL)
1132 memset (mbend, '\0', len);
1133 else
1134 memcpy (mbend, endp->mbs, len);
1135 mbend[len] = '\0';
1137 /* Test whether we have a correct range. */
1138 ret = memcmp (mbcnt, mbend, len);
1139 if (ret >= 0)
1141 if (ret > 0)
1142 lr_error (ldfile, _("%s: byte sequence of first character of \
1143 sequence is not lower than that of the last character"), "LC_COLLATE");
1144 return;
1147 /* Generate the byte sequences data. */
1148 while (1)
1150 struct charseq *seq;
1152 /* Quite a bit of work ahead. We have to find the character
1153 definition for the byte sequence and then determine the
1154 wide character belonging to it. */
1155 seq = charmap_find_symbol (charmap, mbcnt, len);
1156 if (seq != NULL)
1158 struct element_t *elem;
1159 size_t namelen;
1161 /* I don't this this can ever happen. */
1162 assert (seq->name != NULL);
1163 namelen = strlen (seq->name);
1165 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1166 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1167 namelen);
1169 /* Now we are ready to insert the new value in the
1170 sequence. Find out whether the element is
1171 already known. */
1172 if (find_entry (&collate->seq_table, seq->name, namelen,
1173 (void **) &elem) != 0)
1175 uint32_t wcs[2] = { seq->ucs4, 0 };
1177 /* We have to allocate an entry. */
1178 elem = new_element (collate, mbcnt, len,
1179 seq->ucs4 == ILLEGAL_CHAR_VALUE
1180 ? NULL : wcs, seq->name,
1181 namelen, 1);
1183 /* And add it to the table. */
1184 if (insert_entry (&collate->seq_table, seq->name,
1185 namelen, elem) != 0)
1186 /* This cannot happen. */
1187 assert (! "Internal error");
1190 /* Test whether this element is not already in the list. */
1191 if (elem->next != NULL || (collate->cursor != NULL
1192 && elem->next == collate->cursor))
1194 lr_error (ldfile, _("\
1195 order for `%.*s' already defined at %s:%Zu"),
1196 (int) namelen, seq->name,
1197 elem->file, elem->line);
1198 goto increment;
1201 /* Enqueue the new element. */
1202 elem->last = collate->cursor;
1203 if (collate->cursor == NULL)
1204 elem->next = NULL;
1205 else
1207 elem->next = collate->cursor->next;
1208 elem->last->next = elem;
1209 if (elem->next != NULL)
1210 elem->next->last = elem;
1212 if (collate->start == NULL)
1214 assert (collate->cursor == NULL);
1215 collate->start = elem;
1217 collate->cursor = elem;
1219 /* Add the weight value. We take them from the
1220 `ellipsis_weights' member of `collate'. */
1221 elem->weights = (struct element_list_t *)
1222 obstack_alloc (&collate->mempool,
1223 nrules * sizeof (struct element_list_t));
1224 for (cnt = 0; cnt < nrules; ++cnt)
1225 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1226 && (collate->ellipsis_weight.weights[cnt].w[0]
1227 == ELEMENT_ELLIPSIS2))
1229 elem->weights[cnt].w = (struct element_t **)
1230 obstack_alloc (&collate->mempool,
1231 sizeof (struct element_t *));
1232 elem->weights[cnt].w[0] = elem;
1233 elem->weights[cnt].cnt = 1;
1235 else
1237 /* Simply use the weight from `ellipsis_weight'. */
1238 elem->weights[cnt].w =
1239 collate->ellipsis_weight.weights[cnt].w;
1240 elem->weights[cnt].cnt =
1241 collate->ellipsis_weight.weights[cnt].cnt;
1245 /* Increment for the next round. */
1246 increment:
1247 for (cnt = len - 1; cnt >= 0; --cnt)
1248 if (++mbcnt[cnt] != '\0')
1249 break;
1251 /* Find out whether this was all. */
1252 if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1253 /* Yep, that's all. */
1254 break;
1258 else
1260 /* For symbolic range we naturally must have a beginning and an
1261 end specified by the user. */
1262 if (startp == NULL)
1263 lr_error (ldfile, _("\
1264 %s: symbolic range ellipsis must not directly follow `order_start'"),
1265 "LC_COLLATE");
1266 else if (endp == NULL)
1267 lr_error (ldfile, _("\
1268 %s: symbolic range ellipsis must not be directly followed by `order_end'"),
1269 "LC_COLLATE");
1270 else
1272 /* Determine the range. To do so we have to determine the
1273 common prefix of the both names and then the numeric
1274 values of both ends. */
1275 size_t lenfrom = strlen (startp->name);
1276 size_t lento = strlen (endp->name);
1277 char buf[lento + 1];
1278 int preflen = 0;
1279 long int from;
1280 long int to;
1281 char *cp;
1282 int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1284 if (lenfrom != lento)
1286 invalid_range:
1287 lr_error (ldfile, _("\
1288 `%s' and `%.*s' are no valid names for symbolic range"),
1289 startp->name, (int) lento, endp->name);
1290 return;
1293 while (startp->name[preflen] == endp->name[preflen])
1294 if (startp->name[preflen] == '\0')
1295 /* Nothing to be done. The start and end point are identical
1296 and while inserting the end point we have already given
1297 the user an error message. */
1298 return;
1299 else
1300 ++preflen;
1302 errno = 0;
1303 from = strtol (startp->name + preflen, &cp, base);
1304 if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1305 goto invalid_range;
1307 errno = 0;
1308 to = strtol (endp->name + preflen, &cp, base);
1309 if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1310 goto invalid_range;
1312 /* Copy the prefix. */
1313 memcpy (buf, startp->name, preflen);
1315 /* Loop over all values. */
1316 for (++from; from < to; ++from)
1318 struct element_t *elem = NULL;
1319 struct charseq *seq;
1320 uint32_t wc;
1321 int cnt;
1323 /* Generate the the name. */
1324 sprintf (buf + preflen, base == 10 ? "%ld" : "%lX", from);
1326 /* Look whether this name is already defined. */
1327 if (find_entry (&collate->seq_table, buf, symlen,
1328 (void **) &elem) == 0)
1330 if (elem->next != NULL || (collate->cursor != NULL
1331 && elem->next == collate->cursor))
1333 lr_error (ldfile, _("\
1334 %s: order for `%.*s' already defined at %s:%Zu"),
1335 "LC_COLLATE", (int) lenfrom, buf,
1336 elem->file, elem->line);
1337 continue;
1340 if (elem->name == NULL)
1342 lr_error (ldfile, _("%s: `%s' must be a character"),
1343 "LC_COLLATE", buf);
1344 continue;
1348 if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1350 /* Search for a character of this name. */
1351 seq = charmap_find_value (charmap, buf, lenfrom);
1352 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1354 wc = repertoire_find_value (repertoire, buf, lenfrom);
1356 if (seq != NULL)
1357 seq->ucs4 = wc;
1359 else
1360 wc = seq->ucs4;
1362 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1363 /* We don't know anything about a character with this
1364 name. XXX Should we warn? */
1365 continue;
1367 if (elem == NULL)
1369 uint32_t wcs[2] = { wc, 0 };
1371 /* We have to allocate an entry. */
1372 elem = new_element (collate,
1373 seq != NULL ? seq->bytes : NULL,
1374 seq != NULL ? seq->nbytes : 0,
1375 wc == ILLEGAL_CHAR_VALUE
1376 ? NULL : wcs, buf, lenfrom, 1);
1378 else
1380 /* Update the element. */
1381 if (seq != NULL)
1383 elem->mbs = obstack_copy0 (&collate->mempool,
1384 seq->bytes, seq->nbytes);
1385 elem->nmbs = seq->nbytes;
1388 if (wc != ILLEGAL_CHAR_VALUE)
1390 uint32_t zero = 0;
1392 obstack_grow (&collate->mempool,
1393 &wc, sizeof (uint32_t));
1394 obstack_grow (&collate->mempool,
1395 &zero, sizeof (uint32_t));
1396 elem->wcs = obstack_finish (&collate->mempool);
1397 elem->nwcs = 1;
1401 elem->file = ldfile->fname;
1402 elem->line = ldfile->lineno;
1403 elem->section = collate->current_section;
1406 /* Enqueue the new element. */
1407 elem->last = collate->cursor;
1408 elem->next = collate->cursor->next;
1409 elem->last->next = elem;
1410 if (elem->next != NULL)
1411 elem->next->last = elem;
1412 collate->cursor = elem;
1414 /* Now add the weights. They come from the `ellipsis_weights'
1415 member of `collate'. */
1416 elem->weights = (struct element_list_t *)
1417 obstack_alloc (&collate->mempool,
1418 nrules * sizeof (struct element_list_t));
1419 for (cnt = 0; cnt < nrules; ++cnt)
1420 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1421 && (collate->ellipsis_weight.weights[cnt].w[0]
1422 == ELEMENT_ELLIPSIS2))
1424 elem->weights[cnt].w = (struct element_t **)
1425 obstack_alloc (&collate->mempool,
1426 sizeof (struct element_t *));
1427 elem->weights[cnt].w[0] = elem;
1428 elem->weights[cnt].cnt = 1;
1430 else
1432 /* Simly use the weight from `ellipsis_weight'. */
1433 elem->weights[cnt].w =
1434 collate->ellipsis_weight.weights[cnt].w;
1435 elem->weights[cnt].cnt =
1436 collate->ellipsis_weight.weights[cnt].cnt;
1444 static void
1445 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1446 struct localedef_t *copy_locale, int ignore_content)
1448 if (!ignore_content && locale->categories[LC_COLLATE].collate == NULL)
1450 struct locale_collate_t *collate;
1452 if (copy_locale == NULL)
1454 collate = locale->categories[LC_COLLATE].collate =
1455 (struct locale_collate_t *)
1456 xcalloc (1, sizeof (struct locale_collate_t));
1458 /* Init the various data structures. */
1459 init_hash (&collate->elem_table, 100);
1460 init_hash (&collate->sym_table, 100);
1461 init_hash (&collate->seq_table, 500);
1462 obstack_init (&collate->mempool);
1464 collate->col_weight_max = -1;
1466 else
1467 /* Reuse the copy_locale's data structures. */
1468 collate = locale->categories[LC_COLLATE].collate =
1469 copy_locale->categories[LC_COLLATE].collate;
1472 ldfile->translate_strings = 0;
1473 ldfile->return_widestr = 0;
1477 void
1478 collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
1480 /* Now is the time when we can assign the individual collation
1481 values for all the symbols. We have possibly different values
1482 for the wide- and the multibyte-character symbols. This is done
1483 since it might make a difference in the encoding if there is in
1484 some cases no multibyte-character but there are wide-characters.
1485 (The other way around it is not important since theencoded
1486 collation value in the wide-character case is 32 bits wide and
1487 therefore requires no encoding).
1489 The lowest collation value assigned is 2. Zero is reserved for
1490 the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1491 functions and 1 is used to separate the individual passes for the
1492 different rules.
1494 We also have to construct is list with all the bytes/words which
1495 can come first in a sequence, followed by all the elements which
1496 also start with this byte/word. The order is reverse which has
1497 among others the important effect that longer strings are located
1498 first in the list. This is required for the output data since
1499 the algorithm used in `strcoll' etc depends on this.
1501 The multibyte case is easy. We simply sort into an array with
1502 256 elements. */
1503 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1504 int mbact[nrules];
1505 int wcact;
1506 int mbseqact;
1507 int wcseqact;
1508 struct element_t *runp;
1509 int i;
1510 int need_undefined = 0;
1511 struct section_list *sect;
1512 int ruleidx;
1513 int nr_wide_elems = 0;
1515 if (collate == NULL)
1517 /* No data, no check. */
1518 if (! be_quiet)
1519 WITH_CUR_LOCALE (error (0, 0, _("No definition for %s category found"),
1520 "LC_COLLATE"));
1521 return;
1524 /* If this assertion is hit change the type in `element_t'. */
1525 assert (nrules <= sizeof (runp->used_in_level) * 8);
1527 /* Make sure that the `position' rule is used either in all sections
1528 or in none. */
1529 for (i = 0; i < nrules; ++i)
1530 for (sect = collate->sections; sect != NULL; sect = sect->next)
1531 if (sect->rules != NULL
1532 && ((sect->rules[i] & sort_position)
1533 != (collate->sections->rules[i] & sort_position)))
1535 WITH_CUR_LOCALE (error (0, 0, _("\
1536 %s: `position' must be used for a specific level in all sections or none"),
1537 "LC_COLLATE"));
1538 break;
1541 /* Find out which elements are used at which level. At the same
1542 time we find out whether we have any undefined symbols. */
1543 runp = collate->start;
1544 while (runp != NULL)
1546 if (runp->mbs != NULL)
1548 for (i = 0; i < nrules; ++i)
1550 int j;
1552 for (j = 0; j < runp->weights[i].cnt; ++j)
1553 /* A NULL pointer as the weight means IGNORE. */
1554 if (runp->weights[i].w[j] != NULL)
1556 if (runp->weights[i].w[j]->weights == NULL)
1558 WITH_CUR_LOCALE (error_at_line (0, 0, runp->file,
1559 runp->line,
1560 _("symbol `%s' not defined"),
1561 runp->weights[i].w[j]->name));
1563 need_undefined = 1;
1564 runp->weights[i].w[j] = &collate->undefined;
1566 else
1567 /* Set the bit for the level. */
1568 runp->weights[i].w[j]->used_in_level |= 1 << i;
1573 /* Up to the next entry. */
1574 runp = runp->next;
1577 /* Walk through the list of defined sequences and assign weights. Also
1578 create the data structure which will allow generating the single byte
1579 character based tables.
1581 Since at each time only the weights for each of the rules are
1582 only compared to other weights for this rule it is possible to
1583 assign more compact weight values than simply counting all
1584 weights in sequence. We can assign weights from 3, one for each
1585 rule individually and only for those elements, which are actually
1586 used for this rule.
1588 Why is this important? It is not for the wide char table. But
1589 it is for the singlebyte output since here larger numbers have to
1590 be encoded to make it possible to emit the value as a byte
1591 string. */
1592 for (i = 0; i < nrules; ++i)
1593 mbact[i] = 2;
1594 wcact = 2;
1595 mbseqact = 0;
1596 wcseqact = 0;
1597 runp = collate->start;
1598 while (runp != NULL)
1600 /* Determine the order. */
1601 if (runp->used_in_level != 0)
1603 runp->mborder = (int *) obstack_alloc (&collate->mempool,
1604 nrules * sizeof (int));
1606 for (i = 0; i < nrules; ++i)
1607 if ((runp->used_in_level & (1 << i)) != 0)
1608 runp->mborder[i] = mbact[i]++;
1609 else
1610 runp->mborder[i] = 0;
1613 if (runp->mbs != NULL)
1615 struct element_t **eptr;
1616 struct element_t *lastp = NULL;
1618 /* Find the point where to insert in the list. */
1619 eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1620 while (*eptr != NULL)
1622 if ((*eptr)->nmbs < runp->nmbs)
1623 break;
1625 if ((*eptr)->nmbs == runp->nmbs)
1627 int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1629 if (c == 0)
1631 /* This should not happen. It means that we have
1632 to symbols with the same byte sequence. It is
1633 of course an error. */
1634 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1635 (*eptr)->line,
1636 _("\
1637 symbol `%s' has the same encoding as"), (*eptr)->name);
1638 error_at_line (0, 0, runp->file,
1639 runp->line,
1640 _("symbol `%s'"),
1641 runp->name));
1642 goto dont_insert;
1644 else if (c < 0)
1645 /* Insert it here. */
1646 break;
1649 /* To the next entry. */
1650 lastp = *eptr;
1651 eptr = &(*eptr)->mbnext;
1654 /* Set the pointers. */
1655 runp->mbnext = *eptr;
1656 runp->mblast = lastp;
1657 if (*eptr != NULL)
1658 (*eptr)->mblast = runp;
1659 *eptr = runp;
1660 dont_insert:
1664 if (runp->used_in_level)
1666 runp->wcorder = wcact++;
1668 /* We take the opportunity to count the elements which have
1669 wide characters. */
1670 ++nr_wide_elems;
1673 if (runp->is_character)
1675 if (runp->nmbs == 1)
1676 collate->mbseqorder[((unsigned char *) runp->mbs)[0]] = mbseqact++;
1678 runp->wcseqorder = wcseqact++;
1680 else if (runp->mbs != NULL && runp->weights != NULL)
1681 /* This is for collation elements. */
1682 runp->wcseqorder = wcseqact++;
1684 /* Up to the next entry. */
1685 runp = runp->next;
1688 /* Find out whether any of the `mbheads' entries is unset. In this
1689 case we use the UNDEFINED entry. */
1690 for (i = 1; i < 256; ++i)
1691 if (collate->mbheads[i] == NULL)
1693 need_undefined = 1;
1694 collate->mbheads[i] = &collate->undefined;
1697 /* Now to the wide character case. */
1698 collate->wcheads.p = 6;
1699 collate->wcheads.q = 10;
1700 wchead_table_init (&collate->wcheads);
1702 collate->wcseqorder.p = 6;
1703 collate->wcseqorder.q = 10;
1704 collseq_table_init (&collate->wcseqorder);
1706 /* Start adding. */
1707 runp = collate->start;
1708 while (runp != NULL)
1710 if (runp->wcs != NULL)
1712 struct element_t *e;
1713 struct element_t **eptr;
1714 struct element_t *lastp;
1716 /* Insert the collation sequence value. */
1717 if (runp->is_character)
1718 collseq_table_add (&collate->wcseqorder, runp->wcs[0],
1719 runp->wcseqorder);
1721 /* Find the point where to insert in the list. */
1722 e = wchead_table_get (&collate->wcheads, runp->wcs[0]);
1723 eptr = &e;
1724 lastp = NULL;
1725 while (*eptr != NULL)
1727 if ((*eptr)->nwcs < runp->nwcs)
1728 break;
1730 if ((*eptr)->nwcs == runp->nwcs)
1732 int c = wmemcmp ((wchar_t *) (*eptr)->wcs,
1733 (wchar_t *) runp->wcs, runp->nwcs);
1735 if (c == 0)
1737 /* This should not happen. It means that we have
1738 two symbols with the same byte sequence. It is
1739 of course an error. */
1740 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1741 (*eptr)->line,
1742 _("\
1743 symbol `%s' has the same encoding as"), (*eptr)->name);
1744 error_at_line (0, 0, runp->file,
1745 runp->line,
1746 _("symbol `%s'"),
1747 runp->name));
1748 goto dont_insertwc;
1750 else if (c < 0)
1751 /* Insert it here. */
1752 break;
1755 /* To the next entry. */
1756 lastp = *eptr;
1757 eptr = &(*eptr)->wcnext;
1760 /* Set the pointers. */
1761 runp->wcnext = *eptr;
1762 runp->wclast = lastp;
1763 if (*eptr != NULL)
1764 (*eptr)->wclast = runp;
1765 *eptr = runp;
1766 if (eptr == &e)
1767 wchead_table_add (&collate->wcheads, runp->wcs[0], e);
1768 dont_insertwc:
1772 /* Up to the next entry. */
1773 runp = runp->next;
1776 collseq_table_finalize (&collate->wcseqorder);
1778 /* Now determine whether the UNDEFINED entry is needed and if yes,
1779 whether it was defined. */
1780 collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1781 if (collate->undefined.file == NULL)
1783 if (need_undefined)
1785 /* This seems not to be enforced by recent standards. Don't
1786 emit an error, simply append UNDEFINED at the end. */
1787 if (0)
1788 WITH_CUR_LOCALE (error (0, 0, _("no definition of `UNDEFINED'")));
1790 /* Add UNDEFINED at the end. */
1791 collate->undefined.mborder =
1792 (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1794 for (i = 0; i < nrules; ++i)
1795 collate->undefined.mborder[i] = mbact[i]++;
1798 /* In any case we will need the definition for the wide character
1799 case. But we will not complain that it is missing since the
1800 specification strangely enough does not seem to account for
1801 this. */
1802 collate->undefined.wcorder = wcact++;
1805 /* Finally, try to unify the rules for the sections. Whenever the rules
1806 for a section are the same as those for another section give the
1807 ruleset the same index. Since there are never many section we can
1808 use an O(n^2) algorithm here. */
1809 sect = collate->sections;
1810 while (sect != NULL && sect->rules == NULL)
1811 sect = sect->next;
1813 /* Bail out if we have no sections because of earlier errors. */
1814 if (sect == NULL)
1816 WITH_CUR_LOCALE (error (EXIT_FAILURE, 0,
1817 _("too many errors; giving up")));
1818 return;
1821 ruleidx = 0;
1824 struct section_list *osect = collate->sections;
1826 while (osect != sect)
1827 if (osect->rules != NULL
1828 && memcmp (osect->rules, sect->rules, nrules) == 0)
1829 break;
1830 else
1831 osect = osect->next;
1833 if (osect == sect)
1834 sect->ruleidx = ruleidx++;
1835 else
1836 sect->ruleidx = osect->ruleidx;
1838 /* Next section. */
1840 sect = sect->next;
1841 while (sect != NULL && sect->rules == NULL);
1843 while (sect != NULL);
1844 /* We are currently not prepared for more than 128 rulesets. But this
1845 should never really be a problem. */
1846 assert (ruleidx <= 128);
1850 static int32_t
1851 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1852 struct element_t *elem)
1854 size_t cnt;
1855 int32_t retval;
1857 /* Optimize the use of UNDEFINED. */
1858 if (elem == &collate->undefined)
1859 /* The weights are already inserted. */
1860 return 0;
1862 /* This byte can start exactly one collation element and this is
1863 a single byte. We can directly give the index to the weights. */
1864 retval = obstack_object_size (pool);
1866 /* Construct the weight. */
1867 for (cnt = 0; cnt < nrules; ++cnt)
1869 char buf[elem->weights[cnt].cnt * 7];
1870 int len = 0;
1871 int i;
1873 for (i = 0; i < elem->weights[cnt].cnt; ++i)
1874 /* Encode the weight value. We do nothing for IGNORE entries. */
1875 if (elem->weights[cnt].w[i] != NULL)
1876 len += utf8_encode (&buf[len],
1877 elem->weights[cnt].w[i]->mborder[cnt]);
1879 /* And add the buffer content. */
1880 obstack_1grow (pool, len);
1881 obstack_grow (pool, buf, len);
1884 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1888 static int32_t
1889 output_weightwc (struct obstack *pool, struct locale_collate_t *collate,
1890 struct element_t *elem)
1892 size_t cnt;
1893 int32_t retval;
1895 /* Optimize the use of UNDEFINED. */
1896 if (elem == &collate->undefined)
1897 /* The weights are already inserted. */
1898 return 0;
1900 /* This byte can start exactly one collation element and this is
1901 a single byte. We can directly give the index to the weights. */
1902 retval = obstack_object_size (pool) / sizeof (int32_t);
1904 /* Construct the weight. */
1905 for (cnt = 0; cnt < nrules; ++cnt)
1907 int32_t buf[elem->weights[cnt].cnt];
1908 int i;
1909 int32_t j;
1911 for (i = 0, j = 0; i < elem->weights[cnt].cnt; ++i)
1912 if (elem->weights[cnt].w[i] != NULL)
1913 buf[j++] = elem->weights[cnt].w[i]->wcorder;
1915 /* And add the buffer content. */
1916 obstack_int32_grow (pool, j);
1918 obstack_grow (pool, buf, j * sizeof (int32_t));
1921 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1925 void
1926 collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
1927 const char *output_path)
1929 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1930 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
1931 struct iovec iov[2 + nelems];
1932 struct locale_file data;
1933 uint32_t idx[nelems];
1934 size_t cnt;
1935 size_t ch;
1936 int32_t tablemb[256];
1937 struct obstack weightpool;
1938 struct obstack extrapool;
1939 struct obstack indirectpool;
1940 struct section_list *sect;
1941 struct collidx_table tablewc;
1942 uint32_t elem_size;
1943 uint32_t *elem_table;
1944 int i;
1945 struct element_t *runp;
1947 data.magic = LIMAGIC (LC_COLLATE);
1948 data.n = nelems;
1949 iov[0].iov_base = (void *) &data;
1950 iov[0].iov_len = sizeof (data);
1952 iov[1].iov_base = (void *) idx;
1953 iov[1].iov_len = sizeof (idx);
1955 idx[0] = iov[0].iov_len + iov[1].iov_len;
1956 cnt = 0;
1958 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES));
1959 iov[2 + cnt].iov_base = &nrules;
1960 iov[2 + cnt].iov_len = sizeof (uint32_t);
1961 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1962 ++cnt;
1964 /* If we have no LC_COLLATE data emit only the number of rules as zero. */
1965 if (collate == NULL)
1967 int32_t dummy = 0;
1969 while (cnt < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
1971 /* The words have to be handled specially. */
1972 if (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
1974 iov[2 + cnt].iov_base = &dummy;
1975 iov[2 + cnt].iov_len = sizeof (int32_t);
1977 else
1979 iov[2 + cnt].iov_base = NULL;
1980 iov[2 + cnt].iov_len = 0;
1983 if (cnt + 1 < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
1984 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1985 ++cnt;
1988 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
1990 write_locale_data (output_path, "LC_COLLATE", 2 + cnt, iov);
1992 return;
1995 obstack_init (&weightpool);
1996 obstack_init (&extrapool);
1997 obstack_init (&indirectpool);
1999 /* Since we are using the sign of an integer to mark indirection the
2000 offsets in the arrays we are indirectly referring to must not be
2001 zero since -0 == 0. Therefore we add a bit of dummy content. */
2002 obstack_int32_grow (&extrapool, 0);
2003 obstack_int32_grow (&indirectpool, 0);
2005 /* Prepare the ruleset table. */
2006 for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
2007 if (sect->rules != NULL && sect->ruleidx == i)
2009 int j;
2011 obstack_make_room (&weightpool, nrules);
2013 for (j = 0; j < nrules; ++j)
2014 obstack_1grow_fast (&weightpool, sect->rules[j]);
2015 ++i;
2017 /* And align the output. */
2018 i = (nrules * i) % __alignof__ (int32_t);
2019 if (i > 0)
2021 obstack_1grow (&weightpool, '\0');
2022 while (++i < __alignof__ (int32_t));
2024 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_RULESETS));
2025 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2026 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2027 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2028 ++cnt;
2030 /* Generate the 8-bit table. Walk through the lists of sequences
2031 starting with the same byte and add them one after the other to
2032 the table. In case we have more than one sequence starting with
2033 the same byte we have to use extra indirection.
2035 First add a record for the NUL byte. This entry will never be used
2036 so it does not matter. */
2037 tablemb[0] = 0;
2039 /* Now insert the `UNDEFINED' value if it is used. Since this value
2040 will probably be used more than once it is good to store the
2041 weights only once. */
2042 if (collate->undefined.used_in_level != 0)
2043 output_weight (&weightpool, collate, &collate->undefined);
2045 for (ch = 1; ch < 256; ++ch)
2046 if (collate->mbheads[ch]->mbnext == NULL
2047 && collate->mbheads[ch]->nmbs <= 1)
2049 tablemb[ch] = output_weight (&weightpool, collate,
2050 collate->mbheads[ch]);
2052 else
2054 /* The entries in the list are sorted by length and then
2055 alphabetically. This is the order in which we will add the
2056 elements to the collation table. This allows simply walking
2057 the table in sequence and stopping at the first matching
2058 entry. Since the longer sequences are coming first in the
2059 list they have the possibility to match first, just as it
2060 has to be. In the worst case we are walking to the end of
2061 the list where we put, if no singlebyte sequence is defined
2062 in the locale definition, the weights for UNDEFINED.
2064 To reduce the length of the search list we compress them a bit.
2065 This happens by collecting sequences of consecutive byte
2066 sequences in one entry (having and begin and end byte sequence)
2067 and add only one index into the weight table. We can find the
2068 consecutive entries since they are also consecutive in the list. */
2069 struct element_t *runp = collate->mbheads[ch];
2070 struct element_t *lastp;
2072 assert ((obstack_object_size (&extrapool)
2073 & (__alignof__ (int32_t) - 1)) == 0);
2075 tablemb[ch] = -obstack_object_size (&extrapool);
2079 /* Store the current index in the weight table. We know that
2080 the current position in the `extrapool' is aligned on a
2081 32-bit address. */
2082 int32_t weightidx;
2083 int added;
2085 /* Find out wether this is a single entry or we have more than
2086 one consecutive entry. */
2087 if (runp->mbnext != NULL
2088 && runp->nmbs == runp->mbnext->nmbs
2089 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
2090 && (runp->mbs[runp->nmbs - 1]
2091 == runp->mbnext->mbs[runp->nmbs - 1] + 1))
2093 int i;
2094 struct element_t *series_startp = runp;
2095 struct element_t *curp;
2097 /* Compute how much space we will need. */
2098 added = ((sizeof (int32_t) + 1 + 2 * (runp->nmbs - 1)
2099 + __alignof__ (int32_t) - 1)
2100 & ~(__alignof__ (int32_t) - 1));
2101 assert ((obstack_object_size (&extrapool)
2102 & (__alignof__ (int32_t) - 1)) == 0);
2103 obstack_make_room (&extrapool, added);
2105 /* More than one consecutive entry. We mark this by having
2106 a negative index into the indirect table. */
2107 obstack_int32_grow_fast (&extrapool,
2108 -(obstack_object_size (&indirectpool)
2109 / sizeof (int32_t)));
2111 /* Now search first the end of the series. */
2113 runp = runp->mbnext;
2114 while (runp->mbnext != NULL
2115 && runp->nmbs == runp->mbnext->nmbs
2116 && memcmp (runp->mbs, runp->mbnext->mbs,
2117 runp->nmbs - 1) == 0
2118 && (runp->mbs[runp->nmbs - 1]
2119 == runp->mbnext->mbs[runp->nmbs - 1] + 1));
2121 /* Now walk backward from here to the beginning. */
2122 curp = runp;
2124 assert (runp->nmbs <= 256);
2125 obstack_1grow_fast (&extrapool, curp->nmbs - 1);
2126 for (i = 1; i < curp->nmbs; ++i)
2127 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2129 /* Now find the end of the consecutive sequence and
2130 add all the indeces in the indirect pool. */
2133 weightidx = output_weight (&weightpool, collate, curp);
2134 obstack_int32_grow (&indirectpool, weightidx);
2136 curp = curp->mblast;
2138 while (curp != series_startp);
2140 /* Add the final weight. */
2141 weightidx = output_weight (&weightpool, collate, curp);
2142 obstack_int32_grow (&indirectpool, weightidx);
2144 /* And add the end byte sequence. Without length this
2145 time. */
2146 for (i = 1; i < curp->nmbs; ++i)
2147 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2149 else
2151 /* A single entry. Simply add the index and the length and
2152 string (except for the first character which is already
2153 tested for). */
2154 int i;
2156 /* Output the weight info. */
2157 weightidx = output_weight (&weightpool, collate, runp);
2159 added = ((sizeof (int32_t) + 1 + runp->nmbs - 1
2160 + __alignof__ (int32_t) - 1)
2161 & ~(__alignof__ (int32_t) - 1));
2162 assert ((obstack_object_size (&extrapool)
2163 & (__alignof__ (int32_t) - 1)) == 0);
2164 obstack_make_room (&extrapool, added);
2166 obstack_int32_grow_fast (&extrapool, weightidx);
2167 assert (runp->nmbs <= 256);
2168 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
2170 for (i = 1; i < runp->nmbs; ++i)
2171 obstack_1grow_fast (&extrapool, runp->mbs[i]);
2174 /* Add alignment bytes if necessary. */
2175 while ((obstack_object_size (&extrapool)
2176 & (__alignof__ (int32_t) - 1)) != 0)
2177 obstack_1grow_fast (&extrapool, '\0');
2179 /* Next entry. */
2180 lastp = runp;
2181 runp = runp->mbnext;
2183 while (runp != NULL);
2185 assert ((obstack_object_size (&extrapool)
2186 & (__alignof__ (int32_t) - 1)) == 0);
2188 /* If the final entry in the list is not a single character we
2189 add an UNDEFINED entry here. */
2190 if (lastp->nmbs != 1)
2192 int added = ((sizeof (int32_t) + 1 + 1 + __alignof__ (int32_t) - 1)
2193 & ~(__alignof__ (int32_t) - 1));
2194 obstack_make_room (&extrapool, added);
2196 obstack_int32_grow_fast (&extrapool, 0);
2197 /* XXX What rule? We just pick the first. */
2198 obstack_1grow_fast (&extrapool, 0);
2199 /* Length is zero. */
2200 obstack_1grow_fast (&extrapool, 0);
2202 /* Add alignment bytes if necessary. */
2203 while ((obstack_object_size (&extrapool)
2204 & (__alignof__ (int32_t) - 1)) != 0)
2205 obstack_1grow_fast (&extrapool, '\0');
2209 /* Add padding to the tables if necessary. */
2210 while ((obstack_object_size (&weightpool) & (__alignof__ (int32_t) - 1))
2211 != 0)
2212 obstack_1grow (&weightpool, 0);
2214 /* Now add the four tables. */
2215 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEMB));
2216 iov[2 + cnt].iov_base = tablemb;
2217 iov[2 + cnt].iov_len = sizeof (tablemb);
2218 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2219 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2220 ++cnt;
2222 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB));
2223 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2224 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2225 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2226 ++cnt;
2228 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB));
2229 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2230 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2231 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2232 ++cnt;
2234 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB));
2235 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2236 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2237 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2238 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2239 ++cnt;
2242 /* Now the same for the wide character table. We need to store some
2243 more information here. */
2244 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP1));
2245 iov[2 + cnt].iov_base = NULL;
2246 iov[2 + cnt].iov_len = 0;
2247 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2248 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2249 ++cnt;
2251 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP2));
2252 iov[2 + cnt].iov_base = NULL;
2253 iov[2 + cnt].iov_len = 0;
2254 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2255 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2256 ++cnt;
2258 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP3));
2259 iov[2 + cnt].iov_base = NULL;
2260 iov[2 + cnt].iov_len = 0;
2261 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2262 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2263 ++cnt;
2265 /* Since we are using the sign of an integer to mark indirection the
2266 offsets in the arrays we are indirectly referring to must not be
2267 zero since -0 == 0. Therefore we add a bit of dummy content. */
2268 obstack_int32_grow (&extrapool, 0);
2269 obstack_int32_grow (&indirectpool, 0);
2271 /* Now insert the `UNDEFINED' value if it is used. Since this value
2272 will probably be used more than once it is good to store the
2273 weights only once. */
2274 if (output_weightwc (&weightpool, collate, &collate->undefined) != 0)
2275 abort ();
2277 /* Generate the table. Walk through the lists of sequences starting
2278 with the same wide character and add them one after the other to
2279 the table. In case we have more than one sequence starting with
2280 the same byte we have to use extra indirection. */
2282 auto void add_to_tablewc (uint32_t ch, struct element_t *runp);
2284 void add_to_tablewc (uint32_t ch, struct element_t *runp)
2286 if (runp->wcnext == NULL && runp->nwcs == 1)
2288 int32_t weigthidx = output_weightwc (&weightpool, collate, runp);
2289 collidx_table_add (&tablewc, ch, weigthidx);
2291 else
2293 /* As for the singlebyte table, we recognize sequences and
2294 compress them. */
2295 struct element_t *lastp;
2297 collidx_table_add (&tablewc, ch,
2298 -(obstack_object_size (&extrapool) / sizeof (uint32_t)));
2302 /* Store the current index in the weight table. We know that
2303 the current position in the `extrapool' is aligned on a
2304 32-bit address. */
2305 int32_t weightidx;
2306 int added;
2308 /* Find out wether this is a single entry or we have more than
2309 one consecutive entry. */
2310 if (runp->wcnext != NULL
2311 && runp->nwcs == runp->wcnext->nwcs
2312 && wmemcmp ((wchar_t *) runp->wcs,
2313 (wchar_t *)runp->wcnext->wcs,
2314 runp->nwcs - 1) == 0
2315 && (runp->wcs[runp->nwcs - 1]
2316 == runp->wcnext->wcs[runp->nwcs - 1] + 1))
2318 int i;
2319 struct element_t *series_startp = runp;
2320 struct element_t *curp;
2322 /* Now add first the initial byte sequence. */
2323 added = (1 + 1 + 2 * (runp->nwcs - 1)) * sizeof (int32_t);
2324 if (sizeof (int32_t) == sizeof (int))
2325 obstack_make_room (&extrapool, added);
2327 /* More than one consecutive entry. We mark this by having
2328 a negative index into the indirect table. */
2329 obstack_int32_grow_fast (&extrapool,
2330 -(obstack_object_size (&indirectpool)
2331 / sizeof (int32_t)));
2332 obstack_int32_grow_fast (&extrapool, runp->nwcs - 1);
2335 runp = runp->wcnext;
2336 while (runp->wcnext != NULL
2337 && runp->nwcs == runp->wcnext->nwcs
2338 && wmemcmp ((wchar_t *) runp->wcs,
2339 (wchar_t *)runp->wcnext->wcs,
2340 runp->nwcs - 1) == 0
2341 && (runp->wcs[runp->nwcs - 1]
2342 == runp->wcnext->wcs[runp->nwcs - 1] + 1));
2344 /* Now walk backward from here to the beginning. */
2345 curp = runp;
2347 for (i = 1; i < runp->nwcs; ++i)
2348 obstack_int32_grow_fast (&extrapool, curp->wcs[i]);
2350 /* Now find the end of the consecutive sequence and
2351 add all the indeces in the indirect pool. */
2354 weightidx = output_weightwc (&weightpool, collate,
2355 curp);
2356 obstack_int32_grow (&indirectpool, weightidx);
2358 curp = curp->wclast;
2360 while (curp != series_startp);
2362 /* Add the final weight. */
2363 weightidx = output_weightwc (&weightpool, collate, curp);
2364 obstack_int32_grow (&indirectpool, weightidx);
2366 /* And add the end byte sequence. Without length this
2367 time. */
2368 for (i = 1; i < curp->nwcs; ++i)
2369 obstack_int32_grow (&extrapool, curp->wcs[i]);
2371 else
2373 /* A single entry. Simply add the index and the length and
2374 string (except for the first character which is already
2375 tested for). */
2376 int i;
2378 /* Output the weight info. */
2379 weightidx = output_weightwc (&weightpool, collate, runp);
2381 added = (1 + 1 + runp->nwcs - 1) * sizeof (int32_t);
2382 if (sizeof (int) == sizeof (int32_t))
2383 obstack_make_room (&extrapool, added);
2385 obstack_int32_grow_fast (&extrapool, weightidx);
2386 obstack_int32_grow_fast (&extrapool, runp->nwcs - 1);
2387 for (i = 1; i < runp->nwcs; ++i)
2388 obstack_int32_grow_fast (&extrapool, runp->wcs[i]);
2391 /* Next entry. */
2392 lastp = runp;
2393 runp = runp->wcnext;
2395 while (runp != NULL);
2399 tablewc.p = 6;
2400 tablewc.q = 10;
2401 collidx_table_init (&tablewc);
2403 wchead_table_iterate (&collate->wcheads, add_to_tablewc);
2405 collidx_table_finalize (&tablewc);
2408 /* Now add the four tables. */
2409 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEWC));
2410 iov[2 + cnt].iov_base = tablewc.result;
2411 iov[2 + cnt].iov_len = tablewc.result_size;
2412 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2413 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2414 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2415 ++cnt;
2417 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTWC));
2418 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2419 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2420 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2421 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2422 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2423 ++cnt;
2425 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAWC));
2426 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2427 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2428 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2429 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2430 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2431 ++cnt;
2433 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTWC));
2434 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2435 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2436 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2437 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2438 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2439 ++cnt;
2442 /* Finally write the table with collation element names out. It is
2443 a hash table with a simple function which gets the name of the
2444 character as the input. One character might have many names. The
2445 value associated with the name is an index into the weight table
2446 where we are then interested in the first-level weight value.
2448 To determine how large the table should be we are counting the
2449 elements have to put in. Since we are using internal chaining
2450 using a secondary hash function we have to make the table a bit
2451 larger to avoid extremely long search times. We can achieve
2452 good results with a 40% larger table than there are entries. */
2453 elem_size = 0;
2454 runp = collate->start;
2455 while (runp != NULL)
2457 if (runp->mbs != NULL && runp->weights != NULL)
2458 /* Yep, the element really counts. */
2459 ++elem_size;
2461 runp = runp->next;
2463 /* Add 40% and find the next prime number. */
2464 elem_size = MIN (next_prime (elem_size * 1.4), 257);
2466 /* Allocate the table. Each entry consists of two words: the hash
2467 value and an index in a secondary table which provides the index
2468 into the weight table and the string itself (so that a match can
2469 be determined). */
2470 elem_table = (uint32_t *) obstack_alloc (&extrapool,
2471 elem_size * 2 * sizeof (uint32_t));
2472 memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t));
2474 /* Now add the elements. */
2475 runp = collate->start;
2476 while (runp != NULL)
2478 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2480 /* Compute the hash value of the name. */
2481 uint32_t namelen = strlen (runp->name);
2482 uint32_t hash = elem_hash (runp->name, namelen);
2483 size_t idx = hash % elem_size;
2485 if (elem_table[idx * 2] != 0)
2487 /* The spot is already take. Try iterating using the value
2488 from the secondary hashing function. */
2489 size_t iter = hash % (elem_size - 2);
2493 idx += iter;
2494 if (idx >= elem_size)
2495 idx -= elem_size;
2497 while (elem_table[idx * 2] != 0);
2499 /* This is the spot where we will insert the value. */
2500 elem_table[idx * 2] = hash;
2501 elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
2503 /* The the string itself including length. */
2504 obstack_1grow (&extrapool, namelen);
2505 obstack_grow (&extrapool, runp->name, namelen);
2507 /* And the multibyte representation. */
2508 obstack_1grow (&extrapool, runp->nmbs);
2509 obstack_grow (&extrapool, runp->mbs, runp->nmbs);
2511 /* And align again to 32 bits. */
2512 if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0)
2513 obstack_grow (&extrapool, "\0\0",
2514 (sizeof (int32_t)
2515 - ((1 + namelen + 1 + runp->nmbs)
2516 % sizeof (int32_t))));
2518 /* Now some 32-bit values: multibyte collation sequence,
2519 wide char string (including length), and wide char
2520 collation sequence. */
2521 obstack_int32_grow (&extrapool, runp->mbseqorder);
2523 obstack_int32_grow (&extrapool, runp->nwcs);
2524 obstack_grow (&extrapool, runp->wcs,
2525 runp->nwcs * sizeof (uint32_t));
2527 obstack_int32_grow (&extrapool, runp->wcseqorder);
2530 runp = runp->next;
2533 /* Prepare to write out this data. */
2534 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB));
2535 iov[2 + cnt].iov_base = &elem_size;
2536 iov[2 + cnt].iov_len = sizeof (int32_t);
2537 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2538 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2539 ++cnt;
2541 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_TABLEMB));
2542 iov[2 + cnt].iov_base = elem_table;
2543 iov[2 + cnt].iov_len = elem_size * 2 * sizeof (int32_t);
2544 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2545 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2546 ++cnt;
2548 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_EXTRAMB));
2549 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2550 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2551 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2552 ++cnt;
2554 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQMB));
2555 iov[2 + cnt].iov_base = collate->mbseqorder;
2556 iov[2 + cnt].iov_len = 256;
2557 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2558 ++cnt;
2560 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQWC));
2561 iov[2 + cnt].iov_base = collate->wcseqorder.result;
2562 iov[2 + cnt].iov_len = collate->wcseqorder.result_size;
2563 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2564 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2565 ++cnt;
2567 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_CODESET));
2568 iov[2 + cnt].iov_base = (void *) charmap->code_set_name;
2569 iov[2 + cnt].iov_len = strlen (iov[2 + cnt].iov_base) + 1;
2570 ++cnt;
2572 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2574 write_locale_data (output_path, "LC_COLLATE", 2 + cnt, iov);
2576 obstack_free (&weightpool, NULL);
2577 obstack_free (&extrapool, NULL);
2578 obstack_free (&indirectpool, NULL);
2582 void
2583 collate_read (struct linereader *ldfile, struct localedef_t *result,
2584 const struct charmap_t *charmap, const char *repertoire_name,
2585 int ignore_content)
2587 struct repertoire_t *repertoire = NULL;
2588 struct locale_collate_t *collate;
2589 struct token *now;
2590 struct token *arg = NULL;
2591 enum token_t nowtok;
2592 enum token_t was_ellipsis = tok_none;
2593 struct localedef_t *copy_locale = NULL;
2594 /* Parsing state:
2595 0 - start
2596 1 - between `order-start' and `order-end'
2597 2 - after `order-end'
2598 3 - after `reorder-after', waiting for `reorder-end'
2599 4 - after `reorder-end'
2600 5 - after `reorder-sections-after', waiting for `reorder-sections-end'
2601 6 - after `reorder-sections-end'
2603 int state = 0;
2605 /* Get the repertoire we have to use. */
2606 if (repertoire_name != NULL)
2607 repertoire = repertoire_read (repertoire_name);
2609 /* The rest of the line containing `LC_COLLATE' must be free. */
2610 lr_ignore_rest (ldfile, 1);
2614 now = lr_token (ldfile, charmap, result, NULL, verbose);
2615 nowtok = now->tok;
2617 while (nowtok == tok_eol);
2619 if (nowtok == tok_copy)
2621 state = 2;
2622 now = lr_token (ldfile, charmap, result, NULL, verbose);
2623 if (now->tok != tok_string)
2625 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2627 skip_category:
2629 now = lr_token (ldfile, charmap, result, NULL, verbose);
2630 while (now->tok != tok_eof && now->tok != tok_end);
2632 if (now->tok != tok_eof
2633 || (now = lr_token (ldfile, charmap, result, NULL, verbose),
2634 now->tok == tok_eof))
2635 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2636 else if (now->tok != tok_lc_collate)
2638 lr_error (ldfile, _("\
2639 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2640 lr_ignore_rest (ldfile, 0);
2642 else
2643 lr_ignore_rest (ldfile, 1);
2645 return;
2648 if (! ignore_content)
2650 /* Get the locale definition. */
2651 copy_locale = load_locale (LC_COLLATE, now->val.str.startmb,
2652 repertoire_name, charmap, NULL);
2653 if ((copy_locale->avail & COLLATE_LOCALE) == 0)
2655 /* Not yet loaded. So do it now. */
2656 if (locfile_read (copy_locale, charmap) != 0)
2657 goto skip_category;
2661 lr_ignore_rest (ldfile, 1);
2663 now = lr_token (ldfile, charmap, result, NULL, verbose);
2664 nowtok = now->tok;
2667 /* Prepare the data structures. */
2668 collate_startup (ldfile, result, copy_locale, ignore_content);
2669 collate = result->categories[LC_COLLATE].collate;
2671 while (1)
2673 char ucs4buf[10];
2674 char *symstr;
2675 size_t symlen;
2677 /* Of course we don't proceed beyond the end of file. */
2678 if (nowtok == tok_eof)
2679 break;
2681 /* Ingore empty lines. */
2682 if (nowtok == tok_eol)
2684 now = lr_token (ldfile, charmap, result, NULL, verbose);
2685 nowtok = now->tok;
2686 continue;
2689 switch (nowtok)
2691 case tok_copy:
2692 /* Allow copying other locales. */
2693 now = lr_token (ldfile, charmap, result, NULL, verbose);
2694 if (now->tok != tok_string)
2695 goto err_label;
2697 if (! ignore_content)
2698 load_locale (LC_COLLATE, now->val.str.startmb, repertoire_name,
2699 charmap, result);
2701 lr_ignore_rest (ldfile, 1);
2702 break;
2704 case tok_coll_weight_max:
2705 /* Ignore the rest of the line if we don't need the input of
2706 this line. */
2707 if (ignore_content)
2709 lr_ignore_rest (ldfile, 0);
2710 break;
2713 if (state != 0)
2714 goto err_label;
2716 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2717 if (arg->tok != tok_number)
2718 goto err_label;
2719 if (collate->col_weight_max != -1)
2720 lr_error (ldfile, _("%s: duplicate definition of `%s'"),
2721 "LC_COLLATE", "col_weight_max");
2722 else
2723 collate->col_weight_max = arg->val.num;
2724 lr_ignore_rest (ldfile, 1);
2725 break;
2727 case tok_section_symbol:
2728 /* Ignore the rest of the line if we don't need the input of
2729 this line. */
2730 if (ignore_content)
2732 lr_ignore_rest (ldfile, 0);
2733 break;
2736 if (state != 0)
2737 goto err_label;
2739 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2740 if (arg->tok != tok_bsymbol)
2741 goto err_label;
2742 else if (!ignore_content)
2744 /* Check whether this section is already known. */
2745 struct section_list *known = collate->sections;
2746 while (known != NULL)
2748 if (strcmp (known->name, arg->val.str.startmb) == 0)
2749 break;
2750 known = known->next;
2753 if (known != NULL)
2755 lr_error (ldfile,
2756 _("%s: duplicate declaration of section `%s'"),
2757 "LC_COLLATE", arg->val.str.startmb);
2758 free (arg->val.str.startmb);
2760 else
2761 collate->sections = make_seclist_elem (collate,
2762 arg->val.str.startmb,
2763 collate->sections);
2765 lr_ignore_rest (ldfile, known == NULL);
2767 else
2769 free (arg->val.str.startmb);
2770 lr_ignore_rest (ldfile, 0);
2772 break;
2774 case tok_collating_element:
2775 /* Ignore the rest of the line if we don't need the input of
2776 this line. */
2777 if (ignore_content)
2779 lr_ignore_rest (ldfile, 0);
2780 break;
2783 if (state != 0 && state != 2)
2784 goto err_label;
2786 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2787 if (arg->tok != tok_bsymbol)
2788 goto err_label;
2789 else
2791 const char *symbol = arg->val.str.startmb;
2792 size_t symbol_len = arg->val.str.lenmb;
2794 /* Next the `from' keyword. */
2795 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2796 if (arg->tok != tok_from)
2798 free ((char *) symbol);
2799 goto err_label;
2802 ldfile->return_widestr = 1;
2803 ldfile->translate_strings = 1;
2805 /* Finally the string with the replacement. */
2806 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2808 ldfile->return_widestr = 0;
2809 ldfile->translate_strings = 0;
2811 if (arg->tok != tok_string)
2812 goto err_label;
2814 if (!ignore_content && symbol != NULL)
2816 /* The name is already defined. */
2817 if (check_duplicate (ldfile, collate, charmap,
2818 repertoire, symbol, symbol_len))
2819 goto col_elem_free;
2821 if (arg->val.str.startmb != NULL)
2822 insert_entry (&collate->elem_table, symbol, symbol_len,
2823 new_element (collate,
2824 arg->val.str.startmb,
2825 arg->val.str.lenmb - 1,
2826 arg->val.str.startwc,
2827 symbol, symbol_len, 0));
2829 else
2831 col_elem_free:
2832 if (symbol != NULL)
2833 free ((char *) symbol);
2834 if (arg->val.str.startmb != NULL)
2835 free (arg->val.str.startmb);
2836 if (arg->val.str.startwc != NULL)
2837 free (arg->val.str.startwc);
2839 lr_ignore_rest (ldfile, 1);
2841 break;
2843 case tok_collating_symbol:
2844 /* Ignore the rest of the line if we don't need the input of
2845 this line. */
2846 if (ignore_content)
2848 lr_ignore_rest (ldfile, 0);
2849 break;
2852 if (state != 0 && state != 2)
2853 goto err_label;
2855 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2856 if (arg->tok != tok_bsymbol)
2857 goto err_label;
2858 else
2860 char *symbol = arg->val.str.startmb;
2861 size_t symbol_len = arg->val.str.lenmb;
2862 char *endsymbol = NULL;
2863 size_t endsymbol_len = 0;
2864 enum token_t ellipsis = tok_none;
2866 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2867 if (arg->tok == tok_ellipsis2 || arg->tok == tok_ellipsis4)
2869 ellipsis = arg->tok;
2871 arg = lr_token (ldfile, charmap, result, repertoire,
2872 verbose);
2873 if (arg->tok != tok_bsymbol)
2875 free (symbol);
2876 goto err_label;
2879 endsymbol = arg->val.str.startmb;
2880 endsymbol_len = arg->val.str.lenmb;
2882 lr_ignore_rest (ldfile, 1);
2884 else if (arg->tok != tok_eol)
2886 free (symbol);
2887 goto err_label;
2890 if (!ignore_content)
2892 if (symbol == NULL
2893 || (ellipsis != tok_none && endsymbol == NULL))
2895 lr_error (ldfile, _("\
2896 %s: unknown character in collating symbol name"),
2897 "LC_COLLATE");
2898 goto col_sym_free;
2900 else if (ellipsis == tok_none)
2902 /* A single symbol, no ellipsis. */
2903 if (check_duplicate (ldfile, collate, charmap,
2904 repertoire, symbol, symbol_len))
2905 /* The name is already defined. */
2906 goto col_sym_free;
2908 insert_entry (&collate->sym_table, symbol, symbol_len,
2909 new_symbol (collate, symbol, symbol_len));
2911 else if (symbol_len != endsymbol_len)
2913 col_sym_inv_range:
2914 lr_error (ldfile,
2915 _("invalid names for character range"));
2916 goto col_sym_free;
2918 else
2920 /* Oh my, we have to handle an ellipsis. First, as
2921 usual, determine the common prefix and then
2922 convert the rest into a range. */
2923 size_t prefixlen;
2924 unsigned long int from;
2925 unsigned long int to;
2926 char *endp;
2928 for (prefixlen = 0; prefixlen < symbol_len; ++prefixlen)
2929 if (symbol[prefixlen] != endsymbol[prefixlen])
2930 break;
2932 /* Convert the rest into numbers. */
2933 symbol[symbol_len] = '\0';
2934 from = strtoul (&symbol[prefixlen], &endp,
2935 ellipsis == tok_ellipsis2 ? 16 : 10);
2936 if (*endp != '\0')
2937 goto col_sym_inv_range;
2939 endsymbol[symbol_len] = '\0';
2940 to = strtoul (&endsymbol[prefixlen], &endp,
2941 ellipsis == tok_ellipsis2 ? 16 : 10);
2942 if (*endp != '\0')
2943 goto col_sym_inv_range;
2945 if (from > to)
2946 goto col_sym_inv_range;
2948 /* Now loop over all entries. */
2949 while (from <= to)
2951 char *symbuf;
2953 symbuf = (char *) obstack_alloc (&collate->mempool,
2954 symbol_len + 1);
2956 /* Create the name. */
2957 sprintf (symbuf,
2958 ellipsis == tok_ellipsis2
2959 ? "%.*s%.*lX" : "%.*s%.*lu",
2960 (int) prefixlen, symbol,
2961 (int) (symbol_len - prefixlen), from);
2963 if (check_duplicate (ldfile, collate, charmap,
2964 repertoire, symbuf, symbol_len))
2965 /* The name is already defined. */
2966 goto col_sym_free;
2968 insert_entry (&collate->sym_table, symbuf,
2969 symbol_len,
2970 new_symbol (collate, symbuf,
2971 symbol_len));
2973 /* Increment the counter. */
2974 ++from;
2977 goto col_sym_free;
2980 else
2982 col_sym_free:
2983 if (symbol != NULL)
2984 free (symbol);
2985 if (endsymbol != NULL)
2986 free (endsymbol);
2989 break;
2991 case tok_symbol_equivalence:
2992 /* Ignore the rest of the line if we don't need the input of
2993 this line. */
2994 if (ignore_content)
2996 lr_ignore_rest (ldfile, 0);
2997 break;
3000 if (state != 0)
3001 goto err_label;
3003 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3004 if (arg->tok != tok_bsymbol)
3005 goto err_label;
3006 else
3008 const char *newname = arg->val.str.startmb;
3009 size_t newname_len = arg->val.str.lenmb;
3010 const char *symname;
3011 size_t symname_len;
3012 struct symbol_t *symval;
3014 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3015 if (arg->tok != tok_bsymbol)
3017 if (newname != NULL)
3018 free ((char *) newname);
3019 goto err_label;
3022 symname = arg->val.str.startmb;
3023 symname_len = arg->val.str.lenmb;
3025 if (newname == NULL)
3027 lr_error (ldfile, _("\
3028 %s: unknown character in equivalent definition name"),
3029 "LC_COLLATE");
3031 sym_equiv_free:
3032 if (newname != NULL)
3033 free ((char *) newname);
3034 if (symname != NULL)
3035 free ((char *) symname);
3036 break;
3038 if (symname == NULL)
3040 lr_error (ldfile, _("\
3041 %s: unknown character in equivalent definition value"),
3042 "LC_COLLATE");
3043 goto sym_equiv_free;
3046 /* See whether the symbol name is already defined. */
3047 if (find_entry (&collate->sym_table, symname, symname_len,
3048 (void **) &symval) != 0)
3050 lr_error (ldfile, _("\
3051 %s: unknown symbol `%s' in equivalent definition"),
3052 "LC_COLLATE", symname);
3053 goto col_sym_free;
3056 if (insert_entry (&collate->sym_table,
3057 newname, newname_len, symval) < 0)
3059 lr_error (ldfile, _("\
3060 error while adding equivalent collating symbol"));
3061 goto sym_equiv_free;
3064 free ((char *) symname);
3066 lr_ignore_rest (ldfile, 1);
3067 break;
3069 case tok_script:
3070 /* We get told about the scripts we know. */
3071 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3072 if (arg->tok != tok_bsymbol)
3073 goto err_label;
3074 else
3076 struct section_list *runp = collate->known_sections;
3077 char *name;
3079 while (runp != NULL)
3080 if (strncmp (runp->name, arg->val.str.startmb,
3081 arg->val.str.lenmb) == 0
3082 && runp->name[arg->val.str.lenmb] == '\0')
3083 break;
3084 else
3085 runp = runp->def_next;
3087 if (runp != NULL)
3089 lr_error (ldfile, _("duplicate definition of script `%s'"),
3090 runp->name);
3091 lr_ignore_rest (ldfile, 0);
3092 break;
3095 runp = (struct section_list *) xcalloc (1, sizeof (*runp));
3096 name = (char *) xmalloc (arg->val.str.lenmb + 1);
3097 memcpy (name, arg->val.str.startmb, arg->val.str.lenmb);
3098 name[arg->val.str.lenmb] = '\0';
3099 runp->name = name;
3101 runp->def_next = collate->known_sections;
3102 collate->known_sections = runp;
3104 lr_ignore_rest (ldfile, 1);
3105 break;
3107 case tok_order_start:
3108 /* Ignore the rest of the line if we don't need the input of
3109 this line. */
3110 if (ignore_content)
3112 lr_ignore_rest (ldfile, 0);
3113 break;
3116 if (state != 0 && state != 1)
3117 goto err_label;
3118 state = 1;
3120 /* The 14652 draft does not specify whether all `order_start' lines
3121 must contain the same number of sort-rules, but 14651 does. So
3122 we require this here as well. */
3123 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3124 if (arg->tok == tok_bsymbol)
3126 /* This better should be a section name. */
3127 struct section_list *sp = collate->known_sections;
3128 while (sp != NULL
3129 && (sp->name == NULL
3130 || strncmp (sp->name, arg->val.str.startmb,
3131 arg->val.str.lenmb) != 0
3132 || sp->name[arg->val.str.lenmb] != '\0'))
3133 sp = sp->def_next;
3135 if (sp == NULL)
3137 lr_error (ldfile, _("\
3138 %s: unknown section name `%s'"),
3139 "LC_COLLATE", arg->val.str.startmb);
3140 /* We use the error section. */
3141 collate->current_section = &collate->error_section;
3143 if (collate->error_section.first == NULL)
3145 /* Insert &collate->error_section at the end of
3146 the collate->sections list. */
3147 if (collate->sections == NULL)
3148 collate->sections = &collate->error_section;
3149 else
3151 sp = collate->sections;
3152 while (sp->next != NULL)
3153 sp = sp->next;
3155 sp->next = &collate->error_section;
3157 collate->error_section.next = NULL;
3160 else
3162 /* One should not be allowed to open the same
3163 section twice. */
3164 if (sp->first != NULL)
3165 lr_error (ldfile, _("\
3166 %s: multiple order definitions for section `%s'"),
3167 "LC_COLLATE", sp->name);
3168 else
3170 /* Insert sp in the collate->sections list,
3171 right after collate->current_section. */
3172 if (collate->current_section == NULL)
3173 collate->current_section = sp;
3174 else
3176 sp->next = collate->current_section->next;
3177 collate->current_section->next = sp;
3181 /* Next should come the end of the line or a semicolon. */
3182 arg = lr_token (ldfile, charmap, result, repertoire,
3183 verbose);
3184 if (arg->tok == tok_eol)
3186 uint32_t cnt;
3188 /* This means we have exactly one rule: `forward'. */
3189 if (nrules > 1)
3190 lr_error (ldfile, _("\
3191 %s: invalid number of sorting rules"),
3192 "LC_COLLATE");
3193 else
3194 nrules = 1;
3195 sp->rules = obstack_alloc (&collate->mempool,
3196 (sizeof (enum coll_sort_rule)
3197 * nrules));
3198 for (cnt = 0; cnt < nrules; ++cnt)
3199 sp->rules[cnt] = sort_forward;
3201 /* Next line. */
3202 break;
3205 /* Get the next token. */
3206 arg = lr_token (ldfile, charmap, result, repertoire,
3207 verbose);
3210 else
3212 /* There is no section symbol. Therefore we use the unnamed
3213 section. */
3214 collate->current_section = &collate->unnamed_section;
3216 if (collate->unnamed_section.first != NULL)
3217 lr_error (ldfile, _("\
3218 %s: multiple order definitions for unnamed section"),
3219 "LC_COLLATE");
3220 else
3222 /* Insert &collate->unnamed_section at the beginning of
3223 the collate->sections list. */
3224 collate->unnamed_section.next = collate->sections;
3225 collate->sections = &collate->unnamed_section;
3229 /* Now read the direction names. */
3230 read_directions (ldfile, arg, charmap, repertoire, result);
3232 /* From now we need the strings untranslated. */
3233 ldfile->translate_strings = 0;
3234 break;
3236 case tok_order_end:
3237 /* Ignore the rest of the line if we don't need the input of
3238 this line. */
3239 if (ignore_content)
3241 lr_ignore_rest (ldfile, 0);
3242 break;
3245 if (state != 1)
3246 goto err_label;
3248 /* Handle ellipsis at end of list. */
3249 if (was_ellipsis != tok_none)
3251 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3252 repertoire, result);
3253 was_ellipsis = tok_none;
3256 state = 2;
3257 lr_ignore_rest (ldfile, 1);
3258 break;
3260 case tok_reorder_after:
3261 /* Ignore the rest of the line if we don't need the input of
3262 this line. */
3263 if (ignore_content)
3265 lr_ignore_rest (ldfile, 0);
3266 break;
3269 if (state == 1)
3271 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3272 "LC_COLLATE");
3273 state = 2;
3275 /* Handle ellipsis at end of list. */
3276 if (was_ellipsis != tok_none)
3278 handle_ellipsis (ldfile, arg->val.str.startmb,
3279 arg->val.str.lenmb, was_ellipsis, charmap,
3280 repertoire, result);
3281 was_ellipsis = tok_none;
3284 else if (state != 2 && state != 3)
3285 goto err_label;
3286 state = 3;
3288 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3289 if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
3291 /* Find this symbol in the sequence table. */
3292 char ucsbuf[10];
3293 char *startmb;
3294 size_t lenmb;
3295 struct element_t *insp;
3296 int no_error = 1;
3298 if (arg->tok == tok_bsymbol)
3300 startmb = arg->val.str.startmb;
3301 lenmb = arg->val.str.lenmb;
3303 else
3305 sprintf (ucsbuf, "U%08X", arg->val.ucs4);
3306 startmb = ucsbuf;
3307 lenmb = 9;
3310 if (find_entry (&collate->seq_table, startmb, lenmb,
3311 (void **) &insp) == 0)
3312 /* Yes, the symbol exists. Simply point the cursor
3313 to it. */
3314 collate->cursor = insp;
3315 else
3317 struct symbol_t *symbp;
3319 if (find_entry (&collate->sym_table, startmb, lenmb,
3320 (void **) &symbp) == 0)
3322 if (symbp->order->last != NULL
3323 || symbp->order->next != NULL)
3324 collate->cursor = symbp->order;
3325 else
3327 /* This is a collating symbol but its position
3328 is not yet defined. */
3329 lr_error (ldfile, _("\
3330 %s: order for collating symbol %.*s not yet defined"),
3331 "LC_COLLATE", (int) lenmb, startmb);
3332 collate->cursor = NULL;
3333 no_error = 0;
3336 else if (find_entry (&collate->elem_table, startmb, lenmb,
3337 (void **) &insp) == 0)
3339 if (insp->last != NULL || insp->next != NULL)
3340 collate->cursor = insp;
3341 else
3343 /* This is a collating element but its position
3344 is not yet defined. */
3345 lr_error (ldfile, _("\
3346 %s: order for collating element %.*s not yet defined"),
3347 "LC_COLLATE", (int) lenmb, startmb);
3348 collate->cursor = NULL;
3349 no_error = 0;
3352 else
3354 /* This is bad. The symbol after which we have to
3355 insert does not exist. */
3356 lr_error (ldfile, _("\
3357 %s: cannot reorder after %.*s: symbol not known"),
3358 "LC_COLLATE", (int) lenmb, startmb);
3359 collate->cursor = NULL;
3360 no_error = 0;
3364 lr_ignore_rest (ldfile, no_error);
3366 else
3367 /* This must not happen. */
3368 goto err_label;
3369 break;
3371 case tok_reorder_end:
3372 /* Ignore the rest of the line if we don't need the input of
3373 this line. */
3374 if (ignore_content)
3375 break;
3377 if (state != 3)
3378 goto err_label;
3379 state = 4;
3380 lr_ignore_rest (ldfile, 1);
3381 break;
3383 case tok_reorder_sections_after:
3384 /* Ignore the rest of the line if we don't need the input of
3385 this line. */
3386 if (ignore_content)
3388 lr_ignore_rest (ldfile, 0);
3389 break;
3392 if (state == 1)
3394 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3395 "LC_COLLATE");
3396 state = 2;
3398 /* Handle ellipsis at end of list. */
3399 if (was_ellipsis != tok_none)
3401 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3402 repertoire, result);
3403 was_ellipsis = tok_none;
3406 else if (state == 3)
3408 WITH_CUR_LOCALE (error (0, 0, _("\
3409 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3410 state = 4;
3412 else if (state != 2 && state != 4)
3413 goto err_label;
3414 state = 5;
3416 /* Get the name of the sections we are adding after. */
3417 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3418 if (arg->tok == tok_bsymbol)
3420 /* Now find a section with this name. */
3421 struct section_list *runp = collate->sections;
3423 while (runp != NULL)
3425 if (runp->name != NULL
3426 && strlen (runp->name) == arg->val.str.lenmb
3427 && memcmp (runp->name, arg->val.str.startmb,
3428 arg->val.str.lenmb) == 0)
3429 break;
3431 runp = runp->next;
3434 if (runp != NULL)
3435 collate->current_section = runp;
3436 else
3438 /* This is bad. The section after which we have to
3439 reorder does not exist. Therefore we cannot
3440 process the whole rest of this reorder
3441 specification. */
3442 lr_error (ldfile, _("%s: section `%.*s' not known"),
3443 "LC_COLLATE", (int) arg->val.str.lenmb,
3444 arg->val.str.startmb);
3448 lr_ignore_rest (ldfile, 0);
3450 now = lr_token (ldfile, charmap, result, NULL, verbose);
3452 while (now->tok == tok_reorder_sections_after
3453 || now->tok == tok_reorder_sections_end
3454 || now->tok == tok_end);
3456 /* Process the token we just saw. */
3457 nowtok = now->tok;
3458 continue;
3461 else
3462 /* This must not happen. */
3463 goto err_label;
3464 break;
3466 case tok_reorder_sections_end:
3467 /* Ignore the rest of the line if we don't need the input of
3468 this line. */
3469 if (ignore_content)
3470 break;
3472 if (state != 5)
3473 goto err_label;
3474 state = 6;
3475 lr_ignore_rest (ldfile, 1);
3476 break;
3478 case tok_bsymbol:
3479 case tok_ucs4:
3480 /* Ignore the rest of the line if we don't need the input of
3481 this line. */
3482 if (ignore_content)
3484 lr_ignore_rest (ldfile, 0);
3485 break;
3488 if (state != 0 && state != 1 && state != 3 && state != 5)
3489 goto err_label;
3491 if ((state == 0 || state == 5) && nowtok == tok_ucs4)
3492 goto err_label;
3494 if (nowtok == tok_ucs4)
3496 snprintf (ucs4buf, sizeof (ucs4buf), "U%08X", now->val.ucs4);
3497 symstr = ucs4buf;
3498 symlen = 9;
3500 else if (arg != NULL)
3502 symstr = arg->val.str.startmb;
3503 symlen = arg->val.str.lenmb;
3505 else
3507 lr_error (ldfile, _("%s: bad symbol <%.*s>"), "LC_COLLATE",
3508 ldfile->token.val.str.lenmb,
3509 ldfile->token.val.str.startmb);
3510 break;
3513 if (state == 0)
3515 /* We are outside an `order_start' region. This means
3516 we must only accept definitions of values for
3517 collation symbols since these are purely abstract
3518 values and don't need directions associated. */
3519 struct element_t *seqp;
3521 if (find_entry (&collate->seq_table, symstr, symlen,
3522 (void **) &seqp) == 0)
3524 /* It's already defined. First check whether this
3525 is really a collating symbol. */
3526 if (seqp->is_character)
3527 goto err_label;
3529 goto move_entry;
3531 else
3533 void *result;
3535 if (find_entry (&collate->sym_table, symstr, symlen,
3536 &result) != 0)
3537 /* No collating symbol, it's an error. */
3538 goto err_label;
3540 /* Maybe this is the first time we define a symbol
3541 value and it is before the first actual section. */
3542 if (collate->sections == NULL)
3543 collate->sections = collate->current_section =
3544 &collate->symbol_section;
3547 if (was_ellipsis != tok_none)
3550 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis,
3551 charmap, repertoire, result);
3553 /* Remember that we processed the ellipsis. */
3554 was_ellipsis = tok_none;
3556 /* And don't add the value a second time. */
3557 break;
3560 else if (state == 3)
3562 /* It is possible that we already have this collation sequence.
3563 In this case we move the entry. */
3564 struct element_t *seqp;
3565 void *sym;
3567 /* If the symbol after which we have to insert was not found
3568 ignore all entries. */
3569 if (collate->cursor == NULL)
3571 lr_ignore_rest (ldfile, 0);
3572 break;
3575 if (find_entry (&collate->seq_table, symstr, symlen,
3576 (void **) &seqp) == 0)
3577 goto move_entry;
3579 if (find_entry (&collate->sym_table, symstr, symlen, &sym) == 0
3580 && (seqp = ((struct symbol_t *) sym)->order) != NULL)
3581 goto move_entry;
3583 if (find_entry (&collate->elem_table, symstr, symlen,
3584 (void **) &seqp) == 0
3585 && (seqp->last != NULL || seqp->next != NULL
3586 || (collate->start != NULL && seqp == collate->start)))
3588 move_entry:
3589 /* Remove the entry from the old position. */
3590 if (seqp->last == NULL)
3591 collate->start = seqp->next;
3592 else
3593 seqp->last->next = seqp->next;
3594 if (seqp->next != NULL)
3595 seqp->next->last = seqp->last;
3597 /* We also have to check whether this entry is the
3598 first or last of a section. */
3599 if (seqp->section->first == seqp)
3601 if (seqp->section->first == seqp->section->last)
3602 /* This section has no content anymore. */
3603 seqp->section->first = seqp->section->last = NULL;
3604 else
3605 seqp->section->first = seqp->next;
3607 else if (seqp->section->last == seqp)
3608 seqp->section->last = seqp->last;
3610 /* Now insert it in the new place. */
3611 insert_weights (ldfile, seqp, charmap, repertoire, result,
3612 tok_none);
3613 break;
3616 /* Otherwise we just add a new entry. */
3618 else if (state == 5)
3620 /* We are reordering sections. Find the named section. */
3621 struct section_list *runp = collate->sections;
3622 struct section_list *prevp = NULL;
3624 while (runp != NULL)
3626 if (runp->name != NULL
3627 && strlen (runp->name) == symlen
3628 && memcmp (runp->name, symstr, symlen) == 0)
3629 break;
3631 prevp = runp;
3632 runp = runp->next;
3635 if (runp == NULL)
3637 lr_error (ldfile, _("%s: section `%.*s' not known"),
3638 "LC_COLLATE", (int) symlen, symstr);
3639 lr_ignore_rest (ldfile, 0);
3641 else
3643 if (runp != collate->current_section)
3645 /* Remove the named section from the old place and
3646 insert it in the new one. */
3647 prevp->next = runp->next;
3649 runp->next = collate->current_section->next;
3650 collate->current_section->next = runp;
3651 collate->current_section = runp;
3654 /* Process the rest of the line which might change
3655 the collation rules. */
3656 arg = lr_token (ldfile, charmap, result, repertoire,
3657 verbose);
3658 if (arg->tok != tok_eof && arg->tok != tok_eol)
3659 read_directions (ldfile, arg, charmap, repertoire,
3660 result);
3662 break;
3664 else if (was_ellipsis != tok_none)
3666 /* Using the information in the `ellipsis_weight'
3667 element and this and the last value we have to handle
3668 the ellipsis now. */
3669 assert (state == 1);
3671 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis, charmap,
3672 repertoire, result);
3674 /* Remember that we processed the ellipsis. */
3675 was_ellipsis = tok_none;
3677 /* And don't add the value a second time. */
3678 break;
3681 /* Now insert in the new place. */
3682 insert_value (ldfile, symstr, symlen, charmap, repertoire, result);
3683 break;
3685 case tok_undefined:
3686 /* Ignore the rest of the line if we don't need the input of
3687 this line. */
3688 if (ignore_content)
3690 lr_ignore_rest (ldfile, 0);
3691 break;
3694 if (state != 1)
3695 goto err_label;
3697 if (was_ellipsis != tok_none)
3699 lr_error (ldfile,
3700 _("%s: cannot have `%s' as end of ellipsis range"),
3701 "LC_COLLATE", "UNDEFINED");
3703 unlink_element (collate);
3704 was_ellipsis = tok_none;
3707 /* See whether UNDEFINED already appeared somewhere. */
3708 if (collate->undefined.next != NULL
3709 || &collate->undefined == collate->cursor)
3711 lr_error (ldfile,
3712 _("%s: order for `%.*s' already defined at %s:%Zu"),
3713 "LC_COLLATE", 9, "UNDEFINED",
3714 collate->undefined.file,
3715 collate->undefined.line);
3716 lr_ignore_rest (ldfile, 0);
3718 else
3719 /* Parse the weights. */
3720 insert_weights (ldfile, &collate->undefined, charmap,
3721 repertoire, result, tok_none);
3722 break;
3724 case tok_ellipsis2: /* symbolic hexadecimal ellipsis */
3725 case tok_ellipsis3: /* absolute ellipsis */
3726 case tok_ellipsis4: /* symbolic decimal ellipsis */
3727 /* This is the symbolic (decimal or hexadecimal) or absolute
3728 ellipsis. */
3729 if (was_ellipsis != tok_none)
3730 goto err_label;
3732 if (state != 0 && state != 1 && state != 3)
3733 goto err_label;
3735 was_ellipsis = nowtok;
3737 insert_weights (ldfile, &collate->ellipsis_weight, charmap,
3738 repertoire, result, nowtok);
3739 break;
3741 case tok_end:
3742 /* Next we assume `LC_COLLATE'. */
3743 if (!ignore_content)
3745 if (state == 0)
3746 /* We must either see a copy statement or have
3747 ordering values. */
3748 lr_error (ldfile,
3749 _("%s: empty category description not allowed"),
3750 "LC_COLLATE");
3751 else if (state == 1)
3753 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3754 "LC_COLLATE");
3756 /* Handle ellipsis at end of list. */
3757 if (was_ellipsis != tok_none)
3759 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3760 repertoire, result);
3761 was_ellipsis = tok_none;
3764 else if (state == 3)
3765 WITH_CUR_LOCALE (error (0, 0, _("\
3766 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3767 else if (state == 5)
3768 WITH_CUR_LOCALE (error (0, 0, _("\
3769 %s: missing `reorder-sections-end' keyword"), "LC_COLLATE"));
3771 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3772 if (arg->tok == tok_eof)
3773 break;
3774 if (arg->tok == tok_eol)
3775 lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
3776 else if (arg->tok != tok_lc_collate)
3777 lr_error (ldfile, _("\
3778 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
3779 lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
3780 return;
3782 default:
3783 err_label:
3784 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
3787 /* Prepare for the next round. */
3788 now = lr_token (ldfile, charmap, result, NULL, verbose);
3789 nowtok = now->tok;
3792 /* When we come here we reached the end of the file. */
3793 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");