Updated to fedora-glibc-20070827T2032
[glibc.git] / locale / programs / ld-collate.c
blobfeab034740e4927eacdd4a5ee6894adea4216b4c
1 /* Copyright (C) 1995-2003, 2005, 2006, 2007 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
23 #include <errno.h>
24 #include <error.h>
25 #include <stdlib.h>
26 #include <wchar.h>
27 #include <sys/param.h>
29 #include "localedef.h"
30 #include "charmap.h"
31 #include "localeinfo.h"
32 #include "linereader.h"
33 #include "locfile.h"
34 #include "elem-hash.h"
36 /* Uncomment the following line in the production version. */
37 /* #define NDEBUG 1 */
38 #include <assert.h>
40 #define obstack_chunk_alloc malloc
41 #define obstack_chunk_free free
43 static inline void
44 __attribute ((always_inline))
45 obstack_int32_grow (struct obstack *obstack, int32_t data)
47 if (sizeof (int32_t) == sizeof (int))
48 obstack_int_grow (obstack, data);
49 else
50 obstack_grow (obstack, &data, sizeof (int32_t));
53 static inline void
54 __attribute ((always_inline))
55 obstack_int32_grow_fast (struct obstack *obstack, int32_t data)
57 if (sizeof (int32_t) == sizeof (int))
58 obstack_int_grow_fast (obstack, data);
59 else
60 obstack_grow (obstack, &data, sizeof (int32_t));
63 /* Forward declaration. */
64 struct element_t;
66 /* Data type for list of strings. */
67 struct section_list
69 /* Successor in the known_sections list. */
70 struct section_list *def_next;
71 /* Successor in the sections list. */
72 struct section_list *next;
73 /* Name of the section. */
74 const char *name;
75 /* First element of this section. */
76 struct element_t *first;
77 /* Last element of this section. */
78 struct element_t *last;
79 /* These are the rules for this section. */
80 enum coll_sort_rule *rules;
81 /* Index of the rule set in the appropriate section of the output file. */
82 int ruleidx;
85 struct element_t;
87 struct element_list_t
89 /* Number of elements. */
90 int cnt;
92 struct element_t **w;
95 /* Data type for collating element. */
96 struct element_t
98 const char *name;
100 const char *mbs;
101 size_t nmbs;
102 const uint32_t *wcs;
103 size_t nwcs;
104 int *mborder;
105 int wcorder;
107 /* The following is a bit mask which bits are set if this element is
108 used in the appropriate level. Interesting for the singlebyte
109 weight computation.
111 XXX The type here restricts the number of levels to 32. It could
112 be changed if necessary but I doubt this is necessary. */
113 unsigned int used_in_level;
115 struct element_list_t *weights;
117 /* Nonzero if this is a real character definition. */
118 int is_character;
120 /* Order of the character in the sequence. This information will
121 be used in range expressions. */
122 int mbseqorder;
123 int wcseqorder;
125 /* Where does the definition come from. */
126 const char *file;
127 size_t line;
129 /* Which section does this belong to. */
130 struct section_list *section;
132 /* Predecessor and successor in the order list. */
133 struct element_t *last;
134 struct element_t *next;
136 /* Next element in multibyte output list. */
137 struct element_t *mbnext;
138 struct element_t *mblast;
140 /* Next element in wide character output list. */
141 struct element_t *wcnext;
142 struct element_t *wclast;
145 /* Special element value. */
146 #define ELEMENT_ELLIPSIS2 ((struct element_t *) 1)
147 #define ELEMENT_ELLIPSIS3 ((struct element_t *) 2)
148 #define ELEMENT_ELLIPSIS4 ((struct element_t *) 3)
150 /* Data type for collating symbol. */
151 struct symbol_t
153 const char *name;
155 /* Point to place in the order list. */
156 struct element_t *order;
158 /* Where does the definition come from. */
159 const char *file;
160 size_t line;
163 /* Sparse table of struct element_t *. */
164 #define TABLE wchead_table
165 #define ELEMENT struct element_t *
166 #define DEFAULT NULL
167 #define ITERATE
168 #define NO_FINALIZE
169 #include "3level.h"
171 /* Sparse table of int32_t. */
172 #define TABLE collidx_table
173 #define ELEMENT int32_t
174 #define DEFAULT 0
175 #include "3level.h"
177 /* Sparse table of uint32_t. */
178 #define TABLE collseq_table
179 #define ELEMENT uint32_t
180 #define DEFAULT ~((uint32_t) 0)
181 #include "3level.h"
184 /* The real definition of the struct for the LC_COLLATE locale. */
185 struct locale_collate_t
187 int col_weight_max;
188 int cur_weight_max;
190 /* List of known scripts. */
191 struct section_list *known_sections;
192 /* List of used sections. */
193 struct section_list *sections;
194 /* Current section using definition. */
195 struct section_list *current_section;
196 /* There always can be an unnamed section. */
197 struct section_list unnamed_section;
198 /* To make handling of errors easier we have another section. */
199 struct section_list error_section;
200 /* Sometimes we are defining the values for collating symbols before
201 the first actual section. */
202 struct section_list symbol_section;
204 /* Start of the order list. */
205 struct element_t *start;
207 /* The undefined element. */
208 struct element_t undefined;
210 /* This is the cursor for `reorder_after' insertions. */
211 struct element_t *cursor;
213 /* This value is used when handling ellipsis. */
214 struct element_t ellipsis_weight;
216 /* Known collating elements. */
217 hash_table elem_table;
219 /* Known collating symbols. */
220 hash_table sym_table;
222 /* Known collation sequences. */
223 hash_table seq_table;
225 struct obstack mempool;
227 /* The LC_COLLATE category is a bit special as it is sometimes possible
228 that the definitions from more than one input file contains information.
229 Therefore we keep all relevant input in a list. */
230 struct locale_collate_t *next;
232 /* Arrays with heads of the list for each of the leading bytes in
233 the multibyte sequences. */
234 struct element_t *mbheads[256];
236 /* Arrays with heads of the list for each of the leading bytes in
237 the multibyte sequences. */
238 struct wchead_table wcheads;
240 /* The arrays with the collation sequence order. */
241 unsigned char mbseqorder[256];
242 struct collseq_table wcseqorder;
246 /* We have a few global variables which are used for reading all
247 LC_COLLATE category descriptions in all files. */
248 static uint32_t nrules;
251 /* We need UTF-8 encoding of numbers. */
252 static inline int
253 __attribute ((always_inline))
254 utf8_encode (char *buf, int val)
256 int retval;
258 if (val < 0x80)
260 *buf++ = (char) val;
261 retval = 1;
263 else
265 int step;
267 for (step = 2; step < 6; ++step)
268 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
269 break;
270 retval = step;
272 *buf = (unsigned char) (~0xff >> step);
273 --step;
276 buf[step] = 0x80 | (val & 0x3f);
277 val >>= 6;
279 while (--step > 0);
280 *buf |= val;
283 return retval;
287 static struct section_list *
288 make_seclist_elem (struct locale_collate_t *collate, const char *string,
289 struct section_list *next)
291 struct section_list *newp;
293 newp = (struct section_list *) obstack_alloc (&collate->mempool,
294 sizeof (*newp));
295 newp->next = next;
296 newp->name = string;
297 newp->first = NULL;
298 newp->last = NULL;
300 return newp;
304 static struct element_t *
305 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
306 const uint32_t *wcs, const char *name, size_t namelen,
307 int is_character)
309 struct element_t *newp;
311 newp = (struct element_t *) obstack_alloc (&collate->mempool,
312 sizeof (*newp));
313 newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
314 name, namelen);
315 if (mbs != NULL)
317 newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
318 newp->nmbs = mbslen;
320 else
322 newp->mbs = NULL;
323 newp->nmbs = 0;
325 if (wcs != NULL)
327 size_t nwcs = wcslen ((wchar_t *) wcs);
328 uint32_t zero = 0;
329 obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
330 obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
331 newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
332 newp->nwcs = nwcs;
334 else
336 newp->wcs = NULL;
337 newp->nwcs = 0;
339 newp->mborder = NULL;
340 newp->wcorder = 0;
341 newp->used_in_level = 0;
342 newp->is_character = is_character;
344 /* Will be assigned later. XXX */
345 newp->mbseqorder = 0;
346 newp->wcseqorder = 0;
348 /* Will be allocated later. */
349 newp->weights = NULL;
351 newp->file = NULL;
352 newp->line = 0;
354 newp->section = collate->current_section;
356 newp->last = NULL;
357 newp->next = NULL;
359 newp->mbnext = NULL;
360 newp->mblast = NULL;
362 newp->wcnext = NULL;
363 newp->wclast = NULL;
365 return newp;
369 static struct symbol_t *
370 new_symbol (struct locale_collate_t *collate, const char *name, size_t len)
372 struct symbol_t *newp;
374 newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
376 newp->name = obstack_copy0 (&collate->mempool, name, len);
377 newp->order = NULL;
379 newp->file = NULL;
380 newp->line = 0;
382 return newp;
386 /* Test whether this name is already defined somewhere. */
387 static int
388 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
389 const struct charmap_t *charmap,
390 struct repertoire_t *repertoire, const char *symbol,
391 size_t symbol_len)
393 void *ignore = NULL;
395 if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
397 lr_error (ldfile, _("`%.*s' already defined in charmap"),
398 (int) symbol_len, symbol);
399 return 1;
402 if (repertoire != NULL
403 && (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore)
404 == 0))
406 lr_error (ldfile, _("`%.*s' already defined in repertoire"),
407 (int) symbol_len, symbol);
408 return 1;
411 if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
413 lr_error (ldfile, _("`%.*s' already defined as collating symbol"),
414 (int) symbol_len, symbol);
415 return 1;
418 if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
420 lr_error (ldfile, _("`%.*s' already defined as collating element"),
421 (int) symbol_len, symbol);
422 return 1;
425 return 0;
429 /* Read the direction specification. */
430 static void
431 read_directions (struct linereader *ldfile, struct token *arg,
432 const struct charmap_t *charmap,
433 struct repertoire_t *repertoire, struct localedef_t *result)
435 int cnt = 0;
436 int max = nrules ?: 10;
437 enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
438 int warned = 0;
439 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
441 while (1)
443 int valid = 0;
445 if (arg->tok == tok_forward)
447 if (rules[cnt] & sort_backward)
449 if (! warned)
451 lr_error (ldfile, _("\
452 %s: `forward' and `backward' are mutually excluding each other"),
453 "LC_COLLATE");
454 warned = 1;
457 else if (rules[cnt] & sort_forward)
459 if (! warned)
461 lr_error (ldfile, _("\
462 %s: `%s' mentioned more than once in definition of weight %d"),
463 "LC_COLLATE", "forward", cnt + 1);
466 else
467 rules[cnt] |= sort_forward;
469 valid = 1;
471 else if (arg->tok == tok_backward)
473 if (rules[cnt] & sort_forward)
475 if (! warned)
477 lr_error (ldfile, _("\
478 %s: `forward' and `backward' are mutually excluding each other"),
479 "LC_COLLATE");
480 warned = 1;
483 else if (rules[cnt] & sort_backward)
485 if (! warned)
487 lr_error (ldfile, _("\
488 %s: `%s' mentioned more than once in definition of weight %d"),
489 "LC_COLLATE", "backward", cnt + 1);
492 else
493 rules[cnt] |= sort_backward;
495 valid = 1;
497 else if (arg->tok == tok_position)
499 if (rules[cnt] & sort_position)
501 if (! warned)
503 lr_error (ldfile, _("\
504 %s: `%s' mentioned more than once in definition of weight %d"),
505 "LC_COLLATE", "position", cnt + 1);
508 else
509 rules[cnt] |= sort_position;
511 valid = 1;
514 if (valid)
515 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
517 if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
518 || arg->tok == tok_semicolon)
520 if (! valid && ! warned)
522 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
523 warned = 1;
526 /* See whether we have to increment the counter. */
527 if (arg->tok != tok_comma && rules[cnt] != 0)
529 /* Add the default `forward' if we have seen only `position'. */
530 if (rules[cnt] == sort_position)
531 rules[cnt] = sort_position | sort_forward;
533 ++cnt;
536 if (arg->tok == tok_eof || arg->tok == tok_eol)
537 /* End of line or file, so we exit the loop. */
538 break;
540 if (nrules == 0)
542 /* See whether we have enough room in the array. */
543 if (cnt == max)
545 max += 10;
546 rules = (enum coll_sort_rule *) xrealloc (rules,
548 * sizeof (*rules));
549 memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
552 else
554 if (cnt == nrules)
556 /* There must not be any more rule. */
557 if (! warned)
559 lr_error (ldfile, _("\
560 %s: too many rules; first entry only had %d"),
561 "LC_COLLATE", nrules);
562 warned = 1;
565 lr_ignore_rest (ldfile, 0);
566 break;
570 else
572 if (! warned)
574 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
575 warned = 1;
579 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
582 if (nrules == 0)
584 /* Now we know how many rules we have. */
585 nrules = cnt;
586 rules = (enum coll_sort_rule *) xrealloc (rules,
587 nrules * sizeof (*rules));
589 else
591 if (cnt < nrules)
593 /* Not enough rules in this specification. */
594 if (! warned)
595 lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
598 rules[cnt] = sort_forward;
599 while (++cnt < nrules);
603 collate->current_section->rules = rules;
607 static struct element_t *
608 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
609 const char *str, size_t len)
611 void *result = NULL;
613 /* Search for the entries among the collation sequences already define. */
614 if (find_entry (&collate->seq_table, str, len, &result) != 0)
616 /* Nope, not define yet. So we see whether it is a
617 collation symbol. */
618 void *ptr;
620 if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
622 /* It's a collation symbol. */
623 struct symbol_t *sym = (struct symbol_t *) ptr;
624 result = sym->order;
626 if (result == NULL)
627 result = sym->order = new_element (collate, NULL, 0, NULL,
628 NULL, 0, 0);
630 else if (find_entry (&collate->elem_table, str, len, &result) != 0)
632 /* It's also no collation element. So it is a character
633 element defined later. */
634 result = new_element (collate, NULL, 0, NULL, str, len, 1);
635 /* Insert it into the sequence table. */
636 insert_entry (&collate->seq_table, str, len, result);
640 return (struct element_t *) result;
644 static void
645 unlink_element (struct locale_collate_t *collate)
647 if (collate->cursor == collate->start)
649 assert (collate->cursor->next == NULL);
650 assert (collate->cursor->last == NULL);
651 collate->cursor = NULL;
653 else
655 if (collate->cursor->next != NULL)
656 collate->cursor->next->last = collate->cursor->last;
657 if (collate->cursor->last != NULL)
658 collate->cursor->last->next = collate->cursor->next;
659 collate->cursor = collate->cursor->last;
664 static void
665 insert_weights (struct linereader *ldfile, struct element_t *elem,
666 const struct charmap_t *charmap,
667 struct repertoire_t *repertoire, struct localedef_t *result,
668 enum token_t ellipsis)
670 int weight_cnt;
671 struct token *arg;
672 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
674 /* Initialize all the fields. */
675 elem->file = ldfile->fname;
676 elem->line = ldfile->lineno;
678 elem->last = collate->cursor;
679 elem->next = collate->cursor ? collate->cursor->next : NULL;
680 if (collate->cursor != NULL && collate->cursor->next != NULL)
681 collate->cursor->next->last = elem;
682 if (collate->cursor != NULL)
683 collate->cursor->next = elem;
684 if (collate->start == NULL)
686 assert (collate->cursor == NULL);
687 collate->start = elem;
690 elem->section = collate->current_section;
692 if (collate->current_section->first == NULL)
693 collate->current_section->first = elem;
694 if (collate->current_section->last == collate->cursor)
695 collate->current_section->last = elem;
697 collate->cursor = elem;
699 elem->weights = (struct element_list_t *)
700 obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
701 memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
703 weight_cnt = 0;
705 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
708 if (arg->tok == tok_eof || arg->tok == tok_eol)
709 break;
711 if (arg->tok == tok_ignore)
713 /* The weight for this level has to be ignored. We use the
714 null pointer to indicate this. */
715 elem->weights[weight_cnt].w = (struct element_t **)
716 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
717 elem->weights[weight_cnt].w[0] = NULL;
718 elem->weights[weight_cnt].cnt = 1;
720 else if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
722 char ucs4str[10];
723 struct element_t *val;
724 char *symstr;
725 size_t symlen;
727 if (arg->tok == tok_bsymbol)
729 symstr = arg->val.str.startmb;
730 symlen = arg->val.str.lenmb;
732 else
734 snprintf (ucs4str, sizeof (ucs4str), "U%08X", arg->val.ucs4);
735 symstr = ucs4str;
736 symlen = 9;
739 val = find_element (ldfile, collate, symstr, symlen);
740 if (val == NULL)
741 break;
743 elem->weights[weight_cnt].w = (struct element_t **)
744 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
745 elem->weights[weight_cnt].w[0] = val;
746 elem->weights[weight_cnt].cnt = 1;
748 else if (arg->tok == tok_string)
750 /* Split the string up in the individual characters and put
751 the element definitions in the list. */
752 const char *cp = arg->val.str.startmb;
753 int cnt = 0;
754 struct element_t *charelem;
755 struct element_t **weights = NULL;
756 int max = 0;
758 if (*cp == '\0')
760 lr_error (ldfile, _("%s: empty weight string not allowed"),
761 "LC_COLLATE");
762 lr_ignore_rest (ldfile, 0);
763 break;
768 if (*cp == '<')
770 /* Ahh, it's a bsymbol or an UCS4 value. If it's
771 the latter we have to unify the name. */
772 const char *startp = ++cp;
773 size_t len;
775 while (*cp != '>')
777 if (*cp == ldfile->escape_char)
778 ++cp;
779 if (*cp == '\0')
780 /* It's a syntax error. */
781 goto syntax;
783 ++cp;
786 if (cp - startp == 5 && startp[0] == 'U'
787 && isxdigit (startp[1]) && isxdigit (startp[2])
788 && isxdigit (startp[3]) && isxdigit (startp[4]))
790 unsigned int ucs4 = strtoul (startp + 1, NULL, 16);
791 char *newstr;
793 newstr = (char *) xmalloc (10);
794 snprintf (newstr, 10, "U%08X", ucs4);
795 startp = newstr;
797 len = 9;
799 else
800 len = cp - startp;
802 charelem = find_element (ldfile, collate, startp, len);
803 ++cp;
805 else
807 /* People really shouldn't use characters directly in
808 the string. Especially since it's not really clear
809 what this means. We interpret all characters in the
810 string as if that would be bsymbols. Otherwise we
811 would have to match back to bsymbols somehow and this
812 is normally not what people normally expect. */
813 charelem = find_element (ldfile, collate, cp++, 1);
816 if (charelem == NULL)
818 /* We ignore the rest of the line. */
819 lr_ignore_rest (ldfile, 0);
820 break;
823 /* Add the pointer. */
824 if (cnt >= max)
826 struct element_t **newp;
827 max += 10;
828 newp = (struct element_t **)
829 alloca (max * sizeof (struct element_t *));
830 memcpy (newp, weights, cnt * sizeof (struct element_t *));
831 weights = newp;
833 weights[cnt++] = charelem;
835 while (*cp != '\0');
837 /* Now store the information. */
838 elem->weights[weight_cnt].w = (struct element_t **)
839 obstack_alloc (&collate->mempool,
840 cnt * sizeof (struct element_t *));
841 memcpy (elem->weights[weight_cnt].w, weights,
842 cnt * sizeof (struct element_t *));
843 elem->weights[weight_cnt].cnt = cnt;
845 /* We don't need the string anymore. */
846 free (arg->val.str.startmb);
848 else if (ellipsis != tok_none
849 && (arg->tok == tok_ellipsis2
850 || arg->tok == tok_ellipsis3
851 || arg->tok == tok_ellipsis4))
853 /* It must be the same ellipsis as used in the initial column. */
854 if (arg->tok != ellipsis)
855 lr_error (ldfile, _("\
856 %s: weights must use the same ellipsis symbol as the name"),
857 "LC_COLLATE");
859 /* The weight for this level will depend on the element
860 iterating over the range. Put a placeholder. */
861 elem->weights[weight_cnt].w = (struct element_t **)
862 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
863 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
864 elem->weights[weight_cnt].cnt = 1;
866 else
868 syntax:
869 /* It's a syntax error. */
870 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
871 lr_ignore_rest (ldfile, 0);
872 break;
875 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
876 /* This better should be the end of the line or a semicolon. */
877 if (arg->tok == tok_semicolon)
878 /* OK, ignore this and read the next token. */
879 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
880 else if (arg->tok != tok_eof && arg->tok != tok_eol)
882 /* It's a syntax error. */
883 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
884 lr_ignore_rest (ldfile, 0);
885 break;
888 while (++weight_cnt < nrules);
890 if (weight_cnt < nrules)
892 /* This means the rest of the line uses the current element as
893 the weight. */
896 elem->weights[weight_cnt].w = (struct element_t **)
897 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
898 if (ellipsis == tok_none)
899 elem->weights[weight_cnt].w[0] = elem;
900 else
901 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
902 elem->weights[weight_cnt].cnt = 1;
904 while (++weight_cnt < nrules);
906 else
908 if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
910 /* Too many rule values. */
911 lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
912 lr_ignore_rest (ldfile, 0);
914 else
915 lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
920 static int
921 insert_value (struct linereader *ldfile, const char *symstr, size_t symlen,
922 const struct charmap_t *charmap, struct repertoire_t *repertoire,
923 struct localedef_t *result)
925 /* First find out what kind of symbol this is. */
926 struct charseq *seq;
927 uint32_t wc;
928 struct element_t *elem = NULL;
929 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
931 /* Try to find the character in the charmap. */
932 seq = charmap_find_value (charmap, symstr, symlen);
934 /* Determine the wide character. */
935 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
937 wc = repertoire_find_value (repertoire, symstr, symlen);
938 if (seq != NULL)
939 seq->ucs4 = wc;
941 else
942 wc = seq->ucs4;
944 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
946 /* It's no character, so look through the collation elements and
947 symbol list. */
948 void *ptr = elem;
949 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) != 0)
951 void *result;
952 struct symbol_t *sym = NULL;
954 /* It's also collation element. Therefore it's either a
955 collating symbol or it's a character which is not
956 supported by the character set. In the later case we
957 simply create a dummy entry. */
958 if (find_entry (&collate->sym_table, symstr, symlen, &result) == 0)
960 /* It's a collation symbol. */
961 sym = (struct symbol_t *) result;
963 elem = sym->order;
966 if (elem == NULL)
968 elem = new_element (collate, NULL, 0, NULL, symstr, symlen, 0);
970 if (sym != NULL)
971 sym->order = elem;
972 else
973 /* Enter a fake element in the sequence table. This
974 won't cause anything in the output since there is
975 no multibyte or wide character associated with
976 it. */
977 insert_entry (&collate->seq_table, symstr, symlen, elem);
980 else
981 /* Copy the result back. */
982 elem = ptr;
984 else
986 /* Otherwise the symbols stands for a character. */
987 void *ptr = elem;
988 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) != 0)
990 uint32_t wcs[2] = { wc, 0 };
992 /* We have to allocate an entry. */
993 elem = new_element (collate,
994 seq != NULL ? (char *) seq->bytes : NULL,
995 seq != NULL ? seq->nbytes : 0,
996 wc == ILLEGAL_CHAR_VALUE ? NULL : wcs,
997 symstr, symlen, 1);
999 /* And add it to the table. */
1000 if (insert_entry (&collate->seq_table, symstr, symlen, elem) != 0)
1001 /* This cannot happen. */
1002 assert (! "Internal error");
1004 else
1006 /* Copy the result back. */
1007 elem = ptr;
1009 /* Maybe the character was used before the definition. In this case
1010 we have to insert the byte sequences now. */
1011 if (elem->mbs == NULL && seq != NULL)
1013 elem->mbs = obstack_copy0 (&collate->mempool,
1014 seq->bytes, seq->nbytes);
1015 elem->nmbs = seq->nbytes;
1018 if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
1020 uint32_t wcs[2] = { wc, 0 };
1022 elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
1023 elem->nwcs = 1;
1028 /* Test whether this element is not already in the list. */
1029 if (elem->next != NULL || elem == collate->cursor)
1031 lr_error (ldfile, _("order for `%.*s' already defined at %s:%Zu"),
1032 (int) symlen, symstr, elem->file, elem->line);
1033 lr_ignore_rest (ldfile, 0);
1034 return 1;
1037 insert_weights (ldfile, elem, charmap, repertoire, result, tok_none);
1039 return 0;
1043 static void
1044 handle_ellipsis (struct linereader *ldfile, const char *symstr, size_t symlen,
1045 enum token_t ellipsis, const struct charmap_t *charmap,
1046 struct repertoire_t *repertoire,
1047 struct localedef_t *result)
1049 struct element_t *startp;
1050 struct element_t *endp;
1051 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
1053 /* Unlink the entry added for the ellipsis. */
1054 unlink_element (collate);
1055 startp = collate->cursor;
1057 /* Process and add the end-entry. */
1058 if (symstr != NULL
1059 && insert_value (ldfile, symstr, symlen, charmap, repertoire, result))
1060 /* Something went wrong with inserting the to-value. This means
1061 we cannot process the ellipsis. */
1062 return;
1064 /* Reset the cursor. */
1065 collate->cursor = startp;
1067 /* Now we have to handle many different situations:
1068 - we have to distinguish between the three different ellipsis forms
1069 - the is the ellipsis at the beginning, in the middle, or at the end.
1071 endp = collate->cursor->next;
1072 assert (symstr == NULL || endp != NULL);
1074 /* XXX The following is probably very wrong since also collating symbols
1075 can appear in ranges. But do we want/can refine the test for that? */
1076 #if 0
1077 /* Both, the start and the end symbol, must stand for characters. */
1078 if ((startp != NULL && (startp->name == NULL || ! startp->is_character))
1079 || (endp != NULL && (endp->name == NULL|| ! endp->is_character)))
1081 lr_error (ldfile, _("\
1082 %s: the start and the end symbol of a range must stand for characters"),
1083 "LC_COLLATE");
1084 return;
1086 #endif
1088 if (ellipsis == tok_ellipsis3)
1090 /* One requirement we make here: the length of the byte
1091 sequences for the first and end character must be the same.
1092 This is mainly to prevent unwanted effects and this is often
1093 not what is wanted. */
1094 size_t len = (startp->mbs != NULL ? startp->nmbs
1095 : (endp->mbs != NULL ? endp->nmbs : 0));
1096 char mbcnt[len + 1];
1097 char mbend[len + 1];
1099 /* Well, this should be caught somewhere else already. Just to
1100 make sure. */
1101 assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
1102 assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
1104 if (startp != NULL && endp != NULL
1105 && startp->mbs != NULL && endp->mbs != NULL
1106 && startp->nmbs != endp->nmbs)
1108 lr_error (ldfile, _("\
1109 %s: byte sequences of first and last character must have the same length"),
1110 "LC_COLLATE");
1111 return;
1114 /* Determine whether we have to generate multibyte sequences. */
1115 if ((startp == NULL || startp->mbs != NULL)
1116 && (endp == NULL || endp->mbs != NULL))
1118 int cnt;
1119 int ret;
1121 /* Prepare the beginning byte sequence. This is either from the
1122 beginning byte sequence or it is all nulls if it was an
1123 initial ellipsis. */
1124 if (startp == NULL || startp->mbs == NULL)
1125 memset (mbcnt, '\0', len);
1126 else
1128 memcpy (mbcnt, startp->mbs, len);
1130 /* And increment it so that the value is the first one we will
1131 try to insert. */
1132 for (cnt = len - 1; cnt >= 0; --cnt)
1133 if (++mbcnt[cnt] != '\0')
1134 break;
1136 mbcnt[len] = '\0';
1138 /* And the end sequence. */
1139 if (endp == NULL || endp->mbs == NULL)
1140 memset (mbend, '\0', len);
1141 else
1142 memcpy (mbend, endp->mbs, len);
1143 mbend[len] = '\0';
1145 /* Test whether we have a correct range. */
1146 ret = memcmp (mbcnt, mbend, len);
1147 if (ret >= 0)
1149 if (ret > 0)
1150 lr_error (ldfile, _("%s: byte sequence of first character of \
1151 range is not lower than that of the last character"), "LC_COLLATE");
1152 return;
1155 /* Generate the byte sequences data. */
1156 while (1)
1158 struct charseq *seq;
1160 /* Quite a bit of work ahead. We have to find the character
1161 definition for the byte sequence and then determine the
1162 wide character belonging to it. */
1163 seq = charmap_find_symbol (charmap, mbcnt, len);
1164 if (seq != NULL)
1166 struct element_t *elem;
1167 size_t namelen;
1169 /* I don't think this can ever happen. */
1170 assert (seq->name != NULL);
1171 namelen = strlen (seq->name);
1173 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1174 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1175 namelen);
1177 /* Now we are ready to insert the new value in the
1178 sequence. Find out whether the element is
1179 already known. */
1180 void *ptr;
1181 if (find_entry (&collate->seq_table, seq->name, namelen,
1182 &ptr) != 0)
1184 uint32_t wcs[2] = { seq->ucs4, 0 };
1186 /* We have to allocate an entry. */
1187 elem = new_element (collate, mbcnt, len,
1188 seq->ucs4 == ILLEGAL_CHAR_VALUE
1189 ? NULL : wcs, seq->name,
1190 namelen, 1);
1192 /* And add it to the table. */
1193 if (insert_entry (&collate->seq_table, seq->name,
1194 namelen, elem) != 0)
1195 /* This cannot happen. */
1196 assert (! "Internal error");
1198 else
1199 /* Copy the result. */
1200 elem = ptr;
1202 /* Test whether this element is not already in the list. */
1203 if (elem->next != NULL || (collate->cursor != NULL
1204 && elem->next == collate->cursor))
1206 lr_error (ldfile, _("\
1207 order for `%.*s' already defined at %s:%Zu"),
1208 (int) namelen, seq->name,
1209 elem->file, elem->line);
1210 goto increment;
1213 /* Enqueue the new element. */
1214 elem->last = collate->cursor;
1215 if (collate->cursor == NULL)
1216 elem->next = NULL;
1217 else
1219 elem->next = collate->cursor->next;
1220 elem->last->next = elem;
1221 if (elem->next != NULL)
1222 elem->next->last = elem;
1224 if (collate->start == NULL)
1226 assert (collate->cursor == NULL);
1227 collate->start = elem;
1229 collate->cursor = elem;
1231 /* Add the weight value. We take them from the
1232 `ellipsis_weights' member of `collate'. */
1233 elem->weights = (struct element_list_t *)
1234 obstack_alloc (&collate->mempool,
1235 nrules * sizeof (struct element_list_t));
1236 for (cnt = 0; cnt < nrules; ++cnt)
1237 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1238 && (collate->ellipsis_weight.weights[cnt].w[0]
1239 == ELEMENT_ELLIPSIS2))
1241 elem->weights[cnt].w = (struct element_t **)
1242 obstack_alloc (&collate->mempool,
1243 sizeof (struct element_t *));
1244 elem->weights[cnt].w[0] = elem;
1245 elem->weights[cnt].cnt = 1;
1247 else
1249 /* Simply use the weight from `ellipsis_weight'. */
1250 elem->weights[cnt].w =
1251 collate->ellipsis_weight.weights[cnt].w;
1252 elem->weights[cnt].cnt =
1253 collate->ellipsis_weight.weights[cnt].cnt;
1257 /* Increment for the next round. */
1258 increment:
1259 for (cnt = len - 1; cnt >= 0; --cnt)
1260 if (++mbcnt[cnt] != '\0')
1261 break;
1263 /* Find out whether this was all. */
1264 if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1265 /* Yep, that's all. */
1266 break;
1270 else
1272 /* For symbolic range we naturally must have a beginning and an
1273 end specified by the user. */
1274 if (startp == NULL)
1275 lr_error (ldfile, _("\
1276 %s: symbolic range ellipsis must not directly follow `order_start'"),
1277 "LC_COLLATE");
1278 else if (endp == NULL)
1279 lr_error (ldfile, _("\
1280 %s: symbolic range ellipsis must not be directly followed by `order_end'"),
1281 "LC_COLLATE");
1282 else
1284 /* Determine the range. To do so we have to determine the
1285 common prefix of the both names and then the numeric
1286 values of both ends. */
1287 size_t lenfrom = strlen (startp->name);
1288 size_t lento = strlen (endp->name);
1289 char buf[lento + 1];
1290 int preflen = 0;
1291 long int from;
1292 long int to;
1293 char *cp;
1294 int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1296 if (lenfrom != lento)
1298 invalid_range:
1299 lr_error (ldfile, _("\
1300 `%s' and `%.*s' are not valid names for symbolic range"),
1301 startp->name, (int) lento, endp->name);
1302 return;
1305 while (startp->name[preflen] == endp->name[preflen])
1306 if (startp->name[preflen] == '\0')
1307 /* Nothing to be done. The start and end point are identical
1308 and while inserting the end point we have already given
1309 the user an error message. */
1310 return;
1311 else
1312 ++preflen;
1314 errno = 0;
1315 from = strtol (startp->name + preflen, &cp, base);
1316 if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1317 goto invalid_range;
1319 errno = 0;
1320 to = strtol (endp->name + preflen, &cp, base);
1321 if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1322 goto invalid_range;
1324 /* Copy the prefix. */
1325 memcpy (buf, startp->name, preflen);
1327 /* Loop over all values. */
1328 for (++from; from < to; ++from)
1330 struct element_t *elem = NULL;
1331 struct charseq *seq;
1332 uint32_t wc;
1333 int cnt;
1335 /* Generate the name. */
1336 sprintf (buf + preflen, base == 10 ? "%0*ld" : "%0*lX",
1337 (int) (lenfrom - preflen), from);
1339 /* Look whether this name is already defined. */
1340 void *ptr;
1341 if (find_entry (&collate->seq_table, buf, symlen, &ptr) == 0)
1343 /* Copy back the result. */
1344 elem = ptr;
1346 if (elem->next != NULL || (collate->cursor != NULL
1347 && elem->next == collate->cursor))
1349 lr_error (ldfile, _("\
1350 %s: order for `%.*s' already defined at %s:%Zu"),
1351 "LC_COLLATE", (int) lenfrom, buf,
1352 elem->file, elem->line);
1353 continue;
1356 if (elem->name == NULL)
1358 lr_error (ldfile, _("%s: `%s' must be a character"),
1359 "LC_COLLATE", buf);
1360 continue;
1364 if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1366 /* Search for a character of this name. */
1367 seq = charmap_find_value (charmap, buf, lenfrom);
1368 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1370 wc = repertoire_find_value (repertoire, buf, lenfrom);
1372 if (seq != NULL)
1373 seq->ucs4 = wc;
1375 else
1376 wc = seq->ucs4;
1378 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1379 /* We don't know anything about a character with this
1380 name. XXX Should we warn? */
1381 continue;
1383 if (elem == NULL)
1385 uint32_t wcs[2] = { wc, 0 };
1387 /* We have to allocate an entry. */
1388 elem = new_element (collate,
1389 seq != NULL
1390 ? (char *) seq->bytes : NULL,
1391 seq != NULL ? seq->nbytes : 0,
1392 wc == ILLEGAL_CHAR_VALUE
1393 ? NULL : wcs, buf, lenfrom, 1);
1395 else
1397 /* Update the element. */
1398 if (seq != NULL)
1400 elem->mbs = obstack_copy0 (&collate->mempool,
1401 seq->bytes, seq->nbytes);
1402 elem->nmbs = seq->nbytes;
1405 if (wc != ILLEGAL_CHAR_VALUE)
1407 uint32_t zero = 0;
1409 obstack_grow (&collate->mempool,
1410 &wc, sizeof (uint32_t));
1411 obstack_grow (&collate->mempool,
1412 &zero, sizeof (uint32_t));
1413 elem->wcs = obstack_finish (&collate->mempool);
1414 elem->nwcs = 1;
1418 elem->file = ldfile->fname;
1419 elem->line = ldfile->lineno;
1420 elem->section = collate->current_section;
1423 /* Enqueue the new element. */
1424 elem->last = collate->cursor;
1425 elem->next = collate->cursor->next;
1426 elem->last->next = elem;
1427 if (elem->next != NULL)
1428 elem->next->last = elem;
1429 collate->cursor = elem;
1431 /* Now add the weights. They come from the `ellipsis_weights'
1432 member of `collate'. */
1433 elem->weights = (struct element_list_t *)
1434 obstack_alloc (&collate->mempool,
1435 nrules * sizeof (struct element_list_t));
1436 for (cnt = 0; cnt < nrules; ++cnt)
1437 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1438 && (collate->ellipsis_weight.weights[cnt].w[0]
1439 == ELEMENT_ELLIPSIS2))
1441 elem->weights[cnt].w = (struct element_t **)
1442 obstack_alloc (&collate->mempool,
1443 sizeof (struct element_t *));
1444 elem->weights[cnt].w[0] = elem;
1445 elem->weights[cnt].cnt = 1;
1447 else
1449 /* Simly use the weight from `ellipsis_weight'. */
1450 elem->weights[cnt].w =
1451 collate->ellipsis_weight.weights[cnt].w;
1452 elem->weights[cnt].cnt =
1453 collate->ellipsis_weight.weights[cnt].cnt;
1461 static void
1462 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1463 struct localedef_t *copy_locale, int ignore_content)
1465 if (!ignore_content && locale->categories[LC_COLLATE].collate == NULL)
1467 struct locale_collate_t *collate;
1469 if (copy_locale == NULL)
1471 collate = locale->categories[LC_COLLATE].collate =
1472 (struct locale_collate_t *)
1473 xcalloc (1, sizeof (struct locale_collate_t));
1475 /* Init the various data structures. */
1476 init_hash (&collate->elem_table, 100);
1477 init_hash (&collate->sym_table, 100);
1478 init_hash (&collate->seq_table, 500);
1479 obstack_init (&collate->mempool);
1481 collate->col_weight_max = -1;
1483 else
1484 /* Reuse the copy_locale's data structures. */
1485 collate = locale->categories[LC_COLLATE].collate =
1486 copy_locale->categories[LC_COLLATE].collate;
1489 ldfile->translate_strings = 0;
1490 ldfile->return_widestr = 0;
1494 void
1495 collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
1497 /* Now is the time when we can assign the individual collation
1498 values for all the symbols. We have possibly different values
1499 for the wide- and the multibyte-character symbols. This is done
1500 since it might make a difference in the encoding if there is in
1501 some cases no multibyte-character but there are wide-characters.
1502 (The other way around it is not important since theencoded
1503 collation value in the wide-character case is 32 bits wide and
1504 therefore requires no encoding).
1506 The lowest collation value assigned is 2. Zero is reserved for
1507 the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1508 functions and 1 is used to separate the individual passes for the
1509 different rules.
1511 We also have to construct is list with all the bytes/words which
1512 can come first in a sequence, followed by all the elements which
1513 also start with this byte/word. The order is reverse which has
1514 among others the important effect that longer strings are located
1515 first in the list. This is required for the output data since
1516 the algorithm used in `strcoll' etc depends on this.
1518 The multibyte case is easy. We simply sort into an array with
1519 256 elements. */
1520 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1521 int mbact[nrules];
1522 int wcact;
1523 int mbseqact;
1524 int wcseqact;
1525 struct element_t *runp;
1526 int i;
1527 int need_undefined = 0;
1528 struct section_list *sect;
1529 int ruleidx;
1530 int nr_wide_elems = 0;
1532 if (collate == NULL)
1534 /* No data, no check. */
1535 if (! be_quiet)
1536 WITH_CUR_LOCALE (error (0, 0, _("No definition for %s category found"),
1537 "LC_COLLATE"));
1538 return;
1541 /* If this assertion is hit change the type in `element_t'. */
1542 assert (nrules <= sizeof (runp->used_in_level) * 8);
1544 /* Make sure that the `position' rule is used either in all sections
1545 or in none. */
1546 for (i = 0; i < nrules; ++i)
1547 for (sect = collate->sections; sect != NULL; sect = sect->next)
1548 if (sect->rules != NULL
1549 && ((sect->rules[i] & sort_position)
1550 != (collate->sections->rules[i] & sort_position)))
1552 WITH_CUR_LOCALE (error (0, 0, _("\
1553 %s: `position' must be used for a specific level in all sections or none"),
1554 "LC_COLLATE"));
1555 break;
1558 /* Find out which elements are used at which level. At the same
1559 time we find out whether we have any undefined symbols. */
1560 runp = collate->start;
1561 while (runp != NULL)
1563 if (runp->mbs != NULL)
1565 for (i = 0; i < nrules; ++i)
1567 int j;
1569 for (j = 0; j < runp->weights[i].cnt; ++j)
1570 /* A NULL pointer as the weight means IGNORE. */
1571 if (runp->weights[i].w[j] != NULL)
1573 if (runp->weights[i].w[j]->weights == NULL)
1575 WITH_CUR_LOCALE (error_at_line (0, 0, runp->file,
1576 runp->line,
1577 _("symbol `%s' not defined"),
1578 runp->weights[i].w[j]->name));
1580 need_undefined = 1;
1581 runp->weights[i].w[j] = &collate->undefined;
1583 else
1584 /* Set the bit for the level. */
1585 runp->weights[i].w[j]->used_in_level |= 1 << i;
1590 /* Up to the next entry. */
1591 runp = runp->next;
1594 /* Walk through the list of defined sequences and assign weights. Also
1595 create the data structure which will allow generating the single byte
1596 character based tables.
1598 Since at each time only the weights for each of the rules are
1599 only compared to other weights for this rule it is possible to
1600 assign more compact weight values than simply counting all
1601 weights in sequence. We can assign weights from 3, one for each
1602 rule individually and only for those elements, which are actually
1603 used for this rule.
1605 Why is this important? It is not for the wide char table. But
1606 it is for the singlebyte output since here larger numbers have to
1607 be encoded to make it possible to emit the value as a byte
1608 string. */
1609 for (i = 0; i < nrules; ++i)
1610 mbact[i] = 2;
1611 wcact = 2;
1612 mbseqact = 0;
1613 wcseqact = 0;
1614 runp = collate->start;
1615 while (runp != NULL)
1617 /* Determine the order. */
1618 if (runp->used_in_level != 0)
1620 runp->mborder = (int *) obstack_alloc (&collate->mempool,
1621 nrules * sizeof (int));
1623 for (i = 0; i < nrules; ++i)
1624 if ((runp->used_in_level & (1 << i)) != 0)
1625 runp->mborder[i] = mbact[i]++;
1626 else
1627 runp->mborder[i] = 0;
1630 if (runp->mbs != NULL)
1632 struct element_t **eptr;
1633 struct element_t *lastp = NULL;
1635 /* Find the point where to insert in the list. */
1636 eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1637 while (*eptr != NULL)
1639 if ((*eptr)->nmbs < runp->nmbs)
1640 break;
1642 if ((*eptr)->nmbs == runp->nmbs)
1644 int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1646 if (c == 0)
1648 /* This should not happen. It means that we have
1649 to symbols with the same byte sequence. It is
1650 of course an error. */
1651 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1652 (*eptr)->line,
1653 _("\
1654 symbol `%s' has the same encoding as"), (*eptr)->name);
1655 error_at_line (0, 0, runp->file,
1656 runp->line,
1657 _("symbol `%s'"),
1658 runp->name));
1659 goto dont_insert;
1661 else if (c < 0)
1662 /* Insert it here. */
1663 break;
1666 /* To the next entry. */
1667 lastp = *eptr;
1668 eptr = &(*eptr)->mbnext;
1671 /* Set the pointers. */
1672 runp->mbnext = *eptr;
1673 runp->mblast = lastp;
1674 if (*eptr != NULL)
1675 (*eptr)->mblast = runp;
1676 *eptr = runp;
1677 dont_insert:
1681 if (runp->used_in_level)
1683 runp->wcorder = wcact++;
1685 /* We take the opportunity to count the elements which have
1686 wide characters. */
1687 ++nr_wide_elems;
1690 if (runp->is_character)
1692 if (runp->nmbs == 1)
1693 collate->mbseqorder[((unsigned char *) runp->mbs)[0]] = mbseqact++;
1695 runp->wcseqorder = wcseqact++;
1697 else if (runp->mbs != NULL && runp->weights != NULL)
1698 /* This is for collation elements. */
1699 runp->wcseqorder = wcseqact++;
1701 /* Up to the next entry. */
1702 runp = runp->next;
1705 /* Find out whether any of the `mbheads' entries is unset. In this
1706 case we use the UNDEFINED entry. */
1707 for (i = 1; i < 256; ++i)
1708 if (collate->mbheads[i] == NULL)
1710 need_undefined = 1;
1711 collate->mbheads[i] = &collate->undefined;
1714 /* Now to the wide character case. */
1715 collate->wcheads.p = 6;
1716 collate->wcheads.q = 10;
1717 wchead_table_init (&collate->wcheads);
1719 collate->wcseqorder.p = 6;
1720 collate->wcseqorder.q = 10;
1721 collseq_table_init (&collate->wcseqorder);
1723 /* Start adding. */
1724 runp = collate->start;
1725 while (runp != NULL)
1727 if (runp->wcs != NULL)
1729 struct element_t *e;
1730 struct element_t **eptr;
1731 struct element_t *lastp;
1733 /* Insert the collation sequence value. */
1734 if (runp->is_character)
1735 collseq_table_add (&collate->wcseqorder, runp->wcs[0],
1736 runp->wcseqorder);
1738 /* Find the point where to insert in the list. */
1739 e = wchead_table_get (&collate->wcheads, runp->wcs[0]);
1740 eptr = &e;
1741 lastp = NULL;
1742 while (*eptr != NULL)
1744 if ((*eptr)->nwcs < runp->nwcs)
1745 break;
1747 if ((*eptr)->nwcs == runp->nwcs)
1749 int c = wmemcmp ((wchar_t *) (*eptr)->wcs,
1750 (wchar_t *) runp->wcs, runp->nwcs);
1752 if (c == 0)
1754 /* This should not happen. It means that we have
1755 two symbols with the same byte sequence. It is
1756 of course an error. */
1757 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1758 (*eptr)->line,
1759 _("\
1760 symbol `%s' has the same encoding as"), (*eptr)->name);
1761 error_at_line (0, 0, runp->file,
1762 runp->line,
1763 _("symbol `%s'"),
1764 runp->name));
1765 goto dont_insertwc;
1767 else if (c < 0)
1768 /* Insert it here. */
1769 break;
1772 /* To the next entry. */
1773 lastp = *eptr;
1774 eptr = &(*eptr)->wcnext;
1777 /* Set the pointers. */
1778 runp->wcnext = *eptr;
1779 runp->wclast = lastp;
1780 if (*eptr != NULL)
1781 (*eptr)->wclast = runp;
1782 *eptr = runp;
1783 if (eptr == &e)
1784 wchead_table_add (&collate->wcheads, runp->wcs[0], e);
1785 dont_insertwc:
1789 /* Up to the next entry. */
1790 runp = runp->next;
1793 collseq_table_finalize (&collate->wcseqorder);
1795 /* Now determine whether the UNDEFINED entry is needed and if yes,
1796 whether it was defined. */
1797 collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1798 if (collate->undefined.file == NULL)
1800 if (need_undefined)
1802 /* This seems not to be enforced by recent standards. Don't
1803 emit an error, simply append UNDEFINED at the end. */
1804 if (0)
1805 WITH_CUR_LOCALE (error (0, 0, _("no definition of `UNDEFINED'")));
1807 /* Add UNDEFINED at the end. */
1808 collate->undefined.mborder =
1809 (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1811 for (i = 0; i < nrules; ++i)
1812 collate->undefined.mborder[i] = mbact[i]++;
1815 /* In any case we will need the definition for the wide character
1816 case. But we will not complain that it is missing since the
1817 specification strangely enough does not seem to account for
1818 this. */
1819 collate->undefined.wcorder = wcact++;
1822 /* Finally, try to unify the rules for the sections. Whenever the rules
1823 for a section are the same as those for another section give the
1824 ruleset the same index. Since there are never many section we can
1825 use an O(n^2) algorithm here. */
1826 sect = collate->sections;
1827 while (sect != NULL && sect->rules == NULL)
1828 sect = sect->next;
1830 /* Bail out if we have no sections because of earlier errors. */
1831 if (sect == NULL)
1833 WITH_CUR_LOCALE (error (EXIT_FAILURE, 0,
1834 _("too many errors; giving up")));
1835 return;
1838 ruleidx = 0;
1841 struct section_list *osect = collate->sections;
1843 while (osect != sect)
1844 if (osect->rules != NULL
1845 && memcmp (osect->rules, sect->rules, nrules) == 0)
1846 break;
1847 else
1848 osect = osect->next;
1850 if (osect == sect)
1851 sect->ruleidx = ruleidx++;
1852 else
1853 sect->ruleidx = osect->ruleidx;
1855 /* Next section. */
1857 sect = sect->next;
1858 while (sect != NULL && sect->rules == NULL);
1860 while (sect != NULL);
1861 /* We are currently not prepared for more than 128 rulesets. But this
1862 should never really be a problem. */
1863 assert (ruleidx <= 128);
1867 static int32_t
1868 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1869 struct element_t *elem)
1871 size_t cnt;
1872 int32_t retval;
1874 /* Optimize the use of UNDEFINED. */
1875 if (elem == &collate->undefined)
1876 /* The weights are already inserted. */
1877 return 0;
1879 /* This byte can start exactly one collation element and this is
1880 a single byte. We can directly give the index to the weights. */
1881 retval = obstack_object_size (pool);
1883 /* Construct the weight. */
1884 for (cnt = 0; cnt < nrules; ++cnt)
1886 char buf[elem->weights[cnt].cnt * 7];
1887 int len = 0;
1888 int i;
1890 for (i = 0; i < elem->weights[cnt].cnt; ++i)
1891 /* Encode the weight value. We do nothing for IGNORE entries. */
1892 if (elem->weights[cnt].w[i] != NULL)
1893 len += utf8_encode (&buf[len],
1894 elem->weights[cnt].w[i]->mborder[cnt]);
1896 /* And add the buffer content. */
1897 obstack_1grow (pool, len);
1898 obstack_grow (pool, buf, len);
1901 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1905 static int32_t
1906 output_weightwc (struct obstack *pool, struct locale_collate_t *collate,
1907 struct element_t *elem)
1909 size_t cnt;
1910 int32_t retval;
1912 /* Optimize the use of UNDEFINED. */
1913 if (elem == &collate->undefined)
1914 /* The weights are already inserted. */
1915 return 0;
1917 /* This byte can start exactly one collation element and this is
1918 a single byte. We can directly give the index to the weights. */
1919 retval = obstack_object_size (pool) / sizeof (int32_t);
1921 /* Construct the weight. */
1922 for (cnt = 0; cnt < nrules; ++cnt)
1924 int32_t buf[elem->weights[cnt].cnt];
1925 int i;
1926 int32_t j;
1928 for (i = 0, j = 0; i < elem->weights[cnt].cnt; ++i)
1929 if (elem->weights[cnt].w[i] != NULL)
1930 buf[j++] = elem->weights[cnt].w[i]->wcorder;
1932 /* And add the buffer content. */
1933 obstack_int32_grow (pool, j);
1935 obstack_grow (pool, buf, j * sizeof (int32_t));
1938 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1941 /* If localedef is every threaded, this would need to be __thread var. */
1942 static struct
1944 struct obstack *weightpool;
1945 struct obstack *extrapool;
1946 struct obstack *indpool;
1947 struct locale_collate_t *collate;
1948 struct collidx_table *tablewc;
1949 } atwc;
1951 static void add_to_tablewc (uint32_t ch, struct element_t *runp);
1953 static void
1954 add_to_tablewc (uint32_t ch, struct element_t *runp)
1956 if (runp->wcnext == NULL && runp->nwcs == 1)
1958 int32_t weigthidx = output_weightwc (atwc.weightpool, atwc.collate,
1959 runp);
1960 collidx_table_add (atwc.tablewc, ch, weigthidx);
1962 else
1964 /* As for the singlebyte table, we recognize sequences and
1965 compress them. */
1966 struct element_t *lastp;
1968 collidx_table_add (atwc.tablewc, ch,
1969 -(obstack_object_size (atwc.extrapool)
1970 / sizeof (uint32_t)));
1974 /* Store the current index in the weight table. We know that
1975 the current position in the `extrapool' is aligned on a
1976 32-bit address. */
1977 int32_t weightidx;
1978 int added;
1980 /* Find out wether this is a single entry or we have more than
1981 one consecutive entry. */
1982 if (runp->wcnext != NULL
1983 && runp->nwcs == runp->wcnext->nwcs
1984 && wmemcmp ((wchar_t *) runp->wcs,
1985 (wchar_t *)runp->wcnext->wcs,
1986 runp->nwcs - 1) == 0
1987 && (runp->wcs[runp->nwcs - 1]
1988 == runp->wcnext->wcs[runp->nwcs - 1] + 1))
1990 int i;
1991 struct element_t *series_startp = runp;
1992 struct element_t *curp;
1994 /* Now add first the initial byte sequence. */
1995 added = (1 + 1 + 2 * (runp->nwcs - 1)) * sizeof (int32_t);
1996 if (sizeof (int32_t) == sizeof (int))
1997 obstack_make_room (atwc.extrapool, added);
1999 /* More than one consecutive entry. We mark this by having
2000 a negative index into the indirect table. */
2001 obstack_int32_grow_fast (atwc.extrapool,
2002 -(obstack_object_size (atwc.indpool)
2003 / sizeof (int32_t)));
2004 obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2007 runp = runp->wcnext;
2008 while (runp->wcnext != NULL
2009 && runp->nwcs == runp->wcnext->nwcs
2010 && wmemcmp ((wchar_t *) runp->wcs,
2011 (wchar_t *)runp->wcnext->wcs,
2012 runp->nwcs - 1) == 0
2013 && (runp->wcs[runp->nwcs - 1]
2014 == runp->wcnext->wcs[runp->nwcs - 1] + 1));
2016 /* Now walk backward from here to the beginning. */
2017 curp = runp;
2019 for (i = 1; i < runp->nwcs; ++i)
2020 obstack_int32_grow_fast (atwc.extrapool, curp->wcs[i]);
2022 /* Now find the end of the consecutive sequence and
2023 add all the indeces in the indirect pool. */
2026 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2027 curp);
2028 obstack_int32_grow (atwc.indpool, weightidx);
2030 curp = curp->wclast;
2032 while (curp != series_startp);
2034 /* Add the final weight. */
2035 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2036 curp);
2037 obstack_int32_grow (atwc.indpool, weightidx);
2039 /* And add the end byte sequence. Without length this
2040 time. */
2041 for (i = 1; i < curp->nwcs; ++i)
2042 obstack_int32_grow (atwc.extrapool, curp->wcs[i]);
2044 else
2046 /* A single entry. Simply add the index and the length and
2047 string (except for the first character which is already
2048 tested for). */
2049 int i;
2051 /* Output the weight info. */
2052 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2053 runp);
2055 added = (1 + 1 + runp->nwcs - 1) * sizeof (int32_t);
2056 if (sizeof (int) == sizeof (int32_t))
2057 obstack_make_room (atwc.extrapool, added);
2059 obstack_int32_grow_fast (atwc.extrapool, weightidx);
2060 obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2061 for (i = 1; i < runp->nwcs; ++i)
2062 obstack_int32_grow_fast (atwc.extrapool, runp->wcs[i]);
2065 /* Next entry. */
2066 lastp = runp;
2067 runp = runp->wcnext;
2069 while (runp != NULL);
2073 void
2074 collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
2075 const char *output_path)
2077 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
2078 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
2079 struct iovec iov[2 + nelems];
2080 struct locale_file data;
2081 uint32_t idx[nelems];
2082 size_t cnt;
2083 size_t ch;
2084 int32_t tablemb[256];
2085 struct obstack weightpool;
2086 struct obstack extrapool;
2087 struct obstack indirectpool;
2088 struct section_list *sect;
2089 struct collidx_table tablewc;
2090 uint32_t elem_size;
2091 uint32_t *elem_table;
2092 int i;
2093 struct element_t *runp;
2095 data.magic = LIMAGIC (LC_COLLATE);
2096 data.n = nelems;
2097 iov[0].iov_base = (void *) &data;
2098 iov[0].iov_len = sizeof (data);
2100 iov[1].iov_base = (void *) idx;
2101 iov[1].iov_len = sizeof (idx);
2103 idx[0] = iov[0].iov_len + iov[1].iov_len;
2104 cnt = 0;
2106 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES));
2107 iov[2 + cnt].iov_base = &nrules;
2108 iov[2 + cnt].iov_len = sizeof (uint32_t);
2109 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2110 ++cnt;
2112 /* If we have no LC_COLLATE data emit only the number of rules as zero. */
2113 if (collate == NULL)
2115 int32_t dummy = 0;
2117 while (cnt < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
2119 /* The words have to be handled specially. */
2120 if (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
2122 iov[2 + cnt].iov_base = &dummy;
2123 iov[2 + cnt].iov_len = sizeof (int32_t);
2125 else
2127 iov[2 + cnt].iov_base = NULL;
2128 iov[2 + cnt].iov_len = 0;
2131 if (cnt + 1 < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
2132 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2133 ++cnt;
2136 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2138 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", 2 + cnt, iov);
2140 return;
2143 obstack_init (&weightpool);
2144 obstack_init (&extrapool);
2145 obstack_init (&indirectpool);
2147 /* Since we are using the sign of an integer to mark indirection the
2148 offsets in the arrays we are indirectly referring to must not be
2149 zero since -0 == 0. Therefore we add a bit of dummy content. */
2150 obstack_int32_grow (&extrapool, 0);
2151 obstack_int32_grow (&indirectpool, 0);
2153 /* Prepare the ruleset table. */
2154 for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
2155 if (sect->rules != NULL && sect->ruleidx == i)
2157 int j;
2159 obstack_make_room (&weightpool, nrules);
2161 for (j = 0; j < nrules; ++j)
2162 obstack_1grow_fast (&weightpool, sect->rules[j]);
2163 ++i;
2165 /* And align the output. */
2166 i = (nrules * i) % __alignof__ (int32_t);
2167 if (i > 0)
2169 obstack_1grow (&weightpool, '\0');
2170 while (++i < __alignof__ (int32_t));
2172 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_RULESETS));
2173 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2174 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2175 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2176 ++cnt;
2178 /* Generate the 8-bit table. Walk through the lists of sequences
2179 starting with the same byte and add them one after the other to
2180 the table. In case we have more than one sequence starting with
2181 the same byte we have to use extra indirection.
2183 First add a record for the NUL byte. This entry will never be used
2184 so it does not matter. */
2185 tablemb[0] = 0;
2187 /* Now insert the `UNDEFINED' value if it is used. Since this value
2188 will probably be used more than once it is good to store the
2189 weights only once. */
2190 if (collate->undefined.used_in_level != 0)
2191 output_weight (&weightpool, collate, &collate->undefined);
2193 for (ch = 1; ch < 256; ++ch)
2194 if (collate->mbheads[ch]->mbnext == NULL
2195 && collate->mbheads[ch]->nmbs <= 1)
2197 tablemb[ch] = output_weight (&weightpool, collate,
2198 collate->mbheads[ch]);
2200 else
2202 /* The entries in the list are sorted by length and then
2203 alphabetically. This is the order in which we will add the
2204 elements to the collation table. This allows simply walking
2205 the table in sequence and stopping at the first matching
2206 entry. Since the longer sequences are coming first in the
2207 list they have the possibility to match first, just as it
2208 has to be. In the worst case we are walking to the end of
2209 the list where we put, if no singlebyte sequence is defined
2210 in the locale definition, the weights for UNDEFINED.
2212 To reduce the length of the search list we compress them a bit.
2213 This happens by collecting sequences of consecutive byte
2214 sequences in one entry (having and begin and end byte sequence)
2215 and add only one index into the weight table. We can find the
2216 consecutive entries since they are also consecutive in the list. */
2217 struct element_t *runp = collate->mbheads[ch];
2218 struct element_t *lastp;
2220 assert ((obstack_object_size (&extrapool)
2221 & (__alignof__ (int32_t) - 1)) == 0);
2223 tablemb[ch] = -obstack_object_size (&extrapool);
2227 /* Store the current index in the weight table. We know that
2228 the current position in the `extrapool' is aligned on a
2229 32-bit address. */
2230 int32_t weightidx;
2231 int added;
2233 /* Find out wether this is a single entry or we have more than
2234 one consecutive entry. */
2235 if (runp->mbnext != NULL
2236 && runp->nmbs == runp->mbnext->nmbs
2237 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
2238 && (runp->mbs[runp->nmbs - 1]
2239 == runp->mbnext->mbs[runp->nmbs - 1] + 1))
2241 int i;
2242 struct element_t *series_startp = runp;
2243 struct element_t *curp;
2245 /* Compute how much space we will need. */
2246 added = ((sizeof (int32_t) + 1 + 2 * (runp->nmbs - 1)
2247 + __alignof__ (int32_t) - 1)
2248 & ~(__alignof__ (int32_t) - 1));
2249 assert ((obstack_object_size (&extrapool)
2250 & (__alignof__ (int32_t) - 1)) == 0);
2251 obstack_make_room (&extrapool, added);
2253 /* More than one consecutive entry. We mark this by having
2254 a negative index into the indirect table. */
2255 obstack_int32_grow_fast (&extrapool,
2256 -(obstack_object_size (&indirectpool)
2257 / sizeof (int32_t)));
2259 /* Now search first the end of the series. */
2261 runp = runp->mbnext;
2262 while (runp->mbnext != NULL
2263 && runp->nmbs == runp->mbnext->nmbs
2264 && memcmp (runp->mbs, runp->mbnext->mbs,
2265 runp->nmbs - 1) == 0
2266 && (runp->mbs[runp->nmbs - 1]
2267 == runp->mbnext->mbs[runp->nmbs - 1] + 1));
2269 /* Now walk backward from here to the beginning. */
2270 curp = runp;
2272 assert (runp->nmbs <= 256);
2273 obstack_1grow_fast (&extrapool, curp->nmbs - 1);
2274 for (i = 1; i < curp->nmbs; ++i)
2275 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2277 /* Now find the end of the consecutive sequence and
2278 add all the indeces in the indirect pool. */
2281 weightidx = output_weight (&weightpool, collate, curp);
2282 obstack_int32_grow (&indirectpool, weightidx);
2284 curp = curp->mblast;
2286 while (curp != series_startp);
2288 /* Add the final weight. */
2289 weightidx = output_weight (&weightpool, collate, curp);
2290 obstack_int32_grow (&indirectpool, weightidx);
2292 /* And add the end byte sequence. Without length this
2293 time. */
2294 for (i = 1; i < curp->nmbs; ++i)
2295 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2297 else
2299 /* A single entry. Simply add the index and the length and
2300 string (except for the first character which is already
2301 tested for). */
2302 int i;
2304 /* Output the weight info. */
2305 weightidx = output_weight (&weightpool, collate, runp);
2307 added = ((sizeof (int32_t) + 1 + runp->nmbs - 1
2308 + __alignof__ (int32_t) - 1)
2309 & ~(__alignof__ (int32_t) - 1));
2310 assert ((obstack_object_size (&extrapool)
2311 & (__alignof__ (int32_t) - 1)) == 0);
2312 obstack_make_room (&extrapool, added);
2314 obstack_int32_grow_fast (&extrapool, weightidx);
2315 assert (runp->nmbs <= 256);
2316 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
2318 for (i = 1; i < runp->nmbs; ++i)
2319 obstack_1grow_fast (&extrapool, runp->mbs[i]);
2322 /* Add alignment bytes if necessary. */
2323 while ((obstack_object_size (&extrapool)
2324 & (__alignof__ (int32_t) - 1)) != 0)
2325 obstack_1grow_fast (&extrapool, '\0');
2327 /* Next entry. */
2328 lastp = runp;
2329 runp = runp->mbnext;
2331 while (runp != NULL);
2333 assert ((obstack_object_size (&extrapool)
2334 & (__alignof__ (int32_t) - 1)) == 0);
2336 /* If the final entry in the list is not a single character we
2337 add an UNDEFINED entry here. */
2338 if (lastp->nmbs != 1)
2340 int added = ((sizeof (int32_t) + 1 + 1 + __alignof__ (int32_t) - 1)
2341 & ~(__alignof__ (int32_t) - 1));
2342 obstack_make_room (&extrapool, added);
2344 obstack_int32_grow_fast (&extrapool, 0);
2345 /* XXX What rule? We just pick the first. */
2346 obstack_1grow_fast (&extrapool, 0);
2347 /* Length is zero. */
2348 obstack_1grow_fast (&extrapool, 0);
2350 /* Add alignment bytes if necessary. */
2351 while ((obstack_object_size (&extrapool)
2352 & (__alignof__ (int32_t) - 1)) != 0)
2353 obstack_1grow_fast (&extrapool, '\0');
2357 /* Add padding to the tables if necessary. */
2358 while ((obstack_object_size (&weightpool) & (__alignof__ (int32_t) - 1))
2359 != 0)
2360 obstack_1grow (&weightpool, 0);
2362 /* Now add the four tables. */
2363 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEMB));
2364 iov[2 + cnt].iov_base = tablemb;
2365 iov[2 + cnt].iov_len = sizeof (tablemb);
2366 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2367 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2368 ++cnt;
2370 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB));
2371 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2372 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2373 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2374 ++cnt;
2376 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB));
2377 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2378 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2379 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2380 ++cnt;
2382 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB));
2383 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2384 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2385 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2386 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2387 ++cnt;
2390 /* Now the same for the wide character table. We need to store some
2391 more information here. */
2392 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP1));
2393 iov[2 + cnt].iov_base = NULL;
2394 iov[2 + cnt].iov_len = 0;
2395 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2396 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2397 ++cnt;
2399 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP2));
2400 iov[2 + cnt].iov_base = NULL;
2401 iov[2 + cnt].iov_len = 0;
2402 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2403 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2404 ++cnt;
2406 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP3));
2407 iov[2 + cnt].iov_base = NULL;
2408 iov[2 + cnt].iov_len = 0;
2409 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2410 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2411 ++cnt;
2413 /* Since we are using the sign of an integer to mark indirection the
2414 offsets in the arrays we are indirectly referring to must not be
2415 zero since -0 == 0. Therefore we add a bit of dummy content. */
2416 obstack_int32_grow (&extrapool, 0);
2417 obstack_int32_grow (&indirectpool, 0);
2419 /* Now insert the `UNDEFINED' value if it is used. Since this value
2420 will probably be used more than once it is good to store the
2421 weights only once. */
2422 if (output_weightwc (&weightpool, collate, &collate->undefined) != 0)
2423 abort ();
2425 /* Generate the table. Walk through the lists of sequences starting
2426 with the same wide character and add them one after the other to
2427 the table. In case we have more than one sequence starting with
2428 the same byte we have to use extra indirection. */
2429 tablewc.p = 6;
2430 tablewc.q = 10;
2431 collidx_table_init (&tablewc);
2433 atwc.weightpool = &weightpool;
2434 atwc.extrapool = &extrapool;
2435 atwc.indpool = &indirectpool;
2436 atwc.collate = collate;
2437 atwc.tablewc = &tablewc;
2439 wchead_table_iterate (&collate->wcheads, add_to_tablewc);
2441 memset (&atwc, 0, sizeof (atwc));
2443 collidx_table_finalize (&tablewc);
2445 /* Now add the four tables. */
2446 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEWC));
2447 iov[2 + cnt].iov_base = tablewc.result;
2448 iov[2 + cnt].iov_len = tablewc.result_size;
2449 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2450 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2451 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2452 ++cnt;
2454 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTWC));
2455 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2456 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2457 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2458 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2459 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2460 ++cnt;
2462 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAWC));
2463 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2464 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2465 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2466 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2467 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2468 ++cnt;
2470 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTWC));
2471 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2472 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2473 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2474 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2475 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2476 ++cnt;
2479 /* Finally write the table with collation element names out. It is
2480 a hash table with a simple function which gets the name of the
2481 character as the input. One character might have many names. The
2482 value associated with the name is an index into the weight table
2483 where we are then interested in the first-level weight value.
2485 To determine how large the table should be we are counting the
2486 elements have to put in. Since we are using internal chaining
2487 using a secondary hash function we have to make the table a bit
2488 larger to avoid extremely long search times. We can achieve
2489 good results with a 40% larger table than there are entries. */
2490 elem_size = 0;
2491 runp = collate->start;
2492 while (runp != NULL)
2494 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2495 /* Yep, the element really counts. */
2496 ++elem_size;
2498 runp = runp->next;
2500 /* Add 40% and find the next prime number. */
2501 elem_size = next_prime (elem_size * 1.4);
2503 /* Allocate the table. Each entry consists of two words: the hash
2504 value and an index in a secondary table which provides the index
2505 into the weight table and the string itself (so that a match can
2506 be determined). */
2507 elem_table = (uint32_t *) obstack_alloc (&extrapool,
2508 elem_size * 2 * sizeof (uint32_t));
2509 memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t));
2511 /* Now add the elements. */
2512 runp = collate->start;
2513 while (runp != NULL)
2515 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2517 /* Compute the hash value of the name. */
2518 uint32_t namelen = strlen (runp->name);
2519 uint32_t hash = elem_hash (runp->name, namelen);
2520 size_t idx = hash % elem_size;
2521 #ifndef NDEBUG
2522 size_t start_idx = idx;
2523 #endif
2525 if (elem_table[idx * 2] != 0)
2527 /* The spot is already taken. Try iterating using the value
2528 from the secondary hashing function. */
2529 size_t iter = hash % (elem_size - 2) + 1;
2533 idx += iter;
2534 if (idx >= elem_size)
2535 idx -= elem_size;
2536 assert (idx != start_idx);
2538 while (elem_table[idx * 2] != 0);
2540 /* This is the spot where we will insert the value. */
2541 elem_table[idx * 2] = hash;
2542 elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
2544 /* The the string itself including length. */
2545 obstack_1grow (&extrapool, namelen);
2546 obstack_grow (&extrapool, runp->name, namelen);
2548 /* And the multibyte representation. */
2549 obstack_1grow (&extrapool, runp->nmbs);
2550 obstack_grow (&extrapool, runp->mbs, runp->nmbs);
2552 /* And align again to 32 bits. */
2553 if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0)
2554 obstack_grow (&extrapool, "\0\0",
2555 (sizeof (int32_t)
2556 - ((1 + namelen + 1 + runp->nmbs)
2557 % sizeof (int32_t))));
2559 /* Now some 32-bit values: multibyte collation sequence,
2560 wide char string (including length), and wide char
2561 collation sequence. */
2562 obstack_int32_grow (&extrapool, runp->mbseqorder);
2564 obstack_int32_grow (&extrapool, runp->nwcs);
2565 obstack_grow (&extrapool, runp->wcs,
2566 runp->nwcs * sizeof (uint32_t));
2568 obstack_int32_grow (&extrapool, runp->wcseqorder);
2571 runp = runp->next;
2574 /* Prepare to write out this data. */
2575 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB));
2576 iov[2 + cnt].iov_base = &elem_size;
2577 iov[2 + cnt].iov_len = sizeof (int32_t);
2578 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2579 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2580 ++cnt;
2582 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_TABLEMB));
2583 iov[2 + cnt].iov_base = elem_table;
2584 iov[2 + cnt].iov_len = elem_size * 2 * sizeof (int32_t);
2585 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2586 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2587 ++cnt;
2589 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_EXTRAMB));
2590 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2591 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2592 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2593 ++cnt;
2595 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQMB));
2596 iov[2 + cnt].iov_base = collate->mbseqorder;
2597 iov[2 + cnt].iov_len = 256;
2598 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2599 ++cnt;
2601 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQWC));
2602 iov[2 + cnt].iov_base = collate->wcseqorder.result;
2603 iov[2 + cnt].iov_len = collate->wcseqorder.result_size;
2604 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2605 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2606 ++cnt;
2608 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_CODESET));
2609 iov[2 + cnt].iov_base = (void *) charmap->code_set_name;
2610 iov[2 + cnt].iov_len = strlen (iov[2 + cnt].iov_base) + 1;
2611 ++cnt;
2613 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2615 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", 2 + cnt, iov);
2617 obstack_free (&weightpool, NULL);
2618 obstack_free (&extrapool, NULL);
2619 obstack_free (&indirectpool, NULL);
2623 void
2624 collate_read (struct linereader *ldfile, struct localedef_t *result,
2625 const struct charmap_t *charmap, const char *repertoire_name,
2626 int ignore_content)
2628 struct repertoire_t *repertoire = NULL;
2629 struct locale_collate_t *collate;
2630 struct token *now;
2631 struct token *arg = NULL;
2632 enum token_t nowtok;
2633 enum token_t was_ellipsis = tok_none;
2634 struct localedef_t *copy_locale = NULL;
2635 /* Parsing state:
2636 0 - start
2637 1 - between `order-start' and `order-end'
2638 2 - after `order-end'
2639 3 - after `reorder-after', waiting for `reorder-end'
2640 4 - after `reorder-end'
2641 5 - after `reorder-sections-after', waiting for `reorder-sections-end'
2642 6 - after `reorder-sections-end'
2644 int state = 0;
2646 /* Get the repertoire we have to use. */
2647 if (repertoire_name != NULL)
2648 repertoire = repertoire_read (repertoire_name);
2650 /* The rest of the line containing `LC_COLLATE' must be free. */
2651 lr_ignore_rest (ldfile, 1);
2655 now = lr_token (ldfile, charmap, result, NULL, verbose);
2656 nowtok = now->tok;
2658 while (nowtok == tok_eol);
2660 if (nowtok == tok_copy)
2662 state = 2;
2663 now = lr_token (ldfile, charmap, result, NULL, verbose);
2664 if (now->tok != tok_string)
2666 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2668 skip_category:
2670 now = lr_token (ldfile, charmap, result, NULL, verbose);
2671 while (now->tok != tok_eof && now->tok != tok_end);
2673 if (now->tok != tok_eof
2674 || (now = lr_token (ldfile, charmap, result, NULL, verbose),
2675 now->tok == tok_eof))
2676 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2677 else if (now->tok != tok_lc_collate)
2679 lr_error (ldfile, _("\
2680 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2681 lr_ignore_rest (ldfile, 0);
2683 else
2684 lr_ignore_rest (ldfile, 1);
2686 return;
2689 if (! ignore_content)
2691 /* Get the locale definition. */
2692 copy_locale = load_locale (LC_COLLATE, now->val.str.startmb,
2693 repertoire_name, charmap, NULL);
2694 if ((copy_locale->avail & COLLATE_LOCALE) == 0)
2696 /* Not yet loaded. So do it now. */
2697 if (locfile_read (copy_locale, charmap) != 0)
2698 goto skip_category;
2701 if (copy_locale->categories[LC_COLLATE].collate == NULL)
2702 return;
2705 lr_ignore_rest (ldfile, 1);
2707 now = lr_token (ldfile, charmap, result, NULL, verbose);
2708 nowtok = now->tok;
2711 /* Prepare the data structures. */
2712 collate_startup (ldfile, result, copy_locale, ignore_content);
2713 collate = result->categories[LC_COLLATE].collate;
2715 while (1)
2717 char ucs4buf[10];
2718 char *symstr;
2719 size_t symlen;
2721 /* Of course we don't proceed beyond the end of file. */
2722 if (nowtok == tok_eof)
2723 break;
2725 /* Ingore empty lines. */
2726 if (nowtok == tok_eol)
2728 now = lr_token (ldfile, charmap, result, NULL, verbose);
2729 nowtok = now->tok;
2730 continue;
2733 switch (nowtok)
2735 case tok_copy:
2736 /* Allow copying other locales. */
2737 now = lr_token (ldfile, charmap, result, NULL, verbose);
2738 if (now->tok != tok_string)
2739 goto err_label;
2741 if (! ignore_content)
2742 load_locale (LC_COLLATE, now->val.str.startmb, repertoire_name,
2743 charmap, result);
2745 lr_ignore_rest (ldfile, 1);
2746 break;
2748 case tok_coll_weight_max:
2749 /* Ignore the rest of the line if we don't need the input of
2750 this line. */
2751 if (ignore_content)
2753 lr_ignore_rest (ldfile, 0);
2754 break;
2757 if (state != 0)
2758 goto err_label;
2760 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2761 if (arg->tok != tok_number)
2762 goto err_label;
2763 if (collate->col_weight_max != -1)
2764 lr_error (ldfile, _("%s: duplicate definition of `%s'"),
2765 "LC_COLLATE", "col_weight_max");
2766 else
2767 collate->col_weight_max = arg->val.num;
2768 lr_ignore_rest (ldfile, 1);
2769 break;
2771 case tok_section_symbol:
2772 /* Ignore the rest of the line if we don't need the input of
2773 this line. */
2774 if (ignore_content)
2776 lr_ignore_rest (ldfile, 0);
2777 break;
2780 if (state != 0)
2781 goto err_label;
2783 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2784 if (arg->tok != tok_bsymbol)
2785 goto err_label;
2786 else if (!ignore_content)
2788 /* Check whether this section is already known. */
2789 struct section_list *known = collate->sections;
2790 while (known != NULL)
2792 if (strcmp (known->name, arg->val.str.startmb) == 0)
2793 break;
2794 known = known->next;
2797 if (known != NULL)
2799 lr_error (ldfile,
2800 _("%s: duplicate declaration of section `%s'"),
2801 "LC_COLLATE", arg->val.str.startmb);
2802 free (arg->val.str.startmb);
2804 else
2805 collate->sections = make_seclist_elem (collate,
2806 arg->val.str.startmb,
2807 collate->sections);
2809 lr_ignore_rest (ldfile, known == NULL);
2811 else
2813 free (arg->val.str.startmb);
2814 lr_ignore_rest (ldfile, 0);
2816 break;
2818 case tok_collating_element:
2819 /* Ignore the rest of the line if we don't need the input of
2820 this line. */
2821 if (ignore_content)
2823 lr_ignore_rest (ldfile, 0);
2824 break;
2827 if (state != 0 && state != 2)
2828 goto err_label;
2830 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2831 if (arg->tok != tok_bsymbol)
2832 goto err_label;
2833 else
2835 const char *symbol = arg->val.str.startmb;
2836 size_t symbol_len = arg->val.str.lenmb;
2838 /* Next the `from' keyword. */
2839 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2840 if (arg->tok != tok_from)
2842 free ((char *) symbol);
2843 goto err_label;
2846 ldfile->return_widestr = 1;
2847 ldfile->translate_strings = 1;
2849 /* Finally the string with the replacement. */
2850 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2852 ldfile->return_widestr = 0;
2853 ldfile->translate_strings = 0;
2855 if (arg->tok != tok_string)
2856 goto err_label;
2858 if (!ignore_content && symbol != NULL)
2860 /* The name is already defined. */
2861 if (check_duplicate (ldfile, collate, charmap,
2862 repertoire, symbol, symbol_len))
2863 goto col_elem_free;
2865 if (arg->val.str.startmb != NULL)
2866 insert_entry (&collate->elem_table, symbol, symbol_len,
2867 new_element (collate,
2868 arg->val.str.startmb,
2869 arg->val.str.lenmb - 1,
2870 arg->val.str.startwc,
2871 symbol, symbol_len, 0));
2873 else
2875 col_elem_free:
2876 if (symbol != NULL)
2877 free ((char *) symbol);
2878 if (arg->val.str.startmb != NULL)
2879 free (arg->val.str.startmb);
2880 if (arg->val.str.startwc != NULL)
2881 free (arg->val.str.startwc);
2883 lr_ignore_rest (ldfile, 1);
2885 break;
2887 case tok_collating_symbol:
2888 /* Ignore the rest of the line if we don't need the input of
2889 this line. */
2890 if (ignore_content)
2892 lr_ignore_rest (ldfile, 0);
2893 break;
2896 if (state != 0 && state != 2)
2897 goto err_label;
2899 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2900 if (arg->tok != tok_bsymbol)
2901 goto err_label;
2902 else
2904 char *symbol = arg->val.str.startmb;
2905 size_t symbol_len = arg->val.str.lenmb;
2906 char *endsymbol = NULL;
2907 size_t endsymbol_len = 0;
2908 enum token_t ellipsis = tok_none;
2910 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2911 if (arg->tok == tok_ellipsis2 || arg->tok == tok_ellipsis4)
2913 ellipsis = arg->tok;
2915 arg = lr_token (ldfile, charmap, result, repertoire,
2916 verbose);
2917 if (arg->tok != tok_bsymbol)
2919 free (symbol);
2920 goto err_label;
2923 endsymbol = arg->val.str.startmb;
2924 endsymbol_len = arg->val.str.lenmb;
2926 lr_ignore_rest (ldfile, 1);
2928 else if (arg->tok != tok_eol)
2930 free (symbol);
2931 goto err_label;
2934 if (!ignore_content)
2936 if (symbol == NULL
2937 || (ellipsis != tok_none && endsymbol == NULL))
2939 lr_error (ldfile, _("\
2940 %s: unknown character in collating symbol name"),
2941 "LC_COLLATE");
2942 goto col_sym_free;
2944 else if (ellipsis == tok_none)
2946 /* A single symbol, no ellipsis. */
2947 if (check_duplicate (ldfile, collate, charmap,
2948 repertoire, symbol, symbol_len))
2949 /* The name is already defined. */
2950 goto col_sym_free;
2952 insert_entry (&collate->sym_table, symbol, symbol_len,
2953 new_symbol (collate, symbol, symbol_len));
2955 else if (symbol_len != endsymbol_len)
2957 col_sym_inv_range:
2958 lr_error (ldfile,
2959 _("invalid names for character range"));
2960 goto col_sym_free;
2962 else
2964 /* Oh my, we have to handle an ellipsis. First, as
2965 usual, determine the common prefix and then
2966 convert the rest into a range. */
2967 size_t prefixlen;
2968 unsigned long int from;
2969 unsigned long int to;
2970 char *endp;
2972 for (prefixlen = 0; prefixlen < symbol_len; ++prefixlen)
2973 if (symbol[prefixlen] != endsymbol[prefixlen])
2974 break;
2976 /* Convert the rest into numbers. */
2977 symbol[symbol_len] = '\0';
2978 from = strtoul (&symbol[prefixlen], &endp,
2979 ellipsis == tok_ellipsis2 ? 16 : 10);
2980 if (*endp != '\0')
2981 goto col_sym_inv_range;
2983 endsymbol[symbol_len] = '\0';
2984 to = strtoul (&endsymbol[prefixlen], &endp,
2985 ellipsis == tok_ellipsis2 ? 16 : 10);
2986 if (*endp != '\0')
2987 goto col_sym_inv_range;
2989 if (from > to)
2990 goto col_sym_inv_range;
2992 /* Now loop over all entries. */
2993 while (from <= to)
2995 char *symbuf;
2997 symbuf = (char *) obstack_alloc (&collate->mempool,
2998 symbol_len + 1);
3000 /* Create the name. */
3001 sprintf (symbuf,
3002 ellipsis == tok_ellipsis2
3003 ? "%.*s%.*lX" : "%.*s%.*lu",
3004 (int) prefixlen, symbol,
3005 (int) (symbol_len - prefixlen), from);
3007 if (check_duplicate (ldfile, collate, charmap,
3008 repertoire, symbuf, symbol_len))
3009 /* The name is already defined. */
3010 goto col_sym_free;
3012 insert_entry (&collate->sym_table, symbuf,
3013 symbol_len,
3014 new_symbol (collate, symbuf,
3015 symbol_len));
3017 /* Increment the counter. */
3018 ++from;
3021 goto col_sym_free;
3024 else
3026 col_sym_free:
3027 if (symbol != NULL)
3028 free (symbol);
3029 if (endsymbol != NULL)
3030 free (endsymbol);
3033 break;
3035 case tok_symbol_equivalence:
3036 /* Ignore the rest of the line if we don't need the input of
3037 this line. */
3038 if (ignore_content)
3040 lr_ignore_rest (ldfile, 0);
3041 break;
3044 if (state != 0)
3045 goto err_label;
3047 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3048 if (arg->tok != tok_bsymbol)
3049 goto err_label;
3050 else
3052 const char *newname = arg->val.str.startmb;
3053 size_t newname_len = arg->val.str.lenmb;
3054 const char *symname;
3055 size_t symname_len;
3056 void *symval; /* Actually struct symbol_t* */
3058 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3059 if (arg->tok != tok_bsymbol)
3061 if (newname != NULL)
3062 free ((char *) newname);
3063 goto err_label;
3066 symname = arg->val.str.startmb;
3067 symname_len = arg->val.str.lenmb;
3069 if (newname == NULL)
3071 lr_error (ldfile, _("\
3072 %s: unknown character in equivalent definition name"),
3073 "LC_COLLATE");
3075 sym_equiv_free:
3076 if (newname != NULL)
3077 free ((char *) newname);
3078 if (symname != NULL)
3079 free ((char *) symname);
3080 break;
3082 if (symname == NULL)
3084 lr_error (ldfile, _("\
3085 %s: unknown character in equivalent definition value"),
3086 "LC_COLLATE");
3087 goto sym_equiv_free;
3090 /* See whether the symbol name is already defined. */
3091 if (find_entry (&collate->sym_table, symname, symname_len,
3092 &symval) != 0)
3094 lr_error (ldfile, _("\
3095 %s: unknown symbol `%s' in equivalent definition"),
3096 "LC_COLLATE", symname);
3097 goto sym_equiv_free;
3100 if (insert_entry (&collate->sym_table,
3101 newname, newname_len, symval) < 0)
3103 lr_error (ldfile, _("\
3104 error while adding equivalent collating symbol"));
3105 goto sym_equiv_free;
3108 free ((char *) symname);
3110 lr_ignore_rest (ldfile, 1);
3111 break;
3113 case tok_script:
3114 /* We get told about the scripts we know. */
3115 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3116 if (arg->tok != tok_bsymbol)
3117 goto err_label;
3118 else
3120 struct section_list *runp = collate->known_sections;
3121 char *name;
3123 while (runp != NULL)
3124 if (strncmp (runp->name, arg->val.str.startmb,
3125 arg->val.str.lenmb) == 0
3126 && runp->name[arg->val.str.lenmb] == '\0')
3127 break;
3128 else
3129 runp = runp->def_next;
3131 if (runp != NULL)
3133 lr_error (ldfile, _("duplicate definition of script `%s'"),
3134 runp->name);
3135 lr_ignore_rest (ldfile, 0);
3136 break;
3139 runp = (struct section_list *) xcalloc (1, sizeof (*runp));
3140 name = (char *) xmalloc (arg->val.str.lenmb + 1);
3141 memcpy (name, arg->val.str.startmb, arg->val.str.lenmb);
3142 name[arg->val.str.lenmb] = '\0';
3143 runp->name = name;
3145 runp->def_next = collate->known_sections;
3146 collate->known_sections = runp;
3148 lr_ignore_rest (ldfile, 1);
3149 break;
3151 case tok_order_start:
3152 /* Ignore the rest of the line if we don't need the input of
3153 this line. */
3154 if (ignore_content)
3156 lr_ignore_rest (ldfile, 0);
3157 break;
3160 if (state != 0 && state != 1 && state != 2)
3161 goto err_label;
3162 state = 1;
3164 /* The 14652 draft does not specify whether all `order_start' lines
3165 must contain the same number of sort-rules, but 14651 does. So
3166 we require this here as well. */
3167 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3168 if (arg->tok == tok_bsymbol)
3170 /* This better should be a section name. */
3171 struct section_list *sp = collate->known_sections;
3172 while (sp != NULL
3173 && (sp->name == NULL
3174 || strncmp (sp->name, arg->val.str.startmb,
3175 arg->val.str.lenmb) != 0
3176 || sp->name[arg->val.str.lenmb] != '\0'))
3177 sp = sp->def_next;
3179 if (sp == NULL)
3181 lr_error (ldfile, _("\
3182 %s: unknown section name `%.*s'"),
3183 "LC_COLLATE", (int) arg->val.str.lenmb,
3184 arg->val.str.startmb);
3185 /* We use the error section. */
3186 collate->current_section = &collate->error_section;
3188 if (collate->error_section.first == NULL)
3190 /* Insert &collate->error_section at the end of
3191 the collate->sections list. */
3192 if (collate->sections == NULL)
3193 collate->sections = &collate->error_section;
3194 else
3196 sp = collate->sections;
3197 while (sp->next != NULL)
3198 sp = sp->next;
3200 sp->next = &collate->error_section;
3202 collate->error_section.next = NULL;
3205 else
3207 /* One should not be allowed to open the same
3208 section twice. */
3209 if (sp->first != NULL)
3210 lr_error (ldfile, _("\
3211 %s: multiple order definitions for section `%s'"),
3212 "LC_COLLATE", sp->name);
3213 else
3215 /* Insert sp in the collate->sections list,
3216 right after collate->current_section. */
3217 if (collate->current_section == NULL)
3218 collate->current_section = sp;
3219 else
3221 sp->next = collate->current_section->next;
3222 collate->current_section->next = sp;
3226 /* Next should come the end of the line or a semicolon. */
3227 arg = lr_token (ldfile, charmap, result, repertoire,
3228 verbose);
3229 if (arg->tok == tok_eol)
3231 uint32_t cnt;
3233 /* This means we have exactly one rule: `forward'. */
3234 if (nrules > 1)
3235 lr_error (ldfile, _("\
3236 %s: invalid number of sorting rules"),
3237 "LC_COLLATE");
3238 else
3239 nrules = 1;
3240 sp->rules = obstack_alloc (&collate->mempool,
3241 (sizeof (enum coll_sort_rule)
3242 * nrules));
3243 for (cnt = 0; cnt < nrules; ++cnt)
3244 sp->rules[cnt] = sort_forward;
3246 /* Next line. */
3247 break;
3250 /* Get the next token. */
3251 arg = lr_token (ldfile, charmap, result, repertoire,
3252 verbose);
3255 else
3257 /* There is no section symbol. Therefore we use the unnamed
3258 section. */
3259 collate->current_section = &collate->unnamed_section;
3261 if (collate->unnamed_section.first != NULL)
3262 lr_error (ldfile, _("\
3263 %s: multiple order definitions for unnamed section"),
3264 "LC_COLLATE");
3265 else
3267 /* Insert &collate->unnamed_section at the beginning of
3268 the collate->sections list. */
3269 collate->unnamed_section.next = collate->sections;
3270 collate->sections = &collate->unnamed_section;
3274 /* Now read the direction names. */
3275 read_directions (ldfile, arg, charmap, repertoire, result);
3277 /* From now we need the strings untranslated. */
3278 ldfile->translate_strings = 0;
3279 break;
3281 case tok_order_end:
3282 /* Ignore the rest of the line if we don't need the input of
3283 this line. */
3284 if (ignore_content)
3286 lr_ignore_rest (ldfile, 0);
3287 break;
3290 if (state != 1)
3291 goto err_label;
3293 /* Handle ellipsis at end of list. */
3294 if (was_ellipsis != tok_none)
3296 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3297 repertoire, result);
3298 was_ellipsis = tok_none;
3301 state = 2;
3302 lr_ignore_rest (ldfile, 1);
3303 break;
3305 case tok_reorder_after:
3306 /* Ignore the rest of the line if we don't need the input of
3307 this line. */
3308 if (ignore_content)
3310 lr_ignore_rest (ldfile, 0);
3311 break;
3314 if (state == 1)
3316 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3317 "LC_COLLATE");
3318 state = 2;
3320 /* Handle ellipsis at end of list. */
3321 if (was_ellipsis != tok_none)
3323 handle_ellipsis (ldfile, arg->val.str.startmb,
3324 arg->val.str.lenmb, was_ellipsis, charmap,
3325 repertoire, result);
3326 was_ellipsis = tok_none;
3329 else if (state != 2 && state != 3)
3330 goto err_label;
3331 state = 3;
3333 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3334 if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
3336 /* Find this symbol in the sequence table. */
3337 char ucsbuf[10];
3338 char *startmb;
3339 size_t lenmb;
3340 struct element_t *insp;
3341 int no_error = 1;
3342 void *ptr;
3344 if (arg->tok == tok_bsymbol)
3346 startmb = arg->val.str.startmb;
3347 lenmb = arg->val.str.lenmb;
3349 else
3351 sprintf (ucsbuf, "U%08X", arg->val.ucs4);
3352 startmb = ucsbuf;
3353 lenmb = 9;
3356 if (find_entry (&collate->seq_table, startmb, lenmb, &ptr) == 0)
3357 /* Yes, the symbol exists. Simply point the cursor
3358 to it. */
3359 collate->cursor = (struct element_t *) ptr;
3360 else
3362 struct symbol_t *symbp;
3363 void *ptr;
3365 if (find_entry (&collate->sym_table, startmb, lenmb,
3366 &ptr) == 0)
3368 symbp = ptr;
3370 if (symbp->order->last != NULL
3371 || symbp->order->next != NULL)
3372 collate->cursor = symbp->order;
3373 else
3375 /* This is a collating symbol but its position
3376 is not yet defined. */
3377 lr_error (ldfile, _("\
3378 %s: order for collating symbol %.*s not yet defined"),
3379 "LC_COLLATE", (int) lenmb, startmb);
3380 collate->cursor = NULL;
3381 no_error = 0;
3384 else if (find_entry (&collate->elem_table, startmb, lenmb,
3385 &ptr) == 0)
3387 insp = (struct element_t *) ptr;
3389 if (insp->last != NULL || insp->next != NULL)
3390 collate->cursor = insp;
3391 else
3393 /* This is a collating element but its position
3394 is not yet defined. */
3395 lr_error (ldfile, _("\
3396 %s: order for collating element %.*s not yet defined"),
3397 "LC_COLLATE", (int) lenmb, startmb);
3398 collate->cursor = NULL;
3399 no_error = 0;
3402 else
3404 /* This is bad. The symbol after which we have to
3405 insert does not exist. */
3406 lr_error (ldfile, _("\
3407 %s: cannot reorder after %.*s: symbol not known"),
3408 "LC_COLLATE", (int) lenmb, startmb);
3409 collate->cursor = NULL;
3410 no_error = 0;
3414 lr_ignore_rest (ldfile, no_error);
3416 else
3417 /* This must not happen. */
3418 goto err_label;
3419 break;
3421 case tok_reorder_end:
3422 /* Ignore the rest of the line if we don't need the input of
3423 this line. */
3424 if (ignore_content)
3425 break;
3427 if (state != 3)
3428 goto err_label;
3429 state = 4;
3430 lr_ignore_rest (ldfile, 1);
3431 break;
3433 case tok_reorder_sections_after:
3434 /* Ignore the rest of the line if we don't need the input of
3435 this line. */
3436 if (ignore_content)
3438 lr_ignore_rest (ldfile, 0);
3439 break;
3442 if (state == 1)
3444 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3445 "LC_COLLATE");
3446 state = 2;
3448 /* Handle ellipsis at end of list. */
3449 if (was_ellipsis != tok_none)
3451 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3452 repertoire, result);
3453 was_ellipsis = tok_none;
3456 else if (state == 3)
3458 WITH_CUR_LOCALE (error (0, 0, _("\
3459 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3460 state = 4;
3462 else if (state != 2 && state != 4)
3463 goto err_label;
3464 state = 5;
3466 /* Get the name of the sections we are adding after. */
3467 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3468 if (arg->tok == tok_bsymbol)
3470 /* Now find a section with this name. */
3471 struct section_list *runp = collate->sections;
3473 while (runp != NULL)
3475 if (runp->name != NULL
3476 && strlen (runp->name) == arg->val.str.lenmb
3477 && memcmp (runp->name, arg->val.str.startmb,
3478 arg->val.str.lenmb) == 0)
3479 break;
3481 runp = runp->next;
3484 if (runp != NULL)
3485 collate->current_section = runp;
3486 else
3488 /* This is bad. The section after which we have to
3489 reorder does not exist. Therefore we cannot
3490 process the whole rest of this reorder
3491 specification. */
3492 lr_error (ldfile, _("%s: section `%.*s' not known"),
3493 "LC_COLLATE", (int) arg->val.str.lenmb,
3494 arg->val.str.startmb);
3498 lr_ignore_rest (ldfile, 0);
3500 now = lr_token (ldfile, charmap, result, NULL, verbose);
3502 while (now->tok == tok_reorder_sections_after
3503 || now->tok == tok_reorder_sections_end
3504 || now->tok == tok_end);
3506 /* Process the token we just saw. */
3507 nowtok = now->tok;
3508 continue;
3511 else
3512 /* This must not happen. */
3513 goto err_label;
3514 break;
3516 case tok_reorder_sections_end:
3517 /* Ignore the rest of the line if we don't need the input of
3518 this line. */
3519 if (ignore_content)
3520 break;
3522 if (state != 5)
3523 goto err_label;
3524 state = 6;
3525 lr_ignore_rest (ldfile, 1);
3526 break;
3528 case tok_bsymbol:
3529 case tok_ucs4:
3530 /* Ignore the rest of the line if we don't need the input of
3531 this line. */
3532 if (ignore_content)
3534 lr_ignore_rest (ldfile, 0);
3535 break;
3538 if (state != 0 && state != 1 && state != 3 && state != 5)
3539 goto err_label;
3541 if ((state == 0 || state == 5) && nowtok == tok_ucs4)
3542 goto err_label;
3544 if (nowtok == tok_ucs4)
3546 snprintf (ucs4buf, sizeof (ucs4buf), "U%08X", now->val.ucs4);
3547 symstr = ucs4buf;
3548 symlen = 9;
3550 else if (arg != NULL)
3552 symstr = arg->val.str.startmb;
3553 symlen = arg->val.str.lenmb;
3555 else
3557 lr_error (ldfile, _("%s: bad symbol <%.*s>"), "LC_COLLATE",
3558 (int) ldfile->token.val.str.lenmb,
3559 ldfile->token.val.str.startmb);
3560 break;
3563 struct element_t *seqp;
3564 if (state == 0)
3566 /* We are outside an `order_start' region. This means
3567 we must only accept definitions of values for
3568 collation symbols since these are purely abstract
3569 values and don't need directions associated. */
3570 void *ptr;
3572 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3574 seqp = ptr;
3576 /* It's already defined. First check whether this
3577 is really a collating symbol. */
3578 if (seqp->is_character)
3579 goto err_label;
3581 goto move_entry;
3583 else
3585 void *result;
3587 if (find_entry (&collate->sym_table, symstr, symlen,
3588 &result) != 0)
3589 /* No collating symbol, it's an error. */
3590 goto err_label;
3592 /* Maybe this is the first time we define a symbol
3593 value and it is before the first actual section. */
3594 if (collate->sections == NULL)
3595 collate->sections = collate->current_section =
3596 &collate->symbol_section;
3599 if (was_ellipsis != tok_none)
3601 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis,
3602 charmap, repertoire, result);
3604 /* Remember that we processed the ellipsis. */
3605 was_ellipsis = tok_none;
3607 /* And don't add the value a second time. */
3608 break;
3611 else if (state == 3)
3613 /* It is possible that we already have this collation sequence.
3614 In this case we move the entry. */
3615 void *sym;
3616 void *ptr;
3618 /* If the symbol after which we have to insert was not found
3619 ignore all entries. */
3620 if (collate->cursor == NULL)
3622 lr_ignore_rest (ldfile, 0);
3623 break;
3626 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3628 seqp = (struct element_t *) ptr;
3629 goto move_entry;
3632 if (find_entry (&collate->sym_table, symstr, symlen, &sym) == 0
3633 && (seqp = ((struct symbol_t *) sym)->order) != NULL)
3634 goto move_entry;
3636 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) == 0
3637 && (seqp = (struct element_t *) ptr,
3638 seqp->last != NULL || seqp->next != NULL
3639 || (collate->start != NULL && seqp == collate->start)))
3641 move_entry:
3642 /* Remove the entry from the old position. */
3643 if (seqp->last == NULL)
3644 collate->start = seqp->next;
3645 else
3646 seqp->last->next = seqp->next;
3647 if (seqp->next != NULL)
3648 seqp->next->last = seqp->last;
3650 /* We also have to check whether this entry is the
3651 first or last of a section. */
3652 if (seqp->section->first == seqp)
3654 if (seqp->section->first == seqp->section->last)
3655 /* This section has no content anymore. */
3656 seqp->section->first = seqp->section->last = NULL;
3657 else
3658 seqp->section->first = seqp->next;
3660 else if (seqp->section->last == seqp)
3661 seqp->section->last = seqp->last;
3663 /* Now insert it in the new place. */
3664 insert_weights (ldfile, seqp, charmap, repertoire, result,
3665 tok_none);
3666 break;
3669 /* Otherwise we just add a new entry. */
3671 else if (state == 5)
3673 /* We are reordering sections. Find the named section. */
3674 struct section_list *runp = collate->sections;
3675 struct section_list *prevp = NULL;
3677 while (runp != NULL)
3679 if (runp->name != NULL
3680 && strlen (runp->name) == symlen
3681 && memcmp (runp->name, symstr, symlen) == 0)
3682 break;
3684 prevp = runp;
3685 runp = runp->next;
3688 if (runp == NULL)
3690 lr_error (ldfile, _("%s: section `%.*s' not known"),
3691 "LC_COLLATE", (int) symlen, symstr);
3692 lr_ignore_rest (ldfile, 0);
3694 else
3696 if (runp != collate->current_section)
3698 /* Remove the named section from the old place and
3699 insert it in the new one. */
3700 prevp->next = runp->next;
3702 runp->next = collate->current_section->next;
3703 collate->current_section->next = runp;
3704 collate->current_section = runp;
3707 /* Process the rest of the line which might change
3708 the collation rules. */
3709 arg = lr_token (ldfile, charmap, result, repertoire,
3710 verbose);
3711 if (arg->tok != tok_eof && arg->tok != tok_eol)
3712 read_directions (ldfile, arg, charmap, repertoire,
3713 result);
3715 break;
3717 else if (was_ellipsis != tok_none)
3719 /* Using the information in the `ellipsis_weight'
3720 element and this and the last value we have to handle
3721 the ellipsis now. */
3722 assert (state == 1);
3724 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis, charmap,
3725 repertoire, result);
3727 /* Remember that we processed the ellipsis. */
3728 was_ellipsis = tok_none;
3730 /* And don't add the value a second time. */
3731 break;
3734 /* Now insert in the new place. */
3735 insert_value (ldfile, symstr, symlen, charmap, repertoire, result);
3736 break;
3738 case tok_undefined:
3739 /* Ignore the rest of the line if we don't need the input of
3740 this line. */
3741 if (ignore_content)
3743 lr_ignore_rest (ldfile, 0);
3744 break;
3747 if (state != 1)
3748 goto err_label;
3750 if (was_ellipsis != tok_none)
3752 lr_error (ldfile,
3753 _("%s: cannot have `%s' as end of ellipsis range"),
3754 "LC_COLLATE", "UNDEFINED");
3756 unlink_element (collate);
3757 was_ellipsis = tok_none;
3760 /* See whether UNDEFINED already appeared somewhere. */
3761 if (collate->undefined.next != NULL
3762 || &collate->undefined == collate->cursor)
3764 lr_error (ldfile,
3765 _("%s: order for `%.*s' already defined at %s:%Zu"),
3766 "LC_COLLATE", 9, "UNDEFINED",
3767 collate->undefined.file,
3768 collate->undefined.line);
3769 lr_ignore_rest (ldfile, 0);
3771 else
3772 /* Parse the weights. */
3773 insert_weights (ldfile, &collate->undefined, charmap,
3774 repertoire, result, tok_none);
3775 break;
3777 case tok_ellipsis2: /* symbolic hexadecimal ellipsis */
3778 case tok_ellipsis3: /* absolute ellipsis */
3779 case tok_ellipsis4: /* symbolic decimal ellipsis */
3780 /* This is the symbolic (decimal or hexadecimal) or absolute
3781 ellipsis. */
3782 if (was_ellipsis != tok_none)
3783 goto err_label;
3785 if (state != 0 && state != 1 && state != 3)
3786 goto err_label;
3788 was_ellipsis = nowtok;
3790 insert_weights (ldfile, &collate->ellipsis_weight, charmap,
3791 repertoire, result, nowtok);
3792 break;
3794 case tok_end:
3795 /* Next we assume `LC_COLLATE'. */
3796 if (!ignore_content)
3798 if (state == 0)
3799 /* We must either see a copy statement or have
3800 ordering values. */
3801 lr_error (ldfile,
3802 _("%s: empty category description not allowed"),
3803 "LC_COLLATE");
3804 else if (state == 1)
3806 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3807 "LC_COLLATE");
3809 /* Handle ellipsis at end of list. */
3810 if (was_ellipsis != tok_none)
3812 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3813 repertoire, result);
3814 was_ellipsis = tok_none;
3817 else if (state == 3)
3818 WITH_CUR_LOCALE (error (0, 0, _("\
3819 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3820 else if (state == 5)
3821 WITH_CUR_LOCALE (error (0, 0, _("\
3822 %s: missing `reorder-sections-end' keyword"), "LC_COLLATE"));
3824 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3825 if (arg->tok == tok_eof)
3826 break;
3827 if (arg->tok == tok_eol)
3828 lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
3829 else if (arg->tok != tok_lc_collate)
3830 lr_error (ldfile, _("\
3831 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
3832 lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
3833 return;
3835 default:
3836 err_label:
3837 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
3840 /* Prepare for the next round. */
3841 now = lr_token (ldfile, charmap, result, NULL, verbose);
3842 nowtok = now->tok;
3845 /* When we come here we reached the end of the file. */
3846 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");