2.5-18.1
[glibc.git] / locale / programs / ld-collate.c
blobcf1bff130f250acaaf12ebe372c724ef0bff6310
1 /* Copyright (C) 1995-2002, 2003, 2005, 2006 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License version 2 as
7 published by the Free Software Foundation.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 #include <errno.h>
23 #include <error.h>
24 #include <stdlib.h>
25 #include <wchar.h>
26 #include <sys/param.h>
28 #include "localedef.h"
29 #include "charmap.h"
30 #include "localeinfo.h"
31 #include "linereader.h"
32 #include "locfile.h"
33 #include "elem-hash.h"
35 /* Uncomment the following line in the production version. */
36 /* #define NDEBUG 1 */
37 #include <assert.h>
39 #define obstack_chunk_alloc malloc
40 #define obstack_chunk_free free
42 static inline void
43 __attribute ((always_inline))
44 obstack_int32_grow (struct obstack *obstack, int32_t data)
46 if (sizeof (int32_t) == sizeof (int))
47 obstack_int_grow (obstack, data);
48 else
49 obstack_grow (obstack, &data, sizeof (int32_t));
52 static inline void
53 __attribute ((always_inline))
54 obstack_int32_grow_fast (struct obstack *obstack, int32_t data)
56 if (sizeof (int32_t) == sizeof (int))
57 obstack_int_grow_fast (obstack, data);
58 else
59 obstack_grow (obstack, &data, sizeof (int32_t));
62 /* Forward declaration. */
63 struct element_t;
65 /* Data type for list of strings. */
66 struct section_list
68 /* Successor in the known_sections list. */
69 struct section_list *def_next;
70 /* Successor in the sections list. */
71 struct section_list *next;
72 /* Name of the section. */
73 const char *name;
74 /* First element of this section. */
75 struct element_t *first;
76 /* Last element of this section. */
77 struct element_t *last;
78 /* These are the rules for this section. */
79 enum coll_sort_rule *rules;
80 /* Index of the rule set in the appropriate section of the output file. */
81 int ruleidx;
84 struct element_t;
86 struct element_list_t
88 /* Number of elements. */
89 int cnt;
91 struct element_t **w;
94 /* Data type for collating element. */
95 struct element_t
97 const char *name;
99 const char *mbs;
100 size_t nmbs;
101 const uint32_t *wcs;
102 size_t nwcs;
103 int *mborder;
104 int wcorder;
106 /* The following is a bit mask which bits are set if this element is
107 used in the appropriate level. Interesting for the singlebyte
108 weight computation.
110 XXX The type here restricts the number of levels to 32. It could
111 be changed if necessary but I doubt this is necessary. */
112 unsigned int used_in_level;
114 struct element_list_t *weights;
116 /* Nonzero if this is a real character definition. */
117 int is_character;
119 /* Order of the character in the sequence. This information will
120 be used in range expressions. */
121 int mbseqorder;
122 int wcseqorder;
124 /* Where does the definition come from. */
125 const char *file;
126 size_t line;
128 /* Which section does this belong to. */
129 struct section_list *section;
131 /* Predecessor and successor in the order list. */
132 struct element_t *last;
133 struct element_t *next;
135 /* Next element in multibyte output list. */
136 struct element_t *mbnext;
137 struct element_t *mblast;
139 /* Next element in wide character output list. */
140 struct element_t *wcnext;
141 struct element_t *wclast;
144 /* Special element value. */
145 #define ELEMENT_ELLIPSIS2 ((struct element_t *) 1)
146 #define ELEMENT_ELLIPSIS3 ((struct element_t *) 2)
147 #define ELEMENT_ELLIPSIS4 ((struct element_t *) 3)
149 /* Data type for collating symbol. */
150 struct symbol_t
152 const char *name;
154 /* Point to place in the order list. */
155 struct element_t *order;
157 /* Where does the definition come from. */
158 const char *file;
159 size_t line;
162 /* Sparse table of struct element_t *. */
163 #define TABLE wchead_table
164 #define ELEMENT struct element_t *
165 #define DEFAULT NULL
166 #define ITERATE
167 #define NO_FINALIZE
168 #include "3level.h"
170 /* Sparse table of int32_t. */
171 #define TABLE collidx_table
172 #define ELEMENT int32_t
173 #define DEFAULT 0
174 #include "3level.h"
176 /* Sparse table of uint32_t. */
177 #define TABLE collseq_table
178 #define ELEMENT uint32_t
179 #define DEFAULT ~((uint32_t) 0)
180 #include "3level.h"
183 /* The real definition of the struct for the LC_COLLATE locale. */
184 struct locale_collate_t
186 int col_weight_max;
187 int cur_weight_max;
189 /* List of known scripts. */
190 struct section_list *known_sections;
191 /* List of used sections. */
192 struct section_list *sections;
193 /* Current section using definition. */
194 struct section_list *current_section;
195 /* There always can be an unnamed section. */
196 struct section_list unnamed_section;
197 /* To make handling of errors easier we have another section. */
198 struct section_list error_section;
199 /* Sometimes we are defining the values for collating symbols before
200 the first actual section. */
201 struct section_list symbol_section;
203 /* Start of the order list. */
204 struct element_t *start;
206 /* The undefined element. */
207 struct element_t undefined;
209 /* This is the cursor for `reorder_after' insertions. */
210 struct element_t *cursor;
212 /* This value is used when handling ellipsis. */
213 struct element_t ellipsis_weight;
215 /* Known collating elements. */
216 hash_table elem_table;
218 /* Known collating symbols. */
219 hash_table sym_table;
221 /* Known collation sequences. */
222 hash_table seq_table;
224 struct obstack mempool;
226 /* The LC_COLLATE category is a bit special as it is sometimes possible
227 that the definitions from more than one input file contains information.
228 Therefore we keep all relevant input in a list. */
229 struct locale_collate_t *next;
231 /* Arrays with heads of the list for each of the leading bytes in
232 the multibyte sequences. */
233 struct element_t *mbheads[256];
235 /* Arrays with heads of the list for each of the leading bytes in
236 the multibyte sequences. */
237 struct wchead_table wcheads;
239 /* The arrays with the collation sequence order. */
240 unsigned char mbseqorder[256];
241 struct collseq_table wcseqorder;
245 /* We have a few global variables which are used for reading all
246 LC_COLLATE category descriptions in all files. */
247 static uint32_t nrules;
250 /* We need UTF-8 encoding of numbers. */
251 static inline int
252 __attribute ((always_inline))
253 utf8_encode (char *buf, int val)
255 int retval;
257 if (val < 0x80)
259 *buf++ = (char) val;
260 retval = 1;
262 else
264 int step;
266 for (step = 2; step < 6; ++step)
267 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
268 break;
269 retval = step;
271 *buf = (unsigned char) (~0xff >> step);
272 --step;
275 buf[step] = 0x80 | (val & 0x3f);
276 val >>= 6;
278 while (--step > 0);
279 *buf |= val;
282 return retval;
286 static struct section_list *
287 make_seclist_elem (struct locale_collate_t *collate, const char *string,
288 struct section_list *next)
290 struct section_list *newp;
292 newp = (struct section_list *) obstack_alloc (&collate->mempool,
293 sizeof (*newp));
294 newp->next = next;
295 newp->name = string;
296 newp->first = NULL;
297 newp->last = NULL;
299 return newp;
303 static struct element_t *
304 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
305 const uint32_t *wcs, const char *name, size_t namelen,
306 int is_character)
308 struct element_t *newp;
310 newp = (struct element_t *) obstack_alloc (&collate->mempool,
311 sizeof (*newp));
312 newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
313 name, namelen);
314 if (mbs != NULL)
316 newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
317 newp->nmbs = mbslen;
319 else
321 newp->mbs = NULL;
322 newp->nmbs = 0;
324 if (wcs != NULL)
326 size_t nwcs = wcslen ((wchar_t *) wcs);
327 uint32_t zero = 0;
328 obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
329 obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
330 newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
331 newp->nwcs = nwcs;
333 else
335 newp->wcs = NULL;
336 newp->nwcs = 0;
338 newp->mborder = NULL;
339 newp->wcorder = 0;
340 newp->used_in_level = 0;
341 newp->is_character = is_character;
343 /* Will be assigned later. XXX */
344 newp->mbseqorder = 0;
345 newp->wcseqorder = 0;
347 /* Will be allocated later. */
348 newp->weights = NULL;
350 newp->file = NULL;
351 newp->line = 0;
353 newp->section = collate->current_section;
355 newp->last = NULL;
356 newp->next = NULL;
358 newp->mbnext = NULL;
359 newp->mblast = NULL;
361 newp->wcnext = NULL;
362 newp->wclast = NULL;
364 return newp;
368 static struct symbol_t *
369 new_symbol (struct locale_collate_t *collate, const char *name, size_t len)
371 struct symbol_t *newp;
373 newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
375 newp->name = obstack_copy0 (&collate->mempool, name, len);
376 newp->order = NULL;
378 newp->file = NULL;
379 newp->line = 0;
381 return newp;
385 /* Test whether this name is already defined somewhere. */
386 static int
387 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
388 const struct charmap_t *charmap,
389 struct repertoire_t *repertoire, const char *symbol,
390 size_t symbol_len)
392 void *ignore = NULL;
394 if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
396 lr_error (ldfile, _("`%.*s' already defined in charmap"),
397 (int) symbol_len, symbol);
398 return 1;
401 if (repertoire != NULL
402 && (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore)
403 == 0))
405 lr_error (ldfile, _("`%.*s' already defined in repertoire"),
406 (int) symbol_len, symbol);
407 return 1;
410 if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
412 lr_error (ldfile, _("`%.*s' already defined as collating symbol"),
413 (int) symbol_len, symbol);
414 return 1;
417 if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
419 lr_error (ldfile, _("`%.*s' already defined as collating element"),
420 (int) symbol_len, symbol);
421 return 1;
424 return 0;
428 /* Read the direction specification. */
429 static void
430 read_directions (struct linereader *ldfile, struct token *arg,
431 const struct charmap_t *charmap,
432 struct repertoire_t *repertoire, struct localedef_t *result)
434 int cnt = 0;
435 int max = nrules ?: 10;
436 enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
437 int warned = 0;
438 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
440 while (1)
442 int valid = 0;
444 if (arg->tok == tok_forward)
446 if (rules[cnt] & sort_backward)
448 if (! warned)
450 lr_error (ldfile, _("\
451 %s: `forward' and `backward' are mutually excluding each other"),
452 "LC_COLLATE");
453 warned = 1;
456 else if (rules[cnt] & sort_forward)
458 if (! warned)
460 lr_error (ldfile, _("\
461 %s: `%s' mentioned more than once in definition of weight %d"),
462 "LC_COLLATE", "forward", cnt + 1);
465 else
466 rules[cnt] |= sort_forward;
468 valid = 1;
470 else if (arg->tok == tok_backward)
472 if (rules[cnt] & sort_forward)
474 if (! warned)
476 lr_error (ldfile, _("\
477 %s: `forward' and `backward' are mutually excluding each other"),
478 "LC_COLLATE");
479 warned = 1;
482 else if (rules[cnt] & sort_backward)
484 if (! warned)
486 lr_error (ldfile, _("\
487 %s: `%s' mentioned more than once in definition of weight %d"),
488 "LC_COLLATE", "backward", cnt + 1);
491 else
492 rules[cnt] |= sort_backward;
494 valid = 1;
496 else if (arg->tok == tok_position)
498 if (rules[cnt] & sort_position)
500 if (! warned)
502 lr_error (ldfile, _("\
503 %s: `%s' mentioned more than once in definition of weight %d"),
504 "LC_COLLATE", "position", cnt + 1);
507 else
508 rules[cnt] |= sort_position;
510 valid = 1;
513 if (valid)
514 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
516 if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
517 || arg->tok == tok_semicolon)
519 if (! valid && ! warned)
521 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
522 warned = 1;
525 /* See whether we have to increment the counter. */
526 if (arg->tok != tok_comma && rules[cnt] != 0)
528 /* Add the default `forward' if we have seen only `position'. */
529 if (rules[cnt] == sort_position)
530 rules[cnt] = sort_position | sort_forward;
532 ++cnt;
535 if (arg->tok == tok_eof || arg->tok == tok_eol)
536 /* End of line or file, so we exit the loop. */
537 break;
539 if (nrules == 0)
541 /* See whether we have enough room in the array. */
542 if (cnt == max)
544 max += 10;
545 rules = (enum coll_sort_rule *) xrealloc (rules,
547 * sizeof (*rules));
548 memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
551 else
553 if (cnt == nrules)
555 /* There must not be any more rule. */
556 if (! warned)
558 lr_error (ldfile, _("\
559 %s: too many rules; first entry only had %d"),
560 "LC_COLLATE", nrules);
561 warned = 1;
564 lr_ignore_rest (ldfile, 0);
565 break;
569 else
571 if (! warned)
573 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
574 warned = 1;
578 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
581 if (nrules == 0)
583 /* Now we know how many rules we have. */
584 nrules = cnt;
585 rules = (enum coll_sort_rule *) xrealloc (rules,
586 nrules * sizeof (*rules));
588 else
590 if (cnt < nrules)
592 /* Not enough rules in this specification. */
593 if (! warned)
594 lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
597 rules[cnt] = sort_forward;
598 while (++cnt < nrules);
602 collate->current_section->rules = rules;
606 static struct element_t *
607 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
608 const char *str, size_t len)
610 void *result = NULL;
612 /* Search for the entries among the collation sequences already define. */
613 if (find_entry (&collate->seq_table, str, len, &result) != 0)
615 /* Nope, not define yet. So we see whether it is a
616 collation symbol. */
617 void *ptr;
619 if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
621 /* It's a collation symbol. */
622 struct symbol_t *sym = (struct symbol_t *) ptr;
623 result = sym->order;
625 if (result == NULL)
626 result = sym->order = new_element (collate, NULL, 0, NULL,
627 NULL, 0, 0);
629 else if (find_entry (&collate->elem_table, str, len, &result) != 0)
631 /* It's also no collation element. So it is a character
632 element defined later. */
633 result = new_element (collate, NULL, 0, NULL, str, len, 1);
634 /* Insert it into the sequence table. */
635 insert_entry (&collate->seq_table, str, len, result);
639 return (struct element_t *) result;
643 static void
644 unlink_element (struct locale_collate_t *collate)
646 if (collate->cursor == collate->start)
648 assert (collate->cursor->next == NULL);
649 assert (collate->cursor->last == NULL);
650 collate->cursor = NULL;
652 else
654 if (collate->cursor->next != NULL)
655 collate->cursor->next->last = collate->cursor->last;
656 if (collate->cursor->last != NULL)
657 collate->cursor->last->next = collate->cursor->next;
658 collate->cursor = collate->cursor->last;
663 static void
664 insert_weights (struct linereader *ldfile, struct element_t *elem,
665 const struct charmap_t *charmap,
666 struct repertoire_t *repertoire, struct localedef_t *result,
667 enum token_t ellipsis)
669 int weight_cnt;
670 struct token *arg;
671 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
673 /* Initialize all the fields. */
674 elem->file = ldfile->fname;
675 elem->line = ldfile->lineno;
677 elem->last = collate->cursor;
678 elem->next = collate->cursor ? collate->cursor->next : NULL;
679 if (collate->cursor != NULL && collate->cursor->next != NULL)
680 collate->cursor->next->last = elem;
681 if (collate->cursor != NULL)
682 collate->cursor->next = elem;
683 if (collate->start == NULL)
685 assert (collate->cursor == NULL);
686 collate->start = elem;
689 elem->section = collate->current_section;
691 if (collate->current_section->first == NULL)
692 collate->current_section->first = elem;
693 if (collate->current_section->last == collate->cursor)
694 collate->current_section->last = elem;
696 collate->cursor = elem;
698 elem->weights = (struct element_list_t *)
699 obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
700 memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
702 weight_cnt = 0;
704 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
707 if (arg->tok == tok_eof || arg->tok == tok_eol)
708 break;
710 if (arg->tok == tok_ignore)
712 /* The weight for this level has to be ignored. We use the
713 null pointer to indicate this. */
714 elem->weights[weight_cnt].w = (struct element_t **)
715 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
716 elem->weights[weight_cnt].w[0] = NULL;
717 elem->weights[weight_cnt].cnt = 1;
719 else if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
721 char ucs4str[10];
722 struct element_t *val;
723 char *symstr;
724 size_t symlen;
726 if (arg->tok == tok_bsymbol)
728 symstr = arg->val.str.startmb;
729 symlen = arg->val.str.lenmb;
731 else
733 snprintf (ucs4str, sizeof (ucs4str), "U%08X", arg->val.ucs4);
734 symstr = ucs4str;
735 symlen = 9;
738 val = find_element (ldfile, collate, symstr, symlen);
739 if (val == NULL)
740 break;
742 elem->weights[weight_cnt].w = (struct element_t **)
743 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
744 elem->weights[weight_cnt].w[0] = val;
745 elem->weights[weight_cnt].cnt = 1;
747 else if (arg->tok == tok_string)
749 /* Split the string up in the individual characters and put
750 the element definitions in the list. */
751 const char *cp = arg->val.str.startmb;
752 int cnt = 0;
753 struct element_t *charelem;
754 struct element_t **weights = NULL;
755 int max = 0;
757 if (*cp == '\0')
759 lr_error (ldfile, _("%s: empty weight string not allowed"),
760 "LC_COLLATE");
761 lr_ignore_rest (ldfile, 0);
762 break;
767 if (*cp == '<')
769 /* Ahh, it's a bsymbol or an UCS4 value. If it's
770 the latter we have to unify the name. */
771 const char *startp = ++cp;
772 size_t len;
774 while (*cp != '>')
776 if (*cp == ldfile->escape_char)
777 ++cp;
778 if (*cp == '\0')
779 /* It's a syntax error. */
780 goto syntax;
782 ++cp;
785 if (cp - startp == 5 && startp[0] == 'U'
786 && isxdigit (startp[1]) && isxdigit (startp[2])
787 && isxdigit (startp[3]) && isxdigit (startp[4]))
789 unsigned int ucs4 = strtoul (startp + 1, NULL, 16);
790 char *newstr;
792 newstr = (char *) xmalloc (10);
793 snprintf (newstr, 10, "U%08X", ucs4);
794 startp = newstr;
796 len = 9;
798 else
799 len = cp - startp;
801 charelem = find_element (ldfile, collate, startp, len);
802 ++cp;
804 else
806 /* People really shouldn't use characters directly in
807 the string. Especially since it's not really clear
808 what this means. We interpret all characters in the
809 string as if that would be bsymbols. Otherwise we
810 would have to match back to bsymbols somehow and this
811 is normally not what people normally expect. */
812 charelem = find_element (ldfile, collate, cp++, 1);
815 if (charelem == NULL)
817 /* We ignore the rest of the line. */
818 lr_ignore_rest (ldfile, 0);
819 break;
822 /* Add the pointer. */
823 if (cnt >= max)
825 struct element_t **newp;
826 max += 10;
827 newp = (struct element_t **)
828 alloca (max * sizeof (struct element_t *));
829 memcpy (newp, weights, cnt * sizeof (struct element_t *));
830 weights = newp;
832 weights[cnt++] = charelem;
834 while (*cp != '\0');
836 /* Now store the information. */
837 elem->weights[weight_cnt].w = (struct element_t **)
838 obstack_alloc (&collate->mempool,
839 cnt * sizeof (struct element_t *));
840 memcpy (elem->weights[weight_cnt].w, weights,
841 cnt * sizeof (struct element_t *));
842 elem->weights[weight_cnt].cnt = cnt;
844 /* We don't need the string anymore. */
845 free (arg->val.str.startmb);
847 else if (ellipsis != tok_none
848 && (arg->tok == tok_ellipsis2
849 || arg->tok == tok_ellipsis3
850 || arg->tok == tok_ellipsis4))
852 /* It must be the same ellipsis as used in the initial column. */
853 if (arg->tok != ellipsis)
854 lr_error (ldfile, _("\
855 %s: weights must use the same ellipsis symbol as the name"),
856 "LC_COLLATE");
858 /* The weight for this level will depend on the element
859 iterating over the range. Put a placeholder. */
860 elem->weights[weight_cnt].w = (struct element_t **)
861 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
862 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
863 elem->weights[weight_cnt].cnt = 1;
865 else
867 syntax:
868 /* It's a syntax error. */
869 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
870 lr_ignore_rest (ldfile, 0);
871 break;
874 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
875 /* This better should be the end of the line or a semicolon. */
876 if (arg->tok == tok_semicolon)
877 /* OK, ignore this and read the next token. */
878 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
879 else if (arg->tok != tok_eof && arg->tok != tok_eol)
881 /* It's a syntax error. */
882 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
883 lr_ignore_rest (ldfile, 0);
884 break;
887 while (++weight_cnt < nrules);
889 if (weight_cnt < nrules)
891 /* This means the rest of the line uses the current element as
892 the weight. */
895 elem->weights[weight_cnt].w = (struct element_t **)
896 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
897 if (ellipsis == tok_none)
898 elem->weights[weight_cnt].w[0] = elem;
899 else
900 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
901 elem->weights[weight_cnt].cnt = 1;
903 while (++weight_cnt < nrules);
905 else
907 if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
909 /* Too many rule values. */
910 lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
911 lr_ignore_rest (ldfile, 0);
913 else
914 lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
919 static int
920 insert_value (struct linereader *ldfile, const char *symstr, size_t symlen,
921 const struct charmap_t *charmap, struct repertoire_t *repertoire,
922 struct localedef_t *result)
924 /* First find out what kind of symbol this is. */
925 struct charseq *seq;
926 uint32_t wc;
927 struct element_t *elem = NULL;
928 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
930 /* Try to find the character in the charmap. */
931 seq = charmap_find_value (charmap, symstr, symlen);
933 /* Determine the wide character. */
934 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
936 wc = repertoire_find_value (repertoire, symstr, symlen);
937 if (seq != NULL)
938 seq->ucs4 = wc;
940 else
941 wc = seq->ucs4;
943 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
945 /* It's no character, so look through the collation elements and
946 symbol list. */
947 void *ptr = elem;
948 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) != 0)
950 void *result;
951 struct symbol_t *sym = NULL;
953 /* It's also collation element. Therefore it's either a
954 collating symbol or it's a character which is not
955 supported by the character set. In the later case we
956 simply create a dummy entry. */
957 if (find_entry (&collate->sym_table, symstr, symlen, &result) == 0)
959 /* It's a collation symbol. */
960 sym = (struct symbol_t *) result;
962 elem = sym->order;
965 if (elem == NULL)
967 elem = new_element (collate, NULL, 0, NULL, symstr, symlen, 0);
969 if (sym != NULL)
970 sym->order = elem;
971 else
972 /* Enter a fake element in the sequence table. This
973 won't cause anything in the output since there is
974 no multibyte or wide character associated with
975 it. */
976 insert_entry (&collate->seq_table, symstr, symlen, elem);
979 else
980 /* Copy the result back. */
981 elem = ptr;
983 else
985 /* Otherwise the symbols stands for a character. */
986 void *ptr = elem;
987 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) != 0)
989 uint32_t wcs[2] = { wc, 0 };
991 /* We have to allocate an entry. */
992 elem = new_element (collate, seq != NULL ? seq->bytes : NULL,
993 seq != NULL ? seq->nbytes : 0,
994 wc == ILLEGAL_CHAR_VALUE ? NULL : wcs,
995 symstr, symlen, 1);
997 /* And add it to the table. */
998 if (insert_entry (&collate->seq_table, symstr, symlen, elem) != 0)
999 /* This cannot happen. */
1000 assert (! "Internal error");
1002 else
1004 /* Copy the result back. */
1005 elem = ptr;
1007 /* Maybe the character was used before the definition. In this case
1008 we have to insert the byte sequences now. */
1009 if (elem->mbs == NULL && seq != NULL)
1011 elem->mbs = obstack_copy0 (&collate->mempool,
1012 seq->bytes, seq->nbytes);
1013 elem->nmbs = seq->nbytes;
1016 if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
1018 uint32_t wcs[2] = { wc, 0 };
1020 elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
1021 elem->nwcs = 1;
1026 /* Test whether this element is not already in the list. */
1027 if (elem->next != NULL || elem == collate->cursor)
1029 lr_error (ldfile, _("order for `%.*s' already defined at %s:%Zu"),
1030 (int) symlen, symstr, elem->file, elem->line);
1031 lr_ignore_rest (ldfile, 0);
1032 return 1;
1035 insert_weights (ldfile, elem, charmap, repertoire, result, tok_none);
1037 return 0;
1041 static void
1042 handle_ellipsis (struct linereader *ldfile, const char *symstr, size_t symlen,
1043 enum token_t ellipsis, const struct charmap_t *charmap,
1044 struct repertoire_t *repertoire,
1045 struct localedef_t *result)
1047 struct element_t *startp;
1048 struct element_t *endp;
1049 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
1051 /* Unlink the entry added for the ellipsis. */
1052 unlink_element (collate);
1053 startp = collate->cursor;
1055 /* Process and add the end-entry. */
1056 if (symstr != NULL
1057 && insert_value (ldfile, symstr, symlen, charmap, repertoire, result))
1058 /* Something went wrong with inserting the to-value. This means
1059 we cannot process the ellipsis. */
1060 return;
1062 /* Reset the cursor. */
1063 collate->cursor = startp;
1065 /* Now we have to handle many different situations:
1066 - we have to distinguish between the three different ellipsis forms
1067 - the is the ellipsis at the beginning, in the middle, or at the end.
1069 endp = collate->cursor->next;
1070 assert (symstr == NULL || endp != NULL);
1072 /* XXX The following is probably very wrong since also collating symbols
1073 can appear in ranges. But do we want/can refine the test for that? */
1074 #if 0
1075 /* Both, the start and the end symbol, must stand for characters. */
1076 if ((startp != NULL && (startp->name == NULL || ! startp->is_character))
1077 || (endp != NULL && (endp->name == NULL|| ! endp->is_character)))
1079 lr_error (ldfile, _("\
1080 %s: the start and the end symbol of a range must stand for characters"),
1081 "LC_COLLATE");
1082 return;
1084 #endif
1086 if (ellipsis == tok_ellipsis3)
1088 /* One requirement we make here: the length of the byte
1089 sequences for the first and end character must be the same.
1090 This is mainly to prevent unwanted effects and this is often
1091 not what is wanted. */
1092 size_t len = (startp->mbs != NULL ? startp->nmbs
1093 : (endp->mbs != NULL ? endp->nmbs : 0));
1094 char mbcnt[len + 1];
1095 char mbend[len + 1];
1097 /* Well, this should be caught somewhere else already. Just to
1098 make sure. */
1099 assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
1100 assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
1102 if (startp != NULL && endp != NULL
1103 && startp->mbs != NULL && endp->mbs != NULL
1104 && startp->nmbs != endp->nmbs)
1106 lr_error (ldfile, _("\
1107 %s: byte sequences of first and last character must have the same length"),
1108 "LC_COLLATE");
1109 return;
1112 /* Determine whether we have to generate multibyte sequences. */
1113 if ((startp == NULL || startp->mbs != NULL)
1114 && (endp == NULL || endp->mbs != NULL))
1116 int cnt;
1117 int ret;
1119 /* Prepare the beginning byte sequence. This is either from the
1120 beginning byte sequence or it is all nulls if it was an
1121 initial ellipsis. */
1122 if (startp == NULL || startp->mbs == NULL)
1123 memset (mbcnt, '\0', len);
1124 else
1126 memcpy (mbcnt, startp->mbs, len);
1128 /* And increment it so that the value is the first one we will
1129 try to insert. */
1130 for (cnt = len - 1; cnt >= 0; --cnt)
1131 if (++mbcnt[cnt] != '\0')
1132 break;
1134 mbcnt[len] = '\0';
1136 /* And the end sequence. */
1137 if (endp == NULL || endp->mbs == NULL)
1138 memset (mbend, '\0', len);
1139 else
1140 memcpy (mbend, endp->mbs, len);
1141 mbend[len] = '\0';
1143 /* Test whether we have a correct range. */
1144 ret = memcmp (mbcnt, mbend, len);
1145 if (ret >= 0)
1147 if (ret > 0)
1148 lr_error (ldfile, _("%s: byte sequence of first character of \
1149 sequence is not lower than that of the last character"), "LC_COLLATE");
1150 return;
1153 /* Generate the byte sequences data. */
1154 while (1)
1156 struct charseq *seq;
1158 /* Quite a bit of work ahead. We have to find the character
1159 definition for the byte sequence and then determine the
1160 wide character belonging to it. */
1161 seq = charmap_find_symbol (charmap, mbcnt, len);
1162 if (seq != NULL)
1164 struct element_t *elem;
1165 size_t namelen;
1167 /* I don't this this can ever happen. */
1168 assert (seq->name != NULL);
1169 namelen = strlen (seq->name);
1171 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1172 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1173 namelen);
1175 /* Now we are ready to insert the new value in the
1176 sequence. Find out whether the element is
1177 already known. */
1178 void *ptr;
1179 if (find_entry (&collate->seq_table, seq->name, namelen,
1180 &ptr) != 0)
1182 uint32_t wcs[2] = { seq->ucs4, 0 };
1184 /* We have to allocate an entry. */
1185 elem = new_element (collate, mbcnt, len,
1186 seq->ucs4 == ILLEGAL_CHAR_VALUE
1187 ? NULL : wcs, seq->name,
1188 namelen, 1);
1190 /* And add it to the table. */
1191 if (insert_entry (&collate->seq_table, seq->name,
1192 namelen, elem) != 0)
1193 /* This cannot happen. */
1194 assert (! "Internal error");
1196 else
1197 /* Copy the result. */
1198 elem = ptr;
1200 /* Test whether this element is not already in the list. */
1201 if (elem->next != NULL || (collate->cursor != NULL
1202 && elem->next == collate->cursor))
1204 lr_error (ldfile, _("\
1205 order for `%.*s' already defined at %s:%Zu"),
1206 (int) namelen, seq->name,
1207 elem->file, elem->line);
1208 goto increment;
1211 /* Enqueue the new element. */
1212 elem->last = collate->cursor;
1213 if (collate->cursor == NULL)
1214 elem->next = NULL;
1215 else
1217 elem->next = collate->cursor->next;
1218 elem->last->next = elem;
1219 if (elem->next != NULL)
1220 elem->next->last = elem;
1222 if (collate->start == NULL)
1224 assert (collate->cursor == NULL);
1225 collate->start = elem;
1227 collate->cursor = elem;
1229 /* Add the weight value. We take them from the
1230 `ellipsis_weights' member of `collate'. */
1231 elem->weights = (struct element_list_t *)
1232 obstack_alloc (&collate->mempool,
1233 nrules * sizeof (struct element_list_t));
1234 for (cnt = 0; cnt < nrules; ++cnt)
1235 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1236 && (collate->ellipsis_weight.weights[cnt].w[0]
1237 == ELEMENT_ELLIPSIS2))
1239 elem->weights[cnt].w = (struct element_t **)
1240 obstack_alloc (&collate->mempool,
1241 sizeof (struct element_t *));
1242 elem->weights[cnt].w[0] = elem;
1243 elem->weights[cnt].cnt = 1;
1245 else
1247 /* Simply use the weight from `ellipsis_weight'. */
1248 elem->weights[cnt].w =
1249 collate->ellipsis_weight.weights[cnt].w;
1250 elem->weights[cnt].cnt =
1251 collate->ellipsis_weight.weights[cnt].cnt;
1255 /* Increment for the next round. */
1256 increment:
1257 for (cnt = len - 1; cnt >= 0; --cnt)
1258 if (++mbcnt[cnt] != '\0')
1259 break;
1261 /* Find out whether this was all. */
1262 if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1263 /* Yep, that's all. */
1264 break;
1268 else
1270 /* For symbolic range we naturally must have a beginning and an
1271 end specified by the user. */
1272 if (startp == NULL)
1273 lr_error (ldfile, _("\
1274 %s: symbolic range ellipsis must not directly follow `order_start'"),
1275 "LC_COLLATE");
1276 else if (endp == NULL)
1277 lr_error (ldfile, _("\
1278 %s: symbolic range ellipsis must not be directly followed by `order_end'"),
1279 "LC_COLLATE");
1280 else
1282 /* Determine the range. To do so we have to determine the
1283 common prefix of the both names and then the numeric
1284 values of both ends. */
1285 size_t lenfrom = strlen (startp->name);
1286 size_t lento = strlen (endp->name);
1287 char buf[lento + 1];
1288 int preflen = 0;
1289 long int from;
1290 long int to;
1291 char *cp;
1292 int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1294 if (lenfrom != lento)
1296 invalid_range:
1297 lr_error (ldfile, _("\
1298 `%s' and `%.*s' are not valid names for symbolic range"),
1299 startp->name, (int) lento, endp->name);
1300 return;
1303 while (startp->name[preflen] == endp->name[preflen])
1304 if (startp->name[preflen] == '\0')
1305 /* Nothing to be done. The start and end point are identical
1306 and while inserting the end point we have already given
1307 the user an error message. */
1308 return;
1309 else
1310 ++preflen;
1312 errno = 0;
1313 from = strtol (startp->name + preflen, &cp, base);
1314 if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1315 goto invalid_range;
1317 errno = 0;
1318 to = strtol (endp->name + preflen, &cp, base);
1319 if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1320 goto invalid_range;
1322 /* Copy the prefix. */
1323 memcpy (buf, startp->name, preflen);
1325 /* Loop over all values. */
1326 for (++from; from < to; ++from)
1328 struct element_t *elem = NULL;
1329 struct charseq *seq;
1330 uint32_t wc;
1331 int cnt;
1333 /* Generate the the name. */
1334 sprintf (buf + preflen, base == 10 ? "%ld" : "%lX", from);
1336 /* Look whether this name is already defined. */
1337 void *ptr;
1338 if (find_entry (&collate->seq_table, buf, symlen, &ptr) == 0)
1340 /* Copy back the result. */
1341 elem = ptr;
1343 if (elem->next != NULL || (collate->cursor != NULL
1344 && elem->next == collate->cursor))
1346 lr_error (ldfile, _("\
1347 %s: order for `%.*s' already defined at %s:%Zu"),
1348 "LC_COLLATE", (int) lenfrom, buf,
1349 elem->file, elem->line);
1350 continue;
1353 if (elem->name == NULL)
1355 lr_error (ldfile, _("%s: `%s' must be a character"),
1356 "LC_COLLATE", buf);
1357 continue;
1361 if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1363 /* Search for a character of this name. */
1364 seq = charmap_find_value (charmap, buf, lenfrom);
1365 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1367 wc = repertoire_find_value (repertoire, buf, lenfrom);
1369 if (seq != NULL)
1370 seq->ucs4 = wc;
1372 else
1373 wc = seq->ucs4;
1375 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1376 /* We don't know anything about a character with this
1377 name. XXX Should we warn? */
1378 continue;
1380 if (elem == NULL)
1382 uint32_t wcs[2] = { wc, 0 };
1384 /* We have to allocate an entry. */
1385 elem = new_element (collate,
1386 seq != NULL ? seq->bytes : NULL,
1387 seq != NULL ? seq->nbytes : 0,
1388 wc == ILLEGAL_CHAR_VALUE
1389 ? NULL : wcs, buf, lenfrom, 1);
1391 else
1393 /* Update the element. */
1394 if (seq != NULL)
1396 elem->mbs = obstack_copy0 (&collate->mempool,
1397 seq->bytes, seq->nbytes);
1398 elem->nmbs = seq->nbytes;
1401 if (wc != ILLEGAL_CHAR_VALUE)
1403 uint32_t zero = 0;
1405 obstack_grow (&collate->mempool,
1406 &wc, sizeof (uint32_t));
1407 obstack_grow (&collate->mempool,
1408 &zero, sizeof (uint32_t));
1409 elem->wcs = obstack_finish (&collate->mempool);
1410 elem->nwcs = 1;
1414 elem->file = ldfile->fname;
1415 elem->line = ldfile->lineno;
1416 elem->section = collate->current_section;
1419 /* Enqueue the new element. */
1420 elem->last = collate->cursor;
1421 elem->next = collate->cursor->next;
1422 elem->last->next = elem;
1423 if (elem->next != NULL)
1424 elem->next->last = elem;
1425 collate->cursor = elem;
1427 /* Now add the weights. They come from the `ellipsis_weights'
1428 member of `collate'. */
1429 elem->weights = (struct element_list_t *)
1430 obstack_alloc (&collate->mempool,
1431 nrules * sizeof (struct element_list_t));
1432 for (cnt = 0; cnt < nrules; ++cnt)
1433 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1434 && (collate->ellipsis_weight.weights[cnt].w[0]
1435 == ELEMENT_ELLIPSIS2))
1437 elem->weights[cnt].w = (struct element_t **)
1438 obstack_alloc (&collate->mempool,
1439 sizeof (struct element_t *));
1440 elem->weights[cnt].w[0] = elem;
1441 elem->weights[cnt].cnt = 1;
1443 else
1445 /* Simly use the weight from `ellipsis_weight'. */
1446 elem->weights[cnt].w =
1447 collate->ellipsis_weight.weights[cnt].w;
1448 elem->weights[cnt].cnt =
1449 collate->ellipsis_weight.weights[cnt].cnt;
1457 static void
1458 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1459 struct localedef_t *copy_locale, int ignore_content)
1461 if (!ignore_content && locale->categories[LC_COLLATE].collate == NULL)
1463 struct locale_collate_t *collate;
1465 if (copy_locale == NULL)
1467 collate = locale->categories[LC_COLLATE].collate =
1468 (struct locale_collate_t *)
1469 xcalloc (1, sizeof (struct locale_collate_t));
1471 /* Init the various data structures. */
1472 init_hash (&collate->elem_table, 100);
1473 init_hash (&collate->sym_table, 100);
1474 init_hash (&collate->seq_table, 500);
1475 obstack_init (&collate->mempool);
1477 collate->col_weight_max = -1;
1479 else
1480 /* Reuse the copy_locale's data structures. */
1481 collate = locale->categories[LC_COLLATE].collate =
1482 copy_locale->categories[LC_COLLATE].collate;
1485 ldfile->translate_strings = 0;
1486 ldfile->return_widestr = 0;
1490 void
1491 collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
1493 /* Now is the time when we can assign the individual collation
1494 values for all the symbols. We have possibly different values
1495 for the wide- and the multibyte-character symbols. This is done
1496 since it might make a difference in the encoding if there is in
1497 some cases no multibyte-character but there are wide-characters.
1498 (The other way around it is not important since theencoded
1499 collation value in the wide-character case is 32 bits wide and
1500 therefore requires no encoding).
1502 The lowest collation value assigned is 2. Zero is reserved for
1503 the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1504 functions and 1 is used to separate the individual passes for the
1505 different rules.
1507 We also have to construct is list with all the bytes/words which
1508 can come first in a sequence, followed by all the elements which
1509 also start with this byte/word. The order is reverse which has
1510 among others the important effect that longer strings are located
1511 first in the list. This is required for the output data since
1512 the algorithm used in `strcoll' etc depends on this.
1514 The multibyte case is easy. We simply sort into an array with
1515 256 elements. */
1516 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1517 int mbact[nrules];
1518 int wcact;
1519 int mbseqact;
1520 int wcseqact;
1521 struct element_t *runp;
1522 int i;
1523 int need_undefined = 0;
1524 struct section_list *sect;
1525 int ruleidx;
1526 int nr_wide_elems = 0;
1528 if (collate == NULL)
1530 /* No data, no check. */
1531 if (! be_quiet)
1532 WITH_CUR_LOCALE (error (0, 0, _("No definition for %s category found"),
1533 "LC_COLLATE"));
1534 return;
1537 /* If this assertion is hit change the type in `element_t'. */
1538 assert (nrules <= sizeof (runp->used_in_level) * 8);
1540 /* Make sure that the `position' rule is used either in all sections
1541 or in none. */
1542 for (i = 0; i < nrules; ++i)
1543 for (sect = collate->sections; sect != NULL; sect = sect->next)
1544 if (sect->rules != NULL
1545 && ((sect->rules[i] & sort_position)
1546 != (collate->sections->rules[i] & sort_position)))
1548 WITH_CUR_LOCALE (error (0, 0, _("\
1549 %s: `position' must be used for a specific level in all sections or none"),
1550 "LC_COLLATE"));
1551 break;
1554 /* Find out which elements are used at which level. At the same
1555 time we find out whether we have any undefined symbols. */
1556 runp = collate->start;
1557 while (runp != NULL)
1559 if (runp->mbs != NULL)
1561 for (i = 0; i < nrules; ++i)
1563 int j;
1565 for (j = 0; j < runp->weights[i].cnt; ++j)
1566 /* A NULL pointer as the weight means IGNORE. */
1567 if (runp->weights[i].w[j] != NULL)
1569 if (runp->weights[i].w[j]->weights == NULL)
1571 WITH_CUR_LOCALE (error_at_line (0, 0, runp->file,
1572 runp->line,
1573 _("symbol `%s' not defined"),
1574 runp->weights[i].w[j]->name));
1576 need_undefined = 1;
1577 runp->weights[i].w[j] = &collate->undefined;
1579 else
1580 /* Set the bit for the level. */
1581 runp->weights[i].w[j]->used_in_level |= 1 << i;
1586 /* Up to the next entry. */
1587 runp = runp->next;
1590 /* Walk through the list of defined sequences and assign weights. Also
1591 create the data structure which will allow generating the single byte
1592 character based tables.
1594 Since at each time only the weights for each of the rules are
1595 only compared to other weights for this rule it is possible to
1596 assign more compact weight values than simply counting all
1597 weights in sequence. We can assign weights from 3, one for each
1598 rule individually and only for those elements, which are actually
1599 used for this rule.
1601 Why is this important? It is not for the wide char table. But
1602 it is for the singlebyte output since here larger numbers have to
1603 be encoded to make it possible to emit the value as a byte
1604 string. */
1605 for (i = 0; i < nrules; ++i)
1606 mbact[i] = 2;
1607 wcact = 2;
1608 mbseqact = 0;
1609 wcseqact = 0;
1610 runp = collate->start;
1611 while (runp != NULL)
1613 /* Determine the order. */
1614 if (runp->used_in_level != 0)
1616 runp->mborder = (int *) obstack_alloc (&collate->mempool,
1617 nrules * sizeof (int));
1619 for (i = 0; i < nrules; ++i)
1620 if ((runp->used_in_level & (1 << i)) != 0)
1621 runp->mborder[i] = mbact[i]++;
1622 else
1623 runp->mborder[i] = 0;
1626 if (runp->mbs != NULL)
1628 struct element_t **eptr;
1629 struct element_t *lastp = NULL;
1631 /* Find the point where to insert in the list. */
1632 eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1633 while (*eptr != NULL)
1635 if ((*eptr)->nmbs < runp->nmbs)
1636 break;
1638 if ((*eptr)->nmbs == runp->nmbs)
1640 int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1642 if (c == 0)
1644 /* This should not happen. It means that we have
1645 to symbols with the same byte sequence. It is
1646 of course an error. */
1647 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1648 (*eptr)->line,
1649 _("\
1650 symbol `%s' has the same encoding as"), (*eptr)->name);
1651 error_at_line (0, 0, runp->file,
1652 runp->line,
1653 _("symbol `%s'"),
1654 runp->name));
1655 goto dont_insert;
1657 else if (c < 0)
1658 /* Insert it here. */
1659 break;
1662 /* To the next entry. */
1663 lastp = *eptr;
1664 eptr = &(*eptr)->mbnext;
1667 /* Set the pointers. */
1668 runp->mbnext = *eptr;
1669 runp->mblast = lastp;
1670 if (*eptr != NULL)
1671 (*eptr)->mblast = runp;
1672 *eptr = runp;
1673 dont_insert:
1677 if (runp->used_in_level)
1679 runp->wcorder = wcact++;
1681 /* We take the opportunity to count the elements which have
1682 wide characters. */
1683 ++nr_wide_elems;
1686 if (runp->is_character)
1688 if (runp->nmbs == 1)
1689 collate->mbseqorder[((unsigned char *) runp->mbs)[0]] = mbseqact++;
1691 runp->wcseqorder = wcseqact++;
1693 else if (runp->mbs != NULL && runp->weights != NULL)
1694 /* This is for collation elements. */
1695 runp->wcseqorder = wcseqact++;
1697 /* Up to the next entry. */
1698 runp = runp->next;
1701 /* Find out whether any of the `mbheads' entries is unset. In this
1702 case we use the UNDEFINED entry. */
1703 for (i = 1; i < 256; ++i)
1704 if (collate->mbheads[i] == NULL)
1706 need_undefined = 1;
1707 collate->mbheads[i] = &collate->undefined;
1710 /* Now to the wide character case. */
1711 collate->wcheads.p = 6;
1712 collate->wcheads.q = 10;
1713 wchead_table_init (&collate->wcheads);
1715 collate->wcseqorder.p = 6;
1716 collate->wcseqorder.q = 10;
1717 collseq_table_init (&collate->wcseqorder);
1719 /* Start adding. */
1720 runp = collate->start;
1721 while (runp != NULL)
1723 if (runp->wcs != NULL)
1725 struct element_t *e;
1726 struct element_t **eptr;
1727 struct element_t *lastp;
1729 /* Insert the collation sequence value. */
1730 if (runp->is_character)
1731 collseq_table_add (&collate->wcseqorder, runp->wcs[0],
1732 runp->wcseqorder);
1734 /* Find the point where to insert in the list. */
1735 e = wchead_table_get (&collate->wcheads, runp->wcs[0]);
1736 eptr = &e;
1737 lastp = NULL;
1738 while (*eptr != NULL)
1740 if ((*eptr)->nwcs < runp->nwcs)
1741 break;
1743 if ((*eptr)->nwcs == runp->nwcs)
1745 int c = wmemcmp ((wchar_t *) (*eptr)->wcs,
1746 (wchar_t *) runp->wcs, runp->nwcs);
1748 if (c == 0)
1750 /* This should not happen. It means that we have
1751 two symbols with the same byte sequence. It is
1752 of course an error. */
1753 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1754 (*eptr)->line,
1755 _("\
1756 symbol `%s' has the same encoding as"), (*eptr)->name);
1757 error_at_line (0, 0, runp->file,
1758 runp->line,
1759 _("symbol `%s'"),
1760 runp->name));
1761 goto dont_insertwc;
1763 else if (c < 0)
1764 /* Insert it here. */
1765 break;
1768 /* To the next entry. */
1769 lastp = *eptr;
1770 eptr = &(*eptr)->wcnext;
1773 /* Set the pointers. */
1774 runp->wcnext = *eptr;
1775 runp->wclast = lastp;
1776 if (*eptr != NULL)
1777 (*eptr)->wclast = runp;
1778 *eptr = runp;
1779 if (eptr == &e)
1780 wchead_table_add (&collate->wcheads, runp->wcs[0], e);
1781 dont_insertwc:
1785 /* Up to the next entry. */
1786 runp = runp->next;
1789 collseq_table_finalize (&collate->wcseqorder);
1791 /* Now determine whether the UNDEFINED entry is needed and if yes,
1792 whether it was defined. */
1793 collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1794 if (collate->undefined.file == NULL)
1796 if (need_undefined)
1798 /* This seems not to be enforced by recent standards. Don't
1799 emit an error, simply append UNDEFINED at the end. */
1800 if (0)
1801 WITH_CUR_LOCALE (error (0, 0, _("no definition of `UNDEFINED'")));
1803 /* Add UNDEFINED at the end. */
1804 collate->undefined.mborder =
1805 (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1807 for (i = 0; i < nrules; ++i)
1808 collate->undefined.mborder[i] = mbact[i]++;
1811 /* In any case we will need the definition for the wide character
1812 case. But we will not complain that it is missing since the
1813 specification strangely enough does not seem to account for
1814 this. */
1815 collate->undefined.wcorder = wcact++;
1818 /* Finally, try to unify the rules for the sections. Whenever the rules
1819 for a section are the same as those for another section give the
1820 ruleset the same index. Since there are never many section we can
1821 use an O(n^2) algorithm here. */
1822 sect = collate->sections;
1823 while (sect != NULL && sect->rules == NULL)
1824 sect = sect->next;
1826 /* Bail out if we have no sections because of earlier errors. */
1827 if (sect == NULL)
1829 WITH_CUR_LOCALE (error (EXIT_FAILURE, 0,
1830 _("too many errors; giving up")));
1831 return;
1834 ruleidx = 0;
1837 struct section_list *osect = collate->sections;
1839 while (osect != sect)
1840 if (osect->rules != NULL
1841 && memcmp (osect->rules, sect->rules, nrules) == 0)
1842 break;
1843 else
1844 osect = osect->next;
1846 if (osect == sect)
1847 sect->ruleidx = ruleidx++;
1848 else
1849 sect->ruleidx = osect->ruleidx;
1851 /* Next section. */
1853 sect = sect->next;
1854 while (sect != NULL && sect->rules == NULL);
1856 while (sect != NULL);
1857 /* We are currently not prepared for more than 128 rulesets. But this
1858 should never really be a problem. */
1859 assert (ruleidx <= 128);
1863 static int32_t
1864 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1865 struct element_t *elem)
1867 size_t cnt;
1868 int32_t retval;
1870 /* Optimize the use of UNDEFINED. */
1871 if (elem == &collate->undefined)
1872 /* The weights are already inserted. */
1873 return 0;
1875 /* This byte can start exactly one collation element and this is
1876 a single byte. We can directly give the index to the weights. */
1877 retval = obstack_object_size (pool);
1879 /* Construct the weight. */
1880 for (cnt = 0; cnt < nrules; ++cnt)
1882 char buf[elem->weights[cnt].cnt * 7];
1883 int len = 0;
1884 int i;
1886 for (i = 0; i < elem->weights[cnt].cnt; ++i)
1887 /* Encode the weight value. We do nothing for IGNORE entries. */
1888 if (elem->weights[cnt].w[i] != NULL)
1889 len += utf8_encode (&buf[len],
1890 elem->weights[cnt].w[i]->mborder[cnt]);
1892 /* And add the buffer content. */
1893 obstack_1grow (pool, len);
1894 obstack_grow (pool, buf, len);
1897 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1901 static int32_t
1902 output_weightwc (struct obstack *pool, struct locale_collate_t *collate,
1903 struct element_t *elem)
1905 size_t cnt;
1906 int32_t retval;
1908 /* Optimize the use of UNDEFINED. */
1909 if (elem == &collate->undefined)
1910 /* The weights are already inserted. */
1911 return 0;
1913 /* This byte can start exactly one collation element and this is
1914 a single byte. We can directly give the index to the weights. */
1915 retval = obstack_object_size (pool) / sizeof (int32_t);
1917 /* Construct the weight. */
1918 for (cnt = 0; cnt < nrules; ++cnt)
1920 int32_t buf[elem->weights[cnt].cnt];
1921 int i;
1922 int32_t j;
1924 for (i = 0, j = 0; i < elem->weights[cnt].cnt; ++i)
1925 if (elem->weights[cnt].w[i] != NULL)
1926 buf[j++] = elem->weights[cnt].w[i]->wcorder;
1928 /* And add the buffer content. */
1929 obstack_int32_grow (pool, j);
1931 obstack_grow (pool, buf, j * sizeof (int32_t));
1934 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1938 void
1939 collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
1940 const char *output_path)
1942 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1943 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
1944 struct iovec iov[2 + nelems];
1945 struct locale_file data;
1946 uint32_t idx[nelems];
1947 size_t cnt;
1948 size_t ch;
1949 int32_t tablemb[256];
1950 struct obstack weightpool;
1951 struct obstack extrapool;
1952 struct obstack indirectpool;
1953 struct section_list *sect;
1954 struct collidx_table tablewc;
1955 uint32_t elem_size;
1956 uint32_t *elem_table;
1957 int i;
1958 struct element_t *runp;
1960 data.magic = LIMAGIC (LC_COLLATE);
1961 data.n = nelems;
1962 iov[0].iov_base = (void *) &data;
1963 iov[0].iov_len = sizeof (data);
1965 iov[1].iov_base = (void *) idx;
1966 iov[1].iov_len = sizeof (idx);
1968 idx[0] = iov[0].iov_len + iov[1].iov_len;
1969 cnt = 0;
1971 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES));
1972 iov[2 + cnt].iov_base = &nrules;
1973 iov[2 + cnt].iov_len = sizeof (uint32_t);
1974 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1975 ++cnt;
1977 /* If we have no LC_COLLATE data emit only the number of rules as zero. */
1978 if (collate == NULL)
1980 int32_t dummy = 0;
1982 while (cnt < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
1984 /* The words have to be handled specially. */
1985 if (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
1987 iov[2 + cnt].iov_base = &dummy;
1988 iov[2 + cnt].iov_len = sizeof (int32_t);
1990 else
1992 iov[2 + cnt].iov_base = NULL;
1993 iov[2 + cnt].iov_len = 0;
1996 if (cnt + 1 < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
1997 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1998 ++cnt;
2001 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2003 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", 2 + cnt, iov);
2005 return;
2008 obstack_init (&weightpool);
2009 obstack_init (&extrapool);
2010 obstack_init (&indirectpool);
2012 /* Since we are using the sign of an integer to mark indirection the
2013 offsets in the arrays we are indirectly referring to must not be
2014 zero since -0 == 0. Therefore we add a bit of dummy content. */
2015 obstack_int32_grow (&extrapool, 0);
2016 obstack_int32_grow (&indirectpool, 0);
2018 /* Prepare the ruleset table. */
2019 for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
2020 if (sect->rules != NULL && sect->ruleidx == i)
2022 int j;
2024 obstack_make_room (&weightpool, nrules);
2026 for (j = 0; j < nrules; ++j)
2027 obstack_1grow_fast (&weightpool, sect->rules[j]);
2028 ++i;
2030 /* And align the output. */
2031 i = (nrules * i) % __alignof__ (int32_t);
2032 if (i > 0)
2034 obstack_1grow (&weightpool, '\0');
2035 while (++i < __alignof__ (int32_t));
2037 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_RULESETS));
2038 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2039 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2040 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2041 ++cnt;
2043 /* Generate the 8-bit table. Walk through the lists of sequences
2044 starting with the same byte and add them one after the other to
2045 the table. In case we have more than one sequence starting with
2046 the same byte we have to use extra indirection.
2048 First add a record for the NUL byte. This entry will never be used
2049 so it does not matter. */
2050 tablemb[0] = 0;
2052 /* Now insert the `UNDEFINED' value if it is used. Since this value
2053 will probably be used more than once it is good to store the
2054 weights only once. */
2055 if (collate->undefined.used_in_level != 0)
2056 output_weight (&weightpool, collate, &collate->undefined);
2058 for (ch = 1; ch < 256; ++ch)
2059 if (collate->mbheads[ch]->mbnext == NULL
2060 && collate->mbheads[ch]->nmbs <= 1)
2062 tablemb[ch] = output_weight (&weightpool, collate,
2063 collate->mbheads[ch]);
2065 else
2067 /* The entries in the list are sorted by length and then
2068 alphabetically. This is the order in which we will add the
2069 elements to the collation table. This allows simply walking
2070 the table in sequence and stopping at the first matching
2071 entry. Since the longer sequences are coming first in the
2072 list they have the possibility to match first, just as it
2073 has to be. In the worst case we are walking to the end of
2074 the list where we put, if no singlebyte sequence is defined
2075 in the locale definition, the weights for UNDEFINED.
2077 To reduce the length of the search list we compress them a bit.
2078 This happens by collecting sequences of consecutive byte
2079 sequences in one entry (having and begin and end byte sequence)
2080 and add only one index into the weight table. We can find the
2081 consecutive entries since they are also consecutive in the list. */
2082 struct element_t *runp = collate->mbheads[ch];
2083 struct element_t *lastp;
2085 assert ((obstack_object_size (&extrapool)
2086 & (__alignof__ (int32_t) - 1)) == 0);
2088 tablemb[ch] = -obstack_object_size (&extrapool);
2092 /* Store the current index in the weight table. We know that
2093 the current position in the `extrapool' is aligned on a
2094 32-bit address. */
2095 int32_t weightidx;
2096 int added;
2098 /* Find out wether this is a single entry or we have more than
2099 one consecutive entry. */
2100 if (runp->mbnext != NULL
2101 && runp->nmbs == runp->mbnext->nmbs
2102 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
2103 && (runp->mbs[runp->nmbs - 1]
2104 == runp->mbnext->mbs[runp->nmbs - 1] + 1))
2106 int i;
2107 struct element_t *series_startp = runp;
2108 struct element_t *curp;
2110 /* Compute how much space we will need. */
2111 added = ((sizeof (int32_t) + 1 + 2 * (runp->nmbs - 1)
2112 + __alignof__ (int32_t) - 1)
2113 & ~(__alignof__ (int32_t) - 1));
2114 assert ((obstack_object_size (&extrapool)
2115 & (__alignof__ (int32_t) - 1)) == 0);
2116 obstack_make_room (&extrapool, added);
2118 /* More than one consecutive entry. We mark this by having
2119 a negative index into the indirect table. */
2120 obstack_int32_grow_fast (&extrapool,
2121 -(obstack_object_size (&indirectpool)
2122 / sizeof (int32_t)));
2124 /* Now search first the end of the series. */
2126 runp = runp->mbnext;
2127 while (runp->mbnext != NULL
2128 && runp->nmbs == runp->mbnext->nmbs
2129 && memcmp (runp->mbs, runp->mbnext->mbs,
2130 runp->nmbs - 1) == 0
2131 && (runp->mbs[runp->nmbs - 1]
2132 == runp->mbnext->mbs[runp->nmbs - 1] + 1));
2134 /* Now walk backward from here to the beginning. */
2135 curp = runp;
2137 assert (runp->nmbs <= 256);
2138 obstack_1grow_fast (&extrapool, curp->nmbs - 1);
2139 for (i = 1; i < curp->nmbs; ++i)
2140 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2142 /* Now find the end of the consecutive sequence and
2143 add all the indeces in the indirect pool. */
2146 weightidx = output_weight (&weightpool, collate, curp);
2147 obstack_int32_grow (&indirectpool, weightidx);
2149 curp = curp->mblast;
2151 while (curp != series_startp);
2153 /* Add the final weight. */
2154 weightidx = output_weight (&weightpool, collate, curp);
2155 obstack_int32_grow (&indirectpool, weightidx);
2157 /* And add the end byte sequence. Without length this
2158 time. */
2159 for (i = 1; i < curp->nmbs; ++i)
2160 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2162 else
2164 /* A single entry. Simply add the index and the length and
2165 string (except for the first character which is already
2166 tested for). */
2167 int i;
2169 /* Output the weight info. */
2170 weightidx = output_weight (&weightpool, collate, runp);
2172 added = ((sizeof (int32_t) + 1 + runp->nmbs - 1
2173 + __alignof__ (int32_t) - 1)
2174 & ~(__alignof__ (int32_t) - 1));
2175 assert ((obstack_object_size (&extrapool)
2176 & (__alignof__ (int32_t) - 1)) == 0);
2177 obstack_make_room (&extrapool, added);
2179 obstack_int32_grow_fast (&extrapool, weightidx);
2180 assert (runp->nmbs <= 256);
2181 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
2183 for (i = 1; i < runp->nmbs; ++i)
2184 obstack_1grow_fast (&extrapool, runp->mbs[i]);
2187 /* Add alignment bytes if necessary. */
2188 while ((obstack_object_size (&extrapool)
2189 & (__alignof__ (int32_t) - 1)) != 0)
2190 obstack_1grow_fast (&extrapool, '\0');
2192 /* Next entry. */
2193 lastp = runp;
2194 runp = runp->mbnext;
2196 while (runp != NULL);
2198 assert ((obstack_object_size (&extrapool)
2199 & (__alignof__ (int32_t) - 1)) == 0);
2201 /* If the final entry in the list is not a single character we
2202 add an UNDEFINED entry here. */
2203 if (lastp->nmbs != 1)
2205 int added = ((sizeof (int32_t) + 1 + 1 + __alignof__ (int32_t) - 1)
2206 & ~(__alignof__ (int32_t) - 1));
2207 obstack_make_room (&extrapool, added);
2209 obstack_int32_grow_fast (&extrapool, 0);
2210 /* XXX What rule? We just pick the first. */
2211 obstack_1grow_fast (&extrapool, 0);
2212 /* Length is zero. */
2213 obstack_1grow_fast (&extrapool, 0);
2215 /* Add alignment bytes if necessary. */
2216 while ((obstack_object_size (&extrapool)
2217 & (__alignof__ (int32_t) - 1)) != 0)
2218 obstack_1grow_fast (&extrapool, '\0');
2222 /* Add padding to the tables if necessary. */
2223 while ((obstack_object_size (&weightpool) & (__alignof__ (int32_t) - 1))
2224 != 0)
2225 obstack_1grow (&weightpool, 0);
2227 /* Now add the four tables. */
2228 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEMB));
2229 iov[2 + cnt].iov_base = tablemb;
2230 iov[2 + cnt].iov_len = sizeof (tablemb);
2231 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2232 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2233 ++cnt;
2235 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB));
2236 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2237 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2238 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2239 ++cnt;
2241 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB));
2242 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2243 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2244 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2245 ++cnt;
2247 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB));
2248 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2249 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2250 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2251 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2252 ++cnt;
2255 /* Now the same for the wide character table. We need to store some
2256 more information here. */
2257 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP1));
2258 iov[2 + cnt].iov_base = NULL;
2259 iov[2 + cnt].iov_len = 0;
2260 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2261 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2262 ++cnt;
2264 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP2));
2265 iov[2 + cnt].iov_base = NULL;
2266 iov[2 + cnt].iov_len = 0;
2267 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2268 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2269 ++cnt;
2271 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP3));
2272 iov[2 + cnt].iov_base = NULL;
2273 iov[2 + cnt].iov_len = 0;
2274 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2275 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2276 ++cnt;
2278 /* Since we are using the sign of an integer to mark indirection the
2279 offsets in the arrays we are indirectly referring to must not be
2280 zero since -0 == 0. Therefore we add a bit of dummy content. */
2281 obstack_int32_grow (&extrapool, 0);
2282 obstack_int32_grow (&indirectpool, 0);
2284 /* Now insert the `UNDEFINED' value if it is used. Since this value
2285 will probably be used more than once it is good to store the
2286 weights only once. */
2287 if (output_weightwc (&weightpool, collate, &collate->undefined) != 0)
2288 abort ();
2290 /* Generate the table. Walk through the lists of sequences starting
2291 with the same wide character and add them one after the other to
2292 the table. In case we have more than one sequence starting with
2293 the same byte we have to use extra indirection. */
2295 auto void add_to_tablewc (uint32_t ch, struct element_t *runp);
2297 void add_to_tablewc (uint32_t ch, struct element_t *runp)
2299 if (runp->wcnext == NULL && runp->nwcs == 1)
2301 int32_t weigthidx = output_weightwc (&weightpool, collate, runp);
2302 collidx_table_add (&tablewc, ch, weigthidx);
2304 else
2306 /* As for the singlebyte table, we recognize sequences and
2307 compress them. */
2308 struct element_t *lastp;
2310 collidx_table_add (&tablewc, ch,
2311 -(obstack_object_size (&extrapool) / sizeof (uint32_t)));
2315 /* Store the current index in the weight table. We know that
2316 the current position in the `extrapool' is aligned on a
2317 32-bit address. */
2318 int32_t weightidx;
2319 int added;
2321 /* Find out wether this is a single entry or we have more than
2322 one consecutive entry. */
2323 if (runp->wcnext != NULL
2324 && runp->nwcs == runp->wcnext->nwcs
2325 && wmemcmp ((wchar_t *) runp->wcs,
2326 (wchar_t *)runp->wcnext->wcs,
2327 runp->nwcs - 1) == 0
2328 && (runp->wcs[runp->nwcs - 1]
2329 == runp->wcnext->wcs[runp->nwcs - 1] + 1))
2331 int i;
2332 struct element_t *series_startp = runp;
2333 struct element_t *curp;
2335 /* Now add first the initial byte sequence. */
2336 added = (1 + 1 + 2 * (runp->nwcs - 1)) * sizeof (int32_t);
2337 if (sizeof (int32_t) == sizeof (int))
2338 obstack_make_room (&extrapool, added);
2340 /* More than one consecutive entry. We mark this by having
2341 a negative index into the indirect table. */
2342 obstack_int32_grow_fast (&extrapool,
2343 -(obstack_object_size (&indirectpool)
2344 / sizeof (int32_t)));
2345 obstack_int32_grow_fast (&extrapool, runp->nwcs - 1);
2348 runp = runp->wcnext;
2349 while (runp->wcnext != NULL
2350 && runp->nwcs == runp->wcnext->nwcs
2351 && wmemcmp ((wchar_t *) runp->wcs,
2352 (wchar_t *)runp->wcnext->wcs,
2353 runp->nwcs - 1) == 0
2354 && (runp->wcs[runp->nwcs - 1]
2355 == runp->wcnext->wcs[runp->nwcs - 1] + 1));
2357 /* Now walk backward from here to the beginning. */
2358 curp = runp;
2360 for (i = 1; i < runp->nwcs; ++i)
2361 obstack_int32_grow_fast (&extrapool, curp->wcs[i]);
2363 /* Now find the end of the consecutive sequence and
2364 add all the indeces in the indirect pool. */
2367 weightidx = output_weightwc (&weightpool, collate,
2368 curp);
2369 obstack_int32_grow (&indirectpool, weightidx);
2371 curp = curp->wclast;
2373 while (curp != series_startp);
2375 /* Add the final weight. */
2376 weightidx = output_weightwc (&weightpool, collate, curp);
2377 obstack_int32_grow (&indirectpool, weightidx);
2379 /* And add the end byte sequence. Without length this
2380 time. */
2381 for (i = 1; i < curp->nwcs; ++i)
2382 obstack_int32_grow (&extrapool, curp->wcs[i]);
2384 else
2386 /* A single entry. Simply add the index and the length and
2387 string (except for the first character which is already
2388 tested for). */
2389 int i;
2391 /* Output the weight info. */
2392 weightidx = output_weightwc (&weightpool, collate, runp);
2394 added = (1 + 1 + runp->nwcs - 1) * sizeof (int32_t);
2395 if (sizeof (int) == sizeof (int32_t))
2396 obstack_make_room (&extrapool, added);
2398 obstack_int32_grow_fast (&extrapool, weightidx);
2399 obstack_int32_grow_fast (&extrapool, runp->nwcs - 1);
2400 for (i = 1; i < runp->nwcs; ++i)
2401 obstack_int32_grow_fast (&extrapool, runp->wcs[i]);
2404 /* Next entry. */
2405 lastp = runp;
2406 runp = runp->wcnext;
2408 while (runp != NULL);
2412 tablewc.p = 6;
2413 tablewc.q = 10;
2414 collidx_table_init (&tablewc);
2416 wchead_table_iterate (&collate->wcheads, add_to_tablewc);
2418 collidx_table_finalize (&tablewc);
2421 /* Now add the four tables. */
2422 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEWC));
2423 iov[2 + cnt].iov_base = tablewc.result;
2424 iov[2 + cnt].iov_len = tablewc.result_size;
2425 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2426 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2427 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2428 ++cnt;
2430 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTWC));
2431 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2432 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2433 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2434 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2435 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2436 ++cnt;
2438 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAWC));
2439 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2440 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2441 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2442 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2443 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2444 ++cnt;
2446 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTWC));
2447 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2448 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2449 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2450 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2451 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2452 ++cnt;
2455 /* Finally write the table with collation element names out. It is
2456 a hash table with a simple function which gets the name of the
2457 character as the input. One character might have many names. The
2458 value associated with the name is an index into the weight table
2459 where we are then interested in the first-level weight value.
2461 To determine how large the table should be we are counting the
2462 elements have to put in. Since we are using internal chaining
2463 using a secondary hash function we have to make the table a bit
2464 larger to avoid extremely long search times. We can achieve
2465 good results with a 40% larger table than there are entries. */
2466 elem_size = 0;
2467 runp = collate->start;
2468 while (runp != NULL)
2470 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2471 /* Yep, the element really counts. */
2472 ++elem_size;
2474 runp = runp->next;
2476 /* Add 40% and find the next prime number. */
2477 elem_size = next_prime (elem_size * 1.4);
2479 /* Allocate the table. Each entry consists of two words: the hash
2480 value and an index in a secondary table which provides the index
2481 into the weight table and the string itself (so that a match can
2482 be determined). */
2483 elem_table = (uint32_t *) obstack_alloc (&extrapool,
2484 elem_size * 2 * sizeof (uint32_t));
2485 memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t));
2487 /* Now add the elements. */
2488 runp = collate->start;
2489 while (runp != NULL)
2491 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2493 /* Compute the hash value of the name. */
2494 uint32_t namelen = strlen (runp->name);
2495 uint32_t hash = elem_hash (runp->name, namelen);
2496 size_t idx = hash % elem_size;
2497 size_t start_idx = idx;
2499 if (elem_table[idx * 2] != 0)
2501 /* The spot is already taken. Try iterating using the value
2502 from the secondary hashing function. */
2503 size_t iter = hash % (elem_size - 2) + 1;
2507 idx += iter;
2508 if (idx >= elem_size)
2509 idx -= elem_size;
2510 assert (idx != start_idx);
2512 while (elem_table[idx * 2] != 0);
2514 /* This is the spot where we will insert the value. */
2515 elem_table[idx * 2] = hash;
2516 elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
2518 /* The the string itself including length. */
2519 obstack_1grow (&extrapool, namelen);
2520 obstack_grow (&extrapool, runp->name, namelen);
2522 /* And the multibyte representation. */
2523 obstack_1grow (&extrapool, runp->nmbs);
2524 obstack_grow (&extrapool, runp->mbs, runp->nmbs);
2526 /* And align again to 32 bits. */
2527 if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0)
2528 obstack_grow (&extrapool, "\0\0",
2529 (sizeof (int32_t)
2530 - ((1 + namelen + 1 + runp->nmbs)
2531 % sizeof (int32_t))));
2533 /* Now some 32-bit values: multibyte collation sequence,
2534 wide char string (including length), and wide char
2535 collation sequence. */
2536 obstack_int32_grow (&extrapool, runp->mbseqorder);
2538 obstack_int32_grow (&extrapool, runp->nwcs);
2539 obstack_grow (&extrapool, runp->wcs,
2540 runp->nwcs * sizeof (uint32_t));
2542 obstack_int32_grow (&extrapool, runp->wcseqorder);
2545 runp = runp->next;
2548 /* Prepare to write out this data. */
2549 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB));
2550 iov[2 + cnt].iov_base = &elem_size;
2551 iov[2 + cnt].iov_len = sizeof (int32_t);
2552 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2553 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2554 ++cnt;
2556 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_TABLEMB));
2557 iov[2 + cnt].iov_base = elem_table;
2558 iov[2 + cnt].iov_len = elem_size * 2 * sizeof (int32_t);
2559 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2560 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2561 ++cnt;
2563 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_EXTRAMB));
2564 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2565 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2566 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2567 ++cnt;
2569 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQMB));
2570 iov[2 + cnt].iov_base = collate->mbseqorder;
2571 iov[2 + cnt].iov_len = 256;
2572 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2573 ++cnt;
2575 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQWC));
2576 iov[2 + cnt].iov_base = collate->wcseqorder.result;
2577 iov[2 + cnt].iov_len = collate->wcseqorder.result_size;
2578 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2579 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2580 ++cnt;
2582 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_CODESET));
2583 iov[2 + cnt].iov_base = (void *) charmap->code_set_name;
2584 iov[2 + cnt].iov_len = strlen (iov[2 + cnt].iov_base) + 1;
2585 ++cnt;
2587 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2589 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", 2 + cnt, iov);
2591 obstack_free (&weightpool, NULL);
2592 obstack_free (&extrapool, NULL);
2593 obstack_free (&indirectpool, NULL);
2597 void
2598 collate_read (struct linereader *ldfile, struct localedef_t *result,
2599 const struct charmap_t *charmap, const char *repertoire_name,
2600 int ignore_content)
2602 struct repertoire_t *repertoire = NULL;
2603 struct locale_collate_t *collate;
2604 struct token *now;
2605 struct token *arg = NULL;
2606 enum token_t nowtok;
2607 enum token_t was_ellipsis = tok_none;
2608 struct localedef_t *copy_locale = NULL;
2609 /* Parsing state:
2610 0 - start
2611 1 - between `order-start' and `order-end'
2612 2 - after `order-end'
2613 3 - after `reorder-after', waiting for `reorder-end'
2614 4 - after `reorder-end'
2615 5 - after `reorder-sections-after', waiting for `reorder-sections-end'
2616 6 - after `reorder-sections-end'
2618 int state = 0;
2620 /* Get the repertoire we have to use. */
2621 if (repertoire_name != NULL)
2622 repertoire = repertoire_read (repertoire_name);
2624 /* The rest of the line containing `LC_COLLATE' must be free. */
2625 lr_ignore_rest (ldfile, 1);
2629 now = lr_token (ldfile, charmap, result, NULL, verbose);
2630 nowtok = now->tok;
2632 while (nowtok == tok_eol);
2634 if (nowtok == tok_copy)
2636 state = 2;
2637 now = lr_token (ldfile, charmap, result, NULL, verbose);
2638 if (now->tok != tok_string)
2640 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2642 skip_category:
2644 now = lr_token (ldfile, charmap, result, NULL, verbose);
2645 while (now->tok != tok_eof && now->tok != tok_end);
2647 if (now->tok != tok_eof
2648 || (now = lr_token (ldfile, charmap, result, NULL, verbose),
2649 now->tok == tok_eof))
2650 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2651 else if (now->tok != tok_lc_collate)
2653 lr_error (ldfile, _("\
2654 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2655 lr_ignore_rest (ldfile, 0);
2657 else
2658 lr_ignore_rest (ldfile, 1);
2660 return;
2663 if (! ignore_content)
2665 /* Get the locale definition. */
2666 copy_locale = load_locale (LC_COLLATE, now->val.str.startmb,
2667 repertoire_name, charmap, NULL);
2668 if ((copy_locale->avail & COLLATE_LOCALE) == 0)
2670 /* Not yet loaded. So do it now. */
2671 if (locfile_read (copy_locale, charmap) != 0)
2672 goto skip_category;
2675 if (copy_locale->categories[LC_COLLATE].collate == NULL)
2676 return;
2679 lr_ignore_rest (ldfile, 1);
2681 now = lr_token (ldfile, charmap, result, NULL, verbose);
2682 nowtok = now->tok;
2685 /* Prepare the data structures. */
2686 collate_startup (ldfile, result, copy_locale, ignore_content);
2687 collate = result->categories[LC_COLLATE].collate;
2689 while (1)
2691 char ucs4buf[10];
2692 char *symstr;
2693 size_t symlen;
2695 /* Of course we don't proceed beyond the end of file. */
2696 if (nowtok == tok_eof)
2697 break;
2699 /* Ingore empty lines. */
2700 if (nowtok == tok_eol)
2702 now = lr_token (ldfile, charmap, result, NULL, verbose);
2703 nowtok = now->tok;
2704 continue;
2707 switch (nowtok)
2709 case tok_copy:
2710 /* Allow copying other locales. */
2711 now = lr_token (ldfile, charmap, result, NULL, verbose);
2712 if (now->tok != tok_string)
2713 goto err_label;
2715 if (! ignore_content)
2716 load_locale (LC_COLLATE, now->val.str.startmb, repertoire_name,
2717 charmap, result);
2719 lr_ignore_rest (ldfile, 1);
2720 break;
2722 case tok_coll_weight_max:
2723 /* Ignore the rest of the line if we don't need the input of
2724 this line. */
2725 if (ignore_content)
2727 lr_ignore_rest (ldfile, 0);
2728 break;
2731 if (state != 0)
2732 goto err_label;
2734 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2735 if (arg->tok != tok_number)
2736 goto err_label;
2737 if (collate->col_weight_max != -1)
2738 lr_error (ldfile, _("%s: duplicate definition of `%s'"),
2739 "LC_COLLATE", "col_weight_max");
2740 else
2741 collate->col_weight_max = arg->val.num;
2742 lr_ignore_rest (ldfile, 1);
2743 break;
2745 case tok_section_symbol:
2746 /* Ignore the rest of the line if we don't need the input of
2747 this line. */
2748 if (ignore_content)
2750 lr_ignore_rest (ldfile, 0);
2751 break;
2754 if (state != 0)
2755 goto err_label;
2757 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2758 if (arg->tok != tok_bsymbol)
2759 goto err_label;
2760 else if (!ignore_content)
2762 /* Check whether this section is already known. */
2763 struct section_list *known = collate->sections;
2764 while (known != NULL)
2766 if (strcmp (known->name, arg->val.str.startmb) == 0)
2767 break;
2768 known = known->next;
2771 if (known != NULL)
2773 lr_error (ldfile,
2774 _("%s: duplicate declaration of section `%s'"),
2775 "LC_COLLATE", arg->val.str.startmb);
2776 free (arg->val.str.startmb);
2778 else
2779 collate->sections = make_seclist_elem (collate,
2780 arg->val.str.startmb,
2781 collate->sections);
2783 lr_ignore_rest (ldfile, known == NULL);
2785 else
2787 free (arg->val.str.startmb);
2788 lr_ignore_rest (ldfile, 0);
2790 break;
2792 case tok_collating_element:
2793 /* Ignore the rest of the line if we don't need the input of
2794 this line. */
2795 if (ignore_content)
2797 lr_ignore_rest (ldfile, 0);
2798 break;
2801 if (state != 0 && state != 2)
2802 goto err_label;
2804 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2805 if (arg->tok != tok_bsymbol)
2806 goto err_label;
2807 else
2809 const char *symbol = arg->val.str.startmb;
2810 size_t symbol_len = arg->val.str.lenmb;
2812 /* Next the `from' keyword. */
2813 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2814 if (arg->tok != tok_from)
2816 free ((char *) symbol);
2817 goto err_label;
2820 ldfile->return_widestr = 1;
2821 ldfile->translate_strings = 1;
2823 /* Finally the string with the replacement. */
2824 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2826 ldfile->return_widestr = 0;
2827 ldfile->translate_strings = 0;
2829 if (arg->tok != tok_string)
2830 goto err_label;
2832 if (!ignore_content && symbol != NULL)
2834 /* The name is already defined. */
2835 if (check_duplicate (ldfile, collate, charmap,
2836 repertoire, symbol, symbol_len))
2837 goto col_elem_free;
2839 if (arg->val.str.startmb != NULL)
2840 insert_entry (&collate->elem_table, symbol, symbol_len,
2841 new_element (collate,
2842 arg->val.str.startmb,
2843 arg->val.str.lenmb - 1,
2844 arg->val.str.startwc,
2845 symbol, symbol_len, 0));
2847 else
2849 col_elem_free:
2850 if (symbol != NULL)
2851 free ((char *) symbol);
2852 if (arg->val.str.startmb != NULL)
2853 free (arg->val.str.startmb);
2854 if (arg->val.str.startwc != NULL)
2855 free (arg->val.str.startwc);
2857 lr_ignore_rest (ldfile, 1);
2859 break;
2861 case tok_collating_symbol:
2862 /* Ignore the rest of the line if we don't need the input of
2863 this line. */
2864 if (ignore_content)
2866 lr_ignore_rest (ldfile, 0);
2867 break;
2870 if (state != 0 && state != 2)
2871 goto err_label;
2873 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2874 if (arg->tok != tok_bsymbol)
2875 goto err_label;
2876 else
2878 char *symbol = arg->val.str.startmb;
2879 size_t symbol_len = arg->val.str.lenmb;
2880 char *endsymbol = NULL;
2881 size_t endsymbol_len = 0;
2882 enum token_t ellipsis = tok_none;
2884 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2885 if (arg->tok == tok_ellipsis2 || arg->tok == tok_ellipsis4)
2887 ellipsis = arg->tok;
2889 arg = lr_token (ldfile, charmap, result, repertoire,
2890 verbose);
2891 if (arg->tok != tok_bsymbol)
2893 free (symbol);
2894 goto err_label;
2897 endsymbol = arg->val.str.startmb;
2898 endsymbol_len = arg->val.str.lenmb;
2900 lr_ignore_rest (ldfile, 1);
2902 else if (arg->tok != tok_eol)
2904 free (symbol);
2905 goto err_label;
2908 if (!ignore_content)
2910 if (symbol == NULL
2911 || (ellipsis != tok_none && endsymbol == NULL))
2913 lr_error (ldfile, _("\
2914 %s: unknown character in collating symbol name"),
2915 "LC_COLLATE");
2916 goto col_sym_free;
2918 else if (ellipsis == tok_none)
2920 /* A single symbol, no ellipsis. */
2921 if (check_duplicate (ldfile, collate, charmap,
2922 repertoire, symbol, symbol_len))
2923 /* The name is already defined. */
2924 goto col_sym_free;
2926 insert_entry (&collate->sym_table, symbol, symbol_len,
2927 new_symbol (collate, symbol, symbol_len));
2929 else if (symbol_len != endsymbol_len)
2931 col_sym_inv_range:
2932 lr_error (ldfile,
2933 _("invalid names for character range"));
2934 goto col_sym_free;
2936 else
2938 /* Oh my, we have to handle an ellipsis. First, as
2939 usual, determine the common prefix and then
2940 convert the rest into a range. */
2941 size_t prefixlen;
2942 unsigned long int from;
2943 unsigned long int to;
2944 char *endp;
2946 for (prefixlen = 0; prefixlen < symbol_len; ++prefixlen)
2947 if (symbol[prefixlen] != endsymbol[prefixlen])
2948 break;
2950 /* Convert the rest into numbers. */
2951 symbol[symbol_len] = '\0';
2952 from = strtoul (&symbol[prefixlen], &endp,
2953 ellipsis == tok_ellipsis2 ? 16 : 10);
2954 if (*endp != '\0')
2955 goto col_sym_inv_range;
2957 endsymbol[symbol_len] = '\0';
2958 to = strtoul (&endsymbol[prefixlen], &endp,
2959 ellipsis == tok_ellipsis2 ? 16 : 10);
2960 if (*endp != '\0')
2961 goto col_sym_inv_range;
2963 if (from > to)
2964 goto col_sym_inv_range;
2966 /* Now loop over all entries. */
2967 while (from <= to)
2969 char *symbuf;
2971 symbuf = (char *) obstack_alloc (&collate->mempool,
2972 symbol_len + 1);
2974 /* Create the name. */
2975 sprintf (symbuf,
2976 ellipsis == tok_ellipsis2
2977 ? "%.*s%.*lX" : "%.*s%.*lu",
2978 (int) prefixlen, symbol,
2979 (int) (symbol_len - prefixlen), from);
2981 if (check_duplicate (ldfile, collate, charmap,
2982 repertoire, symbuf, symbol_len))
2983 /* The name is already defined. */
2984 goto col_sym_free;
2986 insert_entry (&collate->sym_table, symbuf,
2987 symbol_len,
2988 new_symbol (collate, symbuf,
2989 symbol_len));
2991 /* Increment the counter. */
2992 ++from;
2995 goto col_sym_free;
2998 else
3000 col_sym_free:
3001 if (symbol != NULL)
3002 free (symbol);
3003 if (endsymbol != NULL)
3004 free (endsymbol);
3007 break;
3009 case tok_symbol_equivalence:
3010 /* Ignore the rest of the line if we don't need the input of
3011 this line. */
3012 if (ignore_content)
3014 lr_ignore_rest (ldfile, 0);
3015 break;
3018 if (state != 0)
3019 goto err_label;
3021 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3022 if (arg->tok != tok_bsymbol)
3023 goto err_label;
3024 else
3026 const char *newname = arg->val.str.startmb;
3027 size_t newname_len = arg->val.str.lenmb;
3028 const char *symname;
3029 size_t symname_len;
3030 void *symval; /* Actually struct symbol_t* */
3032 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3033 if (arg->tok != tok_bsymbol)
3035 if (newname != NULL)
3036 free ((char *) newname);
3037 goto err_label;
3040 symname = arg->val.str.startmb;
3041 symname_len = arg->val.str.lenmb;
3043 if (newname == NULL)
3045 lr_error (ldfile, _("\
3046 %s: unknown character in equivalent definition name"),
3047 "LC_COLLATE");
3049 sym_equiv_free:
3050 if (newname != NULL)
3051 free ((char *) newname);
3052 if (symname != NULL)
3053 free ((char *) symname);
3054 break;
3056 if (symname == NULL)
3058 lr_error (ldfile, _("\
3059 %s: unknown character in equivalent definition value"),
3060 "LC_COLLATE");
3061 goto sym_equiv_free;
3064 /* See whether the symbol name is already defined. */
3065 if (find_entry (&collate->sym_table, symname, symname_len,
3066 &symval) != 0)
3068 lr_error (ldfile, _("\
3069 %s: unknown symbol `%s' in equivalent definition"),
3070 "LC_COLLATE", symname);
3071 goto sym_equiv_free;
3074 if (insert_entry (&collate->sym_table,
3075 newname, newname_len, symval) < 0)
3077 lr_error (ldfile, _("\
3078 error while adding equivalent collating symbol"));
3079 goto sym_equiv_free;
3082 free ((char *) symname);
3084 lr_ignore_rest (ldfile, 1);
3085 break;
3087 case tok_script:
3088 /* We get told about the scripts we know. */
3089 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3090 if (arg->tok != tok_bsymbol)
3091 goto err_label;
3092 else
3094 struct section_list *runp = collate->known_sections;
3095 char *name;
3097 while (runp != NULL)
3098 if (strncmp (runp->name, arg->val.str.startmb,
3099 arg->val.str.lenmb) == 0
3100 && runp->name[arg->val.str.lenmb] == '\0')
3101 break;
3102 else
3103 runp = runp->def_next;
3105 if (runp != NULL)
3107 lr_error (ldfile, _("duplicate definition of script `%s'"),
3108 runp->name);
3109 lr_ignore_rest (ldfile, 0);
3110 break;
3113 runp = (struct section_list *) xcalloc (1, sizeof (*runp));
3114 name = (char *) xmalloc (arg->val.str.lenmb + 1);
3115 memcpy (name, arg->val.str.startmb, arg->val.str.lenmb);
3116 name[arg->val.str.lenmb] = '\0';
3117 runp->name = name;
3119 runp->def_next = collate->known_sections;
3120 collate->known_sections = runp;
3122 lr_ignore_rest (ldfile, 1);
3123 break;
3125 case tok_order_start:
3126 /* Ignore the rest of the line if we don't need the input of
3127 this line. */
3128 if (ignore_content)
3130 lr_ignore_rest (ldfile, 0);
3131 break;
3134 if (state != 0 && state != 1)
3135 goto err_label;
3136 state = 1;
3138 /* The 14652 draft does not specify whether all `order_start' lines
3139 must contain the same number of sort-rules, but 14651 does. So
3140 we require this here as well. */
3141 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3142 if (arg->tok == tok_bsymbol)
3144 /* This better should be a section name. */
3145 struct section_list *sp = collate->known_sections;
3146 while (sp != NULL
3147 && (sp->name == NULL
3148 || strncmp (sp->name, arg->val.str.startmb,
3149 arg->val.str.lenmb) != 0
3150 || sp->name[arg->val.str.lenmb] != '\0'))
3151 sp = sp->def_next;
3153 if (sp == NULL)
3155 lr_error (ldfile, _("\
3156 %s: unknown section name `%s'"),
3157 "LC_COLLATE", arg->val.str.startmb);
3158 /* We use the error section. */
3159 collate->current_section = &collate->error_section;
3161 if (collate->error_section.first == NULL)
3163 /* Insert &collate->error_section at the end of
3164 the collate->sections list. */
3165 if (collate->sections == NULL)
3166 collate->sections = &collate->error_section;
3167 else
3169 sp = collate->sections;
3170 while (sp->next != NULL)
3171 sp = sp->next;
3173 sp->next = &collate->error_section;
3175 collate->error_section.next = NULL;
3178 else
3180 /* One should not be allowed to open the same
3181 section twice. */
3182 if (sp->first != NULL)
3183 lr_error (ldfile, _("\
3184 %s: multiple order definitions for section `%s'"),
3185 "LC_COLLATE", sp->name);
3186 else
3188 /* Insert sp in the collate->sections list,
3189 right after collate->current_section. */
3190 if (collate->current_section == NULL)
3191 collate->current_section = sp;
3192 else
3194 sp->next = collate->current_section->next;
3195 collate->current_section->next = sp;
3199 /* Next should come the end of the line or a semicolon. */
3200 arg = lr_token (ldfile, charmap, result, repertoire,
3201 verbose);
3202 if (arg->tok == tok_eol)
3204 uint32_t cnt;
3206 /* This means we have exactly one rule: `forward'. */
3207 if (nrules > 1)
3208 lr_error (ldfile, _("\
3209 %s: invalid number of sorting rules"),
3210 "LC_COLLATE");
3211 else
3212 nrules = 1;
3213 sp->rules = obstack_alloc (&collate->mempool,
3214 (sizeof (enum coll_sort_rule)
3215 * nrules));
3216 for (cnt = 0; cnt < nrules; ++cnt)
3217 sp->rules[cnt] = sort_forward;
3219 /* Next line. */
3220 break;
3223 /* Get the next token. */
3224 arg = lr_token (ldfile, charmap, result, repertoire,
3225 verbose);
3228 else
3230 /* There is no section symbol. Therefore we use the unnamed
3231 section. */
3232 collate->current_section = &collate->unnamed_section;
3234 if (collate->unnamed_section.first != NULL)
3235 lr_error (ldfile, _("\
3236 %s: multiple order definitions for unnamed section"),
3237 "LC_COLLATE");
3238 else
3240 /* Insert &collate->unnamed_section at the beginning of
3241 the collate->sections list. */
3242 collate->unnamed_section.next = collate->sections;
3243 collate->sections = &collate->unnamed_section;
3247 /* Now read the direction names. */
3248 read_directions (ldfile, arg, charmap, repertoire, result);
3250 /* From now we need the strings untranslated. */
3251 ldfile->translate_strings = 0;
3252 break;
3254 case tok_order_end:
3255 /* Ignore the rest of the line if we don't need the input of
3256 this line. */
3257 if (ignore_content)
3259 lr_ignore_rest (ldfile, 0);
3260 break;
3263 if (state != 1)
3264 goto err_label;
3266 /* Handle ellipsis at end of list. */
3267 if (was_ellipsis != tok_none)
3269 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3270 repertoire, result);
3271 was_ellipsis = tok_none;
3274 state = 2;
3275 lr_ignore_rest (ldfile, 1);
3276 break;
3278 case tok_reorder_after:
3279 /* Ignore the rest of the line if we don't need the input of
3280 this line. */
3281 if (ignore_content)
3283 lr_ignore_rest (ldfile, 0);
3284 break;
3287 if (state == 1)
3289 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3290 "LC_COLLATE");
3291 state = 2;
3293 /* Handle ellipsis at end of list. */
3294 if (was_ellipsis != tok_none)
3296 handle_ellipsis (ldfile, arg->val.str.startmb,
3297 arg->val.str.lenmb, was_ellipsis, charmap,
3298 repertoire, result);
3299 was_ellipsis = tok_none;
3302 else if (state != 2 && state != 3)
3303 goto err_label;
3304 state = 3;
3306 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3307 if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
3309 /* Find this symbol in the sequence table. */
3310 char ucsbuf[10];
3311 char *startmb;
3312 size_t lenmb;
3313 struct element_t *insp;
3314 int no_error = 1;
3315 void *ptr;
3317 if (arg->tok == tok_bsymbol)
3319 startmb = arg->val.str.startmb;
3320 lenmb = arg->val.str.lenmb;
3322 else
3324 sprintf (ucsbuf, "U%08X", arg->val.ucs4);
3325 startmb = ucsbuf;
3326 lenmb = 9;
3329 if (find_entry (&collate->seq_table, startmb, lenmb, &ptr) == 0)
3330 /* Yes, the symbol exists. Simply point the cursor
3331 to it. */
3332 collate->cursor = (struct element_t *) ptr;
3333 else
3335 struct symbol_t *symbp;
3336 void *ptr;
3338 if (find_entry (&collate->sym_table, startmb, lenmb,
3339 &ptr) == 0)
3341 symbp = ptr;
3343 if (symbp->order->last != NULL
3344 || symbp->order->next != NULL)
3345 collate->cursor = symbp->order;
3346 else
3348 /* This is a collating symbol but its position
3349 is not yet defined. */
3350 lr_error (ldfile, _("\
3351 %s: order for collating symbol %.*s not yet defined"),
3352 "LC_COLLATE", (int) lenmb, startmb);
3353 collate->cursor = NULL;
3354 no_error = 0;
3357 else if (find_entry (&collate->elem_table, startmb, lenmb,
3358 &ptr) == 0)
3360 insp = (struct element_t *) ptr;
3362 if (insp->last != NULL || insp->next != NULL)
3363 collate->cursor = insp;
3364 else
3366 /* This is a collating element but its position
3367 is not yet defined. */
3368 lr_error (ldfile, _("\
3369 %s: order for collating element %.*s not yet defined"),
3370 "LC_COLLATE", (int) lenmb, startmb);
3371 collate->cursor = NULL;
3372 no_error = 0;
3375 else
3377 /* This is bad. The symbol after which we have to
3378 insert does not exist. */
3379 lr_error (ldfile, _("\
3380 %s: cannot reorder after %.*s: symbol not known"),
3381 "LC_COLLATE", (int) lenmb, startmb);
3382 collate->cursor = NULL;
3383 no_error = 0;
3387 lr_ignore_rest (ldfile, no_error);
3389 else
3390 /* This must not happen. */
3391 goto err_label;
3392 break;
3394 case tok_reorder_end:
3395 /* Ignore the rest of the line if we don't need the input of
3396 this line. */
3397 if (ignore_content)
3398 break;
3400 if (state != 3)
3401 goto err_label;
3402 state = 4;
3403 lr_ignore_rest (ldfile, 1);
3404 break;
3406 case tok_reorder_sections_after:
3407 /* Ignore the rest of the line if we don't need the input of
3408 this line. */
3409 if (ignore_content)
3411 lr_ignore_rest (ldfile, 0);
3412 break;
3415 if (state == 1)
3417 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3418 "LC_COLLATE");
3419 state = 2;
3421 /* Handle ellipsis at end of list. */
3422 if (was_ellipsis != tok_none)
3424 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3425 repertoire, result);
3426 was_ellipsis = tok_none;
3429 else if (state == 3)
3431 WITH_CUR_LOCALE (error (0, 0, _("\
3432 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3433 state = 4;
3435 else if (state != 2 && state != 4)
3436 goto err_label;
3437 state = 5;
3439 /* Get the name of the sections we are adding after. */
3440 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3441 if (arg->tok == tok_bsymbol)
3443 /* Now find a section with this name. */
3444 struct section_list *runp = collate->sections;
3446 while (runp != NULL)
3448 if (runp->name != NULL
3449 && strlen (runp->name) == arg->val.str.lenmb
3450 && memcmp (runp->name, arg->val.str.startmb,
3451 arg->val.str.lenmb) == 0)
3452 break;
3454 runp = runp->next;
3457 if (runp != NULL)
3458 collate->current_section = runp;
3459 else
3461 /* This is bad. The section after which we have to
3462 reorder does not exist. Therefore we cannot
3463 process the whole rest of this reorder
3464 specification. */
3465 lr_error (ldfile, _("%s: section `%.*s' not known"),
3466 "LC_COLLATE", (int) arg->val.str.lenmb,
3467 arg->val.str.startmb);
3471 lr_ignore_rest (ldfile, 0);
3473 now = lr_token (ldfile, charmap, result, NULL, verbose);
3475 while (now->tok == tok_reorder_sections_after
3476 || now->tok == tok_reorder_sections_end
3477 || now->tok == tok_end);
3479 /* Process the token we just saw. */
3480 nowtok = now->tok;
3481 continue;
3484 else
3485 /* This must not happen. */
3486 goto err_label;
3487 break;
3489 case tok_reorder_sections_end:
3490 /* Ignore the rest of the line if we don't need the input of
3491 this line. */
3492 if (ignore_content)
3493 break;
3495 if (state != 5)
3496 goto err_label;
3497 state = 6;
3498 lr_ignore_rest (ldfile, 1);
3499 break;
3501 case tok_bsymbol:
3502 case tok_ucs4:
3503 /* Ignore the rest of the line if we don't need the input of
3504 this line. */
3505 if (ignore_content)
3507 lr_ignore_rest (ldfile, 0);
3508 break;
3511 if (state != 0 && state != 1 && state != 3 && state != 5)
3512 goto err_label;
3514 if ((state == 0 || state == 5) && nowtok == tok_ucs4)
3515 goto err_label;
3517 if (nowtok == tok_ucs4)
3519 snprintf (ucs4buf, sizeof (ucs4buf), "U%08X", now->val.ucs4);
3520 symstr = ucs4buf;
3521 symlen = 9;
3523 else if (arg != NULL)
3525 symstr = arg->val.str.startmb;
3526 symlen = arg->val.str.lenmb;
3528 else
3530 lr_error (ldfile, _("%s: bad symbol <%.*s>"), "LC_COLLATE",
3531 (int) ldfile->token.val.str.lenmb,
3532 ldfile->token.val.str.startmb);
3533 break;
3536 struct element_t *seqp;
3537 if (state == 0)
3539 /* We are outside an `order_start' region. This means
3540 we must only accept definitions of values for
3541 collation symbols since these are purely abstract
3542 values and don't need directions associated. */
3543 void *ptr;
3545 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3547 seqp = ptr;
3549 /* It's already defined. First check whether this
3550 is really a collating symbol. */
3551 if (seqp->is_character)
3552 goto err_label;
3554 goto move_entry;
3556 else
3558 void *result;
3560 if (find_entry (&collate->sym_table, symstr, symlen,
3561 &result) != 0)
3562 /* No collating symbol, it's an error. */
3563 goto err_label;
3565 /* Maybe this is the first time we define a symbol
3566 value and it is before the first actual section. */
3567 if (collate->sections == NULL)
3568 collate->sections = collate->current_section =
3569 &collate->symbol_section;
3572 if (was_ellipsis != tok_none)
3575 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis,
3576 charmap, repertoire, result);
3578 /* Remember that we processed the ellipsis. */
3579 was_ellipsis = tok_none;
3581 /* And don't add the value a second time. */
3582 break;
3585 else if (state == 3)
3587 /* It is possible that we already have this collation sequence.
3588 In this case we move the entry. */
3589 void *sym;
3590 void *ptr;
3592 /* If the symbol after which we have to insert was not found
3593 ignore all entries. */
3594 if (collate->cursor == NULL)
3596 lr_ignore_rest (ldfile, 0);
3597 break;
3600 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3602 seqp = (struct element_t *) ptr;
3603 goto move_entry;
3606 if (find_entry (&collate->sym_table, symstr, symlen, &sym) == 0
3607 && (seqp = ((struct symbol_t *) sym)->order) != NULL)
3608 goto move_entry;
3610 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) == 0
3611 && (seqp = (struct element_t *) ptr,
3612 seqp->last != NULL || seqp->next != NULL
3613 || (collate->start != NULL && seqp == collate->start)))
3615 move_entry:
3616 /* Remove the entry from the old position. */
3617 if (seqp->last == NULL)
3618 collate->start = seqp->next;
3619 else
3620 seqp->last->next = seqp->next;
3621 if (seqp->next != NULL)
3622 seqp->next->last = seqp->last;
3624 /* We also have to check whether this entry is the
3625 first or last of a section. */
3626 if (seqp->section->first == seqp)
3628 if (seqp->section->first == seqp->section->last)
3629 /* This section has no content anymore. */
3630 seqp->section->first = seqp->section->last = NULL;
3631 else
3632 seqp->section->first = seqp->next;
3634 else if (seqp->section->last == seqp)
3635 seqp->section->last = seqp->last;
3637 /* Now insert it in the new place. */
3638 insert_weights (ldfile, seqp, charmap, repertoire, result,
3639 tok_none);
3640 break;
3643 /* Otherwise we just add a new entry. */
3645 else if (state == 5)
3647 /* We are reordering sections. Find the named section. */
3648 struct section_list *runp = collate->sections;
3649 struct section_list *prevp = NULL;
3651 while (runp != NULL)
3653 if (runp->name != NULL
3654 && strlen (runp->name) == symlen
3655 && memcmp (runp->name, symstr, symlen) == 0)
3656 break;
3658 prevp = runp;
3659 runp = runp->next;
3662 if (runp == NULL)
3664 lr_error (ldfile, _("%s: section `%.*s' not known"),
3665 "LC_COLLATE", (int) symlen, symstr);
3666 lr_ignore_rest (ldfile, 0);
3668 else
3670 if (runp != collate->current_section)
3672 /* Remove the named section from the old place and
3673 insert it in the new one. */
3674 prevp->next = runp->next;
3676 runp->next = collate->current_section->next;
3677 collate->current_section->next = runp;
3678 collate->current_section = runp;
3681 /* Process the rest of the line which might change
3682 the collation rules. */
3683 arg = lr_token (ldfile, charmap, result, repertoire,
3684 verbose);
3685 if (arg->tok != tok_eof && arg->tok != tok_eol)
3686 read_directions (ldfile, arg, charmap, repertoire,
3687 result);
3689 break;
3691 else if (was_ellipsis != tok_none)
3693 /* Using the information in the `ellipsis_weight'
3694 element and this and the last value we have to handle
3695 the ellipsis now. */
3696 assert (state == 1);
3698 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis, charmap,
3699 repertoire, result);
3701 /* Remember that we processed the ellipsis. */
3702 was_ellipsis = tok_none;
3704 /* And don't add the value a second time. */
3705 break;
3708 /* Now insert in the new place. */
3709 insert_value (ldfile, symstr, symlen, charmap, repertoire, result);
3710 break;
3712 case tok_undefined:
3713 /* Ignore the rest of the line if we don't need the input of
3714 this line. */
3715 if (ignore_content)
3717 lr_ignore_rest (ldfile, 0);
3718 break;
3721 if (state != 1)
3722 goto err_label;
3724 if (was_ellipsis != tok_none)
3726 lr_error (ldfile,
3727 _("%s: cannot have `%s' as end of ellipsis range"),
3728 "LC_COLLATE", "UNDEFINED");
3730 unlink_element (collate);
3731 was_ellipsis = tok_none;
3734 /* See whether UNDEFINED already appeared somewhere. */
3735 if (collate->undefined.next != NULL
3736 || &collate->undefined == collate->cursor)
3738 lr_error (ldfile,
3739 _("%s: order for `%.*s' already defined at %s:%Zu"),
3740 "LC_COLLATE", 9, "UNDEFINED",
3741 collate->undefined.file,
3742 collate->undefined.line);
3743 lr_ignore_rest (ldfile, 0);
3745 else
3746 /* Parse the weights. */
3747 insert_weights (ldfile, &collate->undefined, charmap,
3748 repertoire, result, tok_none);
3749 break;
3751 case tok_ellipsis2: /* symbolic hexadecimal ellipsis */
3752 case tok_ellipsis3: /* absolute ellipsis */
3753 case tok_ellipsis4: /* symbolic decimal ellipsis */
3754 /* This is the symbolic (decimal or hexadecimal) or absolute
3755 ellipsis. */
3756 if (was_ellipsis != tok_none)
3757 goto err_label;
3759 if (state != 0 && state != 1 && state != 3)
3760 goto err_label;
3762 was_ellipsis = nowtok;
3764 insert_weights (ldfile, &collate->ellipsis_weight, charmap,
3765 repertoire, result, nowtok);
3766 break;
3768 case tok_end:
3769 /* Next we assume `LC_COLLATE'. */
3770 if (!ignore_content)
3772 if (state == 0)
3773 /* We must either see a copy statement or have
3774 ordering values. */
3775 lr_error (ldfile,
3776 _("%s: empty category description not allowed"),
3777 "LC_COLLATE");
3778 else if (state == 1)
3780 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3781 "LC_COLLATE");
3783 /* Handle ellipsis at end of list. */
3784 if (was_ellipsis != tok_none)
3786 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3787 repertoire, result);
3788 was_ellipsis = tok_none;
3791 else if (state == 3)
3792 WITH_CUR_LOCALE (error (0, 0, _("\
3793 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3794 else if (state == 5)
3795 WITH_CUR_LOCALE (error (0, 0, _("\
3796 %s: missing `reorder-sections-end' keyword"), "LC_COLLATE"));
3798 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3799 if (arg->tok == tok_eof)
3800 break;
3801 if (arg->tok == tok_eol)
3802 lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
3803 else if (arg->tok != tok_lc_collate)
3804 lr_error (ldfile, _("\
3805 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
3806 lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
3807 return;
3809 default:
3810 err_label:
3811 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
3814 /* Prepare for the next round. */
3815 now = lr_token (ldfile, charmap, result, NULL, verbose);
3816 nowtok = now->tok;
3819 /* When we come here we reached the end of the file. */
3820 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");