(CFLAGS-tst-align.c): Add -mpreferred-stack-boundary=4.
[glibc.git] / locale / programs / ld-collate.c
blob6d0d03c23577a3ff25d333e6160ce1c01c71a817
1 /* Copyright (C) 1995-2002, 2003 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, write to the Free
17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18 02111-1307 USA. */
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
24 #include <errno.h>
25 #include <error.h>
26 #include <stdlib.h>
27 #include <wchar.h>
28 #include <sys/param.h>
30 #include "localedef.h"
31 #include "charmap.h"
32 #include "localeinfo.h"
33 #include "linereader.h"
34 #include "locfile.h"
35 #include "elem-hash.h"
37 /* Uncomment the following line in the production version. */
38 /* #define NDEBUG 1 */
39 #include <assert.h>
41 #define obstack_chunk_alloc malloc
42 #define obstack_chunk_free free
44 static inline void
45 __attribute ((always_inline))
46 obstack_int32_grow (struct obstack *obstack, int32_t data)
48 if (sizeof (int32_t) == sizeof (int))
49 obstack_int_grow (obstack, data);
50 else
51 obstack_grow (obstack, &data, sizeof (int32_t));
54 static inline void
55 __attribute ((always_inline))
56 obstack_int32_grow_fast (struct obstack *obstack, int32_t data)
58 if (sizeof (int32_t) == sizeof (int))
59 obstack_int_grow_fast (obstack, data);
60 else
61 obstack_grow (obstack, &data, sizeof (int32_t));
64 /* Forward declaration. */
65 struct element_t;
67 /* Data type for list of strings. */
68 struct section_list
70 /* Successor in the known_sections list. */
71 struct section_list *def_next;
72 /* Successor in the sections list. */
73 struct section_list *next;
74 /* Name of the section. */
75 const char *name;
76 /* First element of this section. */
77 struct element_t *first;
78 /* Last element of this section. */
79 struct element_t *last;
80 /* These are the rules for this section. */
81 enum coll_sort_rule *rules;
82 /* Index of the rule set in the appropriate section of the output file. */
83 int ruleidx;
86 struct element_t;
88 struct element_list_t
90 /* Number of elements. */
91 int cnt;
93 struct element_t **w;
96 /* Data type for collating element. */
97 struct element_t
99 const char *name;
101 const char *mbs;
102 size_t nmbs;
103 const uint32_t *wcs;
104 size_t nwcs;
105 int *mborder;
106 int wcorder;
108 /* The following is a bit mask which bits are set if this element is
109 used in the appropriate level. Interesting for the singlebyte
110 weight computation.
112 XXX The type here restricts the number of levels to 32. It could
113 be changed if necessary but I doubt this is necessary. */
114 unsigned int used_in_level;
116 struct element_list_t *weights;
118 /* Nonzero if this is a real character definition. */
119 int is_character;
121 /* Order of the character in the sequence. This information will
122 be used in range expressions. */
123 int mbseqorder;
124 int wcseqorder;
126 /* Where does the definition come from. */
127 const char *file;
128 size_t line;
130 /* Which section does this belong to. */
131 struct section_list *section;
133 /* Predecessor and successor in the order list. */
134 struct element_t *last;
135 struct element_t *next;
137 /* Next element in multibyte output list. */
138 struct element_t *mbnext;
139 struct element_t *mblast;
141 /* Next element in wide character output list. */
142 struct element_t *wcnext;
143 struct element_t *wclast;
146 /* Special element value. */
147 #define ELEMENT_ELLIPSIS2 ((struct element_t *) 1)
148 #define ELEMENT_ELLIPSIS3 ((struct element_t *) 2)
149 #define ELEMENT_ELLIPSIS4 ((struct element_t *) 3)
151 /* Data type for collating symbol. */
152 struct symbol_t
154 const char *name;
156 /* Point to place in the order list. */
157 struct element_t *order;
159 /* Where does the definition come from. */
160 const char *file;
161 size_t line;
164 /* Sparse table of struct element_t *. */
165 #define TABLE wchead_table
166 #define ELEMENT struct element_t *
167 #define DEFAULT NULL
168 #define ITERATE
169 #define NO_FINALIZE
170 #include "3level.h"
172 /* Sparse table of int32_t. */
173 #define TABLE collidx_table
174 #define ELEMENT int32_t
175 #define DEFAULT 0
176 #include "3level.h"
178 /* Sparse table of uint32_t. */
179 #define TABLE collseq_table
180 #define ELEMENT uint32_t
181 #define DEFAULT ~((uint32_t) 0)
182 #include "3level.h"
185 /* The real definition of the struct for the LC_COLLATE locale. */
186 struct locale_collate_t
188 int col_weight_max;
189 int cur_weight_max;
191 /* List of known scripts. */
192 struct section_list *known_sections;
193 /* List of used sections. */
194 struct section_list *sections;
195 /* Current section using definition. */
196 struct section_list *current_section;
197 /* There always can be an unnamed section. */
198 struct section_list unnamed_section;
199 /* To make handling of errors easier we have another section. */
200 struct section_list error_section;
201 /* Sometimes we are defining the values for collating symbols before
202 the first actual section. */
203 struct section_list symbol_section;
205 /* Start of the order list. */
206 struct element_t *start;
208 /* The undefined element. */
209 struct element_t undefined;
211 /* This is the cursor for `reorder_after' insertions. */
212 struct element_t *cursor;
214 /* This value is used when handling ellipsis. */
215 struct element_t ellipsis_weight;
217 /* Known collating elements. */
218 hash_table elem_table;
220 /* Known collating symbols. */
221 hash_table sym_table;
223 /* Known collation sequences. */
224 hash_table seq_table;
226 struct obstack mempool;
228 /* The LC_COLLATE category is a bit special as it is sometimes possible
229 that the definitions from more than one input file contains information.
230 Therefore we keep all relevant input in a list. */
231 struct locale_collate_t *next;
233 /* Arrays with heads of the list for each of the leading bytes in
234 the multibyte sequences. */
235 struct element_t *mbheads[256];
237 /* Arrays with heads of the list for each of the leading bytes in
238 the multibyte sequences. */
239 struct wchead_table wcheads;
241 /* The arrays with the collation sequence order. */
242 unsigned char mbseqorder[256];
243 struct collseq_table wcseqorder;
247 /* We have a few global variables which are used for reading all
248 LC_COLLATE category descriptions in all files. */
249 static uint32_t nrules;
252 /* We need UTF-8 encoding of numbers. */
253 static inline int
254 __attribute ((always_inline))
255 utf8_encode (char *buf, int val)
257 int retval;
259 if (val < 0x80)
261 *buf++ = (char) val;
262 retval = 1;
264 else
266 int step;
268 for (step = 2; step < 6; ++step)
269 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
270 break;
271 retval = step;
273 *buf = (unsigned char) (~0xff >> step);
274 --step;
277 buf[step] = 0x80 | (val & 0x3f);
278 val >>= 6;
280 while (--step > 0);
281 *buf |= val;
284 return retval;
288 static struct section_list *
289 make_seclist_elem (struct locale_collate_t *collate, const char *string,
290 struct section_list *next)
292 struct section_list *newp;
294 newp = (struct section_list *) obstack_alloc (&collate->mempool,
295 sizeof (*newp));
296 newp->next = next;
297 newp->name = string;
298 newp->first = NULL;
299 newp->last = NULL;
301 return newp;
305 static struct element_t *
306 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
307 const uint32_t *wcs, const char *name, size_t namelen,
308 int is_character)
310 struct element_t *newp;
312 newp = (struct element_t *) obstack_alloc (&collate->mempool,
313 sizeof (*newp));
314 newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
315 name, namelen);
316 if (mbs != NULL)
318 newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
319 newp->nmbs = mbslen;
321 else
323 newp->mbs = NULL;
324 newp->nmbs = 0;
326 if (wcs != NULL)
328 size_t nwcs = wcslen ((wchar_t *) wcs);
329 uint32_t zero = 0;
330 obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
331 obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
332 newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
333 newp->nwcs = nwcs;
335 else
337 newp->wcs = NULL;
338 newp->nwcs = 0;
340 newp->mborder = NULL;
341 newp->wcorder = 0;
342 newp->used_in_level = 0;
343 newp->is_character = is_character;
345 /* Will be assigned later. XXX */
346 newp->mbseqorder = 0;
347 newp->wcseqorder = 0;
349 /* Will be allocated later. */
350 newp->weights = NULL;
352 newp->file = NULL;
353 newp->line = 0;
355 newp->section = collate->current_section;
357 newp->last = NULL;
358 newp->next = NULL;
360 newp->mbnext = NULL;
361 newp->mblast = NULL;
363 newp->wcnext = NULL;
364 newp->wclast = NULL;
366 return newp;
370 static struct symbol_t *
371 new_symbol (struct locale_collate_t *collate, const char *name, size_t len)
373 struct symbol_t *newp;
375 newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
377 newp->name = obstack_copy0 (&collate->mempool, name, len);
378 newp->order = NULL;
380 newp->file = NULL;
381 newp->line = 0;
383 return newp;
387 /* Test whether this name is already defined somewhere. */
388 static int
389 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
390 const struct charmap_t *charmap,
391 struct repertoire_t *repertoire, const char *symbol,
392 size_t symbol_len)
394 void *ignore = NULL;
396 if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
398 lr_error (ldfile, _("`%.*s' already defined in charmap"),
399 (int) symbol_len, symbol);
400 return 1;
403 if (repertoire != NULL
404 && (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore)
405 == 0))
407 lr_error (ldfile, _("`%.*s' already defined in repertoire"),
408 (int) symbol_len, symbol);
409 return 1;
412 if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
414 lr_error (ldfile, _("`%.*s' already defined as collating symbol"),
415 (int) symbol_len, symbol);
416 return 1;
419 if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
421 lr_error (ldfile, _("`%.*s' already defined as collating element"),
422 (int) symbol_len, symbol);
423 return 1;
426 return 0;
430 /* Read the direction specification. */
431 static void
432 read_directions (struct linereader *ldfile, struct token *arg,
433 const struct charmap_t *charmap,
434 struct repertoire_t *repertoire, struct localedef_t *result)
436 int cnt = 0;
437 int max = nrules ?: 10;
438 enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
439 int warned = 0;
440 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
442 while (1)
444 int valid = 0;
446 if (arg->tok == tok_forward)
448 if (rules[cnt] & sort_backward)
450 if (! warned)
452 lr_error (ldfile, _("\
453 %s: `forward' and `backward' are mutually excluding each other"),
454 "LC_COLLATE");
455 warned = 1;
458 else if (rules[cnt] & sort_forward)
460 if (! warned)
462 lr_error (ldfile, _("\
463 %s: `%s' mentioned more than once in definition of weight %d"),
464 "LC_COLLATE", "forward", cnt + 1);
467 else
468 rules[cnt] |= sort_forward;
470 valid = 1;
472 else if (arg->tok == tok_backward)
474 if (rules[cnt] & sort_forward)
476 if (! warned)
478 lr_error (ldfile, _("\
479 %s: `forward' and `backward' are mutually excluding each other"),
480 "LC_COLLATE");
481 warned = 1;
484 else if (rules[cnt] & sort_backward)
486 if (! warned)
488 lr_error (ldfile, _("\
489 %s: `%s' mentioned more than once in definition of weight %d"),
490 "LC_COLLATE", "backward", cnt + 1);
493 else
494 rules[cnt] |= sort_backward;
496 valid = 1;
498 else if (arg->tok == tok_position)
500 if (rules[cnt] & sort_position)
502 if (! warned)
504 lr_error (ldfile, _("\
505 %s: `%s' mentioned more than once in definition of weight %d"),
506 "LC_COLLATE", "position", cnt + 1);
509 else
510 rules[cnt] |= sort_position;
512 valid = 1;
515 if (valid)
516 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
518 if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
519 || arg->tok == tok_semicolon)
521 if (! valid && ! warned)
523 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
524 warned = 1;
527 /* See whether we have to increment the counter. */
528 if (arg->tok != tok_comma && rules[cnt] != 0)
530 /* Add the default `forward' if we have seen only `position'. */
531 if (rules[cnt] == sort_position)
532 rules[cnt] = sort_position | sort_forward;
534 ++cnt;
537 if (arg->tok == tok_eof || arg->tok == tok_eol)
538 /* End of line or file, so we exit the loop. */
539 break;
541 if (nrules == 0)
543 /* See whether we have enough room in the array. */
544 if (cnt == max)
546 max += 10;
547 rules = (enum coll_sort_rule *) xrealloc (rules,
549 * sizeof (*rules));
550 memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
553 else
555 if (cnt == nrules)
557 /* There must not be any more rule. */
558 if (! warned)
560 lr_error (ldfile, _("\
561 %s: too many rules; first entry only had %d"),
562 "LC_COLLATE", nrules);
563 warned = 1;
566 lr_ignore_rest (ldfile, 0);
567 break;
571 else
573 if (! warned)
575 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
576 warned = 1;
580 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
583 if (nrules == 0)
585 /* Now we know how many rules we have. */
586 nrules = cnt;
587 rules = (enum coll_sort_rule *) xrealloc (rules,
588 nrules * sizeof (*rules));
590 else
592 if (cnt < nrules)
594 /* Not enough rules in this specification. */
595 if (! warned)
596 lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
599 rules[cnt] = sort_forward;
600 while (++cnt < nrules);
604 collate->current_section->rules = rules;
608 static struct element_t *
609 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
610 const char *str, size_t len)
612 void *result = NULL;
614 /* Search for the entries among the collation sequences already define. */
615 if (find_entry (&collate->seq_table, str, len, &result) != 0)
617 /* Nope, not define yet. So we see whether it is a
618 collation symbol. */
619 void *ptr;
621 if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
623 /* It's a collation symbol. */
624 struct symbol_t *sym = (struct symbol_t *) ptr;
625 result = sym->order;
627 if (result == NULL)
628 result = sym->order = new_element (collate, NULL, 0, NULL,
629 NULL, 0, 0);
631 else if (find_entry (&collate->elem_table, str, len, &result) != 0)
633 /* It's also no collation element. So it is a character
634 element defined later. */
635 result = new_element (collate, NULL, 0, NULL, str, len, 1);
636 /* Insert it into the sequence table. */
637 insert_entry (&collate->seq_table, str, len, result);
641 return (struct element_t *) result;
645 static void
646 unlink_element (struct locale_collate_t *collate)
648 if (collate->cursor == collate->start)
650 assert (collate->cursor->next == NULL);
651 assert (collate->cursor->last == NULL);
652 collate->cursor = NULL;
654 else
656 if (collate->cursor->next != NULL)
657 collate->cursor->next->last = collate->cursor->last;
658 if (collate->cursor->last != NULL)
659 collate->cursor->last->next = collate->cursor->next;
660 collate->cursor = collate->cursor->last;
665 static void
666 insert_weights (struct linereader *ldfile, struct element_t *elem,
667 const struct charmap_t *charmap,
668 struct repertoire_t *repertoire, struct localedef_t *result,
669 enum token_t ellipsis)
671 int weight_cnt;
672 struct token *arg;
673 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
675 /* Initialize all the fields. */
676 elem->file = ldfile->fname;
677 elem->line = ldfile->lineno;
679 elem->last = collate->cursor;
680 elem->next = collate->cursor ? collate->cursor->next : NULL;
681 if (collate->cursor != NULL && collate->cursor->next != NULL)
682 collate->cursor->next->last = elem;
683 if (collate->cursor != NULL)
684 collate->cursor->next = elem;
685 if (collate->start == NULL)
687 assert (collate->cursor == NULL);
688 collate->start = elem;
691 elem->section = collate->current_section;
693 if (collate->current_section->first == NULL)
694 collate->current_section->first = elem;
695 if (collate->current_section->last == collate->cursor)
696 collate->current_section->last = elem;
698 collate->cursor = elem;
700 elem->weights = (struct element_list_t *)
701 obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
702 memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
704 weight_cnt = 0;
706 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
709 if (arg->tok == tok_eof || arg->tok == tok_eol)
710 break;
712 if (arg->tok == tok_ignore)
714 /* The weight for this level has to be ignored. We use the
715 null pointer to indicate this. */
716 elem->weights[weight_cnt].w = (struct element_t **)
717 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
718 elem->weights[weight_cnt].w[0] = NULL;
719 elem->weights[weight_cnt].cnt = 1;
721 else if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
723 char ucs4str[10];
724 struct element_t *val;
725 char *symstr;
726 size_t symlen;
728 if (arg->tok == tok_bsymbol)
730 symstr = arg->val.str.startmb;
731 symlen = arg->val.str.lenmb;
733 else
735 snprintf (ucs4str, sizeof (ucs4str), "U%08X", arg->val.ucs4);
736 symstr = ucs4str;
737 symlen = 9;
740 val = find_element (ldfile, collate, symstr, symlen);
741 if (val == NULL)
742 break;
744 elem->weights[weight_cnt].w = (struct element_t **)
745 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
746 elem->weights[weight_cnt].w[0] = val;
747 elem->weights[weight_cnt].cnt = 1;
749 else if (arg->tok == tok_string)
751 /* Split the string up in the individual characters and put
752 the element definitions in the list. */
753 const char *cp = arg->val.str.startmb;
754 int cnt = 0;
755 struct element_t *charelem;
756 struct element_t **weights = NULL;
757 int max = 0;
759 if (*cp == '\0')
761 lr_error (ldfile, _("%s: empty weight string not allowed"),
762 "LC_COLLATE");
763 lr_ignore_rest (ldfile, 0);
764 break;
769 if (*cp == '<')
771 /* Ahh, it's a bsymbol or an UCS4 value. If it's
772 the latter we have to unify the name. */
773 const char *startp = ++cp;
774 size_t len;
776 while (*cp != '>')
778 if (*cp == ldfile->escape_char)
779 ++cp;
780 if (*cp == '\0')
781 /* It's a syntax error. */
782 goto syntax;
784 ++cp;
787 if (cp - startp == 5 && startp[0] == 'U'
788 && isxdigit (startp[1]) && isxdigit (startp[2])
789 && isxdigit (startp[3]) && isxdigit (startp[4]))
791 unsigned int ucs4 = strtoul (startp + 1, NULL, 16);
792 char *newstr;
794 newstr = (char *) xmalloc (10);
795 snprintf (newstr, 10, "U%08X", ucs4);
796 startp = newstr;
798 len = 9;
800 else
801 len = cp - startp;
803 charelem = find_element (ldfile, collate, startp, len);
804 ++cp;
806 else
808 /* People really shouldn't use characters directly in
809 the string. Especially since it's not really clear
810 what this means. We interpret all characters in the
811 string as if that would be bsymbols. Otherwise we
812 would have to match back to bsymbols somehow and this
813 is normally not what people normally expect. */
814 charelem = find_element (ldfile, collate, cp++, 1);
817 if (charelem == NULL)
819 /* We ignore the rest of the line. */
820 lr_ignore_rest (ldfile, 0);
821 break;
824 /* Add the pointer. */
825 if (cnt >= max)
827 struct element_t **newp;
828 max += 10;
829 newp = (struct element_t **)
830 alloca (max * sizeof (struct element_t *));
831 memcpy (newp, weights, cnt * sizeof (struct element_t *));
832 weights = newp;
834 weights[cnt++] = charelem;
836 while (*cp != '\0');
838 /* Now store the information. */
839 elem->weights[weight_cnt].w = (struct element_t **)
840 obstack_alloc (&collate->mempool,
841 cnt * sizeof (struct element_t *));
842 memcpy (elem->weights[weight_cnt].w, weights,
843 cnt * sizeof (struct element_t *));
844 elem->weights[weight_cnt].cnt = cnt;
846 /* We don't need the string anymore. */
847 free (arg->val.str.startmb);
849 else if (ellipsis != tok_none
850 && (arg->tok == tok_ellipsis2
851 || arg->tok == tok_ellipsis3
852 || arg->tok == tok_ellipsis4))
854 /* It must be the same ellipsis as used in the initial column. */
855 if (arg->tok != ellipsis)
856 lr_error (ldfile, _("\
857 %s: weights must use the same ellipsis symbol as the name"),
858 "LC_COLLATE");
860 /* The weight for this level will depend on the element
861 iterating over the range. Put a placeholder. */
862 elem->weights[weight_cnt].w = (struct element_t **)
863 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
864 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
865 elem->weights[weight_cnt].cnt = 1;
867 else
869 syntax:
870 /* It's a syntax error. */
871 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
872 lr_ignore_rest (ldfile, 0);
873 break;
876 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
877 /* This better should be the end of the line or a semicolon. */
878 if (arg->tok == tok_semicolon)
879 /* OK, ignore this and read the next token. */
880 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
881 else if (arg->tok != tok_eof && arg->tok != tok_eol)
883 /* It's a syntax error. */
884 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
885 lr_ignore_rest (ldfile, 0);
886 break;
889 while (++weight_cnt < nrules);
891 if (weight_cnt < nrules)
893 /* This means the rest of the line uses the current element as
894 the weight. */
897 elem->weights[weight_cnt].w = (struct element_t **)
898 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
899 if (ellipsis == tok_none)
900 elem->weights[weight_cnt].w[0] = elem;
901 else
902 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
903 elem->weights[weight_cnt].cnt = 1;
905 while (++weight_cnt < nrules);
907 else
909 if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
911 /* Too many rule values. */
912 lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
913 lr_ignore_rest (ldfile, 0);
915 else
916 lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
921 static int
922 insert_value (struct linereader *ldfile, const char *symstr, size_t symlen,
923 const struct charmap_t *charmap, struct repertoire_t *repertoire,
924 struct localedef_t *result)
926 /* First find out what kind of symbol this is. */
927 struct charseq *seq;
928 uint32_t wc;
929 struct element_t *elem = NULL;
930 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
932 /* Try to find the character in the charmap. */
933 seq = charmap_find_value (charmap, symstr, symlen);
935 /* Determine the wide character. */
936 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
938 wc = repertoire_find_value (repertoire, symstr, symlen);
939 if (seq != NULL)
940 seq->ucs4 = wc;
942 else
943 wc = seq->ucs4;
945 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
947 /* It's no character, so look through the collation elements and
948 symbol list. */
949 void *ptr = elem;
950 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) != 0)
952 void *result;
953 struct symbol_t *sym = NULL;
955 /* It's also collation element. Therefore it's either a
956 collating symbol or it's a character which is not
957 supported by the character set. In the later case we
958 simply create a dummy entry. */
959 if (find_entry (&collate->sym_table, symstr, symlen, &result) == 0)
961 /* It's a collation symbol. */
962 sym = (struct symbol_t *) result;
964 elem = sym->order;
967 if (elem == NULL)
969 elem = new_element (collate, NULL, 0, NULL, symstr, symlen, 0);
971 if (sym != NULL)
972 sym->order = elem;
973 else
974 /* Enter a fake element in the sequence table. This
975 won't cause anything in the output since there is
976 no multibyte or wide character associated with
977 it. */
978 insert_entry (&collate->seq_table, symstr, symlen, elem);
981 else
982 /* Copy the result back. */
983 elem = ptr;
985 else
987 /* Otherwise the symbols stands for a character. */
988 void *ptr = elem;
989 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) != 0)
991 uint32_t wcs[2] = { wc, 0 };
993 /* We have to allocate an entry. */
994 elem = new_element (collate, seq != NULL ? seq->bytes : NULL,
995 seq != NULL ? seq->nbytes : 0,
996 wc == ILLEGAL_CHAR_VALUE ? NULL : wcs,
997 symstr, symlen, 1);
999 /* And add it to the table. */
1000 if (insert_entry (&collate->seq_table, symstr, symlen, elem) != 0)
1001 /* This cannot happen. */
1002 assert (! "Internal error");
1004 else
1006 /* Copy the result back. */
1007 elem = ptr;
1009 /* Maybe the character was used before the definition. In this case
1010 we have to insert the byte sequences now. */
1011 if (elem->mbs == NULL && seq != NULL)
1013 elem->mbs = obstack_copy0 (&collate->mempool,
1014 seq->bytes, seq->nbytes);
1015 elem->nmbs = seq->nbytes;
1018 if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
1020 uint32_t wcs[2] = { wc, 0 };
1022 elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
1023 elem->nwcs = 1;
1028 /* Test whether this element is not already in the list. */
1029 if (elem->next != NULL || elem == collate->cursor)
1031 lr_error (ldfile, _("order for `%.*s' already defined at %s:%Zu"),
1032 (int) symlen, symstr, elem->file, elem->line);
1033 lr_ignore_rest (ldfile, 0);
1034 return 1;
1037 insert_weights (ldfile, elem, charmap, repertoire, result, tok_none);
1039 return 0;
1043 static void
1044 handle_ellipsis (struct linereader *ldfile, const char *symstr, size_t symlen,
1045 enum token_t ellipsis, const struct charmap_t *charmap,
1046 struct repertoire_t *repertoire,
1047 struct localedef_t *result)
1049 struct element_t *startp;
1050 struct element_t *endp;
1051 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
1053 /* Unlink the entry added for the ellipsis. */
1054 unlink_element (collate);
1055 startp = collate->cursor;
1057 /* Process and add the end-entry. */
1058 if (symstr != NULL
1059 && insert_value (ldfile, symstr, symlen, charmap, repertoire, result))
1060 /* Something went wrong with inserting the to-value. This means
1061 we cannot process the ellipsis. */
1062 return;
1064 /* Reset the cursor. */
1065 collate->cursor = startp;
1067 /* Now we have to handle many different situations:
1068 - we have to distinguish between the three different ellipsis forms
1069 - the is the ellipsis at the beginning, in the middle, or at the end.
1071 endp = collate->cursor->next;
1072 assert (symstr == NULL || endp != NULL);
1074 /* XXX The following is probably very wrong since also collating symbols
1075 can appear in ranges. But do we want/can refine the test for that? */
1076 #if 0
1077 /* Both, the start and the end symbol, must stand for characters. */
1078 if ((startp != NULL && (startp->name == NULL || ! startp->is_character))
1079 || (endp != NULL && (endp->name == NULL|| ! endp->is_character)))
1081 lr_error (ldfile, _("\
1082 %s: the start and the end symbol of a range must stand for characters"),
1083 "LC_COLLATE");
1084 return;
1086 #endif
1088 if (ellipsis == tok_ellipsis3)
1090 /* One requirement we make here: the length of the byte
1091 sequences for the first and end character must be the same.
1092 This is mainly to prevent unwanted effects and this is often
1093 not what is wanted. */
1094 size_t len = (startp->mbs != NULL ? startp->nmbs
1095 : (endp->mbs != NULL ? endp->nmbs : 0));
1096 char mbcnt[len + 1];
1097 char mbend[len + 1];
1099 /* Well, this should be caught somewhere else already. Just to
1100 make sure. */
1101 assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
1102 assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
1104 if (startp != NULL && endp != NULL
1105 && startp->mbs != NULL && endp->mbs != NULL
1106 && startp->nmbs != endp->nmbs)
1108 lr_error (ldfile, _("\
1109 %s: byte sequences of first and last character must have the same length"),
1110 "LC_COLLATE");
1111 return;
1114 /* Determine whether we have to generate multibyte sequences. */
1115 if ((startp == NULL || startp->mbs != NULL)
1116 && (endp == NULL || endp->mbs != NULL))
1118 int cnt;
1119 int ret;
1121 /* Prepare the beginning byte sequence. This is either from the
1122 beginning byte sequence or it is all nulls if it was an
1123 initial ellipsis. */
1124 if (startp == NULL || startp->mbs == NULL)
1125 memset (mbcnt, '\0', len);
1126 else
1128 memcpy (mbcnt, startp->mbs, len);
1130 /* And increment it so that the value is the first one we will
1131 try to insert. */
1132 for (cnt = len - 1; cnt >= 0; --cnt)
1133 if (++mbcnt[cnt] != '\0')
1134 break;
1136 mbcnt[len] = '\0';
1138 /* And the end sequence. */
1139 if (endp == NULL || endp->mbs == NULL)
1140 memset (mbend, '\0', len);
1141 else
1142 memcpy (mbend, endp->mbs, len);
1143 mbend[len] = '\0';
1145 /* Test whether we have a correct range. */
1146 ret = memcmp (mbcnt, mbend, len);
1147 if (ret >= 0)
1149 if (ret > 0)
1150 lr_error (ldfile, _("%s: byte sequence of first character of \
1151 sequence is not lower than that of the last character"), "LC_COLLATE");
1152 return;
1155 /* Generate the byte sequences data. */
1156 while (1)
1158 struct charseq *seq;
1160 /* Quite a bit of work ahead. We have to find the character
1161 definition for the byte sequence and then determine the
1162 wide character belonging to it. */
1163 seq = charmap_find_symbol (charmap, mbcnt, len);
1164 if (seq != NULL)
1166 struct element_t *elem;
1167 size_t namelen;
1169 /* I don't this this can ever happen. */
1170 assert (seq->name != NULL);
1171 namelen = strlen (seq->name);
1173 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1174 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1175 namelen);
1177 /* Now we are ready to insert the new value in the
1178 sequence. Find out whether the element is
1179 already known. */
1180 void *ptr;
1181 if (find_entry (&collate->seq_table, seq->name, namelen,
1182 &ptr) != 0)
1184 uint32_t wcs[2] = { seq->ucs4, 0 };
1186 /* We have to allocate an entry. */
1187 elem = new_element (collate, mbcnt, len,
1188 seq->ucs4 == ILLEGAL_CHAR_VALUE
1189 ? NULL : wcs, seq->name,
1190 namelen, 1);
1192 /* And add it to the table. */
1193 if (insert_entry (&collate->seq_table, seq->name,
1194 namelen, elem) != 0)
1195 /* This cannot happen. */
1196 assert (! "Internal error");
1198 else
1199 /* Copy the result. */
1200 elem = ptr;
1202 /* Test whether this element is not already in the list. */
1203 if (elem->next != NULL || (collate->cursor != NULL
1204 && elem->next == collate->cursor))
1206 lr_error (ldfile, _("\
1207 order for `%.*s' already defined at %s:%Zu"),
1208 (int) namelen, seq->name,
1209 elem->file, elem->line);
1210 goto increment;
1213 /* Enqueue the new element. */
1214 elem->last = collate->cursor;
1215 if (collate->cursor == NULL)
1216 elem->next = NULL;
1217 else
1219 elem->next = collate->cursor->next;
1220 elem->last->next = elem;
1221 if (elem->next != NULL)
1222 elem->next->last = elem;
1224 if (collate->start == NULL)
1226 assert (collate->cursor == NULL);
1227 collate->start = elem;
1229 collate->cursor = elem;
1231 /* Add the weight value. We take them from the
1232 `ellipsis_weights' member of `collate'. */
1233 elem->weights = (struct element_list_t *)
1234 obstack_alloc (&collate->mempool,
1235 nrules * sizeof (struct element_list_t));
1236 for (cnt = 0; cnt < nrules; ++cnt)
1237 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1238 && (collate->ellipsis_weight.weights[cnt].w[0]
1239 == ELEMENT_ELLIPSIS2))
1241 elem->weights[cnt].w = (struct element_t **)
1242 obstack_alloc (&collate->mempool,
1243 sizeof (struct element_t *));
1244 elem->weights[cnt].w[0] = elem;
1245 elem->weights[cnt].cnt = 1;
1247 else
1249 /* Simply use the weight from `ellipsis_weight'. */
1250 elem->weights[cnt].w =
1251 collate->ellipsis_weight.weights[cnt].w;
1252 elem->weights[cnt].cnt =
1253 collate->ellipsis_weight.weights[cnt].cnt;
1257 /* Increment for the next round. */
1258 increment:
1259 for (cnt = len - 1; cnt >= 0; --cnt)
1260 if (++mbcnt[cnt] != '\0')
1261 break;
1263 /* Find out whether this was all. */
1264 if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1265 /* Yep, that's all. */
1266 break;
1270 else
1272 /* For symbolic range we naturally must have a beginning and an
1273 end specified by the user. */
1274 if (startp == NULL)
1275 lr_error (ldfile, _("\
1276 %s: symbolic range ellipsis must not directly follow `order_start'"),
1277 "LC_COLLATE");
1278 else if (endp == NULL)
1279 lr_error (ldfile, _("\
1280 %s: symbolic range ellipsis must not be directly followed by `order_end'"),
1281 "LC_COLLATE");
1282 else
1284 /* Determine the range. To do so we have to determine the
1285 common prefix of the both names and then the numeric
1286 values of both ends. */
1287 size_t lenfrom = strlen (startp->name);
1288 size_t lento = strlen (endp->name);
1289 char buf[lento + 1];
1290 int preflen = 0;
1291 long int from;
1292 long int to;
1293 char *cp;
1294 int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1296 if (lenfrom != lento)
1298 invalid_range:
1299 lr_error (ldfile, _("\
1300 `%s' and `%.*s' are no valid names for symbolic range"),
1301 startp->name, (int) lento, endp->name);
1302 return;
1305 while (startp->name[preflen] == endp->name[preflen])
1306 if (startp->name[preflen] == '\0')
1307 /* Nothing to be done. The start and end point are identical
1308 and while inserting the end point we have already given
1309 the user an error message. */
1310 return;
1311 else
1312 ++preflen;
1314 errno = 0;
1315 from = strtol (startp->name + preflen, &cp, base);
1316 if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1317 goto invalid_range;
1319 errno = 0;
1320 to = strtol (endp->name + preflen, &cp, base);
1321 if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1322 goto invalid_range;
1324 /* Copy the prefix. */
1325 memcpy (buf, startp->name, preflen);
1327 /* Loop over all values. */
1328 for (++from; from < to; ++from)
1330 struct element_t *elem = NULL;
1331 struct charseq *seq;
1332 uint32_t wc;
1333 int cnt;
1335 /* Generate the the name. */
1336 sprintf (buf + preflen, base == 10 ? "%ld" : "%lX", from);
1338 /* Look whether this name is already defined. */
1339 void *ptr;
1340 if (find_entry (&collate->seq_table, buf, symlen, &ptr) == 0)
1342 /* Copy back the result. */
1343 elem = ptr;
1345 if (elem->next != NULL || (collate->cursor != NULL
1346 && elem->next == collate->cursor))
1348 lr_error (ldfile, _("\
1349 %s: order for `%.*s' already defined at %s:%Zu"),
1350 "LC_COLLATE", (int) lenfrom, buf,
1351 elem->file, elem->line);
1352 continue;
1355 if (elem->name == NULL)
1357 lr_error (ldfile, _("%s: `%s' must be a character"),
1358 "LC_COLLATE", buf);
1359 continue;
1363 if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1365 /* Search for a character of this name. */
1366 seq = charmap_find_value (charmap, buf, lenfrom);
1367 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1369 wc = repertoire_find_value (repertoire, buf, lenfrom);
1371 if (seq != NULL)
1372 seq->ucs4 = wc;
1374 else
1375 wc = seq->ucs4;
1377 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1378 /* We don't know anything about a character with this
1379 name. XXX Should we warn? */
1380 continue;
1382 if (elem == NULL)
1384 uint32_t wcs[2] = { wc, 0 };
1386 /* We have to allocate an entry. */
1387 elem = new_element (collate,
1388 seq != NULL ? seq->bytes : NULL,
1389 seq != NULL ? seq->nbytes : 0,
1390 wc == ILLEGAL_CHAR_VALUE
1391 ? NULL : wcs, buf, lenfrom, 1);
1393 else
1395 /* Update the element. */
1396 if (seq != NULL)
1398 elem->mbs = obstack_copy0 (&collate->mempool,
1399 seq->bytes, seq->nbytes);
1400 elem->nmbs = seq->nbytes;
1403 if (wc != ILLEGAL_CHAR_VALUE)
1405 uint32_t zero = 0;
1407 obstack_grow (&collate->mempool,
1408 &wc, sizeof (uint32_t));
1409 obstack_grow (&collate->mempool,
1410 &zero, sizeof (uint32_t));
1411 elem->wcs = obstack_finish (&collate->mempool);
1412 elem->nwcs = 1;
1416 elem->file = ldfile->fname;
1417 elem->line = ldfile->lineno;
1418 elem->section = collate->current_section;
1421 /* Enqueue the new element. */
1422 elem->last = collate->cursor;
1423 elem->next = collate->cursor->next;
1424 elem->last->next = elem;
1425 if (elem->next != NULL)
1426 elem->next->last = elem;
1427 collate->cursor = elem;
1429 /* Now add the weights. They come from the `ellipsis_weights'
1430 member of `collate'. */
1431 elem->weights = (struct element_list_t *)
1432 obstack_alloc (&collate->mempool,
1433 nrules * sizeof (struct element_list_t));
1434 for (cnt = 0; cnt < nrules; ++cnt)
1435 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1436 && (collate->ellipsis_weight.weights[cnt].w[0]
1437 == ELEMENT_ELLIPSIS2))
1439 elem->weights[cnt].w = (struct element_t **)
1440 obstack_alloc (&collate->mempool,
1441 sizeof (struct element_t *));
1442 elem->weights[cnt].w[0] = elem;
1443 elem->weights[cnt].cnt = 1;
1445 else
1447 /* Simly use the weight from `ellipsis_weight'. */
1448 elem->weights[cnt].w =
1449 collate->ellipsis_weight.weights[cnt].w;
1450 elem->weights[cnt].cnt =
1451 collate->ellipsis_weight.weights[cnt].cnt;
1459 static void
1460 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1461 struct localedef_t *copy_locale, int ignore_content)
1463 if (!ignore_content && locale->categories[LC_COLLATE].collate == NULL)
1465 struct locale_collate_t *collate;
1467 if (copy_locale == NULL)
1469 collate = locale->categories[LC_COLLATE].collate =
1470 (struct locale_collate_t *)
1471 xcalloc (1, sizeof (struct locale_collate_t));
1473 /* Init the various data structures. */
1474 init_hash (&collate->elem_table, 100);
1475 init_hash (&collate->sym_table, 100);
1476 init_hash (&collate->seq_table, 500);
1477 obstack_init (&collate->mempool);
1479 collate->col_weight_max = -1;
1481 else
1482 /* Reuse the copy_locale's data structures. */
1483 collate = locale->categories[LC_COLLATE].collate =
1484 copy_locale->categories[LC_COLLATE].collate;
1487 ldfile->translate_strings = 0;
1488 ldfile->return_widestr = 0;
1492 void
1493 collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
1495 /* Now is the time when we can assign the individual collation
1496 values for all the symbols. We have possibly different values
1497 for the wide- and the multibyte-character symbols. This is done
1498 since it might make a difference in the encoding if there is in
1499 some cases no multibyte-character but there are wide-characters.
1500 (The other way around it is not important since theencoded
1501 collation value in the wide-character case is 32 bits wide and
1502 therefore requires no encoding).
1504 The lowest collation value assigned is 2. Zero is reserved for
1505 the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1506 functions and 1 is used to separate the individual passes for the
1507 different rules.
1509 We also have to construct is list with all the bytes/words which
1510 can come first in a sequence, followed by all the elements which
1511 also start with this byte/word. The order is reverse which has
1512 among others the important effect that longer strings are located
1513 first in the list. This is required for the output data since
1514 the algorithm used in `strcoll' etc depends on this.
1516 The multibyte case is easy. We simply sort into an array with
1517 256 elements. */
1518 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1519 int mbact[nrules];
1520 int wcact;
1521 int mbseqact;
1522 int wcseqact;
1523 struct element_t *runp;
1524 int i;
1525 int need_undefined = 0;
1526 struct section_list *sect;
1527 int ruleidx;
1528 int nr_wide_elems = 0;
1530 if (collate == NULL)
1532 /* No data, no check. */
1533 if (! be_quiet)
1534 WITH_CUR_LOCALE (error (0, 0, _("No definition for %s category found"),
1535 "LC_COLLATE"));
1536 return;
1539 /* If this assertion is hit change the type in `element_t'. */
1540 assert (nrules <= sizeof (runp->used_in_level) * 8);
1542 /* Make sure that the `position' rule is used either in all sections
1543 or in none. */
1544 for (i = 0; i < nrules; ++i)
1545 for (sect = collate->sections; sect != NULL; sect = sect->next)
1546 if (sect->rules != NULL
1547 && ((sect->rules[i] & sort_position)
1548 != (collate->sections->rules[i] & sort_position)))
1550 WITH_CUR_LOCALE (error (0, 0, _("\
1551 %s: `position' must be used for a specific level in all sections or none"),
1552 "LC_COLLATE"));
1553 break;
1556 /* Find out which elements are used at which level. At the same
1557 time we find out whether we have any undefined symbols. */
1558 runp = collate->start;
1559 while (runp != NULL)
1561 if (runp->mbs != NULL)
1563 for (i = 0; i < nrules; ++i)
1565 int j;
1567 for (j = 0; j < runp->weights[i].cnt; ++j)
1568 /* A NULL pointer as the weight means IGNORE. */
1569 if (runp->weights[i].w[j] != NULL)
1571 if (runp->weights[i].w[j]->weights == NULL)
1573 WITH_CUR_LOCALE (error_at_line (0, 0, runp->file,
1574 runp->line,
1575 _("symbol `%s' not defined"),
1576 runp->weights[i].w[j]->name));
1578 need_undefined = 1;
1579 runp->weights[i].w[j] = &collate->undefined;
1581 else
1582 /* Set the bit for the level. */
1583 runp->weights[i].w[j]->used_in_level |= 1 << i;
1588 /* Up to the next entry. */
1589 runp = runp->next;
1592 /* Walk through the list of defined sequences and assign weights. Also
1593 create the data structure which will allow generating the single byte
1594 character based tables.
1596 Since at each time only the weights for each of the rules are
1597 only compared to other weights for this rule it is possible to
1598 assign more compact weight values than simply counting all
1599 weights in sequence. We can assign weights from 3, one for each
1600 rule individually and only for those elements, which are actually
1601 used for this rule.
1603 Why is this important? It is not for the wide char table. But
1604 it is for the singlebyte output since here larger numbers have to
1605 be encoded to make it possible to emit the value as a byte
1606 string. */
1607 for (i = 0; i < nrules; ++i)
1608 mbact[i] = 2;
1609 wcact = 2;
1610 mbseqact = 0;
1611 wcseqact = 0;
1612 runp = collate->start;
1613 while (runp != NULL)
1615 /* Determine the order. */
1616 if (runp->used_in_level != 0)
1618 runp->mborder = (int *) obstack_alloc (&collate->mempool,
1619 nrules * sizeof (int));
1621 for (i = 0; i < nrules; ++i)
1622 if ((runp->used_in_level & (1 << i)) != 0)
1623 runp->mborder[i] = mbact[i]++;
1624 else
1625 runp->mborder[i] = 0;
1628 if (runp->mbs != NULL)
1630 struct element_t **eptr;
1631 struct element_t *lastp = NULL;
1633 /* Find the point where to insert in the list. */
1634 eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1635 while (*eptr != NULL)
1637 if ((*eptr)->nmbs < runp->nmbs)
1638 break;
1640 if ((*eptr)->nmbs == runp->nmbs)
1642 int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1644 if (c == 0)
1646 /* This should not happen. It means that we have
1647 to symbols with the same byte sequence. It is
1648 of course an error. */
1649 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1650 (*eptr)->line,
1651 _("\
1652 symbol `%s' has the same encoding as"), (*eptr)->name);
1653 error_at_line (0, 0, runp->file,
1654 runp->line,
1655 _("symbol `%s'"),
1656 runp->name));
1657 goto dont_insert;
1659 else if (c < 0)
1660 /* Insert it here. */
1661 break;
1664 /* To the next entry. */
1665 lastp = *eptr;
1666 eptr = &(*eptr)->mbnext;
1669 /* Set the pointers. */
1670 runp->mbnext = *eptr;
1671 runp->mblast = lastp;
1672 if (*eptr != NULL)
1673 (*eptr)->mblast = runp;
1674 *eptr = runp;
1675 dont_insert:
1679 if (runp->used_in_level)
1681 runp->wcorder = wcact++;
1683 /* We take the opportunity to count the elements which have
1684 wide characters. */
1685 ++nr_wide_elems;
1688 if (runp->is_character)
1690 if (runp->nmbs == 1)
1691 collate->mbseqorder[((unsigned char *) runp->mbs)[0]] = mbseqact++;
1693 runp->wcseqorder = wcseqact++;
1695 else if (runp->mbs != NULL && runp->weights != NULL)
1696 /* This is for collation elements. */
1697 runp->wcseqorder = wcseqact++;
1699 /* Up to the next entry. */
1700 runp = runp->next;
1703 /* Find out whether any of the `mbheads' entries is unset. In this
1704 case we use the UNDEFINED entry. */
1705 for (i = 1; i < 256; ++i)
1706 if (collate->mbheads[i] == NULL)
1708 need_undefined = 1;
1709 collate->mbheads[i] = &collate->undefined;
1712 /* Now to the wide character case. */
1713 collate->wcheads.p = 6;
1714 collate->wcheads.q = 10;
1715 wchead_table_init (&collate->wcheads);
1717 collate->wcseqorder.p = 6;
1718 collate->wcseqorder.q = 10;
1719 collseq_table_init (&collate->wcseqorder);
1721 /* Start adding. */
1722 runp = collate->start;
1723 while (runp != NULL)
1725 if (runp->wcs != NULL)
1727 struct element_t *e;
1728 struct element_t **eptr;
1729 struct element_t *lastp;
1731 /* Insert the collation sequence value. */
1732 if (runp->is_character)
1733 collseq_table_add (&collate->wcseqorder, runp->wcs[0],
1734 runp->wcseqorder);
1736 /* Find the point where to insert in the list. */
1737 e = wchead_table_get (&collate->wcheads, runp->wcs[0]);
1738 eptr = &e;
1739 lastp = NULL;
1740 while (*eptr != NULL)
1742 if ((*eptr)->nwcs < runp->nwcs)
1743 break;
1745 if ((*eptr)->nwcs == runp->nwcs)
1747 int c = wmemcmp ((wchar_t *) (*eptr)->wcs,
1748 (wchar_t *) runp->wcs, runp->nwcs);
1750 if (c == 0)
1752 /* This should not happen. It means that we have
1753 two symbols with the same byte sequence. It is
1754 of course an error. */
1755 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1756 (*eptr)->line,
1757 _("\
1758 symbol `%s' has the same encoding as"), (*eptr)->name);
1759 error_at_line (0, 0, runp->file,
1760 runp->line,
1761 _("symbol `%s'"),
1762 runp->name));
1763 goto dont_insertwc;
1765 else if (c < 0)
1766 /* Insert it here. */
1767 break;
1770 /* To the next entry. */
1771 lastp = *eptr;
1772 eptr = &(*eptr)->wcnext;
1775 /* Set the pointers. */
1776 runp->wcnext = *eptr;
1777 runp->wclast = lastp;
1778 if (*eptr != NULL)
1779 (*eptr)->wclast = runp;
1780 *eptr = runp;
1781 if (eptr == &e)
1782 wchead_table_add (&collate->wcheads, runp->wcs[0], e);
1783 dont_insertwc:
1787 /* Up to the next entry. */
1788 runp = runp->next;
1791 collseq_table_finalize (&collate->wcseqorder);
1793 /* Now determine whether the UNDEFINED entry is needed and if yes,
1794 whether it was defined. */
1795 collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1796 if (collate->undefined.file == NULL)
1798 if (need_undefined)
1800 /* This seems not to be enforced by recent standards. Don't
1801 emit an error, simply append UNDEFINED at the end. */
1802 if (0)
1803 WITH_CUR_LOCALE (error (0, 0, _("no definition of `UNDEFINED'")));
1805 /* Add UNDEFINED at the end. */
1806 collate->undefined.mborder =
1807 (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1809 for (i = 0; i < nrules; ++i)
1810 collate->undefined.mborder[i] = mbact[i]++;
1813 /* In any case we will need the definition for the wide character
1814 case. But we will not complain that it is missing since the
1815 specification strangely enough does not seem to account for
1816 this. */
1817 collate->undefined.wcorder = wcact++;
1820 /* Finally, try to unify the rules for the sections. Whenever the rules
1821 for a section are the same as those for another section give the
1822 ruleset the same index. Since there are never many section we can
1823 use an O(n^2) algorithm here. */
1824 sect = collate->sections;
1825 while (sect != NULL && sect->rules == NULL)
1826 sect = sect->next;
1828 /* Bail out if we have no sections because of earlier errors. */
1829 if (sect == NULL)
1831 WITH_CUR_LOCALE (error (EXIT_FAILURE, 0,
1832 _("too many errors; giving up")));
1833 return;
1836 ruleidx = 0;
1839 struct section_list *osect = collate->sections;
1841 while (osect != sect)
1842 if (osect->rules != NULL
1843 && memcmp (osect->rules, sect->rules, nrules) == 0)
1844 break;
1845 else
1846 osect = osect->next;
1848 if (osect == sect)
1849 sect->ruleidx = ruleidx++;
1850 else
1851 sect->ruleidx = osect->ruleidx;
1853 /* Next section. */
1855 sect = sect->next;
1856 while (sect != NULL && sect->rules == NULL);
1858 while (sect != NULL);
1859 /* We are currently not prepared for more than 128 rulesets. But this
1860 should never really be a problem. */
1861 assert (ruleidx <= 128);
1865 static int32_t
1866 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1867 struct element_t *elem)
1869 size_t cnt;
1870 int32_t retval;
1872 /* Optimize the use of UNDEFINED. */
1873 if (elem == &collate->undefined)
1874 /* The weights are already inserted. */
1875 return 0;
1877 /* This byte can start exactly one collation element and this is
1878 a single byte. We can directly give the index to the weights. */
1879 retval = obstack_object_size (pool);
1881 /* Construct the weight. */
1882 for (cnt = 0; cnt < nrules; ++cnt)
1884 char buf[elem->weights[cnt].cnt * 7];
1885 int len = 0;
1886 int i;
1888 for (i = 0; i < elem->weights[cnt].cnt; ++i)
1889 /* Encode the weight value. We do nothing for IGNORE entries. */
1890 if (elem->weights[cnt].w[i] != NULL)
1891 len += utf8_encode (&buf[len],
1892 elem->weights[cnt].w[i]->mborder[cnt]);
1894 /* And add the buffer content. */
1895 obstack_1grow (pool, len);
1896 obstack_grow (pool, buf, len);
1899 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1903 static int32_t
1904 output_weightwc (struct obstack *pool, struct locale_collate_t *collate,
1905 struct element_t *elem)
1907 size_t cnt;
1908 int32_t retval;
1910 /* Optimize the use of UNDEFINED. */
1911 if (elem == &collate->undefined)
1912 /* The weights are already inserted. */
1913 return 0;
1915 /* This byte can start exactly one collation element and this is
1916 a single byte. We can directly give the index to the weights. */
1917 retval = obstack_object_size (pool) / sizeof (int32_t);
1919 /* Construct the weight. */
1920 for (cnt = 0; cnt < nrules; ++cnt)
1922 int32_t buf[elem->weights[cnt].cnt];
1923 int i;
1924 int32_t j;
1926 for (i = 0, j = 0; i < elem->weights[cnt].cnt; ++i)
1927 if (elem->weights[cnt].w[i] != NULL)
1928 buf[j++] = elem->weights[cnt].w[i]->wcorder;
1930 /* And add the buffer content. */
1931 obstack_int32_grow (pool, j);
1933 obstack_grow (pool, buf, j * sizeof (int32_t));
1936 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1940 void
1941 collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
1942 const char *output_path)
1944 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1945 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
1946 struct iovec iov[2 + nelems];
1947 struct locale_file data;
1948 uint32_t idx[nelems];
1949 size_t cnt;
1950 size_t ch;
1951 int32_t tablemb[256];
1952 struct obstack weightpool;
1953 struct obstack extrapool;
1954 struct obstack indirectpool;
1955 struct section_list *sect;
1956 struct collidx_table tablewc;
1957 uint32_t elem_size;
1958 uint32_t *elem_table;
1959 int i;
1960 struct element_t *runp;
1962 data.magic = LIMAGIC (LC_COLLATE);
1963 data.n = nelems;
1964 iov[0].iov_base = (void *) &data;
1965 iov[0].iov_len = sizeof (data);
1967 iov[1].iov_base = (void *) idx;
1968 iov[1].iov_len = sizeof (idx);
1970 idx[0] = iov[0].iov_len + iov[1].iov_len;
1971 cnt = 0;
1973 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES));
1974 iov[2 + cnt].iov_base = &nrules;
1975 iov[2 + cnt].iov_len = sizeof (uint32_t);
1976 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1977 ++cnt;
1979 /* If we have no LC_COLLATE data emit only the number of rules as zero. */
1980 if (collate == NULL)
1982 int32_t dummy = 0;
1984 while (cnt < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
1986 /* The words have to be handled specially. */
1987 if (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
1989 iov[2 + cnt].iov_base = &dummy;
1990 iov[2 + cnt].iov_len = sizeof (int32_t);
1992 else
1994 iov[2 + cnt].iov_base = NULL;
1995 iov[2 + cnt].iov_len = 0;
1998 if (cnt + 1 < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
1999 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2000 ++cnt;
2003 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2005 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", 2 + cnt, iov);
2007 return;
2010 obstack_init (&weightpool);
2011 obstack_init (&extrapool);
2012 obstack_init (&indirectpool);
2014 /* Since we are using the sign of an integer to mark indirection the
2015 offsets in the arrays we are indirectly referring to must not be
2016 zero since -0 == 0. Therefore we add a bit of dummy content. */
2017 obstack_int32_grow (&extrapool, 0);
2018 obstack_int32_grow (&indirectpool, 0);
2020 /* Prepare the ruleset table. */
2021 for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
2022 if (sect->rules != NULL && sect->ruleidx == i)
2024 int j;
2026 obstack_make_room (&weightpool, nrules);
2028 for (j = 0; j < nrules; ++j)
2029 obstack_1grow_fast (&weightpool, sect->rules[j]);
2030 ++i;
2032 /* And align the output. */
2033 i = (nrules * i) % __alignof__ (int32_t);
2034 if (i > 0)
2036 obstack_1grow (&weightpool, '\0');
2037 while (++i < __alignof__ (int32_t));
2039 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_RULESETS));
2040 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2041 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2042 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2043 ++cnt;
2045 /* Generate the 8-bit table. Walk through the lists of sequences
2046 starting with the same byte and add them one after the other to
2047 the table. In case we have more than one sequence starting with
2048 the same byte we have to use extra indirection.
2050 First add a record for the NUL byte. This entry will never be used
2051 so it does not matter. */
2052 tablemb[0] = 0;
2054 /* Now insert the `UNDEFINED' value if it is used. Since this value
2055 will probably be used more than once it is good to store the
2056 weights only once. */
2057 if (collate->undefined.used_in_level != 0)
2058 output_weight (&weightpool, collate, &collate->undefined);
2060 for (ch = 1; ch < 256; ++ch)
2061 if (collate->mbheads[ch]->mbnext == NULL
2062 && collate->mbheads[ch]->nmbs <= 1)
2064 tablemb[ch] = output_weight (&weightpool, collate,
2065 collate->mbheads[ch]);
2067 else
2069 /* The entries in the list are sorted by length and then
2070 alphabetically. This is the order in which we will add the
2071 elements to the collation table. This allows simply walking
2072 the table in sequence and stopping at the first matching
2073 entry. Since the longer sequences are coming first in the
2074 list they have the possibility to match first, just as it
2075 has to be. In the worst case we are walking to the end of
2076 the list where we put, if no singlebyte sequence is defined
2077 in the locale definition, the weights for UNDEFINED.
2079 To reduce the length of the search list we compress them a bit.
2080 This happens by collecting sequences of consecutive byte
2081 sequences in one entry (having and begin and end byte sequence)
2082 and add only one index into the weight table. We can find the
2083 consecutive entries since they are also consecutive in the list. */
2084 struct element_t *runp = collate->mbheads[ch];
2085 struct element_t *lastp;
2087 assert ((obstack_object_size (&extrapool)
2088 & (__alignof__ (int32_t) - 1)) == 0);
2090 tablemb[ch] = -obstack_object_size (&extrapool);
2094 /* Store the current index in the weight table. We know that
2095 the current position in the `extrapool' is aligned on a
2096 32-bit address. */
2097 int32_t weightidx;
2098 int added;
2100 /* Find out wether this is a single entry or we have more than
2101 one consecutive entry. */
2102 if (runp->mbnext != NULL
2103 && runp->nmbs == runp->mbnext->nmbs
2104 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
2105 && (runp->mbs[runp->nmbs - 1]
2106 == runp->mbnext->mbs[runp->nmbs - 1] + 1))
2108 int i;
2109 struct element_t *series_startp = runp;
2110 struct element_t *curp;
2112 /* Compute how much space we will need. */
2113 added = ((sizeof (int32_t) + 1 + 2 * (runp->nmbs - 1)
2114 + __alignof__ (int32_t) - 1)
2115 & ~(__alignof__ (int32_t) - 1));
2116 assert ((obstack_object_size (&extrapool)
2117 & (__alignof__ (int32_t) - 1)) == 0);
2118 obstack_make_room (&extrapool, added);
2120 /* More than one consecutive entry. We mark this by having
2121 a negative index into the indirect table. */
2122 obstack_int32_grow_fast (&extrapool,
2123 -(obstack_object_size (&indirectpool)
2124 / sizeof (int32_t)));
2126 /* Now search first the end of the series. */
2128 runp = runp->mbnext;
2129 while (runp->mbnext != NULL
2130 && runp->nmbs == runp->mbnext->nmbs
2131 && memcmp (runp->mbs, runp->mbnext->mbs,
2132 runp->nmbs - 1) == 0
2133 && (runp->mbs[runp->nmbs - 1]
2134 == runp->mbnext->mbs[runp->nmbs - 1] + 1));
2136 /* Now walk backward from here to the beginning. */
2137 curp = runp;
2139 assert (runp->nmbs <= 256);
2140 obstack_1grow_fast (&extrapool, curp->nmbs - 1);
2141 for (i = 1; i < curp->nmbs; ++i)
2142 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2144 /* Now find the end of the consecutive sequence and
2145 add all the indeces in the indirect pool. */
2148 weightidx = output_weight (&weightpool, collate, curp);
2149 obstack_int32_grow (&indirectpool, weightidx);
2151 curp = curp->mblast;
2153 while (curp != series_startp);
2155 /* Add the final weight. */
2156 weightidx = output_weight (&weightpool, collate, curp);
2157 obstack_int32_grow (&indirectpool, weightidx);
2159 /* And add the end byte sequence. Without length this
2160 time. */
2161 for (i = 1; i < curp->nmbs; ++i)
2162 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2164 else
2166 /* A single entry. Simply add the index and the length and
2167 string (except for the first character which is already
2168 tested for). */
2169 int i;
2171 /* Output the weight info. */
2172 weightidx = output_weight (&weightpool, collate, runp);
2174 added = ((sizeof (int32_t) + 1 + runp->nmbs - 1
2175 + __alignof__ (int32_t) - 1)
2176 & ~(__alignof__ (int32_t) - 1));
2177 assert ((obstack_object_size (&extrapool)
2178 & (__alignof__ (int32_t) - 1)) == 0);
2179 obstack_make_room (&extrapool, added);
2181 obstack_int32_grow_fast (&extrapool, weightidx);
2182 assert (runp->nmbs <= 256);
2183 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
2185 for (i = 1; i < runp->nmbs; ++i)
2186 obstack_1grow_fast (&extrapool, runp->mbs[i]);
2189 /* Add alignment bytes if necessary. */
2190 while ((obstack_object_size (&extrapool)
2191 & (__alignof__ (int32_t) - 1)) != 0)
2192 obstack_1grow_fast (&extrapool, '\0');
2194 /* Next entry. */
2195 lastp = runp;
2196 runp = runp->mbnext;
2198 while (runp != NULL);
2200 assert ((obstack_object_size (&extrapool)
2201 & (__alignof__ (int32_t) - 1)) == 0);
2203 /* If the final entry in the list is not a single character we
2204 add an UNDEFINED entry here. */
2205 if (lastp->nmbs != 1)
2207 int added = ((sizeof (int32_t) + 1 + 1 + __alignof__ (int32_t) - 1)
2208 & ~(__alignof__ (int32_t) - 1));
2209 obstack_make_room (&extrapool, added);
2211 obstack_int32_grow_fast (&extrapool, 0);
2212 /* XXX What rule? We just pick the first. */
2213 obstack_1grow_fast (&extrapool, 0);
2214 /* Length is zero. */
2215 obstack_1grow_fast (&extrapool, 0);
2217 /* Add alignment bytes if necessary. */
2218 while ((obstack_object_size (&extrapool)
2219 & (__alignof__ (int32_t) - 1)) != 0)
2220 obstack_1grow_fast (&extrapool, '\0');
2224 /* Add padding to the tables if necessary. */
2225 while ((obstack_object_size (&weightpool) & (__alignof__ (int32_t) - 1))
2226 != 0)
2227 obstack_1grow (&weightpool, 0);
2229 /* Now add the four tables. */
2230 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEMB));
2231 iov[2 + cnt].iov_base = tablemb;
2232 iov[2 + cnt].iov_len = sizeof (tablemb);
2233 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2234 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2235 ++cnt;
2237 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB));
2238 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2239 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2240 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2241 ++cnt;
2243 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB));
2244 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2245 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2246 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2247 ++cnt;
2249 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB));
2250 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2251 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2252 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2253 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2254 ++cnt;
2257 /* Now the same for the wide character table. We need to store some
2258 more information here. */
2259 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP1));
2260 iov[2 + cnt].iov_base = NULL;
2261 iov[2 + cnt].iov_len = 0;
2262 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2263 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2264 ++cnt;
2266 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP2));
2267 iov[2 + cnt].iov_base = NULL;
2268 iov[2 + cnt].iov_len = 0;
2269 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2270 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2271 ++cnt;
2273 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP3));
2274 iov[2 + cnt].iov_base = NULL;
2275 iov[2 + cnt].iov_len = 0;
2276 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2277 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2278 ++cnt;
2280 /* Since we are using the sign of an integer to mark indirection the
2281 offsets in the arrays we are indirectly referring to must not be
2282 zero since -0 == 0. Therefore we add a bit of dummy content. */
2283 obstack_int32_grow (&extrapool, 0);
2284 obstack_int32_grow (&indirectpool, 0);
2286 /* Now insert the `UNDEFINED' value if it is used. Since this value
2287 will probably be used more than once it is good to store the
2288 weights only once. */
2289 if (output_weightwc (&weightpool, collate, &collate->undefined) != 0)
2290 abort ();
2292 /* Generate the table. Walk through the lists of sequences starting
2293 with the same wide character and add them one after the other to
2294 the table. In case we have more than one sequence starting with
2295 the same byte we have to use extra indirection. */
2297 auto void add_to_tablewc (uint32_t ch, struct element_t *runp);
2299 void add_to_tablewc (uint32_t ch, struct element_t *runp)
2301 if (runp->wcnext == NULL && runp->nwcs == 1)
2303 int32_t weigthidx = output_weightwc (&weightpool, collate, runp);
2304 collidx_table_add (&tablewc, ch, weigthidx);
2306 else
2308 /* As for the singlebyte table, we recognize sequences and
2309 compress them. */
2310 struct element_t *lastp;
2312 collidx_table_add (&tablewc, ch,
2313 -(obstack_object_size (&extrapool) / sizeof (uint32_t)));
2317 /* Store the current index in the weight table. We know that
2318 the current position in the `extrapool' is aligned on a
2319 32-bit address. */
2320 int32_t weightidx;
2321 int added;
2323 /* Find out wether this is a single entry or we have more than
2324 one consecutive entry. */
2325 if (runp->wcnext != NULL
2326 && runp->nwcs == runp->wcnext->nwcs
2327 && wmemcmp ((wchar_t *) runp->wcs,
2328 (wchar_t *)runp->wcnext->wcs,
2329 runp->nwcs - 1) == 0
2330 && (runp->wcs[runp->nwcs - 1]
2331 == runp->wcnext->wcs[runp->nwcs - 1] + 1))
2333 int i;
2334 struct element_t *series_startp = runp;
2335 struct element_t *curp;
2337 /* Now add first the initial byte sequence. */
2338 added = (1 + 1 + 2 * (runp->nwcs - 1)) * sizeof (int32_t);
2339 if (sizeof (int32_t) == sizeof (int))
2340 obstack_make_room (&extrapool, added);
2342 /* More than one consecutive entry. We mark this by having
2343 a negative index into the indirect table. */
2344 obstack_int32_grow_fast (&extrapool,
2345 -(obstack_object_size (&indirectpool)
2346 / sizeof (int32_t)));
2347 obstack_int32_grow_fast (&extrapool, runp->nwcs - 1);
2350 runp = runp->wcnext;
2351 while (runp->wcnext != NULL
2352 && runp->nwcs == runp->wcnext->nwcs
2353 && wmemcmp ((wchar_t *) runp->wcs,
2354 (wchar_t *)runp->wcnext->wcs,
2355 runp->nwcs - 1) == 0
2356 && (runp->wcs[runp->nwcs - 1]
2357 == runp->wcnext->wcs[runp->nwcs - 1] + 1));
2359 /* Now walk backward from here to the beginning. */
2360 curp = runp;
2362 for (i = 1; i < runp->nwcs; ++i)
2363 obstack_int32_grow_fast (&extrapool, curp->wcs[i]);
2365 /* Now find the end of the consecutive sequence and
2366 add all the indeces in the indirect pool. */
2369 weightidx = output_weightwc (&weightpool, collate,
2370 curp);
2371 obstack_int32_grow (&indirectpool, weightidx);
2373 curp = curp->wclast;
2375 while (curp != series_startp);
2377 /* Add the final weight. */
2378 weightidx = output_weightwc (&weightpool, collate, curp);
2379 obstack_int32_grow (&indirectpool, weightidx);
2381 /* And add the end byte sequence. Without length this
2382 time. */
2383 for (i = 1; i < curp->nwcs; ++i)
2384 obstack_int32_grow (&extrapool, curp->wcs[i]);
2386 else
2388 /* A single entry. Simply add the index and the length and
2389 string (except for the first character which is already
2390 tested for). */
2391 int i;
2393 /* Output the weight info. */
2394 weightidx = output_weightwc (&weightpool, collate, runp);
2396 added = (1 + 1 + runp->nwcs - 1) * sizeof (int32_t);
2397 if (sizeof (int) == sizeof (int32_t))
2398 obstack_make_room (&extrapool, added);
2400 obstack_int32_grow_fast (&extrapool, weightidx);
2401 obstack_int32_grow_fast (&extrapool, runp->nwcs - 1);
2402 for (i = 1; i < runp->nwcs; ++i)
2403 obstack_int32_grow_fast (&extrapool, runp->wcs[i]);
2406 /* Next entry. */
2407 lastp = runp;
2408 runp = runp->wcnext;
2410 while (runp != NULL);
2414 tablewc.p = 6;
2415 tablewc.q = 10;
2416 collidx_table_init (&tablewc);
2418 wchead_table_iterate (&collate->wcheads, add_to_tablewc);
2420 collidx_table_finalize (&tablewc);
2423 /* Now add the four tables. */
2424 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEWC));
2425 iov[2 + cnt].iov_base = tablewc.result;
2426 iov[2 + cnt].iov_len = tablewc.result_size;
2427 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2428 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2429 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2430 ++cnt;
2432 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTWC));
2433 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2434 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2435 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2436 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2437 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2438 ++cnt;
2440 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAWC));
2441 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2442 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2443 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2444 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2445 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2446 ++cnt;
2448 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTWC));
2449 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2450 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2451 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2452 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2453 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2454 ++cnt;
2457 /* Finally write the table with collation element names out. It is
2458 a hash table with a simple function which gets the name of the
2459 character as the input. One character might have many names. The
2460 value associated with the name is an index into the weight table
2461 where we are then interested in the first-level weight value.
2463 To determine how large the table should be we are counting the
2464 elements have to put in. Since we are using internal chaining
2465 using a secondary hash function we have to make the table a bit
2466 larger to avoid extremely long search times. We can achieve
2467 good results with a 40% larger table than there are entries. */
2468 elem_size = 0;
2469 runp = collate->start;
2470 while (runp != NULL)
2472 if (runp->mbs != NULL && runp->weights != NULL)
2473 /* Yep, the element really counts. */
2474 ++elem_size;
2476 runp = runp->next;
2478 /* Add 40% and find the next prime number. */
2479 elem_size = MIN (next_prime (elem_size * 1.4), 257);
2481 /* Allocate the table. Each entry consists of two words: the hash
2482 value and an index in a secondary table which provides the index
2483 into the weight table and the string itself (so that a match can
2484 be determined). */
2485 elem_table = (uint32_t *) obstack_alloc (&extrapool,
2486 elem_size * 2 * sizeof (uint32_t));
2487 memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t));
2489 /* Now add the elements. */
2490 runp = collate->start;
2491 while (runp != NULL)
2493 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2495 /* Compute the hash value of the name. */
2496 uint32_t namelen = strlen (runp->name);
2497 uint32_t hash = elem_hash (runp->name, namelen);
2498 size_t idx = hash % elem_size;
2500 if (elem_table[idx * 2] != 0)
2502 /* The spot is already take. Try iterating using the value
2503 from the secondary hashing function. */
2504 size_t iter = hash % (elem_size - 2);
2508 idx += iter;
2509 if (idx >= elem_size)
2510 idx -= elem_size;
2512 while (elem_table[idx * 2] != 0);
2514 /* This is the spot where we will insert the value. */
2515 elem_table[idx * 2] = hash;
2516 elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
2518 /* The the string itself including length. */
2519 obstack_1grow (&extrapool, namelen);
2520 obstack_grow (&extrapool, runp->name, namelen);
2522 /* And the multibyte representation. */
2523 obstack_1grow (&extrapool, runp->nmbs);
2524 obstack_grow (&extrapool, runp->mbs, runp->nmbs);
2526 /* And align again to 32 bits. */
2527 if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0)
2528 obstack_grow (&extrapool, "\0\0",
2529 (sizeof (int32_t)
2530 - ((1 + namelen + 1 + runp->nmbs)
2531 % sizeof (int32_t))));
2533 /* Now some 32-bit values: multibyte collation sequence,
2534 wide char string (including length), and wide char
2535 collation sequence. */
2536 obstack_int32_grow (&extrapool, runp->mbseqorder);
2538 obstack_int32_grow (&extrapool, runp->nwcs);
2539 obstack_grow (&extrapool, runp->wcs,
2540 runp->nwcs * sizeof (uint32_t));
2542 obstack_int32_grow (&extrapool, runp->wcseqorder);
2545 runp = runp->next;
2548 /* Prepare to write out this data. */
2549 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB));
2550 iov[2 + cnt].iov_base = &elem_size;
2551 iov[2 + cnt].iov_len = sizeof (int32_t);
2552 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2553 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2554 ++cnt;
2556 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_TABLEMB));
2557 iov[2 + cnt].iov_base = elem_table;
2558 iov[2 + cnt].iov_len = elem_size * 2 * sizeof (int32_t);
2559 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2560 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2561 ++cnt;
2563 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_EXTRAMB));
2564 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2565 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2566 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2567 ++cnt;
2569 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQMB));
2570 iov[2 + cnt].iov_base = collate->mbseqorder;
2571 iov[2 + cnt].iov_len = 256;
2572 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2573 ++cnt;
2575 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQWC));
2576 iov[2 + cnt].iov_base = collate->wcseqorder.result;
2577 iov[2 + cnt].iov_len = collate->wcseqorder.result_size;
2578 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2579 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2580 ++cnt;
2582 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_CODESET));
2583 iov[2 + cnt].iov_base = (void *) charmap->code_set_name;
2584 iov[2 + cnt].iov_len = strlen (iov[2 + cnt].iov_base) + 1;
2585 ++cnt;
2587 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2589 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", 2 + cnt, iov);
2591 obstack_free (&weightpool, NULL);
2592 obstack_free (&extrapool, NULL);
2593 obstack_free (&indirectpool, NULL);
2597 void
2598 collate_read (struct linereader *ldfile, struct localedef_t *result,
2599 const struct charmap_t *charmap, const char *repertoire_name,
2600 int ignore_content)
2602 struct repertoire_t *repertoire = NULL;
2603 struct locale_collate_t *collate;
2604 struct token *now;
2605 struct token *arg = NULL;
2606 enum token_t nowtok;
2607 enum token_t was_ellipsis = tok_none;
2608 struct localedef_t *copy_locale = NULL;
2609 /* Parsing state:
2610 0 - start
2611 1 - between `order-start' and `order-end'
2612 2 - after `order-end'
2613 3 - after `reorder-after', waiting for `reorder-end'
2614 4 - after `reorder-end'
2615 5 - after `reorder-sections-after', waiting for `reorder-sections-end'
2616 6 - after `reorder-sections-end'
2618 int state = 0;
2620 /* Get the repertoire we have to use. */
2621 if (repertoire_name != NULL)
2622 repertoire = repertoire_read (repertoire_name);
2624 /* The rest of the line containing `LC_COLLATE' must be free. */
2625 lr_ignore_rest (ldfile, 1);
2629 now = lr_token (ldfile, charmap, result, NULL, verbose);
2630 nowtok = now->tok;
2632 while (nowtok == tok_eol);
2634 if (nowtok == tok_copy)
2636 state = 2;
2637 now = lr_token (ldfile, charmap, result, NULL, verbose);
2638 if (now->tok != tok_string)
2640 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2642 skip_category:
2644 now = lr_token (ldfile, charmap, result, NULL, verbose);
2645 while (now->tok != tok_eof && now->tok != tok_end);
2647 if (now->tok != tok_eof
2648 || (now = lr_token (ldfile, charmap, result, NULL, verbose),
2649 now->tok == tok_eof))
2650 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2651 else if (now->tok != tok_lc_collate)
2653 lr_error (ldfile, _("\
2654 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2655 lr_ignore_rest (ldfile, 0);
2657 else
2658 lr_ignore_rest (ldfile, 1);
2660 return;
2663 if (! ignore_content)
2665 /* Get the locale definition. */
2666 copy_locale = load_locale (LC_COLLATE, now->val.str.startmb,
2667 repertoire_name, charmap, NULL);
2668 if ((copy_locale->avail & COLLATE_LOCALE) == 0)
2670 /* Not yet loaded. So do it now. */
2671 if (locfile_read (copy_locale, charmap) != 0)
2672 goto skip_category;
2676 lr_ignore_rest (ldfile, 1);
2678 now = lr_token (ldfile, charmap, result, NULL, verbose);
2679 nowtok = now->tok;
2682 /* Prepare the data structures. */
2683 collate_startup (ldfile, result, copy_locale, ignore_content);
2684 collate = result->categories[LC_COLLATE].collate;
2686 while (1)
2688 char ucs4buf[10];
2689 char *symstr;
2690 size_t symlen;
2692 /* Of course we don't proceed beyond the end of file. */
2693 if (nowtok == tok_eof)
2694 break;
2696 /* Ingore empty lines. */
2697 if (nowtok == tok_eol)
2699 now = lr_token (ldfile, charmap, result, NULL, verbose);
2700 nowtok = now->tok;
2701 continue;
2704 switch (nowtok)
2706 case tok_copy:
2707 /* Allow copying other locales. */
2708 now = lr_token (ldfile, charmap, result, NULL, verbose);
2709 if (now->tok != tok_string)
2710 goto err_label;
2712 if (! ignore_content)
2713 load_locale (LC_COLLATE, now->val.str.startmb, repertoire_name,
2714 charmap, result);
2716 lr_ignore_rest (ldfile, 1);
2717 break;
2719 case tok_coll_weight_max:
2720 /* Ignore the rest of the line if we don't need the input of
2721 this line. */
2722 if (ignore_content)
2724 lr_ignore_rest (ldfile, 0);
2725 break;
2728 if (state != 0)
2729 goto err_label;
2731 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2732 if (arg->tok != tok_number)
2733 goto err_label;
2734 if (collate->col_weight_max != -1)
2735 lr_error (ldfile, _("%s: duplicate definition of `%s'"),
2736 "LC_COLLATE", "col_weight_max");
2737 else
2738 collate->col_weight_max = arg->val.num;
2739 lr_ignore_rest (ldfile, 1);
2740 break;
2742 case tok_section_symbol:
2743 /* Ignore the rest of the line if we don't need the input of
2744 this line. */
2745 if (ignore_content)
2747 lr_ignore_rest (ldfile, 0);
2748 break;
2751 if (state != 0)
2752 goto err_label;
2754 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2755 if (arg->tok != tok_bsymbol)
2756 goto err_label;
2757 else if (!ignore_content)
2759 /* Check whether this section is already known. */
2760 struct section_list *known = collate->sections;
2761 while (known != NULL)
2763 if (strcmp (known->name, arg->val.str.startmb) == 0)
2764 break;
2765 known = known->next;
2768 if (known != NULL)
2770 lr_error (ldfile,
2771 _("%s: duplicate declaration of section `%s'"),
2772 "LC_COLLATE", arg->val.str.startmb);
2773 free (arg->val.str.startmb);
2775 else
2776 collate->sections = make_seclist_elem (collate,
2777 arg->val.str.startmb,
2778 collate->sections);
2780 lr_ignore_rest (ldfile, known == NULL);
2782 else
2784 free (arg->val.str.startmb);
2785 lr_ignore_rest (ldfile, 0);
2787 break;
2789 case tok_collating_element:
2790 /* Ignore the rest of the line if we don't need the input of
2791 this line. */
2792 if (ignore_content)
2794 lr_ignore_rest (ldfile, 0);
2795 break;
2798 if (state != 0 && state != 2)
2799 goto err_label;
2801 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2802 if (arg->tok != tok_bsymbol)
2803 goto err_label;
2804 else
2806 const char *symbol = arg->val.str.startmb;
2807 size_t symbol_len = arg->val.str.lenmb;
2809 /* Next the `from' keyword. */
2810 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2811 if (arg->tok != tok_from)
2813 free ((char *) symbol);
2814 goto err_label;
2817 ldfile->return_widestr = 1;
2818 ldfile->translate_strings = 1;
2820 /* Finally the string with the replacement. */
2821 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2823 ldfile->return_widestr = 0;
2824 ldfile->translate_strings = 0;
2826 if (arg->tok != tok_string)
2827 goto err_label;
2829 if (!ignore_content && symbol != NULL)
2831 /* The name is already defined. */
2832 if (check_duplicate (ldfile, collate, charmap,
2833 repertoire, symbol, symbol_len))
2834 goto col_elem_free;
2836 if (arg->val.str.startmb != NULL)
2837 insert_entry (&collate->elem_table, symbol, symbol_len,
2838 new_element (collate,
2839 arg->val.str.startmb,
2840 arg->val.str.lenmb - 1,
2841 arg->val.str.startwc,
2842 symbol, symbol_len, 0));
2844 else
2846 col_elem_free:
2847 if (symbol != NULL)
2848 free ((char *) symbol);
2849 if (arg->val.str.startmb != NULL)
2850 free (arg->val.str.startmb);
2851 if (arg->val.str.startwc != NULL)
2852 free (arg->val.str.startwc);
2854 lr_ignore_rest (ldfile, 1);
2856 break;
2858 case tok_collating_symbol:
2859 /* Ignore the rest of the line if we don't need the input of
2860 this line. */
2861 if (ignore_content)
2863 lr_ignore_rest (ldfile, 0);
2864 break;
2867 if (state != 0 && state != 2)
2868 goto err_label;
2870 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2871 if (arg->tok != tok_bsymbol)
2872 goto err_label;
2873 else
2875 char *symbol = arg->val.str.startmb;
2876 size_t symbol_len = arg->val.str.lenmb;
2877 char *endsymbol = NULL;
2878 size_t endsymbol_len = 0;
2879 enum token_t ellipsis = tok_none;
2881 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2882 if (arg->tok == tok_ellipsis2 || arg->tok == tok_ellipsis4)
2884 ellipsis = arg->tok;
2886 arg = lr_token (ldfile, charmap, result, repertoire,
2887 verbose);
2888 if (arg->tok != tok_bsymbol)
2890 free (symbol);
2891 goto err_label;
2894 endsymbol = arg->val.str.startmb;
2895 endsymbol_len = arg->val.str.lenmb;
2897 lr_ignore_rest (ldfile, 1);
2899 else if (arg->tok != tok_eol)
2901 free (symbol);
2902 goto err_label;
2905 if (!ignore_content)
2907 if (symbol == NULL
2908 || (ellipsis != tok_none && endsymbol == NULL))
2910 lr_error (ldfile, _("\
2911 %s: unknown character in collating symbol name"),
2912 "LC_COLLATE");
2913 goto col_sym_free;
2915 else if (ellipsis == tok_none)
2917 /* A single symbol, no ellipsis. */
2918 if (check_duplicate (ldfile, collate, charmap,
2919 repertoire, symbol, symbol_len))
2920 /* The name is already defined. */
2921 goto col_sym_free;
2923 insert_entry (&collate->sym_table, symbol, symbol_len,
2924 new_symbol (collate, symbol, symbol_len));
2926 else if (symbol_len != endsymbol_len)
2928 col_sym_inv_range:
2929 lr_error (ldfile,
2930 _("invalid names for character range"));
2931 goto col_sym_free;
2933 else
2935 /* Oh my, we have to handle an ellipsis. First, as
2936 usual, determine the common prefix and then
2937 convert the rest into a range. */
2938 size_t prefixlen;
2939 unsigned long int from;
2940 unsigned long int to;
2941 char *endp;
2943 for (prefixlen = 0; prefixlen < symbol_len; ++prefixlen)
2944 if (symbol[prefixlen] != endsymbol[prefixlen])
2945 break;
2947 /* Convert the rest into numbers. */
2948 symbol[symbol_len] = '\0';
2949 from = strtoul (&symbol[prefixlen], &endp,
2950 ellipsis == tok_ellipsis2 ? 16 : 10);
2951 if (*endp != '\0')
2952 goto col_sym_inv_range;
2954 endsymbol[symbol_len] = '\0';
2955 to = strtoul (&endsymbol[prefixlen], &endp,
2956 ellipsis == tok_ellipsis2 ? 16 : 10);
2957 if (*endp != '\0')
2958 goto col_sym_inv_range;
2960 if (from > to)
2961 goto col_sym_inv_range;
2963 /* Now loop over all entries. */
2964 while (from <= to)
2966 char *symbuf;
2968 symbuf = (char *) obstack_alloc (&collate->mempool,
2969 symbol_len + 1);
2971 /* Create the name. */
2972 sprintf (symbuf,
2973 ellipsis == tok_ellipsis2
2974 ? "%.*s%.*lX" : "%.*s%.*lu",
2975 (int) prefixlen, symbol,
2976 (int) (symbol_len - prefixlen), from);
2978 if (check_duplicate (ldfile, collate, charmap,
2979 repertoire, symbuf, symbol_len))
2980 /* The name is already defined. */
2981 goto col_sym_free;
2983 insert_entry (&collate->sym_table, symbuf,
2984 symbol_len,
2985 new_symbol (collate, symbuf,
2986 symbol_len));
2988 /* Increment the counter. */
2989 ++from;
2992 goto col_sym_free;
2995 else
2997 col_sym_free:
2998 if (symbol != NULL)
2999 free (symbol);
3000 if (endsymbol != NULL)
3001 free (endsymbol);
3004 break;
3006 case tok_symbol_equivalence:
3007 /* Ignore the rest of the line if we don't need the input of
3008 this line. */
3009 if (ignore_content)
3011 lr_ignore_rest (ldfile, 0);
3012 break;
3015 if (state != 0)
3016 goto err_label;
3018 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3019 if (arg->tok != tok_bsymbol)
3020 goto err_label;
3021 else
3023 const char *newname = arg->val.str.startmb;
3024 size_t newname_len = arg->val.str.lenmb;
3025 const char *symname;
3026 size_t symname_len;
3027 void *symval; /* Actually struct symbol_t* */
3029 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3030 if (arg->tok != tok_bsymbol)
3032 if (newname != NULL)
3033 free ((char *) newname);
3034 goto err_label;
3037 symname = arg->val.str.startmb;
3038 symname_len = arg->val.str.lenmb;
3040 if (newname == NULL)
3042 lr_error (ldfile, _("\
3043 %s: unknown character in equivalent definition name"),
3044 "LC_COLLATE");
3046 sym_equiv_free:
3047 if (newname != NULL)
3048 free ((char *) newname);
3049 if (symname != NULL)
3050 free ((char *) symname);
3051 break;
3053 if (symname == NULL)
3055 lr_error (ldfile, _("\
3056 %s: unknown character in equivalent definition value"),
3057 "LC_COLLATE");
3058 goto sym_equiv_free;
3061 /* See whether the symbol name is already defined. */
3062 if (find_entry (&collate->sym_table, symname, symname_len,
3063 &symval) != 0)
3065 lr_error (ldfile, _("\
3066 %s: unknown symbol `%s' in equivalent definition"),
3067 "LC_COLLATE", symname);
3068 goto col_sym_free;
3071 if (insert_entry (&collate->sym_table,
3072 newname, newname_len, symval) < 0)
3074 lr_error (ldfile, _("\
3075 error while adding equivalent collating symbol"));
3076 goto sym_equiv_free;
3079 free ((char *) symname);
3081 lr_ignore_rest (ldfile, 1);
3082 break;
3084 case tok_script:
3085 /* We get told about the scripts we know. */
3086 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3087 if (arg->tok != tok_bsymbol)
3088 goto err_label;
3089 else
3091 struct section_list *runp = collate->known_sections;
3092 char *name;
3094 while (runp != NULL)
3095 if (strncmp (runp->name, arg->val.str.startmb,
3096 arg->val.str.lenmb) == 0
3097 && runp->name[arg->val.str.lenmb] == '\0')
3098 break;
3099 else
3100 runp = runp->def_next;
3102 if (runp != NULL)
3104 lr_error (ldfile, _("duplicate definition of script `%s'"),
3105 runp->name);
3106 lr_ignore_rest (ldfile, 0);
3107 break;
3110 runp = (struct section_list *) xcalloc (1, sizeof (*runp));
3111 name = (char *) xmalloc (arg->val.str.lenmb + 1);
3112 memcpy (name, arg->val.str.startmb, arg->val.str.lenmb);
3113 name[arg->val.str.lenmb] = '\0';
3114 runp->name = name;
3116 runp->def_next = collate->known_sections;
3117 collate->known_sections = runp;
3119 lr_ignore_rest (ldfile, 1);
3120 break;
3122 case tok_order_start:
3123 /* Ignore the rest of the line if we don't need the input of
3124 this line. */
3125 if (ignore_content)
3127 lr_ignore_rest (ldfile, 0);
3128 break;
3131 if (state != 0 && state != 1)
3132 goto err_label;
3133 state = 1;
3135 /* The 14652 draft does not specify whether all `order_start' lines
3136 must contain the same number of sort-rules, but 14651 does. So
3137 we require this here as well. */
3138 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3139 if (arg->tok == tok_bsymbol)
3141 /* This better should be a section name. */
3142 struct section_list *sp = collate->known_sections;
3143 while (sp != NULL
3144 && (sp->name == NULL
3145 || strncmp (sp->name, arg->val.str.startmb,
3146 arg->val.str.lenmb) != 0
3147 || sp->name[arg->val.str.lenmb] != '\0'))
3148 sp = sp->def_next;
3150 if (sp == NULL)
3152 lr_error (ldfile, _("\
3153 %s: unknown section name `%s'"),
3154 "LC_COLLATE", arg->val.str.startmb);
3155 /* We use the error section. */
3156 collate->current_section = &collate->error_section;
3158 if (collate->error_section.first == NULL)
3160 /* Insert &collate->error_section at the end of
3161 the collate->sections list. */
3162 if (collate->sections == NULL)
3163 collate->sections = &collate->error_section;
3164 else
3166 sp = collate->sections;
3167 while (sp->next != NULL)
3168 sp = sp->next;
3170 sp->next = &collate->error_section;
3172 collate->error_section.next = NULL;
3175 else
3177 /* One should not be allowed to open the same
3178 section twice. */
3179 if (sp->first != NULL)
3180 lr_error (ldfile, _("\
3181 %s: multiple order definitions for section `%s'"),
3182 "LC_COLLATE", sp->name);
3183 else
3185 /* Insert sp in the collate->sections list,
3186 right after collate->current_section. */
3187 if (collate->current_section == NULL)
3188 collate->current_section = sp;
3189 else
3191 sp->next = collate->current_section->next;
3192 collate->current_section->next = sp;
3196 /* Next should come the end of the line or a semicolon. */
3197 arg = lr_token (ldfile, charmap, result, repertoire,
3198 verbose);
3199 if (arg->tok == tok_eol)
3201 uint32_t cnt;
3203 /* This means we have exactly one rule: `forward'. */
3204 if (nrules > 1)
3205 lr_error (ldfile, _("\
3206 %s: invalid number of sorting rules"),
3207 "LC_COLLATE");
3208 else
3209 nrules = 1;
3210 sp->rules = obstack_alloc (&collate->mempool,
3211 (sizeof (enum coll_sort_rule)
3212 * nrules));
3213 for (cnt = 0; cnt < nrules; ++cnt)
3214 sp->rules[cnt] = sort_forward;
3216 /* Next line. */
3217 break;
3220 /* Get the next token. */
3221 arg = lr_token (ldfile, charmap, result, repertoire,
3222 verbose);
3225 else
3227 /* There is no section symbol. Therefore we use the unnamed
3228 section. */
3229 collate->current_section = &collate->unnamed_section;
3231 if (collate->unnamed_section.first != NULL)
3232 lr_error (ldfile, _("\
3233 %s: multiple order definitions for unnamed section"),
3234 "LC_COLLATE");
3235 else
3237 /* Insert &collate->unnamed_section at the beginning of
3238 the collate->sections list. */
3239 collate->unnamed_section.next = collate->sections;
3240 collate->sections = &collate->unnamed_section;
3244 /* Now read the direction names. */
3245 read_directions (ldfile, arg, charmap, repertoire, result);
3247 /* From now we need the strings untranslated. */
3248 ldfile->translate_strings = 0;
3249 break;
3251 case tok_order_end:
3252 /* Ignore the rest of the line if we don't need the input of
3253 this line. */
3254 if (ignore_content)
3256 lr_ignore_rest (ldfile, 0);
3257 break;
3260 if (state != 1)
3261 goto err_label;
3263 /* Handle ellipsis at end of list. */
3264 if (was_ellipsis != tok_none)
3266 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3267 repertoire, result);
3268 was_ellipsis = tok_none;
3271 state = 2;
3272 lr_ignore_rest (ldfile, 1);
3273 break;
3275 case tok_reorder_after:
3276 /* Ignore the rest of the line if we don't need the input of
3277 this line. */
3278 if (ignore_content)
3280 lr_ignore_rest (ldfile, 0);
3281 break;
3284 if (state == 1)
3286 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3287 "LC_COLLATE");
3288 state = 2;
3290 /* Handle ellipsis at end of list. */
3291 if (was_ellipsis != tok_none)
3293 handle_ellipsis (ldfile, arg->val.str.startmb,
3294 arg->val.str.lenmb, was_ellipsis, charmap,
3295 repertoire, result);
3296 was_ellipsis = tok_none;
3299 else if (state != 2 && state != 3)
3300 goto err_label;
3301 state = 3;
3303 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3304 if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
3306 /* Find this symbol in the sequence table. */
3307 char ucsbuf[10];
3308 char *startmb;
3309 size_t lenmb;
3310 struct element_t *insp;
3311 int no_error = 1;
3312 void *ptr;
3314 if (arg->tok == tok_bsymbol)
3316 startmb = arg->val.str.startmb;
3317 lenmb = arg->val.str.lenmb;
3319 else
3321 sprintf (ucsbuf, "U%08X", arg->val.ucs4);
3322 startmb = ucsbuf;
3323 lenmb = 9;
3326 if (find_entry (&collate->seq_table, startmb, lenmb, &ptr) == 0)
3327 /* Yes, the symbol exists. Simply point the cursor
3328 to it. */
3329 collate->cursor = (struct element_t *) ptr;
3330 else
3332 struct symbol_t *symbp;
3333 void *ptr;
3335 if (find_entry (&collate->sym_table, startmb, lenmb,
3336 &ptr) == 0)
3338 symbp = ptr;
3340 if (symbp->order->last != NULL
3341 || symbp->order->next != NULL)
3342 collate->cursor = symbp->order;
3343 else
3345 /* This is a collating symbol but its position
3346 is not yet defined. */
3347 lr_error (ldfile, _("\
3348 %s: order for collating symbol %.*s not yet defined"),
3349 "LC_COLLATE", (int) lenmb, startmb);
3350 collate->cursor = NULL;
3351 no_error = 0;
3354 else if (find_entry (&collate->elem_table, startmb, lenmb,
3355 &ptr) == 0)
3357 insp = (struct element_t *) ptr;
3359 if (insp->last != NULL || insp->next != NULL)
3360 collate->cursor = insp;
3361 else
3363 /* This is a collating element but its position
3364 is not yet defined. */
3365 lr_error (ldfile, _("\
3366 %s: order for collating element %.*s not yet defined"),
3367 "LC_COLLATE", (int) lenmb, startmb);
3368 collate->cursor = NULL;
3369 no_error = 0;
3372 else
3374 /* This is bad. The symbol after which we have to
3375 insert does not exist. */
3376 lr_error (ldfile, _("\
3377 %s: cannot reorder after %.*s: symbol not known"),
3378 "LC_COLLATE", (int) lenmb, startmb);
3379 collate->cursor = NULL;
3380 no_error = 0;
3384 lr_ignore_rest (ldfile, no_error);
3386 else
3387 /* This must not happen. */
3388 goto err_label;
3389 break;
3391 case tok_reorder_end:
3392 /* Ignore the rest of the line if we don't need the input of
3393 this line. */
3394 if (ignore_content)
3395 break;
3397 if (state != 3)
3398 goto err_label;
3399 state = 4;
3400 lr_ignore_rest (ldfile, 1);
3401 break;
3403 case tok_reorder_sections_after:
3404 /* Ignore the rest of the line if we don't need the input of
3405 this line. */
3406 if (ignore_content)
3408 lr_ignore_rest (ldfile, 0);
3409 break;
3412 if (state == 1)
3414 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3415 "LC_COLLATE");
3416 state = 2;
3418 /* Handle ellipsis at end of list. */
3419 if (was_ellipsis != tok_none)
3421 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3422 repertoire, result);
3423 was_ellipsis = tok_none;
3426 else if (state == 3)
3428 WITH_CUR_LOCALE (error (0, 0, _("\
3429 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3430 state = 4;
3432 else if (state != 2 && state != 4)
3433 goto err_label;
3434 state = 5;
3436 /* Get the name of the sections we are adding after. */
3437 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3438 if (arg->tok == tok_bsymbol)
3440 /* Now find a section with this name. */
3441 struct section_list *runp = collate->sections;
3443 while (runp != NULL)
3445 if (runp->name != NULL
3446 && strlen (runp->name) == arg->val.str.lenmb
3447 && memcmp (runp->name, arg->val.str.startmb,
3448 arg->val.str.lenmb) == 0)
3449 break;
3451 runp = runp->next;
3454 if (runp != NULL)
3455 collate->current_section = runp;
3456 else
3458 /* This is bad. The section after which we have to
3459 reorder does not exist. Therefore we cannot
3460 process the whole rest of this reorder
3461 specification. */
3462 lr_error (ldfile, _("%s: section `%.*s' not known"),
3463 "LC_COLLATE", (int) arg->val.str.lenmb,
3464 arg->val.str.startmb);
3468 lr_ignore_rest (ldfile, 0);
3470 now = lr_token (ldfile, charmap, result, NULL, verbose);
3472 while (now->tok == tok_reorder_sections_after
3473 || now->tok == tok_reorder_sections_end
3474 || now->tok == tok_end);
3476 /* Process the token we just saw. */
3477 nowtok = now->tok;
3478 continue;
3481 else
3482 /* This must not happen. */
3483 goto err_label;
3484 break;
3486 case tok_reorder_sections_end:
3487 /* Ignore the rest of the line if we don't need the input of
3488 this line. */
3489 if (ignore_content)
3490 break;
3492 if (state != 5)
3493 goto err_label;
3494 state = 6;
3495 lr_ignore_rest (ldfile, 1);
3496 break;
3498 case tok_bsymbol:
3499 case tok_ucs4:
3500 /* Ignore the rest of the line if we don't need the input of
3501 this line. */
3502 if (ignore_content)
3504 lr_ignore_rest (ldfile, 0);
3505 break;
3508 if (state != 0 && state != 1 && state != 3 && state != 5)
3509 goto err_label;
3511 if ((state == 0 || state == 5) && nowtok == tok_ucs4)
3512 goto err_label;
3514 if (nowtok == tok_ucs4)
3516 snprintf (ucs4buf, sizeof (ucs4buf), "U%08X", now->val.ucs4);
3517 symstr = ucs4buf;
3518 symlen = 9;
3520 else if (arg != NULL)
3522 symstr = arg->val.str.startmb;
3523 symlen = arg->val.str.lenmb;
3525 else
3527 lr_error (ldfile, _("%s: bad symbol <%.*s>"), "LC_COLLATE",
3528 (int) ldfile->token.val.str.lenmb,
3529 ldfile->token.val.str.startmb);
3530 break;
3533 if (state == 0)
3535 /* We are outside an `order_start' region. This means
3536 we must only accept definitions of values for
3537 collation symbols since these are purely abstract
3538 values and don't need directions associated. */
3539 struct element_t *seqp;
3540 void *ptr;
3542 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3544 seqp = ptr;
3546 /* It's already defined. First check whether this
3547 is really a collating symbol. */
3548 if (seqp->is_character)
3549 goto err_label;
3551 goto move_entry;
3553 else
3555 void *result;
3557 if (find_entry (&collate->sym_table, symstr, symlen,
3558 &result) != 0)
3559 /* No collating symbol, it's an error. */
3560 goto err_label;
3562 /* Maybe this is the first time we define a symbol
3563 value and it is before the first actual section. */
3564 if (collate->sections == NULL)
3565 collate->sections = collate->current_section =
3566 &collate->symbol_section;
3569 if (was_ellipsis != tok_none)
3572 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis,
3573 charmap, repertoire, result);
3575 /* Remember that we processed the ellipsis. */
3576 was_ellipsis = tok_none;
3578 /* And don't add the value a second time. */
3579 break;
3582 else if (state == 3)
3584 /* It is possible that we already have this collation sequence.
3585 In this case we move the entry. */
3586 struct element_t *seqp = NULL;
3587 void *sym;
3588 void *ptr;
3590 /* If the symbol after which we have to insert was not found
3591 ignore all entries. */
3592 if (collate->cursor == NULL)
3594 lr_ignore_rest (ldfile, 0);
3595 break;
3598 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3600 seqp = (struct element_t *) ptr;
3601 goto move_entry;
3604 if (find_entry (&collate->sym_table, symstr, symlen, &sym) == 0
3605 && (seqp = ((struct symbol_t *) sym)->order) != NULL)
3606 goto move_entry;
3608 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) == 0
3609 && (seqp = (struct element_t *) ptr,
3610 seqp->last != NULL || seqp->next != NULL
3611 || (collate->start != NULL && seqp == collate->start)))
3613 move_entry:
3614 /* Remove the entry from the old position. */
3615 if (seqp->last == NULL)
3616 collate->start = seqp->next;
3617 else
3618 seqp->last->next = seqp->next;
3619 if (seqp->next != NULL)
3620 seqp->next->last = seqp->last;
3622 /* We also have to check whether this entry is the
3623 first or last of a section. */
3624 if (seqp->section->first == seqp)
3626 if (seqp->section->first == seqp->section->last)
3627 /* This section has no content anymore. */
3628 seqp->section->first = seqp->section->last = NULL;
3629 else
3630 seqp->section->first = seqp->next;
3632 else if (seqp->section->last == seqp)
3633 seqp->section->last = seqp->last;
3635 /* Now insert it in the new place. */
3636 insert_weights (ldfile, seqp, charmap, repertoire, result,
3637 tok_none);
3638 break;
3641 /* Otherwise we just add a new entry. */
3643 else if (state == 5)
3645 /* We are reordering sections. Find the named section. */
3646 struct section_list *runp = collate->sections;
3647 struct section_list *prevp = NULL;
3649 while (runp != NULL)
3651 if (runp->name != NULL
3652 && strlen (runp->name) == symlen
3653 && memcmp (runp->name, symstr, symlen) == 0)
3654 break;
3656 prevp = runp;
3657 runp = runp->next;
3660 if (runp == NULL)
3662 lr_error (ldfile, _("%s: section `%.*s' not known"),
3663 "LC_COLLATE", (int) symlen, symstr);
3664 lr_ignore_rest (ldfile, 0);
3666 else
3668 if (runp != collate->current_section)
3670 /* Remove the named section from the old place and
3671 insert it in the new one. */
3672 prevp->next = runp->next;
3674 runp->next = collate->current_section->next;
3675 collate->current_section->next = runp;
3676 collate->current_section = runp;
3679 /* Process the rest of the line which might change
3680 the collation rules. */
3681 arg = lr_token (ldfile, charmap, result, repertoire,
3682 verbose);
3683 if (arg->tok != tok_eof && arg->tok != tok_eol)
3684 read_directions (ldfile, arg, charmap, repertoire,
3685 result);
3687 break;
3689 else if (was_ellipsis != tok_none)
3691 /* Using the information in the `ellipsis_weight'
3692 element and this and the last value we have to handle
3693 the ellipsis now. */
3694 assert (state == 1);
3696 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis, charmap,
3697 repertoire, result);
3699 /* Remember that we processed the ellipsis. */
3700 was_ellipsis = tok_none;
3702 /* And don't add the value a second time. */
3703 break;
3706 /* Now insert in the new place. */
3707 insert_value (ldfile, symstr, symlen, charmap, repertoire, result);
3708 break;
3710 case tok_undefined:
3711 /* Ignore the rest of the line if we don't need the input of
3712 this line. */
3713 if (ignore_content)
3715 lr_ignore_rest (ldfile, 0);
3716 break;
3719 if (state != 1)
3720 goto err_label;
3722 if (was_ellipsis != tok_none)
3724 lr_error (ldfile,
3725 _("%s: cannot have `%s' as end of ellipsis range"),
3726 "LC_COLLATE", "UNDEFINED");
3728 unlink_element (collate);
3729 was_ellipsis = tok_none;
3732 /* See whether UNDEFINED already appeared somewhere. */
3733 if (collate->undefined.next != NULL
3734 || &collate->undefined == collate->cursor)
3736 lr_error (ldfile,
3737 _("%s: order for `%.*s' already defined at %s:%Zu"),
3738 "LC_COLLATE", 9, "UNDEFINED",
3739 collate->undefined.file,
3740 collate->undefined.line);
3741 lr_ignore_rest (ldfile, 0);
3743 else
3744 /* Parse the weights. */
3745 insert_weights (ldfile, &collate->undefined, charmap,
3746 repertoire, result, tok_none);
3747 break;
3749 case tok_ellipsis2: /* symbolic hexadecimal ellipsis */
3750 case tok_ellipsis3: /* absolute ellipsis */
3751 case tok_ellipsis4: /* symbolic decimal ellipsis */
3752 /* This is the symbolic (decimal or hexadecimal) or absolute
3753 ellipsis. */
3754 if (was_ellipsis != tok_none)
3755 goto err_label;
3757 if (state != 0 && state != 1 && state != 3)
3758 goto err_label;
3760 was_ellipsis = nowtok;
3762 insert_weights (ldfile, &collate->ellipsis_weight, charmap,
3763 repertoire, result, nowtok);
3764 break;
3766 case tok_end:
3767 /* Next we assume `LC_COLLATE'. */
3768 if (!ignore_content)
3770 if (state == 0)
3771 /* We must either see a copy statement or have
3772 ordering values. */
3773 lr_error (ldfile,
3774 _("%s: empty category description not allowed"),
3775 "LC_COLLATE");
3776 else if (state == 1)
3778 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3779 "LC_COLLATE");
3781 /* Handle ellipsis at end of list. */
3782 if (was_ellipsis != tok_none)
3784 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3785 repertoire, result);
3786 was_ellipsis = tok_none;
3789 else if (state == 3)
3790 WITH_CUR_LOCALE (error (0, 0, _("\
3791 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3792 else if (state == 5)
3793 WITH_CUR_LOCALE (error (0, 0, _("\
3794 %s: missing `reorder-sections-end' keyword"), "LC_COLLATE"));
3796 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3797 if (arg->tok == tok_eof)
3798 break;
3799 if (arg->tok == tok_eol)
3800 lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
3801 else if (arg->tok != tok_lc_collate)
3802 lr_error (ldfile, _("\
3803 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
3804 lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
3805 return;
3807 default:
3808 err_label:
3809 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
3812 /* Prepare for the next round. */
3813 now = lr_token (ldfile, charmap, result, NULL, verbose);
3814 nowtok = now->tok;
3817 /* When we come here we reached the end of the file. */
3818 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");