* sysdeps/unix/sysv/linux/powerpc/lowlevellock.h
[glibc.git] / locale / programs / ld-collate.c
blob3d1199d3724bc2277d388efe24cf1784b65f7d6f
1 /* Copyright (C) 1995-2003, 2005, 2006, 2007 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
23 #include <errno.h>
24 #include <error.h>
25 #include <stdlib.h>
26 #include <wchar.h>
27 #include <sys/param.h>
29 #include "localedef.h"
30 #include "charmap.h"
31 #include "localeinfo.h"
32 #include "linereader.h"
33 #include "locfile.h"
34 #include "elem-hash.h"
36 /* Uncomment the following line in the production version. */
37 /* #define NDEBUG 1 */
38 #include <assert.h>
40 #define obstack_chunk_alloc malloc
41 #define obstack_chunk_free free
43 static inline void
44 __attribute ((always_inline))
45 obstack_int32_grow (struct obstack *obstack, int32_t data)
47 if (sizeof (int32_t) == sizeof (int))
48 obstack_int_grow (obstack, data);
49 else
50 obstack_grow (obstack, &data, sizeof (int32_t));
53 static inline void
54 __attribute ((always_inline))
55 obstack_int32_grow_fast (struct obstack *obstack, int32_t data)
57 if (sizeof (int32_t) == sizeof (int))
58 obstack_int_grow_fast (obstack, data);
59 else
60 obstack_grow (obstack, &data, sizeof (int32_t));
63 /* Forward declaration. */
64 struct element_t;
66 /* Data type for list of strings. */
67 struct section_list
69 /* Successor in the known_sections list. */
70 struct section_list *def_next;
71 /* Successor in the sections list. */
72 struct section_list *next;
73 /* Name of the section. */
74 const char *name;
75 /* First element of this section. */
76 struct element_t *first;
77 /* Last element of this section. */
78 struct element_t *last;
79 /* These are the rules for this section. */
80 enum coll_sort_rule *rules;
81 /* Index of the rule set in the appropriate section of the output file. */
82 int ruleidx;
85 struct element_t;
87 struct element_list_t
89 /* Number of elements. */
90 int cnt;
92 struct element_t **w;
95 /* Data type for collating element. */
96 struct element_t
98 const char *name;
100 const char *mbs;
101 size_t nmbs;
102 const uint32_t *wcs;
103 size_t nwcs;
104 int *mborder;
105 int wcorder;
107 /* The following is a bit mask which bits are set if this element is
108 used in the appropriate level. Interesting for the singlebyte
109 weight computation.
111 XXX The type here restricts the number of levels to 32. It could
112 be changed if necessary but I doubt this is necessary. */
113 unsigned int used_in_level;
115 struct element_list_t *weights;
117 /* Nonzero if this is a real character definition. */
118 int is_character;
120 /* Order of the character in the sequence. This information will
121 be used in range expressions. */
122 int mbseqorder;
123 int wcseqorder;
125 /* Where does the definition come from. */
126 const char *file;
127 size_t line;
129 /* Which section does this belong to. */
130 struct section_list *section;
132 /* Predecessor and successor in the order list. */
133 struct element_t *last;
134 struct element_t *next;
136 /* Next element in multibyte output list. */
137 struct element_t *mbnext;
138 struct element_t *mblast;
140 /* Next element in wide character output list. */
141 struct element_t *wcnext;
142 struct element_t *wclast;
145 /* Special element value. */
146 #define ELEMENT_ELLIPSIS2 ((struct element_t *) 1)
147 #define ELEMENT_ELLIPSIS3 ((struct element_t *) 2)
148 #define ELEMENT_ELLIPSIS4 ((struct element_t *) 3)
150 /* Data type for collating symbol. */
151 struct symbol_t
153 const char *name;
155 /* Point to place in the order list. */
156 struct element_t *order;
158 /* Where does the definition come from. */
159 const char *file;
160 size_t line;
163 /* Sparse table of struct element_t *. */
164 #define TABLE wchead_table
165 #define ELEMENT struct element_t *
166 #define DEFAULT NULL
167 #define ITERATE
168 #define NO_FINALIZE
169 #include "3level.h"
171 /* Sparse table of int32_t. */
172 #define TABLE collidx_table
173 #define ELEMENT int32_t
174 #define DEFAULT 0
175 #include "3level.h"
177 /* Sparse table of uint32_t. */
178 #define TABLE collseq_table
179 #define ELEMENT uint32_t
180 #define DEFAULT ~((uint32_t) 0)
181 #include "3level.h"
184 /* The real definition of the struct for the LC_COLLATE locale. */
185 struct locale_collate_t
187 int col_weight_max;
188 int cur_weight_max;
190 /* List of known scripts. */
191 struct section_list *known_sections;
192 /* List of used sections. */
193 struct section_list *sections;
194 /* Current section using definition. */
195 struct section_list *current_section;
196 /* There always can be an unnamed section. */
197 struct section_list unnamed_section;
198 /* To make handling of errors easier we have another section. */
199 struct section_list error_section;
200 /* Sometimes we are defining the values for collating symbols before
201 the first actual section. */
202 struct section_list symbol_section;
204 /* Start of the order list. */
205 struct element_t *start;
207 /* The undefined element. */
208 struct element_t undefined;
210 /* This is the cursor for `reorder_after' insertions. */
211 struct element_t *cursor;
213 /* This value is used when handling ellipsis. */
214 struct element_t ellipsis_weight;
216 /* Known collating elements. */
217 hash_table elem_table;
219 /* Known collating symbols. */
220 hash_table sym_table;
222 /* Known collation sequences. */
223 hash_table seq_table;
225 struct obstack mempool;
227 /* The LC_COLLATE category is a bit special as it is sometimes possible
228 that the definitions from more than one input file contains information.
229 Therefore we keep all relevant input in a list. */
230 struct locale_collate_t *next;
232 /* Arrays with heads of the list for each of the leading bytes in
233 the multibyte sequences. */
234 struct element_t *mbheads[256];
236 /* Arrays with heads of the list for each of the leading bytes in
237 the multibyte sequences. */
238 struct wchead_table wcheads;
240 /* The arrays with the collation sequence order. */
241 unsigned char mbseqorder[256];
242 struct collseq_table wcseqorder;
246 /* We have a few global variables which are used for reading all
247 LC_COLLATE category descriptions in all files. */
248 static uint32_t nrules;
251 /* We need UTF-8 encoding of numbers. */
252 static inline int
253 __attribute ((always_inline))
254 utf8_encode (char *buf, int val)
256 int retval;
258 if (val < 0x80)
260 *buf++ = (char) val;
261 retval = 1;
263 else
265 int step;
267 for (step = 2; step < 6; ++step)
268 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
269 break;
270 retval = step;
272 *buf = (unsigned char) (~0xff >> step);
273 --step;
276 buf[step] = 0x80 | (val & 0x3f);
277 val >>= 6;
279 while (--step > 0);
280 *buf |= val;
283 return retval;
287 static struct section_list *
288 make_seclist_elem (struct locale_collate_t *collate, const char *string,
289 struct section_list *next)
291 struct section_list *newp;
293 newp = (struct section_list *) obstack_alloc (&collate->mempool,
294 sizeof (*newp));
295 newp->next = next;
296 newp->name = string;
297 newp->first = NULL;
298 newp->last = NULL;
300 return newp;
304 static struct element_t *
305 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
306 const uint32_t *wcs, const char *name, size_t namelen,
307 int is_character)
309 struct element_t *newp;
311 newp = (struct element_t *) obstack_alloc (&collate->mempool,
312 sizeof (*newp));
313 newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
314 name, namelen);
315 if (mbs != NULL)
317 newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
318 newp->nmbs = mbslen;
320 else
322 newp->mbs = NULL;
323 newp->nmbs = 0;
325 if (wcs != NULL)
327 size_t nwcs = wcslen ((wchar_t *) wcs);
328 uint32_t zero = 0;
329 obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
330 obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
331 newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
332 newp->nwcs = nwcs;
334 else
336 newp->wcs = NULL;
337 newp->nwcs = 0;
339 newp->mborder = NULL;
340 newp->wcorder = 0;
341 newp->used_in_level = 0;
342 newp->is_character = is_character;
344 /* Will be assigned later. XXX */
345 newp->mbseqorder = 0;
346 newp->wcseqorder = 0;
348 /* Will be allocated later. */
349 newp->weights = NULL;
351 newp->file = NULL;
352 newp->line = 0;
354 newp->section = collate->current_section;
356 newp->last = NULL;
357 newp->next = NULL;
359 newp->mbnext = NULL;
360 newp->mblast = NULL;
362 newp->wcnext = NULL;
363 newp->wclast = NULL;
365 return newp;
369 static struct symbol_t *
370 new_symbol (struct locale_collate_t *collate, const char *name, size_t len)
372 struct symbol_t *newp;
374 newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
376 newp->name = obstack_copy0 (&collate->mempool, name, len);
377 newp->order = NULL;
379 newp->file = NULL;
380 newp->line = 0;
382 return newp;
386 /* Test whether this name is already defined somewhere. */
387 static int
388 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
389 const struct charmap_t *charmap,
390 struct repertoire_t *repertoire, const char *symbol,
391 size_t symbol_len)
393 void *ignore = NULL;
395 if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
397 lr_error (ldfile, _("`%.*s' already defined in charmap"),
398 (int) symbol_len, symbol);
399 return 1;
402 if (repertoire != NULL
403 && (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore)
404 == 0))
406 lr_error (ldfile, _("`%.*s' already defined in repertoire"),
407 (int) symbol_len, symbol);
408 return 1;
411 if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
413 lr_error (ldfile, _("`%.*s' already defined as collating symbol"),
414 (int) symbol_len, symbol);
415 return 1;
418 if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
420 lr_error (ldfile, _("`%.*s' already defined as collating element"),
421 (int) symbol_len, symbol);
422 return 1;
425 return 0;
429 /* Read the direction specification. */
430 static void
431 read_directions (struct linereader *ldfile, struct token *arg,
432 const struct charmap_t *charmap,
433 struct repertoire_t *repertoire, struct localedef_t *result)
435 int cnt = 0;
436 int max = nrules ?: 10;
437 enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
438 int warned = 0;
439 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
441 while (1)
443 int valid = 0;
445 if (arg->tok == tok_forward)
447 if (rules[cnt] & sort_backward)
449 if (! warned)
451 lr_error (ldfile, _("\
452 %s: `forward' and `backward' are mutually excluding each other"),
453 "LC_COLLATE");
454 warned = 1;
457 else if (rules[cnt] & sort_forward)
459 if (! warned)
461 lr_error (ldfile, _("\
462 %s: `%s' mentioned more than once in definition of weight %d"),
463 "LC_COLLATE", "forward", cnt + 1);
466 else
467 rules[cnt] |= sort_forward;
469 valid = 1;
471 else if (arg->tok == tok_backward)
473 if (rules[cnt] & sort_forward)
475 if (! warned)
477 lr_error (ldfile, _("\
478 %s: `forward' and `backward' are mutually excluding each other"),
479 "LC_COLLATE");
480 warned = 1;
483 else if (rules[cnt] & sort_backward)
485 if (! warned)
487 lr_error (ldfile, _("\
488 %s: `%s' mentioned more than once in definition of weight %d"),
489 "LC_COLLATE", "backward", cnt + 1);
492 else
493 rules[cnt] |= sort_backward;
495 valid = 1;
497 else if (arg->tok == tok_position)
499 if (rules[cnt] & sort_position)
501 if (! warned)
503 lr_error (ldfile, _("\
504 %s: `%s' mentioned more than once in definition of weight %d"),
505 "LC_COLLATE", "position", cnt + 1);
508 else
509 rules[cnt] |= sort_position;
511 valid = 1;
514 if (valid)
515 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
517 if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
518 || arg->tok == tok_semicolon)
520 if (! valid && ! warned)
522 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
523 warned = 1;
526 /* See whether we have to increment the counter. */
527 if (arg->tok != tok_comma && rules[cnt] != 0)
529 /* Add the default `forward' if we have seen only `position'. */
530 if (rules[cnt] == sort_position)
531 rules[cnt] = sort_position | sort_forward;
533 ++cnt;
536 if (arg->tok == tok_eof || arg->tok == tok_eol)
537 /* End of line or file, so we exit the loop. */
538 break;
540 if (nrules == 0)
542 /* See whether we have enough room in the array. */
543 if (cnt == max)
545 max += 10;
546 rules = (enum coll_sort_rule *) xrealloc (rules,
548 * sizeof (*rules));
549 memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
552 else
554 if (cnt == nrules)
556 /* There must not be any more rule. */
557 if (! warned)
559 lr_error (ldfile, _("\
560 %s: too many rules; first entry only had %d"),
561 "LC_COLLATE", nrules);
562 warned = 1;
565 lr_ignore_rest (ldfile, 0);
566 break;
570 else
572 if (! warned)
574 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
575 warned = 1;
579 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
582 if (nrules == 0)
584 /* Now we know how many rules we have. */
585 nrules = cnt;
586 rules = (enum coll_sort_rule *) xrealloc (rules,
587 nrules * sizeof (*rules));
589 else
591 if (cnt < nrules)
593 /* Not enough rules in this specification. */
594 if (! warned)
595 lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
598 rules[cnt] = sort_forward;
599 while (++cnt < nrules);
603 collate->current_section->rules = rules;
607 static struct element_t *
608 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
609 const char *str, size_t len)
611 void *result = NULL;
613 /* Search for the entries among the collation sequences already define. */
614 if (find_entry (&collate->seq_table, str, len, &result) != 0)
616 /* Nope, not define yet. So we see whether it is a
617 collation symbol. */
618 void *ptr;
620 if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
622 /* It's a collation symbol. */
623 struct symbol_t *sym = (struct symbol_t *) ptr;
624 result = sym->order;
626 if (result == NULL)
627 result = sym->order = new_element (collate, NULL, 0, NULL,
628 NULL, 0, 0);
630 else if (find_entry (&collate->elem_table, str, len, &result) != 0)
632 /* It's also no collation element. So it is a character
633 element defined later. */
634 result = new_element (collate, NULL, 0, NULL, str, len, 1);
635 /* Insert it into the sequence table. */
636 insert_entry (&collate->seq_table, str, len, result);
640 return (struct element_t *) result;
644 static void
645 unlink_element (struct locale_collate_t *collate)
647 if (collate->cursor == collate->start)
649 assert (collate->cursor->next == NULL);
650 assert (collate->cursor->last == NULL);
651 collate->cursor = NULL;
653 else
655 if (collate->cursor->next != NULL)
656 collate->cursor->next->last = collate->cursor->last;
657 if (collate->cursor->last != NULL)
658 collate->cursor->last->next = collate->cursor->next;
659 collate->cursor = collate->cursor->last;
664 static void
665 insert_weights (struct linereader *ldfile, struct element_t *elem,
666 const struct charmap_t *charmap,
667 struct repertoire_t *repertoire, struct localedef_t *result,
668 enum token_t ellipsis)
670 int weight_cnt;
671 struct token *arg;
672 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
674 /* Initialize all the fields. */
675 elem->file = ldfile->fname;
676 elem->line = ldfile->lineno;
678 elem->last = collate->cursor;
679 elem->next = collate->cursor ? collate->cursor->next : NULL;
680 if (collate->cursor != NULL && collate->cursor->next != NULL)
681 collate->cursor->next->last = elem;
682 if (collate->cursor != NULL)
683 collate->cursor->next = elem;
684 if (collate->start == NULL)
686 assert (collate->cursor == NULL);
687 collate->start = elem;
690 elem->section = collate->current_section;
692 if (collate->current_section->first == NULL)
693 collate->current_section->first = elem;
694 if (collate->current_section->last == collate->cursor)
695 collate->current_section->last = elem;
697 collate->cursor = elem;
699 elem->weights = (struct element_list_t *)
700 obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
701 memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
703 weight_cnt = 0;
705 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
708 if (arg->tok == tok_eof || arg->tok == tok_eol)
709 break;
711 if (arg->tok == tok_ignore)
713 /* The weight for this level has to be ignored. We use the
714 null pointer to indicate this. */
715 elem->weights[weight_cnt].w = (struct element_t **)
716 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
717 elem->weights[weight_cnt].w[0] = NULL;
718 elem->weights[weight_cnt].cnt = 1;
720 else if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
722 char ucs4str[10];
723 struct element_t *val;
724 char *symstr;
725 size_t symlen;
727 if (arg->tok == tok_bsymbol)
729 symstr = arg->val.str.startmb;
730 symlen = arg->val.str.lenmb;
732 else
734 snprintf (ucs4str, sizeof (ucs4str), "U%08X", arg->val.ucs4);
735 symstr = ucs4str;
736 symlen = 9;
739 val = find_element (ldfile, collate, symstr, symlen);
740 if (val == NULL)
741 break;
743 elem->weights[weight_cnt].w = (struct element_t **)
744 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
745 elem->weights[weight_cnt].w[0] = val;
746 elem->weights[weight_cnt].cnt = 1;
748 else if (arg->tok == tok_string)
750 /* Split the string up in the individual characters and put
751 the element definitions in the list. */
752 const char *cp = arg->val.str.startmb;
753 int cnt = 0;
754 struct element_t *charelem;
755 struct element_t **weights = NULL;
756 int max = 0;
758 if (*cp == '\0')
760 lr_error (ldfile, _("%s: empty weight string not allowed"),
761 "LC_COLLATE");
762 lr_ignore_rest (ldfile, 0);
763 break;
768 if (*cp == '<')
770 /* Ahh, it's a bsymbol or an UCS4 value. If it's
771 the latter we have to unify the name. */
772 const char *startp = ++cp;
773 size_t len;
775 while (*cp != '>')
777 if (*cp == ldfile->escape_char)
778 ++cp;
779 if (*cp == '\0')
780 /* It's a syntax error. */
781 goto syntax;
783 ++cp;
786 if (cp - startp == 5 && startp[0] == 'U'
787 && isxdigit (startp[1]) && isxdigit (startp[2])
788 && isxdigit (startp[3]) && isxdigit (startp[4]))
790 unsigned int ucs4 = strtoul (startp + 1, NULL, 16);
791 char *newstr;
793 newstr = (char *) xmalloc (10);
794 snprintf (newstr, 10, "U%08X", ucs4);
795 startp = newstr;
797 len = 9;
799 else
800 len = cp - startp;
802 charelem = find_element (ldfile, collate, startp, len);
803 ++cp;
805 else
807 /* People really shouldn't use characters directly in
808 the string. Especially since it's not really clear
809 what this means. We interpret all characters in the
810 string as if that would be bsymbols. Otherwise we
811 would have to match back to bsymbols somehow and this
812 is normally not what people normally expect. */
813 charelem = find_element (ldfile, collate, cp++, 1);
816 if (charelem == NULL)
818 /* We ignore the rest of the line. */
819 lr_ignore_rest (ldfile, 0);
820 break;
823 /* Add the pointer. */
824 if (cnt >= max)
826 struct element_t **newp;
827 max += 10;
828 newp = (struct element_t **)
829 alloca (max * sizeof (struct element_t *));
830 memcpy (newp, weights, cnt * sizeof (struct element_t *));
831 weights = newp;
833 weights[cnt++] = charelem;
835 while (*cp != '\0');
837 /* Now store the information. */
838 elem->weights[weight_cnt].w = (struct element_t **)
839 obstack_alloc (&collate->mempool,
840 cnt * sizeof (struct element_t *));
841 memcpy (elem->weights[weight_cnt].w, weights,
842 cnt * sizeof (struct element_t *));
843 elem->weights[weight_cnt].cnt = cnt;
845 /* We don't need the string anymore. */
846 free (arg->val.str.startmb);
848 else if (ellipsis != tok_none
849 && (arg->tok == tok_ellipsis2
850 || arg->tok == tok_ellipsis3
851 || arg->tok == tok_ellipsis4))
853 /* It must be the same ellipsis as used in the initial column. */
854 if (arg->tok != ellipsis)
855 lr_error (ldfile, _("\
856 %s: weights must use the same ellipsis symbol as the name"),
857 "LC_COLLATE");
859 /* The weight for this level will depend on the element
860 iterating over the range. Put a placeholder. */
861 elem->weights[weight_cnt].w = (struct element_t **)
862 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
863 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
864 elem->weights[weight_cnt].cnt = 1;
866 else
868 syntax:
869 /* It's a syntax error. */
870 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
871 lr_ignore_rest (ldfile, 0);
872 break;
875 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
876 /* This better should be the end of the line or a semicolon. */
877 if (arg->tok == tok_semicolon)
878 /* OK, ignore this and read the next token. */
879 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
880 else if (arg->tok != tok_eof && arg->tok != tok_eol)
882 /* It's a syntax error. */
883 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
884 lr_ignore_rest (ldfile, 0);
885 break;
888 while (++weight_cnt < nrules);
890 if (weight_cnt < nrules)
892 /* This means the rest of the line uses the current element as
893 the weight. */
896 elem->weights[weight_cnt].w = (struct element_t **)
897 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
898 if (ellipsis == tok_none)
899 elem->weights[weight_cnt].w[0] = elem;
900 else
901 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
902 elem->weights[weight_cnt].cnt = 1;
904 while (++weight_cnt < nrules);
906 else
908 if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
910 /* Too many rule values. */
911 lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
912 lr_ignore_rest (ldfile, 0);
914 else
915 lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
920 static int
921 insert_value (struct linereader *ldfile, const char *symstr, size_t symlen,
922 const struct charmap_t *charmap, struct repertoire_t *repertoire,
923 struct localedef_t *result)
925 /* First find out what kind of symbol this is. */
926 struct charseq *seq;
927 uint32_t wc;
928 struct element_t *elem = NULL;
929 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
931 /* Try to find the character in the charmap. */
932 seq = charmap_find_value (charmap, symstr, symlen);
934 /* Determine the wide character. */
935 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
937 wc = repertoire_find_value (repertoire, symstr, symlen);
938 if (seq != NULL)
939 seq->ucs4 = wc;
941 else
942 wc = seq->ucs4;
944 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
946 /* It's no character, so look through the collation elements and
947 symbol list. */
948 void *ptr = elem;
949 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) != 0)
951 void *result;
952 struct symbol_t *sym = NULL;
954 /* It's also collation element. Therefore it's either a
955 collating symbol or it's a character which is not
956 supported by the character set. In the later case we
957 simply create a dummy entry. */
958 if (find_entry (&collate->sym_table, symstr, symlen, &result) == 0)
960 /* It's a collation symbol. */
961 sym = (struct symbol_t *) result;
963 elem = sym->order;
966 if (elem == NULL)
968 elem = new_element (collate, NULL, 0, NULL, symstr, symlen, 0);
970 if (sym != NULL)
971 sym->order = elem;
972 else
973 /* Enter a fake element in the sequence table. This
974 won't cause anything in the output since there is
975 no multibyte or wide character associated with
976 it. */
977 insert_entry (&collate->seq_table, symstr, symlen, elem);
980 else
981 /* Copy the result back. */
982 elem = ptr;
984 else
986 /* Otherwise the symbols stands for a character. */
987 void *ptr = elem;
988 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) != 0)
990 uint32_t wcs[2] = { wc, 0 };
992 /* We have to allocate an entry. */
993 elem = new_element (collate, seq != NULL ? seq->bytes : NULL,
994 seq != NULL ? seq->nbytes : 0,
995 wc == ILLEGAL_CHAR_VALUE ? NULL : wcs,
996 symstr, symlen, 1);
998 /* And add it to the table. */
999 if (insert_entry (&collate->seq_table, symstr, symlen, elem) != 0)
1000 /* This cannot happen. */
1001 assert (! "Internal error");
1003 else
1005 /* Copy the result back. */
1006 elem = ptr;
1008 /* Maybe the character was used before the definition. In this case
1009 we have to insert the byte sequences now. */
1010 if (elem->mbs == NULL && seq != NULL)
1012 elem->mbs = obstack_copy0 (&collate->mempool,
1013 seq->bytes, seq->nbytes);
1014 elem->nmbs = seq->nbytes;
1017 if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
1019 uint32_t wcs[2] = { wc, 0 };
1021 elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
1022 elem->nwcs = 1;
1027 /* Test whether this element is not already in the list. */
1028 if (elem->next != NULL || elem == collate->cursor)
1030 lr_error (ldfile, _("order for `%.*s' already defined at %s:%Zu"),
1031 (int) symlen, symstr, elem->file, elem->line);
1032 lr_ignore_rest (ldfile, 0);
1033 return 1;
1036 insert_weights (ldfile, elem, charmap, repertoire, result, tok_none);
1038 return 0;
1042 static void
1043 handle_ellipsis (struct linereader *ldfile, const char *symstr, size_t symlen,
1044 enum token_t ellipsis, const struct charmap_t *charmap,
1045 struct repertoire_t *repertoire,
1046 struct localedef_t *result)
1048 struct element_t *startp;
1049 struct element_t *endp;
1050 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
1052 /* Unlink the entry added for the ellipsis. */
1053 unlink_element (collate);
1054 startp = collate->cursor;
1056 /* Process and add the end-entry. */
1057 if (symstr != NULL
1058 && insert_value (ldfile, symstr, symlen, charmap, repertoire, result))
1059 /* Something went wrong with inserting the to-value. This means
1060 we cannot process the ellipsis. */
1061 return;
1063 /* Reset the cursor. */
1064 collate->cursor = startp;
1066 /* Now we have to handle many different situations:
1067 - we have to distinguish between the three different ellipsis forms
1068 - the is the ellipsis at the beginning, in the middle, or at the end.
1070 endp = collate->cursor->next;
1071 assert (symstr == NULL || endp != NULL);
1073 /* XXX The following is probably very wrong since also collating symbols
1074 can appear in ranges. But do we want/can refine the test for that? */
1075 #if 0
1076 /* Both, the start and the end symbol, must stand for characters. */
1077 if ((startp != NULL && (startp->name == NULL || ! startp->is_character))
1078 || (endp != NULL && (endp->name == NULL|| ! endp->is_character)))
1080 lr_error (ldfile, _("\
1081 %s: the start and the end symbol of a range must stand for characters"),
1082 "LC_COLLATE");
1083 return;
1085 #endif
1087 if (ellipsis == tok_ellipsis3)
1089 /* One requirement we make here: the length of the byte
1090 sequences for the first and end character must be the same.
1091 This is mainly to prevent unwanted effects and this is often
1092 not what is wanted. */
1093 size_t len = (startp->mbs != NULL ? startp->nmbs
1094 : (endp->mbs != NULL ? endp->nmbs : 0));
1095 char mbcnt[len + 1];
1096 char mbend[len + 1];
1098 /* Well, this should be caught somewhere else already. Just to
1099 make sure. */
1100 assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
1101 assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
1103 if (startp != NULL && endp != NULL
1104 && startp->mbs != NULL && endp->mbs != NULL
1105 && startp->nmbs != endp->nmbs)
1107 lr_error (ldfile, _("\
1108 %s: byte sequences of first and last character must have the same length"),
1109 "LC_COLLATE");
1110 return;
1113 /* Determine whether we have to generate multibyte sequences. */
1114 if ((startp == NULL || startp->mbs != NULL)
1115 && (endp == NULL || endp->mbs != NULL))
1117 int cnt;
1118 int ret;
1120 /* Prepare the beginning byte sequence. This is either from the
1121 beginning byte sequence or it is all nulls if it was an
1122 initial ellipsis. */
1123 if (startp == NULL || startp->mbs == NULL)
1124 memset (mbcnt, '\0', len);
1125 else
1127 memcpy (mbcnt, startp->mbs, len);
1129 /* And increment it so that the value is the first one we will
1130 try to insert. */
1131 for (cnt = len - 1; cnt >= 0; --cnt)
1132 if (++mbcnt[cnt] != '\0')
1133 break;
1135 mbcnt[len] = '\0';
1137 /* And the end sequence. */
1138 if (endp == NULL || endp->mbs == NULL)
1139 memset (mbend, '\0', len);
1140 else
1141 memcpy (mbend, endp->mbs, len);
1142 mbend[len] = '\0';
1144 /* Test whether we have a correct range. */
1145 ret = memcmp (mbcnt, mbend, len);
1146 if (ret >= 0)
1148 if (ret > 0)
1149 lr_error (ldfile, _("%s: byte sequence of first character of \
1150 range is not lower than that of the last character"), "LC_COLLATE");
1151 return;
1154 /* Generate the byte sequences data. */
1155 while (1)
1157 struct charseq *seq;
1159 /* Quite a bit of work ahead. We have to find the character
1160 definition for the byte sequence and then determine the
1161 wide character belonging to it. */
1162 seq = charmap_find_symbol (charmap, mbcnt, len);
1163 if (seq != NULL)
1165 struct element_t *elem;
1166 size_t namelen;
1168 /* I don't think this can ever happen. */
1169 assert (seq->name != NULL);
1170 namelen = strlen (seq->name);
1172 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1173 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1174 namelen);
1176 /* Now we are ready to insert the new value in the
1177 sequence. Find out whether the element is
1178 already known. */
1179 void *ptr;
1180 if (find_entry (&collate->seq_table, seq->name, namelen,
1181 &ptr) != 0)
1183 uint32_t wcs[2] = { seq->ucs4, 0 };
1185 /* We have to allocate an entry. */
1186 elem = new_element (collate, mbcnt, len,
1187 seq->ucs4 == ILLEGAL_CHAR_VALUE
1188 ? NULL : wcs, seq->name,
1189 namelen, 1);
1191 /* And add it to the table. */
1192 if (insert_entry (&collate->seq_table, seq->name,
1193 namelen, elem) != 0)
1194 /* This cannot happen. */
1195 assert (! "Internal error");
1197 else
1198 /* Copy the result. */
1199 elem = ptr;
1201 /* Test whether this element is not already in the list. */
1202 if (elem->next != NULL || (collate->cursor != NULL
1203 && elem->next == collate->cursor))
1205 lr_error (ldfile, _("\
1206 order for `%.*s' already defined at %s:%Zu"),
1207 (int) namelen, seq->name,
1208 elem->file, elem->line);
1209 goto increment;
1212 /* Enqueue the new element. */
1213 elem->last = collate->cursor;
1214 if (collate->cursor == NULL)
1215 elem->next = NULL;
1216 else
1218 elem->next = collate->cursor->next;
1219 elem->last->next = elem;
1220 if (elem->next != NULL)
1221 elem->next->last = elem;
1223 if (collate->start == NULL)
1225 assert (collate->cursor == NULL);
1226 collate->start = elem;
1228 collate->cursor = elem;
1230 /* Add the weight value. We take them from the
1231 `ellipsis_weights' member of `collate'. */
1232 elem->weights = (struct element_list_t *)
1233 obstack_alloc (&collate->mempool,
1234 nrules * sizeof (struct element_list_t));
1235 for (cnt = 0; cnt < nrules; ++cnt)
1236 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1237 && (collate->ellipsis_weight.weights[cnt].w[0]
1238 == ELEMENT_ELLIPSIS2))
1240 elem->weights[cnt].w = (struct element_t **)
1241 obstack_alloc (&collate->mempool,
1242 sizeof (struct element_t *));
1243 elem->weights[cnt].w[0] = elem;
1244 elem->weights[cnt].cnt = 1;
1246 else
1248 /* Simply use the weight from `ellipsis_weight'. */
1249 elem->weights[cnt].w =
1250 collate->ellipsis_weight.weights[cnt].w;
1251 elem->weights[cnt].cnt =
1252 collate->ellipsis_weight.weights[cnt].cnt;
1256 /* Increment for the next round. */
1257 increment:
1258 for (cnt = len - 1; cnt >= 0; --cnt)
1259 if (++mbcnt[cnt] != '\0')
1260 break;
1262 /* Find out whether this was all. */
1263 if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1264 /* Yep, that's all. */
1265 break;
1269 else
1271 /* For symbolic range we naturally must have a beginning and an
1272 end specified by the user. */
1273 if (startp == NULL)
1274 lr_error (ldfile, _("\
1275 %s: symbolic range ellipsis must not directly follow `order_start'"),
1276 "LC_COLLATE");
1277 else if (endp == NULL)
1278 lr_error (ldfile, _("\
1279 %s: symbolic range ellipsis must not be directly followed by `order_end'"),
1280 "LC_COLLATE");
1281 else
1283 /* Determine the range. To do so we have to determine the
1284 common prefix of the both names and then the numeric
1285 values of both ends. */
1286 size_t lenfrom = strlen (startp->name);
1287 size_t lento = strlen (endp->name);
1288 char buf[lento + 1];
1289 int preflen = 0;
1290 long int from;
1291 long int to;
1292 char *cp;
1293 int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1295 if (lenfrom != lento)
1297 invalid_range:
1298 lr_error (ldfile, _("\
1299 `%s' and `%.*s' are not valid names for symbolic range"),
1300 startp->name, (int) lento, endp->name);
1301 return;
1304 while (startp->name[preflen] == endp->name[preflen])
1305 if (startp->name[preflen] == '\0')
1306 /* Nothing to be done. The start and end point are identical
1307 and while inserting the end point we have already given
1308 the user an error message. */
1309 return;
1310 else
1311 ++preflen;
1313 errno = 0;
1314 from = strtol (startp->name + preflen, &cp, base);
1315 if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1316 goto invalid_range;
1318 errno = 0;
1319 to = strtol (endp->name + preflen, &cp, base);
1320 if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1321 goto invalid_range;
1323 /* Copy the prefix. */
1324 memcpy (buf, startp->name, preflen);
1326 /* Loop over all values. */
1327 for (++from; from < to; ++from)
1329 struct element_t *elem = NULL;
1330 struct charseq *seq;
1331 uint32_t wc;
1332 int cnt;
1334 /* Generate the name. */
1335 sprintf (buf + preflen, base == 10 ? "%0*ld" : "%0*lX",
1336 (int) (lenfrom - preflen), from);
1338 /* Look whether this name is already defined. */
1339 void *ptr;
1340 if (find_entry (&collate->seq_table, buf, symlen, &ptr) == 0)
1342 /* Copy back the result. */
1343 elem = ptr;
1345 if (elem->next != NULL || (collate->cursor != NULL
1346 && elem->next == collate->cursor))
1348 lr_error (ldfile, _("\
1349 %s: order for `%.*s' already defined at %s:%Zu"),
1350 "LC_COLLATE", (int) lenfrom, buf,
1351 elem->file, elem->line);
1352 continue;
1355 if (elem->name == NULL)
1357 lr_error (ldfile, _("%s: `%s' must be a character"),
1358 "LC_COLLATE", buf);
1359 continue;
1363 if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1365 /* Search for a character of this name. */
1366 seq = charmap_find_value (charmap, buf, lenfrom);
1367 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1369 wc = repertoire_find_value (repertoire, buf, lenfrom);
1371 if (seq != NULL)
1372 seq->ucs4 = wc;
1374 else
1375 wc = seq->ucs4;
1377 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1378 /* We don't know anything about a character with this
1379 name. XXX Should we warn? */
1380 continue;
1382 if (elem == NULL)
1384 uint32_t wcs[2] = { wc, 0 };
1386 /* We have to allocate an entry. */
1387 elem = new_element (collate,
1388 seq != NULL ? seq->bytes : NULL,
1389 seq != NULL ? seq->nbytes : 0,
1390 wc == ILLEGAL_CHAR_VALUE
1391 ? NULL : wcs, buf, lenfrom, 1);
1393 else
1395 /* Update the element. */
1396 if (seq != NULL)
1398 elem->mbs = obstack_copy0 (&collate->mempool,
1399 seq->bytes, seq->nbytes);
1400 elem->nmbs = seq->nbytes;
1403 if (wc != ILLEGAL_CHAR_VALUE)
1405 uint32_t zero = 0;
1407 obstack_grow (&collate->mempool,
1408 &wc, sizeof (uint32_t));
1409 obstack_grow (&collate->mempool,
1410 &zero, sizeof (uint32_t));
1411 elem->wcs = obstack_finish (&collate->mempool);
1412 elem->nwcs = 1;
1416 elem->file = ldfile->fname;
1417 elem->line = ldfile->lineno;
1418 elem->section = collate->current_section;
1421 /* Enqueue the new element. */
1422 elem->last = collate->cursor;
1423 elem->next = collate->cursor->next;
1424 elem->last->next = elem;
1425 if (elem->next != NULL)
1426 elem->next->last = elem;
1427 collate->cursor = elem;
1429 /* Now add the weights. They come from the `ellipsis_weights'
1430 member of `collate'. */
1431 elem->weights = (struct element_list_t *)
1432 obstack_alloc (&collate->mempool,
1433 nrules * sizeof (struct element_list_t));
1434 for (cnt = 0; cnt < nrules; ++cnt)
1435 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1436 && (collate->ellipsis_weight.weights[cnt].w[0]
1437 == ELEMENT_ELLIPSIS2))
1439 elem->weights[cnt].w = (struct element_t **)
1440 obstack_alloc (&collate->mempool,
1441 sizeof (struct element_t *));
1442 elem->weights[cnt].w[0] = elem;
1443 elem->weights[cnt].cnt = 1;
1445 else
1447 /* Simly use the weight from `ellipsis_weight'. */
1448 elem->weights[cnt].w =
1449 collate->ellipsis_weight.weights[cnt].w;
1450 elem->weights[cnt].cnt =
1451 collate->ellipsis_weight.weights[cnt].cnt;
1459 static void
1460 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1461 struct localedef_t *copy_locale, int ignore_content)
1463 if (!ignore_content && locale->categories[LC_COLLATE].collate == NULL)
1465 struct locale_collate_t *collate;
1467 if (copy_locale == NULL)
1469 collate = locale->categories[LC_COLLATE].collate =
1470 (struct locale_collate_t *)
1471 xcalloc (1, sizeof (struct locale_collate_t));
1473 /* Init the various data structures. */
1474 init_hash (&collate->elem_table, 100);
1475 init_hash (&collate->sym_table, 100);
1476 init_hash (&collate->seq_table, 500);
1477 obstack_init (&collate->mempool);
1479 collate->col_weight_max = -1;
1481 else
1482 /* Reuse the copy_locale's data structures. */
1483 collate = locale->categories[LC_COLLATE].collate =
1484 copy_locale->categories[LC_COLLATE].collate;
1487 ldfile->translate_strings = 0;
1488 ldfile->return_widestr = 0;
1492 void
1493 collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
1495 /* Now is the time when we can assign the individual collation
1496 values for all the symbols. We have possibly different values
1497 for the wide- and the multibyte-character symbols. This is done
1498 since it might make a difference in the encoding if there is in
1499 some cases no multibyte-character but there are wide-characters.
1500 (The other way around it is not important since theencoded
1501 collation value in the wide-character case is 32 bits wide and
1502 therefore requires no encoding).
1504 The lowest collation value assigned is 2. Zero is reserved for
1505 the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1506 functions and 1 is used to separate the individual passes for the
1507 different rules.
1509 We also have to construct is list with all the bytes/words which
1510 can come first in a sequence, followed by all the elements which
1511 also start with this byte/word. The order is reverse which has
1512 among others the important effect that longer strings are located
1513 first in the list. This is required for the output data since
1514 the algorithm used in `strcoll' etc depends on this.
1516 The multibyte case is easy. We simply sort into an array with
1517 256 elements. */
1518 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1519 int mbact[nrules];
1520 int wcact;
1521 int mbseqact;
1522 int wcseqact;
1523 struct element_t *runp;
1524 int i;
1525 int need_undefined = 0;
1526 struct section_list *sect;
1527 int ruleidx;
1528 int nr_wide_elems = 0;
1530 if (collate == NULL)
1532 /* No data, no check. */
1533 if (! be_quiet)
1534 WITH_CUR_LOCALE (error (0, 0, _("No definition for %s category found"),
1535 "LC_COLLATE"));
1536 return;
1539 /* If this assertion is hit change the type in `element_t'. */
1540 assert (nrules <= sizeof (runp->used_in_level) * 8);
1542 /* Make sure that the `position' rule is used either in all sections
1543 or in none. */
1544 for (i = 0; i < nrules; ++i)
1545 for (sect = collate->sections; sect != NULL; sect = sect->next)
1546 if (sect->rules != NULL
1547 && ((sect->rules[i] & sort_position)
1548 != (collate->sections->rules[i] & sort_position)))
1550 WITH_CUR_LOCALE (error (0, 0, _("\
1551 %s: `position' must be used for a specific level in all sections or none"),
1552 "LC_COLLATE"));
1553 break;
1556 /* Find out which elements are used at which level. At the same
1557 time we find out whether we have any undefined symbols. */
1558 runp = collate->start;
1559 while (runp != NULL)
1561 if (runp->mbs != NULL)
1563 for (i = 0; i < nrules; ++i)
1565 int j;
1567 for (j = 0; j < runp->weights[i].cnt; ++j)
1568 /* A NULL pointer as the weight means IGNORE. */
1569 if (runp->weights[i].w[j] != NULL)
1571 if (runp->weights[i].w[j]->weights == NULL)
1573 WITH_CUR_LOCALE (error_at_line (0, 0, runp->file,
1574 runp->line,
1575 _("symbol `%s' not defined"),
1576 runp->weights[i].w[j]->name));
1578 need_undefined = 1;
1579 runp->weights[i].w[j] = &collate->undefined;
1581 else
1582 /* Set the bit for the level. */
1583 runp->weights[i].w[j]->used_in_level |= 1 << i;
1588 /* Up to the next entry. */
1589 runp = runp->next;
1592 /* Walk through the list of defined sequences and assign weights. Also
1593 create the data structure which will allow generating the single byte
1594 character based tables.
1596 Since at each time only the weights for each of the rules are
1597 only compared to other weights for this rule it is possible to
1598 assign more compact weight values than simply counting all
1599 weights in sequence. We can assign weights from 3, one for each
1600 rule individually and only for those elements, which are actually
1601 used for this rule.
1603 Why is this important? It is not for the wide char table. But
1604 it is for the singlebyte output since here larger numbers have to
1605 be encoded to make it possible to emit the value as a byte
1606 string. */
1607 for (i = 0; i < nrules; ++i)
1608 mbact[i] = 2;
1609 wcact = 2;
1610 mbseqact = 0;
1611 wcseqact = 0;
1612 runp = collate->start;
1613 while (runp != NULL)
1615 /* Determine the order. */
1616 if (runp->used_in_level != 0)
1618 runp->mborder = (int *) obstack_alloc (&collate->mempool,
1619 nrules * sizeof (int));
1621 for (i = 0; i < nrules; ++i)
1622 if ((runp->used_in_level & (1 << i)) != 0)
1623 runp->mborder[i] = mbact[i]++;
1624 else
1625 runp->mborder[i] = 0;
1628 if (runp->mbs != NULL)
1630 struct element_t **eptr;
1631 struct element_t *lastp = NULL;
1633 /* Find the point where to insert in the list. */
1634 eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1635 while (*eptr != NULL)
1637 if ((*eptr)->nmbs < runp->nmbs)
1638 break;
1640 if ((*eptr)->nmbs == runp->nmbs)
1642 int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1644 if (c == 0)
1646 /* This should not happen. It means that we have
1647 to symbols with the same byte sequence. It is
1648 of course an error. */
1649 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1650 (*eptr)->line,
1651 _("\
1652 symbol `%s' has the same encoding as"), (*eptr)->name);
1653 error_at_line (0, 0, runp->file,
1654 runp->line,
1655 _("symbol `%s'"),
1656 runp->name));
1657 goto dont_insert;
1659 else if (c < 0)
1660 /* Insert it here. */
1661 break;
1664 /* To the next entry. */
1665 lastp = *eptr;
1666 eptr = &(*eptr)->mbnext;
1669 /* Set the pointers. */
1670 runp->mbnext = *eptr;
1671 runp->mblast = lastp;
1672 if (*eptr != NULL)
1673 (*eptr)->mblast = runp;
1674 *eptr = runp;
1675 dont_insert:
1679 if (runp->used_in_level)
1681 runp->wcorder = wcact++;
1683 /* We take the opportunity to count the elements which have
1684 wide characters. */
1685 ++nr_wide_elems;
1688 if (runp->is_character)
1690 if (runp->nmbs == 1)
1691 collate->mbseqorder[((unsigned char *) runp->mbs)[0]] = mbseqact++;
1693 runp->wcseqorder = wcseqact++;
1695 else if (runp->mbs != NULL && runp->weights != NULL)
1696 /* This is for collation elements. */
1697 runp->wcseqorder = wcseqact++;
1699 /* Up to the next entry. */
1700 runp = runp->next;
1703 /* Find out whether any of the `mbheads' entries is unset. In this
1704 case we use the UNDEFINED entry. */
1705 for (i = 1; i < 256; ++i)
1706 if (collate->mbheads[i] == NULL)
1708 need_undefined = 1;
1709 collate->mbheads[i] = &collate->undefined;
1712 /* Now to the wide character case. */
1713 collate->wcheads.p = 6;
1714 collate->wcheads.q = 10;
1715 wchead_table_init (&collate->wcheads);
1717 collate->wcseqorder.p = 6;
1718 collate->wcseqorder.q = 10;
1719 collseq_table_init (&collate->wcseqorder);
1721 /* Start adding. */
1722 runp = collate->start;
1723 while (runp != NULL)
1725 if (runp->wcs != NULL)
1727 struct element_t *e;
1728 struct element_t **eptr;
1729 struct element_t *lastp;
1731 /* Insert the collation sequence value. */
1732 if (runp->is_character)
1733 collseq_table_add (&collate->wcseqorder, runp->wcs[0],
1734 runp->wcseqorder);
1736 /* Find the point where to insert in the list. */
1737 e = wchead_table_get (&collate->wcheads, runp->wcs[0]);
1738 eptr = &e;
1739 lastp = NULL;
1740 while (*eptr != NULL)
1742 if ((*eptr)->nwcs < runp->nwcs)
1743 break;
1745 if ((*eptr)->nwcs == runp->nwcs)
1747 int c = wmemcmp ((wchar_t *) (*eptr)->wcs,
1748 (wchar_t *) runp->wcs, runp->nwcs);
1750 if (c == 0)
1752 /* This should not happen. It means that we have
1753 two symbols with the same byte sequence. It is
1754 of course an error. */
1755 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1756 (*eptr)->line,
1757 _("\
1758 symbol `%s' has the same encoding as"), (*eptr)->name);
1759 error_at_line (0, 0, runp->file,
1760 runp->line,
1761 _("symbol `%s'"),
1762 runp->name));
1763 goto dont_insertwc;
1765 else if (c < 0)
1766 /* Insert it here. */
1767 break;
1770 /* To the next entry. */
1771 lastp = *eptr;
1772 eptr = &(*eptr)->wcnext;
1775 /* Set the pointers. */
1776 runp->wcnext = *eptr;
1777 runp->wclast = lastp;
1778 if (*eptr != NULL)
1779 (*eptr)->wclast = runp;
1780 *eptr = runp;
1781 if (eptr == &e)
1782 wchead_table_add (&collate->wcheads, runp->wcs[0], e);
1783 dont_insertwc:
1787 /* Up to the next entry. */
1788 runp = runp->next;
1791 collseq_table_finalize (&collate->wcseqorder);
1793 /* Now determine whether the UNDEFINED entry is needed and if yes,
1794 whether it was defined. */
1795 collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1796 if (collate->undefined.file == NULL)
1798 if (need_undefined)
1800 /* This seems not to be enforced by recent standards. Don't
1801 emit an error, simply append UNDEFINED at the end. */
1802 if (0)
1803 WITH_CUR_LOCALE (error (0, 0, _("no definition of `UNDEFINED'")));
1805 /* Add UNDEFINED at the end. */
1806 collate->undefined.mborder =
1807 (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1809 for (i = 0; i < nrules; ++i)
1810 collate->undefined.mborder[i] = mbact[i]++;
1813 /* In any case we will need the definition for the wide character
1814 case. But we will not complain that it is missing since the
1815 specification strangely enough does not seem to account for
1816 this. */
1817 collate->undefined.wcorder = wcact++;
1820 /* Finally, try to unify the rules for the sections. Whenever the rules
1821 for a section are the same as those for another section give the
1822 ruleset the same index. Since there are never many section we can
1823 use an O(n^2) algorithm here. */
1824 sect = collate->sections;
1825 while (sect != NULL && sect->rules == NULL)
1826 sect = sect->next;
1828 /* Bail out if we have no sections because of earlier errors. */
1829 if (sect == NULL)
1831 WITH_CUR_LOCALE (error (EXIT_FAILURE, 0,
1832 _("too many errors; giving up")));
1833 return;
1836 ruleidx = 0;
1839 struct section_list *osect = collate->sections;
1841 while (osect != sect)
1842 if (osect->rules != NULL
1843 && memcmp (osect->rules, sect->rules, nrules) == 0)
1844 break;
1845 else
1846 osect = osect->next;
1848 if (osect == sect)
1849 sect->ruleidx = ruleidx++;
1850 else
1851 sect->ruleidx = osect->ruleidx;
1853 /* Next section. */
1855 sect = sect->next;
1856 while (sect != NULL && sect->rules == NULL);
1858 while (sect != NULL);
1859 /* We are currently not prepared for more than 128 rulesets. But this
1860 should never really be a problem. */
1861 assert (ruleidx <= 128);
1865 static int32_t
1866 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1867 struct element_t *elem)
1869 size_t cnt;
1870 int32_t retval;
1872 /* Optimize the use of UNDEFINED. */
1873 if (elem == &collate->undefined)
1874 /* The weights are already inserted. */
1875 return 0;
1877 /* This byte can start exactly one collation element and this is
1878 a single byte. We can directly give the index to the weights. */
1879 retval = obstack_object_size (pool);
1881 /* Construct the weight. */
1882 for (cnt = 0; cnt < nrules; ++cnt)
1884 char buf[elem->weights[cnt].cnt * 7];
1885 int len = 0;
1886 int i;
1888 for (i = 0; i < elem->weights[cnt].cnt; ++i)
1889 /* Encode the weight value. We do nothing for IGNORE entries. */
1890 if (elem->weights[cnt].w[i] != NULL)
1891 len += utf8_encode (&buf[len],
1892 elem->weights[cnt].w[i]->mborder[cnt]);
1894 /* And add the buffer content. */
1895 obstack_1grow (pool, len);
1896 obstack_grow (pool, buf, len);
1899 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1903 static int32_t
1904 output_weightwc (struct obstack *pool, struct locale_collate_t *collate,
1905 struct element_t *elem)
1907 size_t cnt;
1908 int32_t retval;
1910 /* Optimize the use of UNDEFINED. */
1911 if (elem == &collate->undefined)
1912 /* The weights are already inserted. */
1913 return 0;
1915 /* This byte can start exactly one collation element and this is
1916 a single byte. We can directly give the index to the weights. */
1917 retval = obstack_object_size (pool) / sizeof (int32_t);
1919 /* Construct the weight. */
1920 for (cnt = 0; cnt < nrules; ++cnt)
1922 int32_t buf[elem->weights[cnt].cnt];
1923 int i;
1924 int32_t j;
1926 for (i = 0, j = 0; i < elem->weights[cnt].cnt; ++i)
1927 if (elem->weights[cnt].w[i] != NULL)
1928 buf[j++] = elem->weights[cnt].w[i]->wcorder;
1930 /* And add the buffer content. */
1931 obstack_int32_grow (pool, j);
1933 obstack_grow (pool, buf, j * sizeof (int32_t));
1936 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1939 /* If localedef is every threaded, this would need to be __thread var. */
1940 static struct
1942 struct obstack *weightpool;
1943 struct obstack *extrapool;
1944 struct obstack *indpool;
1945 struct locale_collate_t *collate;
1946 struct collidx_table *tablewc;
1947 } atwc;
1949 static void add_to_tablewc (uint32_t ch, struct element_t *runp);
1951 static void
1952 add_to_tablewc (uint32_t ch, struct element_t *runp)
1954 if (runp->wcnext == NULL && runp->nwcs == 1)
1956 int32_t weigthidx = output_weightwc (atwc.weightpool, atwc.collate,
1957 runp);
1958 collidx_table_add (atwc.tablewc, ch, weigthidx);
1960 else
1962 /* As for the singlebyte table, we recognize sequences and
1963 compress them. */
1964 struct element_t *lastp;
1966 collidx_table_add (atwc.tablewc, ch,
1967 -(obstack_object_size (atwc.extrapool)
1968 / sizeof (uint32_t)));
1972 /* Store the current index in the weight table. We know that
1973 the current position in the `extrapool' is aligned on a
1974 32-bit address. */
1975 int32_t weightidx;
1976 int added;
1978 /* Find out wether this is a single entry or we have more than
1979 one consecutive entry. */
1980 if (runp->wcnext != NULL
1981 && runp->nwcs == runp->wcnext->nwcs
1982 && wmemcmp ((wchar_t *) runp->wcs,
1983 (wchar_t *)runp->wcnext->wcs,
1984 runp->nwcs - 1) == 0
1985 && (runp->wcs[runp->nwcs - 1]
1986 == runp->wcnext->wcs[runp->nwcs - 1] + 1))
1988 int i;
1989 struct element_t *series_startp = runp;
1990 struct element_t *curp;
1992 /* Now add first the initial byte sequence. */
1993 added = (1 + 1 + 2 * (runp->nwcs - 1)) * sizeof (int32_t);
1994 if (sizeof (int32_t) == sizeof (int))
1995 obstack_make_room (atwc.extrapool, added);
1997 /* More than one consecutive entry. We mark this by having
1998 a negative index into the indirect table. */
1999 obstack_int32_grow_fast (atwc.extrapool,
2000 -(obstack_object_size (atwc.indpool)
2001 / sizeof (int32_t)));
2002 obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2005 runp = runp->wcnext;
2006 while (runp->wcnext != NULL
2007 && runp->nwcs == runp->wcnext->nwcs
2008 && wmemcmp ((wchar_t *) runp->wcs,
2009 (wchar_t *)runp->wcnext->wcs,
2010 runp->nwcs - 1) == 0
2011 && (runp->wcs[runp->nwcs - 1]
2012 == runp->wcnext->wcs[runp->nwcs - 1] + 1));
2014 /* Now walk backward from here to the beginning. */
2015 curp = runp;
2017 for (i = 1; i < runp->nwcs; ++i)
2018 obstack_int32_grow_fast (atwc.extrapool, curp->wcs[i]);
2020 /* Now find the end of the consecutive sequence and
2021 add all the indeces in the indirect pool. */
2024 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2025 curp);
2026 obstack_int32_grow (atwc.indpool, weightidx);
2028 curp = curp->wclast;
2030 while (curp != series_startp);
2032 /* Add the final weight. */
2033 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2034 curp);
2035 obstack_int32_grow (atwc.indpool, weightidx);
2037 /* And add the end byte sequence. Without length this
2038 time. */
2039 for (i = 1; i < curp->nwcs; ++i)
2040 obstack_int32_grow (atwc.extrapool, curp->wcs[i]);
2042 else
2044 /* A single entry. Simply add the index and the length and
2045 string (except for the first character which is already
2046 tested for). */
2047 int i;
2049 /* Output the weight info. */
2050 weightidx = output_weightwc (atwc.weightpool, atwc.collate,
2051 runp);
2053 added = (1 + 1 + runp->nwcs - 1) * sizeof (int32_t);
2054 if (sizeof (int) == sizeof (int32_t))
2055 obstack_make_room (atwc.extrapool, added);
2057 obstack_int32_grow_fast (atwc.extrapool, weightidx);
2058 obstack_int32_grow_fast (atwc.extrapool, runp->nwcs - 1);
2059 for (i = 1; i < runp->nwcs; ++i)
2060 obstack_int32_grow_fast (atwc.extrapool, runp->wcs[i]);
2063 /* Next entry. */
2064 lastp = runp;
2065 runp = runp->wcnext;
2067 while (runp != NULL);
2071 void
2072 collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
2073 const char *output_path)
2075 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
2076 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
2077 struct iovec iov[2 + nelems];
2078 struct locale_file data;
2079 uint32_t idx[nelems];
2080 size_t cnt;
2081 size_t ch;
2082 int32_t tablemb[256];
2083 struct obstack weightpool;
2084 struct obstack extrapool;
2085 struct obstack indirectpool;
2086 struct section_list *sect;
2087 struct collidx_table tablewc;
2088 uint32_t elem_size;
2089 uint32_t *elem_table;
2090 int i;
2091 struct element_t *runp;
2093 data.magic = LIMAGIC (LC_COLLATE);
2094 data.n = nelems;
2095 iov[0].iov_base = (void *) &data;
2096 iov[0].iov_len = sizeof (data);
2098 iov[1].iov_base = (void *) idx;
2099 iov[1].iov_len = sizeof (idx);
2101 idx[0] = iov[0].iov_len + iov[1].iov_len;
2102 cnt = 0;
2104 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES));
2105 iov[2 + cnt].iov_base = &nrules;
2106 iov[2 + cnt].iov_len = sizeof (uint32_t);
2107 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2108 ++cnt;
2110 /* If we have no LC_COLLATE data emit only the number of rules as zero. */
2111 if (collate == NULL)
2113 int32_t dummy = 0;
2115 while (cnt < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
2117 /* The words have to be handled specially. */
2118 if (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
2120 iov[2 + cnt].iov_base = &dummy;
2121 iov[2 + cnt].iov_len = sizeof (int32_t);
2123 else
2125 iov[2 + cnt].iov_base = NULL;
2126 iov[2 + cnt].iov_len = 0;
2129 if (cnt + 1 < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
2130 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2131 ++cnt;
2134 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2136 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", 2 + cnt, iov);
2138 return;
2141 obstack_init (&weightpool);
2142 obstack_init (&extrapool);
2143 obstack_init (&indirectpool);
2145 /* Since we are using the sign of an integer to mark indirection the
2146 offsets in the arrays we are indirectly referring to must not be
2147 zero since -0 == 0. Therefore we add a bit of dummy content. */
2148 obstack_int32_grow (&extrapool, 0);
2149 obstack_int32_grow (&indirectpool, 0);
2151 /* Prepare the ruleset table. */
2152 for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
2153 if (sect->rules != NULL && sect->ruleidx == i)
2155 int j;
2157 obstack_make_room (&weightpool, nrules);
2159 for (j = 0; j < nrules; ++j)
2160 obstack_1grow_fast (&weightpool, sect->rules[j]);
2161 ++i;
2163 /* And align the output. */
2164 i = (nrules * i) % __alignof__ (int32_t);
2165 if (i > 0)
2167 obstack_1grow (&weightpool, '\0');
2168 while (++i < __alignof__ (int32_t));
2170 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_RULESETS));
2171 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2172 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2173 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2174 ++cnt;
2176 /* Generate the 8-bit table. Walk through the lists of sequences
2177 starting with the same byte and add them one after the other to
2178 the table. In case we have more than one sequence starting with
2179 the same byte we have to use extra indirection.
2181 First add a record for the NUL byte. This entry will never be used
2182 so it does not matter. */
2183 tablemb[0] = 0;
2185 /* Now insert the `UNDEFINED' value if it is used. Since this value
2186 will probably be used more than once it is good to store the
2187 weights only once. */
2188 if (collate->undefined.used_in_level != 0)
2189 output_weight (&weightpool, collate, &collate->undefined);
2191 for (ch = 1; ch < 256; ++ch)
2192 if (collate->mbheads[ch]->mbnext == NULL
2193 && collate->mbheads[ch]->nmbs <= 1)
2195 tablemb[ch] = output_weight (&weightpool, collate,
2196 collate->mbheads[ch]);
2198 else
2200 /* The entries in the list are sorted by length and then
2201 alphabetically. This is the order in which we will add the
2202 elements to the collation table. This allows simply walking
2203 the table in sequence and stopping at the first matching
2204 entry. Since the longer sequences are coming first in the
2205 list they have the possibility to match first, just as it
2206 has to be. In the worst case we are walking to the end of
2207 the list where we put, if no singlebyte sequence is defined
2208 in the locale definition, the weights for UNDEFINED.
2210 To reduce the length of the search list we compress them a bit.
2211 This happens by collecting sequences of consecutive byte
2212 sequences in one entry (having and begin and end byte sequence)
2213 and add only one index into the weight table. We can find the
2214 consecutive entries since they are also consecutive in the list. */
2215 struct element_t *runp = collate->mbheads[ch];
2216 struct element_t *lastp;
2218 assert ((obstack_object_size (&extrapool)
2219 & (__alignof__ (int32_t) - 1)) == 0);
2221 tablemb[ch] = -obstack_object_size (&extrapool);
2225 /* Store the current index in the weight table. We know that
2226 the current position in the `extrapool' is aligned on a
2227 32-bit address. */
2228 int32_t weightidx;
2229 int added;
2231 /* Find out wether this is a single entry or we have more than
2232 one consecutive entry. */
2233 if (runp->mbnext != NULL
2234 && runp->nmbs == runp->mbnext->nmbs
2235 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
2236 && (runp->mbs[runp->nmbs - 1]
2237 == runp->mbnext->mbs[runp->nmbs - 1] + 1))
2239 int i;
2240 struct element_t *series_startp = runp;
2241 struct element_t *curp;
2243 /* Compute how much space we will need. */
2244 added = ((sizeof (int32_t) + 1 + 2 * (runp->nmbs - 1)
2245 + __alignof__ (int32_t) - 1)
2246 & ~(__alignof__ (int32_t) - 1));
2247 assert ((obstack_object_size (&extrapool)
2248 & (__alignof__ (int32_t) - 1)) == 0);
2249 obstack_make_room (&extrapool, added);
2251 /* More than one consecutive entry. We mark this by having
2252 a negative index into the indirect table. */
2253 obstack_int32_grow_fast (&extrapool,
2254 -(obstack_object_size (&indirectpool)
2255 / sizeof (int32_t)));
2257 /* Now search first the end of the series. */
2259 runp = runp->mbnext;
2260 while (runp->mbnext != NULL
2261 && runp->nmbs == runp->mbnext->nmbs
2262 && memcmp (runp->mbs, runp->mbnext->mbs,
2263 runp->nmbs - 1) == 0
2264 && (runp->mbs[runp->nmbs - 1]
2265 == runp->mbnext->mbs[runp->nmbs - 1] + 1));
2267 /* Now walk backward from here to the beginning. */
2268 curp = runp;
2270 assert (runp->nmbs <= 256);
2271 obstack_1grow_fast (&extrapool, curp->nmbs - 1);
2272 for (i = 1; i < curp->nmbs; ++i)
2273 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2275 /* Now find the end of the consecutive sequence and
2276 add all the indeces in the indirect pool. */
2279 weightidx = output_weight (&weightpool, collate, curp);
2280 obstack_int32_grow (&indirectpool, weightidx);
2282 curp = curp->mblast;
2284 while (curp != series_startp);
2286 /* Add the final weight. */
2287 weightidx = output_weight (&weightpool, collate, curp);
2288 obstack_int32_grow (&indirectpool, weightidx);
2290 /* And add the end byte sequence. Without length this
2291 time. */
2292 for (i = 1; i < curp->nmbs; ++i)
2293 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2295 else
2297 /* A single entry. Simply add the index and the length and
2298 string (except for the first character which is already
2299 tested for). */
2300 int i;
2302 /* Output the weight info. */
2303 weightidx = output_weight (&weightpool, collate, runp);
2305 added = ((sizeof (int32_t) + 1 + runp->nmbs - 1
2306 + __alignof__ (int32_t) - 1)
2307 & ~(__alignof__ (int32_t) - 1));
2308 assert ((obstack_object_size (&extrapool)
2309 & (__alignof__ (int32_t) - 1)) == 0);
2310 obstack_make_room (&extrapool, added);
2312 obstack_int32_grow_fast (&extrapool, weightidx);
2313 assert (runp->nmbs <= 256);
2314 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
2316 for (i = 1; i < runp->nmbs; ++i)
2317 obstack_1grow_fast (&extrapool, runp->mbs[i]);
2320 /* Add alignment bytes if necessary. */
2321 while ((obstack_object_size (&extrapool)
2322 & (__alignof__ (int32_t) - 1)) != 0)
2323 obstack_1grow_fast (&extrapool, '\0');
2325 /* Next entry. */
2326 lastp = runp;
2327 runp = runp->mbnext;
2329 while (runp != NULL);
2331 assert ((obstack_object_size (&extrapool)
2332 & (__alignof__ (int32_t) - 1)) == 0);
2334 /* If the final entry in the list is not a single character we
2335 add an UNDEFINED entry here. */
2336 if (lastp->nmbs != 1)
2338 int added = ((sizeof (int32_t) + 1 + 1 + __alignof__ (int32_t) - 1)
2339 & ~(__alignof__ (int32_t) - 1));
2340 obstack_make_room (&extrapool, added);
2342 obstack_int32_grow_fast (&extrapool, 0);
2343 /* XXX What rule? We just pick the first. */
2344 obstack_1grow_fast (&extrapool, 0);
2345 /* Length is zero. */
2346 obstack_1grow_fast (&extrapool, 0);
2348 /* Add alignment bytes if necessary. */
2349 while ((obstack_object_size (&extrapool)
2350 & (__alignof__ (int32_t) - 1)) != 0)
2351 obstack_1grow_fast (&extrapool, '\0');
2355 /* Add padding to the tables if necessary. */
2356 while ((obstack_object_size (&weightpool) & (__alignof__ (int32_t) - 1))
2357 != 0)
2358 obstack_1grow (&weightpool, 0);
2360 /* Now add the four tables. */
2361 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEMB));
2362 iov[2 + cnt].iov_base = tablemb;
2363 iov[2 + cnt].iov_len = sizeof (tablemb);
2364 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2365 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2366 ++cnt;
2368 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB));
2369 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2370 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2371 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2372 ++cnt;
2374 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB));
2375 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2376 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2377 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2378 ++cnt;
2380 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB));
2381 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2382 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2383 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2384 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2385 ++cnt;
2388 /* Now the same for the wide character table. We need to store some
2389 more information here. */
2390 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP1));
2391 iov[2 + cnt].iov_base = NULL;
2392 iov[2 + cnt].iov_len = 0;
2393 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2394 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2395 ++cnt;
2397 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP2));
2398 iov[2 + cnt].iov_base = NULL;
2399 iov[2 + cnt].iov_len = 0;
2400 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2401 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2402 ++cnt;
2404 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP3));
2405 iov[2 + cnt].iov_base = NULL;
2406 iov[2 + cnt].iov_len = 0;
2407 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2408 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2409 ++cnt;
2411 /* Since we are using the sign of an integer to mark indirection the
2412 offsets in the arrays we are indirectly referring to must not be
2413 zero since -0 == 0. Therefore we add a bit of dummy content. */
2414 obstack_int32_grow (&extrapool, 0);
2415 obstack_int32_grow (&indirectpool, 0);
2417 /* Now insert the `UNDEFINED' value if it is used. Since this value
2418 will probably be used more than once it is good to store the
2419 weights only once. */
2420 if (output_weightwc (&weightpool, collate, &collate->undefined) != 0)
2421 abort ();
2423 /* Generate the table. Walk through the lists of sequences starting
2424 with the same wide character and add them one after the other to
2425 the table. In case we have more than one sequence starting with
2426 the same byte we have to use extra indirection. */
2427 tablewc.p = 6;
2428 tablewc.q = 10;
2429 collidx_table_init (&tablewc);
2431 atwc.weightpool = &weightpool;
2432 atwc.extrapool = &extrapool;
2433 atwc.indpool = &indirectpool;
2434 atwc.collate = collate;
2435 atwc.tablewc = &tablewc;
2437 wchead_table_iterate (&collate->wcheads, add_to_tablewc);
2439 memset (&atwc, 0, sizeof (atwc));
2441 collidx_table_finalize (&tablewc);
2443 /* Now add the four tables. */
2444 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEWC));
2445 iov[2 + cnt].iov_base = tablewc.result;
2446 iov[2 + cnt].iov_len = tablewc.result_size;
2447 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2448 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2449 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2450 ++cnt;
2452 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTWC));
2453 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2454 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2455 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2456 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2457 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2458 ++cnt;
2460 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAWC));
2461 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2462 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2463 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2464 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2465 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2466 ++cnt;
2468 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTWC));
2469 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2470 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2471 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2472 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2473 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2474 ++cnt;
2477 /* Finally write the table with collation element names out. It is
2478 a hash table with a simple function which gets the name of the
2479 character as the input. One character might have many names. The
2480 value associated with the name is an index into the weight table
2481 where we are then interested in the first-level weight value.
2483 To determine how large the table should be we are counting the
2484 elements have to put in. Since we are using internal chaining
2485 using a secondary hash function we have to make the table a bit
2486 larger to avoid extremely long search times. We can achieve
2487 good results with a 40% larger table than there are entries. */
2488 elem_size = 0;
2489 runp = collate->start;
2490 while (runp != NULL)
2492 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2493 /* Yep, the element really counts. */
2494 ++elem_size;
2496 runp = runp->next;
2498 /* Add 40% and find the next prime number. */
2499 elem_size = next_prime (elem_size * 1.4);
2501 /* Allocate the table. Each entry consists of two words: the hash
2502 value and an index in a secondary table which provides the index
2503 into the weight table and the string itself (so that a match can
2504 be determined). */
2505 elem_table = (uint32_t *) obstack_alloc (&extrapool,
2506 elem_size * 2 * sizeof (uint32_t));
2507 memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t));
2509 /* Now add the elements. */
2510 runp = collate->start;
2511 while (runp != NULL)
2513 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2515 /* Compute the hash value of the name. */
2516 uint32_t namelen = strlen (runp->name);
2517 uint32_t hash = elem_hash (runp->name, namelen);
2518 size_t idx = hash % elem_size;
2519 size_t start_idx = idx;
2521 if (elem_table[idx * 2] != 0)
2523 /* The spot is already taken. Try iterating using the value
2524 from the secondary hashing function. */
2525 size_t iter = hash % (elem_size - 2) + 1;
2529 idx += iter;
2530 if (idx >= elem_size)
2531 idx -= elem_size;
2532 assert (idx != start_idx);
2534 while (elem_table[idx * 2] != 0);
2536 /* This is the spot where we will insert the value. */
2537 elem_table[idx * 2] = hash;
2538 elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
2540 /* The the string itself including length. */
2541 obstack_1grow (&extrapool, namelen);
2542 obstack_grow (&extrapool, runp->name, namelen);
2544 /* And the multibyte representation. */
2545 obstack_1grow (&extrapool, runp->nmbs);
2546 obstack_grow (&extrapool, runp->mbs, runp->nmbs);
2548 /* And align again to 32 bits. */
2549 if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0)
2550 obstack_grow (&extrapool, "\0\0",
2551 (sizeof (int32_t)
2552 - ((1 + namelen + 1 + runp->nmbs)
2553 % sizeof (int32_t))));
2555 /* Now some 32-bit values: multibyte collation sequence,
2556 wide char string (including length), and wide char
2557 collation sequence. */
2558 obstack_int32_grow (&extrapool, runp->mbseqorder);
2560 obstack_int32_grow (&extrapool, runp->nwcs);
2561 obstack_grow (&extrapool, runp->wcs,
2562 runp->nwcs * sizeof (uint32_t));
2564 obstack_int32_grow (&extrapool, runp->wcseqorder);
2567 runp = runp->next;
2570 /* Prepare to write out this data. */
2571 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB));
2572 iov[2 + cnt].iov_base = &elem_size;
2573 iov[2 + cnt].iov_len = sizeof (int32_t);
2574 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2575 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2576 ++cnt;
2578 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_TABLEMB));
2579 iov[2 + cnt].iov_base = elem_table;
2580 iov[2 + cnt].iov_len = elem_size * 2 * sizeof (int32_t);
2581 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2582 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2583 ++cnt;
2585 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_EXTRAMB));
2586 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2587 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2588 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2589 ++cnt;
2591 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQMB));
2592 iov[2 + cnt].iov_base = collate->mbseqorder;
2593 iov[2 + cnt].iov_len = 256;
2594 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2595 ++cnt;
2597 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQWC));
2598 iov[2 + cnt].iov_base = collate->wcseqorder.result;
2599 iov[2 + cnt].iov_len = collate->wcseqorder.result_size;
2600 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2601 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2602 ++cnt;
2604 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_CODESET));
2605 iov[2 + cnt].iov_base = (void *) charmap->code_set_name;
2606 iov[2 + cnt].iov_len = strlen (iov[2 + cnt].iov_base) + 1;
2607 ++cnt;
2609 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2611 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", 2 + cnt, iov);
2613 obstack_free (&weightpool, NULL);
2614 obstack_free (&extrapool, NULL);
2615 obstack_free (&indirectpool, NULL);
2619 void
2620 collate_read (struct linereader *ldfile, struct localedef_t *result,
2621 const struct charmap_t *charmap, const char *repertoire_name,
2622 int ignore_content)
2624 struct repertoire_t *repertoire = NULL;
2625 struct locale_collate_t *collate;
2626 struct token *now;
2627 struct token *arg = NULL;
2628 enum token_t nowtok;
2629 enum token_t was_ellipsis = tok_none;
2630 struct localedef_t *copy_locale = NULL;
2631 /* Parsing state:
2632 0 - start
2633 1 - between `order-start' and `order-end'
2634 2 - after `order-end'
2635 3 - after `reorder-after', waiting for `reorder-end'
2636 4 - after `reorder-end'
2637 5 - after `reorder-sections-after', waiting for `reorder-sections-end'
2638 6 - after `reorder-sections-end'
2640 int state = 0;
2642 /* Get the repertoire we have to use. */
2643 if (repertoire_name != NULL)
2644 repertoire = repertoire_read (repertoire_name);
2646 /* The rest of the line containing `LC_COLLATE' must be free. */
2647 lr_ignore_rest (ldfile, 1);
2651 now = lr_token (ldfile, charmap, result, NULL, verbose);
2652 nowtok = now->tok;
2654 while (nowtok == tok_eol);
2656 if (nowtok == tok_copy)
2658 state = 2;
2659 now = lr_token (ldfile, charmap, result, NULL, verbose);
2660 if (now->tok != tok_string)
2662 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2664 skip_category:
2666 now = lr_token (ldfile, charmap, result, NULL, verbose);
2667 while (now->tok != tok_eof && now->tok != tok_end);
2669 if (now->tok != tok_eof
2670 || (now = lr_token (ldfile, charmap, result, NULL, verbose),
2671 now->tok == tok_eof))
2672 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2673 else if (now->tok != tok_lc_collate)
2675 lr_error (ldfile, _("\
2676 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2677 lr_ignore_rest (ldfile, 0);
2679 else
2680 lr_ignore_rest (ldfile, 1);
2682 return;
2685 if (! ignore_content)
2687 /* Get the locale definition. */
2688 copy_locale = load_locale (LC_COLLATE, now->val.str.startmb,
2689 repertoire_name, charmap, NULL);
2690 if ((copy_locale->avail & COLLATE_LOCALE) == 0)
2692 /* Not yet loaded. So do it now. */
2693 if (locfile_read (copy_locale, charmap) != 0)
2694 goto skip_category;
2697 if (copy_locale->categories[LC_COLLATE].collate == NULL)
2698 return;
2701 lr_ignore_rest (ldfile, 1);
2703 now = lr_token (ldfile, charmap, result, NULL, verbose);
2704 nowtok = now->tok;
2707 /* Prepare the data structures. */
2708 collate_startup (ldfile, result, copy_locale, ignore_content);
2709 collate = result->categories[LC_COLLATE].collate;
2711 while (1)
2713 char ucs4buf[10];
2714 char *symstr;
2715 size_t symlen;
2717 /* Of course we don't proceed beyond the end of file. */
2718 if (nowtok == tok_eof)
2719 break;
2721 /* Ingore empty lines. */
2722 if (nowtok == tok_eol)
2724 now = lr_token (ldfile, charmap, result, NULL, verbose);
2725 nowtok = now->tok;
2726 continue;
2729 switch (nowtok)
2731 case tok_copy:
2732 /* Allow copying other locales. */
2733 now = lr_token (ldfile, charmap, result, NULL, verbose);
2734 if (now->tok != tok_string)
2735 goto err_label;
2737 if (! ignore_content)
2738 load_locale (LC_COLLATE, now->val.str.startmb, repertoire_name,
2739 charmap, result);
2741 lr_ignore_rest (ldfile, 1);
2742 break;
2744 case tok_coll_weight_max:
2745 /* Ignore the rest of the line if we don't need the input of
2746 this line. */
2747 if (ignore_content)
2749 lr_ignore_rest (ldfile, 0);
2750 break;
2753 if (state != 0)
2754 goto err_label;
2756 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2757 if (arg->tok != tok_number)
2758 goto err_label;
2759 if (collate->col_weight_max != -1)
2760 lr_error (ldfile, _("%s: duplicate definition of `%s'"),
2761 "LC_COLLATE", "col_weight_max");
2762 else
2763 collate->col_weight_max = arg->val.num;
2764 lr_ignore_rest (ldfile, 1);
2765 break;
2767 case tok_section_symbol:
2768 /* Ignore the rest of the line if we don't need the input of
2769 this line. */
2770 if (ignore_content)
2772 lr_ignore_rest (ldfile, 0);
2773 break;
2776 if (state != 0)
2777 goto err_label;
2779 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2780 if (arg->tok != tok_bsymbol)
2781 goto err_label;
2782 else if (!ignore_content)
2784 /* Check whether this section is already known. */
2785 struct section_list *known = collate->sections;
2786 while (known != NULL)
2788 if (strcmp (known->name, arg->val.str.startmb) == 0)
2789 break;
2790 known = known->next;
2793 if (known != NULL)
2795 lr_error (ldfile,
2796 _("%s: duplicate declaration of section `%s'"),
2797 "LC_COLLATE", arg->val.str.startmb);
2798 free (arg->val.str.startmb);
2800 else
2801 collate->sections = make_seclist_elem (collate,
2802 arg->val.str.startmb,
2803 collate->sections);
2805 lr_ignore_rest (ldfile, known == NULL);
2807 else
2809 free (arg->val.str.startmb);
2810 lr_ignore_rest (ldfile, 0);
2812 break;
2814 case tok_collating_element:
2815 /* Ignore the rest of the line if we don't need the input of
2816 this line. */
2817 if (ignore_content)
2819 lr_ignore_rest (ldfile, 0);
2820 break;
2823 if (state != 0 && state != 2)
2824 goto err_label;
2826 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2827 if (arg->tok != tok_bsymbol)
2828 goto err_label;
2829 else
2831 const char *symbol = arg->val.str.startmb;
2832 size_t symbol_len = arg->val.str.lenmb;
2834 /* Next the `from' keyword. */
2835 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2836 if (arg->tok != tok_from)
2838 free ((char *) symbol);
2839 goto err_label;
2842 ldfile->return_widestr = 1;
2843 ldfile->translate_strings = 1;
2845 /* Finally the string with the replacement. */
2846 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2848 ldfile->return_widestr = 0;
2849 ldfile->translate_strings = 0;
2851 if (arg->tok != tok_string)
2852 goto err_label;
2854 if (!ignore_content && symbol != NULL)
2856 /* The name is already defined. */
2857 if (check_duplicate (ldfile, collate, charmap,
2858 repertoire, symbol, symbol_len))
2859 goto col_elem_free;
2861 if (arg->val.str.startmb != NULL)
2862 insert_entry (&collate->elem_table, symbol, symbol_len,
2863 new_element (collate,
2864 arg->val.str.startmb,
2865 arg->val.str.lenmb - 1,
2866 arg->val.str.startwc,
2867 symbol, symbol_len, 0));
2869 else
2871 col_elem_free:
2872 if (symbol != NULL)
2873 free ((char *) symbol);
2874 if (arg->val.str.startmb != NULL)
2875 free (arg->val.str.startmb);
2876 if (arg->val.str.startwc != NULL)
2877 free (arg->val.str.startwc);
2879 lr_ignore_rest (ldfile, 1);
2881 break;
2883 case tok_collating_symbol:
2884 /* Ignore the rest of the line if we don't need the input of
2885 this line. */
2886 if (ignore_content)
2888 lr_ignore_rest (ldfile, 0);
2889 break;
2892 if (state != 0 && state != 2)
2893 goto err_label;
2895 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2896 if (arg->tok != tok_bsymbol)
2897 goto err_label;
2898 else
2900 char *symbol = arg->val.str.startmb;
2901 size_t symbol_len = arg->val.str.lenmb;
2902 char *endsymbol = NULL;
2903 size_t endsymbol_len = 0;
2904 enum token_t ellipsis = tok_none;
2906 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2907 if (arg->tok == tok_ellipsis2 || arg->tok == tok_ellipsis4)
2909 ellipsis = arg->tok;
2911 arg = lr_token (ldfile, charmap, result, repertoire,
2912 verbose);
2913 if (arg->tok != tok_bsymbol)
2915 free (symbol);
2916 goto err_label;
2919 endsymbol = arg->val.str.startmb;
2920 endsymbol_len = arg->val.str.lenmb;
2922 lr_ignore_rest (ldfile, 1);
2924 else if (arg->tok != tok_eol)
2926 free (symbol);
2927 goto err_label;
2930 if (!ignore_content)
2932 if (symbol == NULL
2933 || (ellipsis != tok_none && endsymbol == NULL))
2935 lr_error (ldfile, _("\
2936 %s: unknown character in collating symbol name"),
2937 "LC_COLLATE");
2938 goto col_sym_free;
2940 else if (ellipsis == tok_none)
2942 /* A single symbol, no ellipsis. */
2943 if (check_duplicate (ldfile, collate, charmap,
2944 repertoire, symbol, symbol_len))
2945 /* The name is already defined. */
2946 goto col_sym_free;
2948 insert_entry (&collate->sym_table, symbol, symbol_len,
2949 new_symbol (collate, symbol, symbol_len));
2951 else if (symbol_len != endsymbol_len)
2953 col_sym_inv_range:
2954 lr_error (ldfile,
2955 _("invalid names for character range"));
2956 goto col_sym_free;
2958 else
2960 /* Oh my, we have to handle an ellipsis. First, as
2961 usual, determine the common prefix and then
2962 convert the rest into a range. */
2963 size_t prefixlen;
2964 unsigned long int from;
2965 unsigned long int to;
2966 char *endp;
2968 for (prefixlen = 0; prefixlen < symbol_len; ++prefixlen)
2969 if (symbol[prefixlen] != endsymbol[prefixlen])
2970 break;
2972 /* Convert the rest into numbers. */
2973 symbol[symbol_len] = '\0';
2974 from = strtoul (&symbol[prefixlen], &endp,
2975 ellipsis == tok_ellipsis2 ? 16 : 10);
2976 if (*endp != '\0')
2977 goto col_sym_inv_range;
2979 endsymbol[symbol_len] = '\0';
2980 to = strtoul (&endsymbol[prefixlen], &endp,
2981 ellipsis == tok_ellipsis2 ? 16 : 10);
2982 if (*endp != '\0')
2983 goto col_sym_inv_range;
2985 if (from > to)
2986 goto col_sym_inv_range;
2988 /* Now loop over all entries. */
2989 while (from <= to)
2991 char *symbuf;
2993 symbuf = (char *) obstack_alloc (&collate->mempool,
2994 symbol_len + 1);
2996 /* Create the name. */
2997 sprintf (symbuf,
2998 ellipsis == tok_ellipsis2
2999 ? "%.*s%.*lX" : "%.*s%.*lu",
3000 (int) prefixlen, symbol,
3001 (int) (symbol_len - prefixlen), from);
3003 if (check_duplicate (ldfile, collate, charmap,
3004 repertoire, symbuf, symbol_len))
3005 /* The name is already defined. */
3006 goto col_sym_free;
3008 insert_entry (&collate->sym_table, symbuf,
3009 symbol_len,
3010 new_symbol (collate, symbuf,
3011 symbol_len));
3013 /* Increment the counter. */
3014 ++from;
3017 goto col_sym_free;
3020 else
3022 col_sym_free:
3023 if (symbol != NULL)
3024 free (symbol);
3025 if (endsymbol != NULL)
3026 free (endsymbol);
3029 break;
3031 case tok_symbol_equivalence:
3032 /* Ignore the rest of the line if we don't need the input of
3033 this line. */
3034 if (ignore_content)
3036 lr_ignore_rest (ldfile, 0);
3037 break;
3040 if (state != 0)
3041 goto err_label;
3043 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3044 if (arg->tok != tok_bsymbol)
3045 goto err_label;
3046 else
3048 const char *newname = arg->val.str.startmb;
3049 size_t newname_len = arg->val.str.lenmb;
3050 const char *symname;
3051 size_t symname_len;
3052 void *symval; /* Actually struct symbol_t* */
3054 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3055 if (arg->tok != tok_bsymbol)
3057 if (newname != NULL)
3058 free ((char *) newname);
3059 goto err_label;
3062 symname = arg->val.str.startmb;
3063 symname_len = arg->val.str.lenmb;
3065 if (newname == NULL)
3067 lr_error (ldfile, _("\
3068 %s: unknown character in equivalent definition name"),
3069 "LC_COLLATE");
3071 sym_equiv_free:
3072 if (newname != NULL)
3073 free ((char *) newname);
3074 if (symname != NULL)
3075 free ((char *) symname);
3076 break;
3078 if (symname == NULL)
3080 lr_error (ldfile, _("\
3081 %s: unknown character in equivalent definition value"),
3082 "LC_COLLATE");
3083 goto sym_equiv_free;
3086 /* See whether the symbol name is already defined. */
3087 if (find_entry (&collate->sym_table, symname, symname_len,
3088 &symval) != 0)
3090 lr_error (ldfile, _("\
3091 %s: unknown symbol `%s' in equivalent definition"),
3092 "LC_COLLATE", symname);
3093 goto sym_equiv_free;
3096 if (insert_entry (&collate->sym_table,
3097 newname, newname_len, symval) < 0)
3099 lr_error (ldfile, _("\
3100 error while adding equivalent collating symbol"));
3101 goto sym_equiv_free;
3104 free ((char *) symname);
3106 lr_ignore_rest (ldfile, 1);
3107 break;
3109 case tok_script:
3110 /* We get told about the scripts we know. */
3111 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3112 if (arg->tok != tok_bsymbol)
3113 goto err_label;
3114 else
3116 struct section_list *runp = collate->known_sections;
3117 char *name;
3119 while (runp != NULL)
3120 if (strncmp (runp->name, arg->val.str.startmb,
3121 arg->val.str.lenmb) == 0
3122 && runp->name[arg->val.str.lenmb] == '\0')
3123 break;
3124 else
3125 runp = runp->def_next;
3127 if (runp != NULL)
3129 lr_error (ldfile, _("duplicate definition of script `%s'"),
3130 runp->name);
3131 lr_ignore_rest (ldfile, 0);
3132 break;
3135 runp = (struct section_list *) xcalloc (1, sizeof (*runp));
3136 name = (char *) xmalloc (arg->val.str.lenmb + 1);
3137 memcpy (name, arg->val.str.startmb, arg->val.str.lenmb);
3138 name[arg->val.str.lenmb] = '\0';
3139 runp->name = name;
3141 runp->def_next = collate->known_sections;
3142 collate->known_sections = runp;
3144 lr_ignore_rest (ldfile, 1);
3145 break;
3147 case tok_order_start:
3148 /* Ignore the rest of the line if we don't need the input of
3149 this line. */
3150 if (ignore_content)
3152 lr_ignore_rest (ldfile, 0);
3153 break;
3156 if (state != 0 && state != 1 && state != 2)
3157 goto err_label;
3158 state = 1;
3160 /* The 14652 draft does not specify whether all `order_start' lines
3161 must contain the same number of sort-rules, but 14651 does. So
3162 we require this here as well. */
3163 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3164 if (arg->tok == tok_bsymbol)
3166 /* This better should be a section name. */
3167 struct section_list *sp = collate->known_sections;
3168 while (sp != NULL
3169 && (sp->name == NULL
3170 || strncmp (sp->name, arg->val.str.startmb,
3171 arg->val.str.lenmb) != 0
3172 || sp->name[arg->val.str.lenmb] != '\0'))
3173 sp = sp->def_next;
3175 if (sp == NULL)
3177 lr_error (ldfile, _("\
3178 %s: unknown section name `%.*s'"),
3179 "LC_COLLATE", (int) arg->val.str.lenmb,
3180 arg->val.str.startmb);
3181 /* We use the error section. */
3182 collate->current_section = &collate->error_section;
3184 if (collate->error_section.first == NULL)
3186 /* Insert &collate->error_section at the end of
3187 the collate->sections list. */
3188 if (collate->sections == NULL)
3189 collate->sections = &collate->error_section;
3190 else
3192 sp = collate->sections;
3193 while (sp->next != NULL)
3194 sp = sp->next;
3196 sp->next = &collate->error_section;
3198 collate->error_section.next = NULL;
3201 else
3203 /* One should not be allowed to open the same
3204 section twice. */
3205 if (sp->first != NULL)
3206 lr_error (ldfile, _("\
3207 %s: multiple order definitions for section `%s'"),
3208 "LC_COLLATE", sp->name);
3209 else
3211 /* Insert sp in the collate->sections list,
3212 right after collate->current_section. */
3213 if (collate->current_section == NULL)
3214 collate->current_section = sp;
3215 else
3217 sp->next = collate->current_section->next;
3218 collate->current_section->next = sp;
3222 /* Next should come the end of the line or a semicolon. */
3223 arg = lr_token (ldfile, charmap, result, repertoire,
3224 verbose);
3225 if (arg->tok == tok_eol)
3227 uint32_t cnt;
3229 /* This means we have exactly one rule: `forward'. */
3230 if (nrules > 1)
3231 lr_error (ldfile, _("\
3232 %s: invalid number of sorting rules"),
3233 "LC_COLLATE");
3234 else
3235 nrules = 1;
3236 sp->rules = obstack_alloc (&collate->mempool,
3237 (sizeof (enum coll_sort_rule)
3238 * nrules));
3239 for (cnt = 0; cnt < nrules; ++cnt)
3240 sp->rules[cnt] = sort_forward;
3242 /* Next line. */
3243 break;
3246 /* Get the next token. */
3247 arg = lr_token (ldfile, charmap, result, repertoire,
3248 verbose);
3251 else
3253 /* There is no section symbol. Therefore we use the unnamed
3254 section. */
3255 collate->current_section = &collate->unnamed_section;
3257 if (collate->unnamed_section.first != NULL)
3258 lr_error (ldfile, _("\
3259 %s: multiple order definitions for unnamed section"),
3260 "LC_COLLATE");
3261 else
3263 /* Insert &collate->unnamed_section at the beginning of
3264 the collate->sections list. */
3265 collate->unnamed_section.next = collate->sections;
3266 collate->sections = &collate->unnamed_section;
3270 /* Now read the direction names. */
3271 read_directions (ldfile, arg, charmap, repertoire, result);
3273 /* From now we need the strings untranslated. */
3274 ldfile->translate_strings = 0;
3275 break;
3277 case tok_order_end:
3278 /* Ignore the rest of the line if we don't need the input of
3279 this line. */
3280 if (ignore_content)
3282 lr_ignore_rest (ldfile, 0);
3283 break;
3286 if (state != 1)
3287 goto err_label;
3289 /* Handle ellipsis at end of list. */
3290 if (was_ellipsis != tok_none)
3292 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3293 repertoire, result);
3294 was_ellipsis = tok_none;
3297 state = 2;
3298 lr_ignore_rest (ldfile, 1);
3299 break;
3301 case tok_reorder_after:
3302 /* Ignore the rest of the line if we don't need the input of
3303 this line. */
3304 if (ignore_content)
3306 lr_ignore_rest (ldfile, 0);
3307 break;
3310 if (state == 1)
3312 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3313 "LC_COLLATE");
3314 state = 2;
3316 /* Handle ellipsis at end of list. */
3317 if (was_ellipsis != tok_none)
3319 handle_ellipsis (ldfile, arg->val.str.startmb,
3320 arg->val.str.lenmb, was_ellipsis, charmap,
3321 repertoire, result);
3322 was_ellipsis = tok_none;
3325 else if (state != 2 && state != 3)
3326 goto err_label;
3327 state = 3;
3329 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3330 if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
3332 /* Find this symbol in the sequence table. */
3333 char ucsbuf[10];
3334 char *startmb;
3335 size_t lenmb;
3336 struct element_t *insp;
3337 int no_error = 1;
3338 void *ptr;
3340 if (arg->tok == tok_bsymbol)
3342 startmb = arg->val.str.startmb;
3343 lenmb = arg->val.str.lenmb;
3345 else
3347 sprintf (ucsbuf, "U%08X", arg->val.ucs4);
3348 startmb = ucsbuf;
3349 lenmb = 9;
3352 if (find_entry (&collate->seq_table, startmb, lenmb, &ptr) == 0)
3353 /* Yes, the symbol exists. Simply point the cursor
3354 to it. */
3355 collate->cursor = (struct element_t *) ptr;
3356 else
3358 struct symbol_t *symbp;
3359 void *ptr;
3361 if (find_entry (&collate->sym_table, startmb, lenmb,
3362 &ptr) == 0)
3364 symbp = ptr;
3366 if (symbp->order->last != NULL
3367 || symbp->order->next != NULL)
3368 collate->cursor = symbp->order;
3369 else
3371 /* This is a collating symbol but its position
3372 is not yet defined. */
3373 lr_error (ldfile, _("\
3374 %s: order for collating symbol %.*s not yet defined"),
3375 "LC_COLLATE", (int) lenmb, startmb);
3376 collate->cursor = NULL;
3377 no_error = 0;
3380 else if (find_entry (&collate->elem_table, startmb, lenmb,
3381 &ptr) == 0)
3383 insp = (struct element_t *) ptr;
3385 if (insp->last != NULL || insp->next != NULL)
3386 collate->cursor = insp;
3387 else
3389 /* This is a collating element but its position
3390 is not yet defined. */
3391 lr_error (ldfile, _("\
3392 %s: order for collating element %.*s not yet defined"),
3393 "LC_COLLATE", (int) lenmb, startmb);
3394 collate->cursor = NULL;
3395 no_error = 0;
3398 else
3400 /* This is bad. The symbol after which we have to
3401 insert does not exist. */
3402 lr_error (ldfile, _("\
3403 %s: cannot reorder after %.*s: symbol not known"),
3404 "LC_COLLATE", (int) lenmb, startmb);
3405 collate->cursor = NULL;
3406 no_error = 0;
3410 lr_ignore_rest (ldfile, no_error);
3412 else
3413 /* This must not happen. */
3414 goto err_label;
3415 break;
3417 case tok_reorder_end:
3418 /* Ignore the rest of the line if we don't need the input of
3419 this line. */
3420 if (ignore_content)
3421 break;
3423 if (state != 3)
3424 goto err_label;
3425 state = 4;
3426 lr_ignore_rest (ldfile, 1);
3427 break;
3429 case tok_reorder_sections_after:
3430 /* Ignore the rest of the line if we don't need the input of
3431 this line. */
3432 if (ignore_content)
3434 lr_ignore_rest (ldfile, 0);
3435 break;
3438 if (state == 1)
3440 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3441 "LC_COLLATE");
3442 state = 2;
3444 /* Handle ellipsis at end of list. */
3445 if (was_ellipsis != tok_none)
3447 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3448 repertoire, result);
3449 was_ellipsis = tok_none;
3452 else if (state == 3)
3454 WITH_CUR_LOCALE (error (0, 0, _("\
3455 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3456 state = 4;
3458 else if (state != 2 && state != 4)
3459 goto err_label;
3460 state = 5;
3462 /* Get the name of the sections we are adding after. */
3463 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3464 if (arg->tok == tok_bsymbol)
3466 /* Now find a section with this name. */
3467 struct section_list *runp = collate->sections;
3469 while (runp != NULL)
3471 if (runp->name != NULL
3472 && strlen (runp->name) == arg->val.str.lenmb
3473 && memcmp (runp->name, arg->val.str.startmb,
3474 arg->val.str.lenmb) == 0)
3475 break;
3477 runp = runp->next;
3480 if (runp != NULL)
3481 collate->current_section = runp;
3482 else
3484 /* This is bad. The section after which we have to
3485 reorder does not exist. Therefore we cannot
3486 process the whole rest of this reorder
3487 specification. */
3488 lr_error (ldfile, _("%s: section `%.*s' not known"),
3489 "LC_COLLATE", (int) arg->val.str.lenmb,
3490 arg->val.str.startmb);
3494 lr_ignore_rest (ldfile, 0);
3496 now = lr_token (ldfile, charmap, result, NULL, verbose);
3498 while (now->tok == tok_reorder_sections_after
3499 || now->tok == tok_reorder_sections_end
3500 || now->tok == tok_end);
3502 /* Process the token we just saw. */
3503 nowtok = now->tok;
3504 continue;
3507 else
3508 /* This must not happen. */
3509 goto err_label;
3510 break;
3512 case tok_reorder_sections_end:
3513 /* Ignore the rest of the line if we don't need the input of
3514 this line. */
3515 if (ignore_content)
3516 break;
3518 if (state != 5)
3519 goto err_label;
3520 state = 6;
3521 lr_ignore_rest (ldfile, 1);
3522 break;
3524 case tok_bsymbol:
3525 case tok_ucs4:
3526 /* Ignore the rest of the line if we don't need the input of
3527 this line. */
3528 if (ignore_content)
3530 lr_ignore_rest (ldfile, 0);
3531 break;
3534 if (state != 0 && state != 1 && state != 3 && state != 5)
3535 goto err_label;
3537 if ((state == 0 || state == 5) && nowtok == tok_ucs4)
3538 goto err_label;
3540 if (nowtok == tok_ucs4)
3542 snprintf (ucs4buf, sizeof (ucs4buf), "U%08X", now->val.ucs4);
3543 symstr = ucs4buf;
3544 symlen = 9;
3546 else if (arg != NULL)
3548 symstr = arg->val.str.startmb;
3549 symlen = arg->val.str.lenmb;
3551 else
3553 lr_error (ldfile, _("%s: bad symbol <%.*s>"), "LC_COLLATE",
3554 (int) ldfile->token.val.str.lenmb,
3555 ldfile->token.val.str.startmb);
3556 break;
3559 struct element_t *seqp;
3560 if (state == 0)
3562 /* We are outside an `order_start' region. This means
3563 we must only accept definitions of values for
3564 collation symbols since these are purely abstract
3565 values and don't need directions associated. */
3566 void *ptr;
3568 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3570 seqp = ptr;
3572 /* It's already defined. First check whether this
3573 is really a collating symbol. */
3574 if (seqp->is_character)
3575 goto err_label;
3577 goto move_entry;
3579 else
3581 void *result;
3583 if (find_entry (&collate->sym_table, symstr, symlen,
3584 &result) != 0)
3585 /* No collating symbol, it's an error. */
3586 goto err_label;
3588 /* Maybe this is the first time we define a symbol
3589 value and it is before the first actual section. */
3590 if (collate->sections == NULL)
3591 collate->sections = collate->current_section =
3592 &collate->symbol_section;
3595 if (was_ellipsis != tok_none)
3597 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis,
3598 charmap, repertoire, result);
3600 /* Remember that we processed the ellipsis. */
3601 was_ellipsis = tok_none;
3603 /* And don't add the value a second time. */
3604 break;
3607 else if (state == 3)
3609 /* It is possible that we already have this collation sequence.
3610 In this case we move the entry. */
3611 void *sym;
3612 void *ptr;
3614 /* If the symbol after which we have to insert was not found
3615 ignore all entries. */
3616 if (collate->cursor == NULL)
3618 lr_ignore_rest (ldfile, 0);
3619 break;
3622 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3624 seqp = (struct element_t *) ptr;
3625 goto move_entry;
3628 if (find_entry (&collate->sym_table, symstr, symlen, &sym) == 0
3629 && (seqp = ((struct symbol_t *) sym)->order) != NULL)
3630 goto move_entry;
3632 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) == 0
3633 && (seqp = (struct element_t *) ptr,
3634 seqp->last != NULL || seqp->next != NULL
3635 || (collate->start != NULL && seqp == collate->start)))
3637 move_entry:
3638 /* Remove the entry from the old position. */
3639 if (seqp->last == NULL)
3640 collate->start = seqp->next;
3641 else
3642 seqp->last->next = seqp->next;
3643 if (seqp->next != NULL)
3644 seqp->next->last = seqp->last;
3646 /* We also have to check whether this entry is the
3647 first or last of a section. */
3648 if (seqp->section->first == seqp)
3650 if (seqp->section->first == seqp->section->last)
3651 /* This section has no content anymore. */
3652 seqp->section->first = seqp->section->last = NULL;
3653 else
3654 seqp->section->first = seqp->next;
3656 else if (seqp->section->last == seqp)
3657 seqp->section->last = seqp->last;
3659 /* Now insert it in the new place. */
3660 insert_weights (ldfile, seqp, charmap, repertoire, result,
3661 tok_none);
3662 break;
3665 /* Otherwise we just add a new entry. */
3667 else if (state == 5)
3669 /* We are reordering sections. Find the named section. */
3670 struct section_list *runp = collate->sections;
3671 struct section_list *prevp = NULL;
3673 while (runp != NULL)
3675 if (runp->name != NULL
3676 && strlen (runp->name) == symlen
3677 && memcmp (runp->name, symstr, symlen) == 0)
3678 break;
3680 prevp = runp;
3681 runp = runp->next;
3684 if (runp == NULL)
3686 lr_error (ldfile, _("%s: section `%.*s' not known"),
3687 "LC_COLLATE", (int) symlen, symstr);
3688 lr_ignore_rest (ldfile, 0);
3690 else
3692 if (runp != collate->current_section)
3694 /* Remove the named section from the old place and
3695 insert it in the new one. */
3696 prevp->next = runp->next;
3698 runp->next = collate->current_section->next;
3699 collate->current_section->next = runp;
3700 collate->current_section = runp;
3703 /* Process the rest of the line which might change
3704 the collation rules. */
3705 arg = lr_token (ldfile, charmap, result, repertoire,
3706 verbose);
3707 if (arg->tok != tok_eof && arg->tok != tok_eol)
3708 read_directions (ldfile, arg, charmap, repertoire,
3709 result);
3711 break;
3713 else if (was_ellipsis != tok_none)
3715 /* Using the information in the `ellipsis_weight'
3716 element and this and the last value we have to handle
3717 the ellipsis now. */
3718 assert (state == 1);
3720 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis, charmap,
3721 repertoire, result);
3723 /* Remember that we processed the ellipsis. */
3724 was_ellipsis = tok_none;
3726 /* And don't add the value a second time. */
3727 break;
3730 /* Now insert in the new place. */
3731 insert_value (ldfile, symstr, symlen, charmap, repertoire, result);
3732 break;
3734 case tok_undefined:
3735 /* Ignore the rest of the line if we don't need the input of
3736 this line. */
3737 if (ignore_content)
3739 lr_ignore_rest (ldfile, 0);
3740 break;
3743 if (state != 1)
3744 goto err_label;
3746 if (was_ellipsis != tok_none)
3748 lr_error (ldfile,
3749 _("%s: cannot have `%s' as end of ellipsis range"),
3750 "LC_COLLATE", "UNDEFINED");
3752 unlink_element (collate);
3753 was_ellipsis = tok_none;
3756 /* See whether UNDEFINED already appeared somewhere. */
3757 if (collate->undefined.next != NULL
3758 || &collate->undefined == collate->cursor)
3760 lr_error (ldfile,
3761 _("%s: order for `%.*s' already defined at %s:%Zu"),
3762 "LC_COLLATE", 9, "UNDEFINED",
3763 collate->undefined.file,
3764 collate->undefined.line);
3765 lr_ignore_rest (ldfile, 0);
3767 else
3768 /* Parse the weights. */
3769 insert_weights (ldfile, &collate->undefined, charmap,
3770 repertoire, result, tok_none);
3771 break;
3773 case tok_ellipsis2: /* symbolic hexadecimal ellipsis */
3774 case tok_ellipsis3: /* absolute ellipsis */
3775 case tok_ellipsis4: /* symbolic decimal ellipsis */
3776 /* This is the symbolic (decimal or hexadecimal) or absolute
3777 ellipsis. */
3778 if (was_ellipsis != tok_none)
3779 goto err_label;
3781 if (state != 0 && state != 1 && state != 3)
3782 goto err_label;
3784 was_ellipsis = nowtok;
3786 insert_weights (ldfile, &collate->ellipsis_weight, charmap,
3787 repertoire, result, nowtok);
3788 break;
3790 case tok_end:
3791 /* Next we assume `LC_COLLATE'. */
3792 if (!ignore_content)
3794 if (state == 0)
3795 /* We must either see a copy statement or have
3796 ordering values. */
3797 lr_error (ldfile,
3798 _("%s: empty category description not allowed"),
3799 "LC_COLLATE");
3800 else if (state == 1)
3802 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3803 "LC_COLLATE");
3805 /* Handle ellipsis at end of list. */
3806 if (was_ellipsis != tok_none)
3808 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3809 repertoire, result);
3810 was_ellipsis = tok_none;
3813 else if (state == 3)
3814 WITH_CUR_LOCALE (error (0, 0, _("\
3815 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3816 else if (state == 5)
3817 WITH_CUR_LOCALE (error (0, 0, _("\
3818 %s: missing `reorder-sections-end' keyword"), "LC_COLLATE"));
3820 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3821 if (arg->tok == tok_eof)
3822 break;
3823 if (arg->tok == tok_eol)
3824 lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
3825 else if (arg->tok != tok_lc_collate)
3826 lr_error (ldfile, _("\
3827 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
3828 lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
3829 return;
3831 default:
3832 err_label:
3833 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
3836 /* Prepare for the next round. */
3837 now = lr_token (ldfile, charmap, result, NULL, verbose);
3838 nowtok = now->tok;
3841 /* When we come here we reached the end of the file. */
3842 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");