* pthread_getattr_np.c (pthread_getattr_np): Clear cpuset and
[glibc.git] / locale / programs / ld-collate.c
blob56f3180b10a5631cf8f6fc742de657efdb2edfc9
1 /* Copyright (C) 1995-2003, 2005, 2006, 2007 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License version 2 as
7 published by the Free Software Foundation.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
22 #include <errno.h>
23 #include <error.h>
24 #include <stdlib.h>
25 #include <wchar.h>
26 #include <sys/param.h>
28 #include "localedef.h"
29 #include "charmap.h"
30 #include "localeinfo.h"
31 #include "linereader.h"
32 #include "locfile.h"
33 #include "elem-hash.h"
35 /* Uncomment the following line in the production version. */
36 /* #define NDEBUG 1 */
37 #include <assert.h>
39 #define obstack_chunk_alloc malloc
40 #define obstack_chunk_free free
42 static inline void
43 __attribute ((always_inline))
44 obstack_int32_grow (struct obstack *obstack, int32_t data)
46 if (sizeof (int32_t) == sizeof (int))
47 obstack_int_grow (obstack, data);
48 else
49 obstack_grow (obstack, &data, sizeof (int32_t));
52 static inline void
53 __attribute ((always_inline))
54 obstack_int32_grow_fast (struct obstack *obstack, int32_t data)
56 if (sizeof (int32_t) == sizeof (int))
57 obstack_int_grow_fast (obstack, data);
58 else
59 obstack_grow (obstack, &data, sizeof (int32_t));
62 /* Forward declaration. */
63 struct element_t;
65 /* Data type for list of strings. */
66 struct section_list
68 /* Successor in the known_sections list. */
69 struct section_list *def_next;
70 /* Successor in the sections list. */
71 struct section_list *next;
72 /* Name of the section. */
73 const char *name;
74 /* First element of this section. */
75 struct element_t *first;
76 /* Last element of this section. */
77 struct element_t *last;
78 /* These are the rules for this section. */
79 enum coll_sort_rule *rules;
80 /* Index of the rule set in the appropriate section of the output file. */
81 int ruleidx;
84 struct element_t;
86 struct element_list_t
88 /* Number of elements. */
89 int cnt;
91 struct element_t **w;
94 /* Data type for collating element. */
95 struct element_t
97 const char *name;
99 const char *mbs;
100 size_t nmbs;
101 const uint32_t *wcs;
102 size_t nwcs;
103 int *mborder;
104 int wcorder;
106 /* The following is a bit mask which bits are set if this element is
107 used in the appropriate level. Interesting for the singlebyte
108 weight computation.
110 XXX The type here restricts the number of levels to 32. It could
111 be changed if necessary but I doubt this is necessary. */
112 unsigned int used_in_level;
114 struct element_list_t *weights;
116 /* Nonzero if this is a real character definition. */
117 int is_character;
119 /* Order of the character in the sequence. This information will
120 be used in range expressions. */
121 int mbseqorder;
122 int wcseqorder;
124 /* Where does the definition come from. */
125 const char *file;
126 size_t line;
128 /* Which section does this belong to. */
129 struct section_list *section;
131 /* Predecessor and successor in the order list. */
132 struct element_t *last;
133 struct element_t *next;
135 /* Next element in multibyte output list. */
136 struct element_t *mbnext;
137 struct element_t *mblast;
139 /* Next element in wide character output list. */
140 struct element_t *wcnext;
141 struct element_t *wclast;
144 /* Special element value. */
145 #define ELEMENT_ELLIPSIS2 ((struct element_t *) 1)
146 #define ELEMENT_ELLIPSIS3 ((struct element_t *) 2)
147 #define ELEMENT_ELLIPSIS4 ((struct element_t *) 3)
149 /* Data type for collating symbol. */
150 struct symbol_t
152 const char *name;
154 /* Point to place in the order list. */
155 struct element_t *order;
157 /* Where does the definition come from. */
158 const char *file;
159 size_t line;
162 /* Sparse table of struct element_t *. */
163 #define TABLE wchead_table
164 #define ELEMENT struct element_t *
165 #define DEFAULT NULL
166 #define ITERATE
167 #define NO_FINALIZE
168 #include "3level.h"
170 /* Sparse table of int32_t. */
171 #define TABLE collidx_table
172 #define ELEMENT int32_t
173 #define DEFAULT 0
174 #include "3level.h"
176 /* Sparse table of uint32_t. */
177 #define TABLE collseq_table
178 #define ELEMENT uint32_t
179 #define DEFAULT ~((uint32_t) 0)
180 #include "3level.h"
183 /* The real definition of the struct for the LC_COLLATE locale. */
184 struct locale_collate_t
186 int col_weight_max;
187 int cur_weight_max;
189 /* List of known scripts. */
190 struct section_list *known_sections;
191 /* List of used sections. */
192 struct section_list *sections;
193 /* Current section using definition. */
194 struct section_list *current_section;
195 /* There always can be an unnamed section. */
196 struct section_list unnamed_section;
197 /* To make handling of errors easier we have another section. */
198 struct section_list error_section;
199 /* Sometimes we are defining the values for collating symbols before
200 the first actual section. */
201 struct section_list symbol_section;
203 /* Start of the order list. */
204 struct element_t *start;
206 /* The undefined element. */
207 struct element_t undefined;
209 /* This is the cursor for `reorder_after' insertions. */
210 struct element_t *cursor;
212 /* This value is used when handling ellipsis. */
213 struct element_t ellipsis_weight;
215 /* Known collating elements. */
216 hash_table elem_table;
218 /* Known collating symbols. */
219 hash_table sym_table;
221 /* Known collation sequences. */
222 hash_table seq_table;
224 struct obstack mempool;
226 /* The LC_COLLATE category is a bit special as it is sometimes possible
227 that the definitions from more than one input file contains information.
228 Therefore we keep all relevant input in a list. */
229 struct locale_collate_t *next;
231 /* Arrays with heads of the list for each of the leading bytes in
232 the multibyte sequences. */
233 struct element_t *mbheads[256];
235 /* Arrays with heads of the list for each of the leading bytes in
236 the multibyte sequences. */
237 struct wchead_table wcheads;
239 /* The arrays with the collation sequence order. */
240 unsigned char mbseqorder[256];
241 struct collseq_table wcseqorder;
245 /* We have a few global variables which are used for reading all
246 LC_COLLATE category descriptions in all files. */
247 static uint32_t nrules;
250 /* We need UTF-8 encoding of numbers. */
251 static inline int
252 __attribute ((always_inline))
253 utf8_encode (char *buf, int val)
255 int retval;
257 if (val < 0x80)
259 *buf++ = (char) val;
260 retval = 1;
262 else
264 int step;
266 for (step = 2; step < 6; ++step)
267 if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
268 break;
269 retval = step;
271 *buf = (unsigned char) (~0xff >> step);
272 --step;
275 buf[step] = 0x80 | (val & 0x3f);
276 val >>= 6;
278 while (--step > 0);
279 *buf |= val;
282 return retval;
286 static struct section_list *
287 make_seclist_elem (struct locale_collate_t *collate, const char *string,
288 struct section_list *next)
290 struct section_list *newp;
292 newp = (struct section_list *) obstack_alloc (&collate->mempool,
293 sizeof (*newp));
294 newp->next = next;
295 newp->name = string;
296 newp->first = NULL;
297 newp->last = NULL;
299 return newp;
303 static struct element_t *
304 new_element (struct locale_collate_t *collate, const char *mbs, size_t mbslen,
305 const uint32_t *wcs, const char *name, size_t namelen,
306 int is_character)
308 struct element_t *newp;
310 newp = (struct element_t *) obstack_alloc (&collate->mempool,
311 sizeof (*newp));
312 newp->name = name == NULL ? NULL : obstack_copy0 (&collate->mempool,
313 name, namelen);
314 if (mbs != NULL)
316 newp->mbs = obstack_copy0 (&collate->mempool, mbs, mbslen);
317 newp->nmbs = mbslen;
319 else
321 newp->mbs = NULL;
322 newp->nmbs = 0;
324 if (wcs != NULL)
326 size_t nwcs = wcslen ((wchar_t *) wcs);
327 uint32_t zero = 0;
328 obstack_grow (&collate->mempool, wcs, nwcs * sizeof (uint32_t));
329 obstack_grow (&collate->mempool, &zero, sizeof (uint32_t));
330 newp->wcs = (uint32_t *) obstack_finish (&collate->mempool);
331 newp->nwcs = nwcs;
333 else
335 newp->wcs = NULL;
336 newp->nwcs = 0;
338 newp->mborder = NULL;
339 newp->wcorder = 0;
340 newp->used_in_level = 0;
341 newp->is_character = is_character;
343 /* Will be assigned later. XXX */
344 newp->mbseqorder = 0;
345 newp->wcseqorder = 0;
347 /* Will be allocated later. */
348 newp->weights = NULL;
350 newp->file = NULL;
351 newp->line = 0;
353 newp->section = collate->current_section;
355 newp->last = NULL;
356 newp->next = NULL;
358 newp->mbnext = NULL;
359 newp->mblast = NULL;
361 newp->wcnext = NULL;
362 newp->wclast = NULL;
364 return newp;
368 static struct symbol_t *
369 new_symbol (struct locale_collate_t *collate, const char *name, size_t len)
371 struct symbol_t *newp;
373 newp = (struct symbol_t *) obstack_alloc (&collate->mempool, sizeof (*newp));
375 newp->name = obstack_copy0 (&collate->mempool, name, len);
376 newp->order = NULL;
378 newp->file = NULL;
379 newp->line = 0;
381 return newp;
385 /* Test whether this name is already defined somewhere. */
386 static int
387 check_duplicate (struct linereader *ldfile, struct locale_collate_t *collate,
388 const struct charmap_t *charmap,
389 struct repertoire_t *repertoire, const char *symbol,
390 size_t symbol_len)
392 void *ignore = NULL;
394 if (find_entry (&charmap->char_table, symbol, symbol_len, &ignore) == 0)
396 lr_error (ldfile, _("`%.*s' already defined in charmap"),
397 (int) symbol_len, symbol);
398 return 1;
401 if (repertoire != NULL
402 && (find_entry (&repertoire->char_table, symbol, symbol_len, &ignore)
403 == 0))
405 lr_error (ldfile, _("`%.*s' already defined in repertoire"),
406 (int) symbol_len, symbol);
407 return 1;
410 if (find_entry (&collate->sym_table, symbol, symbol_len, &ignore) == 0)
412 lr_error (ldfile, _("`%.*s' already defined as collating symbol"),
413 (int) symbol_len, symbol);
414 return 1;
417 if (find_entry (&collate->elem_table, symbol, symbol_len, &ignore) == 0)
419 lr_error (ldfile, _("`%.*s' already defined as collating element"),
420 (int) symbol_len, symbol);
421 return 1;
424 return 0;
428 /* Read the direction specification. */
429 static void
430 read_directions (struct linereader *ldfile, struct token *arg,
431 const struct charmap_t *charmap,
432 struct repertoire_t *repertoire, struct localedef_t *result)
434 int cnt = 0;
435 int max = nrules ?: 10;
436 enum coll_sort_rule *rules = calloc (max, sizeof (*rules));
437 int warned = 0;
438 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
440 while (1)
442 int valid = 0;
444 if (arg->tok == tok_forward)
446 if (rules[cnt] & sort_backward)
448 if (! warned)
450 lr_error (ldfile, _("\
451 %s: `forward' and `backward' are mutually excluding each other"),
452 "LC_COLLATE");
453 warned = 1;
456 else if (rules[cnt] & sort_forward)
458 if (! warned)
460 lr_error (ldfile, _("\
461 %s: `%s' mentioned more than once in definition of weight %d"),
462 "LC_COLLATE", "forward", cnt + 1);
465 else
466 rules[cnt] |= sort_forward;
468 valid = 1;
470 else if (arg->tok == tok_backward)
472 if (rules[cnt] & sort_forward)
474 if (! warned)
476 lr_error (ldfile, _("\
477 %s: `forward' and `backward' are mutually excluding each other"),
478 "LC_COLLATE");
479 warned = 1;
482 else if (rules[cnt] & sort_backward)
484 if (! warned)
486 lr_error (ldfile, _("\
487 %s: `%s' mentioned more than once in definition of weight %d"),
488 "LC_COLLATE", "backward", cnt + 1);
491 else
492 rules[cnt] |= sort_backward;
494 valid = 1;
496 else if (arg->tok == tok_position)
498 if (rules[cnt] & sort_position)
500 if (! warned)
502 lr_error (ldfile, _("\
503 %s: `%s' mentioned more than once in definition of weight %d"),
504 "LC_COLLATE", "position", cnt + 1);
507 else
508 rules[cnt] |= sort_position;
510 valid = 1;
513 if (valid)
514 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
516 if (arg->tok == tok_eof || arg->tok == tok_eol || arg->tok == tok_comma
517 || arg->tok == tok_semicolon)
519 if (! valid && ! warned)
521 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
522 warned = 1;
525 /* See whether we have to increment the counter. */
526 if (arg->tok != tok_comma && rules[cnt] != 0)
528 /* Add the default `forward' if we have seen only `position'. */
529 if (rules[cnt] == sort_position)
530 rules[cnt] = sort_position | sort_forward;
532 ++cnt;
535 if (arg->tok == tok_eof || arg->tok == tok_eol)
536 /* End of line or file, so we exit the loop. */
537 break;
539 if (nrules == 0)
541 /* See whether we have enough room in the array. */
542 if (cnt == max)
544 max += 10;
545 rules = (enum coll_sort_rule *) xrealloc (rules,
547 * sizeof (*rules));
548 memset (&rules[cnt], '\0', (max - cnt) * sizeof (*rules));
551 else
553 if (cnt == nrules)
555 /* There must not be any more rule. */
556 if (! warned)
558 lr_error (ldfile, _("\
559 %s: too many rules; first entry only had %d"),
560 "LC_COLLATE", nrules);
561 warned = 1;
564 lr_ignore_rest (ldfile, 0);
565 break;
569 else
571 if (! warned)
573 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
574 warned = 1;
578 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
581 if (nrules == 0)
583 /* Now we know how many rules we have. */
584 nrules = cnt;
585 rules = (enum coll_sort_rule *) xrealloc (rules,
586 nrules * sizeof (*rules));
588 else
590 if (cnt < nrules)
592 /* Not enough rules in this specification. */
593 if (! warned)
594 lr_error (ldfile, _("%s: not enough sorting rules"), "LC_COLLATE");
597 rules[cnt] = sort_forward;
598 while (++cnt < nrules);
602 collate->current_section->rules = rules;
606 static struct element_t *
607 find_element (struct linereader *ldfile, struct locale_collate_t *collate,
608 const char *str, size_t len)
610 void *result = NULL;
612 /* Search for the entries among the collation sequences already define. */
613 if (find_entry (&collate->seq_table, str, len, &result) != 0)
615 /* Nope, not define yet. So we see whether it is a
616 collation symbol. */
617 void *ptr;
619 if (find_entry (&collate->sym_table, str, len, &ptr) == 0)
621 /* It's a collation symbol. */
622 struct symbol_t *sym = (struct symbol_t *) ptr;
623 result = sym->order;
625 if (result == NULL)
626 result = sym->order = new_element (collate, NULL, 0, NULL,
627 NULL, 0, 0);
629 else if (find_entry (&collate->elem_table, str, len, &result) != 0)
631 /* It's also no collation element. So it is a character
632 element defined later. */
633 result = new_element (collate, NULL, 0, NULL, str, len, 1);
634 /* Insert it into the sequence table. */
635 insert_entry (&collate->seq_table, str, len, result);
639 return (struct element_t *) result;
643 static void
644 unlink_element (struct locale_collate_t *collate)
646 if (collate->cursor == collate->start)
648 assert (collate->cursor->next == NULL);
649 assert (collate->cursor->last == NULL);
650 collate->cursor = NULL;
652 else
654 if (collate->cursor->next != NULL)
655 collate->cursor->next->last = collate->cursor->last;
656 if (collate->cursor->last != NULL)
657 collate->cursor->last->next = collate->cursor->next;
658 collate->cursor = collate->cursor->last;
663 static void
664 insert_weights (struct linereader *ldfile, struct element_t *elem,
665 const struct charmap_t *charmap,
666 struct repertoire_t *repertoire, struct localedef_t *result,
667 enum token_t ellipsis)
669 int weight_cnt;
670 struct token *arg;
671 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
673 /* Initialize all the fields. */
674 elem->file = ldfile->fname;
675 elem->line = ldfile->lineno;
677 elem->last = collate->cursor;
678 elem->next = collate->cursor ? collate->cursor->next : NULL;
679 if (collate->cursor != NULL && collate->cursor->next != NULL)
680 collate->cursor->next->last = elem;
681 if (collate->cursor != NULL)
682 collate->cursor->next = elem;
683 if (collate->start == NULL)
685 assert (collate->cursor == NULL);
686 collate->start = elem;
689 elem->section = collate->current_section;
691 if (collate->current_section->first == NULL)
692 collate->current_section->first = elem;
693 if (collate->current_section->last == collate->cursor)
694 collate->current_section->last = elem;
696 collate->cursor = elem;
698 elem->weights = (struct element_list_t *)
699 obstack_alloc (&collate->mempool, nrules * sizeof (struct element_list_t));
700 memset (elem->weights, '\0', nrules * sizeof (struct element_list_t));
702 weight_cnt = 0;
704 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
707 if (arg->tok == tok_eof || arg->tok == tok_eol)
708 break;
710 if (arg->tok == tok_ignore)
712 /* The weight for this level has to be ignored. We use the
713 null pointer to indicate this. */
714 elem->weights[weight_cnt].w = (struct element_t **)
715 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
716 elem->weights[weight_cnt].w[0] = NULL;
717 elem->weights[weight_cnt].cnt = 1;
719 else if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
721 char ucs4str[10];
722 struct element_t *val;
723 char *symstr;
724 size_t symlen;
726 if (arg->tok == tok_bsymbol)
728 symstr = arg->val.str.startmb;
729 symlen = arg->val.str.lenmb;
731 else
733 snprintf (ucs4str, sizeof (ucs4str), "U%08X", arg->val.ucs4);
734 symstr = ucs4str;
735 symlen = 9;
738 val = find_element (ldfile, collate, symstr, symlen);
739 if (val == NULL)
740 break;
742 elem->weights[weight_cnt].w = (struct element_t **)
743 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
744 elem->weights[weight_cnt].w[0] = val;
745 elem->weights[weight_cnt].cnt = 1;
747 else if (arg->tok == tok_string)
749 /* Split the string up in the individual characters and put
750 the element definitions in the list. */
751 const char *cp = arg->val.str.startmb;
752 int cnt = 0;
753 struct element_t *charelem;
754 struct element_t **weights = NULL;
755 int max = 0;
757 if (*cp == '\0')
759 lr_error (ldfile, _("%s: empty weight string not allowed"),
760 "LC_COLLATE");
761 lr_ignore_rest (ldfile, 0);
762 break;
767 if (*cp == '<')
769 /* Ahh, it's a bsymbol or an UCS4 value. If it's
770 the latter we have to unify the name. */
771 const char *startp = ++cp;
772 size_t len;
774 while (*cp != '>')
776 if (*cp == ldfile->escape_char)
777 ++cp;
778 if (*cp == '\0')
779 /* It's a syntax error. */
780 goto syntax;
782 ++cp;
785 if (cp - startp == 5 && startp[0] == 'U'
786 && isxdigit (startp[1]) && isxdigit (startp[2])
787 && isxdigit (startp[3]) && isxdigit (startp[4]))
789 unsigned int ucs4 = strtoul (startp + 1, NULL, 16);
790 char *newstr;
792 newstr = (char *) xmalloc (10);
793 snprintf (newstr, 10, "U%08X", ucs4);
794 startp = newstr;
796 len = 9;
798 else
799 len = cp - startp;
801 charelem = find_element (ldfile, collate, startp, len);
802 ++cp;
804 else
806 /* People really shouldn't use characters directly in
807 the string. Especially since it's not really clear
808 what this means. We interpret all characters in the
809 string as if that would be bsymbols. Otherwise we
810 would have to match back to bsymbols somehow and this
811 is normally not what people normally expect. */
812 charelem = find_element (ldfile, collate, cp++, 1);
815 if (charelem == NULL)
817 /* We ignore the rest of the line. */
818 lr_ignore_rest (ldfile, 0);
819 break;
822 /* Add the pointer. */
823 if (cnt >= max)
825 struct element_t **newp;
826 max += 10;
827 newp = (struct element_t **)
828 alloca (max * sizeof (struct element_t *));
829 memcpy (newp, weights, cnt * sizeof (struct element_t *));
830 weights = newp;
832 weights[cnt++] = charelem;
834 while (*cp != '\0');
836 /* Now store the information. */
837 elem->weights[weight_cnt].w = (struct element_t **)
838 obstack_alloc (&collate->mempool,
839 cnt * sizeof (struct element_t *));
840 memcpy (elem->weights[weight_cnt].w, weights,
841 cnt * sizeof (struct element_t *));
842 elem->weights[weight_cnt].cnt = cnt;
844 /* We don't need the string anymore. */
845 free (arg->val.str.startmb);
847 else if (ellipsis != tok_none
848 && (arg->tok == tok_ellipsis2
849 || arg->tok == tok_ellipsis3
850 || arg->tok == tok_ellipsis4))
852 /* It must be the same ellipsis as used in the initial column. */
853 if (arg->tok != ellipsis)
854 lr_error (ldfile, _("\
855 %s: weights must use the same ellipsis symbol as the name"),
856 "LC_COLLATE");
858 /* The weight for this level will depend on the element
859 iterating over the range. Put a placeholder. */
860 elem->weights[weight_cnt].w = (struct element_t **)
861 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
862 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
863 elem->weights[weight_cnt].cnt = 1;
865 else
867 syntax:
868 /* It's a syntax error. */
869 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
870 lr_ignore_rest (ldfile, 0);
871 break;
874 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
875 /* This better should be the end of the line or a semicolon. */
876 if (arg->tok == tok_semicolon)
877 /* OK, ignore this and read the next token. */
878 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
879 else if (arg->tok != tok_eof && arg->tok != tok_eol)
881 /* It's a syntax error. */
882 lr_error (ldfile, _("%s: syntax error"), "LC_COLLATE");
883 lr_ignore_rest (ldfile, 0);
884 break;
887 while (++weight_cnt < nrules);
889 if (weight_cnt < nrules)
891 /* This means the rest of the line uses the current element as
892 the weight. */
895 elem->weights[weight_cnt].w = (struct element_t **)
896 obstack_alloc (&collate->mempool, sizeof (struct element_t *));
897 if (ellipsis == tok_none)
898 elem->weights[weight_cnt].w[0] = elem;
899 else
900 elem->weights[weight_cnt].w[0] = ELEMENT_ELLIPSIS2;
901 elem->weights[weight_cnt].cnt = 1;
903 while (++weight_cnt < nrules);
905 else
907 if (arg->tok == tok_ignore || arg->tok == tok_bsymbol)
909 /* Too many rule values. */
910 lr_error (ldfile, _("%s: too many values"), "LC_COLLATE");
911 lr_ignore_rest (ldfile, 0);
913 else
914 lr_ignore_rest (ldfile, arg->tok != tok_eol && arg->tok != tok_eof);
919 static int
920 insert_value (struct linereader *ldfile, const char *symstr, size_t symlen,
921 const struct charmap_t *charmap, struct repertoire_t *repertoire,
922 struct localedef_t *result)
924 /* First find out what kind of symbol this is. */
925 struct charseq *seq;
926 uint32_t wc;
927 struct element_t *elem = NULL;
928 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
930 /* Try to find the character in the charmap. */
931 seq = charmap_find_value (charmap, symstr, symlen);
933 /* Determine the wide character. */
934 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
936 wc = repertoire_find_value (repertoire, symstr, symlen);
937 if (seq != NULL)
938 seq->ucs4 = wc;
940 else
941 wc = seq->ucs4;
943 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
945 /* It's no character, so look through the collation elements and
946 symbol list. */
947 void *ptr = elem;
948 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) != 0)
950 void *result;
951 struct symbol_t *sym = NULL;
953 /* It's also collation element. Therefore it's either a
954 collating symbol or it's a character which is not
955 supported by the character set. In the later case we
956 simply create a dummy entry. */
957 if (find_entry (&collate->sym_table, symstr, symlen, &result) == 0)
959 /* It's a collation symbol. */
960 sym = (struct symbol_t *) result;
962 elem = sym->order;
965 if (elem == NULL)
967 elem = new_element (collate, NULL, 0, NULL, symstr, symlen, 0);
969 if (sym != NULL)
970 sym->order = elem;
971 else
972 /* Enter a fake element in the sequence table. This
973 won't cause anything in the output since there is
974 no multibyte or wide character associated with
975 it. */
976 insert_entry (&collate->seq_table, symstr, symlen, elem);
979 else
980 /* Copy the result back. */
981 elem = ptr;
983 else
985 /* Otherwise the symbols stands for a character. */
986 void *ptr = elem;
987 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) != 0)
989 uint32_t wcs[2] = { wc, 0 };
991 /* We have to allocate an entry. */
992 elem = new_element (collate, seq != NULL ? seq->bytes : NULL,
993 seq != NULL ? seq->nbytes : 0,
994 wc == ILLEGAL_CHAR_VALUE ? NULL : wcs,
995 symstr, symlen, 1);
997 /* And add it to the table. */
998 if (insert_entry (&collate->seq_table, symstr, symlen, elem) != 0)
999 /* This cannot happen. */
1000 assert (! "Internal error");
1002 else
1004 /* Copy the result back. */
1005 elem = ptr;
1007 /* Maybe the character was used before the definition. In this case
1008 we have to insert the byte sequences now. */
1009 if (elem->mbs == NULL && seq != NULL)
1011 elem->mbs = obstack_copy0 (&collate->mempool,
1012 seq->bytes, seq->nbytes);
1013 elem->nmbs = seq->nbytes;
1016 if (elem->wcs == NULL && wc != ILLEGAL_CHAR_VALUE)
1018 uint32_t wcs[2] = { wc, 0 };
1020 elem->wcs = obstack_copy (&collate->mempool, wcs, sizeof (wcs));
1021 elem->nwcs = 1;
1026 /* Test whether this element is not already in the list. */
1027 if (elem->next != NULL || elem == collate->cursor)
1029 lr_error (ldfile, _("order for `%.*s' already defined at %s:%Zu"),
1030 (int) symlen, symstr, elem->file, elem->line);
1031 lr_ignore_rest (ldfile, 0);
1032 return 1;
1035 insert_weights (ldfile, elem, charmap, repertoire, result, tok_none);
1037 return 0;
1041 static void
1042 handle_ellipsis (struct linereader *ldfile, const char *symstr, size_t symlen,
1043 enum token_t ellipsis, const struct charmap_t *charmap,
1044 struct repertoire_t *repertoire,
1045 struct localedef_t *result)
1047 struct element_t *startp;
1048 struct element_t *endp;
1049 struct locale_collate_t *collate = result->categories[LC_COLLATE].collate;
1051 /* Unlink the entry added for the ellipsis. */
1052 unlink_element (collate);
1053 startp = collate->cursor;
1055 /* Process and add the end-entry. */
1056 if (symstr != NULL
1057 && insert_value (ldfile, symstr, symlen, charmap, repertoire, result))
1058 /* Something went wrong with inserting the to-value. This means
1059 we cannot process the ellipsis. */
1060 return;
1062 /* Reset the cursor. */
1063 collate->cursor = startp;
1065 /* Now we have to handle many different situations:
1066 - we have to distinguish between the three different ellipsis forms
1067 - the is the ellipsis at the beginning, in the middle, or at the end.
1069 endp = collate->cursor->next;
1070 assert (symstr == NULL || endp != NULL);
1072 /* XXX The following is probably very wrong since also collating symbols
1073 can appear in ranges. But do we want/can refine the test for that? */
1074 #if 0
1075 /* Both, the start and the end symbol, must stand for characters. */
1076 if ((startp != NULL && (startp->name == NULL || ! startp->is_character))
1077 || (endp != NULL && (endp->name == NULL|| ! endp->is_character)))
1079 lr_error (ldfile, _("\
1080 %s: the start and the end symbol of a range must stand for characters"),
1081 "LC_COLLATE");
1082 return;
1084 #endif
1086 if (ellipsis == tok_ellipsis3)
1088 /* One requirement we make here: the length of the byte
1089 sequences for the first and end character must be the same.
1090 This is mainly to prevent unwanted effects and this is often
1091 not what is wanted. */
1092 size_t len = (startp->mbs != NULL ? startp->nmbs
1093 : (endp->mbs != NULL ? endp->nmbs : 0));
1094 char mbcnt[len + 1];
1095 char mbend[len + 1];
1097 /* Well, this should be caught somewhere else already. Just to
1098 make sure. */
1099 assert (startp == NULL || startp->wcs == NULL || startp->wcs[1] == 0);
1100 assert (endp == NULL || endp->wcs == NULL || endp->wcs[1] == 0);
1102 if (startp != NULL && endp != NULL
1103 && startp->mbs != NULL && endp->mbs != NULL
1104 && startp->nmbs != endp->nmbs)
1106 lr_error (ldfile, _("\
1107 %s: byte sequences of first and last character must have the same length"),
1108 "LC_COLLATE");
1109 return;
1112 /* Determine whether we have to generate multibyte sequences. */
1113 if ((startp == NULL || startp->mbs != NULL)
1114 && (endp == NULL || endp->mbs != NULL))
1116 int cnt;
1117 int ret;
1119 /* Prepare the beginning byte sequence. This is either from the
1120 beginning byte sequence or it is all nulls if it was an
1121 initial ellipsis. */
1122 if (startp == NULL || startp->mbs == NULL)
1123 memset (mbcnt, '\0', len);
1124 else
1126 memcpy (mbcnt, startp->mbs, len);
1128 /* And increment it so that the value is the first one we will
1129 try to insert. */
1130 for (cnt = len - 1; cnt >= 0; --cnt)
1131 if (++mbcnt[cnt] != '\0')
1132 break;
1134 mbcnt[len] = '\0';
1136 /* And the end sequence. */
1137 if (endp == NULL || endp->mbs == NULL)
1138 memset (mbend, '\0', len);
1139 else
1140 memcpy (mbend, endp->mbs, len);
1141 mbend[len] = '\0';
1143 /* Test whether we have a correct range. */
1144 ret = memcmp (mbcnt, mbend, len);
1145 if (ret >= 0)
1147 if (ret > 0)
1148 lr_error (ldfile, _("%s: byte sequence of first character of \
1149 range is not lower than that of the last character"), "LC_COLLATE");
1150 return;
1153 /* Generate the byte sequences data. */
1154 while (1)
1156 struct charseq *seq;
1158 /* Quite a bit of work ahead. We have to find the character
1159 definition for the byte sequence and then determine the
1160 wide character belonging to it. */
1161 seq = charmap_find_symbol (charmap, mbcnt, len);
1162 if (seq != NULL)
1164 struct element_t *elem;
1165 size_t namelen;
1167 /* I don't think this can ever happen. */
1168 assert (seq->name != NULL);
1169 namelen = strlen (seq->name);
1171 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1172 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1173 namelen);
1175 /* Now we are ready to insert the new value in the
1176 sequence. Find out whether the element is
1177 already known. */
1178 void *ptr;
1179 if (find_entry (&collate->seq_table, seq->name, namelen,
1180 &ptr) != 0)
1182 uint32_t wcs[2] = { seq->ucs4, 0 };
1184 /* We have to allocate an entry. */
1185 elem = new_element (collate, mbcnt, len,
1186 seq->ucs4 == ILLEGAL_CHAR_VALUE
1187 ? NULL : wcs, seq->name,
1188 namelen, 1);
1190 /* And add it to the table. */
1191 if (insert_entry (&collate->seq_table, seq->name,
1192 namelen, elem) != 0)
1193 /* This cannot happen. */
1194 assert (! "Internal error");
1196 else
1197 /* Copy the result. */
1198 elem = ptr;
1200 /* Test whether this element is not already in the list. */
1201 if (elem->next != NULL || (collate->cursor != NULL
1202 && elem->next == collate->cursor))
1204 lr_error (ldfile, _("\
1205 order for `%.*s' already defined at %s:%Zu"),
1206 (int) namelen, seq->name,
1207 elem->file, elem->line);
1208 goto increment;
1211 /* Enqueue the new element. */
1212 elem->last = collate->cursor;
1213 if (collate->cursor == NULL)
1214 elem->next = NULL;
1215 else
1217 elem->next = collate->cursor->next;
1218 elem->last->next = elem;
1219 if (elem->next != NULL)
1220 elem->next->last = elem;
1222 if (collate->start == NULL)
1224 assert (collate->cursor == NULL);
1225 collate->start = elem;
1227 collate->cursor = elem;
1229 /* Add the weight value. We take them from the
1230 `ellipsis_weights' member of `collate'. */
1231 elem->weights = (struct element_list_t *)
1232 obstack_alloc (&collate->mempool,
1233 nrules * sizeof (struct element_list_t));
1234 for (cnt = 0; cnt < nrules; ++cnt)
1235 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1236 && (collate->ellipsis_weight.weights[cnt].w[0]
1237 == ELEMENT_ELLIPSIS2))
1239 elem->weights[cnt].w = (struct element_t **)
1240 obstack_alloc (&collate->mempool,
1241 sizeof (struct element_t *));
1242 elem->weights[cnt].w[0] = elem;
1243 elem->weights[cnt].cnt = 1;
1245 else
1247 /* Simply use the weight from `ellipsis_weight'. */
1248 elem->weights[cnt].w =
1249 collate->ellipsis_weight.weights[cnt].w;
1250 elem->weights[cnt].cnt =
1251 collate->ellipsis_weight.weights[cnt].cnt;
1255 /* Increment for the next round. */
1256 increment:
1257 for (cnt = len - 1; cnt >= 0; --cnt)
1258 if (++mbcnt[cnt] != '\0')
1259 break;
1261 /* Find out whether this was all. */
1262 if (cnt < 0 || memcmp (mbcnt, mbend, len) >= 0)
1263 /* Yep, that's all. */
1264 break;
1268 else
1270 /* For symbolic range we naturally must have a beginning and an
1271 end specified by the user. */
1272 if (startp == NULL)
1273 lr_error (ldfile, _("\
1274 %s: symbolic range ellipsis must not directly follow `order_start'"),
1275 "LC_COLLATE");
1276 else if (endp == NULL)
1277 lr_error (ldfile, _("\
1278 %s: symbolic range ellipsis must not be directly followed by `order_end'"),
1279 "LC_COLLATE");
1280 else
1282 /* Determine the range. To do so we have to determine the
1283 common prefix of the both names and then the numeric
1284 values of both ends. */
1285 size_t lenfrom = strlen (startp->name);
1286 size_t lento = strlen (endp->name);
1287 char buf[lento + 1];
1288 int preflen = 0;
1289 long int from;
1290 long int to;
1291 char *cp;
1292 int base = ellipsis == tok_ellipsis2 ? 16 : 10;
1294 if (lenfrom != lento)
1296 invalid_range:
1297 lr_error (ldfile, _("\
1298 `%s' and `%.*s' are not valid names for symbolic range"),
1299 startp->name, (int) lento, endp->name);
1300 return;
1303 while (startp->name[preflen] == endp->name[preflen])
1304 if (startp->name[preflen] == '\0')
1305 /* Nothing to be done. The start and end point are identical
1306 and while inserting the end point we have already given
1307 the user an error message. */
1308 return;
1309 else
1310 ++preflen;
1312 errno = 0;
1313 from = strtol (startp->name + preflen, &cp, base);
1314 if ((from == UINT_MAX && errno == ERANGE) || *cp != '\0')
1315 goto invalid_range;
1317 errno = 0;
1318 to = strtol (endp->name + preflen, &cp, base);
1319 if ((to == UINT_MAX && errno == ERANGE) || *cp != '\0')
1320 goto invalid_range;
1322 /* Copy the prefix. */
1323 memcpy (buf, startp->name, preflen);
1325 /* Loop over all values. */
1326 for (++from; from < to; ++from)
1328 struct element_t *elem = NULL;
1329 struct charseq *seq;
1330 uint32_t wc;
1331 int cnt;
1333 /* Generate the name. */
1334 sprintf (buf + preflen, base == 10 ? "%0*ld" : "%0*lX",
1335 (int) (lenfrom - preflen), from);
1337 /* Look whether this name is already defined. */
1338 void *ptr;
1339 if (find_entry (&collate->seq_table, buf, symlen, &ptr) == 0)
1341 /* Copy back the result. */
1342 elem = ptr;
1344 if (elem->next != NULL || (collate->cursor != NULL
1345 && elem->next == collate->cursor))
1347 lr_error (ldfile, _("\
1348 %s: order for `%.*s' already defined at %s:%Zu"),
1349 "LC_COLLATE", (int) lenfrom, buf,
1350 elem->file, elem->line);
1351 continue;
1354 if (elem->name == NULL)
1356 lr_error (ldfile, _("%s: `%s' must be a character"),
1357 "LC_COLLATE", buf);
1358 continue;
1362 if (elem == NULL || (elem->mbs == NULL && elem->wcs == NULL))
1364 /* Search for a character of this name. */
1365 seq = charmap_find_value (charmap, buf, lenfrom);
1366 if (seq == NULL || seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1368 wc = repertoire_find_value (repertoire, buf, lenfrom);
1370 if (seq != NULL)
1371 seq->ucs4 = wc;
1373 else
1374 wc = seq->ucs4;
1376 if (wc == ILLEGAL_CHAR_VALUE && seq == NULL)
1377 /* We don't know anything about a character with this
1378 name. XXX Should we warn? */
1379 continue;
1381 if (elem == NULL)
1383 uint32_t wcs[2] = { wc, 0 };
1385 /* We have to allocate an entry. */
1386 elem = new_element (collate,
1387 seq != NULL ? seq->bytes : NULL,
1388 seq != NULL ? seq->nbytes : 0,
1389 wc == ILLEGAL_CHAR_VALUE
1390 ? NULL : wcs, buf, lenfrom, 1);
1392 else
1394 /* Update the element. */
1395 if (seq != NULL)
1397 elem->mbs = obstack_copy0 (&collate->mempool,
1398 seq->bytes, seq->nbytes);
1399 elem->nmbs = seq->nbytes;
1402 if (wc != ILLEGAL_CHAR_VALUE)
1404 uint32_t zero = 0;
1406 obstack_grow (&collate->mempool,
1407 &wc, sizeof (uint32_t));
1408 obstack_grow (&collate->mempool,
1409 &zero, sizeof (uint32_t));
1410 elem->wcs = obstack_finish (&collate->mempool);
1411 elem->nwcs = 1;
1415 elem->file = ldfile->fname;
1416 elem->line = ldfile->lineno;
1417 elem->section = collate->current_section;
1420 /* Enqueue the new element. */
1421 elem->last = collate->cursor;
1422 elem->next = collate->cursor->next;
1423 elem->last->next = elem;
1424 if (elem->next != NULL)
1425 elem->next->last = elem;
1426 collate->cursor = elem;
1428 /* Now add the weights. They come from the `ellipsis_weights'
1429 member of `collate'. */
1430 elem->weights = (struct element_list_t *)
1431 obstack_alloc (&collate->mempool,
1432 nrules * sizeof (struct element_list_t));
1433 for (cnt = 0; cnt < nrules; ++cnt)
1434 if (collate->ellipsis_weight.weights[cnt].cnt == 1
1435 && (collate->ellipsis_weight.weights[cnt].w[0]
1436 == ELEMENT_ELLIPSIS2))
1438 elem->weights[cnt].w = (struct element_t **)
1439 obstack_alloc (&collate->mempool,
1440 sizeof (struct element_t *));
1441 elem->weights[cnt].w[0] = elem;
1442 elem->weights[cnt].cnt = 1;
1444 else
1446 /* Simly use the weight from `ellipsis_weight'. */
1447 elem->weights[cnt].w =
1448 collate->ellipsis_weight.weights[cnt].w;
1449 elem->weights[cnt].cnt =
1450 collate->ellipsis_weight.weights[cnt].cnt;
1458 static void
1459 collate_startup (struct linereader *ldfile, struct localedef_t *locale,
1460 struct localedef_t *copy_locale, int ignore_content)
1462 if (!ignore_content && locale->categories[LC_COLLATE].collate == NULL)
1464 struct locale_collate_t *collate;
1466 if (copy_locale == NULL)
1468 collate = locale->categories[LC_COLLATE].collate =
1469 (struct locale_collate_t *)
1470 xcalloc (1, sizeof (struct locale_collate_t));
1472 /* Init the various data structures. */
1473 init_hash (&collate->elem_table, 100);
1474 init_hash (&collate->sym_table, 100);
1475 init_hash (&collate->seq_table, 500);
1476 obstack_init (&collate->mempool);
1478 collate->col_weight_max = -1;
1480 else
1481 /* Reuse the copy_locale's data structures. */
1482 collate = locale->categories[LC_COLLATE].collate =
1483 copy_locale->categories[LC_COLLATE].collate;
1486 ldfile->translate_strings = 0;
1487 ldfile->return_widestr = 0;
1491 void
1492 collate_finish (struct localedef_t *locale, const struct charmap_t *charmap)
1494 /* Now is the time when we can assign the individual collation
1495 values for all the symbols. We have possibly different values
1496 for the wide- and the multibyte-character symbols. This is done
1497 since it might make a difference in the encoding if there is in
1498 some cases no multibyte-character but there are wide-characters.
1499 (The other way around it is not important since theencoded
1500 collation value in the wide-character case is 32 bits wide and
1501 therefore requires no encoding).
1503 The lowest collation value assigned is 2. Zero is reserved for
1504 the NUL byte terminating the strings in the `strxfrm'/`wcsxfrm'
1505 functions and 1 is used to separate the individual passes for the
1506 different rules.
1508 We also have to construct is list with all the bytes/words which
1509 can come first in a sequence, followed by all the elements which
1510 also start with this byte/word. The order is reverse which has
1511 among others the important effect that longer strings are located
1512 first in the list. This is required for the output data since
1513 the algorithm used in `strcoll' etc depends on this.
1515 The multibyte case is easy. We simply sort into an array with
1516 256 elements. */
1517 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1518 int mbact[nrules];
1519 int wcact;
1520 int mbseqact;
1521 int wcseqact;
1522 struct element_t *runp;
1523 int i;
1524 int need_undefined = 0;
1525 struct section_list *sect;
1526 int ruleidx;
1527 int nr_wide_elems = 0;
1529 if (collate == NULL)
1531 /* No data, no check. */
1532 if (! be_quiet)
1533 WITH_CUR_LOCALE (error (0, 0, _("No definition for %s category found"),
1534 "LC_COLLATE"));
1535 return;
1538 /* If this assertion is hit change the type in `element_t'. */
1539 assert (nrules <= sizeof (runp->used_in_level) * 8);
1541 /* Make sure that the `position' rule is used either in all sections
1542 or in none. */
1543 for (i = 0; i < nrules; ++i)
1544 for (sect = collate->sections; sect != NULL; sect = sect->next)
1545 if (sect->rules != NULL
1546 && ((sect->rules[i] & sort_position)
1547 != (collate->sections->rules[i] & sort_position)))
1549 WITH_CUR_LOCALE (error (0, 0, _("\
1550 %s: `position' must be used for a specific level in all sections or none"),
1551 "LC_COLLATE"));
1552 break;
1555 /* Find out which elements are used at which level. At the same
1556 time we find out whether we have any undefined symbols. */
1557 runp = collate->start;
1558 while (runp != NULL)
1560 if (runp->mbs != NULL)
1562 for (i = 0; i < nrules; ++i)
1564 int j;
1566 for (j = 0; j < runp->weights[i].cnt; ++j)
1567 /* A NULL pointer as the weight means IGNORE. */
1568 if (runp->weights[i].w[j] != NULL)
1570 if (runp->weights[i].w[j]->weights == NULL)
1572 WITH_CUR_LOCALE (error_at_line (0, 0, runp->file,
1573 runp->line,
1574 _("symbol `%s' not defined"),
1575 runp->weights[i].w[j]->name));
1577 need_undefined = 1;
1578 runp->weights[i].w[j] = &collate->undefined;
1580 else
1581 /* Set the bit for the level. */
1582 runp->weights[i].w[j]->used_in_level |= 1 << i;
1587 /* Up to the next entry. */
1588 runp = runp->next;
1591 /* Walk through the list of defined sequences and assign weights. Also
1592 create the data structure which will allow generating the single byte
1593 character based tables.
1595 Since at each time only the weights for each of the rules are
1596 only compared to other weights for this rule it is possible to
1597 assign more compact weight values than simply counting all
1598 weights in sequence. We can assign weights from 3, one for each
1599 rule individually and only for those elements, which are actually
1600 used for this rule.
1602 Why is this important? It is not for the wide char table. But
1603 it is for the singlebyte output since here larger numbers have to
1604 be encoded to make it possible to emit the value as a byte
1605 string. */
1606 for (i = 0; i < nrules; ++i)
1607 mbact[i] = 2;
1608 wcact = 2;
1609 mbseqact = 0;
1610 wcseqact = 0;
1611 runp = collate->start;
1612 while (runp != NULL)
1614 /* Determine the order. */
1615 if (runp->used_in_level != 0)
1617 runp->mborder = (int *) obstack_alloc (&collate->mempool,
1618 nrules * sizeof (int));
1620 for (i = 0; i < nrules; ++i)
1621 if ((runp->used_in_level & (1 << i)) != 0)
1622 runp->mborder[i] = mbact[i]++;
1623 else
1624 runp->mborder[i] = 0;
1627 if (runp->mbs != NULL)
1629 struct element_t **eptr;
1630 struct element_t *lastp = NULL;
1632 /* Find the point where to insert in the list. */
1633 eptr = &collate->mbheads[((unsigned char *) runp->mbs)[0]];
1634 while (*eptr != NULL)
1636 if ((*eptr)->nmbs < runp->nmbs)
1637 break;
1639 if ((*eptr)->nmbs == runp->nmbs)
1641 int c = memcmp ((*eptr)->mbs, runp->mbs, runp->nmbs);
1643 if (c == 0)
1645 /* This should not happen. It means that we have
1646 to symbols with the same byte sequence. It is
1647 of course an error. */
1648 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1649 (*eptr)->line,
1650 _("\
1651 symbol `%s' has the same encoding as"), (*eptr)->name);
1652 error_at_line (0, 0, runp->file,
1653 runp->line,
1654 _("symbol `%s'"),
1655 runp->name));
1656 goto dont_insert;
1658 else if (c < 0)
1659 /* Insert it here. */
1660 break;
1663 /* To the next entry. */
1664 lastp = *eptr;
1665 eptr = &(*eptr)->mbnext;
1668 /* Set the pointers. */
1669 runp->mbnext = *eptr;
1670 runp->mblast = lastp;
1671 if (*eptr != NULL)
1672 (*eptr)->mblast = runp;
1673 *eptr = runp;
1674 dont_insert:
1678 if (runp->used_in_level)
1680 runp->wcorder = wcact++;
1682 /* We take the opportunity to count the elements which have
1683 wide characters. */
1684 ++nr_wide_elems;
1687 if (runp->is_character)
1689 if (runp->nmbs == 1)
1690 collate->mbseqorder[((unsigned char *) runp->mbs)[0]] = mbseqact++;
1692 runp->wcseqorder = wcseqact++;
1694 else if (runp->mbs != NULL && runp->weights != NULL)
1695 /* This is for collation elements. */
1696 runp->wcseqorder = wcseqact++;
1698 /* Up to the next entry. */
1699 runp = runp->next;
1702 /* Find out whether any of the `mbheads' entries is unset. In this
1703 case we use the UNDEFINED entry. */
1704 for (i = 1; i < 256; ++i)
1705 if (collate->mbheads[i] == NULL)
1707 need_undefined = 1;
1708 collate->mbheads[i] = &collate->undefined;
1711 /* Now to the wide character case. */
1712 collate->wcheads.p = 6;
1713 collate->wcheads.q = 10;
1714 wchead_table_init (&collate->wcheads);
1716 collate->wcseqorder.p = 6;
1717 collate->wcseqorder.q = 10;
1718 collseq_table_init (&collate->wcseqorder);
1720 /* Start adding. */
1721 runp = collate->start;
1722 while (runp != NULL)
1724 if (runp->wcs != NULL)
1726 struct element_t *e;
1727 struct element_t **eptr;
1728 struct element_t *lastp;
1730 /* Insert the collation sequence value. */
1731 if (runp->is_character)
1732 collseq_table_add (&collate->wcseqorder, runp->wcs[0],
1733 runp->wcseqorder);
1735 /* Find the point where to insert in the list. */
1736 e = wchead_table_get (&collate->wcheads, runp->wcs[0]);
1737 eptr = &e;
1738 lastp = NULL;
1739 while (*eptr != NULL)
1741 if ((*eptr)->nwcs < runp->nwcs)
1742 break;
1744 if ((*eptr)->nwcs == runp->nwcs)
1746 int c = wmemcmp ((wchar_t *) (*eptr)->wcs,
1747 (wchar_t *) runp->wcs, runp->nwcs);
1749 if (c == 0)
1751 /* This should not happen. It means that we have
1752 two symbols with the same byte sequence. It is
1753 of course an error. */
1754 WITH_CUR_LOCALE (error_at_line (0, 0, (*eptr)->file,
1755 (*eptr)->line,
1756 _("\
1757 symbol `%s' has the same encoding as"), (*eptr)->name);
1758 error_at_line (0, 0, runp->file,
1759 runp->line,
1760 _("symbol `%s'"),
1761 runp->name));
1762 goto dont_insertwc;
1764 else if (c < 0)
1765 /* Insert it here. */
1766 break;
1769 /* To the next entry. */
1770 lastp = *eptr;
1771 eptr = &(*eptr)->wcnext;
1774 /* Set the pointers. */
1775 runp->wcnext = *eptr;
1776 runp->wclast = lastp;
1777 if (*eptr != NULL)
1778 (*eptr)->wclast = runp;
1779 *eptr = runp;
1780 if (eptr == &e)
1781 wchead_table_add (&collate->wcheads, runp->wcs[0], e);
1782 dont_insertwc:
1786 /* Up to the next entry. */
1787 runp = runp->next;
1790 collseq_table_finalize (&collate->wcseqorder);
1792 /* Now determine whether the UNDEFINED entry is needed and if yes,
1793 whether it was defined. */
1794 collate->undefined.used_in_level = need_undefined ? ~0ul : 0;
1795 if (collate->undefined.file == NULL)
1797 if (need_undefined)
1799 /* This seems not to be enforced by recent standards. Don't
1800 emit an error, simply append UNDEFINED at the end. */
1801 if (0)
1802 WITH_CUR_LOCALE (error (0, 0, _("no definition of `UNDEFINED'")));
1804 /* Add UNDEFINED at the end. */
1805 collate->undefined.mborder =
1806 (int *) obstack_alloc (&collate->mempool, nrules * sizeof (int));
1808 for (i = 0; i < nrules; ++i)
1809 collate->undefined.mborder[i] = mbact[i]++;
1812 /* In any case we will need the definition for the wide character
1813 case. But we will not complain that it is missing since the
1814 specification strangely enough does not seem to account for
1815 this. */
1816 collate->undefined.wcorder = wcact++;
1819 /* Finally, try to unify the rules for the sections. Whenever the rules
1820 for a section are the same as those for another section give the
1821 ruleset the same index. Since there are never many section we can
1822 use an O(n^2) algorithm here. */
1823 sect = collate->sections;
1824 while (sect != NULL && sect->rules == NULL)
1825 sect = sect->next;
1827 /* Bail out if we have no sections because of earlier errors. */
1828 if (sect == NULL)
1830 WITH_CUR_LOCALE (error (EXIT_FAILURE, 0,
1831 _("too many errors; giving up")));
1832 return;
1835 ruleidx = 0;
1838 struct section_list *osect = collate->sections;
1840 while (osect != sect)
1841 if (osect->rules != NULL
1842 && memcmp (osect->rules, sect->rules, nrules) == 0)
1843 break;
1844 else
1845 osect = osect->next;
1847 if (osect == sect)
1848 sect->ruleidx = ruleidx++;
1849 else
1850 sect->ruleidx = osect->ruleidx;
1852 /* Next section. */
1854 sect = sect->next;
1855 while (sect != NULL && sect->rules == NULL);
1857 while (sect != NULL);
1858 /* We are currently not prepared for more than 128 rulesets. But this
1859 should never really be a problem. */
1860 assert (ruleidx <= 128);
1864 static int32_t
1865 output_weight (struct obstack *pool, struct locale_collate_t *collate,
1866 struct element_t *elem)
1868 size_t cnt;
1869 int32_t retval;
1871 /* Optimize the use of UNDEFINED. */
1872 if (elem == &collate->undefined)
1873 /* The weights are already inserted. */
1874 return 0;
1876 /* This byte can start exactly one collation element and this is
1877 a single byte. We can directly give the index to the weights. */
1878 retval = obstack_object_size (pool);
1880 /* Construct the weight. */
1881 for (cnt = 0; cnt < nrules; ++cnt)
1883 char buf[elem->weights[cnt].cnt * 7];
1884 int len = 0;
1885 int i;
1887 for (i = 0; i < elem->weights[cnt].cnt; ++i)
1888 /* Encode the weight value. We do nothing for IGNORE entries. */
1889 if (elem->weights[cnt].w[i] != NULL)
1890 len += utf8_encode (&buf[len],
1891 elem->weights[cnt].w[i]->mborder[cnt]);
1893 /* And add the buffer content. */
1894 obstack_1grow (pool, len);
1895 obstack_grow (pool, buf, len);
1898 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1902 static int32_t
1903 output_weightwc (struct obstack *pool, struct locale_collate_t *collate,
1904 struct element_t *elem)
1906 size_t cnt;
1907 int32_t retval;
1909 /* Optimize the use of UNDEFINED. */
1910 if (elem == &collate->undefined)
1911 /* The weights are already inserted. */
1912 return 0;
1914 /* This byte can start exactly one collation element and this is
1915 a single byte. We can directly give the index to the weights. */
1916 retval = obstack_object_size (pool) / sizeof (int32_t);
1918 /* Construct the weight. */
1919 for (cnt = 0; cnt < nrules; ++cnt)
1921 int32_t buf[elem->weights[cnt].cnt];
1922 int i;
1923 int32_t j;
1925 for (i = 0, j = 0; i < elem->weights[cnt].cnt; ++i)
1926 if (elem->weights[cnt].w[i] != NULL)
1927 buf[j++] = elem->weights[cnt].w[i]->wcorder;
1929 /* And add the buffer content. */
1930 obstack_int32_grow (pool, j);
1932 obstack_grow (pool, buf, j * sizeof (int32_t));
1935 return retval | ((elem->section->ruleidx & 0x7f) << 24);
1939 void
1940 collate_output (struct localedef_t *locale, const struct charmap_t *charmap,
1941 const char *output_path)
1943 struct locale_collate_t *collate = locale->categories[LC_COLLATE].collate;
1944 const size_t nelems = _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE);
1945 struct iovec iov[2 + nelems];
1946 struct locale_file data;
1947 uint32_t idx[nelems];
1948 size_t cnt;
1949 size_t ch;
1950 int32_t tablemb[256];
1951 struct obstack weightpool;
1952 struct obstack extrapool;
1953 struct obstack indirectpool;
1954 struct section_list *sect;
1955 struct collidx_table tablewc;
1956 uint32_t elem_size;
1957 uint32_t *elem_table;
1958 int i;
1959 struct element_t *runp;
1961 data.magic = LIMAGIC (LC_COLLATE);
1962 data.n = nelems;
1963 iov[0].iov_base = (void *) &data;
1964 iov[0].iov_len = sizeof (data);
1966 iov[1].iov_base = (void *) idx;
1967 iov[1].iov_len = sizeof (idx);
1969 idx[0] = iov[0].iov_len + iov[1].iov_len;
1970 cnt = 0;
1972 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_NRULES));
1973 iov[2 + cnt].iov_base = &nrules;
1974 iov[2 + cnt].iov_len = sizeof (uint32_t);
1975 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1976 ++cnt;
1978 /* If we have no LC_COLLATE data emit only the number of rules as zero. */
1979 if (collate == NULL)
1981 int32_t dummy = 0;
1983 while (cnt < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
1985 /* The words have to be handled specially. */
1986 if (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB))
1988 iov[2 + cnt].iov_base = &dummy;
1989 iov[2 + cnt].iov_len = sizeof (int32_t);
1991 else
1993 iov[2 + cnt].iov_base = NULL;
1994 iov[2 + cnt].iov_len = 0;
1997 if (cnt + 1 < _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE))
1998 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
1999 ++cnt;
2002 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2004 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", 2 + cnt, iov);
2006 return;
2009 obstack_init (&weightpool);
2010 obstack_init (&extrapool);
2011 obstack_init (&indirectpool);
2013 /* Since we are using the sign of an integer to mark indirection the
2014 offsets in the arrays we are indirectly referring to must not be
2015 zero since -0 == 0. Therefore we add a bit of dummy content. */
2016 obstack_int32_grow (&extrapool, 0);
2017 obstack_int32_grow (&indirectpool, 0);
2019 /* Prepare the ruleset table. */
2020 for (sect = collate->sections, i = 0; sect != NULL; sect = sect->next)
2021 if (sect->rules != NULL && sect->ruleidx == i)
2023 int j;
2025 obstack_make_room (&weightpool, nrules);
2027 for (j = 0; j < nrules; ++j)
2028 obstack_1grow_fast (&weightpool, sect->rules[j]);
2029 ++i;
2031 /* And align the output. */
2032 i = (nrules * i) % __alignof__ (int32_t);
2033 if (i > 0)
2035 obstack_1grow (&weightpool, '\0');
2036 while (++i < __alignof__ (int32_t));
2038 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_RULESETS));
2039 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2040 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2041 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2042 ++cnt;
2044 /* Generate the 8-bit table. Walk through the lists of sequences
2045 starting with the same byte and add them one after the other to
2046 the table. In case we have more than one sequence starting with
2047 the same byte we have to use extra indirection.
2049 First add a record for the NUL byte. This entry will never be used
2050 so it does not matter. */
2051 tablemb[0] = 0;
2053 /* Now insert the `UNDEFINED' value if it is used. Since this value
2054 will probably be used more than once it is good to store the
2055 weights only once. */
2056 if (collate->undefined.used_in_level != 0)
2057 output_weight (&weightpool, collate, &collate->undefined);
2059 for (ch = 1; ch < 256; ++ch)
2060 if (collate->mbheads[ch]->mbnext == NULL
2061 && collate->mbheads[ch]->nmbs <= 1)
2063 tablemb[ch] = output_weight (&weightpool, collate,
2064 collate->mbheads[ch]);
2066 else
2068 /* The entries in the list are sorted by length and then
2069 alphabetically. This is the order in which we will add the
2070 elements to the collation table. This allows simply walking
2071 the table in sequence and stopping at the first matching
2072 entry. Since the longer sequences are coming first in the
2073 list they have the possibility to match first, just as it
2074 has to be. In the worst case we are walking to the end of
2075 the list where we put, if no singlebyte sequence is defined
2076 in the locale definition, the weights for UNDEFINED.
2078 To reduce the length of the search list we compress them a bit.
2079 This happens by collecting sequences of consecutive byte
2080 sequences in one entry (having and begin and end byte sequence)
2081 and add only one index into the weight table. We can find the
2082 consecutive entries since they are also consecutive in the list. */
2083 struct element_t *runp = collate->mbheads[ch];
2084 struct element_t *lastp;
2086 assert ((obstack_object_size (&extrapool)
2087 & (__alignof__ (int32_t) - 1)) == 0);
2089 tablemb[ch] = -obstack_object_size (&extrapool);
2093 /* Store the current index in the weight table. We know that
2094 the current position in the `extrapool' is aligned on a
2095 32-bit address. */
2096 int32_t weightidx;
2097 int added;
2099 /* Find out wether this is a single entry or we have more than
2100 one consecutive entry. */
2101 if (runp->mbnext != NULL
2102 && runp->nmbs == runp->mbnext->nmbs
2103 && memcmp (runp->mbs, runp->mbnext->mbs, runp->nmbs - 1) == 0
2104 && (runp->mbs[runp->nmbs - 1]
2105 == runp->mbnext->mbs[runp->nmbs - 1] + 1))
2107 int i;
2108 struct element_t *series_startp = runp;
2109 struct element_t *curp;
2111 /* Compute how much space we will need. */
2112 added = ((sizeof (int32_t) + 1 + 2 * (runp->nmbs - 1)
2113 + __alignof__ (int32_t) - 1)
2114 & ~(__alignof__ (int32_t) - 1));
2115 assert ((obstack_object_size (&extrapool)
2116 & (__alignof__ (int32_t) - 1)) == 0);
2117 obstack_make_room (&extrapool, added);
2119 /* More than one consecutive entry. We mark this by having
2120 a negative index into the indirect table. */
2121 obstack_int32_grow_fast (&extrapool,
2122 -(obstack_object_size (&indirectpool)
2123 / sizeof (int32_t)));
2125 /* Now search first the end of the series. */
2127 runp = runp->mbnext;
2128 while (runp->mbnext != NULL
2129 && runp->nmbs == runp->mbnext->nmbs
2130 && memcmp (runp->mbs, runp->mbnext->mbs,
2131 runp->nmbs - 1) == 0
2132 && (runp->mbs[runp->nmbs - 1]
2133 == runp->mbnext->mbs[runp->nmbs - 1] + 1));
2135 /* Now walk backward from here to the beginning. */
2136 curp = runp;
2138 assert (runp->nmbs <= 256);
2139 obstack_1grow_fast (&extrapool, curp->nmbs - 1);
2140 for (i = 1; i < curp->nmbs; ++i)
2141 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2143 /* Now find the end of the consecutive sequence and
2144 add all the indeces in the indirect pool. */
2147 weightidx = output_weight (&weightpool, collate, curp);
2148 obstack_int32_grow (&indirectpool, weightidx);
2150 curp = curp->mblast;
2152 while (curp != series_startp);
2154 /* Add the final weight. */
2155 weightidx = output_weight (&weightpool, collate, curp);
2156 obstack_int32_grow (&indirectpool, weightidx);
2158 /* And add the end byte sequence. Without length this
2159 time. */
2160 for (i = 1; i < curp->nmbs; ++i)
2161 obstack_1grow_fast (&extrapool, curp->mbs[i]);
2163 else
2165 /* A single entry. Simply add the index and the length and
2166 string (except for the first character which is already
2167 tested for). */
2168 int i;
2170 /* Output the weight info. */
2171 weightidx = output_weight (&weightpool, collate, runp);
2173 added = ((sizeof (int32_t) + 1 + runp->nmbs - 1
2174 + __alignof__ (int32_t) - 1)
2175 & ~(__alignof__ (int32_t) - 1));
2176 assert ((obstack_object_size (&extrapool)
2177 & (__alignof__ (int32_t) - 1)) == 0);
2178 obstack_make_room (&extrapool, added);
2180 obstack_int32_grow_fast (&extrapool, weightidx);
2181 assert (runp->nmbs <= 256);
2182 obstack_1grow_fast (&extrapool, runp->nmbs - 1);
2184 for (i = 1; i < runp->nmbs; ++i)
2185 obstack_1grow_fast (&extrapool, runp->mbs[i]);
2188 /* Add alignment bytes if necessary. */
2189 while ((obstack_object_size (&extrapool)
2190 & (__alignof__ (int32_t) - 1)) != 0)
2191 obstack_1grow_fast (&extrapool, '\0');
2193 /* Next entry. */
2194 lastp = runp;
2195 runp = runp->mbnext;
2197 while (runp != NULL);
2199 assert ((obstack_object_size (&extrapool)
2200 & (__alignof__ (int32_t) - 1)) == 0);
2202 /* If the final entry in the list is not a single character we
2203 add an UNDEFINED entry here. */
2204 if (lastp->nmbs != 1)
2206 int added = ((sizeof (int32_t) + 1 + 1 + __alignof__ (int32_t) - 1)
2207 & ~(__alignof__ (int32_t) - 1));
2208 obstack_make_room (&extrapool, added);
2210 obstack_int32_grow_fast (&extrapool, 0);
2211 /* XXX What rule? We just pick the first. */
2212 obstack_1grow_fast (&extrapool, 0);
2213 /* Length is zero. */
2214 obstack_1grow_fast (&extrapool, 0);
2216 /* Add alignment bytes if necessary. */
2217 while ((obstack_object_size (&extrapool)
2218 & (__alignof__ (int32_t) - 1)) != 0)
2219 obstack_1grow_fast (&extrapool, '\0');
2223 /* Add padding to the tables if necessary. */
2224 while ((obstack_object_size (&weightpool) & (__alignof__ (int32_t) - 1))
2225 != 0)
2226 obstack_1grow (&weightpool, 0);
2228 /* Now add the four tables. */
2229 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEMB));
2230 iov[2 + cnt].iov_base = tablemb;
2231 iov[2 + cnt].iov_len = sizeof (tablemb);
2232 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2233 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2234 ++cnt;
2236 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTMB));
2237 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2238 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2239 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2240 ++cnt;
2242 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAMB));
2243 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2244 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2245 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2246 ++cnt;
2248 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTMB));
2249 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2250 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2251 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2252 assert ((iov[2 + cnt].iov_len & (__alignof__ (int32_t) - 1)) == 0);
2253 ++cnt;
2256 /* Now the same for the wide character table. We need to store some
2257 more information here. */
2258 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP1));
2259 iov[2 + cnt].iov_base = NULL;
2260 iov[2 + cnt].iov_len = 0;
2261 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2262 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2263 ++cnt;
2265 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP2));
2266 iov[2 + cnt].iov_base = NULL;
2267 iov[2 + cnt].iov_len = 0;
2268 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2269 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2270 ++cnt;
2272 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_GAP3));
2273 iov[2 + cnt].iov_base = NULL;
2274 iov[2 + cnt].iov_len = 0;
2275 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2276 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2277 ++cnt;
2279 /* Since we are using the sign of an integer to mark indirection the
2280 offsets in the arrays we are indirectly referring to must not be
2281 zero since -0 == 0. Therefore we add a bit of dummy content. */
2282 obstack_int32_grow (&extrapool, 0);
2283 obstack_int32_grow (&indirectpool, 0);
2285 /* Now insert the `UNDEFINED' value if it is used. Since this value
2286 will probably be used more than once it is good to store the
2287 weights only once. */
2288 if (output_weightwc (&weightpool, collate, &collate->undefined) != 0)
2289 abort ();
2291 /* Generate the table. Walk through the lists of sequences starting
2292 with the same wide character and add them one after the other to
2293 the table. In case we have more than one sequence starting with
2294 the same byte we have to use extra indirection. */
2296 auto void add_to_tablewc (uint32_t ch, struct element_t *runp);
2298 void add_to_tablewc (uint32_t ch, struct element_t *runp)
2300 if (runp->wcnext == NULL && runp->nwcs == 1)
2302 int32_t weigthidx = output_weightwc (&weightpool, collate, runp);
2303 collidx_table_add (&tablewc, ch, weigthidx);
2305 else
2307 /* As for the singlebyte table, we recognize sequences and
2308 compress them. */
2309 struct element_t *lastp;
2311 collidx_table_add (&tablewc, ch,
2312 -(obstack_object_size (&extrapool) / sizeof (uint32_t)));
2316 /* Store the current index in the weight table. We know that
2317 the current position in the `extrapool' is aligned on a
2318 32-bit address. */
2319 int32_t weightidx;
2320 int added;
2322 /* Find out wether this is a single entry or we have more than
2323 one consecutive entry. */
2324 if (runp->wcnext != NULL
2325 && runp->nwcs == runp->wcnext->nwcs
2326 && wmemcmp ((wchar_t *) runp->wcs,
2327 (wchar_t *)runp->wcnext->wcs,
2328 runp->nwcs - 1) == 0
2329 && (runp->wcs[runp->nwcs - 1]
2330 == runp->wcnext->wcs[runp->nwcs - 1] + 1))
2332 int i;
2333 struct element_t *series_startp = runp;
2334 struct element_t *curp;
2336 /* Now add first the initial byte sequence. */
2337 added = (1 + 1 + 2 * (runp->nwcs - 1)) * sizeof (int32_t);
2338 if (sizeof (int32_t) == sizeof (int))
2339 obstack_make_room (&extrapool, added);
2341 /* More than one consecutive entry. We mark this by having
2342 a negative index into the indirect table. */
2343 obstack_int32_grow_fast (&extrapool,
2344 -(obstack_object_size (&indirectpool)
2345 / sizeof (int32_t)));
2346 obstack_int32_grow_fast (&extrapool, runp->nwcs - 1);
2349 runp = runp->wcnext;
2350 while (runp->wcnext != NULL
2351 && runp->nwcs == runp->wcnext->nwcs
2352 && wmemcmp ((wchar_t *) runp->wcs,
2353 (wchar_t *)runp->wcnext->wcs,
2354 runp->nwcs - 1) == 0
2355 && (runp->wcs[runp->nwcs - 1]
2356 == runp->wcnext->wcs[runp->nwcs - 1] + 1));
2358 /* Now walk backward from here to the beginning. */
2359 curp = runp;
2361 for (i = 1; i < runp->nwcs; ++i)
2362 obstack_int32_grow_fast (&extrapool, curp->wcs[i]);
2364 /* Now find the end of the consecutive sequence and
2365 add all the indeces in the indirect pool. */
2368 weightidx = output_weightwc (&weightpool, collate,
2369 curp);
2370 obstack_int32_grow (&indirectpool, weightidx);
2372 curp = curp->wclast;
2374 while (curp != series_startp);
2376 /* Add the final weight. */
2377 weightidx = output_weightwc (&weightpool, collate, curp);
2378 obstack_int32_grow (&indirectpool, weightidx);
2380 /* And add the end byte sequence. Without length this
2381 time. */
2382 for (i = 1; i < curp->nwcs; ++i)
2383 obstack_int32_grow (&extrapool, curp->wcs[i]);
2385 else
2387 /* A single entry. Simply add the index and the length and
2388 string (except for the first character which is already
2389 tested for). */
2390 int i;
2392 /* Output the weight info. */
2393 weightidx = output_weightwc (&weightpool, collate, runp);
2395 added = (1 + 1 + runp->nwcs - 1) * sizeof (int32_t);
2396 if (sizeof (int) == sizeof (int32_t))
2397 obstack_make_room (&extrapool, added);
2399 obstack_int32_grow_fast (&extrapool, weightidx);
2400 obstack_int32_grow_fast (&extrapool, runp->nwcs - 1);
2401 for (i = 1; i < runp->nwcs; ++i)
2402 obstack_int32_grow_fast (&extrapool, runp->wcs[i]);
2405 /* Next entry. */
2406 lastp = runp;
2407 runp = runp->wcnext;
2409 while (runp != NULL);
2413 tablewc.p = 6;
2414 tablewc.q = 10;
2415 collidx_table_init (&tablewc);
2417 wchead_table_iterate (&collate->wcheads, add_to_tablewc);
2419 collidx_table_finalize (&tablewc);
2422 /* Now add the four tables. */
2423 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_TABLEWC));
2424 iov[2 + cnt].iov_base = tablewc.result;
2425 iov[2 + cnt].iov_len = tablewc.result_size;
2426 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2427 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2428 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2429 ++cnt;
2431 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_WEIGHTWC));
2432 iov[2 + cnt].iov_len = obstack_object_size (&weightpool);
2433 iov[2 + cnt].iov_base = obstack_finish (&weightpool);
2434 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2435 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2436 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2437 ++cnt;
2439 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_EXTRAWC));
2440 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2441 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2442 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2443 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2444 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2445 ++cnt;
2447 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_INDIRECTWC));
2448 iov[2 + cnt].iov_len = obstack_object_size (&indirectpool);
2449 iov[2 + cnt].iov_base = obstack_finish (&indirectpool);
2450 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2451 assert (iov[2 + cnt].iov_len % sizeof (int32_t) == 0);
2452 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2453 ++cnt;
2456 /* Finally write the table with collation element names out. It is
2457 a hash table with a simple function which gets the name of the
2458 character as the input. One character might have many names. The
2459 value associated with the name is an index into the weight table
2460 where we are then interested in the first-level weight value.
2462 To determine how large the table should be we are counting the
2463 elements have to put in. Since we are using internal chaining
2464 using a secondary hash function we have to make the table a bit
2465 larger to avoid extremely long search times. We can achieve
2466 good results with a 40% larger table than there are entries. */
2467 elem_size = 0;
2468 runp = collate->start;
2469 while (runp != NULL)
2471 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2472 /* Yep, the element really counts. */
2473 ++elem_size;
2475 runp = runp->next;
2477 /* Add 40% and find the next prime number. */
2478 elem_size = next_prime (elem_size * 1.4);
2480 /* Allocate the table. Each entry consists of two words: the hash
2481 value and an index in a secondary table which provides the index
2482 into the weight table and the string itself (so that a match can
2483 be determined). */
2484 elem_table = (uint32_t *) obstack_alloc (&extrapool,
2485 elem_size * 2 * sizeof (uint32_t));
2486 memset (elem_table, '\0', elem_size * 2 * sizeof (uint32_t));
2488 /* Now add the elements. */
2489 runp = collate->start;
2490 while (runp != NULL)
2492 if (runp->mbs != NULL && runp->weights != NULL && !runp->is_character)
2494 /* Compute the hash value of the name. */
2495 uint32_t namelen = strlen (runp->name);
2496 uint32_t hash = elem_hash (runp->name, namelen);
2497 size_t idx = hash % elem_size;
2498 size_t start_idx = idx;
2500 if (elem_table[idx * 2] != 0)
2502 /* The spot is already taken. Try iterating using the value
2503 from the secondary hashing function. */
2504 size_t iter = hash % (elem_size - 2) + 1;
2508 idx += iter;
2509 if (idx >= elem_size)
2510 idx -= elem_size;
2511 assert (idx != start_idx);
2513 while (elem_table[idx * 2] != 0);
2515 /* This is the spot where we will insert the value. */
2516 elem_table[idx * 2] = hash;
2517 elem_table[idx * 2 + 1] = obstack_object_size (&extrapool);
2519 /* The the string itself including length. */
2520 obstack_1grow (&extrapool, namelen);
2521 obstack_grow (&extrapool, runp->name, namelen);
2523 /* And the multibyte representation. */
2524 obstack_1grow (&extrapool, runp->nmbs);
2525 obstack_grow (&extrapool, runp->mbs, runp->nmbs);
2527 /* And align again to 32 bits. */
2528 if ((1 + namelen + 1 + runp->nmbs) % sizeof (int32_t) != 0)
2529 obstack_grow (&extrapool, "\0\0",
2530 (sizeof (int32_t)
2531 - ((1 + namelen + 1 + runp->nmbs)
2532 % sizeof (int32_t))));
2534 /* Now some 32-bit values: multibyte collation sequence,
2535 wide char string (including length), and wide char
2536 collation sequence. */
2537 obstack_int32_grow (&extrapool, runp->mbseqorder);
2539 obstack_int32_grow (&extrapool, runp->nwcs);
2540 obstack_grow (&extrapool, runp->wcs,
2541 runp->nwcs * sizeof (uint32_t));
2543 obstack_int32_grow (&extrapool, runp->wcseqorder);
2546 runp = runp->next;
2549 /* Prepare to write out this data. */
2550 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_HASH_SIZEMB));
2551 iov[2 + cnt].iov_base = &elem_size;
2552 iov[2 + cnt].iov_len = sizeof (int32_t);
2553 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2554 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2555 ++cnt;
2557 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_TABLEMB));
2558 iov[2 + cnt].iov_base = elem_table;
2559 iov[2 + cnt].iov_len = elem_size * 2 * sizeof (int32_t);
2560 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2561 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2562 ++cnt;
2564 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_SYMB_EXTRAMB));
2565 iov[2 + cnt].iov_len = obstack_object_size (&extrapool);
2566 iov[2 + cnt].iov_base = obstack_finish (&extrapool);
2567 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2568 ++cnt;
2570 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQMB));
2571 iov[2 + cnt].iov_base = collate->mbseqorder;
2572 iov[2 + cnt].iov_len = 256;
2573 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2574 ++cnt;
2576 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_COLLSEQWC));
2577 iov[2 + cnt].iov_base = collate->wcseqorder.result;
2578 iov[2 + cnt].iov_len = collate->wcseqorder.result_size;
2579 idx[1 + cnt] = idx[cnt] + iov[2 + cnt].iov_len;
2580 assert (idx[cnt] % __alignof__ (int32_t) == 0);
2581 ++cnt;
2583 assert (cnt == _NL_ITEM_INDEX (_NL_COLLATE_CODESET));
2584 iov[2 + cnt].iov_base = (void *) charmap->code_set_name;
2585 iov[2 + cnt].iov_len = strlen (iov[2 + cnt].iov_base) + 1;
2586 ++cnt;
2588 assert (cnt == _NL_ITEM_INDEX (_NL_NUM_LC_COLLATE));
2590 write_locale_data (output_path, LC_COLLATE, "LC_COLLATE", 2 + cnt, iov);
2592 obstack_free (&weightpool, NULL);
2593 obstack_free (&extrapool, NULL);
2594 obstack_free (&indirectpool, NULL);
2598 void
2599 collate_read (struct linereader *ldfile, struct localedef_t *result,
2600 const struct charmap_t *charmap, const char *repertoire_name,
2601 int ignore_content)
2603 struct repertoire_t *repertoire = NULL;
2604 struct locale_collate_t *collate;
2605 struct token *now;
2606 struct token *arg = NULL;
2607 enum token_t nowtok;
2608 enum token_t was_ellipsis = tok_none;
2609 struct localedef_t *copy_locale = NULL;
2610 /* Parsing state:
2611 0 - start
2612 1 - between `order-start' and `order-end'
2613 2 - after `order-end'
2614 3 - after `reorder-after', waiting for `reorder-end'
2615 4 - after `reorder-end'
2616 5 - after `reorder-sections-after', waiting for `reorder-sections-end'
2617 6 - after `reorder-sections-end'
2619 int state = 0;
2621 /* Get the repertoire we have to use. */
2622 if (repertoire_name != NULL)
2623 repertoire = repertoire_read (repertoire_name);
2625 /* The rest of the line containing `LC_COLLATE' must be free. */
2626 lr_ignore_rest (ldfile, 1);
2630 now = lr_token (ldfile, charmap, result, NULL, verbose);
2631 nowtok = now->tok;
2633 while (nowtok == tok_eol);
2635 if (nowtok == tok_copy)
2637 state = 2;
2638 now = lr_token (ldfile, charmap, result, NULL, verbose);
2639 if (now->tok != tok_string)
2641 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
2643 skip_category:
2645 now = lr_token (ldfile, charmap, result, NULL, verbose);
2646 while (now->tok != tok_eof && now->tok != tok_end);
2648 if (now->tok != tok_eof
2649 || (now = lr_token (ldfile, charmap, result, NULL, verbose),
2650 now->tok == tok_eof))
2651 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");
2652 else if (now->tok != tok_lc_collate)
2654 lr_error (ldfile, _("\
2655 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
2656 lr_ignore_rest (ldfile, 0);
2658 else
2659 lr_ignore_rest (ldfile, 1);
2661 return;
2664 if (! ignore_content)
2666 /* Get the locale definition. */
2667 copy_locale = load_locale (LC_COLLATE, now->val.str.startmb,
2668 repertoire_name, charmap, NULL);
2669 if ((copy_locale->avail & COLLATE_LOCALE) == 0)
2671 /* Not yet loaded. So do it now. */
2672 if (locfile_read (copy_locale, charmap) != 0)
2673 goto skip_category;
2676 if (copy_locale->categories[LC_COLLATE].collate == NULL)
2677 return;
2680 lr_ignore_rest (ldfile, 1);
2682 now = lr_token (ldfile, charmap, result, NULL, verbose);
2683 nowtok = now->tok;
2686 /* Prepare the data structures. */
2687 collate_startup (ldfile, result, copy_locale, ignore_content);
2688 collate = result->categories[LC_COLLATE].collate;
2690 while (1)
2692 char ucs4buf[10];
2693 char *symstr;
2694 size_t symlen;
2696 /* Of course we don't proceed beyond the end of file. */
2697 if (nowtok == tok_eof)
2698 break;
2700 /* Ingore empty lines. */
2701 if (nowtok == tok_eol)
2703 now = lr_token (ldfile, charmap, result, NULL, verbose);
2704 nowtok = now->tok;
2705 continue;
2708 switch (nowtok)
2710 case tok_copy:
2711 /* Allow copying other locales. */
2712 now = lr_token (ldfile, charmap, result, NULL, verbose);
2713 if (now->tok != tok_string)
2714 goto err_label;
2716 if (! ignore_content)
2717 load_locale (LC_COLLATE, now->val.str.startmb, repertoire_name,
2718 charmap, result);
2720 lr_ignore_rest (ldfile, 1);
2721 break;
2723 case tok_coll_weight_max:
2724 /* Ignore the rest of the line if we don't need the input of
2725 this line. */
2726 if (ignore_content)
2728 lr_ignore_rest (ldfile, 0);
2729 break;
2732 if (state != 0)
2733 goto err_label;
2735 arg = lr_token (ldfile, charmap, result, NULL, verbose);
2736 if (arg->tok != tok_number)
2737 goto err_label;
2738 if (collate->col_weight_max != -1)
2739 lr_error (ldfile, _("%s: duplicate definition of `%s'"),
2740 "LC_COLLATE", "col_weight_max");
2741 else
2742 collate->col_weight_max = arg->val.num;
2743 lr_ignore_rest (ldfile, 1);
2744 break;
2746 case tok_section_symbol:
2747 /* Ignore the rest of the line if we don't need the input of
2748 this line. */
2749 if (ignore_content)
2751 lr_ignore_rest (ldfile, 0);
2752 break;
2755 if (state != 0)
2756 goto err_label;
2758 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2759 if (arg->tok != tok_bsymbol)
2760 goto err_label;
2761 else if (!ignore_content)
2763 /* Check whether this section is already known. */
2764 struct section_list *known = collate->sections;
2765 while (known != NULL)
2767 if (strcmp (known->name, arg->val.str.startmb) == 0)
2768 break;
2769 known = known->next;
2772 if (known != NULL)
2774 lr_error (ldfile,
2775 _("%s: duplicate declaration of section `%s'"),
2776 "LC_COLLATE", arg->val.str.startmb);
2777 free (arg->val.str.startmb);
2779 else
2780 collate->sections = make_seclist_elem (collate,
2781 arg->val.str.startmb,
2782 collate->sections);
2784 lr_ignore_rest (ldfile, known == NULL);
2786 else
2788 free (arg->val.str.startmb);
2789 lr_ignore_rest (ldfile, 0);
2791 break;
2793 case tok_collating_element:
2794 /* Ignore the rest of the line if we don't need the input of
2795 this line. */
2796 if (ignore_content)
2798 lr_ignore_rest (ldfile, 0);
2799 break;
2802 if (state != 0 && state != 2)
2803 goto err_label;
2805 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2806 if (arg->tok != tok_bsymbol)
2807 goto err_label;
2808 else
2810 const char *symbol = arg->val.str.startmb;
2811 size_t symbol_len = arg->val.str.lenmb;
2813 /* Next the `from' keyword. */
2814 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2815 if (arg->tok != tok_from)
2817 free ((char *) symbol);
2818 goto err_label;
2821 ldfile->return_widestr = 1;
2822 ldfile->translate_strings = 1;
2824 /* Finally the string with the replacement. */
2825 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2827 ldfile->return_widestr = 0;
2828 ldfile->translate_strings = 0;
2830 if (arg->tok != tok_string)
2831 goto err_label;
2833 if (!ignore_content && symbol != NULL)
2835 /* The name is already defined. */
2836 if (check_duplicate (ldfile, collate, charmap,
2837 repertoire, symbol, symbol_len))
2838 goto col_elem_free;
2840 if (arg->val.str.startmb != NULL)
2841 insert_entry (&collate->elem_table, symbol, symbol_len,
2842 new_element (collate,
2843 arg->val.str.startmb,
2844 arg->val.str.lenmb - 1,
2845 arg->val.str.startwc,
2846 symbol, symbol_len, 0));
2848 else
2850 col_elem_free:
2851 if (symbol != NULL)
2852 free ((char *) symbol);
2853 if (arg->val.str.startmb != NULL)
2854 free (arg->val.str.startmb);
2855 if (arg->val.str.startwc != NULL)
2856 free (arg->val.str.startwc);
2858 lr_ignore_rest (ldfile, 1);
2860 break;
2862 case tok_collating_symbol:
2863 /* Ignore the rest of the line if we don't need the input of
2864 this line. */
2865 if (ignore_content)
2867 lr_ignore_rest (ldfile, 0);
2868 break;
2871 if (state != 0 && state != 2)
2872 goto err_label;
2874 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2875 if (arg->tok != tok_bsymbol)
2876 goto err_label;
2877 else
2879 char *symbol = arg->val.str.startmb;
2880 size_t symbol_len = arg->val.str.lenmb;
2881 char *endsymbol = NULL;
2882 size_t endsymbol_len = 0;
2883 enum token_t ellipsis = tok_none;
2885 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
2886 if (arg->tok == tok_ellipsis2 || arg->tok == tok_ellipsis4)
2888 ellipsis = arg->tok;
2890 arg = lr_token (ldfile, charmap, result, repertoire,
2891 verbose);
2892 if (arg->tok != tok_bsymbol)
2894 free (symbol);
2895 goto err_label;
2898 endsymbol = arg->val.str.startmb;
2899 endsymbol_len = arg->val.str.lenmb;
2901 lr_ignore_rest (ldfile, 1);
2903 else if (arg->tok != tok_eol)
2905 free (symbol);
2906 goto err_label;
2909 if (!ignore_content)
2911 if (symbol == NULL
2912 || (ellipsis != tok_none && endsymbol == NULL))
2914 lr_error (ldfile, _("\
2915 %s: unknown character in collating symbol name"),
2916 "LC_COLLATE");
2917 goto col_sym_free;
2919 else if (ellipsis == tok_none)
2921 /* A single symbol, no ellipsis. */
2922 if (check_duplicate (ldfile, collate, charmap,
2923 repertoire, symbol, symbol_len))
2924 /* The name is already defined. */
2925 goto col_sym_free;
2927 insert_entry (&collate->sym_table, symbol, symbol_len,
2928 new_symbol (collate, symbol, symbol_len));
2930 else if (symbol_len != endsymbol_len)
2932 col_sym_inv_range:
2933 lr_error (ldfile,
2934 _("invalid names for character range"));
2935 goto col_sym_free;
2937 else
2939 /* Oh my, we have to handle an ellipsis. First, as
2940 usual, determine the common prefix and then
2941 convert the rest into a range. */
2942 size_t prefixlen;
2943 unsigned long int from;
2944 unsigned long int to;
2945 char *endp;
2947 for (prefixlen = 0; prefixlen < symbol_len; ++prefixlen)
2948 if (symbol[prefixlen] != endsymbol[prefixlen])
2949 break;
2951 /* Convert the rest into numbers. */
2952 symbol[symbol_len] = '\0';
2953 from = strtoul (&symbol[prefixlen], &endp,
2954 ellipsis == tok_ellipsis2 ? 16 : 10);
2955 if (*endp != '\0')
2956 goto col_sym_inv_range;
2958 endsymbol[symbol_len] = '\0';
2959 to = strtoul (&endsymbol[prefixlen], &endp,
2960 ellipsis == tok_ellipsis2 ? 16 : 10);
2961 if (*endp != '\0')
2962 goto col_sym_inv_range;
2964 if (from > to)
2965 goto col_sym_inv_range;
2967 /* Now loop over all entries. */
2968 while (from <= to)
2970 char *symbuf;
2972 symbuf = (char *) obstack_alloc (&collate->mempool,
2973 symbol_len + 1);
2975 /* Create the name. */
2976 sprintf (symbuf,
2977 ellipsis == tok_ellipsis2
2978 ? "%.*s%.*lX" : "%.*s%.*lu",
2979 (int) prefixlen, symbol,
2980 (int) (symbol_len - prefixlen), from);
2982 if (check_duplicate (ldfile, collate, charmap,
2983 repertoire, symbuf, symbol_len))
2984 /* The name is already defined. */
2985 goto col_sym_free;
2987 insert_entry (&collate->sym_table, symbuf,
2988 symbol_len,
2989 new_symbol (collate, symbuf,
2990 symbol_len));
2992 /* Increment the counter. */
2993 ++from;
2996 goto col_sym_free;
2999 else
3001 col_sym_free:
3002 if (symbol != NULL)
3003 free (symbol);
3004 if (endsymbol != NULL)
3005 free (endsymbol);
3008 break;
3010 case tok_symbol_equivalence:
3011 /* Ignore the rest of the line if we don't need the input of
3012 this line. */
3013 if (ignore_content)
3015 lr_ignore_rest (ldfile, 0);
3016 break;
3019 if (state != 0)
3020 goto err_label;
3022 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3023 if (arg->tok != tok_bsymbol)
3024 goto err_label;
3025 else
3027 const char *newname = arg->val.str.startmb;
3028 size_t newname_len = arg->val.str.lenmb;
3029 const char *symname;
3030 size_t symname_len;
3031 void *symval; /* Actually struct symbol_t* */
3033 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3034 if (arg->tok != tok_bsymbol)
3036 if (newname != NULL)
3037 free ((char *) newname);
3038 goto err_label;
3041 symname = arg->val.str.startmb;
3042 symname_len = arg->val.str.lenmb;
3044 if (newname == NULL)
3046 lr_error (ldfile, _("\
3047 %s: unknown character in equivalent definition name"),
3048 "LC_COLLATE");
3050 sym_equiv_free:
3051 if (newname != NULL)
3052 free ((char *) newname);
3053 if (symname != NULL)
3054 free ((char *) symname);
3055 break;
3057 if (symname == NULL)
3059 lr_error (ldfile, _("\
3060 %s: unknown character in equivalent definition value"),
3061 "LC_COLLATE");
3062 goto sym_equiv_free;
3065 /* See whether the symbol name is already defined. */
3066 if (find_entry (&collate->sym_table, symname, symname_len,
3067 &symval) != 0)
3069 lr_error (ldfile, _("\
3070 %s: unknown symbol `%s' in equivalent definition"),
3071 "LC_COLLATE", symname);
3072 goto sym_equiv_free;
3075 if (insert_entry (&collate->sym_table,
3076 newname, newname_len, symval) < 0)
3078 lr_error (ldfile, _("\
3079 error while adding equivalent collating symbol"));
3080 goto sym_equiv_free;
3083 free ((char *) symname);
3085 lr_ignore_rest (ldfile, 1);
3086 break;
3088 case tok_script:
3089 /* We get told about the scripts we know. */
3090 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3091 if (arg->tok != tok_bsymbol)
3092 goto err_label;
3093 else
3095 struct section_list *runp = collate->known_sections;
3096 char *name;
3098 while (runp != NULL)
3099 if (strncmp (runp->name, arg->val.str.startmb,
3100 arg->val.str.lenmb) == 0
3101 && runp->name[arg->val.str.lenmb] == '\0')
3102 break;
3103 else
3104 runp = runp->def_next;
3106 if (runp != NULL)
3108 lr_error (ldfile, _("duplicate definition of script `%s'"),
3109 runp->name);
3110 lr_ignore_rest (ldfile, 0);
3111 break;
3114 runp = (struct section_list *) xcalloc (1, sizeof (*runp));
3115 name = (char *) xmalloc (arg->val.str.lenmb + 1);
3116 memcpy (name, arg->val.str.startmb, arg->val.str.lenmb);
3117 name[arg->val.str.lenmb] = '\0';
3118 runp->name = name;
3120 runp->def_next = collate->known_sections;
3121 collate->known_sections = runp;
3123 lr_ignore_rest (ldfile, 1);
3124 break;
3126 case tok_order_start:
3127 /* Ignore the rest of the line if we don't need the input of
3128 this line. */
3129 if (ignore_content)
3131 lr_ignore_rest (ldfile, 0);
3132 break;
3135 if (state != 0 && state != 1 && state != 2)
3136 goto err_label;
3137 state = 1;
3139 /* The 14652 draft does not specify whether all `order_start' lines
3140 must contain the same number of sort-rules, but 14651 does. So
3141 we require this here as well. */
3142 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3143 if (arg->tok == tok_bsymbol)
3145 /* This better should be a section name. */
3146 struct section_list *sp = collate->known_sections;
3147 while (sp != NULL
3148 && (sp->name == NULL
3149 || strncmp (sp->name, arg->val.str.startmb,
3150 arg->val.str.lenmb) != 0
3151 || sp->name[arg->val.str.lenmb] != '\0'))
3152 sp = sp->def_next;
3154 if (sp == NULL)
3156 lr_error (ldfile, _("\
3157 %s: unknown section name `%.*s'"),
3158 "LC_COLLATE", (int) arg->val.str.lenmb,
3159 arg->val.str.startmb);
3160 /* We use the error section. */
3161 collate->current_section = &collate->error_section;
3163 if (collate->error_section.first == NULL)
3165 /* Insert &collate->error_section at the end of
3166 the collate->sections list. */
3167 if (collate->sections == NULL)
3168 collate->sections = &collate->error_section;
3169 else
3171 sp = collate->sections;
3172 while (sp->next != NULL)
3173 sp = sp->next;
3175 sp->next = &collate->error_section;
3177 collate->error_section.next = NULL;
3180 else
3182 /* One should not be allowed to open the same
3183 section twice. */
3184 if (sp->first != NULL)
3185 lr_error (ldfile, _("\
3186 %s: multiple order definitions for section `%s'"),
3187 "LC_COLLATE", sp->name);
3188 else
3190 /* Insert sp in the collate->sections list,
3191 right after collate->current_section. */
3192 if (collate->current_section == NULL)
3193 collate->current_section = sp;
3194 else
3196 sp->next = collate->current_section->next;
3197 collate->current_section->next = sp;
3201 /* Next should come the end of the line or a semicolon. */
3202 arg = lr_token (ldfile, charmap, result, repertoire,
3203 verbose);
3204 if (arg->tok == tok_eol)
3206 uint32_t cnt;
3208 /* This means we have exactly one rule: `forward'. */
3209 if (nrules > 1)
3210 lr_error (ldfile, _("\
3211 %s: invalid number of sorting rules"),
3212 "LC_COLLATE");
3213 else
3214 nrules = 1;
3215 sp->rules = obstack_alloc (&collate->mempool,
3216 (sizeof (enum coll_sort_rule)
3217 * nrules));
3218 for (cnt = 0; cnt < nrules; ++cnt)
3219 sp->rules[cnt] = sort_forward;
3221 /* Next line. */
3222 break;
3225 /* Get the next token. */
3226 arg = lr_token (ldfile, charmap, result, repertoire,
3227 verbose);
3230 else
3232 /* There is no section symbol. Therefore we use the unnamed
3233 section. */
3234 collate->current_section = &collate->unnamed_section;
3236 if (collate->unnamed_section.first != NULL)
3237 lr_error (ldfile, _("\
3238 %s: multiple order definitions for unnamed section"),
3239 "LC_COLLATE");
3240 else
3242 /* Insert &collate->unnamed_section at the beginning of
3243 the collate->sections list. */
3244 collate->unnamed_section.next = collate->sections;
3245 collate->sections = &collate->unnamed_section;
3249 /* Now read the direction names. */
3250 read_directions (ldfile, arg, charmap, repertoire, result);
3252 /* From now we need the strings untranslated. */
3253 ldfile->translate_strings = 0;
3254 break;
3256 case tok_order_end:
3257 /* Ignore the rest of the line if we don't need the input of
3258 this line. */
3259 if (ignore_content)
3261 lr_ignore_rest (ldfile, 0);
3262 break;
3265 if (state != 1)
3266 goto err_label;
3268 /* Handle ellipsis at end of list. */
3269 if (was_ellipsis != tok_none)
3271 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3272 repertoire, result);
3273 was_ellipsis = tok_none;
3276 state = 2;
3277 lr_ignore_rest (ldfile, 1);
3278 break;
3280 case tok_reorder_after:
3281 /* Ignore the rest of the line if we don't need the input of
3282 this line. */
3283 if (ignore_content)
3285 lr_ignore_rest (ldfile, 0);
3286 break;
3289 if (state == 1)
3291 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3292 "LC_COLLATE");
3293 state = 2;
3295 /* Handle ellipsis at end of list. */
3296 if (was_ellipsis != tok_none)
3298 handle_ellipsis (ldfile, arg->val.str.startmb,
3299 arg->val.str.lenmb, was_ellipsis, charmap,
3300 repertoire, result);
3301 was_ellipsis = tok_none;
3304 else if (state != 2 && state != 3)
3305 goto err_label;
3306 state = 3;
3308 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3309 if (arg->tok == tok_bsymbol || arg->tok == tok_ucs4)
3311 /* Find this symbol in the sequence table. */
3312 char ucsbuf[10];
3313 char *startmb;
3314 size_t lenmb;
3315 struct element_t *insp;
3316 int no_error = 1;
3317 void *ptr;
3319 if (arg->tok == tok_bsymbol)
3321 startmb = arg->val.str.startmb;
3322 lenmb = arg->val.str.lenmb;
3324 else
3326 sprintf (ucsbuf, "U%08X", arg->val.ucs4);
3327 startmb = ucsbuf;
3328 lenmb = 9;
3331 if (find_entry (&collate->seq_table, startmb, lenmb, &ptr) == 0)
3332 /* Yes, the symbol exists. Simply point the cursor
3333 to it. */
3334 collate->cursor = (struct element_t *) ptr;
3335 else
3337 struct symbol_t *symbp;
3338 void *ptr;
3340 if (find_entry (&collate->sym_table, startmb, lenmb,
3341 &ptr) == 0)
3343 symbp = ptr;
3345 if (symbp->order->last != NULL
3346 || symbp->order->next != NULL)
3347 collate->cursor = symbp->order;
3348 else
3350 /* This is a collating symbol but its position
3351 is not yet defined. */
3352 lr_error (ldfile, _("\
3353 %s: order for collating symbol %.*s not yet defined"),
3354 "LC_COLLATE", (int) lenmb, startmb);
3355 collate->cursor = NULL;
3356 no_error = 0;
3359 else if (find_entry (&collate->elem_table, startmb, lenmb,
3360 &ptr) == 0)
3362 insp = (struct element_t *) ptr;
3364 if (insp->last != NULL || insp->next != NULL)
3365 collate->cursor = insp;
3366 else
3368 /* This is a collating element but its position
3369 is not yet defined. */
3370 lr_error (ldfile, _("\
3371 %s: order for collating element %.*s not yet defined"),
3372 "LC_COLLATE", (int) lenmb, startmb);
3373 collate->cursor = NULL;
3374 no_error = 0;
3377 else
3379 /* This is bad. The symbol after which we have to
3380 insert does not exist. */
3381 lr_error (ldfile, _("\
3382 %s: cannot reorder after %.*s: symbol not known"),
3383 "LC_COLLATE", (int) lenmb, startmb);
3384 collate->cursor = NULL;
3385 no_error = 0;
3389 lr_ignore_rest (ldfile, no_error);
3391 else
3392 /* This must not happen. */
3393 goto err_label;
3394 break;
3396 case tok_reorder_end:
3397 /* Ignore the rest of the line if we don't need the input of
3398 this line. */
3399 if (ignore_content)
3400 break;
3402 if (state != 3)
3403 goto err_label;
3404 state = 4;
3405 lr_ignore_rest (ldfile, 1);
3406 break;
3408 case tok_reorder_sections_after:
3409 /* Ignore the rest of the line if we don't need the input of
3410 this line. */
3411 if (ignore_content)
3413 lr_ignore_rest (ldfile, 0);
3414 break;
3417 if (state == 1)
3419 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3420 "LC_COLLATE");
3421 state = 2;
3423 /* Handle ellipsis at end of list. */
3424 if (was_ellipsis != tok_none)
3426 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3427 repertoire, result);
3428 was_ellipsis = tok_none;
3431 else if (state == 3)
3433 WITH_CUR_LOCALE (error (0, 0, _("\
3434 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3435 state = 4;
3437 else if (state != 2 && state != 4)
3438 goto err_label;
3439 state = 5;
3441 /* Get the name of the sections we are adding after. */
3442 arg = lr_token (ldfile, charmap, result, repertoire, verbose);
3443 if (arg->tok == tok_bsymbol)
3445 /* Now find a section with this name. */
3446 struct section_list *runp = collate->sections;
3448 while (runp != NULL)
3450 if (runp->name != NULL
3451 && strlen (runp->name) == arg->val.str.lenmb
3452 && memcmp (runp->name, arg->val.str.startmb,
3453 arg->val.str.lenmb) == 0)
3454 break;
3456 runp = runp->next;
3459 if (runp != NULL)
3460 collate->current_section = runp;
3461 else
3463 /* This is bad. The section after which we have to
3464 reorder does not exist. Therefore we cannot
3465 process the whole rest of this reorder
3466 specification. */
3467 lr_error (ldfile, _("%s: section `%.*s' not known"),
3468 "LC_COLLATE", (int) arg->val.str.lenmb,
3469 arg->val.str.startmb);
3473 lr_ignore_rest (ldfile, 0);
3475 now = lr_token (ldfile, charmap, result, NULL, verbose);
3477 while (now->tok == tok_reorder_sections_after
3478 || now->tok == tok_reorder_sections_end
3479 || now->tok == tok_end);
3481 /* Process the token we just saw. */
3482 nowtok = now->tok;
3483 continue;
3486 else
3487 /* This must not happen. */
3488 goto err_label;
3489 break;
3491 case tok_reorder_sections_end:
3492 /* Ignore the rest of the line if we don't need the input of
3493 this line. */
3494 if (ignore_content)
3495 break;
3497 if (state != 5)
3498 goto err_label;
3499 state = 6;
3500 lr_ignore_rest (ldfile, 1);
3501 break;
3503 case tok_bsymbol:
3504 case tok_ucs4:
3505 /* Ignore the rest of the line if we don't need the input of
3506 this line. */
3507 if (ignore_content)
3509 lr_ignore_rest (ldfile, 0);
3510 break;
3513 if (state != 0 && state != 1 && state != 3 && state != 5)
3514 goto err_label;
3516 if ((state == 0 || state == 5) && nowtok == tok_ucs4)
3517 goto err_label;
3519 if (nowtok == tok_ucs4)
3521 snprintf (ucs4buf, sizeof (ucs4buf), "U%08X", now->val.ucs4);
3522 symstr = ucs4buf;
3523 symlen = 9;
3525 else if (arg != NULL)
3527 symstr = arg->val.str.startmb;
3528 symlen = arg->val.str.lenmb;
3530 else
3532 lr_error (ldfile, _("%s: bad symbol <%.*s>"), "LC_COLLATE",
3533 (int) ldfile->token.val.str.lenmb,
3534 ldfile->token.val.str.startmb);
3535 break;
3538 struct element_t *seqp;
3539 if (state == 0)
3541 /* We are outside an `order_start' region. This means
3542 we must only accept definitions of values for
3543 collation symbols since these are purely abstract
3544 values and don't need directions associated. */
3545 void *ptr;
3547 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3549 seqp = ptr;
3551 /* It's already defined. First check whether this
3552 is really a collating symbol. */
3553 if (seqp->is_character)
3554 goto err_label;
3556 goto move_entry;
3558 else
3560 void *result;
3562 if (find_entry (&collate->sym_table, symstr, symlen,
3563 &result) != 0)
3564 /* No collating symbol, it's an error. */
3565 goto err_label;
3567 /* Maybe this is the first time we define a symbol
3568 value and it is before the first actual section. */
3569 if (collate->sections == NULL)
3570 collate->sections = collate->current_section =
3571 &collate->symbol_section;
3574 if (was_ellipsis != tok_none)
3576 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis,
3577 charmap, repertoire, result);
3579 /* Remember that we processed the ellipsis. */
3580 was_ellipsis = tok_none;
3582 /* And don't add the value a second time. */
3583 break;
3586 else if (state == 3)
3588 /* It is possible that we already have this collation sequence.
3589 In this case we move the entry. */
3590 void *sym;
3591 void *ptr;
3593 /* If the symbol after which we have to insert was not found
3594 ignore all entries. */
3595 if (collate->cursor == NULL)
3597 lr_ignore_rest (ldfile, 0);
3598 break;
3601 if (find_entry (&collate->seq_table, symstr, symlen, &ptr) == 0)
3603 seqp = (struct element_t *) ptr;
3604 goto move_entry;
3607 if (find_entry (&collate->sym_table, symstr, symlen, &sym) == 0
3608 && (seqp = ((struct symbol_t *) sym)->order) != NULL)
3609 goto move_entry;
3611 if (find_entry (&collate->elem_table, symstr, symlen, &ptr) == 0
3612 && (seqp = (struct element_t *) ptr,
3613 seqp->last != NULL || seqp->next != NULL
3614 || (collate->start != NULL && seqp == collate->start)))
3616 move_entry:
3617 /* Remove the entry from the old position. */
3618 if (seqp->last == NULL)
3619 collate->start = seqp->next;
3620 else
3621 seqp->last->next = seqp->next;
3622 if (seqp->next != NULL)
3623 seqp->next->last = seqp->last;
3625 /* We also have to check whether this entry is the
3626 first or last of a section. */
3627 if (seqp->section->first == seqp)
3629 if (seqp->section->first == seqp->section->last)
3630 /* This section has no content anymore. */
3631 seqp->section->first = seqp->section->last = NULL;
3632 else
3633 seqp->section->first = seqp->next;
3635 else if (seqp->section->last == seqp)
3636 seqp->section->last = seqp->last;
3638 /* Now insert it in the new place. */
3639 insert_weights (ldfile, seqp, charmap, repertoire, result,
3640 tok_none);
3641 break;
3644 /* Otherwise we just add a new entry. */
3646 else if (state == 5)
3648 /* We are reordering sections. Find the named section. */
3649 struct section_list *runp = collate->sections;
3650 struct section_list *prevp = NULL;
3652 while (runp != NULL)
3654 if (runp->name != NULL
3655 && strlen (runp->name) == symlen
3656 && memcmp (runp->name, symstr, symlen) == 0)
3657 break;
3659 prevp = runp;
3660 runp = runp->next;
3663 if (runp == NULL)
3665 lr_error (ldfile, _("%s: section `%.*s' not known"),
3666 "LC_COLLATE", (int) symlen, symstr);
3667 lr_ignore_rest (ldfile, 0);
3669 else
3671 if (runp != collate->current_section)
3673 /* Remove the named section from the old place and
3674 insert it in the new one. */
3675 prevp->next = runp->next;
3677 runp->next = collate->current_section->next;
3678 collate->current_section->next = runp;
3679 collate->current_section = runp;
3682 /* Process the rest of the line which might change
3683 the collation rules. */
3684 arg = lr_token (ldfile, charmap, result, repertoire,
3685 verbose);
3686 if (arg->tok != tok_eof && arg->tok != tok_eol)
3687 read_directions (ldfile, arg, charmap, repertoire,
3688 result);
3690 break;
3692 else if (was_ellipsis != tok_none)
3694 /* Using the information in the `ellipsis_weight'
3695 element and this and the last value we have to handle
3696 the ellipsis now. */
3697 assert (state == 1);
3699 handle_ellipsis (ldfile, symstr, symlen, was_ellipsis, charmap,
3700 repertoire, result);
3702 /* Remember that we processed the ellipsis. */
3703 was_ellipsis = tok_none;
3705 /* And don't add the value a second time. */
3706 break;
3709 /* Now insert in the new place. */
3710 insert_value (ldfile, symstr, symlen, charmap, repertoire, result);
3711 break;
3713 case tok_undefined:
3714 /* Ignore the rest of the line if we don't need the input of
3715 this line. */
3716 if (ignore_content)
3718 lr_ignore_rest (ldfile, 0);
3719 break;
3722 if (state != 1)
3723 goto err_label;
3725 if (was_ellipsis != tok_none)
3727 lr_error (ldfile,
3728 _("%s: cannot have `%s' as end of ellipsis range"),
3729 "LC_COLLATE", "UNDEFINED");
3731 unlink_element (collate);
3732 was_ellipsis = tok_none;
3735 /* See whether UNDEFINED already appeared somewhere. */
3736 if (collate->undefined.next != NULL
3737 || &collate->undefined == collate->cursor)
3739 lr_error (ldfile,
3740 _("%s: order for `%.*s' already defined at %s:%Zu"),
3741 "LC_COLLATE", 9, "UNDEFINED",
3742 collate->undefined.file,
3743 collate->undefined.line);
3744 lr_ignore_rest (ldfile, 0);
3746 else
3747 /* Parse the weights. */
3748 insert_weights (ldfile, &collate->undefined, charmap,
3749 repertoire, result, tok_none);
3750 break;
3752 case tok_ellipsis2: /* symbolic hexadecimal ellipsis */
3753 case tok_ellipsis3: /* absolute ellipsis */
3754 case tok_ellipsis4: /* symbolic decimal ellipsis */
3755 /* This is the symbolic (decimal or hexadecimal) or absolute
3756 ellipsis. */
3757 if (was_ellipsis != tok_none)
3758 goto err_label;
3760 if (state != 0 && state != 1 && state != 3)
3761 goto err_label;
3763 was_ellipsis = nowtok;
3765 insert_weights (ldfile, &collate->ellipsis_weight, charmap,
3766 repertoire, result, nowtok);
3767 break;
3769 case tok_end:
3770 /* Next we assume `LC_COLLATE'. */
3771 if (!ignore_content)
3773 if (state == 0)
3774 /* We must either see a copy statement or have
3775 ordering values. */
3776 lr_error (ldfile,
3777 _("%s: empty category description not allowed"),
3778 "LC_COLLATE");
3779 else if (state == 1)
3781 lr_error (ldfile, _("%s: missing `order_end' keyword"),
3782 "LC_COLLATE");
3784 /* Handle ellipsis at end of list. */
3785 if (was_ellipsis != tok_none)
3787 handle_ellipsis (ldfile, NULL, 0, was_ellipsis, charmap,
3788 repertoire, result);
3789 was_ellipsis = tok_none;
3792 else if (state == 3)
3793 WITH_CUR_LOCALE (error (0, 0, _("\
3794 %s: missing `reorder-end' keyword"), "LC_COLLATE"));
3795 else if (state == 5)
3796 WITH_CUR_LOCALE (error (0, 0, _("\
3797 %s: missing `reorder-sections-end' keyword"), "LC_COLLATE"));
3799 arg = lr_token (ldfile, charmap, result, NULL, verbose);
3800 if (arg->tok == tok_eof)
3801 break;
3802 if (arg->tok == tok_eol)
3803 lr_error (ldfile, _("%s: incomplete `END' line"), "LC_COLLATE");
3804 else if (arg->tok != tok_lc_collate)
3805 lr_error (ldfile, _("\
3806 %1$s: definition does not end with `END %1$s'"), "LC_COLLATE");
3807 lr_ignore_rest (ldfile, arg->tok == tok_lc_collate);
3808 return;
3810 default:
3811 err_label:
3812 SYNTAX_ERROR (_("%s: syntax error"), "LC_COLLATE");
3815 /* Prepare for the next round. */
3816 now = lr_token (ldfile, charmap, result, NULL, verbose);
3817 nowtok = now->tok;
3820 /* When we come here we reached the end of the file. */
3821 lr_error (ldfile, _("%s: premature end of file"), "LC_COLLATE");