Merge commit '0b905b49d460a57773d88d714cd880ffe0182b7c'
[unleashed.git] / bin / localedef / collate.c
blob3f76b61a3617dd4b5962deb1a50bed03294e9313
1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
17 * LC_COLLATE database generation routines for localedef.
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <errno.h>
23 #include <string.h>
24 #include <sys/types.h>
25 #include <sys/avl.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <wchar.h>
29 #include <widec.h>
30 #include <limits.h>
31 #include "localedef.h"
32 #include "parser.h"
33 #include "collatefile.h"
36 * Design notes.
38 * It will be extremely helpful to the reader if they have access to
39 * the localedef and locale file format specifications available.
40 * Latest versions of these are available from www.opengroup.org.
42 * The design for the collation code is a bit complex. The goal is a
43 * single collation database as described in collate.h (in
44 * libc/port/locale). However, there are some other tidbits:
46 * a) The substitution entries are now a directly indexable array. A
47 * priority elsewhere in the table is taken as an index into the
48 * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
49 * set. (The bit is cleared and the result is the index into the
50 * table.
52 * b) We eliminate duplicate entries into the substitution table.
53 * This saves a lot of space.
55 * c) The priorities for each level are "compressed", so that each
56 * sorting level has consecutively numbered priorities starting at 1.
57 * (O is reserved for the ignore priority.) This means sort levels
58 * which only have a few distinct priorities can represent the
59 * priority level in fewer bits, which makes the strxfrm output
60 * smaller.
62 * d) We record the total number of priorities so that strxfrm can
63 * figure out how many bytes to expand a numeric priority into.
65 * e) For the UNDEFINED pass (the last pass), we record the maximum
66 * number of bits needed to uniquely prioritize these entries, so that
67 * the last pass can also use smaller strxfrm output when possible.
69 * f) Priorities with the sign bit set are verboten. This works out
70 * because no active character set needs that bit to carry significant
71 * information once the character is in wide form.
73 * To process the entire data to make the database, we actually run
74 * multiple passes over the data.
76 * The first pass, which is done at parse time, identifies elements,
77 * substitutions, and such, and records them in priority order. As
78 * some priorities can refer to other priorities, using forward
79 * references, we use a table of references indicating whether the
80 * priority's value has been resolved, or whether it is still a
81 * reference.
83 * The second pass walks over all the items in priority order, noting
84 * that they are used directly, and not just an indirect reference.
85 * This is done by creating a "weight" structure for the item. The
86 * weights are stashed in an AVL tree sorted by relative "priority".
88 * The third pass walks over all the weight structures, in priority
89 * order, and assigns a new monotonically increasing (per sort level)
90 * weight value to them. These are the values that will actually be
91 * written to the file.
93 * The fourth pass just writes the data out.
97 * In order to resolve the priorities, we create a table of priorities.
98 * Entries in the table can be in one of three states.
100 * UNKNOWN is for newly allocated entries, and indicates that nothing
101 * is known about the priority. (For example, when new entries are created
102 * for collating-symbols, this is the value assigned for them until the
103 * collating symbol's order has been determined.
105 * RESOLVED is used for an entry where the priority indicates the final
106 * numeric weight.
108 * REFER is used for entries that reference other entries. Typically
109 * this is used for forward references. A collating-symbol can never
110 * have this value.
112 * The "pass" field is used during final resolution to aid in detection
113 * of referencing loops. (For example <A> depends on <B>, but <B> has its
114 * priority dependent on <A>.)
116 typedef enum {
117 UNKNOWN, /* priority is totally unknown */
118 RESOLVED, /* priority value fully resolved */
119 REFER /* priority is a reference (index) */
120 } res_t;
122 typedef struct weight {
123 int32_t pri;
124 int opt;
125 avl_node_t avl;
126 } weight_t;
128 typedef struct priority {
129 res_t res;
130 int32_t pri;
131 int pass;
132 int lineno;
133 } collpri_t;
135 #define NUM_WT collinfo.directive_count
138 * These are the abstract collating symbols, which are just a symbolic
139 * way to reference a priority.
141 struct collsym {
142 char *name;
143 int32_t ref;
144 avl_node_t avl;
148 * These are also abstract collating symbols, but we allow them to have
149 * different priorities at different levels.
151 typedef struct collundef {
152 char *name;
153 int32_t ref[COLL_WEIGHTS_MAX];
154 avl_node_t avl;
155 } collundef_t;
158 * These are called "chains" in libc. This records the fact that two
159 * more characters should be treated as a single collating entity when
160 * they appear together. For example, in Spanish <C><h> gets collated
161 * as a character between <C> and <D>.
163 struct collelem {
164 char *symbol;
165 wchar_t *expand;
166 int32_t ref[COLL_WEIGHTS_MAX];
167 avl_node_t avl_bysymbol;
168 avl_node_t avl_byexpand;
172 * Individual characters have a sequence of weights as well.
174 typedef struct collchar {
175 wchar_t wc;
176 int32_t ref[COLL_WEIGHTS_MAX];
177 avl_node_t avl;
178 } collchar_t;
181 * Substitution entries. The key is itself a priority. Note that
182 * when we create one of these, we *automatically* wind up with a
183 * fully resolved priority for the key, because creation of
184 * substitutions creates a resolved priority at the same time.
186 typedef struct {
187 int32_t key;
188 int32_t ref[COLLATE_STR_LEN];
189 avl_node_t avl;
190 avl_node_t avl_ref;
191 } subst_t;
193 static avl_tree_t collsyms;
194 static avl_tree_t collundefs;
195 static avl_tree_t elem_by_symbol;
196 static avl_tree_t elem_by_expand;
197 static avl_tree_t collchars;
198 static avl_tree_t substs[COLL_WEIGHTS_MAX];
199 static avl_tree_t substs_ref[COLL_WEIGHTS_MAX];
200 static avl_tree_t weights[COLL_WEIGHTS_MAX];
201 static int32_t nweight[COLL_WEIGHTS_MAX];
204 * This is state tracking for the ellipsis token. Note that we start
205 * the initial values so that the ellipsis logic will think we got a
206 * magic starting value of NUL. It starts at minus one because the
207 * starting point is exclusive -- i.e. the starting point is not
208 * itself handled by the ellipsis code.
210 static int currorder = EOF;
211 static int lastorder = EOF;
212 static collelem_t *currelem;
213 static collchar_t *currchar;
214 static collundef_t *currundef;
215 static wchar_t ellipsis_start = 0;
216 static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
219 * We keep a running tally of weights.
221 static int nextpri = 1;
222 static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
225 * This array collects up the weights for each level.
227 static int32_t order_weights[COLL_WEIGHTS_MAX];
228 static int curr_weight = 0;
229 static int32_t subst_weights[COLLATE_STR_LEN];
230 static int curr_subst = 0;
233 * Some initial priority values.
235 static int32_t pri_undefined[COLL_WEIGHTS_MAX];
236 static int32_t pri_ignore;
238 static collate_info_t collinfo;
240 static collpri_t *prilist = NULL;
241 static int numpri = 0;
242 static int maxpri = 0;
244 static void start_order(int);
246 static int32_t
247 new_pri(void)
249 int i;
251 if (numpri >= maxpri) {
252 maxpri = maxpri ? maxpri * 2 : 1024;
253 prilist = reallocarray(prilist, maxpri, sizeof (collpri_t));
254 if (prilist == NULL) {
255 errf(_("out of memory"));
256 return (-1);
258 for (i = numpri; i < maxpri; i++) {
259 prilist[i].res = UNKNOWN;
260 prilist[i].pri = 0;
261 prilist[i].pass = 0;
264 return (numpri++);
267 static collpri_t *
268 get_pri(int32_t ref)
270 if ((ref < 0) || (ref > numpri)) {
271 INTERR;
272 return (NULL);
274 return (&prilist[ref]);
277 static void
278 set_pri(int32_t ref, int32_t v, res_t res)
280 collpri_t *pri;
282 pri = get_pri(ref);
284 if ((res == REFER) && ((v < 0) || (v >= numpri))) {
285 INTERR;
288 /* Resolve self references */
289 if ((res == REFER) && (ref == v)) {
290 v = nextpri;
291 res = RESOLVED;
294 if (pri->res != UNKNOWN) {
295 warn(_("repeated item in order list (first on %d)"),
296 pri->lineno);
297 return;
299 pri->lineno = lineno;
300 pri->pri = v;
301 pri->res = res;
304 static int32_t
305 resolve_pri(int32_t ref)
307 collpri_t *pri;
308 static int32_t pass = 0;
310 pri = get_pri(ref);
311 pass++;
312 while (pri->res == REFER) {
313 if (pri->pass == pass) {
314 /* report a line with the circular symbol */
315 lineno = pri->lineno;
316 errf(_("circular reference in order list"));
317 return (-1);
319 if ((pri->pri < 0) || (pri->pri >= numpri)) {
320 INTERR;
321 return (-1);
323 pri->pass = pass;
324 pri = &prilist[pri->pri];
327 if (pri->res == UNKNOWN) {
328 return (-1);
330 if (pri->res != RESOLVED)
331 INTERR;
333 return (pri->pri);
336 static int
337 weight_compare(const void *n1, const void *n2)
339 int32_t k1 = ((const weight_t *)n1)->pri;
340 int32_t k2 = ((const weight_t *)n2)->pri;
342 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
345 static int
346 collsym_compare(const void *n1, const void *n2)
348 const collsym_t *c1 = n1;
349 const collsym_t *c2 = n2;
350 int rv;
352 rv = strcmp(c1->name, c2->name);
353 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
356 static int
357 collundef_compare(const void *n1, const void *n2)
359 const collundef_t *c1 = n1;
360 const collundef_t *c2 = n2;
361 int rv;
363 rv = strcmp(c1->name, c2->name);
364 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
367 static int
368 element_compare_symbol(const void *n1, const void *n2)
370 const collelem_t *c1 = n1;
371 const collelem_t *c2 = n2;
372 int rv;
374 rv = strcmp(c1->symbol, c2->symbol);
375 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
378 static int
379 element_compare_expand(const void *n1, const void *n2)
381 const collelem_t *c1 = n1;
382 const collelem_t *c2 = n2;
383 int rv;
385 rv = wcscmp(c1->expand, c2->expand);
386 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
389 static int
390 collchar_compare(const void *n1, const void *n2)
392 wchar_t k1 = ((const collchar_t *)n1)->wc;
393 wchar_t k2 = ((const collchar_t *)n2)->wc;
395 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
398 static int
399 subst_compare(const void *n1, const void *n2)
401 int32_t k1 = ((const subst_t *)n1)->key;
402 int32_t k2 = ((const subst_t *)n2)->key;
404 return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
407 static int
408 subst_compare_ref(const void *n1, const void *n2)
410 int32_t *c1 = ((subst_t *)n1)->ref;
411 int32_t *c2 = ((subst_t *)n2)->ref;
412 int rv;
414 rv = wcscmp((wchar_t *)c1, (wchar_t *)c2);
415 return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
418 void
419 init_collate(void)
421 int i;
423 avl_create(&collsyms, collsym_compare, sizeof (collsym_t),
424 offsetof(collsym_t, avl));
426 avl_create(&collundefs, collundef_compare, sizeof (collsym_t),
427 offsetof(collundef_t, avl));
429 avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t),
430 offsetof(collelem_t, avl_bysymbol));
431 avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t),
432 offsetof(collelem_t, avl_byexpand));
434 avl_create(&collchars, collchar_compare, sizeof (collchar_t),
435 offsetof(collchar_t, avl));
437 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
438 avl_create(&substs[i], subst_compare, sizeof (subst_t),
439 offsetof(subst_t, avl));
440 avl_create(&substs_ref[i], subst_compare_ref,
441 sizeof (subst_t), offsetof(subst_t, avl_ref));
442 avl_create(&weights[i], weight_compare, sizeof (weight_t),
443 offsetof(weight_t, avl));
444 nweight[i] = 1;
447 (void) memset(&collinfo, 0, sizeof (collinfo));
449 /* allocate some initial priorities */
450 pri_ignore = new_pri();
452 set_pri(pri_ignore, 0, RESOLVED);
454 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
455 pri_undefined[i] = new_pri();
457 /* we will override this later */
458 set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN);
462 void
463 define_collsym(char *name)
465 collsym_t *sym;
466 avl_index_t where;
468 if ((sym = calloc(sizeof (*sym), 1)) == NULL) {
469 errf(_("out of memory"));
470 return;
472 sym->name = name;
473 sym->ref = new_pri();
475 if (avl_find(&collsyms, sym, &where) != NULL) {
477 * This should never happen because we are only called
478 * for undefined symbols.
480 INTERR;
481 return;
483 avl_insert(&collsyms, sym, where);
486 collsym_t *
487 lookup_collsym(char *name)
489 collsym_t srch;
491 srch.name = name;
492 return (avl_find(&collsyms, &srch, NULL));
495 collelem_t *
496 lookup_collelem(char *symbol)
498 collelem_t srch;
500 srch.symbol = symbol;
501 return (avl_find(&elem_by_symbol, &srch, NULL));
504 static collundef_t *
505 get_collundef(char *name)
507 collundef_t srch;
508 collundef_t *ud;
509 avl_index_t where;
510 int i;
512 srch.name = name;
513 if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) {
514 if (((ud = calloc(sizeof (*ud), 1)) == NULL) ||
515 ((ud->name = strdup(name)) == NULL)) {
516 errf(_("out of memory"));
517 return (NULL);
519 for (i = 0; i < NUM_WT; i++) {
520 ud->ref[i] = new_pri();
522 avl_insert(&collundefs, ud, where);
524 add_charmap_undefined(name);
525 return (ud);
528 static collchar_t *
529 get_collchar(wchar_t wc, int create)
531 collchar_t srch;
532 collchar_t *cc;
533 avl_index_t where;
534 int i;
536 srch.wc = wc;
537 cc = avl_find(&collchars, &srch, &where);
538 if ((cc == NULL) && create) {
539 if ((cc = calloc(sizeof (*cc), 1)) == NULL) {
540 errf(_("out of memory"));
541 return (NULL);
543 for (i = 0; i < NUM_WT; i++) {
544 cc->ref[i] = new_pri();
546 cc->wc = wc;
547 avl_insert(&collchars, cc, where);
549 return (cc);
552 void
553 end_order_collsym(collsym_t *sym)
555 start_order(T_COLLSYM);
556 /* update the weight */
558 set_pri(sym->ref, nextpri, RESOLVED);
559 nextpri++;
562 void
563 end_order(void)
565 int i;
566 int32_t pri;
567 int32_t ref;
568 collpri_t *p;
570 /* advance the priority/weight */
571 pri = nextpri;
573 switch (currorder) {
574 case T_CHAR:
575 for (i = 0; i < NUM_WT; i++) {
576 if (((ref = order_weights[i]) < 0) ||
577 ((p = get_pri(ref)) == NULL) ||
578 (p->pri == -1)) {
579 /* unspecified weight is a self reference */
580 set_pri(currchar->ref[i], pri, RESOLVED);
581 } else {
582 set_pri(currchar->ref[i], ref, REFER);
584 order_weights[i] = -1;
587 /* leave a cookie trail in case next symbol is ellipsis */
588 ellipsis_start = currchar->wc + 1;
589 currchar = NULL;
590 break;
592 case T_ELLIPSIS:
593 /* save off the weights were we can find them */
594 for (i = 0; i < NUM_WT; i++) {
595 ellipsis_weights[i] = order_weights[i];
596 order_weights[i] = -1;
598 break;
600 case T_COLLELEM:
601 if (currelem == NULL) {
602 INTERR;
603 } else {
604 for (i = 0; i < NUM_WT; i++) {
606 if (((ref = order_weights[i]) < 0) ||
607 ((p = get_pri(ref)) == NULL) ||
608 (p->pri == -1)) {
609 set_pri(currelem->ref[i], pri,
610 RESOLVED);
611 } else {
612 set_pri(currelem->ref[i], ref, REFER);
614 order_weights[i] = -1;
617 break;
619 case T_UNDEFINED:
620 for (i = 0; i < NUM_WT; i++) {
621 if (((ref = order_weights[i]) < 0) ||
622 ((p = get_pri(ref)) == NULL) ||
623 (p->pri == -1)) {
624 set_pri(pri_undefined[i], -1, RESOLVED);
625 } else {
626 set_pri(pri_undefined[i], ref, REFER);
628 order_weights[i] = -1;
630 break;
632 case T_SYMBOL:
633 for (i = 0; i < NUM_WT; i++) {
634 if (((ref = order_weights[i]) < 0) ||
635 ((p = get_pri(ref)) == NULL) ||
636 (p->pri == -1)) {
637 set_pri(currundef->ref[i], pri, RESOLVED);
638 } else {
639 set_pri(currundef->ref[i], ref, REFER);
641 order_weights[i] = -1;
643 break;
645 default:
646 INTERR;
649 nextpri++;
652 static void
653 start_order(int type)
655 int i;
657 lastorder = currorder;
658 currorder = type;
660 /* this is used to protect ELLIPSIS processing */
661 if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) {
662 errf(_("character value expected"));
665 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
666 order_weights[i] = -1;
668 curr_weight = 0;
671 void
672 start_order_undefined(void)
674 start_order(T_UNDEFINED);
677 void
678 start_order_symbol(char *name)
680 currundef = get_collundef(name);
681 start_order(T_SYMBOL);
684 void
685 start_order_char(wchar_t wc)
687 collchar_t *cc;
688 int32_t ref;
690 start_order(T_CHAR);
693 * If we last saw an ellipsis, then we need to close the range.
694 * Handle that here. Note that we have to be careful because the
695 * items *inside* the range are treated exclusiveley to the items
696 * outside of the range. The ends of the range can have quite
697 * different weights than the range members.
699 if (lastorder == T_ELLIPSIS) {
700 int i;
702 if (wc < ellipsis_start) {
703 errf(_("malformed range!"));
704 return;
706 while (ellipsis_start < wc) {
708 * pick all of the saved weights for the
709 * ellipsis. note that -1 encodes for the
710 * ellipsis itself, which means to take the
711 * current relative priority.
713 if ((cc = get_collchar(ellipsis_start, 1)) == NULL) {
714 INTERR;
715 return;
717 for (i = 0; i < NUM_WT; i++) {
718 collpri_t *p;
719 if (((ref = ellipsis_weights[i]) == -1) ||
720 ((p = get_pri(ref)) == NULL) ||
721 (p->pri == -1)) {
722 set_pri(cc->ref[i], nextpri, RESOLVED);
723 } else {
724 set_pri(cc->ref[i], ref, REFER);
726 ellipsis_weights[i] = 0;
728 ellipsis_start++;
729 nextpri++;
733 currchar = get_collchar(wc, 1);
736 void
737 start_order_collelem(collelem_t *e)
739 start_order(T_COLLELEM);
740 currelem = e;
743 void
744 start_order_ellipsis(void)
746 int i;
748 start_order(T_ELLIPSIS);
750 if (lastorder != T_CHAR) {
751 errf(_("illegal starting point for range"));
752 return;
755 for (i = 0; i < NUM_WT; i++) {
756 ellipsis_weights[i] = order_weights[i];
760 void
761 define_collelem(char *name, wchar_t *wcs)
763 collelem_t *e;
764 avl_index_t where1;
765 avl_index_t where2;
766 int i;
768 if (wcslen(wcs) >= COLLATE_STR_LEN) {
769 errf(_("expanded collation element too long"));
770 return;
773 if ((e = calloc(sizeof (*e), 1)) == NULL) {
774 errf(_("out of memory"));
775 return;
777 e->expand = wcs;
778 e->symbol = name;
781 * This is executed before the order statement, so we don't
782 * know how many priorities we *really* need. We allocate one
783 * for each possible weight. Not a big deal, as collating-elements
784 * prove to be quite rare.
786 for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
787 e->ref[i] = new_pri();
790 /* A character sequence can only reduce to one element. */
791 if ((avl_find(&elem_by_symbol, e, &where1) != NULL) ||
792 (avl_find(&elem_by_expand, e, &where2) != NULL)) {
793 errf(_("duplicate collating element definition"));
794 return;
796 avl_insert(&elem_by_symbol, e, where1);
797 avl_insert(&elem_by_expand, e, where2);
800 void
801 add_order_bit(int kw)
803 uint8_t bit = DIRECTIVE_UNDEF;
805 switch (kw) {
806 case T_FORWARD:
807 bit = DIRECTIVE_FORWARD;
808 break;
809 case T_BACKWARD:
810 bit = DIRECTIVE_BACKWARD;
811 break;
812 case T_POSITION:
813 bit = DIRECTIVE_POSITION;
814 break;
815 default:
816 INTERR;
817 break;
819 collinfo.directive[collinfo.directive_count] |= bit;
822 void
823 add_order_directive(void)
825 if (collinfo.directive_count >= COLL_WEIGHTS_MAX) {
826 errf(_("too many directives (max %d)"), COLL_WEIGHTS_MAX);
828 collinfo.directive_count++;
831 static void
832 add_order_pri(int32_t ref)
834 if (curr_weight >= NUM_WT) {
835 errf(_("too many weights (max %d)"), NUM_WT);
836 return;
838 order_weights[curr_weight] = ref;
839 curr_weight++;
842 void
843 add_order_collsym(collsym_t *s)
845 add_order_pri(s->ref);
848 void
849 add_order_char(wchar_t wc)
851 collchar_t *cc;
853 if ((cc = get_collchar(wc, 1)) == NULL) {
854 INTERR;
855 return;
858 add_order_pri(cc->ref[curr_weight]);
861 void
862 add_order_collelem(collelem_t *e)
864 add_order_pri(e->ref[curr_weight]);
867 void
868 add_order_ignore(void)
870 add_order_pri(pri_ignore);
873 void
874 add_order_symbol(char *sym)
876 collundef_t *c;
877 if ((c = get_collundef(sym)) == NULL) {
878 INTERR;
879 return;
881 add_order_pri(c->ref[curr_weight]);
884 void
885 add_order_ellipsis(void)
887 /* special 0 value indicates self reference */
888 add_order_pri(0);
891 void
892 add_order_subst(void)
894 subst_t srch;
895 subst_t *s;
896 avl_index_t where;
897 int i;
899 (void) memset(&srch, 0, sizeof (srch));
900 for (i = 0; i < curr_subst; i++) {
901 srch.ref[i] = subst_weights[i];
902 subst_weights[i] = 0;
904 s = avl_find(&substs_ref[curr_weight], &srch, &where);
906 if (s == NULL) {
907 if ((s = calloc(sizeof (*s), 1)) == NULL) {
908 errf(_("out of memory"));
909 return;
911 s->key = new_pri();
914 * We use a self reference for our key, but we set a
915 * high bit to indicate that this is a substitution
916 * reference. This will expedite table lookups later,
917 * and prevent table lookups for situations that don't
918 * require it. (In short, its a big win, because we
919 * can skip a lot of binary searching.)
921 set_pri(s->key,
922 (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
923 RESOLVED);
924 nextsubst[curr_weight] += 1;
926 for (i = 0; i < curr_subst; i++) {
927 s->ref[i] = srch.ref[i];
930 avl_insert(&substs_ref[curr_weight], s, where);
932 if (avl_find(&substs[curr_weight], s, &where) != NULL) {
933 INTERR;
934 return;
936 avl_insert(&substs[curr_weight], s, where);
938 curr_subst = 0;
942 * We are using the current (unique) priority as a search key
943 * in the substitution table.
945 add_order_pri(s->key);
948 static void
949 add_subst_pri(int32_t ref)
951 if (curr_subst >= COLLATE_STR_LEN) {
952 errf(_("substitution string is too long"));
953 return;
955 subst_weights[curr_subst] = ref;
956 curr_subst++;
959 void
960 add_subst_char(wchar_t wc)
962 collchar_t *cc;
965 if (((cc = get_collchar(wc, 1)) == NULL) ||
966 (cc->wc != wc)) {
967 INTERR;
968 return;
970 /* we take the weight for the character at that position */
971 add_subst_pri(cc->ref[curr_weight]);
974 void
975 add_subst_collelem(collelem_t *e)
977 add_subst_pri(e->ref[curr_weight]);
980 void
981 add_subst_collsym(collsym_t *s)
983 add_subst_pri(s->ref);
986 void
987 add_subst_symbol(char *ptr)
989 collundef_t *cu;
991 if ((cu = get_collundef(ptr)) != NULL) {
992 add_subst_pri(cu->ref[curr_weight]);
996 void
997 add_weight(int32_t ref, int pass)
999 weight_t srch;
1000 weight_t *w;
1001 avl_index_t where;
1003 srch.pri = resolve_pri(ref);
1005 /* No translation of ignores */
1006 if (srch.pri == 0)
1007 return;
1009 /* Substitution priorities are not weights */
1010 if (srch.pri & COLLATE_SUBST_PRIORITY)
1011 return;
1013 if (avl_find(&weights[pass], &srch, &where) != NULL)
1014 return;
1016 if ((w = calloc(sizeof (*w), 1)) == NULL) {
1017 errf(_("out of memory"));
1018 return;
1020 w->pri = srch.pri;
1021 avl_insert(&weights[pass], w, where);
1024 void
1025 add_weights(int32_t *refs)
1027 int i;
1028 for (i = 0; i < NUM_WT; i++) {
1029 add_weight(refs[i], i);
1033 int32_t
1034 get_weight(int32_t ref, int pass)
1036 weight_t srch;
1037 weight_t *w;
1038 int32_t pri;
1040 pri = resolve_pri(ref);
1041 if (pri & COLLATE_SUBST_PRIORITY) {
1042 return (pri);
1044 if (pri <= 0) {
1045 return (pri);
1047 srch.pri = pri;
1048 if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) {
1049 INTERR;
1050 return (-1);
1052 return (w->opt);
1055 void
1056 dump_collate(void)
1058 FILE *f;
1059 int i, j, n;
1060 size_t sz;
1061 int32_t pri;
1062 collelem_t *ce;
1063 collchar_t *cc;
1064 subst_t *sb;
1065 char vers[COLLATE_STR_LEN];
1066 collate_char_t chars[UCHAR_MAX + 1];
1067 collate_large_t *large;
1068 collate_subst_t *subst[COLL_WEIGHTS_MAX];
1069 collate_chain_t *chain;
1072 * We have to run throught a preliminary pass to identify all the
1073 * weights that we use for each sorting level.
1075 for (i = 0; i < NUM_WT; i++) {
1076 add_weight(pri_ignore, i);
1078 for (i = 0; i < NUM_WT; i++) {
1079 for (sb = avl_first(&substs[i]); sb;
1080 sb = AVL_NEXT(&substs[i], sb)) {
1081 for (j = 0; sb->ref[j]; j++) {
1082 add_weight(sb->ref[j], i);
1086 for (ce = avl_first(&elem_by_expand);
1087 ce != NULL;
1088 ce = AVL_NEXT(&elem_by_expand, ce)) {
1089 add_weights(ce->ref);
1091 for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1092 add_weights(cc->ref);
1096 * Now we walk the entire set of weights, removing the gaps
1097 * in the weights. This gives us optimum usage. The walk
1098 * occurs in priority.
1100 for (i = 0; i < NUM_WT; i++) {
1101 weight_t *w;
1102 for (w = avl_first(&weights[i]); w;
1103 w = AVL_NEXT(&weights[i], w)) {
1104 w->opt = nweight[i];
1105 nweight[i] += 1;
1109 (void) memset(&chars, 0, sizeof (chars));
1110 (void) memset(vers, 0, COLLATE_STR_LEN);
1111 (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
1114 * We need to make sure we arrange for the UNDEFINED field
1115 * to show up. Also, set the total weight counts.
1117 for (i = 0; i < NUM_WT; i++) {
1118 if (resolve_pri(pri_undefined[i]) == -1) {
1119 set_pri(pri_undefined[i], -1, RESOLVED);
1120 /* they collate at the end of everything else */
1121 collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
1123 collinfo.pri_count[i] = nweight[i];
1126 collinfo.pri_count[NUM_WT] = max_wide();
1127 collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
1128 collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
1131 * Ordinary character priorities
1133 for (i = 0; i <= UCHAR_MAX; i++) {
1134 if ((cc = get_collchar(i, 0)) != NULL) {
1135 for (j = 0; j < NUM_WT; j++) {
1136 chars[i].pri[j] = get_weight(cc->ref[j], j);
1138 } else {
1139 for (j = 0; j < NUM_WT; j++) {
1140 chars[i].pri[j] =
1141 get_weight(pri_undefined[j], j);
1144 * Per POSIX, for undefined characters, we
1145 * also have to add a last item, which is the
1146 * character code.
1148 chars[i].pri[NUM_WT] = i;
1153 * Substitution tables
1155 for (i = 0; i < NUM_WT; i++) {
1156 collate_subst_t *st = NULL;
1157 n = collinfo.subst_count[i] = avl_numnodes(&substs[i]);
1158 if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) {
1159 errf(_("out of memory"));
1160 return;
1162 n = 0;
1163 for (sb = avl_first(&substs[i]); sb;
1164 sb = AVL_NEXT(&substs[i], sb)) {
1165 if ((st[n].key = resolve_pri(sb->key)) < 0) {
1166 /* by definition these resolve! */
1167 INTERR;
1169 if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
1170 INTERR;
1172 for (j = 0; sb->ref[j]; j++) {
1173 st[n].pri[j] = get_weight(sb->ref[j], i);
1175 n++;
1177 if (n != collinfo.subst_count[i])
1178 INTERR;
1179 subst[i] = st;
1184 * Chains, i.e. collating elements
1186 collinfo.chain_count = avl_numnodes(&elem_by_expand);
1187 chain = calloc(sizeof (collate_chain_t), collinfo.chain_count);
1188 if (chain == NULL) {
1189 errf(_("out of memory"));
1190 return;
1192 for (n = 0, ce = avl_first(&elem_by_expand);
1193 ce != NULL;
1194 ce = AVL_NEXT(&elem_by_expand, ce), n++) {
1195 (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
1196 for (i = 0; i < NUM_WT; i++) {
1197 chain[n].pri[i] = get_weight(ce->ref[i], i);
1200 if (n != collinfo.chain_count)
1201 INTERR;
1204 * Large (> UCHAR_MAX) character priorities
1206 large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1);
1207 if (large == NULL) {
1208 errf(_("out of memory"));
1209 return;
1212 i = 0;
1213 for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
1214 int undef = 0;
1215 /* we already gathered those */
1216 if (cc->wc <= UCHAR_MAX)
1217 continue;
1218 for (j = 0; j < NUM_WT; j++) {
1219 if ((pri = get_weight(cc->ref[j], j)) < 0) {
1220 undef = 1;
1222 if (undef && (pri >= 0)) {
1223 /* if undefined, then all priorities are */
1224 INTERR;
1225 } else {
1226 large[i].pri.pri[j] = pri;
1229 if (!undef) {
1230 large[i].val = cc->wc;
1231 collinfo.large_count = i++;
1235 if ((f = open_category()) == NULL) {
1236 return;
1239 /* Time to write the entire data set out */
1241 if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
1242 (wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
1243 (wr_category(&chars, sizeof (chars), f) < 0)) {
1244 return;
1247 for (i = 0; i < NUM_WT; i++) {
1248 sz = sizeof (collate_subst_t) * collinfo.subst_count[i];
1249 if (wr_category(subst[i], sz, f) < 0) {
1250 return;
1253 sz = sizeof (collate_chain_t) * collinfo.chain_count;
1254 if (wr_category(chain, sz, f) < 0) {
1255 return;
1257 sz = sizeof (collate_large_t) * collinfo.large_count;
1258 if (wr_category(large, sz, f) < 0) {
1259 return;
1262 close_category(f);