2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
13 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
17 * LC_COLLATE database generation routines for localedef.
24 #include <sys/types.h>
31 #include "localedef.h"
33 #include "collatefile.h"
38 * It will be extremely helpful to the reader if they have access to
39 * the localedef and locale file format specifications available.
40 * Latest versions of these are available from www.opengroup.org.
42 * The design for the collation code is a bit complex. The goal is a
43 * single collation database as described in collate.h (in
44 * libc/port/locale). However, there are some other tidbits:
46 * a) The substitution entries are now a directly indexable array. A
47 * priority elsewhere in the table is taken as an index into the
48 * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
49 * set. (The bit is cleared and the result is the index into the
52 * b) We eliminate duplicate entries into the substitution table.
53 * This saves a lot of space.
55 * c) The priorities for each level are "compressed", so that each
56 * sorting level has consecutively numbered priorities starting at 1.
57 * (O is reserved for the ignore priority.) This means sort levels
58 * which only have a few distinct priorities can represent the
59 * priority level in fewer bits, which makes the strxfrm output
62 * d) We record the total number of priorities so that strxfrm can
63 * figure out how many bytes to expand a numeric priority into.
65 * e) For the UNDEFINED pass (the last pass), we record the maximum
66 * number of bits needed to uniquely prioritize these entries, so that
67 * the last pass can also use smaller strxfrm output when possible.
69 * f) Priorities with the sign bit set are verboten. This works out
70 * because no active character set needs that bit to carry significant
71 * information once the character is in wide form.
73 * To process the entire data to make the database, we actually run
74 * multiple passes over the data.
76 * The first pass, which is done at parse time, identifies elements,
77 * substitutions, and such, and records them in priority order. As
78 * some priorities can refer to other priorities, using forward
79 * references, we use a table of references indicating whether the
80 * priority's value has been resolved, or whether it is still a
83 * The second pass walks over all the items in priority order, noting
84 * that they are used directly, and not just an indirect reference.
85 * This is done by creating a "weight" structure for the item. The
86 * weights are stashed in an AVL tree sorted by relative "priority".
88 * The third pass walks over all the weight structures, in priority
89 * order, and assigns a new monotonically increasing (per sort level)
90 * weight value to them. These are the values that will actually be
91 * written to the file.
93 * The fourth pass just writes the data out.
97 * In order to resolve the priorities, we create a table of priorities.
98 * Entries in the table can be in one of three states.
100 * UNKNOWN is for newly allocated entries, and indicates that nothing
101 * is known about the priority. (For example, when new entries are created
102 * for collating-symbols, this is the value assigned for them until the
103 * collating symbol's order has been determined.
105 * RESOLVED is used for an entry where the priority indicates the final
108 * REFER is used for entries that reference other entries. Typically
109 * this is used for forward references. A collating-symbol can never
112 * The "pass" field is used during final resolution to aid in detection
113 * of referencing loops. (For example <A> depends on <B>, but <B> has its
114 * priority dependent on <A>.)
117 UNKNOWN
, /* priority is totally unknown */
118 RESOLVED
, /* priority value fully resolved */
119 REFER
/* priority is a reference (index) */
122 typedef struct weight
{
128 typedef struct priority
{
135 #define NUM_WT collinfo.directive_count
138 * These are the abstract collating symbols, which are just a symbolic
139 * way to reference a priority.
148 * These are also abstract collating symbols, but we allow them to have
149 * different priorities at different levels.
151 typedef struct collundef
{
153 int32_t ref
[COLL_WEIGHTS_MAX
];
158 * These are called "chains" in libc. This records the fact that two
159 * more characters should be treated as a single collating entity when
160 * they appear together. For example, in Spanish <C><h> gets collated
161 * as a character between <C> and <D>.
166 int32_t ref
[COLL_WEIGHTS_MAX
];
167 avl_node_t avl_bysymbol
;
168 avl_node_t avl_byexpand
;
172 * Individual characters have a sequence of weights as well.
174 typedef struct collchar
{
176 int32_t ref
[COLL_WEIGHTS_MAX
];
181 * Substitution entries. The key is itself a priority. Note that
182 * when we create one of these, we *automatically* wind up with a
183 * fully resolved priority for the key, because creation of
184 * substitutions creates a resolved priority at the same time.
188 int32_t ref
[COLLATE_STR_LEN
];
193 static avl_tree_t collsyms
;
194 static avl_tree_t collundefs
;
195 static avl_tree_t elem_by_symbol
;
196 static avl_tree_t elem_by_expand
;
197 static avl_tree_t collchars
;
198 static avl_tree_t substs
[COLL_WEIGHTS_MAX
];
199 static avl_tree_t substs_ref
[COLL_WEIGHTS_MAX
];
200 static avl_tree_t weights
[COLL_WEIGHTS_MAX
];
201 static int32_t nweight
[COLL_WEIGHTS_MAX
];
204 * This is state tracking for the ellipsis token. Note that we start
205 * the initial values so that the ellipsis logic will think we got a
206 * magic starting value of NUL. It starts at minus one because the
207 * starting point is exclusive -- i.e. the starting point is not
208 * itself handled by the ellipsis code.
210 static int currorder
= EOF
;
211 static int lastorder
= EOF
;
212 static collelem_t
*currelem
;
213 static collchar_t
*currchar
;
214 static collundef_t
*currundef
;
215 static wchar_t ellipsis_start
= 0;
216 static int32_t ellipsis_weights
[COLL_WEIGHTS_MAX
];
219 * We keep a running tally of weights.
221 static int nextpri
= 1;
222 static int nextsubst
[COLL_WEIGHTS_MAX
] = { 0 };
225 * This array collects up the weights for each level.
227 static int32_t order_weights
[COLL_WEIGHTS_MAX
];
228 static int curr_weight
= 0;
229 static int32_t subst_weights
[COLLATE_STR_LEN
];
230 static int curr_subst
= 0;
233 * Some initial priority values.
235 static int32_t pri_undefined
[COLL_WEIGHTS_MAX
];
236 static int32_t pri_ignore
;
238 static collate_info_t collinfo
;
240 static collpri_t
*prilist
= NULL
;
241 static int numpri
= 0;
242 static int maxpri
= 0;
244 static void start_order(int);
251 if (numpri
>= maxpri
) {
252 maxpri
= maxpri
? maxpri
* 2 : 1024;
253 prilist
= reallocarray(prilist
, maxpri
, sizeof (collpri_t
));
254 if (prilist
== NULL
) {
255 errf(_("out of memory"));
258 for (i
= numpri
; i
< maxpri
; i
++) {
259 prilist
[i
].res
= UNKNOWN
;
270 if ((ref
< 0) || (ref
> numpri
)) {
274 return (&prilist
[ref
]);
278 set_pri(int32_t ref
, int32_t v
, res_t res
)
284 if ((res
== REFER
) && ((v
< 0) || (v
>= numpri
))) {
288 /* Resolve self references */
289 if ((res
== REFER
) && (ref
== v
)) {
294 if (pri
->res
!= UNKNOWN
) {
295 warn(_("repeated item in order list (first on %d)"),
299 pri
->lineno
= lineno
;
305 resolve_pri(int32_t ref
)
308 static int32_t pass
= 0;
312 while (pri
->res
== REFER
) {
313 if (pri
->pass
== pass
) {
314 /* report a line with the circular symbol */
315 lineno
= pri
->lineno
;
316 errf(_("circular reference in order list"));
319 if ((pri
->pri
< 0) || (pri
->pri
>= numpri
)) {
324 pri
= &prilist
[pri
->pri
];
327 if (pri
->res
== UNKNOWN
) {
330 if (pri
->res
!= RESOLVED
)
337 weight_compare(const void *n1
, const void *n2
)
339 int32_t k1
= ((const weight_t
*)n1
)->pri
;
340 int32_t k2
= ((const weight_t
*)n2
)->pri
;
342 return (k1
< k2
? -1 : k1
> k2
? 1 : 0);
346 collsym_compare(const void *n1
, const void *n2
)
348 const collsym_t
*c1
= n1
;
349 const collsym_t
*c2
= n2
;
352 rv
= strcmp(c1
->name
, c2
->name
);
353 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
357 collundef_compare(const void *n1
, const void *n2
)
359 const collundef_t
*c1
= n1
;
360 const collundef_t
*c2
= n2
;
363 rv
= strcmp(c1
->name
, c2
->name
);
364 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
368 element_compare_symbol(const void *n1
, const void *n2
)
370 const collelem_t
*c1
= n1
;
371 const collelem_t
*c2
= n2
;
374 rv
= strcmp(c1
->symbol
, c2
->symbol
);
375 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
379 element_compare_expand(const void *n1
, const void *n2
)
381 const collelem_t
*c1
= n1
;
382 const collelem_t
*c2
= n2
;
385 rv
= wcscmp(c1
->expand
, c2
->expand
);
386 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
390 collchar_compare(const void *n1
, const void *n2
)
392 wchar_t k1
= ((const collchar_t
*)n1
)->wc
;
393 wchar_t k2
= ((const collchar_t
*)n2
)->wc
;
395 return (k1
< k2
? -1 : k1
> k2
? 1 : 0);
399 subst_compare(const void *n1
, const void *n2
)
401 int32_t k1
= ((const subst_t
*)n1
)->key
;
402 int32_t k2
= ((const subst_t
*)n2
)->key
;
404 return (k1
< k2
? -1 : k1
> k2
? 1 : 0);
408 subst_compare_ref(const void *n1
, const void *n2
)
410 int32_t *c1
= ((subst_t
*)n1
)->ref
;
411 int32_t *c2
= ((subst_t
*)n2
)->ref
;
414 rv
= wcscmp((wchar_t *)c1
, (wchar_t *)c2
);
415 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
423 avl_create(&collsyms
, collsym_compare
, sizeof (collsym_t
),
424 offsetof(collsym_t
, avl
));
426 avl_create(&collundefs
, collundef_compare
, sizeof (collsym_t
),
427 offsetof(collundef_t
, avl
));
429 avl_create(&elem_by_symbol
, element_compare_symbol
, sizeof (collelem_t
),
430 offsetof(collelem_t
, avl_bysymbol
));
431 avl_create(&elem_by_expand
, element_compare_expand
, sizeof (collelem_t
),
432 offsetof(collelem_t
, avl_byexpand
));
434 avl_create(&collchars
, collchar_compare
, sizeof (collchar_t
),
435 offsetof(collchar_t
, avl
));
437 for (i
= 0; i
< COLL_WEIGHTS_MAX
; i
++) {
438 avl_create(&substs
[i
], subst_compare
, sizeof (subst_t
),
439 offsetof(subst_t
, avl
));
440 avl_create(&substs_ref
[i
], subst_compare_ref
,
441 sizeof (subst_t
), offsetof(subst_t
, avl_ref
));
442 avl_create(&weights
[i
], weight_compare
, sizeof (weight_t
),
443 offsetof(weight_t
, avl
));
447 (void) memset(&collinfo
, 0, sizeof (collinfo
));
449 /* allocate some initial priorities */
450 pri_ignore
= new_pri();
452 set_pri(pri_ignore
, 0, RESOLVED
);
454 for (i
= 0; i
< COLL_WEIGHTS_MAX
; i
++) {
455 pri_undefined
[i
] = new_pri();
457 /* we will override this later */
458 set_pri(pri_undefined
[i
], COLLATE_MAX_PRIORITY
, UNKNOWN
);
463 define_collsym(char *name
)
468 if ((sym
= calloc(sizeof (*sym
), 1)) == NULL
) {
469 errf(_("out of memory"));
473 sym
->ref
= new_pri();
475 if (avl_find(&collsyms
, sym
, &where
) != NULL
) {
477 * This should never happen because we are only called
478 * for undefined symbols.
483 avl_insert(&collsyms
, sym
, where
);
487 lookup_collsym(char *name
)
492 return (avl_find(&collsyms
, &srch
, NULL
));
496 lookup_collelem(char *symbol
)
500 srch
.symbol
= symbol
;
501 return (avl_find(&elem_by_symbol
, &srch
, NULL
));
505 get_collundef(char *name
)
513 if ((ud
= avl_find(&collundefs
, &srch
, &where
)) == NULL
) {
514 if (((ud
= calloc(sizeof (*ud
), 1)) == NULL
) ||
515 ((ud
->name
= strdup(name
)) == NULL
)) {
516 errf(_("out of memory"));
519 for (i
= 0; i
< NUM_WT
; i
++) {
520 ud
->ref
[i
] = new_pri();
522 avl_insert(&collundefs
, ud
, where
);
524 add_charmap_undefined(name
);
529 get_collchar(wchar_t wc
, int create
)
537 cc
= avl_find(&collchars
, &srch
, &where
);
538 if ((cc
== NULL
) && create
) {
539 if ((cc
= calloc(sizeof (*cc
), 1)) == NULL
) {
540 errf(_("out of memory"));
543 for (i
= 0; i
< NUM_WT
; i
++) {
544 cc
->ref
[i
] = new_pri();
547 avl_insert(&collchars
, cc
, where
);
553 end_order_collsym(collsym_t
*sym
)
555 start_order(T_COLLSYM
);
556 /* update the weight */
558 set_pri(sym
->ref
, nextpri
, RESOLVED
);
570 /* advance the priority/weight */
575 for (i
= 0; i
< NUM_WT
; i
++) {
576 if (((ref
= order_weights
[i
]) < 0) ||
577 ((p
= get_pri(ref
)) == NULL
) ||
579 /* unspecified weight is a self reference */
580 set_pri(currchar
->ref
[i
], pri
, RESOLVED
);
582 set_pri(currchar
->ref
[i
], ref
, REFER
);
584 order_weights
[i
] = -1;
587 /* leave a cookie trail in case next symbol is ellipsis */
588 ellipsis_start
= currchar
->wc
+ 1;
593 /* save off the weights were we can find them */
594 for (i
= 0; i
< NUM_WT
; i
++) {
595 ellipsis_weights
[i
] = order_weights
[i
];
596 order_weights
[i
] = -1;
601 if (currelem
== NULL
) {
604 for (i
= 0; i
< NUM_WT
; i
++) {
606 if (((ref
= order_weights
[i
]) < 0) ||
607 ((p
= get_pri(ref
)) == NULL
) ||
609 set_pri(currelem
->ref
[i
], pri
,
612 set_pri(currelem
->ref
[i
], ref
, REFER
);
614 order_weights
[i
] = -1;
620 for (i
= 0; i
< NUM_WT
; i
++) {
621 if (((ref
= order_weights
[i
]) < 0) ||
622 ((p
= get_pri(ref
)) == NULL
) ||
624 set_pri(pri_undefined
[i
], -1, RESOLVED
);
626 set_pri(pri_undefined
[i
], ref
, REFER
);
628 order_weights
[i
] = -1;
633 for (i
= 0; i
< NUM_WT
; i
++) {
634 if (((ref
= order_weights
[i
]) < 0) ||
635 ((p
= get_pri(ref
)) == NULL
) ||
637 set_pri(currundef
->ref
[i
], pri
, RESOLVED
);
639 set_pri(currundef
->ref
[i
], ref
, REFER
);
641 order_weights
[i
] = -1;
653 start_order(int type
)
657 lastorder
= currorder
;
660 /* this is used to protect ELLIPSIS processing */
661 if ((lastorder
== T_ELLIPSIS
) && (type
!= T_CHAR
)) {
662 errf(_("character value expected"));
665 for (i
= 0; i
< COLL_WEIGHTS_MAX
; i
++) {
666 order_weights
[i
] = -1;
672 start_order_undefined(void)
674 start_order(T_UNDEFINED
);
678 start_order_symbol(char *name
)
680 currundef
= get_collundef(name
);
681 start_order(T_SYMBOL
);
685 start_order_char(wchar_t wc
)
693 * If we last saw an ellipsis, then we need to close the range.
694 * Handle that here. Note that we have to be careful because the
695 * items *inside* the range are treated exclusiveley to the items
696 * outside of the range. The ends of the range can have quite
697 * different weights than the range members.
699 if (lastorder
== T_ELLIPSIS
) {
702 if (wc
< ellipsis_start
) {
703 errf(_("malformed range!"));
706 while (ellipsis_start
< wc
) {
708 * pick all of the saved weights for the
709 * ellipsis. note that -1 encodes for the
710 * ellipsis itself, which means to take the
711 * current relative priority.
713 if ((cc
= get_collchar(ellipsis_start
, 1)) == NULL
) {
717 for (i
= 0; i
< NUM_WT
; i
++) {
719 if (((ref
= ellipsis_weights
[i
]) == -1) ||
720 ((p
= get_pri(ref
)) == NULL
) ||
722 set_pri(cc
->ref
[i
], nextpri
, RESOLVED
);
724 set_pri(cc
->ref
[i
], ref
, REFER
);
726 ellipsis_weights
[i
] = 0;
733 currchar
= get_collchar(wc
, 1);
737 start_order_collelem(collelem_t
*e
)
739 start_order(T_COLLELEM
);
744 start_order_ellipsis(void)
748 start_order(T_ELLIPSIS
);
750 if (lastorder
!= T_CHAR
) {
751 errf(_("illegal starting point for range"));
755 for (i
= 0; i
< NUM_WT
; i
++) {
756 ellipsis_weights
[i
] = order_weights
[i
];
761 define_collelem(char *name
, wchar_t *wcs
)
768 if (wcslen(wcs
) >= COLLATE_STR_LEN
) {
769 errf(_("expanded collation element too long"));
773 if ((e
= calloc(sizeof (*e
), 1)) == NULL
) {
774 errf(_("out of memory"));
781 * This is executed before the order statement, so we don't
782 * know how many priorities we *really* need. We allocate one
783 * for each possible weight. Not a big deal, as collating-elements
784 * prove to be quite rare.
786 for (i
= 0; i
< COLL_WEIGHTS_MAX
; i
++) {
787 e
->ref
[i
] = new_pri();
790 /* A character sequence can only reduce to one element. */
791 if ((avl_find(&elem_by_symbol
, e
, &where1
) != NULL
) ||
792 (avl_find(&elem_by_expand
, e
, &where2
) != NULL
)) {
793 errf(_("duplicate collating element definition"));
796 avl_insert(&elem_by_symbol
, e
, where1
);
797 avl_insert(&elem_by_expand
, e
, where2
);
801 add_order_bit(int kw
)
803 uint8_t bit
= DIRECTIVE_UNDEF
;
807 bit
= DIRECTIVE_FORWARD
;
810 bit
= DIRECTIVE_BACKWARD
;
813 bit
= DIRECTIVE_POSITION
;
819 collinfo
.directive
[collinfo
.directive_count
] |= bit
;
823 add_order_directive(void)
825 if (collinfo
.directive_count
>= COLL_WEIGHTS_MAX
) {
826 errf(_("too many directives (max %d)"), COLL_WEIGHTS_MAX
);
828 collinfo
.directive_count
++;
832 add_order_pri(int32_t ref
)
834 if (curr_weight
>= NUM_WT
) {
835 errf(_("too many weights (max %d)"), NUM_WT
);
838 order_weights
[curr_weight
] = ref
;
843 add_order_collsym(collsym_t
*s
)
845 add_order_pri(s
->ref
);
849 add_order_char(wchar_t wc
)
853 if ((cc
= get_collchar(wc
, 1)) == NULL
) {
858 add_order_pri(cc
->ref
[curr_weight
]);
862 add_order_collelem(collelem_t
*e
)
864 add_order_pri(e
->ref
[curr_weight
]);
868 add_order_ignore(void)
870 add_order_pri(pri_ignore
);
874 add_order_symbol(char *sym
)
877 if ((c
= get_collundef(sym
)) == NULL
) {
881 add_order_pri(c
->ref
[curr_weight
]);
885 add_order_ellipsis(void)
887 /* special 0 value indicates self reference */
892 add_order_subst(void)
899 (void) memset(&srch
, 0, sizeof (srch
));
900 for (i
= 0; i
< curr_subst
; i
++) {
901 srch
.ref
[i
] = subst_weights
[i
];
902 subst_weights
[i
] = 0;
904 s
= avl_find(&substs_ref
[curr_weight
], &srch
, &where
);
907 if ((s
= calloc(sizeof (*s
), 1)) == NULL
) {
908 errf(_("out of memory"));
914 * We use a self reference for our key, but we set a
915 * high bit to indicate that this is a substitution
916 * reference. This will expedite table lookups later,
917 * and prevent table lookups for situations that don't
918 * require it. (In short, its a big win, because we
919 * can skip a lot of binary searching.)
922 (nextsubst
[curr_weight
] | COLLATE_SUBST_PRIORITY
),
924 nextsubst
[curr_weight
] += 1;
926 for (i
= 0; i
< curr_subst
; i
++) {
927 s
->ref
[i
] = srch
.ref
[i
];
930 avl_insert(&substs_ref
[curr_weight
], s
, where
);
932 if (avl_find(&substs
[curr_weight
], s
, &where
) != NULL
) {
936 avl_insert(&substs
[curr_weight
], s
, where
);
942 * We are using the current (unique) priority as a search key
943 * in the substitution table.
945 add_order_pri(s
->key
);
949 add_subst_pri(int32_t ref
)
951 if (curr_subst
>= COLLATE_STR_LEN
) {
952 errf(_("substitution string is too long"));
955 subst_weights
[curr_subst
] = ref
;
960 add_subst_char(wchar_t wc
)
965 if (((cc
= get_collchar(wc
, 1)) == NULL
) ||
970 /* we take the weight for the character at that position */
971 add_subst_pri(cc
->ref
[curr_weight
]);
975 add_subst_collelem(collelem_t
*e
)
977 add_subst_pri(e
->ref
[curr_weight
]);
981 add_subst_collsym(collsym_t
*s
)
983 add_subst_pri(s
->ref
);
987 add_subst_symbol(char *ptr
)
991 if ((cu
= get_collundef(ptr
)) != NULL
) {
992 add_subst_pri(cu
->ref
[curr_weight
]);
997 add_weight(int32_t ref
, int pass
)
1003 srch
.pri
= resolve_pri(ref
);
1005 /* No translation of ignores */
1009 /* Substitution priorities are not weights */
1010 if (srch
.pri
& COLLATE_SUBST_PRIORITY
)
1013 if (avl_find(&weights
[pass
], &srch
, &where
) != NULL
)
1016 if ((w
= calloc(sizeof (*w
), 1)) == NULL
) {
1017 errf(_("out of memory"));
1021 avl_insert(&weights
[pass
], w
, where
);
1025 add_weights(int32_t *refs
)
1028 for (i
= 0; i
< NUM_WT
; i
++) {
1029 add_weight(refs
[i
], i
);
1034 get_weight(int32_t ref
, int pass
)
1040 pri
= resolve_pri(ref
);
1041 if (pri
& COLLATE_SUBST_PRIORITY
) {
1048 if ((w
= avl_find(&weights
[pass
], &srch
, NULL
)) == NULL
) {
1065 char vers
[COLLATE_STR_LEN
];
1066 collate_char_t chars
[UCHAR_MAX
+ 1];
1067 collate_large_t
*large
;
1068 collate_subst_t
*subst
[COLL_WEIGHTS_MAX
];
1069 collate_chain_t
*chain
;
1072 * We have to run throught a preliminary pass to identify all the
1073 * weights that we use for each sorting level.
1075 for (i
= 0; i
< NUM_WT
; i
++) {
1076 add_weight(pri_ignore
, i
);
1078 for (i
= 0; i
< NUM_WT
; i
++) {
1079 for (sb
= avl_first(&substs
[i
]); sb
;
1080 sb
= AVL_NEXT(&substs
[i
], sb
)) {
1081 for (j
= 0; sb
->ref
[j
]; j
++) {
1082 add_weight(sb
->ref
[j
], i
);
1086 for (ce
= avl_first(&elem_by_expand
);
1088 ce
= AVL_NEXT(&elem_by_expand
, ce
)) {
1089 add_weights(ce
->ref
);
1091 for (cc
= avl_first(&collchars
); cc
; cc
= AVL_NEXT(&collchars
, cc
)) {
1092 add_weights(cc
->ref
);
1096 * Now we walk the entire set of weights, removing the gaps
1097 * in the weights. This gives us optimum usage. The walk
1098 * occurs in priority.
1100 for (i
= 0; i
< NUM_WT
; i
++) {
1102 for (w
= avl_first(&weights
[i
]); w
;
1103 w
= AVL_NEXT(&weights
[i
], w
)) {
1104 w
->opt
= nweight
[i
];
1109 (void) memset(&chars
, 0, sizeof (chars
));
1110 (void) memset(vers
, 0, COLLATE_STR_LEN
);
1111 (void) strlcpy(vers
, COLLATE_VERSION
, sizeof (vers
));
1114 * We need to make sure we arrange for the UNDEFINED field
1115 * to show up. Also, set the total weight counts.
1117 for (i
= 0; i
< NUM_WT
; i
++) {
1118 if (resolve_pri(pri_undefined
[i
]) == -1) {
1119 set_pri(pri_undefined
[i
], -1, RESOLVED
);
1120 /* they collate at the end of everything else */
1121 collinfo
.undef_pri
[i
] = COLLATE_MAX_PRIORITY
;
1123 collinfo
.pri_count
[i
] = nweight
[i
];
1126 collinfo
.pri_count
[NUM_WT
] = max_wide();
1127 collinfo
.undef_pri
[NUM_WT
] = COLLATE_MAX_PRIORITY
;
1128 collinfo
.directive
[NUM_WT
] = DIRECTIVE_UNDEFINED
;
1131 * Ordinary character priorities
1133 for (i
= 0; i
<= UCHAR_MAX
; i
++) {
1134 if ((cc
= get_collchar(i
, 0)) != NULL
) {
1135 for (j
= 0; j
< NUM_WT
; j
++) {
1136 chars
[i
].pri
[j
] = get_weight(cc
->ref
[j
], j
);
1139 for (j
= 0; j
< NUM_WT
; j
++) {
1141 get_weight(pri_undefined
[j
], j
);
1144 * Per POSIX, for undefined characters, we
1145 * also have to add a last item, which is the
1148 chars
[i
].pri
[NUM_WT
] = i
;
1153 * Substitution tables
1155 for (i
= 0; i
< NUM_WT
; i
++) {
1156 collate_subst_t
*st
= NULL
;
1157 n
= collinfo
.subst_count
[i
] = avl_numnodes(&substs
[i
]);
1158 if ((st
= calloc(sizeof (collate_subst_t
) * n
, 1)) == NULL
) {
1159 errf(_("out of memory"));
1163 for (sb
= avl_first(&substs
[i
]); sb
;
1164 sb
= AVL_NEXT(&substs
[i
], sb
)) {
1165 if ((st
[n
].key
= resolve_pri(sb
->key
)) < 0) {
1166 /* by definition these resolve! */
1169 if (st
[n
].key
!= (n
| COLLATE_SUBST_PRIORITY
)) {
1172 for (j
= 0; sb
->ref
[j
]; j
++) {
1173 st
[n
].pri
[j
] = get_weight(sb
->ref
[j
], i
);
1177 if (n
!= collinfo
.subst_count
[i
])
1184 * Chains, i.e. collating elements
1186 collinfo
.chain_count
= avl_numnodes(&elem_by_expand
);
1187 chain
= calloc(sizeof (collate_chain_t
), collinfo
.chain_count
);
1188 if (chain
== NULL
) {
1189 errf(_("out of memory"));
1192 for (n
= 0, ce
= avl_first(&elem_by_expand
);
1194 ce
= AVL_NEXT(&elem_by_expand
, ce
), n
++) {
1195 (void) wsncpy(chain
[n
].str
, ce
->expand
, COLLATE_STR_LEN
);
1196 for (i
= 0; i
< NUM_WT
; i
++) {
1197 chain
[n
].pri
[i
] = get_weight(ce
->ref
[i
], i
);
1200 if (n
!= collinfo
.chain_count
)
1204 * Large (> UCHAR_MAX) character priorities
1206 large
= calloc(sizeof (collate_large_t
) * avl_numnodes(&collchars
), 1);
1207 if (large
== NULL
) {
1208 errf(_("out of memory"));
1213 for (cc
= avl_first(&collchars
); cc
; cc
= AVL_NEXT(&collchars
, cc
)) {
1215 /* we already gathered those */
1216 if (cc
->wc
<= UCHAR_MAX
)
1218 for (j
= 0; j
< NUM_WT
; j
++) {
1219 if ((pri
= get_weight(cc
->ref
[j
], j
)) < 0) {
1222 if (undef
&& (pri
>= 0)) {
1223 /* if undefined, then all priorities are */
1226 large
[i
].pri
.pri
[j
] = pri
;
1230 large
[i
].val
= cc
->wc
;
1231 collinfo
.large_count
= i
++;
1235 if ((f
= open_category()) == NULL
) {
1239 /* Time to write the entire data set out */
1241 if ((wr_category(vers
, COLLATE_STR_LEN
, f
) < 0) ||
1242 (wr_category(&collinfo
, sizeof (collinfo
), f
) < 0) ||
1243 (wr_category(&chars
, sizeof (chars
), f
) < 0)) {
1247 for (i
= 0; i
< NUM_WT
; i
++) {
1248 sz
= sizeof (collate_subst_t
) * collinfo
.subst_count
[i
];
1249 if (wr_category(subst
[i
], sz
, f
) < 0) {
1253 sz
= sizeof (collate_chain_t
) * collinfo
.chain_count
;
1254 if (wr_category(chain
, sz
, f
) < 0) {
1257 sz
= sizeof (collate_large_t
) * collinfo
.large_count
;
1258 if (wr_category(large
, sz
, f
) < 0) {