2 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
3 * Copyright 2015 John Marino <draco@marino.st>
5 * This source code is derived from the illumos localedef command, and
6 * provided under BSD-style license terms by Nexenta Systems, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
32 * LC_COLLATE database generation routines for localedef.
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
37 #include <sys/types.h>
48 #include "localedef.h"
55 * It will be extremely helpful to the reader if they have access to
56 * the localedef and locale file format specifications available.
57 * Latest versions of these are available from www.opengroup.org.
59 * The design for the collation code is a bit complex. The goal is a
60 * single collation database as described in collate.h (in
61 * libc/port/locale). However, there are some other tidbits:
63 * a) The substitution entries are now a directly indexable array. A
64 * priority elsewhere in the table is taken as an index into the
65 * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
66 * set. (The bit is cleared and the result is the index into the
69 * b) We eliminate duplicate entries into the substitution table.
70 * This saves a lot of space.
72 * c) The priorities for each level are "compressed", so that each
73 * sorting level has consecutively numbered priorities starting at 1.
74 * (O is reserved for the ignore priority.) This means sort levels
75 * which only have a few distinct priorities can represent the
76 * priority level in fewer bits, which makes the strxfrm output
79 * d) We record the total number of priorities so that strxfrm can
80 * figure out how many bytes to expand a numeric priority into.
82 * e) For the UNDEFINED pass (the last pass), we record the maximum
83 * number of bits needed to uniquely prioritize these entries, so that
84 * the last pass can also use smaller strxfrm output when possible.
86 * f) Priorities with the sign bit set are verboten. This works out
87 * because no active character set needs that bit to carry significant
88 * information once the character is in wide form.
90 * To process the entire data to make the database, we actually run
91 * multiple passes over the data.
93 * The first pass, which is done at parse time, identifies elements,
94 * substitutions, and such, and records them in priority order. As
95 * some priorities can refer to other priorities, using forward
96 * references, we use a table of references indicating whether the
97 * priority's value has been resolved, or whether it is still a
100 * The second pass walks over all the items in priority order, noting
101 * that they are used directly, and not just an indirect reference.
102 * This is done by creating a "weight" structure for the item. The
103 * weights are stashed in an RB tree sorted by relative "priority".
105 * The third pass walks over all the weight structures, in priority
106 * order, and assigns a new monotonically increasing (per sort level)
107 * weight value to them. These are the values that will actually be
108 * written to the file.
110 * The fourth pass just writes the data out.
114 * In order to resolve the priorities, we create a table of priorities.
115 * Entries in the table can be in one of three states.
117 * UNKNOWN is for newly allocated entries, and indicates that nothing
118 * is known about the priority. (For example, when new entries are created
119 * for collating-symbols, this is the value assigned for them until the
120 * collating symbol's order has been determined.
122 * RESOLVED is used for an entry where the priority indicates the final
125 * REFER is used for entries that reference other entries. Typically
126 * this is used for forward references. A collating-symbol can never
129 * The "pass" field is used during final resolution to aid in detection
130 * of referencing loops. (For example <A> depends on <B>, but <B> has its
131 * priority dependent on <A>.)
134 UNKNOWN
, /* priority is totally unknown */
135 RESOLVED
, /* priority value fully resolved */
136 REFER
/* priority is a reference (index) */
139 typedef struct weight
{
142 RB_ENTRY(weight
) entry
;
145 typedef struct priority
{
152 #define NUM_WT collinfo.directive_count
155 * These are the abstract collating symbols, which are just a symbolic
156 * way to reference a priority.
161 RB_ENTRY(collsym
) entry
;
165 * These are also abstract collating symbols, but we allow them to have
166 * different priorities at different levels.
168 typedef struct collundef
{
170 int32_t ref
[COLL_WEIGHTS_MAX
];
171 RB_ENTRY(collundef
) entry
;
175 * These are called "chains" in libc. This records the fact that two
176 * more characters should be treated as a single collating entity when
177 * they appear together. For example, in Spanish <C><h> gets collated
178 * as a character between <C> and <D>.
183 int32_t ref
[COLL_WEIGHTS_MAX
];
184 RB_ENTRY(collelem
) rb_bysymbol
;
185 RB_ENTRY(collelem
) rb_byexpand
;
189 * Individual characters have a sequence of weights as well.
191 typedef struct collchar
{
193 int32_t ref
[COLL_WEIGHTS_MAX
];
194 RB_ENTRY(collchar
) entry
;
198 * Substitution entries. The key is itself a priority. Note that
199 * when we create one of these, we *automatically* wind up with a
200 * fully resolved priority for the key, because creation of
201 * substitutions creates a resolved priority at the same time.
203 typedef struct subst
{
205 int32_t ref
[COLLATE_STR_LEN
];
206 RB_ENTRY(subst
) entry
;
207 RB_ENTRY(subst
) entry_ref
;
210 static RB_HEAD(collsyms
, collsym
) collsyms
;
211 static RB_HEAD(collundefs
, collundef
) collundefs
;
212 static RB_HEAD(elem_by_symbol
, collelem
) elem_by_symbol
;
213 static RB_HEAD(elem_by_expand
, collelem
) elem_by_expand
;
214 static RB_HEAD(collchars
, collchar
) collchars
;
215 static RB_HEAD(substs
, subst
) substs
[COLL_WEIGHTS_MAX
];
216 static RB_HEAD(substs_ref
, subst
) substs_ref
[COLL_WEIGHTS_MAX
];
217 static RB_HEAD(weights
, weight
) weights
[COLL_WEIGHTS_MAX
];
218 static int32_t nweight
[COLL_WEIGHTS_MAX
];
221 * This is state tracking for the ellipsis token. Note that we start
222 * the initial values so that the ellipsis logic will think we got a
223 * magic starting value of NUL. It starts at minus one because the
224 * starting point is exclusive -- i.e. the starting point is not
225 * itself handled by the ellipsis code.
227 static int currorder
= EOF
;
228 static int lastorder
= EOF
;
229 static collelem_t
*currelem
;
230 static collchar_t
*currchar
;
231 static collundef_t
*currundef
;
232 static wchar_t ellipsis_start
= 0;
233 static int32_t ellipsis_weights
[COLL_WEIGHTS_MAX
];
236 * We keep a running tally of weights.
238 static int nextpri
= 1;
239 static int nextsubst
[COLL_WEIGHTS_MAX
] = { 0 };
242 * This array collects up the weights for each level.
244 static int32_t order_weights
[COLL_WEIGHTS_MAX
];
245 static int curr_weight
= 0;
246 static int32_t subst_weights
[COLLATE_STR_LEN
];
247 static int curr_subst
= 0;
250 * Some initial priority values.
252 static int32_t pri_undefined
[COLL_WEIGHTS_MAX
];
253 static int32_t pri_ignore
;
255 static collate_info_t collinfo
;
257 static collpri_t
*prilist
= NULL
;
258 static int numpri
= 0;
259 static int maxpri
= 0;
261 static void start_order(int);
268 if (numpri
>= maxpri
) {
269 maxpri
= maxpri
? maxpri
* 2 : 1024;
270 prilist
= realloc(prilist
, sizeof (collpri_t
) * maxpri
);
271 if (prilist
== NULL
) {
272 fprintf(stderr
,"out of memory");
275 for (i
= numpri
; i
< maxpri
; i
++) {
276 prilist
[i
].res
= UNKNOWN
;
287 if ((ref
< 0) || (ref
> numpri
)) {
291 return (&prilist
[ref
]);
295 set_pri(int32_t ref
, int32_t v
, res_t res
)
301 if ((res
== REFER
) && ((v
< 0) || (v
>= numpri
))) {
305 /* Resolve self references */
306 if ((res
== REFER
) && (ref
== v
)) {
311 if (pri
->res
!= UNKNOWN
) {
312 warn("repeated item in order list (first on %d)",
316 pri
->lineno
= lineno
;
322 resolve_pri(int32_t ref
)
325 static int32_t pass
= 0;
329 while (pri
->res
== REFER
) {
330 if (pri
->pass
== pass
) {
331 /* report a line with the circular symbol */
332 lineno
= pri
->lineno
;
333 fprintf(stderr
,"circular reference in order list");
336 if ((pri
->pri
< 0) || (pri
->pri
>= numpri
)) {
341 pri
= &prilist
[pri
->pri
];
344 if (pri
->res
== UNKNOWN
) {
347 if (pri
->res
!= RESOLVED
)
354 weight_compare(const void *n1
, const void *n2
)
356 int32_t k1
= ((const weight_t
*)n1
)->pri
;
357 int32_t k2
= ((const weight_t
*)n2
)->pri
;
359 return (k1
< k2
? -1 : k1
> k2
? 1 : 0);
362 RB_GENERATE_STATIC(weights
, weight
, entry
, weight_compare
);
365 collsym_compare(const void *n1
, const void *n2
)
367 const collsym_t
*c1
= n1
;
368 const collsym_t
*c2
= n2
;
371 rv
= strcmp(c1
->name
, c2
->name
);
372 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
375 RB_GENERATE_STATIC(collsyms
, collsym
, entry
, collsym_compare
);
378 collundef_compare(const void *n1
, const void *n2
)
380 const collundef_t
*c1
= n1
;
381 const collundef_t
*c2
= n2
;
384 rv
= strcmp(c1
->name
, c2
->name
);
385 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
388 RB_GENERATE_STATIC(collundefs
, collundef
, entry
, collundef_compare
);
391 element_compare_symbol(const void *n1
, const void *n2
)
393 const collelem_t
*c1
= n1
;
394 const collelem_t
*c2
= n2
;
397 rv
= strcmp(c1
->symbol
, c2
->symbol
);
398 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
401 RB_GENERATE_STATIC(elem_by_symbol
, collelem
, rb_bysymbol
, element_compare_symbol
);
404 element_compare_expand(const void *n1
, const void *n2
)
406 const collelem_t
*c1
= n1
;
407 const collelem_t
*c2
= n2
;
410 rv
= wcscmp(c1
->expand
, c2
->expand
);
411 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
414 RB_GENERATE_STATIC(elem_by_expand
, collelem
, rb_byexpand
, element_compare_expand
);
417 collchar_compare(const void *n1
, const void *n2
)
419 wchar_t k1
= ((const collchar_t
*)n1
)->wc
;
420 wchar_t k2
= ((const collchar_t
*)n2
)->wc
;
422 return (k1
< k2
? -1 : k1
> k2
? 1 : 0);
425 RB_GENERATE_STATIC(collchars
, collchar
, entry
, collchar_compare
);
428 subst_compare(const void *n1
, const void *n2
)
430 int32_t k1
= ((const subst_t
*)n1
)->key
;
431 int32_t k2
= ((const subst_t
*)n2
)->key
;
433 return (k1
< k2
? -1 : k1
> k2
? 1 : 0);
436 RB_GENERATE_STATIC(substs
, subst
, entry
, subst_compare
);
439 subst_compare_ref(const void *n1
, const void *n2
)
441 const wchar_t *c1
= ((const subst_t
*)n1
)->ref
;
442 const wchar_t *c2
= ((const subst_t
*)n2
)->ref
;
446 return ((rv
< 0) ? -1 : (rv
> 0) ? 1 : 0);
449 RB_GENERATE_STATIC(substs_ref
, subst
, entry_ref
, subst_compare_ref
);
458 RB_INIT(&collundefs
);
460 RB_INIT(&elem_by_symbol
);
462 RB_INIT(&elem_by_expand
);
466 for (i
= 0; i
< COLL_WEIGHTS_MAX
; i
++) {
468 RB_INIT(&substs_ref
[i
]);
469 RB_INIT(&weights
[i
]);
473 (void) memset(&collinfo
, 0, sizeof (collinfo
));
475 /* allocate some initial priorities */
476 pri_ignore
= new_pri();
478 set_pri(pri_ignore
, 0, RESOLVED
);
480 for (i
= 0; i
< COLL_WEIGHTS_MAX
; i
++) {
481 pri_undefined
[i
] = new_pri();
483 /* we will override this later */
484 set_pri(pri_undefined
[i
], COLLATE_MAX_PRIORITY
, UNKNOWN
);
489 define_collsym(char *name
)
493 if ((sym
= calloc(sizeof (*sym
), 1)) == NULL
) {
494 fprintf(stderr
,"out of memory");
498 sym
->ref
= new_pri();
500 if (RB_FIND(collsyms
, &collsyms
, sym
) != NULL
) {
502 * This should never happen because we are only called
503 * for undefined symbols.
509 RB_INSERT(collsyms
, &collsyms
, sym
);
513 lookup_collsym(char *name
)
518 return (RB_FIND(collsyms
, &collsyms
, &srch
));
522 lookup_collelem(char *symbol
)
526 srch
.symbol
= symbol
;
527 return (RB_FIND(elem_by_symbol
, &elem_by_symbol
, &srch
));
531 get_collundef(char *name
)
538 if ((ud
= RB_FIND(collundefs
, &collundefs
, &srch
)) == NULL
) {
539 if (((ud
= calloc(sizeof (*ud
), 1)) == NULL
) ||
540 ((ud
->name
= strdup(name
)) == NULL
)) {
541 fprintf(stderr
,"out of memory");
545 for (i
= 0; i
< NUM_WT
; i
++) {
546 ud
->ref
[i
] = new_pri();
548 RB_INSERT(collundefs
, &collundefs
, ud
);
550 add_charmap_undefined(name
);
555 get_collchar(wchar_t wc
, int create
)
562 cc
= RB_FIND(collchars
, &collchars
, &srch
);
563 if ((cc
== NULL
) && create
) {
564 if ((cc
= calloc(sizeof (*cc
), 1)) == NULL
) {
565 fprintf(stderr
, "out of memory");
568 for (i
= 0; i
< NUM_WT
; i
++) {
569 cc
->ref
[i
] = new_pri();
572 RB_INSERT(collchars
, &collchars
, cc
);
578 end_order_collsym(collsym_t
*sym
)
580 start_order(T_COLLSYM
);
581 /* update the weight */
583 set_pri(sym
->ref
, nextpri
, RESOLVED
);
595 /* advance the priority/weight */
600 for (i
= 0; i
< NUM_WT
; i
++) {
601 if (((ref
= order_weights
[i
]) < 0) ||
602 ((p
= get_pri(ref
)) == NULL
) ||
604 /* unspecified weight is a self reference */
605 set_pri(currchar
->ref
[i
], pri
, RESOLVED
);
607 set_pri(currchar
->ref
[i
], ref
, REFER
);
609 order_weights
[i
] = -1;
612 /* leave a cookie trail in case next symbol is ellipsis */
613 ellipsis_start
= currchar
->wc
+ 1;
618 /* save off the weights were we can find them */
619 for (i
= 0; i
< NUM_WT
; i
++) {
620 ellipsis_weights
[i
] = order_weights
[i
];
621 order_weights
[i
] = -1;
626 if (currelem
== NULL
) {
629 for (i
= 0; i
< NUM_WT
; i
++) {
631 if (((ref
= order_weights
[i
]) < 0) ||
632 ((p
= get_pri(ref
)) == NULL
) ||
634 set_pri(currelem
->ref
[i
], pri
,
637 set_pri(currelem
->ref
[i
], ref
, REFER
);
639 order_weights
[i
] = -1;
645 for (i
= 0; i
< NUM_WT
; i
++) {
646 if (((ref
= order_weights
[i
]) < 0) ||
647 ((p
= get_pri(ref
)) == NULL
) ||
649 set_pri(pri_undefined
[i
], -1, RESOLVED
);
651 set_pri(pri_undefined
[i
], ref
, REFER
);
653 order_weights
[i
] = -1;
658 for (i
= 0; i
< NUM_WT
; i
++) {
659 if (((ref
= order_weights
[i
]) < 0) ||
660 ((p
= get_pri(ref
)) == NULL
) ||
662 set_pri(currundef
->ref
[i
], pri
, RESOLVED
);
664 set_pri(currundef
->ref
[i
], ref
, REFER
);
666 order_weights
[i
] = -1;
678 start_order(int type
)
682 lastorder
= currorder
;
685 /* this is used to protect ELLIPSIS processing */
686 if ((lastorder
== T_ELLIPSIS
) && (type
!= T_CHAR
)) {
687 fprintf(stderr
, "character value expected");
690 for (i
= 0; i
< COLL_WEIGHTS_MAX
; i
++) {
691 order_weights
[i
] = -1;
697 start_order_undefined(void)
699 start_order(T_UNDEFINED
);
703 start_order_symbol(char *name
)
705 currundef
= get_collundef(name
);
706 start_order(T_SYMBOL
);
710 start_order_char(wchar_t wc
)
718 * If we last saw an ellipsis, then we need to close the range.
719 * Handle that here. Note that we have to be careful because the
720 * items *inside* the range are treated exclusiveley to the items
721 * outside of the range. The ends of the range can have quite
722 * different weights than the range members.
724 if (lastorder
== T_ELLIPSIS
) {
727 if (wc
< ellipsis_start
) {
728 fprintf(stderr
, "malformed range!");
731 while (ellipsis_start
< wc
) {
733 * pick all of the saved weights for the
734 * ellipsis. note that -1 encodes for the
735 * ellipsis itself, which means to take the
736 * current relative priority.
738 if ((cc
= get_collchar(ellipsis_start
, 1)) == NULL
) {
742 for (i
= 0; i
< NUM_WT
; i
++) {
744 if (((ref
= ellipsis_weights
[i
]) == -1) ||
745 ((p
= get_pri(ref
)) == NULL
) ||
747 set_pri(cc
->ref
[i
], nextpri
, RESOLVED
);
749 set_pri(cc
->ref
[i
], ref
, REFER
);
751 ellipsis_weights
[i
] = 0;
758 currchar
= get_collchar(wc
, 1);
762 start_order_collelem(collelem_t
*e
)
764 start_order(T_COLLELEM
);
769 start_order_ellipsis(void)
773 start_order(T_ELLIPSIS
);
775 if (lastorder
!= T_CHAR
) {
776 fprintf(stderr
, "illegal starting point for range");
780 for (i
= 0; i
< NUM_WT
; i
++) {
781 ellipsis_weights
[i
] = order_weights
[i
];
786 define_collelem(char *name
, wchar_t *wcs
)
791 if (wcslen(wcs
) >= COLLATE_STR_LEN
) {
792 fprintf(stderr
,"expanded collation element too long");
796 if ((e
= calloc(sizeof (*e
), 1)) == NULL
) {
797 fprintf(stderr
, "out of memory");
804 * This is executed before the order statement, so we don't
805 * know how many priorities we *really* need. We allocate one
806 * for each possible weight. Not a big deal, as collating-elements
807 * prove to be quite rare.
809 for (i
= 0; i
< COLL_WEIGHTS_MAX
; i
++) {
810 e
->ref
[i
] = new_pri();
813 /* A character sequence can only reduce to one element. */
814 if ((RB_FIND(elem_by_symbol
, &elem_by_symbol
, e
) != NULL
) ||
815 (RB_FIND(elem_by_expand
, &elem_by_expand
, e
) != NULL
)) {
816 fprintf(stderr
, "duplicate collating element definition");
820 RB_INSERT(elem_by_symbol
, &elem_by_symbol
, e
);
821 RB_INSERT(elem_by_expand
, &elem_by_expand
, e
);
825 add_order_bit(int kw
)
827 uint8_t bit
= DIRECTIVE_UNDEF
;
831 bit
= DIRECTIVE_FORWARD
;
834 bit
= DIRECTIVE_BACKWARD
;
837 bit
= DIRECTIVE_POSITION
;
843 collinfo
.directive
[collinfo
.directive_count
] |= bit
;
847 add_order_directive(void)
849 if (collinfo
.directive_count
>= COLL_WEIGHTS_MAX
) {
850 fprintf(stderr
,"too many directives (max %d)", COLL_WEIGHTS_MAX
);
852 collinfo
.directive_count
++;
856 add_order_pri(int32_t ref
)
858 if (curr_weight
>= NUM_WT
) {
859 fprintf(stderr
,"too many weights (max %d)", NUM_WT
);
862 order_weights
[curr_weight
] = ref
;
867 add_order_collsym(collsym_t
*s
)
869 add_order_pri(s
->ref
);
873 add_order_char(wchar_t wc
)
877 if ((cc
= get_collchar(wc
, 1)) == NULL
) {
882 add_order_pri(cc
->ref
[curr_weight
]);
886 add_order_collelem(collelem_t
*e
)
888 add_order_pri(e
->ref
[curr_weight
]);
892 add_order_ignore(void)
894 add_order_pri(pri_ignore
);
898 add_order_symbol(char *sym
)
901 if ((c
= get_collundef(sym
)) == NULL
) {
905 add_order_pri(c
->ref
[curr_weight
]);
909 add_order_ellipsis(void)
911 /* special NULL value indicates self reference */
916 add_order_subst(void)
922 (void) memset(&srch
, 0, sizeof (srch
));
923 for (i
= 0; i
< curr_subst
; i
++) {
924 srch
.ref
[i
] = subst_weights
[i
];
925 subst_weights
[i
] = 0;
927 s
= RB_FIND(substs_ref
, &substs_ref
[curr_weight
], &srch
);
930 if ((s
= calloc(sizeof (*s
), 1)) == NULL
) {
931 fprintf(stderr
,"out of memory");
937 * We use a self reference for our key, but we set a
938 * high bit to indicate that this is a substitution
939 * reference. This will expedite table lookups later,
940 * and prevent table lookups for situations that don't
941 * require it. (In short, its a big win, because we
942 * can skip a lot of binary searching.)
945 (nextsubst
[curr_weight
] | COLLATE_SUBST_PRIORITY
),
947 nextsubst
[curr_weight
] += 1;
949 for (i
= 0; i
< curr_subst
; i
++) {
950 s
->ref
[i
] = srch
.ref
[i
];
953 RB_INSERT(substs_ref
, &substs_ref
[curr_weight
], s
);
955 if (RB_FIND(substs
, &substs
[curr_weight
], s
) != NULL
) {
959 RB_INSERT(substs
, &substs
[curr_weight
], s
);
965 * We are using the current (unique) priority as a search key
966 * in the substitution table.
968 add_order_pri(s
->key
);
972 add_subst_pri(int32_t ref
)
974 if (curr_subst
>= COLLATE_STR_LEN
) {
975 fprintf(stderr
,"substitution string is too long");
978 subst_weights
[curr_subst
] = ref
;
983 add_subst_char(wchar_t wc
)
988 if (((cc
= get_collchar(wc
, 1)) == NULL
) ||
993 /* we take the weight for the character at that position */
994 add_subst_pri(cc
->ref
[curr_weight
]);
998 add_subst_collelem(collelem_t
*e
)
1000 add_subst_pri(e
->ref
[curr_weight
]);
1004 add_subst_collsym(collsym_t
*s
)
1006 add_subst_pri(s
->ref
);
1010 add_subst_symbol(char *ptr
)
1014 if ((cu
= get_collundef(ptr
)) != NULL
) {
1015 add_subst_pri(cu
->ref
[curr_weight
]);
1020 add_weight(int32_t ref
, int pass
)
1025 srch
.pri
= resolve_pri(ref
);
1027 /* No translation of ignores */
1031 /* Substitution priorities are not weights */
1032 if (srch
.pri
& COLLATE_SUBST_PRIORITY
)
1035 if (RB_FIND(weights
, &weights
[pass
], &srch
) != NULL
)
1038 if ((w
= calloc(sizeof (*w
), 1)) == NULL
) {
1039 fprintf(stderr
, "out of memory");
1043 RB_INSERT(weights
, &weights
[pass
], w
);
1047 add_weights(int32_t *refs
)
1050 for (i
= 0; i
< NUM_WT
; i
++) {
1051 add_weight(refs
[i
], i
);
1056 get_weight(int32_t ref
, int pass
)
1062 pri
= resolve_pri(ref
);
1063 if (pri
& COLLATE_SUBST_PRIORITY
) {
1070 if ((w
= RB_FIND(weights
, &weights
[pass
], &srch
)) == NULL
) {
1078 wsncpy(wchar_t *s1
, const wchar_t *s2
, size_t n
)
1083 while (--n
> 0 && (*s1
++ = *s2
++) != 0)
1091 #define RB_COUNT(x, name, head, cnt) do { \
1093 RB_FOREACH(x, name, (head)) { \
1098 #define RB_NUMNODES(type, name, head, cnt) do { \
1101 RB_FOREACH(t, name, head) { \
1116 char vers
[COLLATE_STR_LEN
];
1117 collate_char_t chars
[UCHAR_MAX
+ 1];
1118 collate_large_t
*large
;
1119 collate_subst_t
*subst
[COLL_WEIGHTS_MAX
];
1120 collate_chain_t
*chain
;
1123 * We have to run through a preliminary pass to identify all the
1124 * weights that we use for each sorting level.
1126 for (i
= 0; i
< NUM_WT
; i
++) {
1127 add_weight(pri_ignore
, i
);
1129 for (i
= 0; i
< NUM_WT
; i
++) {
1130 RB_FOREACH(sb
, substs
, &substs
[i
]) {
1131 for (j
= 0; sb
->ref
[j
]; j
++) {
1132 add_weight(sb
->ref
[j
], i
);
1136 RB_FOREACH(ce
, elem_by_expand
, &elem_by_expand
) {
1137 add_weights(ce
->ref
);
1139 RB_FOREACH(cc
, collchars
, &collchars
) {
1140 add_weights(cc
->ref
);
1144 * Now we walk the entire set of weights, removing the gaps
1145 * in the weights. This gives us optimum usage. The walk
1146 * occurs in priority.
1148 for (i
= 0; i
< NUM_WT
; i
++) {
1150 RB_FOREACH(w
, weights
, &weights
[i
]) {
1151 w
->opt
= nweight
[i
];
1156 (void) memset(&chars
, 0, sizeof (chars
));
1157 (void) memset(vers
, 0, COLLATE_STR_LEN
);
1158 (void) strlcpy(vers
, COLLATE_VERSION
, sizeof (vers
));
1161 * We need to make sure we arrange for the UNDEFINED field
1162 * to show up. Also, set the total weight counts.
1164 for (i
= 0; i
< NUM_WT
; i
++) {
1165 if (resolve_pri(pri_undefined
[i
]) == -1) {
1166 set_pri(pri_undefined
[i
], -1, RESOLVED
);
1167 /* they collate at the end of everything else */
1168 collinfo
.undef_pri
[i
] = COLLATE_MAX_PRIORITY
;
1170 collinfo
.pri_count
[i
] = nweight
[i
];
1173 collinfo
.pri_count
[NUM_WT
] = max_wide();
1174 collinfo
.undef_pri
[NUM_WT
] = COLLATE_MAX_PRIORITY
;
1175 collinfo
.directive
[NUM_WT
] = DIRECTIVE_UNDEFINED
;
1178 * Ordinary character priorities
1180 for (i
= 0; i
<= UCHAR_MAX
; i
++) {
1181 if ((cc
= get_collchar(i
, 0)) != NULL
) {
1182 for (j
= 0; j
< NUM_WT
; j
++) {
1183 chars
[i
].pri
[j
] = get_weight(cc
->ref
[j
], j
);
1186 for (j
= 0; j
< NUM_WT
; j
++) {
1188 get_weight(pri_undefined
[j
], j
);
1191 * Per POSIX, for undefined characters, we
1192 * also have to add a last item, which is the
1195 chars
[i
].pri
[NUM_WT
] = i
;
1200 * Substitution tables
1202 for (i
= 0; i
< NUM_WT
; i
++) {
1203 collate_subst_t
*st
= NULL
;
1205 RB_COUNT(temp
, substs
, &substs
[i
], n
);
1206 collinfo
.subst_count
[i
] = n
;
1207 if ((st
= calloc(sizeof (collate_subst_t
) * n
, 1)) == NULL
) {
1208 fprintf(stderr
, "out of memory");
1212 RB_FOREACH(sb
, substs
, &substs
[i
]) {
1213 if ((st
[n
].key
= resolve_pri(sb
->key
)) < 0) {
1214 /* by definition these resolve! */
1217 if (st
[n
].key
!= (n
| COLLATE_SUBST_PRIORITY
)) {
1220 for (j
= 0; sb
->ref
[j
]; j
++) {
1221 st
[n
].pri
[j
] = get_weight(sb
->ref
[j
], i
);
1225 if (n
!= collinfo
.subst_count
[i
])
1232 * Chains, i.e. collating elements
1234 RB_NUMNODES(collelem_t
, elem_by_expand
, &elem_by_expand
,
1235 collinfo
.chain_count
);
1236 chain
= calloc(sizeof (collate_chain_t
), collinfo
.chain_count
);
1237 if (chain
== NULL
) {
1238 fprintf(stderr
, "out of memory");
1242 RB_FOREACH(ce
, elem_by_expand
, &elem_by_expand
) {
1243 (void) wsncpy(chain
[n
].str
, ce
->expand
, COLLATE_STR_LEN
);
1244 for (i
= 0; i
< NUM_WT
; i
++) {
1245 chain
[n
].pri
[i
] = get_weight(ce
->ref
[i
], i
);
1249 if (n
!= collinfo
.chain_count
)
1253 * Large (> UCHAR_MAX) character priorities
1255 RB_NUMNODES(collchar_t
, collchars
, &collchars
, n
);
1256 large
= calloc(n
, sizeof (collate_large_t
));
1257 if (large
== NULL
) {
1258 fprintf(stderr
, "out of memory");
1263 RB_FOREACH(cc
, collchars
, &collchars
) {
1265 /* we already gathered those */
1266 if (cc
->wc
<= UCHAR_MAX
)
1268 for (j
= 0; j
< NUM_WT
; j
++) {
1269 if ((pri
= get_weight(cc
->ref
[j
], j
)) < 0) {
1272 if (undef
&& (pri
>= 0)) {
1273 /* if undefined, then all priorities are */
1276 large
[i
].pri
.pri
[j
] = pri
;
1280 large
[i
].val
= cc
->wc
;
1281 collinfo
.large_count
= i
++;
1285 if ((f
= open_category()) == NULL
) {
1289 /* Time to write the entire data set out */
1291 if ((wr_category(vers
, COLLATE_STR_LEN
, f
) < 0) ||
1292 (wr_category(&collinfo
, sizeof (collinfo
), f
) < 0) ||
1293 (wr_category(&chars
, sizeof (chars
), f
) < 0)) {
1297 for (i
= 0; i
< NUM_WT
; i
++) {
1298 sz
= sizeof (collate_subst_t
) * collinfo
.subst_count
[i
];
1299 if (wr_category(subst
[i
], sz
, f
) < 0) {
1303 sz
= sizeof (collate_chain_t
) * collinfo
.chain_count
;
1304 if (wr_category(chain
, sz
, f
) < 0) {
1307 sz
= sizeof (collate_large_t
) * collinfo
.large_count
;
1308 if (wr_category(large
, sz
, f
) < 0) {