2 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
3 * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua>
4 * at Electronni Visti IA, Kiev, Ukraine.
7 * Copyright (c) 2011 The FreeBSD Foundation
9 * Portions of this software were developed by David Chisnall
10 * under sponsorship from the FreeBSD Foundation.
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * Adapted to xlocale by John Marino <draco@marino.st>
36 #include "namespace.h"
45 #include <sys/types.h>
48 #include "un-namespace.h"
51 #include "setlocale.h"
54 struct xlocale_collate __xlocale_global_collate
= {
55 {{0}, "C"}, 1, 0, 0, 0
58 struct xlocale_collate __xlocale_C_collate
= {
59 {{0}, "C"}, 1, 0, 0, 0
62 #include "libc_private.h"
65 __collate_load_tables_l(const char *encoding
, struct xlocale_collate
*table
);
68 destruct_collate(void *t
)
70 struct xlocale_collate
*table
= t
;
71 if (table
->map
&& (table
->maplen
> 0)) {
72 (void) munmap(table
->map
, table
->maplen
);
78 __collate_load(const char *encoding
, __unused locale_t unused
)
80 if (strcmp(encoding
, "C") == 0 || strcmp(encoding
, "POSIX") == 0) {
81 return &__xlocale_C_collate
;
83 struct xlocale_collate
*table
= calloc(sizeof(struct xlocale_collate
), 1);
84 table
->header
.header
.destructor
= destruct_collate
;
85 // FIXME: Make sure that _LDP_CACHE is never returned. We should be doing
86 // the caching outside of this section
87 if (__collate_load_tables_l(encoding
, table
) != _LDP_LOADED
) {
88 xlocale_release(table
);
95 * Load the collation tables for the specified encoding into the global table.
98 __collate_load_tables(const char *encoding
)
100 int ret
= __collate_load_tables_l(encoding
, &__xlocale_global_collate
);
105 __collate_load_tables_l(const char *encoding
, struct xlocale_collate
*table
)
111 collate_info_t
*info
;
115 table
->__collate_load_error
= 1;
117 /* 'encoding' must be already checked. */
118 if (strcmp(encoding
, "C") == 0 || strcmp(encoding
, "POSIX") == 0) {
122 (void) snprintf(buf
, sizeof (buf
), "%s/%s/LC_COLLATE",
123 _PathLocale
, encoding
);
125 if ((fd
= _open(buf
, O_RDONLY
)) < 0)
127 if (_fstat(fd
, &sbuf
) < 0) {
131 if (sbuf
.st_size
< (COLLATE_STR_LEN
+ sizeof (info
))) {
136 map
= mmap(NULL
, sbuf
.st_size
, PROT_READ
, MAP_PRIVATE
, fd
, 0);
138 if ((TMP
= map
) == NULL
) {
142 if (strncmp(TMP
, COLLATE_VERSION
, COLLATE_STR_LEN
) != 0) {
143 (void) munmap(map
, sbuf
.st_size
);
147 TMP
+= COLLATE_STR_LEN
;
150 TMP
+= sizeof (*info
);
152 if ((info
->directive_count
< 1) ||
153 (info
->directive_count
>= COLL_WEIGHTS_MAX
) ||
154 ((chains
= info
->chain_count
) < 0)) {
155 (void) munmap(map
, sbuf
.st_size
);
160 i
= (sizeof (collate_char_t
) * (UCHAR_MAX
+ 1)) +
161 (sizeof (collate_chain_t
) * chains
) +
162 (sizeof (collate_large_t
) * info
->large_count
);
163 for (z
= 0; z
< info
->directive_count
; z
++) {
164 i
+= sizeof (collate_subst_t
) * info
->subst_count
[z
];
166 if (i
!= (sbuf
.st_size
- (TMP
- map
))) {
167 (void) munmap(map
, sbuf
.st_size
);
173 table
->char_pri_table
= (void *)TMP
;
174 TMP
+= sizeof (collate_char_t
) * (UCHAR_MAX
+ 1);
176 for (z
= 0; z
< info
->directive_count
; z
++) {
177 if (info
->subst_count
[z
] > 0) {
178 table
->subst_table
[z
] = (void *)TMP
;
179 TMP
+= info
->subst_count
[z
] * sizeof (collate_subst_t
);
181 table
->subst_table
[z
] = NULL
;
186 table
->chain_pri_table
= (void *)TMP
;
187 TMP
+= chains
* sizeof (collate_chain_t
);
189 table
->chain_pri_table
= NULL
;
190 if (info
->large_count
> 0)
191 table
->large_pri_table
= (void *)TMP
;
193 table
->large_pri_table
= NULL
;
195 table
->__collate_load_error
= 0;
196 return (_LDP_LOADED
);
199 static const int32_t *
200 substsearch(struct xlocale_collate
*table
, const wchar_t key
, int pass
)
202 const collate_subst_t
*p
;
203 int n
= table
->info
->subst_count
[pass
];
208 if (pass
>= table
->info
->directive_count
)
211 if (!(key
& COLLATE_SUBST_PRIORITY
))
214 p
= table
->subst_table
[pass
] + (key
& ~COLLATE_SUBST_PRIORITY
);
215 assert(p
->key
== key
);
219 static collate_chain_t
*
220 chainsearch(struct xlocale_collate
*table
, const wchar_t *key
, int *len
)
223 int high
= table
->info
->chain_count
- 1;;
226 collate_chain_t
*tab
= table
->chain_pri_table
;
231 while (low
<= high
) {
232 next
= (low
+ high
) / 2;
234 compar
= *key
- *p
->str
;
236 l
= wcsnlen(p
->str
, COLLATE_STR_LEN
);
237 compar
= wcsncmp(key
, p
->str
, l
);
251 static collate_large_t
*
252 largesearch(struct xlocale_collate
*table
, const wchar_t key
)
255 int high
= table
->info
->large_count
- 1;
258 collate_large_t
*tab
= table
->large_pri_table
;
263 while (low
<= high
) {
264 next
= (low
+ high
) / 2;
266 compar
= key
- p
->val
;
278 _collate_lookup(struct xlocale_collate
*table
, const wchar_t *t
, int *len
,
279 int *pri
, int which
, const int **state
)
282 collate_large_t
*match
;
287 * If this is the "last" pass for the UNDEFINED, then
288 * we just return the priority itself.
290 if (which
>= table
->info
->directive_count
) {
298 * If we have remaining substitution data from a previous
299 * call, consume it first.
301 if ((sptr
= *state
) != NULL
) {
304 if ((sptr
== *state
) || (sptr
== NULL
))
312 /* No active substitutions */
316 * Check for composites such as dipthongs that collate as a
317 * single element (aka chains or collating-elements).
319 if (((p2
= chainsearch(table
, t
, &l
)) != NULL
) &&
320 ((p
= p2
->pri
[which
]) >= 0)) {
325 } else if (*t
<= UCHAR_MAX
) {
328 * Character is a small (8-bit) character.
329 * We just look these up directly for speed.
331 *pri
= table
->char_pri_table
[*t
].pri
[which
];
333 } else if ((table
->info
->large_count
> 0) &&
334 ((match
= largesearch(table
, *t
)) != NULL
)) {
337 * Character was found in the extended table.
339 *pri
= match
->pri
.pri
[which
];
343 * Character lacks a specific definition.
345 if (table
->info
->directive
[which
] & DIRECTIVE_UNDEFINED
) {
346 /* Mask off sign bit to prevent ordering confusion. */
347 *pri
= (*t
& COLLATE_MAX_PRIORITY
);
349 *pri
= table
->info
->undef_pri
[which
];
351 /* No substitutions for undefined characters! */
356 * Try substituting (expanding) the character. We are
357 * currently doing this *after* the chain compression. I
358 * think it should not matter, but this way might be slightly
361 * We do this after the priority search, as this will help us
362 * to identify a single key value. In order for this to work,
363 * its important that the priority assigned to a given element
364 * to be substituted be unique for that level. The localedef
365 * code ensures this for us.
367 if ((sptr
= substsearch(table
, *pri
, which
)) != NULL
) {
368 if ((*pri
= *sptr
) > 0) {
370 *state
= *sptr
? sptr
: NULL
;
377 * This is the meaty part of wcsxfrm & strxfrm. Note that it does
378 * NOT NULL terminate. That is left to the caller.
381 _collate_wxfrm(struct xlocale_collate
*table
, const wchar_t *src
, wchar_t *xf
,
390 const int32_t *state
;
393 int ndir
= table
->info
->directive_count
;
397 for (pass
= 0; pass
<= ndir
; pass
++) {
402 /* insert level separator from the previous pass */
410 /* special pass for undefined */
412 direc
= DIRECTIVE_FORWARD
| DIRECTIVE_UNDEFINED
;
414 direc
= table
->info
->directive
[pass
];
419 if (direc
& DIRECTIVE_BACKWARD
) {
423 if ((tr
= wcsdup(t
)) == NULL
) {
428 fp
= tr
+ wcslen(tr
) - 1;
434 t
= (const wchar_t *)tr
;
437 if (direc
& DIRECTIVE_POSITION
) {
438 while (*t
|| state
) {
439 _collate_lookup(table
, t
, &len
, &pri
, pass
, &state
);
447 pri
= COLLATE_MAX_PRIORITY
;
457 while (*t
|| state
) {
458 _collate_lookup(table
, t
, &len
, &pri
, pass
, &state
);
482 return ((size_t)(-1));
486 * In the non-POSIX case, we transform each character into a string of
487 * characters representing the character's priority. Since char is usually
488 * signed, we are limited by 7 bits per byte. To avoid zero, we need to add
489 * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6
492 * It turns out that we sometimes have real priorities that are
493 * 31-bits wide. (But: be careful using priorities where the high
494 * order bit is set -- i.e. the priority is negative. The sort order
495 * may be surprising!)
497 * TODO: This would be a good area to optimize somewhat. It turns out
498 * that real prioririties *except for the last UNDEFINED pass* are generally
499 * very small. We need the localedef code to precalculate the max
500 * priority for us, and ideally also give us a mask, and then we could
501 * severely limit what we expand to.
504 #define XFRM_OFFSET ('0') /* make all printable characters */
506 #define XFRM_MASK ((1 << XFRM_SHIFT) - 1)
507 #define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */
510 xfrm(struct xlocale_collate
*table
, unsigned char *p
, int pri
, int pass
)
512 /* we use unsigned to ensure zero fill on right shift */
513 uint32_t val
= (uint32_t)table
->info
->pri_count
[pass
];
517 *p
= (pri
& XFRM_MASK
) + XFRM_OFFSET
;
527 _collate_sxfrm(struct xlocale_collate
*table
, const wchar_t *src
, char *xf
,
536 const int32_t *state
;
540 uint8_t buf
[XFRM_BYTES
];
541 int ndir
= table
->info
->directive_count
;
545 for (pass
= 0; pass
<= ndir
; pass
++) {
550 /* insert level separator from the previous pass */
558 /* special pass for undefined */
560 direc
= DIRECTIVE_FORWARD
| DIRECTIVE_UNDEFINED
;
562 direc
= table
->info
->directive
[pass
];
567 if (direc
& DIRECTIVE_BACKWARD
) {
571 if ((tr
= wcsdup(t
)) == NULL
) {
576 fp
= tr
+ wcslen(tr
) - 1;
582 t
= (const wchar_t *)tr
;
585 if (direc
& DIRECTIVE_POSITION
) {
586 while (*t
|| state
) {
588 _collate_lookup(table
, t
, &len
, &pri
, pass
, &state
);
596 pri
= COLLATE_MAX_PRIORITY
;
599 b
= xfrm(table
, buf
, pri
, pass
);
613 while (*t
|| state
) {
614 _collate_lookup(table
, t
, &len
, &pri
, pass
, &state
);
625 b
= xfrm(table
, buf
, pri
, pass
);
646 return ((size_t)(-1));
650 * __collate_equiv_value returns the primary collation value for the given
651 * collating symbol specified by str and len. Zero or negative is returned
652 * if the collating symbol was not found. This function is used by bracket
653 * code in the TRE regex library.
656 __collate_equiv_value(locale_t locale
, const wchar_t *str
, size_t len
)
660 if (len
< 1 || len
>= COLLATE_STR_LEN
)
664 struct xlocale_collate
*table
=
665 (struct xlocale_collate
*)locale
->components
[XLC_COLLATE
];
667 if (table
->__collate_load_error
)
668 return ((len
== 1 && *str
<= UCHAR_MAX
) ? *str
: -1);
672 if (*str
<= UCHAR_MAX
)
673 e
= table
->char_pri_table
[*str
].pri
[0];
674 else if (table
->info
->large_count
> 0) {
675 collate_large_t
*match_large
;
676 match_large
= largesearch(table
, *str
);
678 e
= match_large
->pri
.pri
[0];
682 return (e
> 0 ? e
: 0);
684 if (table
->info
->chain_count
> 0) {
685 wchar_t name
[COLLATE_STR_LEN
];
686 collate_chain_t
*match_chain
;
689 wcsncpy (name
, str
, len
);
691 match_chain
= chainsearch(table
, name
, &clen
);
693 e
= match_chain
->pri
[0];
696 return (e
< 0 ? -e
: e
);