libdl: first execute all destructors, then munmap library
[uclibc-ng.git] / libc / string / _collate.c
blobd487c8ba3a76756fb4c82be6da95ab92bcfc90c4
1 /*
2 * Copyright (C) 2002 Manuel Novoa III
3 * Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org>
5 * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
6 */
8 /* Dec 20, 2002
9 * Initial test implementation of strcoll, strxfrm, wcscoll, and wcsxfrm.
10 * The code needs to be cleaned up a good bit, but I'd like to see people
11 * test it out.
15 #include "_string.h"
16 #include <ctype.h>
17 #include <locale.h>
18 #include <stdlib.h>
19 #include <errno.h>
20 #include <assert.h>
22 #ifdef __UCLIBC_HAS_LOCALE__
23 #if defined(L_strxfrm) || defined(L_strxfrm_l) || defined(L_wcsxfrm) || defined(L_wcsxfrm_l)
25 #ifdef L_strxfrm
26 #ifndef WANT_WIDE
27 #error WANT_WIDE should be defined for L_strxfrm
28 #endif
29 #ifdef L_wcsxfrm
30 #error L_wcsxfrm already defined for L_strxfrm
31 #endif
32 #endif /* L_strxfrm */
34 #if defined(L_strxfrm) || defined(L_strxfrm_l)
36 #define wcscoll strcoll
37 #define wcscoll_l strcoll_l
38 #define wcsxfrm strxfrm
39 #define wcsxfrm_l strxfrm_l
41 #undef WANT_WIDE
42 #undef Wvoid
43 #undef Wchar
44 #undef Wuchar
45 #undef Wint
47 #define Wchar char
49 #endif /* defined(L_strxfrm) || defined(L_strxfrm_l) */
51 #if defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE)
54 int wcscoll (const Wchar *s0, const Wchar *s1)
56 return wcscoll_l(s0, s1, __UCLIBC_CURLOCALE );
58 libc_hidden_def(wcscoll)
61 size_t wcsxfrm(Wchar *__restrict ws1, const Wchar *__restrict ws2, size_t n)
63 return wcsxfrm_l(ws1, ws2, n, __UCLIBC_CURLOCALE );
66 #else /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */
69 #if 0
70 #define CUR_COLLATE (&__UCLIBC_CURLOCALE->collate)
71 #else
72 #define CUR_COLLATE (& __LOCALE_PTR->collate)
73 #endif
75 #define MAX_PENDING 8
77 typedef struct {
78 const Wchar *s;
79 const Wchar *eob; /* end of backward */
81 __uwchar_t weight;
82 __uwchar_t ui_weight; /* undefined or invalid */
83 int colitem;
84 int weightidx;
85 int rule;
86 size_t position;
87 /* should be wchar_t. if wchar < 0 do EILSEQ? */
88 __uwchar_t *cip;
89 __uwchar_t ci_pending[MAX_PENDING]; /* nul-terminated */
91 char *back_buf;
92 char *bbe; /* end of back_buf (actual last... not 1 past end) */
93 char *bp; /* ptr into backbuf, NULL if not in backward mode */
94 char ibb[128];
95 size_t bb_size;
97 int ru_pushed;
98 } col_state_t;
101 #define WEIGHT_MASK 0x3fffU
102 #define RULE_MASK 0xc000U
104 #define RULE_FORWARD (1 << 14)
105 #define RULE_POSITION (1 << 15)
107 #define UI_IDX (WEIGHT_MASK-6)
108 #define POSIT_IDX (WEIGHT_MASK-5)
109 #define RANGE_IDX (WEIGHT_MASK-4)
110 #define UNDEF_IDX (WEIGHT_MASK-3)
111 #define INVAL_IDX (WEIGHT_MASK-2)
112 #define DITTO_IDX (WEIGHT_MASK-1)
115 #undef TRACE
116 #if 0
117 #define TRACE(X) printf X
118 #else
119 #define TRACE(X) ((void)0)
120 #endif
122 static int lookup(wchar_t wc __LOCALE_PARAM )
124 unsigned int sc, n, i0, i1;
126 if (((__uwchar_t) wc) > 0xffffU) {
127 return 0;
130 sc = wc & CUR_COLLATE->ti_mask;
131 wc >>= CUR_COLLATE->ti_shift;
132 n = wc & CUR_COLLATE->ii_mask;
133 wc >>= CUR_COLLATE->ii_shift;
135 i0 = CUR_COLLATE->wcs2colidt_tbl[wc];
136 i0 <<= CUR_COLLATE->ii_shift;
137 i1 = CUR_COLLATE->wcs2colidt_tbl[CUR_COLLATE->ii_len + i0 + n];
138 i1 <<= CUR_COLLATE->ti_shift;
139 return CUR_COLLATE->wcs2colidt_tbl[CUR_COLLATE->ii_len + CUR_COLLATE->ti_len + i1 + sc];
143 static void init_col_state(col_state_t *cs, const Wchar *wcs)
145 memset(cs, 0, sizeof(col_state_t));
146 cs->s = wcs;
147 cs->bp = cs->back_buf = cs->ibb;
148 cs->bb_size = 128;
149 cs->bbe = cs->back_buf + (cs->bb_size -1);
152 static void next_weight(col_state_t *cs, int pass __LOCALE_PARAM )
154 int r, w, ru, ri, popping_backup_stack;
155 ssize_t n;
156 const uint16_t *p;
157 #ifdef WANT_WIDE
158 #define WC (*cs->s)
159 #define N (1)
160 #else /* WANT_WIDE */
161 wchar_t WC;
162 size_t n0, nx = 0;
163 #define N n0
165 #endif /* WANT_WIDE */
167 do {
169 if (cs->ru_pushed) {
170 ru = cs->ru_pushed;
171 TRACE(("ru_pushed = %d\n", ru));
172 cs->ru_pushed = 0;
173 goto POSITION_SKIP;
176 if (cs->cip) { /* possible pending weight */
177 if ((r = *(cs->cip++)) == 0) {
178 cs->cip = NULL;
179 continue;
181 cs->weightidx = r & WEIGHT_MASK;
182 assert(cs->weightidx);
183 /* assert(cs->weightidx != WEIGHT_MASK); */
184 } else { /* get the next collation item from the string */
185 TRACE(("clearing popping flag\n"));
186 popping_backup_stack = 0;
188 IGNORE_LOOP:
189 /* keep first pos as 0 for a sentinal */
190 if (*cs->bp) { /* pending backward chars */
191 POP_BACKUP:
192 popping_backup_stack = 1;
193 TRACE(("setting popping flag\n"));
194 n = 0;
195 if (*cs->bp > 0) { /* singles pending */
196 cs->s -= 1;
197 if ((*cs->bp -= 1) == 0) {
198 cs->bp -= 1;
200 } else { /* last was a multi */
201 cs->s += *cs->bp;
202 cs->bp -= 1;
204 } else if (!*cs->s) { /* not in backward mode and end of string */
205 cs->weight = 0;
206 return;
207 } else {
208 cs->position += 1;
211 BACK_LOOP:
212 #ifdef WANT_WIDE
213 n = 1;
214 cs->colitem = r = lookup(*cs->s __LOCALE_ARG );
215 #else /* WANT_WIDE */
216 n = n0 = __locale_mbrtowc_l(&WC, cs->s, __LOCALE_PTR);
217 if (n < 0) {
218 __set_errno(EILSEQ);
219 cs->weight = 0;
220 return;
222 cs->colitem = r = lookup(WC __LOCALE_ARG );
223 #endif /* WANT_WIDE */
225 TRACE((" r=%d WC=%#lx\n", r, (unsigned long)(WC)));
227 if (r > CUR_COLLATE->max_col_index) { /* starting char for one or more sequences */
228 p = CUR_COLLATE->multistart_tbl;
229 p += p[r-CUR_COLLATE->max_col_index -1];
230 do {
231 n = N;
232 r = *p++;
233 do {
234 if (!*p) { /* found it */
235 cs->colitem = r;
236 TRACE((" found multi %d\n", n));
237 goto FOUND;
239 #ifdef WANT_WIDE
240 /* the lookup check here is safe since we're assured that *p is a valid colidx */
241 if (!cs->s[n] || (lookup(cs->s[n] __LOCALE_ARG ) != *p)) {
242 do {} while (*p++);
243 break;
245 ++p;
246 ++n;
247 #else /* WANT_WIDE */
248 if (cs->s[n]) {
249 nx = __locale_mbrtowc_l(&WC, cs->s + n, __LOCALE_PTR);
250 if (nx < 0) {
251 __set_errno(EILSEQ);
252 cs->weight = 0;
253 return;
256 if (!cs->s[n] || (lookup(WC __LOCALE_ARG ) != *p)) {
257 do {} while (*p++);
258 break;
260 ++p;
261 n += nx; /* Only gets here if cs->s[n] != 0, so nx is set. */
262 #endif /* WANT_WIDE */
263 } while (1);
264 } while (1);
265 } else if (r == 0) { /* illegal, undefined, or part of a range */
266 if ((CUR_COLLATE->range_count)
267 && (((__uwchar_t)(WC - CUR_COLLATE->range_low)) <= CUR_COLLATE->range_count)
268 ) { /* part of a range */
269 /* Note: cs->colitem = 0 already. */
270 TRACE((" found range\n"));
271 ru = CUR_COLLATE->ruletable[CUR_COLLATE->range_rule_offset*CUR_COLLATE->MAX_WEIGHTS + pass];
272 assert((ru & WEIGHT_MASK) != DITTO_IDX);
273 if ((ru & WEIGHT_MASK) == WEIGHT_MASK) {
274 ru = (ru & RULE_MASK) | RANGE_IDX;
275 cs->weight = CUR_COLLATE->range_base_weight + (WC - CUR_COLLATE->range_low);
277 goto RANGE_SKIP_TO;
278 } else if (((__uwchar_t)(WC)) <= 0x7fffffffUL) { /* legal but undefined */
279 UNDEFINED:
280 /* Note: cs->colitem = 0 already. */
281 ri = CUR_COLLATE->undefined_idx;
282 assert(ri != 0); /* implicit undefined isn't supported */
284 TRACE((" found explicit UNDEFINED\n"));
285 if (CUR_COLLATE->num_weights == 1) {
286 TRACE((" single weight UNDEFINED\n"));
287 cs->weightidx = RANGE_IDX;
288 cs->weight = ri;
289 cs->s += n;
290 goto PROCESS_WEIGHT;
293 ri = CUR_COLLATE->index2ruleidx[ri - 1];
294 ru = CUR_COLLATE->ruletable[ri * CUR_COLLATE->MAX_WEIGHTS + pass];
295 assert((ru & WEIGHT_MASK) != WEIGHT_MASK); /* TODO: handle ".." */
296 if ((ru & WEIGHT_MASK) == DITTO_IDX) {
297 cs->colitem = CUR_COLLATE->undefined_idx;
299 goto RANGE_SKIP_TO;
300 } else { /* illegal */
301 TRACE((" found illegal\n"));
302 __set_errno(EINVAL);
303 /* We put all illegals in the same equiv class with maximal weight,
304 * and ignore them after the first pass. */
305 if (pass > 0) {
306 cs->s += n;
307 goto IGNORE_LOOP;
309 ru = (RULE_FORWARD | RANGE_IDX);
310 cs->weight = 0xffffU;
311 goto RANGE_SKIP_TO;
313 } else if (CUR_COLLATE->num_weights == 1) {
314 TRACE((" single weight\n"));
315 cs->weightidx = RANGE_IDX;
316 cs->weight = cs->colitem;
317 cs->s += n;
318 goto PROCESS_WEIGHT;
319 } else {
320 TRACE((" normal\n"));
323 /* if we get here, it is a normal char either singlely weighted, undefined, or in a range */
324 FOUND:
325 ri = CUR_COLLATE->index2ruleidx[cs->colitem - 1];
326 TRACE((" ri=%d ", ri));
327 if (!ri) {
328 TRACE(("NOT IN THIS LOCALE\n"));
329 goto UNDEFINED;
331 ru = CUR_COLLATE->ruletable[ri * CUR_COLLATE->MAX_WEIGHTS + pass];
333 RANGE_SKIP_TO:
335 /* if (!(ru & WEIGHT_MASK)) { */
336 /* TRACE(("IGNORE\n")); */
337 /* cs->s += n; */
338 /* continue; */
339 /* } */
342 TRACE((" rule = %#x weight = %#x popping = %d s = %p eob = %p\n",
343 ru & RULE_MASK, ru & WEIGHT_MASK, popping_backup_stack,
344 cs->s, cs->eob));
345 /* now we need to check if we're going backwards... */
347 if (!popping_backup_stack) {
348 if (!(ru & RULE_MASK)) { /* backward */
349 TRACE(("backwards\n"));
350 assert(cs->bp <= cs->bbe);
351 if (cs->bp == cs->bbe) {
352 if (cs->back_buf == cs->ibb) { /* was using internal buffer */
353 cs->bp = malloc(cs->bb_size + 128);
354 if (!cs->bp) {
355 /* __set_errno(ENOMEM); */
356 cs->weight = 0;
357 return;
359 memcpy(cs->bp, cs->back_buf, cs->bb_size);
361 } else {
362 cs->bp = realloc(cs->back_buf, cs->bb_size + 128);
363 if (!cs->bp) {
364 /* __set_errno(ENOMEM); */
365 cs->weight = 0;
366 return;
369 cs->bb_size += 128;
370 cs->bbe = cs->bp + (cs->bbe - cs->back_buf);
371 cs->back_buf = cs->bp;
372 cs->bp = cs->bbe;
375 if (n==1) { /* single char */
376 if (*cs->bp && (((unsigned char)(*cs->bp)) < CHAR_MAX)) {
377 *cs->bp += 1; /* increment last single's count */
378 } else { /* last was a multi, or just starting */
379 if (!cs->bp) {
380 cs->bp = cs->back_buf;
381 } else {
382 assert(cs->bp < cs->bbe);
383 ++cs->bp;
385 *cs->bp = 1;
387 } else { /* multichar */
388 assert(n>1);
389 assert(cs->bp < cs->bbe);
390 *++cs->bp = -n;
392 cs->s += n;
393 if (*cs->s) {
394 goto BACK_LOOP;
396 /* end-of-string so start popping */
397 cs->eob = cs->s;
398 TRACE(("popping\n"));
399 goto POP_BACKUP;
400 } else if (*cs->bp) { /* was going backward but this element isn't */
401 /* discard current and use previous backward element */
402 assert(!cs->cip);
403 cs->eob = cs->s;
404 TRACE(("popping\n"));
405 goto POP_BACKUP;
406 } else { /* was and still going forward */
407 TRACE(("forwards\n"));
408 if ((ru & (RULE_POSITION|WEIGHT_MASK)) > RULE_POSITION) {
409 assert(ru & WEIGHT_MASK);
410 cs->ru_pushed = ru;
411 cs->weight = cs->position;
412 cs->position = 0; /* reset to reduce size for strcoll? */
413 cs->s += n;
414 cs->weightidx = RANGE_IDX;
415 goto PROCESS_WEIGHT;
418 } else { /* popping backwards stack */
419 TRACE(("popping (continued)\n"));
420 if (!*cs->bp) {
421 cs->s = cs->eob;
423 cs->s -= n;
426 cs->s += n;
427 POSITION_SKIP:
428 cs->weightidx = ru & WEIGHT_MASK;
429 cs->rule = ru & RULE_MASK;
432 if (!cs->weightidx) { /* ignore */
433 continue;
436 PROCESS_WEIGHT:
437 assert(cs->weightidx);
440 if (((unsigned int)(cs->weightidx - UI_IDX)) <= (INVAL_IDX-UI_IDX)) {
441 if (cs->weightidx == UI_IDX) {
442 cs->weight = cs->ui_weight;
444 return;
447 assert(cs->weightidx != WEIGHT_MASK);
448 if (cs->weightidx == DITTO_IDX) { /* want the weight of the current collating item */
449 TRACE(("doing ditto\n"));
450 w = CUR_COLLATE->index2weight[cs->colitem -1];
451 } else if (cs->weightidx <= CUR_COLLATE->max_col_index) { /* normal */
452 TRACE(("doing normal\n"));
453 w = CUR_COLLATE->index2weight[cs->weightidx -1];
454 } else { /* a string */
455 TRACE(("doing string\n"));
456 assert(!(cs->weightidx & RULE_MASK));
457 /* note: iso14561 allows null string here */
458 p = CUR_COLLATE->weightstr + (cs->weightidx - (CUR_COLLATE->max_col_index + 2));
459 if (*p & WEIGHT_MASK) {
460 r = 0;
461 do {
462 assert(r < MAX_PENDING);
463 cs->ci_pending[r++] = *p++;
464 } while (*p & WEIGHT_MASK);
465 cs->cip = cs->ci_pending;
467 continue;
470 cs->weight = w;
471 return;
472 } while (1);
475 int __XL_NPP(wcscoll) (const Wchar *s0, const Wchar *s1 __LOCALE_PARAM )
477 col_state_t ws[2];
478 int pass;
480 if (!CUR_COLLATE->num_weights) { /* C locale */
481 #ifdef WANT_WIDE
482 return wcscmp(s0, s1);
483 #else
484 return strcmp(s0, s1);
485 #endif
488 pass = 0;
489 do { /* loop through the weights levels */
490 init_col_state(ws, s0);
491 init_col_state(ws+1, s1);
492 do { /* loop through the strings */
493 /* for each string, get the next weight */
494 next_weight(ws, pass __LOCALE_ARG );
495 next_weight(ws+1, pass __LOCALE_ARG );
496 TRACE(("w0=%lu w1=%lu\n",
497 (unsigned long) ws[0].weight,
498 (unsigned long) ws[1].weight));
500 if (ws[0].weight != ws[1].weight) {
501 return ws[0].weight - ws[1].weight;
503 } while (ws[0].weight);
504 } while (++pass < CUR_COLLATE->num_weights);
506 return 0;
508 libc_hidden_def(__XL_NPP(wcscoll))
510 #ifdef WANT_WIDE
512 size_t __XL_NPP(wcsxfrm)(wchar_t *__restrict ws1, const wchar_t *__restrict ws2,
513 size_t n __LOCALE_PARAM )
515 col_state_t cs;
516 size_t count;
517 int pass;
519 if (!CUR_COLLATE->num_weights) { /* C locale */
520 return __wcslcpy(ws1, ws2, n);
523 count = pass = 0;
524 do { /* loop through the weights levels */
525 init_col_state(&cs, ws2);
526 do { /* loop through the string */
527 next_weight(&cs, pass __LOCALE_ARG );
528 TRACE(("weight=%lu (%#lx)\n", (unsigned long) cs.weight, (unsigned long) cs.weight));
529 if (count < n) {
530 ws1[count] = cs.weight +1;
532 ++count;
533 TRACE(("--------------------------------------------\n"));
534 } while (cs.weight);
535 if (count <= n) { /* overwrite the trailing 0 end-of-pass marker */
536 ws1[count-1] = 1;
538 TRACE(("-------------------- pass %d --------------------\n", pass));
539 } while (++pass < CUR_COLLATE->num_weights);
540 if (count <= n) { /* oops... change it back */
541 ws1[count-1] = 0;
543 return count-1;
545 #if defined L_strxfrm_l || defined L_wcsxfrm_l
546 libc_hidden_def(__XL_NPP(wcsxfrm))
547 #endif
549 #else /* WANT_WIDE */
551 static const unsigned long bound[] = {
552 1UL << 7,
553 1UL << 11,
554 1UL << 16,
555 1UL << 21,
556 1UL << 26,
559 static unsigned char first[] = {
560 0x0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
563 /* Use an extension of UTF-8 to store a 32 bit val in max 6 bytes. */
565 static size_t store(unsigned char *s, size_t count, size_t n, __uwchar_t weight)
567 int i, r;
569 i = 0;
570 do {
571 if (weight < bound[i]) {
572 break;
574 } while (++i < sizeof(bound)/sizeof(bound[0]));
576 r = i+1;
577 if (i + count < n) {
578 s += count;
579 s[0] = first[i];
580 while (i) {
581 s[i] = 0x80 | (weight & 0x3f);
582 weight >>= 6;
583 --i;
585 s[0] |= weight;
588 return r;
591 size_t __XL_NPP(strxfrm)(char *__restrict ws1, const char *__restrict ws2, size_t n
592 __LOCALE_PARAM )
594 col_state_t cs;
595 size_t count, inc;
596 int pass;
598 if (!CUR_COLLATE->num_weights) { /* C locale */
599 return strlcpy(ws1, ws2, n);
602 inc = count = pass = 0;
603 do { /* loop through the weights levels */
604 init_col_state(&cs, ws2);
605 do { /* loop through the string */
606 next_weight(&cs, pass __LOCALE_ARG );
607 TRACE(("weight=%lu (%#lx)\n", (unsigned long) cs.weight, (unsigned long) cs.weight));
608 inc = store((unsigned char *)ws1, count, n, cs.weight + 1);
609 count += inc;
610 TRACE(("--------------------------------------------\n"));
611 } while (cs.weight);
612 /* overwrite the trailing 0 end-of-pass marker */
613 assert(inc == 1);
614 if (count <= n) {
615 ws1[count-1] = 1;
617 TRACE(("-------------------- pass %d --------------------\n", pass));
618 } while (++pass < CUR_COLLATE->num_weights);
619 if (count <= n) { /* oops... change it back */
620 ws1[count-1] = 0;
622 return count-1;
624 #ifdef L_strxfrm_l
625 libc_hidden_def(__XL_NPP(strxfrm))
626 #endif
628 #endif /* WANT_WIDE */
630 #endif /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */
632 #endif /* defined(L_strxfrm) || defined(L_strxfrm_l) || defined(L_wcsxfrm) || defined(L_wcsxfrm_l) */
634 #endif /* __UCLIBC_HAS_LOCALE__ */
635 /**********************************************************************/