*: remove __UCLIBC_CURLOCALE_DATA, __UCLIBC_CURLOCALE_DATA.x
[uclibc-ng.git] / libc / string / _collate.c
blob9cc9ed0554c1fc2788b0fe1859a4845ec25de85c
1 /*
2 * Copyright (C) 2002 Manuel Novoa III
3 * Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org>
5 * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
6 */
8 /* Dec 20, 2002
9 * Initial test implementation of strcoll, strxfrm, wcscoll, and wcsxfrm.
10 * The code needs to be cleaned up a good bit, but I'd like to see people
11 * test it out.
15 #include "_string.h"
16 #include <ctype.h>
17 #include <locale.h>
18 #include <stdlib.h>
19 #include <errno.h>
20 #include <assert.h>
22 /* Experimentally off - libc_hidden_proto(memset) */
23 /* Experimentally off - libc_hidden_proto(memcpy) */
24 /* Experimentally off - libc_hidden_proto(strlcpy) */
25 /* Experimentally off - libc_hidden_proto(strcmp) */
26 #ifdef WANT_WIDE
27 /* libc_hidden_proto(wcsxfrm) */
28 /* libc_hidden_proto(wcscmp) */
29 #endif
31 #ifdef __UCLIBC_HAS_LOCALE__
32 #if defined(L_strxfrm) || defined(L_strxfrm_l) || defined(L_wcsxfrm) || defined(L_wcsxfrm_l)
34 #ifdef L_strxfrm
35 #ifndef WANT_WIDE
36 #error WANT_WIDE should be defined for L_strxfrm
37 #endif
38 #ifdef L_wcsxfrm
39 #error L_wcsxfrm already defined for L_strxfrm
40 #endif
41 #endif /* L_strxfrm */
43 #if defined(L_strxfrm) || defined(L_strxfrm_l)
45 #define wcscoll strcoll
46 #define wcscoll_l strcoll_l
47 #define wcsxfrm strxfrm
48 #define wcsxfrm_l strxfrm_l
50 #undef WANT_WIDE
51 #undef Wvoid
52 #undef Wchar
53 #undef Wuchar
54 #undef Wint
56 #define Wchar char
58 #endif /* defined(L_strxfrm) || defined(L_strxfrm_l) */
60 #if defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE)
62 /* libc_hidden_proto(wcscoll_l) */
64 /* libc_hidden_proto(wcscoll) */
65 int wcscoll (const Wchar *s0, const Wchar *s1)
67 return wcscoll_l(s0, s1, __UCLIBC_CURLOCALE );
69 libc_hidden_def(wcscoll)
71 /* libc_hidden_proto(wcsxfrm_l) */
73 /* libc_hidden_proto(wcsxfrm) */
74 size_t wcsxfrm(Wchar *__restrict ws1, const Wchar *__restrict ws2, size_t n)
76 return wcsxfrm_l(ws1, ws2, n, __UCLIBC_CURLOCALE );
78 libc_hidden_def(wcsxfrm)
80 #else /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */
83 #if 0
84 #define CUR_COLLATE (&__UCLIBC_CURLOCALE->collate)
85 #else
86 #define CUR_COLLATE (& __LOCALE_PTR->collate)
87 #endif
89 #define MAX_PENDING 8
91 typedef struct {
92 const Wchar *s;
93 const Wchar *eob; /* end of backward */
95 __uwchar_t weight;
96 __uwchar_t ui_weight; /* undefined or invalid */
97 int colitem;
98 int weightidx;
99 int rule;
100 size_t position;
101 /* should be wchar_t. if wchar < 0 do EILSEQ? */
102 __uwchar_t *cip;
103 __uwchar_t ci_pending[MAX_PENDING]; /* nul-terminated */
105 char *back_buf;
106 char *bbe; /* end of back_buf (actual last... not 1 past end) */
107 char *bp; /* ptr into backbuf, NULL if not in backward mode */
108 char ibb[128];
109 size_t bb_size;
111 int ru_pushed;
112 } col_state_t;
115 #define WEIGHT_MASK 0x3fffU
116 #define RULE_MASK 0xc000U
118 #define RULE_FORWARD (1 << 14)
119 #define RULE_POSITION (1 << 15)
121 #define UI_IDX (WEIGHT_MASK-6)
122 #define POSIT_IDX (WEIGHT_MASK-5)
123 #define RANGE_IDX (WEIGHT_MASK-4)
124 #define UNDEF_IDX (WEIGHT_MASK-3)
125 #define INVAL_IDX (WEIGHT_MASK-2)
126 #define DITTO_IDX (WEIGHT_MASK-1)
129 #undef TRACE
130 #if 0
131 #define TRACE(X) printf X
132 #else
133 #define TRACE(X) ((void)0)
134 #endif
136 static int lookup(wchar_t wc __LOCALE_PARAM )
138 unsigned int sc, n, i0, i1;
140 if (((__uwchar_t) wc) > 0xffffU) {
141 return 0;
144 sc = wc & CUR_COLLATE->ti_mask;
145 wc >>= CUR_COLLATE->ti_shift;
146 n = wc & CUR_COLLATE->ii_mask;
147 wc >>= CUR_COLLATE->ii_shift;
149 i0 = CUR_COLLATE->wcs2colidt_tbl[wc];
150 i0 <<= CUR_COLLATE->ii_shift;
151 i1 = CUR_COLLATE->wcs2colidt_tbl[CUR_COLLATE->ii_len + i0 + n];
152 i1 <<= CUR_COLLATE->ti_shift;
153 return CUR_COLLATE->wcs2colidt_tbl[CUR_COLLATE->ii_len + CUR_COLLATE->ti_len + i1 + sc];
157 static void init_col_state(col_state_t *cs, const Wchar *wcs)
159 memset(cs, 0, sizeof(col_state_t));
160 cs->s = wcs;
161 cs->bp = cs->back_buf = cs->ibb;
162 cs->bb_size = 128;
163 cs->bbe = cs->back_buf + (cs->bb_size -1);
166 static void next_weight(col_state_t *cs, int pass __LOCALE_PARAM )
168 int r, w, ru, ri, popping_backup_stack;
169 ssize_t n;
170 const uint16_t *p;
171 #ifdef WANT_WIDE
172 #define WC (*cs->s)
173 #define N (1)
174 #else /* WANT_WIDE */
175 wchar_t WC;
176 size_t n0, nx;
177 #define N n0
179 #endif /* WANT_WIDE */
181 do {
183 if (cs->ru_pushed) {
184 ru = cs->ru_pushed;
185 TRACE(("ru_pushed = %d\n", ru));
186 cs->ru_pushed = 0;
187 goto POSITION_SKIP;
190 #ifdef __UCLIBC_MJN3_ONLY__
191 #warning should we walk pendings backwards?
192 #endif
193 if (cs->cip) { /* possible pending weight */
194 if ((r = *(cs->cip++)) == 0) {
195 cs->cip = NULL;
196 continue;
198 cs->weightidx = r & WEIGHT_MASK;
199 assert(cs->weightidx);
200 /* assert(cs->weightidx != WEIGHT_MASK); */
201 } else { /* get the next collation item from the string */
202 TRACE(("clearing popping flag\n"));
203 popping_backup_stack = 0;
205 IGNORE_LOOP:
206 /* keep first pos as 0 for a sentinal */
207 if (*cs->bp) { /* pending backward chars */
208 POP_BACKUP:
209 popping_backup_stack = 1;
210 TRACE(("setting popping flag\n"));
211 n = 0;
212 if (*cs->bp > 0) { /* singles pending */
213 cs->s -= 1;
214 if ((*cs->bp -= 1) == 0) {
215 cs->bp -= 1;
217 } else { /* last was a multi */
218 cs->s += *cs->bp;
219 cs->bp -= 1;
221 } else if (!*cs->s) { /* not in backward mode and end of string */
222 cs->weight = 0;
223 return;
224 } else {
225 cs->position += 1;
228 BACK_LOOP:
229 #ifdef WANT_WIDE
230 n = 1;
231 cs->colitem = r = lookup(*cs->s __LOCALE_ARG );
232 #else /* WANT_WIDE */
233 n = n0 = __locale_mbrtowc_l(&WC, cs->s, __LOCALE_PTR);
234 if (n < 0) {
235 __set_errno(EILSEQ);
236 cs->weight = 0;
237 return;
239 cs->colitem = r = lookup(WC __LOCALE_ARG );
240 #endif /* WANT_WIDE */
242 TRACE((" r=%d WC=%#lx\n", r, (unsigned long)(WC)));
244 if (r > CUR_COLLATE->max_col_index) { /* starting char for one or more sequences */
245 p = CUR_COLLATE->multistart_tbl;
246 p += p[r-CUR_COLLATE->max_col_index -1];
247 do {
248 n = N;
249 r = *p++;
250 do {
251 if (!*p) { /* found it */
252 cs->colitem = r;
253 TRACE((" found multi %d\n", n));
254 goto FOUND;
256 #ifdef WANT_WIDE
257 /* the lookup check here is safe since we're assured that *p is a valid colidx */
258 if (!cs->s[n] || (lookup(cs->s[n] __LOCALE_ARG ) != *p)) {
259 do {} while (*p++);
260 break;
262 ++p;
263 ++n;
264 #else /* WANT_WIDE */
265 if (cs->s[n]) {
266 nx = __locale_mbrtowc_l(&WC, cs->s + n, __LOCALE_PTR);
267 if (nx < 0) {
268 __set_errno(EILSEQ);
269 cs->weight = 0;
270 return;
273 if (!cs->s[n] || (lookup(WC __LOCALE_ARG ) != *p)) {
274 do {} while (*p++);
275 break;
277 ++p;
278 n += nx; /* Only gets here if cs->s[n] != 0, so nx is set. */
279 #endif /* WANT_WIDE */
280 } while (1);
281 } while (1);
282 } else if (r == 0) { /* illegal, undefined, or part of a range */
283 if ((CUR_COLLATE->range_count)
284 #ifdef __UCLIBC_MJN3_ONLY__
285 #warning .. need to introduce range as a collating item?
286 #endif
287 && (((__uwchar_t)(WC - CUR_COLLATE->range_low)) <= CUR_COLLATE->range_count)
288 ) { /* part of a range */
289 /* Note: cs->colitem = 0 already. */
290 TRACE((" found range\n"));
291 ru = CUR_COLLATE->ruletable[CUR_COLLATE->range_rule_offset*CUR_COLLATE->MAX_WEIGHTS + pass];
292 assert((ru & WEIGHT_MASK) != DITTO_IDX);
293 if ((ru & WEIGHT_MASK) == WEIGHT_MASK) {
294 ru = (ru & RULE_MASK) | RANGE_IDX;
295 cs->weight = CUR_COLLATE->range_base_weight + (WC - CUR_COLLATE->range_low);
297 goto RANGE_SKIP_TO;
298 } else if (((__uwchar_t)(WC)) <= 0x7fffffffUL) { /* legal but undefined */
299 UNDEFINED:
300 /* Note: cs->colitem = 0 already. */
301 ri = CUR_COLLATE->undefined_idx;
302 assert(ri != 0); /* implicit undefined isn't supported */
304 TRACE((" found explicit UNDEFINED\n"));
305 #ifdef __UCLIBC_MJN3_ONLY__
306 #warning right now single weight locales do not support ..
307 #endif
308 if (CUR_COLLATE->num_weights == 1) {
309 TRACE((" single weight UNDEFINED\n"));
310 cs->weightidx = RANGE_IDX;
311 cs->weight = ri;
312 cs->s += n;
313 goto PROCESS_WEIGHT;
316 ri = CUR_COLLATE->index2ruleidx[ri - 1];
317 ru = CUR_COLLATE->ruletable[ri * CUR_COLLATE->MAX_WEIGHTS + pass];
318 assert((ru & WEIGHT_MASK) != WEIGHT_MASK); /* TODO: handle ".." */
319 if ((ru & WEIGHT_MASK) == DITTO_IDX) {
320 cs->colitem = CUR_COLLATE->undefined_idx;
322 goto RANGE_SKIP_TO;
323 } else { /* illegal */
324 TRACE((" found illegal\n"));
325 __set_errno(EINVAL);
326 /* We put all illegals in the same equiv class with maximal weight,
327 * and ignore them after the first pass. */
328 if (pass > 0) {
329 cs->s += n;
330 goto IGNORE_LOOP;
332 ru = (RULE_FORWARD | RANGE_IDX);
333 cs->weight = 0xffffU;
334 goto RANGE_SKIP_TO;
336 } else if (CUR_COLLATE->num_weights == 1) {
337 TRACE((" single weight\n"));
338 cs->weightidx = RANGE_IDX;
339 cs->weight = cs->colitem;
340 cs->s += n;
341 goto PROCESS_WEIGHT;
342 } else {
343 TRACE((" normal\n"));
346 /* if we get here, it is a normal char either singlely weighted, undefined, or in a range */
347 FOUND:
348 ri = CUR_COLLATE->index2ruleidx[cs->colitem - 1];
349 TRACE((" ri=%d ", ri));
350 #ifdef __UCLIBC_MJN3_ONLY__
351 #warning make sure this is correct
352 #endif
353 if (!ri) {
354 TRACE(("NOT IN THIS LOCALE\n"));
355 goto UNDEFINED;
357 ru = CUR_COLLATE->ruletable[ri * CUR_COLLATE->MAX_WEIGHTS + pass];
359 RANGE_SKIP_TO:
361 #ifdef __UCLIBC_MJN3_ONLY__
362 #warning ignoreables probably should not interrupt backwards processing, but this is wrong
363 #endif
364 /* if (!(ru & WEIGHT_MASK)) { */
365 /* TRACE(("IGNORE\n")); */
366 /* cs->s += n; */
367 /* continue; */
368 /* } */
371 TRACE((" rule = %#x weight = %#x popping = %d s = %p eob = %p\n",
372 ru & RULE_MASK, ru & WEIGHT_MASK, popping_backup_stack,
373 cs->s, cs->eob));
374 /* now we need to check if we're going backwards... */
376 if (!popping_backup_stack) {
377 if (!(ru & RULE_MASK)) { /* backward */
378 TRACE(("backwards\n"));
379 assert(cs->bp <= cs->bbe);
380 if (cs->bp == cs->bbe) {
381 if (cs->back_buf == cs->ibb) { /* was using internal buffer */
382 cs->bp = malloc(cs->bb_size + 128);
383 if (!cs->bp) {
384 __set_errno(ENOMEM);
385 #ifdef __UCLIBC_MJN3_ONLY__
386 #warning what to do here?
387 #endif
388 cs->weight = 0;
389 return;
391 memcpy(cs->bp, cs->back_buf, cs->bb_size);
393 } else {
394 cs->bp = realloc(cs->back_buf, cs->bb_size + 128);
395 if (!cs->bp) {
396 __set_errno(ENOMEM);
397 #ifdef __UCLIBC_MJN3_ONLY__
398 #warning what to do here?
399 #endif
400 cs->weight = 0;
401 return;
404 cs->bb_size += 128;
405 cs->bbe = cs->bp + (cs->bbe - cs->back_buf);
406 cs->back_buf = cs->bp;
407 cs->bp = cs->bbe;
410 if (n==1) { /* single char */
411 if (*cs->bp && (((unsigned char)(*cs->bp)) < CHAR_MAX)) {
412 *cs->bp += 1; /* increment last single's count */
413 } else { /* last was a multi, or just starting */
414 if (!cs->bp) {
415 cs->bp = cs->back_buf;
416 } else {
417 assert(cs->bp < cs->bbe);
418 ++cs->bp;
420 *cs->bp = 1;
422 } else { /* multichar */
423 assert(n>1);
424 assert(cs->bp < cs->bbe);
425 *++cs->bp = -n;
427 cs->s += n;
428 if (*cs->s) {
429 goto BACK_LOOP;
431 /* end-of-string so start popping */
432 cs->eob = cs->s;
433 TRACE(("popping\n"));
434 goto POP_BACKUP;
435 } else if (*cs->bp) { /* was going backward but this element isn't */
436 /* discard current and use previous backward element */
437 assert(!cs->cip);
438 cs->eob = cs->s;
439 TRACE(("popping\n"));
440 goto POP_BACKUP;
441 } else { /* was and still going forward */
442 TRACE(("forwards\n"));
443 if ((ru & (RULE_POSITION|WEIGHT_MASK)) > RULE_POSITION) {
444 assert(ru & WEIGHT_MASK);
445 cs->ru_pushed = ru;
446 cs->weight = cs->position;
447 #ifdef __UCLIBC_MJN3_ONLY__
448 #warning devel code
449 #endif
450 cs->position = 0; /* reset to reduce size for strcoll? */
451 cs->s += n;
452 cs->weightidx = RANGE_IDX;
453 goto PROCESS_WEIGHT;
456 } else { /* popping backwards stack */
457 TRACE(("popping (continued)\n"));
458 if (!*cs->bp) {
459 cs->s = cs->eob;
461 cs->s -= n;
464 cs->s += n;
465 POSITION_SKIP:
466 cs->weightidx = ru & WEIGHT_MASK;
467 cs->rule = ru & RULE_MASK;
470 #ifdef __UCLIBC_MJN3_ONLY__
471 #warning for pending we only want the weight... _not_ the rule
472 #endif
473 if (!cs->weightidx) { /* ignore */
474 continue;
477 PROCESS_WEIGHT:
478 assert(cs->weightidx);
481 if (((unsigned int)(cs->weightidx - UI_IDX)) <= (INVAL_IDX-UI_IDX)) {
482 if (cs->weightidx == UI_IDX) {
483 cs->weight = cs->ui_weight;
485 return;
488 assert(cs->weightidx != WEIGHT_MASK);
489 if (cs->weightidx == DITTO_IDX) { /* want the weight of the current collating item */
490 TRACE(("doing ditto\n"));
491 w = CUR_COLLATE->index2weight[cs->colitem -1];
492 } else if (cs->weightidx <= CUR_COLLATE->max_col_index) { /* normal */
493 TRACE(("doing normal\n"));
494 w = CUR_COLLATE->index2weight[cs->weightidx -1];
495 } else { /* a string */
496 TRACE(("doing string\n"));
497 assert(!(cs->weightidx & RULE_MASK));
498 /* note: iso14561 allows null string here */
499 p = CUR_COLLATE->weightstr + (cs->weightidx - (CUR_COLLATE->max_col_index + 2));
500 if (*p & WEIGHT_MASK) {
501 r = 0;
502 do {
503 assert(r < MAX_PENDING);
504 cs->ci_pending[r++] = *p++;
505 } while (*p & WEIGHT_MASK);
506 cs->cip = cs->ci_pending;
508 continue;
511 cs->weight = w;
512 return;
513 } while (1);
516 libc_hidden_proto(__XL_NPP(wcscoll))
517 int __XL_NPP(wcscoll) (const Wchar *s0, const Wchar *s1 __LOCALE_PARAM )
519 col_state_t ws[2];
520 int pass;
522 if (!CUR_COLLATE->num_weights) { /* C locale */
523 #ifdef WANT_WIDE
524 return wcscmp(s0, s1);
525 #else /* WANT_WIDE */
526 return strcmp(s0, s1);
527 #endif /* WANT_WIDE */
530 pass = 0;
531 do { /* loop through the weights levels */
532 init_col_state(ws, s0);
533 init_col_state(ws+1, s1);
534 do { /* loop through the strings */
535 /* for each string, get the next weight */
536 next_weight(ws, pass __LOCALE_ARG );
537 next_weight(ws+1, pass __LOCALE_ARG );
538 TRACE(("w0=%lu w1=%lu\n",
539 (unsigned long) ws[0].weight,
540 (unsigned long) ws[1].weight));
542 if (ws[0].weight != ws[1].weight) {
543 return ws[0].weight - ws[1].weight;
545 } while (ws[0].weight);
546 } while (++pass < CUR_COLLATE->num_weights);
548 return 0;
550 libc_hidden_def(__XL_NPP(wcscoll))
552 #ifdef WANT_WIDE
554 extern size_t __wcslcpy(wchar_t *__restrict dst,
555 const wchar_t *__restrict src, size_t n);
557 libc_hidden_proto(__XL_NPP(wcsxfrm))
558 size_t __XL_NPP(wcsxfrm)(wchar_t *__restrict ws1, const wchar_t *__restrict ws2,
559 size_t n __LOCALE_PARAM )
561 col_state_t cs;
562 size_t count;
563 int pass;
565 if (!CUR_COLLATE->num_weights) { /* C locale */
566 return __wcslcpy(ws1, ws2, n);
569 #ifdef __UCLIBC_MJN3_ONLY__
570 #warning handle empty string as a special case
571 #endif
573 count = pass = 0;
574 do { /* loop through the weights levels */
575 init_col_state(&cs, ws2);
576 do { /* loop through the string */
577 next_weight(&cs, pass __LOCALE_ARG );
578 TRACE(("weight=%lu (%#lx)\n", (unsigned long) cs.weight, (unsigned long) cs.weight));
579 if (count < n) {
580 ws1[count] = cs.weight +1;
582 ++count;
583 TRACE(("--------------------------------------------\n"));
584 } while (cs.weight);
585 if (count <= n) { /* overwrite the trailing 0 end-of-pass marker */
586 ws1[count-1] = 1;
588 TRACE(("-------------------- pass %d --------------------\n", pass));
589 } while (++pass < CUR_COLLATE->num_weights);
590 if (count <= n) { /* oops... change it back */
591 ws1[count-1] = 0;
593 return count-1;
595 libc_hidden_def(__XL_NPP(wcsxfrm))
597 #else /* WANT_WIDE */
599 static const unsigned long bound[] = {
600 1UL << 7,
601 1UL << 11,
602 1UL << 16,
603 1UL << 21,
604 1UL << 26,
607 static unsigned char first[] = {
608 0x0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
611 /* Use an extension of UTF-8 to store a 32 bit val in max 6 bytes. */
613 static size_t store(unsigned char *s, size_t count, size_t n, __uwchar_t weight)
615 int i, r;
617 i = 0;
618 do {
619 if (weight < bound[i]) {
620 break;
622 } while (++i < sizeof(bound)/sizeof(bound[0]));
624 r = i+1;
625 if (i + count < n) {
626 s += count;
627 s[0] = first[i];
628 while (i) {
629 s[i] = 0x80 | (weight & 0x3f);
630 weight >>= 6;
631 --i;
633 s[0] |= weight;
636 return r;
639 libc_hidden_proto(__XL_NPP(strxfrm))
640 size_t __XL_NPP(strxfrm)(char *__restrict ws1, const char *__restrict ws2, size_t n
641 __LOCALE_PARAM )
643 col_state_t cs;
644 size_t count, inc;
645 int pass;
647 if (!CUR_COLLATE->num_weights) { /* C locale */
648 return strlcpy(ws1, ws2, n);
651 #ifdef __UCLIBC_MJN3_ONLY__
652 #warning handle empty string as a special case
653 #endif
655 inc = count = pass = 0;
656 do { /* loop through the weights levels */
657 init_col_state(&cs, ws2);
658 do { /* loop through the string */
659 next_weight(&cs, pass __LOCALE_ARG );
660 TRACE(("weight=%lu (%#lx)\n", (unsigned long) cs.weight, (unsigned long) cs.weight));
661 inc = store((unsigned char *)ws1, count, n, cs.weight + 1);
662 count += inc;
663 TRACE(("--------------------------------------------\n"));
664 } while (cs.weight);
665 /* overwrite the trailing 0 end-of-pass marker */
666 assert(inc == 1);
667 if (count <= n) {
668 ws1[count-1] = 1;
670 TRACE(("-------------------- pass %d --------------------\n", pass));
671 } while (++pass < CUR_COLLATE->num_weights);
672 if (count <= n) { /* oops... change it back */
673 ws1[count-1] = 0;
675 return count-1;
677 libc_hidden_def(__XL_NPP(strxfrm))
679 #endif /* WANT_WIDE */
681 #endif /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */
683 #endif /* defined(L_strxfrm) || defined(L_strxfrm_l) || defined(L_wcsxfrm) || defined(L_wcsxfrm_l) */
685 #endif /* __UCLIBC_HAS_LOCALE__ */
686 /**********************************************************************/