2 * Copyright (C) 2002 Manuel Novoa III
3 * Copyright (C) 2000-2005 Erik Andersen <andersen@uclibc.org>
5 * Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball.
9 * Initial test implementation of strcoll, strxfrm, wcscoll, and wcsxfrm.
10 * The code needs to be cleaned up a good bit, but I'd like to see people
22 /* Experimentally off - libc_hidden_proto(memset) */
23 /* Experimentally off - libc_hidden_proto(memcpy) */
24 /* Experimentally off - libc_hidden_proto(strlcpy) */
25 /* Experimentally off - libc_hidden_proto(strcmp) */
27 /* libc_hidden_proto(wcsxfrm) */
28 /* libc_hidden_proto(wcscmp) */
31 #ifdef __UCLIBC_HAS_LOCALE__
32 #if defined(L_strxfrm) || defined(L_strxfrm_l) || defined(L_wcsxfrm) || defined(L_wcsxfrm_l)
36 #error WANT_WIDE should be defined for L_strxfrm
39 #error L_wcsxfrm already defined for L_strxfrm
41 #endif /* L_strxfrm */
43 #if defined(L_strxfrm) || defined(L_strxfrm_l)
45 #define wcscoll strcoll
46 #define wcscoll_l strcoll_l
47 #define wcsxfrm strxfrm
48 #define wcsxfrm_l strxfrm_l
58 #endif /* defined(L_strxfrm) || defined(L_strxfrm_l) */
60 #if defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE)
62 /* libc_hidden_proto(wcscoll_l) */
64 /* libc_hidden_proto(wcscoll) */
65 int wcscoll (const Wchar
*s0
, const Wchar
*s1
)
67 return wcscoll_l(s0
, s1
, __UCLIBC_CURLOCALE
);
69 libc_hidden_def(wcscoll
)
71 /* libc_hidden_proto(wcsxfrm_l) */
73 /* libc_hidden_proto(wcsxfrm) */
74 size_t wcsxfrm(Wchar
*__restrict ws1
, const Wchar
*__restrict ws2
, size_t n
)
76 return wcsxfrm_l(ws1
, ws2
, n
, __UCLIBC_CURLOCALE
);
78 libc_hidden_def(wcsxfrm
)
80 #else /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */
84 #define CUR_COLLATE (&__UCLIBC_CURLOCALE->collate)
86 #define CUR_COLLATE (& __LOCALE_PTR->collate)
93 const Wchar
*eob
; /* end of backward */
96 __uwchar_t ui_weight
; /* undefined or invalid */
101 /* should be wchar_t. if wchar < 0 do EILSEQ? */
103 __uwchar_t ci_pending
[MAX_PENDING
]; /* nul-terminated */
106 char *bbe
; /* end of back_buf (actual last... not 1 past end) */
107 char *bp
; /* ptr into backbuf, NULL if not in backward mode */
115 #define WEIGHT_MASK 0x3fffU
116 #define RULE_MASK 0xc000U
118 #define RULE_FORWARD (1 << 14)
119 #define RULE_POSITION (1 << 15)
121 #define UI_IDX (WEIGHT_MASK-6)
122 #define POSIT_IDX (WEIGHT_MASK-5)
123 #define RANGE_IDX (WEIGHT_MASK-4)
124 #define UNDEF_IDX (WEIGHT_MASK-3)
125 #define INVAL_IDX (WEIGHT_MASK-2)
126 #define DITTO_IDX (WEIGHT_MASK-1)
131 #define TRACE(X) printf X
133 #define TRACE(X) ((void)0)
136 static int lookup(wchar_t wc __LOCALE_PARAM
)
138 unsigned int sc
, n
, i0
, i1
;
140 if (((__uwchar_t
) wc
) > 0xffffU
) {
144 sc
= wc
& CUR_COLLATE
->ti_mask
;
145 wc
>>= CUR_COLLATE
->ti_shift
;
146 n
= wc
& CUR_COLLATE
->ii_mask
;
147 wc
>>= CUR_COLLATE
->ii_shift
;
149 i0
= CUR_COLLATE
->wcs2colidt_tbl
[wc
];
150 i0
<<= CUR_COLLATE
->ii_shift
;
151 i1
= CUR_COLLATE
->wcs2colidt_tbl
[CUR_COLLATE
->ii_len
+ i0
+ n
];
152 i1
<<= CUR_COLLATE
->ti_shift
;
153 return CUR_COLLATE
->wcs2colidt_tbl
[CUR_COLLATE
->ii_len
+ CUR_COLLATE
->ti_len
+ i1
+ sc
];
157 static void init_col_state(col_state_t
*cs
, const Wchar
*wcs
)
159 memset(cs
, 0, sizeof(col_state_t
));
161 cs
->bp
= cs
->back_buf
= cs
->ibb
;
163 cs
->bbe
= cs
->back_buf
+ (cs
->bb_size
-1);
166 static void next_weight(col_state_t
*cs
, int pass __LOCALE_PARAM
)
168 int r
, w
, ru
, ri
, popping_backup_stack
;
174 #else /* WANT_WIDE */
179 #endif /* WANT_WIDE */
185 TRACE(("ru_pushed = %d\n", ru
));
190 #ifdef __UCLIBC_MJN3_ONLY__
191 #warning should we walk pendings backwards?
193 if (cs
->cip
) { /* possible pending weight */
194 if ((r
= *(cs
->cip
++)) == 0) {
198 cs
->weightidx
= r
& WEIGHT_MASK
;
199 assert(cs
->weightidx
);
200 /* assert(cs->weightidx != WEIGHT_MASK); */
201 } else { /* get the next collation item from the string */
202 TRACE(("clearing popping flag\n"));
203 popping_backup_stack
= 0;
206 /* keep first pos as 0 for a sentinal */
207 if (*cs
->bp
) { /* pending backward chars */
209 popping_backup_stack
= 1;
210 TRACE(("setting popping flag\n"));
212 if (*cs
->bp
> 0) { /* singles pending */
214 if ((*cs
->bp
-= 1) == 0) {
217 } else { /* last was a multi */
221 } else if (!*cs
->s
) { /* not in backward mode and end of string */
231 cs
->colitem
= r
= lookup(*cs
->s __LOCALE_ARG
);
232 #else /* WANT_WIDE */
233 n
= n0
= __locale_mbrtowc_l(&WC
, cs
->s
, __LOCALE_PTR
);
239 cs
->colitem
= r
= lookup(WC __LOCALE_ARG
);
240 #endif /* WANT_WIDE */
242 TRACE((" r=%d WC=%#lx\n", r
, (unsigned long)(WC
)));
244 if (r
> CUR_COLLATE
->max_col_index
) { /* starting char for one or more sequences */
245 p
= CUR_COLLATE
->multistart_tbl
;
246 p
+= p
[r
-CUR_COLLATE
->max_col_index
-1];
251 if (!*p
) { /* found it */
253 TRACE((" found multi %d\n", n
));
257 /* the lookup check here is safe since we're assured that *p is a valid colidx */
258 if (!cs
->s
[n
] || (lookup(cs
->s
[n
] __LOCALE_ARG
) != *p
)) {
264 #else /* WANT_WIDE */
266 nx
= __locale_mbrtowc_l(&WC
, cs
->s
+ n
, __LOCALE_PTR
);
273 if (!cs
->s
[n
] || (lookup(WC __LOCALE_ARG
) != *p
)) {
278 n
+= nx
; /* Only gets here if cs->s[n] != 0, so nx is set. */
279 #endif /* WANT_WIDE */
282 } else if (r
== 0) { /* illegal, undefined, or part of a range */
283 if ((CUR_COLLATE
->range_count
)
284 #ifdef __UCLIBC_MJN3_ONLY__
285 #warning .. need to introduce range as a collating item?
287 && (((__uwchar_t
)(WC
- CUR_COLLATE
->range_low
)) <= CUR_COLLATE
->range_count
)
288 ) { /* part of a range */
289 /* Note: cs->colitem = 0 already. */
290 TRACE((" found range\n"));
291 ru
= CUR_COLLATE
->ruletable
[CUR_COLLATE
->range_rule_offset
*CUR_COLLATE
->MAX_WEIGHTS
+ pass
];
292 assert((ru
& WEIGHT_MASK
) != DITTO_IDX
);
293 if ((ru
& WEIGHT_MASK
) == WEIGHT_MASK
) {
294 ru
= (ru
& RULE_MASK
) | RANGE_IDX
;
295 cs
->weight
= CUR_COLLATE
->range_base_weight
+ (WC
- CUR_COLLATE
->range_low
);
298 } else if (((__uwchar_t
)(WC
)) <= 0x7fffffffUL
) { /* legal but undefined */
300 /* Note: cs->colitem = 0 already. */
301 ri
= CUR_COLLATE
->undefined_idx
;
302 assert(ri
!= 0); /* implicit undefined isn't supported */
304 TRACE((" found explicit UNDEFINED\n"));
305 #ifdef __UCLIBC_MJN3_ONLY__
306 #warning right now single weight locales do not support ..
308 if (CUR_COLLATE
->num_weights
== 1) {
309 TRACE((" single weight UNDEFINED\n"));
310 cs
->weightidx
= RANGE_IDX
;
316 ri
= CUR_COLLATE
->index2ruleidx
[ri
- 1];
317 ru
= CUR_COLLATE
->ruletable
[ri
* CUR_COLLATE
->MAX_WEIGHTS
+ pass
];
318 assert((ru
& WEIGHT_MASK
) != WEIGHT_MASK
); /* TODO: handle ".." */
319 if ((ru
& WEIGHT_MASK
) == DITTO_IDX
) {
320 cs
->colitem
= CUR_COLLATE
->undefined_idx
;
323 } else { /* illegal */
324 TRACE((" found illegal\n"));
326 /* We put all illegals in the same equiv class with maximal weight,
327 * and ignore them after the first pass. */
332 ru
= (RULE_FORWARD
| RANGE_IDX
);
333 cs
->weight
= 0xffffU
;
336 } else if (CUR_COLLATE
->num_weights
== 1) {
337 TRACE((" single weight\n"));
338 cs
->weightidx
= RANGE_IDX
;
339 cs
->weight
= cs
->colitem
;
343 TRACE((" normal\n"));
346 /* if we get here, it is a normal char either singlely weighted, undefined, or in a range */
348 ri
= CUR_COLLATE
->index2ruleidx
[cs
->colitem
- 1];
349 TRACE((" ri=%d ", ri
));
350 #ifdef __UCLIBC_MJN3_ONLY__
351 #warning make sure this is correct
354 TRACE(("NOT IN THIS LOCALE\n"));
357 ru
= CUR_COLLATE
->ruletable
[ri
* CUR_COLLATE
->MAX_WEIGHTS
+ pass
];
361 #ifdef __UCLIBC_MJN3_ONLY__
362 #warning ignoreables probably should not interrupt backwards processing, but this is wrong
364 /* if (!(ru & WEIGHT_MASK)) { */
365 /* TRACE(("IGNORE\n")); */
371 TRACE((" rule = %#x weight = %#x popping = %d s = %p eob = %p\n",
372 ru
& RULE_MASK
, ru
& WEIGHT_MASK
, popping_backup_stack
,
374 /* now we need to check if we're going backwards... */
376 if (!popping_backup_stack
) {
377 if (!(ru
& RULE_MASK
)) { /* backward */
378 TRACE(("backwards\n"));
379 assert(cs
->bp
<= cs
->bbe
);
380 if (cs
->bp
== cs
->bbe
) {
381 if (cs
->back_buf
== cs
->ibb
) { /* was using internal buffer */
382 cs
->bp
= malloc(cs
->bb_size
+ 128);
385 #ifdef __UCLIBC_MJN3_ONLY__
386 #warning what to do here?
391 memcpy(cs
->bp
, cs
->back_buf
, cs
->bb_size
);
394 cs
->bp
= realloc(cs
->back_buf
, cs
->bb_size
+ 128);
397 #ifdef __UCLIBC_MJN3_ONLY__
398 #warning what to do here?
405 cs
->bbe
= cs
->bp
+ (cs
->bbe
- cs
->back_buf
);
406 cs
->back_buf
= cs
->bp
;
410 if (n
==1) { /* single char */
411 if (*cs
->bp
&& (((unsigned char)(*cs
->bp
)) < CHAR_MAX
)) {
412 *cs
->bp
+= 1; /* increment last single's count */
413 } else { /* last was a multi, or just starting */
415 cs
->bp
= cs
->back_buf
;
417 assert(cs
->bp
< cs
->bbe
);
422 } else { /* multichar */
424 assert(cs
->bp
< cs
->bbe
);
431 /* end-of-string so start popping */
433 TRACE(("popping\n"));
435 } else if (*cs
->bp
) { /* was going backward but this element isn't */
436 /* discard current and use previous backward element */
439 TRACE(("popping\n"));
441 } else { /* was and still going forward */
442 TRACE(("forwards\n"));
443 if ((ru
& (RULE_POSITION
|WEIGHT_MASK
)) > RULE_POSITION
) {
444 assert(ru
& WEIGHT_MASK
);
446 cs
->weight
= cs
->position
;
447 #ifdef __UCLIBC_MJN3_ONLY__
450 cs
->position
= 0; /* reset to reduce size for strcoll? */
452 cs
->weightidx
= RANGE_IDX
;
456 } else { /* popping backwards stack */
457 TRACE(("popping (continued)\n"));
466 cs
->weightidx
= ru
& WEIGHT_MASK
;
467 cs
->rule
= ru
& RULE_MASK
;
470 #ifdef __UCLIBC_MJN3_ONLY__
471 #warning for pending we only want the weight... _not_ the rule
473 if (!cs
->weightidx
) { /* ignore */
478 assert(cs
->weightidx
);
481 if (((unsigned int)(cs
->weightidx
- UI_IDX
)) <= (INVAL_IDX
-UI_IDX
)) {
482 if (cs
->weightidx
== UI_IDX
) {
483 cs
->weight
= cs
->ui_weight
;
488 assert(cs
->weightidx
!= WEIGHT_MASK
);
489 if (cs
->weightidx
== DITTO_IDX
) { /* want the weight of the current collating item */
490 TRACE(("doing ditto\n"));
491 w
= CUR_COLLATE
->index2weight
[cs
->colitem
-1];
492 } else if (cs
->weightidx
<= CUR_COLLATE
->max_col_index
) { /* normal */
493 TRACE(("doing normal\n"));
494 w
= CUR_COLLATE
->index2weight
[cs
->weightidx
-1];
495 } else { /* a string */
496 TRACE(("doing string\n"));
497 assert(!(cs
->weightidx
& RULE_MASK
));
498 /* note: iso14561 allows null string here */
499 p
= CUR_COLLATE
->weightstr
+ (cs
->weightidx
- (CUR_COLLATE
->max_col_index
+ 2));
500 if (*p
& WEIGHT_MASK
) {
503 assert(r
< MAX_PENDING
);
504 cs
->ci_pending
[r
++] = *p
++;
505 } while (*p
& WEIGHT_MASK
);
506 cs
->cip
= cs
->ci_pending
;
516 libc_hidden_proto(__XL_NPP(wcscoll
))
517 int __XL_NPP(wcscoll
) (const Wchar
*s0
, const Wchar
*s1 __LOCALE_PARAM
)
522 if (!CUR_COLLATE
->num_weights
) { /* C locale */
524 return wcscmp(s0
, s1
);
525 #else /* WANT_WIDE */
526 return strcmp(s0
, s1
);
527 #endif /* WANT_WIDE */
531 do { /* loop through the weights levels */
532 init_col_state(ws
, s0
);
533 init_col_state(ws
+1, s1
);
534 do { /* loop through the strings */
535 /* for each string, get the next weight */
536 next_weight(ws
, pass __LOCALE_ARG
);
537 next_weight(ws
+1, pass __LOCALE_ARG
);
538 TRACE(("w0=%lu w1=%lu\n",
539 (unsigned long) ws
[0].weight
,
540 (unsigned long) ws
[1].weight
));
542 if (ws
[0].weight
!= ws
[1].weight
) {
543 return ws
[0].weight
- ws
[1].weight
;
545 } while (ws
[0].weight
);
546 } while (++pass
< CUR_COLLATE
->num_weights
);
550 libc_hidden_def(__XL_NPP(wcscoll
))
554 extern size_t __wcslcpy(wchar_t *__restrict dst
,
555 const wchar_t *__restrict src
, size_t n
);
557 libc_hidden_proto(__XL_NPP(wcsxfrm
))
558 size_t __XL_NPP(wcsxfrm
)(wchar_t *__restrict ws1
, const wchar_t *__restrict ws2
,
559 size_t n __LOCALE_PARAM
)
565 if (!CUR_COLLATE
->num_weights
) { /* C locale */
566 return __wcslcpy(ws1
, ws2
, n
);
569 #ifdef __UCLIBC_MJN3_ONLY__
570 #warning handle empty string as a special case
574 do { /* loop through the weights levels */
575 init_col_state(&cs
, ws2
);
576 do { /* loop through the string */
577 next_weight(&cs
, pass __LOCALE_ARG
);
578 TRACE(("weight=%lu (%#lx)\n", (unsigned long) cs
.weight
, (unsigned long) cs
.weight
));
580 ws1
[count
] = cs
.weight
+1;
583 TRACE(("--------------------------------------------\n"));
585 if (count
<= n
) { /* overwrite the trailing 0 end-of-pass marker */
588 TRACE(("-------------------- pass %d --------------------\n", pass
));
589 } while (++pass
< CUR_COLLATE
->num_weights
);
590 if (count
<= n
) { /* oops... change it back */
595 libc_hidden_def(__XL_NPP(wcsxfrm
))
597 #else /* WANT_WIDE */
599 static const unsigned long bound
[] = {
607 static unsigned char first
[] = {
608 0x0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
611 /* Use an extension of UTF-8 to store a 32 bit val in max 6 bytes. */
613 static size_t store(unsigned char *s
, size_t count
, size_t n
, __uwchar_t weight
)
619 if (weight
< bound
[i
]) {
622 } while (++i
< sizeof(bound
)/sizeof(bound
[0]));
629 s
[i
] = 0x80 | (weight
& 0x3f);
639 libc_hidden_proto(__XL_NPP(strxfrm
))
640 size_t __XL_NPP(strxfrm
)(char *__restrict ws1
, const char *__restrict ws2
, size_t n
647 if (!CUR_COLLATE
->num_weights
) { /* C locale */
648 return strlcpy(ws1
, ws2
, n
);
651 #ifdef __UCLIBC_MJN3_ONLY__
652 #warning handle empty string as a special case
655 inc
= count
= pass
= 0;
656 do { /* loop through the weights levels */
657 init_col_state(&cs
, ws2
);
658 do { /* loop through the string */
659 next_weight(&cs
, pass __LOCALE_ARG
);
660 TRACE(("weight=%lu (%#lx)\n", (unsigned long) cs
.weight
, (unsigned long) cs
.weight
));
661 inc
= store((unsigned char *)ws1
, count
, n
, cs
.weight
+ 1);
663 TRACE(("--------------------------------------------\n"));
665 /* overwrite the trailing 0 end-of-pass marker */
670 TRACE(("-------------------- pass %d --------------------\n", pass
));
671 } while (++pass
< CUR_COLLATE
->num_weights
);
672 if (count
<= n
) { /* oops... change it back */
677 libc_hidden_def(__XL_NPP(strxfrm
))
679 #endif /* WANT_WIDE */
681 #endif /* defined(__UCLIBC_HAS_XLOCALE__) && !defined(__UCLIBC_DO_XLOCALE) */
683 #endif /* defined(L_strxfrm) || defined(L_strxfrm_l) || defined(L_wcsxfrm) || defined(L_wcsxfrm_l) */
685 #endif /* __UCLIBC_HAS_LOCALE__ */
686 /**********************************************************************/