7 static unsigned char Len_Map
[256] =
9 0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 31 */
10 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 63 */
11 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 95 */
12 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* - 127 */
13 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* - 159 */
14 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* - 191 */
15 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* - 223 */
16 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 /* - 255 */
20 * Also note that the code positions U+D800 to U+DFFF (UTF-16 surrogates)
21 * as well as U+FFFE and U+FFFF must not occur in normal UTF-8 or UCS-4
22 * data. UTF-8 decoders should treat them like malformed or overlong
23 * sequences for safety reasons.
25 #define IS_ILLEGAL_UNICODE(w) \
26 (((w >= 0xD800) && (w <= 0xDFFF)) || (w == 0xFFFE) || (w == 0xFFFF))
29 static int is_invalid_or_overlong_utf8 (SLuchar_Type
*u
, unsigned int len
)
32 unsigned char ch
, ch1
;
34 /* Check for invalid sequences */
35 for (i
= 1; i
< len
; i
++)
37 if ((u
[i
] & 0xC0) != 0x80)
41 /* Illegal (overlong) sequences */
42 /* 1100000x (10xxxxxx) */
43 /* 11100000 100xxxxx (10xxxxxx) */
44 /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */
45 /* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) */
46 /* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) */
48 if ((ch
== 0xC0) || (ch
== 0xC1))
52 if (((ch1
& ch
) == 0x80)
61 /* D800 is encoded as 0xED 0xA0 0x80 and DFFF as 0xED 0xBF 0xBF */
63 && ((ch1
>= 0xA0) && (ch1
<= 0xBF))
64 && (u
[2] >= 0x80) && (u
[2] <= 0xBF))
66 /* Now FFFE and FFFF */
69 && ((u
[2] == 0xBE) || (u
[2] == 0xBF)))
75 /* This function assumes that the necessary checks have been made to ensure
76 * a valid UTF-8 encoded character is present.
79 static SLwchar_Type
fast_utf8_decode (SLuchar_Type
*u
, unsigned int len
)
81 static unsigned char masks
[7] =
83 0, 0, 0x1F, 0xF, 0x7, 0x3, 0x1
88 w
= (*u
& masks
[len
]);
93 w
= (w
<< 6)| (u
[0] & 0x3F);
99 unsigned char *SLutf8_skip_char (unsigned char *s
, unsigned char *smax
)
113 if (is_invalid_or_overlong_utf8 (s
, len
))
119 SLuchar_Type
*SLutf8_skip_chars (SLuchar_Type
*s
, SLuchar_Type
*smax
,
120 unsigned int num
, unsigned int *dnum
,
121 int ignore_combining
)
126 while ((n
< num
) && (s
< smax
))
128 unsigned int len
= Len_Map
[*s
];
144 if (is_invalid_or_overlong_utf8 (s
, len
))
151 if (ignore_combining
)
153 SLwchar_Type w
= fast_utf8_decode (s
, len
);
154 if (0 != SLwchar_wcwidth (w
))
164 if (ignore_combining
)
169 unsigned int nconsumed
;
170 if (NULL
== SLutf8_decode (s
, smax
, &w
, &nconsumed
))
173 if (0 != SLwchar_wcwidth (w
))
186 SLuchar_Type
*SLutf8_bskip_chars (SLuchar_Type
*smin
, SLuchar_Type
*s
,
187 unsigned int num
, unsigned int *dnum
,
188 int ignore_combining
)
191 SLuchar_Type
*smax
= s
;
194 while ((n
< num
) && (s
> smin
))
210 && (Len_Map
[ch
] == 0)
211 && (dn
< SLUTF8_MAX_MBLEN
))
220 /* Invalid sequence */
232 if ((NULL
== (s1
= SLutf8_decode (s
, smax
, &w
, NULL
)))
235 /* This means we backed up over an invalid sequence */
236 dn
= (unsigned int) (smax
- s
);
243 if ((ignore_combining
== 0)
244 || (0 != SLwchar_wcwidth (w
)))
256 SLuchar_Type
*SLutf8_bskip_char (SLuchar_Type
*smin
, SLuchar_Type
*s
)
264 s
= SLutf8_bskip_chars (smin
, s
+1, 1, &dn
, 0);
270 /* This function counts the number of wide characters in a UTF-8 encoded
271 * string. Each byte in an invalid sequence is counted as a single character.
272 * If the string contains illegal values, the bytes making up the character is
273 * counted as 1 character.
275 unsigned int SLutf8_strlen (SLuchar_Type
*s
, int ignore_combining
)
277 unsigned int count
, len
;
282 len
= strlen ((char *)s
);
283 (void) SLutf8_skip_chars (s
, s
+ len
, len
, &count
, ignore_combining
);
289 * This function returns NULL if the input does not correspond to a valid
290 * UTF-8 sequence, otherwise, it returns the position of the next character
293 unsigned char *SLutf8_decode (unsigned char *u
, unsigned char *umax
,
294 SLwchar_Type
*wp
, unsigned int *nconsumedp
)
303 if (nconsumedp
!= NULL
)
311 if (nconsumedp
!= NULL
) *nconsumedp
= 1;
318 /* should not happen--- code here for completeness */
319 if (nconsumedp
!= NULL
) *nconsumedp
= 1;
324 if (nconsumedp
!= NULL
) *nconsumedp
= 1; /* (unsigned int) (umax - u); */
328 if (is_invalid_or_overlong_utf8 (u
, len
))
330 if (nconsumedp
!= NULL
)
336 if (nconsumedp
!= NULL
)
339 *wp
= w
= fast_utf8_decode (u
, len
);
341 if (IS_ILLEGAL_UNICODE(w
))
348 /* Encode the wide character returning a pointer to the end of the
349 * utf8 of the encoded multi-byte character. This function will also encode
350 * illegal unicode values. It returns NULL if buflen is too small.
351 * Otherwise, it returns a pointer at the end of the last encoded byte.
352 * It does not null terminate the encoded string.
354 SLuchar_Type
*SLutf8_encode (SLwchar_Type w
, SLuchar_Type
*u
, unsigned int ulen
)
356 SLuchar_Type
*umax
= u
+ ulen
;
358 /* U-00000000 - U-0000007F: 0xxxxxxx */
364 *u
++ = (unsigned char) w
;
368 /* U-00000080 - U-000007FF: 110xxxxx 10xxxxxx */
374 *u
++ = (w
>> 6) | 0xC0;
375 *u
++ = (w
& 0x3F) | 0x80;
379 /* First bad character starts at 0xD800 */
381 /* Allow illegal values to be encoded */
384 *if (IS_ILLEGAL_UNICODE(w))
388 /* U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx */
393 *u
++ = (w
>> 12 ) | 0xE0;
397 /* U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
402 *u
++ = (w
>> 18) | 0xF0;
406 /* U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
411 *u
++ = (w
>> 24) | 0xF8;
415 /* U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
420 *u
++ = (w
>> 30) | 0xFC;
427 finish_5
: *u
++ = ((w
>> 24) & 0x3F)|0x80;
428 finish_4
: *u
++ = ((w
>> 18) & 0x3F)|0x80;
429 finish_3
: *u
++ = ((w
>> 12) & 0x3F)|0x80;
430 finish_2
: *u
++ = ((w
>> 6) & 0x3F)|0x80;
431 *u
++ = (w
& 0x3F)|0x80;
436 /* Like SLutf8_encode, but null terminates the result.
437 * At least SLUTF8_MAX_MBLEN+1 bytes assumed.
439 SLuchar_Type
*SLutf8_encode_null_terminate (SLwchar_Type w
, SLuchar_Type
*u
)
443 p
= SLutf8_encode (w
, u
, SLUTF8_MAX_MBLEN
);
450 int SLutf8_decode_bytes (SLuchar_Type
*u
, SLuchar_Type
*umax
,
451 unsigned char *b
, unsigned int *np
)
460 if (0 == (*u
& 0x80))
466 if (NULL
== (u
= SLutf8_decode (u
, umax
, &w
, NULL
)))
467 return -1; /* FIXME: HANDLE ERROR */
472 sprintf (bmax
, "<U+%04X>", w
);
473 bmax
+= strlen (bmax
);
476 /* FIXME: HANDLE ERROR */
487 /* UTF-8 Encode the bytes between b and bmax storing the results in the
488 * buffer defined by u and umax, returning the position following the
489 * last encoded character. Upon return, *np is set to the number of bytes
490 * sucessfully encoded.
492 SLuchar_Type
*SLutf8_encode_bytes (unsigned char *b
, unsigned char *bmax
,
493 SLuchar_Type
*u
, unsigned int ulen
,
496 unsigned char *bstart
= b
;
497 SLuchar_Type
*umax
= u
+ ulen
;
503 if (0 == (*b
& 0x80))
512 if (NULL
== (u1
= SLutf8_encode (*b
, u
, umax
- u
)))
526 static SLuchar_Type
*xform_utf8 (SLuchar_Type
*u
, SLuchar_Type
*umax
,
527 SLwchar_Type (*fun
)(SLwchar_Type
))
529 SLuchar_Type
*buf
, *p
;
530 unsigned int malloced_len
, len
;
543 unsigned int nconsumed
;
545 if (malloced_len
<= len
+ SLUTF8_MAX_MBLEN
)
547 SLuchar_Type
*newbuf
;
548 malloced_len
+= 1 + (umax
- u
) + SLUTF8_MAX_MBLEN
;
550 newbuf
= (SLuchar_Type
*)SLrealloc ((char *)buf
, malloced_len
);
553 SLfree ((char *)buf
);
563 p
= (SLuchar_Type
*) SLang_create_nslstring ((char *)buf
, len
);
564 SLfree ((char *)buf
);
568 if (NULL
== (u1
= SLutf8_decode (u
, umax
, &w
, &nconsumed
)))
570 /* Invalid sequence */
571 memcpy ((char *) p
, u
, nconsumed
);
580 p1
= SLutf8_encode ((*fun
)(w
), p
, malloced_len
);
583 SLfree ((char *)buf
);
584 SLang_verror (SL_INTERNAL_ERROR
, "SLutf8_encode returned NULL");
595 /* Returned an uppercased version of an UTF-8 encoded string. Illegal or
596 * invalid sequences will be returned as-is. This function returns
599 SLuchar_Type
*SLutf8_strup (SLuchar_Type
*u
, SLuchar_Type
*umax
)
601 return xform_utf8 (u
, umax
, SLwchar_toupper
);
604 /* Returned an lowercased version of an UTF-8 encoded string. Illegal or
605 * invalid sequences will be returned as-is. This function returns
608 SLuchar_Type
*SLutf8_strlo (SLuchar_Type
*u
, SLuchar_Type
*umax
)
610 return xform_utf8 (u
, umax
, SLwchar_tolower
);
613 int SLutf8_compare (SLuchar_Type
*a
, SLuchar_Type
*amax
,
614 SLuchar_Type
*b
, SLuchar_Type
*bmax
,
618 while (nchars
&& (a
< amax
) && (b
< bmax
))
620 SLwchar_Type cha
, chb
;
626 cha
= (SLwchar_Type
) *a
++;
631 aok
= (NULL
!= SLutf8_decode (a
, amax
, &cha
, &na
));
637 chb
= (SLwchar_Type
) *b
++;
642 bok
= (NULL
!= SLutf8_decode (b
, bmax
, &chb
, &nb
));
652 cha
= SLwchar_toupper (cha
);
653 chb
= SLwchar_toupper (chb
);
673 if ((a
>= amax
) && (b
>= bmax
))
683 /* Returns an SLstring */
684 SLstr_Type
*SLutf8_subst_wchar (SLuchar_Type
*u
, SLuchar_Type
*umax
,
685 SLwchar_Type wch
, unsigned int pos
,
686 int ignore_combining
)
688 SLuchar_Type
*a
, *a1
, *b
;
690 SLuchar_Type buf
[SLUTF8_MAX_MBLEN
+1];
692 unsigned int n1
, n2
, n3
, len
;
694 a
= SLutf8_skip_chars (u
, umax
, pos
, &dpos
, ignore_combining
);
696 if ((dpos
!= pos
) || (a
== umax
))
698 SLang_verror (SL_INDEX_ERROR
, "Specified character position is invalid for string");
702 a1
= SLutf8_skip_chars (a
, umax
, 1, NULL
, ignore_combining
);
704 b
= SLutf8_encode (wch
, buf
, SLUTF8_MAX_MBLEN
);
707 SLang_verror (SL_UNICODE_ERROR
, "Unable to encode wchar 0x%lX", (unsigned long)wch
);
715 c
= _pSLallocate_slstring (len
);
719 memcpy (c
, (char *)u
, n1
);
720 memcpy (c
+n1
, (char *)buf
, n2
);
721 memcpy (c
+n1
+n2
, (char *)a1
, n3
);
724 /* No need to worry about this failing-- it frees its argument */
725 return _pSLcreate_via_alloced_slstring (c
, len
);
729 /* utf8 buffer assumed to be at least SLUTF8_MAX_MBLEN+1 bytes. Result will be
730 * null terminated. Returns position of NEXT character.
733 SLuchar_Type
*SLutf8_extract_utf8_char (SLuchar_Type
*u
,
739 u1
= SLutf8_skip_char (u
, umax
);
740 memcpy ((char *)utf8
, u
, u1
-u
);
748 /* These routines depend upon the value of the _pSLinterp_UTF8_Mode variable.
749 * They also generate slang errors upon error.
751 SLuchar_Type
*_pSLinterp_decode_wchar (SLuchar_Type
*u
,
755 if (_pSLinterp_UTF8_Mode
== 0)
758 *chp
= (SLwchar_Type
) *u
++;
762 if (NULL
== (u
= SLutf8_decode (u
, umax
, chp
, NULL
)))
763 SLang_verror (SL_INVALID_UTF8
, "Invalid UTF-8 encoded string");
768 /* At least SLUTF8_MAX_MBLEN+1 bytes assumed-- null terminates result.
769 * Upon success, it returns a pointer to the _end_ of the encoded character
771 SLuchar_Type
*_pSLinterp_encode_wchar (SLwchar_Type wch
, SLuchar_Type
*u
, unsigned int *encoded_len
)
775 if (_pSLinterp_UTF8_Mode
== 0)
778 *u
++ = (SLuchar_Type
) wch
;
783 if (NULL
== (u1
= SLutf8_encode_null_terminate (wch
, u
)))
785 SLang_verror (SL_UNICODE_ERROR
, "Unable to encode character 0x%lX", (unsigned long)wch
);
789 *encoded_len
= (unsigned int) (u1
- u
);
794 int main (int argc
, char **argv
)
796 unsigned char *s
, *smax
;
807 char *long_tests
[] =
818 while ((s
= (unsigned char *) *t
++) != NULL
)
820 smax
= s
+ strlen ((char *)s
);
826 if (NULL
== (s
= SLutf8_to_wc (s
, smax
, &w
)))
828 fprintf (stderr
, "SLutf8_to_wc failed\n");
833 fprintf (stdout
, " 0x%X", w
);
836 fprintf (stdout
, "\n");