2 // Copyright (c) 2003 by Digital Mars
4 // written by Walter Bright
5 // http://www.digitalmars.com
6 // License for redistribution is by either the Artistic License
7 // in artistic.txt, or the GNU General Public License in gnu.txt.
8 // See the included readme.txt for details.
10 // Description of UTF-8 at:
11 // http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
18 int utf_isValidDchar(dchar_t c
)
21 (c
> 0xDFFF && c
<= 0x10FFFF && c
!= 0xFFFE && c
!= 0xFFFF);
24 /********************************************
25 * Decode a single UTF-8 character sequence.
28 * !=NULL error message string
31 char *utf_decodeChar(unsigned char *s
, size_t len
, size_t *pidx
, dchar_t
*presult
)
35 unsigned char u
= s
[i
];
37 assert(i
>= 0 && i
< len
);
43 /* The following encodings are valid, except for the 5 and 6 byte
47 * 1110xxxx 10xxxxxx 10xxxxxx
48 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
49 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
50 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
55 goto Lerr
; // only do the first 4 of 6 encodings
56 if (((u
<< n
) & 0x80) == 0)
64 // Pick off (7 - n) significant bits of B from first byte of octet
65 V
= (dchar_t
)(u
& ((1 << (7 - n
)) - 1));
67 if (i
+ (n
- 1) >= len
)
68 goto Lerr
; // off end of string
70 /* The following combinations are overlong, and illegal:
72 * 11100000 100xxxxx (10xxxxxx)
73 * 11110000 1000xxxx (10xxxxxx 10xxxxxx)
74 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
75 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
78 if ((u
& 0xFE) == 0xC0 ||
79 (u
== 0xE0 && (u2
& 0xE0) == 0x80) ||
80 (u
== 0xF0 && (u2
& 0xF0) == 0x80) ||
81 (u
== 0xF8 && (u2
& 0xF8) == 0x80) ||
82 (u
== 0xFC && (u2
& 0xFC) == 0x80))
83 goto Lerr
; // overlong combination
85 for (unsigned j
= 1; j
!= n
; j
++)
88 if ((u
& 0xC0) != 0x80)
89 goto Lerr
; // trailing bytes are 10xxxxxx
90 V
= (V
<< 6) | (u
& 0x3F);
92 if (!utf_isValidDchar(V
))
102 assert(utf_isValidDchar(V
));
108 *presult
= (dchar_t
) s
[i
];
110 return "invalid UTF-8 sequence";
113 /***************************************************
114 * Validate a UTF-8 string.
117 * !=NULL error message string
120 char *utf_validateString(unsigned char *s
, size_t len
)
126 for (idx
= 0; idx
< len
; )
128 err
= utf_decodeChar(s
, len
, &idx
, &dc
);
136 /********************************************
137 * Decode a single UTF-16 character sequence.
140 * !=NULL error message string
144 char *utf_decodeWchar(unsigned short *s
, size_t len
, size_t *pidx
, dchar_t
*presult
)
150 assert(i
>= 0 && i
< len
);
152 { if (u
>= 0xD800 && u
<= 0xDBFF)
156 { msg
= "surrogate UTF-16 high value past end of string";
160 if (u2
< 0xDC00 || u2
> 0xDFFF)
161 { msg
= "surrogate UTF-16 low value out of range";
164 u
= ((u
- 0xD7C0) << 10) + (u2
- 0xDC00);
167 else if (u
>= 0xDC00 && u
<= 0xDFFF)
168 { msg
= "unpaired surrogate UTF-16 value";
171 else if (u
== 0xFFFE || u
== 0xFFFF)
172 { msg
= "illegal UTF-16 value";
183 assert(utf_isValidDchar(u
));
185 *presult
= (dchar_t
)u
;
189 *presult
= (dchar_t
)s
[i
];