2 * Copyright (c) 2011, 2012
3 * Zhihao Yuan. All rights reserved.
5 * See the LICENSE file for redistribution information.
10 int looks_utf8(const char *, size_t);
11 int looks_utf16(const char *, size_t);
12 int decode_utf8(const char *);
13 int decode_utf16(const char *, int);
15 #define F 0 /* character never appears in text */
16 #define T 1 /* character appears in plain ASCII text */
17 #define I 2 /* character appears in ISO-8859 text */
18 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
20 static char text_chars
[256] = {
21 /* BEL BS HT LF FF CR */
22 F
, F
, F
, F
, F
, F
, F
, T
, T
, T
, T
, F
, T
, T
, F
, F
, /* 0x0X */
24 F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, F
, T
, F
, F
, F
, F
, /* 0x1X */
25 T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, /* 0x2X */
26 T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, /* 0x3X */
27 T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, /* 0x4X */
28 T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, /* 0x5X */
29 T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, /* 0x6X */
30 T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, T
, F
, /* 0x7X */
32 X
, X
, X
, X
, X
, T
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, /* 0x8X */
33 X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, X
, /* 0x9X */
34 I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, /* 0xaX */
35 I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, /* 0xbX */
36 I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, /* 0xcX */
37 I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, /* 0xdX */
38 I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, /* 0xeX */
39 I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
, I
/* 0xfX */
44 * Decide whether some text looks like UTF-8. Returns:
47 * 0: uses odd control characters, so doesn't look like text
49 * 2: definitely UTF-8 text (valid high-bit set bytes)
51 * Based on RFC 3629. UTF-8 with BOM is not accepted.
53 * PUBLIC: int looks_utf8(const char *, size_t);
56 looks_utf8(const char *ibuf
, size_t nbytes
)
58 const u_char
*buf
= (u_char
*)ibuf
;
61 int gotone
= 0, ctrl
= 0;
63 for (i
= 0; i
< nbytes
; i
++) {
64 if ((buf
[i
] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
66 * Even if the whole file is valid UTF-8 sequences,
67 * still reject it if it uses weird control characters.
70 if (text_chars
[buf
[i
]] != T
)
72 } else if ((buf
[i
] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
74 } else { /* 11xxxxxx begins UTF-8 */
77 if ((buf
[i
] & 0x20) == 0) /* 110xxxxx */
78 if (buf
[i
] > 0xC1) /* C0, C1 */
81 else if ((buf
[i
] & 0x10) == 0) /* 1110xxxx */
83 else if ((buf
[i
] & 0x08) == 0) /* 11110xxx */
86 else return -1; /* F5, F6, F7 */
88 return -1; /* F8~FF */
90 for (n
= 0; n
< following
; n
++) {
95 if ((buf
[i
] & 0xc0) != 0x80) /* 10xxxxxx */
103 return ctrl
? 0 : (gotone
? 2 : 1);
108 * Decide whether some text looks like UTF-16. Returns:
111 * 1: Little-endian UTF-16
112 * 2: Big-endian UTF-16
114 * PUBLIC: int looks_utf16(const char *, size_t);
117 looks_utf16(const char *ibuf
, size_t nbytes
)
119 const u_char
*buf
= (u_char
*)ibuf
;
129 bom
= buf
[0] << 8 ^ buf
[1];
132 else if (bom
== 0xFEFF)
137 for (i
= 2; i
+ 1 < nbytes
; i
+= 2) {
139 c
= buf
[i
] << 8 ^ buf
[i
+ 1];
141 c
= buf
[i
] ^ buf
[i
+ 1] << 8;
144 if (c
< 0xD800 || c
> 0xDFFF)
145 if (c
< 128 && text_chars
[c
] != T
)
155 else if (c
< 0xDC00 || c
> 0xDFFF)
169 * Decode a UTF-8 character from byte string to Unicode.
170 * Returns -1 if the first byte is a not UTF-8 leader.
172 * Based on RFC 3629, but without error detection.
174 * PUBLIC: int decode_utf8(const char *);
177 decode_utf8(const char *ibuf
)
179 const u_char
*buf
= (u_char
*)ibuf
;
182 if ((buf
[0] & 0x80) == 0)
184 else if ((buf
[0] & 0x40) == 0);
186 if ((buf
[0] & 0x20) == 0)
187 u
= (buf
[0] ^ 0xC0) << 6 ^ (buf
[1] ^ 0x80);
188 else if ((buf
[0] & 0x10) == 0)
189 u
= (buf
[0] ^ 0xE0) << 12 ^ (buf
[1] ^ 0x80) << 6
191 else if (((buf
[0] & 0x08) == 0))
192 u
= (buf
[0] ^ 0xF0) << 18 ^ (buf
[1] ^ 0x80) << 12
193 ^ (buf
[2] ^ 0x80) << 6 ^ (buf
[3] ^ 0x80);
201 * Decode a UTF-16 character from byte string to Unicode.
202 * Returns -1 if the first unsigned integer is invalid.
204 * No error detection on supplementary bytes.
206 * PUBLIC: int decode_utf16(const char *, int);
209 decode_utf16(const char* ibuf
, int bigend
)
211 const u_char
*buf
= (u_char
*)ibuf
;
216 w1
= buf
[0] << 8 ^ buf
[1];
218 w1
= buf
[0] ^ buf
[1] << 8;
220 if (w1
< 0xD800 || w1
> 0xDFFF)
222 else if (w1
> 0xDBFF);
225 w2
= buf
[2] << 8 ^ buf
[3];
227 w2
= buf
[2] ^ buf
[3] << 8;
228 u
= ((w1
^ 0xD800) << 10 ^ (w2
^ 0xDC00)) + 0x10000;