rtld - do not allow both dynamic DTV index and static TLS offset
[dragonfly.git] / contrib / nvi2 / common / encoding.c
bloba1ff1c1021354a210ee5ee86e540c81fd49d0e45
1 /*-
2 * Copyright (c) 2011, 2012
3 * Zhihao Yuan. All rights reserved.
5 * See the LICENSE file for redistribution information.
6 */
8 #include <sys/types.h>
10 int looks_utf8(const char *, size_t);
11 int looks_utf16(const char *, size_t);
12 int decode_utf8(const char *);
13 int decode_utf16(const char *, int);
15 #define F 0 /* character never appears in text */
16 #define T 1 /* character appears in plain ASCII text */
17 #define I 2 /* character appears in ISO-8859 text */
18 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
20 static char text_chars[256] = {
21 /* BEL BS HT LF FF CR */
22 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
23 /* ESC */
24 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
25 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
26 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
27 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
28 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
29 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
30 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
31 /* NEL */
32 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
33 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
34 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
35 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
36 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
37 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
38 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
39 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
43 * looks_utf8 --
44 * Decide whether some text looks like UTF-8. Returns:
46 * -1: invalid UTF-8
47 * 0: uses odd control characters, so doesn't look like text
48 * 1: 7-bit text
49 * 2: definitely UTF-8 text (valid high-bit set bytes)
51 * Based on RFC 3629. UTF-8 with BOM is not accepted.
53 * PUBLIC: int looks_utf8(const char *, size_t);
55 int
56 looks_utf8(const char *ibuf, size_t nbytes)
58 const u_char *buf = (u_char *)ibuf;
59 size_t i;
60 int n;
61 int gotone = 0, ctrl = 0;
63 for (i = 0; i < nbytes; i++) {
64 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
66 * Even if the whole file is valid UTF-8 sequences,
67 * still reject it if it uses weird control characters.
70 if (text_chars[buf[i]] != T)
71 ctrl = 1;
72 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
73 return -1;
74 } else { /* 11xxxxxx begins UTF-8 */
75 int following;
77 if ((buf[i] & 0x20) == 0) /* 110xxxxx */
78 if (buf[i] > 0xC1) /* C0, C1 */
79 following = 1;
80 else return -1;
81 else if ((buf[i] & 0x10) == 0) /* 1110xxxx */
82 following = 2;
83 else if ((buf[i] & 0x08) == 0) /* 11110xxx */
84 if (buf[i] < 0xF5)
85 following = 3;
86 else return -1; /* F5, F6, F7 */
87 else
88 return -1; /* F8~FF */
90 for (n = 0; n < following; n++) {
91 i++;
92 if (i >= nbytes)
93 goto done;
95 if ((buf[i] & 0xc0) != 0x80) /* 10xxxxxx */
96 return -1;
99 gotone = 1;
102 done:
103 return ctrl ? 0 : (gotone ? 2 : 1);
107 * looks_utf16 --
108 * Decide whether some text looks like UTF-16. Returns:
110 * 0: invalid UTF-16
111 * 1: Little-endian UTF-16
112 * 2: Big-endian UTF-16
114 * PUBLIC: int looks_utf16(const char *, size_t);
117 looks_utf16(const char *ibuf, size_t nbytes)
119 const u_char *buf = (u_char *)ibuf;
120 int bigend;
121 size_t i;
122 unsigned int c;
123 int bom;
124 int following = 0;
126 if (nbytes < 2)
127 return 0;
129 bom = buf[0] << 8 ^ buf[1];
130 if (bom == 0xFFFE)
131 bigend = 0;
132 else if (bom == 0xFEFF)
133 bigend = 1;
134 else
135 return 0;
137 for (i = 2; i + 1 < nbytes; i += 2) {
138 if (bigend)
139 c = buf[i] << 8 ^ buf[i + 1];
140 else
141 c = buf[i] ^ buf[i + 1] << 8;
143 if (!following)
144 if (c < 0xD800 || c > 0xDFFF)
145 if (c < 128 && text_chars[c] != T)
146 return 0;
147 else
148 following = 0;
149 else if (c > 0xDBFF)
150 return 0;
151 else {
152 following = 1;
153 continue;
155 else if (c < 0xDC00 || c > 0xDFFF)
156 return 0;
159 return 1 + bigend;
162 #undef F
163 #undef T
164 #undef I
165 #undef X
168 * decode_utf8 --
169 * Decode a UTF-8 character from byte string to Unicode.
170 * Returns -1 if the first byte is a not UTF-8 leader.
172 * Based on RFC 3629, but without error detection.
174 * PUBLIC: int decode_utf8(const char *);
177 decode_utf8(const char *ibuf)
179 const u_char *buf = (u_char *)ibuf;
180 int u = -1;
182 if ((buf[0] & 0x80) == 0)
183 u = buf[0];
184 else if ((buf[0] & 0x40) == 0);
185 else {
186 if ((buf[0] & 0x20) == 0)
187 u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80);
188 else if ((buf[0] & 0x10) == 0)
189 u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6
190 ^ (buf[2] ^ 0x80);
191 else if (((buf[0] & 0x08) == 0))
192 u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
193 ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80);
196 return u;
200 * decode_utf16 --
201 * Decode a UTF-16 character from byte string to Unicode.
202 * Returns -1 if the first unsigned integer is invalid.
204 * No error detection on supplementary bytes.
206 * PUBLIC: int decode_utf16(const char *, int);
209 decode_utf16(const char* ibuf, int bigend)
211 const u_char *buf = (u_char *)ibuf;
212 int u = -1;
213 unsigned int w1, w2;
215 if (bigend)
216 w1 = buf[0] << 8 ^ buf[1];
217 else
218 w1 = buf[0] ^ buf[1] << 8;
220 if (w1 < 0xD800 || w1 > 0xDFFF)
221 u = w1;
222 else if (w1 > 0xDBFF);
223 else {
224 if (bigend)
225 w2 = buf[2] << 8 ^ buf[3];
226 else
227 w2 = buf[2] ^ buf[3] << 8;
228 u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
231 return u;