Bring in an errno.9 manual page (based on NetBSD's).
[dragonfly.git] / contrib / nvi2 / common / encoding.c
blob7bdcf7069238c96c4d177aa08839083a9882c1f9
1 /*-
2 * Copyright (c) 2011, 2012
3 * Zhihao Yuan. All rights reserved.
5 * See the LICENSE file for redistribution information.
6 */
8 #ifndef lint
9 static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $";
10 #endif /* not lint */
12 #include <sys/types.h>
14 int looks_utf8(const char *, size_t);
15 int looks_utf16(const char *, size_t);
16 int decode_utf8(const char *);
17 int decode_utf16(const char *, int);
19 #define F 0 /* character never appears in text */
20 #define T 1 /* character appears in plain ASCII text */
21 #define I 2 /* character appears in ISO-8859 text */
22 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
24 static char text_chars[256] = {
25 /* BEL BS HT LF FF CR */
26 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
27 /* ESC */
28 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
29 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
30 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
31 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
32 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
33 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
34 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
35 /* NEL */
36 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
37 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
38 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
39 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
40 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
41 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
42 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
43 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
47 * looks_utf8 --
48 * Decide whether some text looks like UTF-8. Returns:
50 * -1: invalid UTF-8
51 * 0: uses odd control characters, so doesn't look like text
52 * 1: 7-bit text
53 * 2: definitely UTF-8 text (valid high-bit set bytes)
55 * Based on RFC 3629. UTF-8 with BOM is not accepted.
57 * PUBLIC: int looks_utf8(const char *, size_t);
59 int
60 looks_utf8(const char *ibuf, size_t nbytes)
62 const u_char *buf = (u_char *)ibuf;
63 size_t i;
64 int n;
65 int gotone = 0, ctrl = 0;
67 for (i = 0; i < nbytes; i++) {
68 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
70 * Even if the whole file is valid UTF-8 sequences,
71 * still reject it if it uses weird control characters.
74 if (text_chars[buf[i]] != T)
75 ctrl = 1;
76 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
77 return -1;
78 } else { /* 11xxxxxx begins UTF-8 */
79 int following;
81 if ((buf[i] & 0x20) == 0) /* 110xxxxx */
82 if (buf[i] > 0xC1) /* C0, C1 */
83 following = 1;
84 else return -1;
85 else if ((buf[i] & 0x10) == 0) /* 1110xxxx */
86 following = 2;
87 else if ((buf[i] & 0x08) == 0) /* 11110xxx */
88 if (buf[i] < 0xF5)
89 following = 3;
90 else return -1; /* F5, F6, F7 */
91 else
92 return -1; /* F8~FF */
94 for (n = 0; n < following; n++) {
95 i++;
96 if (i >= nbytes)
97 goto done;
99 if (buf[i] & 0x40) /* 10xxxxxx */
100 return -1;
103 gotone = 1;
106 done:
107 return ctrl ? 0 : (gotone ? 2 : 1);
111 * looks_utf16 --
112 * Decide whether some text looks like UTF-16. Returns:
114 * 0: invalid UTF-16
115 * 1: Little-endian UTF-16
116 * 2: Big-endian UTF-16
118 * PUBLIC: int looks_utf16(const char *, size_t);
121 looks_utf16(const char *ibuf, size_t nbytes)
123 const u_char *buf = (u_char *)ibuf;
124 int bigend;
125 size_t i;
126 unsigned int c;
127 int bom;
128 int following = 0;
130 if (nbytes < 2)
131 return 0;
133 bom = buf[0] << 8 ^ buf[1];
134 if (bom == 0xFFFE)
135 bigend = 0;
136 else if (bom == 0xFEFF)
137 bigend = 1;
138 else
139 return 0;
141 for (i = 2; i + 1 < nbytes; i += 2) {
142 if (bigend)
143 c = buf[i] << 8 ^ buf[i + 1];
144 else
145 c = buf[i] ^ buf[i + 1] << 8;
147 if (!following)
148 if (c < 0xD800 || c > 0xDFFF)
149 if (c < 128 && text_chars[c] != T)
150 return 0;
151 else
152 following = 0;
153 else if (c > 0xDBFF)
154 return 0;
155 else {
156 following = 1;
157 continue;
159 else if (c < 0xDC00 || c > 0xDFFF)
160 return 0;
163 return 1 + bigend;
166 #undef F
167 #undef T
168 #undef I
169 #undef X
172 * decode_utf8 --
173 * Decode a UTF-8 character from byte string to Unicode.
174 * Returns -1 if the first byte is a not UTF-8 leader.
176 * Based on RFC 3629, but without error detection.
178 * PUBLIC: int decode_utf8(const char *);
181 decode_utf8(const char *ibuf)
183 const u_char *buf = (u_char *)ibuf;
184 int u = -1;
186 if ((buf[0] & 0x80) == 0)
187 u = buf[0];
188 else if ((buf[0] & 0x40) == 0);
189 else {
190 if ((buf[0] & 0x20) == 0)
191 u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80);
192 else if ((buf[0] & 0x10) == 0)
193 u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6
194 ^ (buf[2] ^ 0x80);
195 else if (((buf[0] & 0x08) == 0))
196 u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
197 ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80);
200 return u;
204 * decode_utf16 --
205 * Decode a UTF-16 character from byte string to Unicode.
206 * Returns -1 if the first unsigned integer is invalid.
208 * No error detection on supplementary bytes.
210 * PUBLIC: int decode_utf16(const char *, int);
213 decode_utf16(const char* ibuf, int bigend)
215 const u_char *buf = (u_char *)ibuf;
216 int u = -1;
217 unsigned int w1, w2;
219 if (bigend)
220 w1 = buf[0] << 8 ^ buf[1];
221 else
222 w1 = buf[0] ^ buf[1] << 8;
224 if (w1 < 0xD800 || w1 > 0xDFFF)
225 u = w1;
226 else if (w1 > 0xDBFF);
227 else {
228 if (bigend)
229 w2 = buf[2] << 8 ^ buf[3];
230 else
231 w2 = buf[2] ^ buf[3] << 8;
232 u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
235 return u;