update to debhelper v9
[moreutils.git] / isutf8.c
blob971112457c834ca97fdecb5880dc7c749dc8ca44
1 /*
2 * isutf8.c - do the input files look like valid utf-8 byte streams?
3 *
4 * Copyright (C) 2005 Lars Wirzenius
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <assert.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <errno.h>
25 #include <string.h>
26 #include <getopt.h>
29 #define VERSION "1.1"
33 * Code to indicate an invalid UTF8 character.
35 enum { INVALID_CHAR = 0xffffffff };
39 * Produce shortest UTF8 encoding of a 31-bit value in 'u', returning it
40 * in the array 'buf'. Return the number of bytes in the encoded value.
41 * If the value is too large (more than 32 bits or would take more than
42 * 'maxbytes' bytes), return -1.
44 static int encodeutf8(unsigned long u, unsigned char *buf, size_t maxbytes)
46 static const struct {
47 int nbytes;
48 unsigned long max;
49 } tab[] = {
50 { 1, 0x0000007F },
51 { 2, 0x000007FF },
52 { 3, 0x0000FFFF },
53 { 4, 0x001FFFFF },
54 { 5, 0x03FFFFFF },
55 { 6, 0x7FFFFFFF },
57 static const int ntab = sizeof(tab) / sizeof(tab[0]);
58 int i, j;
60 if (u > tab[ntab-1].max)
61 return -1;
63 for (i = 0; i < ntab; ++i) {
64 if (u <= tab[i].max)
65 break;
67 assert(i < ntab);
69 if (tab[i].nbytes > maxbytes)
70 return -1;
72 if (tab[i].nbytes == 1) { /* Special case */
73 buf[0] = u;
74 } else {
75 for (j = tab[i].nbytes-1; j > 0; --j) {
76 buf[j] = 0x80 | (u & 0x3f);
77 u >>= 6;
80 unsigned char mask = ~(0xFF >> tab[i].nbytes);
81 buf[0] = mask | u;
84 return tab[i].nbytes;
88 /*
89 * Return number of ones at the top of a byte.
91 * I'm pretty sure there is a fancy trick to do this without a loop,
92 * but I'm too tired to figure it out now. --liw
94 static int high_ones(int c) {
95 int n;
97 for (n = 0; (c & 0x80) == 0x80; c <<= 1)
98 ++n;
99 return n;
104 * Decode a UTF8 character from an array of bytes. Return character code.
105 * Upon error, return INVALID_CHAR.
107 static unsigned long decodeutf8(unsigned char *buf, int nbytes)
109 unsigned long u;
110 int i, j;
112 if (nbytes <= 0)
113 return INVALID_CHAR;
115 if (nbytes == 1) {
116 if (buf[0] >= 0x80)
117 return INVALID_CHAR;
118 return buf[0];
121 i = high_ones(buf[0]);
122 if (i != nbytes)
123 return INVALID_CHAR;
124 u = buf[0] & (0xff >> i);
125 for (j = 1; j < nbytes; ++j) {
126 if ((buf[j] & 0xC0) != 0x80)
127 return INVALID_CHAR;
128 u = (u << 6) | (buf[j] & 0x3f);
131 /* Conforming UTF-8 cannot contain codes 0xd800–0xdfff (UTF-16
132 surrogates) as well as 0xfffe and 0xffff. */
133 if (u >= 0xD800 && u <= 0xDFFF)
134 return INVALID_CHAR;
135 if (u == 0xFFFE || u == 0xFFFF)
136 return INVALID_CHAR;
138 return u;
143 * Determine if the contents of an open file form a valid UTF8 byte stream.
144 * Do this by collecting bytes for a character into a buffer and then
145 * decode the bytes and re-encode them and compare that they are identical
146 * to the original bytes. If any step fails, return 0 for error. If EOF
147 * is reached, return 1 for OK.
149 static int is_utf8_byte_stream(FILE *file, char *filename, int quiet) {
150 enum { MAX_UTF8_BYTES = 6 };
151 unsigned char buf[MAX_UTF8_BYTES];
152 unsigned char buf2[MAX_UTF8_BYTES];
153 int nbytes, nbytes2;
154 int c;
155 unsigned long code;
156 unsigned long line, col, byteoff;
158 nbytes = 0;
159 line = 1;
160 col = 1;
161 byteoff = 0;
163 for (;;) {
164 c = getc(file);
166 if (c == EOF || c < 0x80 || (c & 0xC0) != 0x80) {
167 /* New char starts, deal with previous one. */
168 if (nbytes > 0) {
169 code = decodeutf8(buf, nbytes);
170 if (code == INVALID_CHAR)
171 goto error;
172 nbytes2 = encodeutf8(code, buf2,
173 MAX_UTF8_BYTES);
174 if (nbytes != nbytes2 ||
175 memcmp(buf, buf2, nbytes) != 0)
176 goto error;
177 ++col;
179 nbytes = 0;
180 /* If it's UTF8, start collecting again. */
181 if (c != EOF && c >= 0x80)
182 buf[nbytes++] = c;
183 } else {
184 /* This is a continuation byte, append to buffer. */
185 if (nbytes == MAX_UTF8_BYTES)
186 goto error;
187 buf[nbytes++] = c;
190 if (c == EOF)
191 break;
192 else if (c == '\n') {
193 ++line;
194 byteoff = 0;
195 col = 1;
196 } else
197 ++byteoff;
200 if (nbytes != 0)
201 goto error;
203 return 1;
205 error:
206 if (!quiet) {
207 printf("%s: line %lu, char %lu, byte offset %lu: "
208 "invalid UTF-8 code\n", filename, line, col, byteoff);
210 return 0;
214 static void usage(const char *program_name) {
215 printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n",
216 program_name);
217 printf("Check whether input files are valid UTF-8.\n");
218 printf("This is version %s.\n", VERSION);
222 int main(int argc, char **argv) {
223 int i, ok;
224 FILE *file;
226 int quiet;
227 struct option options[] = {
228 { "help", no_argument, NULL, 'h' },
229 { "quiet", no_argument, &quiet, 1 },
230 { 0, 0, 0, 0 }
232 int opt;
234 quiet = 0;
236 while ((opt = getopt_long(argc, argv, "hq", options, NULL)) != -1) {
237 switch (opt) {
238 case 0:
239 break;
241 case 'h':
242 usage(argv[0]);
243 exit(0);
244 break;
246 case 'q':
247 quiet = 1;
248 break;
250 case '?':
251 exit(EXIT_FAILURE);
253 default:
254 abort();
258 if (optind == argc)
259 ok = is_utf8_byte_stream(stdin, "stdin", quiet);
260 else {
261 ok = 1;
262 for (i = optind; i < argc; ++i) {
263 file = fopen(argv[i], "r");
264 if (file == NULL) {
265 fprintf(stderr, "isutf8: %s: error %d: %s\n",
266 argv[i], errno,
267 strerror(errno));
268 ok = 0;
269 } else {
270 if (! is_utf8_byte_stream(file, argv[i], quiet))
271 ok = 0;
272 (void) fclose(file);
277 if (ok)
278 exit(0);
279 exit(EXIT_FAILURE);