2 * isutf8.c - do the input files look like valid utf-8 byte streams?
4 * Copyright (C) 2005 Lars Wirzenius
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
33 * Code to indicate an invalid UTF8 character.
35 enum { INVALID_CHAR
= 0xffffffff };
39 * Produce shortest UTF8 encoding of a 31-bit value in 'u', returning it
40 * in the array 'buf'. Return the number of bytes in the encoded value.
41 * If the value is too large (more than 32 bits or would take more than
42 * 'maxbytes' bytes), return -1.
44 static int encodeutf8(unsigned long u
, unsigned char *buf
, size_t maxbytes
)
57 static const int ntab
= sizeof(tab
) / sizeof(tab
[0]);
60 if (u
> tab
[ntab
-1].max
)
63 for (i
= 0; i
< ntab
; ++i
) {
69 if (tab
[i
].nbytes
> maxbytes
)
72 if (tab
[i
].nbytes
== 1) { /* Special case */
75 for (j
= tab
[i
].nbytes
-1; j
> 0; --j
) {
76 buf
[j
] = 0x80 | (u
& 0x3f);
80 unsigned char mask
= ~(0xFF >> tab
[i
].nbytes
);
89 * Return number of ones at the top of a byte.
91 * I'm pretty sure there is a fancy trick to do this without a loop,
92 * but I'm too tired to figure it out now. --liw
94 static int high_ones(int c
) {
97 for (n
= 0; (c
& 0x80) == 0x80; c
<<= 1)
104 * Decode a UTF8 character from an array of bytes. Return character code.
105 * Upon error, return INVALID_CHAR.
107 static unsigned long decodeutf8(unsigned char *buf
, int nbytes
)
121 i
= high_ones(buf
[0]);
124 u
= buf
[0] & (0xff >> i
);
125 for (j
= 1; j
< nbytes
; ++j
) {
126 if ((buf
[j
] & 0xC0) != 0x80)
128 u
= (u
<< 6) | (buf
[j
] & 0x3f);
135 * Determine if the contents of an open file form a valid UTF8 byte stream.
136 * Do this by collecting bytes for a character into a buffer and then
137 * decode the bytes and re-encode them and compare that they are identical
138 * to the original bytes. If any step fails, return 0 for error. If EOF
139 * is reached, return 1 for OK.
141 static int is_utf8_byte_stream(FILE *file
, char *filename
, int quiet
) {
142 enum { MAX_UTF8_BYTES
= 6 };
143 unsigned char buf
[MAX_UTF8_BYTES
];
144 unsigned char buf2
[MAX_UTF8_BYTES
];
148 unsigned long line
, col
, byteoff
;
158 if (c
== EOF
|| c
< 0x80 || (c
& 0xC0) != 0x80) {
159 /* New char starts, deal with previous one. */
161 code
= decodeutf8(buf
, nbytes
);
162 if (code
== INVALID_CHAR
)
164 nbytes2
= encodeutf8(code
, buf2
,
166 if (nbytes
!= nbytes2
||
167 memcmp(buf
, buf2
, nbytes
) != 0)
172 /* If it's UTF8, start collecting again. */
173 if (c
!= EOF
&& c
>= 0x80)
176 /* This is a continuation byte, append to buffer. */
177 if (nbytes
== MAX_UTF8_BYTES
)
184 else if (c
== '\n') {
199 printf("%s: line %lu, char %lu, byte offset %lu: "
200 "invalid UTF-8 code\n", filename
, line
, col
, byteoff
);
206 static void usage(const char *program_name
) {
207 printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n",
209 printf("Check whether input files are valid UTF-8.\n");
210 printf("This is version %s.\n", VERSION
);
214 int main(int argc
, char **argv
) {
219 struct option options
[] = {
220 { "help", no_argument
, NULL
, 'h' },
221 { "quiet", no_argument
, &quiet
, 1 },
227 while ((opt
= getopt_long(argc
, argv
, "hq", options
, NULL
)) != -1) {
250 ok
= is_utf8_byte_stream(stdin
, "stdin", quiet
);
253 for (i
= optind
; i
< argc
; ++i
) {
254 file
= fopen(argv
[i
], "r");
256 fprintf(stderr
, "isutf8: %s: error %d: %s\n",
261 if (! is_utf8_byte_stream(file
, argv
[i
], quiet
))