2 * isutf8.c - do the input files look like valid utf-8 byte streams?
4 * Copyright (C) 2005 Lars Wirzenius
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
30 * I'm pretty sure there is a fancy trick to do this without a loop,
31 * but I'm too tired to figure it out now. --liw
33 static int high_ones(int c
) {
36 for (n
= 0; (c
& 0x80) == 0x80; c
<<= 1)
41 static int is_utf8_byte_stream(FILE *file
, char *filename
, int quiet
) {
42 int c
, n
, remaining_bytes
;
43 unsigned long line
, col
;
48 while ((c
= getc(file
)) != EOF
) {
50 if (remaining_bytes
> 0) {
53 if (remaining_bytes
== 0)
58 /* 7-bit character, skip, but adjust position */
65 goto error
; /* wrong place for continuation byte */
67 remaining_bytes
= n
- 1; /* start of multi-byte sequence */
69 if (remaining_bytes
> 0)
75 printf("%s: line %lu, col %lu: invalid UTF-8 code\n",
81 static void usage(const char *program_name
) {
82 printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n", program_name
);
83 printf("Check whether input files are valid UTF-8.\n");
84 printf("This is version %s.\n", VERSION
);
87 int main(int argc
, char **argv
) {
92 struct option options
[] = {
93 { "help", no_argument
, NULL
, 'h' },
94 { "quiet", no_argument
, &quiet
, 1 },
100 while ((opt
= getopt_long(argc
, argv
, "hq", options
, NULL
)) != -1) {
123 ok
= is_utf8_byte_stream(stdin
, "stdin", quiet
);
126 for (i
= optind
; i
< argc
; ++i
) {
127 file
= fopen(argv
[i
], "r");
129 fprintf(stderr
, "isutf8: %s: error %d: %s\n",
130 argv
[i
], errno
, strerror(errno
));
133 ok
= is_utf8_byte_stream(file
, argv
[i
], quiet
) && ok
;