* ifdata: robustness patch from Adam Lackorzynski, in particular deal with
[moreutils.git] / isutf8.c
blob59749c0f49701106ac0aaa949678a8cbbf9de0bf
1 /*
2 * isutf8.c - do the input files look like valid utf-8 byte streams?
3 *
4 * Copyright (C) 2005 Lars Wirzenius
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <errno.h>
24 #include <string.h>
25 #include <getopt.h>
27 #define VERSION "1.0"
29 /*
30 * I'm pretty sure there is a fancy trick to do this without a loop,
31 * but I'm too tired to figure it out now. --liw
33 static int high_ones(int c) {
34 int n;
36 for (n = 0; (c & 0x80) == 0x80; c <<= 1)
37 ++n;
38 return n;
41 static int is_utf8_byte_stream(FILE *file, char *filename, int quiet) {
42 int c, n, remaining_bytes;
43 unsigned long line, col;
45 remaining_bytes = 0;
46 line = 1;
47 col = 1;
48 while ((c = getc(file)) != EOF) {
49 n = high_ones(c);
50 if (remaining_bytes > 0) {
51 if (n == 1) {
52 --remaining_bytes;
53 if (remaining_bytes == 0)
54 ++col;
55 } else
56 goto error;
57 } else if (n == 0) {
58 /* 7-bit character, skip, but adjust position */
59 if (c == '\n') {
60 ++line;
61 col = 1;
62 } else
63 ++col;
64 } else if (n == 1)
65 goto error; /* wrong place for continuation byte */
66 else
67 remaining_bytes = n - 1; /* start of multi-byte sequence */
69 if (remaining_bytes > 0)
70 goto error;
71 return 1;
73 error:
74 if (!quiet) {
75 printf("%s: line %lu, col %lu: invalid UTF-8 code\n",
76 filename, line, col);
78 return 0;
81 static void usage(const char *program_name) {
82 printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n", program_name);
83 printf("Check whether input files are valid UTF-8.\n");
84 printf("This is version %s.\n", VERSION);
87 int main(int argc, char **argv) {
88 int i, ok;
89 FILE *file;
91 int quiet;
92 struct option options[] = {
93 { "help", no_argument, NULL, 'h' },
94 { "quiet", no_argument, &quiet, 1 },
96 int opt;
98 quiet = 0;
100 while ((opt = getopt_long(argc, argv, "hq", options, NULL)) != -1) {
101 switch (opt) {
102 case 0:
103 break;
105 case 'h':
106 usage(argv[0]);
107 exit(0);
108 break;
110 case 'q':
111 quiet = 1;
112 break;
114 case '?':
115 exit(EXIT_FAILURE);
117 default:
118 abort();
122 if (optind == argc)
123 ok = is_utf8_byte_stream(stdin, "stdin", quiet);
124 else {
125 ok = 1;
126 for (i = optind; i < argc; ++i) {
127 file = fopen(argv[i], "r");
128 if (file == NULL) {
129 fprintf(stderr, "isutf8: %s: error %d: %s\n",
130 argv[i], errno, strerror(errno));
131 ok = 0;
132 } else {
133 ok = is_utf8_byte_stream(file, argv[i], quiet) && ok;
134 (void) fclose(file);
139 if (ok)
140 exit(0);
141 exit(EXIT_FAILURE);