isutf8.c

   1 /*
   2  * isutf8.c - do the input files look like valid utf-8 byte streams?
   3  *
   4  * Copyright (C) 2005  Lars Wirzenius
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
  19  */
  20
  21 #include <assert.h>
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24 #include <errno.h>
  25 #include <string.h>
  26 #include <getopt.h>
  27
  28
  29 #define VERSION "1.1"
  30
  31
  32 /*
  33  * Code to indicate an invalid UTF8 character.
  34  */
  35 enum { INVALID_CHAR = 0xffffffff };
  36
  37
  38 /*
  39  * Produce shortest UTF8 encoding of a 31-bit value in 'u', returning it
  40  * in the array 'buf'. Return the number of bytes in the encoded value.
  41  * If the value is too large (more than 32 bits or would take more than
  42  * 'maxbytes' bytes), return -1.
  43  */
  44 static int encodeutf8(unsigned long u, unsigned char *buf, size_t maxbytes)
  45 {
  46         static const struct {
  47             int nbytes;
  48             unsigned long max;
  49         } tab[] = {
  50             { 1, 0x0000007F },
  51             { 2, 0x000007FF },
  52             { 3, 0x0000FFFF },
  53             { 4, 0x001FFFFF },
  54             { 5, 0x03FFFFFF },
  55             { 6, 0x7FFFFFFF },
  56         };
  57         static const int ntab = sizeof(tab) / sizeof(tab[0]);
  58         int i, j;
  59
  60         if (u > tab[ntab-1].max)
  61                 return -1;
  62
  63         for (i = 0; i < ntab; ++i) {
  64                 if (u <= tab[i].max)
  65                     break;
  66         }
  67         assert(i < ntab);
  68
  69         if (tab[i].nbytes > maxbytes)
  70                 return -1;
  71
  72         if (tab[i].nbytes == 1) { /* Special case */
  73                 buf[0] = u;
  74         } else {
  75                 for (j = tab[i].nbytes-1; j > 0; --j) {
  76                         buf[j] = 0x80 | (u & 0x3f);
  77                         u >>= 6;
  78                 }
  79
  80                 unsigned char mask = ~(0xFF >> tab[i].nbytes);
  81                 buf[0] = mask | u;
  82         }
  83
  84         return tab[i].nbytes;
  85 }
  86
  87
  88 /*
  89  * Return number of ones at the top of a byte.
  90  *
  91  * I'm pretty sure there is a fancy trick to do this without a loop,
  92  * but I'm too tired to figure it out now. --liw
  93  */
  94 static int high_ones(int c) {
  95         int n;
  96
  97         for (n = 0; (c & 0x80) == 0x80; c <<= 1)
  98                 ++n;
  99         return n;
 100 }
 101
 102
 103 /*
 104  * Decode a UTF8 character from an array of bytes. Return character code.
 105  * Upon error, return INVALID_CHAR.
 106  */
 107 static unsigned long decodeutf8(unsigned char *buf, int nbytes)
 108 {
 109         unsigned long u;
 110         int i, j;
 111
 112         if (nbytes <= 0)
 113                 return INVALID_CHAR;
 114
 115         if (nbytes == 1) {
 116                 if (buf[0] >= 0x80)
 117                         return INVALID_CHAR;
 118                 return buf[0];
 119         }
 120
 121         i = high_ones(buf[0]);
 122         if (i != nbytes)
 123                 return INVALID_CHAR;
 124         u = buf[0] & (0xff >> i);
 125         for (j = 1; j < nbytes; ++j) {
 126                 if ((buf[j] & 0xC0) != 0x80)
 127                             return INVALID_CHAR;
 128                 u = (u << 6) | (buf[j] & 0x3f);
 129         }
 130         return u;
 131 }
 132
 133
 134 /*
 135  * Determine if the contents of an open file form a valid UTF8 byte stream.
 136  * Do this by collecting bytes for a character into a buffer and then
 137  * decode the bytes and re-encode them and compare that they are identical
 138  * to the original bytes. If any step fails, return 0 for error. If EOF
 139  * is reached, return 1 for OK.
 140  */
 141 static int is_utf8_byte_stream(FILE *file, char *filename, int quiet) {
 142         enum { MAX_UTF8_BYTES = 6 };
 143         unsigned char buf[MAX_UTF8_BYTES];
 144         unsigned char buf2[MAX_UTF8_BYTES];
 145         int nbytes, nbytes2;
 146         int c;
 147         unsigned long code;
 148         unsigned long line, col, byteoff;
 149
 150         nbytes = 0;
 151         line = 1;
 152         col = 1;
 153         byteoff = 0;
 154
 155         for (;;) {
 156                 c = getc(file);
 157
 158                 if (c == EOF || c < 0x80 || (c & 0xC0) != 0x80) {
 159                         /* New char starts, deal with previous one. */
 160                         if (nbytes > 0) {
 161                                 code = decodeutf8(buf, nbytes);
 162                                 if (code == INVALID_CHAR)
 163                                         goto error;
 164                                 nbytes2 = encodeutf8(code, buf2,
 165                                                      MAX_UTF8_BYTES);
 166                                 if (nbytes != nbytes2 ||
 167                                     memcmp(buf, buf2, nbytes) != 0)
 168                                         goto error;
 169                                 ++col;
 170                         }
 171                         nbytes = 0;
 172                         /* If it's UTF8, start collecting again. */
 173                         if (c != EOF && c >= 0x80)
 174                                 buf[nbytes++] = c;
 175                 } else {
 176                         /* This is a continuation byte, append to buffer. */
 177                         if (nbytes == MAX_UTF8_BYTES)
 178                                 goto error;
 179                         buf[nbytes++] = c;
 180                 }
 181
 182                 if (c == EOF)
 183                         break;
 184                 else if (c == '\n') {
 185                         ++line;
 186                         byteoff = 0;
 187                         col = 1;
 188                 } else
 189                         ++byteoff;
 190         }
 191
 192         if (nbytes != 0)
 193                 goto error;
 194
 195         return 1;
 196
 197 error:
 198         if (!quiet) {
 199                 printf("%s: line %lu, char %lu, byte offset %lu: "
 200                        "invalid UTF-8 code\n", filename, line, col, byteoff);
 201         }
 202         return 0;
 203 }
 204
 205
 206 static void usage(const char *program_name) {
 207         printf("Usage: %s [-hq] [--help] [--quiet] [file ...]\n",
 208                program_name);
 209         printf("Check whether input files are valid UTF-8.\n");
 210         printf("This is version %s.\n", VERSION);
 211 }
 212
 213
 214 int main(int argc, char **argv) {
 215         int i, ok;
 216         FILE *file;
 217
 218         int quiet;
 219         struct option options[] = {
 220                 { "help", no_argument, NULL, 'h' },
 221                 { "quiet", no_argument, &quiet, 1 },
 222         };
 223         int opt;
 224
 225         quiet = 0;
 226
 227         while ((opt = getopt_long(argc, argv, "hq", options, NULL)) != -1) {
 228                 switch (opt) {
 229                 case 0:
 230                         break;
 231
 232                 case 'h':
 233                         usage(argv[0]);
 234                         exit(0);
 235                         break;
 236
 237                 case 'q':
 238                         quiet = 1;
 239                         break;
 240
 241                 case '?':
 242                         exit(EXIT_FAILURE);
 243
 244                 default:
 245                         abort();
 246                 }
 247         }
 248
 249         if (optind == argc)
 250                 ok = is_utf8_byte_stream(stdin, "stdin", quiet);
 251         else {
 252                 ok = 1;
 253                 for (i = optind; i < argc; ++i) {
 254                         file = fopen(argv[i], "r");
 255                         if (file == NULL) {
 256                                 fprintf(stderr, "isutf8: %s: error %d: %s\n",
 257                                                 argv[i], errno,
 258                                                 strerror(errno));
 259                                 ok = 0;
 260                         } else {
 261                                 if (! is_utf8_byte_stream(file, argv[i], quiet))
 262                                     ok = 0;
 263                                 (void) fclose(file);
 264                         }
 265                 }
 266         }
 267
 268         if (ok)
 269                 exit(0);
 270         exit(EXIT_FAILURE);
 271 }