utils/iconv.c

   1
   2 /*  Copyright (C) 2002, 2003, 2004     Manuel Novoa III
   3  *
   4  *  This library is free software; you can redistribute it and/or
   5  *  modify it under the terms of the GNU Library General Public
   6  *  License as published by the Free Software Foundation; either
   7  *  version 2 of the License, or (at your option) any later version.
   8  *
   9  *  This library is distributed in the hope that it will be useful,
  10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  *  Library General Public License for more details.
  13  *
  14  *  You should have received a copy of the GNU Library General Public
  15  *  License along with this library; if not, see
  16  *  <http://www.gnu.org/licenses/>.
  17  */
  18
  19 /*  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!
  20  *
  21  *  Besides uClibc, I'm using this code in my libc for elks, which is
  22  *  a 16-bit environment with a fairly limited compiler.  It would make
  23  *  things much easier for me if this file isn't modified unnecessarily.
  24  *  In particular, please put any new or replacement functions somewhere
  25  *  else, and modify the makefile to use your version instead.
  26  *  Thanks.  Manuel
  27  *
  28  *  ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION!   ATTENTION! */
  29
  30
  31 /* May 23, 2002     Initial Notes:
  32  *
  33  * I'm still tweaking this stuff, but it passes the tests I've thrown
  34  * at it, and Erik needs it for the gcc port.  The glibc extension
  35  * __wcsnrtombs() hasn't been tested, as I didn't find a test for it
  36  * in the glibc source.  I also need to fix the behavior of
  37  * _wchar_utf8sntowcs() if the max number of wchars to convert is 0.
  38  *
  39  * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt
  40  * file on my platform (x86) show about 5-10% faster conversion speed than
  41  * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with
  42  * individual mbrtowc()/wcrtomb() calls.
  43  *
  44  * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled
  45  * as a fail-safe UTF-8 decoder appropriate for a terminal, etc.  which
  46  * needs to deal gracefully with whatever is sent to it.  In that mode,
  47  * it passes Markus Kuhn's UTF-8-test.txt stress test.  I plan to add
  48  * an arg to force that behavior, so the interface will be changing.
  49  *
  50  * I need to fix the error checking for 16-bit wide chars.  This isn't
  51  * an issue for uClibc, but may be for ELKS.  I'm currently not sure
  52  * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS.
  53  *
  54  * July 1, 2002
  55  *
  56  * Fixed _wchar_utf8sntowcs() for the max number of wchars == 0 case.
  57  * Fixed nul-char bug in btowc(), and another in __mbsnrtowcs() for 8-bit
  58  *    locales.
  59  * Enabled building of a C/POSIX-locale-only version, so full locale support
  60  *    no longer needs to be enabled.
  61  *
  62  * Nov 4, 2002
  63  *
  64  * Fixed a bug in _wchar_wcsntoutf8s().  Don't store wcs position if dst is NULL.
  65  * Also, introduce an awful hack into _wchar_wcsntoutf8s() and wcsrtombs() in
  66  *   order to support %ls in printf.  See comments below for details.
  67  * Change behaviour of wc<->mb functions when in the C locale.  Now they do
  68  *   a 1-1 map for the range 0x80-UCHAR_MAX.  This is for backwards compatibility
  69  *   and consistency with the stds requirements that a printf format string by
  70  *   a valid multibyte string beginning and ending in it's initial shift state.
  71  *
  72  * Nov 5, 2002
  73  *
  74  * Forgot to change btowc and wctob when I changed the wc<->mb functions yesterday.
  75  *
  76  * Nov 7, 2002
  77  *
  78  * Add wcwidth and wcswidth, based on Markus Kuhn's wcwidth of 2002-05-08.
  79  *   Added some size/speed optimizations and integrated it into my locale
  80  *   framework.  Minimally tested at the moment, but the stub C-locale
  81  *   version (which most people would probably be using) should be fine.
  82  *
  83  * Nov 21, 2002
  84  *
  85  * Revert the wc<->mb changes from earlier this month involving the C-locale.
  86  * Add a couple of ugly hacks to support *wprintf.
  87  * Add a mini iconv() and iconv implementation (requires locale support).
  88  *
  89  * Aug 1, 2003
  90  * Bug fix for mbrtowc.
  91  *
  92  * Aug 18, 2003
  93  * Bug fix: _wchar_utf8sntowcs and _wchar_wcsntoutf8s now set errno if EILSEQ.
  94  *
  95  * Feb 11, 2004
  96  * Bug fix: Fix size check for remaining output space in iconv().
  97  *
  98  * Manuel
  99  */
 100
 101 /* keep libgen before string.h - and porting.h to use the
 102  * XPG version of basename */
 103 #include <libgen.h>
 104 #include "porting.h"
 105 #include <string.h>
 106 #include <iconv.h>
 107 #include <stdarg.h>
 108 #include <wchar.h>
 109 #include "wchar.c" /* for _UC_iconv_t and __iconv_codesets */
 110
 111 #ifdef L_iconv_main
 112 static
 113 #else
 114 extern
 115 #endif
 116 const unsigned char __iconv_codesets[];
 117
 118 #define IBUF BUFSIZ
 119 #define OBUF BUFSIZ
 120
 121 static char *progname;
 122 static int hide_errors;
 123
 124 static void error_msg(const char *fmt, ...)
 125          __attribute__ ((noreturn, format (printf, 1, 2)));
 126
 127 static void error_msg(const char *fmt, ...)
 128 {
 129         va_list arg;
 130
 131         if (!hide_errors) {
 132                 fprintf(stderr, "%s: ", progname);
 133                 va_start(arg, fmt);
 134                 vfprintf(stderr, fmt, arg);
 135                 va_end(arg);
 136         }
 137
 138         exit(EXIT_FAILURE);
 139 }
 140
 141 int main(int argc, char **argv)
 142 {
 143         FILE *ifile;
 144         FILE *ofile = stdout;
 145         const char *p;
 146         const char *s;
 147         static const char opt_chars[] = "tfocsl";
 148                                       /* 012345 */
 149         const char *opts[sizeof(opt_chars)]; /* last is infile name */
 150         iconv_t ic;
 151         char ibuf[IBUF];
 152         char obuf[OBUF];
 153         char *pi;
 154         char *po;
 155         size_t ni, no, r, pos;
 156
 157         hide_errors = 0;
 158
 159         for (s = opt_chars ; *s ; s++) {
 160                 opts[ s - opt_chars ] = NULL;
 161         }
 162
 163         progname = *argv;
 164         while (--argc) {
 165                 p = *++argv;
 166                 if ((*p != '-') || (*++p == 0)) {
 167                         break;
 168                 }
 169                 do {
 170                         if ((s = strchr(opt_chars,*p)) == NULL) {
 171                         USAGE:
 172                                 s = basename(progname);
 173                                 fprintf(stderr,
 174                                                 "%s [-cs] -f fromcode -t tocode [-o outputfile] [inputfile ...]\n"
 175                                                 "  or\n%s -l\n", s, s);
 176                                 return EXIT_FAILURE;
 177                         }
 178                         if ((s - opt_chars) < 3) {
 179                                 if ((--argc == 0) || opts[s - opt_chars]) {
 180                                         goto USAGE;
 181                                 }
 182                                 opts[s - opt_chars] = *++argv;
 183                         } else {
 184                                 opts[s - opt_chars] = p;
 185                         }
 186                 } while (*++p);
 187         }
 188
 189         if (opts[5]) {                          /* -l */
 190                 fprintf(stderr, "Recognized codesets:\n");
 191                 for (s = (char *)__iconv_codesets ; *s ; s += *s) {
 192                         fprintf(stderr,"  %s\n", s+2);
 193                 }
 194                 s = __LOCALE_DATA_CODESET_LIST;
 195                 do {
 196                         fprintf(stderr,"  %s\n", __LOCALE_DATA_CODESET_LIST+ (unsigned char)(*s));
 197                 } while (*++s);
 198
 199                 return EXIT_SUCCESS;
 200         }
 201
 202         if (opts[4]) {
 203                 hide_errors = 1;
 204         }
 205
 206         if (!opts[0] || !opts[1]) {
 207                 goto USAGE;
 208         }
 209         if ((ic = iconv_open(opts[0],opts[1])) == ((iconv_t)(-1))) {
 210                 error_msg( "unsupported codeset in %s -> %s conversion\n", opts[1], opts[0]);
 211         }
 212         if (opts[3]) {                          /* -c */
 213                 ((_UC_iconv_t *) ic)->skip_invalid_input = 1;
 214         }
 215
 216         if ((s = opts[2]) != NULL) {
 217                 if (!(ofile = fopen(s, "w"))) {
 218                         error_msg( "couldn't open %s for writing\n", s);
 219                 }
 220         }
 221
 222         pos = ni = 0;
 223         do {
 224                 if (!argc || ((**argv == '-') && !((*argv)[1]))) {
 225                         ifile = stdin;          /* we don't check for duplicates */
 226                 } else if (!(ifile = fopen(*argv, "r"))) {
 227                         error_msg( "couldn't open %s for reading\n", *argv);
 228                 }
 229
 230                 while ((r = fread(ibuf + ni, 1, IBUF - ni, ifile)) > 0) {
 231                         pos += r;
 232                         ni += r;
 233                         no = OBUF;
 234                         pi = ibuf;
 235                         po = obuf;
 236                         if ((r = iconv(ic, &pi, &ni, &po, &no)) == ((size_t)(-1))) {
 237                                 if ((errno != EINVAL) && (errno != E2BIG)) {
 238                                         error_msg( "iconv failed at pos %lu : %m\n", (unsigned long) (pos - ni));
 239                                 }
 240                         }
 241                         if ((r = OBUF - no) > 0) {
 242                                 if (fwrite(obuf, 1, OBUF - no, ofile) < r) {
 243                                         error_msg( "write error\n");
 244                                 }
 245                         }
 246                         if (ni) {                       /* still bytes in buffer! */
 247                                 memmove(ibuf, pi, ni);
 248                         }
 249                 }
 250
 251                 if (ferror(ifile)) {
 252                         error_msg( "read error\n");
 253                 }
 254
 255                 ++argv;
 256
 257                 if (ifile != stdin) {
 258                         fclose(ifile);
 259                 }
 260
 261         } while (--argc > 0);
 262
 263         iconv_close(ic);
 264
 265         if (ni) {
 266                 error_msg( "incomplete sequence\n");
 267         }
 268
 269         return (((_UC_iconv_t *) ic)->skip_invalid_input < 2)
 270                 ? EXIT_SUCCESS : EXIT_FAILURE;
 271 }