2.5-18.1
[glibc.git] / iconv / iconv_prog.c
blob77829971ad9a11d09546a280629bb0e38e93b1b8
1 /* Convert text in given files from the specified from-set to the to-set.
2 Copyright (C) 1998-2004, 2005, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License version 2 as
8 published by the Free Software Foundation.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19 #include <argp.h>
20 #include <assert.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <error.h>
24 #include <fcntl.h>
25 #include <iconv.h>
26 #include <langinfo.h>
27 #include <locale.h>
28 #include <search.h>
29 #include <stdbool.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <unistd.h>
34 #include <libintl.h>
35 #ifdef _POSIX_MAPPED_FILES
36 # include <sys/mman.h>
37 #endif
38 #include <charmap.h>
39 #include <gconv_int.h>
40 #include "iconv_prog.h"
41 #include "iconvconfig.h"
43 /* Get libc version number. */
44 #include "../version.h"
46 #define PACKAGE _libc_intl_domainname
49 /* Name and version of program. */
50 static void print_version (FILE *stream, struct argp_state *state);
51 void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version;
53 #define OPT_VERBOSE 1000
54 #define OPT_LIST 'l'
56 /* Definitions of arguments for argp functions. */
57 static const struct argp_option options[] =
59 { NULL, 0, NULL, 0, N_("Input/Output format specification:") },
60 { "from-code", 'f', "NAME", 0, N_("encoding of original text") },
61 { "to-code", 't', "NAME", 0, N_("encoding for output") },
62 { NULL, 0, NULL, 0, N_("Information:") },
63 { "list", 'l', NULL, 0, N_("list all known coded character sets") },
64 { NULL, 0, NULL, 0, N_("Output control:") },
65 { NULL, 'c', NULL, 0, N_("omit invalid characters from output") },
66 { "output", 'o', "FILE", 0, N_("output file") },
67 { "silent", 's', NULL, 0, N_("suppress warnings") },
68 { "verbose", OPT_VERBOSE, NULL, 0, N_("print progress information") },
69 { NULL, 0, NULL, 0, NULL }
72 /* Short description of program. */
73 static const char doc[] = N_("\
74 Convert encoding of given files from one encoding to another.");
76 /* Strings for arguments in help texts. */
77 static const char args_doc[] = N_("[FILE...]");
79 /* Prototype for option handler. */
80 static error_t parse_opt (int key, char *arg, struct argp_state *state);
82 /* Function to print some extra text in the help message. */
83 static char *more_help (int key, const char *text, void *input);
85 /* Data structure to communicate with argp functions. */
86 static struct argp argp =
88 options, parse_opt, args_doc, doc, NULL, more_help
91 /* Code sets to convert from and to respectively. An empty string as the
92 default causes the 'iconv_open' function to look up the charset of the
93 currently selected locale and use it. */
94 static const char *from_code = "";
95 static const char *to_code = "";
97 /* File to write output to. If NULL write to stdout. */
98 static const char *output_file;
100 /* Nonzero if verbose ouput is wanted. */
101 int verbose;
103 /* Nonzero if list of all coded character sets is wanted. */
104 static int list;
106 /* If nonzero omit invalid character from output. */
107 int omit_invalid;
109 /* Prototypes for the functions doing the actual work. */
110 static int process_block (iconv_t cd, char *addr, size_t len, FILE *output);
111 static int process_fd (iconv_t cd, int fd, FILE *output);
112 static int process_file (iconv_t cd, FILE *input, FILE *output);
113 static void print_known_names (void) internal_function;
117 main (int argc, char *argv[])
119 int status = EXIT_SUCCESS;
120 int remaining;
121 FILE *output;
122 iconv_t cd;
123 const char *orig_to_code;
124 struct charmap_t *from_charmap = NULL;
125 struct charmap_t *to_charmap = NULL;
127 /* Set locale via LC_ALL. */
128 setlocale (LC_ALL, "");
130 /* Set the text message domain. */
131 textdomain (_libc_intl_domainname);
133 /* Parse and process arguments. */
134 argp_parse (&argp, argc, argv, 0, &remaining, NULL);
136 /* List all coded character sets if wanted. */
137 if (list)
139 print_known_names ();
140 exit (EXIT_SUCCESS);
143 /* If we have to ignore errors make sure we use the appropriate name for
144 the to-character-set. */
145 orig_to_code = to_code;
146 if (omit_invalid)
148 const char *errhand = strchrnul (to_code, '/');
149 int nslash = 2;
150 char *newp;
151 char *cp;
153 if (*errhand == '/')
155 --nslash;
156 errhand = strchrnul (errhand, '/');
158 if (*errhand == '/')
160 --nslash;
161 errhand = strchr (errhand, '\0');
165 newp = (char *) alloca (errhand - to_code + nslash + 7 + 1);
166 cp = mempcpy (newp, to_code, errhand - to_code);
167 while (nslash-- > 0)
168 *cp++ = '/';
169 if (cp[-1] != '/')
170 *cp++ = ',';
171 memcpy (cp, "IGNORE", sizeof ("IGNORE"));
173 to_code = newp;
176 /* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f
177 can be file names of charmaps. In this case iconv will have to read
178 those charmaps and use them to do the conversion. But there are
179 holes in the specification. There is nothing said that if -f is a
180 charmap filename that -t must be, too. And vice versa. There is
181 also no word about the symbolic names used. What if they don't
182 match? */
183 if (strchr (from_code, '/') != NULL)
184 /* The from-name might be a charmap file name. Try reading the
185 file. */
186 from_charmap = charmap_read (from_code, /*0, 1*/1, 0, 0, 0);
188 if (strchr (orig_to_code, '/') != NULL)
189 /* The to-name might be a charmap file name. Try reading the
190 file. */
191 to_charmap = charmap_read (orig_to_code, /*0, 1,*/1, 0, 0, 0);
194 /* Determine output file. */
195 if (output_file != NULL && strcmp (output_file, "-") != 0)
197 output = fopen (output_file, "w");
198 if (output == NULL)
199 error (EXIT_FAILURE, errno, _("cannot open output file"));
201 else
202 output = stdout;
204 /* At this point we have to handle two cases. The first one is
205 where a charmap is used for the from- or to-charset, or both. We
206 handle this special since it is very different from the sane way of
207 doing things. The other case allows converting using the iconv()
208 function. */
209 if (from_charmap != NULL || to_charmap != NULL)
210 /* Construct the conversion table and do the conversion. */
211 status = charmap_conversion (from_code, from_charmap, to_code, to_charmap,
212 argc, remaining, argv, output);
213 else
215 /* Let's see whether we have these coded character sets. */
216 cd = iconv_open (to_code, from_code);
217 if (cd == (iconv_t) -1)
219 if (errno == EINVAL)
221 /* Try to be nice with the user and tell her which of the
222 two encoding names is wrong. This is possible because
223 all supported encodings can be converted from/to Unicode,
224 in other words, because the graph of encodings is
225 connected. */
226 bool from_wrong =
227 (iconv_open ("UTF-8", from_code) == (iconv_t) -1
228 && errno == EINVAL);
229 bool to_wrong =
230 (iconv_open (to_code, "UTF-8") == (iconv_t) -1
231 && errno == EINVAL);
232 const char *from_pretty =
233 (from_code[0] ? from_code : nl_langinfo (CODESET));
234 const char *to_pretty =
235 (orig_to_code[0] ? orig_to_code : nl_langinfo (CODESET));
237 if (from_wrong)
239 if (to_wrong)
240 error (0, 0,
241 _("\
242 conversions from `%s' and to `%s' are not supported"),
243 from_pretty, to_pretty);
244 else
245 error (0, 0,
246 _("conversion from `%s' is not supported"),
247 from_pretty);
249 else
251 if (to_wrong)
252 error (0, 0,
253 _("conversion to `%s' is not supported"),
254 to_pretty);
255 else
256 error (0, 0,
257 _("conversion from `%s' to `%s' is not supported"),
258 from_pretty, to_pretty);
261 argp_help (&argp, stderr, ARGP_HELP_SEE,
262 program_invocation_short_name);
263 exit (1);
265 else
266 error (EXIT_FAILURE, errno,
267 _("failed to start conversion processing"));
270 /* Now process the remaining files. Write them to stdout or the file
271 specified with the `-o' parameter. If we have no file given as
272 the parameter process all from stdin. */
273 if (remaining == argc)
275 if (process_file (cd, stdin, output) != 0)
276 status = EXIT_FAILURE;
278 else
281 #ifdef _POSIX_MAPPED_FILES
282 struct stat st;
283 char *addr;
284 #endif
285 int fd, ret;
287 if (verbose)
288 fprintf (stderr, "%s:\n", argv[remaining]);
289 if (strcmp (argv[remaining], "-") == 0)
290 fd = 0;
291 else
293 fd = open (argv[remaining], O_RDONLY);
295 if (fd == -1)
297 error (0, errno, _("cannot open input file `%s'"),
298 argv[remaining]);
299 status = EXIT_FAILURE;
300 continue;
304 #ifdef _POSIX_MAPPED_FILES
305 /* We have possibilities for reading the input file. First try
306 to mmap() it since this will provide the fastest solution. */
307 if (fstat (fd, &st) == 0
308 && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
309 fd, 0)) != MAP_FAILED))
311 /* Yes, we can use mmap(). The descriptor is not needed
312 anymore. */
313 if (close (fd) != 0)
314 error (EXIT_FAILURE, errno,
315 _("error while closing input `%s'"),
316 argv[remaining]);
318 ret = process_block (cd, addr, st.st_size, output);
320 /* We don't need the input data anymore. */
321 munmap ((void *) addr, st.st_size);
323 if (ret != 0)
325 status = EXIT_FAILURE;
327 if (ret < 0)
328 /* We cannot go on with producing output since it might
329 lead to problem because the last output might leave
330 the output stream in an undefined state. */
331 break;
334 else
335 #endif /* _POSIX_MAPPED_FILES */
337 /* Read the file in pieces. */
338 ret = process_fd (cd, fd, output);
340 /* Now close the file. */
341 close (fd);
343 if (ret != 0)
345 /* Something went wrong. */
346 status = EXIT_FAILURE;
348 if (ret < 0)
349 /* We cannot go on with producing output since it might
350 lead to problem because the last output might leave
351 the output stream in an undefined state. */
352 break;
356 while (++remaining < argc);
359 /* Close the output file now. */
360 if (fclose (output))
361 error (EXIT_FAILURE, errno, _("error while closing output file"));
363 return status;
367 /* Handle program arguments. */
368 static error_t
369 parse_opt (int key, char *arg, struct argp_state *state)
371 switch (key)
373 case 'f':
374 from_code = arg;
375 break;
376 case 't':
377 to_code = arg;
378 break;
379 case 'o':
380 output_file = arg;
381 break;
382 case 's':
383 /* Nothing, for now at least. We are not giving out any information
384 about missing character or so. */
385 break;
386 case 'c':
387 /* Omit invalid characters from output. */
388 omit_invalid = 1;
389 break;
390 case OPT_VERBOSE:
391 verbose = 1;
392 break;
393 case OPT_LIST:
394 list = 1;
395 break;
396 default:
397 return ARGP_ERR_UNKNOWN;
399 return 0;
403 static char *
404 more_help (int key, const char *text, void *input)
406 switch (key)
408 case ARGP_KEY_HELP_EXTRA:
409 /* We print some extra information. */
410 return strdup (gettext ("\
411 For bug reporting instructions, please see:\n\
412 <http://www.gnu.org/software/libc/bugs.html>.\n"));
413 default:
414 break;
416 return (char *) text;
420 /* Print the version information. */
421 static void
422 print_version (FILE *stream, struct argp_state *state)
424 fprintf (stream, "iconv (GNU %s) %s\n", PACKAGE, VERSION);
425 fprintf (stream, gettext ("\
426 Copyright (C) %s Free Software Foundation, Inc.\n\
427 This is free software; see the source for copying conditions. There is NO\n\
428 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
429 "), "2006");
430 fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper");
434 static int
435 process_block (iconv_t cd, char *addr, size_t len, FILE *output)
437 #define OUTBUF_SIZE 32768
438 const char *start = addr;
439 char outbuf[OUTBUF_SIZE];
440 char *outptr;
441 size_t outlen;
442 size_t n;
443 int ret = 0;
445 while (len > 0)
447 outptr = outbuf;
448 outlen = OUTBUF_SIZE;
449 n = iconv (cd, &addr, &len, &outptr, &outlen);
451 if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
453 ret = 1;
454 if (len == 0)
455 n = 0;
456 else
457 errno = E2BIG;
460 if (outptr != outbuf)
462 /* We have something to write out. */
463 int errno_save = errno;
465 if (fwrite (outbuf, 1, outptr - outbuf, output)
466 < (size_t) (outptr - outbuf)
467 || ferror (output))
469 /* Error occurred while printing the result. */
470 error (0, 0, _("\
471 conversion stopped due to problem in writing the output"));
472 return -1;
475 errno = errno_save;
478 if (n != (size_t) -1)
480 /* All the input test is processed. For state-dependent
481 character sets we have to flush the state now. */
482 outptr = outbuf;
483 outlen = OUTBUF_SIZE;
484 n = iconv (cd, NULL, NULL, &outptr, &outlen);
486 if (outptr != outbuf)
488 /* We have something to write out. */
489 int errno_save = errno;
491 if (fwrite (outbuf, 1, outptr - outbuf, output)
492 < (size_t) (outptr - outbuf)
493 || ferror (output))
495 /* Error occurred while printing the result. */
496 error (0, 0, _("\
497 conversion stopped due to problem in writing the output"));
498 return -1;
501 errno = errno_save;
504 if (n != (size_t) -1)
505 break;
507 if (omit_invalid && errno == EILSEQ)
509 ret = 1;
510 break;
514 if (errno != E2BIG)
516 /* iconv() ran into a problem. */
517 switch (errno)
519 case EILSEQ:
520 if (! omit_invalid)
521 error (0, 0, _("illegal input sequence at position %ld"),
522 (long int) (addr - start));
523 break;
524 case EINVAL:
525 error (0, 0, _("\
526 incomplete character or shift sequence at end of buffer"));
527 break;
528 case EBADF:
529 error (0, 0, _("internal error (illegal descriptor)"));
530 break;
531 default:
532 error (0, 0, _("unknown iconv() error %d"), errno);
533 break;
536 return -1;
540 return ret;
544 static int
545 process_fd (iconv_t cd, int fd, FILE *output)
547 /* we have a problem with reading from a desriptor since we must not
548 provide the iconv() function an incomplete character or shift
549 sequence at the end of the buffer. Since we have to deal with
550 arbitrary encodings we must read the whole text in a buffer and
551 process it in one step. */
552 static char *inbuf = NULL;
553 static size_t maxlen = 0;
554 char *inptr = NULL;
555 size_t actlen = 0;
557 while (actlen < maxlen)
559 ssize_t n = read (fd, inptr, maxlen - actlen);
561 if (n == 0)
562 /* No more text to read. */
563 break;
565 if (n == -1)
567 /* Error while reading. */
568 error (0, errno, _("error while reading the input"));
569 return -1;
572 inptr += n;
573 actlen += n;
576 if (actlen == maxlen)
577 while (1)
579 ssize_t n;
580 char *new_inbuf;
582 /* Increase the buffer. */
583 new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
584 if (new_inbuf == NULL)
586 error (0, errno, _("unable to allocate buffer for input"));
587 return -1;
589 inbuf = new_inbuf;
590 maxlen += 32768;
591 inptr = inbuf + actlen;
595 n = read (fd, inptr, maxlen - actlen);
597 if (n == 0)
598 /* No more text to read. */
599 break;
601 if (n == -1)
603 /* Error while reading. */
604 error (0, errno, _("error while reading the input"));
605 return -1;
608 inptr += n;
609 actlen += n;
611 while (actlen < maxlen);
613 if (n == 0)
614 /* Break again so we leave both loops. */
615 break;
618 /* Now we have all the input in the buffer. Process it in one run. */
619 return process_block (cd, inbuf, actlen, output);
623 static int
624 process_file (iconv_t cd, FILE *input, FILE *output)
626 /* This should be safe since we use this function only for `stdin' and
627 we haven't read anything so far. */
628 return process_fd (cd, fileno (input), output);
632 /* Print all known character sets/encodings. */
633 static void *printlist;
634 static size_t column;
635 static int not_first;
637 static void
638 insert_print_list (const void *nodep, VISIT value, int level)
640 if (value == leaf || value == postorder)
642 const struct gconv_alias *s = *(const struct gconv_alias **) nodep;
643 tsearch (s->fromname, &printlist, (__compar_fn_t) strverscmp);
647 static void
648 do_print_human (const void *nodep, VISIT value, int level)
650 if (value == leaf || value == postorder)
652 const char *s = *(const char **) nodep;
653 size_t len = strlen (s);
654 size_t cnt;
656 while (len > 0 && s[len - 1] == '/')
657 --len;
659 for (cnt = 0; cnt < len; ++cnt)
660 if (isalnum (s[cnt]))
661 break;
662 if (cnt == len)
663 return;
665 if (not_first)
667 putchar (',');
668 ++column;
670 if (column > 2 && column + len > 77)
672 fputs ("\n ", stdout);
673 column = 2;
675 else
677 putchar (' ');
678 ++column;
681 else
682 not_first = 1;
684 fwrite (s, len, 1, stdout);
685 column += len;
689 static void
690 do_print (const void *nodep, VISIT value, int level)
692 if (value == leaf || value == postorder)
694 const char *s = *(const char **) nodep;
696 puts (s);
700 static void
701 internal_function
702 add_known_names (struct gconv_module *node)
704 if (node->left != NULL)
705 add_known_names (node->left);
706 if (node->right != NULL)
707 add_known_names (node->right);
710 if (strcmp (node->from_string, "INTERNAL"))
711 tsearch (node->from_string, &printlist,
712 (__compar_fn_t) strverscmp);
713 if (strcmp (node->to_string, "INTERNAL") != 0)
714 tsearch (node->to_string, &printlist, (__compar_fn_t) strverscmp);
716 node = node->same;
718 while (node != NULL);
722 static void
723 insert_cache (void)
725 const struct gconvcache_header *header;
726 const char *strtab;
727 const struct hash_entry *hashtab;
728 size_t cnt;
730 header = (const struct gconvcache_header *) __gconv_get_cache ();
731 strtab = (char *) header + header->string_offset;
732 hashtab = (struct hash_entry *) ((char *) header + header->hash_offset);
734 for (cnt = 0; cnt < header->hash_size; ++cnt)
735 if (hashtab[cnt].string_offset != 0)
737 const char *str = strtab + hashtab[cnt].string_offset;
739 if (strcmp (str, "INTERNAL") != 0)
740 tsearch (str, &printlist, (__compar_fn_t) strverscmp);
745 static void
746 internal_function
747 print_known_names (void)
749 iconv_t h;
750 void *cache;
752 /* We must initialize the internal databases first. */
753 h = iconv_open ("L1", "L1");
754 iconv_close (h);
756 /* See whether we have a cache. */
757 cache = __gconv_get_cache ();
758 if (cache != NULL)
759 /* Yep, use only this information. */
760 insert_cache ();
761 else
763 struct gconv_module *modules;
765 /* No, then use the information read from the gconv-modules file.
766 First add the aliases. */
767 twalk (__gconv_get_alias_db (), insert_print_list);
769 /* Add the from- and to-names from the known modules. */
770 modules = __gconv_get_modules_db ();
771 if (modules != NULL)
772 add_known_names (modules);
775 bool human_readable = isatty (fileno (stdout));
777 if (human_readable)
778 fputs (_("\
779 The following list contain all the coded character sets known. This does\n\
780 not necessarily mean that all combinations of these names can be used for\n\
781 the FROM and TO command line parameters. One coded character set can be\n\
782 listed with several different names (aliases).\n\n "), stdout);
784 /* Now print the collected names. */
785 column = 2;
786 twalk (printlist, human_readable ? do_print_human : do_print);
788 if (human_readable && column != 0)
789 puts ("");