1 /* Convert text in given files from the specified from-set to the to-set.
2 Copyright (C) 1998-2004, 2005, 2006 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License version 2 as
8 published by the Free Software Foundation.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
35 #ifdef _POSIX_MAPPED_FILES
36 # include <sys/mman.h>
39 #include <gconv_int.h>
40 #include "iconv_prog.h"
41 #include "iconvconfig.h"
43 /* Get libc version number. */
44 #include "../version.h"
46 #define PACKAGE _libc_intl_domainname
49 /* Name and version of program. */
50 static void print_version (FILE *stream
, struct argp_state
*state
);
51 void (*argp_program_version_hook
) (FILE *, struct argp_state
*) = print_version
;
53 #define OPT_VERBOSE 1000
56 /* Definitions of arguments for argp functions. */
57 static const struct argp_option options
[] =
59 { NULL
, 0, NULL
, 0, N_("Input/Output format specification:") },
60 { "from-code", 'f', "NAME", 0, N_("encoding of original text") },
61 { "to-code", 't', "NAME", 0, N_("encoding for output") },
62 { NULL
, 0, NULL
, 0, N_("Information:") },
63 { "list", 'l', NULL
, 0, N_("list all known coded character sets") },
64 { NULL
, 0, NULL
, 0, N_("Output control:") },
65 { NULL
, 'c', NULL
, 0, N_("omit invalid characters from output") },
66 { "output", 'o', "FILE", 0, N_("output file") },
67 { "silent", 's', NULL
, 0, N_("suppress warnings") },
68 { "verbose", OPT_VERBOSE
, NULL
, 0, N_("print progress information") },
69 { NULL
, 0, NULL
, 0, NULL
}
72 /* Short description of program. */
73 static const char doc
[] = N_("\
74 Convert encoding of given files from one encoding to another.");
76 /* Strings for arguments in help texts. */
77 static const char args_doc
[] = N_("[FILE...]");
79 /* Prototype for option handler. */
80 static error_t
parse_opt (int key
, char *arg
, struct argp_state
*state
);
82 /* Function to print some extra text in the help message. */
83 static char *more_help (int key
, const char *text
, void *input
);
85 /* Data structure to communicate with argp functions. */
86 static struct argp argp
=
88 options
, parse_opt
, args_doc
, doc
, NULL
, more_help
91 /* Code sets to convert from and to respectively. An empty string as the
92 default causes the 'iconv_open' function to look up the charset of the
93 currently selected locale and use it. */
94 static const char *from_code
= "";
95 static const char *to_code
= "";
97 /* File to write output to. If NULL write to stdout. */
98 static const char *output_file
;
100 /* Nonzero if verbose ouput is wanted. */
103 /* Nonzero if list of all coded character sets is wanted. */
106 /* If nonzero omit invalid character from output. */
109 /* Prototypes for the functions doing the actual work. */
110 static int process_block (iconv_t cd
, char *addr
, size_t len
, FILE *output
);
111 static int process_fd (iconv_t cd
, int fd
, FILE *output
);
112 static int process_file (iconv_t cd
, FILE *input
, FILE *output
);
113 static void print_known_names (void) internal_function
;
117 main (int argc
, char *argv
[])
119 int status
= EXIT_SUCCESS
;
123 const char *orig_to_code
;
124 struct charmap_t
*from_charmap
= NULL
;
125 struct charmap_t
*to_charmap
= NULL
;
127 /* Set locale via LC_ALL. */
128 setlocale (LC_ALL
, "");
130 /* Set the text message domain. */
131 textdomain (_libc_intl_domainname
);
133 /* Parse and process arguments. */
134 argp_parse (&argp
, argc
, argv
, 0, &remaining
, NULL
);
136 /* List all coded character sets if wanted. */
139 print_known_names ();
143 /* If we have to ignore errors make sure we use the appropriate name for
144 the to-character-set. */
145 orig_to_code
= to_code
;
148 const char *errhand
= strchrnul (to_code
, '/');
156 errhand
= strchrnul (errhand
, '/');
161 errhand
= strchr (errhand
, '\0');
165 newp
= (char *) alloca (errhand
- to_code
+ nslash
+ 7 + 1);
166 cp
= mempcpy (newp
, to_code
, errhand
- to_code
);
171 memcpy (cp
, "IGNORE", sizeof ("IGNORE"));
176 /* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f
177 can be file names of charmaps. In this case iconv will have to read
178 those charmaps and use them to do the conversion. But there are
179 holes in the specification. There is nothing said that if -f is a
180 charmap filename that -t must be, too. And vice versa. There is
181 also no word about the symbolic names used. What if they don't
183 if (strchr (from_code
, '/') != NULL
)
184 /* The from-name might be a charmap file name. Try reading the
186 from_charmap
= charmap_read (from_code
, /*0, 1*/1, 0, 0);
188 if (strchr (orig_to_code
, '/') != NULL
)
189 /* The to-name might be a charmap file name. Try reading the
191 to_charmap
= charmap_read (orig_to_code
, /*0, 1,*/1,0, 0);
194 /* Determine output file. */
195 if (output_file
!= NULL
&& strcmp (output_file
, "-") != 0)
197 output
= fopen (output_file
, "w");
199 error (EXIT_FAILURE
, errno
, _("cannot open output file"));
204 /* At this point we have to handle two cases. The first one is
205 where a charmap is used for the from- or to-charset, or both. We
206 handle this special since it is very different from the sane way of
207 doing things. The other case allows converting using the iconv()
209 if (from_charmap
!= NULL
|| to_charmap
!= NULL
)
210 /* Construct the conversion table and do the conversion. */
211 status
= charmap_conversion (from_code
, from_charmap
, to_code
, to_charmap
,
212 argc
, remaining
, argv
, output
);
215 /* Let's see whether we have these coded character sets. */
216 cd
= iconv_open (to_code
, from_code
);
217 if (cd
== (iconv_t
) -1)
221 /* Try to be nice with the user and tell her which of the
222 two encoding names is wrong. This is possible because
223 all supported encodings can be converted from/to Unicode,
224 in other words, because the graph of encodings is
227 (iconv_open ("UTF-8", from_code
) == (iconv_t
) -1
230 (iconv_open (to_code
, "UTF-8") == (iconv_t
) -1
232 const char *from_pretty
=
233 (from_code
[0] ? from_code
: nl_langinfo (CODESET
));
234 const char *to_pretty
=
235 (orig_to_code
[0] ? orig_to_code
: nl_langinfo (CODESET
));
242 conversion from `%s' and to `%s' are not supported"),
243 from_pretty
, to_pretty
);
246 _("conversion from `%s' is not supported"),
253 _("conversion to `%s' is not supported"),
257 _("conversion from `%s' to `%s' is not supported"),
258 from_pretty
, to_pretty
);
261 argp_help (&argp
, stderr
, ARGP_HELP_SEE
,
262 program_invocation_short_name
);
266 error (EXIT_FAILURE
, errno
,
267 _("failed to start conversion processing"));
270 /* Now process the remaining files. Write them to stdout or the file
271 specified with the `-o' parameter. If we have no file given as
272 the parameter process all from stdin. */
273 if (remaining
== argc
)
275 if (process_file (cd
, stdin
, output
) != 0)
276 status
= EXIT_FAILURE
;
281 #ifdef _POSIX_MAPPED_FILES
288 fprintf (stderr
, "%s:\n", argv
[remaining
]);
289 if (strcmp (argv
[remaining
], "-") == 0)
293 fd
= open (argv
[remaining
], O_RDONLY
);
297 error (0, errno
, _("cannot open input file `%s'"),
299 status
= EXIT_FAILURE
;
304 #ifdef _POSIX_MAPPED_FILES
305 /* We have possibilities for reading the input file. First try
306 to mmap() it since this will provide the fastest solution. */
307 if (fstat (fd
, &st
) == 0
308 && ((addr
= mmap (NULL
, st
.st_size
, PROT_READ
, MAP_PRIVATE
,
309 fd
, 0)) != MAP_FAILED
))
311 /* Yes, we can use mmap(). The descriptor is not needed
314 error (EXIT_FAILURE
, errno
,
315 _("error while closing input `%s'"),
318 ret
= process_block (cd
, addr
, st
.st_size
, output
);
320 /* We don't need the input data anymore. */
321 munmap ((void *) addr
, st
.st_size
);
325 status
= EXIT_FAILURE
;
328 /* We cannot go on with producing output since it might
329 lead to problem because the last output might leave
330 the output stream in an undefined state. */
335 #endif /* _POSIX_MAPPED_FILES */
337 /* Read the file in pieces. */
338 ret
= process_fd (cd
, fd
, output
);
340 /* Now close the file. */
345 /* Something went wrong. */
346 status
= EXIT_FAILURE
;
349 /* We cannot go on with producing output since it might
350 lead to problem because the last output might leave
351 the output stream in an undefined state. */
356 while (++remaining
< argc
);
359 /* Close the output file now. */
361 error (EXIT_FAILURE
, errno
, _("error while closing output file"));
367 /* Handle program arguments. */
369 parse_opt (int key
, char *arg
, struct argp_state
*state
)
383 /* Nothing, for now at least. We are not giving out any information
384 about missing character or so. */
387 /* Omit invalid characters from output. */
397 return ARGP_ERR_UNKNOWN
;
404 more_help (int key
, const char *text
, void *input
)
408 case ARGP_KEY_HELP_EXTRA
:
409 /* We print some extra information. */
410 return strdup (gettext ("\
411 For bug reporting instructions, please see:\n\
412 <http://www.gnu.org/software/libc/bugs.html>.\n"));
416 return (char *) text
;
420 /* Print the version information. */
422 print_version (FILE *stream
, struct argp_state
*state
)
424 fprintf (stream
, "iconv (GNU %s) %s\n", PACKAGE
, VERSION
);
425 fprintf (stream
, gettext ("\
426 Copyright (C) %s Free Software Foundation, Inc.\n\
427 This is free software; see the source for copying conditions. There is NO\n\
428 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
430 fprintf (stream
, gettext ("Written by %s.\n"), "Ulrich Drepper");
435 process_block (iconv_t cd
, char *addr
, size_t len
, FILE *output
)
437 #define OUTBUF_SIZE 32768
438 const char *start
= addr
;
439 char outbuf
[OUTBUF_SIZE
];
448 outlen
= OUTBUF_SIZE
;
449 n
= iconv (cd
, &addr
, &len
, &outptr
, &outlen
);
451 if (n
== (size_t) -1 && omit_invalid
&& errno
== EILSEQ
)
460 if (outptr
!= outbuf
)
462 /* We have something to write out. */
463 int errno_save
= errno
;
465 if (fwrite (outbuf
, 1, outptr
- outbuf
, output
)
466 < (size_t) (outptr
- outbuf
)
469 /* Error occurred while printing the result. */
471 conversion stopped due to problem in writing the output"));
478 if (n
!= (size_t) -1)
480 /* All the input test is processed. For state-dependent
481 character sets we have to flush the state now. */
483 outlen
= OUTBUF_SIZE
;
484 n
= iconv (cd
, NULL
, NULL
, &outptr
, &outlen
);
486 if (outptr
!= outbuf
)
488 /* We have something to write out. */
489 int errno_save
= errno
;
491 if (fwrite (outbuf
, 1, outptr
- outbuf
, output
)
492 < (size_t) (outptr
- outbuf
)
495 /* Error occurred while printing the result. */
497 conversion stopped due to problem in writing the output"));
504 if (n
!= (size_t) -1)
507 if (omit_invalid
&& errno
== EILSEQ
)
516 /* iconv() ran into a problem. */
521 error (0, 0, _("illegal input sequence at position %ld"),
522 (long int) (addr
- start
));
526 incomplete character or shift sequence at end of buffer"));
529 error (0, 0, _("internal error (illegal descriptor)"));
532 error (0, 0, _("unknown iconv() error %d"), errno
);
545 process_fd (iconv_t cd
, int fd
, FILE *output
)
547 /* we have a problem with reading from a desriptor since we must not
548 provide the iconv() function an incomplete character or shift
549 sequence at the end of the buffer. Since we have to deal with
550 arbitrary encodings we must read the whole text in a buffer and
551 process it in one step. */
552 static char *inbuf
= NULL
;
553 static size_t maxlen
= 0;
557 while (actlen
< maxlen
)
559 ssize_t n
= read (fd
, inptr
, maxlen
- actlen
);
562 /* No more text to read. */
567 /* Error while reading. */
568 error (0, errno
, _("error while reading the input"));
576 if (actlen
== maxlen
)
582 /* Increase the buffer. */
583 new_inbuf
= (char *) realloc (inbuf
, maxlen
+ 32768);
584 if (new_inbuf
== NULL
)
586 error (0, errno
, _("unable to allocate buffer for input"));
591 inptr
= inbuf
+ actlen
;
595 n
= read (fd
, inptr
, maxlen
- actlen
);
598 /* No more text to read. */
603 /* Error while reading. */
604 error (0, errno
, _("error while reading the input"));
611 while (actlen
< maxlen
);
614 /* Break again so we leave both loops. */
618 /* Now we have all the input in the buffer. Process it in one run. */
619 return process_block (cd
, inbuf
, actlen
, output
);
624 process_file (iconv_t cd
, FILE *input
, FILE *output
)
626 /* This should be safe since we use this function only for `stdin' and
627 we haven't read anything so far. */
628 return process_fd (cd
, fileno (input
), output
);
632 /* Print all known character sets/encodings. */
633 static void *printlist
;
634 static size_t column
;
635 static int not_first
;
638 insert_print_list (const void *nodep
, VISIT value
, int level
)
640 if (value
== leaf
|| value
== postorder
)
642 const struct gconv_alias
*s
= *(const struct gconv_alias
**) nodep
;
643 tsearch (s
->fromname
, &printlist
, (__compar_fn_t
) strverscmp
);
648 do_print_human (const void *nodep
, VISIT value
, int level
)
650 if (value
== leaf
|| value
== postorder
)
652 const char *s
= *(const char **) nodep
;
653 size_t len
= strlen (s
);
656 while (len
> 0 && s
[len
- 1] == '/')
659 for (cnt
= 0; cnt
< len
; ++cnt
)
660 if (isalnum (s
[cnt
]))
670 if (column
> 2 && column
+ len
> 77)
672 fputs ("\n ", stdout
);
684 fwrite (s
, len
, 1, stdout
);
690 do_print (const void *nodep
, VISIT value
, int level
)
692 if (value
== leaf
|| value
== postorder
)
694 const char *s
= *(const char **) nodep
;
702 add_known_names (struct gconv_module
*node
)
704 if (node
->left
!= NULL
)
705 add_known_names (node
->left
);
706 if (node
->right
!= NULL
)
707 add_known_names (node
->right
);
710 if (strcmp (node
->from_string
, "INTERNAL"))
711 tsearch (node
->from_string
, &printlist
,
712 (__compar_fn_t
) strverscmp
);
713 if (strcmp (node
->to_string
, "INTERNAL") != 0)
714 tsearch (node
->to_string
, &printlist
, (__compar_fn_t
) strverscmp
);
718 while (node
!= NULL
);
725 const struct gconvcache_header
*header
;
727 const struct hash_entry
*hashtab
;
730 header
= (const struct gconvcache_header
*) __gconv_get_cache ();
731 strtab
= (char *) header
+ header
->string_offset
;
732 hashtab
= (struct hash_entry
*) ((char *) header
+ header
->hash_offset
);
734 for (cnt
= 0; cnt
< header
->hash_size
; ++cnt
)
735 if (hashtab
[cnt
].string_offset
!= 0)
737 const char *str
= strtab
+ hashtab
[cnt
].string_offset
;
739 if (strcmp (str
, "INTERNAL") != 0)
740 tsearch (str
, &printlist
, (__compar_fn_t
) strverscmp
);
747 print_known_names (void)
752 /* We must initialize the internal databases first. */
753 h
= iconv_open ("L1", "L1");
756 /* See whether we have a cache. */
757 cache
= __gconv_get_cache ();
759 /* Yep, use only this information. */
763 struct gconv_module
*modules
;
765 /* No, then use the information read from the gconv-modules file.
766 First add the aliases. */
767 twalk (__gconv_get_alias_db (), insert_print_list
);
769 /* Add the from- and to-names from the known modules. */
770 modules
= __gconv_get_modules_db ();
772 add_known_names (modules
);
775 bool human_readable
= isatty (fileno (stdout
));
779 The following list contain all the coded character sets known. This does\n\
780 not necessarily mean that all combinations of these names can be used for\n\
781 the FROM and TO command line parameters. One coded character set can be\n\
782 listed with several different names (aliases).\n\n "), stdout
);
784 /* Now print the collected names. */
786 twalk (printlist
, human_readable
? do_print_human
: do_print
);
788 if (human_readable
&& column
!= 0)