Update install.texi, and regenerate INSTALL.
[glibc.git] / iconv / iconv_prog.c
blobc5dbb18519c2d4dab2667273a4e1497a4607fb3e
1 /* Convert text in given files from the specified from-set to the to-set.
2 Copyright (C) 1998-2021 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published
8 by the Free Software Foundation; version 2 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, see <https://www.gnu.org/licenses/>. */
19 #include <argp.h>
20 #include <assert.h>
21 #include <ctype.h>
22 #include <errno.h>
23 #include <error.h>
24 #include <fcntl.h>
25 #include <iconv.h>
26 #include <langinfo.h>
27 #include <locale.h>
28 #include <search.h>
29 #include <stdbool.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <unistd.h>
34 #include <libintl.h>
35 #ifdef _POSIX_MAPPED_FILES
36 # include <sys/mman.h>
37 #endif
38 #include <charmap.h>
39 #include <gconv_int.h>
40 #include "iconv_prog.h"
41 #include "iconvconfig.h"
42 #include "gconv_charset.h"
44 /* Get libc version number. */
45 #include "../version.h"
47 #define PACKAGE _libc_intl_domainname
50 /* Name and version of program. */
51 static void print_version (FILE *stream, struct argp_state *state);
52 void (*argp_program_version_hook) (FILE *, struct argp_state *) = print_version;
54 #define OPT_VERBOSE 1000
55 #define OPT_LIST 'l'
57 /* Definitions of arguments for argp functions. */
58 static const struct argp_option options[] =
60 { NULL, 0, NULL, 0, N_("Input/Output format specification:") },
61 { "from-code", 'f', N_("NAME"), 0, N_("encoding of original text") },
62 { "to-code", 't', N_("NAME"), 0, N_("encoding for output") },
63 { NULL, 0, NULL, 0, N_("Information:") },
64 { "list", 'l', NULL, 0, N_("list all known coded character sets") },
65 { NULL, 0, NULL, 0, N_("Output control:") },
66 { NULL, 'c', NULL, 0, N_("omit invalid characters from output") },
67 { "output", 'o', N_("FILE"), 0, N_("output file") },
68 { "silent", 's', NULL, 0, N_("suppress warnings") },
69 { "verbose", OPT_VERBOSE, NULL, 0, N_("print progress information") },
70 { NULL, 0, NULL, 0, NULL }
73 /* Short description of program. */
74 static const char doc[] = N_("\
75 Convert encoding of given files from one encoding to another.");
77 /* Strings for arguments in help texts. */
78 static const char args_doc[] = N_("[FILE...]");
80 /* Prototype for option handler. */
81 static error_t parse_opt (int key, char *arg, struct argp_state *state);
83 /* Function to print some extra text in the help message. */
84 static char *more_help (int key, const char *text, void *input);
86 /* Data structure to communicate with argp functions. */
87 static struct argp argp =
89 options, parse_opt, args_doc, doc, NULL, more_help
92 /* Code sets to convert from and to respectively. An empty string as the
93 default causes the 'iconv_open' function to look up the charset of the
94 currently selected locale and use it. */
95 static const char *from_code = "";
96 static const char *to_code = "";
98 /* File to write output to. If NULL write to stdout. */
99 static const char *output_file;
101 /* Nonzero if list of all coded character sets is wanted. */
102 static int list;
104 /* If nonzero omit invalid character from output. */
105 int omit_invalid;
107 /* Prototypes for the functions doing the actual work. */
108 static int process_block (iconv_t cd, char *addr, size_t len, FILE **output,
109 const char *output_file);
110 static int process_fd (iconv_t cd, int fd, FILE **output,
111 const char *output_file);
112 static int process_file (iconv_t cd, FILE *input, FILE **output,
113 const char *output_file);
114 static void print_known_names (void);
118 main (int argc, char *argv[])
120 int status = EXIT_SUCCESS;
121 int remaining;
122 __gconv_t cd;
123 struct charmap_t *from_charmap = NULL;
124 struct charmap_t *to_charmap = NULL;
126 /* Set locale via LC_ALL. */
127 setlocale (LC_ALL, "");
129 /* Set the text message domain. */
130 textdomain (_libc_intl_domainname);
132 /* Parse and process arguments. */
133 argp_parse (&argp, argc, argv, 0, &remaining, NULL);
135 /* List all coded character sets if wanted. */
136 if (list)
138 print_known_names ();
139 exit (EXIT_SUCCESS);
142 /* POSIX 1003.2b introduces a silly thing: the arguments to -t anf -f
143 can be file names of charmaps. In this case iconv will have to read
144 those charmaps and use them to do the conversion. But there are
145 holes in the specification. There is nothing said that if -f is a
146 charmap filename that -t must be, too. And vice versa. There is
147 also no word about the symbolic names used. What if they don't
148 match? */
149 if (strchr (from_code, '/') != NULL)
150 /* The from-name might be a charmap file name. Try reading the
151 file. */
152 from_charmap = charmap_read (from_code, /*0, 1*/1, 0, 0, 0);
154 if (strchr (to_code, '/') != NULL)
155 /* The to-name might be a charmap file name. Try reading the
156 file. */
157 to_charmap = charmap_read (to_code, /*0, 1,*/1, 0, 0, 0);
160 /* At this point we have to handle two cases. The first one is
161 where a charmap is used for the from- or to-charset, or both. We
162 handle this special since it is very different from the sane way of
163 doing things. The other case allows converting using the iconv()
164 function. */
165 if (from_charmap != NULL || to_charmap != NULL)
166 /* Construct the conversion table and do the conversion. */
167 status = charmap_conversion (from_code, from_charmap, to_code, to_charmap,
168 argc, remaining, argv, output_file);
169 else
171 struct gconv_spec conv_spec;
172 int res;
174 if (__gconv_create_spec (&conv_spec, from_code, to_code) == NULL)
176 error (EXIT_FAILURE, errno,
177 _("failed to start conversion processing"));
178 exit (1);
181 if (omit_invalid)
182 conv_spec.ignore = true;
184 /* Let's see whether we have these coded character sets. */
185 res = __gconv_open (&conv_spec, &cd, 0);
187 __gconv_destroy_spec (&conv_spec);
189 if (res != __GCONV_OK)
191 if (errno == EINVAL)
193 /* Try to be nice with the user and tell her which of the
194 two encoding names is wrong. This is possible because
195 all supported encodings can be converted from/to Unicode,
196 in other words, because the graph of encodings is
197 connected. */
198 bool from_wrong =
199 (iconv_open ("UTF-8", from_code) == (iconv_t) -1
200 && errno == EINVAL);
201 bool to_wrong =
202 (iconv_open (to_code, "UTF-8") == (iconv_t) -1
203 && errno == EINVAL);
204 const char *from_pretty =
205 (from_code[0] ? from_code : nl_langinfo (CODESET));
206 const char *to_pretty =
207 (to_code[0] ? to_code : nl_langinfo (CODESET));
209 if (from_wrong)
211 if (to_wrong)
212 error (0, 0,
213 _("\
214 conversions from `%s' and to `%s' are not supported"),
215 from_pretty, to_pretty);
216 else
217 error (0, 0,
218 _("conversion from `%s' is not supported"),
219 from_pretty);
221 else
223 if (to_wrong)
224 error (0, 0,
225 _("conversion to `%s' is not supported"),
226 to_pretty);
227 else
228 error (0, 0,
229 _("conversion from `%s' to `%s' is not supported"),
230 from_pretty, to_pretty);
233 argp_help (&argp, stderr, ARGP_HELP_SEE,
234 program_invocation_short_name);
235 exit (1);
237 else
238 error (EXIT_FAILURE, errno,
239 _("failed to start conversion processing"));
242 /* The output file. Will be opened when we are ready to produce
243 output. */
244 FILE *output = NULL;
246 /* Now process the remaining files. Write them to stdout or the file
247 specified with the `-o' parameter. If we have no file given as
248 the parameter process all from stdin. */
249 if (remaining == argc)
251 if (process_file (cd, stdin, &output, output_file) != 0)
252 status = EXIT_FAILURE;
254 else
257 #ifdef _POSIX_MAPPED_FILES
258 struct stat64 st;
259 char *addr;
260 #endif
261 int fd, ret;
263 if (verbose)
264 fprintf (stderr, "%s:\n", argv[remaining]);
265 if (strcmp (argv[remaining], "-") == 0)
266 fd = 0;
267 else
269 fd = open (argv[remaining], O_RDONLY);
271 if (fd == -1)
273 error (0, errno, _("cannot open input file `%s'"),
274 argv[remaining]);
275 status = EXIT_FAILURE;
276 continue;
280 #ifdef _POSIX_MAPPED_FILES
281 /* We have possibilities for reading the input file. First try
282 to mmap() it since this will provide the fastest solution. */
283 if (fstat64 (fd, &st) == 0
284 && ((addr = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE,
285 fd, 0)) != MAP_FAILED))
287 /* Yes, we can use mmap(). The descriptor is not needed
288 anymore. */
289 if (close (fd) != 0)
290 error (EXIT_FAILURE, errno,
291 _("error while closing input `%s'"),
292 argv[remaining]);
294 ret = process_block (cd, addr, st.st_size, &output,
295 output_file);
297 /* We don't need the input data anymore. */
298 munmap ((void *) addr, st.st_size);
300 if (ret != 0)
302 status = EXIT_FAILURE;
304 if (ret < 0)
305 /* We cannot go on with producing output since it might
306 lead to problem because the last output might leave
307 the output stream in an undefined state. */
308 break;
311 else
312 #endif /* _POSIX_MAPPED_FILES */
314 /* Read the file in pieces. */
315 ret = process_fd (cd, fd, &output, output_file);
317 /* Now close the file. */
318 close (fd);
320 if (ret != 0)
322 /* Something went wrong. */
323 status = EXIT_FAILURE;
325 if (ret < 0)
326 /* We cannot go on with producing output since it might
327 lead to problem because the last output might leave
328 the output stream in an undefined state. */
329 break;
333 while (++remaining < argc);
335 /* Close the output file now. */
336 if (output != NULL && fclose (output))
337 error (EXIT_FAILURE, errno, _("error while closing output file"));
340 return status;
344 /* Handle program arguments. */
345 static error_t
346 parse_opt (int key, char *arg, struct argp_state *state)
348 switch (key)
350 case 'f':
351 from_code = arg;
352 break;
353 case 't':
354 to_code = arg;
355 break;
356 case 'o':
357 output_file = arg;
358 break;
359 case 's':
360 /* Nothing, for now at least. We are not giving out any information
361 about missing character or so. */
362 break;
363 case 'c':
364 /* Omit invalid characters from output. */
365 omit_invalid = 1;
366 break;
367 case OPT_VERBOSE:
368 verbose = 1;
369 break;
370 case OPT_LIST:
371 list = 1;
372 break;
373 default:
374 return ARGP_ERR_UNKNOWN;
376 return 0;
380 static char *
381 more_help (int key, const char *text, void *input)
383 char *tp = NULL;
384 switch (key)
386 case ARGP_KEY_HELP_EXTRA:
387 /* We print some extra information. */
388 if (asprintf (&tp, gettext ("\
389 For bug reporting instructions, please see:\n\
390 %s.\n"), REPORT_BUGS_TO) < 0)
391 return NULL;
392 return tp;
393 default:
394 break;
396 return (char *) text;
400 /* Print the version information. */
401 static void
402 print_version (FILE *stream, struct argp_state *state)
404 fprintf (stream, "iconv %s%s\n", PKGVERSION, VERSION);
405 fprintf (stream, gettext ("\
406 Copyright (C) %s Free Software Foundation, Inc.\n\
407 This is free software; see the source for copying conditions. There is NO\n\
408 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n\
409 "), "2021");
410 fprintf (stream, gettext ("Written by %s.\n"), "Ulrich Drepper");
414 static int
415 write_output (const char *outbuf, const char *outptr, FILE **output,
416 const char *output_file)
418 /* We have something to write out. */
419 int errno_save = errno;
421 if (*output == NULL)
423 /* Determine output file. */
424 if (output_file != NULL && strcmp (output_file, "-") != 0)
426 *output = fopen (output_file, "w");
427 if (*output == NULL)
428 error (EXIT_FAILURE, errno, _("cannot open output file"));
430 else
431 *output = stdout;
434 if (fwrite (outbuf, 1, outptr - outbuf, *output) < (size_t) (outptr - outbuf)
435 || ferror (*output))
437 /* Error occurred while printing the result. */
438 error (0, 0, _("\
439 conversion stopped due to problem in writing the output"));
440 return -1;
443 errno = errno_save;
445 return 0;
449 static int
450 process_block (iconv_t cd, char *addr, size_t len, FILE **output,
451 const char *output_file)
453 #define OUTBUF_SIZE 32768
454 const char *start = addr;
455 char outbuf[OUTBUF_SIZE];
456 char *outptr;
457 size_t outlen;
458 size_t n;
459 int ret = 0;
461 while (len > 0)
463 outptr = outbuf;
464 outlen = OUTBUF_SIZE;
465 n = iconv (cd, &addr, &len, &outptr, &outlen);
467 if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
469 ret = 1;
470 if (len == 0)
471 n = 0;
472 else
473 errno = E2BIG;
476 if (outptr != outbuf)
478 ret = write_output (outbuf, outptr, output, output_file);
479 if (ret != 0)
480 break;
483 if (n != (size_t) -1)
485 /* All the input test is processed. For state-dependent
486 character sets we have to flush the state now. */
487 outptr = outbuf;
488 outlen = OUTBUF_SIZE;
489 n = iconv (cd, NULL, NULL, &outptr, &outlen);
491 if (outptr != outbuf)
493 ret = write_output (outbuf, outptr, output, output_file);
494 if (ret != 0)
495 break;
498 if (n != (size_t) -1)
499 break;
501 if (omit_invalid && errno == EILSEQ)
503 ret = 1;
504 break;
508 if (errno != E2BIG)
510 /* iconv() ran into a problem. */
511 switch (errno)
513 case EILSEQ:
514 if (! omit_invalid)
515 error (0, 0, _("illegal input sequence at position %ld"),
516 (long int) (addr - start));
517 break;
518 case EINVAL:
519 error (0, 0, _("\
520 incomplete character or shift sequence at end of buffer"));
521 break;
522 case EBADF:
523 error (0, 0, _("internal error (illegal descriptor)"));
524 break;
525 default:
526 error (0, 0, _("unknown iconv() error %d"), errno);
527 break;
530 return -1;
534 return ret;
538 static int
539 process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
541 /* we have a problem with reading from a desriptor since we must not
542 provide the iconv() function an incomplete character or shift
543 sequence at the end of the buffer. Since we have to deal with
544 arbitrary encodings we must read the whole text in a buffer and
545 process it in one step. */
546 static char *inbuf = NULL;
547 static size_t maxlen = 0;
548 char *inptr = NULL;
549 size_t actlen = 0;
551 while (actlen < maxlen)
553 ssize_t n = read (fd, inptr, maxlen - actlen);
555 if (n == 0)
556 /* No more text to read. */
557 break;
559 if (n == -1)
561 /* Error while reading. */
562 error (0, errno, _("error while reading the input"));
563 return -1;
566 inptr += n;
567 actlen += n;
570 if (actlen == maxlen)
571 while (1)
573 ssize_t n;
574 char *new_inbuf;
576 /* Increase the buffer. */
577 new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
578 if (new_inbuf == NULL)
580 error (0, errno, _("unable to allocate buffer for input"));
581 return -1;
583 inbuf = new_inbuf;
584 maxlen += 32768;
585 inptr = inbuf + actlen;
589 n = read (fd, inptr, maxlen - actlen);
591 if (n == 0)
592 /* No more text to read. */
593 break;
595 if (n == -1)
597 /* Error while reading. */
598 error (0, errno, _("error while reading the input"));
599 return -1;
602 inptr += n;
603 actlen += n;
605 while (actlen < maxlen);
607 if (n == 0)
608 /* Break again so we leave both loops. */
609 break;
612 /* Now we have all the input in the buffer. Process it in one run. */
613 return process_block (cd, inbuf, actlen, output, output_file);
617 static int
618 process_file (iconv_t cd, FILE *input, FILE **output, const char *output_file)
620 /* This should be safe since we use this function only for `stdin' and
621 we haven't read anything so far. */
622 return process_fd (cd, fileno (input), output, output_file);
626 /* Print all known character sets/encodings. */
627 static void *printlist;
628 static size_t column;
629 static int not_first;
631 static void
632 insert_print_list (const void *nodep, VISIT value, int level)
634 if (value == leaf || value == postorder)
636 const struct gconv_alias *s = *(const struct gconv_alias **) nodep;
637 tsearch (s->fromname, &printlist, (__compar_fn_t) strverscmp);
641 static void
642 do_print_human (const void *nodep, VISIT value, int level)
644 if (value == leaf || value == postorder)
646 const char *s = *(const char **) nodep;
647 size_t len = strlen (s);
648 size_t cnt;
650 while (len > 0 && s[len - 1] == '/')
651 --len;
653 for (cnt = 0; cnt < len; ++cnt)
654 if (isalnum (s[cnt]))
655 break;
656 if (cnt == len)
657 return;
659 if (not_first)
661 putchar (',');
662 ++column;
664 if (column > 2 && column + len > 77)
666 fputs ("\n ", stdout);
667 column = 2;
669 else
671 putchar (' ');
672 ++column;
675 else
676 not_first = 1;
678 fwrite (s, len, 1, stdout);
679 column += len;
683 static void
684 do_print (const void *nodep, VISIT value, int level)
686 if (value == leaf || value == postorder)
688 const char *s = *(const char **) nodep;
690 puts (s);
694 static void
695 add_known_names (struct gconv_module *node)
697 if (node->left != NULL)
698 add_known_names (node->left);
699 if (node->right != NULL)
700 add_known_names (node->right);
703 if (strcmp (node->from_string, "INTERNAL") != 0)
704 tsearch (node->from_string, &printlist, (__compar_fn_t) strverscmp);
705 if (strcmp (node->to_string, "INTERNAL") != 0)
706 tsearch (node->to_string, &printlist, (__compar_fn_t) strverscmp);
708 node = node->same;
710 while (node != NULL);
714 static void
715 insert_cache (void)
717 const struct gconvcache_header *header;
718 const char *strtab;
719 const struct hash_entry *hashtab;
720 size_t cnt;
722 header = (const struct gconvcache_header *) __gconv_get_cache ();
723 strtab = (char *) header + header->string_offset;
724 hashtab = (struct hash_entry *) ((char *) header + header->hash_offset);
726 for (cnt = 0; cnt < header->hash_size; ++cnt)
727 if (hashtab[cnt].string_offset != 0)
729 const char *str = strtab + hashtab[cnt].string_offset;
731 if (strcmp (str, "INTERNAL") != 0)
732 tsearch (str, &printlist, (__compar_fn_t) strverscmp);
737 static void
738 print_known_names (void)
740 iconv_t h;
741 void *cache;
743 /* We must initialize the internal databases first. */
744 h = iconv_open ("L1", "L1");
745 iconv_close (h);
747 /* See whether we have a cache. */
748 cache = __gconv_get_cache ();
749 if (cache != NULL)
750 /* Yep, use only this information. */
751 insert_cache ();
752 else
754 struct gconv_module *modules;
756 /* No, then use the information read from the gconv-modules file.
757 First add the aliases. */
758 twalk (__gconv_get_alias_db (), insert_print_list);
760 /* Add the from- and to-names from the known modules. */
761 modules = __gconv_get_modules_db ();
762 if (modules != NULL)
763 add_known_names (modules);
766 bool human_readable = isatty (fileno (stdout));
768 if (human_readable)
769 fputs (_("\
770 The following list contains all the coded character sets known. This does\n\
771 not necessarily mean that all combinations of these names can be used for\n\
772 the FROM and TO command line parameters. One coded character set can be\n\
773 listed with several different names (aliases).\n\n "), stdout);
775 /* Now print the collected names. */
776 column = 2;
777 twalk (printlist, human_readable ? do_print_human : do_print);
779 if (human_readable && column != 0)
780 puts ("");