Remove support for the beos file format
[binutils-gdb.git] / binutils / strings.c
blobec02e1d5fcea2b7cbb8ba13e701153276613407f
1 /* strings -- print the strings of printable characters in files
2 Copyright (C) 1993-2024 Free Software Foundation, Inc.
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17 02110-1301, USA. */
19 /* Usage: strings [options] file...
21 Options:
22 --all
24 - Scan each file in its entirety.
26 --data
27 -d Scan only the initialized data section(s) of object files.
29 --print-file-name
30 -f Print the name of the file before each string.
32 --bytes=min-len
33 -n min-len
34 -min-len Print graphic char sequences, MIN-LEN or more bytes long,
35 that are followed by a NUL or a non-displayable character.
36 Default is 4.
38 --radix={o,x,d}
39 -t {o,x,d} Print the offset within the file before each string,
40 in octal/hex/decimal.
42 --include-all-whitespace
43 -w By default tab and space are the only whitepace included in graphic
44 char sequences. This option considers all of isspace() valid.
46 -o Like -to. (Some other implementations have -o like -to,
47 others like -td. We chose one arbitrarily.)
49 --encoding={s,S,b,l,B,L}
50 -e {s,S,b,l,B,L}
51 Select character encoding: 7-bit-character, 8-bit-character,
52 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
53 littleendian 32-bit.
55 --target=BFDNAME
56 -T {bfdname}
57 Specify a non-default object file format.
59 --unicode={default|locale|invalid|hex|escape|highlight}
60 -U {d|l|i|x|e|h}
61 Determine how to handle UTF-8 unicode characters. The default
62 is no special treatment. All other versions of this option
63 only apply if the encoding is valid and enabling the option
64 implies --encoding=S.
65 The 'locale' option displays the characters according to the
66 current locale. The 'invalid' option treats them as
67 non-string characters. The 'hex' option displays them as hex
68 byte sequences. The 'escape' option displays them as escape
69 sequences and the 'highlight' option displays them as
70 coloured escape sequences.
72 --output-separator=sep_string
73 -s sep_string String used to separate parsed strings in output.
74 Default is newline.
76 --help
77 -h Print the usage message on the standard output.
79 --version
81 -v Print the program version number.
83 Written by Richard Stallman <rms@gnu.ai.mit.edu>
84 and David MacKenzie <djm@gnu.ai.mit.edu>. */
86 #include "sysdep.h"
87 #include "bfd.h"
88 #include "getopt.h"
89 #include "libiberty.h"
90 #include "safe-ctype.h"
91 #include "bucomm.h"
93 #ifndef streq
94 #define streq(a,b) (strcmp ((a),(b)) == 0)
95 #endif
97 typedef enum unicode_display_type
99 unicode_default = 0,
100 unicode_locale,
101 unicode_escape,
102 unicode_hex,
103 unicode_highlight,
104 unicode_invalid
105 } unicode_display_type;
107 static unicode_display_type unicode_display = unicode_default;
109 #define STRING_ISGRAPHIC(c) \
110 ( (c) >= 0 \
111 && (c) <= 255 \
112 && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
113 || (include_all_whitespace && ISSPACE (c))) \
116 #ifndef errno
117 extern int errno;
118 #endif
120 /* The BFD section flags that identify an initialized data section. */
121 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
123 /* Radix for printing addresses (must be 8, 10 or 16). */
124 static int address_radix;
126 /* Minimum length of sequence of graphic chars to trigger output. */
127 static unsigned int string_min;
129 /* Whether or not we include all whitespace as a graphic char. */
130 static bool include_all_whitespace;
132 /* TRUE means print address within file for each string. */
133 static bool print_addresses;
135 /* TRUE means print filename for each string. */
136 static bool print_filenames;
138 /* TRUE means for object files scan only the data section. */
139 static bool datasection_only;
141 /* The BFD object file format. */
142 static char *target;
144 /* The character encoding format. */
145 static char encoding;
146 static int encoding_bytes;
148 /* Output string used to separate parsed strings */
149 static char *output_separator;
151 static struct option long_options[] =
153 {"all", no_argument, NULL, 'a'},
154 {"bytes", required_argument, NULL, 'n'},
155 {"data", no_argument, NULL, 'd'},
156 {"encoding", required_argument, NULL, 'e'},
157 {"help", no_argument, NULL, 'h'},
158 {"include-all-whitespace", no_argument, NULL, 'w'},
159 {"output-separator", required_argument, NULL, 's'},
160 {"print-file-name", no_argument, NULL, 'f'},
161 {"radix", required_argument, NULL, 't'},
162 {"target", required_argument, NULL, 'T'},
163 {"unicode", required_argument, NULL, 'U'},
164 {"version", no_argument, NULL, 'v'},
165 {NULL, 0, NULL, 0}
168 static bool strings_file (char *);
169 static void print_strings (const char *, FILE *, file_ptr, int, char *);
170 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
172 int main (int, char **);
174 static void
175 set_string_min (const char * arg)
177 char *s;
178 unsigned long l = strtoul (arg, &s, 0);
180 if (s != NULL && *s != 0)
181 fatal (_("invalid integer argument %s"), arg);
183 string_min = (unsigned int) l;
185 if (l != (unsigned long) string_min)
186 fatal (_("minimum string length is too big: %s"), arg);
188 if (string_min < 1)
189 fatal (_("minimum string length is too small: %s"), arg);
191 /* PR 30595: Look for minimum string lengths that overflow an 'int'. */
192 if (string_min + 1 == 0)
193 fatal (_("minimum string length %s is too big"), arg);
195 /* FIXME: Should we warn for unreasonably large minimum
196 string lengths, even if technically they will work ? */
200 main (int argc, char **argv)
202 int optc;
203 int exit_status = 0;
204 bool files_given = false;
205 int numeric_opt = 0;
207 setlocale (LC_ALL, "");
208 bindtextdomain (PACKAGE, LOCALEDIR);
209 textdomain (PACKAGE);
211 program_name = argv[0];
212 xmalloc_set_program_name (program_name);
213 bfd_set_error_program_name (program_name);
215 expandargv (&argc, &argv);
217 string_min = 4;
218 include_all_whitespace = false;
219 print_addresses = false;
220 print_filenames = false;
221 if (DEFAULT_STRINGS_ALL)
222 datasection_only = false;
223 else
224 datasection_only = true;
225 target = NULL;
226 encoding = 's';
227 output_separator = NULL;
229 while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
230 long_options, (int *) 0)) != EOF)
232 switch (optc)
234 case 'a':
235 datasection_only = false;
236 break;
238 case 'd':
239 datasection_only = true;
240 break;
242 case 'f':
243 print_filenames = true;
244 break;
246 case 'H':
247 case 'h':
248 usage (stdout, 0);
250 case 'n':
251 set_string_min (optarg);
252 break;
254 case 'w':
255 include_all_whitespace = true;
256 break;
258 case 'o':
259 print_addresses = true;
260 address_radix = 8;
261 break;
263 case 't':
264 print_addresses = true;
265 if (optarg[1] != '\0')
266 usage (stderr, 1);
267 switch (optarg[0])
269 case 'o':
270 address_radix = 8;
271 break;
273 case 'd':
274 address_radix = 10;
275 break;
277 case 'x':
278 address_radix = 16;
279 break;
281 default:
282 usage (stderr, 1);
284 break;
286 case 'T':
287 target = optarg;
288 break;
290 case 'e':
291 if (optarg[1] != '\0')
292 usage (stderr, 1);
293 encoding = optarg[0];
294 break;
296 case 's':
297 output_separator = optarg;
298 break;
300 case 'U':
301 if (streq (optarg, "default") || streq (optarg, "d"))
302 unicode_display = unicode_default;
303 else if (streq (optarg, "locale") || streq (optarg, "l"))
304 unicode_display = unicode_locale;
305 else if (streq (optarg, "escape") || streq (optarg, "e"))
306 unicode_display = unicode_escape;
307 else if (streq (optarg, "invalid") || streq (optarg, "i"))
308 unicode_display = unicode_invalid;
309 else if (streq (optarg, "hex") || streq (optarg, "x"))
310 unicode_display = unicode_hex;
311 else if (streq (optarg, "highlight") || streq (optarg, "h"))
312 unicode_display = unicode_highlight;
313 else
314 fatal (_("invalid argument to -U/--unicode: %s"), optarg);
315 break;
317 case 'V':
318 case 'v':
319 print_version ("strings");
320 break;
322 case '?':
323 usage (stderr, 1);
325 default:
326 numeric_opt = optind;
327 break;
331 if (unicode_display != unicode_default)
332 encoding = 'S';
334 if (numeric_opt != 0)
335 set_string_min (argv[numeric_opt - 1] + 1);
337 switch (encoding)
339 case 'S':
340 case 's':
341 encoding_bytes = 1;
342 break;
343 case 'b':
344 case 'l':
345 encoding_bytes = 2;
346 break;
347 case 'B':
348 case 'L':
349 encoding_bytes = 4;
350 break;
351 default:
352 usage (stderr, 1);
355 if (bfd_init () != BFD_INIT_MAGIC)
356 fatal (_("fatal error: libbfd ABI mismatch"));
357 set_default_bfd_target ();
359 if (optind >= argc)
361 datasection_only = false;
362 SET_BINARY (fileno (stdin));
363 print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
364 files_given = true;
366 else
368 for (; optind < argc; ++optind)
370 if (streq (argv[optind], "-"))
371 datasection_only = false;
372 else
374 files_given = true;
375 exit_status |= !strings_file (argv[optind]);
380 if (!files_given)
381 usage (stderr, 1);
383 return (exit_status);
386 /* Scan section SECT of the file ABFD, whose printable name is
387 FILENAME. If it contains initialized data set GOT_A_SECTION and
388 print the strings in it. */
390 static void
391 strings_a_section (bfd *abfd, asection *sect, const char *filename,
392 bool *got_a_section)
394 bfd_size_type sectsize;
395 bfd_byte *mem;
397 if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
398 return;
400 sectsize = bfd_section_size (sect);
401 if (sectsize == 0)
402 return;
404 if (!bfd_malloc_and_get_section (abfd, sect, &mem))
406 non_fatal (_("%s: Reading section %s failed: %s"),
407 filename, sect->name, bfd_errmsg (bfd_get_error ()));
408 return;
411 *got_a_section = true;
412 print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
413 free (mem);
416 /* Scan all of the sections in FILE, and print the strings
417 in the initialized data section(s).
419 Return TRUE if successful,
420 FALSE if not (such as if FILE is not an object file). */
422 static bool
423 strings_object_file (const char *file)
425 bfd *abfd;
426 asection *s;
427 bool got_a_section;
429 abfd = bfd_openr (file, target);
431 if (abfd == NULL)
432 /* Treat the file as a non-object file. */
433 return false;
435 /* This call is mainly for its side effect of reading in the sections.
436 We follow the traditional behavior of `strings' in that we don't
437 complain if we don't recognize a file to be an object file. */
438 if (!bfd_check_format (abfd, bfd_object))
440 bfd_close (abfd);
441 return false;
444 got_a_section = false;
445 for (s = abfd->sections; s != NULL; s = s->next)
446 strings_a_section (abfd, s, file, &got_a_section);
448 if (!bfd_close (abfd))
450 bfd_nonfatal (file);
451 return false;
454 return got_a_section;
457 /* Print the strings in FILE. Return TRUE if ok, FALSE if an error occurs. */
459 static bool
460 strings_file (char *file)
462 struct stat st;
464 /* get_file_size does not support non-S_ISREG files. */
466 if (stat (file, &st) < 0)
468 if (errno == ENOENT)
469 non_fatal (_("'%s': No such file"), file);
470 else
471 non_fatal (_("Warning: could not locate '%s'. reason: %s"),
472 file, strerror (errno));
473 return false;
475 else if (S_ISDIR (st.st_mode))
477 non_fatal (_("Warning: '%s' is a directory"), file);
478 return false;
481 /* If we weren't told to scan the whole file,
482 try to open it as an object file and only look at
483 initialized data sections. If that fails, fall back to the
484 whole file. */
485 if (!datasection_only || !strings_object_file (file))
487 FILE *stream;
489 stream = fopen (file, FOPEN_RB);
490 if (stream == NULL)
492 fprintf (stderr, "%s: ", program_name);
493 perror (file);
494 return false;
497 print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
499 if (fclose (stream) == EOF)
501 fprintf (stderr, "%s: ", program_name);
502 perror (file);
503 return false;
507 return true;
510 /* Read the next character, return EOF if none available.
511 Assume that STREAM is positioned so that the next byte read
512 is at address ADDRESS in the file.
514 If STREAM is NULL, do not read from it.
515 The caller can supply a buffer of characters
516 to be processed before the data in STREAM.
517 MAGIC is the address of the buffer and
518 MAGICCOUNT is how many characters are in it. */
520 static long
521 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
523 int c, i;
524 long r = 0;
526 for (i = 0; i < encoding_bytes; i++)
528 if (*magiccount)
530 (*magiccount)--;
531 c = *(*magic)++;
533 else
535 if (stream == NULL)
536 return EOF;
538 /* Only use getc_unlocked if we found a declaration for it.
539 Otherwise, libc is not thread safe by default, and we
540 should not use it. */
542 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
543 c = getc_unlocked (stream);
544 #else
545 c = getc (stream);
546 #endif
547 if (c == EOF)
548 return EOF;
551 (*address)++;
552 r = (r << 8) | (c & 0xff);
555 switch (encoding)
557 default:
558 break;
559 case 'l':
560 r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
561 break;
562 case 'L':
563 r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
564 | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
565 break;
568 return r;
571 /* Throw away one byte of a (possibly) multi-byte char C, updating
572 address and buffer to suit. */
574 static void
575 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
577 static char tmp[4];
579 if (encoding_bytes > 1)
581 *address -= encoding_bytes - 1;
583 if (*magiccount == 0)
585 /* If no magic buffer exists, use temp buffer. */
586 switch (encoding)
588 default:
589 break;
590 case 'b':
591 tmp[0] = c & 0xff;
592 *magiccount = 1;
593 break;
594 case 'l':
595 tmp[0] = (c >> 8) & 0xff;
596 *magiccount = 1;
597 break;
598 case 'B':
599 tmp[0] = (c >> 16) & 0xff;
600 tmp[1] = (c >> 8) & 0xff;
601 tmp[2] = c & 0xff;
602 *magiccount = 3;
603 break;
604 case 'L':
605 tmp[0] = (c >> 8) & 0xff;
606 tmp[1] = (c >> 16) & 0xff;
607 tmp[2] = (c >> 24) & 0xff;
608 *magiccount = 3;
609 break;
611 *magic = tmp;
613 else
615 /* If magic buffer exists, rewind. */
616 *magic -= encoding_bytes - 1;
617 *magiccount += encoding_bytes - 1;
622 static void
623 print_filename_and_address (const char * filename, file_ptr address)
625 if (print_filenames)
626 printf ("%s: ", filename);
628 if (! print_addresses)
629 return;
631 switch (address_radix)
633 case 8:
634 if (sizeof (address) > sizeof (long))
636 #ifndef __MSVCRT__
637 printf ("%7llo ", (unsigned long long) address);
638 #else
639 printf ("%7I64o ", (unsigned long long) address);
640 #endif
642 else
643 printf ("%7lo ", (unsigned long) address);
644 break;
646 case 10:
647 if (sizeof (address) > sizeof (long))
649 #ifndef __MSVCRT__
650 printf ("%7llu ", (unsigned long long) address);
651 #else
652 printf ("%7I64d ", (unsigned long long) address);
653 #endif
655 else
656 printf ("%7ld ", (long) address);
657 break;
659 case 16:
660 if (sizeof (address) > sizeof (long))
662 #ifndef __MSVCRT__
663 printf ("%7llx ", (unsigned long long) address);
664 #else
665 printf ("%7I64x ", (unsigned long long) address);
666 #endif
668 else
669 printf ("%7lx ", (unsigned long) address);
670 break;
674 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
675 If the encoding is valid then returns the number of bytes it uses. */
677 static unsigned int
678 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
680 if (buffer[0] < 0xc0)
681 return 0;
683 if (buflen < 2)
684 return 0;
686 if ((buffer[1] & 0xc0) != 0x80)
687 return 0;
689 if ((buffer[0] & 0x20) == 0)
690 return 2;
692 if (buflen < 3)
693 return 0;
695 if ((buffer[2] & 0xc0) != 0x80)
696 return 0;
698 if ((buffer[0] & 0x10) == 0)
699 return 3;
701 if (buflen < 4)
702 return 0;
704 if ((buffer[3] & 0xc0) != 0x80)
705 return 0;
707 return 4;
710 /* Display a UTF-8 encoded character in BUFFER according to the setting
711 of unicode_display. The character is known to be valid.
712 Returns the number of bytes consumed. */
714 static unsigned int
715 display_utf8_char (const unsigned char * buffer)
717 unsigned int j;
718 unsigned int utf8_len;
720 switch (buffer[0] & 0x30)
722 case 0x00:
723 case 0x10:
724 utf8_len = 2;
725 break;
726 case 0x20:
727 utf8_len = 3;
728 break;
729 default:
730 utf8_len = 4;
733 switch (unicode_display)
735 default:
736 fprintf (stderr, "ICE: unexpected unicode display type\n");
737 break;
739 case unicode_escape:
740 case unicode_highlight:
741 if (unicode_display == unicode_highlight && isatty (1))
742 printf ("\x1B[31;47m"); /* Red. */
744 switch (utf8_len)
746 case 2:
747 printf ("\\u%02x%02x",
748 ((buffer[0] & 0x1c) >> 2),
749 ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
750 break;
752 case 3:
753 printf ("\\u%02x%02x",
754 ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
755 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
756 break;
758 case 4:
759 printf ("\\u%02x%02x%02x",
760 ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
761 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
762 ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
763 break;
764 default:
765 /* URG. */
766 break;
769 if (unicode_display == unicode_highlight && isatty (1))
770 printf ("\033[0m"); /* Default colour. */
771 break;
773 case unicode_hex:
774 putchar ('<');
775 printf ("0x");
776 for (j = 0; j < utf8_len; j++)
777 printf ("%02x", buffer [j]);
778 putchar ('>');
779 break;
781 case unicode_locale:
782 printf ("%.1s", buffer);
783 break;
786 return utf8_len;
789 /* Display strings in BUFFER. Treat any UTF-8 encoded characters encountered
790 according to the setting of the unicode_display variable. The buffer
791 contains BUFLEN bytes.
793 Display the characters as if they started at ADDRESS and are contained in
794 FILENAME. */
796 static void
797 print_unicode_buffer (const char * filename,
798 file_ptr address,
799 const unsigned char * buffer,
800 unsigned long buflen)
802 /* Paranoia checks... */
803 if (filename == NULL
804 || buffer == NULL
805 || unicode_display == unicode_default
806 || encoding != 'S'
807 || encoding_bytes != 1)
809 fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
810 return;
813 if (buflen == 0)
814 return;
816 /* We must only display strings that are at least string_min *characters*
817 long. So we scan the buffer in two stages. First we locate the start
818 of a potential string. Then we walk along it until we have found
819 string_min characters. Then we go back to the start point and start
820 displaying characters according to the unicode_display setting. */
822 unsigned long start_point = 0;
823 unsigned long i = 0;
824 unsigned int char_len = 1;
825 unsigned int num_found = 0;
827 for (i = 0; i < buflen; i += char_len)
829 int c = buffer[i];
831 char_len = 1;
833 /* Find the first potential character of a string. */
834 if (! STRING_ISGRAPHIC (c))
836 num_found = 0;
837 continue;
840 if (c > 126)
842 if (c < 0xc0)
844 num_found = 0;
845 continue;
848 if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
850 char_len = 1;
851 num_found = 0;
852 continue;
855 if (unicode_display == unicode_invalid)
857 /* We have found a valid UTF-8 character, but we treat it as non-graphic. */
858 num_found = 0;
859 continue;
863 if (num_found == 0)
864 /* We have found a potential starting point for a string. */
865 start_point = i;
867 ++ num_found;
869 if (num_found >= string_min)
870 break;
873 if (num_found < string_min)
874 return;
876 print_filename_and_address (filename, address + start_point);
878 /* We have found string_min characters. Display them and any
879 more that follow. */
880 for (i = start_point; i < buflen; i += char_len)
882 int c = buffer[i];
884 char_len = 1;
886 if (! STRING_ISGRAPHIC (c))
887 break;
888 else if (c < 127)
889 putchar (c);
890 else if (! is_valid_utf8 (buffer + i, buflen - i))
891 break;
892 else if (unicode_display == unicode_invalid)
893 break;
894 else
895 char_len = display_utf8_char (buffer + i);
898 if (output_separator)
899 fputs (output_separator, stdout);
900 else
901 putchar ('\n');
903 /* FIXME: Using tail recursion here is lazy programming... */
904 print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
907 static int
908 get_unicode_byte (FILE * stream,
909 unsigned char * putback,
910 unsigned int * num_putback,
911 unsigned int * num_read)
913 if (* num_putback > 0)
915 * num_putback = * num_putback - 1;
916 return putback [* num_putback];
919 * num_read = * num_read + 1;
921 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
922 return getc_unlocked (stream);
923 #else
924 return getc (stream);
925 #endif
928 /* Helper function for print_unicode_stream. */
930 static void
931 print_unicode_stream_body (const char * filename,
932 file_ptr address,
933 FILE * stream,
934 unsigned char * putback_buf,
935 unsigned int num_putback,
936 unsigned char * print_buf)
938 /* It would be nice if we could just read the stream into a buffer
939 and then process if with print_unicode_buffer. But the input
940 might be huge or it might time-locked (eg stdin). So instead
941 we go one byte at a time... */
943 file_ptr start_point = 0;
944 unsigned int num_read = 0;
945 unsigned int num_chars = 0;
946 unsigned int num_print = 0;
947 int c = 0;
949 /* Find a series of string_min characters. Put them into print_buf. */
952 if (num_chars >= string_min)
953 break;
955 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
956 if (c == EOF)
957 break;
959 if (! STRING_ISGRAPHIC (c))
961 num_chars = num_print = 0;
962 continue;
965 if (num_chars == 0)
966 start_point = num_read - 1;
968 if (c < 127)
970 print_buf[num_print] = c;
971 num_chars ++;
972 num_print ++;
973 continue;
976 if (c < 0xc0)
978 num_chars = num_print = 0;
979 continue;
982 /* We *might* have a UTF-8 sequence. Time to start peeking. */
983 char utf8[4];
985 utf8[0] = c;
986 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
987 if (c == EOF)
988 break;
989 utf8[1] = c;
991 if ((utf8[1] & 0xc0) != 0x80)
993 /* Invalid UTF-8. */
994 putback_buf[num_putback++] = utf8[1];
995 num_chars = num_print = 0;
996 continue;
998 else if ((utf8[0] & 0x20) == 0)
1000 /* A valid 2-byte UTF-8 encoding. */
1001 if (unicode_display == unicode_invalid)
1003 putback_buf[num_putback++] = utf8[1];
1004 num_chars = num_print = 0;
1006 else
1008 print_buf[num_print ++] = utf8[0];
1009 print_buf[num_print ++] = utf8[1];
1010 num_chars ++;
1012 continue;
1015 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1016 if (c == EOF)
1017 break;
1018 utf8[2] = c;
1020 if ((utf8[2] & 0xc0) != 0x80)
1022 /* Invalid UTF-8. */
1023 putback_buf[num_putback++] = utf8[2];
1024 putback_buf[num_putback++] = utf8[1];
1025 num_chars = num_print = 0;
1026 continue;
1028 else if ((utf8[0] & 0x10) == 0)
1030 /* A valid 3-byte UTF-8 encoding. */
1031 if (unicode_display == unicode_invalid)
1033 putback_buf[num_putback++] = utf8[2];
1034 putback_buf[num_putback++] = utf8[1];
1035 num_chars = num_print = 0;
1037 else
1039 print_buf[num_print ++] = utf8[0];
1040 print_buf[num_print ++] = utf8[1];
1041 print_buf[num_print ++] = utf8[2];
1042 num_chars ++;
1044 continue;
1047 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1048 if (c == EOF)
1049 break;
1050 utf8[3] = c;
1052 if ((utf8[3] & 0xc0) != 0x80)
1054 /* Invalid UTF-8. */
1055 putback_buf[num_putback++] = utf8[3];
1056 putback_buf[num_putback++] = utf8[2];
1057 putback_buf[num_putback++] = utf8[1];
1058 num_chars = num_print = 0;
1060 /* We have a valid 4-byte UTF-8 encoding. */
1061 else if (unicode_display == unicode_invalid)
1063 putback_buf[num_putback++] = utf8[3];
1064 putback_buf[num_putback++] = utf8[1];
1065 putback_buf[num_putback++] = utf8[2];
1066 num_chars = num_print = 0;
1068 else
1070 print_buf[num_print ++] = utf8[0];
1071 print_buf[num_print ++] = utf8[1];
1072 print_buf[num_print ++] = utf8[2];
1073 print_buf[num_print ++] = utf8[3];
1074 num_chars ++;
1077 while (1);
1079 if (num_chars >= string_min)
1081 /* We know that we have string_min valid characters in print_buf,
1082 and there may be more to come in the stream. Start displaying
1083 them. */
1085 print_filename_and_address (filename, address + start_point);
1087 unsigned int i;
1088 for (i = 0; i < num_print;)
1090 if (print_buf[i] < 127)
1091 putchar (print_buf[i++]);
1092 else
1093 i += display_utf8_char (print_buf + i);
1096 /* OK so now we have to start read unchecked bytes. */
1098 /* Find a series of string_min characters. Put them into print_buf. */
1101 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1102 if (c == EOF)
1103 break;
1105 if (! STRING_ISGRAPHIC (c))
1106 break;
1108 if (c < 127)
1110 putchar (c);
1111 continue;
1114 if (c < 0xc0)
1115 break;
1117 /* We *might* have a UTF-8 sequence. Time to start peeking. */
1118 unsigned char utf8[4];
1120 utf8[0] = c;
1121 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1122 if (c == EOF)
1123 break;
1124 utf8[1] = c;
1126 if ((utf8[1] & 0xc0) != 0x80)
1128 /* Invalid UTF-8. */
1129 putback_buf[num_putback++] = utf8[1];
1130 break;
1132 else if ((utf8[0] & 0x20) == 0)
1134 /* Valid 2-byte UTF-8. */
1135 if (unicode_display == unicode_invalid)
1137 putback_buf[num_putback++] = utf8[1];
1138 break;
1140 else
1142 (void) display_utf8_char (utf8);
1143 continue;
1147 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1148 if (c == EOF)
1149 break;
1150 utf8[2] = c;
1152 if ((utf8[2] & 0xc0) != 0x80)
1154 /* Invalid UTF-8. */
1155 putback_buf[num_putback++] = utf8[2];
1156 putback_buf[num_putback++] = utf8[1];
1157 break;
1159 else if ((utf8[0] & 0x10) == 0)
1161 /* Valid 3-byte UTF-8. */
1162 if (unicode_display == unicode_invalid)
1164 putback_buf[num_putback++] = utf8[2];
1165 putback_buf[num_putback++] = utf8[1];
1166 break;
1168 else
1170 (void) display_utf8_char (utf8);
1171 continue;
1175 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1176 if (c == EOF)
1177 break;
1178 utf8[3] = c;
1180 if ((utf8[3] & 0xc0) != 0x80)
1182 /* Invalid UTF-8. */
1183 putback_buf[num_putback++] = utf8[3];
1184 putback_buf[num_putback++] = utf8[2];
1185 putback_buf[num_putback++] = utf8[1];
1186 break;
1188 else if (unicode_display == unicode_invalid)
1190 putback_buf[num_putback++] = utf8[3];
1191 putback_buf[num_putback++] = utf8[2];
1192 putback_buf[num_putback++] = utf8[1];
1193 break;
1195 else
1196 /* A valid 4-byte UTF-8 encoding. */
1197 (void) display_utf8_char (utf8);
1199 while (1);
1201 if (output_separator)
1202 fputs (output_separator, stdout);
1203 else
1204 putchar ('\n');
1207 if (c != EOF)
1208 /* FIXME: Using tail recursion here is lazy, but it works. */
1209 print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1212 /* Display strings read in from STREAM. Treat any UTF-8 encoded characters
1213 encountered according to the setting of the unicode_display variable.
1214 The stream is positioned at ADDRESS and is attached to FILENAME. */
1216 static void
1217 print_unicode_stream (const char * filename,
1218 file_ptr address,
1219 FILE * stream)
1221 /* Paranoia checks... */
1222 if (filename == NULL
1223 || stream == NULL
1224 || unicode_display == unicode_default
1225 || encoding != 'S'
1226 || encoding_bytes != 1)
1228 fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1229 return;
1232 /* Allocate space for string_min 4-byte utf-8 characters. */
1233 size_t amt = string_min;
1234 amt = (4 * amt) + 1;
1235 unsigned char * print_buf = xmalloc (amt);
1236 /* We should never have to put back more than 4 bytes. */
1237 unsigned char putback_buf[5];
1238 unsigned int num_putback = 0;
1240 print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1241 free (print_buf);
1244 /* Find the strings in file FILENAME, read from STREAM.
1245 Assume that STREAM is positioned so that the next byte read
1246 is at address ADDRESS in the file.
1248 If STREAM is NULL, do not read from it.
1249 The caller can supply a buffer of characters
1250 to be processed before the data in STREAM.
1251 MAGIC is the address of the buffer and
1252 MAGICCOUNT is how many characters are in it.
1253 Those characters come at address ADDRESS and the data in STREAM follow. */
1255 static void
1256 print_strings (const char *filename, FILE *stream, file_ptr address,
1257 int magiccount, char *magic)
1259 if (unicode_display != unicode_default)
1261 if (magic != NULL)
1262 print_unicode_buffer (filename, address,
1263 (const unsigned char *) magic, magiccount);
1265 if (stream != NULL)
1266 print_unicode_stream (filename, address, stream);
1267 return;
1270 char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1272 while (1)
1274 file_ptr start;
1275 unsigned int i;
1276 long c;
1278 /* See if the next `string_min' chars are all graphic chars. */
1279 tryline:
1280 start = address;
1281 for (i = 0; i < string_min; i++)
1283 c = get_char (stream, &address, &magiccount, &magic);
1284 if (c == EOF)
1286 free (buf);
1287 return;
1290 if (! STRING_ISGRAPHIC (c))
1292 /* Found a non-graphic. Try again starting with next byte. */
1293 unget_part_char (c, &address, &magiccount, &magic);
1294 goto tryline;
1296 buf[i] = c;
1299 /* We found a run of `string_min' graphic characters. Print up
1300 to the next non-graphic character. */
1301 print_filename_and_address (filename, start);
1303 buf[i] = '\0';
1304 fputs (buf, stdout);
1306 while (1)
1308 c = get_char (stream, &address, &magiccount, &magic);
1309 if (c == EOF)
1310 break;
1311 if (! STRING_ISGRAPHIC (c))
1313 unget_part_char (c, &address, &magiccount, &magic);
1314 break;
1316 putchar (c);
1319 if (output_separator)
1320 fputs (output_separator, stdout);
1321 else
1322 putchar ('\n');
1324 free (buf);
1327 static void
1328 usage (FILE *stream, int status)
1330 fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1331 fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1332 fprintf (stream, _(" The options are:\n"));
1334 if (DEFAULT_STRINGS_ALL)
1335 fprintf (stream, _("\
1336 -a - --all Scan the entire file, not just the data section [default]\n\
1337 -d --data Only scan the data sections in the file\n"));
1338 else
1339 fprintf (stream, _("\
1340 -a - --all Scan the entire file, not just the data section\n\
1341 -d --data Only scan the data sections in the file [default]\n"));
1343 fprintf (stream, _("\
1344 -f --print-file-name Print the name of the file before each string\n\
1345 -n <number> Locate & print any sequence of at least <number>\n\
1346 --bytes=<number> displayable characters. (The default is 4).\n\
1347 -t --radix={o,d,x} Print the location of the string in base 8, 10 or 16\n\
1348 -w --include-all-whitespace Include all whitespace as valid string characters\n\
1349 -o An alias for --radix=o\n\
1350 -T --target=<BFDNAME> Specify the binary file format\n\
1351 -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1352 s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1353 --unicode={default|show|invalid|hex|escape|highlight}\n\
1354 -U {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\
1355 -s --output-separator=<string> String used to separate strings in output.\n\
1356 @<file> Read options from <file>\n\
1357 -h --help Display this information\n\
1358 -v -V --version Print the program's version number\n"));
1359 list_supported_targets (program_name, stream);
1360 if (REPORT_BUGS_TO[0] && status == 0)
1361 fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1362 exit (status);