sort: pacify GCC 12 false positive
[coreutils.git] / src / tr.c
blob0bfe8024b8dd606996cddde7168a565d669f7c4a
1 /* tr -- a filter to translate characters
2 Copyright (C) 1991-2022 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Jim Meyering */
19 #include <config.h>
21 #include <stdio.h>
22 #include <assert.h>
23 #include <sys/types.h>
24 #include <getopt.h>
26 #include "system.h"
27 #include "die.h"
28 #include "error.h"
29 #include "fadvise.h"
30 #include "quote.h"
31 #include "safe-read.h"
32 #include "xbinary-io.h"
33 #include "xstrtol.h"
35 /* The official name of this program (e.g., no 'g' prefix). */
36 #define PROGRAM_NAME "tr"
38 #define AUTHORS proper_name ("Jim Meyering")
40 enum { N_CHARS = UCHAR_MAX + 1 };
42 /* An unsigned integer type big enough to hold a repeat count or an
43 unsigned character. POSIX requires support for repeat counts as
44 high as 2**31 - 1. Since repeat counts might need to expand to
45 match the length of an argument string, we need at least size_t to
46 avoid arbitrary internal limits. It doesn't cost much to use
47 uintmax_t, though. */
48 typedef uintmax_t count;
50 /* The value for Spec_list->state that indicates to
51 get_next that it should initialize the tail pointer.
52 Its value should be as large as possible to avoid conflict
53 a valid value for the state field -- and that may be as
54 large as any valid repeat_count. */
55 #define BEGIN_STATE (UINTMAX_MAX - 1)
57 /* The value for Spec_list->state that indicates to
58 get_next that the element pointed to by Spec_list->tail is
59 being considered for the first time on this pass through the
60 list -- it indicates that get_next should make any necessary
61 initializations. */
62 #define NEW_ELEMENT (BEGIN_STATE + 1)
64 /* The maximum possible repeat count. Due to how the states are
65 implemented, it can be as much as BEGIN_STATE. */
66 #define REPEAT_COUNT_MAXIMUM BEGIN_STATE
68 /* The following (but not CC_NO_CLASS) are indices into the array of
69 valid character class strings. */
70 enum Char_class
72 CC_ALNUM = 0, CC_ALPHA = 1, CC_BLANK = 2, CC_CNTRL = 3,
73 CC_DIGIT = 4, CC_GRAPH = 5, CC_LOWER = 6, CC_PRINT = 7,
74 CC_PUNCT = 8, CC_SPACE = 9, CC_UPPER = 10, CC_XDIGIT = 11,
75 CC_NO_CLASS = 9999
78 /* Character class to which a character (returned by get_next) belonged;
79 but it is set only if the construct from which the character was obtained
80 was one of the character classes [:upper:] or [:lower:]. The value
81 is used only when translating and then, only to make sure that upper
82 and lower class constructs have the same relative positions in string1
83 and string2. */
84 enum Upper_Lower_class
86 UL_LOWER,
87 UL_UPPER,
88 UL_NONE
91 /* The type of a List_element. See build_spec_list for more details. */
92 enum Range_element_type
94 RE_NORMAL_CHAR,
95 RE_RANGE,
96 RE_CHAR_CLASS,
97 RE_EQUIV_CLASS,
98 RE_REPEATED_CHAR
101 /* One construct in one of tr's argument strings.
102 For example, consider the POSIX version of the classic tr command:
103 tr -cs 'a-zA-Z_' '[\n*]'
104 String1 has 3 constructs, two of which are ranges (a-z and A-Z),
105 and a single normal character, '_'. String2 has one construct. */
106 struct List_element
108 enum Range_element_type type;
109 struct List_element *next;
110 union
112 unsigned char normal_char;
113 struct /* unnamed */
115 unsigned char first_char;
116 unsigned char last_char;
118 range;
119 enum Char_class char_class;
120 unsigned char equiv_code;
121 struct /* unnamed */
123 unsigned char the_repeated_char;
124 count repeat_count;
126 repeated_char;
131 /* Each of tr's argument strings is parsed into a form that is easier
132 to work with: a linked list of constructs (struct List_element).
133 Each Spec_list structure also encapsulates various attributes of
134 the corresponding argument string. The attributes are used mainly
135 to verify that the strings are valid in the context of any options
136 specified (like -s, -d, or -c). The main exception is the member
137 'tail', which is first used to construct the list. After construction,
138 it is used by get_next to save its state when traversing the list.
139 The member 'state' serves a similar function. */
140 struct Spec_list
142 /* Points to the head of the list of range elements.
143 The first struct is a dummy; its members are never used. */
144 struct List_element *head;
146 /* When appending, points to the last element. When traversing via
147 get_next(), points to the element to process next. Setting
148 Spec_list.state to the value BEGIN_STATE before calling get_next
149 signals get_next to initialize tail to point to head->next. */
150 struct List_element *tail;
152 /* Used to save state between calls to get_next. */
153 count state;
155 /* Length, in the sense that length ('a-z[:digit:]123abc')
156 is 42 ( = 26 + 10 + 6). */
157 count length;
159 /* The number of [c*] and [c*0] constructs that appear in this spec. */
160 size_t n_indefinite_repeats;
162 /* If n_indefinite_repeats is nonzero, this points to the List_element
163 corresponding to the last [c*] or [c*0] construct encountered in
164 this spec. Otherwise it is undefined. */
165 struct List_element *indefinite_repeat_element;
167 /* True if this spec contains at least one equivalence
168 class construct e.g. [=c=]. */
169 bool has_equiv_class;
171 /* True if this spec contains at least one character class
172 construct. E.g. [:digit:]. */
173 bool has_char_class;
175 /* True if this spec contains at least one of the character class
176 constructs (all but upper and lower) that aren't allowed in s2. */
177 bool has_restricted_char_class;
180 /* A representation for escaped string1 or string2. As a string is parsed,
181 any backslash-escaped characters (other than octal or \a, \b, \f, \n,
182 etc.) are marked as such in this structure by setting the corresponding
183 entry in the ESCAPED vector. */
184 struct E_string
186 char *s;
187 bool *escaped;
188 size_t len;
191 /* Return nonzero if the Ith character of escaped string ES matches C
192 and is not escaped itself. */
193 static inline bool
194 es_match (struct E_string const *es, size_t i, char c)
196 return es->s[i] == c && !es->escaped[i];
199 /* When true, each sequence in the input of a repeated character
200 (call it c) is replaced (in the output) by a single occurrence of c
201 for every c in the squeeze set. */
202 static bool squeeze_repeats = false;
204 /* When true, removes characters in the delete set from input. */
205 static bool delete = false;
207 /* Use the complement of set1 in place of set1. */
208 static bool complement = false;
210 /* When tr is performing translation and string1 is longer than string2,
211 POSIX says that the result is unspecified. That gives the implementor
212 of a POSIX conforming version of tr two reasonable choices for the
213 semantics of this case.
215 * The BSD tr pads string2 to the length of string1 by
216 repeating the last character in string2.
218 * System V tr ignores characters in string1 that have no
219 corresponding character in string2. That is, string1 is effectively
220 truncated to the length of string2.
222 When nonzero, this flag causes GNU tr to imitate the behavior
223 of System V tr when translating with string1 longer than string2.
224 The default is to emulate BSD tr. This flag is ignored in modes where
225 no translation is performed. Emulating the System V tr
226 in this exceptional case causes the relatively common BSD idiom:
228 tr -cs A-Za-z0-9 '\012'
230 to break (it would convert only zero bytes, rather than all
231 non-alphanumerics, to newlines).
233 WARNING: This switch does not provide general BSD or System V
234 compatibility. For example, it doesn't disable the interpretation
235 of the POSIX constructs [:alpha:], [=c=], and [c*10], so if by
236 some unfortunate coincidence you use such constructs in scripts
237 expecting to use some other version of tr, the scripts will break. */
238 static bool truncate_set1 = false;
240 /* An alias for (!delete && non_option_args == 2).
241 It is set in main and used there and in validate(). */
242 static bool translating;
244 static char io_buf[BUFSIZ];
246 static char const *const char_class_name[] =
248 "alnum", "alpha", "blank", "cntrl", "digit", "graph",
249 "lower", "print", "punct", "space", "upper", "xdigit"
252 /* Array of boolean values. A character 'c' is a member of the
253 squeeze set if and only if in_squeeze_set[c] is true. The squeeze
254 set is defined by the last (possibly, the only) string argument
255 on the command line when the squeeze option is given. */
256 static bool in_squeeze_set[N_CHARS];
258 /* Array of boolean values. A character 'c' is a member of the
259 delete set if and only if in_delete_set[c] is true. The delete
260 set is defined by the first (or only) string argument on the
261 command line when the delete option is given. */
262 static bool in_delete_set[N_CHARS];
264 /* Array of character values defining the translation (if any) that
265 tr is to perform. Translation is performed only when there are
266 two specification strings and the delete switch is not given. */
267 static char xlate[N_CHARS];
269 static struct option const long_options[] =
271 {"complement", no_argument, NULL, 'c'},
272 {"delete", no_argument, NULL, 'd'},
273 {"squeeze-repeats", no_argument, NULL, 's'},
274 {"truncate-set1", no_argument, NULL, 't'},
275 {GETOPT_HELP_OPTION_DECL},
276 {GETOPT_VERSION_OPTION_DECL},
277 {NULL, 0, NULL, 0}
280 void
281 usage (int status)
283 if (status != EXIT_SUCCESS)
284 emit_try_help ();
285 else
287 printf (_("\
288 Usage: %s [OPTION]... STRING1 [STRING2]\n\
290 program_name);
291 fputs (_("\
292 Translate, squeeze, and/or delete characters from standard input,\n\
293 writing to standard output. STRING1 and STRING2 specify arrays of\n\
294 characters ARRAY1 and ARRAY2 that control the action.\n\
296 -c, -C, --complement use the complement of ARRAY1\n\
297 -d, --delete delete characters in ARRAY1, do not translate\n\
298 -s, --squeeze-repeats replace each sequence of a repeated character\n\
299 that is listed in the last specified ARRAY,\n\
300 with a single occurrence of that character\n\
301 -t, --truncate-set1 first truncate ARRAY1 to length of ARRAY2\n\
302 "), stdout);
303 fputs (HELP_OPTION_DESCRIPTION, stdout);
304 fputs (VERSION_OPTION_DESCRIPTION, stdout);
305 fputs (_("\
307 ARRAYs are specified as strings of characters. Most represent themselves.\n\
308 Interpreted sequences are:\n\
310 \\NNN character with octal value NNN (1 to 3 octal digits)\n\
311 \\\\ backslash\n\
312 \\a audible BEL\n\
313 \\b backspace\n\
314 \\f form feed\n\
315 \\n new line\n\
316 \\r return\n\
317 \\t horizontal tab\n\
318 "), stdout);
319 fputs (_("\
320 \\v vertical tab\n\
321 CHAR1-CHAR2 all characters from CHAR1 to CHAR2 in ascending order\n\
322 [CHAR*] in ARRAY2, copies of CHAR until length of ARRAY1\n\
323 [CHAR*REPEAT] REPEAT copies of CHAR, REPEAT octal if starting with 0\n\
324 [:alnum:] all letters and digits\n\
325 [:alpha:] all letters\n\
326 [:blank:] all horizontal whitespace\n\
327 [:cntrl:] all control characters\n\
328 [:digit:] all digits\n\
329 "), stdout);
330 fputs (_("\
331 [:graph:] all printable characters, not including space\n\
332 [:lower:] all lower case letters\n\
333 [:print:] all printable characters, including space\n\
334 [:punct:] all punctuation characters\n\
335 [:space:] all horizontal or vertical whitespace\n\
336 [:upper:] all upper case letters\n\
337 [:xdigit:] all hexadecimal digits\n\
338 [=CHAR=] all characters which are equivalent to CHAR\n\
339 "), stdout);
340 fputs (_("\
342 Translation occurs if -d is not given and both STRING1 and STRING2 appear.\n\
343 -t may be used only when translating. ARRAY2 is extended to length of\n\
344 ARRAY1 by repeating its last character as necessary. Excess characters\n\
345 of ARRAY2 are ignored. Character classes expand in unspecified order;\n\
346 while translating, [:lower:] and [:upper:] may be used in pairs to\n\
347 specify case conversion. Squeezing occurs after translation or deletion.\n\
348 "), stdout);
349 emit_ancillary_info (PROGRAM_NAME);
351 exit (status);
354 /* Return nonzero if the character C is a member of the
355 equivalence class containing the character EQUIV_CLASS. */
357 static inline bool
358 is_equiv_class_member (unsigned char equiv_class, unsigned char c)
360 return (equiv_class == c);
363 /* Return true if the character C is a member of the
364 character class CHAR_CLASS. */
366 ATTRIBUTE_PURE
367 static bool
368 is_char_class_member (enum Char_class char_class, unsigned char c)
370 int result;
372 switch (char_class)
374 case CC_ALNUM:
375 result = isalnum (c);
376 break;
377 case CC_ALPHA:
378 result = isalpha (c);
379 break;
380 case CC_BLANK:
381 result = isblank (c);
382 break;
383 case CC_CNTRL:
384 result = iscntrl (c);
385 break;
386 case CC_DIGIT:
387 result = isdigit (c);
388 break;
389 case CC_GRAPH:
390 result = isgraph (c);
391 break;
392 case CC_LOWER:
393 result = islower (c);
394 break;
395 case CC_PRINT:
396 result = isprint (c);
397 break;
398 case CC_PUNCT:
399 result = ispunct (c);
400 break;
401 case CC_SPACE:
402 result = isspace (c);
403 break;
404 case CC_UPPER:
405 result = isupper (c);
406 break;
407 case CC_XDIGIT:
408 result = isxdigit (c);
409 break;
410 default:
411 abort ();
414 return !! result;
417 static void
418 es_free (struct E_string *es)
420 free (es->s);
421 free (es->escaped);
424 /* Perform the first pass over each range-spec argument S, converting all
425 \c and \ddd escapes to their one-byte representations. If an invalid
426 quote sequence is found print an error message and return false;
427 Otherwise set *ES to the resulting string and return true.
428 The resulting array of characters may contain zero-bytes;
429 however, on input, S is assumed to be null-terminated, and hence
430 cannot contain actual (non-escaped) zero bytes. */
432 static bool
433 unquote (char const *s, struct E_string *es)
435 size_t len = strlen (s);
437 es->s = xmalloc (len);
438 es->escaped = xcalloc (len, sizeof es->escaped[0]);
440 unsigned int j = 0;
441 for (unsigned int i = 0; s[i]; i++)
443 unsigned char c;
444 int oct_digit;
446 switch (s[i])
448 case '\\':
449 es->escaped[j] = true;
450 switch (s[i + 1])
452 case '\\':
453 c = '\\';
454 break;
455 case 'a':
456 c = '\a';
457 break;
458 case 'b':
459 c = '\b';
460 break;
461 case 'f':
462 c = '\f';
463 break;
464 case 'n':
465 c = '\n';
466 break;
467 case 'r':
468 c = '\r';
469 break;
470 case 't':
471 c = '\t';
472 break;
473 case 'v':
474 c = '\v';
475 break;
476 case '0':
477 case '1':
478 case '2':
479 case '3':
480 case '4':
481 case '5':
482 case '6':
483 case '7':
484 c = s[i + 1] - '0';
485 oct_digit = s[i + 2] - '0';
486 if (0 <= oct_digit && oct_digit <= 7)
488 c = 8 * c + oct_digit;
489 ++i;
490 oct_digit = s[i + 2] - '0';
491 if (0 <= oct_digit && oct_digit <= 7)
493 if (8 * c + oct_digit < N_CHARS)
495 c = 8 * c + oct_digit;
496 ++i;
498 else
500 /* A 3-digit octal number larger than \377 won't
501 fit in 8 bits. So we stop when adding the
502 next digit would put us over the limit and
503 give a warning about the ambiguity. POSIX
504 isn't clear on this, and we interpret this
505 lack of clarity as meaning the resulting behavior
506 is undefined, which means we're allowed to issue
507 a warning. */
508 error (0, 0, _("warning: the ambiguous octal escape\
509 \\%c%c%c is being\n\tinterpreted as the 2-byte sequence \\0%c%c, %c"),
510 s[i], s[i + 1], s[i + 2],
511 s[i], s[i + 1], s[i + 2]);
515 break;
516 case '\0':
517 error (0, 0, _("warning: an unescaped backslash "
518 "at end of string is not portable"));
519 /* POSIX is not clear about this. */
520 es->escaped[j] = false;
521 i--;
522 c = '\\';
523 break;
524 default:
525 c = s[i + 1];
526 break;
528 ++i;
529 es->s[j++] = c;
530 break;
531 default:
532 es->s[j++] = s[i];
533 break;
536 es->len = j;
537 return true;
540 /* If CLASS_STR is a valid character class string, return its index
541 in the global char_class_name array. Otherwise, return CC_NO_CLASS. */
543 ATTRIBUTE_PURE
544 static enum Char_class
545 look_up_char_class (char const *class_str, size_t len)
547 enum Char_class i;
549 for (i = 0; i < ARRAY_CARDINALITY (char_class_name); i++)
550 if (STREQ_LEN (class_str, char_class_name[i], len)
551 && strlen (char_class_name[i]) == len)
552 return i;
553 return CC_NO_CLASS;
556 /* Return a newly allocated string with a printable version of C.
557 This function is used solely for formatting error messages. */
559 static char *
560 make_printable_char (unsigned char c)
562 char *buf = xmalloc (5);
564 if (isprint (c))
566 buf[0] = c;
567 buf[1] = '\0';
569 else
571 sprintf (buf, "\\%03o", c);
573 return buf;
576 /* Return a newly allocated copy of S which is suitable for printing.
577 LEN is the number of characters in S. Most non-printing
578 (isprint) characters are represented by a backslash followed by
579 3 octal digits. However, the characters represented by \c escapes
580 where c is one of [abfnrtv] are represented by their 2-character \c
581 sequences. This function is used solely for printing error messages. */
583 static char *
584 make_printable_str (char const *s, size_t len)
586 /* Worst case is that every character expands to a backslash
587 followed by a 3-character octal escape sequence. */
588 char *printable_buf = xnmalloc (len + 1, 4);
589 char *p = printable_buf;
591 for (size_t i = 0; i < len; i++)
593 char buf[5];
594 char const *tmp = NULL;
595 unsigned char c = s[i];
597 switch (c)
599 case '\\':
600 tmp = "\\";
601 break;
602 case '\a':
603 tmp = "\\a";
604 break;
605 case '\b':
606 tmp = "\\b";
607 break;
608 case '\f':
609 tmp = "\\f";
610 break;
611 case '\n':
612 tmp = "\\n";
613 break;
614 case '\r':
615 tmp = "\\r";
616 break;
617 case '\t':
618 tmp = "\\t";
619 break;
620 case '\v':
621 tmp = "\\v";
622 break;
623 default:
624 if (isprint (c))
626 buf[0] = c;
627 buf[1] = '\0';
629 else
630 sprintf (buf, "\\%03o", c);
631 tmp = buf;
632 break;
634 p = stpcpy (p, tmp);
636 return printable_buf;
639 /* Append a newly allocated structure representing a
640 character C to the specification list LIST. */
642 static void
643 append_normal_char (struct Spec_list *list, unsigned char c)
645 struct List_element *new = xmalloc (sizeof *new);
646 new->next = NULL;
647 new->type = RE_NORMAL_CHAR;
648 new->u.normal_char = c;
649 assert (list->tail);
650 list->tail->next = new;
651 list->tail = new;
654 /* Append a newly allocated structure representing the range
655 of characters from FIRST to LAST to the specification list LIST.
656 Return false if LAST precedes FIRST in the collating sequence,
657 true otherwise. This means that '[c-c]' is acceptable. */
659 static bool
660 append_range (struct Spec_list *list, unsigned char first, unsigned char last)
662 if (last < first)
664 char *tmp1 = make_printable_char (first);
665 char *tmp2 = make_printable_char (last);
667 error (0, 0,
668 _("range-endpoints of '%s-%s' are in reverse collating sequence order"),
669 tmp1, tmp2);
670 free (tmp1);
671 free (tmp2);
672 return false;
674 struct List_element *new = xmalloc (sizeof *new);
675 new->next = NULL;
676 new->type = RE_RANGE;
677 new->u.range.first_char = first;
678 new->u.range.last_char = last;
679 assert (list->tail);
680 list->tail->next = new;
681 list->tail = new;
682 return true;
685 /* If CHAR_CLASS_STR is a valid character class string, append a
686 newly allocated structure representing that character class to the end
687 of the specification list LIST and return true. If CHAR_CLASS_STR is not
688 a valid string return false. */
690 static bool
691 append_char_class (struct Spec_list *list,
692 char const *char_class_str, size_t len)
694 enum Char_class char_class = look_up_char_class (char_class_str, len);
695 if (char_class == CC_NO_CLASS)
696 return false;
697 struct List_element *new = xmalloc (sizeof *new);
698 new->next = NULL;
699 new->type = RE_CHAR_CLASS;
700 new->u.char_class = char_class;
701 assert (list->tail);
702 list->tail->next = new;
703 list->tail = new;
704 return true;
707 /* Append a newly allocated structure representing a [c*n]
708 repeated character construct to the specification list LIST.
709 THE_CHAR is the single character to be repeated, and REPEAT_COUNT
710 is a non-negative repeat count. */
712 static void
713 append_repeated_char (struct Spec_list *list, unsigned char the_char,
714 count repeat_count)
716 struct List_element *new = xmalloc (sizeof *new);
717 new->next = NULL;
718 new->type = RE_REPEATED_CHAR;
719 new->u.repeated_char.the_repeated_char = the_char;
720 new->u.repeated_char.repeat_count = repeat_count;
721 assert (list->tail);
722 list->tail->next = new;
723 list->tail = new;
726 /* Given a string, EQUIV_CLASS_STR, from a [=str=] context and
727 the length of that string, LEN, if LEN is exactly one, append
728 a newly allocated structure representing the specified
729 equivalence class to the specification list, LIST and return true.
730 If LEN is not 1, return false. */
732 static bool
733 append_equiv_class (struct Spec_list *list,
734 char const *equiv_class_str, size_t len)
736 if (len != 1)
737 return false;
739 struct List_element *new = xmalloc (sizeof *new);
740 new->next = NULL;
741 new->type = RE_EQUIV_CLASS;
742 new->u.equiv_code = *equiv_class_str;
743 assert (list->tail);
744 list->tail->next = new;
745 list->tail = new;
746 return true;
749 /* Search forward starting at START_IDX for the 2-char sequence
750 (PRE_BRACKET_CHAR,']') in the string P of length P_LEN. If such
751 a sequence is found, set *RESULT_IDX to the index of the first
752 character and return true. Otherwise return false. P may contain
753 zero bytes. */
755 static bool
756 find_closing_delim (const struct E_string *es, size_t start_idx,
757 char pre_bracket_char, size_t *result_idx)
759 for (size_t i = start_idx; i < es->len - 1; i++)
760 if (es->s[i] == pre_bracket_char && es->s[i + 1] == ']'
761 && !es->escaped[i] && !es->escaped[i + 1])
763 *result_idx = i;
764 return true;
766 return false;
769 /* Parse the bracketed repeat-char syntax. If the P_LEN characters
770 beginning with P[ START_IDX ] comprise a valid [c*n] construct,
771 then set *CHAR_TO_REPEAT, *REPEAT_COUNT, and *CLOSING_BRACKET_IDX
772 and return zero. If the second character following
773 the opening bracket is not '*' or if no closing bracket can be
774 found, return -1. If a closing bracket is found and the
775 second char is '*', but the string between the '*' and ']' isn't
776 empty, an octal number, or a decimal number, print an error message
777 and return -2. */
779 static int
780 find_bracketed_repeat (const struct E_string *es, size_t start_idx,
781 unsigned char *char_to_repeat, count *repeat_count,
782 size_t *closing_bracket_idx)
784 assert (start_idx + 1 < es->len);
785 if (!es_match (es, start_idx + 1, '*'))
786 return -1;
788 for (size_t i = start_idx + 2; i < es->len && !es->escaped[i]; i++)
790 if (es->s[i] == ']')
792 size_t digit_str_len = i - start_idx - 2;
794 *char_to_repeat = es->s[start_idx];
795 if (digit_str_len == 0)
797 /* We've matched [c*] -- no explicit repeat count. */
798 *repeat_count = 0;
800 else
802 /* Here, we have found [c*s] where s should be a string
803 of octal (if it starts with '0') or decimal digits. */
804 char const *digit_str = &es->s[start_idx + 2];
805 char *d_end;
806 if ((xstrtoumax (digit_str, &d_end, *digit_str == '0' ? 8 : 10,
807 repeat_count, NULL)
808 != LONGINT_OK)
809 || REPEAT_COUNT_MAXIMUM < *repeat_count
810 || digit_str + digit_str_len != d_end)
812 char *tmp = make_printable_str (digit_str, digit_str_len);
813 error (0, 0,
814 _("invalid repeat count %s in [c*n] construct"),
815 quote (tmp));
816 free (tmp);
817 return -2;
820 *closing_bracket_idx = i;
821 return 0;
824 return -1; /* No bracket found. */
827 /* Return true if the string at ES->s[IDX] matches the regular
828 expression '\*[0-9]*\]', false otherwise. The string does not
829 match if any of its characters are escaped. */
831 ATTRIBUTE_PURE
832 static bool
833 star_digits_closebracket (const struct E_string *es, size_t idx)
835 if (!es_match (es, idx, '*'))
836 return false;
838 for (size_t i = idx + 1; i < es->len; i++)
839 if (!ISDIGIT (to_uchar (es->s[i])) || es->escaped[i])
840 return es_match (es, i, ']');
841 return false;
844 /* Convert string UNESCAPED_STRING (which has been preprocessed to
845 convert backslash-escape sequences) of length LEN characters into
846 a linked list of the following 5 types of constructs:
847 - [:str:] Character class where 'str' is one of the 12 valid strings.
848 - [=c=] Equivalence class where 'c' is any single character.
849 - [c*n] Repeat the single character 'c' 'n' times. n may be omitted.
850 However, if 'n' is present, it must be a non-negative octal or
851 decimal integer.
852 - r-s Range of characters from 'r' to 's'. The second endpoint must
853 not precede the first in the current collating sequence.
854 - c Any other character is interpreted as itself. */
856 static bool
857 build_spec_list (const struct E_string *es, struct Spec_list *result)
859 char const *p = es->s;
861 /* The main for-loop below recognizes the 4 multi-character constructs.
862 A character that matches (in its context) none of the multi-character
863 constructs is classified as 'normal'. Since all multi-character
864 constructs have at least 3 characters, any strings of length 2 or
865 less are composed solely of normal characters. Hence, the index of
866 the outer for-loop runs only as far as LEN-2. */
867 size_t i;
868 for (i = 0; i + 2 < es->len; /* empty */)
870 if (es_match (es, i, '['))
872 bool matched_multi_char_construct;
873 size_t closing_bracket_idx;
874 unsigned char char_to_repeat;
875 count repeat_count;
876 int err;
878 matched_multi_char_construct = true;
879 if (es_match (es, i + 1, ':') || es_match (es, i + 1, '='))
881 size_t closing_delim_idx;
883 if (find_closing_delim (es, i + 2, p[i + 1], &closing_delim_idx))
885 size_t opnd_str_len = closing_delim_idx - 1 - (i + 2) + 1;
886 char const *opnd_str = p + i + 2;
888 if (opnd_str_len == 0)
890 if (p[i + 1] == ':')
891 error (0, 0, _("missing character class name '[::]'"));
892 else
893 error (0, 0,
894 _("missing equivalence class character '[==]'"));
895 return false;
898 if (p[i + 1] == ':')
900 /* FIXME: big comment. */
901 if (!append_char_class (result, opnd_str, opnd_str_len))
903 if (star_digits_closebracket (es, i + 2))
904 goto try_bracketed_repeat;
905 else
907 char *tmp = make_printable_str (opnd_str,
908 opnd_str_len);
909 error (0, 0, _("invalid character class %s"),
910 quote (tmp));
911 free (tmp);
912 return false;
916 else
918 /* FIXME: big comment. */
919 if (!append_equiv_class (result, opnd_str, opnd_str_len))
921 if (star_digits_closebracket (es, i + 2))
922 goto try_bracketed_repeat;
923 else
925 char *tmp = make_printable_str (opnd_str,
926 opnd_str_len);
927 error (0, 0,
928 _("%s: equivalence class operand must be a single character"),
929 tmp);
930 free (tmp);
931 return false;
936 i = closing_delim_idx + 2;
937 continue;
939 /* Else fall through. This could be [:*] or [=*]. */
942 try_bracketed_repeat:
944 /* Determine whether this is a bracketed repeat range
945 matching the RE \[.\*(dec_or_oct_number)?\]. */
946 err = find_bracketed_repeat (es, i + 1, &char_to_repeat,
947 &repeat_count,
948 &closing_bracket_idx);
949 if (err == 0)
951 append_repeated_char (result, char_to_repeat, repeat_count);
952 i = closing_bracket_idx + 1;
954 else if (err == -1)
956 matched_multi_char_construct = false;
958 else
960 /* Found a string that looked like [c*n] but the
961 numeric part was invalid. */
962 return false;
965 if (matched_multi_char_construct)
966 continue;
968 /* We reach this point if P does not match [:str:], [=c=],
969 [c*n], or [c*]. Now, see if P looks like a range '[-c'
970 (from '[' to 'c'). */
973 /* Look ahead one char for ranges like a-z. */
974 if (es_match (es, i + 1, '-'))
976 if (!append_range (result, p[i], p[i + 2]))
977 return false;
978 i += 3;
980 else
982 append_normal_char (result, p[i]);
983 ++i;
987 /* Now handle the (2 or fewer) remaining characters p[i]..p[es->len - 1]. */
988 for (; i < es->len; i++)
989 append_normal_char (result, p[i]);
991 return true;
994 /* Advance past the current construct.
995 S->tail must be non-NULL. */
996 static void
997 skip_construct (struct Spec_list *s)
999 s->tail = s->tail->next;
1000 s->state = NEW_ELEMENT;
1003 /* Given a Spec_list S (with its saved state implicit in the values
1004 of its members 'tail' and 'state'), return the next single character
1005 in the expansion of S's constructs. If the last character of S was
1006 returned on the previous call or if S was empty, this function
1007 returns -1. For example, successive calls to get_next where S
1008 represents the spec-string 'a-d[y*3]' will return the sequence
1009 of values a, b, c, d, y, y, y, -1. Finally, if the construct from
1010 which the returned character comes is [:upper:] or [:lower:], the
1011 parameter CLASS is given a value to indicate which it was. Otherwise
1012 CLASS is set to UL_NONE. This value is used only when constructing
1013 the translation table to verify that any occurrences of upper and
1014 lower class constructs in the spec-strings appear in the same relative
1015 positions. */
1017 static int
1018 get_next (struct Spec_list *s, enum Upper_Lower_class *class)
1020 struct List_element *p;
1021 int return_val;
1022 int i;
1024 if (class)
1025 *class = UL_NONE;
1027 if (s->state == BEGIN_STATE)
1029 s->tail = s->head->next;
1030 s->state = NEW_ELEMENT;
1033 p = s->tail;
1034 if (p == NULL)
1035 return -1;
1037 switch (p->type)
1039 case RE_NORMAL_CHAR:
1040 return_val = p->u.normal_char;
1041 s->state = NEW_ELEMENT;
1042 s->tail = p->next;
1043 break;
1045 case RE_RANGE:
1046 if (s->state == NEW_ELEMENT)
1047 s->state = p->u.range.first_char;
1048 else
1049 ++(s->state);
1050 return_val = s->state;
1051 if (s->state == p->u.range.last_char)
1053 s->tail = p->next;
1054 s->state = NEW_ELEMENT;
1056 break;
1058 case RE_CHAR_CLASS:
1059 if (class)
1061 switch (p->u.char_class)
1063 case CC_LOWER:
1064 *class = UL_LOWER;
1065 break;
1066 case CC_UPPER:
1067 *class = UL_UPPER;
1068 break;
1069 default:
1070 break;
1074 if (s->state == NEW_ELEMENT)
1076 for (i = 0; i < N_CHARS; i++)
1077 if (is_char_class_member (p->u.char_class, i))
1078 break;
1079 assert (i < N_CHARS);
1080 s->state = i;
1082 assert (is_char_class_member (p->u.char_class, s->state));
1083 return_val = s->state;
1084 for (i = s->state + 1; i < N_CHARS; i++)
1085 if (is_char_class_member (p->u.char_class, i))
1086 break;
1087 if (i < N_CHARS)
1088 s->state = i;
1089 else
1091 s->tail = p->next;
1092 s->state = NEW_ELEMENT;
1094 break;
1096 case RE_EQUIV_CLASS:
1097 /* FIXME: this assumes that each character is alone in its own
1098 equivalence class (which appears to be correct for my
1099 LC_COLLATE. But I don't know of any function that allows
1100 one to determine a character's equivalence class. */
1102 return_val = p->u.equiv_code;
1103 s->state = NEW_ELEMENT;
1104 s->tail = p->next;
1105 break;
1107 case RE_REPEATED_CHAR:
1108 /* Here, a repeat count of n == 0 means don't repeat at all. */
1109 if (p->u.repeated_char.repeat_count == 0)
1111 s->tail = p->next;
1112 s->state = NEW_ELEMENT;
1113 return_val = get_next (s, class);
1115 else
1117 if (s->state == NEW_ELEMENT)
1119 s->state = 0;
1121 ++(s->state);
1122 return_val = p->u.repeated_char.the_repeated_char;
1123 if (s->state == p->u.repeated_char.repeat_count)
1125 s->tail = p->next;
1126 s->state = NEW_ELEMENT;
1129 break;
1131 default:
1132 abort ();
1135 return return_val;
1138 /* This is a minor kludge. This function is called from
1139 get_spec_stats to determine the cardinality of a set derived
1140 from a complemented string. It's a kludge in that some of the
1141 same operations are (duplicated) performed in set_initialize. */
1143 static int
1144 card_of_complement (struct Spec_list *s)
1146 int c;
1147 int cardinality = N_CHARS;
1148 bool in_set[N_CHARS] = { 0, };
1150 s->state = BEGIN_STATE;
1151 while ((c = get_next (s, NULL)) != -1)
1153 cardinality -= (!in_set[c]);
1154 in_set[c] = true;
1156 return cardinality;
1159 /* Discard the lengths associated with a case conversion,
1160 as using the actual number of upper or lower case characters
1161 is problematic when they don't match in some locales.
1162 Also ensure the case conversion classes in string2 are
1163 aligned correctly with those in string1.
1164 Note POSIX says the behavior of 'tr "[:upper:]" "[:upper:]"'
1165 is undefined. Therefore we allow it (unlike Solaris)
1166 and treat it as a no-op. */
1168 static void
1169 validate_case_classes (struct Spec_list *s1, struct Spec_list *s2)
1171 size_t n_upper = 0;
1172 size_t n_lower = 0;
1173 int c1 = 0;
1174 int c2 = 0;
1175 count old_s1_len = s1->length;
1176 count old_s2_len = s2->length;
1177 struct List_element *s1_tail = s1->tail;
1178 struct List_element *s2_tail = s2->tail;
1179 bool s1_new_element = true;
1180 bool s2_new_element = true;
1182 if (complement || !s2->has_char_class)
1183 return;
1185 for (int i = 0; i < N_CHARS; i++)
1187 if (isupper (i))
1188 n_upper++;
1189 if (islower (i))
1190 n_lower++;
1193 s1->state = BEGIN_STATE;
1194 s2->state = BEGIN_STATE;
1196 while (c1 != -1 && c2 != -1)
1198 enum Upper_Lower_class class_s1, class_s2;
1200 c1 = get_next (s1, &class_s1);
1201 c2 = get_next (s2, &class_s2);
1203 /* If c2 transitions to a new case class, then
1204 c1 must also transition at the same time. */
1205 if (s2_new_element && class_s2 != UL_NONE
1206 && !(s1_new_element && class_s1 != UL_NONE))
1207 die (EXIT_FAILURE, 0,
1208 _("misaligned [:upper:] and/or [:lower:] construct"));
1210 /* If case converting, quickly skip over the elements. */
1211 if (class_s2 != UL_NONE)
1213 skip_construct (s1);
1214 skip_construct (s2);
1215 /* Discount insignificant/problematic lengths. */
1216 s1->length -= (class_s1 == UL_UPPER ? n_upper : n_lower) - 1;
1217 s2->length -= (class_s2 == UL_UPPER ? n_upper : n_lower) - 1;
1220 s1_new_element = s1->state == NEW_ELEMENT; /* Next element is new. */
1221 s2_new_element = s2->state == NEW_ELEMENT; /* Next element is new. */
1224 assert (old_s1_len >= s1->length && old_s2_len >= s2->length);
1226 s1->tail = s1_tail;
1227 s2->tail = s2_tail;
1230 /* Gather statistics about the spec-list S in preparation for the tests
1231 in validate that determine the consistency of the specs. This function
1232 is called at most twice; once for string1, and again for any string2.
1233 LEN_S1 < 0 indicates that this is the first call and that S represents
1234 string1. When LEN_S1 >= 0, it is the length of the expansion of the
1235 constructs in string1, and we can use its value to resolve any
1236 indefinite repeat construct in S (which represents string2). Hence,
1237 this function has the side-effect that it converts a valid [c*]
1238 construct in string2 to [c*n] where n is large enough (or 0) to give
1239 string2 the same length as string1. For example, with the command
1240 tr a-z 'A[\n*]Z' on the second call to get_spec_stats, LEN_S1 would
1241 be 26 and S (representing string2) would be converted to 'A[\n*24]Z'. */
1243 static void
1244 get_spec_stats (struct Spec_list *s)
1246 struct List_element *p;
1247 count length = 0;
1249 s->n_indefinite_repeats = 0;
1250 s->has_equiv_class = false;
1251 s->has_restricted_char_class = false;
1252 s->has_char_class = false;
1253 for (p = s->head->next; p; p = p->next)
1255 count len = 0;
1256 count new_length;
1258 switch (p->type)
1260 case RE_NORMAL_CHAR:
1261 len = 1;
1262 break;
1264 case RE_RANGE:
1265 assert (p->u.range.last_char >= p->u.range.first_char);
1266 len = p->u.range.last_char - p->u.range.first_char + 1;
1267 break;
1269 case RE_CHAR_CLASS:
1270 s->has_char_class = true;
1271 for (int i = 0; i < N_CHARS; i++)
1272 if (is_char_class_member (p->u.char_class, i))
1273 ++len;
1274 switch (p->u.char_class)
1276 case CC_UPPER:
1277 case CC_LOWER:
1278 break;
1279 default:
1280 s->has_restricted_char_class = true;
1281 break;
1283 break;
1285 case RE_EQUIV_CLASS:
1286 for (int i = 0; i < N_CHARS; i++)
1287 if (is_equiv_class_member (p->u.equiv_code, i))
1288 ++len;
1289 s->has_equiv_class = true;
1290 break;
1292 case RE_REPEATED_CHAR:
1293 if (p->u.repeated_char.repeat_count > 0)
1294 len = p->u.repeated_char.repeat_count;
1295 else
1297 s->indefinite_repeat_element = p;
1298 ++(s->n_indefinite_repeats);
1300 break;
1302 default:
1303 abort ();
1306 /* Check for arithmetic overflow in computing length. Also, reject
1307 any length greater than the maximum repeat count, in case the
1308 length is later used to compute the repeat count for an
1309 indefinite element. */
1310 new_length = length + len;
1311 if (! (length <= new_length && new_length <= REPEAT_COUNT_MAXIMUM))
1312 die (EXIT_FAILURE, 0, _("too many characters in set"));
1313 length = new_length;
1316 s->length = length;
1319 static void
1320 get_s1_spec_stats (struct Spec_list *s1)
1322 get_spec_stats (s1);
1323 if (complement)
1324 s1->length = card_of_complement (s1);
1327 static void
1328 get_s2_spec_stats (struct Spec_list *s2, count len_s1)
1330 get_spec_stats (s2);
1331 if (len_s1 >= s2->length && s2->n_indefinite_repeats == 1)
1333 s2->indefinite_repeat_element->u.repeated_char.repeat_count =
1334 len_s1 - s2->length;
1335 s2->length = len_s1;
1339 static void
1340 spec_init (struct Spec_list *spec_list)
1342 struct List_element *new = xmalloc (sizeof *new);
1343 spec_list->head = spec_list->tail = new;
1344 spec_list->head->next = NULL;
1347 /* This function makes two passes over the argument string S. The first
1348 one converts all \c and \ddd escapes to their one-byte representations.
1349 The second constructs a linked specification list, SPEC_LIST, of the
1350 characters and constructs that comprise the argument string. If either
1351 of these passes detects an error, this function returns false. */
1353 static bool
1354 parse_str (char const *s, struct Spec_list *spec_list)
1356 struct E_string es;
1357 bool ok = unquote (s, &es) && build_spec_list (&es, spec_list);
1358 es_free (&es);
1359 return ok;
1362 /* Given two specification lists, S1 and S2, and assuming that
1363 S1->length > S2->length, append a single [c*n] element to S2 where c
1364 is the last character in the expansion of S2 and n is the difference
1365 between the two lengths.
1366 Upon successful completion, S2->length is set to S1->length. The only
1367 way this function can fail to make S2 as long as S1 is when S2 has
1368 zero-length, since in that case, there is no last character to repeat.
1369 So S2->length is required to be at least 1. */
1371 static void
1372 string2_extend (const struct Spec_list *s1, struct Spec_list *s2)
1374 struct List_element *p;
1375 unsigned char char_to_repeat;
1377 assert (translating);
1378 assert (s1->length > s2->length);
1379 assert (s2->length > 0);
1381 p = s2->tail;
1382 switch (p->type)
1384 case RE_NORMAL_CHAR:
1385 char_to_repeat = p->u.normal_char;
1386 break;
1387 case RE_RANGE:
1388 char_to_repeat = p->u.range.last_char;
1389 break;
1390 case RE_CHAR_CLASS:
1391 /* Note BSD allows extending of classes in string2. For example:
1392 tr '[:upper:]0-9' '[:lower:]'
1393 That's not portable however, contradicts POSIX and is dependent
1394 on your collating sequence. */
1395 die (EXIT_FAILURE, 0,
1396 _("when translating with string1 longer than string2,\nthe\
1397 latter string must not end with a character class"));
1399 case RE_REPEATED_CHAR:
1400 char_to_repeat = p->u.repeated_char.the_repeated_char;
1401 break;
1403 case RE_EQUIV_CLASS:
1404 /* This shouldn't happen, because validate exits with an error
1405 if it finds an equiv class in string2 when translating. */
1406 abort ();
1408 default:
1409 abort ();
1412 append_repeated_char (s2, char_to_repeat, s1->length - s2->length);
1413 s2->length = s1->length;
1416 /* Return true if S is a non-empty list in which exactly one
1417 character (but potentially, many instances of it) appears.
1418 E.g., [X*] or xxxxxxxx. */
1420 static bool
1421 homogeneous_spec_list (struct Spec_list *s)
1423 int b, c;
1425 s->state = BEGIN_STATE;
1427 if ((b = get_next (s, NULL)) == -1)
1428 return false;
1430 while ((c = get_next (s, NULL)) != -1)
1431 if (c != b)
1432 return false;
1434 return true;
1437 /* Die with an error message if S1 and S2 describe strings that
1438 are not valid with the given command line switches.
1439 A side effect of this function is that if a valid [c*] or
1440 [c*0] construct appears in string2, it is converted to [c*n]
1441 with a value for n that makes s2->length == s1->length. By
1442 the same token, if the --truncate-set1 option is not
1443 given, S2 may be extended. */
1445 static void
1446 validate (struct Spec_list *s1, struct Spec_list *s2)
1448 get_s1_spec_stats (s1);
1449 if (s1->n_indefinite_repeats > 0)
1451 die (EXIT_FAILURE, 0,
1452 _("the [c*] repeat construct may not appear in string1"));
1455 if (s2)
1457 get_s2_spec_stats (s2, s1->length);
1459 if (s2->n_indefinite_repeats > 1)
1461 die (EXIT_FAILURE, 0,
1462 _("only one [c*] repeat construct may appear in string2"));
1465 if (translating)
1467 if (s2->has_equiv_class)
1469 die (EXIT_FAILURE, 0,
1470 _("[=c=] expressions may not appear in string2\
1471 when translating"));
1474 if (s2->has_restricted_char_class)
1476 die (EXIT_FAILURE, 0,
1477 _("when translating, the only character classes that may\
1478 appear in\nstring2 are 'upper' and 'lower'"));
1481 validate_case_classes (s1, s2);
1483 if (s1->length > s2->length)
1485 if (!truncate_set1)
1487 /* string2 must be non-empty unless --truncate-set1 is
1488 given or string1 is empty. */
1490 if (s2->length == 0)
1491 die (EXIT_FAILURE, 0,
1492 _("when not truncating set1, string2 must be non-empty"));
1493 string2_extend (s1, s2);
1497 if (complement && s1->has_char_class
1498 && ! (s2->length == s1->length && homogeneous_spec_list (s2)))
1500 die (EXIT_FAILURE, 0,
1501 _("when translating with complemented character classes,\
1502 \nstring2 must map all characters in the domain to one"));
1505 else
1506 /* Not translating. */
1508 if (s2->n_indefinite_repeats > 0)
1509 die (EXIT_FAILURE, 0,
1510 _("the [c*] construct may appear in string2 only\
1511 when translating"));
1516 /* Read buffers of SIZE bytes via the function READER (if READER is
1517 NULL, read from stdin) until EOF. When non-NULL, READER is either
1518 read_and_delete or read_and_xlate. After each buffer is read, it is
1519 processed and written to stdout. The buffers are processed so that
1520 multiple consecutive occurrences of the same character in the input
1521 stream are replaced by a single occurrence of that character if the
1522 character is in the squeeze set. */
1524 static void
1525 squeeze_filter (char *buf, size_t size, size_t (*reader) (char *, size_t))
1527 /* A value distinct from any character that may have been stored in a
1528 buffer as the result of a block-read in the function squeeze_filter. */
1529 const int NOT_A_CHAR = INT_MAX;
1531 int char_to_squeeze = NOT_A_CHAR;
1532 size_t i = 0;
1533 size_t nr = 0;
1535 while (true)
1537 if (i >= nr)
1539 nr = reader (buf, size);
1540 if (nr == 0)
1541 break;
1542 i = 0;
1545 size_t begin = i;
1547 if (char_to_squeeze == NOT_A_CHAR)
1549 size_t out_len;
1550 /* Here, by being a little tricky, we can get a significant
1551 performance increase in most cases when the input is
1552 reasonably large. Since tr will modify the input only
1553 if two consecutive (and identical) input characters are
1554 in the squeeze set, we can step by two through the data
1555 when searching for a character in the squeeze set. This
1556 means there may be a little more work in a few cases and
1557 perhaps twice as much work in the worst cases where most
1558 of the input is removed by squeezing repeats. But most
1559 uses of this functionality seem to remove less than 20-30%
1560 of the input. */
1561 for (; i < nr && !in_squeeze_set[to_uchar (buf[i])]; i += 2)
1562 continue;
1564 /* There is a special case when i == nr and we've just
1565 skipped a character (the last one in buf) that is in
1566 the squeeze set. */
1567 if (i == nr && in_squeeze_set[to_uchar (buf[i - 1])])
1568 --i;
1570 if (i >= nr)
1571 out_len = nr - begin;
1572 else
1574 char_to_squeeze = buf[i];
1575 /* We're about to output buf[begin..i]. */
1576 out_len = i - begin + 1;
1578 /* But since we stepped by 2 in the loop above,
1579 out_len may be one too large. */
1580 if (i > 0 && buf[i - 1] == char_to_squeeze)
1581 --out_len;
1583 /* Advance i to the index of first character to be
1584 considered when looking for a char different from
1585 char_to_squeeze. */
1586 ++i;
1588 if (out_len > 0
1589 && fwrite (&buf[begin], 1, out_len, stdout) != out_len)
1590 die (EXIT_FAILURE, errno, _("write error"));
1593 if (char_to_squeeze != NOT_A_CHAR)
1595 /* Advance i to index of first char != char_to_squeeze
1596 (or to nr if all the rest of the characters in this
1597 buffer are the same as char_to_squeeze). */
1598 for (; i < nr && buf[i] == char_to_squeeze; i++)
1599 continue;
1600 if (i < nr)
1601 char_to_squeeze = NOT_A_CHAR;
1602 /* If (i >= nr) we've squeezed the last character in this buffer.
1603 So now we have to read a new buffer and continue comparing
1604 characters against char_to_squeeze. */
1609 static size_t
1610 plain_read (char *buf, size_t size)
1612 size_t nr = safe_read (STDIN_FILENO, buf, size);
1613 if (nr == SAFE_READ_ERROR)
1614 die (EXIT_FAILURE, errno, _("read error"));
1615 return nr;
1618 /* Read buffers of SIZE bytes from stdin until one is found that
1619 contains at least one character not in the delete set. Store
1620 in the array BUF, all characters from that buffer that are not
1621 in the delete set, and return the number of characters saved
1622 or 0 upon EOF. */
1624 static size_t
1625 read_and_delete (char *buf, size_t size)
1627 size_t n_saved;
1629 /* This enclosing do-while loop is to make sure that
1630 we don't return zero (indicating EOF) when we've
1631 just deleted all the characters in a buffer. */
1634 size_t nr = plain_read (buf, size);
1636 if (nr == 0)
1637 return 0;
1639 /* This first loop may be a waste of code, but gives much
1640 better performance when no characters are deleted in
1641 the beginning of a buffer. It just avoids the copying
1642 of buf[i] into buf[n_saved] when it would be a NOP. */
1644 size_t i;
1645 for (i = 0; i < nr && !in_delete_set[to_uchar (buf[i])]; i++)
1646 continue;
1647 n_saved = i;
1649 for (++i; i < nr; i++)
1650 if (!in_delete_set[to_uchar (buf[i])])
1651 buf[n_saved++] = buf[i];
1653 while (n_saved == 0);
1655 return n_saved;
1658 /* Read at most SIZE bytes from stdin into the array BUF. Then
1659 perform the in-place and one-to-one mapping specified by the global
1660 array 'xlate'. Return the number of characters read, or 0 upon EOF. */
1662 static size_t
1663 read_and_xlate (char *buf, size_t size)
1665 size_t bytes_read = plain_read (buf, size);
1667 for (size_t i = 0; i < bytes_read; i++)
1668 buf[i] = xlate[to_uchar (buf[i])];
1670 return bytes_read;
1673 /* Initialize a boolean membership set, IN_SET, with the character
1674 values obtained by traversing the linked list of constructs S
1675 using the function 'get_next'. IN_SET is expected to have been
1676 initialized to all zeros by the caller. If COMPLEMENT_THIS_SET
1677 is true the resulting set is complemented. */
1679 static void
1680 set_initialize (struct Spec_list *s, bool complement_this_set, bool *in_set)
1682 int c;
1684 s->state = BEGIN_STATE;
1685 while ((c = get_next (s, NULL)) != -1)
1686 in_set[c] = true;
1687 if (complement_this_set)
1688 for (size_t i = 0; i < N_CHARS; i++)
1689 in_set[i] = (!in_set[i]);
1693 main (int argc, char **argv)
1695 int c;
1696 int non_option_args;
1697 int min_operands;
1698 int max_operands;
1699 struct Spec_list buf1, buf2;
1700 struct Spec_list *s1 = &buf1;
1701 struct Spec_list *s2 = &buf2;
1703 initialize_main (&argc, &argv);
1704 set_program_name (argv[0]);
1705 setlocale (LC_ALL, "");
1706 bindtextdomain (PACKAGE, LOCALEDIR);
1707 textdomain (PACKAGE);
1709 atexit (close_stdout);
1711 while ((c = getopt_long (argc, argv, "+AcCdst", long_options, NULL)) != -1)
1713 switch (c)
1715 case 'A':
1716 /* Undocumented option, for compatibility with AIX. */
1717 setlocale (LC_COLLATE, "C");
1718 setlocale (LC_CTYPE, "C");
1719 break;
1721 case 'c':
1722 case 'C':
1723 complement = true;
1724 break;
1726 case 'd':
1727 delete = true;
1728 break;
1730 case 's':
1731 squeeze_repeats = true;
1732 break;
1734 case 't':
1735 truncate_set1 = true;
1736 break;
1738 case_GETOPT_HELP_CHAR;
1740 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1742 default:
1743 usage (EXIT_FAILURE);
1744 break;
1748 non_option_args = argc - optind;
1749 translating = (non_option_args == 2 && !delete);
1750 min_operands = 1 + (delete == squeeze_repeats);
1751 max_operands = 1 + (delete <= squeeze_repeats);
1753 if (non_option_args < min_operands)
1755 if (non_option_args == 0)
1756 error (0, 0, _("missing operand"));
1757 else
1759 error (0, 0, _("missing operand after %s"), quote (argv[argc - 1]));
1760 fprintf (stderr, "%s\n",
1761 _(squeeze_repeats
1762 ? N_("Two strings must be given when "
1763 "both deleting and squeezing repeats.")
1764 : N_("Two strings must be given when translating.")));
1766 usage (EXIT_FAILURE);
1769 if (max_operands < non_option_args)
1771 error (0, 0, _("extra operand %s"), quote (argv[optind + max_operands]));
1772 if (non_option_args == 2)
1773 fprintf (stderr, "%s\n",
1774 _("Only one string may be given when "
1775 "deleting without squeezing repeats."));
1776 usage (EXIT_FAILURE);
1779 spec_init (s1);
1780 if (!parse_str (argv[optind], s1))
1781 main_exit (EXIT_FAILURE);
1783 if (non_option_args == 2)
1785 spec_init (s2);
1786 if (!parse_str (argv[optind + 1], s2))
1787 main_exit (EXIT_FAILURE);
1789 else
1790 s2 = NULL;
1792 validate (s1, s2);
1794 /* Use binary I/O, since 'tr' is sometimes used to transliterate
1795 non-printable characters, or characters which are stripped away
1796 by text-mode reads (like CR and ^Z). */
1797 xset_binary_mode (STDIN_FILENO, O_BINARY);
1798 xset_binary_mode (STDOUT_FILENO, O_BINARY);
1799 fadvise (stdin, FADVISE_SEQUENTIAL);
1801 if (squeeze_repeats && non_option_args == 1)
1803 set_initialize (s1, complement, in_squeeze_set);
1804 squeeze_filter (io_buf, sizeof io_buf, plain_read);
1806 else if (delete && non_option_args == 1)
1808 set_initialize (s1, complement, in_delete_set);
1810 while (true)
1812 size_t nr = read_and_delete (io_buf, sizeof io_buf);
1813 if (nr == 0)
1814 break;
1815 if (fwrite (io_buf, 1, nr, stdout) != nr)
1816 die (EXIT_FAILURE, errno, _("write error"));
1819 else if (squeeze_repeats && delete && non_option_args == 2)
1821 set_initialize (s1, complement, in_delete_set);
1822 set_initialize (s2, false, in_squeeze_set);
1823 squeeze_filter (io_buf, sizeof io_buf, read_and_delete);
1825 else if (translating)
1827 if (complement)
1829 bool *in_s1 = in_delete_set;
1831 set_initialize (s1, false, in_s1);
1832 s2->state = BEGIN_STATE;
1833 for (int i = 0; i < N_CHARS; i++)
1834 xlate[i] = i;
1835 for (int i = 0; i < N_CHARS; i++)
1837 if (!in_s1[i])
1839 int ch = get_next (s2, NULL);
1840 assert (ch != -1 || truncate_set1);
1841 if (ch == -1)
1843 /* This will happen when tr is invoked like e.g.
1844 tr -cs A-Za-z0-9 '\012'. */
1845 break;
1847 xlate[i] = ch;
1851 else
1853 int c1, c2;
1854 enum Upper_Lower_class class_s1;
1855 enum Upper_Lower_class class_s2;
1857 for (int i = 0; i < N_CHARS; i++)
1858 xlate[i] = i;
1859 s1->state = BEGIN_STATE;
1860 s2->state = BEGIN_STATE;
1861 while (true)
1863 c1 = get_next (s1, &class_s1);
1864 c2 = get_next (s2, &class_s2);
1866 if (class_s1 == UL_LOWER && class_s2 == UL_UPPER)
1868 for (int i = 0; i < N_CHARS; i++)
1869 if (islower (i))
1870 xlate[i] = toupper (i);
1872 else if (class_s1 == UL_UPPER && class_s2 == UL_LOWER)
1874 for (int i = 0; i < N_CHARS; i++)
1875 if (isupper (i))
1876 xlate[i] = tolower (i);
1878 else
1880 /* The following should have been checked by validate... */
1881 if (c1 == -1 || c2 == -1)
1882 break;
1883 xlate[c1] = c2;
1886 /* When case-converting, skip the elements as an optimization. */
1887 if (class_s2 != UL_NONE)
1889 skip_construct (s1);
1890 skip_construct (s2);
1893 assert (c1 == -1 || truncate_set1);
1895 if (squeeze_repeats)
1897 set_initialize (s2, false, in_squeeze_set);
1898 squeeze_filter (io_buf, sizeof io_buf, read_and_xlate);
1900 else
1902 while (true)
1904 size_t bytes_read = read_and_xlate (io_buf, sizeof io_buf);
1905 if (bytes_read == 0)
1906 break;
1907 if (fwrite (io_buf, 1, bytes_read, stdout) != bytes_read)
1908 die (EXIT_FAILURE, errno, _("write error"));
1913 if (close (STDIN_FILENO) != 0)
1914 die (EXIT_FAILURE, errno, _("standard input"));
1916 main_exit (EXIT_SUCCESS);