maint: spelling fixes, including author names
[coreutils.git] / src / uniq.c
blobfab04ded8c671d8f13d7c1c0202a3ee3cc531980
1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 1986-2023 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Richard M. Stallman and David MacKenzie. */
19 #include <config.h>
21 #include <getopt.h>
22 #include <sys/types.h>
24 #include "system.h"
25 #include "argmatch.h"
26 #include "linebuffer.h"
27 #include "fadvise.h"
28 #include "posixver.h"
29 #include "stdio--.h"
30 #include "xstrtol.h"
31 #include "memcasecmp.h"
32 #include "quote.h"
34 /* The official name of this program (e.g., no 'g' prefix). */
35 #define PROGRAM_NAME "uniq"
37 #define AUTHORS \
38 proper_name ("Richard M. Stallman"), \
39 proper_name ("David MacKenzie")
41 #define SWAP_LINES(A, B) \
42 do \
43 { \
44 struct linebuffer *_tmp; \
45 _tmp = (A); \
46 (A) = (B); \
47 (B) = _tmp; \
48 } \
49 while (0)
51 /* Number of fields to skip on each line when doing comparisons. */
52 static size_t skip_fields;
54 /* Number of chars to skip after skipping any fields. */
55 static size_t skip_chars;
57 /* Number of chars to compare. */
58 static size_t check_chars;
60 enum countmode
62 count_occurrences, /* -c Print count before output lines. */
63 count_none /* Default. Do not print counts. */
66 /* Whether and how to precede the output lines with a count of the number of
67 times they occurred in the input. */
68 static enum countmode countmode;
70 /* Which lines to output: unique lines, the first of a group of
71 repeated lines, and the second and subsequent of a group of
72 repeated lines. */
73 static bool output_unique;
74 static bool output_first_repeated;
75 static bool output_later_repeated;
77 /* If true, ignore case when comparing. */
78 static bool ignore_case;
80 enum delimit_method
82 /* No delimiters output. --all-repeated[=none] */
83 DM_NONE,
85 /* Delimiter precedes all groups. --all-repeated=prepend */
86 DM_PREPEND,
88 /* Delimit all groups. --all-repeated=separate */
89 DM_SEPARATE
92 static char const *const delimit_method_string[] =
94 "none", "prepend", "separate", nullptr
97 static enum delimit_method const delimit_method_map[] =
99 DM_NONE, DM_PREPEND, DM_SEPARATE
102 /* Select whether/how to delimit groups of duplicate lines. */
103 static enum delimit_method delimit_groups;
105 enum grouping_method
107 /* No grouping, when "--group" isn't used */
108 GM_NONE,
110 /* Delimiter precedes all groups. --group=prepend */
111 GM_PREPEND,
113 /* Delimiter follows all groups. --group=append */
114 GM_APPEND,
116 /* Delimiter between groups. --group[=separate] */
117 GM_SEPARATE,
119 /* Delimiter before and after each group. --group=both */
120 GM_BOTH
123 static char const *const grouping_method_string[] =
125 "prepend", "append", "separate", "both", nullptr
128 static enum grouping_method const grouping_method_map[] =
130 GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
133 static enum grouping_method grouping = GM_NONE;
135 enum
137 GROUP_OPTION = CHAR_MAX + 1
140 static struct option const longopts[] =
142 {"count", no_argument, nullptr, 'c'},
143 {"repeated", no_argument, nullptr, 'd'},
144 {"all-repeated", optional_argument, nullptr, 'D'},
145 {"group", optional_argument, nullptr, GROUP_OPTION},
146 {"ignore-case", no_argument, nullptr, 'i'},
147 {"unique", no_argument, nullptr, 'u'},
148 {"skip-fields", required_argument, nullptr, 'f'},
149 {"skip-chars", required_argument, nullptr, 's'},
150 {"check-chars", required_argument, nullptr, 'w'},
151 {"zero-terminated", no_argument, nullptr, 'z'},
152 {GETOPT_HELP_OPTION_DECL},
153 {GETOPT_VERSION_OPTION_DECL},
154 {nullptr, 0, nullptr, 0}
157 void
158 usage (int status)
160 if (status != EXIT_SUCCESS)
161 emit_try_help ();
162 else
164 printf (_("\
165 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
167 program_name);
168 fputs (_("\
169 Filter adjacent matching lines from INPUT (or standard input),\n\
170 writing to OUTPUT (or standard output).\n\
172 With no options, matching lines are merged to the first occurrence.\n\
173 "), stdout);
175 emit_mandatory_arg_note ();
177 fputs (_("\
178 -c, --count prefix lines by the number of occurrences\n\
179 -d, --repeated only print duplicate lines, one for each group\n\
180 "), stdout);
181 fputs (_("\
182 -D print all duplicate lines\n\
183 --all-repeated[=METHOD] like -D, but allow separating groups\n\
184 with an empty line;\n\
185 METHOD={none(default),prepend,separate}\n\
186 "), stdout);
187 fputs (_("\
188 -f, --skip-fields=N avoid comparing the first N fields\n\
189 "), stdout);
190 fputs (_("\
191 --group[=METHOD] show all items, separating groups with an empty line;\n\
192 METHOD={separate(default),prepend,append,both}\n\
193 "), stdout);
194 fputs (_("\
195 -i, --ignore-case ignore differences in case when comparing\n\
196 -s, --skip-chars=N avoid comparing the first N characters\n\
197 -u, --unique only print unique lines\n\
198 "), stdout);
199 fputs (_("\
200 -z, --zero-terminated line delimiter is NUL, not newline\n\
201 "), stdout);
202 fputs (_("\
203 -w, --check-chars=N compare no more than N characters in lines\n\
204 "), stdout);
205 fputs (HELP_OPTION_DESCRIPTION, stdout);
206 fputs (VERSION_OPTION_DESCRIPTION, stdout);
207 fputs (_("\
209 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
210 characters. Fields are skipped before chars.\n\
211 "), stdout);
212 fputs (_("\
214 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
215 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
216 "), stdout);
217 emit_ancillary_info (PROGRAM_NAME);
219 exit (status);
222 static bool
223 strict_posix2 (void)
225 int posix_ver = posix2_version ();
226 return 200112 <= posix_ver && posix_ver < 200809;
229 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
230 invalid. Silently convert too-large values to SIZE_MAX. */
232 static size_t
233 size_opt (char const *opt, char const *msgid)
235 uintmax_t size;
237 switch (xstrtoumax (opt, nullptr, 10, &size, ""))
239 case LONGINT_OK:
240 case LONGINT_OVERFLOW:
241 break;
243 default:
244 error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
247 return MIN (size, SIZE_MAX);
250 /* Given a linebuffer LINE,
251 return a pointer to the beginning of the line's field to be compared. */
253 ATTRIBUTE_PURE
254 static char *
255 find_field (struct linebuffer const *line)
257 size_t count;
258 char const *lp = line->buffer;
259 size_t size = line->length - 1;
260 size_t i = 0;
262 for (count = 0; count < skip_fields && i < size; count++)
264 while (i < size && field_sep (lp[i]))
265 i++;
266 while (i < size && !field_sep (lp[i]))
267 i++;
270 i += MIN (skip_chars, size - i);
272 return line->buffer + i;
275 /* Return false if two strings OLD and NEW match, true if not.
276 OLD and NEW point not to the beginnings of the lines
277 but rather to the beginnings of the fields to compare.
278 OLDLEN and NEWLEN are their lengths. */
280 static bool
281 different (char *old, char *new, size_t oldlen, size_t newlen)
283 if (check_chars < oldlen)
284 oldlen = check_chars;
285 if (check_chars < newlen)
286 newlen = check_chars;
288 if (ignore_case)
289 return oldlen != newlen || memcasecmp (old, new, oldlen);
290 else
291 return oldlen != newlen || memcmp (old, new, oldlen);
294 /* Output the line in linebuffer LINE to standard output
295 provided that the switches say it should be output.
296 MATCH is true if the line matches the previous line.
297 If requested, print the number of times it occurred, as well;
298 LINECOUNT + 1 is the number of times that the line occurred. */
300 static void
301 writeline (struct linebuffer const *line,
302 bool match, uintmax_t linecount)
304 if (! (linecount == 0 ? output_unique
305 : !match ? output_first_repeated
306 : output_later_repeated))
307 return;
309 if (countmode == count_occurrences)
310 printf ("%7" PRIuMAX " ", linecount + 1);
312 if (fwrite (line->buffer, sizeof (char), line->length, stdout)
313 != line->length)
314 write_error ();
317 /* Process input file INFILE with output to OUTFILE.
318 If either is "-", use the standard I/O stream for it instead. */
320 static void
321 check_file (char const *infile, char const *outfile, char delimiter)
323 struct linebuffer lb1, lb2;
324 struct linebuffer *thisline, *prevline;
326 if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
327 error (EXIT_FAILURE, errno, "%s", quotef (infile));
328 if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
329 error (EXIT_FAILURE, errno, "%s", quotef (outfile));
331 fadvise (stdin, FADVISE_SEQUENTIAL);
333 thisline = &lb1;
334 prevline = &lb2;
336 initbuffer (thisline);
337 initbuffer (prevline);
339 /* The duplication in the following 'if' and 'else' blocks is an
340 optimization to distinguish between when we can print input
341 lines immediately (1. & 2.) or not.
343 1. --group => all input lines are printed.
344 checking for unique/duplicated lines is used only for printing
345 group separators.
347 2. The default case in which none of these options has been specified:
348 --count, --repeated, --all-repeated, --unique
349 In the default case, this optimization lets uniq output each different
350 line right away, without waiting to see if the next one is different.
352 3. All other cases.
354 if (output_unique && output_first_repeated && countmode == count_none)
356 char *prevfield = nullptr;
357 size_t prevlen;
358 bool first_group_printed = false;
360 while (!feof (stdin))
362 char *thisfield;
363 size_t thislen;
364 bool new_group;
366 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
367 break;
369 thisfield = find_field (thisline);
370 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
372 new_group = (!prevfield
373 || different (thisfield, prevfield, thislen, prevlen));
375 if (new_group && grouping != GM_NONE
376 && (grouping == GM_PREPEND || grouping == GM_BOTH
377 || (first_group_printed && (grouping == GM_APPEND
378 || grouping == GM_SEPARATE))))
379 putchar (delimiter);
381 if (new_group || grouping != GM_NONE)
383 if (fwrite (thisline->buffer, sizeof (char), thisline->length,
384 stdout) != thisline->length)
385 write_error ();
387 SWAP_LINES (prevline, thisline);
388 prevfield = thisfield;
389 prevlen = thislen;
390 first_group_printed = true;
393 if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
394 putchar (delimiter);
396 else
398 char *prevfield;
399 size_t prevlen;
400 uintmax_t match_count = 0;
401 bool first_delimiter = true;
403 if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
404 goto closefiles;
405 prevfield = find_field (prevline);
406 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
408 while (!feof (stdin))
410 bool match;
411 char *thisfield;
412 size_t thislen;
413 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
415 if (ferror (stdin))
416 goto closefiles;
417 break;
419 thisfield = find_field (thisline);
420 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
421 match = !different (thisfield, prevfield, thislen, prevlen);
422 match_count += match;
424 if (match_count == UINTMAX_MAX)
426 if (count_occurrences)
427 error (EXIT_FAILURE, 0, _("too many repeated lines"));
428 match_count--;
431 if (delimit_groups != DM_NONE)
433 if (!match)
435 if (match_count) /* a previous match */
436 first_delimiter = false; /* Only used when DM_SEPARATE */
438 else if (match_count == 1)
440 if ((delimit_groups == DM_PREPEND)
441 || (delimit_groups == DM_SEPARATE
442 && !first_delimiter))
443 putchar (delimiter);
447 if (!match || output_later_repeated)
449 writeline (prevline, match, match_count);
450 SWAP_LINES (prevline, thisline);
451 prevfield = thisfield;
452 prevlen = thislen;
453 if (!match)
454 match_count = 0;
458 writeline (prevline, false, match_count);
461 closefiles:
462 if (ferror (stdin) || fclose (stdin) != 0)
463 error (EXIT_FAILURE, errno, _("error reading %s"), quoteaf (infile));
465 /* stdout is handled via the atexit-invoked close_stdout function. */
467 free (lb1.buffer);
468 free (lb2.buffer);
471 enum Skip_field_option_type
473 SFO_NONE,
474 SFO_OBSOLETE,
475 SFO_NEW
479 main (int argc, char **argv)
481 int optc = 0;
482 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != nullptr);
483 enum Skip_field_option_type skip_field_option_type = SFO_NONE;
484 unsigned int nfiles = 0;
485 char const *file[2];
486 char delimiter = '\n'; /* change with --zero-terminated, -z */
487 bool output_option_used = false; /* if true, one of -u/-d/-D/-c was used */
489 file[0] = file[1] = "-";
490 initialize_main (&argc, &argv);
491 set_program_name (argv[0]);
492 setlocale (LC_ALL, "");
493 bindtextdomain (PACKAGE, LOCALEDIR);
494 textdomain (PACKAGE);
496 atexit (close_stdout);
498 skip_chars = 0;
499 skip_fields = 0;
500 check_chars = SIZE_MAX;
501 output_unique = output_first_repeated = true;
502 output_later_repeated = false;
503 countmode = count_none;
504 delimit_groups = DM_NONE;
506 while (true)
508 /* Parse an operand with leading "+" as a file after "--" was
509 seen; or if pedantic and a file was seen; or if not
510 obsolete. */
512 if (optc == -1
513 || (posixly_correct && nfiles != 0)
514 || ((optc = getopt_long (argc, argv,
515 "-0123456789Dcdf:is:uw:z",
516 longopts, nullptr))
517 == -1))
519 if (argc <= optind)
520 break;
521 if (nfiles == 2)
523 error (0, 0, _("extra operand %s"), quote (argv[optind]));
524 usage (EXIT_FAILURE);
526 file[nfiles++] = argv[optind++];
528 else switch (optc)
530 case 1:
532 uintmax_t size;
533 if (optarg[0] == '+'
534 && ! strict_posix2 ()
535 && xstrtoumax (optarg, nullptr, 10, &size, "") == LONGINT_OK
536 && size <= SIZE_MAX)
537 skip_chars = size;
538 else if (nfiles == 2)
540 error (0, 0, _("extra operand %s"), quote (optarg));
541 usage (EXIT_FAILURE);
543 else
544 file[nfiles++] = optarg;
546 break;
548 case '0':
549 case '1':
550 case '2':
551 case '3':
552 case '4':
553 case '5':
554 case '6':
555 case '7':
556 case '8':
557 case '9':
559 if (skip_field_option_type == SFO_NEW)
560 skip_fields = 0;
562 if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
563 skip_fields = SIZE_MAX;
565 skip_field_option_type = SFO_OBSOLETE;
567 break;
569 case 'c':
570 countmode = count_occurrences;
571 output_option_used = true;
572 break;
574 case 'd':
575 output_unique = false;
576 output_option_used = true;
577 break;
579 case 'D':
580 output_unique = false;
581 output_later_repeated = true;
582 if (optarg == nullptr)
583 delimit_groups = DM_NONE;
584 else
585 delimit_groups = XARGMATCH ("--all-repeated", optarg,
586 delimit_method_string,
587 delimit_method_map);
588 output_option_used = true;
589 break;
591 case GROUP_OPTION:
592 if (optarg == nullptr)
593 grouping = GM_SEPARATE;
594 else
595 grouping = XARGMATCH ("--group", optarg,
596 grouping_method_string,
597 grouping_method_map);
598 break;
600 case 'f':
601 skip_field_option_type = SFO_NEW;
602 skip_fields = size_opt (optarg,
603 N_("invalid number of fields to skip"));
604 break;
606 case 'i':
607 ignore_case = true;
608 break;
610 case 's':
611 skip_chars = size_opt (optarg,
612 N_("invalid number of bytes to skip"));
613 break;
615 case 'u':
616 output_first_repeated = false;
617 output_option_used = true;
618 break;
620 case 'w':
621 check_chars = size_opt (optarg,
622 N_("invalid number of bytes to compare"));
623 break;
625 case 'z':
626 delimiter = '\0';
627 break;
629 case_GETOPT_HELP_CHAR;
631 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
633 default:
634 usage (EXIT_FAILURE);
638 /* Note we could allow --group with -D at least, and that would
639 avoid the need to specify a grouping method to --all-repeated.
640 It was thought best to avoid deprecating those parameters though
641 and keep --group separate to other options. */
642 if (grouping != GM_NONE && output_option_used)
644 error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
645 usage (EXIT_FAILURE);
648 if (grouping != GM_NONE && countmode != count_none)
650 error (0, 0,
651 _("grouping and printing repeat counts is meaningless"));
652 usage (EXIT_FAILURE);
655 if (countmode == count_occurrences && output_later_repeated)
657 error (0, 0,
658 _("printing all duplicated lines and repeat counts is meaningless"));
659 usage (EXIT_FAILURE);
662 check_file (file[0], file[1], delimiter);
664 return EXIT_SUCCESS;