doc: sort: give example for sorting on the last field
[coreutils.git] / src / uniq.c
blobabaf9e04509517fe271b1130c4fdbe01230489d4
1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 1986-2024 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Richard M. Stallman and David MacKenzie. */
19 #include <config.h>
21 #include <getopt.h>
22 #include <sys/types.h>
24 #include "system.h"
25 #include "argmatch.h"
26 #include "linebuffer.h"
27 #include "fadvise.h"
28 #include "mcel.h"
29 #include "posixver.h"
30 #include "skipchars.h"
31 #include "stdio--.h"
32 #include "xstrtol.h"
33 #include "memcasecmp.h"
34 #include "quote.h"
36 /* The official name of this program (e.g., no 'g' prefix). */
37 #define PROGRAM_NAME "uniq"
39 #define AUTHORS \
40 proper_name ("Richard M. Stallman"), \
41 proper_name ("David MacKenzie")
43 static void
44 swap_lines (struct linebuffer **a, struct linebuffer **b)
46 struct linebuffer *tmp = *a;
47 *a = *b;
48 *b = tmp;
51 /* Number of fields to skip on each line when doing comparisons. */
52 static idx_t skip_fields = false;
54 /* Number of chars to skip after skipping any fields. */
55 static idx_t skip_chars = false;
57 /* Number of chars to compare. */
58 static idx_t check_chars = IDX_MAX;
60 /* Whether to precede the output lines with a count of the number of
61 times they occurred in the input. */
62 static bool count_occurrences = false;
64 /* Which lines to output: unique lines, the first of a group of
65 repeated lines, and the second and subsequent of a group of
66 repeated lines. */
67 static bool output_unique = true;
68 static bool output_first_repeated = true;
69 static bool output_later_repeated = false;
71 /* If true, ignore case when comparing. */
72 static bool ignore_case;
74 enum delimit_method
76 /* No delimiters output. --all-repeated[=none] */
77 DM_NONE,
79 /* Delimiter precedes all groups. --all-repeated=prepend */
80 DM_PREPEND,
82 /* Delimit all groups. --all-repeated=separate */
83 DM_SEPARATE
86 static char const *const delimit_method_string[] =
88 "none", "prepend", "separate", nullptr
91 static enum delimit_method const delimit_method_map[] =
93 DM_NONE, DM_PREPEND, DM_SEPARATE
96 /* Select whether/how to delimit groups of duplicate lines. */
97 static enum delimit_method delimit_groups = DM_NONE;
99 enum grouping_method
101 /* No grouping, when "--group" isn't used */
102 GM_NONE,
104 /* Delimiter precedes all groups. --group=prepend */
105 GM_PREPEND,
107 /* Delimiter follows all groups. --group=append */
108 GM_APPEND,
110 /* Delimiter between groups. --group[=separate] */
111 GM_SEPARATE,
113 /* Delimiter before and after each group. --group=both */
114 GM_BOTH
117 static char const *const grouping_method_string[] =
119 "prepend", "append", "separate", "both", nullptr
122 static enum grouping_method const grouping_method_map[] =
124 GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
127 static enum grouping_method grouping = GM_NONE;
129 enum
131 GROUP_OPTION = CHAR_MAX + 1
134 static struct option const longopts[] =
136 {"count", no_argument, nullptr, 'c'},
137 {"repeated", no_argument, nullptr, 'd'},
138 {"all-repeated", optional_argument, nullptr, 'D'},
139 {"group", optional_argument, nullptr, GROUP_OPTION},
140 {"ignore-case", no_argument, nullptr, 'i'},
141 {"unique", no_argument, nullptr, 'u'},
142 {"skip-fields", required_argument, nullptr, 'f'},
143 {"skip-chars", required_argument, nullptr, 's'},
144 {"check-chars", required_argument, nullptr, 'w'},
145 {"zero-terminated", no_argument, nullptr, 'z'},
146 {GETOPT_HELP_OPTION_DECL},
147 {GETOPT_VERSION_OPTION_DECL},
148 {nullptr, 0, nullptr, 0}
151 void
152 usage (int status)
154 if (status != EXIT_SUCCESS)
155 emit_try_help ();
156 else
158 printf (_("\
159 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
161 program_name);
162 fputs (_("\
163 Filter adjacent matching lines from INPUT (or standard input),\n\
164 writing to OUTPUT (or standard output).\n\
166 With no options, matching lines are merged to the first occurrence.\n\
167 "), stdout);
169 emit_mandatory_arg_note ();
171 fputs (_("\
172 -c, --count prefix lines by the number of occurrences\n\
173 -d, --repeated only print duplicate lines, one for each group\n\
174 "), stdout);
175 fputs (_("\
176 -D print all duplicate lines\n\
177 --all-repeated[=METHOD] like -D, but allow separating groups\n\
178 with an empty line;\n\
179 METHOD={none(default),prepend,separate}\n\
180 "), stdout);
181 fputs (_("\
182 -f, --skip-fields=N avoid comparing the first N fields\n\
183 "), stdout);
184 fputs (_("\
185 --group[=METHOD] show all items, separating groups with an empty line;\n\
186 METHOD={separate(default),prepend,append,both}\n\
187 "), stdout);
188 fputs (_("\
189 -i, --ignore-case ignore differences in case when comparing\n\
190 -s, --skip-chars=N avoid comparing the first N characters\n\
191 -u, --unique only print unique lines\n\
192 "), stdout);
193 fputs (_("\
194 -z, --zero-terminated line delimiter is NUL, not newline\n\
195 "), stdout);
196 fputs (_("\
197 -w, --check-chars=N compare no more than N characters in lines\n\
198 "), stdout);
199 fputs (HELP_OPTION_DESCRIPTION, stdout);
200 fputs (VERSION_OPTION_DESCRIPTION, stdout);
201 fputs (_("\
203 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
204 characters. Fields are skipped before chars.\n\
205 "), stdout);
206 fputs (_("\
208 'uniq' does not detect repeated lines unless they are adjacent.\n\
209 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
210 "), stdout);
211 emit_ancillary_info (PROGRAM_NAME);
213 exit (status);
216 static bool
217 strict_posix2 (void)
219 int posix_ver = posix2_version ();
220 return 200112 <= posix_ver && posix_ver < 200809;
223 /* Convert OPT to idx_t, reporting an error using MSGID if OPT is
224 invalid. Silently convert too-large values to IDX_MAX. */
226 static idx_t
227 size_opt (char const *opt, char const *msgid)
229 intmax_t size;
230 if (LONGINT_OVERFLOW < xstrtoimax (opt, nullptr, 10, &size, "")
231 || size < 0)
232 error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
233 return MIN (size, IDX_MAX);
236 static bool
237 newline_or_blank (mcel_t g)
239 return g.ch == '\n' || c32isblank (g.ch);
242 /* Given a linebuffer LINE,
243 return a pointer to the beginning of the line's field to be compared.
244 Store into *PLEN the length in bytes of FIELD. */
246 static char *
247 find_field (struct linebuffer const *line, idx_t *plen)
249 char *lp = line->buffer;
250 char const *lim = lp + line->length - 1;
252 for (idx_t i = skip_fields; 0 < i && lp < lim; i--)
254 lp = skip_buf_matching (lp, lim, newline_or_blank, true);
255 lp = skip_buf_matching (lp, lim, newline_or_blank, false);
258 for (idx_t i = skip_chars; 0 < i && lp < lim; i--)
259 lp += mcel_scan (lp, lim).len;
261 /* Compute the length in bytes cheaply if possible; otherwise, scan. */
262 idx_t len;
263 if (lim - lp <= check_chars)
264 len = lim - lp;
265 else if (MB_CUR_MAX <= 1)
266 len = check_chars;
267 else
269 char *ep = lp;
270 for (idx_t i = check_chars; 0 < i && lp < lim; i--)
271 ep += mcel_scan (lp, lim).len;
272 len = ep - lp;
275 *plen = len;
276 return lp;
279 /* Return false if two strings OLD and NEW match, true if not.
280 OLD and NEW point not to the beginnings of the lines
281 but rather to the beginnings of the fields to compare.
282 OLDLEN and NEWLEN are their lengths. */
284 static bool
285 different (char *old, char *new, idx_t oldlen, idx_t newlen)
287 if (ignore_case)
288 return oldlen != newlen || memcasecmp (old, new, oldlen);
289 else
290 return oldlen != newlen || memcmp (old, new, oldlen);
293 /* Output the line in linebuffer LINE to standard output
294 provided that the switches say it should be output.
295 MATCH is true if the line matches the previous line.
296 If requested, print the number of times it occurred, as well;
297 LINECOUNT + 1 is the number of times that the line occurred. */
299 static void
300 writeline (struct linebuffer const *line,
301 bool match, intmax_t linecount)
303 if (! (linecount == 0 ? output_unique
304 : !match ? output_first_repeated
305 : output_later_repeated))
306 return;
308 if (count_occurrences)
309 printf ("%7jd ", linecount + 1);
311 if (fwrite (line->buffer, sizeof (char), line->length, stdout)
312 != line->length)
313 write_error ();
316 /* Process input file INFILE with output to OUTFILE.
317 If either is "-", use the standard I/O stream for it instead. */
319 static void
320 check_file (char const *infile, char const *outfile, char delimiter)
322 struct linebuffer lb1, lb2;
323 struct linebuffer *thisline, *prevline;
325 if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
326 error (EXIT_FAILURE, errno, "%s", quotef (infile));
327 if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
328 error (EXIT_FAILURE, errno, "%s", quotef (outfile));
330 fadvise (stdin, FADVISE_SEQUENTIAL);
332 thisline = &lb1;
333 prevline = &lb2;
335 initbuffer (thisline);
336 initbuffer (prevline);
338 /* The duplication in the following 'if' and 'else' blocks is an
339 optimization to distinguish between when we can print input
340 lines immediately (1. & 2.) or not.
342 1. --group => all input lines are printed.
343 checking for unique/duplicated lines is used only for printing
344 group separators.
346 2. The default case in which none of these options has been specified:
347 --count, --repeated, --all-repeated, --unique
348 In the default case, this optimization lets uniq output each different
349 line right away, without waiting to see if the next one is different.
351 3. All other cases.
353 if (output_unique && output_first_repeated && !count_occurrences)
355 char *prevfield = nullptr;
356 idx_t prevlen;
357 bool first_group_printed = false;
359 while (!feof (stdin)
360 && readlinebuffer_delim (thisline, stdin, delimiter) != 0)
362 idx_t thislen;
363 char *thisfield = find_field (thisline, &thislen);
364 bool new_group = (!prevfield
365 || different (thisfield, prevfield,
366 thislen, prevlen));
368 if (new_group && grouping != GM_NONE
369 && (grouping == GM_PREPEND || grouping == GM_BOTH
370 || (first_group_printed && (grouping == GM_APPEND
371 || grouping == GM_SEPARATE))))
372 putchar (delimiter);
374 if (new_group || grouping != GM_NONE)
376 if (fwrite (thisline->buffer, sizeof (char), thisline->length,
377 stdout) != thisline->length)
378 write_error ();
380 swap_lines (&prevline, &thisline);
381 prevfield = thisfield;
382 prevlen = thislen;
383 first_group_printed = true;
386 if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
387 putchar (delimiter);
389 else
391 if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
392 goto closefiles;
394 idx_t prevlen;
395 char *prevfield = find_field (prevline, &prevlen);
396 intmax_t match_count = 0;
397 bool first_delimiter = true;
399 while (!feof (stdin))
401 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
403 if (ferror (stdin))
404 goto closefiles;
405 break;
407 idx_t thislen;
408 char *thisfield = find_field (thisline, &thislen);
409 bool match = !different (thisfield, prevfield, thislen, prevlen);
410 match_count += match;
412 if (match_count == INTMAX_MAX)
414 if (count_occurrences)
415 error (EXIT_FAILURE, 0, _("too many repeated lines"));
416 match_count--;
419 if (delimit_groups != DM_NONE)
421 if (!match)
423 if (match_count) /* a previous match */
424 first_delimiter = false; /* Only used when DM_SEPARATE */
426 else if (match_count == 1)
428 if ((delimit_groups == DM_PREPEND)
429 || (delimit_groups == DM_SEPARATE
430 && !first_delimiter))
431 putchar (delimiter);
435 if (!match || output_later_repeated)
437 writeline (prevline, match, match_count);
438 swap_lines (&prevline, &thisline);
439 prevfield = thisfield;
440 prevlen = thislen;
441 if (!match)
442 match_count = 0;
446 writeline (prevline, false, match_count);
449 closefiles:
450 if (ferror (stdin) || fclose (stdin) != 0)
451 error (EXIT_FAILURE, errno, _("error reading %s"), quoteaf (infile));
453 /* stdout is handled via the atexit-invoked close_stdout function. */
455 free (lb1.buffer);
456 free (lb2.buffer);
459 enum Skip_field_option_type
461 SFO_NONE,
462 SFO_OBSOLETE,
463 SFO_NEW
467 main (int argc, char **argv)
469 int optc = 0;
470 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != nullptr);
471 enum Skip_field_option_type skip_field_option_type = SFO_NONE;
472 int nfiles = 0;
473 char const *file[2];
474 char delimiter = '\n'; /* change with --zero-terminated, -z */
475 bool output_option_used = false; /* if true, one of -u/-d/-D/-c was used */
477 file[0] = file[1] = "-";
478 initialize_main (&argc, &argv);
479 set_program_name (argv[0]);
480 setlocale (LC_ALL, "");
481 bindtextdomain (PACKAGE, LOCALEDIR);
482 textdomain (PACKAGE);
484 atexit (close_stdout);
486 while (true)
488 /* Parse an operand with leading "+" as a file after "--" was
489 seen; or if pedantic and a file was seen; or if not
490 obsolete. */
492 if (optc == -1
493 || (posixly_correct && nfiles != 0)
494 || ((optc = getopt_long (argc, argv,
495 "-0123456789Dcdf:is:uw:z",
496 longopts, nullptr))
497 == -1))
499 if (argc <= optind)
500 break;
501 if (nfiles == 2)
503 error (0, 0, _("extra operand %s"), quote (argv[optind]));
504 usage (EXIT_FAILURE);
506 file[nfiles++] = argv[optind++];
508 else switch (optc)
510 case 1:
512 intmax_t size;
513 if (optarg[0] == '+'
514 && ! strict_posix2 ()
515 && (xstrtoimax (optarg, nullptr, 10, &size, "")
516 <= LONGINT_OVERFLOW))
517 skip_chars = MIN (size, IDX_MAX);
518 else if (nfiles == 2)
520 error (0, 0, _("extra operand %s"), quote (optarg));
521 usage (EXIT_FAILURE);
523 else
524 file[nfiles++] = optarg;
526 break;
528 case '0':
529 case '1':
530 case '2':
531 case '3':
532 case '4':
533 case '5':
534 case '6':
535 case '7':
536 case '8':
537 case '9':
539 if (skip_field_option_type == SFO_NEW)
540 skip_fields = 0;
542 if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0'))
543 skip_fields = IDX_MAX;
545 skip_field_option_type = SFO_OBSOLETE;
547 break;
549 case 'c':
550 count_occurrences = true;
551 output_option_used = true;
552 break;
554 case 'd':
555 output_unique = false;
556 output_option_used = true;
557 break;
559 case 'D':
560 output_unique = false;
561 output_later_repeated = true;
562 if (optarg == nullptr)
563 delimit_groups = DM_NONE;
564 else
565 delimit_groups = XARGMATCH ("--all-repeated", optarg,
566 delimit_method_string,
567 delimit_method_map);
568 output_option_used = true;
569 break;
571 case GROUP_OPTION:
572 if (optarg == nullptr)
573 grouping = GM_SEPARATE;
574 else
575 grouping = XARGMATCH ("--group", optarg,
576 grouping_method_string,
577 grouping_method_map);
578 break;
580 case 'f':
581 skip_field_option_type = SFO_NEW;
582 skip_fields = size_opt (optarg,
583 N_("invalid number of fields to skip"));
584 break;
586 case 'i':
587 ignore_case = true;
588 break;
590 case 's':
591 skip_chars = size_opt (optarg,
592 N_("invalid number of bytes to skip"));
593 break;
595 case 'u':
596 output_first_repeated = false;
597 output_option_used = true;
598 break;
600 case 'w':
601 check_chars = size_opt (optarg,
602 N_("invalid number of bytes to compare"));
603 break;
605 case 'z':
606 delimiter = '\0';
607 break;
609 case_GETOPT_HELP_CHAR;
611 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
613 default:
614 usage (EXIT_FAILURE);
618 /* Note we could allow --group with -D at least, and that would
619 avoid the need to specify a grouping method to --all-repeated.
620 It was thought best to avoid deprecating those parameters though
621 and keep --group separate to other options. */
622 if (grouping != GM_NONE && output_option_used)
624 error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
625 usage (EXIT_FAILURE);
628 if (grouping != GM_NONE && count_occurrences)
630 error (0, 0,
631 _("grouping and printing repeat counts is meaningless"));
632 usage (EXIT_FAILURE);
635 if (count_occurrences && output_later_repeated)
637 error (0, 0,
638 _("printing all duplicated lines and repeat counts is meaningless"));
639 usage (EXIT_FAILURE);
642 check_file (file[0], file[1], delimiter);
644 return EXIT_SUCCESS;