1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 1986-2023 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Richard M. Stallman and David MacKenzie. */
22 #include <sys/types.h>
26 #include "linebuffer.h"
33 #include "memcasecmp.h"
36 /* The official name of this program (e.g., no 'g' prefix). */
37 #define PROGRAM_NAME "uniq"
40 proper_name ("Richard M. Stallman"), \
41 proper_name ("David MacKenzie")
43 #define SWAP_LINES(A, B) \
46 struct linebuffer *_tmp; \
53 /* Number of fields to skip on each line when doing comparisons. */
54 static size_t skip_fields
;
56 /* Number of chars to skip after skipping any fields. */
57 static size_t skip_chars
;
59 /* Number of chars to compare. */
60 static size_t check_chars
;
64 count_occurrences
, /* -c Print count before output lines. */
65 count_none
/* Default. Do not print counts. */
68 /* Whether and how to precede the output lines with a count of the number of
69 times they occurred in the input. */
70 static enum countmode countmode
;
72 /* Which lines to output: unique lines, the first of a group of
73 repeated lines, and the second and subsequented of a group of
75 static bool output_unique
;
76 static bool output_first_repeated
;
77 static bool output_later_repeated
;
79 /* If true, ignore case when comparing. */
80 static bool ignore_case
;
84 /* No delimiters output. --all-repeated[=none] */
87 /* Delimiter precedes all groups. --all-repeated=prepend */
90 /* Delimit all groups. --all-repeated=separate */
94 static char const *const delimit_method_string
[] =
96 "none", "prepend", "separate", nullptr
99 static enum delimit_method
const delimit_method_map
[] =
101 DM_NONE
, DM_PREPEND
, DM_SEPARATE
104 /* Select whether/how to delimit groups of duplicate lines. */
105 static enum delimit_method delimit_groups
;
109 /* No grouping, when "--group" isn't used */
112 /* Delimiter precedes all groups. --group=prepend */
115 /* Delimiter follows all groups. --group=append */
118 /* Delimiter between groups. --group[=separate] */
121 /* Delimiter before and after each group. --group=both */
125 static char const *const grouping_method_string
[] =
127 "prepend", "append", "separate", "both", nullptr
130 static enum grouping_method
const grouping_method_map
[] =
132 GM_PREPEND
, GM_APPEND
, GM_SEPARATE
, GM_BOTH
135 static enum grouping_method grouping
= GM_NONE
;
139 GROUP_OPTION
= CHAR_MAX
+ 1
142 static struct option
const longopts
[] =
144 {"count", no_argument
, nullptr, 'c'},
145 {"repeated", no_argument
, nullptr, 'd'},
146 {"all-repeated", optional_argument
, nullptr, 'D'},
147 {"group", optional_argument
, nullptr, GROUP_OPTION
},
148 {"ignore-case", no_argument
, nullptr, 'i'},
149 {"unique", no_argument
, nullptr, 'u'},
150 {"skip-fields", required_argument
, nullptr, 'f'},
151 {"skip-chars", required_argument
, nullptr, 's'},
152 {"check-chars", required_argument
, nullptr, 'w'},
153 {"zero-terminated", no_argument
, nullptr, 'z'},
154 {GETOPT_HELP_OPTION_DECL
},
155 {GETOPT_VERSION_OPTION_DECL
},
156 {nullptr, 0, nullptr, 0}
162 if (status
!= EXIT_SUCCESS
)
167 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
171 Filter adjacent matching lines from INPUT (or standard input),\n\
172 writing to OUTPUT (or standard output).\n\
174 With no options, matching lines are merged to the first occurrence.\n\
177 emit_mandatory_arg_note ();
180 -c, --count prefix lines by the number of occurrences\n\
181 -d, --repeated only print duplicate lines, one for each group\n\
184 -D print all duplicate lines\n\
185 --all-repeated[=METHOD] like -D, but allow separating groups\n\
186 with an empty line;\n\
187 METHOD={none(default),prepend,separate}\n\
190 -f, --skip-fields=N avoid comparing the first N fields\n\
193 --group[=METHOD] show all items, separating groups with an empty line;\n\
194 METHOD={separate(default),prepend,append,both}\n\
197 -i, --ignore-case ignore differences in case when comparing\n\
198 -s, --skip-chars=N avoid comparing the first N characters\n\
199 -u, --unique only print unique lines\n\
202 -z, --zero-terminated line delimiter is NUL, not newline\n\
205 -w, --check-chars=N compare no more than N characters in lines\n\
207 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
208 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
211 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
212 characters. Fields are skipped before chars.\n\
216 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
217 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
219 emit_ancillary_info (PROGRAM_NAME
);
227 int posix_ver
= posix2_version ();
228 return 200112 <= posix_ver
&& posix_ver
< 200809;
231 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
232 invalid. Silently convert too-large values to SIZE_MAX. */
235 size_opt (char const *opt
, char const *msgid
)
239 switch (xstrtoumax (opt
, nullptr, 10, &size
, ""))
242 case LONGINT_OVERFLOW
:
246 die (EXIT_FAILURE
, 0, "%s: %s", opt
, _(msgid
));
249 return MIN (size
, SIZE_MAX
);
252 /* Given a linebuffer LINE,
253 return a pointer to the beginning of the line's field to be compared. */
257 find_field (struct linebuffer
const *line
)
260 char const *lp
= line
->buffer
;
261 size_t size
= line
->length
- 1;
264 for (count
= 0; count
< skip_fields
&& i
< size
; count
++)
266 while (i
< size
&& field_sep (lp
[i
]))
268 while (i
< size
&& !field_sep (lp
[i
]))
272 i
+= MIN (skip_chars
, size
- i
);
274 return line
->buffer
+ i
;
277 /* Return false if two strings OLD and NEW match, true if not.
278 OLD and NEW point not to the beginnings of the lines
279 but rather to the beginnings of the fields to compare.
280 OLDLEN and NEWLEN are their lengths. */
283 different (char *old
, char *new, size_t oldlen
, size_t newlen
)
285 if (check_chars
< oldlen
)
286 oldlen
= check_chars
;
287 if (check_chars
< newlen
)
288 newlen
= check_chars
;
291 return oldlen
!= newlen
|| memcasecmp (old
, new, oldlen
);
293 return oldlen
!= newlen
|| memcmp (old
, new, oldlen
);
296 /* Output the line in linebuffer LINE to standard output
297 provided that the switches say it should be output.
298 MATCH is true if the line matches the previous line.
299 If requested, print the number of times it occurred, as well;
300 LINECOUNT + 1 is the number of times that the line occurred. */
303 writeline (struct linebuffer
const *line
,
304 bool match
, uintmax_t linecount
)
306 if (! (linecount
== 0 ? output_unique
307 : !match
? output_first_repeated
308 : output_later_repeated
))
311 if (countmode
== count_occurrences
)
312 printf ("%7" PRIuMAX
" ", linecount
+ 1);
314 fwrite (line
->buffer
, sizeof (char), line
->length
, stdout
);
317 /* Process input file INFILE with output to OUTFILE.
318 If either is "-", use the standard I/O stream for it instead. */
321 check_file (char const *infile
, char const *outfile
, char delimiter
)
323 struct linebuffer lb1
, lb2
;
324 struct linebuffer
*thisline
, *prevline
;
326 if (! (STREQ (infile
, "-") || freopen (infile
, "r", stdin
)))
327 die (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
328 if (! (STREQ (outfile
, "-") || freopen (outfile
, "w", stdout
)))
329 die (EXIT_FAILURE
, errno
, "%s", quotef (outfile
));
331 fadvise (stdin
, FADVISE_SEQUENTIAL
);
336 initbuffer (thisline
);
337 initbuffer (prevline
);
339 /* The duplication in the following 'if' and 'else' blocks is an
340 optimization to distinguish between when we can print input
341 lines immediately (1. & 2.) or not.
343 1. --group => all input lines are printed.
344 checking for unique/duplicated lines is used only for printing
347 2. The default case in which none of these options has been specified:
348 --count, --repeated, --all-repeated, --unique
349 In the default case, this optimization lets uniq output each different
350 line right away, without waiting to see if the next one is different.
354 if (output_unique
&& output_first_repeated
&& countmode
== count_none
)
356 char *prevfield
= nullptr;
358 bool first_group_printed
= false;
360 while (!feof (stdin
))
366 if (readlinebuffer_delim (thisline
, stdin
, delimiter
) == 0)
369 thisfield
= find_field (thisline
);
370 thislen
= thisline
->length
- 1 - (thisfield
- thisline
->buffer
);
372 new_group
= (!prevfield
373 || different (thisfield
, prevfield
, thislen
, prevlen
));
375 if (new_group
&& grouping
!= GM_NONE
376 && (grouping
== GM_PREPEND
|| grouping
== GM_BOTH
377 || (first_group_printed
&& (grouping
== GM_APPEND
378 || grouping
== GM_SEPARATE
))))
381 if (new_group
|| grouping
!= GM_NONE
)
383 fwrite (thisline
->buffer
, sizeof (char),
384 thisline
->length
, stdout
);
386 SWAP_LINES (prevline
, thisline
);
387 prevfield
= thisfield
;
389 first_group_printed
= true;
392 if ((grouping
== GM_BOTH
|| grouping
== GM_APPEND
) && first_group_printed
)
399 uintmax_t match_count
= 0;
400 bool first_delimiter
= true;
402 if (readlinebuffer_delim (prevline
, stdin
, delimiter
) == 0)
404 prevfield
= find_field (prevline
);
405 prevlen
= prevline
->length
- 1 - (prevfield
- prevline
->buffer
);
407 while (!feof (stdin
))
412 if (readlinebuffer_delim (thisline
, stdin
, delimiter
) == 0)
418 thisfield
= find_field (thisline
);
419 thislen
= thisline
->length
- 1 - (thisfield
- thisline
->buffer
);
420 match
= !different (thisfield
, prevfield
, thislen
, prevlen
);
421 match_count
+= match
;
423 if (match_count
== UINTMAX_MAX
)
425 if (count_occurrences
)
426 die (EXIT_FAILURE
, 0, _("too many repeated lines"));
430 if (delimit_groups
!= DM_NONE
)
434 if (match_count
) /* a previous match */
435 first_delimiter
= false; /* Only used when DM_SEPARATE */
437 else if (match_count
== 1)
439 if ((delimit_groups
== DM_PREPEND
)
440 || (delimit_groups
== DM_SEPARATE
441 && !first_delimiter
))
446 if (!match
|| output_later_repeated
)
448 writeline (prevline
, match
, match_count
);
449 SWAP_LINES (prevline
, thisline
);
450 prevfield
= thisfield
;
457 writeline (prevline
, false, match_count
);
461 if (ferror (stdin
) || fclose (stdin
) != 0)
462 die (EXIT_FAILURE
, errno
, _("error reading %s"), quoteaf (infile
));
464 /* stdout is handled via the atexit-invoked close_stdout function. */
470 enum Skip_field_option_type
478 main (int argc
, char **argv
)
481 bool posixly_correct
= (getenv ("POSIXLY_CORRECT") != nullptr);
482 enum Skip_field_option_type skip_field_option_type
= SFO_NONE
;
483 unsigned int nfiles
= 0;
485 char delimiter
= '\n'; /* change with --zero-terminated, -z */
486 bool output_option_used
= false; /* if true, one of -u/-d/-D/-c was used */
488 file
[0] = file
[1] = "-";
489 initialize_main (&argc
, &argv
);
490 set_program_name (argv
[0]);
491 setlocale (LC_ALL
, "");
492 bindtextdomain (PACKAGE
, LOCALEDIR
);
493 textdomain (PACKAGE
);
495 atexit (close_stdout
);
499 check_chars
= SIZE_MAX
;
500 output_unique
= output_first_repeated
= true;
501 output_later_repeated
= false;
502 countmode
= count_none
;
503 delimit_groups
= DM_NONE
;
507 /* Parse an operand with leading "+" as a file after "--" was
508 seen; or if pedantic and a file was seen; or if not
512 || (posixly_correct
&& nfiles
!= 0)
513 || ((optc
= getopt_long (argc
, argv
,
514 "-0123456789Dcdf:is:uw:z",
522 error (0, 0, _("extra operand %s"), quote (argv
[optind
]));
523 usage (EXIT_FAILURE
);
525 file
[nfiles
++] = argv
[optind
++];
533 && ! strict_posix2 ()
534 && xstrtoumax (optarg
, nullptr, 10, &size
, "") == LONGINT_OK
537 else if (nfiles
== 2)
539 error (0, 0, _("extra operand %s"), quote (optarg
));
540 usage (EXIT_FAILURE
);
543 file
[nfiles
++] = optarg
;
558 if (skip_field_option_type
== SFO_NEW
)
561 if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields
, optc
- '0', size_t))
562 skip_fields
= SIZE_MAX
;
564 skip_field_option_type
= SFO_OBSOLETE
;
569 countmode
= count_occurrences
;
570 output_option_used
= true;
574 output_unique
= false;
575 output_option_used
= true;
579 output_unique
= false;
580 output_later_repeated
= true;
581 if (optarg
== nullptr)
582 delimit_groups
= DM_NONE
;
584 delimit_groups
= XARGMATCH ("--all-repeated", optarg
,
585 delimit_method_string
,
587 output_option_used
= true;
591 if (optarg
== nullptr)
592 grouping
= GM_SEPARATE
;
594 grouping
= XARGMATCH ("--group", optarg
,
595 grouping_method_string
,
596 grouping_method_map
);
600 skip_field_option_type
= SFO_NEW
;
601 skip_fields
= size_opt (optarg
,
602 N_("invalid number of fields to skip"));
610 skip_chars
= size_opt (optarg
,
611 N_("invalid number of bytes to skip"));
615 output_first_repeated
= false;
616 output_option_used
= true;
620 check_chars
= size_opt (optarg
,
621 N_("invalid number of bytes to compare"));
628 case_GETOPT_HELP_CHAR
;
630 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
633 usage (EXIT_FAILURE
);
637 /* Note we could allow --group with -D at least, and that would
638 avoid the need to specify a grouping method to --all-repeated.
639 It was thought best to avoid deprecating those parameters though
640 and keep --group separate to other options. */
641 if (grouping
!= GM_NONE
&& output_option_used
)
643 error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
644 usage (EXIT_FAILURE
);
647 if (grouping
!= GM_NONE
&& countmode
!= count_none
)
650 _("grouping and printing repeat counts is meaningless"));
651 usage (EXIT_FAILURE
);
654 if (countmode
== count_occurrences
&& output_later_repeated
)
657 _("printing all duplicated lines and repeat counts is meaningless"));
658 usage (EXIT_FAILURE
);
661 check_file (file
[0], file
[1], delimiter
);