1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 1986-2023 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17 /* Written by Richard M. Stallman and David MacKenzie. */
22 #include <sys/types.h>
26 #include "linebuffer.h"
31 #include "memcasecmp.h"
34 /* The official name of this program (e.g., no 'g' prefix). */
35 #define PROGRAM_NAME "uniq"
38 proper_name ("Richard M. Stallman"), \
39 proper_name ("David MacKenzie")
41 #define SWAP_LINES(A, B) \
44 struct linebuffer *_tmp; \
51 /* Number of fields to skip on each line when doing comparisons. */
52 static size_t skip_fields
;
54 /* Number of chars to skip after skipping any fields. */
55 static size_t skip_chars
;
57 /* Number of chars to compare. */
58 static size_t check_chars
;
62 count_occurrences
, /* -c Print count before output lines. */
63 count_none
/* Default. Do not print counts. */
66 /* Whether and how to precede the output lines with a count of the number of
67 times they occurred in the input. */
68 static enum countmode countmode
;
70 /* Which lines to output: unique lines, the first of a group of
71 repeated lines, and the second and subsequented of a group of
73 static bool output_unique
;
74 static bool output_first_repeated
;
75 static bool output_later_repeated
;
77 /* If true, ignore case when comparing. */
78 static bool ignore_case
;
82 /* No delimiters output. --all-repeated[=none] */
85 /* Delimiter precedes all groups. --all-repeated=prepend */
88 /* Delimit all groups. --all-repeated=separate */
92 static char const *const delimit_method_string
[] =
94 "none", "prepend", "separate", nullptr
97 static enum delimit_method
const delimit_method_map
[] =
99 DM_NONE
, DM_PREPEND
, DM_SEPARATE
102 /* Select whether/how to delimit groups of duplicate lines. */
103 static enum delimit_method delimit_groups
;
107 /* No grouping, when "--group" isn't used */
110 /* Delimiter precedes all groups. --group=prepend */
113 /* Delimiter follows all groups. --group=append */
116 /* Delimiter between groups. --group[=separate] */
119 /* Delimiter before and after each group. --group=both */
123 static char const *const grouping_method_string
[] =
125 "prepend", "append", "separate", "both", nullptr
128 static enum grouping_method
const grouping_method_map
[] =
130 GM_PREPEND
, GM_APPEND
, GM_SEPARATE
, GM_BOTH
133 static enum grouping_method grouping
= GM_NONE
;
137 GROUP_OPTION
= CHAR_MAX
+ 1
140 static struct option
const longopts
[] =
142 {"count", no_argument
, nullptr, 'c'},
143 {"repeated", no_argument
, nullptr, 'd'},
144 {"all-repeated", optional_argument
, nullptr, 'D'},
145 {"group", optional_argument
, nullptr, GROUP_OPTION
},
146 {"ignore-case", no_argument
, nullptr, 'i'},
147 {"unique", no_argument
, nullptr, 'u'},
148 {"skip-fields", required_argument
, nullptr, 'f'},
149 {"skip-chars", required_argument
, nullptr, 's'},
150 {"check-chars", required_argument
, nullptr, 'w'},
151 {"zero-terminated", no_argument
, nullptr, 'z'},
152 {GETOPT_HELP_OPTION_DECL
},
153 {GETOPT_VERSION_OPTION_DECL
},
154 {nullptr, 0, nullptr, 0}
160 if (status
!= EXIT_SUCCESS
)
165 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
169 Filter adjacent matching lines from INPUT (or standard input),\n\
170 writing to OUTPUT (or standard output).\n\
172 With no options, matching lines are merged to the first occurrence.\n\
175 emit_mandatory_arg_note ();
178 -c, --count prefix lines by the number of occurrences\n\
179 -d, --repeated only print duplicate lines, one for each group\n\
182 -D print all duplicate lines\n\
183 --all-repeated[=METHOD] like -D, but allow separating groups\n\
184 with an empty line;\n\
185 METHOD={none(default),prepend,separate}\n\
188 -f, --skip-fields=N avoid comparing the first N fields\n\
191 --group[=METHOD] show all items, separating groups with an empty line;\n\
192 METHOD={separate(default),prepend,append,both}\n\
195 -i, --ignore-case ignore differences in case when comparing\n\
196 -s, --skip-chars=N avoid comparing the first N characters\n\
197 -u, --unique only print unique lines\n\
200 -z, --zero-terminated line delimiter is NUL, not newline\n\
203 -w, --check-chars=N compare no more than N characters in lines\n\
205 fputs (HELP_OPTION_DESCRIPTION
, stdout
);
206 fputs (VERSION_OPTION_DESCRIPTION
, stdout
);
209 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
210 characters. Fields are skipped before chars.\n\
214 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
215 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
217 emit_ancillary_info (PROGRAM_NAME
);
225 int posix_ver
= posix2_version ();
226 return 200112 <= posix_ver
&& posix_ver
< 200809;
229 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
230 invalid. Silently convert too-large values to SIZE_MAX. */
233 size_opt (char const *opt
, char const *msgid
)
237 switch (xstrtoumax (opt
, nullptr, 10, &size
, ""))
240 case LONGINT_OVERFLOW
:
244 error (EXIT_FAILURE
, 0, "%s: %s", opt
, _(msgid
));
247 return MIN (size
, SIZE_MAX
);
250 /* Given a linebuffer LINE,
251 return a pointer to the beginning of the line's field to be compared. */
255 find_field (struct linebuffer
const *line
)
258 char const *lp
= line
->buffer
;
259 size_t size
= line
->length
- 1;
262 for (count
= 0; count
< skip_fields
&& i
< size
; count
++)
264 while (i
< size
&& field_sep (lp
[i
]))
266 while (i
< size
&& !field_sep (lp
[i
]))
270 i
+= MIN (skip_chars
, size
- i
);
272 return line
->buffer
+ i
;
275 /* Return false if two strings OLD and NEW match, true if not.
276 OLD and NEW point not to the beginnings of the lines
277 but rather to the beginnings of the fields to compare.
278 OLDLEN and NEWLEN are their lengths. */
281 different (char *old
, char *new, size_t oldlen
, size_t newlen
)
283 if (check_chars
< oldlen
)
284 oldlen
= check_chars
;
285 if (check_chars
< newlen
)
286 newlen
= check_chars
;
289 return oldlen
!= newlen
|| memcasecmp (old
, new, oldlen
);
291 return oldlen
!= newlen
|| memcmp (old
, new, oldlen
);
294 /* Output the line in linebuffer LINE to standard output
295 provided that the switches say it should be output.
296 MATCH is true if the line matches the previous line.
297 If requested, print the number of times it occurred, as well;
298 LINECOUNT + 1 is the number of times that the line occurred. */
301 writeline (struct linebuffer
const *line
,
302 bool match
, uintmax_t linecount
)
304 if (! (linecount
== 0 ? output_unique
305 : !match
? output_first_repeated
306 : output_later_repeated
))
309 if (countmode
== count_occurrences
)
310 printf ("%7" PRIuMAX
" ", linecount
+ 1);
312 fwrite (line
->buffer
, sizeof (char), line
->length
, stdout
);
315 /* Process input file INFILE with output to OUTFILE.
316 If either is "-", use the standard I/O stream for it instead. */
319 check_file (char const *infile
, char const *outfile
, char delimiter
)
321 struct linebuffer lb1
, lb2
;
322 struct linebuffer
*thisline
, *prevline
;
324 if (! (STREQ (infile
, "-") || freopen (infile
, "r", stdin
)))
325 error (EXIT_FAILURE
, errno
, "%s", quotef (infile
));
326 if (! (STREQ (outfile
, "-") || freopen (outfile
, "w", stdout
)))
327 error (EXIT_FAILURE
, errno
, "%s", quotef (outfile
));
329 fadvise (stdin
, FADVISE_SEQUENTIAL
);
334 initbuffer (thisline
);
335 initbuffer (prevline
);
337 /* The duplication in the following 'if' and 'else' blocks is an
338 optimization to distinguish between when we can print input
339 lines immediately (1. & 2.) or not.
341 1. --group => all input lines are printed.
342 checking for unique/duplicated lines is used only for printing
345 2. The default case in which none of these options has been specified:
346 --count, --repeated, --all-repeated, --unique
347 In the default case, this optimization lets uniq output each different
348 line right away, without waiting to see if the next one is different.
352 if (output_unique
&& output_first_repeated
&& countmode
== count_none
)
354 char *prevfield
= nullptr;
356 bool first_group_printed
= false;
358 while (!feof (stdin
))
364 if (readlinebuffer_delim (thisline
, stdin
, delimiter
) == 0)
367 thisfield
= find_field (thisline
);
368 thislen
= thisline
->length
- 1 - (thisfield
- thisline
->buffer
);
370 new_group
= (!prevfield
371 || different (thisfield
, prevfield
, thislen
, prevlen
));
373 if (new_group
&& grouping
!= GM_NONE
374 && (grouping
== GM_PREPEND
|| grouping
== GM_BOTH
375 || (first_group_printed
&& (grouping
== GM_APPEND
376 || grouping
== GM_SEPARATE
))))
379 if (new_group
|| grouping
!= GM_NONE
)
381 fwrite (thisline
->buffer
, sizeof (char),
382 thisline
->length
, stdout
);
384 SWAP_LINES (prevline
, thisline
);
385 prevfield
= thisfield
;
387 first_group_printed
= true;
390 if ((grouping
== GM_BOTH
|| grouping
== GM_APPEND
) && first_group_printed
)
397 uintmax_t match_count
= 0;
398 bool first_delimiter
= true;
400 if (readlinebuffer_delim (prevline
, stdin
, delimiter
) == 0)
402 prevfield
= find_field (prevline
);
403 prevlen
= prevline
->length
- 1 - (prevfield
- prevline
->buffer
);
405 while (!feof (stdin
))
410 if (readlinebuffer_delim (thisline
, stdin
, delimiter
) == 0)
416 thisfield
= find_field (thisline
);
417 thislen
= thisline
->length
- 1 - (thisfield
- thisline
->buffer
);
418 match
= !different (thisfield
, prevfield
, thislen
, prevlen
);
419 match_count
+= match
;
421 if (match_count
== UINTMAX_MAX
)
423 if (count_occurrences
)
424 error (EXIT_FAILURE
, 0, _("too many repeated lines"));
428 if (delimit_groups
!= DM_NONE
)
432 if (match_count
) /* a previous match */
433 first_delimiter
= false; /* Only used when DM_SEPARATE */
435 else if (match_count
== 1)
437 if ((delimit_groups
== DM_PREPEND
)
438 || (delimit_groups
== DM_SEPARATE
439 && !first_delimiter
))
444 if (!match
|| output_later_repeated
)
446 writeline (prevline
, match
, match_count
);
447 SWAP_LINES (prevline
, thisline
);
448 prevfield
= thisfield
;
455 writeline (prevline
, false, match_count
);
459 if (ferror (stdin
) || fclose (stdin
) != 0)
460 error (EXIT_FAILURE
, errno
, _("error reading %s"), quoteaf (infile
));
462 /* stdout is handled via the atexit-invoked close_stdout function. */
468 enum Skip_field_option_type
476 main (int argc
, char **argv
)
479 bool posixly_correct
= (getenv ("POSIXLY_CORRECT") != nullptr);
480 enum Skip_field_option_type skip_field_option_type
= SFO_NONE
;
481 unsigned int nfiles
= 0;
483 char delimiter
= '\n'; /* change with --zero-terminated, -z */
484 bool output_option_used
= false; /* if true, one of -u/-d/-D/-c was used */
486 file
[0] = file
[1] = "-";
487 initialize_main (&argc
, &argv
);
488 set_program_name (argv
[0]);
489 setlocale (LC_ALL
, "");
490 bindtextdomain (PACKAGE
, LOCALEDIR
);
491 textdomain (PACKAGE
);
493 atexit (close_stdout
);
497 check_chars
= SIZE_MAX
;
498 output_unique
= output_first_repeated
= true;
499 output_later_repeated
= false;
500 countmode
= count_none
;
501 delimit_groups
= DM_NONE
;
505 /* Parse an operand with leading "+" as a file after "--" was
506 seen; or if pedantic and a file was seen; or if not
510 || (posixly_correct
&& nfiles
!= 0)
511 || ((optc
= getopt_long (argc
, argv
,
512 "-0123456789Dcdf:is:uw:z",
520 error (0, 0, _("extra operand %s"), quote (argv
[optind
]));
521 usage (EXIT_FAILURE
);
523 file
[nfiles
++] = argv
[optind
++];
531 && ! strict_posix2 ()
532 && xstrtoumax (optarg
, nullptr, 10, &size
, "") == LONGINT_OK
535 else if (nfiles
== 2)
537 error (0, 0, _("extra operand %s"), quote (optarg
));
538 usage (EXIT_FAILURE
);
541 file
[nfiles
++] = optarg
;
556 if (skip_field_option_type
== SFO_NEW
)
559 if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields
, optc
- '0', size_t))
560 skip_fields
= SIZE_MAX
;
562 skip_field_option_type
= SFO_OBSOLETE
;
567 countmode
= count_occurrences
;
568 output_option_used
= true;
572 output_unique
= false;
573 output_option_used
= true;
577 output_unique
= false;
578 output_later_repeated
= true;
579 if (optarg
== nullptr)
580 delimit_groups
= DM_NONE
;
582 delimit_groups
= XARGMATCH ("--all-repeated", optarg
,
583 delimit_method_string
,
585 output_option_used
= true;
589 if (optarg
== nullptr)
590 grouping
= GM_SEPARATE
;
592 grouping
= XARGMATCH ("--group", optarg
,
593 grouping_method_string
,
594 grouping_method_map
);
598 skip_field_option_type
= SFO_NEW
;
599 skip_fields
= size_opt (optarg
,
600 N_("invalid number of fields to skip"));
608 skip_chars
= size_opt (optarg
,
609 N_("invalid number of bytes to skip"));
613 output_first_repeated
= false;
614 output_option_used
= true;
618 check_chars
= size_opt (optarg
,
619 N_("invalid number of bytes to compare"));
626 case_GETOPT_HELP_CHAR
;
628 case_GETOPT_VERSION_CHAR (PROGRAM_NAME
, AUTHORS
);
631 usage (EXIT_FAILURE
);
635 /* Note we could allow --group with -D at least, and that would
636 avoid the need to specify a grouping method to --all-repeated.
637 It was thought best to avoid deprecating those parameters though
638 and keep --group separate to other options. */
639 if (grouping
!= GM_NONE
&& output_option_used
)
641 error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
642 usage (EXIT_FAILURE
);
645 if (grouping
!= GM_NONE
&& countmode
!= count_none
)
648 _("grouping and printing repeat counts is meaningless"));
649 usage (EXIT_FAILURE
);
652 if (countmode
== count_occurrences
&& output_later_repeated
)
655 _("printing all duplicated lines and repeat counts is meaningless"));
656 usage (EXIT_FAILURE
);
659 check_file (file
[0], file
[1], delimiter
);