doc: add NEWS entries for recent changes
[coreutils.git] / src / uniq.c
blob87a0c9301a3b974f47c88bf69ffdd2d22ca175bd
1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 1986-2016 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Richard M. Stallman and David MacKenzie. */
19 #include <config.h>
21 #include <getopt.h>
22 #include <sys/types.h>
24 #include "system.h"
25 #include "argmatch.h"
26 #include "linebuffer.h"
27 #include "die.h"
28 #include "error.h"
29 #include "fadvise.h"
30 #include "hard-locale.h"
31 #include "posixver.h"
32 #include "stdio--.h"
33 #include "xmemcoll.h"
34 #include "xstrtol.h"
35 #include "memcasecmp.h"
36 #include "quote.h"
38 /* The official name of this program (e.g., no 'g' prefix). */
39 #define PROGRAM_NAME "uniq"
41 #define AUTHORS \
42 proper_name ("Richard M. Stallman"), \
43 proper_name ("David MacKenzie")
45 #define SWAP_LINES(A, B) \
46 do \
47 { \
48 struct linebuffer *_tmp; \
49 _tmp = (A); \
50 (A) = (B); \
51 (B) = _tmp; \
52 } \
53 while (0)
55 /* True if the LC_COLLATE locale is hard. */
56 static bool hard_LC_COLLATE;
58 /* Number of fields to skip on each line when doing comparisons. */
59 static size_t skip_fields;
61 /* Number of chars to skip after skipping any fields. */
62 static size_t skip_chars;
64 /* Number of chars to compare. */
65 static size_t check_chars;
67 enum countmode
69 count_occurrences, /* -c Print count before output lines. */
70 count_none /* Default. Do not print counts. */
73 /* Whether and how to precede the output lines with a count of the number of
74 times they occurred in the input. */
75 static enum countmode countmode;
77 /* Which lines to output: unique lines, the first of a group of
78 repeated lines, and the second and subsequented of a group of
79 repeated lines. */
80 static bool output_unique;
81 static bool output_first_repeated;
82 static bool output_later_repeated;
84 /* If true, ignore case when comparing. */
85 static bool ignore_case;
87 enum delimit_method
89 /* No delimiters output. --all-repeated[=none] */
90 DM_NONE,
92 /* Delimiter precedes all groups. --all-repeated=prepend */
93 DM_PREPEND,
95 /* Delimit all groups. --all-repeated=separate */
96 DM_SEPARATE
99 static char const *const delimit_method_string[] =
101 "none", "prepend", "separate", NULL
104 static enum delimit_method const delimit_method_map[] =
106 DM_NONE, DM_PREPEND, DM_SEPARATE
109 /* Select whether/how to delimit groups of duplicate lines. */
110 static enum delimit_method delimit_groups;
112 enum grouping_method
114 /* No grouping, when "--group" isn't used */
115 GM_NONE,
117 /* Delimiter preceges all groups. --group=prepend */
118 GM_PREPEND,
120 /* Delimiter follows all groups. --group=append */
121 GM_APPEND,
123 /* Delimiter between groups. --group[=separate] */
124 GM_SEPARATE,
126 /* Delimiter before and after each group. --group=both */
127 GM_BOTH
130 static char const *const grouping_method_string[] =
132 "prepend", "append", "separate", "both", NULL
135 static enum grouping_method const grouping_method_map[] =
137 GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
140 static enum grouping_method grouping = GM_NONE;
142 enum
144 GROUP_OPTION = CHAR_MAX + 1
147 static struct option const longopts[] =
149 {"count", no_argument, NULL, 'c'},
150 {"repeated", no_argument, NULL, 'd'},
151 {"all-repeated", optional_argument, NULL, 'D'},
152 {"group", optional_argument, NULL, GROUP_OPTION},
153 {"ignore-case", no_argument, NULL, 'i'},
154 {"unique", no_argument, NULL, 'u'},
155 {"skip-fields", required_argument, NULL, 'f'},
156 {"skip-chars", required_argument, NULL, 's'},
157 {"check-chars", required_argument, NULL, 'w'},
158 {"zero-terminated", no_argument, NULL, 'z'},
159 {GETOPT_HELP_OPTION_DECL},
160 {GETOPT_VERSION_OPTION_DECL},
161 {NULL, 0, NULL, 0}
164 void
165 usage (int status)
167 if (status != EXIT_SUCCESS)
168 emit_try_help ();
169 else
171 printf (_("\
172 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
174 program_name);
175 fputs (_("\
176 Filter adjacent matching lines from INPUT (or standard input),\n\
177 writing to OUTPUT (or standard output).\n\
179 With no options, matching lines are merged to the first occurrence.\n\
180 "), stdout);
182 emit_mandatory_arg_note ();
184 fputs (_("\
185 -c, --count prefix lines by the number of occurrences\n\
186 -d, --repeated only print duplicate lines, one for each group\n\
187 "), stdout);
188 fputs (_("\
189 -D print all duplicate lines\n\
190 --all-repeated[=METHOD] like -D, but allow separating groups\n\
191 with an empty line;\n\
192 METHOD={none(default),prepend,separate}\n\
193 "), stdout);
194 fputs (_("\
195 -f, --skip-fields=N avoid comparing the first N fields\n\
196 "), stdout);
197 fputs (_("\
198 --group[=METHOD] show all items, separating groups with an empty line;\n\
199 METHOD={separate(default),prepend,append,both}\n\
200 "), stdout);
201 fputs (_("\
202 -i, --ignore-case ignore differences in case when comparing\n\
203 -s, --skip-chars=N avoid comparing the first N characters\n\
204 -u, --unique only print unique lines\n\
205 "), stdout);
206 fputs (_("\
207 -z, --zero-terminated line delimiter is NUL, not newline\n\
208 "), stdout);
209 fputs (_("\
210 -w, --check-chars=N compare no more than N characters in lines\n\
211 "), stdout);
212 fputs (HELP_OPTION_DESCRIPTION, stdout);
213 fputs (VERSION_OPTION_DESCRIPTION, stdout);
214 fputs (_("\
216 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
217 characters. Fields are skipped before chars.\n\
218 "), stdout);
219 fputs (_("\
221 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
222 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
223 Also, comparisons honor the rules specified by 'LC_COLLATE'.\n\
224 "), stdout);
225 emit_ancillary_info (PROGRAM_NAME);
227 exit (status);
230 static bool
231 strict_posix2 (void)
233 int posix_ver = posix2_version ();
234 return 200112 <= posix_ver && posix_ver < 200809;
237 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
238 invalid. Silently convert too-large values to SIZE_MAX. */
240 static size_t
241 size_opt (char const *opt, char const *msgid)
243 unsigned long int size;
244 verify (SIZE_MAX <= ULONG_MAX);
246 switch (xstrtoul (opt, NULL, 10, &size, ""))
248 case LONGINT_OK:
249 case LONGINT_OVERFLOW:
250 break;
252 default:
253 die (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
256 return MIN (size, SIZE_MAX);
259 /* Given a linebuffer LINE,
260 return a pointer to the beginning of the line's field to be compared. */
262 static char * _GL_ATTRIBUTE_PURE
263 find_field (struct linebuffer const *line)
265 size_t count;
266 char const *lp = line->buffer;
267 size_t size = line->length - 1;
268 size_t i = 0;
270 for (count = 0; count < skip_fields && i < size; count++)
272 while (i < size && field_sep (lp[i]))
273 i++;
274 while (i < size && !field_sep (lp[i]))
275 i++;
278 i += MIN (skip_chars, size - i);
280 return line->buffer + i;
283 /* Return false if two strings OLD and NEW match, true if not.
284 OLD and NEW point not to the beginnings of the lines
285 but rather to the beginnings of the fields to compare.
286 OLDLEN and NEWLEN are their lengths. */
288 static bool
289 different (char *old, char *new, size_t oldlen, size_t newlen)
291 if (check_chars < oldlen)
292 oldlen = check_chars;
293 if (check_chars < newlen)
294 newlen = check_chars;
296 if (ignore_case)
298 /* FIXME: This should invoke strcoll somehow. */
299 return oldlen != newlen || memcasecmp (old, new, oldlen);
301 else if (hard_LC_COLLATE)
302 return xmemcoll (old, oldlen, new, newlen) != 0;
303 else
304 return oldlen != newlen || memcmp (old, new, oldlen);
307 /* Output the line in linebuffer LINE to standard output
308 provided that the switches say it should be output.
309 MATCH is true if the line matches the previous line.
310 If requested, print the number of times it occurred, as well;
311 LINECOUNT + 1 is the number of times that the line occurred. */
313 static void
314 writeline (struct linebuffer const *line,
315 bool match, uintmax_t linecount)
317 if (! (linecount == 0 ? output_unique
318 : !match ? output_first_repeated
319 : output_later_repeated))
320 return;
322 if (countmode == count_occurrences)
323 printf ("%7" PRIuMAX " ", linecount + 1);
325 fwrite (line->buffer, sizeof (char), line->length, stdout);
328 /* Process input file INFILE with output to OUTFILE.
329 If either is "-", use the standard I/O stream for it instead. */
331 static void
332 check_file (const char *infile, const char *outfile, char delimiter)
334 struct linebuffer lb1, lb2;
335 struct linebuffer *thisline, *prevline;
337 if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
338 die (EXIT_FAILURE, errno, "%s", quotef (infile));
339 if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
340 die (EXIT_FAILURE, errno, "%s", quotef (outfile));
342 fadvise (stdin, FADVISE_SEQUENTIAL);
344 thisline = &lb1;
345 prevline = &lb2;
347 initbuffer (thisline);
348 initbuffer (prevline);
350 /* The duplication in the following 'if' and 'else' blocks is an
351 optimization to distinguish between when we can print input
352 lines immediately (1. & 2.) or not.
354 1. --group => all input lines are printed.
355 checking for unique/duplicated lines is used only for printing
356 group separators.
358 2. The default case in which none of these options has been specified:
359 --count, --repeated, --all-repeated, --unique
360 In the default case, this optimization lets uniq output each different
361 line right away, without waiting to see if the next one is different.
363 3. All other cases.
365 if (output_unique && output_first_repeated && countmode == count_none)
367 char *prevfield IF_LINT ( = NULL);
368 size_t prevlen IF_LINT ( = 0);
369 bool first_group_printed = false;
371 while (!feof (stdin))
373 char *thisfield;
374 size_t thislen;
375 bool new_group;
377 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
378 break;
380 thisfield = find_field (thisline);
381 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
383 new_group = (prevline->length == 0
384 || different (thisfield, prevfield, thislen, prevlen));
386 if (new_group && grouping != GM_NONE
387 && (grouping == GM_PREPEND || grouping == GM_BOTH
388 || (first_group_printed && (grouping == GM_APPEND
389 || grouping == GM_SEPARATE))))
390 putchar (delimiter);
392 if (new_group || grouping != GM_NONE)
394 fwrite (thisline->buffer, sizeof (char),
395 thisline->length, stdout);
397 SWAP_LINES (prevline, thisline);
398 prevfield = thisfield;
399 prevlen = thislen;
400 first_group_printed = true;
403 if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
404 putchar (delimiter);
406 else
408 char *prevfield;
409 size_t prevlen;
410 uintmax_t match_count = 0;
411 bool first_delimiter = true;
413 if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
414 goto closefiles;
415 prevfield = find_field (prevline);
416 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
418 while (!feof (stdin))
420 bool match;
421 char *thisfield;
422 size_t thislen;
423 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
425 if (ferror (stdin))
426 goto closefiles;
427 break;
429 thisfield = find_field (thisline);
430 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
431 match = !different (thisfield, prevfield, thislen, prevlen);
432 match_count += match;
434 if (match_count == UINTMAX_MAX)
436 if (count_occurrences)
437 die (EXIT_FAILURE, 0, _("too many repeated lines"));
438 match_count--;
441 if (delimit_groups != DM_NONE)
443 if (!match)
445 if (match_count) /* a previous match */
446 first_delimiter = false; /* Only used when DM_SEPARATE */
448 else if (match_count == 1)
450 if ((delimit_groups == DM_PREPEND)
451 || (delimit_groups == DM_SEPARATE
452 && !first_delimiter))
453 putchar (delimiter);
457 if (!match || output_later_repeated)
459 writeline (prevline, match, match_count);
460 SWAP_LINES (prevline, thisline);
461 prevfield = thisfield;
462 prevlen = thislen;
463 if (!match)
464 match_count = 0;
468 writeline (prevline, false, match_count);
471 closefiles:
472 if (ferror (stdin) || fclose (stdin) != 0)
473 die (EXIT_FAILURE, 0, _("error reading %s"), quoteaf (infile));
475 /* stdout is handled via the atexit-invoked close_stdout function. */
477 free (lb1.buffer);
478 free (lb2.buffer);
481 enum Skip_field_option_type
483 SFO_NONE,
484 SFO_OBSOLETE,
485 SFO_NEW
489 main (int argc, char **argv)
491 int optc = 0;
492 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
493 enum Skip_field_option_type skip_field_option_type = SFO_NONE;
494 unsigned int nfiles = 0;
495 char const *file[2];
496 char delimiter = '\n'; /* change with --zero-terminated, -z */
497 bool output_option_used = false; /* if true, one of -u/-d/-D/-c was used */
499 file[0] = file[1] = "-";
500 initialize_main (&argc, &argv);
501 set_program_name (argv[0]);
502 setlocale (LC_ALL, "");
503 bindtextdomain (PACKAGE, LOCALEDIR);
504 textdomain (PACKAGE);
505 hard_LC_COLLATE = hard_locale (LC_COLLATE);
507 atexit (close_stdout);
509 skip_chars = 0;
510 skip_fields = 0;
511 check_chars = SIZE_MAX;
512 output_unique = output_first_repeated = true;
513 output_later_repeated = false;
514 countmode = count_none;
515 delimit_groups = DM_NONE;
517 while (true)
519 /* Parse an operand with leading "+" as a file after "--" was
520 seen; or if pedantic and a file was seen; or if not
521 obsolete. */
523 if (optc == -1
524 || (posixly_correct && nfiles != 0)
525 || ((optc = getopt_long (argc, argv,
526 "-0123456789Dcdf:is:uw:z", longopts, NULL))
527 == -1))
529 if (argc <= optind)
530 break;
531 if (nfiles == 2)
533 error (0, 0, _("extra operand %s"), quote (argv[optind]));
534 usage (EXIT_FAILURE);
536 file[nfiles++] = argv[optind++];
538 else switch (optc)
540 case 1:
542 unsigned long int size;
543 if (optarg[0] == '+'
544 && ! strict_posix2 ()
545 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
546 && size <= SIZE_MAX)
547 skip_chars = size;
548 else if (nfiles == 2)
550 error (0, 0, _("extra operand %s"), quote (optarg));
551 usage (EXIT_FAILURE);
553 else
554 file[nfiles++] = optarg;
556 break;
558 case '0':
559 case '1':
560 case '2':
561 case '3':
562 case '4':
563 case '5':
564 case '6':
565 case '7':
566 case '8':
567 case '9':
569 if (skip_field_option_type == SFO_NEW)
570 skip_fields = 0;
572 if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
573 skip_fields = SIZE_MAX;
575 skip_field_option_type = SFO_OBSOLETE;
577 break;
579 case 'c':
580 countmode = count_occurrences;
581 output_option_used = true;
582 break;
584 case 'd':
585 output_unique = false;
586 output_option_used = true;
587 break;
589 case 'D':
590 output_unique = false;
591 output_later_repeated = true;
592 if (optarg == NULL)
593 delimit_groups = DM_NONE;
594 else
595 delimit_groups = XARGMATCH ("--all-repeated", optarg,
596 delimit_method_string,
597 delimit_method_map);
598 output_option_used = true;
599 break;
601 case GROUP_OPTION:
602 if (optarg == NULL)
603 grouping = GM_SEPARATE;
604 else
605 grouping = XARGMATCH ("--group", optarg,
606 grouping_method_string,
607 grouping_method_map);
608 break;
610 case 'f':
611 skip_field_option_type = SFO_NEW;
612 skip_fields = size_opt (optarg,
613 N_("invalid number of fields to skip"));
614 break;
616 case 'i':
617 ignore_case = true;
618 break;
620 case 's':
621 skip_chars = size_opt (optarg,
622 N_("invalid number of bytes to skip"));
623 break;
625 case 'u':
626 output_first_repeated = false;
627 output_option_used = true;
628 break;
630 case 'w':
631 check_chars = size_opt (optarg,
632 N_("invalid number of bytes to compare"));
633 break;
635 case 'z':
636 delimiter = '\0';
637 break;
639 case_GETOPT_HELP_CHAR;
641 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
643 default:
644 usage (EXIT_FAILURE);
648 /* Note we could allow --group with -D at least, and that would
649 avoid the need to specify a grouping method to --all-repeated.
650 It was thought best to avoid deprecating those parameters though
651 and keep --group separate to other options. */
652 if (grouping != GM_NONE && output_option_used)
654 error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
655 usage (EXIT_FAILURE);
658 if (grouping != GM_NONE && countmode != count_none)
660 error (0, 0,
661 _("grouping and printing repeat counts is meaningless"));
662 usage (EXIT_FAILURE);
665 if (countmode == count_occurrences && output_later_repeated)
667 error (0, 0,
668 _("printing all duplicated lines and repeat counts is meaningless"));
669 usage (EXIT_FAILURE);
672 check_file (file[0], file[1], delimiter);
674 return EXIT_SUCCESS;