shred: increase I/O block size for periodic pattern case
[coreutils.git] / src / uniq.c
blob530f7164184dda1ca40b3b549811e97096b62855
1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 1986-2013 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Richard M. Stallman and David MacKenzie. */
19 #include <config.h>
21 #include <getopt.h>
22 #include <sys/types.h>
24 #include "system.h"
25 #include "argmatch.h"
26 #include "linebuffer.h"
27 #include "error.h"
28 #include "fadvise.h"
29 #include "hard-locale.h"
30 #include "posixver.h"
31 #include "quote.h"
32 #include "stdio--.h"
33 #include "xmemcoll.h"
34 #include "xstrtol.h"
35 #include "memcasecmp.h"
37 /* The official name of this program (e.g., no 'g' prefix). */
38 #define PROGRAM_NAME "uniq"
40 #define AUTHORS \
41 proper_name ("Richard M. Stallman"), \
42 proper_name ("David MacKenzie")
44 #define SWAP_LINES(A, B) \
45 do \
46 { \
47 struct linebuffer *_tmp; \
48 _tmp = (A); \
49 (A) = (B); \
50 (B) = _tmp; \
51 } \
52 while (0)
54 /* True if the LC_COLLATE locale is hard. */
55 static bool hard_LC_COLLATE;
57 /* Number of fields to skip on each line when doing comparisons. */
58 static size_t skip_fields;
60 /* Number of chars to skip after skipping any fields. */
61 static size_t skip_chars;
63 /* Number of chars to compare. */
64 static size_t check_chars;
66 enum countmode
68 count_occurrences, /* -c Print count before output lines. */
69 count_none /* Default. Do not print counts. */
72 /* Whether and how to precede the output lines with a count of the number of
73 times they occurred in the input. */
74 static enum countmode countmode;
76 /* Which lines to output: unique lines, the first of a group of
77 repeated lines, and the second and subsequented of a group of
78 repeated lines. */
79 static bool output_unique;
80 static bool output_first_repeated;
81 static bool output_later_repeated;
83 /* If true, ignore case when comparing. */
84 static bool ignore_case;
86 enum delimit_method
88 /* No delimiters output. --all-repeated[=none] */
89 DM_NONE,
91 /* Delimiter precedes all groups. --all-repeated=prepend */
92 DM_PREPEND,
94 /* Delimit all groups. --all-repeated=separate */
95 DM_SEPARATE
98 static char const *const delimit_method_string[] =
100 "none", "prepend", "separate", NULL
103 static enum delimit_method const delimit_method_map[] =
105 DM_NONE, DM_PREPEND, DM_SEPARATE
108 /* Select whether/how to delimit groups of duplicate lines. */
109 static enum delimit_method delimit_groups;
111 enum grouping_method
113 /* No grouping, when "--group" isn't used */
114 GM_NONE,
116 /* Delimiter preceges all groups. --group=prepend */
117 GM_PREPEND,
119 /* Delimiter follows all groups. --group=append */
120 GM_APPEND,
122 /* Delimiter between groups. --group[=separate] */
123 GM_SEPARATE,
125 /* Delimiter before and after each group. --group=both */
126 GM_BOTH
129 static char const *const grouping_method_string[] =
131 "prepend", "append", "separate", "both", NULL
134 static enum grouping_method const grouping_method_map[] =
136 GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
139 static enum grouping_method grouping = GM_NONE;
141 enum
143 GROUP_OPTION = CHAR_MAX + 1
146 static struct option const longopts[] =
148 {"count", no_argument, NULL, 'c'},
149 {"repeated", no_argument, NULL, 'd'},
150 {"all-repeated", optional_argument, NULL, 'D'},
151 {"group", optional_argument, NULL, GROUP_OPTION},
152 {"ignore-case", no_argument, NULL, 'i'},
153 {"unique", no_argument, NULL, 'u'},
154 {"skip-fields", required_argument, NULL, 'f'},
155 {"skip-chars", required_argument, NULL, 's'},
156 {"check-chars", required_argument, NULL, 'w'},
157 {"zero-terminated", no_argument, NULL, 'z'},
158 {GETOPT_HELP_OPTION_DECL},
159 {GETOPT_VERSION_OPTION_DECL},
160 {NULL, 0, NULL, 0}
163 void
164 usage (int status)
166 if (status != EXIT_SUCCESS)
167 emit_try_help ();
168 else
170 printf (_("\
171 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
173 program_name);
174 fputs (_("\
175 Filter adjacent matching lines from INPUT (or standard input),\n\
176 writing to OUTPUT (or standard output).\n\
178 With no options, matching lines are merged to the first occurrence.\n\
179 "), stdout);
181 emit_mandatory_arg_note ();
183 fputs (_("\
184 -c, --count prefix lines by the number of occurrences\n\
185 -d, --repeated only print duplicate lines, one for each group\n\
186 "), stdout);
187 fputs (_("\
188 -D, --all-repeated[=METHOD] print all duplicate lines\n\
189 groups can be delimited with an empty line\n\
190 METHOD={none(default),prepend,separate}\n\
191 "), stdout);
192 fputs (_("\
193 -f, --skip-fields=N avoid comparing the first N fields\n\
194 "), stdout);
195 fputs (_("\
196 --group[=METHOD] show all items, separating groups with an empty line\n\
197 METHOD={separate(default),prepend,append,both}\n\
198 "), stdout);
199 fputs (_("\
200 -i, --ignore-case ignore differences in case when comparing\n\
201 -s, --skip-chars=N avoid comparing the first N characters\n\
202 -u, --unique only print unique lines\n\
203 -z, --zero-terminated end lines with 0 byte, not newline\n\
204 "), stdout);
205 fputs (_("\
206 -w, --check-chars=N compare no more than N characters in lines\n\
207 "), stdout);
208 fputs (HELP_OPTION_DESCRIPTION, stdout);
209 fputs (VERSION_OPTION_DESCRIPTION, stdout);
210 fputs (_("\
212 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
213 characters. Fields are skipped before chars.\n\
214 "), stdout);
215 fputs (_("\
217 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
218 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
219 Also, comparisons honor the rules specified by 'LC_COLLATE'.\n\
220 "), stdout);
221 emit_ancillary_info ();
223 exit (status);
226 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
227 invalid. Silently convert too-large values to SIZE_MAX. */
229 static size_t
230 size_opt (char const *opt, char const *msgid)
232 unsigned long int size;
233 verify (SIZE_MAX <= ULONG_MAX);
235 switch (xstrtoul (opt, NULL, 10, &size, ""))
237 case LONGINT_OK:
238 case LONGINT_OVERFLOW:
239 break;
241 default:
242 error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
245 return MIN (size, SIZE_MAX);
248 /* Given a linebuffer LINE,
249 return a pointer to the beginning of the line's field to be compared. */
251 static char * _GL_ATTRIBUTE_PURE
252 find_field (struct linebuffer const *line)
254 size_t count;
255 char const *lp = line->buffer;
256 size_t size = line->length - 1;
257 size_t i = 0;
259 for (count = 0; count < skip_fields && i < size; count++)
261 while (i < size && isblank (to_uchar (lp[i])))
262 i++;
263 while (i < size && !isblank (to_uchar (lp[i])))
264 i++;
267 i += MIN (skip_chars, size - i);
269 return line->buffer + i;
272 /* Return false if two strings OLD and NEW match, true if not.
273 OLD and NEW point not to the beginnings of the lines
274 but rather to the beginnings of the fields to compare.
275 OLDLEN and NEWLEN are their lengths. */
277 static bool
278 different (char *old, char *new, size_t oldlen, size_t newlen)
280 if (check_chars < oldlen)
281 oldlen = check_chars;
282 if (check_chars < newlen)
283 newlen = check_chars;
285 if (ignore_case)
287 /* FIXME: This should invoke strcoll somehow. */
288 return oldlen != newlen || memcasecmp (old, new, oldlen);
290 else if (hard_LC_COLLATE)
291 return xmemcoll (old, oldlen, new, newlen) != 0;
292 else
293 return oldlen != newlen || memcmp (old, new, oldlen);
296 /* Output the line in linebuffer LINE to standard output
297 provided that the switches say it should be output.
298 MATCH is true if the line matches the previous line.
299 If requested, print the number of times it occurred, as well;
300 LINECOUNT + 1 is the number of times that the line occurred. */
302 static void
303 writeline (struct linebuffer const *line,
304 bool match, uintmax_t linecount)
306 if (! (linecount == 0 ? output_unique
307 : !match ? output_first_repeated
308 : output_later_repeated))
309 return;
311 if (countmode == count_occurrences)
312 printf ("%7" PRIuMAX " ", linecount + 1);
314 fwrite (line->buffer, sizeof (char), line->length, stdout);
317 /* Process input file INFILE with output to OUTFILE.
318 If either is "-", use the standard I/O stream for it instead. */
320 static void
321 check_file (const char *infile, const char *outfile, char delimiter)
323 struct linebuffer lb1, lb2;
324 struct linebuffer *thisline, *prevline;
326 if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
327 error (EXIT_FAILURE, errno, "%s", infile);
328 if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
329 error (EXIT_FAILURE, errno, "%s", outfile);
331 fadvise (stdin, FADVISE_SEQUENTIAL);
333 thisline = &lb1;
334 prevline = &lb2;
336 initbuffer (thisline);
337 initbuffer (prevline);
339 /* The duplication in the following 'if' and 'else' blocks is an
340 optimization to distinguish between when we can print input
341 lines immediately (1. & 2.) or not.
343 1. --group => all input lines are printed.
344 checking for unique/duplicated lines is used only for printing
345 group separators.
347 2. The default case in which none of these options has been specified:
348 --count, --repeated, --all-repeated, --unique
349 In the default case, this optimization lets uniq output each different
350 line right away, without waiting to see if the next one is different.
352 3. All other cases.
354 if (output_unique && output_first_repeated && countmode == count_none)
356 char *prevfield IF_LINT ( = NULL);
357 size_t prevlen IF_LINT ( = 0);
358 bool first_group_printed = false;
360 while (!feof (stdin))
362 char *thisfield;
363 size_t thislen;
364 bool new_group;
366 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
367 break;
369 thisfield = find_field (thisline);
370 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
372 new_group = (prevline->length == 0
373 || different (thisfield, prevfield, thislen, prevlen));
375 if (new_group && grouping != GM_NONE
376 && (grouping == GM_PREPEND || grouping == GM_BOTH
377 || (first_group_printed && (grouping == GM_APPEND
378 || grouping == GM_SEPARATE))))
379 putchar (delimiter);
381 if (new_group || grouping != GM_NONE)
383 fwrite (thisline->buffer, sizeof (char),
384 thisline->length, stdout);
386 SWAP_LINES (prevline, thisline);
387 prevfield = thisfield;
388 prevlen = thislen;
389 first_group_printed = true;
392 if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
393 putchar (delimiter);
395 else
397 char *prevfield;
398 size_t prevlen;
399 uintmax_t match_count = 0;
400 bool first_delimiter = true;
402 if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
403 goto closefiles;
404 prevfield = find_field (prevline);
405 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
407 while (!feof (stdin))
409 bool match;
410 char *thisfield;
411 size_t thislen;
412 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
414 if (ferror (stdin))
415 goto closefiles;
416 break;
418 thisfield = find_field (thisline);
419 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
420 match = !different (thisfield, prevfield, thislen, prevlen);
421 match_count += match;
423 if (match_count == UINTMAX_MAX)
425 if (count_occurrences)
426 error (EXIT_FAILURE, 0, _("too many repeated lines"));
427 match_count--;
430 if (delimit_groups != DM_NONE)
432 if (!match)
434 if (match_count) /* a previous match */
435 first_delimiter = false; /* Only used when DM_SEPARATE */
437 else if (match_count == 1)
439 if ((delimit_groups == DM_PREPEND)
440 || (delimit_groups == DM_SEPARATE
441 && !first_delimiter))
442 putchar (delimiter);
446 if (!match || output_later_repeated)
448 writeline (prevline, match, match_count);
449 SWAP_LINES (prevline, thisline);
450 prevfield = thisfield;
451 prevlen = thislen;
452 if (!match)
453 match_count = 0;
457 writeline (prevline, false, match_count);
460 closefiles:
461 if (ferror (stdin) || fclose (stdin) != 0)
462 error (EXIT_FAILURE, 0, _("error reading %s"), infile);
464 /* stdout is handled via the atexit-invoked close_stdout function. */
466 free (lb1.buffer);
467 free (lb2.buffer);
470 enum Skip_field_option_type
472 SFO_NONE,
473 SFO_OBSOLETE,
474 SFO_NEW
478 main (int argc, char **argv)
480 int optc = 0;
481 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
482 enum Skip_field_option_type skip_field_option_type = SFO_NONE;
483 int nfiles = 0;
484 char const *file[2];
485 char delimiter = '\n'; /* change with --zero-terminated, -z */
486 bool output_option_used = false; /* if true, one of -u/-d/-D/-c was used */
488 file[0] = file[1] = "-";
489 initialize_main (&argc, &argv);
490 set_program_name (argv[0]);
491 setlocale (LC_ALL, "");
492 bindtextdomain (PACKAGE, LOCALEDIR);
493 textdomain (PACKAGE);
494 hard_LC_COLLATE = hard_locale (LC_COLLATE);
496 atexit (close_stdout);
498 skip_chars = 0;
499 skip_fields = 0;
500 check_chars = SIZE_MAX;
501 output_unique = output_first_repeated = true;
502 output_later_repeated = false;
503 countmode = count_none;
504 delimit_groups = DM_NONE;
506 while (true)
508 /* Parse an operand with leading "+" as a file after "--" was
509 seen; or if pedantic and a file was seen; or if not
510 obsolete. */
512 if (optc == -1
513 || (posixly_correct && nfiles != 0)
514 || ((optc = getopt_long (argc, argv,
515 "-0123456789Dcdf:is:uw:z", longopts, NULL))
516 == -1))
518 if (argc <= optind)
519 break;
520 if (nfiles == 2)
522 error (0, 0, _("extra operand %s"), quote (argv[optind]));
523 usage (EXIT_FAILURE);
525 file[nfiles++] = argv[optind++];
527 else switch (optc)
529 case 1:
531 unsigned long int size;
532 if (optarg[0] == '+'
533 && posix2_version () < 200112
534 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
535 && size <= SIZE_MAX)
536 skip_chars = size;
537 else if (nfiles == 2)
539 error (0, 0, _("extra operand %s"), quote (optarg));
540 usage (EXIT_FAILURE);
542 else
543 file[nfiles++] = optarg;
545 break;
547 case '0':
548 case '1':
549 case '2':
550 case '3':
551 case '4':
552 case '5':
553 case '6':
554 case '7':
555 case '8':
556 case '9':
558 if (skip_field_option_type == SFO_NEW)
559 skip_fields = 0;
561 if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
562 skip_fields = SIZE_MAX;
564 skip_field_option_type = SFO_OBSOLETE;
566 break;
568 case 'c':
569 countmode = count_occurrences;
570 output_option_used = true;
571 break;
573 case 'd':
574 output_unique = false;
575 output_option_used = true;
576 break;
578 case 'D':
579 output_unique = false;
580 output_later_repeated = true;
581 if (optarg == NULL)
582 delimit_groups = DM_NONE;
583 else
584 delimit_groups = XARGMATCH ("--all-repeated", optarg,
585 delimit_method_string,
586 delimit_method_map);
587 output_option_used = true;
588 break;
590 case GROUP_OPTION:
591 if (optarg == NULL)
592 grouping = GM_SEPARATE;
593 else
594 grouping = XARGMATCH ("--group", optarg,
595 grouping_method_string,
596 grouping_method_map);
597 break;
599 case 'f':
600 skip_field_option_type = SFO_NEW;
601 skip_fields = size_opt (optarg,
602 N_("invalid number of fields to skip"));
603 break;
605 case 'i':
606 ignore_case = true;
607 break;
609 case 's':
610 skip_chars = size_opt (optarg,
611 N_("invalid number of bytes to skip"));
612 break;
614 case 'u':
615 output_first_repeated = false;
616 output_option_used = true;
617 break;
619 case 'w':
620 check_chars = size_opt (optarg,
621 N_("invalid number of bytes to compare"));
622 break;
624 case 'z':
625 delimiter = '\0';
626 break;
628 case_GETOPT_HELP_CHAR;
630 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
632 default:
633 usage (EXIT_FAILURE);
637 /* Note we could allow --group with -D at least, and that would
638 avoid the need to specify a grouping method to --all-repeated.
639 It was thought best to avoid deprecating those parameters though
640 and keep --group separate to other options. */
641 if (grouping != GM_NONE && output_option_used)
643 error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
644 usage (EXIT_FAILURE);
647 if (grouping != GM_NONE && countmode != count_none)
649 error (0, 0,
650 _("grouping and printing repeat counts is meaningless"));
651 usage (EXIT_FAILURE);
654 if (countmode == count_occurrences && output_later_repeated)
656 error (0, 0,
657 _("printing all duplicated lines and repeat counts is meaningless"));
658 usage (EXIT_FAILURE);
661 check_file (file[0], file[1], delimiter);
663 exit (EXIT_SUCCESS);