build: update gnulib submodule to latest, for fewer compiler warnings
[coreutils.git] / src / uniq.c
blob7509bfce9cd9b492673985ffcd95f19fa2874657
1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 86, 91, 1995-2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Richard M. Stallman and David MacKenzie. */
19 #include <config.h>
21 #include <stdio.h>
22 #include <getopt.h>
23 #include <sys/types.h>
25 #include "system.h"
26 #include "argmatch.h"
27 #include "linebuffer.h"
28 #include "error.h"
29 #include "hard-locale.h"
30 #include "posixver.h"
31 #include "quote.h"
32 #include "xmemcoll.h"
33 #include "xstrtol.h"
34 #include "memcasecmp.h"
36 /* The official name of this program (e.g., no `g' prefix). */
37 #define PROGRAM_NAME "uniq"
39 #define AUTHORS \
40 proper_name ("Richard M. Stallman"), \
41 proper_name ("David MacKenzie")
43 #define SWAP_LINES(A, B) \
44 do \
45 { \
46 struct linebuffer *_tmp; \
47 _tmp = (A); \
48 (A) = (B); \
49 (B) = _tmp; \
50 } \
51 while (0)
53 /* True if the LC_COLLATE locale is hard. */
54 static bool hard_LC_COLLATE;
56 /* Number of fields to skip on each line when doing comparisons. */
57 static size_t skip_fields;
59 /* Number of chars to skip after skipping any fields. */
60 static size_t skip_chars;
62 /* Number of chars to compare. */
63 static size_t check_chars;
65 enum countmode
67 count_occurrences, /* -c Print count before output lines. */
68 count_none /* Default. Do not print counts. */
71 /* Whether and how to precede the output lines with a count of the number of
72 times they occurred in the input. */
73 static enum countmode countmode;
75 /* Which lines to output: unique lines, the first of a group of
76 repeated lines, and the second and subsequented of a group of
77 repeated lines. */
78 static bool output_unique;
79 static bool output_first_repeated;
80 static bool output_later_repeated;
82 /* If true, ignore case when comparing. */
83 static bool ignore_case;
85 enum delimit_method
87 /* No delimiters output. --all-repeated[=none] */
88 DM_NONE,
90 /* Delimiter precedes all groups. --all-repeated=prepend */
91 DM_PREPEND,
93 /* Delimit all groups. --all-repeated=separate */
94 DM_SEPARATE
97 static char const *const delimit_method_string[] =
99 "none", "prepend", "separate", NULL
102 static enum delimit_method const delimit_method_map[] =
104 DM_NONE, DM_PREPEND, DM_SEPARATE
107 /* Select whether/how to delimit groups of duplicate lines. */
108 static enum delimit_method delimit_groups;
110 static struct option const longopts[] =
112 {"count", no_argument, NULL, 'c'},
113 {"repeated", no_argument, NULL, 'd'},
114 {"all-repeated", optional_argument, NULL, 'D'},
115 {"ignore-case", no_argument, NULL, 'i'},
116 {"unique", no_argument, NULL, 'u'},
117 {"skip-fields", required_argument, NULL, 'f'},
118 {"skip-chars", required_argument, NULL, 's'},
119 {"check-chars", required_argument, NULL, 'w'},
120 {"zero-terminated", no_argument, NULL, 'z'},
121 {GETOPT_HELP_OPTION_DECL},
122 {GETOPT_VERSION_OPTION_DECL},
123 {NULL, 0, NULL, 0}
126 void
127 usage (int status)
129 if (status != EXIT_SUCCESS)
130 fprintf (stderr, _("Try `%s --help' for more information.\n"),
131 program_name);
132 else
134 printf (_("\
135 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
137 program_name);
138 fputs (_("\
139 Filter adjacent matching lines from INPUT (or standard input),\n\
140 writing to OUTPUT (or standard output).\n\
142 With no options, matching lines are merged to the first occurrence.\n\
144 "), stdout);
145 fputs (_("\
146 Mandatory arguments to long options are mandatory for short options too.\n\
147 "), stdout);
148 fputs (_("\
149 -c, --count prefix lines by the number of occurrences\n\
150 -d, --repeated only print duplicate lines\n\
151 "), stdout);
152 fputs (_("\
153 -D, --all-repeated[=delimit-method] print all duplicate lines\n\
154 delimit-method={none(default),prepend,separate}\n\
155 Delimiting is done with blank lines.\n\
156 -f, --skip-fields=N avoid comparing the first N fields\n\
157 -i, --ignore-case ignore differences in case when comparing\n\
158 -s, --skip-chars=N avoid comparing the first N characters\n\
159 -u, --unique only print unique lines\n\
160 -z, --zero-terminated end lines with 0 byte, not newline\n\
161 "), stdout);
162 fputs (_("\
163 -w, --check-chars=N compare no more than N characters in lines\n\
164 "), stdout);
165 fputs (HELP_OPTION_DESCRIPTION, stdout);
166 fputs (VERSION_OPTION_DESCRIPTION, stdout);
167 fputs (_("\
169 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
170 characters. Fields are skipped before chars.\n\
171 "), stdout);
172 fputs (_("\
174 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
175 You may want to sort the input first, or use `sort -u' without `uniq'.\n\
176 Also, comparisons honor the rules specified by `LC_COLLATE'.\n\
177 "), stdout);
178 emit_ancillary_info ();
180 exit (status);
183 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
184 invalid. Silently convert too-large values to SIZE_MAX. */
186 static size_t
187 size_opt (char const *opt, char const *msgid)
189 unsigned long int size;
190 verify (SIZE_MAX <= ULONG_MAX);
192 switch (xstrtoul (opt, NULL, 10, &size, ""))
194 case LONGINT_OK:
195 case LONGINT_OVERFLOW:
196 break;
198 default:
199 error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
202 return MIN (size, SIZE_MAX);
205 /* Given a linebuffer LINE,
206 return a pointer to the beginning of the line's field to be compared. */
208 static char *
209 find_field (struct linebuffer const *line)
211 size_t count;
212 char const *lp = line->buffer;
213 size_t size = line->length - 1;
214 size_t i = 0;
216 for (count = 0; count < skip_fields; count++)
218 while (i < size && isblank (to_uchar (lp[i])))
219 i++;
220 while (i < size && !isblank (to_uchar (lp[i])))
221 i++;
224 for (count = 0; count < skip_chars && i < size; count++)
225 i++;
227 return line->buffer + i;
230 /* Return false if two strings OLD and NEW match, true if not.
231 OLD and NEW point not to the beginnings of the lines
232 but rather to the beginnings of the fields to compare.
233 OLDLEN and NEWLEN are their lengths. */
235 static bool
236 different (char *old, char *new, size_t oldlen, size_t newlen)
238 if (check_chars < oldlen)
239 oldlen = check_chars;
240 if (check_chars < newlen)
241 newlen = check_chars;
243 if (ignore_case)
245 /* FIXME: This should invoke strcoll somehow. */
246 return oldlen != newlen || memcasecmp (old, new, oldlen);
248 else if (hard_LC_COLLATE)
249 return xmemcoll (old, oldlen, new, newlen) != 0;
250 else
251 return oldlen != newlen || memcmp (old, new, oldlen);
254 /* Output the line in linebuffer LINE to standard output
255 provided that the switches say it should be output.
256 MATCH is true if the line matches the previous line.
257 If requested, print the number of times it occurred, as well;
258 LINECOUNT + 1 is the number of times that the line occurred. */
260 static void
261 writeline (struct linebuffer const *line,
262 bool match, uintmax_t linecount)
264 if (! (linecount == 0 ? output_unique
265 : !match ? output_first_repeated
266 : output_later_repeated))
267 return;
269 if (countmode == count_occurrences)
270 printf ("%7" PRIuMAX " ", linecount + 1);
272 fwrite (line->buffer, sizeof (char), line->length, stdout);
275 /* Process input file INFILE with output to OUTFILE.
276 If either is "-", use the standard I/O stream for it instead. */
278 static void
279 check_file (const char *infile, const char *outfile, char delimiter)
281 struct linebuffer lb1, lb2;
282 struct linebuffer *thisline, *prevline;
284 if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
285 error (EXIT_FAILURE, errno, "%s", infile);
286 if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
287 error (EXIT_FAILURE, errno, "%s", outfile);
289 thisline = &lb1;
290 prevline = &lb2;
292 initbuffer (thisline);
293 initbuffer (prevline);
295 /* The duplication in the following `if' and `else' blocks is an
296 optimization to distinguish the common case (in which none of
297 the following options has been specified: --count, -repeated,
298 --all-repeated, --unique) from the others. In the common case,
299 this optimization lets uniq output each different line right away,
300 without waiting to see if the next one is different. */
302 if (output_unique && output_first_repeated && countmode == count_none)
304 char *prevfield IF_LINT (= NULL);
305 size_t prevlen IF_LINT (= 0);
307 while (!feof (stdin))
309 char *thisfield;
310 size_t thislen;
311 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
312 break;
313 thisfield = find_field (thisline);
314 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
315 if (prevline->length == 0
316 || different (thisfield, prevfield, thislen, prevlen))
318 fwrite (thisline->buffer, sizeof (char),
319 thisline->length, stdout);
321 SWAP_LINES (prevline, thisline);
322 prevfield = thisfield;
323 prevlen = thislen;
327 else
329 char *prevfield;
330 size_t prevlen;
331 uintmax_t match_count = 0;
332 bool first_delimiter = true;
334 if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
335 goto closefiles;
336 prevfield = find_field (prevline);
337 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
339 while (!feof (stdin))
341 bool match;
342 char *thisfield;
343 size_t thislen;
344 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
346 if (ferror (stdin))
347 goto closefiles;
348 break;
350 thisfield = find_field (thisline);
351 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
352 match = !different (thisfield, prevfield, thislen, prevlen);
353 match_count += match;
355 if (match_count == UINTMAX_MAX)
357 if (count_occurrences)
358 error (EXIT_FAILURE, 0, _("too many repeated lines"));
359 match_count--;
362 if (delimit_groups != DM_NONE)
364 if (!match)
366 if (match_count) /* a previous match */
367 first_delimiter = false; /* Only used when DM_SEPARATE */
369 else if (match_count == 1)
371 if ((delimit_groups == DM_PREPEND)
372 || (delimit_groups == DM_SEPARATE
373 && !first_delimiter))
374 putchar (delimiter);
378 if (!match || output_later_repeated)
380 writeline (prevline, match, match_count);
381 SWAP_LINES (prevline, thisline);
382 prevfield = thisfield;
383 prevlen = thislen;
384 if (!match)
385 match_count = 0;
389 writeline (prevline, false, match_count);
392 closefiles:
393 if (ferror (stdin) || fclose (stdin) != 0)
394 error (EXIT_FAILURE, 0, _("error reading %s"), infile);
396 /* stdout is handled via the atexit-invoked close_stdout function. */
398 free (lb1.buffer);
399 free (lb2.buffer);
402 enum Skip_field_option_type
404 SFO_NONE,
405 SFO_OBSOLETE,
406 SFO_NEW
410 main (int argc, char **argv)
412 int optc = 0;
413 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
414 enum Skip_field_option_type skip_field_option_type = SFO_NONE;
415 int nfiles = 0;
416 char const *file[2];
417 char delimiter = '\n'; /* change with --zero-terminated, -z */
419 file[0] = file[1] = "-";
420 initialize_main (&argc, &argv);
421 set_program_name (argv[0]);
422 setlocale (LC_ALL, "");
423 bindtextdomain (PACKAGE, LOCALEDIR);
424 textdomain (PACKAGE);
425 hard_LC_COLLATE = hard_locale (LC_COLLATE);
427 atexit (close_stdout);
429 skip_chars = 0;
430 skip_fields = 0;
431 check_chars = SIZE_MAX;
432 output_unique = output_first_repeated = true;
433 output_later_repeated = false;
434 countmode = count_none;
435 delimit_groups = DM_NONE;
437 for (;;)
439 /* Parse an operand with leading "+" as a file after "--" was
440 seen; or if pedantic and a file was seen; or if not
441 obsolete. */
443 if (optc == -1
444 || (posixly_correct && nfiles != 0)
445 || ((optc = getopt_long (argc, argv,
446 "-0123456789Dcdf:is:uw:z", longopts, NULL))
447 == -1))
449 if (argc <= optind)
450 break;
451 if (nfiles == 2)
453 error (0, 0, _("extra operand %s"), quote (argv[optind]));
454 usage (EXIT_FAILURE);
456 file[nfiles++] = argv[optind++];
458 else switch (optc)
460 case 1:
462 unsigned long int size;
463 if (optarg[0] == '+'
464 && posix2_version () < 200112
465 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
466 && size <= SIZE_MAX)
467 skip_chars = size;
468 else if (nfiles == 2)
470 error (0, 0, _("extra operand %s"), quote (optarg));
471 usage (EXIT_FAILURE);
473 else
474 file[nfiles++] = optarg;
476 break;
478 case '0':
479 case '1':
480 case '2':
481 case '3':
482 case '4':
483 case '5':
484 case '6':
485 case '7':
486 case '8':
487 case '9':
489 if (skip_field_option_type == SFO_NEW)
490 skip_fields = 0;
492 if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
493 skip_fields = SIZE_MAX;
495 skip_field_option_type = SFO_OBSOLETE;
497 break;
499 case 'c':
500 countmode = count_occurrences;
501 break;
503 case 'd':
504 output_unique = false;
505 break;
507 case 'D':
508 output_unique = false;
509 output_later_repeated = true;
510 if (optarg == NULL)
511 delimit_groups = DM_NONE;
512 else
513 delimit_groups = XARGMATCH ("--all-repeated", optarg,
514 delimit_method_string,
515 delimit_method_map);
516 break;
518 case 'f':
519 skip_field_option_type = SFO_NEW;
520 skip_fields = size_opt (optarg,
521 N_("invalid number of fields to skip"));
522 break;
524 case 'i':
525 ignore_case = true;
526 break;
528 case 's':
529 skip_chars = size_opt (optarg,
530 N_("invalid number of bytes to skip"));
531 break;
533 case 'u':
534 output_first_repeated = false;
535 break;
537 case 'w':
538 check_chars = size_opt (optarg,
539 N_("invalid number of bytes to compare"));
540 break;
542 case 'z':
543 delimiter = '\0';
544 break;
546 case_GETOPT_HELP_CHAR;
548 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
550 default:
551 usage (EXIT_FAILURE);
555 if (countmode == count_occurrences && output_later_repeated)
557 error (0, 0,
558 _("printing all duplicated lines and repeat counts is meaningless"));
559 usage (EXIT_FAILURE);
562 check_file (file[0], file[1], delimiter);
564 exit (EXIT_SUCCESS);