global: convert indentation-TABs to spaces
[coreutils.git] / src / uniq.c
blob13aaebccc3b6ce8d7c70454c2141ac90bb38d985
1 /* uniq -- remove duplicate lines from a sorted file
2 Copyright (C) 86, 91, 1995-2009 Free Software Foundation, Inc.
4 This program is free software: you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation, either version 3 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program. If not, see <http://www.gnu.org/licenses/>. */
17 /* Written by Richard M. Stallman and David MacKenzie. */
19 #include <config.h>
21 #include <stdio.h>
22 #include <getopt.h>
23 #include <sys/types.h>
25 #include "system.h"
26 #include "argmatch.h"
27 #include "linebuffer.h"
28 #include "error.h"
29 #include "posixver.h"
30 #include "quote.h"
31 #include "xmemcoll.h"
32 #include "xstrtol.h"
33 #include "memcasecmp.h"
35 /* The official name of this program (e.g., no `g' prefix). */
36 #define PROGRAM_NAME "uniq"
38 #define AUTHORS \
39 proper_name ("Richard M. Stallman"), \
40 proper_name ("David MacKenzie")
42 #define SWAP_LINES(A, B) \
43 do \
44 { \
45 struct linebuffer *_tmp; \
46 _tmp = (A); \
47 (A) = (B); \
48 (B) = _tmp; \
49 } \
50 while (0)
52 /* True if the LC_COLLATE locale is hard. */
53 static bool hard_LC_COLLATE;
55 /* Number of fields to skip on each line when doing comparisons. */
56 static size_t skip_fields;
58 /* Number of chars to skip after skipping any fields. */
59 static size_t skip_chars;
61 /* Number of chars to compare. */
62 static size_t check_chars;
64 enum countmode
66 count_occurrences, /* -c Print count before output lines. */
67 count_none /* Default. Do not print counts. */
70 /* Whether and how to precede the output lines with a count of the number of
71 times they occurred in the input. */
72 static enum countmode countmode;
74 /* Which lines to output: unique lines, the first of a group of
75 repeated lines, and the second and subsequented of a group of
76 repeated lines. */
77 static bool output_unique;
78 static bool output_first_repeated;
79 static bool output_later_repeated;
81 /* If true, ignore case when comparing. */
82 static bool ignore_case;
84 enum delimit_method
86 /* No delimiters output. --all-repeated[=none] */
87 DM_NONE,
89 /* Delimiter precedes all groups. --all-repeated=prepend */
90 DM_PREPEND,
92 /* Delimit all groups. --all-repeated=separate */
93 DM_SEPARATE
96 static char const *const delimit_method_string[] =
98 "none", "prepend", "separate", NULL
101 static enum delimit_method const delimit_method_map[] =
103 DM_NONE, DM_PREPEND, DM_SEPARATE
106 /* Select whether/how to delimit groups of duplicate lines. */
107 static enum delimit_method delimit_groups;
109 static struct option const longopts[] =
111 {"count", no_argument, NULL, 'c'},
112 {"repeated", no_argument, NULL, 'd'},
113 {"all-repeated", optional_argument, NULL, 'D'},
114 {"ignore-case", no_argument, NULL, 'i'},
115 {"unique", no_argument, NULL, 'u'},
116 {"skip-fields", required_argument, NULL, 'f'},
117 {"skip-chars", required_argument, NULL, 's'},
118 {"check-chars", required_argument, NULL, 'w'},
119 {"zero-terminated", no_argument, NULL, 'z'},
120 {GETOPT_HELP_OPTION_DECL},
121 {GETOPT_VERSION_OPTION_DECL},
122 {NULL, 0, NULL, 0}
125 void
126 usage (int status)
128 if (status != EXIT_SUCCESS)
129 fprintf (stderr, _("Try `%s --help' for more information.\n"),
130 program_name);
131 else
133 printf (_("\
134 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
136 program_name);
137 fputs (_("\
138 Filter adjacent matching lines from INPUT (or standard input),\n\
139 writing to OUTPUT (or standard output).\n\
141 With no options, matching lines are merged to the first occurrence.\n\
143 "), stdout);
144 fputs (_("\
145 Mandatory arguments to long options are mandatory for short options too.\n\
146 "), stdout);
147 fputs (_("\
148 -c, --count prefix lines by the number of occurrences\n\
149 -d, --repeated only print duplicate lines\n\
150 "), stdout);
151 fputs (_("\
152 -D, --all-repeated[=delimit-method] print all duplicate lines\n\
153 delimit-method={none(default),prepend,separate}\n\
154 Delimiting is done with blank lines.\n\
155 -f, --skip-fields=N avoid comparing the first N fields\n\
156 -i, --ignore-case ignore differences in case when comparing\n\
157 -s, --skip-chars=N avoid comparing the first N characters\n\
158 -u, --unique only print unique lines\n\
159 -z, --zero-terminated end lines with 0 byte, not newline\n\
160 "), stdout);
161 fputs (_("\
162 -w, --check-chars=N compare no more than N characters in lines\n\
163 "), stdout);
164 fputs (HELP_OPTION_DESCRIPTION, stdout);
165 fputs (VERSION_OPTION_DESCRIPTION, stdout);
166 fputs (_("\
168 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
169 characters. Fields are skipped before chars.\n\
170 "), stdout);
171 fputs (_("\
173 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
174 You may want to sort the input first, or use `sort -u' without `uniq'.\n\
175 Also, comparisons honor the rules specified by `LC_COLLATE'.\n\
176 "), stdout);
177 emit_bug_reporting_address ();
179 exit (status);
182 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
183 invalid. Silently convert too-large values to SIZE_MAX. */
185 static size_t
186 size_opt (char const *opt, char const *msgid)
188 unsigned long int size;
189 verify (SIZE_MAX <= ULONG_MAX);
191 switch (xstrtoul (opt, NULL, 10, &size, ""))
193 case LONGINT_OK:
194 case LONGINT_OVERFLOW:
195 break;
197 default:
198 error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
201 return MIN (size, SIZE_MAX);
204 /* Given a linebuffer LINE,
205 return a pointer to the beginning of the line's field to be compared. */
207 static char *
208 find_field (struct linebuffer const *line)
210 size_t count;
211 char const *lp = line->buffer;
212 size_t size = line->length - 1;
213 size_t i = 0;
215 for (count = 0; count < skip_fields; count++)
217 while (i < size && isblank (to_uchar (lp[i])))
218 i++;
219 while (i < size && !isblank (to_uchar (lp[i])))
220 i++;
223 for (count = 0; count < skip_chars && i < size; count++)
224 i++;
226 return line->buffer + i;
229 /* Return false if two strings OLD and NEW match, true if not.
230 OLD and NEW point not to the beginnings of the lines
231 but rather to the beginnings of the fields to compare.
232 OLDLEN and NEWLEN are their lengths. */
234 static bool
235 different (char *old, char *new, size_t oldlen, size_t newlen)
237 if (check_chars < oldlen)
238 oldlen = check_chars;
239 if (check_chars < newlen)
240 newlen = check_chars;
242 if (ignore_case)
244 /* FIXME: This should invoke strcoll somehow. */
245 return oldlen != newlen || memcasecmp (old, new, oldlen);
247 else if (hard_LC_COLLATE)
248 return xmemcoll (old, oldlen, new, newlen) != 0;
249 else
250 return oldlen != newlen || memcmp (old, new, oldlen);
253 /* Output the line in linebuffer LINE to standard output
254 provided that the switches say it should be output.
255 MATCH is true if the line matches the previous line.
256 If requested, print the number of times it occurred, as well;
257 LINECOUNT + 1 is the number of times that the line occurred. */
259 static void
260 writeline (struct linebuffer const *line,
261 bool match, uintmax_t linecount)
263 if (! (linecount == 0 ? output_unique
264 : !match ? output_first_repeated
265 : output_later_repeated))
266 return;
268 if (countmode == count_occurrences)
269 printf ("%7" PRIuMAX " ", linecount + 1);
271 fwrite (line->buffer, sizeof (char), line->length, stdout);
274 /* Process input file INFILE with output to OUTFILE.
275 If either is "-", use the standard I/O stream for it instead. */
277 static void
278 check_file (const char *infile, const char *outfile, char delimiter)
280 struct linebuffer lb1, lb2;
281 struct linebuffer *thisline, *prevline;
283 if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
284 error (EXIT_FAILURE, errno, "%s", infile);
285 if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
286 error (EXIT_FAILURE, errno, "%s", outfile);
288 thisline = &lb1;
289 prevline = &lb2;
291 initbuffer (thisline);
292 initbuffer (prevline);
294 /* The duplication in the following `if' and `else' blocks is an
295 optimization to distinguish the common case (in which none of
296 the following options has been specified: --count, -repeated,
297 --all-repeated, --unique) from the others. In the common case,
298 this optimization lets uniq output each different line right away,
299 without waiting to see if the next one is different. */
301 if (output_unique && output_first_repeated && countmode == count_none)
303 char *prevfield IF_LINT (= NULL);
304 size_t prevlen IF_LINT (= 0);
306 while (!feof (stdin))
308 char *thisfield;
309 size_t thislen;
310 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
311 break;
312 thisfield = find_field (thisline);
313 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
314 if (prevline->length == 0
315 || different (thisfield, prevfield, thislen, prevlen))
317 fwrite (thisline->buffer, sizeof (char),
318 thisline->length, stdout);
320 SWAP_LINES (prevline, thisline);
321 prevfield = thisfield;
322 prevlen = thislen;
326 else
328 char *prevfield;
329 size_t prevlen;
330 uintmax_t match_count = 0;
331 bool first_delimiter = true;
333 if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
334 goto closefiles;
335 prevfield = find_field (prevline);
336 prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
338 while (!feof (stdin))
340 bool match;
341 char *thisfield;
342 size_t thislen;
343 if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
345 if (ferror (stdin))
346 goto closefiles;
347 break;
349 thisfield = find_field (thisline);
350 thislen = thisline->length - 1 - (thisfield - thisline->buffer);
351 match = !different (thisfield, prevfield, thislen, prevlen);
352 match_count += match;
354 if (match_count == UINTMAX_MAX)
356 if (count_occurrences)
357 error (EXIT_FAILURE, 0, _("too many repeated lines"));
358 match_count--;
361 if (delimit_groups != DM_NONE)
363 if (!match)
365 if (match_count) /* a previous match */
366 first_delimiter = false; /* Only used when DM_SEPARATE */
368 else if (match_count == 1)
370 if ((delimit_groups == DM_PREPEND)
371 || (delimit_groups == DM_SEPARATE
372 && !first_delimiter))
373 putchar (delimiter);
377 if (!match || output_later_repeated)
379 writeline (prevline, match, match_count);
380 SWAP_LINES (prevline, thisline);
381 prevfield = thisfield;
382 prevlen = thislen;
383 if (!match)
384 match_count = 0;
388 writeline (prevline, false, match_count);
391 closefiles:
392 if (ferror (stdin) || fclose (stdin) != 0)
393 error (EXIT_FAILURE, 0, _("error reading %s"), infile);
395 /* stdout is handled via the atexit-invoked close_stdout function. */
397 free (lb1.buffer);
398 free (lb2.buffer);
401 enum Skip_field_option_type
403 SFO_NONE,
404 SFO_OBSOLETE,
405 SFO_NEW
409 main (int argc, char **argv)
411 int optc = 0;
412 bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
413 enum Skip_field_option_type skip_field_option_type = SFO_NONE;
414 int nfiles = 0;
415 char const *file[2];
416 char delimiter = '\n'; /* change with --zero-terminated, -z */
418 file[0] = file[1] = "-";
419 initialize_main (&argc, &argv);
420 set_program_name (argv[0]);
421 setlocale (LC_ALL, "");
422 bindtextdomain (PACKAGE, LOCALEDIR);
423 textdomain (PACKAGE);
424 hard_LC_COLLATE = hard_locale (LC_COLLATE);
426 atexit (close_stdout);
428 skip_chars = 0;
429 skip_fields = 0;
430 check_chars = SIZE_MAX;
431 output_unique = output_first_repeated = true;
432 output_later_repeated = false;
433 countmode = count_none;
434 delimit_groups = DM_NONE;
436 for (;;)
438 /* Parse an operand with leading "+" as a file after "--" was
439 seen; or if pedantic and a file was seen; or if not
440 obsolete. */
442 if (optc == -1
443 || (posixly_correct && nfiles != 0)
444 || ((optc = getopt_long (argc, argv,
445 "-0123456789Dcdf:is:uw:z", longopts, NULL))
446 == -1))
448 if (argc <= optind)
449 break;
450 if (nfiles == 2)
452 error (0, 0, _("extra operand %s"), quote (argv[optind]));
453 usage (EXIT_FAILURE);
455 file[nfiles++] = argv[optind++];
457 else switch (optc)
459 case 1:
461 unsigned long int size;
462 if (optarg[0] == '+'
463 && posix2_version () < 200112
464 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
465 && size <= SIZE_MAX)
466 skip_chars = size;
467 else if (nfiles == 2)
469 error (0, 0, _("extra operand %s"), quote (optarg));
470 usage (EXIT_FAILURE);
472 else
473 file[nfiles++] = optarg;
475 break;
477 case '0':
478 case '1':
479 case '2':
480 case '3':
481 case '4':
482 case '5':
483 case '6':
484 case '7':
485 case '8':
486 case '9':
488 if (skip_field_option_type == SFO_NEW)
489 skip_fields = 0;
491 if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
492 skip_fields = SIZE_MAX;
494 skip_field_option_type = SFO_OBSOLETE;
496 break;
498 case 'c':
499 countmode = count_occurrences;
500 break;
502 case 'd':
503 output_unique = false;
504 break;
506 case 'D':
507 output_unique = false;
508 output_later_repeated = true;
509 if (optarg == NULL)
510 delimit_groups = DM_NONE;
511 else
512 delimit_groups = XARGMATCH ("--all-repeated", optarg,
513 delimit_method_string,
514 delimit_method_map);
515 break;
517 case 'f':
518 skip_field_option_type = SFO_NEW;
519 skip_fields = size_opt (optarg,
520 N_("invalid number of fields to skip"));
521 break;
523 case 'i':
524 ignore_case = true;
525 break;
527 case 's':
528 skip_chars = size_opt (optarg,
529 N_("invalid number of bytes to skip"));
530 break;
532 case 'u':
533 output_first_repeated = false;
534 break;
536 case 'w':
537 check_chars = size_opt (optarg,
538 N_("invalid number of bytes to compare"));
539 break;
541 case 'z':
542 delimiter = '\0';
543 break;
545 case_GETOPT_HELP_CHAR;
547 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
549 default:
550 usage (EXIT_FAILURE);
554 if (countmode == count_occurrences && output_later_repeated)
556 error (0, 0,
557 _("printing all duplicated lines and repeat counts is meaningless"));
558 usage (EXIT_FAILURE);
561 check_file (file[0], file[1], delimiter);
563 exit (EXIT_SUCCESS);