src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 1986-2016 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Written by Richard M. Stallman and David MacKenzie. */
  18
  19 #include <config.h>
  20
  21 #include <getopt.h>
  22 #include <sys/types.h>
  23
  24 #include "system.h"
  25 #include "argmatch.h"
  26 #include "linebuffer.h"
  27 #include "die.h"
  28 #include "error.h"
  29 #include "fadvise.h"
  30 #include "hard-locale.h"
  31 #include "posixver.h"
  32 #include "stdio--.h"
  33 #include "xmemcoll.h"
  34 #include "xstrtol.h"
  35 #include "memcasecmp.h"
  36 #include "quote.h"
  37
  38 /* The official name of this program (e.g., no 'g' prefix).  */
  39 #define PROGRAM_NAME "uniq"
  40
  41 #define AUTHORS \
  42   proper_name ("Richard M. Stallman"), \
  43   proper_name ("David MacKenzie")
  44
  45 #define SWAP_LINES(A, B)                        \
  46   do                                            \
  47     {                                           \
  48       struct linebuffer *_tmp;                  \
  49       _tmp = (A);                               \
  50       (A) = (B);                                \
  51       (B) = _tmp;                               \
  52     }                                           \
  53   while (0)
  54
  55 /* True if the LC_COLLATE locale is hard.  */
  56 static bool hard_LC_COLLATE;
  57
  58 /* Number of fields to skip on each line when doing comparisons. */
  59 static size_t skip_fields;
  60
  61 /* Number of chars to skip after skipping any fields. */
  62 static size_t skip_chars;
  63
  64 /* Number of chars to compare. */
  65 static size_t check_chars;
  66
  67 enum countmode
  68 {
  69   count_occurrences,            /* -c Print count before output lines. */
  70   count_none                    /* Default.  Do not print counts. */
  71 };
  72
  73 /* Whether and how to precede the output lines with a count of the number of
  74    times they occurred in the input. */
  75 static enum countmode countmode;
  76
  77 /* Which lines to output: unique lines, the first of a group of
  78    repeated lines, and the second and subsequented of a group of
  79    repeated lines.  */
  80 static bool output_unique;
  81 static bool output_first_repeated;
  82 static bool output_later_repeated;
  83
  84 /* If true, ignore case when comparing.  */
  85 static bool ignore_case;
  86
  87 enum delimit_method
  88 {
  89   /* No delimiters output.  --all-repeated[=none] */
  90   DM_NONE,
  91
  92   /* Delimiter precedes all groups.  --all-repeated=prepend */
  93   DM_PREPEND,
  94
  95   /* Delimit all groups.  --all-repeated=separate */
  96   DM_SEPARATE
  97 };
  98
  99 static char const *const delimit_method_string[] =
 100 {
 101   "none", "prepend", "separate", NULL
 102 };
 103
 104 static enum delimit_method const delimit_method_map[] =
 105 {
 106   DM_NONE, DM_PREPEND, DM_SEPARATE
 107 };
 108
 109 /* Select whether/how to delimit groups of duplicate lines.  */
 110 static enum delimit_method delimit_groups;
 111
 112 enum grouping_method
 113 {
 114   /* No grouping, when "--group" isn't used */
 115   GM_NONE,
 116
 117   /* Delimiter preceges all groups.  --group=prepend */
 118   GM_PREPEND,
 119
 120   /* Delimiter follows all groups.   --group=append */
 121   GM_APPEND,
 122
 123   /* Delimiter between groups.    --group[=separate] */
 124   GM_SEPARATE,
 125
 126   /* Delimiter before and after each group. --group=both */
 127   GM_BOTH
 128 };
 129
 130 static char const *const grouping_method_string[] =
 131 {
 132   "prepend", "append", "separate", "both", NULL
 133 };
 134
 135 static enum grouping_method const grouping_method_map[] =
 136 {
 137   GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
 138 };
 139
 140 static enum grouping_method grouping = GM_NONE;
 141
 142 enum
 143 {
 144   GROUP_OPTION = CHAR_MAX + 1
 145 };
 146
 147 static struct option const longopts[] =
 148 {
 149   {"count", no_argument, NULL, 'c'},
 150   {"repeated", no_argument, NULL, 'd'},
 151   {"all-repeated", optional_argument, NULL, 'D'},
 152   {"group", optional_argument, NULL, GROUP_OPTION},
 153   {"ignore-case", no_argument, NULL, 'i'},
 154   {"unique", no_argument, NULL, 'u'},
 155   {"skip-fields", required_argument, NULL, 'f'},
 156   {"skip-chars", required_argument, NULL, 's'},
 157   {"check-chars", required_argument, NULL, 'w'},
 158   {"zero-terminated", no_argument, NULL, 'z'},
 159   {GETOPT_HELP_OPTION_DECL},
 160   {GETOPT_VERSION_OPTION_DECL},
 161   {NULL, 0, NULL, 0}
 162 };
 163
 164 void
 165 usage (int status)
 166 {
 167   if (status != EXIT_SUCCESS)
 168     emit_try_help ();
 169   else
 170     {
 171       printf (_("\
 172 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 173 "),
 174               program_name);
 175       fputs (_("\
 176 Filter adjacent matching lines from INPUT (or standard input),\n\
 177 writing to OUTPUT (or standard output).\n\
 178 \n\
 179 With no options, matching lines are merged to the first occurrence.\n\
 180 "), stdout);
 181
 182       emit_mandatory_arg_note ();
 183
 184      fputs (_("\
 185   -c, --count           prefix lines by the number of occurrences\n\
 186   -d, --repeated        only print duplicate lines, one for each group\n\
 187 "), stdout);
 188      fputs (_("\
 189   -D                    print all duplicate lines\n\
 190       --all-repeated[=METHOD]  like -D, but allow separating groups\n\
 191                                  with an empty line;\n\
 192                                  METHOD={none(default),prepend,separate}\n\
 193 "), stdout);
 194      fputs (_("\
 195   -f, --skip-fields=N   avoid comparing the first N fields\n\
 196 "), stdout);
 197      fputs (_("\
 198       --group[=METHOD]  show all items, separating groups with an empty line;\n\
 199                           METHOD={separate(default),prepend,append,both}\n\
 200 "), stdout);
 201      fputs (_("\
 202   -i, --ignore-case     ignore differences in case when comparing\n\
 203   -s, --skip-chars=N    avoid comparing the first N characters\n\
 204   -u, --unique          only print unique lines\n\
 205 "), stdout);
 206       fputs (_("\
 207   -z, --zero-terminated     line delimiter is NUL, not newline\n\
 208 "), stdout);
 209      fputs (_("\
 210   -w, --check-chars=N   compare no more than N characters in lines\n\
 211 "), stdout);
 212      fputs (HELP_OPTION_DESCRIPTION, stdout);
 213      fputs (VERSION_OPTION_DESCRIPTION, stdout);
 214      fputs (_("\
 215 \n\
 216 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
 217 characters.  Fields are skipped before chars.\n\
 218 "), stdout);
 219      fputs (_("\
 220 \n\
 221 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
 222 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
 223 Also, comparisons honor the rules specified by 'LC_COLLATE'.\n\
 224 "), stdout);
 225       emit_ancillary_info (PROGRAM_NAME);
 226     }
 227   exit (status);
 228 }
 229
 230 static bool
 231 strict_posix2 (void)
 232 {
 233   int posix_ver = posix2_version ();
 234   return 200112 <= posix_ver && posix_ver < 200809;
 235 }
 236
 237 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
 238    invalid.  Silently convert too-large values to SIZE_MAX.  */
 239
 240 static size_t
 241 size_opt (char const *opt, char const *msgid)
 242 {
 243   unsigned long int size;
 244   verify (SIZE_MAX <= ULONG_MAX);
 245
 246   switch (xstrtoul (opt, NULL, 10, &size, ""))
 247     {
 248     case LONGINT_OK:
 249     case LONGINT_OVERFLOW:
 250       break;
 251
 252     default:
 253       die (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 254     }
 255
 256   return MIN (size, SIZE_MAX);
 257 }
 258
 259 /* Given a linebuffer LINE,
 260    return a pointer to the beginning of the line's field to be compared. */
 261
 262 static char * _GL_ATTRIBUTE_PURE
 263 find_field (struct linebuffer const *line)
 264 {
 265   size_t count;
 266   char const *lp = line->buffer;
 267   size_t size = line->length - 1;
 268   size_t i = 0;
 269
 270   for (count = 0; count < skip_fields && i < size; count++)
 271     {
 272       while (i < size && field_sep (lp[i]))
 273         i++;
 274       while (i < size && !field_sep (lp[i]))
 275         i++;
 276     }
 277
 278   i += MIN (skip_chars, size - i);
 279
 280   return line->buffer + i;
 281 }
 282
 283 /* Return false if two strings OLD and NEW match, true if not.
 284    OLD and NEW point not to the beginnings of the lines
 285    but rather to the beginnings of the fields to compare.
 286    OLDLEN and NEWLEN are their lengths. */
 287
 288 static bool
 289 different (char *old, char *new, size_t oldlen, size_t newlen)
 290 {
 291   if (check_chars < oldlen)
 292     oldlen = check_chars;
 293   if (check_chars < newlen)
 294     newlen = check_chars;
 295
 296   if (ignore_case)
 297     {
 298       /* FIXME: This should invoke strcoll somehow.  */
 299       return oldlen != newlen || memcasecmp (old, new, oldlen);
 300     }
 301   else if (hard_LC_COLLATE)
 302     return xmemcoll (old, oldlen, new, newlen) != 0;
 303   else
 304     return oldlen != newlen || memcmp (old, new, oldlen);
 305 }
 306
 307 /* Output the line in linebuffer LINE to standard output
 308    provided that the switches say it should be output.
 309    MATCH is true if the line matches the previous line.
 310    If requested, print the number of times it occurred, as well;
 311    LINECOUNT + 1 is the number of times that the line occurred. */
 312
 313 static void
 314 writeline (struct linebuffer const *line,
 315            bool match, uintmax_t linecount)
 316 {
 317   if (! (linecount == 0 ? output_unique
 318          : !match ? output_first_repeated
 319          : output_later_repeated))
 320     return;
 321
 322   if (countmode == count_occurrences)
 323     printf ("%7" PRIuMAX " ", linecount + 1);
 324
 325   fwrite (line->buffer, sizeof (char), line->length, stdout);
 326 }
 327
 328 /* Process input file INFILE with output to OUTFILE.
 329    If either is "-", use the standard I/O stream for it instead. */
 330
 331 static void
 332 check_file (const char *infile, const char *outfile, char delimiter)
 333 {
 334   struct linebuffer lb1, lb2;
 335   struct linebuffer *thisline, *prevline;
 336
 337   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
 338     die (EXIT_FAILURE, errno, "%s", quotef (infile));
 339   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
 340     die (EXIT_FAILURE, errno, "%s", quotef (outfile));
 341
 342   fadvise (stdin, FADVISE_SEQUENTIAL);
 343
 344   thisline = &lb1;
 345   prevline = &lb2;
 346
 347   initbuffer (thisline);
 348   initbuffer (prevline);
 349
 350   /* The duplication in the following 'if' and 'else' blocks is an
 351      optimization to distinguish between when we can print input
 352      lines immediately (1. & 2.) or not.
 353
 354      1. --group => all input lines are printed.
 355         checking for unique/duplicated lines is used only for printing
 356         group separators.
 357
 358      2. The default case in which none of these options has been specified:
 359           --count, --repeated,  --all-repeated, --unique
 360         In the default case, this optimization lets uniq output each different
 361         line right away, without waiting to see if the next one is different.
 362
 363      3. All other cases.
 364   */
 365   if (output_unique && output_first_repeated && countmode == count_none)
 366     {
 367       char *prevfield IF_LINT ( = NULL);
 368       size_t prevlen IF_LINT ( = 0);
 369       bool first_group_printed = false;
 370
 371       while (!feof (stdin))
 372         {
 373           char *thisfield;
 374           size_t thislen;
 375           bool new_group;
 376
 377           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 378             break;
 379
 380           thisfield = find_field (thisline);
 381           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 382
 383           new_group = (prevline->length == 0
 384                        || different (thisfield, prevfield, thislen, prevlen));
 385
 386           if (new_group && grouping != GM_NONE
 387               && (grouping == GM_PREPEND || grouping == GM_BOTH
 388                   || (first_group_printed && (grouping == GM_APPEND
 389                                               || grouping == GM_SEPARATE))))
 390             putchar (delimiter);
 391
 392           if (new_group || grouping != GM_NONE)
 393             {
 394               fwrite (thisline->buffer, sizeof (char),
 395                       thisline->length, stdout);
 396
 397               SWAP_LINES (prevline, thisline);
 398               prevfield = thisfield;
 399               prevlen = thislen;
 400               first_group_printed = true;
 401             }
 402         }
 403       if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
 404         putchar (delimiter);
 405     }
 406   else
 407     {
 408       char *prevfield;
 409       size_t prevlen;
 410       uintmax_t match_count = 0;
 411       bool first_delimiter = true;
 412
 413       if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
 414         goto closefiles;
 415       prevfield = find_field (prevline);
 416       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
 417
 418       while (!feof (stdin))
 419         {
 420           bool match;
 421           char *thisfield;
 422           size_t thislen;
 423           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 424             {
 425               if (ferror (stdin))
 426                 goto closefiles;
 427               break;
 428             }
 429           thisfield = find_field (thisline);
 430           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 431           match = !different (thisfield, prevfield, thislen, prevlen);
 432           match_count += match;
 433
 434           if (match_count == UINTMAX_MAX)
 435             {
 436               if (count_occurrences)
 437                 die (EXIT_FAILURE, 0, _("too many repeated lines"));
 438               match_count--;
 439             }
 440
 441           if (delimit_groups != DM_NONE)
 442             {
 443               if (!match)
 444                 {
 445                   if (match_count) /* a previous match */
 446                     first_delimiter = false; /* Only used when DM_SEPARATE */
 447                 }
 448               else if (match_count == 1)
 449                 {
 450                   if ((delimit_groups == DM_PREPEND)
 451                       || (delimit_groups == DM_SEPARATE
 452                           && !first_delimiter))
 453                     putchar (delimiter);
 454                 }
 455             }
 456
 457           if (!match || output_later_repeated)
 458             {
 459               writeline (prevline, match, match_count);
 460               SWAP_LINES (prevline, thisline);
 461               prevfield = thisfield;
 462               prevlen = thislen;
 463               if (!match)
 464                 match_count = 0;
 465             }
 466         }
 467
 468       writeline (prevline, false, match_count);
 469     }
 470
 471  closefiles:
 472   if (ferror (stdin) || fclose (stdin) != 0)
 473     die (EXIT_FAILURE, 0, _("error reading %s"), quoteaf (infile));
 474
 475   /* stdout is handled via the atexit-invoked close_stdout function.  */
 476
 477   free (lb1.buffer);
 478   free (lb2.buffer);
 479 }
 480
 481 enum Skip_field_option_type
 482   {
 483     SFO_NONE,
 484     SFO_OBSOLETE,
 485     SFO_NEW
 486   };
 487
 488 int
 489 main (int argc, char **argv)
 490 {
 491   int optc = 0;
 492   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
 493   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
 494   unsigned int nfiles = 0;
 495   char const *file[2];
 496   char delimiter = '\n';        /* change with --zero-terminated, -z */
 497   bool output_option_used = false;   /* if true, one of -u/-d/-D/-c was used */
 498
 499   file[0] = file[1] = "-";
 500   initialize_main (&argc, &argv);
 501   set_program_name (argv[0]);
 502   setlocale (LC_ALL, "");
 503   bindtextdomain (PACKAGE, LOCALEDIR);
 504   textdomain (PACKAGE);
 505   hard_LC_COLLATE = hard_locale (LC_COLLATE);
 506
 507   atexit (close_stdout);
 508
 509   skip_chars = 0;
 510   skip_fields = 0;
 511   check_chars = SIZE_MAX;
 512   output_unique = output_first_repeated = true;
 513   output_later_repeated = false;
 514   countmode = count_none;
 515   delimit_groups = DM_NONE;
 516
 517   while (true)
 518     {
 519       /* Parse an operand with leading "+" as a file after "--" was
 520          seen; or if pedantic and a file was seen; or if not
 521          obsolete.  */
 522
 523       if (optc == -1
 524           || (posixly_correct && nfiles != 0)
 525           || ((optc = getopt_long (argc, argv,
 526                                    "-0123456789Dcdf:is:uw:z", longopts, NULL))
 527               == -1))
 528         {
 529           if (argc <= optind)
 530             break;
 531           if (nfiles == 2)
 532             {
 533               error (0, 0, _("extra operand %s"), quote (argv[optind]));
 534               usage (EXIT_FAILURE);
 535             }
 536           file[nfiles++] = argv[optind++];
 537         }
 538       else switch (optc)
 539         {
 540         case 1:
 541           {
 542             unsigned long int size;
 543             if (optarg[0] == '+'
 544                 && ! strict_posix2 ()
 545                 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
 546                 && size <= SIZE_MAX)
 547               skip_chars = size;
 548             else if (nfiles == 2)
 549               {
 550                 error (0, 0, _("extra operand %s"), quote (optarg));
 551                 usage (EXIT_FAILURE);
 552               }
 553             else
 554               file[nfiles++] = optarg;
 555           }
 556           break;
 557
 558         case '0':
 559         case '1':
 560         case '2':
 561         case '3':
 562         case '4':
 563         case '5':
 564         case '6':
 565         case '7':
 566         case '8':
 567         case '9':
 568           {
 569             if (skip_field_option_type == SFO_NEW)
 570               skip_fields = 0;
 571
 572             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
 573               skip_fields = SIZE_MAX;
 574
 575             skip_field_option_type = SFO_OBSOLETE;
 576           }
 577           break;
 578
 579         case 'c':
 580           countmode = count_occurrences;
 581           output_option_used = true;
 582           break;
 583
 584         case 'd':
 585           output_unique = false;
 586           output_option_used = true;
 587           break;
 588
 589         case 'D':
 590           output_unique = false;
 591           output_later_repeated = true;
 592           if (optarg == NULL)
 593             delimit_groups = DM_NONE;
 594           else
 595             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 596                                         delimit_method_string,
 597                                         delimit_method_map);
 598           output_option_used = true;
 599           break;
 600
 601         case GROUP_OPTION:
 602           if (optarg == NULL)
 603             grouping = GM_SEPARATE;
 604           else
 605             grouping = XARGMATCH ("--group", optarg,
 606                                   grouping_method_string,
 607                                   grouping_method_map);
 608           break;
 609
 610         case 'f':
 611           skip_field_option_type = SFO_NEW;
 612           skip_fields = size_opt (optarg,
 613                                   N_("invalid number of fields to skip"));
 614           break;
 615
 616         case 'i':
 617           ignore_case = true;
 618           break;
 619
 620         case 's':
 621           skip_chars = size_opt (optarg,
 622                                  N_("invalid number of bytes to skip"));
 623           break;
 624
 625         case 'u':
 626           output_first_repeated = false;
 627           output_option_used = true;
 628           break;
 629
 630         case 'w':
 631           check_chars = size_opt (optarg,
 632                                   N_("invalid number of bytes to compare"));
 633           break;
 634
 635         case 'z':
 636           delimiter = '\0';
 637           break;
 638
 639         case_GETOPT_HELP_CHAR;
 640
 641         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 642
 643         default:
 644           usage (EXIT_FAILURE);
 645         }
 646     }
 647
 648   /* Note we could allow --group with -D at least, and that would
 649      avoid the need to specify a grouping method to --all-repeated.
 650      It was thought best to avoid deprecating those parameters though
 651      and keep --group separate to other options.  */
 652   if (grouping != GM_NONE && output_option_used)
 653     {
 654       error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
 655       usage (EXIT_FAILURE);
 656     }
 657
 658   if (grouping != GM_NONE && countmode != count_none)
 659     {
 660       error (0, 0,
 661            _("grouping and printing repeat counts is meaningless"));
 662       usage (EXIT_FAILURE);
 663     }
 664
 665   if (countmode == count_occurrences && output_later_repeated)
 666     {
 667       error (0, 0,
 668            _("printing all duplicated lines and repeat counts is meaningless"));
 669       usage (EXIT_FAILURE);
 670     }
 671
 672   check_file (file[0], file[1], delimiter);
 673
 674   return EXIT_SUCCESS;
 675 }