src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 1986-2023 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 /* Written by Richard M. Stallman and David MacKenzie. */
  18
  19 #include <config.h>
  20
  21 #include <getopt.h>
  22 #include <sys/types.h>
  23
  24 #include "system.h"
  25 #include "argmatch.h"
  26 #include "linebuffer.h"
  27 #include "fadvise.h"
  28 #include "posixver.h"
  29 #include "stdio--.h"
  30 #include "xstrtol.h"
  31 #include "memcasecmp.h"
  32 #include "quote.h"
  33
  34 /* The official name of this program (e.g., no 'g' prefix).  */
  35 #define PROGRAM_NAME "uniq"
  36
  37 #define AUTHORS \
  38   proper_name ("Richard M. Stallman"), \
  39   proper_name ("David MacKenzie")
  40
  41 #define SWAP_LINES(A, B)                        \
  42   do                                            \
  43     {                                           \
  44       struct linebuffer *_tmp;                  \
  45       _tmp = (A);                               \
  46       (A) = (B);                                \
  47       (B) = _tmp;                               \
  48     }                                           \
  49   while (0)
  50
  51 /* Number of fields to skip on each line when doing comparisons. */
  52 static size_t skip_fields;
  53
  54 /* Number of chars to skip after skipping any fields. */
  55 static size_t skip_chars;
  56
  57 /* Number of chars to compare. */
  58 static size_t check_chars;
  59
  60 enum countmode
  61 {
  62   count_occurrences,            /* -c Print count before output lines. */
  63   count_none                    /* Default.  Do not print counts. */
  64 };
  65
  66 /* Whether and how to precede the output lines with a count of the number of
  67    times they occurred in the input. */
  68 static enum countmode countmode;
  69
  70 /* Which lines to output: unique lines, the first of a group of
  71    repeated lines, and the second and subsequented of a group of
  72    repeated lines.  */
  73 static bool output_unique;
  74 static bool output_first_repeated;
  75 static bool output_later_repeated;
  76
  77 /* If true, ignore case when comparing.  */
  78 static bool ignore_case;
  79
  80 enum delimit_method
  81 {
  82   /* No delimiters output.  --all-repeated[=none] */
  83   DM_NONE,
  84
  85   /* Delimiter precedes all groups.  --all-repeated=prepend */
  86   DM_PREPEND,
  87
  88   /* Delimit all groups.  --all-repeated=separate */
  89   DM_SEPARATE
  90 };
  91
  92 static char const *const delimit_method_string[] =
  93 {
  94   "none", "prepend", "separate", nullptr
  95 };
  96
  97 static enum delimit_method const delimit_method_map[] =
  98 {
  99   DM_NONE, DM_PREPEND, DM_SEPARATE
 100 };
 101
 102 /* Select whether/how to delimit groups of duplicate lines.  */
 103 static enum delimit_method delimit_groups;
 104
 105 enum grouping_method
 106 {
 107   /* No grouping, when "--group" isn't used */
 108   GM_NONE,
 109
 110   /* Delimiter precedes all groups.  --group=prepend */
 111   GM_PREPEND,
 112
 113   /* Delimiter follows all groups.   --group=append */
 114   GM_APPEND,
 115
 116   /* Delimiter between groups.    --group[=separate] */
 117   GM_SEPARATE,
 118
 119   /* Delimiter before and after each group. --group=both */
 120   GM_BOTH
 121 };
 122
 123 static char const *const grouping_method_string[] =
 124 {
 125   "prepend", "append", "separate", "both", nullptr
 126 };
 127
 128 static enum grouping_method const grouping_method_map[] =
 129 {
 130   GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
 131 };
 132
 133 static enum grouping_method grouping = GM_NONE;
 134
 135 enum
 136 {
 137   GROUP_OPTION = CHAR_MAX + 1
 138 };
 139
 140 static struct option const longopts[] =
 141 {
 142   {"count", no_argument, nullptr, 'c'},
 143   {"repeated", no_argument, nullptr, 'd'},
 144   {"all-repeated", optional_argument, nullptr, 'D'},
 145   {"group", optional_argument, nullptr, GROUP_OPTION},
 146   {"ignore-case", no_argument, nullptr, 'i'},
 147   {"unique", no_argument, nullptr, 'u'},
 148   {"skip-fields", required_argument, nullptr, 'f'},
 149   {"skip-chars", required_argument, nullptr, 's'},
 150   {"check-chars", required_argument, nullptr, 'w'},
 151   {"zero-terminated", no_argument, nullptr, 'z'},
 152   {GETOPT_HELP_OPTION_DECL},
 153   {GETOPT_VERSION_OPTION_DECL},
 154   {nullptr, 0, nullptr, 0}
 155 };
 156
 157 void
 158 usage (int status)
 159 {
 160   if (status != EXIT_SUCCESS)
 161     emit_try_help ();
 162   else
 163     {
 164       printf (_("\
 165 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 166 "),
 167               program_name);
 168       fputs (_("\
 169 Filter adjacent matching lines from INPUT (or standard input),\n\
 170 writing to OUTPUT (or standard output).\n\
 171 \n\
 172 With no options, matching lines are merged to the first occurrence.\n\
 173 "), stdout);
 174
 175       emit_mandatory_arg_note ();
 176
 177      fputs (_("\
 178   -c, --count           prefix lines by the number of occurrences\n\
 179   -d, --repeated        only print duplicate lines, one for each group\n\
 180 "), stdout);
 181      fputs (_("\
 182   -D                    print all duplicate lines\n\
 183       --all-repeated[=METHOD]  like -D, but allow separating groups\n\
 184                                  with an empty line;\n\
 185                                  METHOD={none(default),prepend,separate}\n\
 186 "), stdout);
 187      fputs (_("\
 188   -f, --skip-fields=N   avoid comparing the first N fields\n\
 189 "), stdout);
 190      fputs (_("\
 191       --group[=METHOD]  show all items, separating groups with an empty line;\n\
 192                           METHOD={separate(default),prepend,append,both}\n\
 193 "), stdout);
 194      fputs (_("\
 195   -i, --ignore-case     ignore differences in case when comparing\n\
 196   -s, --skip-chars=N    avoid comparing the first N characters\n\
 197   -u, --unique          only print unique lines\n\
 198 "), stdout);
 199       fputs (_("\
 200   -z, --zero-terminated     line delimiter is NUL, not newline\n\
 201 "), stdout);
 202      fputs (_("\
 203   -w, --check-chars=N   compare no more than N characters in lines\n\
 204 "), stdout);
 205      fputs (HELP_OPTION_DESCRIPTION, stdout);
 206      fputs (VERSION_OPTION_DESCRIPTION, stdout);
 207      fputs (_("\
 208 \n\
 209 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
 210 characters.  Fields are skipped before chars.\n\
 211 "), stdout);
 212      fputs (_("\
 213 \n\
 214 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
 215 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
 216 "), stdout);
 217       emit_ancillary_info (PROGRAM_NAME);
 218     }
 219   exit (status);
 220 }
 221
 222 static bool
 223 strict_posix2 (void)
 224 {
 225   int posix_ver = posix2_version ();
 226   return 200112 <= posix_ver && posix_ver < 200809;
 227 }
 228
 229 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
 230    invalid.  Silently convert too-large values to SIZE_MAX.  */
 231
 232 static size_t
 233 size_opt (char const *opt, char const *msgid)
 234 {
 235   uintmax_t size;
 236
 237   switch (xstrtoumax (opt, nullptr, 10, &size, ""))
 238     {
 239     case LONGINT_OK:
 240     case LONGINT_OVERFLOW:
 241       break;
 242
 243     default:
 244       error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 245     }
 246
 247   return MIN (size, SIZE_MAX);
 248 }
 249
 250 /* Given a linebuffer LINE,
 251    return a pointer to the beginning of the line's field to be compared. */
 252
 253 ATTRIBUTE_PURE
 254 static char *
 255 find_field (struct linebuffer const *line)
 256 {
 257   size_t count;
 258   char const *lp = line->buffer;
 259   size_t size = line->length - 1;
 260   size_t i = 0;
 261
 262   for (count = 0; count < skip_fields && i < size; count++)
 263     {
 264       while (i < size && field_sep (lp[i]))
 265         i++;
 266       while (i < size && !field_sep (lp[i]))
 267         i++;
 268     }
 269
 270   i += MIN (skip_chars, size - i);
 271
 272   return line->buffer + i;
 273 }
 274
 275 /* Return false if two strings OLD and NEW match, true if not.
 276    OLD and NEW point not to the beginnings of the lines
 277    but rather to the beginnings of the fields to compare.
 278    OLDLEN and NEWLEN are their lengths. */
 279
 280 static bool
 281 different (char *old, char *new, size_t oldlen, size_t newlen)
 282 {
 283   if (check_chars < oldlen)
 284     oldlen = check_chars;
 285   if (check_chars < newlen)
 286     newlen = check_chars;
 287
 288   if (ignore_case)
 289     return oldlen != newlen || memcasecmp (old, new, oldlen);
 290   else
 291     return oldlen != newlen || memcmp (old, new, oldlen);
 292 }
 293
 294 /* Output the line in linebuffer LINE to standard output
 295    provided that the switches say it should be output.
 296    MATCH is true if the line matches the previous line.
 297    If requested, print the number of times it occurred, as well;
 298    LINECOUNT + 1 is the number of times that the line occurred. */
 299
 300 static void
 301 writeline (struct linebuffer const *line,
 302            bool match, uintmax_t linecount)
 303 {
 304   if (! (linecount == 0 ? output_unique
 305          : !match ? output_first_repeated
 306          : output_later_repeated))
 307     return;
 308
 309   if (countmode == count_occurrences)
 310     printf ("%7" PRIuMAX " ", linecount + 1);
 311
 312   if (fwrite (line->buffer, sizeof (char), line->length, stdout)
 313       != line->length)
 314     write_error ();
 315 }
 316
 317 /* Process input file INFILE with output to OUTFILE.
 318    If either is "-", use the standard I/O stream for it instead. */
 319
 320 static void
 321 check_file (char const *infile, char const *outfile, char delimiter)
 322 {
 323   struct linebuffer lb1, lb2;
 324   struct linebuffer *thisline, *prevline;
 325
 326   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
 327     error (EXIT_FAILURE, errno, "%s", quotef (infile));
 328   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
 329     error (EXIT_FAILURE, errno, "%s", quotef (outfile));
 330
 331   fadvise (stdin, FADVISE_SEQUENTIAL);
 332
 333   thisline = &lb1;
 334   prevline = &lb2;
 335
 336   initbuffer (thisline);
 337   initbuffer (prevline);
 338
 339   /* The duplication in the following 'if' and 'else' blocks is an
 340      optimization to distinguish between when we can print input
 341      lines immediately (1. & 2.) or not.
 342
 343      1. --group => all input lines are printed.
 344         checking for unique/duplicated lines is used only for printing
 345         group separators.
 346
 347      2. The default case in which none of these options has been specified:
 348           --count, --repeated,  --all-repeated, --unique
 349         In the default case, this optimization lets uniq output each different
 350         line right away, without waiting to see if the next one is different.
 351
 352      3. All other cases.
 353   */
 354   if (output_unique && output_first_repeated && countmode == count_none)
 355     {
 356       char *prevfield = nullptr;
 357       size_t prevlen;
 358       bool first_group_printed = false;
 359
 360       while (!feof (stdin))
 361         {
 362           char *thisfield;
 363           size_t thislen;
 364           bool new_group;
 365
 366           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 367             break;
 368
 369           thisfield = find_field (thisline);
 370           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 371
 372           new_group = (!prevfield
 373                        || different (thisfield, prevfield, thislen, prevlen));
 374
 375           if (new_group && grouping != GM_NONE
 376               && (grouping == GM_PREPEND || grouping == GM_BOTH
 377                   || (first_group_printed && (grouping == GM_APPEND
 378                                               || grouping == GM_SEPARATE))))
 379             putchar (delimiter);
 380
 381           if (new_group || grouping != GM_NONE)
 382             {
 383               if (fwrite (thisline->buffer, sizeof (char), thisline->length,
 384                   stdout) != thisline->length)
 385                 write_error ();
 386
 387               SWAP_LINES (prevline, thisline);
 388               prevfield = thisfield;
 389               prevlen = thislen;
 390               first_group_printed = true;
 391             }
 392         }
 393       if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
 394         putchar (delimiter);
 395     }
 396   else
 397     {
 398       char *prevfield;
 399       size_t prevlen;
 400       uintmax_t match_count = 0;
 401       bool first_delimiter = true;
 402
 403       if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
 404         goto closefiles;
 405       prevfield = find_field (prevline);
 406       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
 407
 408       while (!feof (stdin))
 409         {
 410           bool match;
 411           char *thisfield;
 412           size_t thislen;
 413           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 414             {
 415               if (ferror (stdin))
 416                 goto closefiles;
 417               break;
 418             }
 419           thisfield = find_field (thisline);
 420           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 421           match = !different (thisfield, prevfield, thislen, prevlen);
 422           match_count += match;
 423
 424           if (match_count == UINTMAX_MAX)
 425             {
 426               if (count_occurrences)
 427                 error (EXIT_FAILURE, 0, _("too many repeated lines"));
 428               match_count--;
 429             }
 430
 431           if (delimit_groups != DM_NONE)
 432             {
 433               if (!match)
 434                 {
 435                   if (match_count) /* a previous match */
 436                     first_delimiter = false; /* Only used when DM_SEPARATE */
 437                 }
 438               else if (match_count == 1)
 439                 {
 440                   if ((delimit_groups == DM_PREPEND)
 441                       || (delimit_groups == DM_SEPARATE
 442                           && !first_delimiter))
 443                     putchar (delimiter);
 444                 }
 445             }
 446
 447           if (!match || output_later_repeated)
 448             {
 449               writeline (prevline, match, match_count);
 450               SWAP_LINES (prevline, thisline);
 451               prevfield = thisfield;
 452               prevlen = thislen;
 453               if (!match)
 454                 match_count = 0;
 455             }
 456         }
 457
 458       writeline (prevline, false, match_count);
 459     }
 460
 461  closefiles:
 462   if (ferror (stdin) || fclose (stdin) != 0)
 463     error (EXIT_FAILURE, errno, _("error reading %s"), quoteaf (infile));
 464
 465   /* stdout is handled via the atexit-invoked close_stdout function.  */
 466
 467   free (lb1.buffer);
 468   free (lb2.buffer);
 469 }
 470
 471 enum Skip_field_option_type
 472   {
 473     SFO_NONE,
 474     SFO_OBSOLETE,
 475     SFO_NEW
 476   };
 477
 478 int
 479 main (int argc, char **argv)
 480 {
 481   int optc = 0;
 482   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != nullptr);
 483   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
 484   unsigned int nfiles = 0;
 485   char const *file[2];
 486   char delimiter = '\n';        /* change with --zero-terminated, -z */
 487   bool output_option_used = false;   /* if true, one of -u/-d/-D/-c was used */
 488
 489   file[0] = file[1] = "-";
 490   initialize_main (&argc, &argv);
 491   set_program_name (argv[0]);
 492   setlocale (LC_ALL, "");
 493   bindtextdomain (PACKAGE, LOCALEDIR);
 494   textdomain (PACKAGE);
 495
 496   atexit (close_stdout);
 497
 498   skip_chars = 0;
 499   skip_fields = 0;
 500   check_chars = SIZE_MAX;
 501   output_unique = output_first_repeated = true;
 502   output_later_repeated = false;
 503   countmode = count_none;
 504   delimit_groups = DM_NONE;
 505
 506   while (true)
 507     {
 508       /* Parse an operand with leading "+" as a file after "--" was
 509          seen; or if pedantic and a file was seen; or if not
 510          obsolete.  */
 511
 512       if (optc == -1
 513           || (posixly_correct && nfiles != 0)
 514           || ((optc = getopt_long (argc, argv,
 515                                    "-0123456789Dcdf:is:uw:z",
 516                                    longopts, nullptr))
 517               == -1))
 518         {
 519           if (argc <= optind)
 520             break;
 521           if (nfiles == 2)
 522             {
 523               error (0, 0, _("extra operand %s"), quote (argv[optind]));
 524               usage (EXIT_FAILURE);
 525             }
 526           file[nfiles++] = argv[optind++];
 527         }
 528       else switch (optc)
 529         {
 530         case 1:
 531           {
 532             uintmax_t size;
 533             if (optarg[0] == '+'
 534                 && ! strict_posix2 ()
 535                 && xstrtoumax (optarg, nullptr, 10, &size, "") == LONGINT_OK
 536                 && size <= SIZE_MAX)
 537               skip_chars = size;
 538             else if (nfiles == 2)
 539               {
 540                 error (0, 0, _("extra operand %s"), quote (optarg));
 541                 usage (EXIT_FAILURE);
 542               }
 543             else
 544               file[nfiles++] = optarg;
 545           }
 546           break;
 547
 548         case '0':
 549         case '1':
 550         case '2':
 551         case '3':
 552         case '4':
 553         case '5':
 554         case '6':
 555         case '7':
 556         case '8':
 557         case '9':
 558           {
 559             if (skip_field_option_type == SFO_NEW)
 560               skip_fields = 0;
 561
 562             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
 563               skip_fields = SIZE_MAX;
 564
 565             skip_field_option_type = SFO_OBSOLETE;
 566           }
 567           break;
 568
 569         case 'c':
 570           countmode = count_occurrences;
 571           output_option_used = true;
 572           break;
 573
 574         case 'd':
 575           output_unique = false;
 576           output_option_used = true;
 577           break;
 578
 579         case 'D':
 580           output_unique = false;
 581           output_later_repeated = true;
 582           if (optarg == nullptr)
 583             delimit_groups = DM_NONE;
 584           else
 585             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 586                                         delimit_method_string,
 587                                         delimit_method_map);
 588           output_option_used = true;
 589           break;
 590
 591         case GROUP_OPTION:
 592           if (optarg == nullptr)
 593             grouping = GM_SEPARATE;
 594           else
 595             grouping = XARGMATCH ("--group", optarg,
 596                                   grouping_method_string,
 597                                   grouping_method_map);
 598           break;
 599
 600         case 'f':
 601           skip_field_option_type = SFO_NEW;
 602           skip_fields = size_opt (optarg,
 603                                   N_("invalid number of fields to skip"));
 604           break;
 605
 606         case 'i':
 607           ignore_case = true;
 608           break;
 609
 610         case 's':
 611           skip_chars = size_opt (optarg,
 612                                  N_("invalid number of bytes to skip"));
 613           break;
 614
 615         case 'u':
 616           output_first_repeated = false;
 617           output_option_used = true;
 618           break;
 619
 620         case 'w':
 621           check_chars = size_opt (optarg,
 622                                   N_("invalid number of bytes to compare"));
 623           break;
 624
 625         case 'z':
 626           delimiter = '\0';
 627           break;
 628
 629         case_GETOPT_HELP_CHAR;
 630
 631         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 632
 633         default:
 634           usage (EXIT_FAILURE);
 635         }
 636     }
 637
 638   /* Note we could allow --group with -D at least, and that would
 639      avoid the need to specify a grouping method to --all-repeated.
 640      It was thought best to avoid deprecating those parameters though
 641      and keep --group separate to other options.  */
 642   if (grouping != GM_NONE && output_option_used)
 643     {
 644       error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
 645       usage (EXIT_FAILURE);
 646     }
 647
 648   if (grouping != GM_NONE && countmode != count_none)
 649     {
 650       error (0, 0,
 651            _("grouping and printing repeat counts is meaningless"));
 652       usage (EXIT_FAILURE);
 653     }
 654
 655   if (countmode == count_occurrences && output_later_repeated)
 656     {
 657       error (0, 0,
 658            _("printing all duplicated lines and repeat counts is meaningless"));
 659       usage (EXIT_FAILURE);
 660     }
 661
 662   check_file (file[0], file[1], delimiter);
 663
 664   return EXIT_SUCCESS;
 665 }