src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 1986-2023 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 /* Written by Richard M. Stallman and David MacKenzie. */
  18
  19 #include <config.h>
  20
  21 #include <getopt.h>
  22 #include <sys/types.h>
  23
  24 #include "system.h"
  25 #include "argmatch.h"
  26 #include "linebuffer.h"
  27 #include "fadvise.h"
  28 #include "posixver.h"
  29 #include "stdio--.h"
  30 #include "xstrtol.h"
  31 #include "memcasecmp.h"
  32 #include "quote.h"
  33
  34 /* The official name of this program (e.g., no 'g' prefix).  */
  35 #define PROGRAM_NAME "uniq"
  36
  37 #define AUTHORS \
  38   proper_name ("Richard M. Stallman"), \
  39   proper_name ("David MacKenzie")
  40
  41 #define SWAP_LINES(A, B)                        \
  42   do                                            \
  43     {                                           \
  44       struct linebuffer *_tmp;                  \
  45       _tmp = (A);                               \
  46       (A) = (B);                                \
  47       (B) = _tmp;                               \
  48     }                                           \
  49   while (0)
  50
  51 /* Number of fields to skip on each line when doing comparisons. */
  52 static size_t skip_fields;
  53
  54 /* Number of chars to skip after skipping any fields. */
  55 static size_t skip_chars;
  56
  57 /* Number of chars to compare. */
  58 static size_t check_chars;
  59
  60 enum countmode
  61 {
  62   count_occurrences,            /* -c Print count before output lines. */
  63   count_none                    /* Default.  Do not print counts. */
  64 };
  65
  66 /* Whether and how to precede the output lines with a count of the number of
  67    times they occurred in the input. */
  68 static enum countmode countmode;
  69
  70 /* Which lines to output: unique lines, the first of a group of
  71    repeated lines, and the second and subsequented of a group of
  72    repeated lines.  */
  73 static bool output_unique;
  74 static bool output_first_repeated;
  75 static bool output_later_repeated;
  76
  77 /* If true, ignore case when comparing.  */
  78 static bool ignore_case;
  79
  80 enum delimit_method
  81 {
  82   /* No delimiters output.  --all-repeated[=none] */
  83   DM_NONE,
  84
  85   /* Delimiter precedes all groups.  --all-repeated=prepend */
  86   DM_PREPEND,
  87
  88   /* Delimit all groups.  --all-repeated=separate */
  89   DM_SEPARATE
  90 };
  91
  92 static char const *const delimit_method_string[] =
  93 {
  94   "none", "prepend", "separate", nullptr
  95 };
  96
  97 static enum delimit_method const delimit_method_map[] =
  98 {
  99   DM_NONE, DM_PREPEND, DM_SEPARATE
 100 };
 101
 102 /* Select whether/how to delimit groups of duplicate lines.  */
 103 static enum delimit_method delimit_groups;
 104
 105 enum grouping_method
 106 {
 107   /* No grouping, when "--group" isn't used */
 108   GM_NONE,
 109
 110   /* Delimiter precedes all groups.  --group=prepend */
 111   GM_PREPEND,
 112
 113   /* Delimiter follows all groups.   --group=append */
 114   GM_APPEND,
 115
 116   /* Delimiter between groups.    --group[=separate] */
 117   GM_SEPARATE,
 118
 119   /* Delimiter before and after each group. --group=both */
 120   GM_BOTH
 121 };
 122
 123 static char const *const grouping_method_string[] =
 124 {
 125   "prepend", "append", "separate", "both", nullptr
 126 };
 127
 128 static enum grouping_method const grouping_method_map[] =
 129 {
 130   GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
 131 };
 132
 133 static enum grouping_method grouping = GM_NONE;
 134
 135 enum
 136 {
 137   GROUP_OPTION = CHAR_MAX + 1
 138 };
 139
 140 static struct option const longopts[] =
 141 {
 142   {"count", no_argument, nullptr, 'c'},
 143   {"repeated", no_argument, nullptr, 'd'},
 144   {"all-repeated", optional_argument, nullptr, 'D'},
 145   {"group", optional_argument, nullptr, GROUP_OPTION},
 146   {"ignore-case", no_argument, nullptr, 'i'},
 147   {"unique", no_argument, nullptr, 'u'},
 148   {"skip-fields", required_argument, nullptr, 'f'},
 149   {"skip-chars", required_argument, nullptr, 's'},
 150   {"check-chars", required_argument, nullptr, 'w'},
 151   {"zero-terminated", no_argument, nullptr, 'z'},
 152   {GETOPT_HELP_OPTION_DECL},
 153   {GETOPT_VERSION_OPTION_DECL},
 154   {nullptr, 0, nullptr, 0}
 155 };
 156
 157 void
 158 usage (int status)
 159 {
 160   if (status != EXIT_SUCCESS)
 161     emit_try_help ();
 162   else
 163     {
 164       printf (_("\
 165 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 166 "),
 167               program_name);
 168       fputs (_("\
 169 Filter adjacent matching lines from INPUT (or standard input),\n\
 170 writing to OUTPUT (or standard output).\n\
 171 \n\
 172 With no options, matching lines are merged to the first occurrence.\n\
 173 "), stdout);
 174
 175       emit_mandatory_arg_note ();
 176
 177      fputs (_("\
 178   -c, --count           prefix lines by the number of occurrences\n\
 179   -d, --repeated        only print duplicate lines, one for each group\n\
 180 "), stdout);
 181      fputs (_("\
 182   -D                    print all duplicate lines\n\
 183       --all-repeated[=METHOD]  like -D, but allow separating groups\n\
 184                                  with an empty line;\n\
 185                                  METHOD={none(default),prepend,separate}\n\
 186 "), stdout);
 187      fputs (_("\
 188   -f, --skip-fields=N   avoid comparing the first N fields\n\
 189 "), stdout);
 190      fputs (_("\
 191       --group[=METHOD]  show all items, separating groups with an empty line;\n\
 192                           METHOD={separate(default),prepend,append,both}\n\
 193 "), stdout);
 194      fputs (_("\
 195   -i, --ignore-case     ignore differences in case when comparing\n\
 196   -s, --skip-chars=N    avoid comparing the first N characters\n\
 197   -u, --unique          only print unique lines\n\
 198 "), stdout);
 199       fputs (_("\
 200   -z, --zero-terminated     line delimiter is NUL, not newline\n\
 201 "), stdout);
 202      fputs (_("\
 203   -w, --check-chars=N   compare no more than N characters in lines\n\
 204 "), stdout);
 205      fputs (HELP_OPTION_DESCRIPTION, stdout);
 206      fputs (VERSION_OPTION_DESCRIPTION, stdout);
 207      fputs (_("\
 208 \n\
 209 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
 210 characters.  Fields are skipped before chars.\n\
 211 "), stdout);
 212      fputs (_("\
 213 \n\
 214 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
 215 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
 216 "), stdout);
 217       emit_ancillary_info (PROGRAM_NAME);
 218     }
 219   exit (status);
 220 }
 221
 222 static bool
 223 strict_posix2 (void)
 224 {
 225   int posix_ver = posix2_version ();
 226   return 200112 <= posix_ver && posix_ver < 200809;
 227 }
 228
 229 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
 230    invalid.  Silently convert too-large values to SIZE_MAX.  */
 231
 232 static size_t
 233 size_opt (char const *opt, char const *msgid)
 234 {
 235   uintmax_t size;
 236
 237   switch (xstrtoumax (opt, nullptr, 10, &size, ""))
 238     {
 239     case LONGINT_OK:
 240     case LONGINT_OVERFLOW:
 241       break;
 242
 243     default:
 244       error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 245     }
 246
 247   return MIN (size, SIZE_MAX);
 248 }
 249
 250 /* Given a linebuffer LINE,
 251    return a pointer to the beginning of the line's field to be compared. */
 252
 253 ATTRIBUTE_PURE
 254 static char *
 255 find_field (struct linebuffer const *line)
 256 {
 257   size_t count;
 258   char const *lp = line->buffer;
 259   size_t size = line->length - 1;
 260   size_t i = 0;
 261
 262   for (count = 0; count < skip_fields && i < size; count++)
 263     {
 264       while (i < size && field_sep (lp[i]))
 265         i++;
 266       while (i < size && !field_sep (lp[i]))
 267         i++;
 268     }
 269
 270   i += MIN (skip_chars, size - i);
 271
 272   return line->buffer + i;
 273 }
 274
 275 /* Return false if two strings OLD and NEW match, true if not.
 276    OLD and NEW point not to the beginnings of the lines
 277    but rather to the beginnings of the fields to compare.
 278    OLDLEN and NEWLEN are their lengths. */
 279
 280 static bool
 281 different (char *old, char *new, size_t oldlen, size_t newlen)
 282 {
 283   if (check_chars < oldlen)
 284     oldlen = check_chars;
 285   if (check_chars < newlen)
 286     newlen = check_chars;
 287
 288   if (ignore_case)
 289     return oldlen != newlen || memcasecmp (old, new, oldlen);
 290   else
 291     return oldlen != newlen || memcmp (old, new, oldlen);
 292 }
 293
 294 /* Output the line in linebuffer LINE to standard output
 295    provided that the switches say it should be output.
 296    MATCH is true if the line matches the previous line.
 297    If requested, print the number of times it occurred, as well;
 298    LINECOUNT + 1 is the number of times that the line occurred. */
 299
 300 static void
 301 writeline (struct linebuffer const *line,
 302            bool match, uintmax_t linecount)
 303 {
 304   if (! (linecount == 0 ? output_unique
 305          : !match ? output_first_repeated
 306          : output_later_repeated))
 307     return;
 308
 309   if (countmode == count_occurrences)
 310     printf ("%7" PRIuMAX " ", linecount + 1);
 311
 312   fwrite (line->buffer, sizeof (char), line->length, stdout);
 313 }
 314
 315 /* Process input file INFILE with output to OUTFILE.
 316    If either is "-", use the standard I/O stream for it instead. */
 317
 318 static void
 319 check_file (char const *infile, char const *outfile, char delimiter)
 320 {
 321   struct linebuffer lb1, lb2;
 322   struct linebuffer *thisline, *prevline;
 323
 324   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
 325     error (EXIT_FAILURE, errno, "%s", quotef (infile));
 326   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
 327     error (EXIT_FAILURE, errno, "%s", quotef (outfile));
 328
 329   fadvise (stdin, FADVISE_SEQUENTIAL);
 330
 331   thisline = &lb1;
 332   prevline = &lb2;
 333
 334   initbuffer (thisline);
 335   initbuffer (prevline);
 336
 337   /* The duplication in the following 'if' and 'else' blocks is an
 338      optimization to distinguish between when we can print input
 339      lines immediately (1. & 2.) or not.
 340
 341      1. --group => all input lines are printed.
 342         checking for unique/duplicated lines is used only for printing
 343         group separators.
 344
 345      2. The default case in which none of these options has been specified:
 346           --count, --repeated,  --all-repeated, --unique
 347         In the default case, this optimization lets uniq output each different
 348         line right away, without waiting to see if the next one is different.
 349
 350      3. All other cases.
 351   */
 352   if (output_unique && output_first_repeated && countmode == count_none)
 353     {
 354       char *prevfield = nullptr;
 355       size_t prevlen;
 356       bool first_group_printed = false;
 357
 358       while (!feof (stdin))
 359         {
 360           char *thisfield;
 361           size_t thislen;
 362           bool new_group;
 363
 364           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 365             break;
 366
 367           thisfield = find_field (thisline);
 368           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 369
 370           new_group = (!prevfield
 371                        || different (thisfield, prevfield, thislen, prevlen));
 372
 373           if (new_group && grouping != GM_NONE
 374               && (grouping == GM_PREPEND || grouping == GM_BOTH
 375                   || (first_group_printed && (grouping == GM_APPEND
 376                                               || grouping == GM_SEPARATE))))
 377             putchar (delimiter);
 378
 379           if (new_group || grouping != GM_NONE)
 380             {
 381               fwrite (thisline->buffer, sizeof (char),
 382                       thisline->length, stdout);
 383
 384               SWAP_LINES (prevline, thisline);
 385               prevfield = thisfield;
 386               prevlen = thislen;
 387               first_group_printed = true;
 388             }
 389         }
 390       if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
 391         putchar (delimiter);
 392     }
 393   else
 394     {
 395       char *prevfield;
 396       size_t prevlen;
 397       uintmax_t match_count = 0;
 398       bool first_delimiter = true;
 399
 400       if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
 401         goto closefiles;
 402       prevfield = find_field (prevline);
 403       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
 404
 405       while (!feof (stdin))
 406         {
 407           bool match;
 408           char *thisfield;
 409           size_t thislen;
 410           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 411             {
 412               if (ferror (stdin))
 413                 goto closefiles;
 414               break;
 415             }
 416           thisfield = find_field (thisline);
 417           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 418           match = !different (thisfield, prevfield, thislen, prevlen);
 419           match_count += match;
 420
 421           if (match_count == UINTMAX_MAX)
 422             {
 423               if (count_occurrences)
 424                 error (EXIT_FAILURE, 0, _("too many repeated lines"));
 425               match_count--;
 426             }
 427
 428           if (delimit_groups != DM_NONE)
 429             {
 430               if (!match)
 431                 {
 432                   if (match_count) /* a previous match */
 433                     first_delimiter = false; /* Only used when DM_SEPARATE */
 434                 }
 435               else if (match_count == 1)
 436                 {
 437                   if ((delimit_groups == DM_PREPEND)
 438                       || (delimit_groups == DM_SEPARATE
 439                           && !first_delimiter))
 440                     putchar (delimiter);
 441                 }
 442             }
 443
 444           if (!match || output_later_repeated)
 445             {
 446               writeline (prevline, match, match_count);
 447               SWAP_LINES (prevline, thisline);
 448               prevfield = thisfield;
 449               prevlen = thislen;
 450               if (!match)
 451                 match_count = 0;
 452             }
 453         }
 454
 455       writeline (prevline, false, match_count);
 456     }
 457
 458  closefiles:
 459   if (ferror (stdin) || fclose (stdin) != 0)
 460     error (EXIT_FAILURE, errno, _("error reading %s"), quoteaf (infile));
 461
 462   /* stdout is handled via the atexit-invoked close_stdout function.  */
 463
 464   free (lb1.buffer);
 465   free (lb2.buffer);
 466 }
 467
 468 enum Skip_field_option_type
 469   {
 470     SFO_NONE,
 471     SFO_OBSOLETE,
 472     SFO_NEW
 473   };
 474
 475 int
 476 main (int argc, char **argv)
 477 {
 478   int optc = 0;
 479   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != nullptr);
 480   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
 481   unsigned int nfiles = 0;
 482   char const *file[2];
 483   char delimiter = '\n';        /* change with --zero-terminated, -z */
 484   bool output_option_used = false;   /* if true, one of -u/-d/-D/-c was used */
 485
 486   file[0] = file[1] = "-";
 487   initialize_main (&argc, &argv);
 488   set_program_name (argv[0]);
 489   setlocale (LC_ALL, "");
 490   bindtextdomain (PACKAGE, LOCALEDIR);
 491   textdomain (PACKAGE);
 492
 493   atexit (close_stdout);
 494
 495   skip_chars = 0;
 496   skip_fields = 0;
 497   check_chars = SIZE_MAX;
 498   output_unique = output_first_repeated = true;
 499   output_later_repeated = false;
 500   countmode = count_none;
 501   delimit_groups = DM_NONE;
 502
 503   while (true)
 504     {
 505       /* Parse an operand with leading "+" as a file after "--" was
 506          seen; or if pedantic and a file was seen; or if not
 507          obsolete.  */
 508
 509       if (optc == -1
 510           || (posixly_correct && nfiles != 0)
 511           || ((optc = getopt_long (argc, argv,
 512                                    "-0123456789Dcdf:is:uw:z",
 513                                    longopts, nullptr))
 514               == -1))
 515         {
 516           if (argc <= optind)
 517             break;
 518           if (nfiles == 2)
 519             {
 520               error (0, 0, _("extra operand %s"), quote (argv[optind]));
 521               usage (EXIT_FAILURE);
 522             }
 523           file[nfiles++] = argv[optind++];
 524         }
 525       else switch (optc)
 526         {
 527         case 1:
 528           {
 529             uintmax_t size;
 530             if (optarg[0] == '+'
 531                 && ! strict_posix2 ()
 532                 && xstrtoumax (optarg, nullptr, 10, &size, "") == LONGINT_OK
 533                 && size <= SIZE_MAX)
 534               skip_chars = size;
 535             else if (nfiles == 2)
 536               {
 537                 error (0, 0, _("extra operand %s"), quote (optarg));
 538                 usage (EXIT_FAILURE);
 539               }
 540             else
 541               file[nfiles++] = optarg;
 542           }
 543           break;
 544
 545         case '0':
 546         case '1':
 547         case '2':
 548         case '3':
 549         case '4':
 550         case '5':
 551         case '6':
 552         case '7':
 553         case '8':
 554         case '9':
 555           {
 556             if (skip_field_option_type == SFO_NEW)
 557               skip_fields = 0;
 558
 559             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
 560               skip_fields = SIZE_MAX;
 561
 562             skip_field_option_type = SFO_OBSOLETE;
 563           }
 564           break;
 565
 566         case 'c':
 567           countmode = count_occurrences;
 568           output_option_used = true;
 569           break;
 570
 571         case 'd':
 572           output_unique = false;
 573           output_option_used = true;
 574           break;
 575
 576         case 'D':
 577           output_unique = false;
 578           output_later_repeated = true;
 579           if (optarg == nullptr)
 580             delimit_groups = DM_NONE;
 581           else
 582             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 583                                         delimit_method_string,
 584                                         delimit_method_map);
 585           output_option_used = true;
 586           break;
 587
 588         case GROUP_OPTION:
 589           if (optarg == nullptr)
 590             grouping = GM_SEPARATE;
 591           else
 592             grouping = XARGMATCH ("--group", optarg,
 593                                   grouping_method_string,
 594                                   grouping_method_map);
 595           break;
 596
 597         case 'f':
 598           skip_field_option_type = SFO_NEW;
 599           skip_fields = size_opt (optarg,
 600                                   N_("invalid number of fields to skip"));
 601           break;
 602
 603         case 'i':
 604           ignore_case = true;
 605           break;
 606
 607         case 's':
 608           skip_chars = size_opt (optarg,
 609                                  N_("invalid number of bytes to skip"));
 610           break;
 611
 612         case 'u':
 613           output_first_repeated = false;
 614           output_option_used = true;
 615           break;
 616
 617         case 'w':
 618           check_chars = size_opt (optarg,
 619                                   N_("invalid number of bytes to compare"));
 620           break;
 621
 622         case 'z':
 623           delimiter = '\0';
 624           break;
 625
 626         case_GETOPT_HELP_CHAR;
 627
 628         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 629
 630         default:
 631           usage (EXIT_FAILURE);
 632         }
 633     }
 634
 635   /* Note we could allow --group with -D at least, and that would
 636      avoid the need to specify a grouping method to --all-repeated.
 637      It was thought best to avoid deprecating those parameters though
 638      and keep --group separate to other options.  */
 639   if (grouping != GM_NONE && output_option_used)
 640     {
 641       error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
 642       usage (EXIT_FAILURE);
 643     }
 644
 645   if (grouping != GM_NONE && countmode != count_none)
 646     {
 647       error (0, 0,
 648            _("grouping and printing repeat counts is meaningless"));
 649       usage (EXIT_FAILURE);
 650     }
 651
 652   if (countmode == count_occurrences && output_later_repeated)
 653     {
 654       error (0, 0,
 655            _("printing all duplicated lines and repeat counts is meaningless"));
 656       usage (EXIT_FAILURE);
 657     }
 658
 659   check_file (file[0], file[1], delimiter);
 660
 661   return EXIT_SUCCESS;
 662 }