src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 1986-2024 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 /* Written by Richard M. Stallman and David MacKenzie. */
  18
  19 #include <config.h>
  20
  21 #include <getopt.h>
  22 #include <sys/types.h>
  23
  24 #include "system.h"
  25 #include "argmatch.h"
  26 #include "linebuffer.h"
  27 #include "fadvise.h"
  28 #include "mcel.h"
  29 #include "posixver.h"
  30 #include "skipchars.h"
  31 #include "stdio--.h"
  32 #include "xstrtol.h"
  33 #include "memcasecmp.h"
  34 #include "quote.h"
  35
  36 /* The official name of this program (e.g., no 'g' prefix).  */
  37 #define PROGRAM_NAME "uniq"
  38
  39 #define AUTHORS \
  40   proper_name ("Richard M. Stallman"), \
  41   proper_name ("David MacKenzie")
  42
  43 static void
  44 swap_lines (struct linebuffer **a, struct linebuffer **b)
  45 {
  46   struct linebuffer *tmp = *a;
  47   *a = *b;
  48   *b = tmp;
  49 }
  50
  51 /* Number of fields to skip on each line when doing comparisons. */
  52 static idx_t skip_fields = false;
  53
  54 /* Number of chars to skip after skipping any fields. */
  55 static idx_t skip_chars = false;
  56
  57 /* Number of chars to compare. */
  58 static idx_t check_chars = IDX_MAX;
  59
  60 /* Whether to precede the output lines with a count of the number of
  61    times they occurred in the input. */
  62 static bool count_occurrences = false;
  63
  64 /* Which lines to output: unique lines, the first of a group of
  65    repeated lines, and the second and subsequent of a group of
  66    repeated lines.  */
  67 static bool output_unique = true;
  68 static bool output_first_repeated = true;
  69 static bool output_later_repeated = false;
  70
  71 /* If true, ignore case when comparing.  */
  72 static bool ignore_case;
  73
  74 enum delimit_method
  75 {
  76   /* No delimiters output.  --all-repeated[=none] */
  77   DM_NONE,
  78
  79   /* Delimiter precedes all groups.  --all-repeated=prepend */
  80   DM_PREPEND,
  81
  82   /* Delimit all groups.  --all-repeated=separate */
  83   DM_SEPARATE
  84 };
  85
  86 static char const *const delimit_method_string[] =
  87 {
  88   "none", "prepend", "separate", nullptr
  89 };
  90
  91 static enum delimit_method const delimit_method_map[] =
  92 {
  93   DM_NONE, DM_PREPEND, DM_SEPARATE
  94 };
  95
  96 /* Select whether/how to delimit groups of duplicate lines.  */
  97 static enum delimit_method delimit_groups = DM_NONE;
  98
  99 enum grouping_method
 100 {
 101   /* No grouping, when "--group" isn't used */
 102   GM_NONE,
 103
 104   /* Delimiter precedes all groups.  --group=prepend */
 105   GM_PREPEND,
 106
 107   /* Delimiter follows all groups.   --group=append */
 108   GM_APPEND,
 109
 110   /* Delimiter between groups.    --group[=separate] */
 111   GM_SEPARATE,
 112
 113   /* Delimiter before and after each group. --group=both */
 114   GM_BOTH
 115 };
 116
 117 static char const *const grouping_method_string[] =
 118 {
 119   "prepend", "append", "separate", "both", nullptr
 120 };
 121
 122 static enum grouping_method const grouping_method_map[] =
 123 {
 124   GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
 125 };
 126
 127 static enum grouping_method grouping = GM_NONE;
 128
 129 enum
 130 {
 131   GROUP_OPTION = CHAR_MAX + 1
 132 };
 133
 134 static struct option const longopts[] =
 135 {
 136   {"count", no_argument, nullptr, 'c'},
 137   {"repeated", no_argument, nullptr, 'd'},
 138   {"all-repeated", optional_argument, nullptr, 'D'},
 139   {"group", optional_argument, nullptr, GROUP_OPTION},
 140   {"ignore-case", no_argument, nullptr, 'i'},
 141   {"unique", no_argument, nullptr, 'u'},
 142   {"skip-fields", required_argument, nullptr, 'f'},
 143   {"skip-chars", required_argument, nullptr, 's'},
 144   {"check-chars", required_argument, nullptr, 'w'},
 145   {"zero-terminated", no_argument, nullptr, 'z'},
 146   {GETOPT_HELP_OPTION_DECL},
 147   {GETOPT_VERSION_OPTION_DECL},
 148   {nullptr, 0, nullptr, 0}
 149 };
 150
 151 void
 152 usage (int status)
 153 {
 154   if (status != EXIT_SUCCESS)
 155     emit_try_help ();
 156   else
 157     {
 158       printf (_("\
 159 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 160 "),
 161               program_name);
 162       fputs (_("\
 163 Filter adjacent matching lines from INPUT (or standard input),\n\
 164 writing to OUTPUT (or standard output).\n\
 165 \n\
 166 With no options, matching lines are merged to the first occurrence.\n\
 167 "), stdout);
 168
 169       emit_mandatory_arg_note ();
 170
 171      fputs (_("\
 172   -c, --count           prefix lines by the number of occurrences\n\
 173   -d, --repeated        only print duplicate lines, one for each group\n\
 174 "), stdout);
 175      fputs (_("\
 176   -D                    print all duplicate lines\n\
 177       --all-repeated[=METHOD]  like -D, but allow separating groups\n\
 178                                  with an empty line;\n\
 179                                  METHOD={none(default),prepend,separate}\n\
 180 "), stdout);
 181      fputs (_("\
 182   -f, --skip-fields=N   avoid comparing the first N fields\n\
 183 "), stdout);
 184      fputs (_("\
 185       --group[=METHOD]  show all items, separating groups with an empty line;\n\
 186                           METHOD={separate(default),prepend,append,both}\n\
 187 "), stdout);
 188      fputs (_("\
 189   -i, --ignore-case     ignore differences in case when comparing\n\
 190   -s, --skip-chars=N    avoid comparing the first N characters\n\
 191   -u, --unique          only print unique lines\n\
 192 "), stdout);
 193       fputs (_("\
 194   -z, --zero-terminated     line delimiter is NUL, not newline\n\
 195 "), stdout);
 196      fputs (_("\
 197   -w, --check-chars=N   compare no more than N characters in lines\n\
 198 "), stdout);
 199      fputs (HELP_OPTION_DESCRIPTION, stdout);
 200      fputs (VERSION_OPTION_DESCRIPTION, stdout);
 201      fputs (_("\
 202 \n\
 203 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
 204 characters.  Fields are skipped before chars.\n\
 205 "), stdout);
 206      fputs (_("\
 207 \n\
 208 'uniq' does not detect repeated lines unless they are adjacent.\n\
 209 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
 210 "), stdout);
 211       emit_ancillary_info (PROGRAM_NAME);
 212     }
 213   exit (status);
 214 }
 215
 216 static bool
 217 strict_posix2 (void)
 218 {
 219   int posix_ver = posix2_version ();
 220   return 200112 <= posix_ver && posix_ver < 200809;
 221 }
 222
 223 /* Convert OPT to idx_t, reporting an error using MSGID if OPT is
 224    invalid.  Silently convert too-large values to IDX_MAX.  */
 225
 226 static idx_t
 227 size_opt (char const *opt, char const *msgid)
 228 {
 229   intmax_t size;
 230   if (LONGINT_OVERFLOW < xstrtoimax (opt, nullptr, 10, &size, "")
 231       || size < 0)
 232     error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 233   return MIN (size, IDX_MAX);
 234 }
 235
 236 static bool
 237 newline_or_blank (mcel_t g)
 238 {
 239   return g.ch == '\n' || c32isblank (g.ch);
 240 }
 241
 242 /* Given a linebuffer LINE,
 243    return a pointer to the beginning of the line's field to be compared.
 244    Store into *PLEN the length in bytes of FIELD.  */
 245
 246 static char *
 247 find_field (struct linebuffer const *line, idx_t *plen)
 248 {
 249   char *lp = line->buffer;
 250   char const *lim = lp + line->length - 1;
 251
 252   for (idx_t i = skip_fields; 0 < i && lp < lim; i--)
 253     {
 254       lp = skip_buf_matching (lp, lim, newline_or_blank, true);
 255       lp = skip_buf_matching (lp, lim, newline_or_blank, false);
 256     }
 257
 258   for (idx_t i = skip_chars; 0 < i && lp < lim; i--)
 259     lp += mcel_scan (lp, lim).len;
 260
 261   /* Compute the length in bytes cheaply if possible; otherwise, scan.  */
 262   idx_t len;
 263   if (lim - lp <= check_chars)
 264     len = lim - lp;
 265   else if (MB_CUR_MAX <= 1)
 266     len = check_chars;
 267   else
 268     {
 269       char *ep = lp;
 270       for (idx_t i = check_chars; 0 < i && lp < lim; i--)
 271         ep += mcel_scan (lp, lim).len;
 272       len = ep - lp;
 273     }
 274
 275   *plen = len;
 276   return lp;
 277 }
 278
 279 /* Return false if two strings OLD and NEW match, true if not.
 280    OLD and NEW point not to the beginnings of the lines
 281    but rather to the beginnings of the fields to compare.
 282    OLDLEN and NEWLEN are their lengths. */
 283
 284 static bool
 285 different (char *old, char *new, idx_t oldlen, idx_t newlen)
 286 {
 287   if (ignore_case)
 288     return oldlen != newlen || memcasecmp (old, new, oldlen);
 289   else
 290     return oldlen != newlen || memcmp (old, new, oldlen);
 291 }
 292
 293 /* Output the line in linebuffer LINE to standard output
 294    provided that the switches say it should be output.
 295    MATCH is true if the line matches the previous line.
 296    If requested, print the number of times it occurred, as well;
 297    LINECOUNT + 1 is the number of times that the line occurred. */
 298
 299 static void
 300 writeline (struct linebuffer const *line,
 301            bool match, intmax_t linecount)
 302 {
 303   if (! (linecount == 0 ? output_unique
 304          : !match ? output_first_repeated
 305          : output_later_repeated))
 306     return;
 307
 308   if (count_occurrences)
 309     printf ("%7jd ", linecount + 1);
 310
 311   if (fwrite (line->buffer, sizeof (char), line->length, stdout)
 312       != line->length)
 313     write_error ();
 314 }
 315
 316 /* Process input file INFILE with output to OUTFILE.
 317    If either is "-", use the standard I/O stream for it instead. */
 318
 319 static void
 320 check_file (char const *infile, char const *outfile, char delimiter)
 321 {
 322   struct linebuffer lb1, lb2;
 323   struct linebuffer *thisline, *prevline;
 324
 325   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
 326     error (EXIT_FAILURE, errno, "%s", quotef (infile));
 327   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
 328     error (EXIT_FAILURE, errno, "%s", quotef (outfile));
 329
 330   fadvise (stdin, FADVISE_SEQUENTIAL);
 331
 332   thisline = &lb1;
 333   prevline = &lb2;
 334
 335   initbuffer (thisline);
 336   initbuffer (prevline);
 337
 338   /* The duplication in the following 'if' and 'else' blocks is an
 339      optimization to distinguish between when we can print input
 340      lines immediately (1. & 2.) or not.
 341
 342      1. --group => all input lines are printed.
 343         checking for unique/duplicated lines is used only for printing
 344         group separators.
 345
 346      2. The default case in which none of these options has been specified:
 347           --count, --repeated,  --all-repeated, --unique
 348         In the default case, this optimization lets uniq output each different
 349         line right away, without waiting to see if the next one is different.
 350
 351      3. All other cases.
 352   */
 353   if (output_unique && output_first_repeated && !count_occurrences)
 354     {
 355       char *prevfield = nullptr;
 356       idx_t prevlen;
 357       bool first_group_printed = false;
 358
 359       while (!feof (stdin)
 360              && readlinebuffer_delim (thisline, stdin, delimiter) != 0)
 361         {
 362           idx_t thislen;
 363           char *thisfield = find_field (thisline, &thislen);
 364           bool new_group = (!prevfield
 365                             || different (thisfield, prevfield,
 366                                           thislen, prevlen));
 367
 368           if (new_group && grouping != GM_NONE
 369               && (grouping == GM_PREPEND || grouping == GM_BOTH
 370                   || (first_group_printed && (grouping == GM_APPEND
 371                                               || grouping == GM_SEPARATE))))
 372             putchar (delimiter);
 373
 374           if (new_group || grouping != GM_NONE)
 375             {
 376               if (fwrite (thisline->buffer, sizeof (char), thisline->length,
 377                   stdout) != thisline->length)
 378                 write_error ();
 379
 380               swap_lines (&prevline, &thisline);
 381               prevfield = thisfield;
 382               prevlen = thislen;
 383               first_group_printed = true;
 384             }
 385         }
 386       if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
 387         putchar (delimiter);
 388     }
 389   else
 390     {
 391       if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
 392         goto closefiles;
 393
 394       idx_t prevlen;
 395       char *prevfield = find_field (prevline, &prevlen);
 396       intmax_t match_count = 0;
 397       bool first_delimiter = true;
 398
 399       while (!feof (stdin))
 400         {
 401           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 402             {
 403               if (ferror (stdin))
 404                 goto closefiles;
 405               break;
 406             }
 407           idx_t thislen;
 408           char *thisfield = find_field (thisline, &thislen);
 409           bool match = !different (thisfield, prevfield, thislen, prevlen);
 410           match_count += match;
 411
 412           if (match_count == INTMAX_MAX)
 413             {
 414               if (count_occurrences)
 415                 error (EXIT_FAILURE, 0, _("too many repeated lines"));
 416               match_count--;
 417             }
 418
 419           if (delimit_groups != DM_NONE)
 420             {
 421               if (!match)
 422                 {
 423                   if (match_count) /* a previous match */
 424                     first_delimiter = false; /* Only used when DM_SEPARATE */
 425                 }
 426               else if (match_count == 1)
 427                 {
 428                   if ((delimit_groups == DM_PREPEND)
 429                       || (delimit_groups == DM_SEPARATE
 430                           && !first_delimiter))
 431                     putchar (delimiter);
 432                 }
 433             }
 434
 435           if (!match || output_later_repeated)
 436             {
 437               writeline (prevline, match, match_count);
 438               swap_lines (&prevline, &thisline);
 439               prevfield = thisfield;
 440               prevlen = thislen;
 441               if (!match)
 442                 match_count = 0;
 443             }
 444         }
 445
 446       writeline (prevline, false, match_count);
 447     }
 448
 449  closefiles:
 450   if (ferror (stdin) || fclose (stdin) != 0)
 451     error (EXIT_FAILURE, errno, _("error reading %s"), quoteaf (infile));
 452
 453   /* stdout is handled via the atexit-invoked close_stdout function.  */
 454
 455   free (lb1.buffer);
 456   free (lb2.buffer);
 457 }
 458
 459 enum Skip_field_option_type
 460   {
 461     SFO_NONE,
 462     SFO_OBSOLETE,
 463     SFO_NEW
 464   };
 465
 466 int
 467 main (int argc, char **argv)
 468 {
 469   int optc = 0;
 470   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != nullptr);
 471   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
 472   int nfiles = 0;
 473   char const *file[2];
 474   char delimiter = '\n';        /* change with --zero-terminated, -z */
 475   bool output_option_used = false;   /* if true, one of -u/-d/-D/-c was used */
 476
 477   file[0] = file[1] = "-";
 478   initialize_main (&argc, &argv);
 479   set_program_name (argv[0]);
 480   setlocale (LC_ALL, "");
 481   bindtextdomain (PACKAGE, LOCALEDIR);
 482   textdomain (PACKAGE);
 483
 484   atexit (close_stdout);
 485
 486   while (true)
 487     {
 488       /* Parse an operand with leading "+" as a file after "--" was
 489          seen; or if pedantic and a file was seen; or if not
 490          obsolete.  */
 491
 492       if (optc == -1
 493           || (posixly_correct && nfiles != 0)
 494           || ((optc = getopt_long (argc, argv,
 495                                    "-0123456789Dcdf:is:uw:z",
 496                                    longopts, nullptr))
 497               == -1))
 498         {
 499           if (argc <= optind)
 500             break;
 501           if (nfiles == 2)
 502             {
 503               error (0, 0, _("extra operand %s"), quote (argv[optind]));
 504               usage (EXIT_FAILURE);
 505             }
 506           file[nfiles++] = argv[optind++];
 507         }
 508       else switch (optc)
 509         {
 510         case 1:
 511           {
 512             intmax_t size;
 513             if (optarg[0] == '+'
 514                 && ! strict_posix2 ()
 515                 && (xstrtoimax (optarg, nullptr, 10, &size, "")
 516                     <= LONGINT_OVERFLOW))
 517               skip_chars = MIN (size, IDX_MAX);
 518             else if (nfiles == 2)
 519               {
 520                 error (0, 0, _("extra operand %s"), quote (optarg));
 521                 usage (EXIT_FAILURE);
 522               }
 523             else
 524               file[nfiles++] = optarg;
 525           }
 526           break;
 527
 528         case '0':
 529         case '1':
 530         case '2':
 531         case '3':
 532         case '4':
 533         case '5':
 534         case '6':
 535         case '7':
 536         case '8':
 537         case '9':
 538           {
 539             if (skip_field_option_type == SFO_NEW)
 540               skip_fields = 0;
 541
 542             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0'))
 543               skip_fields = IDX_MAX;
 544
 545             skip_field_option_type = SFO_OBSOLETE;
 546           }
 547           break;
 548
 549         case 'c':
 550           count_occurrences = true;
 551           output_option_used = true;
 552           break;
 553
 554         case 'd':
 555           output_unique = false;
 556           output_option_used = true;
 557           break;
 558
 559         case 'D':
 560           output_unique = false;
 561           output_later_repeated = true;
 562           if (optarg == nullptr)
 563             delimit_groups = DM_NONE;
 564           else
 565             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 566                                         delimit_method_string,
 567                                         delimit_method_map);
 568           output_option_used = true;
 569           break;
 570
 571         case GROUP_OPTION:
 572           if (optarg == nullptr)
 573             grouping = GM_SEPARATE;
 574           else
 575             grouping = XARGMATCH ("--group", optarg,
 576                                   grouping_method_string,
 577                                   grouping_method_map);
 578           break;
 579
 580         case 'f':
 581           skip_field_option_type = SFO_NEW;
 582           skip_fields = size_opt (optarg,
 583                                   N_("invalid number of fields to skip"));
 584           break;
 585
 586         case 'i':
 587           ignore_case = true;
 588           break;
 589
 590         case 's':
 591           skip_chars = size_opt (optarg,
 592                                  N_("invalid number of bytes to skip"));
 593           break;
 594
 595         case 'u':
 596           output_first_repeated = false;
 597           output_option_used = true;
 598           break;
 599
 600         case 'w':
 601           check_chars = size_opt (optarg,
 602                                   N_("invalid number of bytes to compare"));
 603           break;
 604
 605         case 'z':
 606           delimiter = '\0';
 607           break;
 608
 609         case_GETOPT_HELP_CHAR;
 610
 611         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 612
 613         default:
 614           usage (EXIT_FAILURE);
 615         }
 616     }
 617
 618   /* Note we could allow --group with -D at least, and that would
 619      avoid the need to specify a grouping method to --all-repeated.
 620      It was thought best to avoid deprecating those parameters though
 621      and keep --group separate to other options.  */
 622   if (grouping != GM_NONE && output_option_used)
 623     {
 624       error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
 625       usage (EXIT_FAILURE);
 626     }
 627
 628   if (grouping != GM_NONE && count_occurrences)
 629     {
 630       error (0, 0,
 631            _("grouping and printing repeat counts is meaningless"));
 632       usage (EXIT_FAILURE);
 633     }
 634
 635   if (count_occurrences && output_later_repeated)
 636     {
 637       error (0, 0,
 638            _("printing all duplicated lines and repeat counts is meaningless"));
 639       usage (EXIT_FAILURE);
 640     }
 641
 642   check_file (file[0], file[1], delimiter);
 643
 644   return EXIT_SUCCESS;
 645 }