src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 1986-2023 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
  16
  17 /* Written by Richard M. Stallman and David MacKenzie. */
  18
  19 #include <config.h>
  20
  21 #include <getopt.h>
  22 #include <sys/types.h>
  23
  24 #include "system.h"
  25 #include "argmatch.h"
  26 #include "linebuffer.h"
  27 #include "die.h"
  28 #include "error.h"
  29 #include "fadvise.h"
  30 #include "posixver.h"
  31 #include "stdio--.h"
  32 #include "xstrtol.h"
  33 #include "memcasecmp.h"
  34 #include "quote.h"
  35
  36 /* The official name of this program (e.g., no 'g' prefix).  */
  37 #define PROGRAM_NAME "uniq"
  38
  39 #define AUTHORS \
  40   proper_name ("Richard M. Stallman"), \
  41   proper_name ("David MacKenzie")
  42
  43 #define SWAP_LINES(A, B)                        \
  44   do                                            \
  45     {                                           \
  46       struct linebuffer *_tmp;                  \
  47       _tmp = (A);                               \
  48       (A) = (B);                                \
  49       (B) = _tmp;                               \
  50     }                                           \
  51   while (0)
  52
  53 /* Number of fields to skip on each line when doing comparisons. */
  54 static size_t skip_fields;
  55
  56 /* Number of chars to skip after skipping any fields. */
  57 static size_t skip_chars;
  58
  59 /* Number of chars to compare. */
  60 static size_t check_chars;
  61
  62 enum countmode
  63 {
  64   count_occurrences,            /* -c Print count before output lines. */
  65   count_none                    /* Default.  Do not print counts. */
  66 };
  67
  68 /* Whether and how to precede the output lines with a count of the number of
  69    times they occurred in the input. */
  70 static enum countmode countmode;
  71
  72 /* Which lines to output: unique lines, the first of a group of
  73    repeated lines, and the second and subsequented of a group of
  74    repeated lines.  */
  75 static bool output_unique;
  76 static bool output_first_repeated;
  77 static bool output_later_repeated;
  78
  79 /* If true, ignore case when comparing.  */
  80 static bool ignore_case;
  81
  82 enum delimit_method
  83 {
  84   /* No delimiters output.  --all-repeated[=none] */
  85   DM_NONE,
  86
  87   /* Delimiter precedes all groups.  --all-repeated=prepend */
  88   DM_PREPEND,
  89
  90   /* Delimit all groups.  --all-repeated=separate */
  91   DM_SEPARATE
  92 };
  93
  94 static char const *const delimit_method_string[] =
  95 {
  96   "none", "prepend", "separate", NULL
  97 };
  98
  99 static enum delimit_method const delimit_method_map[] =
 100 {
 101   DM_NONE, DM_PREPEND, DM_SEPARATE
 102 };
 103
 104 /* Select whether/how to delimit groups of duplicate lines.  */
 105 static enum delimit_method delimit_groups;
 106
 107 enum grouping_method
 108 {
 109   /* No grouping, when "--group" isn't used */
 110   GM_NONE,
 111
 112   /* Delimiter precedes all groups.  --group=prepend */
 113   GM_PREPEND,
 114
 115   /* Delimiter follows all groups.   --group=append */
 116   GM_APPEND,
 117
 118   /* Delimiter between groups.    --group[=separate] */
 119   GM_SEPARATE,
 120
 121   /* Delimiter before and after each group. --group=both */
 122   GM_BOTH
 123 };
 124
 125 static char const *const grouping_method_string[] =
 126 {
 127   "prepend", "append", "separate", "both", NULL
 128 };
 129
 130 static enum grouping_method const grouping_method_map[] =
 131 {
 132   GM_PREPEND, GM_APPEND, GM_SEPARATE, GM_BOTH
 133 };
 134
 135 static enum grouping_method grouping = GM_NONE;
 136
 137 enum
 138 {
 139   GROUP_OPTION = CHAR_MAX + 1
 140 };
 141
 142 static struct option const longopts[] =
 143 {
 144   {"count", no_argument, NULL, 'c'},
 145   {"repeated", no_argument, NULL, 'd'},
 146   {"all-repeated", optional_argument, NULL, 'D'},
 147   {"group", optional_argument, NULL, GROUP_OPTION},
 148   {"ignore-case", no_argument, NULL, 'i'},
 149   {"unique", no_argument, NULL, 'u'},
 150   {"skip-fields", required_argument, NULL, 'f'},
 151   {"skip-chars", required_argument, NULL, 's'},
 152   {"check-chars", required_argument, NULL, 'w'},
 153   {"zero-terminated", no_argument, NULL, 'z'},
 154   {GETOPT_HELP_OPTION_DECL},
 155   {GETOPT_VERSION_OPTION_DECL},
 156   {NULL, 0, NULL, 0}
 157 };
 158
 159 void
 160 usage (int status)
 161 {
 162   if (status != EXIT_SUCCESS)
 163     emit_try_help ();
 164   else
 165     {
 166       printf (_("\
 167 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 168 "),
 169               program_name);
 170       fputs (_("\
 171 Filter adjacent matching lines from INPUT (or standard input),\n\
 172 writing to OUTPUT (or standard output).\n\
 173 \n\
 174 With no options, matching lines are merged to the first occurrence.\n\
 175 "), stdout);
 176
 177       emit_mandatory_arg_note ();
 178
 179      fputs (_("\
 180   -c, --count           prefix lines by the number of occurrences\n\
 181   -d, --repeated        only print duplicate lines, one for each group\n\
 182 "), stdout);
 183      fputs (_("\
 184   -D                    print all duplicate lines\n\
 185       --all-repeated[=METHOD]  like -D, but allow separating groups\n\
 186                                  with an empty line;\n\
 187                                  METHOD={none(default),prepend,separate}\n\
 188 "), stdout);
 189      fputs (_("\
 190   -f, --skip-fields=N   avoid comparing the first N fields\n\
 191 "), stdout);
 192      fputs (_("\
 193       --group[=METHOD]  show all items, separating groups with an empty line;\n\
 194                           METHOD={separate(default),prepend,append,both}\n\
 195 "), stdout);
 196      fputs (_("\
 197   -i, --ignore-case     ignore differences in case when comparing\n\
 198   -s, --skip-chars=N    avoid comparing the first N characters\n\
 199   -u, --unique          only print unique lines\n\
 200 "), stdout);
 201       fputs (_("\
 202   -z, --zero-terminated     line delimiter is NUL, not newline\n\
 203 "), stdout);
 204      fputs (_("\
 205   -w, --check-chars=N   compare no more than N characters in lines\n\
 206 "), stdout);
 207      fputs (HELP_OPTION_DESCRIPTION, stdout);
 208      fputs (VERSION_OPTION_DESCRIPTION, stdout);
 209      fputs (_("\
 210 \n\
 211 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
 212 characters.  Fields are skipped before chars.\n\
 213 "), stdout);
 214      fputs (_("\
 215 \n\
 216 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
 217 You may want to sort the input first, or use 'sort -u' without 'uniq'.\n\
 218 "), stdout);
 219       emit_ancillary_info (PROGRAM_NAME);
 220     }
 221   exit (status);
 222 }
 223
 224 static bool
 225 strict_posix2 (void)
 226 {
 227   int posix_ver = posix2_version ();
 228   return 200112 <= posix_ver && posix_ver < 200809;
 229 }
 230
 231 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
 232    invalid.  Silently convert too-large values to SIZE_MAX.  */
 233
 234 static size_t
 235 size_opt (char const *opt, char const *msgid)
 236 {
 237   uintmax_t size;
 238
 239   switch (xstrtoumax (opt, NULL, 10, &size, ""))
 240     {
 241     case LONGINT_OK:
 242     case LONGINT_OVERFLOW:
 243       break;
 244
 245     default:
 246       die (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 247     }
 248
 249   return MIN (size, SIZE_MAX);
 250 }
 251
 252 /* Given a linebuffer LINE,
 253    return a pointer to the beginning of the line's field to be compared. */
 254
 255 ATTRIBUTE_PURE
 256 static char *
 257 find_field (struct linebuffer const *line)
 258 {
 259   size_t count;
 260   char const *lp = line->buffer;
 261   size_t size = line->length - 1;
 262   size_t i = 0;
 263
 264   for (count = 0; count < skip_fields && i < size; count++)
 265     {
 266       while (i < size && field_sep (lp[i]))
 267         i++;
 268       while (i < size && !field_sep (lp[i]))
 269         i++;
 270     }
 271
 272   i += MIN (skip_chars, size - i);
 273
 274   return line->buffer + i;
 275 }
 276
 277 /* Return false if two strings OLD and NEW match, true if not.
 278    OLD and NEW point not to the beginnings of the lines
 279    but rather to the beginnings of the fields to compare.
 280    OLDLEN and NEWLEN are their lengths. */
 281
 282 static bool
 283 different (char *old, char *new, size_t oldlen, size_t newlen)
 284 {
 285   if (check_chars < oldlen)
 286     oldlen = check_chars;
 287   if (check_chars < newlen)
 288     newlen = check_chars;
 289
 290   if (ignore_case)
 291     return oldlen != newlen || memcasecmp (old, new, oldlen);
 292   else
 293     return oldlen != newlen || memcmp (old, new, oldlen);
 294 }
 295
 296 /* Output the line in linebuffer LINE to standard output
 297    provided that the switches say it should be output.
 298    MATCH is true if the line matches the previous line.
 299    If requested, print the number of times it occurred, as well;
 300    LINECOUNT + 1 is the number of times that the line occurred. */
 301
 302 static void
 303 writeline (struct linebuffer const *line,
 304            bool match, uintmax_t linecount)
 305 {
 306   if (! (linecount == 0 ? output_unique
 307          : !match ? output_first_repeated
 308          : output_later_repeated))
 309     return;
 310
 311   if (countmode == count_occurrences)
 312     printf ("%7" PRIuMAX " ", linecount + 1);
 313
 314   fwrite (line->buffer, sizeof (char), line->length, stdout);
 315 }
 316
 317 /* Process input file INFILE with output to OUTFILE.
 318    If either is "-", use the standard I/O stream for it instead. */
 319
 320 static void
 321 check_file (char const *infile, char const *outfile, char delimiter)
 322 {
 323   struct linebuffer lb1, lb2;
 324   struct linebuffer *thisline, *prevline;
 325
 326   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
 327     die (EXIT_FAILURE, errno, "%s", quotef (infile));
 328   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
 329     die (EXIT_FAILURE, errno, "%s", quotef (outfile));
 330
 331   fadvise (stdin, FADVISE_SEQUENTIAL);
 332
 333   thisline = &lb1;
 334   prevline = &lb2;
 335
 336   initbuffer (thisline);
 337   initbuffer (prevline);
 338
 339   /* The duplication in the following 'if' and 'else' blocks is an
 340      optimization to distinguish between when we can print input
 341      lines immediately (1. & 2.) or not.
 342
 343      1. --group => all input lines are printed.
 344         checking for unique/duplicated lines is used only for printing
 345         group separators.
 346
 347      2. The default case in which none of these options has been specified:
 348           --count, --repeated,  --all-repeated, --unique
 349         In the default case, this optimization lets uniq output each different
 350         line right away, without waiting to see if the next one is different.
 351
 352      3. All other cases.
 353   */
 354   if (output_unique && output_first_repeated && countmode == count_none)
 355     {
 356       char *prevfield = NULL;
 357       size_t prevlen;
 358       bool first_group_printed = false;
 359
 360       while (!feof (stdin))
 361         {
 362           char *thisfield;
 363           size_t thislen;
 364           bool new_group;
 365
 366           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 367             break;
 368
 369           thisfield = find_field (thisline);
 370           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 371
 372           new_group = (!prevfield
 373                        || different (thisfield, prevfield, thislen, prevlen));
 374
 375           if (new_group && grouping != GM_NONE
 376               && (grouping == GM_PREPEND || grouping == GM_BOTH
 377                   || (first_group_printed && (grouping == GM_APPEND
 378                                               || grouping == GM_SEPARATE))))
 379             putchar (delimiter);
 380
 381           if (new_group || grouping != GM_NONE)
 382             {
 383               fwrite (thisline->buffer, sizeof (char),
 384                       thisline->length, stdout);
 385
 386               SWAP_LINES (prevline, thisline);
 387               prevfield = thisfield;
 388               prevlen = thislen;
 389               first_group_printed = true;
 390             }
 391         }
 392       if ((grouping == GM_BOTH || grouping == GM_APPEND) && first_group_printed)
 393         putchar (delimiter);
 394     }
 395   else
 396     {
 397       char *prevfield;
 398       size_t prevlen;
 399       uintmax_t match_count = 0;
 400       bool first_delimiter = true;
 401
 402       if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
 403         goto closefiles;
 404       prevfield = find_field (prevline);
 405       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
 406
 407       while (!feof (stdin))
 408         {
 409           bool match;
 410           char *thisfield;
 411           size_t thislen;
 412           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 413             {
 414               if (ferror (stdin))
 415                 goto closefiles;
 416               break;
 417             }
 418           thisfield = find_field (thisline);
 419           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 420           match = !different (thisfield, prevfield, thislen, prevlen);
 421           match_count += match;
 422
 423           if (match_count == UINTMAX_MAX)
 424             {
 425               if (count_occurrences)
 426                 die (EXIT_FAILURE, 0, _("too many repeated lines"));
 427               match_count--;
 428             }
 429
 430           if (delimit_groups != DM_NONE)
 431             {
 432               if (!match)
 433                 {
 434                   if (match_count) /* a previous match */
 435                     first_delimiter = false; /* Only used when DM_SEPARATE */
 436                 }
 437               else if (match_count == 1)
 438                 {
 439                   if ((delimit_groups == DM_PREPEND)
 440                       || (delimit_groups == DM_SEPARATE
 441                           && !first_delimiter))
 442                     putchar (delimiter);
 443                 }
 444             }
 445
 446           if (!match || output_later_repeated)
 447             {
 448               writeline (prevline, match, match_count);
 449               SWAP_LINES (prevline, thisline);
 450               prevfield = thisfield;
 451               prevlen = thislen;
 452               if (!match)
 453                 match_count = 0;
 454             }
 455         }
 456
 457       writeline (prevline, false, match_count);
 458     }
 459
 460  closefiles:
 461   if (ferror (stdin) || fclose (stdin) != 0)
 462     die (EXIT_FAILURE, 0, _("error reading %s"), quoteaf (infile));
 463
 464   /* stdout is handled via the atexit-invoked close_stdout function.  */
 465
 466   free (lb1.buffer);
 467   free (lb2.buffer);
 468 }
 469
 470 enum Skip_field_option_type
 471   {
 472     SFO_NONE,
 473     SFO_OBSOLETE,
 474     SFO_NEW
 475   };
 476
 477 int
 478 main (int argc, char **argv)
 479 {
 480   int optc = 0;
 481   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
 482   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
 483   unsigned int nfiles = 0;
 484   char const *file[2];
 485   char delimiter = '\n';        /* change with --zero-terminated, -z */
 486   bool output_option_used = false;   /* if true, one of -u/-d/-D/-c was used */
 487
 488   file[0] = file[1] = "-";
 489   initialize_main (&argc, &argv);
 490   set_program_name (argv[0]);
 491   setlocale (LC_ALL, "");
 492   bindtextdomain (PACKAGE, LOCALEDIR);
 493   textdomain (PACKAGE);
 494
 495   atexit (close_stdout);
 496
 497   skip_chars = 0;
 498   skip_fields = 0;
 499   check_chars = SIZE_MAX;
 500   output_unique = output_first_repeated = true;
 501   output_later_repeated = false;
 502   countmode = count_none;
 503   delimit_groups = DM_NONE;
 504
 505   while (true)
 506     {
 507       /* Parse an operand with leading "+" as a file after "--" was
 508          seen; or if pedantic and a file was seen; or if not
 509          obsolete.  */
 510
 511       if (optc == -1
 512           || (posixly_correct && nfiles != 0)
 513           || ((optc = getopt_long (argc, argv,
 514                                    "-0123456789Dcdf:is:uw:z", longopts, NULL))
 515               == -1))
 516         {
 517           if (argc <= optind)
 518             break;
 519           if (nfiles == 2)
 520             {
 521               error (0, 0, _("extra operand %s"), quote (argv[optind]));
 522               usage (EXIT_FAILURE);
 523             }
 524           file[nfiles++] = argv[optind++];
 525         }
 526       else switch (optc)
 527         {
 528         case 1:
 529           {
 530             uintmax_t size;
 531             if (optarg[0] == '+'
 532                 && ! strict_posix2 ()
 533                 && xstrtoumax (optarg, NULL, 10, &size, "") == LONGINT_OK
 534                 && size <= SIZE_MAX)
 535               skip_chars = size;
 536             else if (nfiles == 2)
 537               {
 538                 error (0, 0, _("extra operand %s"), quote (optarg));
 539                 usage (EXIT_FAILURE);
 540               }
 541             else
 542               file[nfiles++] = optarg;
 543           }
 544           break;
 545
 546         case '0':
 547         case '1':
 548         case '2':
 549         case '3':
 550         case '4':
 551         case '5':
 552         case '6':
 553         case '7':
 554         case '8':
 555         case '9':
 556           {
 557             if (skip_field_option_type == SFO_NEW)
 558               skip_fields = 0;
 559
 560             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
 561               skip_fields = SIZE_MAX;
 562
 563             skip_field_option_type = SFO_OBSOLETE;
 564           }
 565           break;
 566
 567         case 'c':
 568           countmode = count_occurrences;
 569           output_option_used = true;
 570           break;
 571
 572         case 'd':
 573           output_unique = false;
 574           output_option_used = true;
 575           break;
 576
 577         case 'D':
 578           output_unique = false;
 579           output_later_repeated = true;
 580           if (optarg == NULL)
 581             delimit_groups = DM_NONE;
 582           else
 583             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 584                                         delimit_method_string,
 585                                         delimit_method_map);
 586           output_option_used = true;
 587           break;
 588
 589         case GROUP_OPTION:
 590           if (optarg == NULL)
 591             grouping = GM_SEPARATE;
 592           else
 593             grouping = XARGMATCH ("--group", optarg,
 594                                   grouping_method_string,
 595                                   grouping_method_map);
 596           break;
 597
 598         case 'f':
 599           skip_field_option_type = SFO_NEW;
 600           skip_fields = size_opt (optarg,
 601                                   N_("invalid number of fields to skip"));
 602           break;
 603
 604         case 'i':
 605           ignore_case = true;
 606           break;
 607
 608         case 's':
 609           skip_chars = size_opt (optarg,
 610                                  N_("invalid number of bytes to skip"));
 611           break;
 612
 613         case 'u':
 614           output_first_repeated = false;
 615           output_option_used = true;
 616           break;
 617
 618         case 'w':
 619           check_chars = size_opt (optarg,
 620                                   N_("invalid number of bytes to compare"));
 621           break;
 622
 623         case 'z':
 624           delimiter = '\0';
 625           break;
 626
 627         case_GETOPT_HELP_CHAR;
 628
 629         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 630
 631         default:
 632           usage (EXIT_FAILURE);
 633         }
 634     }
 635
 636   /* Note we could allow --group with -D at least, and that would
 637      avoid the need to specify a grouping method to --all-repeated.
 638      It was thought best to avoid deprecating those parameters though
 639      and keep --group separate to other options.  */
 640   if (grouping != GM_NONE && output_option_used)
 641     {
 642       error (0, 0, _("--group is mutually exclusive with -c/-d/-D/-u"));
 643       usage (EXIT_FAILURE);
 644     }
 645
 646   if (grouping != GM_NONE && countmode != count_none)
 647     {
 648       error (0, 0,
 649            _("grouping and printing repeat counts is meaningless"));
 650       usage (EXIT_FAILURE);
 651     }
 652
 653   if (countmode == count_occurrences && output_later_repeated)
 654     {
 655       error (0, 0,
 656            _("printing all duplicated lines and repeat counts is meaningless"));
 657       usage (EXIT_FAILURE);
 658     }
 659
 660   check_file (file[0], file[1], delimiter);
 661
 662   return EXIT_SUCCESS;
 663 }