src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 86, 91, 1995-2009 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Written by Richard M. Stallman and David MacKenzie. */
  18 \f
  19 #include <config.h>
  20
  21 #include <stdio.h>
  22 #include <getopt.h>
  23 #include <sys/types.h>
  24
  25 #include "system.h"
  26 #include "argmatch.h"
  27 #include "linebuffer.h"
  28 #include "error.h"
  29 #include "posixver.h"
  30 #include "quote.h"
  31 #include "xmemcoll.h"
  32 #include "xstrtol.h"
  33 #include "memcasecmp.h"
  34
  35 /* The official name of this program (e.g., no `g' prefix).  */
  36 #define PROGRAM_NAME "uniq"
  37
  38 #define AUTHORS \
  39   proper_name ("Richard M. Stallman"), \
  40   proper_name ("David MacKenzie")
  41
  42 #define SWAP_LINES(A, B)                        \
  43   do                                            \
  44     {                                           \
  45       struct linebuffer *_tmp;                  \
  46       _tmp = (A);                               \
  47       (A) = (B);                                \
  48       (B) = _tmp;                               \
  49     }                                           \
  50   while (0)
  51
  52 /* True if the LC_COLLATE locale is hard.  */
  53 static bool hard_LC_COLLATE;
  54
  55 /* Number of fields to skip on each line when doing comparisons. */
  56 static size_t skip_fields;
  57
  58 /* Number of chars to skip after skipping any fields. */
  59 static size_t skip_chars;
  60
  61 /* Number of chars to compare. */
  62 static size_t check_chars;
  63
  64 enum countmode
  65 {
  66   count_occurrences,            /* -c Print count before output lines. */
  67   count_none                    /* Default.  Do not print counts. */
  68 };
  69
  70 /* Whether and how to precede the output lines with a count of the number of
  71    times they occurred in the input. */
  72 static enum countmode countmode;
  73
  74 /* Which lines to output: unique lines, the first of a group of
  75    repeated lines, and the second and subsequented of a group of
  76    repeated lines.  */
  77 static bool output_unique;
  78 static bool output_first_repeated;
  79 static bool output_later_repeated;
  80
  81 /* If true, ignore case when comparing.  */
  82 static bool ignore_case;
  83
  84 enum delimit_method
  85 {
  86   /* No delimiters output.  --all-repeated[=none] */
  87   DM_NONE,
  88
  89   /* Delimiter precedes all groups.  --all-repeated=prepend */
  90   DM_PREPEND,
  91
  92   /* Delimit all groups.  --all-repeated=separate */
  93   DM_SEPARATE
  94 };
  95
  96 static char const *const delimit_method_string[] =
  97 {
  98   "none", "prepend", "separate", NULL
  99 };
 100
 101 static enum delimit_method const delimit_method_map[] =
 102 {
 103   DM_NONE, DM_PREPEND, DM_SEPARATE
 104 };
 105
 106 /* Select whether/how to delimit groups of duplicate lines.  */
 107 static enum delimit_method delimit_groups;
 108
 109 static struct option const longopts[] =
 110 {
 111   {"count", no_argument, NULL, 'c'},
 112   {"repeated", no_argument, NULL, 'd'},
 113   {"all-repeated", optional_argument, NULL, 'D'},
 114   {"ignore-case", no_argument, NULL, 'i'},
 115   {"unique", no_argument, NULL, 'u'},
 116   {"skip-fields", required_argument, NULL, 'f'},
 117   {"skip-chars", required_argument, NULL, 's'},
 118   {"check-chars", required_argument, NULL, 'w'},
 119   {"zero-terminated", no_argument, NULL, 'z'},
 120   {GETOPT_HELP_OPTION_DECL},
 121   {GETOPT_VERSION_OPTION_DECL},
 122   {NULL, 0, NULL, 0}
 123 };
 124
 125 void
 126 usage (int status)
 127 {
 128   if (status != EXIT_SUCCESS)
 129     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 130              program_name);
 131   else
 132     {
 133       printf (_("\
 134 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 135 "),
 136               program_name);
 137       fputs (_("\
 138 Filter adjacent matching lines from INPUT (or standard input),\n\
 139 writing to OUTPUT (or standard output).\n\
 140 \n\
 141 With no options, matching lines are merged to the first occurrence.\n\
 142 \n\
 143 "), stdout);
 144      fputs (_("\
 145 Mandatory arguments to long options are mandatory for short options too.\n\
 146 "), stdout);
 147      fputs (_("\
 148   -c, --count           prefix lines by the number of occurrences\n\
 149   -d, --repeated        only print duplicate lines\n\
 150 "), stdout);
 151      fputs (_("\
 152   -D, --all-repeated[=delimit-method]  print all duplicate lines\n\
 153                         delimit-method={none(default),prepend,separate}\n\
 154                         Delimiting is done with blank lines.\n\
 155   -f, --skip-fields=N   avoid comparing the first N fields\n\
 156   -i, --ignore-case     ignore differences in case when comparing\n\
 157   -s, --skip-chars=N    avoid comparing the first N characters\n\
 158   -u, --unique          only print unique lines\n\
 159   -z, --zero-terminated  end lines with 0 byte, not newline\n\
 160 "), stdout);
 161      fputs (_("\
 162   -w, --check-chars=N   compare no more than N characters in lines\n\
 163 "), stdout);
 164      fputs (HELP_OPTION_DESCRIPTION, stdout);
 165      fputs (VERSION_OPTION_DESCRIPTION, stdout);
 166      fputs (_("\
 167 \n\
 168 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
 169 characters.  Fields are skipped before chars.\n\
 170 "), stdout);
 171      fputs (_("\
 172 \n\
 173 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
 174 You may want to sort the input first, or use `sort -u' without `uniq'.\n\
 175 Also, comparisons honor the rules specified by `LC_COLLATE'.\n\
 176 "), stdout);
 177       emit_bug_reporting_address ();
 178     }
 179   exit (status);
 180 }
 181
 182 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
 183    invalid.  Silently convert too-large values to SIZE_MAX.  */
 184
 185 static size_t
 186 size_opt (char const *opt, char const *msgid)
 187 {
 188   unsigned long int size;
 189   verify (SIZE_MAX <= ULONG_MAX);
 190
 191   switch (xstrtoul (opt, NULL, 10, &size, ""))
 192     {
 193     case LONGINT_OK:
 194     case LONGINT_OVERFLOW:
 195       break;
 196
 197     default:
 198       error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 199     }
 200
 201   return MIN (size, SIZE_MAX);
 202 }
 203
 204 /* Given a linebuffer LINE,
 205    return a pointer to the beginning of the line's field to be compared. */
 206
 207 static char *
 208 find_field (struct linebuffer const *line)
 209 {
 210   size_t count;
 211   char const *lp = line->buffer;
 212   size_t size = line->length - 1;
 213   size_t i = 0;
 214
 215   for (count = 0; count < skip_fields; count++)
 216     {
 217       while (i < size && isblank (to_uchar (lp[i])))
 218         i++;
 219       while (i < size && !isblank (to_uchar (lp[i])))
 220         i++;
 221     }
 222
 223   for (count = 0; count < skip_chars && i < size; count++)
 224     i++;
 225
 226   return line->buffer + i;
 227 }
 228
 229 /* Return false if two strings OLD and NEW match, true if not.
 230    OLD and NEW point not to the beginnings of the lines
 231    but rather to the beginnings of the fields to compare.
 232    OLDLEN and NEWLEN are their lengths. */
 233
 234 static bool
 235 different (char *old, char *new, size_t oldlen, size_t newlen)
 236 {
 237   if (check_chars < oldlen)
 238     oldlen = check_chars;
 239   if (check_chars < newlen)
 240     newlen = check_chars;
 241
 242   if (ignore_case)
 243     {
 244       /* FIXME: This should invoke strcoll somehow.  */
 245       return oldlen != newlen || memcasecmp (old, new, oldlen);
 246     }
 247   else if (hard_LC_COLLATE)
 248     return xmemcoll (old, oldlen, new, newlen) != 0;
 249   else
 250     return oldlen != newlen || memcmp (old, new, oldlen);
 251 }
 252
 253 /* Output the line in linebuffer LINE to standard output
 254    provided that the switches say it should be output.
 255    MATCH is true if the line matches the previous line.
 256    If requested, print the number of times it occurred, as well;
 257    LINECOUNT + 1 is the number of times that the line occurred. */
 258
 259 static void
 260 writeline (struct linebuffer const *line,
 261            bool match, uintmax_t linecount)
 262 {
 263   if (! (linecount == 0 ? output_unique
 264          : !match ? output_first_repeated
 265          : output_later_repeated))
 266     return;
 267
 268   if (countmode == count_occurrences)
 269     printf ("%7" PRIuMAX " ", linecount + 1);
 270
 271   fwrite (line->buffer, sizeof (char), line->length, stdout);
 272 }
 273
 274 /* Process input file INFILE with output to OUTFILE.
 275    If either is "-", use the standard I/O stream for it instead. */
 276
 277 static void
 278 check_file (const char *infile, const char *outfile, char delimiter)
 279 {
 280   struct linebuffer lb1, lb2;
 281   struct linebuffer *thisline, *prevline;
 282
 283   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
 284     error (EXIT_FAILURE, errno, "%s", infile);
 285   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
 286     error (EXIT_FAILURE, errno, "%s", outfile);
 287
 288   thisline = &lb1;
 289   prevline = &lb2;
 290
 291   initbuffer (thisline);
 292   initbuffer (prevline);
 293
 294   /* The duplication in the following `if' and `else' blocks is an
 295      optimization to distinguish the common case (in which none of
 296      the following options has been specified: --count, -repeated,
 297      --all-repeated, --unique) from the others.  In the common case,
 298      this optimization lets uniq output each different line right away,
 299      without waiting to see if the next one is different.  */
 300
 301   if (output_unique && output_first_repeated && countmode == count_none)
 302     {
 303       char *prevfield IF_LINT (= NULL);
 304       size_t prevlen IF_LINT (= 0);
 305
 306       while (!feof (stdin))
 307         {
 308           char *thisfield;
 309           size_t thislen;
 310           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 311             break;
 312           thisfield = find_field (thisline);
 313           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 314           if (prevline->length == 0
 315               || different (thisfield, prevfield, thislen, prevlen))
 316             {
 317               fwrite (thisline->buffer, sizeof (char),
 318                       thisline->length, stdout);
 319
 320               SWAP_LINES (prevline, thisline);
 321               prevfield = thisfield;
 322               prevlen = thislen;
 323             }
 324         }
 325     }
 326   else
 327     {
 328       char *prevfield;
 329       size_t prevlen;
 330       uintmax_t match_count = 0;
 331       bool first_delimiter = true;
 332
 333       if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
 334         goto closefiles;
 335       prevfield = find_field (prevline);
 336       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
 337
 338       while (!feof (stdin))
 339         {
 340           bool match;
 341           char *thisfield;
 342           size_t thislen;
 343           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 344             {
 345               if (ferror (stdin))
 346                 goto closefiles;
 347               break;
 348             }
 349           thisfield = find_field (thisline);
 350           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 351           match = !different (thisfield, prevfield, thislen, prevlen);
 352           match_count += match;
 353
 354           if (match_count == UINTMAX_MAX)
 355             {
 356               if (count_occurrences)
 357                 error (EXIT_FAILURE, 0, _("too many repeated lines"));
 358               match_count--;
 359             }
 360
 361           if (delimit_groups != DM_NONE)
 362             {
 363               if (!match)
 364                 {
 365                   if (match_count) /* a previous match */
 366                     first_delimiter = false; /* Only used when DM_SEPARATE */
 367                 }
 368               else if (match_count == 1)
 369                 {
 370                   if ((delimit_groups == DM_PREPEND)
 371                       || (delimit_groups == DM_SEPARATE
 372                           && !first_delimiter))
 373                     putchar (delimiter);
 374                 }
 375             }
 376
 377           if (!match || output_later_repeated)
 378             {
 379               writeline (prevline, match, match_count);
 380               SWAP_LINES (prevline, thisline);
 381               prevfield = thisfield;
 382               prevlen = thislen;
 383               if (!match)
 384                 match_count = 0;
 385             }
 386         }
 387
 388       writeline (prevline, false, match_count);
 389     }
 390
 391  closefiles:
 392   if (ferror (stdin) || fclose (stdin) != 0)
 393     error (EXIT_FAILURE, 0, _("error reading %s"), infile);
 394
 395   /* stdout is handled via the atexit-invoked close_stdout function.  */
 396
 397   free (lb1.buffer);
 398   free (lb2.buffer);
 399 }
 400
 401 enum Skip_field_option_type
 402   {
 403     SFO_NONE,
 404     SFO_OBSOLETE,
 405     SFO_NEW
 406   };
 407
 408 int
 409 main (int argc, char **argv)
 410 {
 411   int optc = 0;
 412   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
 413   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
 414   int nfiles = 0;
 415   char const *file[2];
 416   char delimiter = '\n';        /* change with --zero-terminated, -z */
 417
 418   file[0] = file[1] = "-";
 419   initialize_main (&argc, &argv);
 420   set_program_name (argv[0]);
 421   setlocale (LC_ALL, "");
 422   bindtextdomain (PACKAGE, LOCALEDIR);
 423   textdomain (PACKAGE);
 424   hard_LC_COLLATE = hard_locale (LC_COLLATE);
 425
 426   atexit (close_stdout);
 427
 428   skip_chars = 0;
 429   skip_fields = 0;
 430   check_chars = SIZE_MAX;
 431   output_unique = output_first_repeated = true;
 432   output_later_repeated = false;
 433   countmode = count_none;
 434   delimit_groups = DM_NONE;
 435
 436   for (;;)
 437     {
 438       /* Parse an operand with leading "+" as a file after "--" was
 439          seen; or if pedantic and a file was seen; or if not
 440          obsolete.  */
 441
 442       if (optc == -1
 443           || (posixly_correct && nfiles != 0)
 444           || ((optc = getopt_long (argc, argv,
 445                                    "-0123456789Dcdf:is:uw:z", longopts, NULL))
 446               == -1))
 447         {
 448           if (argc <= optind)
 449             break;
 450           if (nfiles == 2)
 451             {
 452               error (0, 0, _("extra operand %s"), quote (argv[optind]));
 453               usage (EXIT_FAILURE);
 454             }
 455           file[nfiles++] = argv[optind++];
 456         }
 457       else switch (optc)
 458         {
 459         case 1:
 460           {
 461             unsigned long int size;
 462             if (optarg[0] == '+'
 463                 && posix2_version () < 200112
 464                 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
 465                 && size <= SIZE_MAX)
 466               skip_chars = size;
 467             else if (nfiles == 2)
 468               {
 469                 error (0, 0, _("extra operand %s"), quote (optarg));
 470                 usage (EXIT_FAILURE);
 471               }
 472             else
 473               file[nfiles++] = optarg;
 474           }
 475           break;
 476
 477         case '0':
 478         case '1':
 479         case '2':
 480         case '3':
 481         case '4':
 482         case '5':
 483         case '6':
 484         case '7':
 485         case '8':
 486         case '9':
 487           {
 488             if (skip_field_option_type == SFO_NEW)
 489               skip_fields = 0;
 490
 491             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
 492               skip_fields = SIZE_MAX;
 493
 494             skip_field_option_type = SFO_OBSOLETE;
 495           }
 496           break;
 497
 498         case 'c':
 499           countmode = count_occurrences;
 500           break;
 501
 502         case 'd':
 503           output_unique = false;
 504           break;
 505
 506         case 'D':
 507           output_unique = false;
 508           output_later_repeated = true;
 509           if (optarg == NULL)
 510             delimit_groups = DM_NONE;
 511           else
 512             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 513                                         delimit_method_string,
 514                                         delimit_method_map);
 515           break;
 516
 517         case 'f':
 518           skip_field_option_type = SFO_NEW;
 519           skip_fields = size_opt (optarg,
 520                                   N_("invalid number of fields to skip"));
 521           break;
 522
 523         case 'i':
 524           ignore_case = true;
 525           break;
 526
 527         case 's':
 528           skip_chars = size_opt (optarg,
 529                                  N_("invalid number of bytes to skip"));
 530           break;
 531
 532         case 'u':
 533           output_first_repeated = false;
 534           break;
 535
 536         case 'w':
 537           check_chars = size_opt (optarg,
 538                                   N_("invalid number of bytes to compare"));
 539           break;
 540
 541         case 'z':
 542           delimiter = '\0';
 543           break;
 544
 545         case_GETOPT_HELP_CHAR;
 546
 547         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 548
 549         default:
 550           usage (EXIT_FAILURE);
 551         }
 552     }
 553
 554   if (countmode == count_occurrences && output_later_repeated)
 555     {
 556       error (0, 0,
 557            _("printing all duplicated lines and repeat counts is meaningless"));
 558       usage (EXIT_FAILURE);
 559     }
 560
 561   check_file (file[0], file[1], delimiter);
 562
 563   exit (EXIT_SUCCESS);
 564 }