src/uniq.c

   1 /* uniq -- remove duplicate lines from a sorted file
   2    Copyright (C) 86, 91, 1995-2009 Free Software Foundation, Inc.
   3
   4    This program is free software: you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published by
   6    the Free Software Foundation, either version 3 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
  16
  17 /* Written by Richard M. Stallman and David MacKenzie. */
  18 \f
  19 #include <config.h>
  20
  21 #include <stdio.h>
  22 #include <getopt.h>
  23 #include <sys/types.h>
  24
  25 #include "system.h"
  26 #include "argmatch.h"
  27 #include "linebuffer.h"
  28 #include "error.h"
  29 #include "hard-locale.h"
  30 #include "posixver.h"
  31 #include "quote.h"
  32 #include "xmemcoll.h"
  33 #include "xstrtol.h"
  34 #include "memcasecmp.h"
  35
  36 /* The official name of this program (e.g., no `g' prefix).  */
  37 #define PROGRAM_NAME "uniq"
  38
  39 #define AUTHORS \
  40   proper_name ("Richard M. Stallman"), \
  41   proper_name ("David MacKenzie")
  42
  43 #define SWAP_LINES(A, B)                        \
  44   do                                            \
  45     {                                           \
  46       struct linebuffer *_tmp;                  \
  47       _tmp = (A);                               \
  48       (A) = (B);                                \
  49       (B) = _tmp;                               \
  50     }                                           \
  51   while (0)
  52
  53 /* True if the LC_COLLATE locale is hard.  */
  54 static bool hard_LC_COLLATE;
  55
  56 /* Number of fields to skip on each line when doing comparisons. */
  57 static size_t skip_fields;
  58
  59 /* Number of chars to skip after skipping any fields. */
  60 static size_t skip_chars;
  61
  62 /* Number of chars to compare. */
  63 static size_t check_chars;
  64
  65 enum countmode
  66 {
  67   count_occurrences,            /* -c Print count before output lines. */
  68   count_none                    /* Default.  Do not print counts. */
  69 };
  70
  71 /* Whether and how to precede the output lines with a count of the number of
  72    times they occurred in the input. */
  73 static enum countmode countmode;
  74
  75 /* Which lines to output: unique lines, the first of a group of
  76    repeated lines, and the second and subsequented of a group of
  77    repeated lines.  */
  78 static bool output_unique;
  79 static bool output_first_repeated;
  80 static bool output_later_repeated;
  81
  82 /* If true, ignore case when comparing.  */
  83 static bool ignore_case;
  84
  85 enum delimit_method
  86 {
  87   /* No delimiters output.  --all-repeated[=none] */
  88   DM_NONE,
  89
  90   /* Delimiter precedes all groups.  --all-repeated=prepend */
  91   DM_PREPEND,
  92
  93   /* Delimit all groups.  --all-repeated=separate */
  94   DM_SEPARATE
  95 };
  96
  97 static char const *const delimit_method_string[] =
  98 {
  99   "none", "prepend", "separate", NULL
 100 };
 101
 102 static enum delimit_method const delimit_method_map[] =
 103 {
 104   DM_NONE, DM_PREPEND, DM_SEPARATE
 105 };
 106
 107 /* Select whether/how to delimit groups of duplicate lines.  */
 108 static enum delimit_method delimit_groups;
 109
 110 static struct option const longopts[] =
 111 {
 112   {"count", no_argument, NULL, 'c'},
 113   {"repeated", no_argument, NULL, 'd'},
 114   {"all-repeated", optional_argument, NULL, 'D'},
 115   {"ignore-case", no_argument, NULL, 'i'},
 116   {"unique", no_argument, NULL, 'u'},
 117   {"skip-fields", required_argument, NULL, 'f'},
 118   {"skip-chars", required_argument, NULL, 's'},
 119   {"check-chars", required_argument, NULL, 'w'},
 120   {"zero-terminated", no_argument, NULL, 'z'},
 121   {GETOPT_HELP_OPTION_DECL},
 122   {GETOPT_VERSION_OPTION_DECL},
 123   {NULL, 0, NULL, 0}
 124 };
 125
 126 void
 127 usage (int status)
 128 {
 129   if (status != EXIT_SUCCESS)
 130     fprintf (stderr, _("Try `%s --help' for more information.\n"),
 131              program_name);
 132   else
 133     {
 134       printf (_("\
 135 Usage: %s [OPTION]... [INPUT [OUTPUT]]\n\
 136 "),
 137               program_name);
 138       fputs (_("\
 139 Filter adjacent matching lines from INPUT (or standard input),\n\
 140 writing to OUTPUT (or standard output).\n\
 141 \n\
 142 With no options, matching lines are merged to the first occurrence.\n\
 143 \n\
 144 "), stdout);
 145      fputs (_("\
 146 Mandatory arguments to long options are mandatory for short options too.\n\
 147 "), stdout);
 148      fputs (_("\
 149   -c, --count           prefix lines by the number of occurrences\n\
 150   -d, --repeated        only print duplicate lines\n\
 151 "), stdout);
 152      fputs (_("\
 153   -D, --all-repeated[=delimit-method]  print all duplicate lines\n\
 154                         delimit-method={none(default),prepend,separate}\n\
 155                         Delimiting is done with blank lines.\n\
 156   -f, --skip-fields=N   avoid comparing the first N fields\n\
 157   -i, --ignore-case     ignore differences in case when comparing\n\
 158   -s, --skip-chars=N    avoid comparing the first N characters\n\
 159   -u, --unique          only print unique lines\n\
 160   -z, --zero-terminated  end lines with 0 byte, not newline\n\
 161 "), stdout);
 162      fputs (_("\
 163   -w, --check-chars=N   compare no more than N characters in lines\n\
 164 "), stdout);
 165      fputs (HELP_OPTION_DESCRIPTION, stdout);
 166      fputs (VERSION_OPTION_DESCRIPTION, stdout);
 167      fputs (_("\
 168 \n\
 169 A field is a run of blanks (usually spaces and/or TABs), then non-blank\n\
 170 characters.  Fields are skipped before chars.\n\
 171 "), stdout);
 172      fputs (_("\
 173 \n\
 174 Note: 'uniq' does not detect repeated lines unless they are adjacent.\n\
 175 You may want to sort the input first, or use `sort -u' without `uniq'.\n\
 176 Also, comparisons honor the rules specified by `LC_COLLATE'.\n\
 177 "), stdout);
 178       emit_ancillary_info ();
 179     }
 180   exit (status);
 181 }
 182
 183 /* Convert OPT to size_t, reporting an error using MSGID if OPT is
 184    invalid.  Silently convert too-large values to SIZE_MAX.  */
 185
 186 static size_t
 187 size_opt (char const *opt, char const *msgid)
 188 {
 189   unsigned long int size;
 190   verify (SIZE_MAX <= ULONG_MAX);
 191
 192   switch (xstrtoul (opt, NULL, 10, &size, ""))
 193     {
 194     case LONGINT_OK:
 195     case LONGINT_OVERFLOW:
 196       break;
 197
 198     default:
 199       error (EXIT_FAILURE, 0, "%s: %s", opt, _(msgid));
 200     }
 201
 202   return MIN (size, SIZE_MAX);
 203 }
 204
 205 /* Given a linebuffer LINE,
 206    return a pointer to the beginning of the line's field to be compared. */
 207
 208 static char *
 209 find_field (struct linebuffer const *line)
 210 {
 211   size_t count;
 212   char const *lp = line->buffer;
 213   size_t size = line->length - 1;
 214   size_t i = 0;
 215
 216   for (count = 0; count < skip_fields; count++)
 217     {
 218       while (i < size && isblank (to_uchar (lp[i])))
 219         i++;
 220       while (i < size && !isblank (to_uchar (lp[i])))
 221         i++;
 222     }
 223
 224   for (count = 0; count < skip_chars && i < size; count++)
 225     i++;
 226
 227   return line->buffer + i;
 228 }
 229
 230 /* Return false if two strings OLD and NEW match, true if not.
 231    OLD and NEW point not to the beginnings of the lines
 232    but rather to the beginnings of the fields to compare.
 233    OLDLEN and NEWLEN are their lengths. */
 234
 235 static bool
 236 different (char *old, char *new, size_t oldlen, size_t newlen)
 237 {
 238   if (check_chars < oldlen)
 239     oldlen = check_chars;
 240   if (check_chars < newlen)
 241     newlen = check_chars;
 242
 243   if (ignore_case)
 244     {
 245       /* FIXME: This should invoke strcoll somehow.  */
 246       return oldlen != newlen || memcasecmp (old, new, oldlen);
 247     }
 248   else if (hard_LC_COLLATE)
 249     return xmemcoll (old, oldlen, new, newlen) != 0;
 250   else
 251     return oldlen != newlen || memcmp (old, new, oldlen);
 252 }
 253
 254 /* Output the line in linebuffer LINE to standard output
 255    provided that the switches say it should be output.
 256    MATCH is true if the line matches the previous line.
 257    If requested, print the number of times it occurred, as well;
 258    LINECOUNT + 1 is the number of times that the line occurred. */
 259
 260 static void
 261 writeline (struct linebuffer const *line,
 262            bool match, uintmax_t linecount)
 263 {
 264   if (! (linecount == 0 ? output_unique
 265          : !match ? output_first_repeated
 266          : output_later_repeated))
 267     return;
 268
 269   if (countmode == count_occurrences)
 270     printf ("%7" PRIuMAX " ", linecount + 1);
 271
 272   fwrite (line->buffer, sizeof (char), line->length, stdout);
 273 }
 274
 275 /* Process input file INFILE with output to OUTFILE.
 276    If either is "-", use the standard I/O stream for it instead. */
 277
 278 static void
 279 check_file (const char *infile, const char *outfile, char delimiter)
 280 {
 281   struct linebuffer lb1, lb2;
 282   struct linebuffer *thisline, *prevline;
 283
 284   if (! (STREQ (infile, "-") || freopen (infile, "r", stdin)))
 285     error (EXIT_FAILURE, errno, "%s", infile);
 286   if (! (STREQ (outfile, "-") || freopen (outfile, "w", stdout)))
 287     error (EXIT_FAILURE, errno, "%s", outfile);
 288
 289   thisline = &lb1;
 290   prevline = &lb2;
 291
 292   initbuffer (thisline);
 293   initbuffer (prevline);
 294
 295   /* The duplication in the following `if' and `else' blocks is an
 296      optimization to distinguish the common case (in which none of
 297      the following options has been specified: --count, -repeated,
 298      --all-repeated, --unique) from the others.  In the common case,
 299      this optimization lets uniq output each different line right away,
 300      without waiting to see if the next one is different.  */
 301
 302   if (output_unique && output_first_repeated && countmode == count_none)
 303     {
 304       char *prevfield IF_LINT (= NULL);
 305       size_t prevlen IF_LINT (= 0);
 306
 307       while (!feof (stdin))
 308         {
 309           char *thisfield;
 310           size_t thislen;
 311           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 312             break;
 313           thisfield = find_field (thisline);
 314           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 315           if (prevline->length == 0
 316               || different (thisfield, prevfield, thislen, prevlen))
 317             {
 318               fwrite (thisline->buffer, sizeof (char),
 319                       thisline->length, stdout);
 320
 321               SWAP_LINES (prevline, thisline);
 322               prevfield = thisfield;
 323               prevlen = thislen;
 324             }
 325         }
 326     }
 327   else
 328     {
 329       char *prevfield;
 330       size_t prevlen;
 331       uintmax_t match_count = 0;
 332       bool first_delimiter = true;
 333
 334       if (readlinebuffer_delim (prevline, stdin, delimiter) == 0)
 335         goto closefiles;
 336       prevfield = find_field (prevline);
 337       prevlen = prevline->length - 1 - (prevfield - prevline->buffer);
 338
 339       while (!feof (stdin))
 340         {
 341           bool match;
 342           char *thisfield;
 343           size_t thislen;
 344           if (readlinebuffer_delim (thisline, stdin, delimiter) == 0)
 345             {
 346               if (ferror (stdin))
 347                 goto closefiles;
 348               break;
 349             }
 350           thisfield = find_field (thisline);
 351           thislen = thisline->length - 1 - (thisfield - thisline->buffer);
 352           match = !different (thisfield, prevfield, thislen, prevlen);
 353           match_count += match;
 354
 355           if (match_count == UINTMAX_MAX)
 356             {
 357               if (count_occurrences)
 358                 error (EXIT_FAILURE, 0, _("too many repeated lines"));
 359               match_count--;
 360             }
 361
 362           if (delimit_groups != DM_NONE)
 363             {
 364               if (!match)
 365                 {
 366                   if (match_count) /* a previous match */
 367                     first_delimiter = false; /* Only used when DM_SEPARATE */
 368                 }
 369               else if (match_count == 1)
 370                 {
 371                   if ((delimit_groups == DM_PREPEND)
 372                       || (delimit_groups == DM_SEPARATE
 373                           && !first_delimiter))
 374                     putchar (delimiter);
 375                 }
 376             }
 377
 378           if (!match || output_later_repeated)
 379             {
 380               writeline (prevline, match, match_count);
 381               SWAP_LINES (prevline, thisline);
 382               prevfield = thisfield;
 383               prevlen = thislen;
 384               if (!match)
 385                 match_count = 0;
 386             }
 387         }
 388
 389       writeline (prevline, false, match_count);
 390     }
 391
 392  closefiles:
 393   if (ferror (stdin) || fclose (stdin) != 0)
 394     error (EXIT_FAILURE, 0, _("error reading %s"), infile);
 395
 396   /* stdout is handled via the atexit-invoked close_stdout function.  */
 397
 398   free (lb1.buffer);
 399   free (lb2.buffer);
 400 }
 401
 402 enum Skip_field_option_type
 403   {
 404     SFO_NONE,
 405     SFO_OBSOLETE,
 406     SFO_NEW
 407   };
 408
 409 int
 410 main (int argc, char **argv)
 411 {
 412   int optc = 0;
 413   bool posixly_correct = (getenv ("POSIXLY_CORRECT") != NULL);
 414   enum Skip_field_option_type skip_field_option_type = SFO_NONE;
 415   int nfiles = 0;
 416   char const *file[2];
 417   char delimiter = '\n';        /* change with --zero-terminated, -z */
 418
 419   file[0] = file[1] = "-";
 420   initialize_main (&argc, &argv);
 421   set_program_name (argv[0]);
 422   setlocale (LC_ALL, "");
 423   bindtextdomain (PACKAGE, LOCALEDIR);
 424   textdomain (PACKAGE);
 425   hard_LC_COLLATE = hard_locale (LC_COLLATE);
 426
 427   atexit (close_stdout);
 428
 429   skip_chars = 0;
 430   skip_fields = 0;
 431   check_chars = SIZE_MAX;
 432   output_unique = output_first_repeated = true;
 433   output_later_repeated = false;
 434   countmode = count_none;
 435   delimit_groups = DM_NONE;
 436
 437   for (;;)
 438     {
 439       /* Parse an operand with leading "+" as a file after "--" was
 440          seen; or if pedantic and a file was seen; or if not
 441          obsolete.  */
 442
 443       if (optc == -1
 444           || (posixly_correct && nfiles != 0)
 445           || ((optc = getopt_long (argc, argv,
 446                                    "-0123456789Dcdf:is:uw:z", longopts, NULL))
 447               == -1))
 448         {
 449           if (argc <= optind)
 450             break;
 451           if (nfiles == 2)
 452             {
 453               error (0, 0, _("extra operand %s"), quote (argv[optind]));
 454               usage (EXIT_FAILURE);
 455             }
 456           file[nfiles++] = argv[optind++];
 457         }
 458       else switch (optc)
 459         {
 460         case 1:
 461           {
 462             unsigned long int size;
 463             if (optarg[0] == '+'
 464                 && posix2_version () < 200112
 465                 && xstrtoul (optarg, NULL, 10, &size, "") == LONGINT_OK
 466                 && size <= SIZE_MAX)
 467               skip_chars = size;
 468             else if (nfiles == 2)
 469               {
 470                 error (0, 0, _("extra operand %s"), quote (optarg));
 471                 usage (EXIT_FAILURE);
 472               }
 473             else
 474               file[nfiles++] = optarg;
 475           }
 476           break;
 477
 478         case '0':
 479         case '1':
 480         case '2':
 481         case '3':
 482         case '4':
 483         case '5':
 484         case '6':
 485         case '7':
 486         case '8':
 487         case '9':
 488           {
 489             if (skip_field_option_type == SFO_NEW)
 490               skip_fields = 0;
 491
 492             if (!DECIMAL_DIGIT_ACCUMULATE (skip_fields, optc - '0', size_t))
 493               skip_fields = SIZE_MAX;
 494
 495             skip_field_option_type = SFO_OBSOLETE;
 496           }
 497           break;
 498
 499         case 'c':
 500           countmode = count_occurrences;
 501           break;
 502
 503         case 'd':
 504           output_unique = false;
 505           break;
 506
 507         case 'D':
 508           output_unique = false;
 509           output_later_repeated = true;
 510           if (optarg == NULL)
 511             delimit_groups = DM_NONE;
 512           else
 513             delimit_groups = XARGMATCH ("--all-repeated", optarg,
 514                                         delimit_method_string,
 515                                         delimit_method_map);
 516           break;
 517
 518         case 'f':
 519           skip_field_option_type = SFO_NEW;
 520           skip_fields = size_opt (optarg,
 521                                   N_("invalid number of fields to skip"));
 522           break;
 523
 524         case 'i':
 525           ignore_case = true;
 526           break;
 527
 528         case 's':
 529           skip_chars = size_opt (optarg,
 530                                  N_("invalid number of bytes to skip"));
 531           break;
 532
 533         case 'u':
 534           output_first_repeated = false;
 535           break;
 536
 537         case 'w':
 538           check_chars = size_opt (optarg,
 539                                   N_("invalid number of bytes to compare"));
 540           break;
 541
 542         case 'z':
 543           delimiter = '\0';
 544           break;
 545
 546         case_GETOPT_HELP_CHAR;
 547
 548         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
 549
 550         default:
 551           usage (EXIT_FAILURE);
 552         }
 553     }
 554
 555   if (countmode == count_occurrences && output_later_repeated)
 556     {
 557       error (0, 0,
 558            _("printing all duplicated lines and repeat counts is meaningless"));
 559       usage (EXIT_FAILURE);
 560     }
 561
 562   check_file (file[0], file[1], delimiter);
 563
 564   exit (EXIT_SUCCESS);
 565 }