src/ptx.c

   1 /* Permuted index for GNU, with keywords in their context.
   2    Copyright (C) 1990-2022 Free Software Foundation, Inc.
   3    François Pinard <pinard@iro.umontreal.ca>, 1988.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.
  17
  18    François Pinard <pinard@iro.umontreal.ca> */
  19
  20 #include <config.h>
  21
  22 #include <getopt.h>
  23 #include <sys/types.h>
  24 #include "system.h"
  25 #include "die.h"
  26 #include <regex.h>
  27 #include "argmatch.h"
  28 #include "error.h"
  29 #include "fadvise.h"
  30 #include "quote.h"
  31 #include "read-file.h"
  32 #include "stdio--.h"
  33 #include "xstrtol.h"
  34
  35 /* The official name of this program (e.g., no 'g' prefix).  */
  36 #define PROGRAM_NAME "ptx"
  37
  38 /* TRANSLATORS: Please translate "F. Pinard" to "François Pinard"
  39    if "ç" (c-with-cedilla) is available in the translation's character
  40    set and encoding.  */
  41 #define AUTHORS proper_name_utf8 ("F. Pinard", "Fran\xc3\xa7ois Pinard")
  42
  43 /* Number of possible characters in a byte.  */
  44 #define CHAR_SET_SIZE 256
  45
  46 #define ISODIGIT(C) ((C) >= '0' && (C) <= '7')
  47 #define HEXTOBIN(C) ((C) >= 'a' && (C) <= 'f' ? (C)-'a'+10 \
  48                      : (C) >= 'A' && (C) <= 'F' ? (C)-'A'+10 : (C)-'0')
  49 #define OCTTOBIN(C) ((C) - '0')
  50
  51 /* Debugging the memory allocator.  */
  52
  53 #if WITH_DMALLOC
  54 # define MALLOC_FUNC_CHECK 1
  55 # include <dmalloc.h>
  56 #endif
  57
  58 /* Global definitions.  */
  59
  60 /* FIXME: There are many unchecked integer overflows in this file,
  61    and in theory they could cause this command to have undefined
  62    behavior given large inputs or options.  This command should
  63    diagnose any such overflow and exit.  */
  64
  65 /* Program options.  */
  66
  67 enum Format
  68 {
  69   UNKNOWN_FORMAT,               /* output format still unknown */
  70   DUMB_FORMAT,                  /* output for a dumb terminal */
  71   ROFF_FORMAT,                  /* output for 'troff' or 'nroff' */
  72   TEX_FORMAT                    /* output for 'TeX' or 'LaTeX' */
  73 };
  74
  75 static bool gnu_extensions = true;      /* trigger all GNU extensions */
  76 static bool auto_reference = false;     /* refs are 'file_name:line_number:' */
  77 static bool input_reference = false;    /* refs at beginning of input lines */
  78 static bool right_reference = false;    /* output refs after right context  */
  79 static ptrdiff_t line_width = 72;       /* output line width in characters */
  80 static ptrdiff_t gap_size = 3;  /* number of spaces between output fields */
  81 static char const *truncation_string = "/";
  82                                 /* string used to mark line truncations */
  83 static char const *macro_name = "xx";   /* macro name for roff or TeX output */
  84 static enum Format output_format = UNKNOWN_FORMAT;
  85                                 /* output format */
  86
  87 static bool ignore_case = false;        /* fold lower to upper for sorting */
  88 static char const *break_file = NULL;   /* name of the 'Break chars' file */
  89 static char const *only_file = NULL;    /* name of the 'Only words' file */
  90 static char const *ignore_file = NULL;  /* name of the 'Ignore words' file */
  91
  92 /* Options that use regular expressions.  */
  93 struct regex_data
  94 {
  95   /* The original regular expression, as a string.  */
  96   char const *string;
  97
  98   /* The compiled regular expression, and its fastmap.  */
  99   struct re_pattern_buffer pattern;
 100   char fastmap[UCHAR_MAX + 1];
 101 };
 102
 103 static struct regex_data context_regex; /* end of context */
 104 static struct regex_data word_regex;    /* keyword */
 105
 106 /* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
 107    whole file.  A WORD is similar, except it is intended for smaller regions.
 108    A WORD_TABLE may contain several WORDs.  */
 109
 110 typedef struct
 111   {
 112     char *start;                /* pointer to beginning of region */
 113     char *end;                  /* pointer to end + 1 of region */
 114   }
 115 BLOCK;
 116
 117 typedef struct
 118   {
 119     char *start;                /* pointer to beginning of region */
 120     ptrdiff_t size;             /* length of the region */
 121   }
 122 WORD;
 123
 124 typedef struct
 125   {
 126     WORD *start;                /* array of WORDs */
 127     size_t alloc;               /* allocated length */
 128     ptrdiff_t length;           /* number of used entries */
 129   }
 130 WORD_TABLE;
 131
 132 /* Pattern description tables.  */
 133
 134 /* For each character, provide its folded equivalent.  */
 135 static unsigned char folded_chars[CHAR_SET_SIZE];
 136
 137 /* End of context pattern register indices.  */
 138 static struct re_registers context_regs;
 139
 140 /* Keyword pattern register indices.  */
 141 static struct re_registers word_regs;
 142
 143 /* A word characters fastmap is used only when no word regexp has been
 144    provided.  A word is then made up of a sequence of one or more characters
 145    allowed by the fastmap.  Contains !0 if character allowed in word.  Not
 146    only this is faster in most cases, but it simplifies the implementation
 147    of the Break files.  */
 148 static char word_fastmap[CHAR_SET_SIZE];
 149
 150 /* Maximum length of any word read.  */
 151 static ptrdiff_t maximum_word_length;
 152
 153 /* Maximum width of any reference used.  */
 154 static ptrdiff_t reference_max_width;
 155
 156 /* Ignore and Only word tables.  */
 157
 158 static WORD_TABLE ignore_table; /* table of words to ignore */
 159 static WORD_TABLE only_table;           /* table of words to select */
 160
 161 /* Source text table, and scanning macros.  */
 162
 163 static int number_input_files;  /* number of text input files */
 164 static intmax_t total_line_count;       /* total number of lines seen so far */
 165 static char const **input_file_name;    /* array of text input file names */
 166 static intmax_t *file_line_count;       /* array of line count values at end */
 167
 168 static BLOCK *text_buffers;     /* files to study */
 169
 170 /* SKIP_NON_WHITE used only for getting or skipping the reference.  */
 171
 172 #define SKIP_NON_WHITE(cursor, limit) \
 173   while (cursor < limit && ! isspace (to_uchar (*cursor)))              \
 174     cursor++
 175
 176 #define SKIP_WHITE(cursor, limit) \
 177   while (cursor < limit && isspace (to_uchar (*cursor)))                \
 178     cursor++
 179
 180 #define SKIP_WHITE_BACKWARDS(cursor, start) \
 181   while (cursor > start && isspace (to_uchar (cursor[-1])))             \
 182     cursor--
 183
 184 #define SKIP_SOMETHING(cursor, limit) \
 185   if (word_regex.string)                                                \
 186     {                                                                   \
 187       regoff_t count;                                                   \
 188       count = re_match (&word_regex.pattern, cursor, limit - cursor, 0, NULL); \
 189       if (count == -2)                                                  \
 190         matcher_error ();                                               \
 191       cursor += count == -1 ? 1 : count;                                \
 192     }                                                                   \
 193   else if (word_fastmap[to_uchar (*cursor)])                            \
 194     while (cursor < limit && word_fastmap[to_uchar (*cursor)])          \
 195       cursor++;                                                         \
 196   else                                                                  \
 197     cursor++
 198
 199 /* Occurrences table.
 200
 201    The 'keyword' pointer provides the central word, which is surrounded
 202    by a left context and a right context.  The 'keyword' and 'length'
 203    field allow full 8-bit characters keys, even including NULs.  At other
 204    places in this program, the name 'keyafter' refers to the keyword
 205    followed by its right context.
 206
 207    The left context does not extend, towards the beginning of the file,
 208    further than a distance given by the 'left' value.  This value is
 209    relative to the keyword beginning, it is usually negative.  This
 210    insures that, except for white space, we will never have to backward
 211    scan the source text, when it is time to generate the final output
 212    lines.
 213
 214    The right context, indirectly attainable through the keyword end, does
 215    not extend, towards the end of the file, further than a distance given
 216    by the 'right' value.  This value is relative to the keyword
 217    beginning, it is usually positive.
 218
 219    When automatic references are used, the 'reference' value is the
 220    overall line number in all input files read so far, in this case, it
 221    is of type intmax_t.  When input references are used, the 'reference'
 222    value indicates the distance between the keyword beginning and the
 223    start of the reference field, and it fits in ptrdiff_t and is usually
 224    negative.  */
 225
 226 typedef struct
 227   {
 228     WORD key;                   /* description of the keyword */
 229     ptrdiff_t left;             /* distance to left context start */
 230     ptrdiff_t right;            /* distance to right context end */
 231     intmax_t reference;         /* reference descriptor */
 232     int file_index;             /* corresponding file  */
 233   }
 234 OCCURS;
 235
 236 /* The various OCCURS tables are indexed by the language.  But the time
 237    being, there is no such multiple language support.  */
 238
 239 static OCCURS *occurs_table[1]; /* all words retained from the read text */
 240 static size_t occurs_alloc[1];  /* allocated size of occurs_table */
 241 static ptrdiff_t number_of_occurs[1]; /* number of used slots in occurs_table */
 242
 243
 244 /* Communication among output routines.  */
 245
 246 /* Indicate if special output processing is requested for each character.  */
 247 static char edited_flag[CHAR_SET_SIZE];
 248
 249 /* Half of line width, reference excluded.  */
 250 static ptrdiff_t half_line_width;
 251
 252 /* Maximum width of before field.  */
 253 static ptrdiff_t before_max_width;
 254
 255 /* Maximum width of keyword-and-after field.  */
 256 static ptrdiff_t keyafter_max_width;
 257
 258 /* Length of string that flags truncation.  */
 259 static ptrdiff_t truncation_string_length;
 260
 261 /* When context is limited by lines, wraparound may happen on final output:
 262    the 'head' pointer gives access to some supplementary left context which
 263    will be seen at the end of the output line, the 'tail' pointer gives
 264    access to some supplementary right context which will be seen at the
 265    beginning of the output line. */
 266
 267 static BLOCK tail;              /* tail field */
 268 static bool tail_truncation;    /* flag truncation after the tail field */
 269
 270 static BLOCK before;            /* before field */
 271 static bool before_truncation;  /* flag truncation before the before field */
 272
 273 static BLOCK keyafter;          /* keyword-and-after field */
 274 static bool keyafter_truncation; /* flag truncation after the keyafter field */
 275
 276 static BLOCK head;              /* head field */
 277 static bool head_truncation;    /* flag truncation before the head field */
 278
 279 static BLOCK reference;         /* reference field for input reference mode */
 280
 281 /* Miscellaneous routines.  */
 282
 283 /* Diagnose an error in the regular expression matcher.  Then exit.  */
 284
 285 static void
 286 matcher_error (void)
 287 {
 288   die (EXIT_FAILURE, errno, _("error in regular expression matcher"));
 289 }
 290
 291 /*------------------------------------------------------.
 292 | Duplicate string STRING, while evaluating \-escapes.  |
 293 `------------------------------------------------------*/
 294
 295 /* Loosely adapted from GNU sh-utils printf.c code.  */
 296
 297 static char *
 298 copy_unescaped_string (char const *string)
 299 {
 300   char *result;                 /* allocated result */
 301   char *cursor;                 /* cursor in result */
 302   int value;                    /* value of \nnn escape */
 303   int length;                   /* length of \nnn escape */
 304
 305   result = xmalloc (strlen (string) + 1);
 306   cursor = result;
 307
 308   while (*string)
 309     {
 310       if (*string == '\\')
 311         {
 312           string++;
 313           switch (*string)
 314             {
 315             case 'x':           /* \xhhh escape, 3 chars maximum */
 316               value = 0;
 317               for (length = 0, string++;
 318                    length < 3 && isxdigit (to_uchar (*string));
 319                    length++, string++)
 320                 value = value * 16 + HEXTOBIN (*string);
 321               if (length == 0)
 322                 {
 323                   *cursor++ = '\\';
 324                   *cursor++ = 'x';
 325                 }
 326               else
 327                 *cursor++ = value;
 328               break;
 329
 330             case '0':           /* \0ooo escape, 3 chars maximum */
 331               value = 0;
 332               for (length = 0, string++;
 333                    length < 3 && ISODIGIT (*string);
 334                    length++, string++)
 335                 value = value * 8 + OCTTOBIN (*string);
 336               *cursor++ = value;
 337               break;
 338
 339             case 'a':           /* alert */
 340 #if __STDC__
 341               *cursor++ = '\a';
 342 #else
 343               *cursor++ = 7;
 344 #endif
 345               string++;
 346               break;
 347
 348             case 'b':           /* backspace */
 349               *cursor++ = '\b';
 350               string++;
 351               break;
 352
 353             case 'c':           /* cancel the rest of the output */
 354               while (*string)
 355                 string++;
 356               break;
 357
 358             case 'f':           /* form feed */
 359               *cursor++ = '\f';
 360               string++;
 361               break;
 362
 363             case 'n':           /* new line */
 364               *cursor++ = '\n';
 365               string++;
 366               break;
 367
 368             case 'r':           /* carriage return */
 369               *cursor++ = '\r';
 370               string++;
 371               break;
 372
 373             case 't':           /* horizontal tab */
 374               *cursor++ = '\t';
 375               string++;
 376               break;
 377
 378             case 'v':           /* vertical tab */
 379 #if __STDC__
 380               *cursor++ = '\v';
 381 #else
 382               *cursor++ = 11;
 383 #endif
 384               string++;
 385               break;
 386
 387             case '\0':          /* lone backslash at end of string */
 388               /* ignore it */
 389               break;
 390
 391             default:
 392               *cursor++ = '\\';
 393               *cursor++ = *string++;
 394               break;
 395             }
 396         }
 397       else
 398         *cursor++ = *string++;
 399     }
 400
 401   *cursor = '\0';
 402   return result;
 403 }
 404
 405 /*--------------------------------------------------------------------------.
 406 | Compile the regex represented by REGEX, diagnose and abort if any error.  |
 407 `--------------------------------------------------------------------------*/
 408
 409 static void
 410 compile_regex (struct regex_data *regex)
 411 {
 412   struct re_pattern_buffer *pattern = &regex->pattern;
 413   char const *string = regex->string;
 414   char const *message;
 415
 416   pattern->buffer = NULL;
 417   pattern->allocated = 0;
 418   pattern->fastmap = regex->fastmap;
 419   pattern->translate = ignore_case ? folded_chars : NULL;
 420
 421   message = re_compile_pattern (string, strlen (string), pattern);
 422   if (message)
 423     die (EXIT_FAILURE, 0, _("%s (for regexp %s)"), message, quote (string));
 424
 425   /* The fastmap should be compiled before 're_match'.  The following
 426      call is not mandatory, because 're_search' is always called sooner,
 427      and it compiles the fastmap if this has not been done yet.  */
 428
 429   re_compile_fastmap (pattern);
 430 }
 431
 432 /*------------------------------------------------------------------------.
 433 | This will initialize various tables for pattern match and compiles some |
 434 | regexps.                                                                |
 435 `------------------------------------------------------------------------*/
 436
 437 static void
 438 initialize_regex (void)
 439 {
 440   int character;                /* character value */
 441
 442   /* Initialize the case folding table.  */
 443
 444   if (ignore_case)
 445     for (character = 0; character < CHAR_SET_SIZE; character++)
 446       folded_chars[character] = toupper (character);
 447
 448   /* Unless the user already provided a description of the end of line or
 449      end of sentence sequence, select an end of line sequence to compile.
 450      If the user provided an empty definition, thus disabling end of line
 451      or sentence feature, make it NULL to speed up tests.  If GNU
 452      extensions are enabled, use end of sentence like in GNU emacs.  If
 453      disabled, use end of lines.  */
 454
 455   if (context_regex.string)
 456     {
 457       if (!*context_regex.string)
 458         context_regex.string = NULL;
 459     }
 460   else if (gnu_extensions && !input_reference)
 461     context_regex.string = "[.?!][]\"')}]*\\($\\|\t\\|  \\)[ \t\n]*";
 462   else
 463     context_regex.string = "\n";
 464
 465   if (context_regex.string)
 466     compile_regex (&context_regex);
 467
 468   /* If the user has already provided a non-empty regexp to describe
 469      words, compile it.  Else, unless this has already been done through
 470      a user provided Break character file, construct a fastmap of
 471      characters that may appear in a word.  If GNU extensions enabled,
 472      include only letters of the underlying character set.  If disabled,
 473      include almost everything, even punctuations; stop only on white
 474      space.  */
 475
 476   if (word_regex.string)
 477     compile_regex (&word_regex);
 478   else if (!break_file)
 479     {
 480       if (gnu_extensions)
 481         {
 482
 483           /* Simulate \w+.  */
 484
 485           for (character = 0; character < CHAR_SET_SIZE; character++)
 486             word_fastmap[character] = !! isalpha (character);
 487         }
 488       else
 489         {
 490
 491           /* Simulate [^ \t\n]+.  */
 492
 493           memset (word_fastmap, 1, CHAR_SET_SIZE);
 494           word_fastmap[' '] = 0;
 495           word_fastmap['\t'] = 0;
 496           word_fastmap['\n'] = 0;
 497         }
 498     }
 499 }
 500
 501 /*------------------------------------------------------------------------.
 502 | This routine will attempt to swallow a whole file name FILE_NAME into a |
 503 | contiguous region of memory and return a description of it into BLOCK.  |
 504 | Standard input is assumed whenever FILE_NAME is NULL, empty or "-".     |
 505 |                                                                         |
 506 | Previously, in some cases, white space compression was attempted while  |
 507 | inputting text.  This was defeating some regexps like default end of    |
 508 | sentence, which checks for two consecutive spaces.  If white space      |
 509 | compression is ever reinstated, it should be in output routines.        |
 510 `------------------------------------------------------------------------*/
 511
 512 static void
 513 swallow_file_in_memory (char const *file_name, BLOCK *block)
 514 {
 515   size_t used_length;           /* used length in memory buffer */
 516
 517   /* As special cases, a file name which is NULL or "-" indicates standard
 518      input, which is already opened.  In all other cases, open the file from
 519      its name.  */
 520   bool using_stdin = !file_name || !*file_name || STREQ (file_name, "-");
 521   if (using_stdin)
 522     block->start = fread_file (stdin, 0, &used_length);
 523   else
 524     block->start = read_file (file_name, 0, &used_length);
 525
 526   if (!block->start)
 527     die (EXIT_FAILURE, errno, "%s", quotef (using_stdin ? "-" : file_name));
 528
 529   if (using_stdin)
 530     clearerr (stdin);
 531
 532   block->end = block->start + used_length;
 533 }
 534
 535 /* Sort and search routines.  */
 536
 537 /*--------------------------------------------------------------------------.
 538 | Compare two words, FIRST and SECOND, and return 0 if they are identical.  |
 539 | Return less than 0 if the first word goes before the second; return       |
 540 | greater than 0 if the first word goes after the second.                   |
 541 |                                                                           |
 542 | If a word is indeed a prefix of the other, the shorter should go first.   |
 543 `--------------------------------------------------------------------------*/
 544
 545 static int
 546 compare_words (const void *void_first, const void *void_second)
 547 {
 548 #define first ((const WORD *) void_first)
 549 #define second ((const WORD *) void_second)
 550   ptrdiff_t length;             /* minimum of two lengths */
 551   ptrdiff_t counter;            /* cursor in words */
 552   int value;                    /* value of comparison */
 553
 554   length = first->size < second->size ? first->size : second->size;
 555
 556   if (ignore_case)
 557     {
 558       for (counter = 0; counter < length; counter++)
 559         {
 560           value = (folded_chars [to_uchar (first->start[counter])]
 561                    - folded_chars [to_uchar (second->start[counter])]);
 562           if (value != 0)
 563             return value;
 564         }
 565     }
 566   else
 567     {
 568       for (counter = 0; counter < length; counter++)
 569         {
 570           value = (to_uchar (first->start[counter])
 571                    - to_uchar (second->start[counter]));
 572           if (value != 0)
 573             return value;
 574         }
 575     }
 576
 577   return first->size < second->size ? -1 : first->size > second->size;
 578 #undef first
 579 #undef second
 580 }
 581
 582 /*-----------------------------------------------------------------------.
 583 | Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
 584 | go first.  In case of a tie, preserve the original order through a     |
 585 | pointer comparison.                                                    |
 586 `-----------------------------------------------------------------------*/
 587
 588 static int
 589 compare_occurs (const void *void_first, const void *void_second)
 590 {
 591 #define first ((const OCCURS *) void_first)
 592 #define second ((const OCCURS *) void_second)
 593   int value;
 594
 595   value = compare_words (&first->key, &second->key);
 596   return (value ? value
 597           : first->key.start < second->key.start ? -1
 598           : first->key.start > second->key.start);
 599 #undef first
 600 #undef second
 601 }
 602
 603 /* True if WORD appears in TABLE.  Uses a binary search.  */
 604
 605 ATTRIBUTE_PURE
 606 static bool
 607 search_table (WORD *word, WORD_TABLE *table)
 608 {
 609   ptrdiff_t lowest;             /* current lowest possible index */
 610   ptrdiff_t highest;            /* current highest possible index */
 611   ptrdiff_t middle;             /* current middle index */
 612   int value;                    /* value from last comparison */
 613
 614   lowest = 0;
 615   highest = table->length - 1;
 616   while (lowest <= highest)
 617     {
 618       middle = (lowest + highest) / 2;
 619       value = compare_words (word, table->start + middle);
 620       if (value < 0)
 621         highest = middle - 1;
 622       else if (value > 0)
 623         lowest = middle + 1;
 624       else
 625         return true;
 626     }
 627   return false;
 628 }
 629
 630 /*---------------------------------------------------------------------.
 631 | Sort the whole occurs table in memory.  Presumably, 'qsort' does not |
 632 | take intermediate copies or table elements, so the sort will be      |
 633 | stabilized throughout the comparison routine.                        |
 634 `---------------------------------------------------------------------*/
 635
 636 static void
 637 sort_found_occurs (void)
 638 {
 639
 640   /* Only one language for the time being.  */
 641   if (number_of_occurs[0])
 642     qsort (occurs_table[0], number_of_occurs[0], sizeof **occurs_table,
 643            compare_occurs);
 644 }
 645
 646 /* Parameter files reading routines.  */
 647
 648 /*----------------------------------------------------------------------.
 649 | Read a file named FILE_NAME, containing a set of break characters.    |
 650 | Build a content to the array word_fastmap in which all characters are |
 651 | allowed except those found in the file.  Characters may be repeated.  |
 652 `----------------------------------------------------------------------*/
 653
 654 static void
 655 digest_break_file (char const *file_name)
 656 {
 657   BLOCK file_contents;          /* to receive a copy of the file */
 658   char *cursor;                 /* cursor in file copy */
 659
 660   swallow_file_in_memory (file_name, &file_contents);
 661
 662   /* Make the fastmap and record the file contents in it.  */
 663
 664   memset (word_fastmap, 1, CHAR_SET_SIZE);
 665   for (cursor = file_contents.start; cursor < file_contents.end; cursor++)
 666     word_fastmap[to_uchar (*cursor)] = 0;
 667
 668   if (!gnu_extensions)
 669     {
 670
 671       /* If GNU extensions are enabled, the only way to avoid newline as
 672          a break character is to write all the break characters in the
 673          file with no newline at all, not even at the end of the file.
 674          If disabled, spaces, tabs and newlines are always considered as
 675          break characters even if not included in the break file.  */
 676
 677       word_fastmap[' '] = 0;
 678       word_fastmap['\t'] = 0;
 679       word_fastmap['\n'] = 0;
 680     }
 681
 682   /* Return the space of the file, which is no more required.  */
 683
 684   free (file_contents.start);
 685 }
 686
 687 /*-----------------------------------------------------------------------.
 688 | Read a file named FILE_NAME, containing one word per line, then        |
 689 | construct in TABLE a table of WORD descriptors for them.  The routine  |
 690 | swallows the whole file in memory; this is at the expense of space     |
 691 | needed for newlines, which are useless; however, the reading is fast.  |
 692 `-----------------------------------------------------------------------*/
 693
 694 static void
 695 digest_word_file (char const *file_name, WORD_TABLE *table)
 696 {
 697   BLOCK file_contents;          /* to receive a copy of the file */
 698   char *cursor;                 /* cursor in file copy */
 699   char *word_start;             /* start of the current word */
 700
 701   swallow_file_in_memory (file_name, &file_contents);
 702
 703   table->start = NULL;
 704   table->alloc = 0;
 705   table->length = 0;
 706
 707   /* Read the whole file.  */
 708
 709   cursor = file_contents.start;
 710   while (cursor < file_contents.end)
 711     {
 712
 713       /* Read one line, and save the word in contains.  */
 714
 715       word_start = cursor;
 716       while (cursor < file_contents.end && *cursor != '\n')
 717         cursor++;
 718
 719       /* Record the word in table if it is not empty.  */
 720
 721       if (cursor > word_start)
 722         {
 723           if (table->length == table->alloc)
 724             table->start = x2nrealloc (table->start, &table->alloc,
 725                                        sizeof *table->start);
 726           table->start[table->length].start = word_start;
 727           table->start[table->length].size = cursor - word_start;
 728           table->length++;
 729         }
 730
 731       /* This test allows for an incomplete line at end of file.  */
 732
 733       if (cursor < file_contents.end)
 734         cursor++;
 735     }
 736
 737   /* Finally, sort all the words read.  */
 738
 739   qsort (table->start, table->length, sizeof table->start[0], compare_words);
 740 }
 741
 742 /* Keyword recognition and selection.  */
 743
 744 /*----------------------------------------------------------------------.
 745 | For each keyword in the source text, constructs an OCCURS structure.  |
 746 `----------------------------------------------------------------------*/
 747
 748 static void
 749 find_occurs_in_text (int file_index)
 750 {
 751   char *cursor;                 /* for scanning the source text */
 752   char *scan;                   /* for scanning the source text also */
 753   char *line_start;             /* start of the current input line */
 754   char *line_scan;              /* newlines scanned until this point */
 755   ptrdiff_t reference_length;   /* length of reference in input mode */
 756   WORD possible_key;            /* possible key, to ease searches */
 757   OCCURS *occurs_cursor;        /* current OCCURS under construction */
 758
 759   char *context_start;          /* start of left context */
 760   char *context_end;            /* end of right context */
 761   char *word_start;             /* start of word */
 762   char *word_end;               /* end of word */
 763   char *next_context_start;     /* next start of left context */
 764
 765   const BLOCK *text_buffer = &text_buffers[file_index];
 766
 767   /* reference_length is always used within 'if (input_reference)'.
 768      However, GNU C diagnoses that it may be used uninitialized.  The
 769      following assignment is merely to shut it up.  */
 770
 771   reference_length = 0;
 772
 773   /* Tracking where lines start is helpful for reference processing.  In
 774      auto reference mode, this allows counting lines.  In input reference
 775      mode, this permits finding the beginning of the references.
 776
 777      The first line begins with the file, skip immediately this very first
 778      reference in input reference mode, to help further rejection any word
 779      found inside it.  Also, unconditionally assigning these variable has
 780      the happy effect of shutting up lint.  */
 781
 782   line_start = text_buffer->start;
 783   line_scan = line_start;
 784   if (input_reference)
 785     {
 786       SKIP_NON_WHITE (line_scan, text_buffer->end);
 787       reference_length = line_scan - line_start;
 788       SKIP_WHITE (line_scan, text_buffer->end);
 789     }
 790
 791   /* Process the whole buffer, one line or one sentence at a time.  */
 792
 793   for (cursor = text_buffer->start;
 794        cursor < text_buffer->end;
 795        cursor = next_context_start)
 796     {
 797
 798       /* 'context_start' gets initialized before the processing of each
 799          line, or once for the whole buffer if no end of line or sentence
 800          sequence separator.  */
 801
 802       context_start = cursor;
 803
 804       /* If an end of line or end of sentence sequence is defined and
 805          non-empty, 'next_context_start' will be recomputed to be the end of
 806          each line or sentence, before each one is processed.  If no such
 807          sequence, then 'next_context_start' is set at the end of the whole
 808          buffer, which is then considered to be a single line or sentence.
 809          This test also accounts for the case of an incomplete line or
 810          sentence at the end of the buffer.  */
 811
 812       next_context_start = text_buffer->end;
 813       if (context_regex.string)
 814         switch (re_search (&context_regex.pattern, cursor,
 815                            text_buffer->end - cursor,
 816                            0, text_buffer->end - cursor, &context_regs))
 817           {
 818           case -2:
 819             matcher_error ();
 820
 821           case -1:
 822             break;
 823
 824           case 0:
 825             die (EXIT_FAILURE, 0,
 826                  _("error: regular expression has a match of length zero: %s"),
 827                  quote (context_regex.string));
 828
 829           default:
 830             next_context_start = cursor + context_regs.end[0];
 831             break;
 832           }
 833
 834       /* Include the separator into the right context, but not any suffix
 835          white space in this separator; this insures it will be seen in
 836          output and will not take more space than necessary.  */
 837
 838       context_end = next_context_start;
 839       SKIP_WHITE_BACKWARDS (context_end, context_start);
 840
 841       /* Read and process a single input line or sentence, one word at a
 842          time.  */
 843
 844       while (true)
 845         {
 846           if (word_regex.string)
 847
 848             /* If a word regexp has been compiled, use it to skip at the
 849                beginning of the next word.  If there is no such word, exit
 850                the loop.  */
 851
 852             {
 853               regoff_t r = re_search (&word_regex.pattern, cursor,
 854                                       context_end - cursor,
 855                                       0, context_end - cursor, &word_regs);
 856               if (r == -2)
 857                 matcher_error ();
 858               if (r == -1)
 859                 break;
 860               word_start = cursor + word_regs.start[0];
 861               word_end = cursor + word_regs.end[0];
 862             }
 863           else
 864
 865             /* Avoid re_search and use the fastmap to skip to the
 866                beginning of the next word.  If there is no more word in
 867                the buffer, exit the loop.  */
 868
 869             {
 870               scan = cursor;
 871               while (scan < context_end
 872                      && !word_fastmap[to_uchar (*scan)])
 873                 scan++;
 874
 875               if (scan == context_end)
 876                 break;
 877
 878               word_start = scan;
 879
 880               while (scan < context_end
 881                      && word_fastmap[to_uchar (*scan)])
 882                 scan++;
 883
 884               word_end = scan;
 885             }
 886
 887           /* Skip right to the beginning of the found word.  */
 888
 889           cursor = word_start;
 890
 891           /* Skip any zero length word.  Just advance a single position,
 892              then go fetch the next word.  */
 893
 894           if (word_end == word_start)
 895             {
 896               cursor++;
 897               continue;
 898             }
 899
 900           /* This is a genuine, non empty word, so save it as a possible
 901              key.  Then skip over it.  Also, maintain the maximum length of
 902              all words read so far.  It is mandatory to take the maximum
 903              length of all words in the file, without considering if they
 904              are actually kept or rejected, because backward jumps at output
 905              generation time may fall in *any* word.  */
 906
 907           possible_key.start = cursor;
 908           possible_key.size = word_end - word_start;
 909           cursor += possible_key.size;
 910
 911           if (possible_key.size > maximum_word_length)
 912             maximum_word_length = possible_key.size;
 913
 914           /* In input reference mode, update 'line_start' from its previous
 915              value.  Count the lines just in case auto reference mode is
 916              also selected. If it happens that the word just matched is
 917              indeed part of a reference; just ignore it.  */
 918
 919           if (input_reference)
 920             {
 921               while (line_scan < possible_key.start)
 922                 if (*line_scan == '\n')
 923                   {
 924                     total_line_count++;
 925                     line_scan++;
 926                     line_start = line_scan;
 927                     SKIP_NON_WHITE (line_scan, text_buffer->end);
 928                     reference_length = line_scan - line_start;
 929                   }
 930                 else
 931                   line_scan++;
 932               if (line_scan > possible_key.start)
 933                 continue;
 934             }
 935
 936           /* Ignore the word if an 'Ignore words' table exists and if it is
 937              part of it.  Also ignore the word if an 'Only words' table and
 938              if it is *not* part of it.
 939
 940              It is allowed that both tables be used at once, even if this
 941              may look strange for now.  Just ignore a word that would appear
 942              in both.  If regexps are eventually implemented for these
 943              tables, the Ignore table could then reject words that would
 944              have been previously accepted by the Only table.  */
 945
 946           if (ignore_file && search_table (&possible_key, &ignore_table))
 947             continue;
 948           if (only_file && !search_table (&possible_key, &only_table))
 949             continue;
 950
 951           /* A non-empty word has been found.  First of all, insure
 952              proper allocation of the next OCCURS, and make a pointer to
 953              where it will be constructed.  */
 954
 955           if (number_of_occurs[0] == occurs_alloc[0])
 956             occurs_table[0] = x2nrealloc (occurs_table[0],
 957                                           &occurs_alloc[0],
 958                                           sizeof *occurs_table[0]);
 959           occurs_cursor = occurs_table[0] + number_of_occurs[0];
 960
 961           /* Define the reference field, if any.  */
 962
 963           if (auto_reference)
 964             {
 965
 966               /* While auto referencing, update 'line_start' from its
 967                  previous value, counting lines as we go.  If input
 968                  referencing at the same time, 'line_start' has been
 969                  advanced earlier, and the following loop is never really
 970                  executed.  */
 971
 972               while (line_scan < possible_key.start)
 973                 if (*line_scan == '\n')
 974                   {
 975                     total_line_count++;
 976                     line_scan++;
 977                     line_start = line_scan;
 978                     SKIP_NON_WHITE (line_scan, text_buffer->end);
 979                   }
 980                 else
 981                   line_scan++;
 982
 983               occurs_cursor->reference = total_line_count;
 984             }
 985           else if (input_reference)
 986             {
 987
 988               /* If only input referencing, 'line_start' has been computed
 989                  earlier to detect the case the word matched would be part
 990                  of the reference.  The reference position is simply the
 991                  value of 'line_start'.  */
 992
 993               occurs_cursor->reference = line_start - possible_key.start;
 994               if (reference_length > reference_max_width)
 995                 reference_max_width = reference_length;
 996             }
 997
 998           /* Exclude the reference from the context in simple cases.  */
 999
1000           if (input_reference && line_start == context_start)
1001             {
1002               SKIP_NON_WHITE (context_start, context_end);
1003               SKIP_WHITE (context_start, context_end);
1004             }
1005
1006           /* Completes the OCCURS structure.  */
1007
1008           occurs_cursor->key = possible_key;
1009           occurs_cursor->left = context_start - possible_key.start;
1010           occurs_cursor->right = context_end - possible_key.start;
1011           occurs_cursor->file_index = file_index;
1012
1013           number_of_occurs[0]++;
1014         }
1015     }
1016 }
1017
1018 /* Formatting and actual output - service routines.  */
1019
1020 /*-----------------------------------------.
1021 | Prints some NUMBER of spaces on stdout.  |
1022 `-----------------------------------------*/
1023
1024 static void
1025 print_spaces (ptrdiff_t number)
1026 {
1027   for (ptrdiff_t counter = number; counter > 0; counter--)
1028     putchar (' ');
1029 }
1030
1031 /*-------------------------------------.
1032 | Prints the field provided by FIELD.  |
1033 `-------------------------------------*/
1034
1035 static void
1036 print_field (BLOCK field)
1037 {
1038   char *cursor;                 /* Cursor in field to print */
1039
1040   /* Whitespace is not really compressed.  Instead, each white space
1041      character (tab, vt, ht etc.) is printed as one single space.  */
1042
1043   for (cursor = field.start; cursor < field.end; cursor++)
1044     {
1045       unsigned char character = *cursor;
1046       if (edited_flag[character])
1047         {
1048           /* Handle cases which are specific to 'roff' or TeX.  All
1049              white space processing is done as the default case of
1050              this switch.  */
1051
1052           switch (character)
1053             {
1054             case '"':
1055               /* In roff output format, double any quote.  */
1056               putchar ('"');
1057               putchar ('"');
1058               break;
1059
1060             case '$':
1061             case '%':
1062             case '&':
1063             case '#':
1064             case '_':
1065               /* In TeX output format, precede these with a backslash.  */
1066               putchar ('\\');
1067               putchar (character);
1068               break;
1069
1070             case '{':
1071             case '}':
1072               /* In TeX output format, precede these with a backslash and
1073                  force mathematical mode.  */
1074               printf ("$\\%c$", character);
1075               break;
1076
1077             case '\\':
1078               /* In TeX output mode, request production of a backslash.  */
1079               fputs ("\\backslash{}", stdout);
1080               break;
1081
1082             default:
1083               /* Any other flagged character produces a single space.  */
1084               putchar (' ');
1085             }
1086         }
1087       else
1088         putchar (*cursor);
1089     }
1090 }
1091
1092 /* Formatting and actual output - planning routines.  */
1093
1094 /*--------------------------------------------------------------------.
1095 | From information collected from command line options and input file |
1096 | readings, compute and fix some output parameter values.             |
1097 `--------------------------------------------------------------------*/
1098
1099 static void
1100 fix_output_parameters (void)
1101 {
1102   size_t file_index;            /* index in text input file arrays */
1103   intmax_t line_ordinal;        /* line ordinal value for reference */
1104   ptrdiff_t reference_width;    /* width for the whole reference */
1105   int character;                /* character ordinal */
1106   char const *cursor;           /* cursor in some constant strings */
1107
1108   /* In auto reference mode, the maximum width of this field is
1109      precomputed and subtracted from the overall line width.  Add one for
1110      the column which separate the file name from the line number.  */
1111
1112   if (auto_reference)
1113     {
1114       reference_max_width = 0;
1115       for (file_index = 0; file_index < number_input_files; file_index++)
1116         {
1117           line_ordinal = file_line_count[file_index] + 1;
1118           if (file_index > 0)
1119             line_ordinal -= file_line_count[file_index - 1];
1120           char ordinal_string[INT_BUFSIZE_BOUND (intmax_t)];
1121           reference_width = sprintf (ordinal_string, "%"PRIdMAX, line_ordinal);
1122           if (input_file_name[file_index])
1123             reference_width += strlen (input_file_name[file_index]);
1124           if (reference_width > reference_max_width)
1125             reference_max_width = reference_width;
1126         }
1127       reference_max_width++;
1128       reference.start = xmalloc (reference_max_width + 1);
1129     }
1130
1131   /* If the reference appears to the left of the output line, reserve some
1132      space for it right away, including one gap size.  */
1133
1134   if ((auto_reference || input_reference) && !right_reference)
1135     line_width -= reference_max_width + gap_size;
1136   if (line_width < 0)
1137     line_width = 0;
1138
1139   /* The output lines, minimally, will contain from left to right a left
1140      context, a gap, and a keyword followed by the right context with no
1141      special intervening gap.  Half of the line width is dedicated to the
1142      left context and the gap, the other half is dedicated to the keyword
1143      and the right context; these values are computed once and for all here.
1144      There also are tail and head wrap around fields, used when the keyword
1145      is near the beginning or the end of the line, or when some long word
1146      cannot fit in, but leave place from wrapped around shorter words.  The
1147      maximum width of these fields are recomputed separately for each line,
1148      on a case by case basis.  It is worth noting that it cannot happen that
1149      both the tail and head fields are used at once.  */
1150
1151   half_line_width = line_width / 2;
1152   before_max_width = half_line_width - gap_size;
1153   keyafter_max_width = half_line_width;
1154
1155   /* If truncation_string is the empty string, make it NULL to speed up
1156      tests.  In this case, truncation_string_length will never get used, so
1157      there is no need to set it.  */
1158
1159   if (truncation_string && *truncation_string)
1160     truncation_string_length = strlen (truncation_string);
1161   else
1162     truncation_string = NULL;
1163
1164   if (gnu_extensions)
1165     {
1166
1167       /* When flagging truncation at the left of the keyword, the
1168          truncation mark goes at the beginning of the before field,
1169          unless there is a head field, in which case the mark goes at the
1170          left of the head field.  When flagging truncation at the right
1171          of the keyword, the mark goes at the end of the keyafter field,
1172          unless there is a tail field, in which case the mark goes at the
1173          end of the tail field.  Only eight combination cases could arise
1174          for truncation marks:
1175
1176          . None.
1177          . One beginning the before field.
1178          . One beginning the head field.
1179          . One ending the keyafter field.
1180          . One ending the tail field.
1181          . One beginning the before field, another ending the keyafter field.
1182          . One ending the tail field, another beginning the before field.
1183          . One ending the keyafter field, another beginning the head field.
1184
1185          So, there is at most two truncation marks, which could appear both
1186          on the left side of the center of the output line, both on the
1187          right side, or one on either side.  */
1188
1189       before_max_width -= 2 * truncation_string_length;
1190       if (before_max_width < 0)
1191         before_max_width = 0;
1192       keyafter_max_width -= 2 * truncation_string_length;
1193     }
1194   else
1195     {
1196
1197       /* I never figured out exactly how UNIX' ptx plans the output width
1198          of its various fields.  If GNU extensions are disabled, do not
1199          try computing the field widths correctly; instead, use the
1200          following formula, which does not completely imitate UNIX' ptx,
1201          but almost.  */
1202
1203       keyafter_max_width -= 2 * truncation_string_length + 1;
1204     }
1205
1206   /* Compute which characters need special output processing.  Initialize
1207      by flagging any white space character.  Some systems do not consider
1208      form feed as a space character, but we do.  */
1209
1210   for (character = 0; character < CHAR_SET_SIZE; character++)
1211     edited_flag[character] = !! isspace (character);
1212   edited_flag['\f'] = 1;
1213
1214   /* Complete the special character flagging according to selected output
1215      format.  */
1216
1217   switch (output_format)
1218     {
1219     case UNKNOWN_FORMAT:
1220       /* Should never happen.  */
1221
1222     case DUMB_FORMAT:
1223       break;
1224
1225     case ROFF_FORMAT:
1226
1227       /* 'Quote' characters should be doubled.  */
1228
1229       edited_flag['"'] = 1;
1230       break;
1231
1232     case TEX_FORMAT:
1233
1234       /* Various characters need special processing.  */
1235
1236       for (cursor = "$%&#_{}\\"; *cursor; cursor++)
1237         edited_flag[to_uchar (*cursor)] = 1;
1238
1239       break;
1240     }
1241 }
1242
1243 /*------------------------------------------------------------------.
1244 | Compute the position and length of all the output fields, given a |
1245 | pointer to some OCCURS.                                           |
1246 `------------------------------------------------------------------*/
1247
1248 static void
1249 define_all_fields (OCCURS *occurs)
1250 {
1251   ptrdiff_t tail_max_width;     /* allowable width of tail field */
1252   ptrdiff_t head_max_width;     /* allowable width of head field */
1253   char *cursor;                 /* running cursor in source text */
1254   char *left_context_start;     /* start of left context */
1255   char *right_context_end;      /* end of right context */
1256   char *left_field_start;       /* conservative start for 'head'/'before' */
1257   char const *file_name;        /* file name for reference */
1258   intmax_t line_ordinal;        /* line ordinal for reference */
1259   char const *buffer_start;     /* start of buffered file for this occurs */
1260   char const *buffer_end;       /* end of buffered file for this occurs */
1261
1262   /* Define 'keyafter', start of left context and end of right context.
1263      'keyafter' starts at the saved position for keyword and extend to the
1264      right from the end of the keyword, eating separators or full words, but
1265      not beyond maximum allowed width for 'keyafter' field or limit for the
1266      right context.  Suffix spaces will be removed afterwards.  */
1267
1268   keyafter.start = occurs->key.start;
1269   keyafter.end = keyafter.start + occurs->key.size;
1270   left_context_start = keyafter.start + occurs->left;
1271   right_context_end = keyafter.start + occurs->right;
1272
1273   buffer_start = text_buffers[occurs->file_index].start;
1274   buffer_end = text_buffers[occurs->file_index].end;
1275
1276   cursor = keyafter.end;
1277   while (cursor < right_context_end
1278          && cursor <= keyafter.start + keyafter_max_width)
1279     {
1280       keyafter.end = cursor;
1281       SKIP_SOMETHING (cursor, right_context_end);
1282     }
1283   if (cursor <= keyafter.start + keyafter_max_width)
1284     keyafter.end = cursor;
1285
1286   keyafter_truncation = truncation_string && keyafter.end < right_context_end;
1287
1288   SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start);
1289
1290   /* When the left context is wide, it might take some time to catch up from
1291      the left context boundary to the beginning of the 'head' or 'before'
1292      fields.  So, in this case, to speed the catchup, we jump back from the
1293      keyword, using some secure distance, possibly falling in the middle of
1294      a word.  A secure backward jump would be at least half the maximum
1295      width of a line, plus the size of the longest word met in the whole
1296      input.  We conclude this backward jump by a skip forward of at least
1297      one word.  In this manner, we should not inadvertently accept only part
1298      of a word.  From the reached point, when it will be time to fix the
1299      beginning of 'head' or 'before' fields, we will skip forward words or
1300      delimiters until we get sufficiently near.  */
1301
1302   if (-occurs->left > half_line_width + maximum_word_length)
1303     {
1304       left_field_start
1305         = keyafter.start - (half_line_width + maximum_word_length);
1306       SKIP_SOMETHING (left_field_start, keyafter.start);
1307     }
1308   else
1309     left_field_start = keyafter.start + occurs->left;
1310
1311   /* 'before' certainly ends at the keyword, but not including separating
1312      spaces.  It starts after than the saved value for the left context, by
1313      advancing it until it falls inside the maximum allowed width for the
1314      before field.  There will be no prefix spaces either.  'before' only
1315      advances by skipping single separators or whole words. */
1316
1317   before.start = left_field_start;
1318   before.end = keyafter.start;
1319   SKIP_WHITE_BACKWARDS (before.end, before.start);
1320
1321   while (before.start + before_max_width < before.end)
1322     SKIP_SOMETHING (before.start, before.end);
1323
1324   if (truncation_string)
1325     {
1326       cursor = before.start;
1327       SKIP_WHITE_BACKWARDS (cursor, buffer_start);
1328       before_truncation = cursor > left_context_start;
1329     }
1330   else
1331     before_truncation = false;
1332
1333   SKIP_WHITE (before.start, buffer_end);
1334
1335   /* The tail could not take more columns than what has been left in the
1336      left context field, and a gap is mandatory.  It starts after the
1337      right context, and does not contain prefixed spaces.  It ends at
1338      the end of line, the end of buffer or when the tail field is full,
1339      whichever comes first.  It cannot contain only part of a word, and
1340      has no suffixed spaces.  */
1341
1342   tail_max_width
1343     = before_max_width - (before.end - before.start) - gap_size;
1344
1345   if (tail_max_width > 0)
1346     {
1347       tail.start = keyafter.end;
1348       SKIP_WHITE (tail.start, buffer_end);
1349
1350       tail.end = tail.start;
1351       cursor = tail.end;
1352       while (cursor < right_context_end
1353              && cursor < tail.start + tail_max_width)
1354         {
1355           tail.end = cursor;
1356           SKIP_SOMETHING (cursor, right_context_end);
1357         }
1358
1359       if (cursor < tail.start + tail_max_width)
1360         tail.end = cursor;
1361
1362       if (tail.end > tail.start)
1363         {
1364           keyafter_truncation = false;
1365           tail_truncation = truncation_string && tail.end < right_context_end;
1366         }
1367       else
1368         tail_truncation = false;
1369
1370       SKIP_WHITE_BACKWARDS (tail.end, tail.start);
1371     }
1372   else
1373     {
1374
1375       /* No place left for a tail field.  */
1376
1377       tail.start = NULL;
1378       tail.end = NULL;
1379       tail_truncation = false;
1380     }
1381
1382   /* 'head' could not take more columns than what has been left in the right
1383      context field, and a gap is mandatory.  It ends before the left
1384      context, and does not contain suffixed spaces.  Its pointer is advanced
1385      until the head field has shrunk to its allowed width.  It cannot
1386      contain only part of a word, and has no suffixed spaces.  */
1387
1388   head_max_width
1389     = keyafter_max_width - (keyafter.end - keyafter.start) - gap_size;
1390
1391   if (head_max_width > 0)
1392     {
1393       head.end = before.start;
1394       SKIP_WHITE_BACKWARDS (head.end, buffer_start);
1395
1396       head.start = left_field_start;
1397       while (head.start + head_max_width < head.end)
1398         SKIP_SOMETHING (head.start, head.end);
1399
1400       if (head.end > head.start)
1401         {
1402           before_truncation = false;
1403           head_truncation = (truncation_string
1404                              && head.start > left_context_start);
1405         }
1406       else
1407         head_truncation = false;
1408
1409       SKIP_WHITE (head.start, head.end);
1410     }
1411   else
1412     {
1413
1414       /* No place left for a head field.  */
1415
1416       head.start = NULL;
1417       head.end = NULL;
1418       head_truncation = false;
1419     }
1420
1421   if (auto_reference)
1422     {
1423
1424       /* Construct the reference text in preallocated space from the file
1425          name and the line number.  Standard input yields an empty file name.
1426          Ensure line numbers are 1 based, even if they are computed 0 based.  */
1427
1428       file_name = input_file_name[occurs->file_index];
1429       if (!file_name)
1430         file_name = "";
1431
1432       line_ordinal = occurs->reference + 1;
1433       if (occurs->file_index > 0)
1434         line_ordinal -= file_line_count[occurs->file_index - 1];
1435
1436       char *file_end = stpcpy (reference.start, file_name);
1437       reference.end = file_end + sprintf (file_end, ":%"PRIdMAX, line_ordinal);
1438     }
1439   else if (input_reference)
1440     {
1441
1442       /* Reference starts at saved position for reference and extends right
1443          until some white space is met.  */
1444
1445       reference.start = keyafter.start + occurs->reference;
1446       reference.end = reference.start;
1447       SKIP_NON_WHITE (reference.end, right_context_end);
1448     }
1449 }
1450
1451 /* Formatting and actual output - control routines.  */
1452
1453 /*----------------------------------------------------------------------.
1454 | Output the current output fields as one line for 'troff' or 'nroff'.  |
1455 `----------------------------------------------------------------------*/
1456
1457 static void
1458 output_one_roff_line (void)
1459 {
1460   /* Output the 'tail' field.  */
1461
1462   printf (".%s \"", macro_name);
1463   print_field (tail);
1464   if (tail_truncation)
1465     fputs (truncation_string, stdout);
1466   putchar ('"');
1467
1468   /* Output the 'before' field.  */
1469
1470   fputs (" \"", stdout);
1471   if (before_truncation)
1472     fputs (truncation_string, stdout);
1473   print_field (before);
1474   putchar ('"');
1475
1476   /* Output the 'keyafter' field.  */
1477
1478   fputs (" \"", stdout);
1479   print_field (keyafter);
1480   if (keyafter_truncation)
1481     fputs (truncation_string, stdout);
1482   putchar ('"');
1483
1484   /* Output the 'head' field.  */
1485
1486   fputs (" \"", stdout);
1487   if (head_truncation)
1488     fputs (truncation_string, stdout);
1489   print_field (head);
1490   putchar ('"');
1491
1492   /* Conditionally output the 'reference' field.  */
1493
1494   if (auto_reference || input_reference)
1495     {
1496       fputs (" \"", stdout);
1497       print_field (reference);
1498       putchar ('"');
1499     }
1500
1501   putchar ('\n');
1502 }
1503
1504 /*---------------------------------------------------------.
1505 | Output the current output fields as one line for 'TeX'.  |
1506 `---------------------------------------------------------*/
1507
1508 static void
1509 output_one_tex_line (void)
1510 {
1511   BLOCK key;                    /* key field, isolated */
1512   BLOCK after;                  /* after field, isolated */
1513   char *cursor;                 /* running cursor in source text */
1514
1515   printf ("\\%s ", macro_name);
1516   putchar ('{');
1517   print_field (tail);
1518   fputs ("}{", stdout);
1519   print_field (before);
1520   fputs ("}{", stdout);
1521   key.start = keyafter.start;
1522   after.end = keyafter.end;
1523   cursor = keyafter.start;
1524   SKIP_SOMETHING (cursor, keyafter.end);
1525   key.end = cursor;
1526   after.start = cursor;
1527   print_field (key);
1528   fputs ("}{", stdout);
1529   print_field (after);
1530   fputs ("}{", stdout);
1531   print_field (head);
1532   putchar ('}');
1533   if (auto_reference || input_reference)
1534     {
1535       putchar ('{');
1536       print_field (reference);
1537       putchar ('}');
1538     }
1539   putchar ('\n');
1540 }
1541
1542 /*-------------------------------------------------------------------.
1543 | Output the current output fields as one line for a dumb terminal.  |
1544 `-------------------------------------------------------------------*/
1545
1546 static void
1547 output_one_dumb_line (void)
1548 {
1549   if (!right_reference)
1550     {
1551       if (auto_reference)
1552         {
1553
1554           /* Output the 'reference' field, in such a way that GNU emacs
1555              next-error will handle it.  The ending colon is taken from the
1556              gap which follows.  */
1557
1558           print_field (reference);
1559           putchar (':');
1560           print_spaces (reference_max_width
1561                         + gap_size
1562                         - (reference.end - reference.start)
1563                         - 1);
1564         }
1565       else
1566         {
1567
1568           /* Output the 'reference' field and its following gap.  */
1569
1570           print_field (reference);
1571           print_spaces (reference_max_width
1572                         + gap_size
1573                         - (reference.end - reference.start));
1574         }
1575     }
1576
1577   if (tail.start < tail.end)
1578     {
1579       /* Output the 'tail' field.  */
1580
1581       print_field (tail);
1582       if (tail_truncation)
1583         fputs (truncation_string, stdout);
1584
1585       print_spaces (half_line_width - gap_size
1586                     - (before.end - before.start)
1587                     - (before_truncation ? truncation_string_length : 0)
1588                     - (tail.end - tail.start)
1589                     - (tail_truncation ? truncation_string_length : 0));
1590     }
1591   else
1592     print_spaces (half_line_width - gap_size
1593                   - (before.end - before.start)
1594                   - (before_truncation ? truncation_string_length : 0));
1595
1596   /* Output the 'before' field.  */
1597
1598   if (before_truncation)
1599     fputs (truncation_string, stdout);
1600   print_field (before);
1601
1602   print_spaces (gap_size);
1603
1604   /* Output the 'keyafter' field.  */
1605
1606   print_field (keyafter);
1607   if (keyafter_truncation)
1608     fputs (truncation_string, stdout);
1609
1610   if (head.start < head.end)
1611     {
1612       /* Output the 'head' field.  */
1613
1614       print_spaces (half_line_width
1615                     - (keyafter.end - keyafter.start)
1616                     - (keyafter_truncation ? truncation_string_length : 0)
1617                     - (head.end - head.start)
1618                     - (head_truncation ? truncation_string_length : 0));
1619       if (head_truncation)
1620         fputs (truncation_string, stdout);
1621       print_field (head);
1622     }
1623   else
1624
1625     if ((auto_reference || input_reference) && right_reference)
1626       print_spaces (half_line_width
1627                     - (keyafter.end - keyafter.start)
1628                     - (keyafter_truncation ? truncation_string_length : 0));
1629
1630   if ((auto_reference || input_reference) && right_reference)
1631     {
1632       /* Output the 'reference' field.  */
1633
1634       print_spaces (gap_size);
1635       print_field (reference);
1636     }
1637
1638   putchar ('\n');
1639 }
1640
1641 /*------------------------------------------------------------------------.
1642 | Scan the whole occurs table and, for each entry, output one line in the |
1643 | appropriate format.                                                     |
1644 `------------------------------------------------------------------------*/
1645
1646 static void
1647 generate_all_output (void)
1648 {
1649   ptrdiff_t occurs_index;       /* index of keyword entry being processed */
1650   OCCURS *occurs_cursor;        /* current keyword entry being processed */
1651
1652   /* The following assignments are useful to provide default values in case
1653      line contexts or references are not used, in which case these variables
1654      would never be computed.  */
1655
1656   tail.start = NULL;
1657   tail.end = NULL;
1658   tail_truncation = false;
1659
1660   head.start = NULL;
1661   head.end = NULL;
1662   head_truncation = false;
1663
1664   /* Loop over all keyword occurrences.  */
1665
1666   occurs_cursor = occurs_table[0];
1667
1668   for (occurs_index = 0; occurs_index < number_of_occurs[0]; occurs_index++)
1669     {
1670       /* Compute the exact size of every field and whenever truncation flags
1671          are present or not.  */
1672
1673       define_all_fields (occurs_cursor);
1674
1675       /* Produce one output line according to selected format.  */
1676
1677       switch (output_format)
1678         {
1679         case UNKNOWN_FORMAT:
1680           /* Should never happen.  */
1681
1682         case DUMB_FORMAT:
1683           output_one_dumb_line ();
1684           break;
1685
1686         case ROFF_FORMAT:
1687           output_one_roff_line ();
1688           break;
1689
1690         case TEX_FORMAT:
1691           output_one_tex_line ();
1692           break;
1693         }
1694
1695       /* Advance the cursor into the occurs table.  */
1696
1697       occurs_cursor++;
1698     }
1699 }
1700
1701 /* Option decoding and main program.  */
1702
1703 /*------------------------------------------------------.
1704 | Print program identification and options, then exit.  |
1705 `------------------------------------------------------*/
1706
1707 void
1708 usage (int status)
1709 {
1710   if (status != EXIT_SUCCESS)
1711     emit_try_help ();
1712   else
1713     {
1714       printf (_("\
1715 Usage: %s [OPTION]... [INPUT]...   (without -G)\n\
1716   or:  %s -G [OPTION]... [INPUT [OUTPUT]]\n"),
1717               program_name, program_name);
1718       fputs (_("\
1719 Output a permuted index, including context, of the words in the input files.\n\
1720 "), stdout);
1721
1722       emit_stdin_note ();
1723       emit_mandatory_arg_note ();
1724
1725       fputs (_("\
1726   -A, --auto-reference           output automatically generated references\n\
1727   -G, --traditional              behave more like System V 'ptx'\n\
1728 "), stdout);
1729       fputs (_("\
1730   -F, --flag-truncation=STRING   use STRING for flagging line truncations.\n\
1731                                  The default is '/'\n\
1732 "), stdout);
1733       fputs (_("\
1734   -M, --macro-name=STRING        macro name to use instead of 'xx'\n\
1735   -O, --format=roff              generate output as roff directives\n\
1736   -R, --right-side-refs          put references at right, not counted in -w\n\
1737   -S, --sentence-regexp=REGEXP   for end of lines or end of sentences\n\
1738   -T, --format=tex               generate output as TeX directives\n\
1739 "), stdout);
1740       fputs (_("\
1741   -W, --word-regexp=REGEXP       use REGEXP to match each keyword\n\
1742   -b, --break-file=FILE          word break characters in this FILE\n\
1743   -f, --ignore-case              fold lower case to upper case for sorting\n\
1744   -g, --gap-size=NUMBER          gap size in columns between output fields\n\
1745   -i, --ignore-file=FILE         read ignore word list from FILE\n\
1746   -o, --only-file=FILE           read only word list from this FILE\n\
1747 "), stdout);
1748       fputs (_("\
1749   -r, --references               first field of each line is a reference\n\
1750   -t, --typeset-mode               - not implemented -\n\
1751   -w, --width=NUMBER             output width in columns, reference excluded\n\
1752 "), stdout);
1753       fputs (HELP_OPTION_DESCRIPTION, stdout);
1754       fputs (VERSION_OPTION_DESCRIPTION, stdout);
1755       emit_ancillary_info (PROGRAM_NAME);
1756     }
1757   exit (status);
1758 }
1759
1760 /*----------------------------------------------------------------------.
1761 | Main program.  Decode ARGC arguments passed through the ARGV array of |
1762 | strings, then launch execution.                                       |
1763 `----------------------------------------------------------------------*/
1764
1765 /* Long options equivalences.  */
1766 static struct option const long_options[] =
1767 {
1768   {"auto-reference", no_argument, NULL, 'A'},
1769   {"break-file", required_argument, NULL, 'b'},
1770   {"flag-truncation", required_argument, NULL, 'F'},
1771   {"ignore-case", no_argument, NULL, 'f'},
1772   {"gap-size", required_argument, NULL, 'g'},
1773   {"ignore-file", required_argument, NULL, 'i'},
1774   {"macro-name", required_argument, NULL, 'M'},
1775   {"only-file", required_argument, NULL, 'o'},
1776   {"references", no_argument, NULL, 'r'},
1777   {"right-side-refs", no_argument, NULL, 'R'},
1778   {"format", required_argument, NULL, 10},
1779   {"sentence-regexp", required_argument, NULL, 'S'},
1780   {"traditional", no_argument, NULL, 'G'},
1781   {"typeset-mode", no_argument, NULL, 't'},
1782   {"width", required_argument, NULL, 'w'},
1783   {"word-regexp", required_argument, NULL, 'W'},
1784   {GETOPT_HELP_OPTION_DECL},
1785   {GETOPT_VERSION_OPTION_DECL},
1786   {NULL, 0, NULL, 0},
1787 };
1788
1789 static char const *const format_args[] =
1790 {
1791   "roff", "tex", NULL
1792 };
1793
1794 static enum Format const format_vals[] =
1795 {
1796   ROFF_FORMAT, TEX_FORMAT
1797 };
1798
1799 int
1800 main (int argc, char **argv)
1801 {
1802   int optchar;                  /* argument character */
1803   int file_index;               /* index in text input file arrays */
1804
1805   /* Decode program options.  */
1806
1807   initialize_main (&argc, &argv);
1808   set_program_name (argv[0]);
1809   setlocale (LC_ALL, "");
1810   bindtextdomain (PACKAGE, LOCALEDIR);
1811   textdomain (PACKAGE);
1812
1813   atexit (close_stdout);
1814
1815 #if HAVE_SETCHRCLASS
1816   setchrclass (NULL);
1817 #endif
1818
1819   while (optchar = getopt_long (argc, argv, "AF:GM:ORS:TW:b:i:fg:o:trw:",
1820                                 long_options, NULL),
1821          optchar != EOF)
1822     {
1823       switch (optchar)
1824         {
1825         default:
1826           usage (EXIT_FAILURE);
1827
1828         case 'G':
1829           gnu_extensions = false;
1830           break;
1831
1832         case 'b':
1833           break_file = optarg;
1834           break;
1835
1836         case 'f':
1837           ignore_case = true;
1838           break;
1839
1840         case 'g':
1841           {
1842             intmax_t tmp;
1843             if (! (xstrtoimax (optarg, NULL, 0, &tmp, "") == LONGINT_OK
1844                    && 0 < tmp && tmp <= PTRDIFF_MAX))
1845               die (EXIT_FAILURE, 0, _("invalid gap width: %s"),
1846                    quote (optarg));
1847             gap_size = tmp;
1848             break;
1849           }
1850
1851         case 'i':
1852           ignore_file = optarg;
1853           break;
1854
1855         case 'o':
1856           only_file = optarg;
1857           break;
1858
1859         case 'r':
1860           input_reference = true;
1861           break;
1862
1863         case 't':
1864           /* Yet to understand...  */
1865           break;
1866
1867         case 'w':
1868           {
1869             intmax_t tmp;
1870             if (! (xstrtoimax (optarg, NULL, 0, &tmp, "") == LONGINT_OK
1871                    && 0 < tmp && tmp <= PTRDIFF_MAX))
1872               die (EXIT_FAILURE, 0, _("invalid line width: %s"),
1873                    quote (optarg));
1874             line_width = tmp;
1875             break;
1876           }
1877
1878         case 'A':
1879           auto_reference = true;
1880           break;
1881
1882         case 'F':
1883           truncation_string = copy_unescaped_string (optarg);
1884           break;
1885
1886         case 'M':
1887           macro_name = optarg;
1888           break;
1889
1890         case 'O':
1891           output_format = ROFF_FORMAT;
1892           break;
1893
1894         case 'R':
1895           right_reference = true;
1896           break;
1897
1898         case 'S':
1899           context_regex.string = copy_unescaped_string (optarg);
1900           break;
1901
1902         case 'T':
1903           output_format = TEX_FORMAT;
1904           break;
1905
1906         case 'W':
1907           word_regex.string = copy_unescaped_string (optarg);
1908           if (!*word_regex.string)
1909             word_regex.string = NULL;
1910           break;
1911
1912         case 10:
1913           output_format = XARGMATCH ("--format", optarg,
1914                                      format_args, format_vals);
1915           break;
1916
1917         case_GETOPT_HELP_CHAR;
1918
1919         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1920         }
1921     }
1922
1923   /* Process remaining arguments.  If GNU extensions are enabled, process
1924      all arguments as input parameters.  If disabled, accept at most two
1925      arguments, the second of which is an output parameter.  */
1926
1927   if (optind == argc)
1928     {
1929
1930       /* No more argument simply means: read standard input.  */
1931
1932       input_file_name = xmalloc (sizeof *input_file_name);
1933       file_line_count = xmalloc (sizeof *file_line_count);
1934       text_buffers =    xmalloc (sizeof *text_buffers);
1935       number_input_files = 1;
1936       input_file_name[0] = NULL;
1937     }
1938   else if (gnu_extensions)
1939     {
1940       number_input_files = argc - optind;
1941       input_file_name = xnmalloc (number_input_files, sizeof *input_file_name);
1942       file_line_count = xnmalloc (number_input_files, sizeof *file_line_count);
1943       text_buffers    = xnmalloc (number_input_files, sizeof *text_buffers);
1944
1945       for (file_index = 0; file_index < number_input_files; file_index++)
1946         {
1947           if (!*argv[optind] || STREQ (argv[optind], "-"))
1948             input_file_name[file_index] = NULL;
1949           else
1950             input_file_name[file_index] = argv[optind];
1951           optind++;
1952         }
1953     }
1954   else
1955     {
1956
1957       /* There is one necessary input file.  */
1958
1959       number_input_files = 1;
1960       input_file_name = xmalloc (sizeof *input_file_name);
1961       file_line_count = xmalloc (sizeof *file_line_count);
1962       text_buffers    = xmalloc (sizeof *text_buffers);
1963       if (!*argv[optind] || STREQ (argv[optind], "-"))
1964         input_file_name[0] = NULL;
1965       else
1966         input_file_name[0] = argv[optind];
1967       optind++;
1968
1969       /* Redirect standard output, only if requested.  */
1970
1971       if (optind < argc)
1972         {
1973           if (! freopen (argv[optind], "w", stdout))
1974             die (EXIT_FAILURE, errno, "%s", quotef (argv[optind]));
1975           optind++;
1976         }
1977
1978       /* Diagnose any other argument as an error.  */
1979
1980       if (optind < argc)
1981         {
1982           error (0, 0, _("extra operand %s"), quote (argv[optind]));
1983           usage (EXIT_FAILURE);
1984         }
1985     }
1986
1987   /* If the output format has not been explicitly selected, choose dumb
1988      terminal format if GNU extensions are enabled, else 'roff' format.  */
1989
1990   if (output_format == UNKNOWN_FORMAT)
1991     output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT;
1992
1993   /* Initialize the main tables.  */
1994
1995   initialize_regex ();
1996
1997   /* Read 'Break character' file, if any.  */
1998
1999   if (break_file)
2000     digest_break_file (break_file);
2001
2002   /* Read 'Ignore words' file and 'Only words' files, if any.  If any of
2003      these files is empty, reset the name of the file to NULL, to avoid
2004      unnecessary calls to search_table. */
2005
2006   if (ignore_file)
2007     {
2008       digest_word_file (ignore_file, &ignore_table);
2009       if (ignore_table.length == 0)
2010         ignore_file = NULL;
2011     }
2012
2013   if (only_file)
2014     {
2015       digest_word_file (only_file, &only_table);
2016       if (only_table.length == 0)
2017         only_file = NULL;
2018     }
2019
2020   /* Prepare to study all the input files.  */
2021
2022   number_of_occurs[0] = 0;
2023   total_line_count = 0;
2024   maximum_word_length = 0;
2025   reference_max_width = 0;
2026
2027   for (file_index = 0; file_index < number_input_files; file_index++)
2028     {
2029       BLOCK *text_buffer = text_buffers + file_index;
2030
2031       /* Read the file contents into memory, then study it.  */
2032
2033       swallow_file_in_memory (input_file_name[file_index], text_buffer);
2034       find_occurs_in_text (file_index);
2035
2036       /* Maintain for each file how many lines has been read so far when its
2037          end is reached.  Incrementing the count first is a simple kludge to
2038          handle a possible incomplete line at end of file.  */
2039
2040       total_line_count++;
2041       file_line_count[file_index] = total_line_count;
2042     }
2043
2044   /* Do the output process phase.  */
2045
2046   sort_found_occurs ();
2047   fix_output_parameters ();
2048   generate_all_output ();
2049
2050   /* All done.  */
2051
2052   return EXIT_SUCCESS;
2053 }