src/ptx.c

   1 /* Permuted index for GNU, with keywords in their context.
   2    Copyright (C) 1990-2024 Free Software Foundation, Inc.
   3    François Pinard <pinard@iro.umontreal.ca>, 1988.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.
  17
  18    François Pinard <pinard@iro.umontreal.ca> */
  19
  20 #include <config.h>
  21
  22 #include <ctype.h>
  23 #include <getopt.h>
  24 #include <sys/types.h>
  25 #include "system.h"
  26 #include <regex.h>
  27 #include "argmatch.h"
  28 #include "c-ctype.h"
  29 #include "fadvise.h"
  30 #include "quote.h"
  31 #include "read-file.h"
  32 #include "stdio--.h"
  33 #include "xstrtol.h"
  34
  35 /* The official name of this program (e.g., no 'g' prefix).  */
  36 #define PROGRAM_NAME "ptx"
  37
  38 /* TRANSLATORS: Please translate "F. Pinard" to "François Pinard"
  39    if "ç" (c-with-cedilla) is available in the translation's character
  40    set and encoding.  */
  41 #define AUTHORS proper_name_lite ("F. Pinard", "Fran\xc3\xa7ois Pinard")
  42
  43 /* Number of possible characters in a byte.  */
  44 #define CHAR_SET_SIZE 256
  45
  46 #define ISODIGIT(C) ((C) >= '0' && (C) <= '7')
  47 #define HEXTOBIN(C) ((C) >= 'a' && (C) <= 'f' ? (C)-'a'+10 \
  48                      : (C) >= 'A' && (C) <= 'F' ? (C)-'A'+10 : (C)-'0')
  49 #define OCTTOBIN(C) ((C) - '0')
  50
  51 /* Debugging the memory allocator.  */
  52
  53 #if WITH_DMALLOC
  54 # define MALLOC_FUNC_CHECK 1
  55 # include <dmalloc.h>
  56 #endif
  57
  58 /* Global definitions.  */
  59
  60 /* FIXME: There are many unchecked integer overflows in this file,
  61    and in theory they could cause this command to have undefined
  62    behavior given large inputs or options.  This command should
  63    diagnose any such overflow and exit.  */
  64
  65 /* Program options.  */
  66
  67 enum Format
  68 {
  69   UNKNOWN_FORMAT,               /* output format still unknown */
  70   DUMB_FORMAT,                  /* output for a dumb terminal */
  71   ROFF_FORMAT,                  /* output for 'troff' or 'nroff' */
  72   TEX_FORMAT                    /* output for 'TeX' or 'LaTeX' */
  73 };
  74
  75 static bool gnu_extensions = true;      /* trigger all GNU extensions */
  76 static bool auto_reference = false;     /* refs are 'file_name:line_number:' */
  77 static bool input_reference = false;    /* refs at beginning of input lines */
  78 static bool right_reference = false;    /* output refs after right context  */
  79 static ptrdiff_t line_width = 72;       /* output line width in characters */
  80 static ptrdiff_t gap_size = 3;  /* number of spaces between output fields */
  81 static char const *truncation_string = "/";
  82                                 /* string used to mark line truncations */
  83 static char const *macro_name = "xx";   /* macro name for roff or TeX output */
  84 static enum Format output_format = UNKNOWN_FORMAT;
  85                                 /* output format */
  86
  87 static bool ignore_case = false;        /* fold lower to upper for sorting */
  88 static char const *break_file = nullptr; /* name of the 'Break chars' file */
  89 static char const *only_file = nullptr; /* name of the 'Only words' file */
  90 static char const *ignore_file = nullptr; /* name of the 'Ignore words' file */
  91
  92 /* Options that use regular expressions.  */
  93 struct regex_data
  94 {
  95   /* The original regular expression, as a string.  */
  96   char const *string;
  97
  98   /* The compiled regular expression, and its fastmap.  */
  99   struct re_pattern_buffer pattern;
 100   char fastmap[UCHAR_MAX + 1];
 101 };
 102
 103 static struct regex_data context_regex; /* end of context */
 104 static struct regex_data word_regex;    /* keyword */
 105
 106 /* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
 107    whole file.  A WORD is similar, except it is intended for smaller regions.
 108    A WORD_TABLE may contain several WORDs.  */
 109
 110 typedef struct
 111   {
 112     char *start;                /* pointer to beginning of region */
 113     char *end;                  /* pointer to end + 1 of region */
 114   }
 115 BLOCK;
 116
 117 typedef struct
 118   {
 119     char *start;                /* pointer to beginning of region */
 120     ptrdiff_t size;             /* length of the region */
 121   }
 122 WORD;
 123
 124 typedef struct
 125   {
 126     WORD *start;                /* array of WORDs */
 127     size_t alloc;               /* allocated length */
 128     ptrdiff_t length;           /* number of used entries */
 129   }
 130 WORD_TABLE;
 131
 132 /* Pattern description tables.  */
 133
 134 /* For each character, provide its folded equivalent.  */
 135 static unsigned char folded_chars[CHAR_SET_SIZE];
 136
 137 /* End of context pattern register indices.  */
 138 static struct re_registers context_regs;
 139
 140 /* Keyword pattern register indices.  */
 141 static struct re_registers word_regs;
 142
 143 /* A word characters fastmap is used only when no word regexp has been
 144    provided.  A word is then made up of a sequence of one or more characters
 145    allowed by the fastmap.  Contains !0 if character allowed in word.  Not
 146    only this is faster in most cases, but it simplifies the implementation
 147    of the Break files.  */
 148 static char word_fastmap[CHAR_SET_SIZE];
 149
 150 /* Maximum length of any word read.  */
 151 static ptrdiff_t maximum_word_length;
 152
 153 /* Maximum width of any reference used.  */
 154 static ptrdiff_t reference_max_width;
 155
 156 /* Ignore and Only word tables.  */
 157
 158 static WORD_TABLE ignore_table; /* table of words to ignore */
 159 static WORD_TABLE only_table;           /* table of words to select */
 160
 161 /* Source text table, and scanning macros.  */
 162
 163 static int number_input_files;  /* number of text input files */
 164 static intmax_t total_line_count;       /* total number of lines seen so far */
 165 static char const **input_file_name;    /* array of text input file names */
 166 static intmax_t *file_line_count;       /* array of line count values at end */
 167
 168 static BLOCK *text_buffers;     /* files to study */
 169
 170 /* SKIP_NON_WHITE used only for getting or skipping the reference.  */
 171
 172 #define SKIP_NON_WHITE(cursor, limit) \
 173   while (cursor < limit && ! isspace (to_uchar (*cursor)))              \
 174     cursor++
 175
 176 #define SKIP_WHITE(cursor, limit) \
 177   while (cursor < limit && isspace (to_uchar (*cursor)))                \
 178     cursor++
 179
 180 #define SKIP_WHITE_BACKWARDS(cursor, start) \
 181   while (cursor > start && isspace (to_uchar (cursor[-1])))             \
 182     cursor--
 183
 184 #define SKIP_SOMETHING(cursor, limit) \
 185   if (word_regex.string)                                                \
 186     {                                                                   \
 187       regoff_t count;                                                   \
 188       count = re_match (&word_regex.pattern, cursor, limit - cursor,    \
 189                         0, nullptr);                                    \
 190       if (count == -2)                                                  \
 191         matcher_error ();                                               \
 192       cursor += count == -1 ? 1 : count;                                \
 193     }                                                                   \
 194   else if (word_fastmap[to_uchar (*cursor)])                            \
 195     while (cursor < limit && word_fastmap[to_uchar (*cursor)])          \
 196       cursor++;                                                         \
 197   else                                                                  \
 198     cursor++
 199
 200 /* Occurrences table.
 201
 202    The 'keyword' pointer provides the central word, which is surrounded
 203    by a left context and a right context.  The 'keyword' and 'length'
 204    field allow full 8-bit characters keys, even including NULs.  At other
 205    places in this program, the name 'keyafter' refers to the keyword
 206    followed by its right context.
 207
 208    The left context does not extend, towards the beginning of the file,
 209    further than a distance given by the 'left' value.  This value is
 210    relative to the keyword beginning, it is usually negative.  This
 211    insures that, except for white space, we will never have to backward
 212    scan the source text, when it is time to generate the final output
 213    lines.
 214
 215    The right context, indirectly attainable through the keyword end, does
 216    not extend, towards the end of the file, further than a distance given
 217    by the 'right' value.  This value is relative to the keyword
 218    beginning, it is usually positive.
 219
 220    When automatic references are used, the 'reference' value is the
 221    overall line number in all input files read so far, in this case, it
 222    is of type intmax_t.  When input references are used, the 'reference'
 223    value indicates the distance between the keyword beginning and the
 224    start of the reference field, and it fits in ptrdiff_t and is usually
 225    negative.  */
 226
 227 typedef struct
 228   {
 229     WORD key;                   /* description of the keyword */
 230     ptrdiff_t left;             /* distance to left context start */
 231     ptrdiff_t right;            /* distance to right context end */
 232     intmax_t reference;         /* reference descriptor */
 233     int file_index;             /* corresponding file  */
 234   }
 235 OCCURS;
 236
 237 /* The various OCCURS tables are indexed by the language.  But the time
 238    being, there is no such multiple language support.  */
 239
 240 static OCCURS *occurs_table[1]; /* all words retained from the read text */
 241 static size_t occurs_alloc[1];  /* allocated size of occurs_table */
 242 static ptrdiff_t number_of_occurs[1]; /* number of used slots in occurs_table */
 243
 244
 245 /* Communication among output routines.  */
 246
 247 /* Indicate if special output processing is requested for each character.  */
 248 static char edited_flag[CHAR_SET_SIZE];
 249
 250 /* Half of line width, reference excluded.  */
 251 static ptrdiff_t half_line_width;
 252
 253 /* Maximum width of before field.  */
 254 static ptrdiff_t before_max_width;
 255
 256 /* Maximum width of keyword-and-after field.  */
 257 static ptrdiff_t keyafter_max_width;
 258
 259 /* Length of string that flags truncation.  */
 260 static ptrdiff_t truncation_string_length;
 261
 262 /* When context is limited by lines, wraparound may happen on final output:
 263    the 'head' pointer gives access to some supplementary left context which
 264    will be seen at the end of the output line, the 'tail' pointer gives
 265    access to some supplementary right context which will be seen at the
 266    beginning of the output line. */
 267
 268 static BLOCK tail;              /* tail field */
 269 static bool tail_truncation;    /* flag truncation after the tail field */
 270
 271 static BLOCK before;            /* before field */
 272 static bool before_truncation;  /* flag truncation before the before field */
 273
 274 static BLOCK keyafter;          /* keyword-and-after field */
 275 static bool keyafter_truncation; /* flag truncation after the keyafter field */
 276
 277 static BLOCK head;              /* head field */
 278 static bool head_truncation;    /* flag truncation before the head field */
 279
 280 static BLOCK reference;         /* reference field for input reference mode */
 281
 282 /* Miscellaneous routines.  */
 283
 284 /* Diagnose an error in the regular expression matcher.  Then exit.  */
 285
 286 static void
 287 matcher_error (void)
 288 {
 289   error (EXIT_FAILURE, errno, _("error in regular expression matcher"));
 290 }
 291
 292 /* Unescape STRING in-place.  */
 293
 294 static void
 295 unescape_string (char *string)
 296 {
 297   char *cursor;                 /* cursor in result */
 298   int value;                    /* value of \nnn escape */
 299   int length;                   /* length of \nnn escape */
 300
 301   cursor = string;
 302
 303   while (*string)
 304     {
 305       if (*string == '\\')
 306         {
 307           string++;
 308           switch (*string)
 309             {
 310             case 'x':           /* \xhhh escape, 3 chars maximum */
 311               value = 0;
 312               for (length = 0, string++;
 313                    length < 3 && c_isxdigit (to_uchar (*string));
 314                    length++, string++)
 315                 value = value * 16 + HEXTOBIN (*string);
 316               if (length == 0)
 317                 {
 318                   *cursor++ = '\\';
 319                   *cursor++ = 'x';
 320                 }
 321               else
 322                 *cursor++ = value;
 323               break;
 324
 325             case '0':           /* \0ooo escape, 3 chars maximum */
 326               value = 0;
 327               for (length = 0, string++;
 328                    length < 3 && ISODIGIT (*string);
 329                    length++, string++)
 330                 value = value * 8 + OCTTOBIN (*string);
 331               *cursor++ = value;
 332               break;
 333
 334             case 'a':           /* alert */
 335 #if __STDC__
 336               *cursor++ = '\a';
 337 #else
 338               *cursor++ = 7;
 339 #endif
 340               string++;
 341               break;
 342
 343             case 'b':           /* backspace */
 344               *cursor++ = '\b';
 345               string++;
 346               break;
 347
 348             case 'c':           /* cancel the rest of the output */
 349               while (*string)
 350                 string++;
 351               break;
 352
 353             case 'f':           /* form feed */
 354               *cursor++ = '\f';
 355               string++;
 356               break;
 357
 358             case 'n':           /* new line */
 359               *cursor++ = '\n';
 360               string++;
 361               break;
 362
 363             case 'r':           /* carriage return */
 364               *cursor++ = '\r';
 365               string++;
 366               break;
 367
 368             case 't':           /* horizontal tab */
 369               *cursor++ = '\t';
 370               string++;
 371               break;
 372
 373             case 'v':           /* vertical tab */
 374 #if __STDC__
 375               *cursor++ = '\v';
 376 #else
 377               *cursor++ = 11;
 378 #endif
 379               string++;
 380               break;
 381
 382             case '\0':          /* lone backslash at end of string */
 383               /* ignore it */
 384               break;
 385
 386             default:
 387               *cursor++ = '\\';
 388               *cursor++ = *string++;
 389               break;
 390             }
 391         }
 392       else
 393         *cursor++ = *string++;
 394     }
 395
 396   *cursor = '\0';
 397 }
 398
 399 /*--------------------------------------------------------------------------.
 400 | Compile the regex represented by REGEX, diagnose and abort if any error.  |
 401 `--------------------------------------------------------------------------*/
 402
 403 static void
 404 compile_regex (struct regex_data *regex)
 405 {
 406   struct re_pattern_buffer *pattern = &regex->pattern;
 407   char const *string = regex->string;
 408   char const *message;
 409
 410   pattern->buffer = nullptr;
 411   pattern->allocated = 0;
 412   pattern->fastmap = regex->fastmap;
 413   pattern->translate = ignore_case ? folded_chars : nullptr;
 414
 415   message = re_compile_pattern (string, strlen (string), pattern);
 416   if (message)
 417     error (EXIT_FAILURE, 0, _("%s (for regexp %s)"), message, quote (string));
 418
 419   /* The fastmap should be compiled before 're_match'.  The following
 420      call is not mandatory, because 're_search' is always called sooner,
 421      and it compiles the fastmap if this has not been done yet.  */
 422
 423   re_compile_fastmap (pattern);
 424 }
 425
 426 /*------------------------------------------------------------------------.
 427 | This will initialize various tables for pattern match and compiles some |
 428 | regexps.                                                                |
 429 `------------------------------------------------------------------------*/
 430
 431 static void
 432 initialize_regex (void)
 433 {
 434   int character;                /* character value */
 435
 436   /* Initialize the case folding table.  */
 437
 438   if (ignore_case)
 439     for (character = 0; character < CHAR_SET_SIZE; character++)
 440       folded_chars[character] = toupper (character);
 441
 442   /* Unless the user already provided a description of the end of line or
 443      end of sentence sequence, select an end of line sequence to compile.
 444      If the user provided an empty definition, thus disabling end of line
 445      or sentence feature, make it null to speed up tests.  If GNU
 446      extensions are enabled, use end of sentence like in GNU emacs.  If
 447      disabled, use end of lines.  */
 448
 449   if (context_regex.string)
 450     {
 451       if (!*context_regex.string)
 452         context_regex.string = nullptr;
 453     }
 454   else if (gnu_extensions && !input_reference)
 455     context_regex.string = "[.?!][]\"')}]*\\($\\|\t\\|  \\)[ \t\n]*";
 456   else
 457     context_regex.string = "\n";
 458
 459   if (context_regex.string)
 460     compile_regex (&context_regex);
 461
 462   /* If the user has already provided a non-empty regexp to describe
 463      words, compile it.  Else, unless this has already been done through
 464      a user provided Break character file, construct a fastmap of
 465      characters that may appear in a word.  If GNU extensions enabled,
 466      include only letters of the underlying character set.  If disabled,
 467      include almost everything, even punctuation; stop only on white
 468      space.  */
 469
 470   if (word_regex.string)
 471     compile_regex (&word_regex);
 472   else if (!break_file)
 473     {
 474       if (gnu_extensions)
 475         {
 476
 477           /* Simulate \w+.  */
 478
 479           for (character = 0; character < CHAR_SET_SIZE; character++)
 480             word_fastmap[character] = !! isalpha (character);
 481         }
 482       else
 483         {
 484
 485           /* Simulate [^ \t\n]+.  */
 486
 487           memset (word_fastmap, 1, CHAR_SET_SIZE);
 488           word_fastmap[' '] = 0;
 489           word_fastmap['\t'] = 0;
 490           word_fastmap['\n'] = 0;
 491         }
 492     }
 493 }
 494
 495 /*------------------------------------------------------------------------.
 496 | This routine will attempt to swallow a whole file name FILE_NAME into a |
 497 | contiguous region of memory and return a description of it into BLOCK.  |
 498 | Standard input is assumed whenever FILE_NAME is null, empty or "-".     |
 499 |                                                                         |
 500 | Previously, in some cases, white space compression was attempted while  |
 501 | inputting text.  This was defeating some regexps like default end of    |
 502 | sentence, which checks for two consecutive spaces.  If white space      |
 503 | compression is ever reinstated, it should be in output routines.        |
 504 `------------------------------------------------------------------------*/
 505
 506 static void
 507 swallow_file_in_memory (char const *file_name, BLOCK *block)
 508 {
 509   size_t used_length;           /* used length in memory buffer */
 510
 511   /* As special cases, a file name which is null or "-" indicates standard
 512      input, which is already opened.  In all other cases, open the file from
 513      its name.  */
 514   bool using_stdin = !file_name || !*file_name || STREQ (file_name, "-");
 515   if (using_stdin)
 516     block->start = fread_file (stdin, 0, &used_length);
 517   else
 518     block->start = read_file (file_name, 0, &used_length);
 519
 520   if (!block->start)
 521     error (EXIT_FAILURE, errno, "%s", quotef (using_stdin ? "-" : file_name));
 522
 523   if (using_stdin)
 524     clearerr (stdin);
 525
 526   block->end = block->start + used_length;
 527 }
 528
 529 /* Sort and search routines.  */
 530
 531 /*--------------------------------------------------------------------------.
 532 | Compare two words, FIRST and SECOND, and return 0 if they are identical.  |
 533 | Return less than 0 if the first word goes before the second; return       |
 534 | greater than 0 if the first word goes after the second.                   |
 535 |                                                                           |
 536 | If a word is indeed a prefix of the other, the shorter should go first.   |
 537 `--------------------------------------------------------------------------*/
 538
 539 static int
 540 compare_words (const void *void_first, const void *void_second)
 541 {
 542 #define first ((const WORD *) void_first)
 543 #define second ((const WORD *) void_second)
 544   ptrdiff_t length;             /* minimum of two lengths */
 545   ptrdiff_t counter;            /* cursor in words */
 546   int value;                    /* value of comparison */
 547
 548   length = first->size < second->size ? first->size : second->size;
 549
 550   if (ignore_case)
 551     {
 552       for (counter = 0; counter < length; counter++)
 553         {
 554           value = (folded_chars [to_uchar (first->start[counter])]
 555                    - folded_chars [to_uchar (second->start[counter])]);
 556           if (value != 0)
 557             return value;
 558         }
 559     }
 560   else
 561     {
 562       for (counter = 0; counter < length; counter++)
 563         {
 564           value = (to_uchar (first->start[counter])
 565                    - to_uchar (second->start[counter]));
 566           if (value != 0)
 567             return value;
 568         }
 569     }
 570
 571   return (first->size > second->size) - (first->size < second->size);
 572 #undef first
 573 #undef second
 574 }
 575
 576 /*-----------------------------------------------------------------------.
 577 | Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
 578 | go first.  In case of a tie, preserve the original order through a     |
 579 | pointer comparison.                                                    |
 580 `-----------------------------------------------------------------------*/
 581
 582 static int
 583 compare_occurs (const void *void_first, const void *void_second)
 584 {
 585 #define first ((const OCCURS *) void_first)
 586 #define second ((const OCCURS *) void_second)
 587   int value;
 588
 589   value = compare_words (&first->key, &second->key);
 590   return (value ? value
 591           : ((first->key.start > second->key.start)
 592              - (first->key.start < second->key.start)));
 593 #undef first
 594 #undef second
 595 }
 596
 597 /* True if WORD appears in TABLE.  Uses a binary search.  */
 598
 599 ATTRIBUTE_PURE
 600 static bool
 601 search_table (WORD *word, WORD_TABLE *table)
 602 {
 603   ptrdiff_t lowest;             /* current lowest possible index */
 604   ptrdiff_t highest;            /* current highest possible index */
 605   ptrdiff_t middle;             /* current middle index */
 606   int value;                    /* value from last comparison */
 607
 608   lowest = 0;
 609   highest = table->length - 1;
 610   while (lowest <= highest)
 611     {
 612       middle = (lowest + highest) / 2;
 613       value = compare_words (word, table->start + middle);
 614       if (value < 0)
 615         highest = middle - 1;
 616       else if (value > 0)
 617         lowest = middle + 1;
 618       else
 619         return true;
 620     }
 621   return false;
 622 }
 623
 624 /*---------------------------------------------------------------------.
 625 | Sort the whole occurs table in memory.  Presumably, 'qsort' does not |
 626 | take intermediate copies or table elements, so the sort will be      |
 627 | stabilized throughout the comparison routine.                        |
 628 `---------------------------------------------------------------------*/
 629
 630 static void
 631 sort_found_occurs (void)
 632 {
 633
 634   /* Only one language for the time being.  */
 635   if (number_of_occurs[0])
 636     qsort (occurs_table[0], number_of_occurs[0], sizeof **occurs_table,
 637            compare_occurs);
 638 }
 639
 640 /* Parameter files reading routines.  */
 641
 642 /*----------------------------------------------------------------------.
 643 | Read a file named FILE_NAME, containing a set of break characters.    |
 644 | Build a content to the array word_fastmap in which all characters are |
 645 | allowed except those found in the file.  Characters may be repeated.  |
 646 `----------------------------------------------------------------------*/
 647
 648 static void
 649 digest_break_file (char const *file_name)
 650 {
 651   BLOCK file_contents;          /* to receive a copy of the file */
 652   char *cursor;                 /* cursor in file copy */
 653
 654   swallow_file_in_memory (file_name, &file_contents);
 655
 656   /* Make the fastmap and record the file contents in it.  */
 657
 658   memset (word_fastmap, 1, CHAR_SET_SIZE);
 659   for (cursor = file_contents.start; cursor < file_contents.end; cursor++)
 660     word_fastmap[to_uchar (*cursor)] = 0;
 661
 662   if (!gnu_extensions)
 663     {
 664
 665       /* If GNU extensions are enabled, the only way to avoid newline as
 666          a break character is to write all the break characters in the
 667          file with no newline at all, not even at the end of the file.
 668          If disabled, spaces, tabs and newlines are always considered as
 669          break characters even if not included in the break file.  */
 670
 671       word_fastmap[' '] = 0;
 672       word_fastmap['\t'] = 0;
 673       word_fastmap['\n'] = 0;
 674     }
 675
 676   /* Return the space of the file, which is no more required.  */
 677
 678   free (file_contents.start);
 679 }
 680
 681 /*-----------------------------------------------------------------------.
 682 | Read a file named FILE_NAME, containing one word per line, then        |
 683 | construct in TABLE a table of WORD descriptors for them.  The routine  |
 684 | swallows the whole file in memory; this is at the expense of space     |
 685 | needed for newlines, which are useless; however, the reading is fast.  |
 686 `-----------------------------------------------------------------------*/
 687
 688 static void
 689 digest_word_file (char const *file_name, WORD_TABLE *table)
 690 {
 691   BLOCK file_contents;          /* to receive a copy of the file */
 692   char *cursor;                 /* cursor in file copy */
 693   char *word_start;             /* start of the current word */
 694
 695   swallow_file_in_memory (file_name, &file_contents);
 696
 697   table->start = nullptr;
 698   table->alloc = 0;
 699   table->length = 0;
 700
 701   /* Read the whole file.  */
 702
 703   cursor = file_contents.start;
 704   while (cursor < file_contents.end)
 705     {
 706
 707       /* Read one line, and save the word in contains.  */
 708
 709       word_start = cursor;
 710       while (cursor < file_contents.end && *cursor != '\n')
 711         cursor++;
 712
 713       /* Record the word in table if it is not empty.  */
 714
 715       if (cursor > word_start)
 716         {
 717           if (table->length == table->alloc)
 718             table->start = x2nrealloc (table->start, &table->alloc,
 719                                        sizeof *table->start);
 720           table->start[table->length].start = word_start;
 721           table->start[table->length].size = cursor - word_start;
 722           table->length++;
 723         }
 724
 725       /* This test allows for an incomplete line at end of file.  */
 726
 727       if (cursor < file_contents.end)
 728         cursor++;
 729     }
 730
 731   /* Finally, sort all the words read.  */
 732
 733   qsort (table->start, table->length, sizeof table->start[0], compare_words);
 734 }
 735
 736 /* Keyword recognition and selection.  */
 737
 738 /*----------------------------------------------------------------------.
 739 | For each keyword in the source text, constructs an OCCURS structure.  |
 740 `----------------------------------------------------------------------*/
 741
 742 static void
 743 find_occurs_in_text (int file_index)
 744 {
 745   char *cursor;                 /* for scanning the source text */
 746   char *scan;                   /* for scanning the source text also */
 747   char *line_start;             /* start of the current input line */
 748   char *line_scan;              /* newlines scanned until this point */
 749   ptrdiff_t reference_length;   /* length of reference in input mode */
 750   WORD possible_key;            /* possible key, to ease searches */
 751   OCCURS *occurs_cursor;        /* current OCCURS under construction */
 752
 753   char *context_start;          /* start of left context */
 754   char *context_end;            /* end of right context */
 755   char *word_start;             /* start of word */
 756   char *word_end;               /* end of word */
 757   char *next_context_start;     /* next start of left context */
 758
 759   const BLOCK *text_buffer = &text_buffers[file_index];
 760
 761   /* reference_length is always used within 'if (input_reference)'.
 762      However, GNU C diagnoses that it may be used uninitialized.  The
 763      following assignment is merely to shut it up.  */
 764
 765   reference_length = 0;
 766
 767   /* Tracking where lines start is helpful for reference processing.  In
 768      auto reference mode, this allows counting lines.  In input reference
 769      mode, this permits finding the beginning of the references.
 770
 771      The first line begins with the file, skip immediately this very first
 772      reference in input reference mode, to help further rejection any word
 773      found inside it.  Also, unconditionally assigning these variable has
 774      the happy effect of shutting up lint.  */
 775
 776   line_start = text_buffer->start;
 777   line_scan = line_start;
 778   if (input_reference)
 779     {
 780       SKIP_NON_WHITE (line_scan, text_buffer->end);
 781       reference_length = line_scan - line_start;
 782       SKIP_WHITE (line_scan, text_buffer->end);
 783     }
 784
 785   /* Process the whole buffer, one line or one sentence at a time.  */
 786
 787   for (cursor = text_buffer->start;
 788        cursor < text_buffer->end;
 789        cursor = next_context_start)
 790     {
 791
 792       /* 'context_start' gets initialized before the processing of each
 793          line, or once for the whole buffer if no end of line or sentence
 794          sequence separator.  */
 795
 796       context_start = cursor;
 797
 798       /* If an end of line or end of sentence sequence is defined and
 799          non-empty, 'next_context_start' will be recomputed to be the end of
 800          each line or sentence, before each one is processed.  If no such
 801          sequence, then 'next_context_start' is set at the end of the whole
 802          buffer, which is then considered to be a single line or sentence.
 803          This test also accounts for the case of an incomplete line or
 804          sentence at the end of the buffer.  */
 805
 806       next_context_start = text_buffer->end;
 807       if (context_regex.string)
 808         switch (re_search (&context_regex.pattern, cursor,
 809                            text_buffer->end - cursor,
 810                            0, text_buffer->end - cursor, &context_regs))
 811           {
 812           case -2:
 813             matcher_error ();
 814
 815           case -1:
 816             break;
 817
 818           case 0:
 819             error (EXIT_FAILURE, 0,
 820                    _("error: regular expression has a match of length zero:"
 821                      " %s"),
 822                    quote (context_regex.string));
 823
 824           default:
 825             next_context_start = cursor + context_regs.end[0];
 826             break;
 827           }
 828
 829       /* Include the separator into the right context, but not any suffix
 830          white space in this separator; this insures it will be seen in
 831          output and will not take more space than necessary.  */
 832
 833       context_end = next_context_start;
 834       SKIP_WHITE_BACKWARDS (context_end, context_start);
 835
 836       /* Read and process a single input line or sentence, one word at a
 837          time.  */
 838
 839       while (true)
 840         {
 841           if (word_regex.string)
 842
 843             /* If a word regexp has been compiled, use it to skip at the
 844                beginning of the next word.  If there is no such word, exit
 845                the loop.  */
 846
 847             {
 848               regoff_t r = re_search (&word_regex.pattern, cursor,
 849                                       context_end - cursor,
 850                                       0, context_end - cursor, &word_regs);
 851               if (r == -2)
 852                 matcher_error ();
 853               if (r == -1)
 854                 break;
 855               word_start = cursor + word_regs.start[0];
 856               word_end = cursor + word_regs.end[0];
 857             }
 858           else
 859
 860             /* Avoid re_search and use the fastmap to skip to the
 861                beginning of the next word.  If there is no more word in
 862                the buffer, exit the loop.  */
 863
 864             {
 865               scan = cursor;
 866               while (scan < context_end
 867                      && !word_fastmap[to_uchar (*scan)])
 868                 scan++;
 869
 870               if (scan == context_end)
 871                 break;
 872
 873               word_start = scan;
 874
 875               while (scan < context_end
 876                      && word_fastmap[to_uchar (*scan)])
 877                 scan++;
 878
 879               word_end = scan;
 880             }
 881
 882           /* Skip right to the beginning of the found word.  */
 883
 884           cursor = word_start;
 885
 886           /* Skip any zero length word.  Just advance a single position,
 887              then go fetch the next word.  */
 888
 889           if (word_end == word_start)
 890             {
 891               cursor++;
 892               continue;
 893             }
 894
 895           /* This is a genuine, non empty word, so save it as a possible
 896              key.  Then skip over it.  Also, maintain the maximum length of
 897              all words read so far.  It is mandatory to take the maximum
 898              length of all words in the file, without considering if they
 899              are actually kept or rejected, because backward jumps at output
 900              generation time may fall in *any* word.  */
 901
 902           possible_key.start = cursor;
 903           possible_key.size = word_end - word_start;
 904           cursor += possible_key.size;
 905
 906           if (possible_key.size > maximum_word_length)
 907             maximum_word_length = possible_key.size;
 908
 909           /* In input reference mode, update 'line_start' from its previous
 910              value.  Count the lines just in case auto reference mode is
 911              also selected. If it happens that the word just matched is
 912              indeed part of a reference; just ignore it.  */
 913
 914           if (input_reference)
 915             {
 916               while (line_scan < possible_key.start)
 917                 if (*line_scan == '\n')
 918                   {
 919                     total_line_count++;
 920                     line_scan++;
 921                     line_start = line_scan;
 922                     SKIP_NON_WHITE (line_scan, text_buffer->end);
 923                     reference_length = line_scan - line_start;
 924                   }
 925                 else
 926                   line_scan++;
 927               if (line_scan > possible_key.start)
 928                 continue;
 929             }
 930
 931           /* Ignore the word if an 'Ignore words' table exists and if it is
 932              part of it.  Also ignore the word if an 'Only words' table and
 933              if it is *not* part of it.
 934
 935              It is allowed that both tables be used at once, even if this
 936              may look strange for now.  Just ignore a word that would appear
 937              in both.  If regexps are eventually implemented for these
 938              tables, the Ignore table could then reject words that would
 939              have been previously accepted by the Only table.  */
 940
 941           if (ignore_file && search_table (&possible_key, &ignore_table))
 942             continue;
 943           if (only_file && !search_table (&possible_key, &only_table))
 944             continue;
 945
 946           /* A non-empty word has been found.  First of all, insure
 947              proper allocation of the next OCCURS, and make a pointer to
 948              where it will be constructed.  */
 949
 950           if (number_of_occurs[0] == occurs_alloc[0])
 951             occurs_table[0] = x2nrealloc (occurs_table[0],
 952                                           &occurs_alloc[0],
 953                                           sizeof *occurs_table[0]);
 954           occurs_cursor = occurs_table[0] + number_of_occurs[0];
 955
 956           /* Define the reference field, if any.  */
 957
 958           if (auto_reference)
 959             {
 960
 961               /* While auto referencing, update 'line_start' from its
 962                  previous value, counting lines as we go.  If input
 963                  referencing at the same time, 'line_start' has been
 964                  advanced earlier, and the following loop is never really
 965                  executed.  */
 966
 967               while (line_scan < possible_key.start)
 968                 if (*line_scan == '\n')
 969                   {
 970                     total_line_count++;
 971                     line_scan++;
 972                     line_start = line_scan;
 973                     SKIP_NON_WHITE (line_scan, text_buffer->end);
 974                   }
 975                 else
 976                   line_scan++;
 977
 978               occurs_cursor->reference = total_line_count;
 979             }
 980           else if (input_reference)
 981             {
 982
 983               /* If only input referencing, 'line_start' has been computed
 984                  earlier to detect the case the word matched would be part
 985                  of the reference.  The reference position is simply the
 986                  value of 'line_start'.  */
 987
 988               occurs_cursor->reference = line_start - possible_key.start;
 989               if (reference_length > reference_max_width)
 990                 reference_max_width = reference_length;
 991             }
 992
 993           /* Exclude the reference from the context in simple cases.  */
 994
 995           if (input_reference && line_start == context_start)
 996             {
 997               SKIP_NON_WHITE (context_start, context_end);
 998               SKIP_WHITE (context_start, context_end);
 999             }
1000
1001           /* Completes the OCCURS structure.  */
1002
1003           occurs_cursor->key = possible_key;
1004           occurs_cursor->left = context_start - possible_key.start;
1005           occurs_cursor->right = context_end - possible_key.start;
1006           occurs_cursor->file_index = file_index;
1007
1008           number_of_occurs[0]++;
1009         }
1010     }
1011 }
1012
1013 /* Formatting and actual output - service routines.  */
1014
1015 /*-----------------------------------------.
1016 | Prints some NUMBER of spaces on stdout.  |
1017 `-----------------------------------------*/
1018
1019 static void
1020 print_spaces (ptrdiff_t number)
1021 {
1022   for (ptrdiff_t counter = number; counter > 0; counter--)
1023     putchar (' ');
1024 }
1025
1026 /*-------------------------------------.
1027 | Prints the field provided by FIELD.  |
1028 `-------------------------------------*/
1029
1030 static void
1031 print_field (BLOCK field)
1032 {
1033   char *cursor;                 /* Cursor in field to print */
1034
1035   /* Whitespace is not really compressed.  Instead, each white space
1036      character (tab, vt, ht etc.) is printed as one single space.  */
1037
1038   for (cursor = field.start; cursor < field.end; cursor++)
1039     {
1040       unsigned char character = *cursor;
1041       if (edited_flag[character])
1042         {
1043           /* Handle cases which are specific to 'roff' or TeX.  All
1044              white space processing is done as the default case of
1045              this switch.  */
1046
1047           switch (character)
1048             {
1049             case '"':
1050               /* In roff output format, double any quote.  */
1051               putchar ('"');
1052               putchar ('"');
1053               break;
1054
1055             case '$':
1056             case '%':
1057             case '&':
1058             case '#':
1059             case '_':
1060               /* In TeX output format, precede these with a backslash.  */
1061               putchar ('\\');
1062               putchar (character);
1063               break;
1064
1065             case '{':
1066             case '}':
1067               /* In TeX output format, precede these with a backslash and
1068                  force mathematical mode.  */
1069               printf ("$\\%c$", character);
1070               break;
1071
1072             case '\\':
1073               /* In TeX output mode, request production of a backslash.  */
1074               fputs ("\\backslash{}", stdout);
1075               break;
1076
1077             default:
1078               /* Any other flagged character produces a single space.  */
1079               putchar (' ');
1080             }
1081         }
1082       else
1083         putchar (*cursor);
1084     }
1085 }
1086
1087 /* Formatting and actual output - planning routines.  */
1088
1089 /*--------------------------------------------------------------------.
1090 | From information collected from command line options and input file |
1091 | readings, compute and fix some output parameter values.             |
1092 `--------------------------------------------------------------------*/
1093
1094 static void
1095 fix_output_parameters (void)
1096 {
1097   size_t file_index;            /* index in text input file arrays */
1098   intmax_t line_ordinal;        /* line ordinal value for reference */
1099   ptrdiff_t reference_width;    /* width for the whole reference */
1100   int character;                /* character ordinal */
1101   char const *cursor;           /* cursor in some constant strings */
1102
1103   /* In auto reference mode, the maximum width of this field is
1104      precomputed and subtracted from the overall line width.  Add one for
1105      the column which separate the file name from the line number.  */
1106
1107   if (auto_reference)
1108     {
1109       reference_max_width = 0;
1110       for (file_index = 0; file_index < number_input_files; file_index++)
1111         {
1112           line_ordinal = file_line_count[file_index] + 1;
1113           if (file_index > 0)
1114             line_ordinal -= file_line_count[file_index - 1];
1115           char ordinal_string[INT_BUFSIZE_BOUND (intmax_t)];
1116           reference_width = sprintf (ordinal_string, "%jd", line_ordinal);
1117           if (input_file_name[file_index])
1118             reference_width += strlen (input_file_name[file_index]);
1119           if (reference_width > reference_max_width)
1120             reference_max_width = reference_width;
1121         }
1122       reference_max_width++;
1123       reference.start = xmalloc (reference_max_width + 1);
1124     }
1125
1126   /* If the reference appears to the left of the output line, reserve some
1127      space for it right away, including one gap size.  */
1128
1129   if ((auto_reference || input_reference) && !right_reference)
1130     line_width -= reference_max_width + gap_size;
1131   if (line_width < 0)
1132     line_width = 0;
1133
1134   /* The output lines, minimally, will contain from left to right a left
1135      context, a gap, and a keyword followed by the right context with no
1136      special intervening gap.  Half of the line width is dedicated to the
1137      left context and the gap, the other half is dedicated to the keyword
1138      and the right context; these values are computed once and for all here.
1139      There also are tail and head wrap around fields, used when the keyword
1140      is near the beginning or the end of the line, or when some long word
1141      cannot fit in, but leave place from wrapped around shorter words.  The
1142      maximum width of these fields are recomputed separately for each line,
1143      on a case by case basis.  It is worth noting that it cannot happen that
1144      both the tail and head fields are used at once.  */
1145
1146   half_line_width = line_width / 2;
1147   before_max_width = half_line_width - gap_size;
1148   keyafter_max_width = half_line_width;
1149
1150   /* If truncation_string is the empty string, make it null to speed up
1151      tests.  In this case, truncation_string_length will never get used, so
1152      there is no need to set it.  */
1153
1154   if (truncation_string && *truncation_string)
1155     truncation_string_length = strlen (truncation_string);
1156   else
1157     truncation_string = nullptr;
1158
1159   if (gnu_extensions)
1160     {
1161
1162       /* When flagging truncation at the left of the keyword, the
1163          truncation mark goes at the beginning of the before field,
1164          unless there is a head field, in which case the mark goes at the
1165          left of the head field.  When flagging truncation at the right
1166          of the keyword, the mark goes at the end of the keyafter field,
1167          unless there is a tail field, in which case the mark goes at the
1168          end of the tail field.  Only eight combination cases could arise
1169          for truncation marks:
1170
1171          . None.
1172          . One beginning the before field.
1173          . One beginning the head field.
1174          . One ending the keyafter field.
1175          . One ending the tail field.
1176          . One beginning the before field, another ending the keyafter field.
1177          . One ending the tail field, another beginning the before field.
1178          . One ending the keyafter field, another beginning the head field.
1179
1180          So, there is at most two truncation marks, which could appear both
1181          on the left side of the center of the output line, both on the
1182          right side, or one on either side.  */
1183
1184       before_max_width -= 2 * truncation_string_length;
1185       if (before_max_width < 0)
1186         before_max_width = 0;
1187       keyafter_max_width -= 2 * truncation_string_length;
1188     }
1189   else
1190     {
1191
1192       /* I never figured out exactly how UNIX' ptx plans the output width
1193          of its various fields.  If GNU extensions are disabled, do not
1194          try computing the field widths correctly; instead, use the
1195          following formula, which does not completely imitate UNIX' ptx,
1196          but almost.  */
1197
1198       keyafter_max_width -= 2 * truncation_string_length + 1;
1199     }
1200
1201   /* Compute which characters need special output processing.  Initialize
1202      by flagging any white space character.  Some systems do not consider
1203      form feed as a space character, but we do.  */
1204
1205   for (character = 0; character < CHAR_SET_SIZE; character++)
1206     edited_flag[character] = !! isspace (character);
1207   edited_flag['\f'] = 1;
1208
1209   /* Complete the special character flagging according to selected output
1210      format.  */
1211
1212   switch (output_format)
1213     {
1214     case UNKNOWN_FORMAT:
1215       /* Should never happen.  */
1216
1217     case DUMB_FORMAT:
1218       break;
1219
1220     case ROFF_FORMAT:
1221
1222       /* 'Quote' characters should be doubled.  */
1223
1224       edited_flag['"'] = 1;
1225       break;
1226
1227     case TEX_FORMAT:
1228
1229       /* Various characters need special processing.  */
1230
1231       for (cursor = "$%&#_{}\\"; *cursor; cursor++)
1232         edited_flag[to_uchar (*cursor)] = 1;
1233
1234       break;
1235     }
1236 }
1237
1238 /*------------------------------------------------------------------.
1239 | Compute the position and length of all the output fields, given a |
1240 | pointer to some OCCURS.                                           |
1241 `------------------------------------------------------------------*/
1242
1243 static void
1244 define_all_fields (OCCURS *occurs)
1245 {
1246   ptrdiff_t tail_max_width;     /* allowable width of tail field */
1247   ptrdiff_t head_max_width;     /* allowable width of head field */
1248   char *cursor;                 /* running cursor in source text */
1249   char *left_context_start;     /* start of left context */
1250   char *right_context_end;      /* end of right context */
1251   char *left_field_start;       /* conservative start for 'head'/'before' */
1252   char const *file_name;        /* file name for reference */
1253   intmax_t line_ordinal;        /* line ordinal for reference */
1254   char const *buffer_start;     /* start of buffered file for this occurs */
1255   char const *buffer_end;       /* end of buffered file for this occurs */
1256
1257   /* Define 'keyafter', start of left context and end of right context.
1258      'keyafter' starts at the saved position for keyword and extend to the
1259      right from the end of the keyword, eating separators or full words, but
1260      not beyond maximum allowed width for 'keyafter' field or limit for the
1261      right context.  Suffix spaces will be removed afterwards.  */
1262
1263   keyafter.start = occurs->key.start;
1264   keyafter.end = keyafter.start + occurs->key.size;
1265   left_context_start = keyafter.start + occurs->left;
1266   right_context_end = keyafter.start + occurs->right;
1267
1268   buffer_start = text_buffers[occurs->file_index].start;
1269   buffer_end = text_buffers[occurs->file_index].end;
1270
1271   cursor = keyafter.end;
1272   while (cursor < right_context_end
1273          && cursor <= keyafter.start + keyafter_max_width)
1274     {
1275       keyafter.end = cursor;
1276       SKIP_SOMETHING (cursor, right_context_end);
1277     }
1278   if (cursor <= keyafter.start + keyafter_max_width)
1279     keyafter.end = cursor;
1280
1281   keyafter_truncation = truncation_string && keyafter.end < right_context_end;
1282
1283   SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start);
1284
1285   /* When the left context is wide, it might take some time to catch up from
1286      the left context boundary to the beginning of the 'head' or 'before'
1287      fields.  So, in this case, to speed the catchup, we jump back from the
1288      keyword, using some secure distance, possibly falling in the middle of
1289      a word.  A secure backward jump would be at least half the maximum
1290      width of a line, plus the size of the longest word met in the whole
1291      input.  We conclude this backward jump by a skip forward of at least
1292      one word.  In this manner, we should not inadvertently accept only part
1293      of a word.  From the reached point, when it will be time to fix the
1294      beginning of 'head' or 'before' fields, we will skip forward words or
1295      delimiters until we get sufficiently near.  */
1296
1297   if (-occurs->left > half_line_width + maximum_word_length)
1298     {
1299       left_field_start
1300         = keyafter.start - (half_line_width + maximum_word_length);
1301       SKIP_SOMETHING (left_field_start, keyafter.start);
1302     }
1303   else
1304     left_field_start = keyafter.start + occurs->left;
1305
1306   /* 'before' certainly ends at the keyword, but not including separating
1307      spaces.  It starts after than the saved value for the left context, by
1308      advancing it until it falls inside the maximum allowed width for the
1309      before field.  There will be no prefix spaces either.  'before' only
1310      advances by skipping single separators or whole words. */
1311
1312   before.start = left_field_start;
1313   before.end = keyafter.start;
1314   SKIP_WHITE_BACKWARDS (before.end, before.start);
1315
1316   while (before.start + before_max_width < before.end)
1317     SKIP_SOMETHING (before.start, before.end);
1318
1319   if (truncation_string)
1320     {
1321       cursor = before.start;
1322       SKIP_WHITE_BACKWARDS (cursor, buffer_start);
1323       before_truncation = cursor > left_context_start;
1324     }
1325   else
1326     before_truncation = false;
1327
1328   SKIP_WHITE (before.start, buffer_end);
1329
1330   /* The tail could not take more columns than what has been left in the
1331      left context field, and a gap is mandatory.  It starts after the
1332      right context, and does not contain prefixed spaces.  It ends at
1333      the end of line, the end of buffer or when the tail field is full,
1334      whichever comes first.  It cannot contain only part of a word, and
1335      has no suffixed spaces.  */
1336
1337   tail_max_width
1338     = before_max_width - (before.end - before.start) - gap_size;
1339
1340   if (tail_max_width > 0)
1341     {
1342       tail.start = keyafter.end;
1343       SKIP_WHITE (tail.start, buffer_end);
1344
1345       tail.end = tail.start;
1346       cursor = tail.end;
1347       while (cursor < right_context_end
1348              && cursor < tail.start + tail_max_width)
1349         {
1350           tail.end = cursor;
1351           SKIP_SOMETHING (cursor, right_context_end);
1352         }
1353
1354       if (cursor < tail.start + tail_max_width)
1355         tail.end = cursor;
1356
1357       if (tail.end > tail.start)
1358         {
1359           keyafter_truncation = false;
1360           tail_truncation = truncation_string && tail.end < right_context_end;
1361         }
1362       else
1363         tail_truncation = false;
1364
1365       SKIP_WHITE_BACKWARDS (tail.end, tail.start);
1366     }
1367   else
1368     {
1369
1370       /* No place left for a tail field.  */
1371
1372       tail.start = nullptr;
1373       tail.end = nullptr;
1374       tail_truncation = false;
1375     }
1376
1377   /* 'head' could not take more columns than what has been left in the right
1378      context field, and a gap is mandatory.  It ends before the left
1379      context, and does not contain suffixed spaces.  Its pointer is advanced
1380      until the head field has shrunk to its allowed width.  It cannot
1381      contain only part of a word, and has no suffixed spaces.  */
1382
1383   head_max_width
1384     = keyafter_max_width - (keyafter.end - keyafter.start) - gap_size;
1385
1386   if (head_max_width > 0)
1387     {
1388       head.end = before.start;
1389       SKIP_WHITE_BACKWARDS (head.end, buffer_start);
1390
1391       head.start = left_field_start;
1392       while (head.start + head_max_width < head.end)
1393         SKIP_SOMETHING (head.start, head.end);
1394
1395       if (head.end > head.start)
1396         {
1397           before_truncation = false;
1398           head_truncation = (truncation_string
1399                              && head.start > left_context_start);
1400         }
1401       else
1402         head_truncation = false;
1403
1404       SKIP_WHITE (head.start, head.end);
1405     }
1406   else
1407     {
1408
1409       /* No place left for a head field.  */
1410
1411       head.start = nullptr;
1412       head.end = nullptr;
1413       head_truncation = false;
1414     }
1415
1416   if (auto_reference)
1417     {
1418
1419       /* Construct the reference text in preallocated space from the file
1420          name and the line number.  Standard input yields an empty file name.
1421          Ensure line numbers are 1 based, even if they are computed 0 based.  */
1422
1423       file_name = input_file_name[occurs->file_index];
1424       if (!file_name)
1425         file_name = "";
1426
1427       line_ordinal = occurs->reference + 1;
1428       if (occurs->file_index > 0)
1429         line_ordinal -= file_line_count[occurs->file_index - 1];
1430
1431       char *file_end = stpcpy (reference.start, file_name);
1432       reference.end = file_end + sprintf (file_end, ":%jd", line_ordinal);
1433     }
1434   else if (input_reference)
1435     {
1436
1437       /* Reference starts at saved position for reference and extends right
1438          until some white space is met.  */
1439
1440       reference.start = keyafter.start + occurs->reference;
1441       reference.end = reference.start;
1442       SKIP_NON_WHITE (reference.end, right_context_end);
1443     }
1444 }
1445
1446 /* Formatting and actual output - control routines.  */
1447
1448 /*----------------------------------------------------------------------.
1449 | Output the current output fields as one line for 'troff' or 'nroff'.  |
1450 `----------------------------------------------------------------------*/
1451
1452 static void
1453 output_one_roff_line (void)
1454 {
1455   /* Output the 'tail' field.  */
1456
1457   printf (".%s \"", macro_name);
1458   print_field (tail);
1459   if (tail_truncation)
1460     fputs (truncation_string, stdout);
1461   putchar ('"');
1462
1463   /* Output the 'before' field.  */
1464
1465   fputs (" \"", stdout);
1466   if (before_truncation)
1467     fputs (truncation_string, stdout);
1468   print_field (before);
1469   putchar ('"');
1470
1471   /* Output the 'keyafter' field.  */
1472
1473   fputs (" \"", stdout);
1474   print_field (keyafter);
1475   if (keyafter_truncation)
1476     fputs (truncation_string, stdout);
1477   putchar ('"');
1478
1479   /* Output the 'head' field.  */
1480
1481   fputs (" \"", stdout);
1482   if (head_truncation)
1483     fputs (truncation_string, stdout);
1484   print_field (head);
1485   putchar ('"');
1486
1487   /* Conditionally output the 'reference' field.  */
1488
1489   if (auto_reference || input_reference)
1490     {
1491       fputs (" \"", stdout);
1492       print_field (reference);
1493       putchar ('"');
1494     }
1495
1496   putchar ('\n');
1497 }
1498
1499 /*---------------------------------------------------------.
1500 | Output the current output fields as one line for 'TeX'.  |
1501 `---------------------------------------------------------*/
1502
1503 static void
1504 output_one_tex_line (void)
1505 {
1506   BLOCK key;                    /* key field, isolated */
1507   BLOCK after;                  /* after field, isolated */
1508   char *cursor;                 /* running cursor in source text */
1509
1510   printf ("\\%s ", macro_name);
1511   putchar ('{');
1512   print_field (tail);
1513   fputs ("}{", stdout);
1514   print_field (before);
1515   fputs ("}{", stdout);
1516   key.start = keyafter.start;
1517   after.end = keyafter.end;
1518   cursor = keyafter.start;
1519   SKIP_SOMETHING (cursor, keyafter.end);
1520   key.end = cursor;
1521   after.start = cursor;
1522   print_field (key);
1523   fputs ("}{", stdout);
1524   print_field (after);
1525   fputs ("}{", stdout);
1526   print_field (head);
1527   putchar ('}');
1528   if (auto_reference || input_reference)
1529     {
1530       putchar ('{');
1531       print_field (reference);
1532       putchar ('}');
1533     }
1534   putchar ('\n');
1535 }
1536
1537 /*-------------------------------------------------------------------.
1538 | Output the current output fields as one line for a dumb terminal.  |
1539 `-------------------------------------------------------------------*/
1540
1541 static void
1542 output_one_dumb_line (void)
1543 {
1544   if (!right_reference)
1545     {
1546       if (auto_reference)
1547         {
1548
1549           /* Output the 'reference' field, in such a way that GNU emacs
1550              next-error will handle it.  The ending colon is taken from the
1551              gap which follows.  */
1552
1553           print_field (reference);
1554           putchar (':');
1555           print_spaces (reference_max_width
1556                         + gap_size
1557                         - (reference.end - reference.start)
1558                         - 1);
1559         }
1560       else
1561         {
1562
1563           /* Output the 'reference' field and its following gap.  */
1564
1565           print_field (reference);
1566           print_spaces (reference_max_width
1567                         + gap_size
1568                         - (reference.end - reference.start));
1569         }
1570     }
1571
1572   if (tail.start < tail.end)
1573     {
1574       /* Output the 'tail' field.  */
1575
1576       print_field (tail);
1577       if (tail_truncation)
1578         fputs (truncation_string, stdout);
1579
1580       print_spaces (half_line_width - gap_size
1581                     - (before.end - before.start)
1582                     - (before_truncation ? truncation_string_length : 0)
1583                     - (tail.end - tail.start)
1584                     - (tail_truncation ? truncation_string_length : 0));
1585     }
1586   else
1587     print_spaces (half_line_width - gap_size
1588                   - (before.end - before.start)
1589                   - (before_truncation ? truncation_string_length : 0));
1590
1591   /* Output the 'before' field.  */
1592
1593   if (before_truncation)
1594     fputs (truncation_string, stdout);
1595   print_field (before);
1596
1597   print_spaces (gap_size);
1598
1599   /* Output the 'keyafter' field.  */
1600
1601   print_field (keyafter);
1602   if (keyafter_truncation)
1603     fputs (truncation_string, stdout);
1604
1605   if (head.start < head.end)
1606     {
1607       /* Output the 'head' field.  */
1608
1609       print_spaces (half_line_width
1610                     - (keyafter.end - keyafter.start)
1611                     - (keyafter_truncation ? truncation_string_length : 0)
1612                     - (head.end - head.start)
1613                     - (head_truncation ? truncation_string_length : 0));
1614       if (head_truncation)
1615         fputs (truncation_string, stdout);
1616       print_field (head);
1617     }
1618   else
1619
1620     if ((auto_reference || input_reference) && right_reference)
1621       print_spaces (half_line_width
1622                     - (keyafter.end - keyafter.start)
1623                     - (keyafter_truncation ? truncation_string_length : 0));
1624
1625   if ((auto_reference || input_reference) && right_reference)
1626     {
1627       /* Output the 'reference' field.  */
1628
1629       print_spaces (gap_size);
1630       print_field (reference);
1631     }
1632
1633   putchar ('\n');
1634 }
1635
1636 /*------------------------------------------------------------------------.
1637 | Scan the whole occurs table and, for each entry, output one line in the |
1638 | appropriate format.                                                     |
1639 `------------------------------------------------------------------------*/
1640
1641 static void
1642 generate_all_output (void)
1643 {
1644   ptrdiff_t occurs_index;       /* index of keyword entry being processed */
1645   OCCURS *occurs_cursor;        /* current keyword entry being processed */
1646
1647   /* The following assignments are useful to provide default values in case
1648      line contexts or references are not used, in which case these variables
1649      would never be computed.  */
1650
1651   tail.start = nullptr;
1652   tail.end = nullptr;
1653   tail_truncation = false;
1654
1655   head.start = nullptr;
1656   head.end = nullptr;
1657   head_truncation = false;
1658
1659   /* Loop over all keyword occurrences.  */
1660
1661   occurs_cursor = occurs_table[0];
1662
1663   for (occurs_index = 0; occurs_index < number_of_occurs[0]; occurs_index++)
1664     {
1665       /* Compute the exact size of every field and whenever truncation flags
1666          are present or not.  */
1667
1668       define_all_fields (occurs_cursor);
1669
1670       /* Produce one output line according to selected format.  */
1671
1672       switch (output_format)
1673         {
1674         case UNKNOWN_FORMAT:
1675           /* Should never happen.  */
1676
1677         case DUMB_FORMAT:
1678           output_one_dumb_line ();
1679           break;
1680
1681         case ROFF_FORMAT:
1682           output_one_roff_line ();
1683           break;
1684
1685         case TEX_FORMAT:
1686           output_one_tex_line ();
1687           break;
1688         }
1689
1690       /* Advance the cursor into the occurs table.  */
1691
1692       occurs_cursor++;
1693     }
1694 }
1695
1696 /* Option decoding and main program.  */
1697
1698 /*------------------------------------------------------.
1699 | Print program identification and options, then exit.  |
1700 `------------------------------------------------------*/
1701
1702 void
1703 usage (int status)
1704 {
1705   if (status != EXIT_SUCCESS)
1706     emit_try_help ();
1707   else
1708     {
1709       printf (_("\
1710 Usage: %s [OPTION]... [INPUT]...   (without -G)\n\
1711   or:  %s -G [OPTION]... [INPUT [OUTPUT]]\n"),
1712               program_name, program_name);
1713       fputs (_("\
1714 Output a permuted index, including context, of the words in the input files.\n\
1715 "), stdout);
1716
1717       emit_stdin_note ();
1718       emit_mandatory_arg_note ();
1719
1720       fputs (_("\
1721   -A, --auto-reference           output automatically generated references\n\
1722   -G, --traditional              behave more like System V 'ptx'\n\
1723 "), stdout);
1724       fputs (_("\
1725   -F, --flag-truncation=STRING   use STRING for flagging line truncations.\n\
1726                                  The default is '/'\n\
1727 "), stdout);
1728       fputs (_("\
1729   -M, --macro-name=STRING        macro name to use instead of 'xx'\n\
1730   -O, --format=roff              generate output as roff directives\n\
1731   -R, --right-side-refs          put references at right, not counted in -w\n\
1732   -S, --sentence-regexp=REGEXP   for end of lines or end of sentences\n\
1733   -T, --format=tex               generate output as TeX directives\n\
1734 "), stdout);
1735       fputs (_("\
1736   -W, --word-regexp=REGEXP       use REGEXP to match each keyword\n\
1737   -b, --break-file=FILE          word break characters in this FILE\n\
1738   -f, --ignore-case              fold lower case to upper case for sorting\n\
1739   -g, --gap-size=NUMBER          gap size in columns between output fields\n\
1740   -i, --ignore-file=FILE         read ignore word list from FILE\n\
1741   -o, --only-file=FILE           read only word list from this FILE\n\
1742 "), stdout);
1743       fputs (_("\
1744   -r, --references               first field of each line is a reference\n\
1745   -t, --typeset-mode               - not implemented -\n\
1746   -w, --width=NUMBER             output width in columns, reference excluded\n\
1747 "), stdout);
1748       fputs (HELP_OPTION_DESCRIPTION, stdout);
1749       fputs (VERSION_OPTION_DESCRIPTION, stdout);
1750       emit_ancillary_info (PROGRAM_NAME);
1751     }
1752   exit (status);
1753 }
1754
1755 /*----------------------------------------------------------------------.
1756 | Main program.  Decode ARGC arguments passed through the ARGV array of |
1757 | strings, then launch execution.                                       |
1758 `----------------------------------------------------------------------*/
1759
1760 /* Long options equivalences.  */
1761 static struct option const long_options[] =
1762 {
1763   {"auto-reference", no_argument, nullptr, 'A'},
1764   {"break-file", required_argument, nullptr, 'b'},
1765   {"flag-truncation", required_argument, nullptr, 'F'},
1766   {"ignore-case", no_argument, nullptr, 'f'},
1767   {"gap-size", required_argument, nullptr, 'g'},
1768   {"ignore-file", required_argument, nullptr, 'i'},
1769   {"macro-name", required_argument, nullptr, 'M'},
1770   {"only-file", required_argument, nullptr, 'o'},
1771   {"references", no_argument, nullptr, 'r'},
1772   {"right-side-refs", no_argument, nullptr, 'R'},
1773   {"format", required_argument, nullptr, 10},
1774   {"sentence-regexp", required_argument, nullptr, 'S'},
1775   {"traditional", no_argument, nullptr, 'G'},
1776   {"typeset-mode", no_argument, nullptr, 't'},
1777   {"width", required_argument, nullptr, 'w'},
1778   {"word-regexp", required_argument, nullptr, 'W'},
1779   {GETOPT_HELP_OPTION_DECL},
1780   {GETOPT_VERSION_OPTION_DECL},
1781   {nullptr, 0, nullptr, 0},
1782 };
1783
1784 static char const *const format_args[] =
1785 {
1786   "roff", "tex", nullptr
1787 };
1788
1789 static enum Format const format_vals[] =
1790 {
1791   ROFF_FORMAT, TEX_FORMAT
1792 };
1793
1794 int
1795 main (int argc, char **argv)
1796 {
1797   int optchar;                  /* argument character */
1798   int file_index;               /* index in text input file arrays */
1799
1800   /* Decode program options.  */
1801
1802   initialize_main (&argc, &argv);
1803   set_program_name (argv[0]);
1804   setlocale (LC_ALL, "");
1805   bindtextdomain (PACKAGE, LOCALEDIR);
1806   textdomain (PACKAGE);
1807
1808   atexit (close_stdout);
1809
1810 #if HAVE_SETCHRCLASS
1811   setchrclass (nullptr);
1812 #endif
1813
1814   while (optchar = getopt_long (argc, argv, "AF:GM:ORS:TW:b:i:fg:o:trw:",
1815                                 long_options, nullptr),
1816          optchar != EOF)
1817     {
1818       switch (optchar)
1819         {
1820         default:
1821           usage (EXIT_FAILURE);
1822
1823         case 'G':
1824           gnu_extensions = false;
1825           break;
1826
1827         case 'b':
1828           break_file = optarg;
1829           break;
1830
1831         case 'f':
1832           ignore_case = true;
1833           break;
1834
1835         case 'g':
1836           {
1837             intmax_t tmp;
1838             if (! (xstrtoimax (optarg, nullptr, 0, &tmp, "") == LONGINT_OK
1839                    && 0 < tmp && tmp <= PTRDIFF_MAX))
1840               error (EXIT_FAILURE, 0, _("invalid gap width: %s"),
1841                      quote (optarg));
1842             gap_size = tmp;
1843             break;
1844           }
1845
1846         case 'i':
1847           ignore_file = optarg;
1848           break;
1849
1850         case 'o':
1851           only_file = optarg;
1852           break;
1853
1854         case 'r':
1855           input_reference = true;
1856           break;
1857
1858         case 't':
1859           /* Yet to understand...  */
1860           break;
1861
1862         case 'w':
1863           {
1864             intmax_t tmp;
1865             if (! (xstrtoimax (optarg, nullptr, 0, &tmp, "") == LONGINT_OK
1866                    && 0 < tmp && tmp <= PTRDIFF_MAX))
1867               error (EXIT_FAILURE, 0, _("invalid line width: %s"),
1868                      quote (optarg));
1869             line_width = tmp;
1870             break;
1871           }
1872
1873         case 'A':
1874           auto_reference = true;
1875           break;
1876
1877         case 'F':
1878           truncation_string = optarg;
1879           unescape_string (optarg);
1880           break;
1881
1882         case 'M':
1883           macro_name = optarg;
1884           break;
1885
1886         case 'O':
1887           output_format = ROFF_FORMAT;
1888           break;
1889
1890         case 'R':
1891           right_reference = true;
1892           break;
1893
1894         case 'S':
1895           context_regex.string = optarg;
1896           unescape_string (optarg);
1897           break;
1898
1899         case 'T':
1900           output_format = TEX_FORMAT;
1901           break;
1902
1903         case 'W':
1904           word_regex.string = optarg;
1905           unescape_string (optarg);
1906           if (!*word_regex.string)
1907             word_regex.string = nullptr;
1908           break;
1909
1910         case 10:
1911           output_format = XARGMATCH ("--format", optarg,
1912                                      format_args, format_vals);
1913           break;
1914
1915         case_GETOPT_HELP_CHAR;
1916
1917         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
1918         }
1919     }
1920
1921   /* Process remaining arguments.  If GNU extensions are enabled, process
1922      all arguments as input parameters.  If disabled, accept at most two
1923      arguments, the second of which is an output parameter.  */
1924
1925   if (optind == argc)
1926     {
1927
1928       /* No more argument simply means: read standard input.  */
1929
1930       input_file_name = xmalloc (sizeof *input_file_name);
1931       file_line_count = xmalloc (sizeof *file_line_count);
1932       text_buffers =    xmalloc (sizeof *text_buffers);
1933       number_input_files = 1;
1934       input_file_name[0] = nullptr;
1935     }
1936   else if (gnu_extensions)
1937     {
1938       number_input_files = argc - optind;
1939       input_file_name = xnmalloc (number_input_files, sizeof *input_file_name);
1940       file_line_count = xnmalloc (number_input_files, sizeof *file_line_count);
1941       text_buffers    = xnmalloc (number_input_files, sizeof *text_buffers);
1942
1943       for (file_index = 0; file_index < number_input_files; file_index++)
1944         {
1945           if (!*argv[optind] || STREQ (argv[optind], "-"))
1946             input_file_name[file_index] = nullptr;
1947           else
1948             input_file_name[file_index] = argv[optind];
1949           optind++;
1950         }
1951     }
1952   else
1953     {
1954
1955       /* There is one necessary input file.  */
1956
1957       number_input_files = 1;
1958       input_file_name = xmalloc (sizeof *input_file_name);
1959       file_line_count = xmalloc (sizeof *file_line_count);
1960       text_buffers    = xmalloc (sizeof *text_buffers);
1961       if (!*argv[optind] || STREQ (argv[optind], "-"))
1962         input_file_name[0] = nullptr;
1963       else
1964         input_file_name[0] = argv[optind];
1965       optind++;
1966
1967       /* Redirect standard output, only if requested.  */
1968
1969       if (optind < argc)
1970         {
1971           if (! freopen (argv[optind], "w", stdout))
1972             error (EXIT_FAILURE, errno, "%s", quotef (argv[optind]));
1973           optind++;
1974         }
1975
1976       /* Diagnose any other argument as an error.  */
1977
1978       if (optind < argc)
1979         {
1980           error (0, 0, _("extra operand %s"), quote (argv[optind]));
1981           usage (EXIT_FAILURE);
1982         }
1983     }
1984
1985   /* If the output format has not been explicitly selected, choose dumb
1986      terminal format if GNU extensions are enabled, else 'roff' format.  */
1987
1988   if (output_format == UNKNOWN_FORMAT)
1989     output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT;
1990
1991   /* Initialize the main tables.  */
1992
1993   initialize_regex ();
1994
1995   /* Read 'Break character' file, if any.  */
1996
1997   if (break_file)
1998     digest_break_file (break_file);
1999
2000   /* Read 'Ignore words' file and 'Only words' files, if any.  If any of
2001      these files is empty, reset the name of the file to null, to avoid
2002      unnecessary calls to search_table. */
2003
2004   if (ignore_file)
2005     {
2006       digest_word_file (ignore_file, &ignore_table);
2007       if (ignore_table.length == 0)
2008         ignore_file = nullptr;
2009     }
2010
2011   if (only_file)
2012     {
2013       digest_word_file (only_file, &only_table);
2014       if (only_table.length == 0)
2015         only_file = nullptr;
2016     }
2017
2018   /* Prepare to study all the input files.  */
2019
2020   number_of_occurs[0] = 0;
2021   total_line_count = 0;
2022   maximum_word_length = 0;
2023   reference_max_width = 0;
2024
2025   for (file_index = 0; file_index < number_input_files; file_index++)
2026     {
2027       BLOCK *text_buffer = text_buffers + file_index;
2028
2029       /* Read the file contents into memory, then study it.  */
2030
2031       swallow_file_in_memory (input_file_name[file_index], text_buffer);
2032       find_occurs_in_text (file_index);
2033
2034       /* Maintain for each file how many lines has been read so far when its
2035          end is reached.  Incrementing the count first is a simple kludge to
2036          handle a possible incomplete line at end of file.  */
2037
2038       total_line_count++;
2039       file_line_count[file_index] = total_line_count;
2040     }
2041
2042   /* Do the output process phase.  */
2043
2044   sort_found_occurs ();
2045   fix_output_parameters ();
2046   generate_all_output ();
2047
2048   /* All done.  */
2049
2050   return EXIT_SUCCESS;
2051 }