src/ptx.c

   1 /* Permuted index for GNU, with keywords in their context.
   2    Copyright (C) 1990-2019 Free Software Foundation, Inc.
   3    François Pinard <pinard@iro.umontreal.ca>, 1988.
   4
   5    This program is free software: you can redistribute it and/or modify
   6    it under the terms of the GNU General Public License as published by
   7    the Free Software Foundation, either version 3 of the License, or
   8    (at your option) any later version.
   9
  10    This program is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13    GNU General Public License for more details.
  14
  15    You should have received a copy of the GNU General Public License
  16    along with this program.  If not, see <https://www.gnu.org/licenses/>.
  17
  18    François Pinard <pinard@iro.umontreal.ca> */
  19
  20 #include <config.h>
  21
  22 #include <getopt.h>
  23 #include <sys/types.h>
  24 #include "system.h"
  25 #include "die.h"
  26 #include <regex.h>
  27 #include "argmatch.h"
  28 #include "diacrit.h"
  29 #include "error.h"
  30 #include "fadvise.h"
  31 #include "quote.h"
  32 #include "read-file.h"
  33 #include "stdio--.h"
  34 #include "xstrtol.h"
  35
  36 /* The official name of this program (e.g., no 'g' prefix).  */
  37 #define PROGRAM_NAME "ptx"
  38
  39 /* TRANSLATORS: Please translate "F. Pinard" to "François Pinard"
  40    if "ç" (c-with-cedilla) is available in the translation's character
  41    set and encoding.  */
  42 #define AUTHORS proper_name_utf8 ("F. Pinard", "Fran\xc3\xa7ois Pinard")
  43
  44 /* Number of possible characters in a byte.  */
  45 #define CHAR_SET_SIZE 256
  46
  47 #define ISODIGIT(C) ((C) >= '0' && (C) <= '7')
  48 #define HEXTOBIN(C) ((C) >= 'a' && (C) <= 'f' ? (C)-'a'+10 \
  49                      : (C) >= 'A' && (C) <= 'F' ? (C)-'A'+10 : (C)-'0')
  50 #define OCTTOBIN(C) ((C) - '0')
  51
  52 /* Debugging the memory allocator.  */
  53
  54 #if WITH_DMALLOC
  55 # define MALLOC_FUNC_CHECK 1
  56 # include <dmalloc.h>
  57 #endif
  58
  59 /* Global definitions.  */
  60
  61 /* FIXME: There are many unchecked integer overflows in this file,
  62    and in theory they could cause this command to have undefined
  63    behavior given large inputs or options.  This command should
  64    diagnose any such overflow and exit.  */
  65
  66 /* Program options.  */
  67
  68 enum Format
  69 {
  70   UNKNOWN_FORMAT,               /* output format still unknown */
  71   DUMB_FORMAT,                  /* output for a dumb terminal */
  72   ROFF_FORMAT,                  /* output for 'troff' or 'nroff' */
  73   TEX_FORMAT                    /* output for 'TeX' or 'LaTeX' */
  74 };
  75
  76 static bool gnu_extensions = true;      /* trigger all GNU extensions */
  77 static bool auto_reference = false;     /* refs are 'file_name:line_number:' */
  78 static bool input_reference = false;    /* refs at beginning of input lines */
  79 static bool right_reference = false;    /* output refs after right context  */
  80 static ptrdiff_t line_width = 72;       /* output line width in characters */
  81 static ptrdiff_t gap_size = 3;  /* number of spaces between output fields */
  82 static const char *truncation_string = "/";
  83                                 /* string used to mark line truncations */
  84 static const char *macro_name = "xx";   /* macro name for roff or TeX output */
  85 static enum Format output_format = UNKNOWN_FORMAT;
  86                                 /* output format */
  87
  88 static bool ignore_case = false;        /* fold lower to upper for sorting */
  89 static const char *break_file = NULL;   /* name of the 'Break chars' file */
  90 static const char *only_file = NULL;    /* name of the 'Only words' file */
  91 static const char *ignore_file = NULL;  /* name of the 'Ignore words' file */
  92
  93 /* Options that use regular expressions.  */
  94 struct regex_data
  95 {
  96   /* The original regular expression, as a string.  */
  97   char const *string;
  98
  99   /* The compiled regular expression, and its fastmap.  */
 100   struct re_pattern_buffer pattern;
 101   char fastmap[UCHAR_MAX + 1];
 102 };
 103
 104 static struct regex_data context_regex; /* end of context */
 105 static struct regex_data word_regex;    /* keyword */
 106
 107 /* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
 108    whole file.  A WORD is similar, except it is intended for smaller regions.
 109    A WORD_TABLE may contain several WORDs.  */
 110
 111 typedef struct
 112   {
 113     char *start;                /* pointer to beginning of region */
 114     char *end;                  /* pointer to end + 1 of region */
 115   }
 116 BLOCK;
 117
 118 typedef struct
 119   {
 120     char *start;                /* pointer to beginning of region */
 121     ptrdiff_t size;             /* length of the region */
 122   }
 123 WORD;
 124
 125 typedef struct
 126   {
 127     WORD *start;                /* array of WORDs */
 128     size_t alloc;               /* allocated length */
 129     ptrdiff_t length;           /* number of used entries */
 130   }
 131 WORD_TABLE;
 132
 133 /* Pattern description tables.  */
 134
 135 /* For each character, provide its folded equivalent.  */
 136 static unsigned char folded_chars[CHAR_SET_SIZE];
 137
 138 /* End of context pattern register indices.  */
 139 static struct re_registers context_regs;
 140
 141 /* Keyword pattern register indices.  */
 142 static struct re_registers word_regs;
 143
 144 /* A word characters fastmap is used only when no word regexp has been
 145    provided.  A word is then made up of a sequence of one or more characters
 146    allowed by the fastmap.  Contains !0 if character allowed in word.  Not
 147    only this is faster in most cases, but it simplifies the implementation
 148    of the Break files.  */
 149 static char word_fastmap[CHAR_SET_SIZE];
 150
 151 /* Maximum length of any word read.  */
 152 static ptrdiff_t maximum_word_length;
 153
 154 /* Maximum width of any reference used.  */
 155 static ptrdiff_t reference_max_width;
 156
 157 /* Ignore and Only word tables.  */
 158
 159 static WORD_TABLE ignore_table; /* table of words to ignore */
 160 static WORD_TABLE only_table;           /* table of words to select */
 161
 162 /* Source text table, and scanning macros.  */
 163
 164 static int number_input_files;  /* number of text input files */
 165 static intmax_t total_line_count;       /* total number of lines seen so far */
 166 static const char **input_file_name;    /* array of text input file names */
 167 static intmax_t *file_line_count;       /* array of line count values at end */
 168
 169 static BLOCK *text_buffers;     /* files to study */
 170
 171 /* SKIP_NON_WHITE used only for getting or skipping the reference.  */
 172
 173 #define SKIP_NON_WHITE(cursor, limit) \
 174   while (cursor < limit && ! isspace (to_uchar (*cursor)))              \
 175     cursor++
 176
 177 #define SKIP_WHITE(cursor, limit) \
 178   while (cursor < limit && isspace (to_uchar (*cursor)))                \
 179     cursor++
 180
 181 #define SKIP_WHITE_BACKWARDS(cursor, start) \
 182   while (cursor > start && isspace (to_uchar (cursor[-1])))             \
 183     cursor--
 184
 185 #define SKIP_SOMETHING(cursor, limit) \
 186   if (word_regex.string)                                                \
 187     {                                                                   \
 188       regoff_t count;                                                   \
 189       count = re_match (&word_regex.pattern, cursor, limit - cursor, 0, NULL); \
 190       if (count == -2)                                                  \
 191         matcher_error ();                                               \
 192       cursor += count == -1 ? 1 : count;                                \
 193     }                                                                   \
 194   else if (word_fastmap[to_uchar (*cursor)])                            \
 195     while (cursor < limit && word_fastmap[to_uchar (*cursor)])          \
 196       cursor++;                                                         \
 197   else                                                                  \
 198     cursor++
 199
 200 /* Occurrences table.
 201
 202    The 'keyword' pointer provides the central word, which is surrounded
 203    by a left context and a right context.  The 'keyword' and 'length'
 204    field allow full 8-bit characters keys, even including NULs.  At other
 205    places in this program, the name 'keyafter' refers to the keyword
 206    followed by its right context.
 207
 208    The left context does not extend, towards the beginning of the file,
 209    further than a distance given by the 'left' value.  This value is
 210    relative to the keyword beginning, it is usually negative.  This
 211    insures that, except for white space, we will never have to backward
 212    scan the source text, when it is time to generate the final output
 213    lines.
 214
 215    The right context, indirectly attainable through the keyword end, does
 216    not extend, towards the end of the file, further than a distance given
 217    by the 'right' value.  This value is relative to the keyword
 218    beginning, it is usually positive.
 219
 220    When automatic references are used, the 'reference' value is the
 221    overall line number in all input files read so far, in this case, it
 222    is of type intmax_t.  When input references are used, the 'reference'
 223    value indicates the distance between the keyword beginning and the
 224    start of the reference field, and it fits in ptrdiff_t and is usually
 225    negative.  */
 226
 227 typedef struct
 228   {
 229     WORD key;                   /* description of the keyword */
 230     ptrdiff_t left;             /* distance to left context start */
 231     ptrdiff_t right;            /* distance to right context end */
 232     intmax_t reference;         /* reference descriptor */
 233     int file_index;             /* corresponding file  */
 234   }
 235 OCCURS;
 236
 237 /* The various OCCURS tables are indexed by the language.  But the time
 238    being, there is no such multiple language support.  */
 239
 240 static OCCURS *occurs_table[1]; /* all words retained from the read text */
 241 static size_t occurs_alloc[1];  /* allocated size of occurs_table */
 242 static ptrdiff_t number_of_occurs[1]; /* number of used slots in occurs_table */
 243
 244
 245 /* Communication among output routines.  */
 246
 247 /* Indicate if special output processing is requested for each character.  */
 248 static char edited_flag[CHAR_SET_SIZE];
 249
 250 /* Half of line width, reference excluded.  */
 251 static ptrdiff_t half_line_width;
 252
 253 /* Maximum width of before field.  */
 254 static ptrdiff_t before_max_width;
 255
 256 /* Maximum width of keyword-and-after field.  */
 257 static ptrdiff_t keyafter_max_width;
 258
 259 /* Length of string that flags truncation.  */
 260 static ptrdiff_t truncation_string_length;
 261
 262 /* When context is limited by lines, wraparound may happen on final output:
 263    the 'head' pointer gives access to some supplementary left context which
 264    will be seen at the end of the output line, the 'tail' pointer gives
 265    access to some supplementary right context which will be seen at the
 266    beginning of the output line. */
 267
 268 static BLOCK tail;              /* tail field */
 269 static bool tail_truncation;    /* flag truncation after the tail field */
 270
 271 static BLOCK before;            /* before field */
 272 static bool before_truncation;  /* flag truncation before the before field */
 273
 274 static BLOCK keyafter;          /* keyword-and-after field */
 275 static bool keyafter_truncation; /* flag truncation after the keyafter field */
 276
 277 static BLOCK head;              /* head field */
 278 static bool head_truncation;    /* flag truncation before the head field */
 279
 280 static BLOCK reference;         /* reference field for input reference mode */
 281
 282 /* Miscellaneous routines.  */
 283
 284 /* Diagnose an error in the regular expression matcher.  Then exit.  */
 285
 286 static void ATTRIBUTE_NORETURN
 287 matcher_error (void)
 288 {
 289   die (EXIT_FAILURE, errno, _("error in regular expression matcher"));
 290 }
 291
 292 /*------------------------------------------------------.
 293 | Duplicate string STRING, while evaluating \-escapes.  |
 294 `------------------------------------------------------*/
 295
 296 /* Loosely adapted from GNU sh-utils printf.c code.  */
 297
 298 static char *
 299 copy_unescaped_string (const char *string)
 300 {
 301   char *result;                 /* allocated result */
 302   char *cursor;                 /* cursor in result */
 303   int value;                    /* value of \nnn escape */
 304   int length;                   /* length of \nnn escape */
 305
 306   result = xmalloc (strlen (string) + 1);
 307   cursor = result;
 308
 309   while (*string)
 310     {
 311       if (*string == '\\')
 312         {
 313           string++;
 314           switch (*string)
 315             {
 316             case 'x':           /* \xhhh escape, 3 chars maximum */
 317               value = 0;
 318               for (length = 0, string++;
 319                    length < 3 && isxdigit (to_uchar (*string));
 320                    length++, string++)
 321                 value = value * 16 + HEXTOBIN (*string);
 322               if (length == 0)
 323                 {
 324                   *cursor++ = '\\';
 325                   *cursor++ = 'x';
 326                 }
 327               else
 328                 *cursor++ = value;
 329               break;
 330
 331             case '0':           /* \0ooo escape, 3 chars maximum */
 332               value = 0;
 333               for (length = 0, string++;
 334                    length < 3 && ISODIGIT (*string);
 335                    length++, string++)
 336                 value = value * 8 + OCTTOBIN (*string);
 337               *cursor++ = value;
 338               break;
 339
 340             case 'a':           /* alert */
 341 #if __STDC__
 342               *cursor++ = '\a';
 343 #else
 344               *cursor++ = 7;
 345 #endif
 346               string++;
 347               break;
 348
 349             case 'b':           /* backspace */
 350               *cursor++ = '\b';
 351               string++;
 352               break;
 353
 354             case 'c':           /* cancel the rest of the output */
 355               while (*string)
 356                 string++;
 357               break;
 358
 359             case 'f':           /* form feed */
 360               *cursor++ = '\f';
 361               string++;
 362               break;
 363
 364             case 'n':           /* new line */
 365               *cursor++ = '\n';
 366               string++;
 367               break;
 368
 369             case 'r':           /* carriage return */
 370               *cursor++ = '\r';
 371               string++;
 372               break;
 373
 374             case 't':           /* horizontal tab */
 375               *cursor++ = '\t';
 376               string++;
 377               break;
 378
 379             case 'v':           /* vertical tab */
 380 #if __STDC__
 381               *cursor++ = '\v';
 382 #else
 383               *cursor++ = 11;
 384 #endif
 385               string++;
 386               break;
 387
 388             case '\0':          /* lone backslash at end of string */
 389               /* ignore it */
 390               break;
 391
 392             default:
 393               *cursor++ = '\\';
 394               *cursor++ = *string++;
 395               break;
 396             }
 397         }
 398       else
 399         *cursor++ = *string++;
 400     }
 401
 402   *cursor = '\0';
 403   return result;
 404 }
 405
 406 /*--------------------------------------------------------------------------.
 407 | Compile the regex represented by REGEX, diagnose and abort if any error.  |
 408 `--------------------------------------------------------------------------*/
 409
 410 static void
 411 compile_regex (struct regex_data *regex)
 412 {
 413   struct re_pattern_buffer *pattern = &regex->pattern;
 414   char const *string = regex->string;
 415   char const *message;
 416
 417   pattern->buffer = NULL;
 418   pattern->allocated = 0;
 419   pattern->fastmap = regex->fastmap;
 420   pattern->translate = ignore_case ? folded_chars : NULL;
 421
 422   message = re_compile_pattern (string, strlen (string), pattern);
 423   if (message)
 424     die (EXIT_FAILURE, 0, _("%s (for regexp %s)"), message, quote (string));
 425
 426   /* The fastmap should be compiled before 're_match'.  The following
 427      call is not mandatory, because 're_search' is always called sooner,
 428      and it compiles the fastmap if this has not been done yet.  */
 429
 430   re_compile_fastmap (pattern);
 431 }
 432
 433 /*------------------------------------------------------------------------.
 434 | This will initialize various tables for pattern match and compiles some |
 435 | regexps.                                                                |
 436 `------------------------------------------------------------------------*/
 437
 438 static void
 439 initialize_regex (void)
 440 {
 441   int character;                /* character value */
 442
 443   /* Initialize the case folding table.  */
 444
 445   if (ignore_case)
 446     for (character = 0; character < CHAR_SET_SIZE; character++)
 447       folded_chars[character] = toupper (character);
 448
 449   /* Unless the user already provided a description of the end of line or
 450      end of sentence sequence, select an end of line sequence to compile.
 451      If the user provided an empty definition, thus disabling end of line
 452      or sentence feature, make it NULL to speed up tests.  If GNU
 453      extensions are enabled, use end of sentence like in GNU emacs.  If
 454      disabled, use end of lines.  */
 455
 456   if (context_regex.string)
 457     {
 458       if (!*context_regex.string)
 459         context_regex.string = NULL;
 460     }
 461   else if (gnu_extensions && !input_reference)
 462     context_regex.string = "[.?!][]\"')}]*\\($\\|\t\\|  \\)[ \t\n]*";
 463   else
 464     context_regex.string = "\n";
 465
 466   if (context_regex.string)
 467     compile_regex (&context_regex);
 468
 469   /* If the user has already provided a non-empty regexp to describe
 470      words, compile it.  Else, unless this has already been done through
 471      a user provided Break character file, construct a fastmap of
 472      characters that may appear in a word.  If GNU extensions enabled,
 473      include only letters of the underlying character set.  If disabled,
 474      include almost everything, even punctuations; stop only on white
 475      space.  */
 476
 477   if (word_regex.string)
 478     compile_regex (&word_regex);
 479   else if (!break_file)
 480     {
 481       if (gnu_extensions)
 482         {
 483
 484           /* Simulate \w+.  */
 485
 486           for (character = 0; character < CHAR_SET_SIZE; character++)
 487             word_fastmap[character] = !! isalpha (character);
 488         }
 489       else
 490         {
 491
 492           /* Simulate [^ \t\n]+.  */
 493
 494           memset (word_fastmap, 1, CHAR_SET_SIZE);
 495           word_fastmap[' '] = 0;
 496           word_fastmap['\t'] = 0;
 497           word_fastmap['\n'] = 0;
 498         }
 499     }
 500 }
 501
 502 /*------------------------------------------------------------------------.
 503 | This routine will attempt to swallow a whole file name FILE_NAME into a |
 504 | contiguous region of memory and return a description of it into BLOCK.  |
 505 | Standard input is assumed whenever FILE_NAME is NULL, empty or "-".     |
 506 |                                                                         |
 507 | Previously, in some cases, white space compression was attempted while  |
 508 | inputting text.  This was defeating some regexps like default end of    |
 509 | sentence, which checks for two consecutive spaces.  If white space      |
 510 | compression is ever reinstated, it should be in output routines.        |
 511 `------------------------------------------------------------------------*/
 512
 513 static void
 514 swallow_file_in_memory (const char *file_name, BLOCK *block)
 515 {
 516   size_t used_length;           /* used length in memory buffer */
 517
 518   /* As special cases, a file name which is NULL or "-" indicates standard
 519      input, which is already opened.  In all other cases, open the file from
 520      its name.  */
 521   bool using_stdin = !file_name || !*file_name || STREQ (file_name, "-");
 522   if (using_stdin)
 523     block->start = fread_file (stdin, &used_length);
 524   else
 525     block->start = read_file (file_name, &used_length);
 526
 527   if (!block->start)
 528     die (EXIT_FAILURE, errno, "%s", quotef (using_stdin ? "-" : file_name));
 529
 530   block->end = block->start + used_length;
 531 }
 532
 533 /* Sort and search routines.  */
 534
 535 /*--------------------------------------------------------------------------.
 536 | Compare two words, FIRST and SECOND, and return 0 if they are identical.  |
 537 | Return less than 0 if the first word goes before the second; return       |
 538 | greater than 0 if the first word goes after the second.                   |
 539 |                                                                           |
 540 | If a word is indeed a prefix of the other, the shorter should go first.   |
 541 `--------------------------------------------------------------------------*/
 542
 543 static int
 544 compare_words (const void *void_first, const void *void_second)
 545 {
 546 #define first ((const WORD *) void_first)
 547 #define second ((const WORD *) void_second)
 548   ptrdiff_t length;             /* minimum of two lengths */
 549   ptrdiff_t counter;            /* cursor in words */
 550   int value;                    /* value of comparison */
 551
 552   length = first->size < second->size ? first->size : second->size;
 553
 554   if (ignore_case)
 555     {
 556       for (counter = 0; counter < length; counter++)
 557         {
 558           value = (folded_chars [to_uchar (first->start[counter])]
 559                    - folded_chars [to_uchar (second->start[counter])]);
 560           if (value != 0)
 561             return value;
 562         }
 563     }
 564   else
 565     {
 566       for (counter = 0; counter < length; counter++)
 567         {
 568           value = (to_uchar (first->start[counter])
 569                    - to_uchar (second->start[counter]));
 570           if (value != 0)
 571             return value;
 572         }
 573     }
 574
 575   return first->size < second->size ? -1 : first->size > second->size;
 576 #undef first
 577 #undef second
 578 }
 579
 580 /*-----------------------------------------------------------------------.
 581 | Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
 582 | go first.  In case of a tie, preserve the original order through a     |
 583 | pointer comparison.                                                    |
 584 `-----------------------------------------------------------------------*/
 585
 586 static int
 587 compare_occurs (const void *void_first, const void *void_second)
 588 {
 589 #define first ((const OCCURS *) void_first)
 590 #define second ((const OCCURS *) void_second)
 591   int value;
 592
 593   value = compare_words (&first->key, &second->key);
 594   return (value ? value
 595           : first->key.start < second->key.start ? -1
 596           : first->key.start > second->key.start);
 597 #undef first
 598 #undef second
 599 }
 600
 601 /* True if WORD appears in TABLE.  Uses a binary search.  */
 602
 603 static bool _GL_ATTRIBUTE_PURE
 604 search_table (WORD *word, WORD_TABLE *table)
 605 {
 606   ptrdiff_t lowest;             /* current lowest possible index */
 607   ptrdiff_t highest;            /* current highest possible index */
 608   ptrdiff_t middle;             /* current middle index */
 609   int value;                    /* value from last comparison */
 610
 611   lowest = 0;
 612   highest = table->length - 1;
 613   while (lowest <= highest)
 614     {
 615       middle = (lowest + highest) / 2;
 616       value = compare_words (word, table->start + middle);
 617       if (value < 0)
 618         highest = middle - 1;
 619       else if (value > 0)
 620         lowest = middle + 1;
 621       else
 622         return true;
 623     }
 624   return false;
 625 }
 626
 627 /*---------------------------------------------------------------------.
 628 | Sort the whole occurs table in memory.  Presumably, 'qsort' does not |
 629 | take intermediate copies or table elements, so the sort will be      |
 630 | stabilized throughout the comparison routine.                        |
 631 `---------------------------------------------------------------------*/
 632
 633 static void
 634 sort_found_occurs (void)
 635 {
 636
 637   /* Only one language for the time being.  */
 638   if (number_of_occurs[0])
 639     qsort (occurs_table[0], number_of_occurs[0], sizeof **occurs_table,
 640            compare_occurs);
 641 }
 642
 643 /* Parameter files reading routines.  */
 644
 645 /*----------------------------------------------------------------------.
 646 | Read a file named FILE_NAME, containing a set of break characters.    |
 647 | Build a content to the array word_fastmap in which all characters are |
 648 | allowed except those found in the file.  Characters may be repeated.  |
 649 `----------------------------------------------------------------------*/
 650
 651 static void
 652 digest_break_file (const char *file_name)
 653 {
 654   BLOCK file_contents;          /* to receive a copy of the file */
 655   char *cursor;                 /* cursor in file copy */
 656
 657   swallow_file_in_memory (file_name, &file_contents);
 658
 659   /* Make the fastmap and record the file contents in it.  */
 660
 661   memset (word_fastmap, 1, CHAR_SET_SIZE);
 662   for (cursor = file_contents.start; cursor < file_contents.end; cursor++)
 663     word_fastmap[to_uchar (*cursor)] = 0;
 664
 665   if (!gnu_extensions)
 666     {
 667
 668       /* If GNU extensions are enabled, the only way to avoid newline as
 669          a break character is to write all the break characters in the
 670          file with no newline at all, not even at the end of the file.
 671          If disabled, spaces, tabs and newlines are always considered as
 672          break characters even if not included in the break file.  */
 673
 674       word_fastmap[' '] = 0;
 675       word_fastmap['\t'] = 0;
 676       word_fastmap['\n'] = 0;
 677     }
 678
 679   /* Return the space of the file, which is no more required.  */
 680
 681   free (file_contents.start);
 682 }
 683
 684 /*-----------------------------------------------------------------------.
 685 | Read a file named FILE_NAME, containing one word per line, then        |
 686 | construct in TABLE a table of WORD descriptors for them.  The routine  |
 687 | swallows the whole file in memory; this is at the expense of space     |
 688 | needed for newlines, which are useless; however, the reading is fast.  |
 689 `-----------------------------------------------------------------------*/
 690
 691 static void
 692 digest_word_file (const char *file_name, WORD_TABLE *table)
 693 {
 694   BLOCK file_contents;          /* to receive a copy of the file */
 695   char *cursor;                 /* cursor in file copy */
 696   char *word_start;             /* start of the current word */
 697
 698   swallow_file_in_memory (file_name, &file_contents);
 699
 700   table->start = NULL;
 701   table->alloc = 0;
 702   table->length = 0;
 703
 704   /* Read the whole file.  */
 705
 706   cursor = file_contents.start;
 707   while (cursor < file_contents.end)
 708     {
 709
 710       /* Read one line, and save the word in contains.  */
 711
 712       word_start = cursor;
 713       while (cursor < file_contents.end && *cursor != '\n')
 714         cursor++;
 715
 716       /* Record the word in table if it is not empty.  */
 717
 718       if (cursor > word_start)
 719         {
 720           if (table->length == table->alloc)
 721             table->start = x2nrealloc (table->start, &table->alloc,
 722                                        sizeof *table->start);
 723           table->start[table->length].start = word_start;
 724           table->start[table->length].size = cursor - word_start;
 725           table->length++;
 726         }
 727
 728       /* This test allows for an incomplete line at end of file.  */
 729
 730       if (cursor < file_contents.end)
 731         cursor++;
 732     }
 733
 734   /* Finally, sort all the words read.  */
 735
 736   qsort (table->start, table->length, sizeof table->start[0], compare_words);
 737 }
 738
 739 /* Keyword recognition and selection.  */
 740
 741 /*----------------------------------------------------------------------.
 742 | For each keyword in the source text, constructs an OCCURS structure.  |
 743 `----------------------------------------------------------------------*/
 744
 745 static void
 746 find_occurs_in_text (int file_index)
 747 {
 748   char *cursor;                 /* for scanning the source text */
 749   char *scan;                   /* for scanning the source text also */
 750   char *line_start;             /* start of the current input line */
 751   char *line_scan;              /* newlines scanned until this point */
 752   ptrdiff_t reference_length;   /* length of reference in input mode */
 753   WORD possible_key;            /* possible key, to ease searches */
 754   OCCURS *occurs_cursor;        /* current OCCURS under construction */
 755
 756   char *context_start;          /* start of left context */
 757   char *context_end;            /* end of right context */
 758   char *word_start;             /* start of word */
 759   char *word_end;               /* end of word */
 760   char *next_context_start;     /* next start of left context */
 761
 762   const BLOCK *text_buffer = &text_buffers[file_index];
 763
 764   /* reference_length is always used within 'if (input_reference)'.
 765      However, GNU C diagnoses that it may be used uninitialized.  The
 766      following assignment is merely to shut it up.  */
 767
 768   reference_length = 0;
 769
 770   /* Tracking where lines start is helpful for reference processing.  In
 771      auto reference mode, this allows counting lines.  In input reference
 772      mode, this permits finding the beginning of the references.
 773
 774      The first line begins with the file, skip immediately this very first
 775      reference in input reference mode, to help further rejection any word
 776      found inside it.  Also, unconditionally assigning these variable has
 777      the happy effect of shutting up lint.  */
 778
 779   line_start = text_buffer->start;
 780   line_scan = line_start;
 781   if (input_reference)
 782     {
 783       SKIP_NON_WHITE (line_scan, text_buffer->end);
 784       reference_length = line_scan - line_start;
 785       SKIP_WHITE (line_scan, text_buffer->end);
 786     }
 787
 788   /* Process the whole buffer, one line or one sentence at a time.  */
 789
 790   for (cursor = text_buffer->start;
 791        cursor < text_buffer->end;
 792        cursor = next_context_start)
 793     {
 794
 795       /* 'context_start' gets initialized before the processing of each
 796          line, or once for the whole buffer if no end of line or sentence
 797          sequence separator.  */
 798
 799       context_start = cursor;
 800
 801       /* If an end of line or end of sentence sequence is defined and
 802          non-empty, 'next_context_start' will be recomputed to be the end of
 803          each line or sentence, before each one is processed.  If no such
 804          sequence, then 'next_context_start' is set at the end of the whole
 805          buffer, which is then considered to be a single line or sentence.
 806          This test also accounts for the case of an incomplete line or
 807          sentence at the end of the buffer.  */
 808
 809       next_context_start = text_buffer->end;
 810       if (context_regex.string)
 811         switch (re_search (&context_regex.pattern, cursor,
 812                            text_buffer->end - cursor,
 813                            0, text_buffer->end - cursor, &context_regs))
 814           {
 815           case -2:
 816             matcher_error ();
 817
 818           case -1:
 819             break;
 820
 821           case 0:
 822             die (EXIT_FAILURE, 0,
 823                  _("error: regular expression has a match of length zero: %s"),
 824                  quote (context_regex.string));
 825
 826           default:
 827             next_context_start = cursor + context_regs.end[0];
 828             break;
 829           }
 830
 831       /* Include the separator into the right context, but not any suffix
 832          white space in this separator; this insures it will be seen in
 833          output and will not take more space than necessary.  */
 834
 835       context_end = next_context_start;
 836       SKIP_WHITE_BACKWARDS (context_end, context_start);
 837
 838       /* Read and process a single input line or sentence, one word at a
 839          time.  */
 840
 841       while (1)
 842         {
 843           if (word_regex.string)
 844
 845             /* If a word regexp has been compiled, use it to skip at the
 846                beginning of the next word.  If there is no such word, exit
 847                the loop.  */
 848
 849             {
 850               regoff_t r = re_search (&word_regex.pattern, cursor,
 851                                       context_end - cursor,
 852                                       0, context_end - cursor, &word_regs);
 853               if (r == -2)
 854                 matcher_error ();
 855               if (r == -1)
 856                 break;
 857               word_start = cursor + word_regs.start[0];
 858               word_end = cursor + word_regs.end[0];
 859             }
 860           else
 861
 862             /* Avoid re_search and use the fastmap to skip to the
 863                beginning of the next word.  If there is no more word in
 864                the buffer, exit the loop.  */
 865
 866             {
 867               scan = cursor;
 868               while (scan < context_end
 869                      && !word_fastmap[to_uchar (*scan)])
 870                 scan++;
 871
 872               if (scan == context_end)
 873                 break;
 874
 875               word_start = scan;
 876
 877               while (scan < context_end
 878                      && word_fastmap[to_uchar (*scan)])
 879                 scan++;
 880
 881               word_end = scan;
 882             }
 883
 884           /* Skip right to the beginning of the found word.  */
 885
 886           cursor = word_start;
 887
 888           /* Skip any zero length word.  Just advance a single position,
 889              then go fetch the next word.  */
 890
 891           if (word_end == word_start)
 892             {
 893               cursor++;
 894               continue;
 895             }
 896
 897           /* This is a genuine, non empty word, so save it as a possible
 898              key.  Then skip over it.  Also, maintain the maximum length of
 899              all words read so far.  It is mandatory to take the maximum
 900              length of all words in the file, without considering if they
 901              are actually kept or rejected, because backward jumps at output
 902              generation time may fall in *any* word.  */
 903
 904           possible_key.start = cursor;
 905           possible_key.size = word_end - word_start;
 906           cursor += possible_key.size;
 907
 908           if (possible_key.size > maximum_word_length)
 909             maximum_word_length = possible_key.size;
 910
 911           /* In input reference mode, update 'line_start' from its previous
 912              value.  Count the lines just in case auto reference mode is
 913              also selected. If it happens that the word just matched is
 914              indeed part of a reference; just ignore it.  */
 915
 916           if (input_reference)
 917             {
 918               while (line_scan < possible_key.start)
 919                 if (*line_scan == '\n')
 920                   {
 921                     total_line_count++;
 922                     line_scan++;
 923                     line_start = line_scan;
 924                     SKIP_NON_WHITE (line_scan, text_buffer->end);
 925                     reference_length = line_scan - line_start;
 926                   }
 927                 else
 928                   line_scan++;
 929               if (line_scan > possible_key.start)
 930                 continue;
 931             }
 932
 933           /* Ignore the word if an 'Ignore words' table exists and if it is
 934              part of it.  Also ignore the word if an 'Only words' table and
 935              if it is *not* part of it.
 936
 937              It is allowed that both tables be used at once, even if this
 938              may look strange for now.  Just ignore a word that would appear
 939              in both.  If regexps are eventually implemented for these
 940              tables, the Ignore table could then reject words that would
 941              have been previously accepted by the Only table.  */
 942
 943           if (ignore_file && search_table (&possible_key, &ignore_table))
 944             continue;
 945           if (only_file && !search_table (&possible_key, &only_table))
 946             continue;
 947
 948           /* A non-empty word has been found.  First of all, insure
 949              proper allocation of the next OCCURS, and make a pointer to
 950              where it will be constructed.  */
 951
 952           if (number_of_occurs[0] == occurs_alloc[0])
 953             occurs_table[0] = x2nrealloc (occurs_table[0],
 954                                           &occurs_alloc[0],
 955                                           sizeof *occurs_table[0]);
 956           occurs_cursor = occurs_table[0] + number_of_occurs[0];
 957
 958           /* Define the reference field, if any.  */
 959
 960           if (auto_reference)
 961             {
 962
 963               /* While auto referencing, update 'line_start' from its
 964                  previous value, counting lines as we go.  If input
 965                  referencing at the same time, 'line_start' has been
 966                  advanced earlier, and the following loop is never really
 967                  executed.  */
 968
 969               while (line_scan < possible_key.start)
 970                 if (*line_scan == '\n')
 971                   {
 972                     total_line_count++;
 973                     line_scan++;
 974                     line_start = line_scan;
 975                     SKIP_NON_WHITE (line_scan, text_buffer->end);
 976                   }
 977                 else
 978                   line_scan++;
 979
 980               occurs_cursor->reference = total_line_count;
 981             }
 982           else if (input_reference)
 983             {
 984
 985               /* If only input referencing, 'line_start' has been computed
 986                  earlier to detect the case the word matched would be part
 987                  of the reference.  The reference position is simply the
 988                  value of 'line_start'.  */
 989
 990               occurs_cursor->reference = line_start - possible_key.start;
 991               if (reference_length > reference_max_width)
 992                 reference_max_width = reference_length;
 993             }
 994
 995           /* Exclude the reference from the context in simple cases.  */
 996
 997           if (input_reference && line_start == context_start)
 998             {
 999               SKIP_NON_WHITE (context_start, context_end);
1000               SKIP_WHITE (context_start, context_end);
1001             }
1002
1003           /* Completes the OCCURS structure.  */
1004
1005           occurs_cursor->key = possible_key;
1006           occurs_cursor->left = context_start - possible_key.start;
1007           occurs_cursor->right = context_end - possible_key.start;
1008           occurs_cursor->file_index = file_index;
1009
1010           number_of_occurs[0]++;
1011         }
1012     }
1013 }
1014
1015 /* Formatting and actual output - service routines.  */
1016
1017 /*-----------------------------------------.
1018 | Prints some NUMBER of spaces on stdout.  |
1019 `-----------------------------------------*/
1020
1021 static void
1022 print_spaces (ptrdiff_t number)
1023 {
1024   for (ptrdiff_t counter = number; counter > 0; counter--)
1025     putchar (' ');
1026 }
1027
1028 /*-------------------------------------.
1029 | Prints the field provided by FIELD.  |
1030 `-------------------------------------*/
1031
1032 static void
1033 print_field (BLOCK field)
1034 {
1035   char *cursor;                 /* Cursor in field to print */
1036   int base;                     /* Base character, without diacritic */
1037   int diacritic;                /* Diacritic code for the character */
1038
1039   /* Whitespace is not really compressed.  Instead, each white space
1040      character (tab, vt, ht etc.) is printed as one single space.  */
1041
1042   for (cursor = field.start; cursor < field.end; cursor++)
1043     {
1044       unsigned char character = *cursor;
1045       if (edited_flag[character])
1046         {
1047
1048           /* First check if this is a diacriticized character.
1049
1050              This works only for TeX.  I do not know how diacriticized
1051              letters work with 'roff'.  Please someone explain it to me!  */
1052
1053           diacritic = todiac (character);
1054           if (diacritic != 0 && output_format == TEX_FORMAT)
1055             {
1056               base = tobase (character);
1057               switch (diacritic)
1058                 {
1059
1060                 case 1:         /* Latin diphthongs */
1061                   switch (base)
1062                     {
1063                     case 'o':
1064                       fputs ("\\oe{}", stdout);
1065                       break;
1066
1067                     case 'O':
1068                       fputs ("\\OE{}", stdout);
1069                       break;
1070
1071                     case 'a':
1072                       fputs ("\\ae{}", stdout);
1073                       break;
1074
1075                     case 'A':
1076                       fputs ("\\AE{}", stdout);
1077                       break;
1078
1079                     default:
1080                       putchar (' ');
1081                     }
1082                   break;
1083
1084                 case 2:         /* Acute accent */
1085                   printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
1086                   break;
1087
1088                 case 3:         /* Grave accent */
1089                   printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
1090                   break;
1091
1092                 case 4:         /* Circumflex accent */
1093                   printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base);
1094                   break;
1095
1096                 case 5:         /* Diaeresis */
1097                   printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base);
1098                   break;
1099
1100                 case 6:         /* Tilde accent */
1101                   printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base);
1102                   break;
1103
1104                 case 7:         /* Cedilla */
1105                   printf ("\\c{%c}", base);
1106                   break;
1107
1108                 case 8:         /* Small circle beneath */
1109                   switch (base)
1110                     {
1111                     case 'a':
1112                       fputs ("\\aa{}", stdout);
1113                       break;
1114
1115                     case 'A':
1116                       fputs ("\\AA{}", stdout);
1117                       break;
1118
1119                     default:
1120                       putchar (' ');
1121                     }
1122                   break;
1123
1124                 case 9:         /* Strike through */
1125                   switch (base)
1126                     {
1127                     case 'o':
1128                       fputs ("\\o{}", stdout);
1129                       break;
1130
1131                     case 'O':
1132                       fputs ("\\O{}", stdout);
1133                       break;
1134
1135                     default:
1136                       putchar (' ');
1137                     }
1138                   break;
1139                 }
1140             }
1141           else
1142
1143             /* This is not a diacritic character, so handle cases which are
1144                really specific to 'roff' or TeX.  All white space processing
1145                is done as the default case of this switch.  */
1146
1147             switch (character)
1148               {
1149               case '"':
1150                 /* In roff output format, double any quote.  */
1151                 putchar ('"');
1152                 putchar ('"');
1153                 break;
1154
1155               case '$':
1156               case '%':
1157               case '&':
1158               case '#':
1159               case '_':
1160                 /* In TeX output format, precede these with a backslash.  */
1161                 putchar ('\\');
1162                 putchar (character);
1163                 break;
1164
1165               case '{':
1166               case '}':
1167                 /* In TeX output format, precede these with a backslash and
1168                    force mathematical mode.  */
1169                 printf ("$\\%c$", character);
1170                 break;
1171
1172               case '\\':
1173                 /* In TeX output mode, request production of a backslash.  */
1174                 fputs ("\\backslash{}", stdout);
1175                 break;
1176
1177               default:
1178                 /* Any other flagged character produces a single space.  */
1179                 putchar (' ');
1180               }
1181         }
1182       else
1183         putchar (*cursor);
1184     }
1185 }
1186
1187 /* Formatting and actual output - planning routines.  */
1188
1189 /*--------------------------------------------------------------------.
1190 | From information collected from command line options and input file |
1191 | readings, compute and fix some output parameter values.             |
1192 `--------------------------------------------------------------------*/
1193
1194 static void
1195 fix_output_parameters (void)
1196 {
1197   size_t file_index;            /* index in text input file arrays */
1198   intmax_t line_ordinal;        /* line ordinal value for reference */
1199   ptrdiff_t reference_width;    /* width for the whole reference */
1200   int character;                /* character ordinal */
1201   const char *cursor;           /* cursor in some constant strings */
1202
1203   /* In auto reference mode, the maximum width of this field is
1204      precomputed and subtracted from the overall line width.  Add one for
1205      the column which separate the file name from the line number.  */
1206
1207   if (auto_reference)
1208     {
1209       reference_max_width = 0;
1210       for (file_index = 0; file_index < number_input_files; file_index++)
1211         {
1212           line_ordinal = file_line_count[file_index] + 1;
1213           if (file_index > 0)
1214             line_ordinal -= file_line_count[file_index - 1];
1215           char ordinal_string[INT_BUFSIZE_BOUND (intmax_t)];
1216           reference_width = sprintf (ordinal_string, "%"PRIdMAX, line_ordinal);
1217           if (input_file_name[file_index])
1218             reference_width += strlen (input_file_name[file_index]);
1219           if (reference_width > reference_max_width)
1220             reference_max_width = reference_width;
1221         }
1222       reference_max_width++;
1223       reference.start = xmalloc (reference_max_width + 1);
1224     }
1225
1226   /* If the reference appears to the left of the output line, reserve some
1227      space for it right away, including one gap size.  */
1228
1229   if ((auto_reference || input_reference) && !right_reference)
1230     line_width -= reference_max_width + gap_size;
1231   if (line_width < 0)
1232     line_width = 0;
1233
1234   /* The output lines, minimally, will contain from left to right a left
1235      context, a gap, and a keyword followed by the right context with no
1236      special intervening gap.  Half of the line width is dedicated to the
1237      left context and the gap, the other half is dedicated to the keyword
1238      and the right context; these values are computed once and for all here.
1239      There also are tail and head wrap around fields, used when the keyword
1240      is near the beginning or the end of the line, or when some long word
1241      cannot fit in, but leave place from wrapped around shorter words.  The
1242      maximum width of these fields are recomputed separately for each line,
1243      on a case by case basis.  It is worth noting that it cannot happen that
1244      both the tail and head fields are used at once.  */
1245
1246   half_line_width = line_width / 2;
1247   before_max_width = half_line_width - gap_size;
1248   keyafter_max_width = half_line_width;
1249
1250   /* If truncation_string is the empty string, make it NULL to speed up
1251      tests.  In this case, truncation_string_length will never get used, so
1252      there is no need to set it.  */
1253
1254   if (truncation_string && *truncation_string)
1255     truncation_string_length = strlen (truncation_string);
1256   else
1257     truncation_string = NULL;
1258
1259   if (gnu_extensions)
1260     {
1261
1262       /* When flagging truncation at the left of the keyword, the
1263          truncation mark goes at the beginning of the before field,
1264          unless there is a head field, in which case the mark goes at the
1265          left of the head field.  When flagging truncation at the right
1266          of the keyword, the mark goes at the end of the keyafter field,
1267          unless there is a tail field, in which case the mark goes at the
1268          end of the tail field.  Only eight combination cases could arise
1269          for truncation marks:
1270
1271          . None.
1272          . One beginning the before field.
1273          . One beginning the head field.
1274          . One ending the keyafter field.
1275          . One ending the tail field.
1276          . One beginning the before field, another ending the keyafter field.
1277          . One ending the tail field, another beginning the before field.
1278          . One ending the keyafter field, another beginning the head field.
1279
1280          So, there is at most two truncation marks, which could appear both
1281          on the left side of the center of the output line, both on the
1282          right side, or one on either side.  */
1283
1284       before_max_width -= 2 * truncation_string_length;
1285       if (before_max_width < 0)
1286         before_max_width = 0;
1287       keyafter_max_width -= 2 * truncation_string_length;
1288     }
1289   else
1290     {
1291
1292       /* I never figured out exactly how UNIX' ptx plans the output width
1293          of its various fields.  If GNU extensions are disabled, do not
1294          try computing the field widths correctly; instead, use the
1295          following formula, which does not completely imitate UNIX' ptx,
1296          but almost.  */
1297
1298       keyafter_max_width -= 2 * truncation_string_length + 1;
1299     }
1300
1301   /* Compute which characters need special output processing.  Initialize
1302      by flagging any white space character.  Some systems do not consider
1303      form feed as a space character, but we do.  */
1304
1305   for (character = 0; character < CHAR_SET_SIZE; character++)
1306     edited_flag[character] = !! isspace (character);
1307   edited_flag['\f'] = 1;
1308
1309   /* Complete the special character flagging according to selected output
1310      format.  */
1311
1312   switch (output_format)
1313     {
1314     case UNKNOWN_FORMAT:
1315       /* Should never happen.  */
1316
1317     case DUMB_FORMAT:
1318       break;
1319
1320     case ROFF_FORMAT:
1321
1322       /* 'Quote' characters should be doubled.  */
1323
1324       edited_flag['"'] = 1;
1325       break;
1326
1327     case TEX_FORMAT:
1328
1329       /* Various characters need special processing.  */
1330
1331       for (cursor = "$%&#_{}\\"; *cursor; cursor++)
1332         edited_flag[to_uchar (*cursor)] = 1;
1333
1334       /* Any character with 8th bit set will print to a single space, unless
1335          it is diacriticized.  */
1336
1337       for (character = 0200; character < CHAR_SET_SIZE; character++)
1338         edited_flag[character] = todiac (character) != 0;
1339       break;
1340     }
1341 }
1342
1343 /*------------------------------------------------------------------.
1344 | Compute the position and length of all the output fields, given a |
1345 | pointer to some OCCURS.                                           |
1346 `------------------------------------------------------------------*/
1347
1348 static void
1349 define_all_fields (OCCURS *occurs)
1350 {
1351   ptrdiff_t tail_max_width;     /* allowable width of tail field */
1352   ptrdiff_t head_max_width;     /* allowable width of head field */
1353   char *cursor;                 /* running cursor in source text */
1354   char *left_context_start;     /* start of left context */
1355   char *right_context_end;      /* end of right context */
1356   char *left_field_start;       /* conservative start for 'head'/'before' */
1357   const char *file_name;        /* file name for reference */
1358   intmax_t line_ordinal;        /* line ordinal for reference */
1359   const char *buffer_start;     /* start of buffered file for this occurs */
1360   const char *buffer_end;       /* end of buffered file for this occurs */
1361
1362   /* Define 'keyafter', start of left context and end of right context.
1363      'keyafter' starts at the saved position for keyword and extend to the
1364      right from the end of the keyword, eating separators or full words, but
1365      not beyond maximum allowed width for 'keyafter' field or limit for the
1366      right context.  Suffix spaces will be removed afterwards.  */
1367
1368   keyafter.start = occurs->key.start;
1369   keyafter.end = keyafter.start + occurs->key.size;
1370   left_context_start = keyafter.start + occurs->left;
1371   right_context_end = keyafter.start + occurs->right;
1372
1373   buffer_start = text_buffers[occurs->file_index].start;
1374   buffer_end = text_buffers[occurs->file_index].end;
1375
1376   cursor = keyafter.end;
1377   while (cursor < right_context_end
1378          && cursor <= keyafter.start + keyafter_max_width)
1379     {
1380       keyafter.end = cursor;
1381       SKIP_SOMETHING (cursor, right_context_end);
1382     }
1383   if (cursor <= keyafter.start + keyafter_max_width)
1384     keyafter.end = cursor;
1385
1386   keyafter_truncation = truncation_string && keyafter.end < right_context_end;
1387
1388   SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start);
1389
1390   /* When the left context is wide, it might take some time to catch up from
1391      the left context boundary to the beginning of the 'head' or 'before'
1392      fields.  So, in this case, to speed the catchup, we jump back from the
1393      keyword, using some secure distance, possibly falling in the middle of
1394      a word.  A secure backward jump would be at least half the maximum
1395      width of a line, plus the size of the longest word met in the whole
1396      input.  We conclude this backward jump by a skip forward of at least
1397      one word.  In this manner, we should not inadvertently accept only part
1398      of a word.  From the reached point, when it will be time to fix the
1399      beginning of 'head' or 'before' fields, we will skip forward words or
1400      delimiters until we get sufficiently near.  */
1401
1402   if (-occurs->left > half_line_width + maximum_word_length)
1403     {
1404       left_field_start
1405         = keyafter.start - (half_line_width + maximum_word_length);
1406       SKIP_SOMETHING (left_field_start, keyafter.start);
1407     }
1408   else
1409     left_field_start = keyafter.start + occurs->left;
1410
1411   /* 'before' certainly ends at the keyword, but not including separating
1412      spaces.  It starts after than the saved value for the left context, by
1413      advancing it until it falls inside the maximum allowed width for the
1414      before field.  There will be no prefix spaces either.  'before' only
1415      advances by skipping single separators or whole words. */
1416
1417   before.start = left_field_start;
1418   before.end = keyafter.start;
1419   SKIP_WHITE_BACKWARDS (before.end, before.start);
1420
1421   while (before.start + before_max_width < before.end)
1422     SKIP_SOMETHING (before.start, before.end);
1423
1424   if (truncation_string)
1425     {
1426       cursor = before.start;
1427       SKIP_WHITE_BACKWARDS (cursor, buffer_start);
1428       before_truncation = cursor > left_context_start;
1429     }
1430   else
1431     before_truncation = false;
1432
1433   SKIP_WHITE (before.start, buffer_end);
1434
1435   /* The tail could not take more columns than what has been left in the
1436      left context field, and a gap is mandatory.  It starts after the
1437      right context, and does not contain prefixed spaces.  It ends at
1438      the end of line, the end of buffer or when the tail field is full,
1439      whichever comes first.  It cannot contain only part of a word, and
1440      has no suffixed spaces.  */
1441
1442   tail_max_width
1443     = before_max_width - (before.end - before.start) - gap_size;
1444
1445   if (tail_max_width > 0)
1446     {
1447       tail.start = keyafter.end;
1448       SKIP_WHITE (tail.start, buffer_end);
1449
1450       tail.end = tail.start;
1451       cursor = tail.end;
1452       while (cursor < right_context_end
1453              && cursor < tail.start + tail_max_width)
1454         {
1455           tail.end = cursor;
1456           SKIP_SOMETHING (cursor, right_context_end);
1457         }
1458
1459       if (cursor < tail.start + tail_max_width)
1460         tail.end = cursor;
1461
1462       if (tail.end > tail.start)
1463         {
1464           keyafter_truncation = false;
1465           tail_truncation = truncation_string && tail.end < right_context_end;
1466         }
1467       else
1468         tail_truncation = false;
1469
1470       SKIP_WHITE_BACKWARDS (tail.end, tail.start);
1471     }
1472   else
1473     {
1474
1475       /* No place left for a tail field.  */
1476
1477       tail.start = NULL;
1478       tail.end = NULL;
1479       tail_truncation = false;
1480     }
1481
1482   /* 'head' could not take more columns than what has been left in the right
1483      context field, and a gap is mandatory.  It ends before the left
1484      context, and does not contain suffixed spaces.  Its pointer is advanced
1485      until the head field has shrunk to its allowed width.  It cannot
1486      contain only part of a word, and has no suffixed spaces.  */
1487
1488   head_max_width
1489     = keyafter_max_width - (keyafter.end - keyafter.start) - gap_size;
1490
1491   if (head_max_width > 0)
1492     {
1493       head.end = before.start;
1494       SKIP_WHITE_BACKWARDS (head.end, buffer_start);
1495
1496       head.start = left_field_start;
1497       while (head.start + head_max_width < head.end)
1498         SKIP_SOMETHING (head.start, head.end);
1499
1500       if (head.end > head.start)
1501         {
1502           before_truncation = false;
1503           head_truncation = (truncation_string
1504                              && head.start > left_context_start);
1505         }
1506       else
1507         head_truncation = false;
1508
1509       SKIP_WHITE (head.start, head.end);
1510     }
1511   else
1512     {
1513
1514       /* No place left for a head field.  */
1515
1516       head.start = NULL;
1517       head.end = NULL;
1518       head_truncation = false;
1519     }
1520
1521   if (auto_reference)
1522     {
1523
1524       /* Construct the reference text in preallocated space from the file
1525          name and the line number.  Standard input yields an empty file name.
1526          Ensure line numbers are 1 based, even if they are computed 0 based.  */
1527
1528       file_name = input_file_name[occurs->file_index];
1529       if (!file_name)
1530         file_name = "";
1531
1532       line_ordinal = occurs->reference + 1;
1533       if (occurs->file_index > 0)
1534         line_ordinal -= file_line_count[occurs->file_index - 1];
1535
1536       char *file_end = stpcpy (reference.start, file_name);
1537       reference.end = file_end + sprintf (file_end, ":%"PRIdMAX, line_ordinal);
1538     }
1539   else if (input_reference)
1540     {
1541
1542       /* Reference starts at saved position for reference and extends right
1543          until some white space is met.  */
1544
1545       reference.start = keyafter.start + occurs->reference;
1546       reference.end = reference.start;
1547       SKIP_NON_WHITE (reference.end, right_context_end);
1548     }
1549 }
1550
1551 /* Formatting and actual output - control routines.  */
1552
1553 /*----------------------------------------------------------------------.
1554 | Output the current output fields as one line for 'troff' or 'nroff'.  |
1555 `----------------------------------------------------------------------*/
1556
1557 static void
1558 output_one_roff_line (void)
1559 {
1560   /* Output the 'tail' field.  */
1561
1562   printf (".%s \"", macro_name);
1563   print_field (tail);
1564   if (tail_truncation)
1565     fputs (truncation_string, stdout);
1566   putchar ('"');
1567
1568   /* Output the 'before' field.  */
1569
1570   fputs (" \"", stdout);
1571   if (before_truncation)
1572     fputs (truncation_string, stdout);
1573   print_field (before);
1574   putchar ('"');
1575
1576   /* Output the 'keyafter' field.  */
1577
1578   fputs (" \"", stdout);
1579   print_field (keyafter);
1580   if (keyafter_truncation)
1581     fputs (truncation_string, stdout);
1582   putchar ('"');
1583
1584   /* Output the 'head' field.  */
1585
1586   fputs (" \"", stdout);
1587   if (head_truncation)
1588     fputs (truncation_string, stdout);
1589   print_field (head);
1590   putchar ('"');
1591
1592   /* Conditionally output the 'reference' field.  */
1593
1594   if (auto_reference || input_reference)
1595     {
1596       fputs (" \"", stdout);
1597       print_field (reference);
1598       putchar ('"');
1599     }
1600
1601   putchar ('\n');
1602 }
1603
1604 /*---------------------------------------------------------.
1605 | Output the current output fields as one line for 'TeX'.  |
1606 `---------------------------------------------------------*/
1607
1608 static void
1609 output_one_tex_line (void)
1610 {
1611   BLOCK key;                    /* key field, isolated */
1612   BLOCK after;                  /* after field, isolated */
1613   char *cursor;                 /* running cursor in source text */
1614
1615   printf ("\\%s ", macro_name);
1616   putchar ('{');
1617   print_field (tail);
1618   fputs ("}{", stdout);
1619   print_field (before);
1620   fputs ("}{", stdout);
1621   key.start = keyafter.start;
1622   after.end = keyafter.end;
1623   cursor = keyafter.start;
1624   SKIP_SOMETHING (cursor, keyafter.end);
1625   key.end = cursor;
1626   after.start = cursor;
1627   print_field (key);
1628   fputs ("}{", stdout);
1629   print_field (after);
1630   fputs ("}{", stdout);
1631   print_field (head);
1632   putchar ('}');
1633   if (auto_reference || input_reference)
1634     {
1635       putchar ('{');
1636       print_field (reference);
1637       putchar ('}');
1638     }
1639   putchar ('\n');
1640 }
1641
1642 /*-------------------------------------------------------------------.
1643 | Output the current output fields as one line for a dumb terminal.  |
1644 `-------------------------------------------------------------------*/
1645
1646 static void
1647 output_one_dumb_line (void)
1648 {
1649   if (!right_reference)
1650     {
1651       if (auto_reference)
1652         {
1653
1654           /* Output the 'reference' field, in such a way that GNU emacs
1655              next-error will handle it.  The ending colon is taken from the
1656              gap which follows.  */
1657
1658           print_field (reference);
1659           putchar (':');
1660           print_spaces (reference_max_width
1661                         + gap_size
1662                         - (reference.end - reference.start)
1663                         - 1);
1664         }
1665       else
1666         {
1667
1668           /* Output the 'reference' field and its following gap.  */
1669
1670           print_field (reference);
1671           print_spaces (reference_max_width
1672                         + gap_size
1673                         - (reference.end - reference.start));
1674         }
1675     }
1676
1677   if (tail.start < tail.end)
1678     {
1679       /* Output the 'tail' field.  */
1680
1681       print_field (tail);
1682       if (tail_truncation)
1683         fputs (truncation_string, stdout);
1684
1685       print_spaces (half_line_width - gap_size
1686                     - (before.end - before.start)
1687                     - (before_truncation ? truncation_string_length : 0)
1688                     - (tail.end - tail.start)
1689                     - (tail_truncation ? truncation_string_length : 0));
1690     }
1691   else
1692     print_spaces (half_line_width - gap_size
1693                   - (before.end - before.start)
1694                   - (before_truncation ? truncation_string_length : 0));
1695
1696   /* Output the 'before' field.  */
1697
1698   if (before_truncation)
1699     fputs (truncation_string, stdout);
1700   print_field (before);
1701
1702   print_spaces (gap_size);
1703
1704   /* Output the 'keyafter' field.  */
1705
1706   print_field (keyafter);
1707   if (keyafter_truncation)
1708     fputs (truncation_string, stdout);
1709
1710   if (head.start < head.end)
1711     {
1712       /* Output the 'head' field.  */
1713
1714       print_spaces (half_line_width
1715                     - (keyafter.end - keyafter.start)
1716                     - (keyafter_truncation ? truncation_string_length : 0)
1717                     - (head.end - head.start)
1718                     - (head_truncation ? truncation_string_length : 0));
1719       if (head_truncation)
1720         fputs (truncation_string, stdout);
1721       print_field (head);
1722     }
1723   else
1724
1725     if ((auto_reference || input_reference) && right_reference)
1726       print_spaces (half_line_width
1727                     - (keyafter.end - keyafter.start)
1728                     - (keyafter_truncation ? truncation_string_length : 0));
1729
1730   if ((auto_reference || input_reference) && right_reference)
1731     {
1732       /* Output the 'reference' field.  */
1733
1734       print_spaces (gap_size);
1735       print_field (reference);
1736     }
1737
1738   putchar ('\n');
1739 }
1740
1741 /*------------------------------------------------------------------------.
1742 | Scan the whole occurs table and, for each entry, output one line in the |
1743 | appropriate format.                                                     |
1744 `------------------------------------------------------------------------*/
1745
1746 static void
1747 generate_all_output (void)
1748 {
1749   ptrdiff_t occurs_index;       /* index of keyword entry being processed */
1750   OCCURS *occurs_cursor;        /* current keyword entry being processed */
1751
1752   /* The following assignments are useful to provide default values in case
1753      line contexts or references are not used, in which case these variables
1754      would never be computed.  */
1755
1756   tail.start = NULL;
1757   tail.end = NULL;
1758   tail_truncation = false;
1759
1760   head.start = NULL;
1761   head.end = NULL;
1762   head_truncation = false;
1763
1764   /* Loop over all keyword occurrences.  */
1765
1766   occurs_cursor = occurs_table[0];
1767
1768   for (occurs_index = 0; occurs_index < number_of_occurs[0]; occurs_index++)
1769     {
1770       /* Compute the exact size of every field and whenever truncation flags
1771          are present or not.  */
1772
1773       define_all_fields (occurs_cursor);
1774
1775       /* Produce one output line according to selected format.  */
1776
1777       switch (output_format)
1778         {
1779         case UNKNOWN_FORMAT:
1780           /* Should never happen.  */
1781
1782         case DUMB_FORMAT:
1783           output_one_dumb_line ();
1784           break;
1785
1786         case ROFF_FORMAT:
1787           output_one_roff_line ();
1788           break;
1789
1790         case TEX_FORMAT:
1791           output_one_tex_line ();
1792           break;
1793         }
1794
1795       /* Advance the cursor into the occurs table.  */
1796
1797       occurs_cursor++;
1798     }
1799 }
1800
1801 /* Option decoding and main program.  */
1802
1803 /*------------------------------------------------------.
1804 | Print program identification and options, then exit.  |
1805 `------------------------------------------------------*/
1806
1807 void
1808 usage (int status)
1809 {
1810   if (status != EXIT_SUCCESS)
1811     emit_try_help ();
1812   else
1813     {
1814       printf (_("\
1815 Usage: %s [OPTION]... [INPUT]...   (without -G)\n\
1816   or:  %s -G [OPTION]... [INPUT [OUTPUT]]\n"),
1817               program_name, program_name);
1818       fputs (_("\
1819 Output a permuted index, including context, of the words in the input files.\n\
1820 "), stdout);
1821
1822       emit_stdin_note ();
1823       emit_mandatory_arg_note ();
1824
1825       fputs (_("\
1826   -A, --auto-reference           output automatically generated references\n\
1827   -G, --traditional              behave more like System V 'ptx'\n\
1828 "), stdout);
1829       fputs (_("\
1830   -F, --flag-truncation=STRING   use STRING for flagging line truncations.\n\
1831                                  The default is '/'\n\
1832 "), stdout);
1833       fputs (_("\
1834   -M, --macro-name=STRING        macro name to use instead of 'xx'\n\
1835   -O, --format=roff              generate output as roff directives\n\
1836   -R, --right-side-refs          put references at right, not counted in -w\n\
1837   -S, --sentence-regexp=REGEXP   for end of lines or end of sentences\n\
1838   -T, --format=tex               generate output as TeX directives\n\
1839 "), stdout);
1840       fputs (_("\
1841   -W, --word-regexp=REGEXP       use REGEXP to match each keyword\n\
1842   -b, --break-file=FILE          word break characters in this FILE\n\
1843   -f, --ignore-case              fold lower case to upper case for sorting\n\
1844   -g, --gap-size=NUMBER          gap size in columns between output fields\n\
1845   -i, --ignore-file=FILE         read ignore word list from FILE\n\
1846   -o, --only-file=FILE           read only word list from this FILE\n\
1847 "), stdout);
1848       fputs (_("\
1849   -r, --references               first field of each line is a reference\n\
1850   -t, --typeset-mode               - not implemented -\n\
1851   -w, --width=NUMBER             output width in columns, reference excluded\n\
1852 "), stdout);
1853       fputs (HELP_OPTION_DESCRIPTION, stdout);
1854       fputs (VERSION_OPTION_DESCRIPTION, stdout);
1855       emit_ancillary_info (PROGRAM_NAME);
1856     }
1857   exit (status);
1858 }
1859
1860 /*----------------------------------------------------------------------.
1861 | Main program.  Decode ARGC arguments passed through the ARGV array of |
1862 | strings, then launch execution.                                       |
1863 `----------------------------------------------------------------------*/
1864
1865 /* Long options equivalences.  */
1866 static struct option const long_options[] =
1867 {
1868   {"auto-reference", no_argument, NULL, 'A'},
1869   {"break-file", required_argument, NULL, 'b'},
1870   {"flag-truncation", required_argument, NULL, 'F'},
1871   {"ignore-case", no_argument, NULL, 'f'},
1872   {"gap-size", required_argument, NULL, 'g'},
1873   {"ignore-file", required_argument, NULL, 'i'},
1874   {"macro-name", required_argument, NULL, 'M'},
1875   {"only-file", required_argument, NULL, 'o'},
1876   {"references", no_argument, NULL, 'r'},
1877   {"right-side-refs", no_argument, NULL, 'R'},
1878   {"format", required_argument, NULL, 10},
1879   {"sentence-regexp", required_argument, NULL, 'S'},
1880   {"traditional", no_argument, NULL, 'G'},
1881   {"typeset-mode", no_argument, NULL, 't'},
1882   {"width", required_argument, NULL, 'w'},
1883   {"word-regexp", required_argument, NULL, 'W'},
1884   {GETOPT_HELP_OPTION_DECL},
1885   {GETOPT_VERSION_OPTION_DECL},
1886   {NULL, 0, NULL, 0},
1887 };
1888
1889 static char const* const format_args[] =
1890 {
1891   "roff", "tex", NULL
1892 };
1893
1894 static enum Format const format_vals[] =
1895 {
1896   ROFF_FORMAT, TEX_FORMAT
1897 };
1898
1899 int
1900 main (int argc, char **argv)
1901 {
1902   int optchar;                  /* argument character */
1903   int file_index;               /* index in text input file arrays */
1904
1905   /* Decode program options.  */
1906
1907   initialize_main (&argc, &argv);
1908   set_program_name (argv[0]);
1909   setlocale (LC_ALL, "");
1910   bindtextdomain (PACKAGE, LOCALEDIR);
1911   textdomain (PACKAGE);
1912
1913   atexit (close_stdout);
1914
1915 #if HAVE_SETCHRCLASS
1916   setchrclass (NULL);
1917 #endif
1918
1919   while (optchar = getopt_long (argc, argv, "AF:GM:ORS:TW:b:i:fg:o:trw:",
1920                                 long_options, NULL),
1921          optchar != EOF)
1922     {
1923       switch (optchar)
1924         {
1925         default:
1926           usage (EXIT_FAILURE);
1927
1928         case 'G':
1929           gnu_extensions = false;
1930           break;
1931
1932         case 'b':
1933           break_file = optarg;
1934           break;
1935
1936         case 'f':
1937           ignore_case = true;
1938           break;
1939
1940         case 'g':
1941           {
1942             intmax_t tmp;
1943             if (! (xstrtoimax (optarg, NULL, 0, &tmp, "") == LONGINT_OK
1944                    && 0 < tmp && tmp <= PTRDIFF_MAX))
1945               die (EXIT_FAILURE, 0, _("invalid gap width: %s"),
1946                    quote (optarg));
1947             gap_size = tmp;
1948             break;
1949           }
1950
1951         case 'i':
1952           ignore_file = optarg;
1953           break;
1954
1955         case 'o':
1956           only_file = optarg;
1957           break;
1958
1959         case 'r':
1960           input_reference = true;
1961           break;
1962
1963         case 't':
1964           /* Yet to understand...  */
1965           break;
1966
1967         case 'w':
1968           {
1969             intmax_t tmp;
1970             if (! (xstrtoimax (optarg, NULL, 0, &tmp, "") == LONGINT_OK
1971                    && 0 < tmp && tmp <= PTRDIFF_MAX))
1972               die (EXIT_FAILURE, 0, _("invalid line width: %s"),
1973                    quote (optarg));
1974             line_width = tmp;
1975             break;
1976           }
1977
1978         case 'A':
1979           auto_reference = true;
1980           break;
1981
1982         case 'F':
1983           truncation_string = copy_unescaped_string (optarg);
1984           break;
1985
1986         case 'M':
1987           macro_name = optarg;
1988           break;
1989
1990         case 'O':
1991           output_format = ROFF_FORMAT;
1992           break;
1993
1994         case 'R':
1995           right_reference = true;
1996           break;
1997
1998         case 'S':
1999           context_regex.string = copy_unescaped_string (optarg);
2000           break;
2001
2002         case 'T':
2003           output_format = TEX_FORMAT;
2004           break;
2005
2006         case 'W':
2007           word_regex.string = copy_unescaped_string (optarg);
2008           if (!*word_regex.string)
2009             word_regex.string = NULL;
2010           break;
2011
2012         case 10:
2013           output_format = XARGMATCH ("--format", optarg,
2014                                      format_args, format_vals);
2015           break;
2016
2017         case_GETOPT_HELP_CHAR;
2018
2019         case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2020         }
2021     }
2022
2023   /* Process remaining arguments.  If GNU extensions are enabled, process
2024      all arguments as input parameters.  If disabled, accept at most two
2025      arguments, the second of which is an output parameter.  */
2026
2027   if (optind == argc)
2028     {
2029
2030       /* No more argument simply means: read standard input.  */
2031
2032       input_file_name = xmalloc (sizeof *input_file_name);
2033       file_line_count = xmalloc (sizeof *file_line_count);
2034       text_buffers =    xmalloc (sizeof *text_buffers);
2035       number_input_files = 1;
2036       input_file_name[0] = NULL;
2037     }
2038   else if (gnu_extensions)
2039     {
2040       number_input_files = argc - optind;
2041       input_file_name = xnmalloc (number_input_files, sizeof *input_file_name);
2042       file_line_count = xnmalloc (number_input_files, sizeof *file_line_count);
2043       text_buffers    = xnmalloc (number_input_files, sizeof *text_buffers);
2044
2045       for (file_index = 0; file_index < number_input_files; file_index++)
2046         {
2047           if (!*argv[optind] || STREQ (argv[optind], "-"))
2048             input_file_name[file_index] = NULL;
2049           else
2050             input_file_name[file_index] = argv[optind];
2051           optind++;
2052         }
2053     }
2054   else
2055     {
2056
2057       /* There is one necessary input file.  */
2058
2059       number_input_files = 1;
2060       input_file_name = xmalloc (sizeof *input_file_name);
2061       file_line_count = xmalloc (sizeof *file_line_count);
2062       text_buffers    = xmalloc (sizeof *text_buffers);
2063       if (!*argv[optind] || STREQ (argv[optind], "-"))
2064         input_file_name[0] = NULL;
2065       else
2066         input_file_name[0] = argv[optind];
2067       optind++;
2068
2069       /* Redirect standard output, only if requested.  */
2070
2071       if (optind < argc)
2072         {
2073           if (! freopen (argv[optind], "w", stdout))
2074             die (EXIT_FAILURE, errno, "%s", quotef (argv[optind]));
2075           optind++;
2076         }
2077
2078       /* Diagnose any other argument as an error.  */
2079
2080       if (optind < argc)
2081         {
2082           error (0, 0, _("extra operand %s"), quote (argv[optind]));
2083           usage (EXIT_FAILURE);
2084         }
2085     }
2086
2087   /* If the output format has not been explicitly selected, choose dumb
2088      terminal format if GNU extensions are enabled, else 'roff' format.  */
2089
2090   if (output_format == UNKNOWN_FORMAT)
2091     output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT;
2092
2093   /* Initialize the main tables.  */
2094
2095   initialize_regex ();
2096
2097   /* Read 'Break character' file, if any.  */
2098
2099   if (break_file)
2100     digest_break_file (break_file);
2101
2102   /* Read 'Ignore words' file and 'Only words' files, if any.  If any of
2103      these files is empty, reset the name of the file to NULL, to avoid
2104      unnecessary calls to search_table. */
2105
2106   if (ignore_file)
2107     {
2108       digest_word_file (ignore_file, &ignore_table);
2109       if (ignore_table.length == 0)
2110         ignore_file = NULL;
2111     }
2112
2113   if (only_file)
2114     {
2115       digest_word_file (only_file, &only_table);
2116       if (only_table.length == 0)
2117         only_file = NULL;
2118     }
2119
2120   /* Prepare to study all the input files.  */
2121
2122   number_of_occurs[0] = 0;
2123   total_line_count = 0;
2124   maximum_word_length = 0;
2125   reference_max_width = 0;
2126
2127   for (file_index = 0; file_index < number_input_files; file_index++)
2128     {
2129       BLOCK *text_buffer = text_buffers + file_index;
2130
2131       /* Read the file in core, then study it.  */
2132
2133       swallow_file_in_memory (input_file_name[file_index], text_buffer);
2134       find_occurs_in_text (file_index);
2135
2136       /* Maintain for each file how many lines has been read so far when its
2137          end is reached.  Incrementing the count first is a simple kludge to
2138          handle a possible incomplete line at end of file.  */
2139
2140       total_line_count++;
2141       file_line_count[file_index] = total_line_count;
2142     }
2143
2144   /* Do the output process phase.  */
2145
2146   sort_found_occurs ();
2147   fix_output_parameters ();
2148   generate_all_output ();
2149
2150   /* All done.  */
2151
2152   return EXIT_SUCCESS;
2153 }