* src/dd.c (flags): noatime and nofollow now depend on
[coreutils/bo.git] / src / ptx.c
blob9c35596fc05160fc717a7e1abea0860268318c50
1 /* Permuted index for GNU, with keywords in their context.
2 Copyright (C) 1990, 1991, 1993, 1998-2006 Free Software Foundation, Inc.
3 François Pinard <pinard@iro.umontreal.ca>, 1988.
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
10 This program is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 François Pinard <pinard@iro.umontreal.ca> */
21 #include <config.h>
23 #include <stdio.h>
24 #include <getopt.h>
25 #include <sys/types.h>
26 #include "system.h"
27 #include "argmatch.h"
28 #include "diacrit.h"
29 #include "error.h"
30 #include "quote.h"
31 #include "quotearg.h"
32 #include "regex.h"
33 #include "xstrtol.h"
35 /* The official name of this program (e.g., no `g' prefix). */
36 #define PROGRAM_NAME "ptx"
38 /* Note to translator: Please translate "F. Pinard" to "François
39 Pinard" if "ç" (c-with-cedilla) is available in the
40 translation's character set and encoding. */
41 #define AUTHORS _("F. Pinard")
43 /* Number of possible characters in a byte. */
44 #define CHAR_SET_SIZE 256
46 #define ISODIGIT(C) ((C) >= '0' && (C) <= '7')
47 #define HEXTOBIN(C) ((C) >= 'a' && (C) <= 'f' ? (C)-'a'+10 \
48 : (C) >= 'A' && (C) <= 'F' ? (C)-'A'+10 : (C)-'0')
49 #define OCTTOBIN(C) ((C) - '0')
51 /* Debugging the memory allocator. */
53 #if WITH_DMALLOC
54 # define MALLOC_FUNC_CHECK 1
55 # include <dmalloc.h>
56 #endif
58 /* Global definitions. */
60 /* FIXME: There are many unchecked integer overflows in this file,
61 that will cause this command to misbehave given large inputs or
62 options. Many of the "int" values below should be "size_t" or
63 something else like that. */
65 /* Reallocation step when swallowing non regular files. The value is not
66 the actual reallocation step, but its base two logarithm. */
67 #define SWALLOW_REALLOC_LOG 12
69 /* Imported from "regex.c". */
70 #define Sword 1
72 /* The name this program was run with. */
73 char *program_name;
75 /* Program options. */
77 enum Format
79 UNKNOWN_FORMAT, /* output format still unknown */
80 DUMB_FORMAT, /* output for a dumb terminal */
81 ROFF_FORMAT, /* output for `troff' or `nroff' */
82 TEX_FORMAT /* output for `TeX' or `LaTeX' */
85 static bool gnu_extensions = true; /* trigger all GNU extensions */
86 static bool auto_reference = false; /* refs are `file_name:line_number:' */
87 static bool input_reference = false; /* refs at beginning of input lines */
88 static bool right_reference = false; /* output refs after right context */
89 static int line_width = 72; /* output line width in characters */
90 static int gap_size = 3; /* number of spaces between output fields */
91 static const char *truncation_string = "/";
92 /* string used to mark line truncations */
93 static const char *macro_name = "xx"; /* macro name for roff or TeX output */
94 static enum Format output_format = UNKNOWN_FORMAT;
95 /* output format */
97 static bool ignore_case = false; /* fold lower to upper for sorting */
98 static const char *break_file = NULL; /* name of the `Break characters' file */
99 static const char *only_file = NULL; /* name of the `Only words' file */
100 static const char *ignore_file = NULL; /* name of the `Ignore words' file */
102 /* Options that use regular expressions. */
103 struct regex_data
105 /* The original regular expression, as a string. */
106 char const *string;
108 /* The compiled regular expression, and its fastmap. */
109 struct re_pattern_buffer pattern;
110 char fastmap[UCHAR_MAX + 1];
113 static struct regex_data context_regex; /* end of context */
114 static struct regex_data word_regex; /* keyword */
116 /* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
117 whole file. A WORD is something smaller, its length should fit in a
118 short integer. A WORD_TABLE may contain several WORDs. */
120 typedef struct
122 char *start; /* pointer to beginning of region */
123 char *end; /* pointer to end + 1 of region */
125 BLOCK;
127 typedef struct
129 char *start; /* pointer to beginning of region */
130 short int size; /* length of the region */
132 WORD;
134 typedef struct
136 WORD *start; /* array of WORDs */
137 size_t alloc; /* allocated length */
138 size_t length; /* number of used entries */
140 WORD_TABLE;
142 /* Pattern description tables. */
144 /* For each character, provide its folded equivalent. */
145 static unsigned char folded_chars[CHAR_SET_SIZE];
147 /* End of context pattern register indices. */
148 static struct re_registers context_regs;
150 /* Keyword pattern register indices. */
151 static struct re_registers word_regs;
153 /* A word characters fastmap is used only when no word regexp has been
154 provided. A word is then made up of a sequence of one or more characters
155 allowed by the fastmap. Contains !0 if character allowed in word. Not
156 only this is faster in most cases, but it simplifies the implementation
157 of the Break files. */
158 static char word_fastmap[CHAR_SET_SIZE];
160 /* Maximum length of any word read. */
161 static int maximum_word_length;
163 /* Maximum width of any reference used. */
164 static int reference_max_width;
166 /* Ignore and Only word tables. */
168 static WORD_TABLE ignore_table; /* table of words to ignore */
169 static WORD_TABLE only_table; /* table of words to select */
171 /* Source text table, and scanning macros. */
173 static int number_input_files; /* number of text input files */
174 static int total_line_count; /* total number of lines seen so far */
175 static const char **input_file_name; /* array of text input file names */
176 static int *file_line_count; /* array of `total_line_count' values at end */
178 static BLOCK text_buffer; /* file to study */
180 /* SKIP_NON_WHITE used only for getting or skipping the reference. */
182 #define SKIP_NON_WHITE(cursor, limit) \
183 while (cursor < limit && ! isspace (to_uchar (*cursor))) \
184 cursor++
186 #define SKIP_WHITE(cursor, limit) \
187 while (cursor < limit && isspace (to_uchar (*cursor))) \
188 cursor++
190 #define SKIP_WHITE_BACKWARDS(cursor, start) \
191 while (cursor > start && isspace (to_uchar (cursor[-1]))) \
192 cursor--
194 #define SKIP_SOMETHING(cursor, limit) \
195 if (word_regex.string) \
197 regoff_t count; \
198 count = re_match (&word_regex.pattern, cursor, limit - cursor, 0, NULL); \
199 if (count == -2) \
200 matcher_error (); \
201 cursor += count == -1 ? 1 : count; \
203 else if (word_fastmap[to_uchar (*cursor)]) \
204 while (cursor < limit && word_fastmap[to_uchar (*cursor)]) \
205 cursor++; \
206 else \
207 cursor++
209 /* Occurrences table.
211 The `keyword' pointer provides the central word, which is surrounded
212 by a left context and a right context. The `keyword' and `length'
213 field allow full 8-bit characters keys, even including NULs. At other
214 places in this program, the name `keyafter' refers to the keyword
215 followed by its right context.
217 The left context does not extend, towards the beginning of the file,
218 further than a distance given by the `left' value. This value is
219 relative to the keyword beginning, it is usually negative. This
220 insures that, except for white space, we will never have to backward
221 scan the source text, when it is time to generate the final output
222 lines.
224 The right context, indirectly attainable through the keyword end, does
225 not extend, towards the end of the file, further than a distance given
226 by the `right' value. This value is relative to the keyword
227 beginning, it is usually positive.
229 When automatic references are used, the `reference' value is the
230 overall line number in all input files read so far, in this case, it
231 is of type (int). When input references are used, the `reference'
232 value indicates the distance between the keyword beginning and the
233 start of the reference field, it is of type (DELTA) and usually
234 negative. */
236 typedef short int DELTA; /* to hold displacement within one context */
238 typedef struct
240 WORD key; /* description of the keyword */
241 DELTA left; /* distance to left context start */
242 DELTA right; /* distance to right context end */
243 int reference; /* reference descriptor */
245 OCCURS;
247 /* The various OCCURS tables are indexed by the language. But the time
248 being, there is no such multiple language support. */
250 static OCCURS *occurs_table[1]; /* all words retained from the read text */
251 static size_t occurs_alloc[1]; /* allocated size of occurs_table */
252 static size_t number_of_occurs[1]; /* number of used slots in occurs_table */
255 /* Communication among output routines. */
257 /* Indicate if special output processing is requested for each character. */
258 static char edited_flag[CHAR_SET_SIZE];
260 static int half_line_width; /* half of line width, reference excluded */
261 static int before_max_width; /* maximum width of before field */
262 static int keyafter_max_width; /* maximum width of keyword-and-after field */
263 static int truncation_string_length;/* length of string used to flag truncation */
265 /* When context is limited by lines, wraparound may happen on final output:
266 the `head' pointer gives access to some supplementary left context which
267 will be seen at the end of the output line, the `tail' pointer gives
268 access to some supplementary right context which will be seen at the
269 beginning of the output line. */
271 static BLOCK tail; /* tail field */
272 static int tail_truncation; /* flag truncation after the tail field */
274 static BLOCK before; /* before field */
275 static int before_truncation; /* flag truncation before the before field */
277 static BLOCK keyafter; /* keyword-and-after field */
278 static int keyafter_truncation; /* flag truncation after the keyafter field */
280 static BLOCK head; /* head field */
281 static int head_truncation; /* flag truncation before the head field */
283 static BLOCK reference; /* reference field for input reference mode */
285 /* Miscellaneous routines. */
287 /* Diagnose an error in the regular expression matcher. Then exit. */
289 static void ATTRIBUTE_NORETURN
290 matcher_error (void)
292 error (0, errno, _("error in regular expression matcher"));
293 exit (EXIT_FAILURE);
296 /*------------------------------------------------------.
297 | Duplicate string STRING, while evaluating \-escapes. |
298 `------------------------------------------------------*/
300 /* Loosely adapted from GNU sh-utils printf.c code. */
302 static char *
303 copy_unescaped_string (const char *string)
305 char *result; /* allocated result */
306 char *cursor; /* cursor in result */
307 int value; /* value of \nnn escape */
308 int length; /* length of \nnn escape */
310 result = xmalloc (strlen (string) + 1);
311 cursor = result;
313 while (*string)
314 if (*string == '\\')
316 string++;
317 switch (*string)
319 case 'x': /* \xhhh escape, 3 chars maximum */
320 value = 0;
321 for (length = 0, string++;
322 length < 3 && isxdigit (to_uchar (*string));
323 length++, string++)
324 value = value * 16 + HEXTOBIN (*string);
325 if (length == 0)
327 *cursor++ = '\\';
328 *cursor++ = 'x';
330 else
331 *cursor++ = value;
332 break;
334 case '0': /* \0ooo escape, 3 chars maximum */
335 value = 0;
336 for (length = 0, string++;
337 length < 3 && ISODIGIT (*string);
338 length++, string++)
339 value = value * 8 + OCTTOBIN (*string);
340 *cursor++ = value;
341 break;
343 case 'a': /* alert */
344 #if __STDC__
345 *cursor++ = '\a';
346 #else
347 *cursor++ = 7;
348 #endif
349 string++;
350 break;
352 case 'b': /* backspace */
353 *cursor++ = '\b';
354 string++;
355 break;
357 case 'c': /* cancel the rest of the output */
358 while (*string)
359 string++;
360 break;
362 case 'f': /* form feed */
363 *cursor++ = '\f';
364 string++;
365 break;
367 case 'n': /* new line */
368 *cursor++ = '\n';
369 string++;
370 break;
372 case 'r': /* carriage return */
373 *cursor++ = '\r';
374 string++;
375 break;
377 case 't': /* horizontal tab */
378 *cursor++ = '\t';
379 string++;
380 break;
382 case 'v': /* vertical tab */
383 #if __STDC__
384 *cursor++ = '\v';
385 #else
386 *cursor++ = 11;
387 #endif
388 string++;
389 break;
391 default:
392 *cursor++ = '\\';
393 *cursor++ = *string++;
394 break;
397 else
398 *cursor++ = *string++;
400 *cursor = '\0';
401 return result;
404 /*--------------------------------------------------------------------------.
405 | Compile the regex represented by REGEX, diagnose and abort if any error. |
406 `--------------------------------------------------------------------------*/
408 static void
409 compile_regex (struct regex_data *regex)
411 struct re_pattern_buffer *pattern = &regex->pattern;
412 char const *string = regex->string;
413 char const *message;
415 pattern->buffer = NULL;
416 pattern->allocated = 0;
417 pattern->fastmap = regex->fastmap;
418 pattern->translate = ignore_case ? folded_chars : NULL;
420 message = re_compile_pattern (string, strlen (string), pattern);
421 if (message)
422 error (EXIT_FAILURE, 0, _("%s (for regexp %s)"), message, quote (string));
424 /* The fastmap should be compiled before `re_match'. The following
425 call is not mandatory, because `re_search' is always called sooner,
426 and it compiles the fastmap if this has not been done yet. */
428 re_compile_fastmap (pattern);
431 /*------------------------------------------------------------------------.
432 | This will initialize various tables for pattern match and compiles some |
433 | regexps. |
434 `------------------------------------------------------------------------*/
436 static void
437 initialize_regex (void)
439 int character; /* character value */
441 /* Initialize the case folding table. */
443 if (ignore_case)
444 for (character = 0; character < CHAR_SET_SIZE; character++)
445 folded_chars[character] = toupper (character);
447 /* Unless the user already provided a description of the end of line or
448 end of sentence sequence, select an end of line sequence to compile.
449 If the user provided an empty definition, thus disabling end of line
450 or sentence feature, make it NULL to speed up tests. If GNU
451 extensions are enabled, use end of sentence like in GNU emacs. If
452 disabled, use end of lines. */
454 if (context_regex.string)
456 if (!*context_regex.string)
457 context_regex.string = NULL;
459 else if (gnu_extensions & !input_reference)
460 context_regex.string = "[.?!][]\"')}]*\\($\\|\t\\| \\)[ \t\n]*";
461 else
462 context_regex.string = "\n";
464 if (context_regex.string)
465 compile_regex (&context_regex);
467 /* If the user has already provided a non-empty regexp to describe
468 words, compile it. Else, unless this has already been done through
469 a user provided Break character file, construct a fastmap of
470 characters that may appear in a word. If GNU extensions enabled,
471 include only letters of the underlying character set. If disabled,
472 include almost everything, even punctuations; stop only on white
473 space. */
475 if (word_regex.string)
476 compile_regex (&word_regex);
477 else if (!break_file)
479 if (gnu_extensions)
482 /* Simulate \w+. */
484 for (character = 0; character < CHAR_SET_SIZE; character++)
485 word_fastmap[character] = !! isalpha (character);
487 else
490 /* Simulate [^ \t\n]+. */
492 memset (word_fastmap, 1, CHAR_SET_SIZE);
493 word_fastmap[' '] = 0;
494 word_fastmap['\t'] = 0;
495 word_fastmap['\n'] = 0;
500 /*------------------------------------------------------------------------.
501 | This routine will attempt to swallow a whole file name FILE_NAME into a |
502 | contiguous region of memory and return a description of it into BLOCK. |
503 | Standard input is assumed whenever FILE_NAME is NULL, empty or "-". |
505 | Previously, in some cases, white space compression was attempted while |
506 | inputting text. This was defeating some regexps like default end of |
507 | sentence, which checks for two consecutive spaces. If white space |
508 | compression is ever reinstated, it should be in output routines. |
509 `------------------------------------------------------------------------*/
511 static void
512 swallow_file_in_memory (const char *file_name, BLOCK *block)
514 int file_handle; /* file descriptor number */
515 struct stat stat_block; /* stat block for file */
516 size_t allocated_length; /* allocated length of memory buffer */
517 size_t used_length; /* used length in memory buffer */
518 int read_length; /* number of character gotten on last read */
520 /* As special cases, a file name which is NULL or "-" indicates standard
521 input, which is already opened. In all other cases, open the file from
522 its name. */
523 bool using_stdin = !file_name || !*file_name || STREQ (file_name, "-");
524 if (using_stdin)
525 file_handle = STDIN_FILENO;
526 else
527 if ((file_handle = open (file_name, O_RDONLY)) < 0)
528 error (EXIT_FAILURE, errno, "%s", file_name);
530 /* If the file is a plain, regular file, allocate the memory buffer all at
531 once and swallow the file in one blow. In other cases, read the file
532 repeatedly in smaller chunks until we have it all, reallocating memory
533 once in a while, as we go. */
535 if (fstat (file_handle, &stat_block) < 0)
536 error (EXIT_FAILURE, errno, "%s", file_name);
538 if (S_ISREG (stat_block.st_mode))
540 size_t in_memory_size;
542 block->start = xmalloc ((size_t) stat_block.st_size);
544 if ((in_memory_size = read (file_handle,
545 block->start, (size_t) stat_block.st_size))
546 != stat_block.st_size)
548 #if MSDOS
549 /* On MSDOS, in memory size may be smaller than the file
550 size, because of end of line conversions. But it can
551 never be smaller than half the file size, because the
552 minimum is when all lines are empty and terminated by
553 CR+LF. */
554 if (in_memory_size != (size_t)-1
555 && in_memory_size >= stat_block.st_size / 2)
556 block->start = xrealloc (block->start, in_memory_size);
557 else
558 #endif /* not MSDOS */
560 error (EXIT_FAILURE, errno, "%s", file_name);
562 block->end = block->start + in_memory_size;
564 else
566 block->start = xmalloc ((size_t) 1 << SWALLOW_REALLOC_LOG);
567 used_length = 0;
568 allocated_length = (1 << SWALLOW_REALLOC_LOG);
570 while (read_length = read (file_handle,
571 block->start + used_length,
572 allocated_length - used_length),
573 read_length > 0)
575 used_length += read_length;
576 if (used_length == allocated_length)
578 allocated_length += (1 << SWALLOW_REALLOC_LOG);
579 block->start
580 = xrealloc (block->start, allocated_length);
584 if (read_length < 0)
585 error (EXIT_FAILURE, errno, "%s", file_name);
587 block->end = block->start + used_length;
590 /* Close the file, but only if it was not the standard input. */
592 if (! using_stdin && close (file_handle) != 0)
593 error (EXIT_FAILURE, errno, "%s", file_name);
596 /* Sort and search routines. */
598 /*--------------------------------------------------------------------------.
599 | Compare two words, FIRST and SECOND, and return 0 if they are identical. |
600 | Return less than 0 if the first word goes before the second; return |
601 | greater than 0 if the first word goes after the second. |
603 | If a word is indeed a prefix of the other, the shorter should go first. |
604 `--------------------------------------------------------------------------*/
606 static int
607 compare_words (const void *void_first, const void *void_second)
609 #define first ((const WORD *) void_first)
610 #define second ((const WORD *) void_second)
611 int length; /* minimum of two lengths */
612 int counter; /* cursor in words */
613 int value; /* value of comparison */
615 length = first->size < second->size ? first->size : second->size;
617 if (ignore_case)
619 for (counter = 0; counter < length; counter++)
621 value = (folded_chars [to_uchar (first->start[counter])]
622 - folded_chars [to_uchar (second->start[counter])]);
623 if (value != 0)
624 return value;
627 else
629 for (counter = 0; counter < length; counter++)
631 value = (to_uchar (first->start[counter])
632 - to_uchar (second->start[counter]));
633 if (value != 0)
634 return value;
638 return first->size - second->size;
639 #undef first
640 #undef second
643 /*-----------------------------------------------------------------------.
644 | Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
645 | go first. In case of a tie, preserve the original order through a |
646 | pointer comparison. |
647 `-----------------------------------------------------------------------*/
649 static int
650 compare_occurs (const void *void_first, const void *void_second)
652 #define first ((const OCCURS *) void_first)
653 #define second ((const OCCURS *) void_second)
654 int value;
656 value = compare_words (&first->key, &second->key);
657 return value == 0 ? first->key.start - second->key.start : value;
658 #undef first
659 #undef second
662 /*------------------------------------------------------------.
663 | Return !0 if WORD appears in TABLE. Uses a binary search. |
664 `------------------------------------------------------------*/
666 static int
667 search_table (WORD *word, WORD_TABLE *table)
669 int lowest; /* current lowest possible index */
670 int highest; /* current highest possible index */
671 int middle; /* current middle index */
672 int value; /* value from last comparison */
674 lowest = 0;
675 highest = table->length - 1;
676 while (lowest <= highest)
678 middle = (lowest + highest) / 2;
679 value = compare_words (word, table->start + middle);
680 if (value < 0)
681 highest = middle - 1;
682 else if (value > 0)
683 lowest = middle + 1;
684 else
685 return 1;
687 return 0;
690 /*---------------------------------------------------------------------.
691 | Sort the whole occurs table in memory. Presumably, `qsort' does not |
692 | take intermediate copies or table elements, so the sort will be |
693 | stabilized throughout the comparison routine. |
694 `---------------------------------------------------------------------*/
696 static void
697 sort_found_occurs (void)
700 /* Only one language for the time being. */
702 qsort (occurs_table[0], number_of_occurs[0], sizeof **occurs_table,
703 compare_occurs);
706 /* Parameter files reading routines. */
708 /*----------------------------------------------------------------------.
709 | Read a file named FILE_NAME, containing a set of break characters. |
710 | Build a content to the array word_fastmap in which all characters are |
711 | allowed except those found in the file. Characters may be repeated. |
712 `----------------------------------------------------------------------*/
714 static void
715 digest_break_file (const char *file_name)
717 BLOCK file_contents; /* to receive a copy of the file */
718 char *cursor; /* cursor in file copy */
720 swallow_file_in_memory (file_name, &file_contents);
722 /* Make the fastmap and record the file contents in it. */
724 memset (word_fastmap, 1, CHAR_SET_SIZE);
725 for (cursor = file_contents.start; cursor < file_contents.end; cursor++)
726 word_fastmap[to_uchar (*cursor)] = 0;
728 if (!gnu_extensions)
731 /* If GNU extensions are enabled, the only way to avoid newline as
732 a break character is to write all the break characters in the
733 file with no newline at all, not even at the end of the file.
734 If disabled, spaces, tabs and newlines are always considered as
735 break characters even if not included in the break file. */
737 word_fastmap[' '] = 0;
738 word_fastmap['\t'] = 0;
739 word_fastmap['\n'] = 0;
742 /* Return the space of the file, which is no more required. */
744 free (file_contents.start);
747 /*-----------------------------------------------------------------------.
748 | Read a file named FILE_NAME, containing one word per line, then |
749 | construct in TABLE a table of WORD descriptors for them. The routine |
750 | swallows the whole file in memory; this is at the expense of space |
751 | needed for newlines, which are useless; however, the reading is fast. |
752 `-----------------------------------------------------------------------*/
754 static void
755 digest_word_file (const char *file_name, WORD_TABLE *table)
757 BLOCK file_contents; /* to receive a copy of the file */
758 char *cursor; /* cursor in file copy */
759 char *word_start; /* start of the current word */
761 swallow_file_in_memory (file_name, &file_contents);
763 table->start = NULL;
764 table->alloc = 0;
765 table->length = 0;
767 /* Read the whole file. */
769 cursor = file_contents.start;
770 while (cursor < file_contents.end)
773 /* Read one line, and save the word in contains. */
775 word_start = cursor;
776 while (cursor < file_contents.end && *cursor != '\n')
777 cursor++;
779 /* Record the word in table if it is not empty. */
781 if (cursor > word_start)
783 if (table->length == table->alloc)
785 if ((SIZE_MAX / sizeof *table->start - 1) / 2 < table->alloc)
786 xalloc_die ();
787 table->alloc = table->alloc * 2 + 1;
788 table->start = xrealloc (table->start,
789 table->alloc * sizeof *table->start);
792 table->start[table->length].start = word_start;
793 table->start[table->length].size = cursor - word_start;
794 table->length++;
797 /* This test allows for an incomplete line at end of file. */
799 if (cursor < file_contents.end)
800 cursor++;
803 /* Finally, sort all the words read. */
805 qsort (table->start, table->length, sizeof table->start[0], compare_words);
808 /* Keyword recognition and selection. */
810 /*----------------------------------------------------------------------.
811 | For each keyword in the source text, constructs an OCCURS structure. |
812 `----------------------------------------------------------------------*/
814 static void
815 find_occurs_in_text (void)
817 char *cursor; /* for scanning the source text */
818 char *scan; /* for scanning the source text also */
819 char *line_start; /* start of the current input line */
820 char *line_scan; /* newlines scanned until this point */
821 int reference_length; /* length of reference in input mode */
822 WORD possible_key; /* possible key, to ease searches */
823 OCCURS *occurs_cursor; /* current OCCURS under construction */
825 char *context_start; /* start of left context */
826 char *context_end; /* end of right context */
827 char *word_start; /* start of word */
828 char *word_end; /* end of word */
829 char *next_context_start; /* next start of left context */
831 /* reference_length is always used within `if (input_reference)'.
832 However, GNU C diagnoses that it may be used uninitialized. The
833 following assignment is merely to shut it up. */
835 reference_length = 0;
837 /* Tracking where lines start is helpful for reference processing. In
838 auto reference mode, this allows counting lines. In input reference
839 mode, this permits finding the beginning of the references.
841 The first line begins with the file, skip immediately this very first
842 reference in input reference mode, to help further rejection any word
843 found inside it. Also, unconditionally assigning these variable has
844 the happy effect of shutting up lint. */
846 line_start = text_buffer.start;
847 line_scan = line_start;
848 if (input_reference)
850 SKIP_NON_WHITE (line_scan, text_buffer.end);
851 reference_length = line_scan - line_start;
852 SKIP_WHITE (line_scan, text_buffer.end);
855 /* Process the whole buffer, one line or one sentence at a time. */
857 for (cursor = text_buffer.start;
858 cursor < text_buffer.end;
859 cursor = next_context_start)
862 /* `context_start' gets initialized before the processing of each
863 line, or once for the whole buffer if no end of line or sentence
864 sequence separator. */
866 context_start = cursor;
868 /* If a end of line or end of sentence sequence is defined and
869 non-empty, `next_context_start' will be recomputed to be the end of
870 each line or sentence, before each one is processed. If no such
871 sequence, then `next_context_start' is set at the end of the whole
872 buffer, which is then considered to be a single line or sentence.
873 This test also accounts for the case of an incomplete line or
874 sentence at the end of the buffer. */
876 next_context_start = text_buffer.end;
877 if (context_regex.string)
878 switch (re_search (&context_regex.pattern, cursor,
879 text_buffer.end - cursor,
880 0, text_buffer.end - cursor, &context_regs))
882 case -2:
883 matcher_error ();
885 case -1:
886 break;
888 default:
889 next_context_start = cursor + context_regs.end[0];
890 break;
893 /* Include the separator into the right context, but not any suffix
894 white space in this separator; this insures it will be seen in
895 output and will not take more space than necessary. */
897 context_end = next_context_start;
898 SKIP_WHITE_BACKWARDS (context_end, context_start);
900 /* Read and process a single input line or sentence, one word at a
901 time. */
903 while (1)
905 if (word_regex.string)
907 /* If a word regexp has been compiled, use it to skip at the
908 beginning of the next word. If there is no such word, exit
909 the loop. */
912 regoff_t r = re_search (&word_regex.pattern, cursor,
913 context_end - cursor,
914 0, context_end - cursor, &word_regs);
915 if (r == -2)
916 matcher_error ();
917 if (r == -1)
918 break;
919 word_start = cursor + word_regs.start[0];
920 word_end = cursor + word_regs.end[0];
922 else
924 /* Avoid re_search and use the fastmap to skip to the
925 beginning of the next word. If there is no more word in
926 the buffer, exit the loop. */
929 scan = cursor;
930 while (scan < context_end
931 && !word_fastmap[to_uchar (*scan)])
932 scan++;
934 if (scan == context_end)
935 break;
937 word_start = scan;
939 while (scan < context_end
940 && word_fastmap[to_uchar (*scan)])
941 scan++;
943 word_end = scan;
946 /* Skip right to the beginning of the found word. */
948 cursor = word_start;
950 /* Skip any zero length word. Just advance a single position,
951 then go fetch the next word. */
953 if (word_end == word_start)
955 cursor++;
956 continue;
959 /* This is a genuine, non empty word, so save it as a possible
960 key. Then skip over it. Also, maintain the maximum length of
961 all words read so far. It is mandatory to take the maximum
962 length of all words in the file, without considering if they
963 are actually kept or rejected, because backward jumps at output
964 generation time may fall in *any* word. */
966 possible_key.start = cursor;
967 possible_key.size = word_end - word_start;
968 cursor += possible_key.size;
970 if (possible_key.size > maximum_word_length)
971 maximum_word_length = possible_key.size;
973 /* In input reference mode, update `line_start' from its previous
974 value. Count the lines just in case auto reference mode is
975 also selected. If it happens that the word just matched is
976 indeed part of a reference; just ignore it. */
978 if (input_reference)
980 while (line_scan < possible_key.start)
981 if (*line_scan == '\n')
983 total_line_count++;
984 line_scan++;
985 line_start = line_scan;
986 SKIP_NON_WHITE (line_scan, text_buffer.end);
987 reference_length = line_scan - line_start;
989 else
990 line_scan++;
991 if (line_scan > possible_key.start)
992 continue;
995 /* Ignore the word if an `Ignore words' table exists and if it is
996 part of it. Also ignore the word if an `Only words' table and
997 if it is *not* part of it.
999 It is allowed that both tables be used at once, even if this
1000 may look strange for now. Just ignore a word that would appear
1001 in both. If regexps are eventually implemented for these
1002 tables, the Ignore table could then reject words that would
1003 have been previously accepted by the Only table. */
1005 if (ignore_file && search_table (&possible_key, &ignore_table))
1006 continue;
1007 if (only_file && !search_table (&possible_key, &only_table))
1008 continue;
1010 /* A non-empty word has been found. First of all, insure
1011 proper allocation of the next OCCURS, and make a pointer to
1012 where it will be constructed. */
1014 if (number_of_occurs[0] == occurs_alloc[0])
1016 if ((SIZE_MAX / sizeof *occurs_table[0] - 1) / 2
1017 < occurs_alloc[0])
1018 xalloc_die ();
1019 occurs_alloc[0] = occurs_alloc[0] * 2 + 1;
1020 occurs_table[0] = xrealloc (occurs_table[0],
1021 occurs_alloc[0] * sizeof *occurs_table[0]);
1024 occurs_cursor = occurs_table[0] + number_of_occurs[0];
1026 /* Define the refence field, if any. */
1028 if (auto_reference)
1031 /* While auto referencing, update `line_start' from its
1032 previous value, counting lines as we go. If input
1033 referencing at the same time, `line_start' has been
1034 advanced earlier, and the following loop is never really
1035 executed. */
1037 while (line_scan < possible_key.start)
1038 if (*line_scan == '\n')
1040 total_line_count++;
1041 line_scan++;
1042 line_start = line_scan;
1043 SKIP_NON_WHITE (line_scan, text_buffer.end);
1045 else
1046 line_scan++;
1048 occurs_cursor->reference = total_line_count;
1050 else if (input_reference)
1053 /* If only input referencing, `line_start' has been computed
1054 earlier to detect the case the word matched would be part
1055 of the reference. The reference position is simply the
1056 value of `line_start'. */
1058 occurs_cursor->reference
1059 = (DELTA) (line_start - possible_key.start);
1060 if (reference_length > reference_max_width)
1061 reference_max_width = reference_length;
1064 /* Exclude the reference from the context in simple cases. */
1066 if (input_reference && line_start == context_start)
1068 SKIP_NON_WHITE (context_start, context_end);
1069 SKIP_WHITE (context_start, context_end);
1072 /* Completes the OCCURS structure. */
1074 occurs_cursor->key = possible_key;
1075 occurs_cursor->left = context_start - possible_key.start;
1076 occurs_cursor->right = context_end - possible_key.start;
1078 number_of_occurs[0]++;
1083 /* Formatting and actual output - service routines. */
1085 /*-----------------------------------------.
1086 | Prints some NUMBER of spaces on stdout. |
1087 `-----------------------------------------*/
1089 static void
1090 print_spaces (int number)
1092 int counter;
1094 for (counter = number; counter > 0; counter--)
1095 putchar (' ');
1098 /*-------------------------------------.
1099 | Prints the field provided by FIELD. |
1100 `-------------------------------------*/
1102 static void
1103 print_field (BLOCK field)
1105 char *cursor; /* Cursor in field to print */
1106 int base; /* Base character, without diacritic */
1107 int diacritic; /* Diacritic code for the character */
1109 /* Whitespace is not really compressed. Instead, each white space
1110 character (tab, vt, ht etc.) is printed as one single space. */
1112 for (cursor = field.start; cursor < field.end; cursor++)
1114 unsigned char character = *cursor;
1115 if (edited_flag[character])
1118 /* First check if this is a diacriticized character.
1120 This works only for TeX. I do not know how diacriticized
1121 letters work with `roff'. Please someone explain it to me! */
1123 diacritic = todiac (character);
1124 if (diacritic != 0 && output_format == TEX_FORMAT)
1126 base = tobase (character);
1127 switch (diacritic)
1130 case 1: /* Latin diphthongs */
1131 switch (base)
1133 case 'o':
1134 fputs ("\\oe{}", stdout);
1135 break;
1137 case 'O':
1138 fputs ("\\OE{}", stdout);
1139 break;
1141 case 'a':
1142 fputs ("\\ae{}", stdout);
1143 break;
1145 case 'A':
1146 fputs ("\\AE{}", stdout);
1147 break;
1149 default:
1150 putchar (' ');
1152 break;
1154 case 2: /* Acute accent */
1155 printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
1156 break;
1158 case 3: /* Grave accent */
1159 printf ("\\`%s%c", (base == 'i' ? "\\" : ""), base);
1160 break;
1162 case 4: /* Circumflex accent */
1163 printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base);
1164 break;
1166 case 5: /* Diaeresis */
1167 printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base);
1168 break;
1170 case 6: /* Tilde accent */
1171 printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base);
1172 break;
1174 case 7: /* Cedilla */
1175 printf ("\\c{%c}", base);
1176 break;
1178 case 8: /* Small circle beneath */
1179 switch (base)
1181 case 'a':
1182 fputs ("\\aa{}", stdout);
1183 break;
1185 case 'A':
1186 fputs ("\\AA{}", stdout);
1187 break;
1189 default:
1190 putchar (' ');
1192 break;
1194 case 9: /* Strike through */
1195 switch (base)
1197 case 'o':
1198 fputs ("\\o{}", stdout);
1199 break;
1201 case 'O':
1202 fputs ("\\O{}", stdout);
1203 break;
1205 default:
1206 putchar (' ');
1208 break;
1211 else
1213 /* This is not a diacritic character, so handle cases which are
1214 really specific to `roff' or TeX. All white space processing
1215 is done as the default case of this switch. */
1217 switch (character)
1219 case '"':
1220 /* In roff output format, double any quote. */
1221 putchar ('"');
1222 putchar ('"');
1223 break;
1225 case '$':
1226 case '%':
1227 case '&':
1228 case '#':
1229 case '_':
1230 /* In TeX output format, precede these with a backslash. */
1231 putchar ('\\');
1232 putchar (character);
1233 break;
1235 case '{':
1236 case '}':
1237 /* In TeX output format, precede these with a backslash and
1238 force mathematical mode. */
1239 printf ("$\\%c$", character);
1240 break;
1242 case '\\':
1243 /* In TeX output mode, request production of a backslash. */
1244 fputs ("\\backslash{}", stdout);
1245 break;
1247 default:
1248 /* Any other flagged character produces a single space. */
1249 putchar (' ');
1252 else
1253 putchar (*cursor);
1257 /* Formatting and actual output - planning routines. */
1259 /*--------------------------------------------------------------------.
1260 | From information collected from command line options and input file |
1261 | readings, compute and fix some output parameter values. |
1262 `--------------------------------------------------------------------*/
1264 static void
1265 fix_output_parameters (void)
1267 int file_index; /* index in text input file arrays */
1268 int line_ordinal; /* line ordinal value for reference */
1269 char ordinal_string[12]; /* edited line ordinal for reference */
1270 int reference_width; /* width for the whole reference */
1271 int character; /* character ordinal */
1272 const char *cursor; /* cursor in some constant strings */
1274 /* In auto reference mode, the maximum width of this field is
1275 precomputed and subtracted from the overall line width. Add one for
1276 the column which separate the file name from the line number. */
1278 if (auto_reference)
1280 reference_max_width = 0;
1281 for (file_index = 0; file_index < number_input_files; file_index++)
1283 line_ordinal = file_line_count[file_index] + 1;
1284 if (file_index > 0)
1285 line_ordinal -= file_line_count[file_index - 1];
1286 sprintf (ordinal_string, "%d", line_ordinal);
1287 reference_width = strlen (ordinal_string);
1288 if (input_file_name[file_index])
1289 reference_width += strlen (input_file_name[file_index]);
1290 if (reference_width > reference_max_width)
1291 reference_max_width = reference_width;
1293 reference_max_width++;
1294 reference.start = xmalloc ((size_t) reference_max_width + 1);
1297 /* If the reference appears to the left of the output line, reserve some
1298 space for it right away, including one gap size. */
1300 if ((auto_reference | input_reference) & !right_reference)
1301 line_width -= reference_max_width + gap_size;
1303 /* The output lines, minimally, will contain from left to right a left
1304 context, a gap, and a keyword followed by the right context with no
1305 special intervening gap. Half of the line width is dedicated to the
1306 left context and the gap, the other half is dedicated to the keyword
1307 and the right context; these values are computed once and for all here.
1308 There also are tail and head wrap around fields, used when the keyword
1309 is near the beginning or the end of the line, or when some long word
1310 cannot fit in, but leave place from wrapped around shorter words. The
1311 maximum width of these fields are recomputed separately for each line,
1312 on a case by case basis. It is worth noting that it cannot happen that
1313 both the tail and head fields are used at once. */
1315 half_line_width = line_width / 2;
1316 before_max_width = half_line_width - gap_size;
1317 keyafter_max_width = half_line_width;
1319 /* If truncation_string is the empty string, make it NULL to speed up
1320 tests. In this case, truncation_string_length will never get used, so
1321 there is no need to set it. */
1323 if (truncation_string && *truncation_string)
1324 truncation_string_length = strlen (truncation_string);
1325 else
1326 truncation_string = NULL;
1328 if (gnu_extensions)
1331 /* When flagging truncation at the left of the keyword, the
1332 truncation mark goes at the beginning of the before field,
1333 unless there is a head field, in which case the mark goes at the
1334 left of the head field. When flagging truncation at the right
1335 of the keyword, the mark goes at the end of the keyafter field,
1336 unless there is a tail field, in which case the mark goes at the
1337 end of the tail field. Only eight combination cases could arise
1338 for truncation marks:
1340 . None.
1341 . One beginning the before field.
1342 . One beginning the head field.
1343 . One ending the keyafter field.
1344 . One ending the tail field.
1345 . One beginning the before field, another ending the keyafter field.
1346 . One ending the tail field, another beginning the before field.
1347 . One ending the keyafter field, another beginning the head field.
1349 So, there is at most two truncation marks, which could appear both
1350 on the left side of the center of the output line, both on the
1351 right side, or one on either side. */
1353 before_max_width -= 2 * truncation_string_length;
1354 keyafter_max_width -= 2 * truncation_string_length;
1356 else
1359 /* I never figured out exactly how UNIX' ptx plans the output width
1360 of its various fields. If GNU extensions are disabled, do not
1361 try computing the field widths correctly; instead, use the
1362 following formula, which does not completely imitate UNIX' ptx,
1363 but almost. */
1365 keyafter_max_width -= 2 * truncation_string_length + 1;
1368 /* Compute which characters need special output processing. Initialize
1369 by flagging any white space character. Some systems do not consider
1370 form feed as a space character, but we do. */
1372 for (character = 0; character < CHAR_SET_SIZE; character++)
1373 edited_flag[character] = !! isspace (character);
1374 edited_flag['\f'] = 1;
1376 /* Complete the special character flagging according to selected output
1377 format. */
1379 switch (output_format)
1381 case UNKNOWN_FORMAT:
1382 /* Should never happen. */
1384 case DUMB_FORMAT:
1385 break;
1387 case ROFF_FORMAT:
1389 /* `Quote' characters should be doubled. */
1391 edited_flag['"'] = 1;
1392 break;
1394 case TEX_FORMAT:
1396 /* Various characters need special processing. */
1398 for (cursor = "$%&#_{}\\"; *cursor; cursor++)
1399 edited_flag[to_uchar (*cursor)] = 1;
1401 /* Any character with 8th bit set will print to a single space, unless
1402 it is diacriticized. */
1404 for (character = 0200; character < CHAR_SET_SIZE; character++)
1405 edited_flag[character] = todiac (character) != 0;
1406 break;
1410 /*------------------------------------------------------------------.
1411 | Compute the position and length of all the output fields, given a |
1412 | pointer to some OCCURS. |
1413 `------------------------------------------------------------------*/
1415 static void
1416 define_all_fields (OCCURS *occurs)
1418 int tail_max_width; /* allowable width of tail field */
1419 int head_max_width; /* allowable width of head field */
1420 char *cursor; /* running cursor in source text */
1421 char *left_context_start; /* start of left context */
1422 char *right_context_end; /* end of right context */
1423 char *left_field_start; /* conservative start for `head'/`before' */
1424 int file_index; /* index in text input file arrays */
1425 const char *file_name; /* file name for reference */
1426 int line_ordinal; /* line ordinal for reference */
1428 /* Define `keyafter', start of left context and end of right context.
1429 `keyafter' starts at the saved position for keyword and extend to the
1430 right from the end of the keyword, eating separators or full words, but
1431 not beyond maximum allowed width for `keyafter' field or limit for the
1432 right context. Suffix spaces will be removed afterwards. */
1434 keyafter.start = occurs->key.start;
1435 keyafter.end = keyafter.start + occurs->key.size;
1436 left_context_start = keyafter.start + occurs->left;
1437 right_context_end = keyafter.start + occurs->right;
1439 cursor = keyafter.end;
1440 while (cursor < right_context_end
1441 && cursor <= keyafter.start + keyafter_max_width)
1443 keyafter.end = cursor;
1444 SKIP_SOMETHING (cursor, right_context_end);
1446 if (cursor <= keyafter.start + keyafter_max_width)
1447 keyafter.end = cursor;
1449 keyafter_truncation = truncation_string && keyafter.end < right_context_end;
1451 SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start);
1453 /* When the left context is wide, it might take some time to catch up from
1454 the left context boundary to the beginning of the `head' or `before'
1455 fields. So, in this case, to speed the catchup, we jump back from the
1456 keyword, using some secure distance, possibly falling in the middle of
1457 a word. A secure backward jump would be at least half the maximum
1458 width of a line, plus the size of the longest word met in the whole
1459 input. We conclude this backward jump by a skip forward of at least
1460 one word. In this manner, we should not inadvertently accept only part
1461 of a word. From the reached point, when it will be time to fix the
1462 beginning of `head' or `before' fields, we will skip forward words or
1463 delimiters until we get sufficiently near. */
1465 if (-occurs->left > half_line_width + maximum_word_length)
1467 left_field_start
1468 = keyafter.start - (half_line_width + maximum_word_length);
1469 SKIP_SOMETHING (left_field_start, keyafter.start);
1471 else
1472 left_field_start = keyafter.start + occurs->left;
1474 /* `before' certainly ends at the keyword, but not including separating
1475 spaces. It starts after than the saved value for the left context, by
1476 advancing it until it falls inside the maximum allowed width for the
1477 before field. There will be no prefix spaces either. `before' only
1478 advances by skipping single separators or whole words. */
1480 before.start = left_field_start;
1481 before.end = keyafter.start;
1482 SKIP_WHITE_BACKWARDS (before.end, before.start);
1484 while (before.start + before_max_width < before.end)
1485 SKIP_SOMETHING (before.start, before.end);
1487 if (truncation_string)
1489 cursor = before.start;
1490 SKIP_WHITE_BACKWARDS (cursor, text_buffer.start);
1491 before_truncation = cursor > left_context_start;
1493 else
1494 before_truncation = 0;
1496 SKIP_WHITE (before.start, text_buffer.end);
1498 /* The tail could not take more columns than what has been left in the
1499 left context field, and a gap is mandatory. It starts after the
1500 right context, and does not contain prefixed spaces. It ends at
1501 the end of line, the end of buffer or when the tail field is full,
1502 whichever comes first. It cannot contain only part of a word, and
1503 has no suffixed spaces. */
1505 tail_max_width
1506 = before_max_width - (before.end - before.start) - gap_size;
1508 if (tail_max_width > 0)
1510 tail.start = keyafter.end;
1511 SKIP_WHITE (tail.start, text_buffer.end);
1513 tail.end = tail.start;
1514 cursor = tail.end;
1515 while (cursor < right_context_end
1516 && cursor < tail.start + tail_max_width)
1518 tail.end = cursor;
1519 SKIP_SOMETHING (cursor, right_context_end);
1522 if (cursor < tail.start + tail_max_width)
1523 tail.end = cursor;
1525 if (tail.end > tail.start)
1527 keyafter_truncation = 0;
1528 tail_truncation = truncation_string && tail.end < right_context_end;
1530 else
1531 tail_truncation = 0;
1533 SKIP_WHITE_BACKWARDS (tail.end, tail.start);
1535 else
1538 /* No place left for a tail field. */
1540 tail.start = NULL;
1541 tail.end = NULL;
1542 tail_truncation = 0;
1545 /* `head' could not take more columns than what has been left in the right
1546 context field, and a gap is mandatory. It ends before the left
1547 context, and does not contain suffixed spaces. Its pointer is advanced
1548 until the head field has shrunk to its allowed width. It cannot
1549 contain only part of a word, and has no suffixed spaces. */
1551 head_max_width
1552 = keyafter_max_width - (keyafter.end - keyafter.start) - gap_size;
1554 if (head_max_width > 0)
1556 head.end = before.start;
1557 SKIP_WHITE_BACKWARDS (head.end, text_buffer.start);
1559 head.start = left_field_start;
1560 while (head.start + head_max_width < head.end)
1561 SKIP_SOMETHING (head.start, head.end);
1563 if (head.end > head.start)
1565 before_truncation = 0;
1566 head_truncation = (truncation_string
1567 && head.start > left_context_start);
1569 else
1570 head_truncation = 0;
1572 SKIP_WHITE (head.start, head.end);
1574 else
1577 /* No place left for a head field. */
1579 head.start = NULL;
1580 head.end = NULL;
1581 head_truncation = 0;
1584 if (auto_reference)
1587 /* Construct the reference text in preallocated space from the file
1588 name and the line number. Find out in which file the reference
1589 occurred. Standard input yields an empty file name. Insure line
1590 numbers are one based, even if they are computed zero based. */
1592 file_index = 0;
1593 while (file_line_count[file_index] < occurs->reference)
1594 file_index++;
1596 file_name = input_file_name[file_index];
1597 if (!file_name)
1598 file_name = "";
1600 line_ordinal = occurs->reference + 1;
1601 if (file_index > 0)
1602 line_ordinal -= file_line_count[file_index - 1];
1604 sprintf (reference.start, "%s:%d", file_name, line_ordinal);
1605 reference.end = reference.start + strlen (reference.start);
1607 else if (input_reference)
1610 /* Reference starts at saved position for reference and extends right
1611 until some white space is met. */
1613 reference.start = keyafter.start + (DELTA) occurs->reference;
1614 reference.end = reference.start;
1615 SKIP_NON_WHITE (reference.end, right_context_end);
1619 /* Formatting and actual output - control routines. */
1621 /*----------------------------------------------------------------------.
1622 | Output the current output fields as one line for `troff' or `nroff'. |
1623 `----------------------------------------------------------------------*/
1625 static void
1626 output_one_roff_line (void)
1628 /* Output the `tail' field. */
1630 printf (".%s \"", macro_name);
1631 print_field (tail);
1632 if (tail_truncation)
1633 fputs (truncation_string, stdout);
1634 putchar ('"');
1636 /* Output the `before' field. */
1638 fputs (" \"", stdout);
1639 if (before_truncation)
1640 fputs (truncation_string, stdout);
1641 print_field (before);
1642 putchar ('"');
1644 /* Output the `keyafter' field. */
1646 fputs (" \"", stdout);
1647 print_field (keyafter);
1648 if (keyafter_truncation)
1649 fputs (truncation_string, stdout);
1650 putchar ('"');
1652 /* Output the `head' field. */
1654 fputs (" \"", stdout);
1655 if (head_truncation)
1656 fputs (truncation_string, stdout);
1657 print_field (head);
1658 putchar ('"');
1660 /* Conditionally output the `reference' field. */
1662 if (auto_reference | input_reference)
1664 fputs (" \"", stdout);
1665 print_field (reference);
1666 putchar ('"');
1669 putchar ('\n');
1672 /*---------------------------------------------------------.
1673 | Output the current output fields as one line for `TeX'. |
1674 `---------------------------------------------------------*/
1676 static void
1677 output_one_tex_line (void)
1679 BLOCK key; /* key field, isolated */
1680 BLOCK after; /* after field, isolated */
1681 char *cursor; /* running cursor in source text */
1683 printf ("\\%s ", macro_name);
1684 putchar ('{');
1685 print_field (tail);
1686 fputs ("}{", stdout);
1687 print_field (before);
1688 fputs ("}{", stdout);
1689 key.start = keyafter.start;
1690 after.end = keyafter.end;
1691 cursor = keyafter.start;
1692 SKIP_SOMETHING (cursor, keyafter.end);
1693 key.end = cursor;
1694 after.start = cursor;
1695 print_field (key);
1696 fputs ("}{", stdout);
1697 print_field (after);
1698 fputs ("}{", stdout);
1699 print_field (head);
1700 putchar ('}');
1701 if (auto_reference | input_reference)
1703 putchar ('{');
1704 print_field (reference);
1705 putchar ('}');
1707 putchar ('\n');
1710 /*-------------------------------------------------------------------.
1711 | Output the current output fields as one line for a dumb terminal. |
1712 `-------------------------------------------------------------------*/
1714 static void
1715 output_one_dumb_line (void)
1717 if (!right_reference)
1719 if (auto_reference)
1722 /* Output the `reference' field, in such a way that GNU emacs
1723 next-error will handle it. The ending colon is taken from the
1724 gap which follows. */
1726 print_field (reference);
1727 putchar (':');
1728 print_spaces (reference_max_width
1729 + gap_size
1730 - (reference.end - reference.start)
1731 - 1);
1733 else
1736 /* Output the `reference' field and its following gap. */
1738 print_field (reference);
1739 print_spaces (reference_max_width
1740 + gap_size
1741 - (reference.end - reference.start));
1745 if (tail.start < tail.end)
1747 /* Output the `tail' field. */
1749 print_field (tail);
1750 if (tail_truncation)
1751 fputs (truncation_string, stdout);
1753 print_spaces (half_line_width - gap_size
1754 - (before.end - before.start)
1755 - (before_truncation ? truncation_string_length : 0)
1756 - (tail.end - tail.start)
1757 - (tail_truncation ? truncation_string_length : 0));
1759 else
1760 print_spaces (half_line_width - gap_size
1761 - (before.end - before.start)
1762 - (before_truncation ? truncation_string_length : 0));
1764 /* Output the `before' field. */
1766 if (before_truncation)
1767 fputs (truncation_string, stdout);
1768 print_field (before);
1770 print_spaces (gap_size);
1772 /* Output the `keyafter' field. */
1774 print_field (keyafter);
1775 if (keyafter_truncation)
1776 fputs (truncation_string, stdout);
1778 if (head.start < head.end)
1780 /* Output the `head' field. */
1782 print_spaces (half_line_width
1783 - (keyafter.end - keyafter.start)
1784 - (keyafter_truncation ? truncation_string_length : 0)
1785 - (head.end - head.start)
1786 - (head_truncation ? truncation_string_length : 0));
1787 if (head_truncation)
1788 fputs (truncation_string, stdout);
1789 print_field (head);
1791 else
1793 if ((auto_reference | input_reference) & right_reference)
1794 print_spaces (half_line_width
1795 - (keyafter.end - keyafter.start)
1796 - (keyafter_truncation ? truncation_string_length : 0));
1798 if ((auto_reference | input_reference) & right_reference)
1800 /* Output the `reference' field. */
1802 print_spaces (gap_size);
1803 print_field (reference);
1806 putchar ('\n');
1809 /*------------------------------------------------------------------------.
1810 | Scan the whole occurs table and, for each entry, output one line in the |
1811 | appropriate format. |
1812 `------------------------------------------------------------------------*/
1814 static void
1815 generate_all_output (void)
1817 size_t occurs_index; /* index of keyword entry being processed */
1818 OCCURS *occurs_cursor; /* current keyword entry being processed */
1820 /* The following assignments are useful to provide default values in case
1821 line contexts or references are not used, in which case these variables
1822 would never be computed. */
1824 tail.start = NULL;
1825 tail.end = NULL;
1826 tail_truncation = 0;
1828 head.start = NULL;
1829 head.end = NULL;
1830 head_truncation = 0;
1832 /* Loop over all keyword occurrences. */
1834 occurs_cursor = occurs_table[0];
1836 for (occurs_index = 0; occurs_index < number_of_occurs[0]; occurs_index++)
1838 /* Compute the exact size of every field and whenever truncation flags
1839 are present or not. */
1841 define_all_fields (occurs_cursor);
1843 /* Produce one output line according to selected format. */
1845 switch (output_format)
1847 case UNKNOWN_FORMAT:
1848 /* Should never happen. */
1850 case DUMB_FORMAT:
1851 output_one_dumb_line ();
1852 break;
1854 case ROFF_FORMAT:
1855 output_one_roff_line ();
1856 break;
1858 case TEX_FORMAT:
1859 output_one_tex_line ();
1860 break;
1863 /* Advance the cursor into the occurs table. */
1865 occurs_cursor++;
1869 /* Option decoding and main program. */
1871 /*------------------------------------------------------.
1872 | Print program identification and options, then exit. |
1873 `------------------------------------------------------*/
1875 void
1876 usage (int status)
1878 if (status != EXIT_SUCCESS)
1879 fprintf (stderr, _("Try `%s --help' for more information.\n"),
1880 program_name);
1881 else
1883 printf (_("\
1884 Usage: %s [OPTION]... [INPUT]... (without -G)\n\
1885 or: %s -G [OPTION]... [INPUT [OUTPUT]]\n"),
1886 program_name, program_name);
1887 fputs (_("\
1888 Output a permuted index, including context, of the words in the input files.\n\
1890 "), stdout);
1891 fputs (_("\
1892 Mandatory arguments to long options are mandatory for short options too.\n\
1893 "), stdout);
1894 fputs (_("\
1895 -A, --auto-reference output automatically generated references\n\
1896 -G, --traditional behave more like System V `ptx'\n\
1897 -F, --flag-truncation=STRING use STRING for flagging line truncations\n\
1898 "), stdout);
1899 fputs (_("\
1900 -M, --macro-name=STRING macro name to use instead of `xx'\n\
1901 -O, --format=roff generate output as roff directives\n\
1902 -R, --right-side-refs put references at right, not counted in -w\n\
1903 -S, --sentence-regexp=REGEXP for end of lines or end of sentences\n\
1904 -T, --format=tex generate output as TeX directives\n\
1905 "), stdout);
1906 fputs (_("\
1907 -W, --word-regexp=REGEXP use REGEXP to match each keyword\n\
1908 -b, --break-file=FILE word break characters in this FILE\n\
1909 -f, --ignore-case fold lower case to upper case for sorting\n\
1910 -g, --gap-size=NUMBER gap size in columns between output fields\n\
1911 -i, --ignore-file=FILE read ignore word list from FILE\n\
1912 -o, --only-file=FILE read only word list from this FILE\n\
1913 "), stdout);
1914 fputs (_("\
1915 -r, --references first field of each line is a reference\n\
1916 -t, --typeset-mode - not implemented -\n\
1917 -w, --width=NUMBER output width in columns, reference excluded\n\
1918 "), stdout);
1919 fputs (HELP_OPTION_DESCRIPTION, stdout);
1920 fputs (VERSION_OPTION_DESCRIPTION, stdout);
1921 fputs (_("\
1923 With no FILE or if FILE is -, read Standard Input. `-F /' by default.\n\
1924 "), stdout);
1925 printf (_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
1927 exit (status);
1930 /*----------------------------------------------------------------------.
1931 | Main program. Decode ARGC arguments passed through the ARGV array of |
1932 | strings, then launch execution. |
1933 `----------------------------------------------------------------------*/
1935 /* Long options equivalences. */
1936 static const struct option long_options[] =
1938 {"auto-reference", no_argument, NULL, 'A'},
1939 {"break-file", required_argument, NULL, 'b'},
1940 {"copyright", no_argument, NULL, 'C'}, /* Deprecated, remove in 2007. */
1941 {"flag-truncation", required_argument, NULL, 'F'},
1942 {"ignore-case", no_argument, NULL, 'f'},
1943 {"gap-size", required_argument, NULL, 'g'},
1944 {"ignore-file", required_argument, NULL, 'i'},
1945 {"macro-name", required_argument, NULL, 'M'},
1946 {"only-file", required_argument, NULL, 'o'},
1947 {"references", no_argument, NULL, 'r'},
1948 {"right-side-refs", no_argument, NULL, 'R'},
1949 {"format", required_argument, NULL, 10},
1950 {"sentence-regexp", required_argument, NULL, 'S'},
1951 {"traditional", no_argument, NULL, 'G'},
1952 {"typeset-mode", no_argument, NULL, 't'},
1953 {"width", required_argument, NULL, 'w'},
1954 {"word-regexp", required_argument, NULL, 'W'},
1955 {GETOPT_HELP_OPTION_DECL},
1956 {GETOPT_VERSION_OPTION_DECL},
1957 {NULL, 0, NULL, 0},
1960 static char const* const format_args[] =
1962 "roff", "tex", NULL
1965 static enum Format const format_vals[] =
1967 ROFF_FORMAT, TEX_FORMAT
1971 main (int argc, char **argv)
1973 int optchar; /* argument character */
1974 int file_index; /* index in text input file arrays */
1976 /* Decode program options. */
1978 initialize_main (&argc, &argv);
1979 program_name = argv[0];
1980 setlocale (LC_ALL, "");
1981 bindtextdomain (PACKAGE, LOCALEDIR);
1982 textdomain (PACKAGE);
1984 atexit (close_stdout);
1986 #if HAVE_SETCHRCLASS
1987 setchrclass (NULL);
1988 #endif
1990 while (optchar = getopt_long (argc, argv, "ACF:GM:ORS:TW:b:i:fg:o:trw:",
1991 long_options, NULL),
1992 optchar != EOF)
1994 switch (optchar)
1996 default:
1997 usage (EXIT_FAILURE);
1999 case 'G':
2000 gnu_extensions = false;
2001 break;
2003 case 'b':
2004 break_file = optarg;
2005 break;
2007 case 'f':
2008 ignore_case = true;
2009 break;
2011 case 'g':
2013 unsigned long int tmp_ulong;
2014 if (xstrtoul (optarg, NULL, 0, &tmp_ulong, NULL) != LONGINT_OK
2015 || ! (0 < tmp_ulong && tmp_ulong <= INT_MAX))
2016 error (EXIT_FAILURE, 0, _("invalid gap width: %s"),
2017 quotearg (optarg));
2018 gap_size = tmp_ulong;
2019 break;
2022 case 'i':
2023 ignore_file = optarg;
2024 break;
2026 case 'o':
2027 only_file = optarg;
2028 break;
2030 case 'r':
2031 input_reference = true;
2032 break;
2034 case 't':
2035 /* Yet to understand... */
2036 break;
2038 case 'w':
2040 unsigned long int tmp_ulong;
2041 if (xstrtoul (optarg, NULL, 0, &tmp_ulong, NULL) != LONGINT_OK
2042 || ! (0 < tmp_ulong && tmp_ulong <= INT_MAX))
2043 error (EXIT_FAILURE, 0, _("invalid line width: %s"),
2044 quotearg (optarg));
2045 line_width = tmp_ulong;
2046 break;
2049 case 'A':
2050 auto_reference = true;
2051 break;
2053 case 'F':
2054 truncation_string = copy_unescaped_string (optarg);
2055 break;
2057 case 'M':
2058 macro_name = optarg;
2059 break;
2061 case 'O':
2062 output_format = ROFF_FORMAT;
2063 break;
2065 case 'R':
2066 right_reference = true;
2067 break;
2069 case 'S':
2070 context_regex.string = copy_unescaped_string (optarg);
2071 break;
2073 case 'T':
2074 output_format = TEX_FORMAT;
2075 break;
2077 case 'W':
2078 word_regex.string = copy_unescaped_string (optarg);
2079 if (!*word_regex.string)
2080 word_regex.string = NULL;
2081 break;
2083 case 10:
2084 output_format = XARGMATCH ("--format", optarg,
2085 format_args, format_vals);
2086 case_GETOPT_HELP_CHAR;
2088 case 'C':
2089 error (0, 0, _("\
2090 the --copyright option is deprecated; use --version instead"));
2091 /* fallthrough */
2093 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
2097 /* Process remaining arguments. If GNU extensions are enabled, process
2098 all arguments as input parameters. If disabled, accept at most two
2099 arguments, the second of which is an output parameter. */
2101 if (optind == argc)
2104 /* No more argument simply means: read standard input. */
2106 input_file_name = xmalloc (sizeof *input_file_name);
2107 file_line_count = xmalloc (sizeof *file_line_count);
2108 number_input_files = 1;
2109 input_file_name[0] = NULL;
2111 else if (gnu_extensions)
2113 number_input_files = argc - optind;
2114 input_file_name = xmalloc (number_input_files * sizeof *input_file_name);
2115 file_line_count = xmalloc (number_input_files * sizeof *file_line_count);
2117 for (file_index = 0; file_index < number_input_files; file_index++)
2119 input_file_name[file_index] = argv[optind];
2120 if (!*argv[optind] || STREQ (argv[optind], "-"))
2121 input_file_name[0] = NULL;
2122 else
2123 input_file_name[0] = argv[optind];
2124 optind++;
2127 else
2130 /* There is one necessary input file. */
2132 number_input_files = 1;
2133 input_file_name = xmalloc (sizeof *input_file_name);
2134 file_line_count = xmalloc (sizeof *file_line_count);
2135 if (!*argv[optind] || STREQ (argv[optind], "-"))
2136 input_file_name[0] = NULL;
2137 else
2138 input_file_name[0] = argv[optind];
2139 optind++;
2141 /* Redirect standard output, only if requested. */
2143 if (optind < argc)
2145 if (! freopen (argv[optind], "w", stdout))
2146 error (EXIT_FAILURE, errno, "%s", argv[optind]);
2147 optind++;
2150 /* Diagnose any other argument as an error. */
2152 if (optind < argc)
2154 error (0, 0, _("extra operand %s"), quote (argv[optind]));
2155 usage (EXIT_FAILURE);
2159 /* If the output format has not been explicitly selected, choose dumb
2160 terminal format if GNU extensions are enabled, else `roff' format. */
2162 if (output_format == UNKNOWN_FORMAT)
2163 output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT;
2165 /* Initialize the main tables. */
2167 initialize_regex ();
2169 /* Read `Break character' file, if any. */
2171 if (break_file)
2172 digest_break_file (break_file);
2174 /* Read `Ignore words' file and `Only words' files, if any. If any of
2175 these files is empty, reset the name of the file to NULL, to avoid
2176 unnecessary calls to search_table. */
2178 if (ignore_file)
2180 digest_word_file (ignore_file, &ignore_table);
2181 if (ignore_table.length == 0)
2182 ignore_file = NULL;
2185 if (only_file)
2187 digest_word_file (only_file, &only_table);
2188 if (only_table.length == 0)
2189 only_file = NULL;
2192 /* Prepare to study all the input files. */
2194 number_of_occurs[0] = 0;
2195 total_line_count = 0;
2196 maximum_word_length = 0;
2197 reference_max_width = 0;
2199 for (file_index = 0; file_index < number_input_files; file_index++)
2202 /* Read the file in core, than study it. */
2204 swallow_file_in_memory (input_file_name[file_index], &text_buffer);
2205 find_occurs_in_text ();
2207 /* Maintain for each file how many lines has been read so far when its
2208 end is reached. Incrementing the count first is a simple kludge to
2209 handle a possible incomplete line at end of file. */
2211 total_line_count++;
2212 file_line_count[file_index] = total_line_count;
2215 /* Do the output process phase. */
2217 sort_found_occurs ();
2218 fix_output_parameters ();
2219 generate_all_output ();
2221 /* All done. */
2223 exit (EXIT_SUCCESS);