groff before CVS: release 1.06
[s-roff.git] / indxbib / indxbib.cc
blobaac5fd097dafb5e369daa461b8b748bce551f47a
1 // -*- C++ -*-
2 /* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc.
3 Written by James Clark (jjc@jclark.com)
5 This file is part of groff.
7 groff is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 groff is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License along
18 with groff; see the file COPYING. If not, write to the Free Software
19 Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <assert.h>
25 #include <signal.h>
26 #include <errno.h>
28 #include "posix.h"
29 #include "lib.h"
30 #include "errarg.h"
31 #include "error.h"
32 #include "stringclass.h"
33 #include "cset.h"
34 #include "cmap.h"
36 #include "defs.h"
37 #include "index.h"
39 extern "C" {
40 // Sun's stdlib.h fails to declare this.
41 char *mktemp(char *);
44 #define DEFAULT_HASH_TABLE_SIZE 997
45 #define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
47 // (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
49 #define MALLOC_OVERHEAD 16
51 const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
52 - sizeof(int)) / sizeof(int));
53 struct block {
54 block *next;
55 int used;
56 int v[BLOCK_SIZE];
58 block(block *p = 0) : next(p), used(0) { }
61 struct block;
63 union table_entry {
64 block *ptr;
65 int count;
68 struct word_list {
69 word_list *next;
70 char *str;
71 int len;
72 word_list(const char *, int, word_list *);
75 table_entry *hash_table;
76 int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
77 // We make this the same size as hash_table so we only have to do one
78 // mod per key.
79 static word_list **common_words_table = 0;
80 char *key_buffer;
82 FILE *indxfp;
83 int ntags = 0;
84 string filenames;
85 char *temp_index_file = 0;
87 const char *ignore_fields = "XYZ";
88 const char *common_words_file = COMMON_WORDS_FILE;
89 int n_ignore_words = 100;
90 int truncate_len = 6;
91 int shortest_len = 3;
92 int max_keys_per_item = 100;
94 static void usage();
95 static void write_hash_table();
96 static void init_hash_table();
97 static void read_common_words_file();
98 static int store_key(char *s, int len);
99 static void possibly_store_key(char *s, int len);
100 static int do_whole_file(const char *filename);
101 static int do_file(const char *filename);
102 static void store_reference(int filename_index, int pos, int len);
103 static void check_integer_arg(char opt, const char *arg, int min, int *res);
104 static void store_filename(const char *);
105 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
106 static char *get_cwd();
108 extern "C" { void fatal_signal(int); }
110 extern "C" { long dir_name_max(const char *); }
112 int main(int argc, char **argv)
114 program_name = argv[0];
115 static char stderr_buf[BUFSIZ];
116 setbuf(stderr, stderr_buf);
118 const char *basename = 0;
119 typedef int (*parser_t)(const char *);
120 parser_t parser = do_file;
121 const char *directory = 0;
122 const char *foption = 0;
123 int opt;
124 while ((opt = getopt(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw")) != EOF)
125 switch (opt) {
126 case 'c':
127 common_words_file = optarg;
128 break;
129 case 'd':
130 directory = optarg;
131 break;
132 case 'f':
133 foption = optarg;
134 break;
135 case 'h':
136 check_integer_arg('h', optarg, 1, &hash_table_size);
137 if (!is_prime(hash_table_size)) {
138 while (!is_prime(++hash_table_size))
140 warning("%1 not prime: using %2 instead", optarg, hash_table_size);
142 break;
143 case 'i':
144 ignore_fields = optarg;
145 break;
146 case 'k':
147 check_integer_arg('k', optarg, 1, &max_keys_per_item);
148 break;
149 case 'l':
150 check_integer_arg('l', optarg, 0, &shortest_len);
151 break;
152 case 'n':
153 check_integer_arg('n', optarg, 0, &n_ignore_words);
154 break;
155 case 'o':
156 basename = optarg;
157 break;
158 case 't':
159 check_integer_arg('t', optarg, 1, &truncate_len);
160 break;
161 case 'w':
162 parser = do_whole_file;
163 break;
164 case 'v':
166 extern const char *version_string;
167 fprintf(stderr, "GNU indxbib version %s\n", version_string);
168 fflush(stderr);
169 break;
171 case '?':
172 usage();
173 break;
174 default:
175 assert(0);
176 break;
178 if (optind >= argc && foption == 0)
179 fatal("no files and no -f option");
180 if (!directory) {
181 char *path = get_cwd();
182 store_filename(path);
183 a_delete path;
185 else
186 store_filename(directory);
187 init_hash_table();
188 store_filename(common_words_file);
189 store_filename(ignore_fields);
190 key_buffer = new char[truncate_len];
191 read_common_words_file();
192 if (!basename)
193 basename = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
194 const char *p = strrchr(basename, '/');
195 long name_max;
196 if (p) {
197 char *dir = strsave(basename);
198 dir[p - basename] = '\0';
199 name_max = dir_name_max(dir);
200 a_delete dir;
202 else
203 name_max = dir_name_max(".");
204 const char *filename = p ? p + 1 : basename;
205 if (name_max >= 0 && strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max)
206 fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
207 if (p) {
208 p++;
209 temp_index_file = new char[p - basename + sizeof(TEMP_INDEX_TEMPLATE)];
210 memcpy(temp_index_file, basename, p - basename);
211 strcpy(temp_index_file + (p - basename), TEMP_INDEX_TEMPLATE);
213 else {
214 temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
216 if (!mktemp(temp_index_file) || !temp_index_file[0])
217 fatal("cannot create file name for temporary file");
218 signal(SIGHUP, fatal_signal);
219 signal(SIGINT, fatal_signal);
220 signal(SIGTERM, fatal_signal);
221 int fd = creat(temp_index_file, S_IRUSR|S_IRGRP|S_IROTH);
222 if (fd < 0)
223 fatal("can't create temporary index file: %1", strerror(errno));
224 indxfp = fdopen(fd, "w");
225 if (indxfp == 0)
226 fatal("fdopen failed");
227 if (fseek(indxfp, sizeof(index_header), 0) < 0)
228 fatal("can't seek past index header: %1", strerror(errno));
229 int failed = 0;
230 if (foption) {
231 FILE *fp = stdin;
232 if (strcmp(foption, "-") != 0) {
233 errno = 0;
234 fp = fopen(foption, "r");
235 if (!fp)
236 fatal("can't open `%1': %2", foption, strerror(errno));
238 string path;
239 int lineno = 1;
240 for (;;) {
241 for (int c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
242 if (c == '\0')
243 error_with_file_and_line(foption, lineno,
244 "nul character in pathname ignored");
245 else
246 path += c;
248 if (path.length() > 0) {
249 path += '\0';
250 if (!(*parser)(path.contents()))
251 failed = 1;
252 path.clear();
254 if (c == EOF)
255 break;
256 lineno++;
258 if (fp != stdin)
259 fclose(fp);
261 for (int i = optind; i < argc; i++)
262 if (!(*parser)(argv[i]))
263 failed = 1;
264 write_hash_table();
265 if (fclose(indxfp) < 0)
266 fatal("error closing temporary index file: %1", strerror(errno));
267 char *index_file = new char[strlen(basename) + sizeof(INDEX_SUFFIX)];
268 strcpy(index_file, basename);
269 strcat(index_file, INDEX_SUFFIX);
270 #ifdef HAVE_RENAME
271 if (rename(temp_index_file, index_file) < 0)
272 fatal("can't rename temporary index file: %1", strerror(errno));
273 #else /* not HAVE_RENAME */
274 signal(SIGHUP, SIG_IGN);
275 signal(SIGINT, SIG_IGN);
276 signal(SIGTERM, SIG_IGN);
277 if (unlink(index_file) < 0) {
278 if (errno != ENOENT)
279 fatal("can't unlink `%1': %2", index_file, strerror(errno));
281 if (link(temp_index_file, index_file) < 0)
282 fatal("can't link temporary index file: %1", strerror(errno));
283 if (unlink(temp_index_file) < 0)
284 fatal("can't unlink temporary index file: %1", strerror(errno));
285 #endif /* not HAVE_RENAME */
286 temp_index_file = 0;
287 exit(failed);
290 static void usage()
292 fprintf(stderr,
293 "usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
294 " [-l n] [-n n] [-o base] [-t n] [files...]\n",
295 program_name);
296 exit(1);
299 static void check_integer_arg(char opt, const char *arg, int min, int *res)
301 char *ptr;
302 long n = strtol(arg, &ptr, 10);
303 if (n == 0 && ptr == arg)
304 error("argument to -%1 not an integer", opt);
305 else if (n < min)
306 error("argument to -%1 must not be less than %2", opt, min);
307 else {
308 if (n > INT_MAX)
309 error("argument to -%1 greater than maximum integer", opt);
310 else if (*ptr != '\0')
311 error("junk after integer argument to -%1", opt);
312 *res = int(n);
316 static char *get_cwd()
318 char *buf;
319 int size = 12;
321 for (;;) {
322 buf = new char[size];
323 if (getcwd(buf, size))
324 break;
325 if (errno != ERANGE)
326 fatal("cannot get current working directory: %1", strerror(errno));
327 a_delete buf;
328 if (size == INT_MAX)
329 fatal("current working directory longer than INT_MAX");
330 if (size > INT_MAX/2)
331 size = INT_MAX;
332 else
333 size *= 2;
335 return buf;
338 word_list::word_list(const char *s, int n, word_list *p)
339 : next(p), len(n)
341 str = new char[n];
342 memcpy(str, s, n);
345 static void read_common_words_file()
347 if (n_ignore_words <= 0)
348 return;
349 errno = 0;
350 FILE *fp = fopen(common_words_file, "r");
351 if (!fp)
352 fatal("can't open `%1': %2", common_words_file, strerror(errno));
353 common_words_table = new word_list * [hash_table_size];
354 for (int i = 0; i < hash_table_size; i++)
355 common_words_table[i] = 0;
356 int count = 0;
357 int key_len = 0;
358 for (;;) {
359 int c = getc(fp);
360 while (c != EOF && !csalnum(c))
361 c = getc(fp);
362 if (c == EOF)
363 break;
364 do {
365 if (key_len < truncate_len)
366 key_buffer[key_len++] = cmlower(c);
367 c = getc(fp);
368 } while (c != EOF && csalnum(c));
369 if (key_len >= shortest_len) {
370 int h = hash(key_buffer, key_len) % hash_table_size;
371 common_words_table[h] = new word_list(key_buffer, key_len,
372 common_words_table[h]);
374 if (++count >= n_ignore_words)
375 break;
376 key_len = 0;
377 if (c == EOF)
378 break;
380 n_ignore_words = count;
381 fclose(fp);
384 static int do_whole_file(const char *filename)
386 errno = 0;
387 FILE *fp = fopen(filename, "r");
388 if (!fp) {
389 error("can't open `%1': %2", filename, strerror(errno));
390 return 0;
392 int count = 0;
393 int key_len = 0;
394 int c;
395 while ((c = getc(fp)) != EOF) {
396 if (csalnum(c)) {
397 key_len = 1;
398 key_buffer[0] = c;
399 while ((c = getc(fp)) != EOF) {
400 if (!csalnum(c))
401 break;
402 if (key_len < truncate_len)
403 key_buffer[key_len++] = c;
405 if (store_key(key_buffer, key_len)) {
406 if (++count >= max_keys_per_item)
407 break;
409 if (c == EOF)
410 break;
413 store_reference(filenames.length(), 0, 0);
414 store_filename(filename);
415 fclose(fp);
416 return 1;
419 static int do_file(const char *filename)
421 errno = 0;
422 FILE *fp = fopen(filename, "r");
423 if (fp == 0) {
424 error("can't open `%1': %2", filename, strerror(errno));
425 return 0;
427 int filename_index = filenames.length();
428 store_filename(filename);
430 enum {
431 START, // at the start of the file; also in between references
432 BOL, // in the middle of a reference, at the beginning of the line
433 PERCENT, // seen a percent at the beginning of the line
434 IGNORE, // ignoring a field
435 IGNORE_BOL, // at the beginning of a line ignoring a field
436 KEY, // in the middle of a key
437 DISCARD, // after truncate_len bytes of a key
438 MIDDLE // in between keys
439 } state = START;
441 // In states START, BOL, IGNORE_BOL, space_count how many spaces at
442 // the beginning have been seen. In states PERCENT, IGNORE, KEY,
443 // MIDDLE space_count must be 0.
444 int space_count = 0;
445 int byte_count = 0; // bytes read
446 int key_len = 0;
447 int ref_start = -1; // position of start of current reference
448 for (;;) {
449 int c = getc(fp);
450 if (c == EOF)
451 break;
452 byte_count++;
453 switch (state) {
454 case START:
455 if (c == ' ' || c == '\t') {
456 space_count++;
457 break;
459 if (c == '\n') {
460 space_count = 0;
461 break;
463 ref_start = byte_count - space_count - 1;
464 space_count = 0;
465 if (c == '%')
466 state = PERCENT;
467 else if (csalnum(c)) {
468 state = KEY;
469 key_buffer[0] = c;
470 key_len = 1;
472 else
473 state = MIDDLE;
474 break;
475 case BOL:
476 switch (c) {
477 case '%':
478 if (space_count > 0) {
479 space_count = 0;
480 state = MIDDLE;
482 else
483 state = PERCENT;
484 break;
485 case ' ':
486 case '\t':
487 space_count++;
488 break;
489 case '\n':
490 store_reference(filename_index, ref_start,
491 byte_count - 1 - space_count - ref_start);
492 state = START;
493 space_count = 0;
494 break;
495 default:
496 space_count = 0;
497 if (csalnum(c)) {
498 state = KEY;
499 key_buffer[0] = c;
500 key_len = 1;
502 else
503 state = MIDDLE;
505 break;
506 case PERCENT:
507 if (strchr(ignore_fields, c) != 0)
508 state = IGNORE;
509 else if (c == '\n')
510 state = BOL;
511 else
512 state = MIDDLE;
513 break;
514 case IGNORE:
515 if (c == '\n')
516 state = IGNORE_BOL;
517 break;
518 case IGNORE_BOL:
519 switch (c) {
520 case '%':
521 if (space_count > 0) {
522 state = IGNORE;
523 space_count = 0;
525 else
526 state = PERCENT;
527 break;
528 case ' ':
529 case '\t':
530 space_count++;
531 break;
532 case '\n':
533 store_reference(filename_index, ref_start,
534 byte_count - 1 - space_count - ref_start);
535 state = START;
536 space_count = 0;
537 break;
538 default:
539 space_count = 0;
540 state = IGNORE;
542 break;
543 case KEY:
544 if (csalnum(c)) {
545 if (key_len < truncate_len)
546 key_buffer[key_len++] = c;
547 else
548 state = DISCARD;
550 else {
551 possibly_store_key(key_buffer, key_len);
552 key_len = 0;
553 if (c == '\n')
554 state = BOL;
555 else
556 state = MIDDLE;
558 break;
559 case DISCARD:
560 if (!csalnum(c)) {
561 possibly_store_key(key_buffer, key_len);
562 key_len = 0;
563 if (c == '\n')
564 state = BOL;
565 else
566 state = MIDDLE;
568 break;
569 case MIDDLE:
570 if (csalnum(c)) {
571 state = KEY;
572 key_buffer[0] = c;
573 key_len = 1;
575 else if (c == '\n')
576 state = BOL;
577 break;
578 default:
579 assert(0);
582 switch (state) {
583 case START:
584 break;
585 case DISCARD:
586 case KEY:
587 possibly_store_key(key_buffer, key_len);
588 // fall through
589 case BOL:
590 case PERCENT:
591 case IGNORE_BOL:
592 case IGNORE:
593 case MIDDLE:
594 store_reference(filename_index, ref_start,
595 byte_count - ref_start - space_count);
596 break;
597 default:
598 assert(0);
600 fclose(fp);
601 return 1;
604 static void store_reference(int filename_index, int pos, int len)
606 tag t;
607 t.filename_index = filename_index;
608 t.start = pos;
609 t.length = len;
610 fwrite_or_die(&t, sizeof(t), 1, indxfp);
611 ntags++;
614 static void store_filename(const char *fn)
616 filenames += fn;
617 filenames += '\0';
620 static void init_hash_table()
622 hash_table = new table_entry[hash_table_size];
623 for (int i = 0; i < hash_table_size; i++)
624 hash_table[i].ptr = 0;
627 static void possibly_store_key(char *s, int len)
629 static int last_tagno = -1;
630 static int key_count;
631 if (last_tagno != ntags) {
632 last_tagno = ntags;
633 key_count = 0;
635 if (key_count < max_keys_per_item) {
636 if (store_key(s, len))
637 key_count++;
641 static int store_key(char *s, int len)
643 if (len < shortest_len)
644 return 0;
645 int is_number = 1;
646 for (int i = 0; i < len; i++)
647 if (!csdigit(s[i])) {
648 is_number = 0;
649 s[i] = cmlower(s[i]);
651 if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
652 return 0;
653 int h = hash(s, len) % hash_table_size;
654 if (common_words_table) {
655 for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
656 if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
657 return 0;
659 table_entry *pp = hash_table + h;
660 if (!pp->ptr)
661 pp->ptr = new block;
662 else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
663 return 1;
664 else if (pp->ptr->used >= BLOCK_SIZE)
665 pp->ptr = new block(pp->ptr);
666 pp->ptr->v[(pp->ptr->used)++] = ntags;
667 return 1;
670 static void write_hash_table()
672 const int minus_one = -1;
673 int li = 0;
674 for (int i = 0; i < hash_table_size; i++) {
675 block *ptr = hash_table[i].ptr;
676 if (!ptr)
677 hash_table[i].count = -1;
678 else {
679 hash_table[i].count = li;
680 block *rev = 0;
681 while (ptr) {
682 block *tem = ptr;
683 ptr = ptr->next;
684 tem->next = rev;
685 rev = tem;
687 while (rev) {
688 fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
689 li += rev->used;
690 block *tem = rev;
691 rev = rev->next;
692 delete tem;
694 fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
695 li += 1;
698 if (sizeof(table_entry) == sizeof(int))
699 fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
700 else {
701 assert(0);
702 // write it out word by word
704 fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
705 if (fseek(indxfp, 0, 0) < 0)
706 fatal("error seeking on index file: %1", strerror(errno));
707 index_header h;
708 h.magic = INDEX_MAGIC;
709 h.version = INDEX_VERSION;
710 h.tags_size = ntags;
711 h.lists_size = li;
712 h.table_size = hash_table_size;
713 h.strings_size = filenames.length();
714 h.truncate = truncate_len;
715 h.shortest = shortest_len;
716 h.common = n_ignore_words;
717 fwrite_or_die(&h, sizeof(h), 1, indxfp);
720 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
722 if (fwrite(ptr, size, nitems, fp) != nitems)
723 fatal("fwrite failed: %1", strerror(errno));
726 void fatal_error_exit()
728 if (temp_index_file)
729 unlink(temp_index_file);
730 exit(3);
733 extern "C" {
735 void fatal_signal(int signum)
737 signal(signum, SIG_DFL);
738 if (temp_index_file)
739 unlink(temp_index_file);
740 kill(getpid(), signum);