groff before CVS: release 1.15
[s-roff.git] / indxbib / indxbib.cc
blobfe310c35bb3b71a2f07fd7ed0f8ccff94f9b8b2f
1 // -*- C++ -*-
2 /* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc.
3 Written by James Clark (jjc@jclark.com)
5 This file is part of groff.
7 groff is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 groff is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License along
18 with groff; see the file COPYING. If not, write to the Free Software
19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <assert.h>
25 #include <errno.h>
27 #include "posix.h"
28 #include "lib.h"
29 #include "errarg.h"
30 #include "error.h"
31 #include "stringclass.h"
32 #include "cset.h"
33 #include "cmap.h"
35 #include "defs.h"
36 #include "index.h"
38 extern "C" {
39 // Sun's stdlib.h fails to declare this.
40 char *mktemp(char *);
43 #define DEFAULT_HASH_TABLE_SIZE 997
44 #define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
46 // (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
48 #define MALLOC_OVERHEAD 16
50 #ifdef BLOCK_SIZE
51 #undef BLOCK_SIZE
52 #endif
54 const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
55 - sizeof(int)) / sizeof(int));
56 struct block {
57 block *next;
58 int used;
59 int v[BLOCK_SIZE];
61 block(block *p = 0) : next(p), used(0) { }
64 struct block;
66 union table_entry {
67 block *ptr;
68 int count;
71 struct word_list {
72 word_list *next;
73 char *str;
74 int len;
75 word_list(const char *, int, word_list *);
78 table_entry *hash_table;
79 int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
80 // We make this the same size as hash_table so we only have to do one
81 // mod per key.
82 static word_list **common_words_table = 0;
83 char *key_buffer;
85 FILE *indxfp;
86 int ntags = 0;
87 string filenames;
88 char *temp_index_file = 0;
90 const char *ignore_fields = "XYZ";
91 const char *common_words_file = COMMON_WORDS_FILE;
92 int n_ignore_words = 100;
93 int truncate_len = 6;
94 int shortest_len = 3;
95 int max_keys_per_item = 100;
97 static void usage();
98 static void write_hash_table();
99 static void init_hash_table();
100 static void read_common_words_file();
101 static int store_key(char *s, int len);
102 static void possibly_store_key(char *s, int len);
103 static int do_whole_file(const char *filename);
104 static int do_file(const char *filename);
105 static void store_reference(int filename_index, int pos, int len);
106 static void check_integer_arg(char opt, const char *arg, int min, int *res);
107 static void store_filename(const char *);
108 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
109 static char *get_cwd();
111 extern "C" {
112 void cleanup();
113 long dir_name_max(const char *);
114 void catch_fatal_signals();
115 void ignore_fatal_signals();
118 int main(int argc, char **argv)
120 program_name = argv[0];
121 static char stderr_buf[BUFSIZ];
122 setbuf(stderr, stderr_buf);
124 const char *basename = 0;
125 typedef int (*parser_t)(const char *);
126 parser_t parser = do_file;
127 const char *directory = 0;
128 const char *foption = 0;
129 int opt;
130 while ((opt = getopt(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw")) != EOF)
131 switch (opt) {
132 case 'c':
133 common_words_file = optarg;
134 break;
135 case 'd':
136 directory = optarg;
137 break;
138 case 'f':
139 foption = optarg;
140 break;
141 case 'h':
142 check_integer_arg('h', optarg, 1, &hash_table_size);
143 if (!is_prime(hash_table_size)) {
144 while (!is_prime(++hash_table_size))
146 warning("%1 not prime: using %2 instead", optarg, hash_table_size);
148 break;
149 case 'i':
150 ignore_fields = optarg;
151 break;
152 case 'k':
153 check_integer_arg('k', optarg, 1, &max_keys_per_item);
154 break;
155 case 'l':
156 check_integer_arg('l', optarg, 0, &shortest_len);
157 break;
158 case 'n':
159 check_integer_arg('n', optarg, 0, &n_ignore_words);
160 break;
161 case 'o':
162 basename = optarg;
163 break;
164 case 't':
165 check_integer_arg('t', optarg, 1, &truncate_len);
166 break;
167 case 'w':
168 parser = do_whole_file;
169 break;
170 case 'v':
172 extern const char *version_string;
173 fprintf(stderr, "GNU indxbib version %s\n", version_string);
174 fflush(stderr);
175 break;
177 case '?':
178 usage();
179 break;
180 default:
181 assert(0);
182 break;
184 if (optind >= argc && foption == 0)
185 fatal("no files and no -f option");
186 if (!directory) {
187 char *path = get_cwd();
188 store_filename(path);
189 a_delete path;
191 else
192 store_filename(directory);
193 init_hash_table();
194 store_filename(common_words_file);
195 store_filename(ignore_fields);
196 key_buffer = new char[truncate_len];
197 read_common_words_file();
198 if (!basename)
199 basename = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
200 const char *p = strrchr(basename, '/');
201 long name_max;
202 if (p) {
203 char *dir = strsave(basename);
204 dir[p - basename] = '\0';
205 name_max = dir_name_max(dir);
206 a_delete dir;
208 else
209 name_max = dir_name_max(".");
210 const char *filename = p ? p + 1 : basename;
211 if (name_max >= 0 && strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max)
212 fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
213 if (p) {
214 p++;
215 temp_index_file = new char[p - basename + sizeof(TEMP_INDEX_TEMPLATE)];
216 memcpy(temp_index_file, basename, p - basename);
217 strcpy(temp_index_file + (p - basename), TEMP_INDEX_TEMPLATE);
219 else {
220 temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
222 if (!mktemp(temp_index_file) || !temp_index_file[0])
223 fatal("cannot create file name for temporary file");
224 catch_fatal_signals();
225 int fd = creat(temp_index_file, S_IRUSR|S_IRGRP|S_IROTH);
226 if (fd < 0)
227 fatal("can't create temporary index file: %1", strerror(errno));
228 indxfp = fdopen(fd, "w");
229 if (indxfp == 0)
230 fatal("fdopen failed");
231 if (fseek(indxfp, sizeof(index_header), 0) < 0)
232 fatal("can't seek past index header: %1", strerror(errno));
233 int failed = 0;
234 if (foption) {
235 FILE *fp = stdin;
236 if (strcmp(foption, "-") != 0) {
237 errno = 0;
238 fp = fopen(foption, "r");
239 if (!fp)
240 fatal("can't open `%1': %2", foption, strerror(errno));
242 string path;
243 int lineno = 1;
244 for (;;) {
245 int c;
246 for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
247 if (c == '\0')
248 error_with_file_and_line(foption, lineno,
249 "nul character in pathname ignored");
250 else
251 path += c;
253 if (path.length() > 0) {
254 path += '\0';
255 if (!(*parser)(path.contents()))
256 failed = 1;
257 path.clear();
259 if (c == EOF)
260 break;
261 lineno++;
263 if (fp != stdin)
264 fclose(fp);
266 for (int i = optind; i < argc; i++)
267 if (!(*parser)(argv[i]))
268 failed = 1;
269 write_hash_table();
270 if (fclose(indxfp) < 0)
271 fatal("error closing temporary index file: %1", strerror(errno));
272 char *index_file = new char[strlen(basename) + sizeof(INDEX_SUFFIX)];
273 strcpy(index_file, basename);
274 strcat(index_file, INDEX_SUFFIX);
275 #ifdef HAVE_RENAME
276 if (rename(temp_index_file, index_file) < 0)
277 fatal("can't rename temporary index file: %1", strerror(errno));
278 #else /* not HAVE_RENAME */
279 ignore_fatal_signals();
280 if (unlink(index_file) < 0) {
281 if (errno != ENOENT)
282 fatal("can't unlink `%1': %2", index_file, strerror(errno));
284 if (link(temp_index_file, index_file) < 0)
285 fatal("can't link temporary index file: %1", strerror(errno));
286 if (unlink(temp_index_file) < 0)
287 fatal("can't unlink temporary index file: %1", strerror(errno));
288 #endif /* not HAVE_RENAME */
289 temp_index_file = 0;
290 return failed;
293 static void usage()
295 fprintf(stderr,
296 "usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
297 " [-l n] [-n n] [-o base] [-t n] [files...]\n",
298 program_name);
299 exit(1);
302 static void check_integer_arg(char opt, const char *arg, int min, int *res)
304 char *ptr;
305 long n = strtol(arg, &ptr, 10);
306 if (n == 0 && ptr == arg)
307 error("argument to -%1 not an integer", opt);
308 else if (n < min)
309 error("argument to -%1 must not be less than %2", opt, min);
310 else {
311 if (n > INT_MAX)
312 error("argument to -%1 greater than maximum integer", opt);
313 else if (*ptr != '\0')
314 error("junk after integer argument to -%1", opt);
315 *res = int(n);
319 static char *get_cwd()
321 char *buf;
322 int size = 12;
324 for (;;) {
325 buf = new char[size];
326 if (getcwd(buf, size))
327 break;
328 if (errno != ERANGE)
329 fatal("cannot get current working directory: %1", strerror(errno));
330 a_delete buf;
331 if (size == INT_MAX)
332 fatal("current working directory longer than INT_MAX");
333 if (size > INT_MAX/2)
334 size = INT_MAX;
335 else
336 size *= 2;
338 return buf;
341 word_list::word_list(const char *s, int n, word_list *p)
342 : next(p), len(n)
344 str = new char[n];
345 memcpy(str, s, n);
348 static void read_common_words_file()
350 if (n_ignore_words <= 0)
351 return;
352 errno = 0;
353 FILE *fp = fopen(common_words_file, "r");
354 if (!fp)
355 fatal("can't open `%1': %2", common_words_file, strerror(errno));
356 common_words_table = new word_list * [hash_table_size];
357 for (int i = 0; i < hash_table_size; i++)
358 common_words_table[i] = 0;
359 int count = 0;
360 int key_len = 0;
361 for (;;) {
362 int c = getc(fp);
363 while (c != EOF && !csalnum(c))
364 c = getc(fp);
365 if (c == EOF)
366 break;
367 do {
368 if (key_len < truncate_len)
369 key_buffer[key_len++] = cmlower(c);
370 c = getc(fp);
371 } while (c != EOF && csalnum(c));
372 if (key_len >= shortest_len) {
373 int h = hash(key_buffer, key_len) % hash_table_size;
374 common_words_table[h] = new word_list(key_buffer, key_len,
375 common_words_table[h]);
377 if (++count >= n_ignore_words)
378 break;
379 key_len = 0;
380 if (c == EOF)
381 break;
383 n_ignore_words = count;
384 fclose(fp);
387 static int do_whole_file(const char *filename)
389 errno = 0;
390 FILE *fp = fopen(filename, "r");
391 if (!fp) {
392 error("can't open `%1': %2", filename, strerror(errno));
393 return 0;
395 int count = 0;
396 int key_len = 0;
397 int c;
398 while ((c = getc(fp)) != EOF) {
399 if (csalnum(c)) {
400 key_len = 1;
401 key_buffer[0] = c;
402 while ((c = getc(fp)) != EOF) {
403 if (!csalnum(c))
404 break;
405 if (key_len < truncate_len)
406 key_buffer[key_len++] = c;
408 if (store_key(key_buffer, key_len)) {
409 if (++count >= max_keys_per_item)
410 break;
412 if (c == EOF)
413 break;
416 store_reference(filenames.length(), 0, 0);
417 store_filename(filename);
418 fclose(fp);
419 return 1;
422 static int do_file(const char *filename)
424 errno = 0;
425 FILE *fp = fopen(filename, "r");
426 if (fp == 0) {
427 error("can't open `%1': %2", filename, strerror(errno));
428 return 0;
430 int filename_index = filenames.length();
431 store_filename(filename);
433 enum {
434 START, // at the start of the file; also in between references
435 BOL, // in the middle of a reference, at the beginning of the line
436 PERCENT, // seen a percent at the beginning of the line
437 IGNORE, // ignoring a field
438 IGNORE_BOL, // at the beginning of a line ignoring a field
439 KEY, // in the middle of a key
440 DISCARD, // after truncate_len bytes of a key
441 MIDDLE // in between keys
442 } state = START;
444 // In states START, BOL, IGNORE_BOL, space_count how many spaces at
445 // the beginning have been seen. In states PERCENT, IGNORE, KEY,
446 // MIDDLE space_count must be 0.
447 int space_count = 0;
448 int byte_count = 0; // bytes read
449 int key_len = 0;
450 int ref_start = -1; // position of start of current reference
451 for (;;) {
452 int c = getc(fp);
453 if (c == EOF)
454 break;
455 byte_count++;
456 switch (state) {
457 case START:
458 if (c == ' ' || c == '\t') {
459 space_count++;
460 break;
462 if (c == '\n') {
463 space_count = 0;
464 break;
466 ref_start = byte_count - space_count - 1;
467 space_count = 0;
468 if (c == '%')
469 state = PERCENT;
470 else if (csalnum(c)) {
471 state = KEY;
472 key_buffer[0] = c;
473 key_len = 1;
475 else
476 state = MIDDLE;
477 break;
478 case BOL:
479 switch (c) {
480 case '%':
481 if (space_count > 0) {
482 space_count = 0;
483 state = MIDDLE;
485 else
486 state = PERCENT;
487 break;
488 case ' ':
489 case '\t':
490 space_count++;
491 break;
492 case '\n':
493 store_reference(filename_index, ref_start,
494 byte_count - 1 - space_count - ref_start);
495 state = START;
496 space_count = 0;
497 break;
498 default:
499 space_count = 0;
500 if (csalnum(c)) {
501 state = KEY;
502 key_buffer[0] = c;
503 key_len = 1;
505 else
506 state = MIDDLE;
508 break;
509 case PERCENT:
510 if (strchr(ignore_fields, c) != 0)
511 state = IGNORE;
512 else if (c == '\n')
513 state = BOL;
514 else
515 state = MIDDLE;
516 break;
517 case IGNORE:
518 if (c == '\n')
519 state = IGNORE_BOL;
520 break;
521 case IGNORE_BOL:
522 switch (c) {
523 case '%':
524 if (space_count > 0) {
525 state = IGNORE;
526 space_count = 0;
528 else
529 state = PERCENT;
530 break;
531 case ' ':
532 case '\t':
533 space_count++;
534 break;
535 case '\n':
536 store_reference(filename_index, ref_start,
537 byte_count - 1 - space_count - ref_start);
538 state = START;
539 space_count = 0;
540 break;
541 default:
542 space_count = 0;
543 state = IGNORE;
545 break;
546 case KEY:
547 if (csalnum(c)) {
548 if (key_len < truncate_len)
549 key_buffer[key_len++] = c;
550 else
551 state = DISCARD;
553 else {
554 possibly_store_key(key_buffer, key_len);
555 key_len = 0;
556 if (c == '\n')
557 state = BOL;
558 else
559 state = MIDDLE;
561 break;
562 case DISCARD:
563 if (!csalnum(c)) {
564 possibly_store_key(key_buffer, key_len);
565 key_len = 0;
566 if (c == '\n')
567 state = BOL;
568 else
569 state = MIDDLE;
571 break;
572 case MIDDLE:
573 if (csalnum(c)) {
574 state = KEY;
575 key_buffer[0] = c;
576 key_len = 1;
578 else if (c == '\n')
579 state = BOL;
580 break;
581 default:
582 assert(0);
585 switch (state) {
586 case START:
587 break;
588 case DISCARD:
589 case KEY:
590 possibly_store_key(key_buffer, key_len);
591 // fall through
592 case BOL:
593 case PERCENT:
594 case IGNORE_BOL:
595 case IGNORE:
596 case MIDDLE:
597 store_reference(filename_index, ref_start,
598 byte_count - ref_start - space_count);
599 break;
600 default:
601 assert(0);
603 fclose(fp);
604 return 1;
607 static void store_reference(int filename_index, int pos, int len)
609 tag t;
610 t.filename_index = filename_index;
611 t.start = pos;
612 t.length = len;
613 fwrite_or_die(&t, sizeof(t), 1, indxfp);
614 ntags++;
617 static void store_filename(const char *fn)
619 filenames += fn;
620 filenames += '\0';
623 static void init_hash_table()
625 hash_table = new table_entry[hash_table_size];
626 for (int i = 0; i < hash_table_size; i++)
627 hash_table[i].ptr = 0;
630 static void possibly_store_key(char *s, int len)
632 static int last_tagno = -1;
633 static int key_count;
634 if (last_tagno != ntags) {
635 last_tagno = ntags;
636 key_count = 0;
638 if (key_count < max_keys_per_item) {
639 if (store_key(s, len))
640 key_count++;
644 static int store_key(char *s, int len)
646 if (len < shortest_len)
647 return 0;
648 int is_number = 1;
649 for (int i = 0; i < len; i++)
650 if (!csdigit(s[i])) {
651 is_number = 0;
652 s[i] = cmlower(s[i]);
654 if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
655 return 0;
656 int h = hash(s, len) % hash_table_size;
657 if (common_words_table) {
658 for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
659 if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
660 return 0;
662 table_entry *pp = hash_table + h;
663 if (!pp->ptr)
664 pp->ptr = new block;
665 else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
666 return 1;
667 else if (pp->ptr->used >= BLOCK_SIZE)
668 pp->ptr = new block(pp->ptr);
669 pp->ptr->v[(pp->ptr->used)++] = ntags;
670 return 1;
673 static void write_hash_table()
675 const int minus_one = -1;
676 int li = 0;
677 for (int i = 0; i < hash_table_size; i++) {
678 block *ptr = hash_table[i].ptr;
679 if (!ptr)
680 hash_table[i].count = -1;
681 else {
682 hash_table[i].count = li;
683 block *rev = 0;
684 while (ptr) {
685 block *tem = ptr;
686 ptr = ptr->next;
687 tem->next = rev;
688 rev = tem;
690 while (rev) {
691 fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
692 li += rev->used;
693 block *tem = rev;
694 rev = rev->next;
695 delete tem;
697 fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
698 li += 1;
701 if (sizeof(table_entry) == sizeof(int))
702 fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
703 else {
704 // write it out word by word
705 for (int i = 0; i < hash_table_size; i++)
706 fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
708 fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
709 if (fseek(indxfp, 0, 0) < 0)
710 fatal("error seeking on index file: %1", strerror(errno));
711 index_header h;
712 h.magic = INDEX_MAGIC;
713 h.version = INDEX_VERSION;
714 h.tags_size = ntags;
715 h.lists_size = li;
716 h.table_size = hash_table_size;
717 h.strings_size = filenames.length();
718 h.truncate = truncate_len;
719 h.shortest = shortest_len;
720 h.common = n_ignore_words;
721 fwrite_or_die(&h, sizeof(h), 1, indxfp);
724 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
726 if (fwrite(ptr, size, nitems, fp) != nitems)
727 fatal("fwrite failed: %1", strerror(errno));
730 void fatal_error_exit()
732 cleanup();
733 exit(3);
736 extern "C" {
738 void cleanup()
740 if (temp_index_file)
741 unlink(temp_index_file);