* Makefile.in (ENVSETUP): Don't assume POSIX make semantics for
[s-roff.git] / src / utils / indxbib / indxbib.cc
blob3185fe2cda89cea9a1d371f42196101409e0fc30
1 // -*- C++ -*-
2 /* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc.
3 Written by James Clark (jjc@jclark.com)
5 This file is part of groff.
7 groff is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 groff is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License along
18 with groff; see the file COPYING. If not, write to the Free Software
19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <assert.h>
25 #include <errno.h>
27 #include "posix.h"
28 #include "lib.h"
29 #include "errarg.h"
30 #include "error.h"
31 #include "stringclass.h"
32 #include "cset.h"
33 #include "cmap.h"
35 #include "defs.h"
36 #include "index.h"
38 #include "nonposix.h"
40 extern "C" {
41 // Solaris 2.5.1 has these functions,
42 // but its stdlib.h fails to declare them.
43 char *mktemp(char *);
44 int mkstemp(char *);
47 #define DEFAULT_HASH_TABLE_SIZE 997
48 #define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
50 // (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
52 #define MALLOC_OVERHEAD 16
54 #ifdef BLOCK_SIZE
55 #undef BLOCK_SIZE
56 #endif
58 const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
59 - sizeof(int)) / sizeof(int));
60 struct block {
61 block *next;
62 int used;
63 int v[BLOCK_SIZE];
65 block(block *p = 0) : next(p), used(0) { }
68 struct block;
70 union table_entry {
71 block *ptr;
72 int count;
75 struct word_list {
76 word_list *next;
77 char *str;
78 int len;
79 word_list(const char *, int, word_list *);
82 table_entry *hash_table;
83 int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
84 // We make this the same size as hash_table so we only have to do one
85 // mod per key.
86 static word_list **common_words_table = 0;
87 char *key_buffer;
89 FILE *indxfp;
90 int ntags = 0;
91 string filenames;
92 char *temp_index_file = 0;
94 const char *ignore_fields = "XYZ";
95 const char *common_words_file = COMMON_WORDS_FILE;
96 int n_ignore_words = 100;
97 int truncate_len = 6;
98 int shortest_len = 3;
99 int max_keys_per_item = 100;
101 static void usage();
102 static void write_hash_table();
103 static void init_hash_table();
104 static void read_common_words_file();
105 static int store_key(char *s, int len);
106 static void possibly_store_key(char *s, int len);
107 static int do_whole_file(const char *filename);
108 static int do_file(const char *filename);
109 static void store_reference(int filename_index, int pos, int len);
110 static void check_integer_arg(char opt, const char *arg, int min, int *res);
111 static void store_filename(const char *);
112 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
113 static char *get_cwd();
115 extern "C" {
116 void cleanup();
117 long dir_name_max(const char *);
118 void catch_fatal_signals();
119 void ignore_fatal_signals();
122 int main(int argc, char **argv)
124 program_name = argv[0];
125 static char stderr_buf[BUFSIZ];
126 setbuf(stderr, stderr_buf);
128 const char *basename = 0;
129 typedef int (*parser_t)(const char *);
130 parser_t parser = do_file;
131 const char *directory = 0;
132 const char *foption = 0;
133 int opt;
134 while ((opt = getopt(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw")) != EOF)
135 switch (opt) {
136 case 'c':
137 common_words_file = optarg;
138 break;
139 case 'd':
140 directory = optarg;
141 break;
142 case 'f':
143 foption = optarg;
144 break;
145 case 'h':
146 check_integer_arg('h', optarg, 1, &hash_table_size);
147 if (!is_prime(hash_table_size)) {
148 while (!is_prime(++hash_table_size))
150 warning("%1 not prime: using %2 instead", optarg, hash_table_size);
152 break;
153 case 'i':
154 ignore_fields = optarg;
155 break;
156 case 'k':
157 check_integer_arg('k', optarg, 1, &max_keys_per_item);
158 break;
159 case 'l':
160 check_integer_arg('l', optarg, 0, &shortest_len);
161 break;
162 case 'n':
163 check_integer_arg('n', optarg, 0, &n_ignore_words);
164 break;
165 case 'o':
166 basename = optarg;
167 break;
168 case 't':
169 check_integer_arg('t', optarg, 1, &truncate_len);
170 break;
171 case 'w':
172 parser = do_whole_file;
173 break;
174 case 'v':
176 extern const char *Version_string;
177 fprintf(stderr, "GNU indxbib version %s\n", Version_string);
178 fflush(stderr);
179 break;
181 case '?':
182 usage();
183 break;
184 default:
185 assert(0);
186 break;
188 if (optind >= argc && foption == 0)
189 fatal("no files and no -f option");
190 if (!directory) {
191 char *path = get_cwd();
192 store_filename(path);
193 a_delete path;
195 else
196 store_filename(directory);
197 init_hash_table();
198 store_filename(common_words_file);
199 store_filename(ignore_fields);
200 key_buffer = new char[truncate_len];
201 read_common_words_file();
202 if (!basename)
203 basename = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
204 const char *p = strrchr(basename, DIR_SEPS[0]), *p1;
205 const char *sep = &DIR_SEPS[1];
206 while (*sep)
208 p1 = strrchr(basename, *sep);
209 if (p1 && (!p || p1 > p))
210 p = p1;
211 sep++;
213 long name_max;
214 if (p) {
215 char *dir = strsave(basename);
216 dir[p - basename] = '\0';
217 name_max = dir_name_max(dir);
218 a_delete dir;
220 else
221 name_max = dir_name_max(".");
222 const char *filename = p ? p + 1 : basename;
223 if (name_max >= 0 && strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max)
224 fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
225 if (p) {
226 p++;
227 temp_index_file = new char[p - basename + sizeof(TEMP_INDEX_TEMPLATE)];
228 memcpy(temp_index_file, basename, p - basename);
229 strcpy(temp_index_file + (p - basename), TEMP_INDEX_TEMPLATE);
231 else {
232 temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
234 #ifndef HAVE_MKSTEMP
235 if (!mktemp(temp_index_file) || !temp_index_file[0])
236 fatal("cannot create file name for temporary file");
237 #endif
238 catch_fatal_signals();
239 #ifdef HAVE_MKSTEMP
240 int fd = mkstemp(temp_index_file);
241 #else
242 int fd = creat(temp_index_file, S_IRUSR|S_IRGRP|S_IROTH);
243 #endif
244 if (fd < 0)
245 fatal("can't create temporary index file: %1", strerror(errno));
246 indxfp = fdopen(fd, FOPEN_WB);
247 if (indxfp == 0)
248 fatal("fdopen failed");
249 if (fseek(indxfp, sizeof(index_header), 0) < 0)
250 fatal("can't seek past index header: %1", strerror(errno));
251 int failed = 0;
252 if (foption) {
253 FILE *fp = stdin;
254 if (strcmp(foption, "-") != 0) {
255 errno = 0;
256 fp = fopen(foption, "r");
257 if (!fp)
258 fatal("can't open `%1': %2", foption, strerror(errno));
260 string path;
261 int lineno = 1;
262 for (;;) {
263 int c;
264 for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
265 if (c == '\0')
266 error_with_file_and_line(foption, lineno,
267 "nul character in pathname ignored");
268 else
269 path += c;
271 if (path.length() > 0) {
272 path += '\0';
273 if (!(*parser)(path.contents()))
274 failed = 1;
275 path.clear();
277 if (c == EOF)
278 break;
279 lineno++;
281 if (fp != stdin)
282 fclose(fp);
284 for (int i = optind; i < argc; i++)
285 if (!(*parser)(argv[i]))
286 failed = 1;
287 write_hash_table();
288 if (fclose(indxfp) < 0)
289 fatal("error closing temporary index file: %1", strerror(errno));
290 char *index_file = new char[strlen(basename) + sizeof(INDEX_SUFFIX)];
291 strcpy(index_file, basename);
292 strcat(index_file, INDEX_SUFFIX);
293 #ifdef HAVE_RENAME
294 if (rename(temp_index_file, index_file) < 0)
296 #ifdef __MSDOS__
297 // RENAME could fail on plain MSDOS filesystems because
298 // INDEX_FILE is an invalid filename, e.g. it has multiple dots.
299 char *fname = p ? index_file + (p - basename) : 0;
300 char *dot = 0;
302 // Replace the dot with an underscore and try again.
303 if (fname
304 && (dot = strchr(fname, '.')) != 0
305 && strcmp(dot, INDEX_SUFFIX) != 0)
306 *dot = '_';
307 if (rename(temp_index_file, index_file) < 0)
308 #endif
309 fatal("can't rename temporary index file: %1", strerror(errno));
311 #else /* not HAVE_RENAME */
312 ignore_fatal_signals();
313 if (unlink(index_file) < 0) {
314 if (errno != ENOENT)
315 fatal("can't unlink `%1': %2", index_file, strerror(errno));
317 if (link(temp_index_file, index_file) < 0)
318 fatal("can't link temporary index file: %1", strerror(errno));
319 if (unlink(temp_index_file) < 0)
320 fatal("can't unlink temporary index file: %1", strerror(errno));
321 #endif /* not HAVE_RENAME */
322 temp_index_file = 0;
323 return failed;
326 static void usage()
328 fprintf(stderr,
329 "usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
330 " [-l n] [-n n] [-o base] [-t n] [files...]\n",
331 program_name);
332 exit(1);
335 static void check_integer_arg(char opt, const char *arg, int min, int *res)
337 char *ptr;
338 long n = strtol(arg, &ptr, 10);
339 if (n == 0 && ptr == arg)
340 error("argument to -%1 not an integer", opt);
341 else if (n < min)
342 error("argument to -%1 must not be less than %2", opt, min);
343 else {
344 if (n > INT_MAX)
345 error("argument to -%1 greater than maximum integer", opt);
346 else if (*ptr != '\0')
347 error("junk after integer argument to -%1", opt);
348 *res = int(n);
352 static char *get_cwd()
354 char *buf;
355 int size = 12;
357 for (;;) {
358 buf = new char[size];
359 if (getcwd(buf, size))
360 break;
361 if (errno != ERANGE)
362 fatal("cannot get current working directory: %1", strerror(errno));
363 a_delete buf;
364 if (size == INT_MAX)
365 fatal("current working directory longer than INT_MAX");
366 if (size > INT_MAX/2)
367 size = INT_MAX;
368 else
369 size *= 2;
371 return buf;
374 word_list::word_list(const char *s, int n, word_list *p)
375 : next(p), len(n)
377 str = new char[n];
378 memcpy(str, s, n);
381 static void read_common_words_file()
383 if (n_ignore_words <= 0)
384 return;
385 errno = 0;
386 FILE *fp = fopen(common_words_file, "r");
387 if (!fp)
388 fatal("can't open `%1': %2", common_words_file, strerror(errno));
389 common_words_table = new word_list * [hash_table_size];
390 for (int i = 0; i < hash_table_size; i++)
391 common_words_table[i] = 0;
392 int count = 0;
393 int key_len = 0;
394 for (;;) {
395 int c = getc(fp);
396 while (c != EOF && !csalnum(c))
397 c = getc(fp);
398 if (c == EOF)
399 break;
400 do {
401 if (key_len < truncate_len)
402 key_buffer[key_len++] = cmlower(c);
403 c = getc(fp);
404 } while (c != EOF && csalnum(c));
405 if (key_len >= shortest_len) {
406 int h = hash(key_buffer, key_len) % hash_table_size;
407 common_words_table[h] = new word_list(key_buffer, key_len,
408 common_words_table[h]);
410 if (++count >= n_ignore_words)
411 break;
412 key_len = 0;
413 if (c == EOF)
414 break;
416 n_ignore_words = count;
417 fclose(fp);
420 static int do_whole_file(const char *filename)
422 errno = 0;
423 FILE *fp = fopen(filename, "r");
424 if (!fp) {
425 error("can't open `%1': %2", filename, strerror(errno));
426 return 0;
428 int count = 0;
429 int key_len = 0;
430 int c;
431 while ((c = getc(fp)) != EOF) {
432 if (csalnum(c)) {
433 key_len = 1;
434 key_buffer[0] = c;
435 while ((c = getc(fp)) != EOF) {
436 if (!csalnum(c))
437 break;
438 if (key_len < truncate_len)
439 key_buffer[key_len++] = c;
441 if (store_key(key_buffer, key_len)) {
442 if (++count >= max_keys_per_item)
443 break;
445 if (c == EOF)
446 break;
449 store_reference(filenames.length(), 0, 0);
450 store_filename(filename);
451 fclose(fp);
452 return 1;
455 static int do_file(const char *filename)
457 errno = 0;
458 // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
459 // byte counts to be consistent with fseek.
460 FILE *fp = fopen(filename, FOPEN_RB);
461 if (fp == 0) {
462 error("can't open `%1': %2", filename, strerror(errno));
463 return 0;
465 int filename_index = filenames.length();
466 store_filename(filename);
468 enum {
469 START, // at the start of the file; also in between references
470 BOL, // in the middle of a reference, at the beginning of the line
471 PERCENT, // seen a percent at the beginning of the line
472 IGNORE, // ignoring a field
473 IGNORE_BOL, // at the beginning of a line ignoring a field
474 KEY, // in the middle of a key
475 DISCARD, // after truncate_len bytes of a key
476 MIDDLE // in between keys
477 } state = START;
479 // In states START, BOL, IGNORE_BOL, space_count how many spaces at
480 // the beginning have been seen. In states PERCENT, IGNORE, KEY,
481 // MIDDLE space_count must be 0.
482 int space_count = 0;
483 int byte_count = 0; // bytes read
484 int key_len = 0;
485 int ref_start = -1; // position of start of current reference
486 for (;;) {
487 int c = getc(fp);
488 if (c == EOF)
489 break;
490 // We opened the file in binary mode, so we need to skip
491 // every CR character before a Newline.
492 if (c == '\r') {
493 int peek = getc(fp);
494 if (peek = '\n') {
495 byte_count++;
496 c = peek;
498 else
499 ungetc(peek, fp);
501 #if defined(__MSDOS__) || defined(_MSC_VER)
502 else if (c == 0x1a) // ^Z means EOF in text files
503 break;
504 #endif
505 byte_count++;
506 switch (state) {
507 case START:
508 if (c == ' ' || c == '\t') {
509 space_count++;
510 break;
512 if (c == '\n') {
513 space_count = 0;
514 break;
516 ref_start = byte_count - space_count - 1;
517 space_count = 0;
518 if (c == '%')
519 state = PERCENT;
520 else if (csalnum(c)) {
521 state = KEY;
522 key_buffer[0] = c;
523 key_len = 1;
525 else
526 state = MIDDLE;
527 break;
528 case BOL:
529 switch (c) {
530 case '%':
531 if (space_count > 0) {
532 space_count = 0;
533 state = MIDDLE;
535 else
536 state = PERCENT;
537 break;
538 case ' ':
539 case '\t':
540 space_count++;
541 break;
542 case '\n':
543 store_reference(filename_index, ref_start,
544 byte_count - 1 - space_count - ref_start);
545 state = START;
546 space_count = 0;
547 break;
548 default:
549 space_count = 0;
550 if (csalnum(c)) {
551 state = KEY;
552 key_buffer[0] = c;
553 key_len = 1;
555 else
556 state = MIDDLE;
558 break;
559 case PERCENT:
560 if (strchr(ignore_fields, c) != 0)
561 state = IGNORE;
562 else if (c == '\n')
563 state = BOL;
564 else
565 state = MIDDLE;
566 break;
567 case IGNORE:
568 if (c == '\n')
569 state = IGNORE_BOL;
570 break;
571 case IGNORE_BOL:
572 switch (c) {
573 case '%':
574 if (space_count > 0) {
575 state = IGNORE;
576 space_count = 0;
578 else
579 state = PERCENT;
580 break;
581 case ' ':
582 case '\t':
583 space_count++;
584 break;
585 case '\n':
586 store_reference(filename_index, ref_start,
587 byte_count - 1 - space_count - ref_start);
588 state = START;
589 space_count = 0;
590 break;
591 default:
592 space_count = 0;
593 state = IGNORE;
595 break;
596 case KEY:
597 if (csalnum(c)) {
598 if (key_len < truncate_len)
599 key_buffer[key_len++] = c;
600 else
601 state = DISCARD;
603 else {
604 possibly_store_key(key_buffer, key_len);
605 key_len = 0;
606 if (c == '\n')
607 state = BOL;
608 else
609 state = MIDDLE;
611 break;
612 case DISCARD:
613 if (!csalnum(c)) {
614 possibly_store_key(key_buffer, key_len);
615 key_len = 0;
616 if (c == '\n')
617 state = BOL;
618 else
619 state = MIDDLE;
621 break;
622 case MIDDLE:
623 if (csalnum(c)) {
624 state = KEY;
625 key_buffer[0] = c;
626 key_len = 1;
628 else if (c == '\n')
629 state = BOL;
630 break;
631 default:
632 assert(0);
635 switch (state) {
636 case START:
637 break;
638 case DISCARD:
639 case KEY:
640 possibly_store_key(key_buffer, key_len);
641 // fall through
642 case BOL:
643 case PERCENT:
644 case IGNORE_BOL:
645 case IGNORE:
646 case MIDDLE:
647 store_reference(filename_index, ref_start,
648 byte_count - ref_start - space_count);
649 break;
650 default:
651 assert(0);
653 fclose(fp);
654 return 1;
657 static void store_reference(int filename_index, int pos, int len)
659 tag t;
660 t.filename_index = filename_index;
661 t.start = pos;
662 t.length = len;
663 fwrite_or_die(&t, sizeof(t), 1, indxfp);
664 ntags++;
667 static void store_filename(const char *fn)
669 filenames += fn;
670 filenames += '\0';
673 static void init_hash_table()
675 hash_table = new table_entry[hash_table_size];
676 for (int i = 0; i < hash_table_size; i++)
677 hash_table[i].ptr = 0;
680 static void possibly_store_key(char *s, int len)
682 static int last_tagno = -1;
683 static int key_count;
684 if (last_tagno != ntags) {
685 last_tagno = ntags;
686 key_count = 0;
688 if (key_count < max_keys_per_item) {
689 if (store_key(s, len))
690 key_count++;
694 static int store_key(char *s, int len)
696 if (len < shortest_len)
697 return 0;
698 int is_number = 1;
699 for (int i = 0; i < len; i++)
700 if (!csdigit(s[i])) {
701 is_number = 0;
702 s[i] = cmlower(s[i]);
704 if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
705 return 0;
706 int h = hash(s, len) % hash_table_size;
707 if (common_words_table) {
708 for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
709 if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
710 return 0;
712 table_entry *pp = hash_table + h;
713 if (!pp->ptr)
714 pp->ptr = new block;
715 else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
716 return 1;
717 else if (pp->ptr->used >= BLOCK_SIZE)
718 pp->ptr = new block(pp->ptr);
719 pp->ptr->v[(pp->ptr->used)++] = ntags;
720 return 1;
723 static void write_hash_table()
725 const int minus_one = -1;
726 int li = 0;
727 for (int i = 0; i < hash_table_size; i++) {
728 block *ptr = hash_table[i].ptr;
729 if (!ptr)
730 hash_table[i].count = -1;
731 else {
732 hash_table[i].count = li;
733 block *rev = 0;
734 while (ptr) {
735 block *tem = ptr;
736 ptr = ptr->next;
737 tem->next = rev;
738 rev = tem;
740 while (rev) {
741 fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
742 li += rev->used;
743 block *tem = rev;
744 rev = rev->next;
745 delete tem;
747 fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
748 li += 1;
751 if (sizeof(table_entry) == sizeof(int))
752 fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
753 else {
754 // write it out word by word
755 for (int i = 0; i < hash_table_size; i++)
756 fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
758 fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
759 if (fseek(indxfp, 0, 0) < 0)
760 fatal("error seeking on index file: %1", strerror(errno));
761 index_header h;
762 h.magic = INDEX_MAGIC;
763 h.version = INDEX_VERSION;
764 h.tags_size = ntags;
765 h.lists_size = li;
766 h.table_size = hash_table_size;
767 h.strings_size = filenames.length();
768 h.truncate = truncate_len;
769 h.shortest = shortest_len;
770 h.common = n_ignore_words;
771 fwrite_or_die(&h, sizeof(h), 1, indxfp);
774 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
776 if (fwrite(ptr, size, nitems, fp) != nitems)
777 fatal("fwrite failed: %1", strerror(errno));
780 void fatal_error_exit()
782 cleanup();
783 exit(3);
786 extern "C" {
788 void cleanup()
790 if (temp_index_file)
791 unlink(temp_index_file);