* src/utils/indxbib/indxbib.cc (main): Change type of `name_max'
[s-roff.git] / src / utils / indxbib / indxbib.cc
blob4b2477f74bf99806a982ed3f713be421e34e1244
1 // -*- C++ -*-
2 /* Copyright (C) 1989-1992, 2000, 2001 Free Software Foundation, Inc.
3 Written by James Clark (jjc@jclark.com)
5 This file is part of groff.
7 groff is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 groff is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License along
18 with groff; see the file COPYING. If not, write to the Free Software
19 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <assert.h>
25 #include <errno.h>
27 #include "posix.h"
28 #include "lib.h"
29 #include "errarg.h"
30 #include "error.h"
31 #include "stringclass.h"
32 #include "cset.h"
33 #include "cmap.h"
35 #include "defs.h"
36 #include "index.h"
38 #include "nonposix.h"
40 extern "C" const char *Version_string;
42 #ifndef HAVE_MKSTEMP_PROTO
43 extern "C" {
44 extern int mkstemp(char *);
46 #endif
48 #define DEFAULT_HASH_TABLE_SIZE 997
49 #define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
51 // (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
53 #define MALLOC_OVERHEAD 16
55 #ifdef BLOCK_SIZE
56 #undef BLOCK_SIZE
57 #endif
59 const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
60 - sizeof(int)) / sizeof(int));
61 struct block {
62 block *next;
63 int used;
64 int v[BLOCK_SIZE];
66 block(block *p = 0) : next(p), used(0) { }
69 struct block;
71 union table_entry {
72 block *ptr;
73 int count;
76 struct word_list {
77 word_list *next;
78 char *str;
79 int len;
80 word_list(const char *, int, word_list *);
83 table_entry *hash_table;
84 int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
85 // We make this the same size as hash_table so we only have to do one
86 // mod per key.
87 static word_list **common_words_table = 0;
88 char *key_buffer;
90 FILE *indxfp;
91 int ntags = 0;
92 string filenames;
93 char *temp_index_file = 0;
95 const char *ignore_fields = "XYZ";
96 const char *common_words_file = COMMON_WORDS_FILE;
97 int n_ignore_words = 100;
98 int truncate_len = 6;
99 int shortest_len = 3;
100 int max_keys_per_item = 100;
102 static void usage(FILE *stream);
103 static void write_hash_table();
104 static void init_hash_table();
105 static void read_common_words_file();
106 static int store_key(char *s, int len);
107 static void possibly_store_key(char *s, int len);
108 static int do_whole_file(const char *filename);
109 static int do_file(const char *filename);
110 static void store_reference(int filename_index, int pos, int len);
111 static void check_integer_arg(char opt, const char *arg, int min, int *res);
112 static void store_filename(const char *);
113 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
114 static char *get_cwd();
116 extern "C" {
117 void cleanup();
118 void catch_fatal_signals();
119 void ignore_fatal_signals();
122 int main(int argc, char **argv)
124 program_name = argv[0];
125 static char stderr_buf[BUFSIZ];
126 setbuf(stderr, stderr_buf);
128 const char *basename = 0;
129 typedef int (*parser_t)(const char *);
130 parser_t parser = do_file;
131 const char *directory = 0;
132 const char *foption = 0;
133 int opt;
134 static const struct option long_options[] = {
135 { "help", no_argument, 0, CHAR_MAX + 1 },
136 { "version", no_argument, 0, 'v' },
137 { NULL, 0, 0, 0 }
139 while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw",
140 long_options, NULL))
141 != EOF)
142 switch (opt) {
143 case 'c':
144 common_words_file = optarg;
145 break;
146 case 'd':
147 directory = optarg;
148 break;
149 case 'f':
150 foption = optarg;
151 break;
152 case 'h':
153 check_integer_arg('h', optarg, 1, &hash_table_size);
154 if (!is_prime(hash_table_size)) {
155 while (!is_prime(++hash_table_size))
157 warning("%1 not prime: using %2 instead", optarg, hash_table_size);
159 break;
160 case 'i':
161 ignore_fields = optarg;
162 break;
163 case 'k':
164 check_integer_arg('k', optarg, 1, &max_keys_per_item);
165 break;
166 case 'l':
167 check_integer_arg('l', optarg, 0, &shortest_len);
168 break;
169 case 'n':
170 check_integer_arg('n', optarg, 0, &n_ignore_words);
171 break;
172 case 'o':
173 basename = optarg;
174 break;
175 case 't':
176 check_integer_arg('t', optarg, 1, &truncate_len);
177 break;
178 case 'w':
179 parser = do_whole_file;
180 break;
181 case 'v':
182 printf("GNU indxbib (groff) version %s\n", Version_string);
183 exit(0);
184 break;
185 case CHAR_MAX + 1: // --help
186 usage(stdout);
187 exit(0);
188 break;
189 case '?':
190 usage(stderr);
191 exit(1);
192 break;
193 default:
194 assert(0);
195 break;
197 if (optind >= argc && foption == 0)
198 fatal("no files and no -f option");
199 if (!directory) {
200 char *path = get_cwd();
201 store_filename(path);
202 a_delete path;
204 else
205 store_filename(directory);
206 init_hash_table();
207 store_filename(common_words_file);
208 store_filename(ignore_fields);
209 key_buffer = new char[truncate_len];
210 read_common_words_file();
211 if (!basename)
212 basename = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
213 const char *p = strrchr(basename, DIR_SEPS[0]), *p1;
214 const char *sep = &DIR_SEPS[1];
215 while (*sep) {
216 p1 = strrchr(basename, *sep);
217 if (p1 && (!p || p1 > p))
218 p = p1;
219 sep++;
221 size_t name_max;
222 if (p) {
223 char *dir = strsave(basename);
224 dir[p - basename] = '\0';
225 name_max = file_name_max(dir);
226 a_delete dir;
228 else
229 name_max = file_name_max(".");
230 const char *filename = p ? p + 1 : basename;
231 if (name_max >= 0 &&
232 long(strlen(filename) + sizeof(INDEX_SUFFIX) - 1) > name_max)
233 fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
234 if (p) {
235 p++;
236 temp_index_file = new char[p - basename + sizeof(TEMP_INDEX_TEMPLATE)];
237 memcpy(temp_index_file, basename, p - basename);
238 strcpy(temp_index_file + (p - basename), TEMP_INDEX_TEMPLATE);
240 else {
241 temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
243 #ifndef HAVE_MKSTEMP
244 if (!mktemp(temp_index_file) || !temp_index_file[0])
245 fatal("cannot create file name for temporary file");
246 #endif
247 catch_fatal_signals();
248 #ifdef HAVE_MKSTEMP
249 int fd = mkstemp(temp_index_file);
250 #else
251 int fd = creat(temp_index_file, S_IRUSR|S_IRGRP|S_IROTH);
252 #endif
253 if (fd < 0)
254 fatal("can't create temporary index file: %1", strerror(errno));
255 indxfp = fdopen(fd, FOPEN_WB);
256 if (indxfp == 0)
257 fatal("fdopen failed");
258 if (fseek(indxfp, sizeof(index_header), 0) < 0)
259 fatal("can't seek past index header: %1", strerror(errno));
260 int failed = 0;
261 if (foption) {
262 FILE *fp = stdin;
263 if (strcmp(foption, "-") != 0) {
264 errno = 0;
265 fp = fopen(foption, "r");
266 if (!fp)
267 fatal("can't open `%1': %2", foption, strerror(errno));
269 string path;
270 int lineno = 1;
271 for (;;) {
272 int c;
273 for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
274 if (c == '\0')
275 error_with_file_and_line(foption, lineno,
276 "nul character in pathname ignored");
277 else
278 path += c;
280 if (path.length() > 0) {
281 path += '\0';
282 if (!(*parser)(path.contents()))
283 failed = 1;
284 path.clear();
286 if (c == EOF)
287 break;
288 lineno++;
290 if (fp != stdin)
291 fclose(fp);
293 for (int i = optind; i < argc; i++)
294 if (!(*parser)(argv[i]))
295 failed = 1;
296 write_hash_table();
297 if (fclose(indxfp) < 0)
298 fatal("error closing temporary index file: %1", strerror(errno));
299 char *index_file = new char[strlen(basename) + sizeof(INDEX_SUFFIX)];
300 strcpy(index_file, basename);
301 strcat(index_file, INDEX_SUFFIX);
302 #ifdef HAVE_RENAME
303 if (rename(temp_index_file, index_file) < 0) {
304 #ifdef __MSDOS__
305 // RENAME could fail on plain MSDOS filesystems because
306 // INDEX_FILE is an invalid filename, e.g. it has multiple dots.
307 char *fname = p ? index_file + (p - basename) : 0;
308 char *dot = 0;
310 // Replace the dot with an underscore and try again.
311 if (fname
312 && (dot = strchr(fname, '.')) != 0
313 && strcmp(dot, INDEX_SUFFIX) != 0)
314 *dot = '_';
315 if (rename(temp_index_file, index_file) < 0)
316 #endif
317 fatal("can't rename temporary index file: %1", strerror(errno));
319 #else /* not HAVE_RENAME */
320 ignore_fatal_signals();
321 if (unlink(index_file) < 0) {
322 if (errno != ENOENT)
323 fatal("can't unlink `%1': %2", index_file, strerror(errno));
325 if (link(temp_index_file, index_file) < 0)
326 fatal("can't link temporary index file: %1", strerror(errno));
327 if (unlink(temp_index_file) < 0)
328 fatal("can't unlink temporary index file: %1", strerror(errno));
329 #endif /* not HAVE_RENAME */
330 temp_index_file = 0;
331 return failed;
334 static void usage(FILE *stream)
336 fprintf(stream,
337 "usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
338 " [-l n] [-n n] [-o base] [-t n] [files...]\n",
339 program_name);
342 static void check_integer_arg(char opt, const char *arg, int min, int *res)
344 char *ptr;
345 long n = strtol(arg, &ptr, 10);
346 if (n == 0 && ptr == arg)
347 error("argument to -%1 not an integer", opt);
348 else if (n < min)
349 error("argument to -%1 must not be less than %2", opt, min);
350 else {
351 if (n > INT_MAX)
352 error("argument to -%1 greater than maximum integer", opt);
353 else if (*ptr != '\0')
354 error("junk after integer argument to -%1", opt);
355 *res = int(n);
359 static char *get_cwd()
361 char *buf;
362 int size = 12;
364 for (;;) {
365 buf = new char[size];
366 if (getcwd(buf, size))
367 break;
368 if (errno != ERANGE)
369 fatal("cannot get current working directory: %1", strerror(errno));
370 a_delete buf;
371 if (size == INT_MAX)
372 fatal("current working directory longer than INT_MAX");
373 if (size > INT_MAX/2)
374 size = INT_MAX;
375 else
376 size *= 2;
378 return buf;
381 word_list::word_list(const char *s, int n, word_list *p)
382 : next(p), len(n)
384 str = new char[n];
385 memcpy(str, s, n);
388 static void read_common_words_file()
390 if (n_ignore_words <= 0)
391 return;
392 errno = 0;
393 FILE *fp = fopen(common_words_file, "r");
394 if (!fp)
395 fatal("can't open `%1': %2", common_words_file, strerror(errno));
396 common_words_table = new word_list * [hash_table_size];
397 for (int i = 0; i < hash_table_size; i++)
398 common_words_table[i] = 0;
399 int count = 0;
400 int key_len = 0;
401 for (;;) {
402 int c = getc(fp);
403 while (c != EOF && !csalnum(c))
404 c = getc(fp);
405 if (c == EOF)
406 break;
407 do {
408 if (key_len < truncate_len)
409 key_buffer[key_len++] = cmlower(c);
410 c = getc(fp);
411 } while (c != EOF && csalnum(c));
412 if (key_len >= shortest_len) {
413 int h = hash(key_buffer, key_len) % hash_table_size;
414 common_words_table[h] = new word_list(key_buffer, key_len,
415 common_words_table[h]);
417 if (++count >= n_ignore_words)
418 break;
419 key_len = 0;
420 if (c == EOF)
421 break;
423 n_ignore_words = count;
424 fclose(fp);
427 static int do_whole_file(const char *filename)
429 errno = 0;
430 FILE *fp = fopen(filename, "r");
431 if (!fp) {
432 error("can't open `%1': %2", filename, strerror(errno));
433 return 0;
435 int count = 0;
436 int key_len = 0;
437 int c;
438 while ((c = getc(fp)) != EOF) {
439 if (csalnum(c)) {
440 key_len = 1;
441 key_buffer[0] = c;
442 while ((c = getc(fp)) != EOF) {
443 if (!csalnum(c))
444 break;
445 if (key_len < truncate_len)
446 key_buffer[key_len++] = c;
448 if (store_key(key_buffer, key_len)) {
449 if (++count >= max_keys_per_item)
450 break;
452 if (c == EOF)
453 break;
456 store_reference(filenames.length(), 0, 0);
457 store_filename(filename);
458 fclose(fp);
459 return 1;
462 static int do_file(const char *filename)
464 errno = 0;
465 // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
466 // byte counts to be consistent with fseek.
467 FILE *fp = fopen(filename, FOPEN_RB);
468 if (fp == 0) {
469 error("can't open `%1': %2", filename, strerror(errno));
470 return 0;
472 int filename_index = filenames.length();
473 store_filename(filename);
475 enum {
476 START, // at the start of the file; also in between references
477 BOL, // in the middle of a reference, at the beginning of the line
478 PERCENT, // seen a percent at the beginning of the line
479 IGNORE, // ignoring a field
480 IGNORE_BOL, // at the beginning of a line ignoring a field
481 KEY, // in the middle of a key
482 DISCARD, // after truncate_len bytes of a key
483 MIDDLE // in between keys
484 } state = START;
486 // In states START, BOL, IGNORE_BOL, space_count how many spaces at
487 // the beginning have been seen. In states PERCENT, IGNORE, KEY,
488 // MIDDLE space_count must be 0.
489 int space_count = 0;
490 int byte_count = 0; // bytes read
491 int key_len = 0;
492 int ref_start = -1; // position of start of current reference
493 for (;;) {
494 int c = getc(fp);
495 if (c == EOF)
496 break;
497 // We opened the file in binary mode, so we need to skip
498 // every CR character before a Newline.
499 if (c == '\r') {
500 int peek = getc(fp);
501 if (peek == '\n') {
502 byte_count++;
503 c = peek;
505 else
506 ungetc(peek, fp);
508 #if defined(__MSDOS__) || defined(_MSC_VER)
509 else if (c == 0x1a) // ^Z means EOF in text files
510 break;
511 #endif
512 byte_count++;
513 switch (state) {
514 case START:
515 if (c == ' ' || c == '\t') {
516 space_count++;
517 break;
519 if (c == '\n') {
520 space_count = 0;
521 break;
523 ref_start = byte_count - space_count - 1;
524 space_count = 0;
525 if (c == '%')
526 state = PERCENT;
527 else if (csalnum(c)) {
528 state = KEY;
529 key_buffer[0] = c;
530 key_len = 1;
532 else
533 state = MIDDLE;
534 break;
535 case BOL:
536 switch (c) {
537 case '%':
538 if (space_count > 0) {
539 space_count = 0;
540 state = MIDDLE;
542 else
543 state = PERCENT;
544 break;
545 case ' ':
546 case '\t':
547 space_count++;
548 break;
549 case '\n':
550 store_reference(filename_index, ref_start,
551 byte_count - 1 - space_count - ref_start);
552 state = START;
553 space_count = 0;
554 break;
555 default:
556 space_count = 0;
557 if (csalnum(c)) {
558 state = KEY;
559 key_buffer[0] = c;
560 key_len = 1;
562 else
563 state = MIDDLE;
565 break;
566 case PERCENT:
567 if (strchr(ignore_fields, c) != 0)
568 state = IGNORE;
569 else if (c == '\n')
570 state = BOL;
571 else
572 state = MIDDLE;
573 break;
574 case IGNORE:
575 if (c == '\n')
576 state = IGNORE_BOL;
577 break;
578 case IGNORE_BOL:
579 switch (c) {
580 case '%':
581 if (space_count > 0) {
582 state = IGNORE;
583 space_count = 0;
585 else
586 state = PERCENT;
587 break;
588 case ' ':
589 case '\t':
590 space_count++;
591 break;
592 case '\n':
593 store_reference(filename_index, ref_start,
594 byte_count - 1 - space_count - ref_start);
595 state = START;
596 space_count = 0;
597 break;
598 default:
599 space_count = 0;
600 state = IGNORE;
602 break;
603 case KEY:
604 if (csalnum(c)) {
605 if (key_len < truncate_len)
606 key_buffer[key_len++] = c;
607 else
608 state = DISCARD;
610 else {
611 possibly_store_key(key_buffer, key_len);
612 key_len = 0;
613 if (c == '\n')
614 state = BOL;
615 else
616 state = MIDDLE;
618 break;
619 case DISCARD:
620 if (!csalnum(c)) {
621 possibly_store_key(key_buffer, key_len);
622 key_len = 0;
623 if (c == '\n')
624 state = BOL;
625 else
626 state = MIDDLE;
628 break;
629 case MIDDLE:
630 if (csalnum(c)) {
631 state = KEY;
632 key_buffer[0] = c;
633 key_len = 1;
635 else if (c == '\n')
636 state = BOL;
637 break;
638 default:
639 assert(0);
642 switch (state) {
643 case START:
644 break;
645 case DISCARD:
646 case KEY:
647 possibly_store_key(key_buffer, key_len);
648 // fall through
649 case BOL:
650 case PERCENT:
651 case IGNORE_BOL:
652 case IGNORE:
653 case MIDDLE:
654 store_reference(filename_index, ref_start,
655 byte_count - ref_start - space_count);
656 break;
657 default:
658 assert(0);
660 fclose(fp);
661 return 1;
664 static void store_reference(int filename_index, int pos, int len)
666 tag t;
667 t.filename_index = filename_index;
668 t.start = pos;
669 t.length = len;
670 fwrite_or_die(&t, sizeof(t), 1, indxfp);
671 ntags++;
674 static void store_filename(const char *fn)
676 filenames += fn;
677 filenames += '\0';
680 static void init_hash_table()
682 hash_table = new table_entry[hash_table_size];
683 for (int i = 0; i < hash_table_size; i++)
684 hash_table[i].ptr = 0;
687 static void possibly_store_key(char *s, int len)
689 static int last_tagno = -1;
690 static int key_count;
691 if (last_tagno != ntags) {
692 last_tagno = ntags;
693 key_count = 0;
695 if (key_count < max_keys_per_item) {
696 if (store_key(s, len))
697 key_count++;
701 static int store_key(char *s, int len)
703 if (len < shortest_len)
704 return 0;
705 int is_number = 1;
706 for (int i = 0; i < len; i++)
707 if (!csdigit(s[i])) {
708 is_number = 0;
709 s[i] = cmlower(s[i]);
711 if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
712 return 0;
713 int h = hash(s, len) % hash_table_size;
714 if (common_words_table) {
715 for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
716 if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
717 return 0;
719 table_entry *pp = hash_table + h;
720 if (!pp->ptr)
721 pp->ptr = new block;
722 else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
723 return 1;
724 else if (pp->ptr->used >= BLOCK_SIZE)
725 pp->ptr = new block(pp->ptr);
726 pp->ptr->v[(pp->ptr->used)++] = ntags;
727 return 1;
730 static void write_hash_table()
732 const int minus_one = -1;
733 int li = 0;
734 for (int i = 0; i < hash_table_size; i++) {
735 block *ptr = hash_table[i].ptr;
736 if (!ptr)
737 hash_table[i].count = -1;
738 else {
739 hash_table[i].count = li;
740 block *rev = 0;
741 while (ptr) {
742 block *tem = ptr;
743 ptr = ptr->next;
744 tem->next = rev;
745 rev = tem;
747 while (rev) {
748 fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
749 li += rev->used;
750 block *tem = rev;
751 rev = rev->next;
752 delete tem;
754 fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
755 li += 1;
758 if (sizeof(table_entry) == sizeof(int))
759 fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
760 else {
761 // write it out word by word
762 for (int i = 0; i < hash_table_size; i++)
763 fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
765 fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
766 if (fseek(indxfp, 0, 0) < 0)
767 fatal("error seeking on index file: %1", strerror(errno));
768 index_header h;
769 h.magic = INDEX_MAGIC;
770 h.version = INDEX_VERSION;
771 h.tags_size = ntags;
772 h.lists_size = li;
773 h.table_size = hash_table_size;
774 h.strings_size = filenames.length();
775 h.truncate = truncate_len;
776 h.shortest = shortest_len;
777 h.common = n_ignore_words;
778 fwrite_or_die(&h, sizeof(h), 1, indxfp);
781 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
783 if (fwrite(ptr, size, nitems, fp) != (size_t)nitems)
784 fatal("fwrite failed: %1", strerror(errno));
787 void fatal_error_exit()
789 cleanup();
790 exit(3);
793 extern "C" {
795 void cleanup()
797 if (temp_index_file)
798 unlink(temp_index_file);