* src/libs/libgroff/glyphuni.cpp (glyph_to_unicode_list),
[s-roff.git] / src / utils / indxbib / indxbib.cpp
blobd8a53b6d26a7f286476ab7963816e26aeae0d060
1 // -*- C++ -*-
2 /* Copyright (C) 1989-1992, 2000, 2001, 2002, 2003, 2004
3 Free Software Foundation, Inc.
4 Written by James Clark (jjc@jclark.com)
6 This file is part of groff.
8 groff is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
13 groff is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License along
19 with groff; see the file COPYING. If not, write to the Free Software
20 Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
22 #include "lib.h"
24 #include <stdlib.h>
25 #include <assert.h>
26 #include <errno.h>
28 #include "posix.h"
29 #include "errarg.h"
30 #include "error.h"
31 #include "stringclass.h"
32 #include "cset.h"
33 #include "cmap.h"
35 #include "defs.h"
36 #include "index.h"
38 #include "nonposix.h"
40 extern "C" const char *Version_string;
42 #define DEFAULT_HASH_TABLE_SIZE 997
43 #define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
45 // (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
47 #define MALLOC_OVERHEAD 16
49 #ifdef BLOCK_SIZE
50 #undef BLOCK_SIZE
51 #endif
53 const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
54 - sizeof(int)) / sizeof(int));
55 struct block {
56 block *next;
57 int used;
58 int v[BLOCK_SIZE];
60 block(block *p = 0) : next(p), used(0) { }
63 struct block;
65 union table_entry {
66 block *ptr;
67 int count;
70 struct word_list {
71 word_list *next;
72 char *str;
73 int len;
74 word_list(const char *, int, word_list *);
77 table_entry *hash_table;
78 int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
79 // We make this the same size as hash_table so we only have to do one
80 // mod per key.
81 static word_list **common_words_table = 0;
82 char *key_buffer;
84 FILE *indxfp;
85 int ntags = 0;
86 string filenames;
87 char *temp_index_file = 0;
89 const char *ignore_fields = "XYZ";
90 const char *common_words_file = COMMON_WORDS_FILE;
91 int n_ignore_words = 100;
92 int truncate_len = 6;
93 int shortest_len = 3;
94 int max_keys_per_item = 100;
96 static void usage(FILE *stream);
97 static void write_hash_table();
98 static void init_hash_table();
99 static void read_common_words_file();
100 static int store_key(char *s, int len);
101 static void possibly_store_key(char *s, int len);
102 static int do_whole_file(const char *filename);
103 static int do_file(const char *filename);
104 static void store_reference(int filename_index, int pos, int len);
105 static void check_integer_arg(char opt, const char *arg, int min, int *res);
106 static void store_filename(const char *);
107 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
108 static char *get_cwd();
110 extern "C" {
111 void cleanup();
112 void catch_fatal_signals();
113 void ignore_fatal_signals();
116 int main(int argc, char **argv)
118 program_name = argv[0];
119 static char stderr_buf[BUFSIZ];
120 setbuf(stderr, stderr_buf);
122 const char *base_name = 0;
123 typedef int (*parser_t)(const char *);
124 parser_t parser = do_file;
125 const char *directory = 0;
126 const char *foption = 0;
127 int opt;
128 static const struct option long_options[] = {
129 { "help", no_argument, 0, CHAR_MAX + 1 },
130 { "version", no_argument, 0, 'v' },
131 { NULL, 0, 0, 0 }
133 while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw",
134 long_options, NULL))
135 != EOF)
136 switch (opt) {
137 case 'c':
138 common_words_file = optarg;
139 break;
140 case 'd':
141 directory = optarg;
142 break;
143 case 'f':
144 foption = optarg;
145 break;
146 case 'h':
147 check_integer_arg('h', optarg, 1, &hash_table_size);
148 if (!is_prime(hash_table_size)) {
149 while (!is_prime(++hash_table_size))
151 warning("%1 not prime: using %2 instead", optarg, hash_table_size);
153 break;
154 case 'i':
155 ignore_fields = optarg;
156 break;
157 case 'k':
158 check_integer_arg('k', optarg, 1, &max_keys_per_item);
159 break;
160 case 'l':
161 check_integer_arg('l', optarg, 0, &shortest_len);
162 break;
163 case 'n':
164 check_integer_arg('n', optarg, 0, &n_ignore_words);
165 break;
166 case 'o':
167 base_name = optarg;
168 break;
169 case 't':
170 check_integer_arg('t', optarg, 1, &truncate_len);
171 break;
172 case 'w':
173 parser = do_whole_file;
174 break;
175 case 'v':
176 printf("GNU indxbib (groff) version %s\n", Version_string);
177 exit(0);
178 break;
179 case CHAR_MAX + 1: // --help
180 usage(stdout);
181 exit(0);
182 break;
183 case '?':
184 usage(stderr);
185 exit(1);
186 break;
187 default:
188 assert(0);
189 break;
191 if (optind >= argc && foption == 0)
192 fatal("no files and no -f option");
193 if (!directory) {
194 char *path = get_cwd();
195 store_filename(path);
196 a_delete path;
198 else
199 store_filename(directory);
200 init_hash_table();
201 store_filename(common_words_file);
202 store_filename(ignore_fields);
203 key_buffer = new char[truncate_len];
204 read_common_words_file();
205 if (!base_name)
206 base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
207 const char *p = strrchr(base_name, DIR_SEPS[0]), *p1;
208 const char *sep = &DIR_SEPS[1];
209 while (*sep) {
210 p1 = strrchr(base_name, *sep);
211 if (p1 && (!p || p1 > p))
212 p = p1;
213 sep++;
215 size_t name_max;
216 if (p) {
217 char *dir = strsave(base_name);
218 dir[p - base_name] = '\0';
219 name_max = file_name_max(dir);
220 a_delete dir;
222 else
223 name_max = file_name_max(".");
224 const char *filename = p ? p + 1 : base_name;
225 if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max)
226 fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
227 if (p) {
228 p++;
229 temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)];
230 memcpy(temp_index_file, base_name, p - base_name);
231 strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE);
233 else {
234 temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
236 catch_fatal_signals();
237 int fd = mkstemp(temp_index_file);
238 if (fd < 0)
239 fatal("can't create temporary index file: %1", strerror(errno));
240 indxfp = fdopen(fd, FOPEN_WB);
241 if (indxfp == 0)
242 fatal("fdopen failed");
243 if (fseek(indxfp, sizeof(index_header), 0) < 0)
244 fatal("can't seek past index header: %1", strerror(errno));
245 int failed = 0;
246 if (foption) {
247 FILE *fp = stdin;
248 if (strcmp(foption, "-") != 0) {
249 errno = 0;
250 fp = fopen(foption, "r");
251 if (!fp)
252 fatal("can't open `%1': %2", foption, strerror(errno));
254 string path;
255 int lineno = 1;
256 for (;;) {
257 int c;
258 for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
259 if (c == '\0')
260 error_with_file_and_line(foption, lineno,
261 "nul character in pathname ignored");
262 else
263 path += c;
265 if (path.length() > 0) {
266 path += '\0';
267 if (!(*parser)(path.contents()))
268 failed = 1;
269 path.clear();
271 if (c == EOF)
272 break;
273 lineno++;
275 if (fp != stdin)
276 fclose(fp);
278 for (int i = optind; i < argc; i++)
279 if (!(*parser)(argv[i]))
280 failed = 1;
281 write_hash_table();
282 if (fclose(indxfp) < 0)
283 fatal("error closing temporary index file: %1", strerror(errno));
284 char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)];
285 strcpy(index_file, base_name);
286 strcat(index_file, INDEX_SUFFIX);
287 #ifdef HAVE_RENAME
288 #ifdef __EMX__
289 if (access(index_file, R_OK) == 0)
290 unlink(index_file);
291 #endif /* __EMX__ */
292 if (rename(temp_index_file, index_file) < 0) {
293 #ifdef __MSDOS__
294 // RENAME could fail on plain MSDOS filesystems because
295 // INDEX_FILE is an invalid filename, e.g. it has multiple dots.
296 char *fname = p ? index_file + (p - base_name) : 0;
297 char *dot = 0;
299 // Replace the dot with an underscore and try again.
300 if (fname
301 && (dot = strchr(fname, '.')) != 0
302 && strcmp(dot, INDEX_SUFFIX) != 0)
303 *dot = '_';
304 if (rename(temp_index_file, index_file) < 0)
305 #endif
306 fatal("can't rename temporary index file: %1", strerror(errno));
308 #else /* not HAVE_RENAME */
309 ignore_fatal_signals();
310 if (unlink(index_file) < 0) {
311 if (errno != ENOENT)
312 fatal("can't unlink `%1': %2", index_file, strerror(errno));
314 if (link(temp_index_file, index_file) < 0)
315 fatal("can't link temporary index file: %1", strerror(errno));
316 if (unlink(temp_index_file) < 0)
317 fatal("can't unlink temporary index file: %1", strerror(errno));
318 #endif /* not HAVE_RENAME */
319 temp_index_file = 0;
320 return failed;
323 static void usage(FILE *stream)
325 fprintf(stream,
326 "usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
327 " [-l n] [-n n] [-o base] [-t n] [files...]\n",
328 program_name);
331 static void check_integer_arg(char opt, const char *arg, int min, int *res)
333 char *ptr;
334 long n = strtol(arg, &ptr, 10);
335 if (n == 0 && ptr == arg)
336 error("argument to -%1 not an integer", opt);
337 else if (n < min)
338 error("argument to -%1 must not be less than %2", opt, min);
339 else {
340 if (n > INT_MAX)
341 error("argument to -%1 greater than maximum integer", opt);
342 else if (*ptr != '\0')
343 error("junk after integer argument to -%1", opt);
344 *res = int(n);
348 static char *get_cwd()
350 char *buf;
351 int size = 12;
353 for (;;) {
354 buf = new char[size];
355 if (getcwd(buf, size))
356 break;
357 if (errno != ERANGE)
358 fatal("cannot get current working directory: %1", strerror(errno));
359 a_delete buf;
360 if (size == INT_MAX)
361 fatal("current working directory longer than INT_MAX");
362 if (size > INT_MAX/2)
363 size = INT_MAX;
364 else
365 size *= 2;
367 return buf;
370 word_list::word_list(const char *s, int n, word_list *p)
371 : next(p), len(n)
373 str = new char[n];
374 memcpy(str, s, n);
377 static void read_common_words_file()
379 if (n_ignore_words <= 0)
380 return;
381 errno = 0;
382 FILE *fp = fopen(common_words_file, "r");
383 if (!fp)
384 fatal("can't open `%1': %2", common_words_file, strerror(errno));
385 common_words_table = new word_list * [hash_table_size];
386 for (int i = 0; i < hash_table_size; i++)
387 common_words_table[i] = 0;
388 int count = 0;
389 int key_len = 0;
390 for (;;) {
391 int c = getc(fp);
392 while (c != EOF && !csalnum(c))
393 c = getc(fp);
394 if (c == EOF)
395 break;
396 do {
397 if (key_len < truncate_len)
398 key_buffer[key_len++] = cmlower(c);
399 c = getc(fp);
400 } while (c != EOF && csalnum(c));
401 if (key_len >= shortest_len) {
402 int h = hash(key_buffer, key_len) % hash_table_size;
403 common_words_table[h] = new word_list(key_buffer, key_len,
404 common_words_table[h]);
406 if (++count >= n_ignore_words)
407 break;
408 key_len = 0;
409 if (c == EOF)
410 break;
412 n_ignore_words = count;
413 fclose(fp);
416 static int do_whole_file(const char *filename)
418 errno = 0;
419 FILE *fp = fopen(filename, "r");
420 if (!fp) {
421 error("can't open `%1': %2", filename, strerror(errno));
422 return 0;
424 int count = 0;
425 int key_len = 0;
426 int c;
427 while ((c = getc(fp)) != EOF) {
428 if (csalnum(c)) {
429 key_len = 1;
430 key_buffer[0] = c;
431 while ((c = getc(fp)) != EOF) {
432 if (!csalnum(c))
433 break;
434 if (key_len < truncate_len)
435 key_buffer[key_len++] = c;
437 if (store_key(key_buffer, key_len)) {
438 if (++count >= max_keys_per_item)
439 break;
441 if (c == EOF)
442 break;
445 store_reference(filenames.length(), 0, 0);
446 store_filename(filename);
447 fclose(fp);
448 return 1;
451 static int do_file(const char *filename)
453 errno = 0;
454 // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
455 // byte counts to be consistent with fseek.
456 FILE *fp = fopen(filename, FOPEN_RB);
457 if (fp == 0) {
458 error("can't open `%1': %2", filename, strerror(errno));
459 return 0;
461 int filename_index = filenames.length();
462 store_filename(filename);
464 enum {
465 START, // at the start of the file; also in between references
466 BOL, // in the middle of a reference, at the beginning of the line
467 PERCENT, // seen a percent at the beginning of the line
468 IGNORE, // ignoring a field
469 IGNORE_BOL, // at the beginning of a line ignoring a field
470 KEY, // in the middle of a key
471 DISCARD, // after truncate_len bytes of a key
472 MIDDLE // in between keys
473 } state = START;
475 // In states START, BOL, IGNORE_BOL, space_count how many spaces at
476 // the beginning have been seen. In states PERCENT, IGNORE, KEY,
477 // MIDDLE space_count must be 0.
478 int space_count = 0;
479 int byte_count = 0; // bytes read
480 int key_len = 0;
481 int ref_start = -1; // position of start of current reference
482 for (;;) {
483 int c = getc(fp);
484 if (c == EOF)
485 break;
486 // We opened the file in binary mode, so we need to skip
487 // every CR character before a Newline.
488 if (c == '\r') {
489 int peek = getc(fp);
490 if (peek == '\n') {
491 byte_count++;
492 c = peek;
494 else
495 ungetc(peek, fp);
497 #if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__)
498 else if (c == 0x1a) // ^Z means EOF in text files
499 break;
500 #endif
501 byte_count++;
502 switch (state) {
503 case START:
504 if (c == ' ' || c == '\t') {
505 space_count++;
506 break;
508 if (c == '\n') {
509 space_count = 0;
510 break;
512 ref_start = byte_count - space_count - 1;
513 space_count = 0;
514 if (c == '%')
515 state = PERCENT;
516 else if (csalnum(c)) {
517 state = KEY;
518 key_buffer[0] = c;
519 key_len = 1;
521 else
522 state = MIDDLE;
523 break;
524 case BOL:
525 switch (c) {
526 case '%':
527 if (space_count > 0) {
528 space_count = 0;
529 state = MIDDLE;
531 else
532 state = PERCENT;
533 break;
534 case ' ':
535 case '\t':
536 space_count++;
537 break;
538 case '\n':
539 store_reference(filename_index, ref_start,
540 byte_count - 1 - space_count - ref_start);
541 state = START;
542 space_count = 0;
543 break;
544 default:
545 space_count = 0;
546 if (csalnum(c)) {
547 state = KEY;
548 key_buffer[0] = c;
549 key_len = 1;
551 else
552 state = MIDDLE;
554 break;
555 case PERCENT:
556 if (strchr(ignore_fields, c) != 0)
557 state = IGNORE;
558 else if (c == '\n')
559 state = BOL;
560 else
561 state = MIDDLE;
562 break;
563 case IGNORE:
564 if (c == '\n')
565 state = IGNORE_BOL;
566 break;
567 case IGNORE_BOL:
568 switch (c) {
569 case '%':
570 if (space_count > 0) {
571 state = IGNORE;
572 space_count = 0;
574 else
575 state = PERCENT;
576 break;
577 case ' ':
578 case '\t':
579 space_count++;
580 break;
581 case '\n':
582 store_reference(filename_index, ref_start,
583 byte_count - 1 - space_count - ref_start);
584 state = START;
585 space_count = 0;
586 break;
587 default:
588 space_count = 0;
589 state = IGNORE;
591 break;
592 case KEY:
593 if (csalnum(c)) {
594 if (key_len < truncate_len)
595 key_buffer[key_len++] = c;
596 else
597 state = DISCARD;
599 else {
600 possibly_store_key(key_buffer, key_len);
601 key_len = 0;
602 if (c == '\n')
603 state = BOL;
604 else
605 state = MIDDLE;
607 break;
608 case DISCARD:
609 if (!csalnum(c)) {
610 possibly_store_key(key_buffer, key_len);
611 key_len = 0;
612 if (c == '\n')
613 state = BOL;
614 else
615 state = MIDDLE;
617 break;
618 case MIDDLE:
619 if (csalnum(c)) {
620 state = KEY;
621 key_buffer[0] = c;
622 key_len = 1;
624 else if (c == '\n')
625 state = BOL;
626 break;
627 default:
628 assert(0);
631 switch (state) {
632 case START:
633 break;
634 case DISCARD:
635 case KEY:
636 possibly_store_key(key_buffer, key_len);
637 // fall through
638 case BOL:
639 case PERCENT:
640 case IGNORE_BOL:
641 case IGNORE:
642 case MIDDLE:
643 store_reference(filename_index, ref_start,
644 byte_count - ref_start - space_count);
645 break;
646 default:
647 assert(0);
649 fclose(fp);
650 return 1;
653 static void store_reference(int filename_index, int pos, int len)
655 tag t;
656 t.filename_index = filename_index;
657 t.start = pos;
658 t.length = len;
659 fwrite_or_die(&t, sizeof(t), 1, indxfp);
660 ntags++;
663 static void store_filename(const char *fn)
665 filenames += fn;
666 filenames += '\0';
669 static void init_hash_table()
671 hash_table = new table_entry[hash_table_size];
672 for (int i = 0; i < hash_table_size; i++)
673 hash_table[i].ptr = 0;
676 static void possibly_store_key(char *s, int len)
678 static int last_tagno = -1;
679 static int key_count;
680 if (last_tagno != ntags) {
681 last_tagno = ntags;
682 key_count = 0;
684 if (key_count < max_keys_per_item) {
685 if (store_key(s, len))
686 key_count++;
690 static int store_key(char *s, int len)
692 if (len < shortest_len)
693 return 0;
694 int is_number = 1;
695 for (int i = 0; i < len; i++)
696 if (!csdigit(s[i])) {
697 is_number = 0;
698 s[i] = cmlower(s[i]);
700 if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
701 return 0;
702 int h = hash(s, len) % hash_table_size;
703 if (common_words_table) {
704 for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
705 if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
706 return 0;
708 table_entry *pp = hash_table + h;
709 if (!pp->ptr)
710 pp->ptr = new block;
711 else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
712 return 1;
713 else if (pp->ptr->used >= BLOCK_SIZE)
714 pp->ptr = new block(pp->ptr);
715 pp->ptr->v[(pp->ptr->used)++] = ntags;
716 return 1;
719 static void write_hash_table()
721 const int minus_one = -1;
722 int li = 0;
723 for (int i = 0; i < hash_table_size; i++) {
724 block *ptr = hash_table[i].ptr;
725 if (!ptr)
726 hash_table[i].count = -1;
727 else {
728 hash_table[i].count = li;
729 block *rev = 0;
730 while (ptr) {
731 block *tem = ptr;
732 ptr = ptr->next;
733 tem->next = rev;
734 rev = tem;
736 while (rev) {
737 fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
738 li += rev->used;
739 block *tem = rev;
740 rev = rev->next;
741 delete tem;
743 fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
744 li += 1;
747 if (sizeof(table_entry) == sizeof(int))
748 fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
749 else {
750 // write it out word by word
751 for (int i = 0; i < hash_table_size; i++)
752 fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
754 fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
755 if (fseek(indxfp, 0, 0) < 0)
756 fatal("error seeking on index file: %1", strerror(errno));
757 index_header h;
758 h.magic = INDEX_MAGIC;
759 h.version = INDEX_VERSION;
760 h.tags_size = ntags;
761 h.lists_size = li;
762 h.table_size = hash_table_size;
763 h.strings_size = filenames.length();
764 h.truncate = truncate_len;
765 h.shortest = shortest_len;
766 h.common = n_ignore_words;
767 fwrite_or_die(&h, sizeof(h), 1, indxfp);
770 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
772 if (fwrite(ptr, size, nitems, fp) != (size_t)nitems)
773 fatal("fwrite failed: %1", strerror(errno));
776 void fatal_error_exit()
778 cleanup();
779 exit(3);
782 extern "C" {
784 void cleanup()
786 if (temp_index_file)
787 unlink(temp_index_file);