Sync-to-go: update copyright for 2015
[s-roff.git] / src / ute-indxbib / indxbib.cpp
blobbe07b24d65e3385cd3004e30eca55178b0b1a972
1 /*@
2 * Copyright (c) 2014 - 2015 Steffen (Daode) Nurpmeso <sdaoden@users.sf.net>.
4 * Copyright (C) 1989 - 1992, 2000 - 2004
5 * Free Software Foundation, Inc.
6 * Written by James Clark (jjc@jclark.com)
8 * This is free software; you can redistribute it and/or modify it under
9 * the terms of the GNU General Public License as published by the Free
10 * Software Foundation; either version 2, or (at your option) any later
11 * version.
13 * This is distributed in the hope that it will be useful, but WITHOUT ANY
14 * WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 * for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with groff; see the file COPYING. If not, write to the Free Software
20 * Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA.
23 #include "config.h"
24 #include "indxbib-config.h"
26 #include <sys/types.h>
28 #include <assert.h>
29 #include <errno.h>
30 #include <signal.h>
31 #include <stdlib.h>
32 #include <unistd.h>
34 #include "cmap.h"
35 #include "cset.h"
36 #include "defs.h"
37 #include "errarg.h"
38 #include "error.h"
39 #include "lib.h"
40 #include "nonposix.h"
41 #include "posix.h"
42 #include "stringclass.h"
44 #include "index.h"
46 // (2^n - MALLOC_OVERHEAD) should be a good argument for malloc(). TODO
47 #define MALLOC_OVERHEAD 16
49 #ifdef BLOCK_SIZE
50 # undef BLOCK_SIZE
51 #endif
52 const int BLOCK_SIZE = ((1024 - MALLOC_OVERHEAD - sizeof(struct block *)
53 - sizeof(int)) / sizeof(int));
54 class block
56 public:
57 block *next;
58 int used;
59 int v[BLOCK_SIZE];
61 block(block *p = 0) : next(p), used(0) { }
64 union table_entry {
65 block *ptr;
66 int count;
69 class word_list
71 public:
72 word_list *next;
73 char *str;
74 int len;
76 word_list(const char *, int, word_list *);
79 table_entry *hash_table;
80 int hash_table_size = DEFAULT_HASH_TABLE_SIZE;
81 // We make this the same size as hash_table so we only have to do one
82 // mod per key.
83 static word_list **common_words_table = 0;
84 char *key_buffer;
86 FILE *indxfp;
87 int ntags = 0;
88 string filenames;
89 char *temp_index_file = 0;
91 const char *ignore_fields = "XYZ";
92 const char *common_words_file = COMMON_WORDS_FILE;
93 int n_ignore_words = 100;
94 int truncate_len = 6;
95 int shortest_len = 3;
96 int max_keys_per_item = 100;
98 static void usage(FILE *stream);
99 static void write_hash_table();
100 static void init_hash_table();
101 static void read_common_words_file();
102 static int store_key(char *s, int len);
103 static void possibly_store_key(char *s, int len);
104 static int do_whole_file(const char *filename);
105 static int do_file(const char *filename);
106 static void store_reference(int filename_index, int pos, int len);
107 static void check_integer_arg(char opt, const char *arg, int min, int *res);
108 static void store_filename(const char *);
109 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp);
110 static char *get_cwd();
112 static void _cleanup(void);
113 static void _handle_fatal_signal(int signum)
114 static void _catch_fatal_signals(void);
115 #ifndef HAVE_RENAME
116 static void _ignore_fatal_signals(void);
117 #endif
119 int main(int argc, char **argv)
121 program_name = argv[0];
122 static char stderr_buf[BUFSIZ];
123 setbuf(stderr, stderr_buf);
125 const char *base_name = 0;
126 typedef int (*parser_t)(const char *);
127 parser_t parser = do_file;
128 const char *directory = 0;
129 const char *foption = 0;
130 int opt;
131 static const struct option long_options[] = {
132 { "help", no_argument, 0, CHAR_MAX + 1 },
133 { "version", no_argument, 0, 'v' },
134 { NULL, 0, 0, 0 }
136 while ((opt = getopt_long(argc, argv, "c:o:h:i:k:l:t:n:c:d:f:vw",
137 long_options, NULL))
138 != EOF)
139 switch (opt) {
140 case 'c':
141 common_words_file = optarg;
142 break;
143 case 'd':
144 directory = optarg;
145 break;
146 case 'f':
147 foption = optarg;
148 break;
149 case 'h':
150 check_integer_arg('h', optarg, 1, &hash_table_size);
151 if (!is_prime(hash_table_size)) {
152 while (!is_prime(++hash_table_size))
154 warning("%1 not prime: using %2 instead", optarg, hash_table_size);
156 break;
157 case 'i':
158 ignore_fields = optarg;
159 break;
160 case 'k':
161 check_integer_arg('k', optarg, 1, &max_keys_per_item);
162 break;
163 case 'l':
164 check_integer_arg('l', optarg, 0, &shortest_len);
165 break;
166 case 'n':
167 check_integer_arg('n', optarg, 0, &n_ignore_words);
168 break;
169 case 'o':
170 base_name = optarg;
171 break;
172 case 't':
173 check_integer_arg('t', optarg, 1, &truncate_len);
174 break;
175 case 'w':
176 parser = do_whole_file;
177 break;
178 case 'v':
179 printf(L_INDXBIB " (" T_ROFF ") v" VERSION);
180 exit(0);
181 break;
182 case CHAR_MAX + 1: // --help
183 usage(stdout);
184 exit(0);
185 break;
186 case '?':
187 usage(stderr);
188 exit(1);
189 break;
190 default:
191 assert(0);
192 break;
194 if (optind >= argc && foption == 0)
195 fatal("no files and no -f option");
196 if (!directory) {
197 char *path = get_cwd();
198 store_filename(path);
199 a_delete path;
201 else
202 store_filename(directory);
203 init_hash_table();
204 store_filename(common_words_file);
205 store_filename(ignore_fields);
206 key_buffer = new char[truncate_len];
207 read_common_words_file();
208 if (!base_name)
209 base_name = optind < argc ? argv[optind] : DEFAULT_INDEX_NAME;
210 const char *p = strrchr(base_name, DIR_SEPS[0]), *p1;
211 const char *sep = &DIR_SEPS[1];
212 while (*sep) {
213 p1 = strrchr(base_name, *sep);
214 if (p1 && (!p || p1 > p))
215 p = p1;
216 sep++;
218 size_t name_max;
219 if (p) {
220 char *dir = strsave(base_name);
221 dir[p - base_name] = '\0';
222 name_max = file_name_max(dir);
223 a_delete dir;
225 else
226 name_max = file_name_max(".");
227 const char *filename = p ? p + 1 : base_name;
228 if (strlen(filename) + sizeof(INDEX_SUFFIX) - 1 > name_max)
229 fatal("`%1.%2' is too long for a filename", filename, INDEX_SUFFIX);
230 if (p) {
231 p++;
232 temp_index_file = new char[p - base_name + sizeof(TEMP_INDEX_TEMPLATE)];
233 memcpy(temp_index_file, base_name, p - base_name);
234 strcpy(temp_index_file + (p - base_name), TEMP_INDEX_TEMPLATE);
236 else {
237 temp_index_file = strsave(TEMP_INDEX_TEMPLATE);
239 _catch_fatal_signals();
240 int fd = mkstemp(temp_index_file);
241 if (fd < 0)
242 fatal("can't create temporary index file: %1", strerror(errno));
243 indxfp = fdopen(fd, FOPEN_WB);
244 if (indxfp == 0)
245 fatal("fdopen failed");
246 if (fseek(indxfp, sizeof(index_header), 0) < 0)
247 fatal("can't seek past index header: %1", strerror(errno));
248 int failed = 0;
249 if (foption) {
250 FILE *fp = stdin;
251 if (strcmp(foption, "-") != 0) {
252 errno = 0;
253 fp = fopen(foption, "r");
254 if (!fp)
255 fatal("can't open `%1': %2", foption, strerror(errno));
257 string path;
258 int lineno = 1;
259 for (;;) {
260 int c;
261 for (c = getc(fp); c != '\n' && c != EOF; c = getc(fp)) {
262 if (c == '\0')
263 error_with_file_and_line(foption, lineno,
264 "nul character in pathname ignored");
265 else
266 path += c;
268 if (path.length() > 0) {
269 path += '\0';
270 if (!(*parser)(path.contents()))
271 failed = 1;
272 path.clear();
274 if (c == EOF)
275 break;
276 lineno++;
278 if (fp != stdin)
279 fclose(fp);
281 for (int i = optind; i < argc; i++)
282 if (!(*parser)(argv[i]))
283 failed = 1;
284 write_hash_table();
285 if (fclose(indxfp) < 0)
286 fatal("error closing temporary index file: %1", strerror(errno));
287 char *index_file = new char[strlen(base_name) + sizeof(INDEX_SUFFIX)];
288 strcpy(index_file, base_name);
289 strcat(index_file, INDEX_SUFFIX);
290 #ifdef HAVE_RENAME
291 # ifdef __EMX__
292 if (access(index_file, R_OK) == 0)
293 unlink(index_file);
294 # endif /* __EMX__ */
295 if (rename(temp_index_file, index_file) < 0) {
296 # ifdef __MSDOS__
297 // RENAME could fail on plain MSDOS filesystems because
298 // INDEX_FILE is an invalid filename, e.g. it has multiple dots.
299 char *fname = p ? index_file + (p - base_name) : 0;
300 char *dot = 0;
302 // Replace the dot with an underscore and try again.
303 if (fname
304 && (dot = strchr(fname, '.')) != 0
305 && strcmp(dot, INDEX_SUFFIX) != 0)
306 *dot = '_';
307 if (rename(temp_index_file, index_file) < 0)
308 # endif
309 fatal("can't rename temporary index file: %1", strerror(errno));
311 #else /* HAVE_RENAME */
312 _ignore_fatal_signals();
313 if (unlink(index_file) < 0) {
314 if (errno != ENOENT)
315 fatal("can't unlink `%1': %2", index_file, strerror(errno));
317 if (link(temp_index_file, index_file) < 0)
318 fatal("can't link temporary index file: %1", strerror(errno));
319 if (unlink(temp_index_file) < 0)
320 fatal("can't unlink temporary index file: %1", strerror(errno));
321 #endif /* HAVE_RENAME */
322 temp_index_file = 0;
323 return failed;
326 static void usage(FILE *stream)
328 fprintf(stream,
329 "Synopsis: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
330 " [-l n] [-n n] [-o base] [-t n] [files...]\n",
331 program_name);
334 static void check_integer_arg(char opt, const char *arg, int min, int *res)
336 char *ptr;
337 long n = strtol(arg, &ptr, 10);
338 if (n == 0 && ptr == arg)
339 error("argument to -%1 not an integer", opt);
340 else if (n < min)
341 error("argument to -%1 must not be less than %2", opt, min);
342 else {
343 if (n > INT_MAX)
344 error("argument to -%1 greater than maximum integer", opt);
345 else if (*ptr != '\0')
346 error("junk after integer argument to -%1", opt);
347 *res = int(n);
351 static char *get_cwd()
353 char *buf;
354 int size = 12;
356 for (;;) {
357 buf = new char[size];
358 if (getcwd(buf, size))
359 break;
360 if (errno != ERANGE)
361 fatal("cannot get current working directory: %1", strerror(errno));
362 a_delete buf;
363 if (size == INT_MAX)
364 fatal("current working directory longer than INT_MAX");
365 if (size > INT_MAX/2)
366 size = INT_MAX;
367 else
368 size *= 2;
370 return buf;
373 word_list::word_list(const char *s, int n, word_list *p)
374 : next(p), len(n)
376 str = new char[n];
377 memcpy(str, s, n);
380 static void read_common_words_file()
382 if (n_ignore_words <= 0)
383 return;
384 errno = 0;
385 FILE *fp = fopen(common_words_file, "r");
386 if (!fp)
387 fatal("can't open `%1': %2", common_words_file, strerror(errno));
388 common_words_table = new word_list * [hash_table_size];
389 for (int i = 0; i < hash_table_size; i++)
390 common_words_table[i] = 0;
391 int count = 0;
392 int key_len = 0;
393 for (;;) {
394 int c = getc(fp);
395 while (c != EOF && !csalnum(c))
396 c = getc(fp);
397 if (c == EOF)
398 break;
399 do {
400 if (key_len < truncate_len)
401 key_buffer[key_len++] = cmlower(c);
402 c = getc(fp);
403 } while (c != EOF && csalnum(c));
404 if (key_len >= shortest_len) {
405 int h = hash(key_buffer, key_len) % hash_table_size;
406 common_words_table[h] = new word_list(key_buffer, key_len,
407 common_words_table[h]);
409 if (++count >= n_ignore_words)
410 break;
411 key_len = 0;
412 if (c == EOF)
413 break;
415 n_ignore_words = count;
416 fclose(fp);
419 static int do_whole_file(const char *filename)
421 errno = 0;
422 FILE *fp = fopen(filename, "r");
423 if (!fp) {
424 error("can't open `%1': %2", filename, strerror(errno));
425 return 0;
427 int count = 0;
428 int key_len = 0;
429 int c;
430 while ((c = getc(fp)) != EOF) {
431 if (csalnum(c)) {
432 key_len = 1;
433 key_buffer[0] = c;
434 while ((c = getc(fp)) != EOF) {
435 if (!csalnum(c))
436 break;
437 if (key_len < truncate_len)
438 key_buffer[key_len++] = c;
440 if (store_key(key_buffer, key_len)) {
441 if (++count >= max_keys_per_item)
442 break;
444 if (c == EOF)
445 break;
448 store_reference(filenames.length(), 0, 0);
449 store_filename(filename);
450 fclose(fp);
451 return 1;
454 static int do_file(const char *filename)
456 errno = 0;
457 // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
458 // byte counts to be consistent with fseek.
459 FILE *fp = fopen(filename, FOPEN_RB);
460 if (fp == 0) {
461 error("can't open `%1': %2", filename, strerror(errno));
462 return 0;
464 int filename_index = filenames.length();
465 store_filename(filename);
467 enum {
468 START, // at the start of the file; also in between references
469 BOL, // in the middle of a reference, at the beginning of the line
470 PERCENT, // seen a percent at the beginning of the line
471 IGNORE, // ignoring a field
472 IGNORE_BOL, // at the beginning of a line ignoring a field
473 KEY, // in the middle of a key
474 DISCARD, // after truncate_len bytes of a key
475 MIDDLE // in between keys
476 } state = START;
478 // In states START, BOL, IGNORE_BOL, space_count how many spaces at
479 // the beginning have been seen. In states PERCENT, IGNORE, KEY,
480 // MIDDLE space_count must be 0.
481 int space_count = 0;
482 int byte_count = 0; // bytes read
483 int key_len = 0;
484 int ref_start = -1; // position of start of current reference
485 for (;;) {
486 int c = getc(fp);
487 if (c == EOF)
488 break;
489 // We opened the file in binary mode, so we need to skip
490 // every CR character before a Newline.
491 if (c == '\r') {
492 int peek = getc(fp);
493 if (peek == '\n') {
494 byte_count++;
495 c = peek;
497 else
498 ungetc(peek, fp);
500 #if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__)
501 else if (c == 0x1a) // ^Z means EOF in text files
502 break;
503 #endif
504 byte_count++;
505 switch (state) {
506 case START:
507 if (c == ' ' || c == '\t') {
508 space_count++;
509 break;
511 if (c == '\n') {
512 space_count = 0;
513 break;
515 ref_start = byte_count - space_count - 1;
516 space_count = 0;
517 if (c == '%')
518 state = PERCENT;
519 else if (csalnum(c)) {
520 state = KEY;
521 key_buffer[0] = c;
522 key_len = 1;
524 else
525 state = MIDDLE;
526 break;
527 case BOL:
528 switch (c) {
529 case '%':
530 if (space_count > 0) {
531 space_count = 0;
532 state = MIDDLE;
534 else
535 state = PERCENT;
536 break;
537 case ' ':
538 case '\t':
539 space_count++;
540 break;
541 case '\n':
542 store_reference(filename_index, ref_start,
543 byte_count - 1 - space_count - ref_start);
544 state = START;
545 space_count = 0;
546 break;
547 default:
548 space_count = 0;
549 if (csalnum(c)) {
550 state = KEY;
551 key_buffer[0] = c;
552 key_len = 1;
554 else
555 state = MIDDLE;
557 break;
558 case PERCENT:
559 if (strchr(ignore_fields, c) != 0)
560 state = IGNORE;
561 else if (c == '\n')
562 state = BOL;
563 else
564 state = MIDDLE;
565 break;
566 case IGNORE:
567 if (c == '\n')
568 state = IGNORE_BOL;
569 break;
570 case IGNORE_BOL:
571 switch (c) {
572 case '%':
573 if (space_count > 0) {
574 state = IGNORE;
575 space_count = 0;
577 else
578 state = PERCENT;
579 break;
580 case ' ':
581 case '\t':
582 space_count++;
583 break;
584 case '\n':
585 store_reference(filename_index, ref_start,
586 byte_count - 1 - space_count - ref_start);
587 state = START;
588 space_count = 0;
589 break;
590 default:
591 space_count = 0;
592 state = IGNORE;
594 break;
595 case KEY:
596 if (csalnum(c)) {
597 if (key_len < truncate_len)
598 key_buffer[key_len++] = c;
599 else
600 state = DISCARD;
602 else {
603 possibly_store_key(key_buffer, key_len);
604 key_len = 0;
605 if (c == '\n')
606 state = BOL;
607 else
608 state = MIDDLE;
610 break;
611 case DISCARD:
612 if (!csalnum(c)) {
613 possibly_store_key(key_buffer, key_len);
614 key_len = 0;
615 if (c == '\n')
616 state = BOL;
617 else
618 state = MIDDLE;
620 break;
621 case MIDDLE:
622 if (csalnum(c)) {
623 state = KEY;
624 key_buffer[0] = c;
625 key_len = 1;
627 else if (c == '\n')
628 state = BOL;
629 break;
630 default:
631 assert(0);
634 switch (state) {
635 case START:
636 break;
637 case DISCARD:
638 case KEY:
639 possibly_store_key(key_buffer, key_len);
640 // fall through
641 case BOL:
642 case PERCENT:
643 case IGNORE_BOL:
644 case IGNORE:
645 case MIDDLE:
646 store_reference(filename_index, ref_start,
647 byte_count - ref_start - space_count);
648 break;
649 default:
650 assert(0);
652 fclose(fp);
653 return 1;
656 static void store_reference(int filename_index, int pos, int len)
658 tag t;
659 t.filename_index = filename_index;
660 t.start = pos;
661 t.length = len;
662 fwrite_or_die(&t, sizeof(t), 1, indxfp);
663 ntags++;
666 static void store_filename(const char *fn)
668 filenames += fn;
669 filenames += '\0';
672 static void init_hash_table()
674 hash_table = new table_entry[hash_table_size];
675 for (int i = 0; i < hash_table_size; i++)
676 hash_table[i].ptr = 0;
679 static void possibly_store_key(char *s, int len)
681 static int last_tagno = -1;
682 static int key_count;
683 if (last_tagno != ntags) {
684 last_tagno = ntags;
685 key_count = 0;
687 if (key_count < max_keys_per_item) {
688 if (store_key(s, len))
689 key_count++;
693 static int store_key(char *s, int len)
695 if (len < shortest_len)
696 return 0;
697 int is_number = 1;
698 for (int i = 0; i < len; i++)
699 if (!csdigit(s[i])) {
700 is_number = 0;
701 s[i] = cmlower(s[i]);
703 if (is_number && !(len == 4 && s[0] == '1' && s[1] == '9'))
704 return 0;
705 int h = hash(s, len) % hash_table_size;
706 if (common_words_table) {
707 for (word_list *ptr = common_words_table[h]; ptr; ptr = ptr->next)
708 if (len == ptr->len && memcmp(s, ptr->str, len) == 0)
709 return 0;
711 table_entry *pp = hash_table + h;
712 if (!pp->ptr)
713 pp->ptr = new block;
714 else if (pp->ptr->v[pp->ptr->used - 1] == ntags)
715 return 1;
716 else if (pp->ptr->used >= BLOCK_SIZE)
717 pp->ptr = new block(pp->ptr);
718 pp->ptr->v[(pp->ptr->used)++] = ntags;
719 return 1;
722 static void write_hash_table()
724 const int minus_one = -1;
725 int li = 0;
726 for (int i = 0; i < hash_table_size; i++) {
727 block *ptr = hash_table[i].ptr;
728 if (!ptr)
729 hash_table[i].count = -1;
730 else {
731 hash_table[i].count = li;
732 block *rev = 0;
733 while (ptr) {
734 block *tem = ptr;
735 ptr = ptr->next;
736 tem->next = rev;
737 rev = tem;
739 while (rev) {
740 fwrite_or_die(rev->v, sizeof(int), rev->used, indxfp);
741 li += rev->used;
742 block *tem = rev;
743 rev = rev->next;
744 delete tem;
746 fwrite_or_die(&minus_one, sizeof(int), 1, indxfp);
747 li += 1;
750 if (sizeof(table_entry) == sizeof(int))
751 fwrite_or_die(hash_table, sizeof(int), hash_table_size, indxfp);
752 else {
753 // write it out word by word
754 for (int i = 0; i < hash_table_size; i++)
755 fwrite_or_die(&hash_table[i].count, sizeof(int), 1, indxfp);
757 fwrite_or_die(filenames.contents(), 1, filenames.length(), indxfp);
758 if (fseek(indxfp, 0, 0) < 0)
759 fatal("error seeking on index file: %1", strerror(errno));
760 index_header h;
761 h.magic = INDEX_MAGIC;
762 h.version = INDEX_VERSION;
763 h.tags_size = ntags;
764 h.lists_size = li;
765 h.table_size = hash_table_size;
766 h.strings_size = filenames.length();
767 h.truncate = truncate_len;
768 h.shortest = shortest_len;
769 h.common = n_ignore_words;
770 fwrite_or_die(&h, sizeof(h), 1, indxfp);
773 static void fwrite_or_die(const void *ptr, int size, int nitems, FILE *fp)
775 if (fwrite(ptr, size, nitems, fp) != (size_t)nitems)
776 fatal("fwrite failed: %1", strerror(errno));
779 void fatal_error_exit()
781 _cleanup();
782 exit(3);
785 static void
786 _cleanup(void)
788 if (temp_index_file)
789 unlink(temp_index_file);
792 static void
793 _handle_fatal_signal(int signum)
795 signal(signum, SIG_DFL);
796 _cleanup();
797 #ifdef HAVE_KILL
798 kill(getpid(), signum);
799 #else
800 /* MS-DOS and Win32 don't have kill(); the best compromise is
801 probably to use exit() instead. */
802 exit(signum);
803 #endif
806 static void
807 _catch_fatal_signals(void)
809 #ifdef SIGHUP
810 signal(SIGHUP, &_handle_fatal_signal);
811 #endif
812 signal(SIGINT, &_handle_fatal_signal);
813 signal(SIGTERM, &_handle_fatal_signal);
816 #ifndef HAVE_RENAME
817 static void
818 _ignore_fatal_signals()
820 # ifdef SIGHUP
821 signal(SIGHUP, SIG_IGN);
822 # endif
823 signal(SIGINT, SIG_IGN);
824 signal(SIGTERM, SIG_IGN);
826 #endif /* HAVE_RENAME */
828 // s-it2-mode