2 /* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc.
3 Written by James Clark (jjc@jclark.com)
5 This file is part of groff.
7 groff is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 groff is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License along
18 with groff; see the file COPYING. If not, write to the Free Software
19 Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
32 #include "stringclass.h"
40 // Sun's stdlib.h fails to declare this.
44 #define DEFAULT_HASH_TABLE_SIZE 997
45 #define TEMP_INDEX_TEMPLATE "indxbibXXXXXX"
47 // (2^n - MALLOC_OVERHEAD) should be a good argument for malloc().
49 #define MALLOC_OVERHEAD 16
51 const int BLOCK_SIZE
= ((1024 - MALLOC_OVERHEAD
- sizeof(struct block
*)
52 - sizeof(int)) / sizeof(int));
58 block(block
*p
= 0) : next(p
), used(0) { }
72 word_list(const char *, int, word_list
*);
75 table_entry
*hash_table
;
76 int hash_table_size
= DEFAULT_HASH_TABLE_SIZE
;
77 // We make this the same size as hash_table so we only have to do one
79 static word_list
**common_words_table
= 0;
85 char *temp_index_file
= 0;
87 const char *ignore_fields
= "XYZ";
88 const char *common_words_file
= COMMON_WORDS_FILE
;
89 int n_ignore_words
= 100;
92 int max_keys_per_item
= 100;
95 static void write_hash_table();
96 static void init_hash_table();
97 static void read_common_words_file();
98 static int store_key(char *s
, int len
);
99 static void possibly_store_key(char *s
, int len
);
100 static int do_whole_file(const char *filename
);
101 static int do_file(const char *filename
);
102 static void store_reference(int filename_index
, int pos
, int len
);
103 static void check_integer_arg(char opt
, const char *arg
, int min
, int *res
);
104 static void store_filename(const char *);
105 static void fwrite_or_die(const void *ptr
, int size
, int nitems
, FILE *fp
);
106 static char *get_cwd();
108 extern "C" { void fatal_signal(int); }
110 extern "C" { long dir_name_max(const char *); }
112 int main(int argc
, char **argv
)
114 program_name
= argv
[0];
115 static char stderr_buf
[BUFSIZ
];
116 setbuf(stderr
, stderr_buf
);
118 const char *basename
= 0;
119 typedef int (*parser_t
)(const char *);
120 parser_t parser
= do_file
;
121 const char *directory
= 0;
122 const char *foption
= 0;
124 while ((opt
= getopt(argc
, argv
, "c:o:h:i:k:l:t:n:c:d:f:vw")) != EOF
)
127 common_words_file
= optarg
;
136 check_integer_arg('h', optarg
, 1, &hash_table_size
);
137 if (!is_prime(hash_table_size
)) {
138 while (!is_prime(++hash_table_size
))
140 warning("%1 not prime: using %2 instead", optarg
, hash_table_size
);
144 ignore_fields
= optarg
;
147 check_integer_arg('k', optarg
, 1, &max_keys_per_item
);
150 check_integer_arg('l', optarg
, 0, &shortest_len
);
153 check_integer_arg('n', optarg
, 0, &n_ignore_words
);
159 check_integer_arg('t', optarg
, 1, &truncate_len
);
162 parser
= do_whole_file
;
166 extern const char *version_string
;
167 fprintf(stderr
, "GNU indxbib version %s\n", version_string
);
178 if (optind
>= argc
&& foption
== 0)
179 fatal("no files and no -f option");
181 char *path
= get_cwd();
182 store_filename(path
);
186 store_filename(directory
);
188 store_filename(common_words_file
);
189 store_filename(ignore_fields
);
190 key_buffer
= new char[truncate_len
];
191 read_common_words_file();
193 basename
= optind
< argc
? argv
[optind
] : DEFAULT_INDEX_NAME
;
194 const char *p
= strrchr(basename
, '/');
197 char *dir
= strsave(basename
);
198 dir
[p
- basename
] = '\0';
199 name_max
= dir_name_max(dir
);
203 name_max
= dir_name_max(".");
204 const char *filename
= p
? p
+ 1 : basename
;
205 if (name_max
>= 0 && strlen(filename
) + sizeof(INDEX_SUFFIX
) - 1 > name_max
)
206 fatal("`%1.%2' is too long for a filename", filename
, INDEX_SUFFIX
);
209 temp_index_file
= new char[p
- basename
+ sizeof(TEMP_INDEX_TEMPLATE
)];
210 memcpy(temp_index_file
, basename
, p
- basename
);
211 strcpy(temp_index_file
+ (p
- basename
), TEMP_INDEX_TEMPLATE
);
214 temp_index_file
= strsave(TEMP_INDEX_TEMPLATE
);
216 if (!mktemp(temp_index_file
) || !temp_index_file
[0])
217 fatal("cannot create file name for temporary file");
218 signal(SIGHUP
, fatal_signal
);
219 signal(SIGINT
, fatal_signal
);
220 signal(SIGTERM
, fatal_signal
);
221 int fd
= creat(temp_index_file
, S_IRUSR
|S_IRGRP
|S_IROTH
);
223 fatal("can't create temporary index file: %1", strerror(errno
));
224 indxfp
= fdopen(fd
, "w");
226 fatal("fdopen failed");
227 if (fseek(indxfp
, sizeof(index_header
), 0) < 0)
228 fatal("can't seek past index header: %1", strerror(errno
));
232 if (strcmp(foption
, "-") != 0) {
234 fp
= fopen(foption
, "r");
236 fatal("can't open `%1': %2", foption
, strerror(errno
));
241 for (int c
= getc(fp
); c
!= '\n' && c
!= EOF
; c
= getc(fp
)) {
243 error_with_file_and_line(foption
, lineno
,
244 "nul character in pathname ignored");
248 if (path
.length() > 0) {
250 if (!(*parser
)(path
.contents()))
261 for (int i
= optind
; i
< argc
; i
++)
262 if (!(*parser
)(argv
[i
]))
265 if (fclose(indxfp
) < 0)
266 fatal("error closing temporary index file: %1", strerror(errno
));
267 char *index_file
= new char[strlen(basename
) + sizeof(INDEX_SUFFIX
)];
268 strcpy(index_file
, basename
);
269 strcat(index_file
, INDEX_SUFFIX
);
271 if (rename(temp_index_file
, index_file
) < 0)
272 fatal("can't rename temporary index file: %1", strerror(errno
));
273 #else /* not HAVE_RENAME */
274 signal(SIGHUP
, SIG_IGN
);
275 signal(SIGINT
, SIG_IGN
);
276 signal(SIGTERM
, SIG_IGN
);
277 if (unlink(index_file
) < 0) {
279 fatal("can't unlink `%1': %2", index_file
, strerror(errno
));
281 if (link(temp_index_file
, index_file
) < 0)
282 fatal("can't link temporary index file: %1", strerror(errno
));
283 if (unlink(temp_index_file
) < 0)
284 fatal("can't unlink temporary index file: %1", strerror(errno
));
285 #endif /* not HAVE_RENAME */
293 "usage: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
294 " [-l n] [-n n] [-o base] [-t n] [files...]\n",
299 static void check_integer_arg(char opt
, const char *arg
, int min
, int *res
)
302 long n
= strtol(arg
, &ptr
, 10);
303 if (n
== 0 && ptr
== arg
)
304 error("argument to -%1 not an integer", opt
);
306 error("argument to -%1 must not be less than %2", opt
, min
);
309 error("argument to -%1 greater than maximum integer", opt
);
310 else if (*ptr
!= '\0')
311 error("junk after integer argument to -%1", opt
);
316 static char *get_cwd()
322 buf
= new char[size
];
323 if (getcwd(buf
, size
))
326 fatal("cannot get current working directory: %1", strerror(errno
));
329 fatal("current working directory longer than INT_MAX");
330 if (size
> INT_MAX
/2)
338 word_list::word_list(const char *s
, int n
, word_list
*p
)
345 static void read_common_words_file()
347 if (n_ignore_words
<= 0)
350 FILE *fp
= fopen(common_words_file
, "r");
352 fatal("can't open `%1': %2", common_words_file
, strerror(errno
));
353 common_words_table
= new word_list
* [hash_table_size
];
354 for (int i
= 0; i
< hash_table_size
; i
++)
355 common_words_table
[i
] = 0;
360 while (c
!= EOF
&& !csalnum(c
))
365 if (key_len
< truncate_len
)
366 key_buffer
[key_len
++] = cmlower(c
);
368 } while (c
!= EOF
&& csalnum(c
));
369 if (key_len
>= shortest_len
) {
370 int h
= hash(key_buffer
, key_len
) % hash_table_size
;
371 common_words_table
[h
] = new word_list(key_buffer
, key_len
,
372 common_words_table
[h
]);
374 if (++count
>= n_ignore_words
)
380 n_ignore_words
= count
;
384 static int do_whole_file(const char *filename
)
387 FILE *fp
= fopen(filename
, "r");
389 error("can't open `%1': %2", filename
, strerror(errno
));
395 while ((c
= getc(fp
)) != EOF
) {
399 while ((c
= getc(fp
)) != EOF
) {
402 if (key_len
< truncate_len
)
403 key_buffer
[key_len
++] = c
;
405 if (store_key(key_buffer
, key_len
)) {
406 if (++count
>= max_keys_per_item
)
413 store_reference(filenames
.length(), 0, 0);
414 store_filename(filename
);
419 static int do_file(const char *filename
)
422 FILE *fp
= fopen(filename
, "r");
424 error("can't open `%1': %2", filename
, strerror(errno
));
427 int filename_index
= filenames
.length();
428 store_filename(filename
);
431 START
, // at the start of the file; also in between references
432 BOL
, // in the middle of a reference, at the beginning of the line
433 PERCENT
, // seen a percent at the beginning of the line
434 IGNORE
, // ignoring a field
435 IGNORE_BOL
, // at the beginning of a line ignoring a field
436 KEY
, // in the middle of a key
437 DISCARD
, // after truncate_len bytes of a key
438 MIDDLE
// in between keys
441 // In states START, BOL, IGNORE_BOL, space_count how many spaces at
442 // the beginning have been seen. In states PERCENT, IGNORE, KEY,
443 // MIDDLE space_count must be 0.
445 int byte_count
= 0; // bytes read
447 int ref_start
= -1; // position of start of current reference
455 if (c
== ' ' || c
== '\t') {
463 ref_start
= byte_count
- space_count
- 1;
467 else if (csalnum(c
)) {
478 if (space_count
> 0) {
490 store_reference(filename_index
, ref_start
,
491 byte_count
- 1 - space_count
- ref_start
);
507 if (strchr(ignore_fields
, c
) != 0)
521 if (space_count
> 0) {
533 store_reference(filename_index
, ref_start
,
534 byte_count
- 1 - space_count
- ref_start
);
545 if (key_len
< truncate_len
)
546 key_buffer
[key_len
++] = c
;
551 possibly_store_key(key_buffer
, key_len
);
561 possibly_store_key(key_buffer
, key_len
);
587 possibly_store_key(key_buffer
, key_len
);
594 store_reference(filename_index
, ref_start
,
595 byte_count
- ref_start
- space_count
);
604 static void store_reference(int filename_index
, int pos
, int len
)
607 t
.filename_index
= filename_index
;
610 fwrite_or_die(&t
, sizeof(t
), 1, indxfp
);
614 static void store_filename(const char *fn
)
620 static void init_hash_table()
622 hash_table
= new table_entry
[hash_table_size
];
623 for (int i
= 0; i
< hash_table_size
; i
++)
624 hash_table
[i
].ptr
= 0;
627 static void possibly_store_key(char *s
, int len
)
629 static int last_tagno
= -1;
630 static int key_count
;
631 if (last_tagno
!= ntags
) {
635 if (key_count
< max_keys_per_item
) {
636 if (store_key(s
, len
))
641 static int store_key(char *s
, int len
)
643 if (len
< shortest_len
)
646 for (int i
= 0; i
< len
; i
++)
647 if (!csdigit(s
[i
])) {
649 s
[i
] = cmlower(s
[i
]);
651 if (is_number
&& !(len
== 4 && s
[0] == '1' && s
[1] == '9'))
653 int h
= hash(s
, len
) % hash_table_size
;
654 if (common_words_table
) {
655 for (word_list
*ptr
= common_words_table
[h
]; ptr
; ptr
= ptr
->next
)
656 if (len
== ptr
->len
&& memcmp(s
, ptr
->str
, len
) == 0)
659 table_entry
*pp
= hash_table
+ h
;
662 else if (pp
->ptr
->v
[pp
->ptr
->used
- 1] == ntags
)
664 else if (pp
->ptr
->used
>= BLOCK_SIZE
)
665 pp
->ptr
= new block(pp
->ptr
);
666 pp
->ptr
->v
[(pp
->ptr
->used
)++] = ntags
;
670 static void write_hash_table()
672 const int minus_one
= -1;
674 for (int i
= 0; i
< hash_table_size
; i
++) {
675 block
*ptr
= hash_table
[i
].ptr
;
677 hash_table
[i
].count
= -1;
679 hash_table
[i
].count
= li
;
688 fwrite_or_die(rev
->v
, sizeof(int), rev
->used
, indxfp
);
694 fwrite_or_die(&minus_one
, sizeof(int), 1, indxfp
);
698 if (sizeof(table_entry
) == sizeof(int))
699 fwrite_or_die(hash_table
, sizeof(int), hash_table_size
, indxfp
);
702 // write it out word by word
704 fwrite_or_die(filenames
.contents(), 1, filenames
.length(), indxfp
);
705 if (fseek(indxfp
, 0, 0) < 0)
706 fatal("error seeking on index file: %1", strerror(errno
));
708 h
.magic
= INDEX_MAGIC
;
709 h
.version
= INDEX_VERSION
;
712 h
.table_size
= hash_table_size
;
713 h
.strings_size
= filenames
.length();
714 h
.truncate
= truncate_len
;
715 h
.shortest
= shortest_len
;
716 h
.common
= n_ignore_words
;
717 fwrite_or_die(&h
, sizeof(h
), 1, indxfp
);
720 static void fwrite_or_die(const void *ptr
, int size
, int nitems
, FILE *fp
)
722 if (fwrite(ptr
, size
, nitems
, fp
) != nitems
)
723 fatal("fwrite failed: %1", strerror(errno
));
726 void fatal_error_exit()
729 unlink(temp_index_file
);
735 void fatal_signal(int signum
)
737 signal(signum
, SIG_DFL
);
739 unlink(temp_index_file
);
740 kill(getpid(), signum
);