2 * Copyright (c) 2014 - 2017 Steffen (Daode) Nurpmeso <steffen@sdaoden.eu>.
4 * Copyright (C) 1989 - 1992, 2000 - 2004
5 * Free Software Foundation, Inc.
6 * Written by James Clark (jjc@jclark.com)
8 * This is free software; you can redistribute it and/or modify it under
9 * the terms of the GNU General Public License as published by the Free
10 * Software Foundation; either version 2, or (at your option) any later
13 * This is distributed in the hope that it will be useful, but WITHOUT ANY
14 * WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 * You should have received a copy of the GNU General Public License along
19 * with groff; see the file COPYING. If not, write to the Free Software
20 * Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA.
24 #include "indxbib-config.h"
26 #include <sys/types.h>
42 #include "stringclass.h"
46 // (2^n - MALLOC_OVERHEAD) should be a good argument for malloc(). TODO
47 #define MALLOC_OVERHEAD 16
52 const int BLOCK_SIZE
= ((1024 - MALLOC_OVERHEAD
- sizeof(struct block
*)
53 - sizeof(int)) / sizeof(int));
61 block(block
*p
= 0) : next(p
), used(0) { }
76 word_list(const char *, int, word_list
*);
79 table_entry
*hash_table
;
80 int hash_table_size
= DEFAULT_HASH_TABLE_SIZE
;
81 // We make this the same size as hash_table so we only have to do one
83 static word_list
**common_words_table
= 0;
89 char *temp_index_file
= 0;
91 const char *ignore_fields
= "XYZ";
92 const char *common_words_file
= COMMON_WORDS_FILE
;
93 int n_ignore_words
= 100;
96 int max_keys_per_item
= 100;
98 static void usage(FILE *stream
);
99 static void write_hash_table();
100 static void init_hash_table();
101 static void read_common_words_file();
102 static int store_key(char *s
, int len
);
103 static void possibly_store_key(char *s
, int len
);
104 static int do_whole_file(const char *filename
);
105 static int do_file(const char *filename
);
106 static void store_reference(int filename_index
, int pos
, int len
);
107 static void check_integer_arg(char opt
, const char *arg
, int min
, int *res
);
108 static void store_filename(const char *);
109 static void fwrite_or_die(const void *ptr
, int size
, int nitems
, FILE *fp
);
110 static char *get_cwd();
112 static void _cleanup(void);
113 static void _handle_fatal_signal(int signum
)
114 static void _catch_fatal_signals(void);
116 static void _ignore_fatal_signals(void);
119 int main(int argc
, char **argv
)
121 program_name
= argv
[0];
122 static char stderr_buf
[BUFSIZ
];
123 setbuf(stderr
, stderr_buf
);
125 const char *base_name
= 0;
126 typedef int (*parser_t
)(const char *);
127 parser_t parser
= do_file
;
128 const char *directory
= 0;
129 const char *foption
= 0;
131 static const struct option long_options
[] = {
132 { "help", no_argument
, 0, CHAR_MAX
+ 1 },
133 { "version", no_argument
, 0, 'v' },
136 while ((opt
= getopt_long(argc
, argv
, "c:o:h:i:k:l:t:n:c:d:f:vw",
141 common_words_file
= optarg
;
150 check_integer_arg('h', optarg
, 1, &hash_table_size
);
151 if (!is_prime(hash_table_size
)) {
152 while (!is_prime(++hash_table_size
))
154 warning("%1 not prime: using %2 instead", optarg
, hash_table_size
);
158 ignore_fields
= optarg
;
161 check_integer_arg('k', optarg
, 1, &max_keys_per_item
);
164 check_integer_arg('l', optarg
, 0, &shortest_len
);
167 check_integer_arg('n', optarg
, 0, &n_ignore_words
);
173 check_integer_arg('t', optarg
, 1, &truncate_len
);
176 parser
= do_whole_file
;
179 printf(L_INDXBIB
" (" T_ROFF
") v" VERSION
);
182 case CHAR_MAX
+ 1: // --help
194 if (optind
>= argc
&& foption
== 0)
195 fatal("no files and no -f option");
197 char *path
= get_cwd();
198 store_filename(path
);
202 store_filename(directory
);
204 store_filename(common_words_file
);
205 store_filename(ignore_fields
);
206 key_buffer
= new char[truncate_len
];
207 read_common_words_file();
209 base_name
= optind
< argc
? argv
[optind
] : DEFAULT_INDEX_NAME
;
210 const char *p
= strrchr(base_name
, DIR_SEPS
[0]), *p1
;
211 const char *sep
= &DIR_SEPS
[1];
213 p1
= strrchr(base_name
, *sep
);
214 if (p1
&& (!p
|| p1
> p
))
220 char *dir
= strsave(base_name
);
221 dir
[p
- base_name
] = '\0';
222 name_max
= file_name_max(dir
);
226 name_max
= file_name_max(".");
227 const char *filename
= p
? p
+ 1 : base_name
;
228 if (strlen(filename
) + sizeof(INDEX_SUFFIX
) - 1 > name_max
)
229 fatal("`%1.%2' is too long for a filename", filename
, INDEX_SUFFIX
);
232 temp_index_file
= new char[p
- base_name
+ sizeof(TEMP_INDEX_TEMPLATE
)];
233 memcpy(temp_index_file
, base_name
, p
- base_name
);
234 strcpy(temp_index_file
+ (p
- base_name
), TEMP_INDEX_TEMPLATE
);
237 temp_index_file
= strsave(TEMP_INDEX_TEMPLATE
);
239 _catch_fatal_signals();
240 int fd
= mkstemp(temp_index_file
);
242 fatal("can't create temporary index file: %1", strerror(errno
));
243 indxfp
= fdopen(fd
, FOPEN_WB
);
245 fatal("fdopen failed");
246 if (fseek(indxfp
, sizeof(index_header
), 0) < 0)
247 fatal("can't seek past index header: %1", strerror(errno
));
251 if (strcmp(foption
, "-") != 0) {
253 fp
= fopen(foption
, "r");
255 fatal("can't open `%1': %2", foption
, strerror(errno
));
261 for (c
= getc(fp
); c
!= '\n' && c
!= EOF
; c
= getc(fp
)) {
263 error_with_file_and_line(foption
, lineno
,
264 "nul character in pathname ignored");
268 if (path
.length() > 0) {
270 if (!(*parser
)(path
.contents()))
281 for (int i
= optind
; i
< argc
; i
++)
282 if (!(*parser
)(argv
[i
]))
285 if (fclose(indxfp
) < 0)
286 fatal("error closing temporary index file: %1", strerror(errno
));
287 char *index_file
= new char[strlen(base_name
) + sizeof(INDEX_SUFFIX
)];
288 strcpy(index_file
, base_name
);
289 strcat(index_file
, INDEX_SUFFIX
);
292 if (access(index_file
, R_OK
) == 0)
294 # endif /* __EMX__ */
295 if (rename(temp_index_file
, index_file
) < 0) {
297 // RENAME could fail on plain MSDOS filesystems because
298 // INDEX_FILE is an invalid filename, e.g. it has multiple dots.
299 char *fname
= p
? index_file
+ (p
- base_name
) : 0;
302 // Replace the dot with an underscore and try again.
304 && (dot
= strchr(fname
, '.')) != 0
305 && strcmp(dot
, INDEX_SUFFIX
) != 0)
307 if (rename(temp_index_file
, index_file
) < 0)
309 fatal("can't rename temporary index file: %1", strerror(errno
));
311 #else /* HAVE_RENAME */
312 _ignore_fatal_signals();
313 if (unlink(index_file
) < 0) {
315 fatal("can't unlink `%1': %2", index_file
, strerror(errno
));
317 if (link(temp_index_file
, index_file
) < 0)
318 fatal("can't link temporary index file: %1", strerror(errno
));
319 if (unlink(temp_index_file
) < 0)
320 fatal("can't unlink temporary index file: %1", strerror(errno
));
321 #endif /* HAVE_RENAME */
326 static void usage(FILE *stream
)
329 "Synopsis: %s [-vw] [-c file] [-d dir] [-f file] [-h n] [-i XYZ] [-k n]\n"
330 " [-l n] [-n n] [-o base] [-t n] [files...]\n",
334 static void check_integer_arg(char opt
, const char *arg
, int min
, int *res
)
337 long n
= strtol(arg
, &ptr
, 10);
338 if (n
== 0 && ptr
== arg
)
339 error("argument to -%1 not an integer", opt
);
341 error("argument to -%1 must not be less than %2", opt
, min
);
344 error("argument to -%1 greater than maximum integer", opt
);
345 else if (*ptr
!= '\0')
346 error("junk after integer argument to -%1", opt
);
351 static char *get_cwd()
357 buf
= new char[size
];
358 if (getcwd(buf
, size
))
361 fatal("cannot get current working directory: %1", strerror(errno
));
364 fatal("current working directory longer than INT_MAX");
365 if (size
> INT_MAX
/2)
373 word_list::word_list(const char *s
, int n
, word_list
*p
)
380 static void read_common_words_file()
382 if (n_ignore_words
<= 0)
385 FILE *fp
= fopen(common_words_file
, "r");
387 fatal("can't open `%1': %2", common_words_file
, strerror(errno
));
388 common_words_table
= new word_list
* [hash_table_size
];
389 for (int i
= 0; i
< hash_table_size
; i
++)
390 common_words_table
[i
] = 0;
395 while (c
!= EOF
&& !csalnum(c
))
400 if (key_len
< truncate_len
)
401 key_buffer
[key_len
++] = cmlower(c
);
403 } while (c
!= EOF
&& csalnum(c
));
404 if (key_len
>= shortest_len
) {
405 int h
= hash(key_buffer
, key_len
) % hash_table_size
;
406 common_words_table
[h
] = new word_list(key_buffer
, key_len
,
407 common_words_table
[h
]);
409 if (++count
>= n_ignore_words
)
415 n_ignore_words
= count
;
419 static int do_whole_file(const char *filename
)
422 FILE *fp
= fopen(filename
, "r");
424 error("can't open `%1': %2", filename
, strerror(errno
));
430 while ((c
= getc(fp
)) != EOF
) {
434 while ((c
= getc(fp
)) != EOF
) {
437 if (key_len
< truncate_len
)
438 key_buffer
[key_len
++] = c
;
440 if (store_key(key_buffer
, key_len
)) {
441 if (++count
>= max_keys_per_item
)
448 store_reference(filenames
.length(), 0, 0);
449 store_filename(filename
);
454 static int do_file(const char *filename
)
457 // Need binary I/O for MS-DOS/MS-Windows, because indxbib relies on
458 // byte counts to be consistent with fseek.
459 FILE *fp
= fopen(filename
, FOPEN_RB
);
461 error("can't open `%1': %2", filename
, strerror(errno
));
464 int filename_index
= filenames
.length();
465 store_filename(filename
);
468 START
, // at the start of the file; also in between references
469 BOL
, // in the middle of a reference, at the beginning of the line
470 PERCENT
, // seen a percent at the beginning of the line
471 IGNORE
, // ignoring a field
472 IGNORE_BOL
, // at the beginning of a line ignoring a field
473 KEY
, // in the middle of a key
474 DISCARD
, // after truncate_len bytes of a key
475 MIDDLE
// in between keys
478 // In states START, BOL, IGNORE_BOL, space_count how many spaces at
479 // the beginning have been seen. In states PERCENT, IGNORE, KEY,
480 // MIDDLE space_count must be 0.
482 int byte_count
= 0; // bytes read
484 int ref_start
= -1; // position of start of current reference
489 // We opened the file in binary mode, so we need to skip
490 // every CR character before a Newline.
500 #if defined(__MSDOS__) || defined(_MSC_VER) || defined(__EMX__)
501 else if (c
== 0x1a) // ^Z means EOF in text files
507 if (c
== ' ' || c
== '\t') {
515 ref_start
= byte_count
- space_count
- 1;
519 else if (csalnum(c
)) {
530 if (space_count
> 0) {
542 store_reference(filename_index
, ref_start
,
543 byte_count
- 1 - space_count
- ref_start
);
559 if (strchr(ignore_fields
, c
) != 0)
573 if (space_count
> 0) {
585 store_reference(filename_index
, ref_start
,
586 byte_count
- 1 - space_count
- ref_start
);
597 if (key_len
< truncate_len
)
598 key_buffer
[key_len
++] = c
;
603 possibly_store_key(key_buffer
, key_len
);
613 possibly_store_key(key_buffer
, key_len
);
639 possibly_store_key(key_buffer
, key_len
);
646 store_reference(filename_index
, ref_start
,
647 byte_count
- ref_start
- space_count
);
656 static void store_reference(int filename_index
, int pos
, int len
)
659 t
.filename_index
= filename_index
;
662 fwrite_or_die(&t
, sizeof(t
), 1, indxfp
);
666 static void store_filename(const char *fn
)
672 static void init_hash_table()
674 hash_table
= new table_entry
[hash_table_size
];
675 for (int i
= 0; i
< hash_table_size
; i
++)
676 hash_table
[i
].ptr
= 0;
679 static void possibly_store_key(char *s
, int len
)
681 static int last_tagno
= -1;
682 static int key_count
;
683 if (last_tagno
!= ntags
) {
687 if (key_count
< max_keys_per_item
) {
688 if (store_key(s
, len
))
693 static int store_key(char *s
, int len
)
695 if (len
< shortest_len
)
698 for (int i
= 0; i
< len
; i
++)
699 if (!csdigit(s
[i
])) {
701 s
[i
] = cmlower(s
[i
]);
703 if (is_number
&& !(len
== 4 && s
[0] == '1' && s
[1] == '9'))
705 int h
= hash(s
, len
) % hash_table_size
;
706 if (common_words_table
) {
707 for (word_list
*ptr
= common_words_table
[h
]; ptr
; ptr
= ptr
->next
)
708 if (len
== ptr
->len
&& memcmp(s
, ptr
->str
, len
) == 0)
711 table_entry
*pp
= hash_table
+ h
;
714 else if (pp
->ptr
->v
[pp
->ptr
->used
- 1] == ntags
)
716 else if (pp
->ptr
->used
>= BLOCK_SIZE
)
717 pp
->ptr
= new block(pp
->ptr
);
718 pp
->ptr
->v
[(pp
->ptr
->used
)++] = ntags
;
722 static void write_hash_table()
724 const int minus_one
= -1;
726 for (int i
= 0; i
< hash_table_size
; i
++) {
727 block
*ptr
= hash_table
[i
].ptr
;
729 hash_table
[i
].count
= -1;
731 hash_table
[i
].count
= li
;
740 fwrite_or_die(rev
->v
, sizeof(int), rev
->used
, indxfp
);
746 fwrite_or_die(&minus_one
, sizeof(int), 1, indxfp
);
750 if (sizeof(table_entry
) == sizeof(int))
751 fwrite_or_die(hash_table
, sizeof(int), hash_table_size
, indxfp
);
753 // write it out word by word
754 for (int i
= 0; i
< hash_table_size
; i
++)
755 fwrite_or_die(&hash_table
[i
].count
, sizeof(int), 1, indxfp
);
757 fwrite_or_die(filenames
.contents(), 1, filenames
.length(), indxfp
);
758 if (fseek(indxfp
, 0, 0) < 0)
759 fatal("error seeking on index file: %1", strerror(errno
));
761 h
.magic
= INDEX_MAGIC
;
762 h
.version
= INDEX_VERSION
;
765 h
.table_size
= hash_table_size
;
766 h
.strings_size
= filenames
.length();
767 h
.truncate
= truncate_len
;
768 h
.shortest
= shortest_len
;
769 h
.common
= n_ignore_words
;
770 fwrite_or_die(&h
, sizeof(h
), 1, indxfp
);
773 static void fwrite_or_die(const void *ptr
, int size
, int nitems
, FILE *fp
)
775 if (fwrite(ptr
, size
, nitems
, fp
) != (size_t)nitems
)
776 fatal("fwrite failed: %1", strerror(errno
));
779 void fatal_error_exit()
789 unlink(temp_index_file
);
793 _handle_fatal_signal(int signum
)
795 signal(signum
, SIG_DFL
);
798 kill(getpid(), signum
);
800 /* MS-DOS and Win32 don't have kill(); the best compromise is
801 probably to use exit() instead. */
807 _catch_fatal_signals(void)
810 signal(SIGHUP
, &_handle_fatal_signal
);
812 signal(SIGINT
, &_handle_fatal_signal
);
813 signal(SIGTERM
, &_handle_fatal_signal
);
818 _ignore_fatal_signals()
821 signal(SIGHUP
, SIG_IGN
);
823 signal(SIGINT
, SIG_IGN
);
824 signal(SIGTERM
, SIG_IGN
);
826 #endif /* HAVE_RENAME */