groff before CVS: release 1.06
[s-roff.git] / libbib / index.cc
blob8fe5acdae7950ff102557a9e2a0d6ea228389883
1 // -*- C++ -*-
2 /* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc.
3 Written by James Clark (jjc@jclark.com)
5 This file is part of groff.
7 groff is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
10 version.
12 groff is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 for more details.
17 You should have received a copy of the GNU General Public License along
18 with groff; see the file COPYING. If not, write to the Free Software
19 Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. */
21 #include <stdio.h>
22 #include <string.h>
23 #include <stdlib.h>
24 #include <errno.h>
26 #include "posix.h"
27 #include "lib.h"
28 #include "cset.h"
29 #include "cmap.h"
30 #include "errarg.h"
31 #include "error.h"
33 #include "refid.h"
34 #include "search.h"
35 #include "index.h"
36 #include "defs.h"
38 // Interface to mmap.
39 extern "C" {
40 void *mapread(int fd, int len);
41 int unmap(void *, int len);
44 const int minus_one = -1;
46 int verify_flag = 0;
48 struct word_list;
50 class index_search_item : public search_item {
51 search_item *out_of_date_files;
52 index_header header;
53 char *buffer;
54 void *map_addr;
55 int map_len;
56 tag *tags;
57 int *table;
58 int *lists;
59 char *pool;
60 char *key_buffer;
61 char *filename_buffer;
62 int filename_buflen;
63 char **common_words_table;
64 int common_words_table_size;
65 const char *ignore_fields;
66 time_t mtime;
68 const char *do_verify();
69 const int *search1(const char **pp, const char *end);
70 const int *search(const char *ptr, int length, int **temp_listp);
71 const char *munge_filename(const char *);
72 void read_common_words_file();
73 void add_out_of_date_file(int fd, const char *filename, int fid);
74 public:
75 index_search_item(const char *, int);
76 ~index_search_item();
77 int load(int fd);
78 search_item_iterator *make_search_item_iterator(const char *);
79 int verify();
80 void check_files();
81 int next_filename_id() const;
82 friend class index_search_item_iterator;
85 class index_search_item_iterator : public search_item_iterator {
86 index_search_item *indx;
87 search_item_iterator *out_of_date_files_iter;
88 search_item *next_out_of_date_file;
89 const int *found_list;
90 int *temp_list;
91 char *buf;
92 int buflen;
93 linear_searcher searcher;
94 char *query;
95 int get_tag(int tagno, const linear_searcher &, const char **, int *,
96 reference_id *);
97 public:
98 index_search_item_iterator(index_search_item *, const char *);
99 ~index_search_item_iterator();
100 int next(const linear_searcher &, const char **, int *, reference_id *);
104 index_search_item::index_search_item(const char *filename, int fid)
105 : search_item(filename, fid), out_of_date_files(0), key_buffer(0),
106 filename_buffer(0), filename_buflen(0), common_words_table(0),
107 map_addr(0), map_len(0), buffer(0)
111 index_search_item::~index_search_item()
113 if (buffer)
114 free(buffer);
115 if (map_addr) {
116 if (unmap(map_addr, map_len) < 0)
117 error("unmap: %1", strerror(errno));
119 while (out_of_date_files) {
120 search_item *tem = out_of_date_files;
121 out_of_date_files = out_of_date_files->next;
122 delete tem;
124 a_delete filename_buffer;
125 a_delete key_buffer;
126 if (common_words_table) {
127 for (int i = 0; i < common_words_table_size; i++)
128 a_delete common_words_table[i];
129 a_delete common_words_table;
133 class file_closer {
134 int *fdp;
135 public:
136 file_closer(int &fd) : fdp(&fd) { }
137 ~file_closer() { close(*fdp); }
140 int index_search_item::load(int fd)
142 file_closer fd_closer(fd); // close fd on return
143 struct stat sb;
144 if (fstat(fd, &sb) < 0) {
145 error("can't fstat `%1': %2", name, strerror(errno));
146 return 0;
148 if (!S_ISREG(sb.st_mode)) {
149 error("`%1' is not a regular file", name);
150 return 0;
152 mtime = sb.st_mtime;
153 int size = int(sb.st_size);
154 char *addr;
155 map_addr = mapread(fd, size);
156 if (map_addr) {
157 addr = (char *)map_addr;
158 map_len = size;
160 else {
161 addr = buffer = (char *)malloc(size);
162 if (buffer == 0) {
163 error("can't allocate buffer for `%1'", name);
164 return 0;
166 char *ptr = buffer;
167 int bytes_to_read = size;
168 while (bytes_to_read > 0) {
169 int nread = read(fd, ptr, bytes_to_read);
170 if (nread == 0) {
171 error("unexpected EOF on `%1'", name);
172 return 0;
174 if (nread < 0) {
175 error("read error on `%1': %2", name, strerror(errno));
176 return 0;
178 bytes_to_read -= nread;
179 ptr += nread;
182 header = *(index_header *)addr;
183 if (header.magic != INDEX_MAGIC) {
184 error("`%1' is not an index file: wrong magic number", name);
185 return 0;
187 if (header.version != INDEX_VERSION) {
188 error("version number in `%1' is wrong: was %2, should be %3",
189 name, header.version, INDEX_VERSION);
190 return 0;
192 int sz = (header.tags_size * sizeof(tag)
193 + header.lists_size * sizeof(int)
194 + header.table_size * sizeof(int)
195 + header.strings_size
196 + sizeof(header));
197 if (sz != size) {
198 error("size of `%1' is wrong: was %2, should be %3",
199 name, size, sz);
200 return 0;
202 tags = (tag *)(addr + sizeof(header));
203 lists = (int *)(tags + header.tags_size);
204 table = (int *)(lists + header.lists_size);
205 pool = (char *)(table + header.table_size);
206 ignore_fields = strchr(strchr(pool, '\0') + 1, '\0') + 1;
207 key_buffer = new char[header.truncate];
208 read_common_words_file();
209 return 1;
212 const char *index_search_item::do_verify()
214 if (tags == 0)
215 return "not loaded";
216 if (lists[header.lists_size - 1] >= 0)
217 return "last list element not negative";
218 int i;
219 for (i = 0; i < header.table_size; i++) {
220 int li = table[i];
221 if (li >= header.lists_size)
222 return "bad list index";
223 if (li >= 0) {
224 for (int *ptr = lists + li; *ptr >= 0; ptr++) {
225 if (*ptr >= header.tags_size)
226 return "bad tag index";
227 if (*ptr >= ptr[1] && ptr[1] >= 0)
228 return "list not ordered";
232 for (i = 0; i < header.tags_size; i++) {
233 if (tags[i].filename_index >= header.strings_size)
234 return "bad index in tags";
235 if (tags[i].length < 0)
236 return "bad length in tags";
237 if (tags[i].start < 0)
238 return "bad start in tags";
240 if (pool[header.strings_size - 1] != '\0')
241 return "last character in pool not nul";
242 return 0;
245 int index_search_item::verify()
247 const char *reason = do_verify();
248 if (!reason)
249 return 1;
250 error("`%1' is bad: %2", name, reason);
251 return 0;
254 int index_search_item::next_filename_id() const
256 return filename_id + header.strings_size + 1;
259 search_item_iterator *index_search_item::make_search_item_iterator(
260 const char *query)
262 return new index_search_item_iterator(this, query);
265 search_item *make_index_search_item(const char *filename, int fid)
267 char *index_filename = new char[strlen(filename) + sizeof(INDEX_SUFFIX)];
268 strcpy(index_filename, filename);
269 strcat(index_filename, INDEX_SUFFIX);
270 int fd = open(index_filename, O_RDONLY);
271 if (fd < 0)
272 return 0;
273 index_search_item *item = new index_search_item(index_filename, fid);
274 a_delete index_filename;
275 if (!item->load(fd)) {
276 close(fd);
277 delete item;
278 return 0;
280 else if (verify_flag && !item->verify()) {
281 delete item;
282 return 0;
284 else {
285 item->check_files();
286 return item;
291 index_search_item_iterator::index_search_item_iterator(index_search_item *ind,
292 const char *q)
293 : indx(ind), buf(0), buflen(0), temp_list(0), query(strsave(q)),
294 searcher(q, strlen(q), ind->ignore_fields, ind->header.truncate),
295 out_of_date_files_iter(0), next_out_of_date_file(0)
297 found_list = indx->search(q, strlen(q), &temp_list);
298 if (!found_list) {
299 found_list = &minus_one;
300 warning("all keys would have been discarded in constructing index `%1'",
301 indx->name);
305 index_search_item_iterator::~index_search_item_iterator()
307 a_delete temp_list;
308 a_delete buf;
309 a_delete query;
310 delete out_of_date_files_iter;
313 int index_search_item_iterator::next(const linear_searcher &,
314 const char **pp, int *lenp,
315 reference_id *ridp)
317 if (found_list) {
318 for (;;) {
319 int tagno = *found_list;
320 if (tagno == -1)
321 break;
322 found_list++;
323 if (get_tag(tagno, searcher, pp, lenp, ridp))
324 return 1;
326 found_list = 0;
327 next_out_of_date_file = indx->out_of_date_files;
329 while (next_out_of_date_file) {
330 if (out_of_date_files_iter == 0)
331 out_of_date_files_iter
332 = next_out_of_date_file->make_search_item_iterator(query);
333 if (out_of_date_files_iter->next(searcher, pp, lenp, ridp))
334 return 1;
335 delete out_of_date_files_iter;
336 out_of_date_files_iter = 0;
337 next_out_of_date_file = next_out_of_date_file->next;
339 return 0;
342 int index_search_item_iterator::get_tag(int tagno,
343 const linear_searcher &searcher,
344 const char **pp, int *lenp,
345 reference_id *ridp)
347 if (tagno < 0 || tagno >= indx->header.tags_size) {
348 error("bad tag number");
349 return 0;
351 tag *tp = indx->tags + tagno;
352 const char *filename = indx->munge_filename(indx->pool + tp->filename_index);
353 int fd = open(filename, O_RDONLY);
354 if (fd < 0) {
355 error("can't open `%1': %2", filename, strerror(errno));
356 return 0;
358 struct stat sb;
359 if (fstat(fd, &sb) < 0) {
360 error("can't fstat: %1", strerror(errno));
361 close(fd);
362 return 0;
364 time_t mtime = sb.st_mtime;
365 if (mtime > indx->mtime) {
366 indx->add_out_of_date_file(fd, filename,
367 indx->filename_id + tp->filename_index);
368 return 0;
370 int res = 0;
371 FILE *fp = fdopen(fd, "r");
372 if (!fp) {
373 error("fdopen failed");
374 close(fd);
375 return 0;
377 if (tp->start != 0 && fseek(fp, long(tp->start), 0) < 0)
378 error("can't seek on `%1': %2", filename, strerror(errno));
379 else {
380 int length = tp->length;
381 int err = 0;
382 if (length == 0) {
383 struct stat sb;
384 if (fstat(fileno(fp), &sb) < 0) {
385 error("can't stat `%1': %2", filename, strerror(errno));
386 err = 1;
388 else if ((sb.st_mode & S_IFMT) != S_IFREG) {
389 error("`%1' is not a regular file", filename);
390 err = 1;
392 else
393 length = int(sb.st_size);
395 if (!err) {
396 if (length + 2 > buflen) {
397 a_delete buf;
398 buflen = length + 2;
399 buf = new char[buflen];
401 if (fread(buf + 1, 1, length, fp) != length)
402 error("fread on `%1' failed: %2", filename, strerror(errno));
403 else {
404 buf[0] = '\n';
405 buf[length + 1] = '\n';
406 res = searcher.search(buf + 1, buf + 2 + length, pp, lenp);
407 if (res && ridp)
408 *ridp = reference_id(indx->filename_id + tp->filename_index,
409 tp->start);
413 fclose(fp);
414 return res;
417 const char *index_search_item::munge_filename(const char *filename)
419 if (filename[0] == '/')
420 return filename;
421 const char *cwd = pool;
422 int need_slash = (cwd[0] != 0 && strchr(cwd, '\0')[-1] != '/');
423 int len = strlen(cwd) + strlen(filename) + need_slash + 1;
424 if (len > filename_buflen) {
425 a_delete filename_buffer;
426 filename_buflen = len;
427 filename_buffer = new char[len];
429 strcpy(filename_buffer, cwd);
430 if (need_slash)
431 strcat(filename_buffer, "/");
432 strcat(filename_buffer, filename);
433 return filename_buffer;
436 const int *index_search_item::search1(const char **pp, const char *end)
438 while (*pp < end && !csalnum(**pp))
439 *pp += 1;
440 if (*pp >= end)
441 return 0;
442 const char *start = *pp;
443 while (*pp < end && csalnum(**pp))
444 *pp += 1;
445 int len = *pp - start;
446 if (len < header.shortest)
447 return 0;
448 if (len > header.truncate)
449 len = header.truncate;
450 int is_number = 1;
451 for (int i = 0; i < len; i++)
452 if (csdigit(start[i]))
453 key_buffer[i] = start[i];
454 else {
455 key_buffer[i] = cmlower(start[i]);
456 is_number = 0;
458 if (is_number && !(len == 4 && start[0] == '1' && start[1] == '9'))
459 return 0;
460 unsigned hc = hash(key_buffer, len);
461 if (common_words_table) {
462 for (int h = hc % common_words_table_size;
463 common_words_table[h];
464 --h) {
465 if (strlen(common_words_table[h]) == len
466 && memcmp(common_words_table[h], key_buffer, len) == 0)
467 return 0;
468 if (h == 0)
469 h = common_words_table_size;
472 int li = table[int(hc % header.table_size)];
473 return li < 0 ? &minus_one : lists + li;
476 static void merge(int *result, const int *s1, const int *s2)
478 for (; *s1 >= 0; s1++) {
479 while (*s2 >= 0 && *s2 < *s1)
480 s2++;
481 if (*s2 == *s1)
482 *result++ = *s2;
484 *result++ = -1;
487 const int *index_search_item::search(const char *ptr, int length,
488 int **temp_listp)
490 const char *end = ptr + length;
491 if (*temp_listp) {
492 a_delete *temp_listp;
493 *temp_listp = 0;
495 const int *first_list = 0;
496 while (ptr < end && (first_list = search1(&ptr, end)) == 0)
498 if (!first_list)
499 return 0;
500 if (*first_list < 0)
501 return first_list;
502 const int *second_list = 0;
503 while (ptr < end && (second_list = search1(&ptr, end)) == 0)
505 if (!second_list)
506 return first_list;
507 if (*second_list < 0)
508 return second_list;
509 for (const int *p = first_list; *p >= 0; p++)
511 int len = p - first_list;
512 for (p = second_list; *p >= 0; p++)
514 if (p - second_list < len)
515 len = p - second_list;
516 int *matches = new int[len + 1];
517 merge(matches, first_list, second_list);
518 while (ptr < end) {
519 const int *list = search1(&ptr, end);
520 if (list != 0) {
521 if (*list < 0) {
522 a_delete matches;
523 return list;
525 merge(matches, matches, list);
526 if (*matches < 0) {
527 a_delete matches;
528 return &minus_one;
532 *temp_listp = matches;
533 return matches;
536 void index_search_item::read_common_words_file()
538 if (header.common <= 0)
539 return;
540 const char *common_words_file = munge_filename(strchr(pool, '\0') + 1);
541 errno = 0;
542 FILE *fp = fopen(common_words_file, "r");
543 if (!fp) {
544 error("can't open `%1': %2", common_words_file, strerror(errno));
545 return;
547 common_words_table_size = 2*header.common + 1;
548 while (!is_prime(common_words_table_size))
549 common_words_table_size++;
550 common_words_table = new char *[common_words_table_size];
551 for (int i = 0; i < common_words_table_size; i++)
552 common_words_table[i] = 0;
553 int count = 0;
554 int key_len = 0;
555 for (;;) {
556 int c = getc(fp);
557 while (c != EOF && !csalnum(c))
558 c = getc(fp);
559 if (c == EOF)
560 break;
561 do {
562 if (key_len < header.truncate)
563 key_buffer[key_len++] = cmlower(c);
564 c = getc(fp);
565 } while (c != EOF && csalnum(c));
566 if (key_len >= header.shortest) {
567 int h = hash(key_buffer, key_len) % common_words_table_size;
568 while (common_words_table[h]) {
569 if (h == 0)
570 h = common_words_table_size;
571 --h;
573 common_words_table[h] = new char[key_len + 1];
574 memcpy(common_words_table[h], key_buffer, key_len);
575 common_words_table[h][key_len] = '\0';
577 if (++count >= header.common)
578 break;
579 key_len = 0;
580 if (c == EOF)
581 break;
583 fclose(fp);
586 void index_search_item::add_out_of_date_file(int fd, const char *filename,
587 int fid)
589 for (search_item **pp = &out_of_date_files; *pp; pp = &(*pp)->next)
590 if ((*pp)->is_named(filename))
591 return;
592 *pp = make_linear_search_item(fd, filename, fid);
593 warning("`%1' modified since `%2' created", filename, name);
596 void index_search_item::check_files()
598 const char *pool_end = pool + header.strings_size;
599 for (const char *ptr = strchr(ignore_fields, '\0') + 1;
600 ptr < pool_end;
601 ptr = strchr(ptr, '\0') + 1) {
602 const char *path = munge_filename(ptr);
603 struct stat sb;
604 if (stat(path, &sb) < 0)
605 error("can't stat `%1': %2", path, strerror(errno));
606 else if (sb.st_mtime > mtime) {
607 int fd = open(path, O_RDONLY);
608 if (fd < 0)
609 error("can't open `%1': %2", path, strerror(errno));
610 else
611 add_out_of_date_file(fd, path, filename_id + (ptr - pool));