* All affected files: Update postal address of FSF.
[s-roff.git] / src / libs / libbib / index.cpp
blob6b9b9e350efe60309dd1c9ea881f53d12abc82fd
1 // -*- C++ -*-
2 /* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2004
3 Free Software Foundation, Inc.
4 Written by James Clark (jjc@jclark.com)
6 This file is part of groff.
8 groff is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
13 groff is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License along
19 with groff; see the file COPYING. If not, write to the Free Software
20 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
22 #include "lib.h"
24 #include <stdlib.h>
25 #include <errno.h>
27 #include "posix.h"
28 #include "cset.h"
29 #include "cmap.h"
30 #include "errarg.h"
31 #include "error.h"
33 #include "refid.h"
34 #include "search.h"
35 #include "index.h"
36 #include "defs.h"
38 #include "nonposix.h"
40 // Interface to mmap.
41 extern "C" {
42 void *mapread(int fd, int len);
43 int unmap(void *, int len);
46 #if 0
47 const
48 #endif
49 int minus_one = -1;
51 int verify_flag = 0;
53 struct word_list;
55 class index_search_item : public search_item {
56 search_item *out_of_date_files;
57 index_header header;
58 char *buffer;
59 void *map_addr;
60 int map_len;
61 tag *tags;
62 int *table;
63 int *lists;
64 char *pool;
65 char *key_buffer;
66 char *filename_buffer;
67 int filename_buflen;
68 char **common_words_table;
69 int common_words_table_size;
70 const char *ignore_fields;
71 time_t mtime;
73 const char *do_verify();
74 const int *search1(const char **pp, const char *end);
75 const int *search(const char *ptr, int length, int **temp_listp);
76 const char *munge_filename(const char *);
77 void read_common_words_file();
78 void add_out_of_date_file(int fd, const char *filename, int fid);
79 public:
80 index_search_item(const char *, int);
81 ~index_search_item();
82 int load(int fd);
83 search_item_iterator *make_search_item_iterator(const char *);
84 int verify();
85 void check_files();
86 int next_filename_id() const;
87 friend class index_search_item_iterator;
90 class index_search_item_iterator : public search_item_iterator {
91 index_search_item *indx;
92 search_item_iterator *out_of_date_files_iter;
93 search_item *next_out_of_date_file;
94 const int *found_list;
95 int *temp_list;
96 char *buf;
97 int buflen;
98 linear_searcher searcher;
99 char *query;
100 int get_tag(int tagno, const linear_searcher &, const char **, int *,
101 reference_id *);
102 public:
103 index_search_item_iterator(index_search_item *, const char *);
104 ~index_search_item_iterator();
105 int next(const linear_searcher &, const char **, int *, reference_id *);
109 index_search_item::index_search_item(const char *filename, int fid)
110 : search_item(filename, fid), out_of_date_files(0), buffer(0), map_addr(0),
111 map_len(0), key_buffer(0), filename_buffer(0), filename_buflen(0),
112 common_words_table(0)
116 index_search_item::~index_search_item()
118 if (buffer)
119 free(buffer);
120 if (map_addr) {
121 if (unmap(map_addr, map_len) < 0)
122 error("unmap: %1", strerror(errno));
124 while (out_of_date_files) {
125 search_item *tem = out_of_date_files;
126 out_of_date_files = out_of_date_files->next;
127 delete tem;
129 a_delete filename_buffer;
130 a_delete key_buffer;
131 if (common_words_table) {
132 for (int i = 0; i < common_words_table_size; i++)
133 a_delete common_words_table[i];
134 a_delete common_words_table;
138 class file_closer {
139 int *fdp;
140 public:
141 file_closer(int &fd) : fdp(&fd) { }
142 ~file_closer() { close(*fdp); }
145 // Tell the compiler that a variable is intentionally unused.
146 inline void unused(void *) { }
148 int index_search_item::load(int fd)
150 file_closer fd_closer(fd); // close fd on return
151 unused(&fd_closer);
152 struct stat sb;
153 if (fstat(fd, &sb) < 0) {
154 error("can't fstat `%1': %2", name, strerror(errno));
155 return 0;
157 if (!S_ISREG(sb.st_mode)) {
158 error("`%1' is not a regular file", name);
159 return 0;
161 mtime = sb.st_mtime;
162 int size = int(sb.st_size);
163 char *addr;
164 map_addr = mapread(fd, size);
165 if (map_addr) {
166 addr = (char *)map_addr;
167 map_len = size;
169 else {
170 addr = buffer = (char *)malloc(size);
171 if (buffer == 0) {
172 error("can't allocate buffer for `%1'", name);
173 return 0;
175 char *ptr = buffer;
176 int bytes_to_read = size;
177 while (bytes_to_read > 0) {
178 int nread = read(fd, ptr, bytes_to_read);
179 if (nread == 0) {
180 error("unexpected EOF on `%1'", name);
181 return 0;
183 if (nread < 0) {
184 error("read error on `%1': %2", name, strerror(errno));
185 return 0;
187 bytes_to_read -= nread;
188 ptr += nread;
191 header = *(index_header *)addr;
192 if (header.magic != INDEX_MAGIC) {
193 error("`%1' is not an index file: wrong magic number", name);
194 return 0;
196 if (header.version != INDEX_VERSION) {
197 error("version number in `%1' is wrong: was %2, should be %3",
198 name, header.version, INDEX_VERSION);
199 return 0;
201 int sz = (header.tags_size * sizeof(tag)
202 + header.lists_size * sizeof(int)
203 + header.table_size * sizeof(int)
204 + header.strings_size
205 + sizeof(header));
206 if (sz != size) {
207 error("size of `%1' is wrong: was %2, should be %3",
208 name, size, sz);
209 return 0;
211 tags = (tag *)(addr + sizeof(header));
212 lists = (int *)(tags + header.tags_size);
213 table = (int *)(lists + header.lists_size);
214 pool = (char *)(table + header.table_size);
215 ignore_fields = strchr(strchr(pool, '\0') + 1, '\0') + 1;
216 key_buffer = new char[header.truncate];
217 read_common_words_file();
218 return 1;
221 const char *index_search_item::do_verify()
223 if (tags == 0)
224 return "not loaded";
225 if (lists[header.lists_size - 1] >= 0)
226 return "last list element not negative";
227 int i;
228 for (i = 0; i < header.table_size; i++) {
229 int li = table[i];
230 if (li >= header.lists_size)
231 return "bad list index";
232 if (li >= 0) {
233 for (int *ptr = lists + li; *ptr >= 0; ptr++) {
234 if (*ptr >= header.tags_size)
235 return "bad tag index";
236 if (*ptr >= ptr[1] && ptr[1] >= 0)
237 return "list not ordered";
241 for (i = 0; i < header.tags_size; i++) {
242 if (tags[i].filename_index >= header.strings_size)
243 return "bad index in tags";
244 if (tags[i].length < 0)
245 return "bad length in tags";
246 if (tags[i].start < 0)
247 return "bad start in tags";
249 if (pool[header.strings_size - 1] != '\0')
250 return "last character in pool not nul";
251 return 0;
254 int index_search_item::verify()
256 const char *reason = do_verify();
257 if (!reason)
258 return 1;
259 error("`%1' is bad: %2", name, reason);
260 return 0;
263 int index_search_item::next_filename_id() const
265 return filename_id + header.strings_size + 1;
268 search_item_iterator *index_search_item::make_search_item_iterator(
269 const char *query)
271 return new index_search_item_iterator(this, query);
274 search_item *make_index_search_item(const char *filename, int fid)
276 char *index_filename = new char[strlen(filename) + sizeof(INDEX_SUFFIX)];
277 strcpy(index_filename, filename);
278 strcat(index_filename, INDEX_SUFFIX);
279 int fd = open(index_filename, O_RDONLY | O_BINARY);
280 if (fd < 0)
281 return 0;
282 index_search_item *item = new index_search_item(index_filename, fid);
283 a_delete index_filename;
284 if (!item->load(fd)) {
285 close(fd);
286 delete item;
287 return 0;
289 else if (verify_flag && !item->verify()) {
290 delete item;
291 return 0;
293 else {
294 item->check_files();
295 return item;
300 index_search_item_iterator::index_search_item_iterator(index_search_item *ind,
301 const char *q)
302 : indx(ind), out_of_date_files_iter(0), next_out_of_date_file(0), temp_list(0),
303 buf(0), buflen(0),
304 searcher(q, strlen(q), ind->ignore_fields, ind->header.truncate),
305 query(strsave(q))
307 found_list = indx->search(q, strlen(q), &temp_list);
308 if (!found_list) {
309 found_list = &minus_one;
310 warning("all keys would have been discarded in constructing index `%1'",
311 indx->name);
315 index_search_item_iterator::~index_search_item_iterator()
317 a_delete temp_list;
318 a_delete buf;
319 a_delete query;
320 delete out_of_date_files_iter;
323 int index_search_item_iterator::next(const linear_searcher &,
324 const char **pp, int *lenp,
325 reference_id *ridp)
327 if (found_list) {
328 for (;;) {
329 int tagno = *found_list;
330 if (tagno == -1)
331 break;
332 found_list++;
333 if (get_tag(tagno, searcher, pp, lenp, ridp))
334 return 1;
336 found_list = 0;
337 next_out_of_date_file = indx->out_of_date_files;
339 while (next_out_of_date_file) {
340 if (out_of_date_files_iter == 0)
341 out_of_date_files_iter
342 = next_out_of_date_file->make_search_item_iterator(query);
343 if (out_of_date_files_iter->next(searcher, pp, lenp, ridp))
344 return 1;
345 delete out_of_date_files_iter;
346 out_of_date_files_iter = 0;
347 next_out_of_date_file = next_out_of_date_file->next;
349 return 0;
352 int index_search_item_iterator::get_tag(int tagno,
353 const linear_searcher &searchr,
354 const char **pp, int *lenp,
355 reference_id *ridp)
357 if (tagno < 0 || tagno >= indx->header.tags_size) {
358 error("bad tag number");
359 return 0;
361 tag *tp = indx->tags + tagno;
362 const char *filename = indx->munge_filename(indx->pool + tp->filename_index);
363 int fd = open(filename, O_RDONLY | O_BINARY);
364 if (fd < 0) {
365 error("can't open `%1': %2", filename, strerror(errno));
366 return 0;
368 struct stat sb;
369 if (fstat(fd, &sb) < 0) {
370 error("can't fstat: %1", strerror(errno));
371 close(fd);
372 return 0;
374 time_t mtime = sb.st_mtime;
375 if (mtime > indx->mtime) {
376 indx->add_out_of_date_file(fd, filename,
377 indx->filename_id + tp->filename_index);
378 return 0;
380 int res = 0;
381 FILE *fp = fdopen(fd, FOPEN_RB);
382 if (!fp) {
383 error("fdopen failed");
384 close(fd);
385 return 0;
387 if (tp->start != 0 && fseek(fp, long(tp->start), 0) < 0)
388 error("can't seek on `%1': %2", filename, strerror(errno));
389 else {
390 int length = tp->length;
391 int err = 0;
392 if (length == 0) {
393 if (fstat(fileno(fp), &sb) < 0) {
394 error("can't stat `%1': %2", filename, strerror(errno));
395 err = 1;
397 else if (!S_ISREG(sb.st_mode)) {
398 error("`%1' is not a regular file", filename);
399 err = 1;
401 else
402 length = int(sb.st_size);
404 if (!err) {
405 if (length + 2 > buflen) {
406 a_delete buf;
407 buflen = length + 2;
408 buf = new char[buflen];
410 if (fread(buf + 1, 1, length, fp) != (size_t)length)
411 error("fread on `%1' failed: %2", filename, strerror(errno));
412 else {
413 buf[0] = '\n';
414 // Remove the CR characters from CRLF pairs.
415 int sidx = 1, didx = 1;
416 for ( ; sidx < length + 1; sidx++, didx++)
418 if (buf[sidx] == '\r')
420 if (buf[++sidx] != '\n')
421 buf[didx++] = '\r';
422 else
423 length--;
425 if (sidx != didx)
426 buf[didx] = buf[sidx];
428 buf[length + 1] = '\n';
429 res = searchr.search(buf + 1, buf + 2 + length, pp, lenp);
430 if (res && ridp)
431 *ridp = reference_id(indx->filename_id + tp->filename_index,
432 tp->start);
436 fclose(fp);
437 return res;
440 const char *index_search_item::munge_filename(const char *filename)
442 if (IS_ABSOLUTE(filename))
443 return filename;
444 const char *cwd = pool;
445 int need_slash = (cwd[0] != 0
446 && strchr(DIR_SEPS, strchr(cwd, '\0')[-1]) == 0);
447 int len = strlen(cwd) + strlen(filename) + need_slash + 1;
448 if (len > filename_buflen) {
449 a_delete filename_buffer;
450 filename_buflen = len;
451 filename_buffer = new char[len];
453 strcpy(filename_buffer, cwd);
454 if (need_slash)
455 strcat(filename_buffer, "/");
456 strcat(filename_buffer, filename);
457 return filename_buffer;
460 const int *index_search_item::search1(const char **pp, const char *end)
462 while (*pp < end && !csalnum(**pp))
463 *pp += 1;
464 if (*pp >= end)
465 return 0;
466 const char *start = *pp;
467 while (*pp < end && csalnum(**pp))
468 *pp += 1;
469 int len = *pp - start;
470 if (len < header.shortest)
471 return 0;
472 if (len > header.truncate)
473 len = header.truncate;
474 int is_number = 1;
475 for (int i = 0; i < len; i++)
476 if (csdigit(start[i]))
477 key_buffer[i] = start[i];
478 else {
479 key_buffer[i] = cmlower(start[i]);
480 is_number = 0;
482 if (is_number && !(len == 4 && start[0] == '1' && start[1] == '9'))
483 return 0;
484 unsigned hc = hash(key_buffer, len);
485 if (common_words_table) {
486 for (int h = hc % common_words_table_size;
487 common_words_table[h];
488 --h) {
489 if (strlen(common_words_table[h]) == (size_t)len
490 && memcmp(common_words_table[h], key_buffer, len) == 0)
491 return 0;
492 if (h == 0)
493 h = common_words_table_size;
496 int li = table[int(hc % header.table_size)];
497 return li < 0 ? &minus_one : lists + li;
500 static void merge(int *result, const int *s1, const int *s2)
502 for (; *s1 >= 0; s1++) {
503 while (*s2 >= 0 && *s2 < *s1)
504 s2++;
505 if (*s2 == *s1)
506 *result++ = *s2;
508 *result++ = -1;
511 const int *index_search_item::search(const char *ptr, int length,
512 int **temp_listp)
514 const char *end = ptr + length;
515 if (*temp_listp) {
516 a_delete *temp_listp;
517 *temp_listp = 0;
519 const int *first_list = 0;
520 while (ptr < end && (first_list = search1(&ptr, end)) == 0)
522 if (!first_list)
523 return 0;
524 if (*first_list < 0)
525 return first_list;
526 const int *second_list = 0;
527 while (ptr < end && (second_list = search1(&ptr, end)) == 0)
529 if (!second_list)
530 return first_list;
531 if (*second_list < 0)
532 return second_list;
533 const int *p;
534 for (p = first_list; *p >= 0; p++)
536 int len = p - first_list;
537 for (p = second_list; *p >= 0; p++)
539 if (p - second_list < len)
540 len = p - second_list;
541 int *matches = new int[len + 1];
542 merge(matches, first_list, second_list);
543 while (ptr < end) {
544 const int *list = search1(&ptr, end);
545 if (list != 0) {
546 if (*list < 0) {
547 a_delete matches;
548 return list;
550 merge(matches, matches, list);
551 if (*matches < 0) {
552 a_delete matches;
553 return &minus_one;
557 *temp_listp = matches;
558 return matches;
561 void index_search_item::read_common_words_file()
563 if (header.common <= 0)
564 return;
565 const char *common_words_file = munge_filename(strchr(pool, '\0') + 1);
566 errno = 0;
567 FILE *fp = fopen(common_words_file, "r");
568 if (!fp) {
569 error("can't open `%1': %2", common_words_file, strerror(errno));
570 return;
572 common_words_table_size = 2*header.common + 1;
573 while (!is_prime(common_words_table_size))
574 common_words_table_size++;
575 common_words_table = new char *[common_words_table_size];
576 for (int i = 0; i < common_words_table_size; i++)
577 common_words_table[i] = 0;
578 int count = 0;
579 int key_len = 0;
580 for (;;) {
581 int c = getc(fp);
582 while (c != EOF && !csalnum(c))
583 c = getc(fp);
584 if (c == EOF)
585 break;
586 do {
587 if (key_len < header.truncate)
588 key_buffer[key_len++] = cmlower(c);
589 c = getc(fp);
590 } while (c != EOF && csalnum(c));
591 if (key_len >= header.shortest) {
592 int h = hash(key_buffer, key_len) % common_words_table_size;
593 while (common_words_table[h]) {
594 if (h == 0)
595 h = common_words_table_size;
596 --h;
598 common_words_table[h] = new char[key_len + 1];
599 memcpy(common_words_table[h], key_buffer, key_len);
600 common_words_table[h][key_len] = '\0';
602 if (++count >= header.common)
603 break;
604 key_len = 0;
605 if (c == EOF)
606 break;
608 fclose(fp);
611 void index_search_item::add_out_of_date_file(int fd, const char *filename,
612 int fid)
614 search_item **pp;
615 for (pp = &out_of_date_files; *pp; pp = &(*pp)->next)
616 if ((*pp)->is_named(filename))
617 return;
618 *pp = make_linear_search_item(fd, filename, fid);
619 warning("`%1' modified since `%2' created", filename, name);
622 void index_search_item::check_files()
624 const char *pool_end = pool + header.strings_size;
625 for (const char *ptr = strchr(ignore_fields, '\0') + 1;
626 ptr < pool_end;
627 ptr = strchr(ptr, '\0') + 1) {
628 const char *path = munge_filename(ptr);
629 struct stat sb;
630 if (stat(path, &sb) < 0)
631 error("can't stat `%1': %2", path, strerror(errno));
632 else if (sb.st_mtime > mtime) {
633 int fd = open(path, O_RDONLY | O_BINARY);
634 if (fd < 0)
635 error("can't open `%1': %2", path, strerror(errno));
636 else
637 add_out_of_date_file(fd, path, filename_id + (ptr - pool));