Sync-to-go: update copyright for 2015
[s-roff.git] / src / lib-bib / index.cpp
blob16b6861e5607c9207f40ab82e0abe8965317c27a
1 /*@
2 * Copyright (c) 2014 - 2015 Steffen (Daode) Nurpmeso <sdaoden@users.sf.net>.
4 * Copyright (C) 1989 - 1992, 2001, 2004
5 * Free Software Foundation, Inc.
6 * Written by James Clark (jjc@jclark.com)
8 * This is free software; you can redistribute it and/or modify it under
9 * the terms of the GNU General Public License as published by the Free
10 * Software Foundation; either version 2, or (at your option) any later
11 * version.
13 * This is distributed in the hope that it will be useful, but WITHOUT ANY
14 * WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 * for more details.
18 * You should have received a copy of the GNU General Public License along
19 * with groff; see the file COPYING. If not, write to the Free Software
20 * Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
22 #include "config.h"
23 #include "lib.h"
25 #include <stdlib.h>
26 #include <errno.h>
28 #include "cset.h"
29 #include "cmap.h"
30 #include "defs.h"
31 #include "errarg.h"
32 #include "error.h"
33 #include "index.h"
34 #include "nonposix.h"
35 #include "posix.h"
36 #include "refid.h"
37 #include "search.h"
39 #include "bib.h"
41 #if 0
42 const
43 #endif
44 int minus_one = -1;
46 int verify_flag = 0;
48 struct word_list;
50 class index_search_item
51 : public search_item
53 search_item *out_of_date_files;
54 index_header header;
55 char *buffer;
56 void *map_addr;
57 int map_len;
58 tag *tags;
59 int *table;
60 int *lists;
61 char *pool;
62 char *key_buffer;
63 char *filename_buffer;
64 int filename_buflen;
65 char **common_words_table;
66 int common_words_table_size;
67 const char *ignore_fields;
68 time_t mtime;
70 const char *do_verify();
71 const int *search1(const char **pp, const char *end);
72 const int *search(const char *ptr, int length, int **temp_listp);
73 const char *munge_filename(const char *);
74 void read_common_words_file();
75 void add_out_of_date_file(int fd, const char *filename, int fid);
77 public:
78 index_search_item(const char *, int);
79 ~index_search_item();
80 int load(int fd);
81 search_item_iterator *make_search_item_iterator(const char *);
82 int verify();
83 void check_files();
84 int next_filename_id() const;
85 friend class index_search_item_iterator;
88 class index_search_item_iterator
89 : public search_item_iterator
91 index_search_item *indx;
92 search_item_iterator *out_of_date_files_iter;
93 search_item *next_out_of_date_file;
94 const int *found_list;
95 int *temp_list;
96 char *buf;
97 int buflen;
98 linear_searcher searcher;
99 char *query;
100 int get_tag(int tagno, const linear_searcher &, const char **, int *,
101 reference_id *);
103 public:
104 index_search_item_iterator(index_search_item *, const char *);
105 ~index_search_item_iterator();
106 int next(const linear_searcher &, const char **, int *, reference_id *);
110 index_search_item::index_search_item(const char *filename, int fid)
111 : search_item(filename, fid), out_of_date_files(0), buffer(0), map_addr(0),
112 map_len(0), key_buffer(0), filename_buffer(0), filename_buflen(0),
113 common_words_table(0)
117 index_search_item::~index_search_item()
119 if (buffer)
120 free(buffer);
121 if (map_addr) {
122 if (unmap(map_addr, map_len) < 0)
123 error("unmap: %1", strerror(errno));
125 while (out_of_date_files) {
126 search_item *tem = out_of_date_files;
127 out_of_date_files = out_of_date_files->next;
128 delete tem;
130 a_delete filename_buffer;
131 a_delete key_buffer;
132 if (common_words_table) {
133 for (int i = 0; i < common_words_table_size; i++)
134 a_delete common_words_table[i];
135 a_delete common_words_table;
139 class file_closer
141 int *fdp;
143 public:
144 file_closer(int &fd) : fdp(&fd) { }
145 ~file_closer() { close(*fdp); }
148 // Tell the compiler that a variable is intentionally unused.
149 inline void unused(void *) { }
151 int index_search_item::load(int fd)
153 file_closer fd_closer(fd); // close fd on return
154 unused(&fd_closer);
155 struct stat sb;
156 if (fstat(fd, &sb) < 0) {
157 error("can't fstat `%1': %2", name, strerror(errno));
158 return 0;
160 if (!S_ISREG(sb.st_mode)) {
161 error("`%1' is not a regular file", name);
162 return 0;
164 mtime = sb.st_mtime;
165 int size = int(sb.st_size);
166 char *addr;
167 map_addr = mapread(fd, size);
168 if (map_addr) {
169 addr = (char *)map_addr;
170 map_len = size;
172 else {
173 addr = buffer = (char *)malloc(size);
174 if (buffer == 0) {
175 error("can't allocate buffer for `%1'", name);
176 return 0;
178 char *ptr = buffer;
179 int bytes_to_read = size;
180 while (bytes_to_read > 0) {
181 int nread = read(fd, ptr, bytes_to_read);
182 if (nread == 0) {
183 error("unexpected EOF on `%1'", name);
184 return 0;
186 if (nread < 0) {
187 error("read error on `%1': %2", name, strerror(errno));
188 return 0;
190 bytes_to_read -= nread;
191 ptr += nread;
194 header = *(index_header *)addr;
195 if (header.magic != INDEX_MAGIC) {
196 error("`%1' is not an index file: wrong magic number", name);
197 return 0;
199 if (header.version != INDEX_VERSION) {
200 error("version number in `%1' is wrong: was %2, should be %3",
201 name, header.version, INDEX_VERSION);
202 return 0;
204 int sz = (header.tags_size * sizeof(tag)
205 + header.lists_size * sizeof(int)
206 + header.table_size * sizeof(int)
207 + header.strings_size
208 + sizeof(header));
209 if (sz != size) {
210 error("size of `%1' is wrong: was %2, should be %3",
211 name, size, sz);
212 return 0;
214 tags = (tag *)(addr + sizeof(header));
215 lists = (int *)(tags + header.tags_size);
216 table = (int *)(lists + header.lists_size);
217 pool = (char *)(table + header.table_size);
218 ignore_fields = strchr(strchr(pool, '\0') + 1, '\0') + 1;
219 key_buffer = new char[header.truncate];
220 read_common_words_file();
221 return 1;
224 const char *index_search_item::do_verify()
226 if (tags == 0)
227 return "not loaded";
228 if (lists[header.lists_size - 1] >= 0)
229 return "last list element not negative";
230 int i;
231 for (i = 0; i < header.table_size; i++) {
232 int li = table[i];
233 if (li >= header.lists_size)
234 return "bad list index";
235 if (li >= 0) {
236 for (int *ptr = lists + li; *ptr >= 0; ptr++) {
237 if (*ptr >= header.tags_size)
238 return "bad tag index";
239 if (*ptr >= ptr[1] && ptr[1] >= 0)
240 return "list not ordered";
244 for (i = 0; i < header.tags_size; i++) {
245 if (tags[i].filename_index >= header.strings_size)
246 return "bad index in tags";
247 if (tags[i].length < 0)
248 return "bad length in tags";
249 if (tags[i].start < 0)
250 return "bad start in tags";
252 if (pool[header.strings_size - 1] != '\0')
253 return "last character in pool not nul";
254 return 0;
257 int index_search_item::verify()
259 const char *reason = do_verify();
260 if (!reason)
261 return 1;
262 error("`%1' is bad: %2", name, reason);
263 return 0;
266 int index_search_item::next_filename_id() const
268 return filename_id + header.strings_size + 1;
271 search_item_iterator *index_search_item::make_search_item_iterator(
272 const char *query)
274 return new index_search_item_iterator(this, query);
277 search_item *make_index_search_item(const char *filename, int fid)
279 char *index_filename = new char[strlen(filename) + sizeof(INDEX_SUFFIX)];
280 strcpy(index_filename, filename);
281 strcat(index_filename, INDEX_SUFFIX);
282 int fd = open(index_filename, O_RDONLY | O_BINARY);
283 if (fd < 0)
284 return 0;
285 index_search_item *item = new index_search_item(index_filename, fid);
286 a_delete index_filename;
287 if (!item->load(fd)) {
288 close(fd);
289 delete item;
290 return 0;
292 else if (verify_flag && !item->verify()) {
293 delete item;
294 return 0;
296 else {
297 item->check_files();
298 return item;
302 index_search_item_iterator::index_search_item_iterator(index_search_item *ind,
303 const char *q)
304 : indx(ind), out_of_date_files_iter(0), next_out_of_date_file(0), temp_list(0),
305 buf(0), buflen(0),
306 searcher(q, strlen(q), ind->ignore_fields, ind->header.truncate),
307 query(strsave(q))
309 found_list = indx->search(q, strlen(q), &temp_list);
310 if (!found_list) {
311 found_list = &minus_one;
312 warning("all keys would have been discarded in constructing index `%1'",
313 indx->name);
317 index_search_item_iterator::~index_search_item_iterator()
319 a_delete temp_list;
320 a_delete buf;
321 a_delete query;
322 delete out_of_date_files_iter;
325 int index_search_item_iterator::next(const linear_searcher &,
326 const char **pp, int *lenp,
327 reference_id *ridp)
329 if (found_list) {
330 for (;;) {
331 int tagno = *found_list;
332 if (tagno == -1)
333 break;
334 found_list++;
335 if (get_tag(tagno, searcher, pp, lenp, ridp))
336 return 1;
338 found_list = 0;
339 next_out_of_date_file = indx->out_of_date_files;
341 while (next_out_of_date_file) {
342 if (out_of_date_files_iter == 0)
343 out_of_date_files_iter
344 = next_out_of_date_file->make_search_item_iterator(query);
345 if (out_of_date_files_iter->next(searcher, pp, lenp, ridp))
346 return 1;
347 delete out_of_date_files_iter;
348 out_of_date_files_iter = 0;
349 next_out_of_date_file = next_out_of_date_file->next;
351 return 0;
354 int index_search_item_iterator::get_tag(int tagno,
355 const linear_searcher &searchr,
356 const char **pp, int *lenp,
357 reference_id *ridp)
359 if (tagno < 0 || tagno >= indx->header.tags_size) {
360 error("bad tag number");
361 return 0;
363 tag *tp = indx->tags + tagno;
364 const char *filename = indx->munge_filename(indx->pool + tp->filename_index);
365 int fd = open(filename, O_RDONLY | O_BINARY);
366 if (fd < 0) {
367 error("can't open `%1': %2", filename, strerror(errno));
368 return 0;
370 struct stat sb;
371 if (fstat(fd, &sb) < 0) {
372 error("can't fstat: %1", strerror(errno));
373 close(fd);
374 return 0;
376 time_t mtime = sb.st_mtime;
377 if (mtime > indx->mtime) {
378 indx->add_out_of_date_file(fd, filename,
379 indx->filename_id + tp->filename_index);
380 return 0;
382 int res = 0;
383 FILE *fp = fdopen(fd, FOPEN_RB);
384 if (!fp) {
385 error("fdopen failed");
386 close(fd);
387 return 0;
389 if (tp->start != 0 && fseek(fp, long(tp->start), 0) < 0)
390 error("can't seek on `%1': %2", filename, strerror(errno));
391 else {
392 int length = tp->length;
393 int err = 0;
394 if (length == 0) {
395 if (fstat(fileno(fp), &sb) < 0) {
396 error("can't stat `%1': %2", filename, strerror(errno));
397 err = 1;
399 else if (!S_ISREG(sb.st_mode)) {
400 error("`%1' is not a regular file", filename);
401 err = 1;
403 else
404 length = int(sb.st_size);
406 if (!err) {
407 if (length + 2 > buflen) {
408 a_delete buf;
409 buflen = length + 2;
410 buf = new char[buflen];
412 if (fread(buf + 1, 1, length, fp) != (size_t)length)
413 error("fread on `%1' failed: %2", filename, strerror(errno));
414 else {
415 buf[0] = '\n';
416 // Remove the CR characters from CRLF pairs.
417 int sidx = 1, didx = 1;
418 for ( ; sidx < length + 1; sidx++, didx++)
420 if (buf[sidx] == '\r')
422 if (buf[++sidx] != '\n')
423 buf[didx++] = '\r';
424 else
425 length--;
427 if (sidx != didx)
428 buf[didx] = buf[sidx];
430 buf[length + 1] = '\n';
431 res = searchr.search(buf + 1, buf + 2 + length, pp, lenp);
432 if (res && ridp)
433 *ridp = reference_id(indx->filename_id + tp->filename_index,
434 tp->start);
438 fclose(fp);
439 return res;
442 const char *index_search_item::munge_filename(const char *filename)
444 if (IS_ABSOLUTE(filename))
445 return filename;
446 const char *cwd = pool;
447 int need_slash = (cwd[0] != 0
448 && strchr(DIR_SEPS, strchr(cwd, '\0')[-1]) == 0);
449 int len = strlen(cwd) + strlen(filename) + need_slash + 1;
450 if (len > filename_buflen) {
451 a_delete filename_buffer;
452 filename_buflen = len;
453 filename_buffer = new char[len];
455 strcpy(filename_buffer, cwd);
456 if (need_slash)
457 strcat(filename_buffer, "/");
458 strcat(filename_buffer, filename);
459 return filename_buffer;
462 const int *index_search_item::search1(const char **pp, const char *end)
464 while (*pp < end && !csalnum(**pp))
465 *pp += 1;
466 if (*pp >= end)
467 return 0;
468 const char *start = *pp;
469 while (*pp < end && csalnum(**pp))
470 *pp += 1;
471 int len = *pp - start;
472 if (len < header.shortest)
473 return 0;
474 if (len > header.truncate)
475 len = header.truncate;
476 int is_number = 1;
477 for (int i = 0; i < len; i++)
478 if (csdigit(start[i]))
479 key_buffer[i] = start[i];
480 else {
481 key_buffer[i] = cmlower(start[i]);
482 is_number = 0;
484 if (is_number && !(len == 4 && start[0] == '1' && start[1] == '9'))
485 return 0;
486 unsigned hc = hash(key_buffer, len);
487 if (common_words_table) {
488 for (int h = hc % common_words_table_size;
489 common_words_table[h];
490 --h) {
491 if (strlen(common_words_table[h]) == (size_t)len
492 && memcmp(common_words_table[h], key_buffer, len) == 0)
493 return 0;
494 if (h == 0)
495 h = common_words_table_size;
498 int li = table[int(hc % header.table_size)];
499 return li < 0 ? &minus_one : lists + li;
502 static void merge(int *result, const int *s1, const int *s2)
504 for (; *s1 >= 0; s1++) {
505 while (*s2 >= 0 && *s2 < *s1)
506 s2++;
507 if (*s2 == *s1)
508 *result++ = *s2;
510 *result++ = -1;
513 const int *index_search_item::search(const char *ptr, int length,
514 int **temp_listp)
516 const char *end = ptr + length;
517 if (*temp_listp) {
518 a_delete *temp_listp;
519 *temp_listp = 0;
521 const int *first_list = 0;
522 while (ptr < end && (first_list = search1(&ptr, end)) == 0)
524 if (!first_list)
525 return 0;
526 if (*first_list < 0)
527 return first_list;
528 const int *second_list = 0;
529 while (ptr < end && (second_list = search1(&ptr, end)) == 0)
531 if (!second_list)
532 return first_list;
533 if (*second_list < 0)
534 return second_list;
535 const int *p;
536 for (p = first_list; *p >= 0; p++)
538 int len = p - first_list;
539 for (p = second_list; *p >= 0; p++)
541 if (p - second_list < len)
542 len = p - second_list;
543 int *matches = new int[len + 1];
544 merge(matches, first_list, second_list);
545 while (ptr < end) {
546 const int *list = search1(&ptr, end);
547 if (list != 0) {
548 if (*list < 0) {
549 a_delete matches;
550 return list;
552 merge(matches, matches, list);
553 if (*matches < 0) {
554 a_delete matches;
555 return &minus_one;
559 *temp_listp = matches;
560 return matches;
563 void index_search_item::read_common_words_file()
565 if (header.common <= 0)
566 return;
567 const char *common_words_file = munge_filename(strchr(pool, '\0') + 1);
568 errno = 0;
569 FILE *fp = fopen(common_words_file, "r");
570 if (!fp) {
571 error("can't open `%1': %2", common_words_file, strerror(errno));
572 return;
574 common_words_table_size = 2*header.common + 1;
575 while (!is_prime(common_words_table_size))
576 common_words_table_size++;
577 common_words_table = new char *[common_words_table_size];
578 for (int i = 0; i < common_words_table_size; i++)
579 common_words_table[i] = 0;
580 int count = 0;
581 int key_len = 0;
582 for (;;) {
583 int c = getc(fp);
584 while (c != EOF && !csalnum(c))
585 c = getc(fp);
586 if (c == EOF)
587 break;
588 do {
589 if (key_len < header.truncate)
590 key_buffer[key_len++] = cmlower(c);
591 c = getc(fp);
592 } while (c != EOF && csalnum(c));
593 if (key_len >= header.shortest) {
594 int h = hash(key_buffer, key_len) % common_words_table_size;
595 while (common_words_table[h]) {
596 if (h == 0)
597 h = common_words_table_size;
598 --h;
600 common_words_table[h] = new char[key_len + 1];
601 memcpy(common_words_table[h], key_buffer, key_len);
602 common_words_table[h][key_len] = '\0';
604 if (++count >= header.common)
605 break;
606 key_len = 0;
607 if (c == EOF)
608 break;
610 fclose(fp);
613 void index_search_item::add_out_of_date_file(int fd, const char *filename,
614 int fid)
616 search_item **pp;
617 for (pp = &out_of_date_files; *pp; pp = &(*pp)->next)
618 if ((*pp)->is_named(filename))
619 return;
620 *pp = make_linear_search_item(fd, filename, fid);
621 warning("`%1' modified since `%2' created", filename, name);
624 void index_search_item::check_files()
626 const char *pool_end = pool + header.strings_size;
627 for (const char *ptr = strchr(ignore_fields, '\0') + 1;
628 ptr < pool_end;
629 ptr = strchr(ptr, '\0') + 1) {
630 const char *path = munge_filename(ptr);
631 struct stat sb;
632 if (stat(path, &sb) < 0)
633 error("can't stat `%1': %2", path, strerror(errno));
634 else if (sb.st_mtime > mtime) {
635 int fd = open(path, O_RDONLY | O_BINARY);
636 if (fd < 0)
637 error("can't open `%1': %2", path, strerror(errno));
638 else
639 add_out_of_date_file(fd, path, filename_id + (ptr - pool));
644 // s-it2-mode