2 * Copyright (c) 2014 - 2015 Steffen (Daode) Nurpmeso <sdaoden@users.sf.net>.
4 * Copyright (C) 1989 - 1992, 2001, 2004
5 * Free Software Foundation, Inc.
6 * Written by James Clark (jjc@jclark.com)
8 * This is free software; you can redistribute it and/or modify it under
9 * the terms of the GNU General Public License as published by the Free
10 * Software Foundation; either version 2, or (at your option) any later
13 * This is distributed in the hope that it will be useful, but WITHOUT ANY
14 * WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 * You should have received a copy of the GNU General Public License along
19 * with groff; see the file COPYING. If not, write to the Free Software
20 * Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
50 class index_search_item
53 search_item
*out_of_date_files
;
63 char *filename_buffer
;
65 char **common_words_table
;
66 int common_words_table_size
;
67 const char *ignore_fields
;
70 const char *do_verify();
71 const int *search1(const char **pp
, const char *end
);
72 const int *search(const char *ptr
, int length
, int **temp_listp
);
73 const char *munge_filename(const char *);
74 void read_common_words_file();
75 void add_out_of_date_file(int fd
, const char *filename
, int fid
);
78 index_search_item(const char *, int);
81 search_item_iterator
*make_search_item_iterator(const char *);
84 int next_filename_id() const;
85 friend class index_search_item_iterator
;
88 class index_search_item_iterator
89 : public search_item_iterator
91 index_search_item
*indx
;
92 search_item_iterator
*out_of_date_files_iter
;
93 search_item
*next_out_of_date_file
;
94 const int *found_list
;
98 linear_searcher searcher
;
100 int get_tag(int tagno
, const linear_searcher
&, const char **, int *,
104 index_search_item_iterator(index_search_item
*, const char *);
105 ~index_search_item_iterator();
106 int next(const linear_searcher
&, const char **, int *, reference_id
*);
110 index_search_item::index_search_item(const char *filename
, int fid
)
111 : search_item(filename
, fid
), out_of_date_files(0), buffer(0), map_addr(0),
112 map_len(0), key_buffer(0), filename_buffer(0), filename_buflen(0),
113 common_words_table(0)
117 index_search_item::~index_search_item()
122 if (unmap(map_addr
, map_len
) < 0)
123 error("unmap: %1", strerror(errno
));
125 while (out_of_date_files
) {
126 search_item
*tem
= out_of_date_files
;
127 out_of_date_files
= out_of_date_files
->next
;
130 a_delete filename_buffer
;
132 if (common_words_table
) {
133 for (int i
= 0; i
< common_words_table_size
; i
++)
134 a_delete common_words_table
[i
];
135 a_delete common_words_table
;
144 file_closer(int &fd
) : fdp(&fd
) { }
145 ~file_closer() { close(*fdp
); }
148 // Tell the compiler that a variable is intentionally unused.
149 inline void unused(void *) { }
151 int index_search_item::load(int fd
)
153 file_closer
fd_closer(fd
); // close fd on return
156 if (fstat(fd
, &sb
) < 0) {
157 error("can't fstat `%1': %2", name
, strerror(errno
));
160 if (!S_ISREG(sb
.st_mode
)) {
161 error("`%1' is not a regular file", name
);
165 int size
= int(sb
.st_size
);
167 map_addr
= mapread(fd
, size
);
169 addr
= (char *)map_addr
;
173 addr
= buffer
= (char *)malloc(size
);
175 error("can't allocate buffer for `%1'", name
);
179 int bytes_to_read
= size
;
180 while (bytes_to_read
> 0) {
181 int nread
= read(fd
, ptr
, bytes_to_read
);
183 error("unexpected EOF on `%1'", name
);
187 error("read error on `%1': %2", name
, strerror(errno
));
190 bytes_to_read
-= nread
;
194 header
= *(index_header
*)addr
;
195 if (header
.magic
!= INDEX_MAGIC
) {
196 error("`%1' is not an index file: wrong magic number", name
);
199 if (header
.version
!= INDEX_VERSION
) {
200 error("version number in `%1' is wrong: was %2, should be %3",
201 name
, header
.version
, INDEX_VERSION
);
204 int sz
= (header
.tags_size
* sizeof(tag
)
205 + header
.lists_size
* sizeof(int)
206 + header
.table_size
* sizeof(int)
207 + header
.strings_size
210 error("size of `%1' is wrong: was %2, should be %3",
214 tags
= (tag
*)(addr
+ sizeof(header
));
215 lists
= (int *)(tags
+ header
.tags_size
);
216 table
= (int *)(lists
+ header
.lists_size
);
217 pool
= (char *)(table
+ header
.table_size
);
218 ignore_fields
= strchr(strchr(pool
, '\0') + 1, '\0') + 1;
219 key_buffer
= new char[header
.truncate
];
220 read_common_words_file();
224 const char *index_search_item::do_verify()
228 if (lists
[header
.lists_size
- 1] >= 0)
229 return "last list element not negative";
231 for (i
= 0; i
< header
.table_size
; i
++) {
233 if (li
>= header
.lists_size
)
234 return "bad list index";
236 for (int *ptr
= lists
+ li
; *ptr
>= 0; ptr
++) {
237 if (*ptr
>= header
.tags_size
)
238 return "bad tag index";
239 if (*ptr
>= ptr
[1] && ptr
[1] >= 0)
240 return "list not ordered";
244 for (i
= 0; i
< header
.tags_size
; i
++) {
245 if (tags
[i
].filename_index
>= header
.strings_size
)
246 return "bad index in tags";
247 if (tags
[i
].length
< 0)
248 return "bad length in tags";
249 if (tags
[i
].start
< 0)
250 return "bad start in tags";
252 if (pool
[header
.strings_size
- 1] != '\0')
253 return "last character in pool not nul";
257 int index_search_item::verify()
259 const char *reason
= do_verify();
262 error("`%1' is bad: %2", name
, reason
);
266 int index_search_item::next_filename_id() const
268 return filename_id
+ header
.strings_size
+ 1;
271 search_item_iterator
*index_search_item::make_search_item_iterator(
274 return new index_search_item_iterator(this, query
);
277 search_item
*make_index_search_item(const char *filename
, int fid
)
279 char *index_filename
= new char[strlen(filename
) + sizeof(INDEX_SUFFIX
)];
280 strcpy(index_filename
, filename
);
281 strcat(index_filename
, INDEX_SUFFIX
);
282 int fd
= open(index_filename
, O_RDONLY
| O_BINARY
);
285 index_search_item
*item
= new index_search_item(index_filename
, fid
);
286 a_delete index_filename
;
287 if (!item
->load(fd
)) {
292 else if (verify_flag
&& !item
->verify()) {
302 index_search_item_iterator::index_search_item_iterator(index_search_item
*ind
,
304 : indx(ind
), out_of_date_files_iter(0), next_out_of_date_file(0), temp_list(0),
306 searcher(q
, strlen(q
), ind
->ignore_fields
, ind
->header
.truncate
),
309 found_list
= indx
->search(q
, strlen(q
), &temp_list
);
311 found_list
= &minus_one
;
312 warning("all keys would have been discarded in constructing index `%1'",
317 index_search_item_iterator::~index_search_item_iterator()
322 delete out_of_date_files_iter
;
325 int index_search_item_iterator::next(const linear_searcher
&,
326 const char **pp
, int *lenp
,
331 int tagno
= *found_list
;
335 if (get_tag(tagno
, searcher
, pp
, lenp
, ridp
))
339 next_out_of_date_file
= indx
->out_of_date_files
;
341 while (next_out_of_date_file
) {
342 if (out_of_date_files_iter
== 0)
343 out_of_date_files_iter
344 = next_out_of_date_file
->make_search_item_iterator(query
);
345 if (out_of_date_files_iter
->next(searcher
, pp
, lenp
, ridp
))
347 delete out_of_date_files_iter
;
348 out_of_date_files_iter
= 0;
349 next_out_of_date_file
= next_out_of_date_file
->next
;
354 int index_search_item_iterator::get_tag(int tagno
,
355 const linear_searcher
&searchr
,
356 const char **pp
, int *lenp
,
359 if (tagno
< 0 || tagno
>= indx
->header
.tags_size
) {
360 error("bad tag number");
363 tag
*tp
= indx
->tags
+ tagno
;
364 const char *filename
= indx
->munge_filename(indx
->pool
+ tp
->filename_index
);
365 int fd
= open(filename
, O_RDONLY
| O_BINARY
);
367 error("can't open `%1': %2", filename
, strerror(errno
));
371 if (fstat(fd
, &sb
) < 0) {
372 error("can't fstat: %1", strerror(errno
));
376 time_t mtime
= sb
.st_mtime
;
377 if (mtime
> indx
->mtime
) {
378 indx
->add_out_of_date_file(fd
, filename
,
379 indx
->filename_id
+ tp
->filename_index
);
383 FILE *fp
= fdopen(fd
, FOPEN_RB
);
385 error("fdopen failed");
389 if (tp
->start
!= 0 && fseek(fp
, long(tp
->start
), 0) < 0)
390 error("can't seek on `%1': %2", filename
, strerror(errno
));
392 int length
= tp
->length
;
395 if (fstat(fileno(fp
), &sb
) < 0) {
396 error("can't stat `%1': %2", filename
, strerror(errno
));
399 else if (!S_ISREG(sb
.st_mode
)) {
400 error("`%1' is not a regular file", filename
);
404 length
= int(sb
.st_size
);
407 if (length
+ 2 > buflen
) {
410 buf
= new char[buflen
];
412 if (fread(buf
+ 1, 1, length
, fp
) != (size_t)length
)
413 error("fread on `%1' failed: %2", filename
, strerror(errno
));
416 // Remove the CR characters from CRLF pairs.
417 int sidx
= 1, didx
= 1;
418 for ( ; sidx
< length
+ 1; sidx
++, didx
++)
420 if (buf
[sidx
] == '\r')
422 if (buf
[++sidx
] != '\n')
428 buf
[didx
] = buf
[sidx
];
430 buf
[length
+ 1] = '\n';
431 res
= searchr
.search(buf
+ 1, buf
+ 2 + length
, pp
, lenp
);
433 *ridp
= reference_id(indx
->filename_id
+ tp
->filename_index
,
442 const char *index_search_item::munge_filename(const char *filename
)
444 if (IS_ABSOLUTE(filename
))
446 const char *cwd
= pool
;
447 int need_slash
= (cwd
[0] != 0
448 && strchr(DIR_SEPS
, strchr(cwd
, '\0')[-1]) == 0);
449 int len
= strlen(cwd
) + strlen(filename
) + need_slash
+ 1;
450 if (len
> filename_buflen
) {
451 a_delete filename_buffer
;
452 filename_buflen
= len
;
453 filename_buffer
= new char[len
];
455 strcpy(filename_buffer
, cwd
);
457 strcat(filename_buffer
, "/");
458 strcat(filename_buffer
, filename
);
459 return filename_buffer
;
462 const int *index_search_item::search1(const char **pp
, const char *end
)
464 while (*pp
< end
&& !csalnum(**pp
))
468 const char *start
= *pp
;
469 while (*pp
< end
&& csalnum(**pp
))
471 int len
= *pp
- start
;
472 if (len
< header
.shortest
)
474 if (len
> header
.truncate
)
475 len
= header
.truncate
;
477 for (int i
= 0; i
< len
; i
++)
478 if (csdigit(start
[i
]))
479 key_buffer
[i
] = start
[i
];
481 key_buffer
[i
] = cmlower(start
[i
]);
484 if (is_number
&& !(len
== 4 && start
[0] == '1' && start
[1] == '9'))
486 unsigned hc
= hash(key_buffer
, len
);
487 if (common_words_table
) {
488 for (int h
= hc
% common_words_table_size
;
489 common_words_table
[h
];
491 if (strlen(common_words_table
[h
]) == (size_t)len
492 && memcmp(common_words_table
[h
], key_buffer
, len
) == 0)
495 h
= common_words_table_size
;
498 int li
= table
[int(hc
% header
.table_size
)];
499 return li
< 0 ? &minus_one
: lists
+ li
;
502 static void merge(int *result
, const int *s1
, const int *s2
)
504 for (; *s1
>= 0; s1
++) {
505 while (*s2
>= 0 && *s2
< *s1
)
513 const int *index_search_item::search(const char *ptr
, int length
,
516 const char *end
= ptr
+ length
;
518 a_delete
*temp_listp
;
521 const int *first_list
= 0;
522 while (ptr
< end
&& (first_list
= search1(&ptr
, end
)) == 0)
528 const int *second_list
= 0;
529 while (ptr
< end
&& (second_list
= search1(&ptr
, end
)) == 0)
533 if (*second_list
< 0)
536 for (p
= first_list
; *p
>= 0; p
++)
538 int len
= p
- first_list
;
539 for (p
= second_list
; *p
>= 0; p
++)
541 if (p
- second_list
< len
)
542 len
= p
- second_list
;
543 int *matches
= new int[len
+ 1];
544 merge(matches
, first_list
, second_list
);
546 const int *list
= search1(&ptr
, end
);
552 merge(matches
, matches
, list
);
559 *temp_listp
= matches
;
563 void index_search_item::read_common_words_file()
565 if (header
.common
<= 0)
567 const char *common_words_file
= munge_filename(strchr(pool
, '\0') + 1);
569 FILE *fp
= fopen(common_words_file
, "r");
571 error("can't open `%1': %2", common_words_file
, strerror(errno
));
574 common_words_table_size
= 2*header
.common
+ 1;
575 while (!is_prime(common_words_table_size
))
576 common_words_table_size
++;
577 common_words_table
= new char *[common_words_table_size
];
578 for (int i
= 0; i
< common_words_table_size
; i
++)
579 common_words_table
[i
] = 0;
584 while (c
!= EOF
&& !csalnum(c
))
589 if (key_len
< header
.truncate
)
590 key_buffer
[key_len
++] = cmlower(c
);
592 } while (c
!= EOF
&& csalnum(c
));
593 if (key_len
>= header
.shortest
) {
594 int h
= hash(key_buffer
, key_len
) % common_words_table_size
;
595 while (common_words_table
[h
]) {
597 h
= common_words_table_size
;
600 common_words_table
[h
] = new char[key_len
+ 1];
601 memcpy(common_words_table
[h
], key_buffer
, key_len
);
602 common_words_table
[h
][key_len
] = '\0';
604 if (++count
>= header
.common
)
613 void index_search_item::add_out_of_date_file(int fd
, const char *filename
,
617 for (pp
= &out_of_date_files
; *pp
; pp
= &(*pp
)->next
)
618 if ((*pp
)->is_named(filename
))
620 *pp
= make_linear_search_item(fd
, filename
, fid
);
621 warning("`%1' modified since `%2' created", filename
, name
);
624 void index_search_item::check_files()
626 const char *pool_end
= pool
+ header
.strings_size
;
627 for (const char *ptr
= strchr(ignore_fields
, '\0') + 1;
629 ptr
= strchr(ptr
, '\0') + 1) {
630 const char *path
= munge_filename(ptr
);
632 if (stat(path
, &sb
) < 0)
633 error("can't stat `%1': %2", path
, strerror(errno
));
634 else if (sb
.st_mtime
> mtime
) {
635 int fd
= open(path
, O_RDONLY
| O_BINARY
);
637 error("can't open `%1': %2", path
, strerror(errno
));
639 add_out_of_date_file(fd
, path
, filename_id
+ (ptr
- pool
));