Bug 1053: Fix crash when download ends prematurely.
[elinks.git] / src / util / fastfind.c
blob17ac97ed8cb41abe058d6dbe42fef1c82b40aa5f
1 /** Very fast search_keyword_in_list.
2 * @file
5 * It replaces bsearch() + strcasecmp() + callback + ...
7 * Following conditions should be met:
9 * - list keys are C strings.
10 * - keys should not be greater than 255 characters, and optimally < 20
11 * characters. It can work with greater keys but then memory usage will
12 * grow a lot.
13 * - each key must be unique and non empty.
14 * - list do not have to be ordered.
15 * - total number of unique characters used in all keys should be <= 128
16 * - idealy total number of keys should be <= 512 (but see below)
18 * (c) 2003 Laurent MONIN (aka Zas)
19 * Feel free to do whatever you want with that code.
22 * These routines use a tree search. First, a big tree is composed from the
23 * keys on input. Then, when searching we just go through the tree. If we will
24 * end up on an 'ending' node, we've got it.
26 * Hm, okay. For keys { 'head', 'h1', 'body', 'bodyrock', 'bodyground' }, it
27 * would look like:
29 * @verbatim
30 * [root]
31 * b h
32 * o e 1
33 * d a
34 * Y D
35 * g r
36 * r o
37 * o c
38 * u K
39 * D
40 * @endverbatim
42 * (the ending nodes are upcased just for this drawing, not in real)
44 * To optimize this for speed, leafs of nodes are organized in per-node arrays
45 * (so-called 'leafsets'), indexed by symbol value of the key's next character.
46 * But to optimize that for memory, we first compose own alphabet consisting
47 * only from the chars we ever use in the key strings. fastfind_info.uniq_chars
48 * holds that alphabet and fastfind_info.idxtab is used to translate between it
49 * and ASCII.
51 * Tree building: O((L+M)*N)
52 * (L: mean key length, M: alphabet size,
53 * N: number of items).
54 * String lookup: O(N) (N: string length). */
56 #ifdef HAVE_CONFIG_H
57 #include "config.h"
58 #endif
60 #include <stdio.h>
61 #include <stdlib.h>
63 #include "elinks.h"
65 #include "util/conv.h"
66 #include "util/error.h"
67 #include "util/fastfind.h"
68 #include "util/memdebug.h"
69 #include "util/memory.h"
71 #ifdef USE_FASTFIND
73 /** Define it to generate performance and memory usage statistics to stderr. */
74 #if 0
75 #define DEBUG_FASTFIND
76 #endif
78 /** Define whether to use 32 or 64 bits per compressed element. */
79 #if 1
80 #define USE_32_BITS
81 #endif
83 #define END_LEAF_BITS 1
84 #define COMPRESSED_BITS 1
86 #ifdef USE_32_BITS
88 /* Use only 32 bits per element, but has very low limits. */
89 /* Adequate for ELinks tags search. */
91 #define POINTER_INDEX_BITS 10 /* 1024 */
92 #define LEAFSET_INDEX_BITS 13 /* 8192 */
93 #define COMP_CHAR_INDEX_BITS 7 /* 128 */
95 #define ff_node ff_node_c /* Both are 32 bits long. */
97 #if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \
98 COMP_CHAR_INDEX_BITS + END_LEAF_BITS + \
99 COMPRESSED_BITS) > 32
100 #error Over 32 bits in struct ff_node !!
101 #endif
103 #else /* !USE_32_BITS */
105 /* Keep this one if there is more than 512 keywords in a list
106 * it eats a bit more memory.
107 * ELinks may need this one if fastfind is used in other
108 * things than tags searching. */
109 /* This will make struct ff_node_c use 64 bits. */
111 #define POINTER_INDEX_BITS 12
112 #define LEAFSET_INDEX_BITS 18
113 #define COMP_CHAR_INDEX_BITS 8
115 #if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \
116 + END_LEAF_BITS + COMPRESSED_BITS) > 32
117 #error Over 32 bits in struct ff_node !!
118 #endif
120 struct ff_node {
121 /** End leaf -> p is significant */
122 unsigned int e:END_LEAF_BITS;
124 /** Compressed */
125 unsigned int c:COMPRESSED_BITS;
127 /** Index in pointers */
128 unsigned int p:POINTER_INDEX_BITS;
130 /** Index in leafsets */
131 unsigned int l:LEAFSET_INDEX_BITS;
134 #endif /* USE_32_BITS */
137 #define FF_MAX_KEYS (1 << POINTER_INDEX_BITS)
138 #define FF_MAX_LEAFSETS ((1 << LEAFSET_INDEX_BITS) - 1)
139 #define FF_MAX_CHARS (1 << COMP_CHAR_INDEX_BITS)
140 #define FF_MAX_KEYLEN 255
142 struct ff_node_c {
143 unsigned int e:END_LEAF_BITS;
144 unsigned int c:COMPRESSED_BITS;
145 unsigned int p:POINTER_INDEX_BITS;
146 unsigned int l:LEAFSET_INDEX_BITS;
148 /** Index of char when compressed. */
149 unsigned int ch:COMP_CHAR_INDEX_BITS;
152 struct ff_data {
153 void *pointer;
154 int keylen;
157 struct fastfind_info {
158 struct ff_data *data;
160 struct ff_node **leafsets;
161 struct ff_node *root_leafset;
163 int min_key_len;
164 int max_key_len;
166 int uniq_chars_count;
167 int count;
168 int pointers_count;
169 int leafsets_count;
171 unsigned int case_aware:1;
172 unsigned int compress:1;
174 int idxtab[FF_MAX_CHARS];
175 unsigned char uniq_chars[FF_MAX_CHARS];
177 #ifdef DEBUG_FASTFIND
178 struct {
179 unsigned long searches;
180 unsigned long found;
181 unsigned long itertmp;
182 unsigned long iterdelta;
183 unsigned long itermax;
184 unsigned long iterations;
185 unsigned long tests;
186 unsigned long teststmp;
187 unsigned long testsdelta;
188 unsigned long testsmax;
189 unsigned long memory_usage;
190 unsigned long total_key_len;
191 unsigned int compressed_nodes;
192 unsigned char *comment;
193 } debug;
194 #endif
198 #ifdef DEBUG_FASTFIND
199 /* These are for performance testing. */
200 #define FF_DBG_mem(x, size) (x)->debug.memory_usage += (size)
201 #define FF_DBG_test(x) (x)->debug.tests++
202 #define FF_DBG_iter(x) (x)->debug.iterations++
203 #define FF_DBG_cnode(x) (x)->debug.compressed_nodes++
204 #define FF_DBG_found(x) \
205 do { \
206 unsigned long iter = (x)->debug.iterations - (x)->debug.itertmp; \
207 unsigned long tests = (x)->debug.tests - (x)->debug.teststmp; \
209 (x)->debug.iterdelta += iter; \
210 (x)->debug.testsdelta += tests; \
211 if (iter > (x)->debug.itermax) \
212 (x)->debug.itermax = iter; \
213 if (tests > (x)->debug.testsmax) \
214 (x)->debug.testsmax = tests; \
215 (x)->debug.found++; \
216 } while (0)
217 #define FF_DBG_comment(x, str) do { (x)->debug.comment = empty_string_or_(str); } while (0)
219 /** Update search stats. */
220 static void
221 FF_DBG_search_stats(struct fastfind_info *info, int key_len)
223 info->debug.searches++;
224 info->debug.total_key_len += key_len;
225 info->debug.teststmp = info->debug.tests;
226 info->debug.itertmp = info->debug.iterations;
229 /** Dump all stats. */
230 static void
231 FF_DBG_dump_stats(struct fastfind_info *info)
233 fprintf(stderr, "------ FastFind Statistics ------\n");
234 fprintf(stderr, "Comment : %s\n", info->debug.comment);
235 fprintf(stderr, "Case-aware : %s\n", info->case_aware ? "yes" : "no");
236 fprintf(stderr, "Compress : %s\n", info->compress ? "yes" : "no");
237 fprintf(stderr, "Uniq_chars : %s\n", info->uniq_chars);
238 fprintf(stderr, "Uniq_chars #: %d/%d max.\n", info->uniq_chars_count, FF_MAX_CHARS);
239 fprintf(stderr, "Min_key_len : %d\n", info->min_key_len);
240 fprintf(stderr, "Max_key_len : %d\n", info->max_key_len);
241 fprintf(stderr, "Entries : %d/%d max.\n", info->pointers_count, FF_MAX_KEYS);
242 fprintf(stderr, "Leafsets : %d/%d max.\n", info->leafsets_count, FF_MAX_LEAFSETS);
243 if (info->compress)
244 fprintf(stderr, "C. leafsets : %u/%d (%0.2f%%)\n",
245 info->debug.compressed_nodes,
246 info->leafsets_count,
247 100 * (double) info->debug.compressed_nodes / info->leafsets_count);
248 fprintf(stderr, "Memory usage: %lu bytes (cost per entry = %0.2f bytes)\n",
249 info->debug.memory_usage, (double) info->debug.memory_usage / info->pointers_count);
250 fprintf(stderr, "Struct info : %zu bytes\n", sizeof(*info) - sizeof(info->debug));
251 fprintf(stderr, "Struct node : %zu bytes\n", sizeof(struct ff_node));
252 fprintf(stderr, "Struct cnode: %zu bytes\n", sizeof(struct ff_node_c));
253 fprintf(stderr, "Searches : %lu\n", info->debug.searches);
254 fprintf(stderr, "Found : %lu (%0.2f%%)\n",
255 info->debug.found, 100 * (double) info->debug.found / info->debug.searches);
256 fprintf(stderr, "Iterations : %lu (%0.2f per search, %0.2f before found, %lu max)\n",
257 info->debug.iterations, (double) info->debug.iterations / info->debug.searches,
258 (double) info->debug.iterdelta / info->debug.found,
259 info->debug.itermax);
260 fprintf(stderr, "Tests : %lu (%0.2f per search, %0.2f per iter., %0.2f before found, %lu max)\n",
261 info->debug.tests, (double) info->debug.tests / info->debug.searches,
262 (double) info->debug.tests / info->debug.iterations,
263 (double) info->debug.testsdelta / info->debug.found,
264 info->debug.testsmax);
265 fprintf(stderr, "Total keylen: %lu bytes (%0.2f per search, %0.2f per iter.)\n",
266 info->debug.total_key_len, (double) info->debug.total_key_len / info->debug.searches,
267 (double) info->debug.total_key_len / info->debug.iterations);
268 fprintf(stderr, "\n");
271 #else /* !DEBUG_FASTFIND */
273 #define FF_DBG_mem(x, size)
274 #define FF_DBG_test(x)
275 #define FF_DBG_iter(x)
276 #define FF_DBG_cnode(x)
277 #define FF_DBG_found(x)
278 #define FF_DBG_comment(x, comment)
279 #define FF_DBG_search_stats(info, key_len)
280 #define FF_DBG_dump_stats(info)
282 #endif /* DEBUG_FASTFIND */
285 static struct fastfind_info *
286 init_fastfind(struct fastfind_index *index, enum fastfind_flags flags)
288 struct fastfind_info *info = mem_calloc(1, sizeof(*info));
290 index->handle = info;
291 if (!info) return NULL;
293 info->min_key_len = FF_MAX_KEYLEN;
294 info->case_aware = !!(flags & FF_CASE_AWARE);
295 info->compress = !!(flags & FF_COMPRESS);
297 FF_DBG_mem(info, sizeof(*info) - sizeof(info->debug));
298 FF_DBG_comment(info, index->comment);
300 return info;
303 /** @returns 1 on success, 0 on allocation failure */
304 static int
305 alloc_ff_data(struct fastfind_info *info)
307 struct ff_data *data;
309 assert(info->count < FF_MAX_KEYS);
310 if_assert_failed return 0;
312 /* On error, cleanup is done by fastfind_done(). */
314 data = mem_calloc(info->count, sizeof(*data));
315 if (!data) return 0;
316 info->data = data;
317 FF_DBG_mem(info, info->count * sizeof(*data));
319 return 1;
322 /** Add pointer and its key length to correspondant arrays, incrementing
323 * internal counter. */
324 static void
325 add_to_ff_data(void *p, int key_len, struct fastfind_info *info)
327 struct ff_data *data = &info->data[info->pointers_count++];
329 /* Record new pointer and key len, used in search */
330 data->pointer = p;
331 data->keylen = key_len;
334 /** @returns 1 on success, 0 on allocation failure */
335 static int
336 alloc_leafset(struct fastfind_info *info)
338 struct ff_node **leafsets;
339 struct ff_node *leafset;
341 assert(info->leafsets_count < FF_MAX_LEAFSETS);
342 if_assert_failed return 0;
344 /* info->leafsets[0] is never used since l=0 marks no leaf
345 * in struct ff_node. That's the reason of that + 2. */
346 leafsets = mem_realloc(info->leafsets,
347 sizeof(*leafsets) * (info->leafsets_count + 2));
348 if (!leafsets) return 0;
349 info->leafsets = leafsets;
351 leafset = mem_calloc(info->uniq_chars_count, sizeof(*leafset));
352 if (!leafset) return 0;
354 FF_DBG_mem(info, sizeof(*leafsets));
355 FF_DBG_mem(info, sizeof(*leafset) * info->uniq_chars_count);
357 info->leafsets_count++;
358 info->leafsets[info->leafsets_count] = leafset;
360 return 1;
363 static inline int
364 char2idx(unsigned char c, struct fastfind_info *info)
366 unsigned char *idx = memchr(info->uniq_chars, c, info->uniq_chars_count);
368 if (idx) return (idx - info->uniq_chars);
370 return -1;
373 static inline void
374 init_idxtab(struct fastfind_info *info)
376 int i;
378 for (i = 0; i < FF_MAX_CHARS; i++)
379 info->idxtab[i] = char2idx((unsigned char) i, info);
382 static inline void
383 compress_node(struct ff_node *leafset, struct fastfind_info *info,
384 int i, int pos)
386 struct ff_node_c *new = mem_alloc(sizeof(*new));
388 if (!new) return;
390 new->c = 1;
391 new->e = leafset[pos].e;
392 new->p = leafset[pos].p;
393 new->l = leafset[pos].l;
394 new->ch = pos;
396 mem_free_set(&info->leafsets[i], (struct ff_node *) new);
397 FF_DBG_cnode(info);
398 FF_DBG_mem(info, sizeof(*new));
399 FF_DBG_mem(info, sizeof(*leafset) * -info->uniq_chars_count);
402 static void
403 compress_tree(struct ff_node *leafset, struct fastfind_info *info)
405 int cnt = 0;
406 int pos = 0;
407 int i;
409 assert(info);
410 if_assert_failed return;
412 for (i = 0; i < info->uniq_chars_count; i++) {
413 if (leafset[i].c) continue;
415 if (leafset[i].l) {
416 /* There's a leaf leafset, descend to it and recurse */
417 compress_tree(info->leafsets[leafset[i].l], info);
420 if (leafset[i].l || leafset[i].e) {
421 cnt++;
422 pos = i;
426 if (cnt != 1 || leafset[pos].c) return;
428 /* Compress if possible ;) */
429 for (i = 1; i < info->leafsets_count; i++) {
430 if (info->leafsets[i] == leafset) {
431 compress_node(leafset, info, i, pos);
432 return;
437 #define ifcase(c) (info->case_aware ? (c) : toupper(c))
439 struct fastfind_index *
440 fastfind_index(struct fastfind_index *index, enum fastfind_flags flags)
442 struct fastfind_key_value *p;
443 struct fastfind_info *info;
445 assert(index && index->reset && index->next);
446 if_assert_failed goto return_error;
448 info = init_fastfind(index, flags);
449 if (!info) goto return_error;
451 /* First search min, max, count and uniq_chars. */
452 index->reset();
454 while ((p = index->next())) {
455 int key_len = strlen(p->key);
456 int i;
458 assert(key_len > 0 && key_len <= FF_MAX_KEYLEN);
459 if_assert_failed goto return_error;
461 if (key_len < info->min_key_len)
462 info->min_key_len = key_len;
464 if (key_len > info->max_key_len)
465 info->max_key_len = key_len;
467 for (i = 0; i < key_len; i++) {
468 /* ifcase() test should be moved outside loops but
469 * remember we call this routine only once per list.
470 * So I go for code readability vs performance here.
471 * --Zas */
472 int k = ifcase(p->key[i]);
474 assert(k < FF_MAX_CHARS);
475 if_assert_failed goto return_error;
477 if (char2idx(k, info) == -1) {
478 assert(info->uniq_chars_count < FF_MAX_CHARS);
479 if_assert_failed goto return_error;
481 info->uniq_chars[info->uniq_chars_count++] = k;
485 info->count++;
488 if (!info->count) return NULL;
490 init_idxtab(info);
492 /* Root leafset allocation */
493 if (!alloc_leafset(info)) goto return_error;
495 info->root_leafset = info->leafsets[info->leafsets_count];
497 if (!alloc_ff_data(info)) goto return_error;
499 /* Build the tree */
500 index->reset();
502 while ((p = index->next())) {
503 int key_len = strlen(p->key);
504 struct ff_node *leafset = info->root_leafset;
505 int i;
507 #if 0
508 fprintf(stderr, "K: %s\n", p->key);
509 #endif
510 for (i = 0; i < key_len - 1; i++) {
511 /* Convert char to its index value */
512 int idx = info->idxtab[ifcase(p->key[i])];
514 /* leafset[idx] is the desired leaf node's bucket. */
516 if (leafset[idx].l == 0) {
517 /* There's no leaf yet */
518 if (!alloc_leafset(info)) goto return_error;
519 leafset[idx].l = info->leafsets_count;
522 /* Descend to leaf */
523 leafset = info->leafsets[leafset[idx].l];
526 /* Index final leaf */
527 i = info->idxtab[ifcase(p->key[i])];
529 leafset[i].e = 1;
531 /* Memorize pointer to data */
532 leafset[i].p = info->pointers_count;
533 add_to_ff_data(p->data, key_len, info);
536 if (info->compress)
537 compress_tree(info->root_leafset, info);
539 return index;
541 return_error:
542 fastfind_done(index);
543 return NULL;
546 #undef ifcase
549 /** This macro searchs for the key in indexed list */
550 #define FF_SEARCH(what) do { \
551 int i; \
553 for (i = 0; i < key_len; i++) { \
554 int lidx, k = what; \
556 FF_DBG_iter(info); \
558 FF_DBG_test(info); \
559 if (k >= FF_MAX_CHARS) return NULL; \
560 lidx = info->idxtab[k]; \
562 FF_DBG_test(info); \
563 if (lidx < 0) return NULL; \
565 FF_DBG_test(info); \
566 if (current->c) { \
567 /* It is a compressed leaf. */ \
568 FF_DBG_test(info); \
569 if (((struct ff_node_c *) current)->ch != lidx) \
570 return NULL; \
571 } else { \
572 current = &current[lidx]; \
575 FF_DBG_test(info); \
576 if (current->e) { \
577 struct ff_data *data = &info->data[current->p]; \
579 FF_DBG_test(info); \
580 if (key_len == data->keylen) { \
581 FF_DBG_found(info); \
582 return data->pointer; \
586 FF_DBG_test(info); \
587 if (!current->l) return NULL; \
588 current = (struct ff_node *) info->leafsets[current->l]; \
590 } while (0)
592 void *
593 fastfind_search(struct fastfind_index *index,
594 const unsigned char *key, int key_len)
596 struct ff_node *current;
597 struct fastfind_info *info;
599 assert(index);
600 if_assert_failed return NULL;
602 info = index->handle;
604 assertm(info != NULL, "FastFind index %s not initialized", index->comment);
605 if_assert_failed return NULL;
607 FF_DBG_search_stats(info, key_len);
609 FF_DBG_test(info); if (!key) return NULL;
610 FF_DBG_test(info); if (key_len > info->max_key_len) return NULL;
611 FF_DBG_test(info); if (key_len < info->min_key_len) return NULL;
613 current = info->root_leafset;
615 /* Macro and code redundancy are there to obtain maximum
616 * performance. Do not move it to an inlined function.
617 * Do not even think about it.
618 * If you find a better way (same or better performance) then
619 * propose it and be prepared to defend it. --Zas */
621 FF_DBG_test(info);
622 if (info->case_aware)
623 FF_SEARCH(key[i]);
624 else
625 FF_SEARCH(toupper(key[i]));
627 return NULL;
630 #undef FF_SEARCH
632 void
633 fastfind_done(struct fastfind_index *index)
635 struct fastfind_info *info;
637 assert(index);
638 if_assert_failed return;
640 info = index->handle;
641 if (!info) return;
643 FF_DBG_dump_stats(info);
645 mem_free_if(info->data);
646 while (info->leafsets_count) {
647 mem_free_if(info->leafsets[info->leafsets_count]);
648 info->leafsets_count--;
650 mem_free_if(info->leafsets);
651 mem_free(info);
653 index->handle = NULL;
657 /* EXAMPLE */
659 #if 0
660 struct list {
661 unsigned char *tag;
662 int val;
665 struct list list[] = {
666 {"A", 1},
667 {"ABBR", 2},
668 {"ADDRESS", 3},
669 {"B", 4},
670 {"BASE", 5},
671 {"BASEFONT", 6},
672 {"BLOCKQUOTE", 7},
673 {"BODY", 8},
674 {"BR", 9},
675 {"BUTTON", 10},
676 {"CAPTION", 11},
677 {"CENTER", 12},
678 {"CODE", 13},
679 {"DD", 14},
680 {"DFN", 15},
681 {"DIR", 16},
682 {"DIV", 17},
683 {"DL", 18},
684 {"DT", 19},
685 {"EM", 20},
686 {"FIXED", 21},
687 {"FONT", 22},
688 {"FORM", 23},
689 {"FRAME", 24},
690 {"FRAMESET", 25},
691 {"H1", 26},
692 {"H2", 27},
693 {"H3", 28},
694 {"H4", 29},
695 {"H5", 30},
696 {"H6", 31},
697 /* {"HEAD", html_skip, 0, 0}, */
698 {"HR", 32},
699 {"I", 33},
700 {"IFRAME", 34},
701 {"IMG", 35},
702 {"INPUT", 36},
703 {"LI", 37},
704 {"LINK", 38},
705 {"LISTING", 39},
706 {"MENU", 40},
707 {"NOFRAMES", 41},
708 {"OL", 42},
709 {"OPTION", 43},
710 {"P", 44},
711 {"PRE", 45},
712 {"Q", 46},
713 {"S", 47},
714 {"SCRIPT", 48},
715 {"SELECT", 49},
716 {"SPAN", 50},
717 {"STRIKE", 51},
718 {"STRONG", 52},
719 {"STYLE", 53},
720 {"SUB", 54},
721 {"SUP", 55},
722 {"TABLE", 56},
723 {"TD", 57},
724 {"TEXTAREA", 58},
725 {"TH", 59},
726 {"TITLE", 60},
727 {"TR", 61},
728 {"U", 62},
729 {"UL", 63},
730 {"XMP", 64},
731 {NULL, 0}, /* List terminaison is key = NULL */
735 struct list *internal_pointer;
737 /** Reset internal list pointer */
738 void
739 reset_list(void)
741 internal_pointer = list;
744 /** Returns a pointer to a struct that contains
745 * current key and data pointers and increment
746 * internal pointer.
747 * It returns NULL when key is NULL. */
748 struct fastfind_key_value *
749 next_in_list(void)
751 static struct fastfind_key_value kv;
753 if (!internal_pointer->tag) return NULL;
755 kv.key = internal_pointer->tag;
756 kv.data = internal_pointer;
758 internal_pointer++;
760 return &kv;
763 static struct fastfind_index ff_index
764 = INIT_FASTFIND_INDEX("example", reset_list, next_in_list);
767 main(int argc, char **argv)
769 unsigned char *key = argv[1];
770 struct list *result;
772 if (!key || !*key) {
773 fprintf(stderr, "Usage: fastfind keyword\n");
774 exit(-1);
777 fprintf(stderr, "---------- INDEX PHASE -----------\n");
778 /* Mandatory */
779 fastfind_index(&ff_index, FF_COMPRESS);
781 fprintf(stderr, "---------- SEARCH PHASE ----------\n");
782 /* Without this one ... */
783 result = (struct list *) fastfind_search(&ff_index, key, strlen(key));
785 if (result)
786 fprintf(stderr, " Found: '%s' -> %d\n", result->tag, result->val);
787 else
788 fprintf(stderr, " Not found: '%s'\n", key);
790 fprintf(stderr, "---------- CLEANUP PHASE ----------\n");
791 fastfind_done(&ff_index);
793 exit(0);
796 #endif
798 #endif /* USE_FASTFIND */