Bug 880: Prevent SIGSEGV in init_python when -no-home is used.
[elinks.git] / src / util / fastfind.c
blob004701d64d40175d832073bcf8fc901661ba9525
1 /* Very fast search_keyword_in_list. */
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
7 #include <stdio.h>
8 #include <stdlib.h>
10 #include "elinks.h"
12 #include "util/conv.h"
13 #include "util/error.h"
14 #include "util/fastfind.h"
15 #include "util/memdebug.h"
16 #include "util/memory.h"
18 #ifdef USE_FASTFIND
20 /* It replaces bsearch() + strcasecmp() + callback + ...
22 * Following conditions should be met:
24 * - list keys are C strings.
25 * - keys should not be greater than 255 characters, and optimally < 20
26 * characters. It can work with greater keys but then memory usage will
27 * grow a lot.
28 * - each key must be unique and non empty.
29 * - list do not have to be ordered.
30 * - total number of unique characters used in all keys should be <= 128
31 * - idealy total number of keys should be <= 512 (but see below)
33 * (c) 2003 Laurent MONIN (aka Zas)
34 * Feel free to do whatever you want with that code. */
37 /* These routines use a tree search. First, a big tree is composed from the
38 * keys on input. Then, when searching we just go through the tree. If we will
39 * end up on an 'ending' node, we've got it.
41 * Hm, okay. For keys { 'head', 'h1', 'body', 'bodyrock', 'bodyground' }, it
42 * would look like:
44 * [root]
45 * b h
46 * o e 1
47 * d a
48 * Y D
49 * g r
50 * r o
51 * o c
52 * u K
53 * D
55 * (the ending nodes are upcased just for this drawing, not in real)
57 * To optimize this for speed, leafs of nodes are organized in per-node arrays
58 * (so-called 'leafsets'), indexed by symbol value of the key's next character.
59 * But to optimize that for memory, we first compose own alphabet consisting
60 * only from the chars we ever use in the key strings. @uniq_chars holds that
61 * alphabet and @idxtab is used to translate between it and ASCII.
63 * Tree building: O((L+M)*N)
64 * (L: mean key length, M: alphabet size,
65 * N: number of items).
66 * String lookup: O(N) (N: string length). */
69 /* Define it to generate performance and memory usage statistics to stderr. */
70 #if 0
71 #define DEBUG_FASTFIND
72 #endif
74 /* Define whether to use 32 or 64 bits per compressed element. */
75 #if 1
76 #define USE_32_BITS
77 #endif
79 #define END_LEAF_BITS 1
80 #define COMPRESSED_BITS 1
82 #ifdef USE_32_BITS
84 /* Use only 32 bits per element, but has very low limits. */
85 /* Adequate for ELinks tags search. */
87 #define POINTER_INDEX_BITS 10 /* 1024 */
88 #define LEAFSET_INDEX_BITS 13 /* 8192 */
89 #define COMP_CHAR_INDEX_BITS 7 /* 128 */
91 #define ff_node ff_node_c /* Both are 32 bits long. */
93 #if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \
94 COMP_CHAR_INDEX_BITS + END_LEAF_BITS + \
95 COMPRESSED_BITS) > 32
96 #error Over 32 bits in struct ff_node !!
97 #endif
99 #else /* !USE_32_BITS */
101 /* Keep this one if there is more than 512 keywords in a list
102 * it eats a bit more memory.
103 * ELinks may need this one if fastfind is used in other
104 * things than tags searching. */
105 /* This will make struct ff_node_c use 64 bits. */
107 #define POINTER_INDEX_BITS 12
108 #define LEAFSET_INDEX_BITS 18
109 #define COMP_CHAR_INDEX_BITS 8
111 #if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \
112 + END_LEAF_BITS + COMPRESSED_BITS) > 32
113 #error Over 32 bits in struct ff_node !!
114 #endif
116 struct ff_node {
117 /* End leaf -> p is significant */
118 unsigned int e:END_LEAF_BITS;
120 /* Compressed */
121 unsigned int c:COMPRESSED_BITS;
123 /* Index in pointers */
124 unsigned int p:POINTER_INDEX_BITS;
126 /* Index in leafsets */
127 unsigned int l:LEAFSET_INDEX_BITS;
130 #endif /* USE_32_BITS */
133 #define FF_MAX_KEYS (1 << POINTER_INDEX_BITS)
134 #define FF_MAX_LEAFSETS ((1 << LEAFSET_INDEX_BITS) - 1)
135 #define FF_MAX_CHARS (1 << COMP_CHAR_INDEX_BITS)
136 #define FF_MAX_KEYLEN 255
138 struct ff_node_c {
139 unsigned int e:END_LEAF_BITS;
140 unsigned int c:COMPRESSED_BITS;
141 unsigned int p:POINTER_INDEX_BITS;
142 unsigned int l:LEAFSET_INDEX_BITS;
144 /* Index of char when compressed. */
145 unsigned int ch:COMP_CHAR_INDEX_BITS;
148 struct ff_data {
149 void *pointer;
150 int keylen;
153 struct fastfind_info {
154 struct ff_data *data;
156 struct ff_node **leafsets;
157 struct ff_node *root_leafset;
159 int min_key_len;
160 int max_key_len;
162 int uniq_chars_count;
163 int count;
164 int pointers_count;
165 int leafsets_count;
167 unsigned int case_aware:1;
168 unsigned int compress:1;
170 int idxtab[FF_MAX_CHARS];
171 unsigned char uniq_chars[FF_MAX_CHARS];
173 #ifdef DEBUG_FASTFIND
174 struct {
175 unsigned long searches;
176 unsigned long found;
177 unsigned long itertmp;
178 unsigned long iterdelta;
179 unsigned long itermax;
180 unsigned long iterations;
181 unsigned long tests;
182 unsigned long teststmp;
183 unsigned long testsdelta;
184 unsigned long testsmax;
185 unsigned long memory_usage;
186 unsigned long total_key_len;
187 unsigned int compressed_nodes;
188 unsigned char *comment;
189 } debug;
190 #endif
194 #ifdef DEBUG_FASTFIND
195 /* These are for performance testing. */
196 #define FF_DBG_mem(x, size) (x)->debug.memory_usage += (size)
197 #define FF_DBG_test(x) (x)->debug.tests++
198 #define FF_DBG_iter(x) (x)->debug.iterations++
199 #define FF_DBG_cnode(x) (x)->debug.compressed_nodes++
200 #define FF_DBG_found(x) \
201 do { \
202 unsigned long iter = (x)->debug.iterations - (x)->debug.itertmp; \
203 unsigned long tests = (x)->debug.tests - (x)->debug.teststmp; \
205 (x)->debug.iterdelta += iter; \
206 (x)->debug.testsdelta += tests; \
207 if (iter > (x)->debug.itermax) \
208 (x)->debug.itermax = iter; \
209 if (tests > (x)->debug.testsmax) \
210 (x)->debug.testsmax = tests; \
211 (x)->debug.found++; \
212 } while (0)
213 #define FF_DBG_comment(x, str) do { (x)->debug.comment = empty_string_or_(str); } while (0)
215 /* Update search stats. */
216 static void
217 FF_DBG_search_stats(struct fastfind_info *info, int key_len)
219 info->debug.searches++;
220 info->debug.total_key_len += key_len;
221 info->debug.teststmp = info->debug.tests;
222 info->debug.itertmp = info->debug.iterations;
225 /* Dump all stats. */
226 static void
227 FF_DBG_dump_stats(struct fastfind_info *info)
229 fprintf(stderr, "------ FastFind Statistics ------\n");
230 fprintf(stderr, "Comment : %s\n", info->debug.comment);
231 fprintf(stderr, "Case-aware : %s\n", info->case_aware ? "yes" : "no");
232 fprintf(stderr, "Compress : %s\n", info->compress ? "yes" : "no");
233 fprintf(stderr, "Uniq_chars : %s\n", info->uniq_chars);
234 fprintf(stderr, "Uniq_chars #: %d/%d max.\n", info->uniq_chars_count, FF_MAX_CHARS);
235 fprintf(stderr, "Min_key_len : %d\n", info->min_key_len);
236 fprintf(stderr, "Max_key_len : %d\n", info->max_key_len);
237 fprintf(stderr, "Entries : %d/%d max.\n", info->pointers_count, FF_MAX_KEYS);
238 fprintf(stderr, "Leafsets : %d/%d max.\n", info->leafsets_count, FF_MAX_LEAFSETS);
239 if (info->compress)
240 fprintf(stderr, "C. leafsets : %u/%d (%0.2f%%)\n",
241 info->debug.compressed_nodes,
242 info->leafsets_count,
243 100 * (double) info->debug.compressed_nodes / info->leafsets_count);
244 fprintf(stderr, "Memory usage: %lu bytes (cost per entry = %0.2f bytes)\n",
245 info->debug.memory_usage, (double) info->debug.memory_usage / info->pointers_count);
246 fprintf(stderr, "Struct info : %zu bytes\n", sizeof(*info) - sizeof(info->debug));
247 fprintf(stderr, "Struct node : %zu bytes\n", sizeof(struct ff_node));
248 fprintf(stderr, "Struct cnode: %zu bytes\n", sizeof(struct ff_node_c));
249 fprintf(stderr, "Searches : %lu\n", info->debug.searches);
250 fprintf(stderr, "Found : %lu (%0.2f%%)\n",
251 info->debug.found, 100 * (double) info->debug.found / info->debug.searches);
252 fprintf(stderr, "Iterations : %lu (%0.2f per search, %0.2f before found, %lu max)\n",
253 info->debug.iterations, (double) info->debug.iterations / info->debug.searches,
254 (double) info->debug.iterdelta / info->debug.found,
255 info->debug.itermax);
256 fprintf(stderr, "Tests : %lu (%0.2f per search, %0.2f per iter., %0.2f before found, %lu max)\n",
257 info->debug.tests, (double) info->debug.tests / info->debug.searches,
258 (double) info->debug.tests / info->debug.iterations,
259 (double) info->debug.testsdelta / info->debug.found,
260 info->debug.testsmax);
261 fprintf(stderr, "Total keylen: %lu bytes (%0.2f per search, %0.2f per iter.)\n",
262 info->debug.total_key_len, (double) info->debug.total_key_len / info->debug.searches,
263 (double) info->debug.total_key_len / info->debug.iterations);
264 fprintf(stderr, "\n");
267 #else /* !DEBUG_FASTFIND */
269 #define FF_DBG_mem(x, size)
270 #define FF_DBG_test(x)
271 #define FF_DBG_iter(x)
272 #define FF_DBG_cnode(x)
273 #define FF_DBG_found(x)
274 #define FF_DBG_comment(x, comment)
275 #define FF_DBG_search_stats(info, key_len)
276 #define FF_DBG_dump_stats(info)
278 #endif /* DEBUG_FASTFIND */
281 static struct fastfind_info *
282 init_fastfind(struct fastfind_index *index, enum fastfind_flags flags)
284 struct fastfind_info *info = mem_calloc(1, sizeof(*info));
286 index->handle = info;
287 if (!info) return NULL;
289 info->min_key_len = FF_MAX_KEYLEN;
290 info->case_aware = !!(flags & FF_CASE_AWARE);
291 info->compress = !!(flags & FF_COMPRESS);
293 FF_DBG_mem(info, sizeof(*info) - sizeof(info->debug));
294 FF_DBG_comment(info, index->comment);
296 return info;
299 /* Return 1 on success, 0 on allocation failure */
300 static int
301 alloc_ff_data(struct fastfind_info *info)
303 struct ff_data *data;
305 assert(info->count < FF_MAX_KEYS);
306 if_assert_failed return 0;
308 /* On error, cleanup is done by fastfind_done(). */
310 data = mem_calloc(info->count, sizeof(*data));
311 if (!data) return 0;
312 info->data = data;
313 FF_DBG_mem(info, info->count * sizeof(*data));
315 return 1;
318 /* Add pointer and its key length to correspondant arrays, incrementing
319 * internal counter. */
320 static void
321 add_to_ff_data(void *p, int key_len, struct fastfind_info *info)
323 struct ff_data *data = &info->data[info->pointers_count++];
325 /* Record new pointer and key len, used in search */
326 data->pointer = p;
327 data->keylen = key_len;
330 /* Return 1 on success, 0 on allocation failure */
331 static int
332 alloc_leafset(struct fastfind_info *info)
334 struct ff_node **leafsets;
335 struct ff_node *leafset;
337 assert(info->leafsets_count < FF_MAX_LEAFSETS);
338 if_assert_failed return 0;
340 /* info->leafsets[0] is never used since l=0 marks no leaf
341 * in struct ff_node. That's the reason of that + 2. */
342 leafsets = mem_realloc(info->leafsets,
343 sizeof(*leafsets) * (info->leafsets_count + 2));
344 if (!leafsets) return 0;
345 info->leafsets = leafsets;
347 leafset = mem_calloc(info->uniq_chars_count, sizeof(*leafset));
348 if (!leafset) return 0;
350 FF_DBG_mem(info, sizeof(*leafsets));
351 FF_DBG_mem(info, sizeof(*leafset) * info->uniq_chars_count);
353 info->leafsets_count++;
354 info->leafsets[info->leafsets_count] = leafset;
356 return 1;
359 static inline int
360 char2idx(unsigned char c, struct fastfind_info *info)
362 unsigned char *idx = memchr(info->uniq_chars, c, info->uniq_chars_count);
364 if (idx) return (idx - info->uniq_chars);
366 return -1;
369 static inline void
370 init_idxtab(struct fastfind_info *info)
372 int i;
374 for (i = 0; i < FF_MAX_CHARS; i++)
375 info->idxtab[i] = char2idx((unsigned char) i, info);
378 static inline void
379 compress_node(struct ff_node *leafset, struct fastfind_info *info,
380 int i, int pos)
382 struct ff_node_c *new = mem_alloc(sizeof(*new));
384 if (!new) return;
386 new->c = 1;
387 new->e = leafset[pos].e;
388 new->p = leafset[pos].p;
389 new->l = leafset[pos].l;
390 new->ch = pos;
392 mem_free_set(&info->leafsets[i], (struct ff_node *) new);
393 FF_DBG_cnode(info);
394 FF_DBG_mem(info, sizeof(*new));
395 FF_DBG_mem(info, sizeof(*leafset) * -info->uniq_chars_count);
398 static void
399 compress_tree(struct ff_node *leafset, struct fastfind_info *info)
401 int cnt = 0;
402 int pos = 0;
403 int i;
405 assert(info);
406 if_assert_failed return;
408 for (i = 0; i < info->uniq_chars_count; i++) {
409 if (leafset[i].c) continue;
411 if (leafset[i].l) {
412 /* There's a leaf leafset, descend to it and recurse */
413 compress_tree(info->leafsets[leafset[i].l], info);
416 if (leafset[i].l || leafset[i].e) {
417 cnt++;
418 pos = i;
422 if (cnt != 1 || leafset[pos].c) return;
424 /* Compress if possible ;) */
425 for (i = 1; i < info->leafsets_count; i++) {
426 if (info->leafsets[i] == leafset) {
427 compress_node(leafset, info, i, pos);
428 return;
433 #define ifcase(c) (info->case_aware ? (c) : toupper(c))
435 struct fastfind_index *
436 fastfind_index(struct fastfind_index *index, enum fastfind_flags flags)
438 struct fastfind_key_value *p;
439 struct fastfind_info *info;
441 assert(index && index->reset && index->next);
442 if_assert_failed goto return_error;
444 info = init_fastfind(index, flags);
445 if (!info) goto return_error;
447 /* First search min, max, count and uniq_chars. */
448 index->reset();
450 while ((p = index->next())) {
451 int key_len = strlen(p->key);
452 int i;
454 assert(key_len > 0 && key_len <= FF_MAX_KEYLEN);
455 if_assert_failed goto return_error;
457 if (key_len < info->min_key_len)
458 info->min_key_len = key_len;
460 if (key_len > info->max_key_len)
461 info->max_key_len = key_len;
463 for (i = 0; i < key_len; i++) {
464 /* ifcase() test should be moved outside loops but
465 * remember we call this routine only once per list.
466 * So I go for code readability vs performance here.
467 * --Zas */
468 int k = ifcase(p->key[i]);
470 assert(k < FF_MAX_CHARS);
471 if_assert_failed goto return_error;
473 if (char2idx(k, info) == -1) {
474 assert(info->uniq_chars_count < FF_MAX_CHARS);
475 if_assert_failed goto return_error;
477 info->uniq_chars[info->uniq_chars_count++] = k;
481 info->count++;
484 if (!info->count) return NULL;
486 init_idxtab(info);
488 /* Root leafset allocation */
489 if (!alloc_leafset(info)) goto return_error;
491 info->root_leafset = info->leafsets[info->leafsets_count];
493 if (!alloc_ff_data(info)) goto return_error;
495 /* Build the tree */
496 index->reset();
498 while ((p = index->next())) {
499 int key_len = strlen(p->key);
500 struct ff_node *leafset = info->root_leafset;
501 int i;
503 #if 0
504 fprintf(stderr, "K: %s\n", p->key);
505 #endif
506 for (i = 0; i < key_len - 1; i++) {
507 /* Convert char to its index value */
508 int idx = info->idxtab[ifcase(p->key[i])];
510 /* leafset[idx] is the desired leaf node's bucket. */
512 if (leafset[idx].l == 0) {
513 /* There's no leaf yet */
514 if (!alloc_leafset(info)) goto return_error;
515 leafset[idx].l = info->leafsets_count;
518 /* Descend to leaf */
519 leafset = info->leafsets[leafset[idx].l];
522 /* Index final leaf */
523 i = info->idxtab[ifcase(p->key[i])];
525 leafset[i].e = 1;
527 /* Memorize pointer to data */
528 leafset[i].p = info->pointers_count;
529 add_to_ff_data(p->data, key_len, info);
532 if (info->compress)
533 compress_tree(info->root_leafset, info);
535 return index;
537 return_error:
538 fastfind_done(index);
539 return NULL;
542 #undef ifcase
545 /* This macro searchs for the key in indexed list */
546 #define FF_SEARCH(what) do { \
547 int i; \
549 for (i = 0; i < key_len; i++) { \
550 int lidx, k = what; \
552 FF_DBG_iter(info); \
554 FF_DBG_test(info); \
555 if (k >= FF_MAX_CHARS) return NULL; \
556 lidx = info->idxtab[k]; \
558 FF_DBG_test(info); \
559 if (lidx < 0) return NULL; \
561 FF_DBG_test(info); \
562 if (current->c) { \
563 /* It is a compressed leaf. */ \
564 FF_DBG_test(info); \
565 if (((struct ff_node_c *) current)->ch != lidx) \
566 return NULL; \
567 } else { \
568 current = &current[lidx]; \
571 FF_DBG_test(info); \
572 if (current->e) { \
573 struct ff_data *data = &info->data[current->p]; \
575 FF_DBG_test(info); \
576 if (key_len == data->keylen) { \
577 FF_DBG_found(info); \
578 return data->pointer; \
582 FF_DBG_test(info); \
583 if (!current->l) return NULL; \
584 current = (struct ff_node *) info->leafsets[current->l]; \
586 } while (0)
588 void *
589 fastfind_search(struct fastfind_index *index, unsigned char *key, int key_len)
591 struct ff_node *current;
592 struct fastfind_info *info;
594 assert(index);
595 if_assert_failed return NULL;
597 info = index->handle;
599 assertm(info, "FastFind index %s not initialized", index->comment);
600 if_assert_failed return NULL;
602 FF_DBG_search_stats(info, key_len);
604 FF_DBG_test(info); if (!key) return NULL;
605 FF_DBG_test(info); if (key_len > info->max_key_len) return NULL;
606 FF_DBG_test(info); if (key_len < info->min_key_len) return NULL;
608 current = info->root_leafset;
610 /* Macro and code redundancy are there to obtain maximum
611 * performance. Do not move it to an inlined function.
612 * Do not even think about it.
613 * If you find a better way (same or better performance) then
614 * propose it and be prepared to defend it. --Zas */
616 FF_DBG_test(info);
617 if (info->case_aware)
618 FF_SEARCH(key[i]);
619 else
620 FF_SEARCH(toupper(key[i]));
622 return NULL;
625 #undef FF_SEARCH
627 void
628 fastfind_done(struct fastfind_index *index)
630 struct fastfind_info *info;
632 assert(index);
633 if_assert_failed return;
635 info = index->handle;
636 if (!info) return;
638 FF_DBG_dump_stats(info);
640 mem_free_if(info->data);
641 while (info->leafsets_count) {
642 mem_free_if(info->leafsets[info->leafsets_count]);
643 info->leafsets_count--;
645 mem_free_if(info->leafsets);
646 mem_free(info);
648 index->handle = NULL;
652 /* EXAMPLE */
654 #if 0
655 struct list {
656 unsigned char *tag;
657 int val;
660 struct list list[] = {
661 {"A", 1},
662 {"ABBR", 2},
663 {"ADDRESS", 3},
664 {"B", 4},
665 {"BASE", 5},
666 {"BASEFONT", 6},
667 {"BLOCKQUOTE", 7},
668 {"BODY", 8},
669 {"BR", 9},
670 {"BUTTON", 10},
671 {"CAPTION", 11},
672 {"CENTER", 12},
673 {"CODE", 13},
674 {"DD", 14},
675 {"DFN", 15},
676 {"DIR", 16},
677 {"DIV", 17},
678 {"DL", 18},
679 {"DT", 19},
680 {"EM", 20},
681 {"FIXED", 21},
682 {"FONT", 22},
683 {"FORM", 23},
684 {"FRAME", 24},
685 {"FRAMESET", 25},
686 {"H1", 26},
687 {"H2", 27},
688 {"H3", 28},
689 {"H4", 29},
690 {"H5", 30},
691 {"H6", 31},
692 /* {"HEAD", html_skip, 0, 0}, */
693 {"HR", 32},
694 {"I", 33},
695 {"IFRAME", 34},
696 {"IMG", 35},
697 {"INPUT", 36},
698 {"LI", 37},
699 {"LINK", 38},
700 {"LISTING", 39},
701 {"MENU", 40},
702 {"NOFRAMES", 41},
703 {"OL", 42},
704 {"OPTION", 43},
705 {"P", 44},
706 {"PRE", 45},
707 {"Q", 46},
708 {"S", 47},
709 {"SCRIPT", 48},
710 {"SELECT", 49},
711 {"SPAN", 50},
712 {"STRIKE", 51},
713 {"STRONG", 52},
714 {"STYLE", 53},
715 {"SUB", 54},
716 {"SUP", 55},
717 {"TABLE", 56},
718 {"TD", 57},
719 {"TEXTAREA", 58},
720 {"TH", 59},
721 {"TITLE", 60},
722 {"TR", 61},
723 {"U", 62},
724 {"UL", 63},
725 {"XMP", 64},
726 {NULL, 0}, /* List terminaison is key = NULL */
730 struct list *internal_pointer;
732 /* Reset internal list pointer */
733 void
734 reset_list(void)
736 internal_pointer = list;
739 /* Returns a pointer to a struct that contains
740 * current key and data pointers and increment
741 * internal pointer.
742 * It returns NULL when key is NULL. */
743 struct fastfind_key_value *
744 next_in_list(void)
746 static struct fastfind_key_value kv;
748 if (!internal_pointer->tag) return NULL;
750 kv.key = internal_pointer->tag;
751 kv.data = internal_pointer;
753 internal_pointer++;
755 return &kv;
758 static struct fastfind_index ff_index
759 = INIT_FASTFIND_INDEX("example", reset_list, next_in_list);
762 main(int argc, char **argv)
764 unsigned char *key = argv[1];
765 struct list *result;
767 if (!key || !*key) {
768 fprintf(stderr, "Usage: fastfind keyword\n");
769 exit(-1);
772 fprintf(stderr, "---------- INDEX PHASE -----------\n");
773 /* Mandatory */
774 fastfind_index(&ff_index, FF_COMPRESS);
776 fprintf(stderr, "---------- SEARCH PHASE ----------\n");
777 /* Without this one ... */
778 result = (struct list *) fastfind_search(&ff_index, key, strlen(key));
780 if (result)
781 fprintf(stderr, " Found: '%s' -> %d\n", result->tag, result->val);
782 else
783 fprintf(stderr, " Not found: '%s'\n", key);
785 fprintf(stderr, "---------- CLEANUP PHASE ----------\n");
786 fastfind_done(&ff_index);
788 exit(0);
791 #endif
793 #endif /* USE_FASTFIND */