1 /** Very fast search_keyword_in_list.
5 * It replaces bsearch() + strcasecmp() + callback + ...
7 * Following conditions should be met:
9 * - list keys are C strings.
10 * - keys should not be greater than 255 characters, and optimally < 20
11 * characters. It can work with greater keys but then memory usage will
13 * - each key must be unique and non empty.
14 * - list do not have to be ordered.
15 * - total number of unique characters used in all keys should be <= 128
16 * - idealy total number of keys should be <= 512 (but see below)
18 * (c) 2003 Laurent MONIN (aka Zas)
19 * Feel free to do whatever you want with that code.
22 * These routines use a tree search. First, a big tree is composed from the
23 * keys on input. Then, when searching we just go through the tree. If we will
24 * end up on an 'ending' node, we've got it.
26 * Hm, okay. For keys { 'head', 'h1', 'body', 'bodyrock', 'bodyground' }, it
42 * (the ending nodes are upcased just for this drawing, not in real)
44 * To optimize this for speed, leafs of nodes are organized in per-node arrays
45 * (so-called 'leafsets'), indexed by symbol value of the key's next character.
46 * But to optimize that for memory, we first compose own alphabet consisting
47 * only from the chars we ever use in the key strings. fastfind_info.uniq_chars
48 * holds that alphabet and fastfind_info.idxtab is used to translate between it
51 * Tree building: O((L+M)*N)
52 * (L: mean key length, M: alphabet size,
53 * N: number of items).
54 * String lookup: O(N) (N: string length). */
65 #include "util/conv.h"
66 #include "util/error.h"
67 #include "util/fastfind.h"
68 #include "util/memdebug.h"
69 #include "util/memory.h"
73 /** Define it to generate performance and memory usage statistics to stderr. */
75 #define DEBUG_FASTFIND
78 /** Define whether to use 32 or 64 bits per compressed element. */
83 #define END_LEAF_BITS 1
84 #define COMPRESSED_BITS 1
88 /* Use only 32 bits per element, but has very low limits. */
89 /* Adequate for ELinks tags search. */
91 #define POINTER_INDEX_BITS 10 /* 1024 */
92 #define LEAFSET_INDEX_BITS 13 /* 8192 */
93 #define COMP_CHAR_INDEX_BITS 7 /* 128 */
95 #define ff_node ff_node_c /* Both are 32 bits long. */
97 #if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \
98 COMP_CHAR_INDEX_BITS + END_LEAF_BITS + \
100 #error Over 32 bits in struct ff_node !!
103 #else /* !USE_32_BITS */
105 /* Keep this one if there is more than 512 keywords in a list
106 * it eats a bit more memory.
107 * ELinks may need this one if fastfind is used in other
108 * things than tags searching. */
109 /* This will make struct ff_node_c use 64 bits. */
111 #define POINTER_INDEX_BITS 12
112 #define LEAFSET_INDEX_BITS 18
113 #define COMP_CHAR_INDEX_BITS 8
115 #if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \
116 + END_LEAF_BITS + COMPRESSED_BITS) > 32
117 #error Over 32 bits in struct ff_node !!
121 /** End leaf -> p is significant */
122 unsigned int e
:END_LEAF_BITS
;
125 unsigned int c
:COMPRESSED_BITS
;
127 /** Index in pointers */
128 unsigned int p
:POINTER_INDEX_BITS
;
130 /** Index in leafsets */
131 unsigned int l
:LEAFSET_INDEX_BITS
;
134 #endif /* USE_32_BITS */
137 #define FF_MAX_KEYS (1 << POINTER_INDEX_BITS)
138 #define FF_MAX_LEAFSETS ((1 << LEAFSET_INDEX_BITS) - 1)
139 #define FF_MAX_CHARS (1 << COMP_CHAR_INDEX_BITS)
140 #define FF_MAX_KEYLEN 255
143 unsigned int e
:END_LEAF_BITS
;
144 unsigned int c
:COMPRESSED_BITS
;
145 unsigned int p
:POINTER_INDEX_BITS
;
146 unsigned int l
:LEAFSET_INDEX_BITS
;
148 /** Index of char when compressed. */
149 unsigned int ch
:COMP_CHAR_INDEX_BITS
;
157 struct fastfind_info
{
158 struct ff_data
*data
;
160 struct ff_node
**leafsets
;
161 struct ff_node
*root_leafset
;
166 int uniq_chars_count
;
171 unsigned int case_aware
:1;
172 unsigned int compress
:1;
174 int idxtab
[FF_MAX_CHARS
];
175 unsigned char uniq_chars
[FF_MAX_CHARS
];
177 #ifdef DEBUG_FASTFIND
179 unsigned long searches
;
181 unsigned long itertmp
;
182 unsigned long iterdelta
;
183 unsigned long itermax
;
184 unsigned long iterations
;
186 unsigned long teststmp
;
187 unsigned long testsdelta
;
188 unsigned long testsmax
;
189 unsigned long memory_usage
;
190 unsigned long total_key_len
;
191 unsigned int compressed_nodes
;
192 unsigned char *comment
;
198 #ifdef DEBUG_FASTFIND
199 /* These are for performance testing. */
200 #define FF_DBG_mem(x, size) (x)->debug.memory_usage += (size)
201 #define FF_DBG_test(x) (x)->debug.tests++
202 #define FF_DBG_iter(x) (x)->debug.iterations++
203 #define FF_DBG_cnode(x) (x)->debug.compressed_nodes++
204 #define FF_DBG_found(x) \
206 unsigned long iter = (x)->debug.iterations - (x)->debug.itertmp; \
207 unsigned long tests = (x)->debug.tests - (x)->debug.teststmp; \
209 (x)->debug.iterdelta += iter; \
210 (x)->debug.testsdelta += tests; \
211 if (iter > (x)->debug.itermax) \
212 (x)->debug.itermax = iter; \
213 if (tests > (x)->debug.testsmax) \
214 (x)->debug.testsmax = tests; \
215 (x)->debug.found++; \
217 #define FF_DBG_comment(x, str) do { (x)->debug.comment = empty_string_or_(str); } while (0)
219 /** Update search stats. */
221 FF_DBG_search_stats(struct fastfind_info
*info
, int key_len
)
223 info
->debug
.searches
++;
224 info
->debug
.total_key_len
+= key_len
;
225 info
->debug
.teststmp
= info
->debug
.tests
;
226 info
->debug
.itertmp
= info
->debug
.iterations
;
229 /** Dump all stats. */
231 FF_DBG_dump_stats(struct fastfind_info
*info
)
233 fprintf(stderr
, "------ FastFind Statistics ------\n");
234 fprintf(stderr
, "Comment : %s\n", info
->debug
.comment
);
235 fprintf(stderr
, "Case-aware : %s\n", info
->case_aware
? "yes" : "no");
236 fprintf(stderr
, "Compress : %s\n", info
->compress
? "yes" : "no");
237 fprintf(stderr
, "Uniq_chars : %s\n", info
->uniq_chars
);
238 fprintf(stderr
, "Uniq_chars #: %d/%d max.\n", info
->uniq_chars_count
, FF_MAX_CHARS
);
239 fprintf(stderr
, "Min_key_len : %d\n", info
->min_key_len
);
240 fprintf(stderr
, "Max_key_len : %d\n", info
->max_key_len
);
241 fprintf(stderr
, "Entries : %d/%d max.\n", info
->pointers_count
, FF_MAX_KEYS
);
242 fprintf(stderr
, "Leafsets : %d/%d max.\n", info
->leafsets_count
, FF_MAX_LEAFSETS
);
244 fprintf(stderr
, "C. leafsets : %u/%d (%0.2f%%)\n",
245 info
->debug
.compressed_nodes
,
246 info
->leafsets_count
,
247 100 * (double) info
->debug
.compressed_nodes
/ info
->leafsets_count
);
248 fprintf(stderr
, "Memory usage: %lu bytes (cost per entry = %0.2f bytes)\n",
249 info
->debug
.memory_usage
, (double) info
->debug
.memory_usage
/ info
->pointers_count
);
250 fprintf(stderr
, "Struct info : %zu bytes\n", sizeof(*info
) - sizeof(info
->debug
));
251 fprintf(stderr
, "Struct node : %zu bytes\n", sizeof(struct ff_node
));
252 fprintf(stderr
, "Struct cnode: %zu bytes\n", sizeof(struct ff_node_c
));
253 fprintf(stderr
, "Searches : %lu\n", info
->debug
.searches
);
254 fprintf(stderr
, "Found : %lu (%0.2f%%)\n",
255 info
->debug
.found
, 100 * (double) info
->debug
.found
/ info
->debug
.searches
);
256 fprintf(stderr
, "Iterations : %lu (%0.2f per search, %0.2f before found, %lu max)\n",
257 info
->debug
.iterations
, (double) info
->debug
.iterations
/ info
->debug
.searches
,
258 (double) info
->debug
.iterdelta
/ info
->debug
.found
,
259 info
->debug
.itermax
);
260 fprintf(stderr
, "Tests : %lu (%0.2f per search, %0.2f per iter., %0.2f before found, %lu max)\n",
261 info
->debug
.tests
, (double) info
->debug
.tests
/ info
->debug
.searches
,
262 (double) info
->debug
.tests
/ info
->debug
.iterations
,
263 (double) info
->debug
.testsdelta
/ info
->debug
.found
,
264 info
->debug
.testsmax
);
265 fprintf(stderr
, "Total keylen: %lu bytes (%0.2f per search, %0.2f per iter.)\n",
266 info
->debug
.total_key_len
, (double) info
->debug
.total_key_len
/ info
->debug
.searches
,
267 (double) info
->debug
.total_key_len
/ info
->debug
.iterations
);
268 fprintf(stderr
, "\n");
271 #else /* !DEBUG_FASTFIND */
273 #define FF_DBG_mem(x, size)
274 #define FF_DBG_test(x)
275 #define FF_DBG_iter(x)
276 #define FF_DBG_cnode(x)
277 #define FF_DBG_found(x)
278 #define FF_DBG_comment(x, comment)
279 #define FF_DBG_search_stats(info, key_len)
280 #define FF_DBG_dump_stats(info)
282 #endif /* DEBUG_FASTFIND */
285 static struct fastfind_info
*
286 init_fastfind(struct fastfind_index
*index
, enum fastfind_flags flags
)
288 struct fastfind_info
*info
= mem_calloc(1, sizeof(*info
));
290 index
->handle
= info
;
291 if (!info
) return NULL
;
293 info
->min_key_len
= FF_MAX_KEYLEN
;
294 info
->case_aware
= !!(flags
& FF_CASE_AWARE
);
295 info
->compress
= !!(flags
& FF_COMPRESS
);
297 FF_DBG_mem(info
, sizeof(*info
) - sizeof(info
->debug
));
298 FF_DBG_comment(info
, index
->comment
);
303 /** @returns 1 on success, 0 on allocation failure */
305 alloc_ff_data(struct fastfind_info
*info
)
307 struct ff_data
*data
;
309 assert(info
->count
< FF_MAX_KEYS
);
310 if_assert_failed
return 0;
312 /* On error, cleanup is done by fastfind_done(). */
314 data
= mem_calloc(info
->count
, sizeof(*data
));
317 FF_DBG_mem(info
, info
->count
* sizeof(*data
));
322 /** Add pointer and its key length to correspondant arrays, incrementing
323 * internal counter. */
325 add_to_ff_data(void *p
, int key_len
, struct fastfind_info
*info
)
327 struct ff_data
*data
= &info
->data
[info
->pointers_count
++];
329 /* Record new pointer and key len, used in search */
331 data
->keylen
= key_len
;
334 /** @returns 1 on success, 0 on allocation failure */
336 alloc_leafset(struct fastfind_info
*info
)
338 struct ff_node
**leafsets
;
339 struct ff_node
*leafset
;
341 assert(info
->leafsets_count
< FF_MAX_LEAFSETS
);
342 if_assert_failed
return 0;
344 /* info->leafsets[0] is never used since l=0 marks no leaf
345 * in struct ff_node. That's the reason of that + 2. */
346 leafsets
= mem_realloc(info
->leafsets
,
347 sizeof(*leafsets
) * (info
->leafsets_count
+ 2));
348 if (!leafsets
) return 0;
349 info
->leafsets
= leafsets
;
351 leafset
= mem_calloc(info
->uniq_chars_count
, sizeof(*leafset
));
352 if (!leafset
) return 0;
354 FF_DBG_mem(info
, sizeof(*leafsets
));
355 FF_DBG_mem(info
, sizeof(*leafset
) * info
->uniq_chars_count
);
357 info
->leafsets_count
++;
358 info
->leafsets
[info
->leafsets_count
] = leafset
;
364 char2idx(unsigned char c
, struct fastfind_info
*info
)
366 unsigned char *idx
= memchr(info
->uniq_chars
, c
, info
->uniq_chars_count
);
368 if (idx
) return (idx
- info
->uniq_chars
);
374 init_idxtab(struct fastfind_info
*info
)
378 for (i
= 0; i
< FF_MAX_CHARS
; i
++)
379 info
->idxtab
[i
] = char2idx((unsigned char) i
, info
);
383 compress_node(struct ff_node
*leafset
, struct fastfind_info
*info
,
386 struct ff_node_c
*new = mem_alloc(sizeof(*new));
391 new->e
= leafset
[pos
].e
;
392 new->p
= leafset
[pos
].p
;
393 new->l
= leafset
[pos
].l
;
396 mem_free_set(&info
->leafsets
[i
], (struct ff_node
*) new);
398 FF_DBG_mem(info
, sizeof(*new));
399 FF_DBG_mem(info
, sizeof(*leafset
) * -info
->uniq_chars_count
);
403 compress_tree(struct ff_node
*leafset
, struct fastfind_info
*info
)
410 if_assert_failed
return;
412 for (i
= 0; i
< info
->uniq_chars_count
; i
++) {
413 if (leafset
[i
].c
) continue;
416 /* There's a leaf leafset, descend to it and recurse */
417 compress_tree(info
->leafsets
[leafset
[i
].l
], info
);
420 if (leafset
[i
].l
|| leafset
[i
].e
) {
426 if (cnt
!= 1 || leafset
[pos
].c
) return;
428 /* Compress if possible ;) */
429 for (i
= 1; i
< info
->leafsets_count
; i
++) {
430 if (info
->leafsets
[i
] == leafset
) {
431 compress_node(leafset
, info
, i
, pos
);
437 #define ifcase(c) (info->case_aware ? (c) : toupper(c))
439 struct fastfind_index
*
440 fastfind_index(struct fastfind_index
*index
, enum fastfind_flags flags
)
442 struct fastfind_key_value
*p
;
443 struct fastfind_info
*info
;
445 assert(index
&& index
->reset
&& index
->next
);
446 if_assert_failed
goto return_error
;
448 info
= init_fastfind(index
, flags
);
449 if (!info
) goto return_error
;
451 /* First search min, max, count and uniq_chars. */
454 while ((p
= index
->next())) {
455 int key_len
= strlen(p
->key
);
458 assert(key_len
> 0 && key_len
<= FF_MAX_KEYLEN
);
459 if_assert_failed
goto return_error
;
461 if (key_len
< info
->min_key_len
)
462 info
->min_key_len
= key_len
;
464 if (key_len
> info
->max_key_len
)
465 info
->max_key_len
= key_len
;
467 for (i
= 0; i
< key_len
; i
++) {
468 /* ifcase() test should be moved outside loops but
469 * remember we call this routine only once per list.
470 * So I go for code readability vs performance here.
472 int k
= ifcase(p
->key
[i
]);
474 assert(k
< FF_MAX_CHARS
);
475 if_assert_failed
goto return_error
;
477 if (char2idx(k
, info
) == -1) {
478 assert(info
->uniq_chars_count
< FF_MAX_CHARS
);
479 if_assert_failed
goto return_error
;
481 info
->uniq_chars
[info
->uniq_chars_count
++] = k
;
488 if (!info
->count
) return NULL
;
492 /* Root leafset allocation */
493 if (!alloc_leafset(info
)) goto return_error
;
495 info
->root_leafset
= info
->leafsets
[info
->leafsets_count
];
497 if (!alloc_ff_data(info
)) goto return_error
;
502 while ((p
= index
->next())) {
503 int key_len
= strlen(p
->key
);
504 struct ff_node
*leafset
= info
->root_leafset
;
508 fprintf(stderr
, "K: %s\n", p
->key
);
510 for (i
= 0; i
< key_len
- 1; i
++) {
511 /* Convert char to its index value */
512 int idx
= info
->idxtab
[ifcase(p
->key
[i
])];
514 /* leafset[idx] is the desired leaf node's bucket. */
516 if (leafset
[idx
].l
== 0) {
517 /* There's no leaf yet */
518 if (!alloc_leafset(info
)) goto return_error
;
519 leafset
[idx
].l
= info
->leafsets_count
;
522 /* Descend to leaf */
523 leafset
= info
->leafsets
[leafset
[idx
].l
];
526 /* Index final leaf */
527 i
= info
->idxtab
[ifcase(p
->key
[i
])];
531 /* Memorize pointer to data */
532 leafset
[i
].p
= info
->pointers_count
;
533 add_to_ff_data(p
->data
, key_len
, info
);
537 compress_tree(info
->root_leafset
, info
);
542 fastfind_done(index
);
549 /** This macro searchs for the key in indexed list */
550 #define FF_SEARCH(what) do { \
553 for (i = 0; i < key_len; i++) { \
554 int lidx, k = what; \
559 if (k >= FF_MAX_CHARS) return NULL; \
560 lidx = info->idxtab[k]; \
563 if (lidx < 0) return NULL; \
567 /* It is a compressed leaf. */ \
569 if (((struct ff_node_c *) current)->ch != lidx) \
572 current = ¤t[lidx]; \
577 struct ff_data *data = &info->data[current->p]; \
580 if (key_len == data->keylen) { \
581 FF_DBG_found(info); \
582 return data->pointer; \
587 if (!current->l) return NULL; \
588 current = (struct ff_node *) info->leafsets[current->l]; \
593 fastfind_search(struct fastfind_index
*index
,
594 const unsigned char *key
, int key_len
)
596 struct ff_node
*current
;
597 struct fastfind_info
*info
;
600 if_assert_failed
return NULL
;
602 info
= index
->handle
;
604 assertm(info
!= NULL
, "FastFind index %s not initialized", index
->comment
);
605 if_assert_failed
return NULL
;
607 FF_DBG_search_stats(info
, key_len
);
609 FF_DBG_test(info
); if (!key
) return NULL
;
610 FF_DBG_test(info
); if (key_len
> info
->max_key_len
) return NULL
;
611 FF_DBG_test(info
); if (key_len
< info
->min_key_len
) return NULL
;
613 current
= info
->root_leafset
;
615 /* Macro and code redundancy are there to obtain maximum
616 * performance. Do not move it to an inlined function.
617 * Do not even think about it.
618 * If you find a better way (same or better performance) then
619 * propose it and be prepared to defend it. --Zas */
622 if (info
->case_aware
)
625 FF_SEARCH(toupper(key
[i
]));
633 fastfind_done(struct fastfind_index
*index
)
635 struct fastfind_info
*info
;
638 if_assert_failed
return;
640 info
= index
->handle
;
643 FF_DBG_dump_stats(info
);
645 mem_free_if(info
->data
);
646 while (info
->leafsets_count
) {
647 mem_free_if(info
->leafsets
[info
->leafsets_count
]);
648 info
->leafsets_count
--;
650 mem_free_if(info
->leafsets
);
653 index
->handle
= NULL
;
665 struct list list
[] = {
697 /* {"HEAD", html_skip, 0, 0}, */
731 {NULL
, 0}, /* List terminaison is key = NULL */
735 struct list
*internal_pointer
;
737 /** Reset internal list pointer */
741 internal_pointer
= list
;
744 /** Returns a pointer to a struct that contains
745 * current key and data pointers and increment
747 * It returns NULL when key is NULL. */
748 struct fastfind_key_value
*
751 static struct fastfind_key_value kv
;
753 if (!internal_pointer
->tag
) return NULL
;
755 kv
.key
= internal_pointer
->tag
;
756 kv
.data
= internal_pointer
;
763 static struct fastfind_index ff_index
764 = INIT_FASTFIND_INDEX("example", reset_list
, next_in_list
);
767 main(int argc
, char **argv
)
769 unsigned char *key
= argv
[1];
773 fprintf(stderr
, "Usage: fastfind keyword\n");
777 fprintf(stderr
, "---------- INDEX PHASE -----------\n");
779 fastfind_index(&ff_index
, FF_COMPRESS
);
781 fprintf(stderr
, "---------- SEARCH PHASE ----------\n");
782 /* Without this one ... */
783 result
= (struct list
*) fastfind_search(&ff_index
, key
, strlen(key
));
786 fprintf(stderr
, " Found: '%s' -> %d\n", result
->tag
, result
->val
);
788 fprintf(stderr
, " Not found: '%s'\n", key
);
790 fprintf(stderr
, "---------- CLEANUP PHASE ----------\n");
791 fastfind_done(&ff_index
);
798 #endif /* USE_FASTFIND */