1 /* Very fast search_keyword_in_list. */
12 #include "util/conv.h"
13 #include "util/error.h"
14 #include "util/fastfind.h"
15 #include "util/memdebug.h"
16 #include "util/memory.h"
20 /* It replaces bsearch() + strcasecmp() + callback + ...
22 * Following conditions should be met:
24 * - list keys are C strings.
25 * - keys should not be greater than 255 characters, and optimally < 20
26 * characters. It can work with greater keys but then memory usage will
28 * - each key must be unique and non empty.
29 * - list do not have to be ordered.
30 * - total number of unique characters used in all keys should be <= 128
31 * - idealy total number of keys should be <= 512 (but see below)
33 * (c) 2003 Laurent MONIN (aka Zas)
34 * Feel free to do whatever you want with that code. */
37 /* These routines use a tree search. First, a big tree is composed from the
38 * keys on input. Then, when searching we just go through the tree. If we will
39 * end up on an 'ending' node, we've got it.
41 * Hm, okay. For keys { 'head', 'h1', 'body', 'bodyrock', 'bodyground' }, it
55 * (the ending nodes are upcased just for this drawing, not in real)
57 * To optimize this for speed, leafs of nodes are organized in per-node arrays
58 * (so-called 'leafsets'), indexed by symbol value of the key's next character.
59 * But to optimize that for memory, we first compose own alphabet consisting
60 * only from the chars we ever use in the key strings. @uniq_chars holds that
61 * alphabet and @idxtab is used to translate between it and ASCII.
63 * Tree building: O((L+M)*N)
64 * (L: mean key length, M: alphabet size,
65 * N: number of items).
66 * String lookup: O(N) (N: string length). */
69 /* Define it to generate performance and memory usage statistics to stderr. */
71 #define DEBUG_FASTFIND
74 /* Define whether to use 32 or 64 bits per compressed element. */
79 #define END_LEAF_BITS 1
80 #define COMPRESSED_BITS 1
84 /* Use only 32 bits per element, but has very low limits. */
85 /* Adequate for ELinks tags search. */
87 #define POINTER_INDEX_BITS 10 /* 1024 */
88 #define LEAFSET_INDEX_BITS 13 /* 8192 */
89 #define COMP_CHAR_INDEX_BITS 7 /* 128 */
91 #define ff_node ff_node_c /* Both are 32 bits long. */
93 #if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \
94 COMP_CHAR_INDEX_BITS + END_LEAF_BITS + \
96 #error Over 32 bits in struct ff_node !!
99 #else /* !USE_32_BITS */
101 /* Keep this one if there is more than 512 keywords in a list
102 * it eats a bit more memory.
103 * ELinks may need this one if fastfind is used in other
104 * things than tags searching. */
105 /* This will make struct ff_node_c use 64 bits. */
107 #define POINTER_INDEX_BITS 12
108 #define LEAFSET_INDEX_BITS 18
109 #define COMP_CHAR_INDEX_BITS 8
111 #if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \
112 + END_LEAF_BITS + COMPRESSED_BITS) > 32
113 #error Over 32 bits in struct ff_node !!
117 /* End leaf -> p is significant */
118 unsigned int e
:END_LEAF_BITS
;
121 unsigned int c
:COMPRESSED_BITS
;
123 /* Index in pointers */
124 unsigned int p
:POINTER_INDEX_BITS
;
126 /* Index in leafsets */
127 unsigned int l
:LEAFSET_INDEX_BITS
;
130 #endif /* USE_32_BITS */
133 #define FF_MAX_KEYS (1 << POINTER_INDEX_BITS)
134 #define FF_MAX_LEAFSETS ((1 << LEAFSET_INDEX_BITS) - 1)
135 #define FF_MAX_CHARS (1 << COMP_CHAR_INDEX_BITS)
136 #define FF_MAX_KEYLEN 255
139 unsigned int e
:END_LEAF_BITS
;
140 unsigned int c
:COMPRESSED_BITS
;
141 unsigned int p
:POINTER_INDEX_BITS
;
142 unsigned int l
:LEAFSET_INDEX_BITS
;
144 /* Index of char when compressed. */
145 unsigned int ch
:COMP_CHAR_INDEX_BITS
;
153 struct fastfind_info
{
154 struct ff_data
*data
;
156 struct ff_node
**leafsets
;
157 struct ff_node
*root_leafset
;
162 int uniq_chars_count
;
167 unsigned int case_aware
:1;
168 unsigned int compress
:1;
170 int idxtab
[FF_MAX_CHARS
];
171 unsigned char uniq_chars
[FF_MAX_CHARS
];
173 #ifdef DEBUG_FASTFIND
175 unsigned long searches
;
177 unsigned long itertmp
;
178 unsigned long iterdelta
;
179 unsigned long itermax
;
180 unsigned long iterations
;
182 unsigned long teststmp
;
183 unsigned long testsdelta
;
184 unsigned long testsmax
;
185 unsigned long memory_usage
;
186 unsigned long total_key_len
;
187 unsigned int compressed_nodes
;
188 unsigned char *comment
;
194 #ifdef DEBUG_FASTFIND
195 /* These are for performance testing. */
196 #define FF_DBG_mem(x, size) (x)->debug.memory_usage += (size)
197 #define FF_DBG_test(x) (x)->debug.tests++
198 #define FF_DBG_iter(x) (x)->debug.iterations++
199 #define FF_DBG_cnode(x) (x)->debug.compressed_nodes++
200 #define FF_DBG_found(x) \
202 unsigned long iter = (x)->debug.iterations - (x)->debug.itertmp; \
203 unsigned long tests = (x)->debug.tests - (x)->debug.teststmp; \
205 (x)->debug.iterdelta += iter; \
206 (x)->debug.testsdelta += tests; \
207 if (iter > (x)->debug.itermax) \
208 (x)->debug.itermax = iter; \
209 if (tests > (x)->debug.testsmax) \
210 (x)->debug.testsmax = tests; \
211 (x)->debug.found++; \
213 #define FF_DBG_comment(x, str) do { (x)->debug.comment = empty_string_or_(str); } while (0)
215 /* Update search stats. */
217 FF_DBG_search_stats(struct fastfind_info
*info
, int key_len
)
219 info
->debug
.searches
++;
220 info
->debug
.total_key_len
+= key_len
;
221 info
->debug
.teststmp
= info
->debug
.tests
;
222 info
->debug
.itertmp
= info
->debug
.iterations
;
225 /* Dump all stats. */
227 FF_DBG_dump_stats(struct fastfind_info
*info
)
229 fprintf(stderr
, "------ FastFind Statistics ------\n");
230 fprintf(stderr
, "Comment : %s\n", info
->debug
.comment
);
231 fprintf(stderr
, "Case-aware : %s\n", info
->case_aware
? "yes" : "no");
232 fprintf(stderr
, "Compress : %s\n", info
->compress
? "yes" : "no");
233 fprintf(stderr
, "Uniq_chars : %s\n", info
->uniq_chars
);
234 fprintf(stderr
, "Uniq_chars #: %d/%d max.\n", info
->uniq_chars_count
, FF_MAX_CHARS
);
235 fprintf(stderr
, "Min_key_len : %d\n", info
->min_key_len
);
236 fprintf(stderr
, "Max_key_len : %d\n", info
->max_key_len
);
237 fprintf(stderr
, "Entries : %d/%d max.\n", info
->pointers_count
, FF_MAX_KEYS
);
238 fprintf(stderr
, "Leafsets : %d/%d max.\n", info
->leafsets_count
, FF_MAX_LEAFSETS
);
240 fprintf(stderr
, "C. leafsets : %u/%d (%0.2f%%)\n",
241 info
->debug
.compressed_nodes
,
242 info
->leafsets_count
,
243 100 * (double) info
->debug
.compressed_nodes
/ info
->leafsets_count
);
244 fprintf(stderr
, "Memory usage: %lu bytes (cost per entry = %0.2f bytes)\n",
245 info
->debug
.memory_usage
, (double) info
->debug
.memory_usage
/ info
->pointers_count
);
246 fprintf(stderr
, "Struct info : %zu bytes\n", sizeof(*info
) - sizeof(info
->debug
));
247 fprintf(stderr
, "Struct node : %zu bytes\n", sizeof(struct ff_node
));
248 fprintf(stderr
, "Struct cnode: %zu bytes\n", sizeof(struct ff_node_c
));
249 fprintf(stderr
, "Searches : %lu\n", info
->debug
.searches
);
250 fprintf(stderr
, "Found : %lu (%0.2f%%)\n",
251 info
->debug
.found
, 100 * (double) info
->debug
.found
/ info
->debug
.searches
);
252 fprintf(stderr
, "Iterations : %lu (%0.2f per search, %0.2f before found, %lu max)\n",
253 info
->debug
.iterations
, (double) info
->debug
.iterations
/ info
->debug
.searches
,
254 (double) info
->debug
.iterdelta
/ info
->debug
.found
,
255 info
->debug
.itermax
);
256 fprintf(stderr
, "Tests : %lu (%0.2f per search, %0.2f per iter., %0.2f before found, %lu max)\n",
257 info
->debug
.tests
, (double) info
->debug
.tests
/ info
->debug
.searches
,
258 (double) info
->debug
.tests
/ info
->debug
.iterations
,
259 (double) info
->debug
.testsdelta
/ info
->debug
.found
,
260 info
->debug
.testsmax
);
261 fprintf(stderr
, "Total keylen: %lu bytes (%0.2f per search, %0.2f per iter.)\n",
262 info
->debug
.total_key_len
, (double) info
->debug
.total_key_len
/ info
->debug
.searches
,
263 (double) info
->debug
.total_key_len
/ info
->debug
.iterations
);
264 fprintf(stderr
, "\n");
267 #else /* !DEBUG_FASTFIND */
269 #define FF_DBG_mem(x, size)
270 #define FF_DBG_test(x)
271 #define FF_DBG_iter(x)
272 #define FF_DBG_cnode(x)
273 #define FF_DBG_found(x)
274 #define FF_DBG_comment(x, comment)
275 #define FF_DBG_search_stats(info, key_len)
276 #define FF_DBG_dump_stats(info)
278 #endif /* DEBUG_FASTFIND */
281 static struct fastfind_info
*
282 init_fastfind(struct fastfind_index
*index
, enum fastfind_flags flags
)
284 struct fastfind_info
*info
= mem_calloc(1, sizeof(*info
));
286 index
->handle
= info
;
287 if (!info
) return NULL
;
289 info
->min_key_len
= FF_MAX_KEYLEN
;
290 info
->case_aware
= !!(flags
& FF_CASE_AWARE
);
291 info
->compress
= !!(flags
& FF_COMPRESS
);
293 FF_DBG_mem(info
, sizeof(*info
) - sizeof(info
->debug
));
294 FF_DBG_comment(info
, index
->comment
);
299 /* Return 1 on success, 0 on allocation failure */
301 alloc_ff_data(struct fastfind_info
*info
)
303 struct ff_data
*data
;
305 assert(info
->count
< FF_MAX_KEYS
);
306 if_assert_failed
return 0;
308 /* On error, cleanup is done by fastfind_done(). */
310 data
= mem_calloc(info
->count
, sizeof(*data
));
313 FF_DBG_mem(info
, info
->count
* sizeof(*data
));
318 /* Add pointer and its key length to correspondant arrays, incrementing
319 * internal counter. */
321 add_to_ff_data(void *p
, int key_len
, struct fastfind_info
*info
)
323 struct ff_data
*data
= &info
->data
[info
->pointers_count
++];
325 /* Record new pointer and key len, used in search */
327 data
->keylen
= key_len
;
330 /* Return 1 on success, 0 on allocation failure */
332 alloc_leafset(struct fastfind_info
*info
)
334 struct ff_node
**leafsets
;
335 struct ff_node
*leafset
;
337 assert(info
->leafsets_count
< FF_MAX_LEAFSETS
);
338 if_assert_failed
return 0;
340 /* info->leafsets[0] is never used since l=0 marks no leaf
341 * in struct ff_node. That's the reason of that + 2. */
342 leafsets
= mem_realloc(info
->leafsets
,
343 sizeof(*leafsets
) * (info
->leafsets_count
+ 2));
344 if (!leafsets
) return 0;
345 info
->leafsets
= leafsets
;
347 leafset
= mem_calloc(info
->uniq_chars_count
, sizeof(*leafset
));
348 if (!leafset
) return 0;
350 FF_DBG_mem(info
, sizeof(*leafsets
));
351 FF_DBG_mem(info
, sizeof(*leafset
) * info
->uniq_chars_count
);
353 info
->leafsets_count
++;
354 info
->leafsets
[info
->leafsets_count
] = leafset
;
360 char2idx(unsigned char c
, struct fastfind_info
*info
)
362 unsigned char *idx
= memchr(info
->uniq_chars
, c
, info
->uniq_chars_count
);
364 if (idx
) return (idx
- info
->uniq_chars
);
370 init_idxtab(struct fastfind_info
*info
)
374 for (i
= 0; i
< FF_MAX_CHARS
; i
++)
375 info
->idxtab
[i
] = char2idx((unsigned char) i
, info
);
379 compress_node(struct ff_node
*leafset
, struct fastfind_info
*info
,
382 struct ff_node_c
*new = mem_alloc(sizeof(*new));
387 new->e
= leafset
[pos
].e
;
388 new->p
= leafset
[pos
].p
;
389 new->l
= leafset
[pos
].l
;
392 mem_free_set(&info
->leafsets
[i
], (struct ff_node
*) new);
394 FF_DBG_mem(info
, sizeof(*new));
395 FF_DBG_mem(info
, sizeof(*leafset
) * -info
->uniq_chars_count
);
399 compress_tree(struct ff_node
*leafset
, struct fastfind_info
*info
)
406 if_assert_failed
return;
408 for (i
= 0; i
< info
->uniq_chars_count
; i
++) {
409 if (leafset
[i
].c
) continue;
412 /* There's a leaf leafset, descend to it and recurse */
413 compress_tree(info
->leafsets
[leafset
[i
].l
], info
);
416 if (leafset
[i
].l
|| leafset
[i
].e
) {
422 if (cnt
!= 1 || leafset
[pos
].c
) return;
424 /* Compress if possible ;) */
425 for (i
= 1; i
< info
->leafsets_count
; i
++) {
426 if (info
->leafsets
[i
] == leafset
) {
427 compress_node(leafset
, info
, i
, pos
);
433 #define ifcase(c) (info->case_aware ? (c) : toupper(c))
435 struct fastfind_index
*
436 fastfind_index(struct fastfind_index
*index
, enum fastfind_flags flags
)
438 struct fastfind_key_value
*p
;
439 struct fastfind_info
*info
;
441 assert(index
&& index
->reset
&& index
->next
);
442 if_assert_failed
goto return_error
;
444 info
= init_fastfind(index
, flags
);
445 if (!info
) goto return_error
;
447 /* First search min, max, count and uniq_chars. */
450 while ((p
= index
->next())) {
451 int key_len
= strlen(p
->key
);
454 assert(key_len
> 0 && key_len
<= FF_MAX_KEYLEN
);
455 if_assert_failed
goto return_error
;
457 if (key_len
< info
->min_key_len
)
458 info
->min_key_len
= key_len
;
460 if (key_len
> info
->max_key_len
)
461 info
->max_key_len
= key_len
;
463 for (i
= 0; i
< key_len
; i
++) {
464 /* ifcase() test should be moved outside loops but
465 * remember we call this routine only once per list.
466 * So I go for code readability vs performance here.
468 int k
= ifcase(p
->key
[i
]);
470 assert(k
< FF_MAX_CHARS
);
471 if_assert_failed
goto return_error
;
473 if (char2idx(k
, info
) == -1) {
474 assert(info
->uniq_chars_count
< FF_MAX_CHARS
);
475 if_assert_failed
goto return_error
;
477 info
->uniq_chars
[info
->uniq_chars_count
++] = k
;
484 if (!info
->count
) return NULL
;
488 /* Root leafset allocation */
489 if (!alloc_leafset(info
)) goto return_error
;
491 info
->root_leafset
= info
->leafsets
[info
->leafsets_count
];
493 if (!alloc_ff_data(info
)) goto return_error
;
498 while ((p
= index
->next())) {
499 int key_len
= strlen(p
->key
);
500 struct ff_node
*leafset
= info
->root_leafset
;
504 fprintf(stderr
, "K: %s\n", p
->key
);
506 for (i
= 0; i
< key_len
- 1; i
++) {
507 /* Convert char to its index value */
508 int idx
= info
->idxtab
[ifcase(p
->key
[i
])];
510 /* leafset[idx] is the desired leaf node's bucket. */
512 if (leafset
[idx
].l
== 0) {
513 /* There's no leaf yet */
514 if (!alloc_leafset(info
)) goto return_error
;
515 leafset
[idx
].l
= info
->leafsets_count
;
518 /* Descend to leaf */
519 leafset
= info
->leafsets
[leafset
[idx
].l
];
522 /* Index final leaf */
523 i
= info
->idxtab
[ifcase(p
->key
[i
])];
527 /* Memorize pointer to data */
528 leafset
[i
].p
= info
->pointers_count
;
529 add_to_ff_data(p
->data
, key_len
, info
);
533 compress_tree(info
->root_leafset
, info
);
538 fastfind_done(index
);
545 /* This macro searchs for the key in indexed list */
546 #define FF_SEARCH(what) do { \
549 for (i = 0; i < key_len; i++) { \
550 int lidx, k = what; \
555 if (k >= FF_MAX_CHARS) return NULL; \
556 lidx = info->idxtab[k]; \
559 if (lidx < 0) return NULL; \
563 /* It is a compressed leaf. */ \
565 if (((struct ff_node_c *) current)->ch != lidx) \
568 current = ¤t[lidx]; \
573 struct ff_data *data = &info->data[current->p]; \
576 if (key_len == data->keylen) { \
577 FF_DBG_found(info); \
578 return data->pointer; \
583 if (!current->l) return NULL; \
584 current = (struct ff_node *) info->leafsets[current->l]; \
589 fastfind_search(struct fastfind_index
*index
, unsigned char *key
, int key_len
)
591 struct ff_node
*current
;
592 struct fastfind_info
*info
;
595 if_assert_failed
return NULL
;
597 info
= index
->handle
;
599 assertm(info
, "FastFind index %s not initialized", index
->comment
);
600 if_assert_failed
return NULL
;
602 FF_DBG_search_stats(info
, key_len
);
604 FF_DBG_test(info
); if (!key
) return NULL
;
605 FF_DBG_test(info
); if (key_len
> info
->max_key_len
) return NULL
;
606 FF_DBG_test(info
); if (key_len
< info
->min_key_len
) return NULL
;
608 current
= info
->root_leafset
;
610 /* Macro and code redundancy are there to obtain maximum
611 * performance. Do not move it to an inlined function.
612 * Do not even think about it.
613 * If you find a better way (same or better performance) then
614 * propose it and be prepared to defend it. --Zas */
617 if (info
->case_aware
)
620 FF_SEARCH(toupper(key
[i
]));
628 fastfind_done(struct fastfind_index
*index
)
630 struct fastfind_info
*info
;
633 if_assert_failed
return;
635 info
= index
->handle
;
638 FF_DBG_dump_stats(info
);
640 mem_free_if(info
->data
);
641 while (info
->leafsets_count
) {
642 mem_free_if(info
->leafsets
[info
->leafsets_count
]);
643 info
->leafsets_count
--;
645 mem_free_if(info
->leafsets
);
648 index
->handle
= NULL
;
660 struct list list
[] = {
692 /* {"HEAD", html_skip, 0, 0}, */
726 {NULL
, 0}, /* List terminaison is key = NULL */
730 struct list
*internal_pointer
;
732 /* Reset internal list pointer */
736 internal_pointer
= list
;
739 /* Returns a pointer to a struct that contains
740 * current key and data pointers and increment
742 * It returns NULL when key is NULL. */
743 struct fastfind_key_value
*
746 static struct fastfind_key_value kv
;
748 if (!internal_pointer
->tag
) return NULL
;
750 kv
.key
= internal_pointer
->tag
;
751 kv
.data
= internal_pointer
;
758 static struct fastfind_index ff_index
759 = INIT_FASTFIND_INDEX("example", reset_list
, next_in_list
);
762 main(int argc
, char **argv
)
764 unsigned char *key
= argv
[1];
768 fprintf(stderr
, "Usage: fastfind keyword\n");
772 fprintf(stderr
, "---------- INDEX PHASE -----------\n");
774 fastfind_index(&ff_index
, FF_COMPRESS
);
776 fprintf(stderr
, "---------- SEARCH PHASE ----------\n");
777 /* Without this one ... */
778 result
= (struct list
*) fastfind_search(&ff_index
, key
, strlen(key
));
781 fprintf(stderr
, " Found: '%s' -> %d\n", result
->tag
, result
->val
);
783 fprintf(stderr
, " Not found: '%s'\n", key
);
785 fprintf(stderr
, "---------- CLEANUP PHASE ----------\n");
786 fastfind_done(&ff_index
);
793 #endif /* USE_FASTFIND */