1 /* $Id: apropos_db.c,v 1.32.2.3 2013/10/10 23:43:04 schwarze Exp $ */
3 * Copyright (c) 2011, 2012 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22 #include <sys/param.h>
33 #if defined(__APPLE__)
34 # include <libkern/OSByteOrder.h>
35 #elif defined(__linux__)
38 # include <sys/byteorder.h>
40 # include <sys/endian.h>
43 #if defined(__linux__) || defined(__sun)
50 #include "apropos_db.h"
60 free((_x)->matches); \
61 } while (/*CONSTCOND*/0)
64 int regex
; /* is regex? */
65 int index
; /* index in match array */
66 uint64_t mask
; /* type-mask */
67 int and; /* is rhs of logical AND? */
68 char *v
; /* search value */
69 regex_t re
; /* compiled re, if regex */
70 struct expr
*next
; /* next in sequence */
80 struct res
*node
; /* record array for dir tree */
81 int len
; /* length of record array */
84 static const struct type types
[] = {
124 { UINT64_MAX
, "any" },
128 static DB
*btree_open(void);
129 static int btree_read(const DBT
*, const DBT
*,
130 const struct mchars
*,
131 uint64_t *, recno_t
*, char **);
132 static int expreval(const struct expr
*, int *);
133 static void exprexec(const struct expr
*,
134 const char *, uint64_t, struct res
*);
135 static int exprmark(const struct expr
*,
136 const char *, uint64_t, int *);
137 static struct expr
*exprexpr(int, char *[], int *, int *, size_t *);
138 static struct expr
*exprterm(char *, int);
139 static DB
*index_open(void);
140 static int index_read(const DBT
*, const DBT
*, int,
141 const struct mchars
*, struct res
*);
142 static void norm_string(const char *,
143 const struct mchars
*, char **);
144 static size_t norm_utf8(unsigned int, char[7]);
145 static int single_search(struct rectree
*, const struct opts
*,
146 const struct expr
*, size_t terms
,
147 struct mchars
*, int);
150 * Open the keyword mandoc-db database.
158 memset(&info
, 0, sizeof(BTREEINFO
));
162 db
= dbopen(MANDOC_DB
, O_RDONLY
, 0, DB_BTREE
, &info
);
170 * Read a keyword from the database and normalise it.
171 * Return 0 if the database is insane, else 1.
174 btree_read(const DBT
*k
, const DBT
*v
, const struct mchars
*mc
,
175 uint64_t *mask
, recno_t
*rec
, char **buf
)
179 /* Are our sizes sane? */
180 if (k
->size
< 2 || sizeof(vbuf
) != v
->size
)
183 /* Is our string nil-terminated? */
184 if ('\0' != ((const char *)k
->data
)[(int)k
->size
- 1])
187 norm_string((const char *)k
->data
, mc
, buf
);
188 memcpy(vbuf
, v
->data
, v
->size
);
189 *mask
= betoh64(vbuf
[0]);
190 *rec
= betoh64(vbuf
[1]);
195 * Take a Unicode codepoint and produce its UTF-8 encoding.
196 * This isn't the best way to do this, but it works.
197 * The magic numbers are from the UTF-8 packaging.
198 * They're not as scary as they seem: read the UTF-8 spec for details.
201 norm_utf8(unsigned int cp
, char out
[7])
207 if (cp
<= 0x0000007F) {
210 } else if (cp
<= 0x000007FF) {
212 out
[0] = (cp
>> 6 & 31) | 192;
213 out
[1] = (cp
& 63) | 128;
214 } else if (cp
<= 0x0000FFFF) {
216 out
[0] = (cp
>> 12 & 15) | 224;
217 out
[1] = (cp
>> 6 & 63) | 128;
218 out
[2] = (cp
& 63) | 128;
219 } else if (cp
<= 0x001FFFFF) {
221 out
[0] = (cp
>> 18 & 7) | 240;
222 out
[1] = (cp
>> 12 & 63) | 128;
223 out
[2] = (cp
>> 6 & 63) | 128;
224 out
[3] = (cp
& 63) | 128;
225 } else if (cp
<= 0x03FFFFFF) {
227 out
[0] = (cp
>> 24 & 3) | 248;
228 out
[1] = (cp
>> 18 & 63) | 128;
229 out
[2] = (cp
>> 12 & 63) | 128;
230 out
[3] = (cp
>> 6 & 63) | 128;
231 out
[4] = (cp
& 63) | 128;
232 } else if (cp
<= 0x7FFFFFFF) {
234 out
[0] = (cp
>> 30 & 1) | 252;
235 out
[1] = (cp
>> 24 & 63) | 128;
236 out
[2] = (cp
>> 18 & 63) | 128;
237 out
[3] = (cp
>> 12 & 63) | 128;
238 out
[4] = (cp
>> 6 & 63) | 128;
239 out
[5] = (cp
& 63) | 128;
248 * Normalise strings from the index and database.
249 * These strings are escaped as defined by mandoc_char(7) along with
250 * other goop in mandoc.h (e.g., soft hyphens).
251 * This function normalises these into a nice UTF-8 string.
252 * Returns 0 if the database is fucked.
255 norm_string(const char *val
, const struct mchars
*mc
, char **buf
)
259 const char *seq
, *cpp
;
262 static const char res
[] = { '\\', '\t',
263 ASCII_NBRSP
, ASCII_HYPH
, '\0' };
265 /* Pre-allocate by the length of the input */
267 bsz
= strlen(val
) + 1;
268 *buf
= mandoc_realloc(*buf
, bsz
);
271 while ('\0' != *val
) {
273 * Halt on the first escape sequence.
274 * This also halts on the end of string, in which case
275 * we just copy, fallthrough, and exit the loop.
277 if ((sz
= strcspn(val
, res
)) > 0) {
278 memcpy(&(*buf
)[pos
], val
, sz
);
283 if (ASCII_HYPH
== *val
) {
287 } else if ('\t' == *val
|| ASCII_NBRSP
== *val
) {
291 } else if ('\\' != *val
)
294 /* Read past the slash. */
300 * Parse the escape sequence and see if it's a
301 * predefined character or special character.
304 esc
= mandoc_escape(&val
, &seq
, &len
);
305 if (ESCAPE_ERROR
== esc
)
309 * XXX - this just does UTF-8, but we need to know
310 * beforehand whether we should do text substitution.
314 case (ESCAPE_SPECIAL
):
315 if (0 != (u
= mchars_spec2cp(mc
, seq
, len
)))
323 * If we have a Unicode codepoint, try to convert that
324 * to a UTF-8 byte string.
328 if (0 == (sz
= norm_utf8(u
, utfbuf
)))
331 /* Copy the rendered glyph into the stream. */
336 *buf
= mandoc_realloc(*buf
, bsz
);
338 memcpy(&(*buf
)[pos
], cpp
, sz
);
346 * Open the filename-index mandoc-db database.
347 * Returns NULL if opening failed.
354 db
= dbopen(MANDOC_IDX
, O_RDONLY
, 0, DB_RECNO
, NULL
);
362 * Safely unpack from an index file record into the structure.
363 * Returns 1 if an entry was unpacked, 0 if the database is insane.
366 index_read(const DBT
*key
, const DBT
*val
, int index
,
367 const struct mchars
*mc
, struct res
*rec
)
373 #define INDEX_BREAD(_dst) \
375 if (NULL == (np = memchr(cp, '\0', left))) \
377 norm_string(cp, mc, &(_dst)); \
378 left -= (np - cp) + 1; \
380 } while (/* CONSTCOND */ 0)
382 if (0 == (left
= val
->size
))
386 assert(sizeof(recno_t
) == key
->size
);
387 memcpy(&rec
->rec
, key
->data
, key
->size
);
390 if ('d' == (type
= *cp
++))
391 rec
->type
= RESTYPE_MDOC
;
392 else if ('a' == type
)
393 rec
->type
= RESTYPE_MAN
;
394 else if ('c' == type
)
395 rec
->type
= RESTYPE_CAT
;
400 INDEX_BREAD(rec
->file
);
401 INDEX_BREAD(rec
->cat
);
402 INDEX_BREAD(rec
->title
);
403 INDEX_BREAD(rec
->arch
);
404 INDEX_BREAD(rec
->desc
);
409 * Search mandocdb databases in paths for expression "expr".
410 * Filter out by "opts".
411 * Call "res" with the results, which may be zero.
412 * Return 0 if there was a database error, else return 1.
415 apropos_search(int pathsz
, char **paths
, const struct opts
*opts
,
416 const struct expr
*expr
, size_t terms
, void *arg
,
417 size_t *sz
, struct res
**resp
,
418 void (*res
)(struct res
*, size_t, void *))
424 memset(&tree
, 0, sizeof(struct rectree
));
431 * Main loop. Change into the directory containing manpage
432 * databases. Run our expession over each database in the set.
435 for (i
= 0; i
< pathsz
; i
++) {
436 assert('/' == paths
[i
][0]);
439 if (single_search(&tree
, opts
, expr
, terms
, mc
, i
))
442 resfree(tree
.node
, tree
.len
);
447 (*res
)(tree
.node
, tree
.len
, arg
);
455 single_search(struct rectree
*tree
, const struct opts
*opts
,
456 const struct expr
*expr
, size_t terms
,
457 struct mchars
*mc
, int vol
)
475 memset(&r
, 0, sizeof(struct res
));
477 if (NULL
== (btree
= btree_open()))
480 if (NULL
== (idx
= index_open())) {
481 (*btree
->close
)(btree
);
485 while (0 == (ch
= (*btree
->seq
)(btree
, &key
, &val
, R_NEXT
))) {
486 if ( ! btree_read(&key
, &val
, mc
, &mask
, &rec
, &buf
))
490 * See if this keyword record matches any of the
491 * expressions we have stored.
493 if ( ! exprmark(expr
, buf
, mask
, NULL
))
497 * O(log n) scan for prior records. Since a record
498 * number is unbounded, this has decent performance over
499 * a complex hash function.
502 for (leaf
= root
; leaf
>= 0; )
503 if (rec
> rs
[leaf
].rec
&&
506 else if (rec
< rs
[leaf
].rec
&&
513 * If we find a record, see if it has already evaluated
514 * to true. If it has, great, just keep going. If not,
515 * try to evaluate it now and continue anyway.
518 if (leaf
>= 0 && rs
[leaf
].rec
== rec
) {
519 if (0 == rs
[leaf
].matched
)
520 exprexec(expr
, buf
, mask
, &rs
[leaf
]);
525 * We have a new file to examine.
526 * Extract the manpage's metadata from the index
527 * database, then begin partial evaluation.
531 key
.size
= sizeof(recno_t
);
533 if (0 != (*idx
->get
)(idx
, &key
, &val
, 0))
537 if ( ! index_read(&key
, &val
, vol
, mc
, &r
))
540 /* XXX: this should be elsewhere, I guess? */
542 if (opts
->cat
&& strcasecmp(opts
->cat
, r
.cat
))
545 if (opts
->arch
&& *r
.arch
)
546 if (strcasecmp(opts
->arch
, r
.arch
))
549 tree
->node
= rs
= mandoc_realloc
550 (rs
, (tree
->len
+ 1) * sizeof(struct res
));
552 memcpy(&rs
[tree
->len
], &r
, sizeof(struct res
));
553 memset(&r
, 0, sizeof(struct res
));
554 rs
[tree
->len
].matches
=
555 mandoc_calloc(terms
, sizeof(int));
557 exprexec(expr
, buf
, mask
, &rs
[tree
->len
]);
559 /* Append to our tree. */
562 if (rec
> rs
[leaf
].rec
)
563 rs
[leaf
].rhs
= tree
->len
;
565 rs
[leaf
].lhs
= tree
->len
;
572 (*btree
->close
)(btree
);
581 resfree(struct res
*rec
, size_t sz
)
585 for (i
= 0; i
< sz
; i
++)
591 * Compile a list of straight-up terms.
592 * The arguments are re-written into ~[[:<:]]term[[:>:]], or "term"
593 * surrounded by word boundaries, then pumped through exprterm().
594 * Terms are case-insensitive.
595 * This emulates whatis(1) behaviour.
598 termcomp(int argc
, char *argv
[], size_t *tt
)
602 struct expr
*e
, *next
;
609 for (pos
= argc
- 1; pos
>= 0; pos
--) {
610 sz
= strlen(argv
[pos
]) + 18;
611 buf
= mandoc_realloc(buf
, sz
);
612 strlcpy(buf
, "Nm~[[:<:]]", sz
);
613 strlcat(buf
, argv
[pos
], sz
);
614 strlcat(buf
, "[[:>:]]", sz
);
615 if (NULL
== (next
= exprterm(buf
, 0))) {
630 * Compile a sequence of logical expressions.
631 * See apropos.1 for a grammar of this sequence.
634 exprcomp(int argc
, char *argv
[], size_t *tt
)
642 e
= exprexpr(argc
, argv
, &pos
, &lvl
, tt
);
644 if (0 == lvl
&& pos
>= argc
)
652 * Compile an array of tokens into an expression.
653 * An informal expression grammar is defined in apropos(1).
654 * Return NULL if we fail doing so. All memory will be cleaned up.
655 * Return the root of the expression sequence if alright.
658 exprexpr(int argc
, char *argv
[], int *pos
, int *lvl
, size_t *tt
)
660 struct expr
*e
, *first
, *next
;
665 for ( ; *pos
< argc
; (*pos
)++) {
669 * Close out a subexpression.
672 if (NULL
!= e
&& 0 == strcmp(")", argv
[*pos
])) {
679 * Small note: if we're just starting, don't let "-a"
680 * and "-o" be considered logical operators: they're
681 * just tokens unless pairwise joining, in which case we
682 * record their existence (or assume "OR").
686 if (NULL
!= e
&& 0 == strcmp("-a", argv
[*pos
]))
688 else if (NULL
!= e
&& 0 == strcmp("-o", argv
[*pos
]))
691 if (log
> 0 && ++(*pos
) >= argc
)
695 * Now we parse the term part. This can begin with
696 * "-i", in which case the expression is case
700 if (0 == strcmp("(", argv
[*pos
])) {
703 next
= mandoc_calloc(1, sizeof(struct expr
));
704 next
->subexpr
= exprexpr(argc
, argv
, pos
, lvl
, tt
);
705 if (NULL
== next
->subexpr
) {
709 } else if (0 == strcmp("-i", argv
[*pos
])) {
710 if (++(*pos
) >= argc
)
712 next
= exprterm(argv
[*pos
], 0);
714 next
= exprterm(argv
[*pos
], 1);
719 next
->and = log
== 1;
720 next
->index
= (int)(*tt
)++;
722 /* Append to our chain of expressions. */
740 * Parse a terminal expression with the grammar as defined in
742 * Return NULL if we fail the parse.
745 exprterm(char *buf
, int cs
)
752 memset(&e
, 0, sizeof(struct expr
));
754 /* Choose regex or substring match. */
756 if (NULL
== (e
.v
= strpbrk(buf
, "=~"))) {
760 e
.regex
= '~' == *e
.v
;
764 /* Determine the record types to search for. */
768 while (NULL
!= (key
= strsep(&buf
, ","))) {
770 while (types
[i
].mask
&&
771 strcmp(types
[i
].name
, key
))
773 e
.mask
|= types
[i
].mask
;
777 e
.mask
= TYPE_Nm
| TYPE_Nd
;
780 i
= REG_EXTENDED
| REG_NOSUB
| (cs
? 0 : REG_ICASE
);
781 if (regcomp(&e
.re
, e
.v
, i
))
785 e
.v
= mandoc_strdup(e
.v
);
787 p
= mandoc_calloc(1, sizeof(struct expr
));
788 memcpy(p
, &e
, sizeof(struct expr
));
793 exprfree(struct expr
*p
)
799 exprfree(p
->subexpr
);
810 exprmark(const struct expr
*p
, const char *cp
,
811 uint64_t mask
, int *ms
)
814 for ( ; p
; p
= p
->next
) {
816 if (exprmark(p
->subexpr
, cp
, mask
, ms
))
819 } else if ( ! (mask
& p
->mask
))
823 if (regexec(&p
->re
, cp
, 0, NULL
, 0))
825 } else if (NULL
== strcasestr(cp
, p
->v
))
838 expreval(const struct expr
*p
, int *ms
)
843 * AND has precedence over OR. Analysis is left-right, though
844 * it doesn't matter because there are no side-effects.
845 * Thus, step through pairwise ANDs and accumulate their Boolean
846 * evaluation. If we encounter a single true AND collection or
847 * standalone term, the whole expression is true (by definition
851 for (match
= 0; p
&& ! match
; p
= p
->next
) {
852 /* Evaluate a subexpression, if applicable. */
853 if (p
->subexpr
&& ! ms
[p
->index
])
854 ms
[p
->index
] = expreval(p
->subexpr
, ms
);
856 match
= ms
[p
->index
];
857 for ( ; p
->next
&& p
->next
->and; p
= p
->next
) {
858 /* Evaluate a subexpression, if applicable. */
859 if (p
->next
->subexpr
&& ! ms
[p
->next
->index
])
861 expreval(p
->next
->subexpr
, ms
);
862 match
= match
&& ms
[p
->next
->index
];
870 * First, update the array of terms for which this expression evaluates
872 * Second, logically evaluate all terms over the updated array of truth
874 * If this evaluates to true, mark the expression as satisfied.
877 exprexec(const struct expr
*e
, const char *cp
,
878 uint64_t mask
, struct res
*r
)
881 assert(0 == r
->matched
);
882 exprmark(e
, cp
, mask
, r
->matches
);
883 r
->matched
= expreval(e
, r
->matches
);