Enhance the percentile() extension function to include the median()
[sqlite.git] / test / fts3_common.tcl
blobfcd3ca3e45eb85e290b6881535cff54f5cd5438f
1 # 2009 November 04
3 # The author disclaims copyright to this source code. In place of
4 # a legal notice, here is a blessing:
6 # May you do good and not evil.
7 # May you find forgiveness for yourself and forgive others.
8 # May you share freely, never taking more than you give.
10 #***********************************************************************
12 # This file contains common code used the fts3 tests. At one point
13 # equivalent functionality was implemented in C code. But it is easier
14 # to use Tcl.
17 #-------------------------------------------------------------------------
18 # INSTRUCTIONS
20 # The following commands are available:
22 # fts3_build_db_1 N
23 # Using database handle [db] create an FTS4 table named t1 and populate
24 # it with N rows of data. N must be less than 10,000. Refer to the
25 # header comments above the proc implementation below for details.
27 # fts3_build_db_2 N
28 # Using database handle [db] create an FTS4 table named t2 and populate
29 # it with N rows of data. N must be less than 100,000. Refer to the
30 # header comments above the proc implementation below for details.
32 # fts3_integrity_check TBL
33 # TBL must be an FTS table in the database currently opened by handle
34 # [db]. This proc loads and tokenizes all documents within the table,
35 # then checks that the current contents of the FTS index matches the
36 # results.
38 # fts3_terms TBL WHERE
39 # Todo.
41 # fts3_doclist TBL TERM WHERE
42 # Todo.
47 ifcapable fts3 {
48 sqlite3_fts3_may_be_corrupt 0
51 #-------------------------------------------------------------------------
52 # USAGE: fts3_build_db_1 SWITCHES N
54 # Build a sample FTS table in the database opened by database connection
55 # [db]. The name of the new table is "t1".
57 proc fts3_build_db_1 {args} {
59 set default(-module) fts4
61 set nArg [llength $args]
62 if {($nArg%2)==0} {
63 error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
66 set n [lindex $args [expr $nArg-1]]
67 array set opts [array get default]
68 array set opts [lrange $args 0 [expr $nArg-2]]
69 foreach k [array names opts] {
70 if {0==[info exists default($k)]} { error "unknown option: $k" }
73 if {$n > 10000} {error "n must be <= 10000"}
74 db eval "CREATE VIRTUAL TABLE t1 USING $opts(-module) (x, y)"
76 set xwords [list zero one two three four five six seven eight nine ten]
77 set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa]
79 for {set i 0} {$i < $n} {incr i} {
80 set x ""
81 set y ""
83 set x [list]
84 lappend x [lindex $xwords [expr ($i / 1000) % 10]]
85 lappend x [lindex $xwords [expr ($i / 100) % 10]]
86 lappend x [lindex $xwords [expr ($i / 10) % 10]]
87 lappend x [lindex $xwords [expr ($i / 1) % 10]]
89 set y [list]
90 lappend y [lindex $ywords [expr ($i / 1000) % 10]]
91 lappend y [lindex $ywords [expr ($i / 100) % 10]]
92 lappend y [lindex $ywords [expr ($i / 10) % 10]]
93 lappend y [lindex $ywords [expr ($i / 1) % 10]]
95 db eval { INSERT INTO t1(docid, x, y) VALUES($i, $x, $y) }
99 #-------------------------------------------------------------------------
100 # USAGE: fts3_build_db_2 N ARGS
102 # Build a sample FTS table in the database opened by database connection
103 # [db]. The name of the new table is "t2".
105 proc fts3_build_db_2 {args} {
107 set default(-module) fts4
108 set default(-extra) ""
110 set nArg [llength $args]
111 if {($nArg%2)==0} {
112 error "wrong # args: should be \"fts3_build_db_1 ?switches? n\""
115 set n [lindex $args [expr $nArg-1]]
116 array set opts [array get default]
117 array set opts [lrange $args 0 [expr $nArg-2]]
118 foreach k [array names opts] {
119 if {0==[info exists default($k)]} { error "unknown option: $k" }
122 if {$n > 100000} {error "n must be <= 100000"}
124 set sql "CREATE VIRTUAL TABLE t2 USING $opts(-module) (content"
125 if {$opts(-extra) != ""} {
126 append sql ", " $opts(-extra)
128 append sql ")"
129 db eval $sql
131 set chars [list a b c d e f g h i j k l m n o p q r s t u v w x y z ""]
133 for {set i 0} {$i < $n} {incr i} {
134 set word ""
135 set nChar [llength $chars]
136 append word [lindex $chars [expr {($i / 1) % $nChar}]]
137 append word [lindex $chars [expr {($i / $nChar) % $nChar}]]
138 append word [lindex $chars [expr {($i / ($nChar*$nChar)) % $nChar}]]
140 db eval { INSERT INTO t2(docid, content) VALUES($i, $word) }
144 #-------------------------------------------------------------------------
145 # USAGE: fts3_integrity_check TBL
147 # This proc is used to verify that the full-text index is consistent with
148 # the contents of the fts3 table. In other words, it checks that the
149 # data in the %_contents table matches that in the %_segdir and %_segments
150 # tables.
152 # This is not an efficient procedure. It uses a lot of memory and a lot
153 # of CPU. But it is better than not checking at all.
155 # The procedure is:
157 # 1) Read the entire full-text index from the %_segdir and %_segments
158 # tables into memory. For each entry in the index, the following is
159 # done:
161 # set C($iDocid,$iCol,$iPosition) $zTerm
163 # 2) Iterate through each column of each row of the %_content table.
164 # Tokenize all documents, and check that for each token there is
165 # a corresponding entry in the $C array. After checking a token,
166 # [unset] the $C array entry.
168 # 3) Check that array $C is now empty.
171 proc fts3_integrity_check {tbl} {
173 fts3_read2 $tbl 1 A
175 foreach zTerm [array names A] {
176 #puts $zTerm
177 foreach doclist $A($zTerm) {
178 set docid 0
179 while {[string length $doclist]>0} {
180 set iCol 0
181 set iPos 0
182 set lPos [list]
183 set lCol [list]
185 # First varint of a doclist-entry is the docid. Delta-compressed
186 # with respect to the docid of the previous entry.
188 incr docid [gobble_varint doclist]
189 if {[info exists D($zTerm,$docid)]} {
190 while {[set iDelta [gobble_varint doclist]] != 0} {}
191 continue
193 set D($zTerm,$docid) 1
195 # Gobble varints until the 0x00 that terminates the doclist-entry
196 # is found.
197 while {[set iDelta [gobble_varint doclist]] > 0} {
198 if {$iDelta == 1} {
199 set iCol [gobble_varint doclist]
200 set iPos 0
201 } else {
202 incr iPos $iDelta
203 incr iPos -2
204 set C($docid,$iCol,$iPos) $zTerm
211 foreach key [array names C] {
212 #puts "$key -> $C($key)"
216 db eval "SELECT * FROM ${tbl}_content" E {
217 set iCol 0
218 set iDoc $E(docid)
219 foreach col [lrange $E(*) 1 end] {
220 set c $E($col)
221 set sql {SELECT fts3_tokenizer_test('simple', $c)}
223 foreach {pos term dummy} [db one $sql] {
224 if {![info exists C($iDoc,$iCol,$pos)]} {
225 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index is missing"
226 lappend errors $es
227 } else {
228 if {[string compare $C($iDoc,$iCol,$pos) $term]} {
229 set es "Error at docid=$iDoc col=$iCol pos=$pos. Index "
230 append es "has \"$C($iDoc,$iCol,$pos)\", document has \"$term\""
231 lappend errors $es
233 unset C($iDoc,$iCol,$pos)
236 incr iCol
240 foreach c [array names C] {
241 lappend errors "Bad index entry: $c -> $C($c)"
244 if {[info exists errors]} { return [join $errors "\n"] }
245 return "ok"
248 # USAGE: fts3_terms TBL WHERE
250 # Argument TBL must be the name of an FTS3 table. Argument WHERE is an
251 # SQL expression that will be used as the WHERE clause when scanning
252 # the %_segdir table. As in the following query:
254 # "SELECT * FROM ${TBL}_segdir WHERE ${WHERE}"
256 # This function returns a list of all terms present in the segments
257 # selected by the statement above.
259 proc fts3_terms {tbl where} {
260 fts3_read $tbl $where a
261 return [lsort [array names a]]
265 # USAGE: fts3_doclist TBL TERM WHERE
267 # Argument TBL must be the name of an FTS3 table. TERM is a term that may
268 # or may not be present in the table. Argument WHERE is used to select a
269 # subset of the b-tree segments in the associated full-text index as
270 # described above for [fts3_terms].
272 # This function returns the results of merging the doclists associated
273 # with TERM in the selected segments. Each doclist is an element of the
274 # returned list. Each doclist is formatted as follows:
276 # [$docid ?$col[$off1 $off2...]?...]
278 # The formatting is odd for a Tcl command in order to be compatible with
279 # the original C-language implementation. If argument WHERE is "1", then
280 # any empty doclists are omitted from the returned list.
282 proc fts3_doclist {tbl term where} {
283 fts3_read $tbl $where a
286 foreach doclist $a($term) {
287 set docid 0
289 while {[string length $doclist]>0} {
290 set iCol 0
291 set iPos 0
292 set lPos [list]
293 set lCol [list]
294 incr docid [gobble_varint doclist]
296 while {[set iDelta [gobble_varint doclist]] > 0} {
297 if {$iDelta == 1} {
298 lappend lCol [list $iCol $lPos]
299 set iPos 0
300 set lPos [list]
301 set iCol [gobble_varint doclist]
302 } else {
303 incr iPos $iDelta
304 incr iPos -2
305 lappend lPos $iPos
309 if {[llength $lPos]>0} {
310 lappend lCol [list $iCol $lPos]
313 if {$where != "1" || [llength $lCol]>0} {
314 set ret($docid) $lCol
315 } else {
316 unset -nocomplain ret($docid)
321 set lDoc [list]
322 foreach docid [lsort -integer [array names ret]] {
323 set lCol [list]
324 set cols ""
325 foreach col $ret($docid) {
326 foreach {iCol lPos} $col {}
327 append cols " $iCol\[[join $lPos { }]\]"
329 lappend lDoc "\[${docid}${cols}\]"
332 join $lDoc " "
335 ###########################################################################
337 proc gobble_varint {varname} {
338 upvar $varname blob
339 set n [read_fts3varint $blob ret]
340 set blob [string range $blob $n end]
341 return $ret
343 proc gobble_string {varname nLength} {
344 upvar $varname blob
345 set ret [string range $blob 0 [expr $nLength-1]]
346 set blob [string range $blob $nLength end]
347 return $ret
350 # The argument is a blob of data representing an FTS3 segment leaf.
351 # Return a list consisting of alternating terms (strings) and doclists
352 # (blobs of data).
354 proc fts3_readleaf {blob} {
355 set zPrev ""
356 set terms [list]
358 while {[string length $blob] > 0} {
359 set nPrefix [gobble_varint blob]
360 set nSuffix [gobble_varint blob]
362 set zTerm [string range $zPrev 0 [expr $nPrefix-1]]
363 append zTerm [gobble_string blob $nSuffix]
364 set nDoclist [gobble_varint blob]
365 set doclist [gobble_string blob $nDoclist]
367 lappend terms $zTerm $doclist
368 set zPrev $zTerm
371 return $terms
374 proc fts3_read2 {tbl where varname} {
375 upvar $varname a
376 array unset a
377 db eval " SELECT start_block, leaves_end_block, root
378 FROM ${tbl}_segdir WHERE $where
379 ORDER BY level ASC, idx DESC
381 set c 0
382 binary scan $root c c
383 if {$c==0} {
384 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
385 } else {
386 db eval " SELECT block
387 FROM ${tbl}_segments
388 WHERE blockid>=$start_block AND blockid<=$leaves_end_block
389 ORDER BY blockid
391 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
397 proc fts3_read {tbl where varname} {
398 upvar $varname a
399 array unset a
400 db eval " SELECT start_block, leaves_end_block, root
401 FROM ${tbl}_segdir WHERE $where
402 ORDER BY level DESC, idx ASC
404 if {$start_block == 0} {
405 foreach {t d} [fts3_readleaf $root] { lappend a($t) $d }
406 } else {
407 db eval " SELECT block
408 FROM ${tbl}_segments
409 WHERE blockid>=$start_block AND blockid<$leaves_end_block
410 ORDER BY blockid
412 foreach {t d} [fts3_readleaf $block] { lappend a($t) $d }
419 ##########################################################################