1 /* fts2 has a design flaw which can lead to database corruption (see
2 ** below). It is recommended not to use it any longer, instead use
3 ** fts3 (or higher). If you believe that your use of fts2 is safe,
4 ** add -DSQLITE_ENABLE_BROKEN_FTS2=1 to your CFLAGS.
6 #if (!defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)) \
7 && !defined(SQLITE_ENABLE_BROKEN_FTS2)
8 #error fts2 has a design flaw and has been deprecated.
10 /* The flaw is that fts2 uses the content table's unaliased rowid as
11 ** the unique docid. fts2 embeds the rowid in the index it builds,
12 ** and expects the rowid to not change. The SQLite VACUUM operation
13 ** will renumber such rowids, thereby breaking fts2. If you are using
14 ** fts2 in a system which has disabled VACUUM, then you can continue
15 ** to use it safely. Note that PRAGMA auto_vacuum does NOT disable
16 ** VACUUM, though systems using auto_vacuum are unlikely to invoke
19 ** Unlike fts1, which is safe across VACUUM if you never delete
20 ** documents, fts2 has a second exposure to this flaw, in the segments
21 ** table. So fts2 should be considered unsafe across VACUUM in all
28 ** The author disclaims copyright to this source code. In place of
29 ** a legal notice, here is a blessing:
31 ** May you do good and not evil.
32 ** May you find forgiveness for yourself and forgive others.
33 ** May you share freely, never taking more than you give.
35 ******************************************************************************
37 ** This is an SQLite module implementing full-text search.
41 ** The code in this file is only compiled if:
43 ** * The FTS2 module is being built as an extension
44 ** (in which case SQLITE_CORE is not defined), or
46 ** * The FTS2 module is being built into the core of
47 ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
50 /* TODO(shess) Consider exporting this comment to an HTML file or the
53 /* The full-text index is stored in a series of b+tree (-like)
54 ** structures called segments which map terms to doclists. The
55 ** structures are like b+trees in layout, but are constructed from the
56 ** bottom up in optimal fashion and are not updatable. Since trees
57 ** are built from the bottom up, things will be described from the
62 ** The basic unit of encoding is a variable-length integer called a
63 ** varint. We encode variable-length integers in little-endian order
64 ** using seven bits * per byte as follows:
67 ** A = 0xxxxxxx 7 bits of data and one flag bit
68 ** B = 1xxxxxxx 7 bits of data and one flag bit
75 ** This is identical to how sqlite encodes varints (see util.c).
78 **** Document lists ****
79 ** A doclist (document list) holds a docid-sorted list of hits for a
80 ** given term. Doclists hold docids, and can optionally associate
81 ** token positions and offsets with docids.
83 ** A DL_POSITIONS_OFFSETS doclist is stored like this:
87 ** array { (position list for column 0)
88 ** varint position; (delta from previous position plus POS_BASE)
89 ** varint startOffset; (delta from previous startOffset)
90 ** varint endOffset; (delta from startOffset)
93 ** varint POS_COLUMN; (marks start of position list for new column)
94 ** varint column; (index of new column)
96 ** varint position; (delta from previous position plus POS_BASE)
97 ** varint startOffset;(delta from previous startOffset)
98 ** varint endOffset; (delta from startOffset)
101 ** varint POS_END; (marks end of positions for this document.
104 ** Here, array { X } means zero or more occurrences of X, adjacent in
105 ** memory. A "position" is an index of a token in the token stream
106 ** generated by the tokenizer, while an "offset" is a byte offset,
107 ** both based at 0. Note that POS_END and POS_COLUMN occur in the
108 ** same logical place as the position element, and act as sentinals
109 ** ending a position list array.
111 ** A DL_POSITIONS doclist omits the startOffset and endOffset
112 ** information. A DL_DOCIDS doclist omits both the position and
113 ** offset information, becoming an array of varint-encoded docids.
115 ** On-disk data is stored as type DL_DEFAULT, so we don't serialize
116 ** the type. Due to how deletion is implemented in the segmentation
117 ** system, on-disk doclists MUST store at least positions.
120 **** Segment leaf nodes ****
121 ** Segment leaf nodes store terms and doclists, ordered by term. Leaf
122 ** nodes are written using LeafWriter, and read using LeafReader (to
123 ** iterate through a single leaf node's data) and LeavesReader (to
124 ** iterate through a segment's entire leaf layer). Leaf nodes have
127 ** varint iHeight; (height from leaf level, always 0)
128 ** varint nTerm; (length of first term)
129 ** char pTerm[nTerm]; (content of first term)
130 ** varint nDoclist; (length of term's associated doclist)
131 ** char pDoclist[nDoclist]; (content of doclist)
133 ** (further terms are delta-encoded)
134 ** varint nPrefix; (length of prefix shared with previous term)
135 ** varint nSuffix; (length of unshared suffix)
136 ** char pTermSuffix[nSuffix];(unshared suffix of next term)
137 ** varint nDoclist; (length of term's associated doclist)
138 ** char pDoclist[nDoclist]; (content of doclist)
141 ** Here, array { X } means zero or more occurrences of X, adjacent in
144 ** Leaf nodes are broken into blocks which are stored contiguously in
145 ** the %_segments table in sorted order. This means that when the end
146 ** of a node is reached, the next term is in the node with the next
149 ** New data is spilled to a new leaf node when the current node
150 ** exceeds LEAF_MAX bytes (default 2048). New data which itself is
151 ** larger than STANDALONE_MIN (default 1024) is placed in a standalone
152 ** node (a leaf node with a single term and doclist). The goal of
153 ** these settings is to pack together groups of small doclists while
154 ** making it efficient to directly access large doclists. The
155 ** assumption is that large doclists represent terms which are more
156 ** likely to be query targets.
158 ** TODO(shess) It may be useful for blocking decisions to be more
159 ** dynamic. For instance, it may make more sense to have a 2.5k leaf
160 ** node rather than splitting into 2k and .5k nodes. My intuition is
161 ** that this might extend through 2x or 4x the pagesize.
164 **** Segment interior nodes ****
165 ** Segment interior nodes store blockids for subtree nodes and terms
166 ** to describe what data is stored by the each subtree. Interior
167 ** nodes are written using InteriorWriter, and read using
168 ** InteriorReader. InteriorWriters are created as needed when
169 ** SegmentWriter creates new leaf nodes, or when an interior node
170 ** itself grows too big and must be split. The format of interior
173 ** varint iHeight; (height from leaf level, always >0)
174 ** varint iBlockid; (block id of node's leftmost subtree)
176 ** varint nTerm; (length of first term)
177 ** char pTerm[nTerm]; (content of first term)
179 ** (further terms are delta-encoded)
180 ** varint nPrefix; (length of shared prefix with previous term)
181 ** varint nSuffix; (length of unshared suffix)
182 ** char pTermSuffix[nSuffix]; (unshared suffix of next term)
186 ** Here, optional { X } means an optional element, while array { X }
187 ** means zero or more occurrences of X, adjacent in memory.
189 ** An interior node encodes n terms separating n+1 subtrees. The
190 ** subtree blocks are contiguous, so only the first subtree's blockid
191 ** is encoded. The subtree at iBlockid will contain all terms less
192 ** than the first term encoded (or all terms if no term is encoded).
193 ** Otherwise, for terms greater than or equal to pTerm[i] but less
194 ** than pTerm[i+1], the subtree for that term will be rooted at
195 ** iBlockid+i. Interior nodes only store enough term data to
196 ** distinguish adjacent children (if the rightmost term of the left
197 ** child is "something", and the leftmost term of the right child is
198 ** "wicked", only "w" is stored).
200 ** New data is spilled to a new interior node at the same height when
201 ** the current node exceeds INTERIOR_MAX bytes (default 2048).
202 ** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
203 ** interior nodes and making the tree too skinny. The interior nodes
204 ** at a given height are naturally tracked by interior nodes at
205 ** height+1, and so on.
208 **** Segment directory ****
209 ** The segment directory in table %_segdir stores meta-information for
210 ** merging and deleting segments, and also the root node of the
213 ** The root node is the top node of the segment's tree after encoding
214 ** the entire segment, restricted to ROOT_MAX bytes (default 1024).
215 ** This could be either a leaf node or an interior node. If the top
216 ** node requires more than ROOT_MAX bytes, it is flushed to %_segments
217 ** and a new root interior node is generated (which should always fit
218 ** within ROOT_MAX because it only needs space for 2 varints, the
219 ** height and the blockid of the previous root).
221 ** The meta-information in the segment directory is:
222 ** level - segment level (see below)
223 ** idx - index within level
224 ** - (level,idx uniquely identify a segment)
225 ** start_block - first leaf node
226 ** leaves_end_block - last leaf node
227 ** end_block - last block (including interior nodes)
228 ** root - contents of root node
230 ** If the root node is a leaf node, then start_block,
231 ** leaves_end_block, and end_block are all 0.
234 **** Segment merging ****
235 ** To amortize update costs, segments are groups into levels and
236 ** merged in matches. Each increase in level represents exponentially
239 ** New documents (actually, document updates) are tokenized and
240 ** written individually (using LeafWriter) to a level 0 segment, with
241 ** incrementing idx. When idx reaches MERGE_COUNT (default 16), all
242 ** level 0 segments are merged into a single level 1 segment. Level 1
243 ** is populated like level 0, and eventually MERGE_COUNT level 1
244 ** segments are merged to a single level 2 segment (representing
245 ** MERGE_COUNT^2 updates), and so on.
247 ** A segment merge traverses all segments at a given level in
248 ** parallel, performing a straightforward sorted merge. Since segment
249 ** leaf nodes are written in to the %_segments table in order, this
250 ** merge traverses the underlying sqlite disk structures efficiently.
251 ** After the merge, all segment blocks from the merged level are
254 ** MERGE_COUNT controls how often we merge segments. 16 seems to be
255 ** somewhat of a sweet spot for insertion performance. 32 and 64 show
256 ** very similar performance numbers to 16 on insertion, though they're
257 ** a tiny bit slower (perhaps due to more overhead in merge-time
258 ** sorting). 8 is about 20% slower than 16, 4 about 50% slower than
259 ** 16, 2 about 66% slower than 16.
261 ** At query time, high MERGE_COUNT increases the number of segments
262 ** which need to be scanned and merged. For instance, with 100k docs
265 ** MERGE_COUNT segments
271 ** This appears to have only a moderate impact on queries for very
272 ** frequent terms (which are somewhat dominated by segment merge
273 ** costs), and infrequent and non-existent terms still seem to be fast
274 ** even with many segments.
276 ** TODO(shess) That said, it would be nice to have a better query-side
277 ** argument for MERGE_COUNT of 16. Also, it is possible/likely that
278 ** optimizations to things like doclist merging will swing the sweet
283 **** Handling of deletions and updates ****
284 ** Since we're using a segmented structure, with no docid-oriented
285 ** index into the term index, we clearly cannot simply update the term
286 ** index when a document is deleted or updated. For deletions, we
287 ** write an empty doclist (varint(docid) varint(POS_END)), for updates
288 ** we simply write the new doclist. Segment merges overwrite older
289 ** data for a particular docid with newer data, so deletes or updates
290 ** will eventually overtake the earlier data and knock it out. The
291 ** query logic likewise merges doclists so that newer data knocks out
294 ** TODO(shess) Provide a VACUUM type operation to clear out all
295 ** deletions and duplications. This would basically be a forced merge
296 ** into a single segment.
299 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
301 #if defined(SQLITE_ENABLE_FTS2) && !defined(SQLITE_CORE)
302 # define SQLITE_CORE 1
310 #include "fts2_hash.h"
311 #include "fts2_tokenizer.h"
313 #include "sqlite3ext.h"
314 SQLITE_EXTENSION_INIT1
317 /* TODO(shess) MAN, this thing needs some refactoring. At minimum, it
318 ** would be nice to order the file better, perhaps something along the
321 ** - utility functions
322 ** - table setup functions
323 ** - table update functions
324 ** - table query functions
326 ** Put the query functions last because they're likely to reference
327 ** typedefs or functions from the table update section.
331 # define TRACE(A) printf A; fflush(stdout)
336 /* It is not safe to call isspace(), tolower(), or isalnum() on
337 ** hi-bit-set characters. This is the same solution used in the
340 /* TODO(shess) The snippet-generation code should be using the
341 ** tokenizer-generated tokens rather than doing its own local
344 /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
345 static int safe_isspace(char c
){
346 return c
==' ' || c
=='\t' || c
=='\n' || c
=='\r' || c
=='\v' || c
=='\f';
348 static int safe_tolower(char c
){
349 return (c
>='A' && c
<='Z') ? (c
- 'A' + 'a') : c
;
351 static int safe_isalnum(char c
){
352 return (c
>='0' && c
<='9') || (c
>='A' && c
<='Z') || (c
>='a' && c
<='z');
355 typedef enum DocListType
{
356 DL_DOCIDS
, /* docids only */
357 DL_POSITIONS
, /* docids + positions */
358 DL_POSITIONS_OFFSETS
/* docids + positions + offsets */
362 ** By default, only positions and not offsets are stored in the doclists.
363 ** To change this so that offsets are stored too, compile with
365 ** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
367 ** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted
368 ** into (no deletes or updates).
371 # define DL_DEFAULT DL_POSITIONS
375 POS_END
= 0, /* end of this position list */
376 POS_COLUMN
, /* followed by new column number */
380 /* MERGE_COUNT controls how often we merge segments (see comment at
383 #define MERGE_COUNT 16
385 /* utility functions */
387 /* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single
388 ** record to prevent errors of the form:
390 ** my_function(SomeType *b){
391 ** memset(b, '\0', sizeof(b)); // sizeof(b)!=sizeof(*b)
394 /* TODO(shess) Obvious candidates for a header file. */
395 #define CLEAR(b) memset(b, '\0', sizeof(*(b)))
398 # define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b)))
403 /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
404 #define VARINT_MAX 10
406 /* Write a 64-bit variable-length integer to memory starting at p[0].
407 * The length of data written will be between 1 and VARINT_MAX bytes.
408 * The number of bytes written is returned. */
409 static int putVarint(char *p
, sqlite_int64 v
){
410 unsigned char *q
= (unsigned char *) p
;
411 sqlite_uint64 vu
= v
;
413 *q
++ = (unsigned char) ((vu
& 0x7f) | 0x80);
416 q
[-1] &= 0x7f; /* turn off high bit in final byte */
417 assert( q
- (unsigned char *)p
<= VARINT_MAX
);
418 return (int) (q
- (unsigned char *)p
);
421 /* Read a 64-bit variable-length integer from memory starting at p[0].
422 * Return the number of bytes read, or 0 on error.
423 * The value is stored in *v. */
424 static int getVarint(const char *p
, sqlite_int64
*v
){
425 const unsigned char *q
= (const unsigned char *) p
;
426 sqlite_uint64 x
= 0, y
= 1;
427 while( (*q
& 0x80) == 0x80 ){
428 x
+= y
* (*q
++ & 0x7f);
430 if( q
- (unsigned char *)p
>= VARINT_MAX
){ /* bad data */
436 *v
= (sqlite_int64
) x
;
437 return (int) (q
- (unsigned char *)p
);
440 static int getVarint32(const char *p
, int *pi
){
442 int ret
= getVarint(p
, &i
);
448 /*******************************************************************/
449 /* DataBuffer is used to collect data into a buffer in piecemeal
450 ** fashion. It implements the usual distinction between amount of
451 ** data currently stored (nData) and buffer capacity (nCapacity).
453 ** dataBufferInit - create a buffer with given initial capacity.
454 ** dataBufferReset - forget buffer's data, retaining capacity.
455 ** dataBufferDestroy - free buffer's data.
456 ** dataBufferSwap - swap contents of two buffers.
457 ** dataBufferExpand - expand capacity without adding data.
458 ** dataBufferAppend - append data.
459 ** dataBufferAppend2 - append two pieces of data at once.
460 ** dataBufferReplace - replace buffer's data.
462 typedef struct DataBuffer
{
463 char *pData
; /* Pointer to malloc'ed buffer. */
464 int nCapacity
; /* Size of pData buffer. */
465 int nData
; /* End of data loaded into pData. */
468 static void dataBufferInit(DataBuffer
*pBuffer
, int nCapacity
){
469 assert( nCapacity
>=0 );
471 pBuffer
->nCapacity
= nCapacity
;
472 pBuffer
->pData
= nCapacity
==0 ? NULL
: sqlite3_malloc(nCapacity
);
474 static void dataBufferReset(DataBuffer
*pBuffer
){
477 static void dataBufferDestroy(DataBuffer
*pBuffer
){
478 if( pBuffer
->pData
!=NULL
) sqlite3_free(pBuffer
->pData
);
481 static void dataBufferSwap(DataBuffer
*pBuffer1
, DataBuffer
*pBuffer2
){
482 DataBuffer tmp
= *pBuffer1
;
483 *pBuffer1
= *pBuffer2
;
486 static void dataBufferExpand(DataBuffer
*pBuffer
, int nAddCapacity
){
487 assert( nAddCapacity
>0 );
488 /* TODO(shess) Consider expanding more aggressively. Note that the
489 ** underlying malloc implementation may take care of such things for
492 if( pBuffer
->nData
+nAddCapacity
>pBuffer
->nCapacity
){
493 pBuffer
->nCapacity
= pBuffer
->nData
+nAddCapacity
;
494 pBuffer
->pData
= sqlite3_realloc(pBuffer
->pData
, pBuffer
->nCapacity
);
497 static void dataBufferAppend(DataBuffer
*pBuffer
,
498 const char *pSource
, int nSource
){
499 assert( nSource
>0 && pSource
!=NULL
);
500 dataBufferExpand(pBuffer
, nSource
);
501 memcpy(pBuffer
->pData
+pBuffer
->nData
, pSource
, nSource
);
502 pBuffer
->nData
+= nSource
;
504 static void dataBufferAppend2(DataBuffer
*pBuffer
,
505 const char *pSource1
, int nSource1
,
506 const char *pSource2
, int nSource2
){
507 assert( nSource1
>0 && pSource1
!=NULL
);
508 assert( nSource2
>0 && pSource2
!=NULL
);
509 dataBufferExpand(pBuffer
, nSource1
+nSource2
);
510 memcpy(pBuffer
->pData
+pBuffer
->nData
, pSource1
, nSource1
);
511 memcpy(pBuffer
->pData
+pBuffer
->nData
+nSource1
, pSource2
, nSource2
);
512 pBuffer
->nData
+= nSource1
+nSource2
;
514 static void dataBufferReplace(DataBuffer
*pBuffer
,
515 const char *pSource
, int nSource
){
516 dataBufferReset(pBuffer
);
517 dataBufferAppend(pBuffer
, pSource
, nSource
);
520 /* StringBuffer is a null-terminated version of DataBuffer. */
521 typedef struct StringBuffer
{
522 DataBuffer b
; /* Includes null terminator. */
525 static void initStringBuffer(StringBuffer
*sb
){
526 dataBufferInit(&sb
->b
, 100);
527 dataBufferReplace(&sb
->b
, "", 1);
529 static int stringBufferLength(StringBuffer
*sb
){
530 return sb
->b
.nData
-1;
532 static char *stringBufferData(StringBuffer
*sb
){
535 static void stringBufferDestroy(StringBuffer
*sb
){
536 dataBufferDestroy(&sb
->b
);
539 static void nappend(StringBuffer
*sb
, const char *zFrom
, int nFrom
){
540 assert( sb
->b
.nData
>0 );
543 dataBufferAppend2(&sb
->b
, zFrom
, nFrom
, "", 1);
546 static void append(StringBuffer
*sb
, const char *zFrom
){
547 nappend(sb
, zFrom
, strlen(zFrom
));
550 /* Append a list of strings separated by commas. */
551 static void appendList(StringBuffer
*sb
, int nString
, char **azString
){
553 for(i
=0; i
<nString
; ++i
){
554 if( i
>0 ) append(sb
, ", ");
555 append(sb
, azString
[i
]);
559 static int endsInWhiteSpace(StringBuffer
*p
){
560 return stringBufferLength(p
)>0 &&
561 safe_isspace(stringBufferData(p
)[stringBufferLength(p
)-1]);
564 /* If the StringBuffer ends in something other than white space, add a
565 ** single space character to the end.
567 static void appendWhiteSpace(StringBuffer
*p
){
568 if( stringBufferLength(p
)==0 ) return;
569 if( !endsInWhiteSpace(p
) ) append(p
, " ");
572 /* Remove white space from the end of the StringBuffer */
573 static void trimWhiteSpace(StringBuffer
*p
){
574 while( endsInWhiteSpace(p
) ){
575 p
->b
.pData
[--p
->b
.nData
-1] = '\0';
579 /*******************************************************************/
580 /* DLReader is used to read document elements from a doclist. The
581 ** current docid is cached, so dlrDocid() is fast. DLReader does not
582 ** own the doclist buffer.
584 ** dlrAtEnd - true if there's no more data to read.
585 ** dlrDocid - docid of current document.
586 ** dlrDocData - doclist data for current document (including docid).
587 ** dlrDocDataBytes - length of same.
588 ** dlrAllDataBytes - length of all remaining data.
589 ** dlrPosData - position data for current document.
590 ** dlrPosDataLen - length of pos data for current document (incl POS_END).
591 ** dlrStep - step to current document.
592 ** dlrInit - initial for doclist of given type against given data.
593 ** dlrDestroy - clean up.
595 ** Expected usage is something like:
598 ** dlrInit(&reader, pData, nData);
599 ** while( !dlrAtEnd(&reader) ){
600 ** // calls to dlrDocid() and kin.
603 ** dlrDestroy(&reader);
605 typedef struct DLReader
{
614 static int dlrAtEnd(DLReader
*pReader
){
615 assert( pReader
->nData
>=0 );
616 return pReader
->nData
==0;
618 static sqlite_int64
dlrDocid(DLReader
*pReader
){
619 assert( !dlrAtEnd(pReader
) );
620 return pReader
->iDocid
;
622 static const char *dlrDocData(DLReader
*pReader
){
623 assert( !dlrAtEnd(pReader
) );
624 return pReader
->pData
;
626 static int dlrDocDataBytes(DLReader
*pReader
){
627 assert( !dlrAtEnd(pReader
) );
628 return pReader
->nElement
;
630 static int dlrAllDataBytes(DLReader
*pReader
){
631 assert( !dlrAtEnd(pReader
) );
632 return pReader
->nData
;
634 /* TODO(shess) Consider adding a field to track iDocid varint length
635 ** to make these two functions faster. This might matter (a tiny bit)
638 static const char *dlrPosData(DLReader
*pReader
){
640 int n
= getVarint(pReader
->pData
, &iDummy
);
641 assert( !dlrAtEnd(pReader
) );
642 return pReader
->pData
+n
;
644 static int dlrPosDataLen(DLReader
*pReader
){
646 int n
= getVarint(pReader
->pData
, &iDummy
);
647 assert( !dlrAtEnd(pReader
) );
648 return pReader
->nElement
-n
;
650 static void dlrStep(DLReader
*pReader
){
651 assert( !dlrAtEnd(pReader
) );
653 /* Skip past current doclist element. */
654 assert( pReader
->nElement
<=pReader
->nData
);
655 pReader
->pData
+= pReader
->nElement
;
656 pReader
->nData
-= pReader
->nElement
;
658 /* If there is more data, read the next doclist element. */
659 if( pReader
->nData
!=0 ){
660 sqlite_int64 iDocidDelta
;
661 int iDummy
, n
= getVarint(pReader
->pData
, &iDocidDelta
);
662 pReader
->iDocid
+= iDocidDelta
;
663 if( pReader
->iType
>=DL_POSITIONS
){
664 assert( n
<pReader
->nData
);
666 n
+= getVarint32(pReader
->pData
+n
, &iDummy
);
667 assert( n
<=pReader
->nData
);
668 if( iDummy
==POS_END
) break;
669 if( iDummy
==POS_COLUMN
){
670 n
+= getVarint32(pReader
->pData
+n
, &iDummy
);
671 assert( n
<pReader
->nData
);
672 }else if( pReader
->iType
==DL_POSITIONS_OFFSETS
){
673 n
+= getVarint32(pReader
->pData
+n
, &iDummy
);
674 n
+= getVarint32(pReader
->pData
+n
, &iDummy
);
675 assert( n
<pReader
->nData
);
679 pReader
->nElement
= n
;
680 assert( pReader
->nElement
<=pReader
->nData
);
683 static void dlrInit(DLReader
*pReader
, DocListType iType
,
684 const char *pData
, int nData
){
685 assert( pData
!=NULL
&& nData
!=0 );
686 pReader
->iType
= iType
;
687 pReader
->pData
= pData
;
688 pReader
->nData
= nData
;
689 pReader
->nElement
= 0;
692 /* Load the first element's data. There must be a first element. */
695 static void dlrDestroy(DLReader
*pReader
){
700 /* Verify that the doclist can be validly decoded. Also returns the
701 ** last docid found because it is convenient in other assertions for
704 static void docListValidate(DocListType iType
, const char *pData
, int nData
,
705 sqlite_int64
*pLastDocid
){
706 sqlite_int64 iPrevDocid
= 0;
709 assert( pData
+nData
>pData
);
711 sqlite_int64 iDocidDelta
;
712 int n
= getVarint(pData
, &iDocidDelta
);
713 iPrevDocid
+= iDocidDelta
;
714 if( iType
>DL_DOCIDS
){
717 n
+= getVarint32(pData
+n
, &iDummy
);
718 if( iDummy
==POS_END
) break;
719 if( iDummy
==POS_COLUMN
){
720 n
+= getVarint32(pData
+n
, &iDummy
);
721 }else if( iType
>DL_POSITIONS
){
722 n
+= getVarint32(pData
+n
, &iDummy
);
723 n
+= getVarint32(pData
+n
, &iDummy
);
732 if( pLastDocid
) *pLastDocid
= iPrevDocid
;
734 #define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o)
736 #define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 )
739 /*******************************************************************/
740 /* DLWriter is used to write doclist data to a DataBuffer. DLWriter
741 ** always appends to the buffer and does not own it.
743 ** dlwInit - initialize to write a given type doclistto a buffer.
744 ** dlwDestroy - clear the writer's memory. Does not free buffer.
745 ** dlwAppend - append raw doclist data to buffer.
746 ** dlwCopy - copy next doclist from reader to writer.
747 ** dlwAdd - construct doclist element and append to buffer.
748 ** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
750 typedef struct DLWriter
{
753 sqlite_int64 iPrevDocid
;
759 static void dlwInit(DLWriter
*pWriter
, DocListType iType
, DataBuffer
*b
){
761 pWriter
->iType
= iType
;
762 pWriter
->iPrevDocid
= 0;
764 pWriter
->has_iPrevDocid
= 0;
767 static void dlwDestroy(DLWriter
*pWriter
){
770 /* iFirstDocid is the first docid in the doclist in pData. It is
771 ** needed because pData may point within a larger doclist, in which
772 ** case the first item would be delta-encoded.
774 ** iLastDocid is the final docid in the doclist in pData. It is
775 ** needed to create the new iPrevDocid for future delta-encoding. The
776 ** code could decode the passed doclist to recreate iLastDocid, but
777 ** the only current user (docListMerge) already has decoded this
780 /* TODO(shess) This has become just a helper for docListMerge.
781 ** Consider a refactor to make this cleaner.
783 static void dlwAppend(DLWriter
*pWriter
,
784 const char *pData
, int nData
,
785 sqlite_int64 iFirstDocid
, sqlite_int64 iLastDocid
){
786 sqlite_int64 iDocid
= 0;
788 int nFirstOld
, nFirstNew
; /* Old and new varint len of first docid. */
790 sqlite_int64 iLastDocidDelta
;
793 /* Recode the initial docid as delta from iPrevDocid. */
794 nFirstOld
= getVarint(pData
, &iDocid
);
795 assert( nFirstOld
<nData
|| (nFirstOld
==nData
&& pWriter
->iType
==DL_DOCIDS
) );
796 nFirstNew
= putVarint(c
, iFirstDocid
-pWriter
->iPrevDocid
);
798 /* Verify that the incoming doclist is valid AND that it ends with
799 ** the expected docid. This is essential because we'll trust this
800 ** docid in future delta-encoding.
802 ASSERT_VALID_DOCLIST(pWriter
->iType
, pData
, nData
, &iLastDocidDelta
);
803 assert( iLastDocid
==iFirstDocid
-iDocid
+iLastDocidDelta
);
805 /* Append recoded initial docid and everything else. Rest of docids
806 ** should have been delta-encoded from previous initial docid.
808 if( nFirstOld
<nData
){
809 dataBufferAppend2(pWriter
->b
, c
, nFirstNew
,
810 pData
+nFirstOld
, nData
-nFirstOld
);
812 dataBufferAppend(pWriter
->b
, c
, nFirstNew
);
814 pWriter
->iPrevDocid
= iLastDocid
;
816 static void dlwCopy(DLWriter
*pWriter
, DLReader
*pReader
){
817 dlwAppend(pWriter
, dlrDocData(pReader
), dlrDocDataBytes(pReader
),
818 dlrDocid(pReader
), dlrDocid(pReader
));
820 static void dlwAdd(DLWriter
*pWriter
, sqlite_int64 iDocid
){
822 int n
= putVarint(c
, iDocid
-pWriter
->iPrevDocid
);
824 /* Docids must ascend. */
825 assert( !pWriter
->has_iPrevDocid
|| iDocid
>pWriter
->iPrevDocid
);
826 assert( pWriter
->iType
==DL_DOCIDS
);
828 dataBufferAppend(pWriter
->b
, c
, n
);
829 pWriter
->iPrevDocid
= iDocid
;
831 pWriter
->has_iPrevDocid
= 1;
835 /*******************************************************************/
836 /* PLReader is used to read data from a document's position list. As
837 ** the caller steps through the list, data is cached so that varints
838 ** only need to be decoded once.
840 ** plrInit, plrDestroy - create/destroy a reader.
841 ** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors
842 ** plrAtEnd - at end of stream, only call plrDestroy once true.
843 ** plrStep - step to the next element.
845 typedef struct PLReader
{
846 /* These refer to the next position's data. nData will reach 0 when
847 ** reading the last position, so plrStep() signals EOF by setting
854 int iColumn
; /* the last column read */
855 int iPosition
; /* the last position read */
856 int iStartOffset
; /* the last start offset read */
857 int iEndOffset
; /* the last end offset read */
860 static int plrAtEnd(PLReader
*pReader
){
861 return pReader
->pData
==NULL
;
863 static int plrColumn(PLReader
*pReader
){
864 assert( !plrAtEnd(pReader
) );
865 return pReader
->iColumn
;
867 static int plrPosition(PLReader
*pReader
){
868 assert( !plrAtEnd(pReader
) );
869 return pReader
->iPosition
;
871 static int plrStartOffset(PLReader
*pReader
){
872 assert( !plrAtEnd(pReader
) );
873 return pReader
->iStartOffset
;
875 static int plrEndOffset(PLReader
*pReader
){
876 assert( !plrAtEnd(pReader
) );
877 return pReader
->iEndOffset
;
879 static void plrStep(PLReader
*pReader
){
882 assert( !plrAtEnd(pReader
) );
884 if( pReader
->nData
==0 ){
885 pReader
->pData
= NULL
;
889 n
= getVarint32(pReader
->pData
, &i
);
891 n
+= getVarint32(pReader
->pData
+n
, &pReader
->iColumn
);
892 pReader
->iPosition
= 0;
893 pReader
->iStartOffset
= 0;
894 n
+= getVarint32(pReader
->pData
+n
, &i
);
896 /* Should never see adjacent column changes. */
897 assert( i
!=POS_COLUMN
);
901 pReader
->pData
= NULL
;
905 pReader
->iPosition
+= i
-POS_BASE
;
906 if( pReader
->iType
==DL_POSITIONS_OFFSETS
){
907 n
+= getVarint32(pReader
->pData
+n
, &i
);
908 pReader
->iStartOffset
+= i
;
909 n
+= getVarint32(pReader
->pData
+n
, &i
);
910 pReader
->iEndOffset
= pReader
->iStartOffset
+i
;
912 assert( n
<=pReader
->nData
);
917 static void plrInit(PLReader
*pReader
, DLReader
*pDLReader
){
918 pReader
->pData
= dlrPosData(pDLReader
);
919 pReader
->nData
= dlrPosDataLen(pDLReader
);
920 pReader
->iType
= pDLReader
->iType
;
921 pReader
->iColumn
= 0;
922 pReader
->iPosition
= 0;
923 pReader
->iStartOffset
= 0;
924 pReader
->iEndOffset
= 0;
927 static void plrDestroy(PLReader
*pReader
){
931 /*******************************************************************/
932 /* PLWriter is used in constructing a document's position list. As a
933 ** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
934 ** PLWriter writes to the associated DLWriter's buffer.
936 ** plwInit - init for writing a document's poslist.
937 ** plwDestroy - clear a writer.
938 ** plwAdd - append position and offset information.
939 ** plwCopy - copy next position's data from reader to writer.
940 ** plwTerminate - add any necessary doclist terminator.
942 ** Calling plwAdd() after plwTerminate() may result in a corrupt
945 /* TODO(shess) Until we've written the second item, we can cache the
946 ** first item's information. Then we'd have three states:
948 ** - initialized with docid, no positions.
949 ** - docid and one position.
950 ** - docid and multiple positions.
952 ** Only the last state needs to actually write to dlw->b, which would
953 ** be an improvement in the DLCollector case.
955 typedef struct PLWriter
{
958 int iColumn
; /* the last column written */
959 int iPos
; /* the last position written */
960 int iOffset
; /* the last start offset written */
963 /* TODO(shess) In the case where the parent is reading these values
964 ** from a PLReader, we could optimize to a copy if that PLReader has
965 ** the same type as pWriter.
967 static void plwAdd(PLWriter
*pWriter
, int iColumn
, int iPos
,
968 int iStartOffset
, int iEndOffset
){
969 /* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
970 ** iStartOffsetDelta, and iEndOffsetDelta.
972 char c
[5*VARINT_MAX
];
975 /* Ban plwAdd() after plwTerminate(). */
976 assert( pWriter
->iPos
!=-1 );
978 if( pWriter
->dlw
->iType
==DL_DOCIDS
) return;
980 if( iColumn
!=pWriter
->iColumn
){
981 n
+= putVarint(c
+n
, POS_COLUMN
);
982 n
+= putVarint(c
+n
, iColumn
);
983 pWriter
->iColumn
= iColumn
;
985 pWriter
->iOffset
= 0;
987 assert( iPos
>=pWriter
->iPos
);
988 n
+= putVarint(c
+n
, POS_BASE
+(iPos
-pWriter
->iPos
));
989 pWriter
->iPos
= iPos
;
990 if( pWriter
->dlw
->iType
==DL_POSITIONS_OFFSETS
){
991 assert( iStartOffset
>=pWriter
->iOffset
);
992 n
+= putVarint(c
+n
, iStartOffset
-pWriter
->iOffset
);
993 pWriter
->iOffset
= iStartOffset
;
994 assert( iEndOffset
>=iStartOffset
);
995 n
+= putVarint(c
+n
, iEndOffset
-iStartOffset
);
997 dataBufferAppend(pWriter
->dlw
->b
, c
, n
);
999 static void plwCopy(PLWriter
*pWriter
, PLReader
*pReader
){
1000 plwAdd(pWriter
, plrColumn(pReader
), plrPosition(pReader
),
1001 plrStartOffset(pReader
), plrEndOffset(pReader
));
1003 static void plwInit(PLWriter
*pWriter
, DLWriter
*dlw
, sqlite_int64 iDocid
){
1009 /* Docids must ascend. */
1010 assert( !pWriter
->dlw
->has_iPrevDocid
|| iDocid
>pWriter
->dlw
->iPrevDocid
);
1011 n
= putVarint(c
, iDocid
-pWriter
->dlw
->iPrevDocid
);
1012 dataBufferAppend(pWriter
->dlw
->b
, c
, n
);
1013 pWriter
->dlw
->iPrevDocid
= iDocid
;
1015 pWriter
->dlw
->has_iPrevDocid
= 1;
1018 pWriter
->iColumn
= 0;
1020 pWriter
->iOffset
= 0;
1022 /* TODO(shess) Should plwDestroy() also terminate the doclist? But
1023 ** then plwDestroy() would no longer be just a destructor, it would
1024 ** also be doing work, which isn't consistent with the overall idiom.
1025 ** Another option would be for plwAdd() to always append any necessary
1026 ** terminator, so that the output is always correct. But that would
1027 ** add incremental work to the common case with the only benefit being
1028 ** API elegance. Punt for now.
1030 static void plwTerminate(PLWriter
*pWriter
){
1031 if( pWriter
->dlw
->iType
>DL_DOCIDS
){
1033 int n
= putVarint(c
, POS_END
);
1034 dataBufferAppend(pWriter
->dlw
->b
, c
, n
);
1037 /* Mark as terminated for assert in plwAdd(). */
1041 static void plwDestroy(PLWriter
*pWriter
){
1045 /*******************************************************************/
1046 /* DLCollector wraps PLWriter and DLWriter to provide a
1047 ** dynamically-allocated doclist area to use during tokenization.
1049 ** dlcNew - malloc up and initialize a collector.
1050 ** dlcDelete - destroy a collector and all contained items.
1051 ** dlcAddPos - append position and offset information.
1052 ** dlcAddDoclist - add the collected doclist to the given buffer.
1053 ** dlcNext - terminate the current document and open another.
1055 typedef struct DLCollector
{
1061 /* TODO(shess) This could also be done by calling plwTerminate() and
1062 ** dataBufferAppend(). I tried that, expecting nominal performance
1063 ** differences, but it seemed to pretty reliably be worth 1% to code
1064 ** it this way. I suspect it is the incremental malloc overhead (some
1065 ** percentage of the plwTerminate() calls will cause a realloc), so
1066 ** this might be worth revisiting if the DataBuffer implementation
1069 static void dlcAddDoclist(DLCollector
*pCollector
, DataBuffer
*b
){
1070 if( pCollector
->dlw
.iType
>DL_DOCIDS
){
1072 int n
= putVarint(c
, POS_END
);
1073 dataBufferAppend2(b
, pCollector
->b
.pData
, pCollector
->b
.nData
, c
, n
);
1075 dataBufferAppend(b
, pCollector
->b
.pData
, pCollector
->b
.nData
);
1078 static void dlcNext(DLCollector
*pCollector
, sqlite_int64 iDocid
){
1079 plwTerminate(&pCollector
->plw
);
1080 plwDestroy(&pCollector
->plw
);
1081 plwInit(&pCollector
->plw
, &pCollector
->dlw
, iDocid
);
1083 static void dlcAddPos(DLCollector
*pCollector
, int iColumn
, int iPos
,
1084 int iStartOffset
, int iEndOffset
){
1085 plwAdd(&pCollector
->plw
, iColumn
, iPos
, iStartOffset
, iEndOffset
);
1088 static DLCollector
*dlcNew(sqlite_int64 iDocid
, DocListType iType
){
1089 DLCollector
*pCollector
= sqlite3_malloc(sizeof(DLCollector
));
1090 dataBufferInit(&pCollector
->b
, 0);
1091 dlwInit(&pCollector
->dlw
, iType
, &pCollector
->b
);
1092 plwInit(&pCollector
->plw
, &pCollector
->dlw
, iDocid
);
1095 static void dlcDelete(DLCollector
*pCollector
){
1096 plwDestroy(&pCollector
->plw
);
1097 dlwDestroy(&pCollector
->dlw
);
1098 dataBufferDestroy(&pCollector
->b
);
1099 SCRAMBLE(pCollector
);
1100 sqlite3_free(pCollector
);
1104 /* Copy the doclist data of iType in pData/nData into *out, trimming
1105 ** unnecessary data as we go. Only columns matching iColumn are
1106 ** copied, all columns copied if iColumn is -1. Elements with no
1107 ** matching columns are dropped. The output is an iOutType doclist.
1109 /* NOTE(shess) This code is only valid after all doclists are merged.
1110 ** If this is run before merges, then doclist items which represent
1111 ** deletion will be trimmed, and will thus not effect a deletion
1112 ** during the merge.
1114 static void docListTrim(DocListType iType
, const char *pData
, int nData
,
1115 int iColumn
, DocListType iOutType
, DataBuffer
*out
){
1119 assert( iOutType
<=iType
);
1121 dlrInit(&dlReader
, iType
, pData
, nData
);
1122 dlwInit(&dlWriter
, iOutType
, out
);
1124 while( !dlrAtEnd(&dlReader
) ){
1129 plrInit(&plReader
, &dlReader
);
1131 while( !plrAtEnd(&plReader
) ){
1132 if( iColumn
==-1 || plrColumn(&plReader
)==iColumn
){
1134 plwInit(&plWriter
, &dlWriter
, dlrDocid(&dlReader
));
1137 plwAdd(&plWriter
, plrColumn(&plReader
), plrPosition(&plReader
),
1138 plrStartOffset(&plReader
), plrEndOffset(&plReader
));
1143 plwTerminate(&plWriter
);
1144 plwDestroy(&plWriter
);
1147 plrDestroy(&plReader
);
1150 dlwDestroy(&dlWriter
);
1151 dlrDestroy(&dlReader
);
1154 /* Used by docListMerge() to keep doclists in the ascending order by
1155 ** docid, then ascending order by age (so the newest comes first).
1157 typedef struct OrderedDLReader
{
1160 /* TODO(shess) If we assume that docListMerge pReaders is ordered by
1161 ** age (which we do), then we could use pReader comparisons to break
1167 /* Order eof to end, then by docid asc, idx desc. */
1168 static int orderedDLReaderCmp(OrderedDLReader
*r1
, OrderedDLReader
*r2
){
1169 if( dlrAtEnd(r1
->pReader
) ){
1170 if( dlrAtEnd(r2
->pReader
) ) return 0; /* Both atEnd(). */
1171 return 1; /* Only r1 atEnd(). */
1173 if( dlrAtEnd(r2
->pReader
) ) return -1; /* Only r2 atEnd(). */
1175 if( dlrDocid(r1
->pReader
)<dlrDocid(r2
->pReader
) ) return -1;
1176 if( dlrDocid(r1
->pReader
)>dlrDocid(r2
->pReader
) ) return 1;
1178 /* Descending on idx. */
1179 return r2
->idx
-r1
->idx
;
1182 /* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that
1183 ** p[1..n-1] is already sorted.
1185 /* TODO(shess) Is this frequent enough to warrant a binary search?
1186 ** Before implementing that, instrument the code to check. In most
1187 ** current usage, I expect that p[0] will be less than p[1] a very
1188 ** high proportion of the time.
1190 static void orderedDLReaderReorder(OrderedDLReader
*p
, int n
){
1191 while( n
>1 && orderedDLReaderCmp(p
, p
+1)>0 ){
1192 OrderedDLReader tmp
= p
[0];
1200 /* Given an array of doclist readers, merge their doclist elements
1201 ** into out in sorted order (by docid), dropping elements from older
1202 ** readers when there is a duplicate docid. pReaders is assumed to be
1203 ** ordered by age, oldest first.
1205 /* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably
1208 static void docListMerge(DataBuffer
*out
,
1209 DLReader
*pReaders
, int nReaders
){
1210 OrderedDLReader readers
[MERGE_COUNT
];
1213 const char *pStart
= 0;
1215 sqlite_int64 iFirstDocid
= 0, iLastDocid
= 0;
1217 assert( nReaders
>0 );
1219 dataBufferAppend(out
, dlrDocData(pReaders
), dlrAllDataBytes(pReaders
));
1223 assert( nReaders
<=MERGE_COUNT
);
1225 for(i
=0; i
<nReaders
; i
++){
1226 assert( pReaders
[i
].iType
==pReaders
[0].iType
);
1227 readers
[i
].pReader
= pReaders
+i
;
1229 n
+= dlrAllDataBytes(&pReaders
[i
]);
1231 /* Conservatively size output to sum of inputs. Output should end
1232 ** up strictly smaller than input.
1234 dataBufferExpand(out
, n
);
1236 /* Get the readers into sorted order. */
1238 orderedDLReaderReorder(readers
+i
, nReaders
-i
);
1241 dlwInit(&writer
, pReaders
[0].iType
, out
);
1242 while( !dlrAtEnd(readers
[0].pReader
) ){
1243 sqlite_int64 iDocid
= dlrDocid(readers
[0].pReader
);
1245 /* If this is a continuation of the current buffer to copy, extend
1246 ** that buffer. memcpy() seems to be more efficient if it has a
1247 ** lots of data to copy.
1249 if( dlrDocData(readers
[0].pReader
)==pStart
+nStart
){
1250 nStart
+= dlrDocDataBytes(readers
[0].pReader
);
1253 dlwAppend(&writer
, pStart
, nStart
, iFirstDocid
, iLastDocid
);
1255 pStart
= dlrDocData(readers
[0].pReader
);
1256 nStart
= dlrDocDataBytes(readers
[0].pReader
);
1257 iFirstDocid
= iDocid
;
1259 iLastDocid
= iDocid
;
1260 dlrStep(readers
[0].pReader
);
1262 /* Drop all of the older elements with the same docid. */
1263 for(i
=1; i
<nReaders
&&
1264 !dlrAtEnd(readers
[i
].pReader
) &&
1265 dlrDocid(readers
[i
].pReader
)==iDocid
; i
++){
1266 dlrStep(readers
[i
].pReader
);
1269 /* Get the readers back into order. */
1271 orderedDLReaderReorder(readers
+i
, nReaders
-i
);
1275 /* Copy over any remaining elements. */
1276 if( nStart
>0 ) dlwAppend(&writer
, pStart
, nStart
, iFirstDocid
, iLastDocid
);
1277 dlwDestroy(&writer
);
1280 /* Helper function for posListUnion(). Compares the current position
1281 ** between left and right, returning as standard C idiom of <0 if
1282 ** left<right, >0 if left>right, and 0 if left==right. "End" always
1283 ** compares greater.
1285 static int posListCmp(PLReader
*pLeft
, PLReader
*pRight
){
1286 assert( pLeft
->iType
==pRight
->iType
);
1287 if( pLeft
->iType
==DL_DOCIDS
) return 0;
1289 if( plrAtEnd(pLeft
) ) return plrAtEnd(pRight
) ? 0 : 1;
1290 if( plrAtEnd(pRight
) ) return -1;
1292 if( plrColumn(pLeft
)<plrColumn(pRight
) ) return -1;
1293 if( plrColumn(pLeft
)>plrColumn(pRight
) ) return 1;
1295 if( plrPosition(pLeft
)<plrPosition(pRight
) ) return -1;
1296 if( plrPosition(pLeft
)>plrPosition(pRight
) ) return 1;
1297 if( pLeft
->iType
==DL_POSITIONS
) return 0;
1299 if( plrStartOffset(pLeft
)<plrStartOffset(pRight
) ) return -1;
1300 if( plrStartOffset(pLeft
)>plrStartOffset(pRight
) ) return 1;
1302 if( plrEndOffset(pLeft
)<plrEndOffset(pRight
) ) return -1;
1303 if( plrEndOffset(pLeft
)>plrEndOffset(pRight
) ) return 1;
1308 /* Write the union of position lists in pLeft and pRight to pOut.
1309 ** "Union" in this case meaning "All unique position tuples". Should
1310 ** work with any doclist type, though both inputs and the output
1311 ** should be the same type.
1313 static void posListUnion(DLReader
*pLeft
, DLReader
*pRight
, DLWriter
*pOut
){
1314 PLReader left
, right
;
1317 assert( dlrDocid(pLeft
)==dlrDocid(pRight
) );
1318 assert( pLeft
->iType
==pRight
->iType
);
1319 assert( pLeft
->iType
==pOut
->iType
);
1321 plrInit(&left
, pLeft
);
1322 plrInit(&right
, pRight
);
1323 plwInit(&writer
, pOut
, dlrDocid(pLeft
));
1325 while( !plrAtEnd(&left
) || !plrAtEnd(&right
) ){
1326 int c
= posListCmp(&left
, &right
);
1328 plwCopy(&writer
, &left
);
1331 plwCopy(&writer
, &right
);
1334 plwCopy(&writer
, &left
);
1340 plwTerminate(&writer
);
1341 plwDestroy(&writer
);
1346 /* Write the union of doclists in pLeft and pRight to pOut. For
1347 ** docids in common between the inputs, the union of the position
1348 ** lists is written. Inputs and outputs are always type DL_DEFAULT.
1350 static void docListUnion(
1351 const char *pLeft
, int nLeft
,
1352 const char *pRight
, int nRight
,
1353 DataBuffer
*pOut
/* Write the combined doclist here */
1355 DLReader left
, right
;
1359 if( nRight
!=0) dataBufferAppend(pOut
, pRight
, nRight
);
1363 dataBufferAppend(pOut
, pLeft
, nLeft
);
1367 dlrInit(&left
, DL_DEFAULT
, pLeft
, nLeft
);
1368 dlrInit(&right
, DL_DEFAULT
, pRight
, nRight
);
1369 dlwInit(&writer
, DL_DEFAULT
, pOut
);
1371 while( !dlrAtEnd(&left
) || !dlrAtEnd(&right
) ){
1372 if( dlrAtEnd(&right
) ){
1373 dlwCopy(&writer
, &left
);
1375 }else if( dlrAtEnd(&left
) ){
1376 dlwCopy(&writer
, &right
);
1378 }else if( dlrDocid(&left
)<dlrDocid(&right
) ){
1379 dlwCopy(&writer
, &left
);
1381 }else if( dlrDocid(&left
)>dlrDocid(&right
) ){
1382 dlwCopy(&writer
, &right
);
1385 posListUnion(&left
, &right
, &writer
);
1393 dlwDestroy(&writer
);
1396 /* pLeft and pRight are DLReaders positioned to the same docid.
1398 ** If there are no instances in pLeft or pRight where the position
1399 ** of pLeft is one less than the position of pRight, then this
1400 ** routine adds nothing to pOut.
1402 ** If there are one or more instances where positions from pLeft
1403 ** are exactly one less than positions from pRight, then add a new
1404 ** document record to pOut. If pOut wants to hold positions, then
1405 ** include the positions from pRight that are one more than a
1406 ** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1.
1408 static void posListPhraseMerge(DLReader
*pLeft
, DLReader
*pRight
,
1410 PLReader left
, right
;
1414 assert( dlrDocid(pLeft
)==dlrDocid(pRight
) );
1415 assert( pOut
->iType
!=DL_POSITIONS_OFFSETS
);
1417 plrInit(&left
, pLeft
);
1418 plrInit(&right
, pRight
);
1420 while( !plrAtEnd(&left
) && !plrAtEnd(&right
) ){
1421 if( plrColumn(&left
)<plrColumn(&right
) ){
1423 }else if( plrColumn(&left
)>plrColumn(&right
) ){
1425 }else if( plrPosition(&left
)+1<plrPosition(&right
) ){
1427 }else if( plrPosition(&left
)+1>plrPosition(&right
) ){
1431 plwInit(&writer
, pOut
, dlrDocid(pLeft
));
1434 plwAdd(&writer
, plrColumn(&right
), plrPosition(&right
), 0, 0);
1441 plwTerminate(&writer
);
1442 plwDestroy(&writer
);
1449 /* We have two doclists with positions: pLeft and pRight.
1450 ** Write the phrase intersection of these two doclists into pOut.
1452 ** A phrase intersection means that two documents only match
1453 ** if pLeft.iPos+1==pRight.iPos.
1455 ** iType controls the type of data written to pOut. If iType is
1456 ** DL_POSITIONS, the positions are those from pRight.
1458 static void docListPhraseMerge(
1459 const char *pLeft
, int nLeft
,
1460 const char *pRight
, int nRight
,
1462 DataBuffer
*pOut
/* Write the combined doclist here */
1464 DLReader left
, right
;
1467 if( nLeft
==0 || nRight
==0 ) return;
1469 assert( iType
!=DL_POSITIONS_OFFSETS
);
1471 dlrInit(&left
, DL_POSITIONS
, pLeft
, nLeft
);
1472 dlrInit(&right
, DL_POSITIONS
, pRight
, nRight
);
1473 dlwInit(&writer
, iType
, pOut
);
1475 while( !dlrAtEnd(&left
) && !dlrAtEnd(&right
) ){
1476 if( dlrDocid(&left
)<dlrDocid(&right
) ){
1478 }else if( dlrDocid(&right
)<dlrDocid(&left
) ){
1481 posListPhraseMerge(&left
, &right
, &writer
);
1489 dlwDestroy(&writer
);
1492 /* We have two DL_DOCIDS doclists: pLeft and pRight.
1493 ** Write the intersection of these two doclists into pOut as a
1494 ** DL_DOCIDS doclist.
1496 static void docListAndMerge(
1497 const char *pLeft
, int nLeft
,
1498 const char *pRight
, int nRight
,
1499 DataBuffer
*pOut
/* Write the combined doclist here */
1501 DLReader left
, right
;
1504 if( nLeft
==0 || nRight
==0 ) return;
1506 dlrInit(&left
, DL_DOCIDS
, pLeft
, nLeft
);
1507 dlrInit(&right
, DL_DOCIDS
, pRight
, nRight
);
1508 dlwInit(&writer
, DL_DOCIDS
, pOut
);
1510 while( !dlrAtEnd(&left
) && !dlrAtEnd(&right
) ){
1511 if( dlrDocid(&left
)<dlrDocid(&right
) ){
1513 }else if( dlrDocid(&right
)<dlrDocid(&left
) ){
1516 dlwAdd(&writer
, dlrDocid(&left
));
1524 dlwDestroy(&writer
);
1527 /* We have two DL_DOCIDS doclists: pLeft and pRight.
1528 ** Write the union of these two doclists into pOut as a
1529 ** DL_DOCIDS doclist.
1531 static void docListOrMerge(
1532 const char *pLeft
, int nLeft
,
1533 const char *pRight
, int nRight
,
1534 DataBuffer
*pOut
/* Write the combined doclist here */
1536 DLReader left
, right
;
1540 if( nRight
!=0 ) dataBufferAppend(pOut
, pRight
, nRight
);
1544 dataBufferAppend(pOut
, pLeft
, nLeft
);
1548 dlrInit(&left
, DL_DOCIDS
, pLeft
, nLeft
);
1549 dlrInit(&right
, DL_DOCIDS
, pRight
, nRight
);
1550 dlwInit(&writer
, DL_DOCIDS
, pOut
);
1552 while( !dlrAtEnd(&left
) || !dlrAtEnd(&right
) ){
1553 if( dlrAtEnd(&right
) ){
1554 dlwAdd(&writer
, dlrDocid(&left
));
1556 }else if( dlrAtEnd(&left
) ){
1557 dlwAdd(&writer
, dlrDocid(&right
));
1559 }else if( dlrDocid(&left
)<dlrDocid(&right
) ){
1560 dlwAdd(&writer
, dlrDocid(&left
));
1562 }else if( dlrDocid(&right
)<dlrDocid(&left
) ){
1563 dlwAdd(&writer
, dlrDocid(&right
));
1566 dlwAdd(&writer
, dlrDocid(&left
));
1574 dlwDestroy(&writer
);
1577 /* We have two DL_DOCIDS doclists: pLeft and pRight.
1578 ** Write into pOut as DL_DOCIDS doclist containing all documents that
1579 ** occur in pLeft but not in pRight.
1581 static void docListExceptMerge(
1582 const char *pLeft
, int nLeft
,
1583 const char *pRight
, int nRight
,
1584 DataBuffer
*pOut
/* Write the combined doclist here */
1586 DLReader left
, right
;
1589 if( nLeft
==0 ) return;
1591 dataBufferAppend(pOut
, pLeft
, nLeft
);
1595 dlrInit(&left
, DL_DOCIDS
, pLeft
, nLeft
);
1596 dlrInit(&right
, DL_DOCIDS
, pRight
, nRight
);
1597 dlwInit(&writer
, DL_DOCIDS
, pOut
);
1599 while( !dlrAtEnd(&left
) ){
1600 while( !dlrAtEnd(&right
) && dlrDocid(&right
)<dlrDocid(&left
) ){
1603 if( dlrAtEnd(&right
) || dlrDocid(&left
)<dlrDocid(&right
) ){
1604 dlwAdd(&writer
, dlrDocid(&left
));
1611 dlwDestroy(&writer
);
1614 static char *string_dup_n(const char *s
, int n
){
1615 char *str
= sqlite3_malloc(n
+ 1);
1621 /* Duplicate a string; the caller must free() the returned string.
1622 * (We don't use strdup() since it is not part of the standard C library and
1623 * may not be available everywhere.) */
1624 static char *string_dup(const char *s
){
1625 return string_dup_n(s
, strlen(s
));
1628 /* Format a string, replacing each occurrence of the % character with
1629 * zDb.zName. This may be more convenient than sqlite_mprintf()
1630 * when one string is used repeatedly in a format string.
1631 * The caller must free() the returned string. */
1632 static char *string_format(const char *zFormat
,
1633 const char *zDb
, const char *zName
){
1636 size_t nDb
= strlen(zDb
);
1637 size_t nName
= strlen(zName
);
1638 size_t nFullTableName
= nDb
+1+nName
;
1642 /* first compute length needed */
1643 for(p
= zFormat
; *p
; ++p
){
1644 len
+= (*p
=='%' ? nFullTableName
: 1);
1646 len
+= 1; /* for null terminator */
1648 r
= result
= sqlite3_malloc(len
);
1649 for(p
= zFormat
; *p
; ++p
){
1651 memcpy(r
, zDb
, nDb
);
1654 memcpy(r
, zName
, nName
);
1661 assert( r
== result
+ len
);
1665 static int sql_exec(sqlite3
*db
, const char *zDb
, const char *zName
,
1666 const char *zFormat
){
1667 char *zCommand
= string_format(zFormat
, zDb
, zName
);
1669 TRACE(("FTS2 sql: %s\n", zCommand
));
1670 rc
= sqlite3_exec(db
, zCommand
, NULL
, 0, NULL
);
1671 sqlite3_free(zCommand
);
1675 static int sql_prepare(sqlite3
*db
, const char *zDb
, const char *zName
,
1676 sqlite3_stmt
**ppStmt
, const char *zFormat
){
1677 char *zCommand
= string_format(zFormat
, zDb
, zName
);
1679 TRACE(("FTS2 prepare: %s\n", zCommand
));
1680 rc
= sqlite3_prepare_v2(db
, zCommand
, -1, ppStmt
, NULL
);
1681 sqlite3_free(zCommand
);
1685 /* end utility functions */
1687 /* Forward reference */
1688 typedef struct fulltext_vtab fulltext_vtab
;
1690 /* A single term in a query is represented by an instances of
1691 ** the following structure.
1693 typedef struct QueryTerm
{
1694 short int nPhrase
; /* How many following terms are part of the same phrase */
1695 short int iPhrase
; /* This is the i-th term of a phrase. */
1696 short int iColumn
; /* Column of the index that must match this term */
1697 signed char isOr
; /* this term is preceded by "OR" */
1698 signed char isNot
; /* this term is preceded by "-" */
1699 signed char isPrefix
; /* this term is followed by "*" */
1700 char *pTerm
; /* text of the term. '\000' terminated. malloced */
1701 int nTerm
; /* Number of bytes in pTerm[] */
1705 /* A query string is parsed into a Query structure.
1707 * We could, in theory, allow query strings to be complicated
1708 * nested expressions with precedence determined by parentheses.
1709 * But none of the major search engines do this. (Perhaps the
1710 * feeling is that an parenthesized expression is two complex of
1711 * an idea for the average user to grasp.) Taking our lead from
1712 * the major search engines, we will allow queries to be a list
1713 * of terms (with an implied AND operator) or phrases in double-quotes,
1714 * with a single optional "-" before each non-phrase term to designate
1715 * negation and an optional OR connector.
1717 * OR binds more tightly than the implied AND, which is what the
1718 * major search engines seem to do. So, for example:
1720 * [one two OR three] ==> one AND (two OR three)
1721 * [one OR two three] ==> (one OR two) AND three
1723 * A "-" before a term matches all entries that lack that term.
1724 * The "-" must occur immediately before the term with in intervening
1725 * space. This is how the search engines do it.
1727 * A NOT term cannot be the right-hand operand of an OR. If this
1728 * occurs in the query string, the NOT is ignored:
1730 * [one OR -two] ==> one OR two
1733 typedef struct Query
{
1734 fulltext_vtab
*pFts
; /* The full text index */
1735 int nTerms
; /* Number of terms in the query */
1736 QueryTerm
*pTerms
; /* Array of terms. Space obtained from malloc() */
1737 int nextIsOr
; /* Set the isOr flag on the next inserted term */
1738 int nextColumn
; /* Next word parsed must be in this column */
1739 int dfltColumn
; /* The default column */
1744 ** An instance of the following structure keeps track of generated
1745 ** matching-word offset information and snippets.
1747 typedef struct Snippet
{
1748 int nMatch
; /* Total number of matches */
1749 int nAlloc
; /* Space allocated for aMatch[] */
1750 struct snippetMatch
{ /* One entry for each matching term */
1751 char snStatus
; /* Status flag for use while constructing snippets */
1752 short int iCol
; /* The column that contains the match */
1753 short int iTerm
; /* The index in Query.pTerms[] of the matching term */
1754 short int nByte
; /* Number of bytes in the term */
1755 int iStart
; /* The offset to the first character of the term */
1756 } *aMatch
; /* Points to space obtained from malloc */
1757 char *zOffset
; /* Text rendering of aMatch[] */
1758 int nOffset
; /* strlen(zOffset) */
1759 char *zSnippet
; /* Snippet text */
1760 int nSnippet
; /* strlen(zSnippet) */
1764 typedef enum QueryType
{
1765 QUERY_GENERIC
, /* table scan */
1766 QUERY_ROWID
, /* lookup by rowid */
1767 QUERY_FULLTEXT
/* QUERY_FULLTEXT + [i] is a full-text search for column i*/
1770 typedef enum fulltext_statement
{
1771 CONTENT_INSERT_STMT
,
1772 CONTENT_SELECT_STMT
,
1773 CONTENT_UPDATE_STMT
,
1774 CONTENT_DELETE_STMT
,
1775 CONTENT_EXISTS_STMT
,
1780 BLOCK_DELETE_ALL_STMT
,
1782 SEGDIR_MAX_INDEX_STMT
,
1784 SEGDIR_SELECT_LEVEL_STMT
,
1787 SEGDIR_SELECT_SEGMENT_STMT
,
1788 SEGDIR_SELECT_ALL_STMT
,
1789 SEGDIR_DELETE_ALL_STMT
,
1792 MAX_STMT
/* Always at end! */
1793 } fulltext_statement
;
1795 /* These must exactly match the enum above. */
1796 /* TODO(shess): Is there some risk that a statement will be used in two
1797 ** cursors at once, e.g. if a query joins a virtual table to itself?
1798 ** If so perhaps we should move some of these to the cursor object.
1800 static const char *const fulltext_zStatement
[MAX_STMT
] = {
1801 /* CONTENT_INSERT */ NULL
, /* generated in contentInsertStatement() */
1802 /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
1803 /* CONTENT_UPDATE */ NULL
, /* generated in contentUpdateStatement() */
1804 /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
1805 /* CONTENT_EXISTS */ "select rowid from %_content limit 1",
1807 /* BLOCK_INSERT */ "insert into %_segments values (?)",
1808 /* BLOCK_SELECT */ "select block from %_segments where rowid = ?",
1809 /* BLOCK_DELETE */ "delete from %_segments where rowid between ? and ?",
1810 /* BLOCK_DELETE_ALL */ "delete from %_segments",
1812 /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?",
1813 /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)",
1814 /* SEGDIR_SELECT_LEVEL */
1815 "select start_block, leaves_end_block, root from %_segdir "
1816 " where level = ? order by idx",
1818 "select min(start_block), max(end_block) from %_segdir "
1819 " where level = ? and start_block <> 0",
1820 /* SEGDIR_DELETE */ "delete from %_segdir where level = ?",
1822 /* NOTE(shess): The first three results of the following two
1823 ** statements must match.
1825 /* SEGDIR_SELECT_SEGMENT */
1826 "select start_block, leaves_end_block, root from %_segdir "
1827 " where level = ? and idx = ?",
1828 /* SEGDIR_SELECT_ALL */
1829 "select start_block, leaves_end_block, root from %_segdir "
1830 " order by level desc, idx asc",
1831 /* SEGDIR_DELETE_ALL */ "delete from %_segdir",
1832 /* SEGDIR_COUNT */ "select count(*), ifnull(max(level),0) from %_segdir",
1836 ** A connection to a fulltext index is an instance of the following
1837 ** structure. The xCreate and xConnect methods create an instance
1838 ** of this structure and xDestroy and xDisconnect free that instance.
1839 ** All other methods receive a pointer to the structure as one of their
1842 struct fulltext_vtab
{
1843 sqlite3_vtab base
; /* Base class used by SQLite core */
1844 sqlite3
*db
; /* The database connection */
1845 const char *zDb
; /* logical database name */
1846 const char *zName
; /* virtual table name */
1847 int nColumn
; /* number of columns in virtual table */
1848 char **azColumn
; /* column names. malloced */
1849 char **azContentColumn
; /* column names in content table; malloced */
1850 sqlite3_tokenizer
*pTokenizer
; /* tokenizer for inserts and queries */
1852 /* Precompiled statements which we keep as long as the table is
1855 sqlite3_stmt
*pFulltextStatements
[MAX_STMT
];
1857 /* Precompiled statements used for segment merges. We run a
1858 ** separate select across the leaf level of each tree being merged.
1860 sqlite3_stmt
*pLeafSelectStmts
[MERGE_COUNT
];
1861 /* The statement used to prepare pLeafSelectStmts. */
1862 #define LEAF_SELECT \
1863 "select block from %_segments where rowid between ? and ? order by rowid"
1865 /* These buffer pending index updates during transactions.
1866 ** nPendingData estimates the memory size of the pending data. It
1867 ** doesn't include the hash-bucket overhead, nor any malloc
1868 ** overhead. When nPendingData exceeds kPendingThreshold, the
1869 ** buffer is flushed even before the transaction closes.
1870 ** pendingTerms stores the data, and is only valid when nPendingData
1871 ** is >=0 (nPendingData<0 means pendingTerms has not been
1872 ** initialized). iPrevDocid is the last docid written, used to make
1873 ** certain we're inserting in sorted order.
1876 #define kPendingThreshold (1*1024*1024)
1877 sqlite_int64 iPrevDocid
;
1878 fts2Hash pendingTerms
;
1882 ** When the core wants to do a query, it create a cursor using a
1883 ** call to xOpen. This structure is an instance of a cursor. It
1884 ** is destroyed by xClose.
1886 typedef struct fulltext_cursor
{
1887 sqlite3_vtab_cursor base
; /* Base class used by SQLite core */
1888 QueryType iCursorType
; /* Copy of sqlite3_index_info.idxNum */
1889 sqlite3_stmt
*pStmt
; /* Prepared statement in use by the cursor */
1890 int eof
; /* True if at End Of Results */
1891 Query q
; /* Parsed query string */
1892 Snippet snippet
; /* Cached snippet for the current row */
1893 int iColumn
; /* Column being searched */
1894 DataBuffer result
; /* Doclist results from fulltextQuery */
1895 DLReader reader
; /* Result reader if result not empty */
1898 static struct fulltext_vtab
*cursor_vtab(fulltext_cursor
*c
){
1899 return (fulltext_vtab
*) c
->base
.pVtab
;
1902 static const sqlite3_module fts2Module
; /* forward declaration */
1904 /* Return a dynamically generated statement of the form
1905 * insert into %_content (rowid, ...) values (?, ...)
1907 static const char *contentInsertStatement(fulltext_vtab
*v
){
1911 initStringBuffer(&sb
);
1912 append(&sb
, "insert into %_content (rowid, ");
1913 appendList(&sb
, v
->nColumn
, v
->azContentColumn
);
1914 append(&sb
, ") values (?");
1915 for(i
=0; i
<v
->nColumn
; ++i
)
1918 return stringBufferData(&sb
);
1921 /* Return a dynamically generated statement of the form
1922 * update %_content set [col_0] = ?, [col_1] = ?, ...
1925 static const char *contentUpdateStatement(fulltext_vtab
*v
){
1929 initStringBuffer(&sb
);
1930 append(&sb
, "update %_content set ");
1931 for(i
=0; i
<v
->nColumn
; ++i
) {
1935 append(&sb
, v
->azContentColumn
[i
]);
1936 append(&sb
, " = ?");
1938 append(&sb
, " where rowid = ?");
1939 return stringBufferData(&sb
);
1942 /* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
1943 ** If the indicated statement has never been prepared, it is prepared
1944 ** and cached, otherwise the cached version is reset.
1946 static int sql_get_statement(fulltext_vtab
*v
, fulltext_statement iStmt
,
1947 sqlite3_stmt
**ppStmt
){
1948 assert( iStmt
<MAX_STMT
);
1949 if( v
->pFulltextStatements
[iStmt
]==NULL
){
1953 case CONTENT_INSERT_STMT
:
1954 zStmt
= contentInsertStatement(v
); break;
1955 case CONTENT_UPDATE_STMT
:
1956 zStmt
= contentUpdateStatement(v
); break;
1958 zStmt
= fulltext_zStatement
[iStmt
];
1960 rc
= sql_prepare(v
->db
, v
->zDb
, v
->zName
, &v
->pFulltextStatements
[iStmt
],
1962 if( zStmt
!= fulltext_zStatement
[iStmt
]) sqlite3_free((void *) zStmt
);
1963 if( rc
!=SQLITE_OK
) return rc
;
1965 int rc
= sqlite3_reset(v
->pFulltextStatements
[iStmt
]);
1966 if( rc
!=SQLITE_OK
) return rc
;
1969 *ppStmt
= v
->pFulltextStatements
[iStmt
];
1973 /* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and
1974 ** SQLITE_ROW to SQLITE_ERROR. Useful for statements like UPDATE,
1975 ** where we expect no results.
1977 static int sql_single_step(sqlite3_stmt
*s
){
1978 int rc
= sqlite3_step(s
);
1979 return (rc
==SQLITE_DONE
) ? SQLITE_OK
: rc
;
1982 /* Like sql_get_statement(), but for special replicated LEAF_SELECT
1983 ** statements. idx -1 is a special case for an uncached version of
1984 ** the statement (used in the optimize implementation).
1986 /* TODO(shess) Write version for generic statements and then share
1987 ** that between the cached-statement functions.
1989 static int sql_get_leaf_statement(fulltext_vtab
*v
, int idx
,
1990 sqlite3_stmt
**ppStmt
){
1991 assert( idx
>=-1 && idx
<MERGE_COUNT
);
1993 return sql_prepare(v
->db
, v
->zDb
, v
->zName
, ppStmt
, LEAF_SELECT
);
1994 }else if( v
->pLeafSelectStmts
[idx
]==NULL
){
1995 int rc
= sql_prepare(v
->db
, v
->zDb
, v
->zName
, &v
->pLeafSelectStmts
[idx
],
1997 if( rc
!=SQLITE_OK
) return rc
;
1999 int rc
= sqlite3_reset(v
->pLeafSelectStmts
[idx
]);
2000 if( rc
!=SQLITE_OK
) return rc
;
2003 *ppStmt
= v
->pLeafSelectStmts
[idx
];
2007 /* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
2008 static int content_insert(fulltext_vtab
*v
, sqlite3_value
*rowid
,
2009 sqlite3_value
**pValues
){
2012 int rc
= sql_get_statement(v
, CONTENT_INSERT_STMT
, &s
);
2013 if( rc
!=SQLITE_OK
) return rc
;
2015 rc
= sqlite3_bind_value(s
, 1, rowid
);
2016 if( rc
!=SQLITE_OK
) return rc
;
2018 for(i
=0; i
<v
->nColumn
; ++i
){
2019 rc
= sqlite3_bind_value(s
, 2+i
, pValues
[i
]);
2020 if( rc
!=SQLITE_OK
) return rc
;
2023 return sql_single_step(s
);
2026 /* update %_content set col0 = pValues[0], col1 = pValues[1], ...
2027 * where rowid = [iRowid] */
2028 static int content_update(fulltext_vtab
*v
, sqlite3_value
**pValues
,
2029 sqlite_int64 iRowid
){
2032 int rc
= sql_get_statement(v
, CONTENT_UPDATE_STMT
, &s
);
2033 if( rc
!=SQLITE_OK
) return rc
;
2035 for(i
=0; i
<v
->nColumn
; ++i
){
2036 rc
= sqlite3_bind_value(s
, 1+i
, pValues
[i
]);
2037 if( rc
!=SQLITE_OK
) return rc
;
2040 rc
= sqlite3_bind_int64(s
, 1+v
->nColumn
, iRowid
);
2041 if( rc
!=SQLITE_OK
) return rc
;
2043 return sql_single_step(s
);
2046 static void freeStringArray(int nString
, const char **pString
){
2049 for (i
=0 ; i
< nString
; ++i
) {
2050 if( pString
[i
]!=NULL
) sqlite3_free((void *) pString
[i
]);
2052 sqlite3_free((void *) pString
);
2055 /* select * from %_content where rowid = [iRow]
2056 * The caller must delete the returned array and all strings in it.
2057 * null fields will be NULL in the returned array.
2059 * TODO: Perhaps we should return pointer/length strings here for consistency
2060 * with other code which uses pointer/length. */
2061 static int content_select(fulltext_vtab
*v
, sqlite_int64 iRow
,
2062 const char ***pValues
){
2064 const char **values
;
2070 rc
= sql_get_statement(v
, CONTENT_SELECT_STMT
, &s
);
2071 if( rc
!=SQLITE_OK
) return rc
;
2073 rc
= sqlite3_bind_int64(s
, 1, iRow
);
2074 if( rc
!=SQLITE_OK
) return rc
;
2076 rc
= sqlite3_step(s
);
2077 if( rc
!=SQLITE_ROW
) return rc
;
2079 values
= (const char **) sqlite3_malloc(v
->nColumn
* sizeof(const char *));
2080 for(i
=0; i
<v
->nColumn
; ++i
){
2081 if( sqlite3_column_type(s
, i
)==SQLITE_NULL
){
2084 values
[i
] = string_dup((char*)sqlite3_column_text(s
, i
));
2088 /* We expect only one row. We must execute another sqlite3_step()
2089 * to complete the iteration; otherwise the table will remain locked. */
2090 rc
= sqlite3_step(s
);
2091 if( rc
==SQLITE_DONE
){
2096 freeStringArray(v
->nColumn
, values
);
2100 /* delete from %_content where rowid = [iRow ] */
2101 static int content_delete(fulltext_vtab
*v
, sqlite_int64 iRow
){
2103 int rc
= sql_get_statement(v
, CONTENT_DELETE_STMT
, &s
);
2104 if( rc
!=SQLITE_OK
) return rc
;
2106 rc
= sqlite3_bind_int64(s
, 1, iRow
);
2107 if( rc
!=SQLITE_OK
) return rc
;
2109 return sql_single_step(s
);
2112 /* Returns SQLITE_ROW if any rows exist in %_content, SQLITE_DONE if
2113 ** no rows exist, and any error in case of failure.
2115 static int content_exists(fulltext_vtab
*v
){
2117 int rc
= sql_get_statement(v
, CONTENT_EXISTS_STMT
, &s
);
2118 if( rc
!=SQLITE_OK
) return rc
;
2120 rc
= sqlite3_step(s
);
2121 if( rc
!=SQLITE_ROW
) return rc
;
2123 /* We expect only one row. We must execute another sqlite3_step()
2124 * to complete the iteration; otherwise the table will remain locked. */
2125 rc
= sqlite3_step(s
);
2126 if( rc
==SQLITE_DONE
) return SQLITE_ROW
;
2127 if( rc
==SQLITE_ROW
) return SQLITE_ERROR
;
2131 /* insert into %_segments values ([pData])
2132 ** returns assigned rowid in *piBlockid
2134 static int block_insert(fulltext_vtab
*v
, const char *pData
, int nData
,
2135 sqlite_int64
*piBlockid
){
2137 int rc
= sql_get_statement(v
, BLOCK_INSERT_STMT
, &s
);
2138 if( rc
!=SQLITE_OK
) return rc
;
2140 rc
= sqlite3_bind_blob(s
, 1, pData
, nData
, SQLITE_STATIC
);
2141 if( rc
!=SQLITE_OK
) return rc
;
2143 rc
= sqlite3_step(s
);
2144 if( rc
==SQLITE_ROW
) return SQLITE_ERROR
;
2145 if( rc
!=SQLITE_DONE
) return rc
;
2147 *piBlockid
= sqlite3_last_insert_rowid(v
->db
);
2151 /* delete from %_segments
2152 ** where rowid between [iStartBlockid] and [iEndBlockid]
2154 ** Deletes the range of blocks, inclusive, used to delete the blocks
2155 ** which form a segment.
2157 static int block_delete(fulltext_vtab
*v
,
2158 sqlite_int64 iStartBlockid
, sqlite_int64 iEndBlockid
){
2160 int rc
= sql_get_statement(v
, BLOCK_DELETE_STMT
, &s
);
2161 if( rc
!=SQLITE_OK
) return rc
;
2163 rc
= sqlite3_bind_int64(s
, 1, iStartBlockid
);
2164 if( rc
!=SQLITE_OK
) return rc
;
2166 rc
= sqlite3_bind_int64(s
, 2, iEndBlockid
);
2167 if( rc
!=SQLITE_OK
) return rc
;
2169 return sql_single_step(s
);
2172 /* Returns SQLITE_ROW with *pidx set to the maximum segment idx found
2173 ** at iLevel. Returns SQLITE_DONE if there are no segments at
2174 ** iLevel. Otherwise returns an error.
2176 static int segdir_max_index(fulltext_vtab
*v
, int iLevel
, int *pidx
){
2178 int rc
= sql_get_statement(v
, SEGDIR_MAX_INDEX_STMT
, &s
);
2179 if( rc
!=SQLITE_OK
) return rc
;
2181 rc
= sqlite3_bind_int(s
, 1, iLevel
);
2182 if( rc
!=SQLITE_OK
) return rc
;
2184 rc
= sqlite3_step(s
);
2185 /* Should always get at least one row due to how max() works. */
2186 if( rc
==SQLITE_DONE
) return SQLITE_DONE
;
2187 if( rc
!=SQLITE_ROW
) return rc
;
2189 /* NULL means that there were no inputs to max(). */
2190 if( SQLITE_NULL
==sqlite3_column_type(s
, 0) ){
2191 rc
= sqlite3_step(s
);
2192 if( rc
==SQLITE_ROW
) return SQLITE_ERROR
;
2196 *pidx
= sqlite3_column_int(s
, 0);
2198 /* We expect only one row. We must execute another sqlite3_step()
2199 * to complete the iteration; otherwise the table will remain locked. */
2200 rc
= sqlite3_step(s
);
2201 if( rc
==SQLITE_ROW
) return SQLITE_ERROR
;
2202 if( rc
!=SQLITE_DONE
) return rc
;
2206 /* insert into %_segdir values (
2208 ** [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid],
2212 static int segdir_set(fulltext_vtab
*v
, int iLevel
, int idx
,
2213 sqlite_int64 iStartBlockid
,
2214 sqlite_int64 iLeavesEndBlockid
,
2215 sqlite_int64 iEndBlockid
,
2216 const char *pRootData
, int nRootData
){
2218 int rc
= sql_get_statement(v
, SEGDIR_SET_STMT
, &s
);
2219 if( rc
!=SQLITE_OK
) return rc
;
2221 rc
= sqlite3_bind_int(s
, 1, iLevel
);
2222 if( rc
!=SQLITE_OK
) return rc
;
2224 rc
= sqlite3_bind_int(s
, 2, idx
);
2225 if( rc
!=SQLITE_OK
) return rc
;
2227 rc
= sqlite3_bind_int64(s
, 3, iStartBlockid
);
2228 if( rc
!=SQLITE_OK
) return rc
;
2230 rc
= sqlite3_bind_int64(s
, 4, iLeavesEndBlockid
);
2231 if( rc
!=SQLITE_OK
) return rc
;
2233 rc
= sqlite3_bind_int64(s
, 5, iEndBlockid
);
2234 if( rc
!=SQLITE_OK
) return rc
;
2236 rc
= sqlite3_bind_blob(s
, 6, pRootData
, nRootData
, SQLITE_STATIC
);
2237 if( rc
!=SQLITE_OK
) return rc
;
2239 return sql_single_step(s
);
2242 /* Queries %_segdir for the block span of the segments in level
2243 ** iLevel. Returns SQLITE_DONE if there are no blocks for iLevel,
2244 ** SQLITE_ROW if there are blocks, else an error.
2246 static int segdir_span(fulltext_vtab
*v
, int iLevel
,
2247 sqlite_int64
*piStartBlockid
,
2248 sqlite_int64
*piEndBlockid
){
2250 int rc
= sql_get_statement(v
, SEGDIR_SPAN_STMT
, &s
);
2251 if( rc
!=SQLITE_OK
) return rc
;
2253 rc
= sqlite3_bind_int(s
, 1, iLevel
);
2254 if( rc
!=SQLITE_OK
) return rc
;
2256 rc
= sqlite3_step(s
);
2257 if( rc
==SQLITE_DONE
) return SQLITE_DONE
; /* Should never happen */
2258 if( rc
!=SQLITE_ROW
) return rc
;
2260 /* This happens if all segments at this level are entirely inline. */
2261 if( SQLITE_NULL
==sqlite3_column_type(s
, 0) ){
2262 /* We expect only one row. We must execute another sqlite3_step()
2263 * to complete the iteration; otherwise the table will remain locked. */
2264 int rc2
= sqlite3_step(s
);
2265 if( rc2
==SQLITE_ROW
) return SQLITE_ERROR
;
2269 *piStartBlockid
= sqlite3_column_int64(s
, 0);
2270 *piEndBlockid
= sqlite3_column_int64(s
, 1);
2272 /* We expect only one row. We must execute another sqlite3_step()
2273 * to complete the iteration; otherwise the table will remain locked. */
2274 rc
= sqlite3_step(s
);
2275 if( rc
==SQLITE_ROW
) return SQLITE_ERROR
;
2276 if( rc
!=SQLITE_DONE
) return rc
;
2280 /* Delete the segment blocks and segment directory records for all
2281 ** segments at iLevel.
2283 static int segdir_delete(fulltext_vtab
*v
, int iLevel
){
2285 sqlite_int64 iStartBlockid
, iEndBlockid
;
2286 int rc
= segdir_span(v
, iLevel
, &iStartBlockid
, &iEndBlockid
);
2287 if( rc
!=SQLITE_ROW
&& rc
!=SQLITE_DONE
) return rc
;
2289 if( rc
==SQLITE_ROW
){
2290 rc
= block_delete(v
, iStartBlockid
, iEndBlockid
);
2291 if( rc
!=SQLITE_OK
) return rc
;
2294 /* Delete the segment directory itself. */
2295 rc
= sql_get_statement(v
, SEGDIR_DELETE_STMT
, &s
);
2296 if( rc
!=SQLITE_OK
) return rc
;
2298 rc
= sqlite3_bind_int64(s
, 1, iLevel
);
2299 if( rc
!=SQLITE_OK
) return rc
;
2301 return sql_single_step(s
);
2304 /* Delete entire fts index, SQLITE_OK on success, relevant error on
2307 static int segdir_delete_all(fulltext_vtab
*v
){
2309 int rc
= sql_get_statement(v
, SEGDIR_DELETE_ALL_STMT
, &s
);
2310 if( rc
!=SQLITE_OK
) return rc
;
2312 rc
= sql_single_step(s
);
2313 if( rc
!=SQLITE_OK
) return rc
;
2315 rc
= sql_get_statement(v
, BLOCK_DELETE_ALL_STMT
, &s
);
2316 if( rc
!=SQLITE_OK
) return rc
;
2318 return sql_single_step(s
);
2321 /* Returns SQLITE_OK with *pnSegments set to the number of entries in
2322 ** %_segdir and *piMaxLevel set to the highest level which has a
2323 ** segment. Otherwise returns the SQLite error which caused failure.
2325 static int segdir_count(fulltext_vtab
*v
, int *pnSegments
, int *piMaxLevel
){
2327 int rc
= sql_get_statement(v
, SEGDIR_COUNT_STMT
, &s
);
2328 if( rc
!=SQLITE_OK
) return rc
;
2330 rc
= sqlite3_step(s
);
2331 /* TODO(shess): This case should not be possible? Should stronger
2332 ** measures be taken if it happens?
2334 if( rc
==SQLITE_DONE
){
2339 if( rc
!=SQLITE_ROW
) return rc
;
2341 *pnSegments
= sqlite3_column_int(s
, 0);
2342 *piMaxLevel
= sqlite3_column_int(s
, 1);
2344 /* We expect only one row. We must execute another sqlite3_step()
2345 * to complete the iteration; otherwise the table will remain locked. */
2346 rc
= sqlite3_step(s
);
2347 if( rc
==SQLITE_DONE
) return SQLITE_OK
;
2348 if( rc
==SQLITE_ROW
) return SQLITE_ERROR
;
2352 /* TODO(shess) clearPendingTerms() is far down the file because
2353 ** writeZeroSegment() is far down the file because LeafWriter is far
2354 ** down the file. Consider refactoring the code to move the non-vtab
2355 ** code above the vtab code so that we don't need this forward
2358 static int clearPendingTerms(fulltext_vtab
*v
);
2361 ** Free the memory used to contain a fulltext_vtab structure.
2363 static void fulltext_vtab_destroy(fulltext_vtab
*v
){
2366 TRACE(("FTS2 Destroy %p\n", v
));
2367 for( iStmt
=0; iStmt
<MAX_STMT
; iStmt
++ ){
2368 if( v
->pFulltextStatements
[iStmt
]!=NULL
){
2369 sqlite3_finalize(v
->pFulltextStatements
[iStmt
]);
2370 v
->pFulltextStatements
[iStmt
] = NULL
;
2374 for( i
=0; i
<MERGE_COUNT
; i
++ ){
2375 if( v
->pLeafSelectStmts
[i
]!=NULL
){
2376 sqlite3_finalize(v
->pLeafSelectStmts
[i
]);
2377 v
->pLeafSelectStmts
[i
] = NULL
;
2381 if( v
->pTokenizer
!=NULL
){
2382 v
->pTokenizer
->pModule
->xDestroy(v
->pTokenizer
);
2383 v
->pTokenizer
= NULL
;
2386 clearPendingTerms(v
);
2388 sqlite3_free(v
->azColumn
);
2389 for(i
= 0; i
< v
->nColumn
; ++i
) {
2390 sqlite3_free(v
->azContentColumn
[i
]);
2392 sqlite3_free(v
->azContentColumn
);
2397 ** Token types for parsing the arguments to xConnect or xCreate.
2399 #define TOKEN_EOF 0 /* End of file */
2400 #define TOKEN_SPACE 1 /* Any kind of whitespace */
2401 #define TOKEN_ID 2 /* An identifier */
2402 #define TOKEN_STRING 3 /* A string literal */
2403 #define TOKEN_PUNCT 4 /* A single punctuation character */
2406 ** If X is a character that can be used in an identifier then
2407 ** IdChar(X) will be true. Otherwise it is false.
2409 ** For ASCII, any character with the high-order bit set is
2410 ** allowed in an identifier. For 7-bit characters,
2411 ** sqlite3IsIdChar[X] must be 1.
2413 ** Ticket #1066. the SQL standard does not allow '$' in the
2414 ** middle of identfiers. But many SQL implementations do.
2415 ** SQLite will allow '$' in identifiers for compatibility.
2416 ** But the feature is undocumented.
2418 static const char isIdChar
[] = {
2419 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
2420 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
2421 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
2422 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
2423 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
2424 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
2425 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
2427 #define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
2431 ** Return the length of the token that begins at z[0].
2432 ** Store the token type in *tokenType before returning.
2434 static int getToken(const char *z
, int *tokenType
){
2438 *tokenType
= TOKEN_EOF
;
2441 case ' ': case '\t': case '\n': case '\f': case '\r': {
2442 for(i
=1; safe_isspace(z
[i
]); i
++){}
2443 *tokenType
= TOKEN_SPACE
;
2450 for(i
=1; (c
=z
[i
])!=0; i
++){
2452 if( z
[i
+1]==delim
){
2459 *tokenType
= TOKEN_STRING
;
2463 for(i
=1, c
=z
[0]; c
!=']' && (c
=z
[i
])!=0; i
++){}
2464 *tokenType
= TOKEN_ID
;
2471 for(i
=1; IdChar(z
[i
]); i
++){}
2472 *tokenType
= TOKEN_ID
;
2476 *tokenType
= TOKEN_PUNCT
;
2481 ** A token extracted from a string is an instance of the following
2484 typedef struct Token
{
2485 const char *z
; /* Pointer to token text. Not '\000' terminated */
2486 short int n
; /* Length of the token text in bytes. */
2490 ** Given a input string (which is really one of the argv[] parameters
2491 ** passed into xConnect or xCreate) split the string up into tokens.
2492 ** Return an array of pointers to '\000' terminated strings, one string
2493 ** for each non-whitespace token.
2495 ** The returned array is terminated by a single NULL pointer.
2497 ** Space to hold the returned array is obtained from a single
2498 ** malloc and should be freed by passing the return value to free().
2499 ** The individual strings within the token list are all a part of
2500 ** the single memory allocation and will all be freed at once.
2502 static char **tokenizeString(const char *z
, int *pnToken
){
2504 Token
*aToken
= sqlite3_malloc( strlen(z
) * sizeof(aToken
[0]) );
2511 n
= getToken(z
, &e
);
2512 if( e
!=TOKEN_SPACE
){
2513 aToken
[nToken
].z
= z
;
2514 aToken
[nToken
].n
= n
;
2520 azToken
= (char**)sqlite3_malloc( nToken
*sizeof(char*) + totalSize
);
2521 zCopy
= (char*)&azToken
[nToken
];
2523 for(i
=0; i
<nToken
; i
++){
2526 memcpy(zCopy
, aToken
[i
].z
, n
);
2530 azToken
[nToken
] = 0;
2531 sqlite3_free(aToken
);
2537 ** Convert an SQL-style quoted string into a normal string by removing
2538 ** the quote characters. The conversion is done in-place. If the
2539 ** input does not begin with a quote character, then this routine
2544 ** "abc" becomes abc
2545 ** 'xyz' becomes xyz
2546 ** [pqr] becomes pqr
2547 ** `mno` becomes mno
2549 static void dequoteString(char *z
){
2557 case '`': break; /* For MySQL compatibility */
2558 case '[': quote
= ']'; break; /* For MS SqlServer compatibility */
2561 for(i
=1, j
=0; z
[i
]; i
++){
2563 if( z
[i
+1]==quote
){
2577 ** The input azIn is a NULL-terminated list of tokens. Remove the first
2578 ** token and all punctuation tokens. Remove the quotes from
2579 ** around string literal tokens.
2583 ** input: tokenize chinese ( 'simplifed' , 'mixed' )
2584 ** output: chinese simplifed mixed
2588 ** input: delimiters ( '[' , ']' , '...' )
2591 static void tokenListToIdList(char **azIn
){
2594 for(i
=0, j
=-1; azIn
[i
]; i
++){
2595 if( safe_isalnum(azIn
[i
][0]) || azIn
[i
][1] ){
2596 dequoteString(azIn
[i
]);
2609 ** Find the first alphanumeric token in the string zIn. Null-terminate
2610 ** this token. Remove any quotation marks. And return a pointer to
2613 static char *firstToken(char *zIn
, char **pzTail
){
2616 n
= getToken(zIn
, &ttype
);
2617 if( ttype
==TOKEN_SPACE
){
2619 }else if( ttype
==TOKEN_EOF
){
2632 /* Return true if...
2634 ** * s begins with the string t, ignoring case
2635 ** * s is longer than t
2636 ** * The first character of s beyond t is not a alphanumeric
2638 ** Ignore leading space in *s.
2640 ** To put it another way, return true if the first token of
2643 static int startsWith(const char *s
, const char *t
){
2644 while( safe_isspace(*s
) ){ s
++; }
2646 if( safe_tolower(*s
++)!=safe_tolower(*t
++) ) return 0;
2648 return *s
!='_' && !safe_isalnum(*s
);
2652 ** An instance of this structure defines the "spec" of a
2653 ** full text index. This structure is populated by parseSpec
2654 ** and use by fulltextConnect and fulltextCreate.
2656 typedef struct TableSpec
{
2657 const char *zDb
; /* Logical database name */
2658 const char *zName
; /* Name of the full-text index */
2659 int nColumn
; /* Number of columns to be indexed */
2660 char **azColumn
; /* Original names of columns to be indexed */
2661 char **azContentColumn
; /* Column names for %_content */
2662 char **azTokenizer
; /* Name of tokenizer and its arguments */
2666 ** Reclaim all of the memory used by a TableSpec
2668 static void clearTableSpec(TableSpec
*p
) {
2669 sqlite3_free(p
->azColumn
);
2670 sqlite3_free(p
->azContentColumn
);
2671 sqlite3_free(p
->azTokenizer
);
2674 /* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
2676 * CREATE VIRTUAL TABLE email
2677 * USING fts2(subject, body, tokenize mytokenizer(myarg))
2679 * We return parsed information in a TableSpec structure.
2682 static int parseSpec(TableSpec
*pSpec
, int argc
, const char *const*argv
,
2687 const char *zTokenizer
= 0; /* argv[] entry describing the tokenizer */
2690 /* Current interface:
2691 ** argv[0] - module name
2692 ** argv[1] - database name
2693 ** argv[2] - table name
2694 ** argv[3..] - columns, optionally followed by tokenizer specification
2695 ** and snippet delimiters specification.
2698 /* Make a copy of the complete argv[][] array in a single allocation.
2699 ** The argv[][] array is read-only and transient. We can write to the
2700 ** copy in order to modify things and the copy is persistent.
2703 for(i
=n
=0; i
<argc
; i
++){
2704 n
+= strlen(argv
[i
]) + 1;
2706 azArg
= sqlite3_malloc( sizeof(char*)*argc
+ n
);
2708 return SQLITE_NOMEM
;
2710 z
= (char*)&azArg
[argc
];
2711 for(i
=0; i
<argc
; i
++){
2717 /* Identify the column names and the tokenizer and delimiter arguments
2718 ** in the argv[][] array.
2720 pSpec
->zDb
= azArg
[1];
2721 pSpec
->zName
= azArg
[2];
2723 pSpec
->azColumn
= azArg
;
2724 zTokenizer
= "tokenize simple";
2725 for(i
=3; i
<argc
; ++i
){
2726 if( startsWith(azArg
[i
],"tokenize") ){
2727 zTokenizer
= azArg
[i
];
2729 z
= azArg
[pSpec
->nColumn
] = firstToken(azArg
[i
], &zDummy
);
2733 if( pSpec
->nColumn
==0 ){
2734 azArg
[0] = "content";
2739 ** Construct the list of content column names.
2741 ** Each content column name will be of the form cNNAAAA
2742 ** where NN is the column number and AAAA is the sanitized
2743 ** column name. "sanitized" means that special characters are
2744 ** converted to "_". The cNN prefix guarantees that all column
2745 ** names are unique.
2747 ** The AAAA suffix is not strictly necessary. It is included
2748 ** for the convenience of people who might examine the generated
2749 ** %_content table and wonder what the columns are used for.
2751 pSpec
->azContentColumn
= sqlite3_malloc( pSpec
->nColumn
* sizeof(char *) );
2752 if( pSpec
->azContentColumn
==0 ){
2753 clearTableSpec(pSpec
);
2754 return SQLITE_NOMEM
;
2756 for(i
=0; i
<pSpec
->nColumn
; i
++){
2758 pSpec
->azContentColumn
[i
] = sqlite3_mprintf("c%d%s", i
, azArg
[i
]);
2759 for (p
= pSpec
->azContentColumn
[i
]; *p
; ++p
) {
2760 if( !safe_isalnum(*p
) ) *p
= '_';
2765 ** Parse the tokenizer specification string.
2767 pSpec
->azTokenizer
= tokenizeString(zTokenizer
, &n
);
2768 tokenListToIdList(pSpec
->azTokenizer
);
2774 ** Generate a CREATE TABLE statement that describes the schema of
2775 ** the virtual table. Return a pointer to this schema string.
2777 ** Space is obtained from sqlite3_mprintf() and should be freed
2778 ** using sqlite3_free().
2780 static char *fulltextSchema(
2781 int nColumn
, /* Number of columns */
2782 const char *const* azColumn
, /* List of columns */
2783 const char *zTableName
/* Name of the table */
2786 char *zSchema
, *zNext
;
2787 const char *zSep
= "(";
2788 zSchema
= sqlite3_mprintf("CREATE TABLE x");
2789 for(i
=0; i
<nColumn
; i
++){
2790 zNext
= sqlite3_mprintf("%s%s%Q", zSchema
, zSep
, azColumn
[i
]);
2791 sqlite3_free(zSchema
);
2795 zNext
= sqlite3_mprintf("%s,%Q)", zSchema
, zTableName
);
2796 sqlite3_free(zSchema
);
2801 ** Build a new sqlite3_vtab structure that will describe the
2802 ** fulltext index defined by spec.
2804 static int constructVtab(
2805 sqlite3
*db
, /* The SQLite database connection */
2806 fts2Hash
*pHash
, /* Hash table containing tokenizers */
2807 TableSpec
*spec
, /* Parsed spec information from parseSpec() */
2808 sqlite3_vtab
**ppVTab
, /* Write the resulting vtab structure here */
2809 char **pzErr
/* Write any error message here */
2813 fulltext_vtab
*v
= 0;
2814 const sqlite3_tokenizer_module
*m
= NULL
;
2817 char const *zTok
; /* Name of tokenizer to use for this fts table */
2818 int nTok
; /* Length of zTok, including nul terminator */
2820 v
= (fulltext_vtab
*) sqlite3_malloc(sizeof(fulltext_vtab
));
2821 if( v
==0 ) return SQLITE_NOMEM
;
2823 /* sqlite will initialize v->base */
2825 v
->zDb
= spec
->zDb
; /* Freed when azColumn is freed */
2826 v
->zName
= spec
->zName
; /* Freed when azColumn is freed */
2827 v
->nColumn
= spec
->nColumn
;
2828 v
->azContentColumn
= spec
->azContentColumn
;
2829 spec
->azContentColumn
= 0;
2830 v
->azColumn
= spec
->azColumn
;
2833 if( spec
->azTokenizer
==0 ){
2834 return SQLITE_NOMEM
;
2837 zTok
= spec
->azTokenizer
[0];
2841 nTok
= strlen(zTok
)+1;
2843 m
= (sqlite3_tokenizer_module
*)sqlite3Fts2HashFind(pHash
, zTok
, nTok
);
2845 *pzErr
= sqlite3_mprintf("unknown tokenizer: %s", spec
->azTokenizer
[0]);
2850 for(n
=0; spec
->azTokenizer
[n
]; n
++){}
2852 rc
= m
->xCreate(n
-1, (const char*const*)&spec
->azTokenizer
[1],
2855 rc
= m
->xCreate(0, 0, &v
->pTokenizer
);
2857 if( rc
!=SQLITE_OK
) goto err
;
2858 v
->pTokenizer
->pModule
= m
;
2860 /* TODO: verify the existence of backing tables foo_content, foo_term */
2862 schema
= fulltextSchema(v
->nColumn
, (const char*const*)v
->azColumn
,
2864 rc
= sqlite3_declare_vtab(db
, schema
);
2865 sqlite3_free(schema
);
2866 if( rc
!=SQLITE_OK
) goto err
;
2868 memset(v
->pFulltextStatements
, 0, sizeof(v
->pFulltextStatements
));
2870 /* Indicate that the buffer is not live. */
2871 v
->nPendingData
= -1;
2874 TRACE(("FTS2 Connect %p\n", v
));
2879 fulltext_vtab_destroy(v
);
2883 static int fulltextConnect(
2886 int argc
, const char *const*argv
,
2887 sqlite3_vtab
**ppVTab
,
2891 int rc
= parseSpec(&spec
, argc
, argv
, pzErr
);
2892 if( rc
!=SQLITE_OK
) return rc
;
2894 rc
= constructVtab(db
, (fts2Hash
*)pAux
, &spec
, ppVTab
, pzErr
);
2895 clearTableSpec(&spec
);
2899 /* The %_content table holds the text of each document, with
2900 ** the rowid used as the docid.
2902 /* TODO(shess) This comment needs elaboration to match the updated
2903 ** code. Work it into the top-of-file comment at that time.
2905 static int fulltextCreate(sqlite3
*db
, void *pAux
,
2906 int argc
, const char * const *argv
,
2907 sqlite3_vtab
**ppVTab
, char **pzErr
){
2910 StringBuffer schema
;
2911 TRACE(("FTS2 Create\n"));
2913 rc
= parseSpec(&spec
, argc
, argv
, pzErr
);
2914 if( rc
!=SQLITE_OK
) return rc
;
2916 initStringBuffer(&schema
);
2917 append(&schema
, "CREATE TABLE %_content(");
2918 appendList(&schema
, spec
.nColumn
, spec
.azContentColumn
);
2919 append(&schema
, ")");
2920 rc
= sql_exec(db
, spec
.zDb
, spec
.zName
, stringBufferData(&schema
));
2921 stringBufferDestroy(&schema
);
2922 if( rc
!=SQLITE_OK
) goto out
;
2924 rc
= sql_exec(db
, spec
.zDb
, spec
.zName
,
2925 "create table %_segments(block blob);");
2926 if( rc
!=SQLITE_OK
) goto out
;
2928 rc
= sql_exec(db
, spec
.zDb
, spec
.zName
,
2929 "create table %_segdir("
2932 " start_block integer,"
2933 " leaves_end_block integer,"
2934 " end_block integer,"
2936 " primary key(level, idx)"
2938 if( rc
!=SQLITE_OK
) goto out
;
2940 rc
= constructVtab(db
, (fts2Hash
*)pAux
, &spec
, ppVTab
, pzErr
);
2943 clearTableSpec(&spec
);
2947 /* Decide how to handle an SQL query. */
2948 static int fulltextBestIndex(sqlite3_vtab
*pVTab
, sqlite3_index_info
*pInfo
){
2950 TRACE(("FTS2 BestIndex\n"));
2952 for(i
=0; i
<pInfo
->nConstraint
; ++i
){
2953 const struct sqlite3_index_constraint
*pConstraint
;
2954 pConstraint
= &pInfo
->aConstraint
[i
];
2955 if( pConstraint
->usable
) {
2956 if( pConstraint
->iColumn
==-1 &&
2957 pConstraint
->op
==SQLITE_INDEX_CONSTRAINT_EQ
){
2958 pInfo
->idxNum
= QUERY_ROWID
; /* lookup by rowid */
2959 TRACE(("FTS2 QUERY_ROWID\n"));
2960 } else if( pConstraint
->iColumn
>=0 &&
2961 pConstraint
->op
==SQLITE_INDEX_CONSTRAINT_MATCH
){
2962 /* full-text search */
2963 pInfo
->idxNum
= QUERY_FULLTEXT
+ pConstraint
->iColumn
;
2964 TRACE(("FTS2 QUERY_FULLTEXT %d\n", pConstraint
->iColumn
));
2967 pInfo
->aConstraintUsage
[i
].argvIndex
= 1;
2968 pInfo
->aConstraintUsage
[i
].omit
= 1;
2970 /* An arbitrary value for now.
2971 * TODO: Perhaps rowid matches should be considered cheaper than
2972 * full-text searches. */
2973 pInfo
->estimatedCost
= 1.0;
2978 pInfo
->idxNum
= QUERY_GENERIC
;
2982 static int fulltextDisconnect(sqlite3_vtab
*pVTab
){
2983 TRACE(("FTS2 Disconnect %p\n", pVTab
));
2984 fulltext_vtab_destroy((fulltext_vtab
*)pVTab
);
2988 static int fulltextDestroy(sqlite3_vtab
*pVTab
){
2989 fulltext_vtab
*v
= (fulltext_vtab
*)pVTab
;
2992 TRACE(("FTS2 Destroy %p\n", pVTab
));
2993 rc
= sql_exec(v
->db
, v
->zDb
, v
->zName
,
2994 "drop table if exists %_content;"
2995 "drop table if exists %_segments;"
2996 "drop table if exists %_segdir;"
2998 if( rc
!=SQLITE_OK
) return rc
;
3000 fulltext_vtab_destroy((fulltext_vtab
*)pVTab
);
3004 static int fulltextOpen(sqlite3_vtab
*pVTab
, sqlite3_vtab_cursor
**ppCursor
){
3007 c
= (fulltext_cursor
*) sqlite3_malloc(sizeof(fulltext_cursor
));
3009 memset(c
, 0, sizeof(fulltext_cursor
));
3010 /* sqlite will initialize c->base */
3011 *ppCursor
= &c
->base
;
3012 TRACE(("FTS2 Open %p: %p\n", pVTab
, c
));
3015 return SQLITE_NOMEM
;
3020 /* Free all of the dynamically allocated memory held by *q
3022 static void queryClear(Query
*q
){
3024 for(i
= 0; i
< q
->nTerms
; ++i
){
3025 sqlite3_free(q
->pTerms
[i
].pTerm
);
3027 sqlite3_free(q
->pTerms
);
3031 /* Free all of the dynamically allocated memory held by the
3034 static void snippetClear(Snippet
*p
){
3035 sqlite3_free(p
->aMatch
);
3036 sqlite3_free(p
->zOffset
);
3037 sqlite3_free(p
->zSnippet
);
3041 ** Append a single entry to the p->aMatch[] log.
3043 static void snippetAppendMatch(
3044 Snippet
*p
, /* Append the entry to this snippet */
3045 int iCol
, int iTerm
, /* The column and query term */
3046 int iStart
, int nByte
/* Offset and size of the match */
3049 struct snippetMatch
*pMatch
;
3050 if( p
->nMatch
+1>=p
->nAlloc
){
3051 p
->nAlloc
= p
->nAlloc
*2 + 10;
3052 p
->aMatch
= sqlite3_realloc(p
->aMatch
, p
->nAlloc
*sizeof(p
->aMatch
[0]) );
3060 pMatch
= &p
->aMatch
[i
];
3061 pMatch
->iCol
= iCol
;
3062 pMatch
->iTerm
= iTerm
;
3063 pMatch
->iStart
= iStart
;
3064 pMatch
->nByte
= nByte
;
3068 ** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
3070 #define FTS2_ROTOR_SZ (32)
3071 #define FTS2_ROTOR_MASK (FTS2_ROTOR_SZ-1)
3074 ** Add entries to pSnippet->aMatch[] for every match that occurs against
3075 ** document zDoc[0..nDoc-1] which is stored in column iColumn.
3077 static void snippetOffsetsOfColumn(
3084 const sqlite3_tokenizer_module
*pTModule
; /* The tokenizer module */
3085 sqlite3_tokenizer
*pTokenizer
; /* The specific tokenizer */
3086 sqlite3_tokenizer_cursor
*pTCursor
; /* Tokenizer cursor */
3087 fulltext_vtab
*pVtab
; /* The full text index */
3088 int nColumn
; /* Number of columns in the index */
3089 const QueryTerm
*aTerm
; /* Query string terms */
3090 int nTerm
; /* Number of query string terms */
3091 int i
, j
; /* Loop counters */
3092 int rc
; /* Return code */
3093 unsigned int match
, prevMatch
; /* Phrase search bitmasks */
3094 const char *zToken
; /* Next token from the tokenizer */
3095 int nToken
; /* Size of zToken */
3096 int iBegin
, iEnd
, iPos
; /* Offsets of beginning and end */
3098 /* The following variables keep a circular buffer of the last
3100 unsigned int iRotor
= 0; /* Index of current token */
3101 int iRotorBegin
[FTS2_ROTOR_SZ
]; /* Beginning offset of token */
3102 int iRotorLen
[FTS2_ROTOR_SZ
]; /* Length of token */
3104 pVtab
= pQuery
->pFts
;
3105 nColumn
= pVtab
->nColumn
;
3106 pTokenizer
= pVtab
->pTokenizer
;
3107 pTModule
= pTokenizer
->pModule
;
3108 rc
= pTModule
->xOpen(pTokenizer
, zDoc
, nDoc
, &pTCursor
);
3110 pTCursor
->pTokenizer
= pTokenizer
;
3111 aTerm
= pQuery
->pTerms
;
3112 nTerm
= pQuery
->nTerms
;
3113 if( nTerm
>=FTS2_ROTOR_SZ
){
3114 nTerm
= FTS2_ROTOR_SZ
- 1;
3118 rc
= pTModule
->xNext(pTCursor
, &zToken
, &nToken
, &iBegin
, &iEnd
, &iPos
);
3120 iRotorBegin
[iRotor
&FTS2_ROTOR_MASK
] = iBegin
;
3121 iRotorLen
[iRotor
&FTS2_ROTOR_MASK
] = iEnd
-iBegin
;
3123 for(i
=0; i
<nTerm
; i
++){
3125 iCol
= aTerm
[i
].iColumn
;
3126 if( iCol
>=0 && iCol
<nColumn
&& iCol
!=iColumn
) continue;
3127 if( aTerm
[i
].nTerm
>nToken
) continue;
3128 if( !aTerm
[i
].isPrefix
&& aTerm
[i
].nTerm
<nToken
) continue;
3129 assert( aTerm
[i
].nTerm
<=nToken
);
3130 if( memcmp(aTerm
[i
].pTerm
, zToken
, aTerm
[i
].nTerm
) ) continue;
3131 if( aTerm
[i
].iPhrase
>1 && (prevMatch
& (1<<i
))==0 ) continue;
3133 if( i
==nTerm
-1 || aTerm
[i
+1].iPhrase
==1 ){
3134 for(j
=aTerm
[i
].iPhrase
-1; j
>=0; j
--){
3135 int k
= (iRotor
-j
) & FTS2_ROTOR_MASK
;
3136 snippetAppendMatch(pSnippet
, iColumn
, i
-j
,
3137 iRotorBegin
[k
], iRotorLen
[k
]);
3141 prevMatch
= match
<<1;
3144 pTModule
->xClose(pTCursor
);
3149 ** Compute all offsets for the current row of the query.
3150 ** If the offsets have already been computed, this routine is a no-op.
3152 static void snippetAllOffsets(fulltext_cursor
*p
){
3156 fulltext_vtab
*pFts
;
3158 if( p
->snippet
.nMatch
) return;
3159 if( p
->q
.nTerms
==0 ) return;
3161 nColumn
= pFts
->nColumn
;
3162 iColumn
= (p
->iCursorType
- QUERY_FULLTEXT
);
3163 if( iColumn
<0 || iColumn
>=nColumn
){
3170 for(i
=iFirst
; i
<=iLast
; i
++){
3173 zDoc
= (const char*)sqlite3_column_text(p
->pStmt
, i
+1);
3174 nDoc
= sqlite3_column_bytes(p
->pStmt
, i
+1);
3175 snippetOffsetsOfColumn(&p
->q
, &p
->snippet
, i
, zDoc
, nDoc
);
3180 ** Convert the information in the aMatch[] array of the snippet
3181 ** into the string zOffset[0..nOffset-1].
3183 static void snippetOffsetText(Snippet
*p
){
3188 if( p
->zOffset
) return;
3189 initStringBuffer(&sb
);
3190 for(i
=0; i
<p
->nMatch
; i
++){
3191 struct snippetMatch
*pMatch
= &p
->aMatch
[i
];
3193 sqlite3_snprintf(sizeof(zBuf
)-1, &zBuf
[cnt
>0], "%d %d %d %d",
3194 pMatch
->iCol
, pMatch
->iTerm
, pMatch
->iStart
, pMatch
->nByte
);
3198 p
->zOffset
= stringBufferData(&sb
);
3199 p
->nOffset
= stringBufferLength(&sb
);
3203 ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
3204 ** of matching words some of which might be in zDoc. zDoc is column
3207 ** iBreak is suggested spot in zDoc where we could begin or end an
3208 ** excerpt. Return a value similar to iBreak but possibly adjusted
3209 ** to be a little left or right so that the break point is better.
3211 static int wordBoundary(
3212 int iBreak
, /* The suggested break point */
3213 const char *zDoc
, /* Document text */
3214 int nDoc
, /* Number of bytes in zDoc[] */
3215 struct snippetMatch
*aMatch
, /* Matching words */
3216 int nMatch
, /* Number of entries in aMatch[] */
3217 int iCol
/* The column number for zDoc[] */
3223 if( iBreak
>=nDoc
-10 ){
3226 for(i
=0; i
<nMatch
&& aMatch
[i
].iCol
<iCol
; i
++){}
3227 while( i
<nMatch
&& aMatch
[i
].iStart
+aMatch
[i
].nByte
<iBreak
){ i
++; }
3229 if( aMatch
[i
].iStart
<iBreak
+10 ){
3230 return aMatch
[i
].iStart
;
3232 if( i
>0 && aMatch
[i
-1].iStart
+aMatch
[i
-1].nByte
>=iBreak
){
3233 return aMatch
[i
-1].iStart
;
3236 for(i
=1; i
<=10; i
++){
3237 if( safe_isspace(zDoc
[iBreak
-i
]) ){
3238 return iBreak
- i
+ 1;
3240 if( safe_isspace(zDoc
[iBreak
+i
]) ){
3241 return iBreak
+ i
+ 1;
3250 ** Allowed values for Snippet.aMatch[].snStatus
3252 #define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
3253 #define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
3256 ** Generate the text of a snippet.
3258 static void snippetText(
3259 fulltext_cursor
*pCursor
, /* The cursor we need the snippet for */
3260 const char *zStartMark
, /* Markup to appear before each match */
3261 const char *zEndMark
, /* Markup to appear after each match */
3262 const char *zEllipsis
/* Ellipsis mark */
3265 struct snippetMatch
*aMatch
;
3275 int tailEllipsis
= 0;
3279 sqlite3_free(pCursor
->snippet
.zSnippet
);
3280 pCursor
->snippet
.zSnippet
= 0;
3281 aMatch
= pCursor
->snippet
.aMatch
;
3282 nMatch
= pCursor
->snippet
.nMatch
;
3283 initStringBuffer(&sb
);
3285 for(i
=0; i
<nMatch
; i
++){
3286 aMatch
[i
].snStatus
= SNIPPET_IGNORE
;
3289 for(i
=0; i
<pCursor
->q
.nTerms
; i
++){
3290 for(j
=0; j
<nMatch
; j
++){
3291 if( aMatch
[j
].iTerm
==i
){
3292 aMatch
[j
].snStatus
= SNIPPET_DESIRED
;
3302 for(i
=0; i
<nMatch
&& nDesired
>0; i
++){
3303 if( aMatch
[i
].snStatus
!=SNIPPET_DESIRED
) continue;
3305 iCol
= aMatch
[i
].iCol
;
3306 zDoc
= (const char*)sqlite3_column_text(pCursor
->pStmt
, iCol
+1);
3307 nDoc
= sqlite3_column_bytes(pCursor
->pStmt
, iCol
+1);
3308 iStart
= aMatch
[i
].iStart
- 40;
3309 iStart
= wordBoundary(iStart
, zDoc
, nDoc
, aMatch
, nMatch
, iCol
);
3313 if( iCol
==tailCol
&& iStart
<=tailOffset
+20 ){
3314 iStart
= tailOffset
;
3316 if( (iCol
!=tailCol
&& tailCol
>=0) || iStart
!=tailOffset
){
3317 trimWhiteSpace(&sb
);
3318 appendWhiteSpace(&sb
);
3319 append(&sb
, zEllipsis
);
3320 appendWhiteSpace(&sb
);
3322 iEnd
= aMatch
[i
].iStart
+ aMatch
[i
].nByte
+ 40;
3323 iEnd
= wordBoundary(iEnd
, zDoc
, nDoc
, aMatch
, nMatch
, iCol
);
3324 if( iEnd
>=nDoc
-10 ){
3330 while( iMatch
<nMatch
&& aMatch
[iMatch
].iCol
<iCol
){ iMatch
++; }
3331 while( iStart
<iEnd
){
3332 while( iMatch
<nMatch
&& aMatch
[iMatch
].iStart
<iStart
3333 && aMatch
[iMatch
].iCol
<=iCol
){
3336 if( iMatch
<nMatch
&& aMatch
[iMatch
].iStart
<iEnd
3337 && aMatch
[iMatch
].iCol
==iCol
){
3338 nappend(&sb
, &zDoc
[iStart
], aMatch
[iMatch
].iStart
- iStart
);
3339 iStart
= aMatch
[iMatch
].iStart
;
3340 append(&sb
, zStartMark
);
3341 nappend(&sb
, &zDoc
[iStart
], aMatch
[iMatch
].nByte
);
3342 append(&sb
, zEndMark
);
3343 iStart
+= aMatch
[iMatch
].nByte
;
3344 for(j
=iMatch
+1; j
<nMatch
; j
++){
3345 if( aMatch
[j
].iTerm
==aMatch
[iMatch
].iTerm
3346 && aMatch
[j
].snStatus
==SNIPPET_DESIRED
){
3348 aMatch
[j
].snStatus
= SNIPPET_IGNORE
;
3352 nappend(&sb
, &zDoc
[iStart
], iEnd
- iStart
);
3359 trimWhiteSpace(&sb
);
3361 appendWhiteSpace(&sb
);
3362 append(&sb
, zEllipsis
);
3364 pCursor
->snippet
.zSnippet
= stringBufferData(&sb
);
3365 pCursor
->snippet
.nSnippet
= stringBufferLength(&sb
);
3370 ** Close the cursor. For additional information see the documentation
3371 ** on the xClose method of the virtual table interface.
3373 static int fulltextClose(sqlite3_vtab_cursor
*pCursor
){
3374 fulltext_cursor
*c
= (fulltext_cursor
*) pCursor
;
3375 TRACE(("FTS2 Close %p\n", c
));
3376 sqlite3_finalize(c
->pStmt
);
3378 snippetClear(&c
->snippet
);
3379 if( c
->result
.nData
!=0 ) dlrDestroy(&c
->reader
);
3380 dataBufferDestroy(&c
->result
);
3385 static int fulltextNext(sqlite3_vtab_cursor
*pCursor
){
3386 fulltext_cursor
*c
= (fulltext_cursor
*) pCursor
;
3389 TRACE(("FTS2 Next %p\n", pCursor
));
3390 snippetClear(&c
->snippet
);
3391 if( c
->iCursorType
< QUERY_FULLTEXT
){
3392 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
3393 rc
= sqlite3_step(c
->pStmt
);
3405 } else { /* full-text query */
3406 rc
= sqlite3_reset(c
->pStmt
);
3407 if( rc
!=SQLITE_OK
) return rc
;
3409 if( c
->result
.nData
==0 || dlrAtEnd(&c
->reader
) ){
3413 rc
= sqlite3_bind_int64(c
->pStmt
, 1, dlrDocid(&c
->reader
));
3414 dlrStep(&c
->reader
);
3415 if( rc
!=SQLITE_OK
) return rc
;
3416 /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
3417 rc
= sqlite3_step(c
->pStmt
);
3418 if( rc
==SQLITE_ROW
){ /* the case we expect */
3422 /* an error occurred; abort */
3423 return rc
==SQLITE_DONE
? SQLITE_ERROR
: rc
;
3428 /* TODO(shess) If we pushed LeafReader to the top of the file, or to
3429 ** another file, term_select() could be pushed above
3432 static int termSelect(fulltext_vtab
*v
, int iColumn
,
3433 const char *pTerm
, int nTerm
, int isPrefix
,
3434 DocListType iType
, DataBuffer
*out
);
3436 /* Return a DocList corresponding to the query term *pTerm. If *pTerm
3437 ** is the first term of a phrase query, go ahead and evaluate the phrase
3438 ** query and return the doclist for the entire phrase query.
3440 ** The resulting DL_DOCIDS doclist is stored in pResult, which is
3443 static int docListOfTerm(
3444 fulltext_vtab
*v
, /* The full text index */
3445 int iColumn
, /* column to restrict to. No restriction if >=nColumn */
3446 QueryTerm
*pQTerm
, /* Term we are looking for, or 1st term of a phrase */
3447 DataBuffer
*pResult
/* Write the result here */
3449 DataBuffer left
, right
, new;
3452 /* No phrase search if no position info. */
3453 assert( pQTerm
->nPhrase
==0 || DL_DEFAULT
!=DL_DOCIDS
);
3455 /* This code should never be called with buffered updates. */
3456 assert( v
->nPendingData
<0 );
3458 dataBufferInit(&left
, 0);
3459 rc
= termSelect(v
, iColumn
, pQTerm
->pTerm
, pQTerm
->nTerm
, pQTerm
->isPrefix
,
3460 0<pQTerm
->nPhrase
? DL_POSITIONS
: DL_DOCIDS
, &left
);
3462 for(i
=1; i
<=pQTerm
->nPhrase
&& left
.nData
>0; i
++){
3463 dataBufferInit(&right
, 0);
3464 rc
= termSelect(v
, iColumn
, pQTerm
[i
].pTerm
, pQTerm
[i
].nTerm
,
3465 pQTerm
[i
].isPrefix
, DL_POSITIONS
, &right
);
3467 dataBufferDestroy(&left
);
3470 dataBufferInit(&new, 0);
3471 docListPhraseMerge(left
.pData
, left
.nData
, right
.pData
, right
.nData
,
3472 i
<pQTerm
->nPhrase
? DL_POSITIONS
: DL_DOCIDS
, &new);
3473 dataBufferDestroy(&left
);
3474 dataBufferDestroy(&right
);
3481 /* Add a new term pTerm[0..nTerm-1] to the query *q.
3483 static void queryAdd(Query
*q
, const char *pTerm
, int nTerm
){
3486 q
->pTerms
= sqlite3_realloc(q
->pTerms
, q
->nTerms
* sizeof(q
->pTerms
[0]));
3491 t
= &q
->pTerms
[q
->nTerms
- 1];
3493 t
->pTerm
= sqlite3_malloc(nTerm
+1);
3494 memcpy(t
->pTerm
, pTerm
, nTerm
);
3495 t
->pTerm
[nTerm
] = 0;
3497 t
->isOr
= q
->nextIsOr
;
3500 t
->iColumn
= q
->nextColumn
;
3501 q
->nextColumn
= q
->dfltColumn
;
3505 ** Check to see if the string zToken[0...nToken-1] matches any
3506 ** column name in the virtual table. If it does,
3507 ** return the zero-indexed column number. If not, return -1.
3509 static int checkColumnSpecifier(
3510 fulltext_vtab
*pVtab
, /* The virtual table */
3511 const char *zToken
, /* Text of the token */
3512 int nToken
/* Number of characters in the token */
3515 for(i
=0; i
<pVtab
->nColumn
; i
++){
3516 if( memcmp(pVtab
->azColumn
[i
], zToken
, nToken
)==0
3517 && pVtab
->azColumn
[i
][nToken
]==0 ){
3525 ** Parse the text at pSegment[0..nSegment-1]. Add additional terms
3526 ** to the query being assemblied in pQuery.
3528 ** inPhrase is true if pSegment[0..nSegement-1] is contained within
3529 ** double-quotes. If inPhrase is true, then the first term
3530 ** is marked with the number of terms in the phrase less one and
3531 ** OR and "-" syntax is ignored. If inPhrase is false, then every
3532 ** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
3534 static int tokenizeSegment(
3535 sqlite3_tokenizer
*pTokenizer
, /* The tokenizer to use */
3536 const char *pSegment
, int nSegment
, /* Query expression being parsed */
3537 int inPhrase
, /* True if within "..." */
3538 Query
*pQuery
/* Append results here */
3540 const sqlite3_tokenizer_module
*pModule
= pTokenizer
->pModule
;
3541 sqlite3_tokenizer_cursor
*pCursor
;
3542 int firstIndex
= pQuery
->nTerms
;
3546 int rc
= pModule
->xOpen(pTokenizer
, pSegment
, nSegment
, &pCursor
);
3547 if( rc
!=SQLITE_OK
) return rc
;
3548 pCursor
->pTokenizer
= pTokenizer
;
3552 int nToken
, iBegin
, iEnd
, iPos
;
3554 rc
= pModule
->xNext(pCursor
,
3556 &iBegin
, &iEnd
, &iPos
);
3557 if( rc
!=SQLITE_OK
) break;
3559 pSegment
[iEnd
]==':' &&
3560 (iCol
= checkColumnSpecifier(pQuery
->pFts
, pToken
, nToken
))>=0 ){
3561 pQuery
->nextColumn
= iCol
;
3564 if( !inPhrase
&& pQuery
->nTerms
>0 && nToken
==2
3565 && pSegment
[iBegin
]=='O' && pSegment
[iBegin
+1]=='R' ){
3566 pQuery
->nextIsOr
= 1;
3569 queryAdd(pQuery
, pToken
, nToken
);
3570 if( !inPhrase
&& iBegin
>0 && pSegment
[iBegin
-1]=='-' ){
3571 pQuery
->pTerms
[pQuery
->nTerms
-1].isNot
= 1;
3573 if( iEnd
<nSegment
&& pSegment
[iEnd
]=='*' ){
3574 pQuery
->pTerms
[pQuery
->nTerms
-1].isPrefix
= 1;
3576 pQuery
->pTerms
[pQuery
->nTerms
-1].iPhrase
= nTerm
;
3582 if( inPhrase
&& pQuery
->nTerms
>firstIndex
){
3583 pQuery
->pTerms
[firstIndex
].nPhrase
= pQuery
->nTerms
- firstIndex
- 1;
3586 return pModule
->xClose(pCursor
);
3589 /* Parse a query string, yielding a Query object pQuery.
3591 ** The calling function will need to queryClear() to clean up
3592 ** the dynamically allocated memory held by pQuery.
3594 static int parseQuery(
3595 fulltext_vtab
*v
, /* The fulltext index */
3596 const char *zInput
, /* Input text of the query string */
3597 int nInput
, /* Size of the input text */
3598 int dfltColumn
, /* Default column of the index to match against */
3599 Query
*pQuery
/* Write the parse results here. */
3601 int iInput
, inPhrase
= 0;
3603 if( zInput
==0 ) nInput
= 0;
3604 if( nInput
<0 ) nInput
= strlen(zInput
);
3606 pQuery
->pTerms
= NULL
;
3607 pQuery
->nextIsOr
= 0;
3608 pQuery
->nextColumn
= dfltColumn
;
3609 pQuery
->dfltColumn
= dfltColumn
;
3612 for(iInput
=0; iInput
<nInput
; ++iInput
){
3614 for(i
=iInput
; i
<nInput
&& zInput
[i
]!='"'; ++i
){}
3616 tokenizeSegment(v
->pTokenizer
, zInput
+iInput
, i
-iInput
, inPhrase
,
3621 assert( zInput
[i
]=='"' );
3622 inPhrase
= !inPhrase
;
3627 /* unmatched quote */
3629 return SQLITE_ERROR
;
3634 /* TODO(shess) Refactor the code to remove this forward decl. */
3635 static int flushPendingTerms(fulltext_vtab
*v
);
3637 /* Perform a full-text query using the search expression in
3638 ** zInput[0..nInput-1]. Return a list of matching documents
3641 ** Queries must match column iColumn. Or if iColumn>=nColumn
3642 ** they are allowed to match against any column.
3644 static int fulltextQuery(
3645 fulltext_vtab
*v
, /* The full text index */
3646 int iColumn
, /* Match against this column by default */
3647 const char *zInput
, /* The query string */
3648 int nInput
, /* Number of bytes in zInput[] */
3649 DataBuffer
*pResult
, /* Write the result doclist here */
3650 Query
*pQuery
/* Put parsed query string here */
3653 DataBuffer left
, right
, or, new;
3657 /* TODO(shess) Instead of flushing pendingTerms, we could query for
3658 ** the relevant term and merge the doclist into what we receive from
3659 ** the database. Wait and see if this is a common issue, first.
3661 ** A good reason not to flush is to not generate update-related
3662 ** error codes from here.
3665 /* Flush any buffered updates before executing the query. */
3666 rc
= flushPendingTerms(v
);
3667 if( rc
!=SQLITE_OK
) return rc
;
3669 /* TODO(shess) I think that the queryClear() calls below are not
3670 ** necessary, because fulltextClose() already clears the query.
3672 rc
= parseQuery(v
, zInput
, nInput
, iColumn
, pQuery
);
3673 if( rc
!=SQLITE_OK
) return rc
;
3675 /* Empty or NULL queries return no results. */
3676 if( pQuery
->nTerms
==0 ){
3677 dataBufferInit(pResult
, 0);
3681 /* Merge AND terms. */
3682 /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */
3683 aTerm
= pQuery
->pTerms
;
3684 for(i
= 0; i
<pQuery
->nTerms
; i
=iNext
){
3685 if( aTerm
[i
].isNot
){
3686 /* Handle all NOT terms in a separate pass */
3688 iNext
= i
+ aTerm
[i
].nPhrase
+1;
3691 iNext
= i
+ aTerm
[i
].nPhrase
+ 1;
3692 rc
= docListOfTerm(v
, aTerm
[i
].iColumn
, &aTerm
[i
], &right
);
3694 if( i
!=nNot
) dataBufferDestroy(&left
);
3698 while( iNext
<pQuery
->nTerms
&& aTerm
[iNext
].isOr
){
3699 rc
= docListOfTerm(v
, aTerm
[iNext
].iColumn
, &aTerm
[iNext
], &or);
3700 iNext
+= aTerm
[iNext
].nPhrase
+ 1;
3702 if( i
!=nNot
) dataBufferDestroy(&left
);
3703 dataBufferDestroy(&right
);
3707 dataBufferInit(&new, 0);
3708 docListOrMerge(right
.pData
, right
.nData
, or.pData
, or.nData
, &new);
3709 dataBufferDestroy(&right
);
3710 dataBufferDestroy(&or);
3713 if( i
==nNot
){ /* first term processed. */
3716 dataBufferInit(&new, 0);
3717 docListAndMerge(left
.pData
, left
.nData
, right
.pData
, right
.nData
, &new);
3718 dataBufferDestroy(&right
);
3719 dataBufferDestroy(&left
);
3724 if( nNot
==pQuery
->nTerms
){
3725 /* We do not yet know how to handle a query of only NOT terms */
3726 return SQLITE_ERROR
;
3729 /* Do the EXCEPT terms */
3730 for(i
=0; i
<pQuery
->nTerms
; i
+= aTerm
[i
].nPhrase
+ 1){
3731 if( !aTerm
[i
].isNot
) continue;
3732 rc
= docListOfTerm(v
, aTerm
[i
].iColumn
, &aTerm
[i
], &right
);
3735 dataBufferDestroy(&left
);
3738 dataBufferInit(&new, 0);
3739 docListExceptMerge(left
.pData
, left
.nData
, right
.pData
, right
.nData
, &new);
3740 dataBufferDestroy(&right
);
3741 dataBufferDestroy(&left
);
3750 ** This is the xFilter interface for the virtual table. See
3751 ** the virtual table xFilter method documentation for additional
3754 ** If idxNum==QUERY_GENERIC then do a full table scan against
3755 ** the %_content table.
3757 ** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
3758 ** in the %_content table.
3760 ** If idxNum>=QUERY_FULLTEXT then use the full text index. The
3761 ** column on the left-hand side of the MATCH operator is column
3762 ** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
3763 ** side of the MATCH operator.
3765 /* TODO(shess) Upgrade the cursor initialization and destruction to
3766 ** account for fulltextFilter() being called multiple times on the
3767 ** same cursor. The current solution is very fragile. Apply fix to
3768 ** fts2 as appropriate.
3770 static int fulltextFilter(
3771 sqlite3_vtab_cursor
*pCursor
, /* The cursor used for this query */
3772 int idxNum
, const char *idxStr
, /* Which indexing scheme to use */
3773 int argc
, sqlite3_value
**argv
/* Arguments for the indexing scheme */
3775 fulltext_cursor
*c
= (fulltext_cursor
*) pCursor
;
3776 fulltext_vtab
*v
= cursor_vtab(c
);
3779 TRACE(("FTS2 Filter %p\n",pCursor
));
3781 /* If the cursor has a statement that was not prepared according to
3782 ** idxNum, clear it. I believe all calls to fulltextFilter with a
3783 ** given cursor will have the same idxNum , but in this case it's
3786 if( c
->pStmt
&& c
->iCursorType
!=idxNum
){
3787 sqlite3_finalize(c
->pStmt
);
3791 /* Get a fresh statement appropriate to idxNum. */
3792 /* TODO(shess): Add a prepared-statement cache in the vt structure.
3793 ** The cache must handle multiple open cursors. Easier to cache the
3794 ** statement variants at the vt to reduce malloc/realloc/free here.
3795 ** Or we could have a StringBuffer variant which allowed stack
3796 ** construction for small values.
3799 char *zSql
= sqlite3_mprintf("select rowid, * from %%_content %s",
3800 idxNum
==QUERY_GENERIC
? "" : "where rowid=?");
3801 rc
= sql_prepare(v
->db
, v
->zDb
, v
->zName
, &c
->pStmt
, zSql
);
3803 if( rc
!=SQLITE_OK
) return rc
;
3804 c
->iCursorType
= idxNum
;
3806 sqlite3_reset(c
->pStmt
);
3807 assert( c
->iCursorType
==idxNum
);
3815 rc
= sqlite3_bind_int64(c
->pStmt
, 1, sqlite3_value_int64(argv
[0]));
3816 if( rc
!=SQLITE_OK
) return rc
;
3819 default: /* full-text search */
3821 const char *zQuery
= (const char *)sqlite3_value_text(argv
[0]);
3822 assert( idxNum
<=QUERY_FULLTEXT
+v
->nColumn
);
3825 if( c
->result
.nData
!=0 ){
3826 /* This case happens if the same cursor is used repeatedly. */
3827 dlrDestroy(&c
->reader
);
3828 dataBufferReset(&c
->result
);
3830 dataBufferInit(&c
->result
, 0);
3832 rc
= fulltextQuery(v
, idxNum
-QUERY_FULLTEXT
, zQuery
, -1, &c
->result
, &c
->q
);
3833 if( rc
!=SQLITE_OK
) return rc
;
3834 if( c
->result
.nData
!=0 ){
3835 dlrInit(&c
->reader
, DL_DOCIDS
, c
->result
.pData
, c
->result
.nData
);
3841 return fulltextNext(pCursor
);
3844 /* This is the xEof method of the virtual table. The SQLite core
3845 ** calls this routine to find out if it has reached the end of
3846 ** a query's results set.
3848 static int fulltextEof(sqlite3_vtab_cursor
*pCursor
){
3849 fulltext_cursor
*c
= (fulltext_cursor
*) pCursor
;
3853 /* This is the xColumn method of the virtual table. The SQLite
3854 ** core calls this method during a query when it needs the value
3855 ** of a column from the virtual table. This method needs to use
3856 ** one of the sqlite3_result_*() routines to store the requested
3857 ** value back in the pContext.
3859 static int fulltextColumn(sqlite3_vtab_cursor
*pCursor
,
3860 sqlite3_context
*pContext
, int idxCol
){
3861 fulltext_cursor
*c
= (fulltext_cursor
*) pCursor
;
3862 fulltext_vtab
*v
= cursor_vtab(c
);
3864 if( idxCol
<v
->nColumn
){
3865 sqlite3_value
*pVal
= sqlite3_column_value(c
->pStmt
, idxCol
+1);
3866 sqlite3_result_value(pContext
, pVal
);
3867 }else if( idxCol
==v
->nColumn
){
3868 /* The extra column whose name is the same as the table.
3869 ** Return a blob which is a pointer to the cursor
3871 sqlite3_result_blob(pContext
, &c
, sizeof(c
), SQLITE_TRANSIENT
);
3876 /* This is the xRowid method. The SQLite core calls this routine to
3877 ** retrive the rowid for the current row of the result set. The
3878 ** rowid should be written to *pRowid.
3880 static int fulltextRowid(sqlite3_vtab_cursor
*pCursor
, sqlite_int64
*pRowid
){
3881 fulltext_cursor
*c
= (fulltext_cursor
*) pCursor
;
3883 *pRowid
= sqlite3_column_int64(c
->pStmt
, 0);
3887 /* Add all terms in [zText] to pendingTerms table. If [iColumn] > 0,
3888 ** we also store positions and offsets in the hash table using that
3891 static int buildTerms(fulltext_vtab
*v
, sqlite_int64 iDocid
,
3892 const char *zText
, int iColumn
){
3893 sqlite3_tokenizer
*pTokenizer
= v
->pTokenizer
;
3894 sqlite3_tokenizer_cursor
*pCursor
;
3897 int iStartOffset
, iEndOffset
, iPosition
;
3900 rc
= pTokenizer
->pModule
->xOpen(pTokenizer
, zText
, -1, &pCursor
);
3901 if( rc
!=SQLITE_OK
) return rc
;
3903 pCursor
->pTokenizer
= pTokenizer
;
3904 while( SQLITE_OK
==(rc
=pTokenizer
->pModule
->xNext(pCursor
,
3905 &pToken
, &nTokenBytes
,
3906 &iStartOffset
, &iEndOffset
,
3909 int nData
; /* Size of doclist before our update. */
3911 /* Positions can't be negative; we use -1 as a terminator
3912 * internally. Token can't be NULL or empty. */
3913 if( iPosition
<0 || pToken
== NULL
|| nTokenBytes
== 0 ){
3918 p
= fts2HashFind(&v
->pendingTerms
, pToken
, nTokenBytes
);
3921 p
= dlcNew(iDocid
, DL_DEFAULT
);
3922 fts2HashInsert(&v
->pendingTerms
, pToken
, nTokenBytes
, p
);
3924 /* Overhead for our hash table entry, the key, and the value. */
3925 v
->nPendingData
+= sizeof(struct fts2HashElem
)+sizeof(*p
)+nTokenBytes
;
3928 if( p
->dlw
.iPrevDocid
!=iDocid
) dlcNext(p
, iDocid
);
3931 dlcAddPos(p
, iColumn
, iPosition
, iStartOffset
, iEndOffset
);
3934 /* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */
3935 v
->nPendingData
+= p
->b
.nData
-nData
;
3938 /* TODO(shess) Check return? Should this be able to cause errors at
3939 ** this point? Actually, same question about sqlite3_finalize(),
3940 ** though one could argue that failure there means that the data is
3941 ** not durable. *ponder*
3943 pTokenizer
->pModule
->xClose(pCursor
);
3944 if( SQLITE_DONE
== rc
) return SQLITE_OK
;
3948 /* Add doclists for all terms in [pValues] to pendingTerms table. */
3949 static int insertTerms(fulltext_vtab
*v
, sqlite_int64 iRowid
,
3950 sqlite3_value
**pValues
){
3952 for(i
= 0; i
< v
->nColumn
; ++i
){
3953 char *zText
= (char*)sqlite3_value_text(pValues
[i
]);
3954 int rc
= buildTerms(v
, iRowid
, zText
, i
);
3955 if( rc
!=SQLITE_OK
) return rc
;
3960 /* Add empty doclists for all terms in the given row's content to
3963 static int deleteTerms(fulltext_vtab
*v
, sqlite_int64 iRowid
){
3964 const char **pValues
;
3967 /* TODO(shess) Should we allow such tables at all? */
3968 if( DL_DEFAULT
==DL_DOCIDS
) return SQLITE_ERROR
;
3970 rc
= content_select(v
, iRowid
, &pValues
);
3971 if( rc
!=SQLITE_OK
) return rc
;
3973 for(i
= 0 ; i
< v
->nColumn
; ++i
) {
3974 rc
= buildTerms(v
, iRowid
, pValues
[i
], -1);
3975 if( rc
!=SQLITE_OK
) break;
3978 freeStringArray(v
->nColumn
, pValues
);
3982 /* TODO(shess) Refactor the code to remove this forward decl. */
3983 static int initPendingTerms(fulltext_vtab
*v
, sqlite_int64 iDocid
);
3985 /* Insert a row into the %_content table; set *piRowid to be the ID of the
3986 ** new row. Add doclists for terms to pendingTerms.
3988 static int index_insert(fulltext_vtab
*v
, sqlite3_value
*pRequestRowid
,
3989 sqlite3_value
**pValues
, sqlite_int64
*piRowid
){
3992 rc
= content_insert(v
, pRequestRowid
, pValues
); /* execute an SQL INSERT */
3993 if( rc
!=SQLITE_OK
) return rc
;
3995 *piRowid
= sqlite3_last_insert_rowid(v
->db
);
3996 rc
= initPendingTerms(v
, *piRowid
);
3997 if( rc
!=SQLITE_OK
) return rc
;
3999 return insertTerms(v
, *piRowid
, pValues
);
4002 /* Delete a row from the %_content table; add empty doclists for terms
4005 static int index_delete(fulltext_vtab
*v
, sqlite_int64 iRow
){
4006 int rc
= initPendingTerms(v
, iRow
);
4007 if( rc
!=SQLITE_OK
) return rc
;
4009 rc
= deleteTerms(v
, iRow
);
4010 if( rc
!=SQLITE_OK
) return rc
;
4012 return content_delete(v
, iRow
); /* execute an SQL DELETE */
4015 /* Update a row in the %_content table; add delete doclists to
4016 ** pendingTerms for old terms not in the new data, add insert doclists
4017 ** to pendingTerms for terms in the new data.
4019 static int index_update(fulltext_vtab
*v
, sqlite_int64 iRow
,
4020 sqlite3_value
**pValues
){
4021 int rc
= initPendingTerms(v
, iRow
);
4022 if( rc
!=SQLITE_OK
) return rc
;
4024 /* Generate an empty doclist for each term that previously appeared in this
4026 rc
= deleteTerms(v
, iRow
);
4027 if( rc
!=SQLITE_OK
) return rc
;
4029 rc
= content_update(v
, pValues
, iRow
); /* execute an SQL UPDATE */
4030 if( rc
!=SQLITE_OK
) return rc
;
4032 /* Now add positions for terms which appear in the updated row. */
4033 return insertTerms(v
, iRow
, pValues
);
4036 /*******************************************************************/
4037 /* InteriorWriter is used to collect terms and block references into
4038 ** interior nodes in %_segments. See commentary at top of file for
4042 /* How large interior nodes can grow. */
4043 #define INTERIOR_MAX 2048
4045 /* Minimum number of terms per interior node (except the root). This
4046 ** prevents large terms from making the tree too skinny - must be >0
4047 ** so that the tree always makes progress. Note that the min tree
4048 ** fanout will be INTERIOR_MIN_TERMS+1.
4050 #define INTERIOR_MIN_TERMS 7
4051 #if INTERIOR_MIN_TERMS<1
4052 # error INTERIOR_MIN_TERMS must be greater than 0.
4055 /* ROOT_MAX controls how much data is stored inline in the segment
4058 /* TODO(shess) Push ROOT_MAX down to whoever is writing things. It's
4059 ** only here so that interiorWriterRootInfo() and leafWriterRootInfo()
4060 ** can both see it, but if the caller passed it in, we wouldn't even
4063 #define ROOT_MAX 1024
4064 #if ROOT_MAX<VARINT_MAX*2
4065 # error ROOT_MAX must have enough space for a header.
4068 /* InteriorBlock stores a linked-list of interior blocks while a lower
4069 ** layer is being constructed.
4071 typedef struct InteriorBlock
{
4072 DataBuffer term
; /* Leftmost term in block's subtree. */
4073 DataBuffer data
; /* Accumulated data for the block. */
4074 struct InteriorBlock
*next
;
4077 static InteriorBlock
*interiorBlockNew(int iHeight
, sqlite_int64 iChildBlock
,
4078 const char *pTerm
, int nTerm
){
4079 InteriorBlock
*block
= sqlite3_malloc(sizeof(InteriorBlock
));
4080 char c
[VARINT_MAX
+VARINT_MAX
];
4084 memset(block
, 0, sizeof(*block
));
4085 dataBufferInit(&block
->term
, 0);
4086 dataBufferReplace(&block
->term
, pTerm
, nTerm
);
4088 n
= putVarint(c
, iHeight
);
4089 n
+= putVarint(c
+n
, iChildBlock
);
4090 dataBufferInit(&block
->data
, INTERIOR_MAX
);
4091 dataBufferReplace(&block
->data
, c
, n
);
4097 /* Verify that the data is readable as an interior node. */
4098 static void interiorBlockValidate(InteriorBlock
*pBlock
){
4099 const char *pData
= pBlock
->data
.pData
;
4100 int nData
= pBlock
->data
.nData
;
4102 sqlite_int64 iBlockid
;
4106 assert( pData
+nData
>pData
);
4108 /* Must lead with height of node as a varint(n), n>0 */
4109 n
= getVarint32(pData
, &iDummy
);
4116 /* Must contain iBlockid. */
4117 n
= getVarint(pData
, &iBlockid
);
4123 /* Zero or more terms of positive length */
4125 /* First term is not delta-encoded. */
4126 n
= getVarint32(pData
, &iDummy
);
4129 assert( n
+iDummy
>0);
4130 assert( n
+iDummy
<=nData
);
4134 /* Following terms delta-encoded. */
4136 /* Length of shared prefix. */
4137 n
= getVarint32(pData
, &iDummy
);
4139 assert( iDummy
>=0 );
4144 /* Length and data of distinct suffix. */
4145 n
= getVarint32(pData
, &iDummy
);
4148 assert( n
+iDummy
>0);
4149 assert( n
+iDummy
<=nData
);
4155 #define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
4157 #define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 )
4160 typedef struct InteriorWriter
{
4161 int iHeight
; /* from 0 at leaves. */
4162 InteriorBlock
*first
, *last
;
4163 struct InteriorWriter
*parentWriter
;
4165 DataBuffer term
; /* Last term written to block "last". */
4166 sqlite_int64 iOpeningChildBlock
; /* First child block in block "last". */
4168 sqlite_int64 iLastChildBlock
; /* for consistency checks. */
4172 /* Initialize an interior node where pTerm[nTerm] marks the leftmost
4173 ** term in the tree. iChildBlock is the leftmost child block at the
4174 ** next level down the tree.
4176 static void interiorWriterInit(int iHeight
, const char *pTerm
, int nTerm
,
4177 sqlite_int64 iChildBlock
,
4178 InteriorWriter
*pWriter
){
4179 InteriorBlock
*block
;
4180 assert( iHeight
>0 );
4183 pWriter
->iHeight
= iHeight
;
4184 pWriter
->iOpeningChildBlock
= iChildBlock
;
4186 pWriter
->iLastChildBlock
= iChildBlock
;
4188 block
= interiorBlockNew(iHeight
, iChildBlock
, pTerm
, nTerm
);
4189 pWriter
->last
= pWriter
->first
= block
;
4190 ASSERT_VALID_INTERIOR_BLOCK(pWriter
->last
);
4191 dataBufferInit(&pWriter
->term
, 0);
4194 /* Append the child node rooted at iChildBlock to the interior node,
4195 ** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree.
4197 static void interiorWriterAppend(InteriorWriter
*pWriter
,
4198 const char *pTerm
, int nTerm
,
4199 sqlite_int64 iChildBlock
){
4200 char c
[VARINT_MAX
+VARINT_MAX
];
4203 ASSERT_VALID_INTERIOR_BLOCK(pWriter
->last
);
4205 /* The first term written into an interior node is actually
4206 ** associated with the second child added (the first child was added
4207 ** in interiorWriterInit, or in the if clause at the bottom of this
4208 ** function). That term gets encoded straight up, with nPrefix left
4211 if( pWriter
->term
.nData
==0 ){
4212 n
= putVarint(c
, nTerm
);
4214 while( nPrefix
<pWriter
->term
.nData
&&
4215 pTerm
[nPrefix
]==pWriter
->term
.pData
[nPrefix
] ){
4219 n
= putVarint(c
, nPrefix
);
4220 n
+= putVarint(c
+n
, nTerm
-nPrefix
);
4224 pWriter
->iLastChildBlock
++;
4226 assert( pWriter
->iLastChildBlock
==iChildBlock
);
4228 /* Overflow to a new block if the new term makes the current block
4229 ** too big, and the current block already has enough terms.
4231 if( pWriter
->last
->data
.nData
+n
+nTerm
-nPrefix
>INTERIOR_MAX
&&
4232 iChildBlock
-pWriter
->iOpeningChildBlock
>INTERIOR_MIN_TERMS
){
4233 pWriter
->last
->next
= interiorBlockNew(pWriter
->iHeight
, iChildBlock
,
4235 pWriter
->last
= pWriter
->last
->next
;
4236 pWriter
->iOpeningChildBlock
= iChildBlock
;
4237 dataBufferReset(&pWriter
->term
);
4239 dataBufferAppend2(&pWriter
->last
->data
, c
, n
,
4240 pTerm
+nPrefix
, nTerm
-nPrefix
);
4241 dataBufferReplace(&pWriter
->term
, pTerm
, nTerm
);
4243 ASSERT_VALID_INTERIOR_BLOCK(pWriter
->last
);
4246 /* Free the space used by pWriter, including the linked-list of
4247 ** InteriorBlocks, and parentWriter, if present.
4249 static int interiorWriterDestroy(InteriorWriter
*pWriter
){
4250 InteriorBlock
*block
= pWriter
->first
;
4252 while( block
!=NULL
){
4253 InteriorBlock
*b
= block
;
4254 block
= block
->next
;
4255 dataBufferDestroy(&b
->term
);
4256 dataBufferDestroy(&b
->data
);
4259 if( pWriter
->parentWriter
!=NULL
){
4260 interiorWriterDestroy(pWriter
->parentWriter
);
4261 sqlite3_free(pWriter
->parentWriter
);
4263 dataBufferDestroy(&pWriter
->term
);
4268 /* If pWriter can fit entirely in ROOT_MAX, return it as the root info
4269 ** directly, leaving *piEndBlockid unchanged. Otherwise, flush
4270 ** pWriter to %_segments, building a new layer of interior nodes, and
4271 ** recursively ask for their root into.
4273 static int interiorWriterRootInfo(fulltext_vtab
*v
, InteriorWriter
*pWriter
,
4274 char **ppRootInfo
, int *pnRootInfo
,
4275 sqlite_int64
*piEndBlockid
){
4276 InteriorBlock
*block
= pWriter
->first
;
4277 sqlite_int64 iBlockid
= 0;
4280 /* If we can fit the segment inline */
4281 if( block
==pWriter
->last
&& block
->data
.nData
<ROOT_MAX
){
4282 *ppRootInfo
= block
->data
.pData
;
4283 *pnRootInfo
= block
->data
.nData
;
4287 /* Flush the first block to %_segments, and create a new level of
4290 ASSERT_VALID_INTERIOR_BLOCK(block
);
4291 rc
= block_insert(v
, block
->data
.pData
, block
->data
.nData
, &iBlockid
);
4292 if( rc
!=SQLITE_OK
) return rc
;
4293 *piEndBlockid
= iBlockid
;
4295 pWriter
->parentWriter
= sqlite3_malloc(sizeof(*pWriter
->parentWriter
));
4296 interiorWriterInit(pWriter
->iHeight
+1,
4297 block
->term
.pData
, block
->term
.nData
,
4298 iBlockid
, pWriter
->parentWriter
);
4300 /* Flush additional blocks and append to the higher interior
4303 for(block
=block
->next
; block
!=NULL
; block
=block
->next
){
4304 ASSERT_VALID_INTERIOR_BLOCK(block
);
4305 rc
= block_insert(v
, block
->data
.pData
, block
->data
.nData
, &iBlockid
);
4306 if( rc
!=SQLITE_OK
) return rc
;
4307 *piEndBlockid
= iBlockid
;
4309 interiorWriterAppend(pWriter
->parentWriter
,
4310 block
->term
.pData
, block
->term
.nData
, iBlockid
);
4313 /* Parent node gets the chance to be the root. */
4314 return interiorWriterRootInfo(v
, pWriter
->parentWriter
,
4315 ppRootInfo
, pnRootInfo
, piEndBlockid
);
4318 /****************************************************************/
4319 /* InteriorReader is used to read off the data from an interior node
4320 ** (see comment at top of file for the format).
4322 typedef struct InteriorReader
{
4326 DataBuffer term
; /* previous term, for decoding term delta. */
4328 sqlite_int64 iBlockid
;
4331 static void interiorReaderDestroy(InteriorReader
*pReader
){
4332 dataBufferDestroy(&pReader
->term
);
4336 /* TODO(shess) The assertions are great, but what if we're in NDEBUG
4337 ** and the blob is empty or otherwise contains suspect data?
4339 static void interiorReaderInit(const char *pData
, int nData
,
4340 InteriorReader
*pReader
){
4343 /* Require at least the leading flag byte */
4345 assert( pData
[0]!='\0' );
4349 /* Decode the base blockid, and set the cursor to the first term. */
4350 n
= getVarint(pData
+1, &pReader
->iBlockid
);
4351 assert( 1+n
<=nData
);
4352 pReader
->pData
= pData
+1+n
;
4353 pReader
->nData
= nData
-(1+n
);
4355 /* A single-child interior node (such as when a leaf node was too
4356 ** large for the segment directory) won't have any terms.
4357 ** Otherwise, decode the first term.
4359 if( pReader
->nData
==0 ){
4360 dataBufferInit(&pReader
->term
, 0);
4362 n
= getVarint32(pReader
->pData
, &nTerm
);
4363 dataBufferInit(&pReader
->term
, nTerm
);
4364 dataBufferReplace(&pReader
->term
, pReader
->pData
+n
, nTerm
);
4365 assert( n
+nTerm
<=pReader
->nData
);
4366 pReader
->pData
+= n
+nTerm
;
4367 pReader
->nData
-= n
+nTerm
;
4371 static int interiorReaderAtEnd(InteriorReader
*pReader
){
4372 return pReader
->term
.nData
==0;
4375 static sqlite_int64
interiorReaderCurrentBlockid(InteriorReader
*pReader
){
4376 return pReader
->iBlockid
;
4379 static int interiorReaderTermBytes(InteriorReader
*pReader
){
4380 assert( !interiorReaderAtEnd(pReader
) );
4381 return pReader
->term
.nData
;
4383 static const char *interiorReaderTerm(InteriorReader
*pReader
){
4384 assert( !interiorReaderAtEnd(pReader
) );
4385 return pReader
->term
.pData
;
4388 /* Step forward to the next term in the node. */
4389 static void interiorReaderStep(InteriorReader
*pReader
){
4390 assert( !interiorReaderAtEnd(pReader
) );
4392 /* If the last term has been read, signal eof, else construct the
4395 if( pReader
->nData
==0 ){
4396 dataBufferReset(&pReader
->term
);
4398 int n
, nPrefix
, nSuffix
;
4400 n
= getVarint32(pReader
->pData
, &nPrefix
);
4401 n
+= getVarint32(pReader
->pData
+n
, &nSuffix
);
4403 /* Truncate the current term and append suffix data. */
4404 pReader
->term
.nData
= nPrefix
;
4405 dataBufferAppend(&pReader
->term
, pReader
->pData
+n
, nSuffix
);
4407 assert( n
+nSuffix
<=pReader
->nData
);
4408 pReader
->pData
+= n
+nSuffix
;
4409 pReader
->nData
-= n
+nSuffix
;
4411 pReader
->iBlockid
++;
4414 /* Compare the current term to pTerm[nTerm], returning strcmp-style
4415 ** results. If isPrefix, equality means equal through nTerm bytes.
4417 static int interiorReaderTermCmp(InteriorReader
*pReader
,
4418 const char *pTerm
, int nTerm
, int isPrefix
){
4419 const char *pReaderTerm
= interiorReaderTerm(pReader
);
4420 int nReaderTerm
= interiorReaderTermBytes(pReader
);
4421 int c
, n
= nReaderTerm
<nTerm
? nReaderTerm
: nTerm
;
4424 if( nReaderTerm
>0 ) return -1;
4425 if( nTerm
>0 ) return 1;
4429 c
= memcmp(pReaderTerm
, pTerm
, n
);
4430 if( c
!=0 ) return c
;
4431 if( isPrefix
&& n
==nTerm
) return 0;
4432 return nReaderTerm
- nTerm
;
4435 /****************************************************************/
4436 /* LeafWriter is used to collect terms and associated doclist data
4437 ** into leaf blocks in %_segments (see top of file for format info).
4438 ** Expected usage is:
4440 ** LeafWriter writer;
4441 ** leafWriterInit(0, 0, &writer);
4442 ** while( sorted_terms_left_to_process ){
4443 ** // data is doclist data for that term.
4444 ** rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData);
4445 ** if( rc!=SQLITE_OK ) goto err;
4447 ** rc = leafWriterFinalize(v, &writer);
4449 ** leafWriterDestroy(&writer);
4452 ** leafWriterStep() may write a collected leaf out to %_segments.
4453 ** leafWriterFinalize() finishes writing any buffered data and stores
4454 ** a root node in %_segdir. leafWriterDestroy() frees all buffers and
4455 ** InteriorWriters allocated as part of writing this segment.
4457 ** TODO(shess) Document leafWriterStepMerge().
4460 /* Put terms with data this big in their own block. */
4461 #define STANDALONE_MIN 1024
4463 /* Keep leaf blocks below this size. */
4464 #define LEAF_MAX 2048
4466 typedef struct LeafWriter
{
4469 sqlite_int64 iStartBlockid
; /* needed to create the root info */
4470 sqlite_int64 iEndBlockid
; /* when we're done writing. */
4472 DataBuffer term
; /* previous encoded term */
4473 DataBuffer data
; /* encoding buffer */
4475 /* bytes of first term in the current node which distinguishes that
4476 ** term from the last term of the previous node.
4480 InteriorWriter parentWriter
; /* if we overflow */
4484 static void leafWriterInit(int iLevel
, int idx
, LeafWriter
*pWriter
){
4486 pWriter
->iLevel
= iLevel
;
4489 dataBufferInit(&pWriter
->term
, 32);
4491 /* Start out with a reasonably sized block, though it can grow. */
4492 dataBufferInit(&pWriter
->data
, LEAF_MAX
);
4496 /* Verify that the data is readable as a leaf node. */
4497 static void leafNodeValidate(const char *pData
, int nData
){
4500 if( nData
==0 ) return;
4503 assert( pData
+nData
>pData
);
4505 /* Must lead with a varint(0) */
4506 n
= getVarint32(pData
, &iDummy
);
4507 assert( iDummy
==0 );
4513 /* Leading term length and data must fit in buffer. */
4514 n
= getVarint32(pData
, &iDummy
);
4517 assert( n
+iDummy
>0 );
4518 assert( n
+iDummy
<nData
);
4522 /* Leading term's doclist length and data must fit. */
4523 n
= getVarint32(pData
, &iDummy
);
4526 assert( n
+iDummy
>0 );
4527 assert( n
+iDummy
<=nData
);
4528 ASSERT_VALID_DOCLIST(DL_DEFAULT
, pData
+n
, iDummy
, NULL
);
4532 /* Verify that trailing terms and doclists also are readable. */
4534 n
= getVarint32(pData
, &iDummy
);
4536 assert( iDummy
>=0 );
4540 n
= getVarint32(pData
, &iDummy
);
4543 assert( n
+iDummy
>0 );
4544 assert( n
+iDummy
<nData
);
4548 n
= getVarint32(pData
, &iDummy
);
4551 assert( n
+iDummy
>0 );
4552 assert( n
+iDummy
<=nData
);
4553 ASSERT_VALID_DOCLIST(DL_DEFAULT
, pData
+n
, iDummy
, NULL
);
4558 #define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n)
4560 #define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 )
4563 /* Flush the current leaf node to %_segments, and adding the resulting
4564 ** blockid and the starting term to the interior node which will
4567 static int leafWriterInternalFlush(fulltext_vtab
*v
, LeafWriter
*pWriter
,
4568 int iData
, int nData
){
4569 sqlite_int64 iBlockid
= 0;
4570 const char *pStartingTerm
;
4571 int nStartingTerm
, rc
, n
;
4573 /* Must have the leading varint(0) flag, plus at least some
4574 ** valid-looking data.
4578 assert( iData
+nData
<=pWriter
->data
.nData
);
4579 ASSERT_VALID_LEAF_NODE(pWriter
->data
.pData
+iData
, nData
);
4581 rc
= block_insert(v
, pWriter
->data
.pData
+iData
, nData
, &iBlockid
);
4582 if( rc
!=SQLITE_OK
) return rc
;
4583 assert( iBlockid
!=0 );
4585 /* Reconstruct the first term in the leaf for purposes of building
4586 ** the interior node.
4588 n
= getVarint32(pWriter
->data
.pData
+iData
+1, &nStartingTerm
);
4589 pStartingTerm
= pWriter
->data
.pData
+iData
+1+n
;
4590 assert( pWriter
->data
.nData
>iData
+1+n
+nStartingTerm
);
4591 assert( pWriter
->nTermDistinct
>0 );
4592 assert( pWriter
->nTermDistinct
<=nStartingTerm
);
4593 nStartingTerm
= pWriter
->nTermDistinct
;
4595 if( pWriter
->has_parent
){
4596 interiorWriterAppend(&pWriter
->parentWriter
,
4597 pStartingTerm
, nStartingTerm
, iBlockid
);
4599 interiorWriterInit(1, pStartingTerm
, nStartingTerm
, iBlockid
,
4600 &pWriter
->parentWriter
);
4601 pWriter
->has_parent
= 1;
4604 /* Track the span of this segment's leaf nodes. */
4605 if( pWriter
->iEndBlockid
==0 ){
4606 pWriter
->iEndBlockid
= pWriter
->iStartBlockid
= iBlockid
;
4608 pWriter
->iEndBlockid
++;
4609 assert( iBlockid
==pWriter
->iEndBlockid
);
4614 static int leafWriterFlush(fulltext_vtab
*v
, LeafWriter
*pWriter
){
4615 int rc
= leafWriterInternalFlush(v
, pWriter
, 0, pWriter
->data
.nData
);
4616 if( rc
!=SQLITE_OK
) return rc
;
4618 /* Re-initialize the output buffer. */
4619 dataBufferReset(&pWriter
->data
);
4624 /* Fetch the root info for the segment. If the entire leaf fits
4625 ** within ROOT_MAX, then it will be returned directly, otherwise it
4626 ** will be flushed and the root info will be returned from the
4627 ** interior node. *piEndBlockid is set to the blockid of the last
4628 ** interior or leaf node written to disk (0 if none are written at
4631 static int leafWriterRootInfo(fulltext_vtab
*v
, LeafWriter
*pWriter
,
4632 char **ppRootInfo
, int *pnRootInfo
,
4633 sqlite_int64
*piEndBlockid
){
4634 /* we can fit the segment entirely inline */
4635 if( !pWriter
->has_parent
&& pWriter
->data
.nData
<ROOT_MAX
){
4636 *ppRootInfo
= pWriter
->data
.pData
;
4637 *pnRootInfo
= pWriter
->data
.nData
;
4642 /* Flush remaining leaf data. */
4643 if( pWriter
->data
.nData
>0 ){
4644 int rc
= leafWriterFlush(v
, pWriter
);
4645 if( rc
!=SQLITE_OK
) return rc
;
4648 /* We must have flushed a leaf at some point. */
4649 assert( pWriter
->has_parent
);
4651 /* Tenatively set the end leaf blockid as the end blockid. If the
4652 ** interior node can be returned inline, this will be the final
4653 ** blockid, otherwise it will be overwritten by
4654 ** interiorWriterRootInfo().
4656 *piEndBlockid
= pWriter
->iEndBlockid
;
4658 return interiorWriterRootInfo(v
, &pWriter
->parentWriter
,
4659 ppRootInfo
, pnRootInfo
, piEndBlockid
);
4662 /* Collect the rootInfo data and store it into the segment directory.
4663 ** This has the effect of flushing the segment's leaf data to
4664 ** %_segments, and also flushing any interior nodes to %_segments.
4666 static int leafWriterFinalize(fulltext_vtab
*v
, LeafWriter
*pWriter
){
4667 sqlite_int64 iEndBlockid
;
4671 rc
= leafWriterRootInfo(v
, pWriter
, &pRootInfo
, &nRootInfo
, &iEndBlockid
);
4672 if( rc
!=SQLITE_OK
) return rc
;
4674 /* Don't bother storing an entirely empty segment. */
4675 if( iEndBlockid
==0 && nRootInfo
==0 ) return SQLITE_OK
;
4677 return segdir_set(v
, pWriter
->iLevel
, pWriter
->idx
,
4678 pWriter
->iStartBlockid
, pWriter
->iEndBlockid
,
4679 iEndBlockid
, pRootInfo
, nRootInfo
);
4682 static void leafWriterDestroy(LeafWriter
*pWriter
){
4683 if( pWriter
->has_parent
) interiorWriterDestroy(&pWriter
->parentWriter
);
4684 dataBufferDestroy(&pWriter
->term
);
4685 dataBufferDestroy(&pWriter
->data
);
4688 /* Encode a term into the leafWriter, delta-encoding as appropriate.
4689 ** Returns the length of the new term which distinguishes it from the
4690 ** previous term, which can be used to set nTermDistinct when a node
4691 ** boundary is crossed.
4693 static int leafWriterEncodeTerm(LeafWriter
*pWriter
,
4694 const char *pTerm
, int nTerm
){
4695 char c
[VARINT_MAX
+VARINT_MAX
];
4699 while( nPrefix
<pWriter
->term
.nData
&&
4700 pTerm
[nPrefix
]==pWriter
->term
.pData
[nPrefix
] ){
4702 /* Failing this implies that the terms weren't in order. */
4703 assert( nPrefix
<nTerm
);
4706 if( pWriter
->data
.nData
==0 ){
4707 /* Encode the node header and leading term as:
4710 ** char pTerm[nTerm]
4712 n
= putVarint(c
, '\0');
4713 n
+= putVarint(c
+n
, nTerm
);
4714 dataBufferAppend2(&pWriter
->data
, c
, n
, pTerm
, nTerm
);
4716 /* Delta-encode the term as:
4719 ** char pTermSuffix[nSuffix]
4721 n
= putVarint(c
, nPrefix
);
4722 n
+= putVarint(c
+n
, nTerm
-nPrefix
);
4723 dataBufferAppend2(&pWriter
->data
, c
, n
, pTerm
+nPrefix
, nTerm
-nPrefix
);
4725 dataBufferReplace(&pWriter
->term
, pTerm
, nTerm
);
4730 /* Used to avoid a memmove when a large amount of doclist data is in
4731 ** the buffer. This constructs a node and term header before
4732 ** iDoclistData and flushes the resulting complete node using
4733 ** leafWriterInternalFlush().
4735 static int leafWriterInlineFlush(fulltext_vtab
*v
, LeafWriter
*pWriter
,
4736 const char *pTerm
, int nTerm
,
4738 char c
[VARINT_MAX
+VARINT_MAX
];
4739 int iData
, n
= putVarint(c
, 0);
4740 n
+= putVarint(c
+n
, nTerm
);
4742 /* There should always be room for the header. Even if pTerm shared
4743 ** a substantial prefix with the previous term, the entire prefix
4744 ** could be constructed from earlier data in the doclist, so there
4747 assert( iDoclistData
>=n
+nTerm
);
4749 iData
= iDoclistData
-(n
+nTerm
);
4750 memcpy(pWriter
->data
.pData
+iData
, c
, n
);
4751 memcpy(pWriter
->data
.pData
+iData
+n
, pTerm
, nTerm
);
4753 return leafWriterInternalFlush(v
, pWriter
, iData
, pWriter
->data
.nData
-iData
);
4756 /* Push pTerm[nTerm] along with the doclist data to the leaf layer of
4759 static int leafWriterStepMerge(fulltext_vtab
*v
, LeafWriter
*pWriter
,
4760 const char *pTerm
, int nTerm
,
4761 DLReader
*pReaders
, int nReaders
){
4762 char c
[VARINT_MAX
+VARINT_MAX
];
4763 int iTermData
= pWriter
->data
.nData
, iDoclistData
;
4764 int i
, nData
, n
, nActualData
, nActual
, rc
, nTermDistinct
;
4766 ASSERT_VALID_LEAF_NODE(pWriter
->data
.pData
, pWriter
->data
.nData
);
4767 nTermDistinct
= leafWriterEncodeTerm(pWriter
, pTerm
, nTerm
);
4769 /* Remember nTermDistinct if opening a new node. */
4770 if( iTermData
==0 ) pWriter
->nTermDistinct
= nTermDistinct
;
4772 iDoclistData
= pWriter
->data
.nData
;
4774 /* Estimate the length of the merged doclist so we can leave space
4777 for(i
=0, nData
=0; i
<nReaders
; i
++){
4778 nData
+= dlrAllDataBytes(&pReaders
[i
]);
4780 n
= putVarint(c
, nData
);
4781 dataBufferAppend(&pWriter
->data
, c
, n
);
4783 docListMerge(&pWriter
->data
, pReaders
, nReaders
);
4784 ASSERT_VALID_DOCLIST(DL_DEFAULT
,
4785 pWriter
->data
.pData
+iDoclistData
+n
,
4786 pWriter
->data
.nData
-iDoclistData
-n
, NULL
);
4788 /* The actual amount of doclist data at this point could be smaller
4789 ** than the length we encoded. Additionally, the space required to
4790 ** encode this length could be smaller. For small doclists, this is
4791 ** not a big deal, we can just use memmove() to adjust things.
4793 nActualData
= pWriter
->data
.nData
-(iDoclistData
+n
);
4794 nActual
= putVarint(c
, nActualData
);
4795 assert( nActualData
<=nData
);
4796 assert( nActual
<=n
);
4798 /* If the new doclist is big enough for force a standalone leaf
4799 ** node, we can immediately flush it inline without doing the
4802 /* TODO(shess) This test matches leafWriterStep(), which does this
4803 ** test before it knows the cost to varint-encode the term and
4804 ** doclist lengths. At some point, change to
4805 ** pWriter->data.nData-iTermData>STANDALONE_MIN.
4807 if( nTerm
+nActualData
>STANDALONE_MIN
){
4808 /* Push leaf node from before this term. */
4810 rc
= leafWriterInternalFlush(v
, pWriter
, 0, iTermData
);
4811 if( rc
!=SQLITE_OK
) return rc
;
4813 pWriter
->nTermDistinct
= nTermDistinct
;
4816 /* Fix the encoded doclist length. */
4817 iDoclistData
+= n
- nActual
;
4818 memcpy(pWriter
->data
.pData
+iDoclistData
, c
, nActual
);
4820 /* Push the standalone leaf node. */
4821 rc
= leafWriterInlineFlush(v
, pWriter
, pTerm
, nTerm
, iDoclistData
);
4822 if( rc
!=SQLITE_OK
) return rc
;
4824 /* Leave the node empty. */
4825 dataBufferReset(&pWriter
->data
);
4830 /* At this point, we know that the doclist was small, so do the
4831 ** memmove if indicated.
4834 memmove(pWriter
->data
.pData
+iDoclistData
+nActual
,
4835 pWriter
->data
.pData
+iDoclistData
+n
,
4836 pWriter
->data
.nData
-(iDoclistData
+n
));
4837 pWriter
->data
.nData
-= n
-nActual
;
4840 /* Replace written length with actual length. */
4841 memcpy(pWriter
->data
.pData
+iDoclistData
, c
, nActual
);
4843 /* If the node is too large, break things up. */
4844 /* TODO(shess) This test matches leafWriterStep(), which does this
4845 ** test before it knows the cost to varint-encode the term and
4846 ** doclist lengths. At some point, change to
4847 ** pWriter->data.nData>LEAF_MAX.
4849 if( iTermData
+nTerm
+nActualData
>LEAF_MAX
){
4850 /* Flush out the leading data as a node */
4851 rc
= leafWriterInternalFlush(v
, pWriter
, 0, iTermData
);
4852 if( rc
!=SQLITE_OK
) return rc
;
4854 pWriter
->nTermDistinct
= nTermDistinct
;
4856 /* Rebuild header using the current term */
4857 n
= putVarint(pWriter
->data
.pData
, 0);
4858 n
+= putVarint(pWriter
->data
.pData
+n
, nTerm
);
4859 memcpy(pWriter
->data
.pData
+n
, pTerm
, nTerm
);
4862 /* There should always be room, because the previous encoding
4863 ** included all data necessary to construct the term.
4865 assert( n
<iDoclistData
);
4866 /* So long as STANDALONE_MIN is half or less of LEAF_MAX, the
4867 ** following memcpy() is safe (as opposed to needing a memmove).
4869 assert( 2*STANDALONE_MIN
<=LEAF_MAX
);
4870 assert( n
+pWriter
->data
.nData
-iDoclistData
<iDoclistData
);
4871 memcpy(pWriter
->data
.pData
+n
,
4872 pWriter
->data
.pData
+iDoclistData
,
4873 pWriter
->data
.nData
-iDoclistData
);
4874 pWriter
->data
.nData
-= iDoclistData
-n
;
4876 ASSERT_VALID_LEAF_NODE(pWriter
->data
.pData
, pWriter
->data
.nData
);
4881 /* Push pTerm[nTerm] along with the doclist data to the leaf layer of
4884 /* TODO(shess) Revise writeZeroSegment() so that doclists are
4885 ** constructed directly in pWriter->data.
4887 static int leafWriterStep(fulltext_vtab
*v
, LeafWriter
*pWriter
,
4888 const char *pTerm
, int nTerm
,
4889 const char *pData
, int nData
){
4893 dlrInit(&reader
, DL_DEFAULT
, pData
, nData
);
4894 rc
= leafWriterStepMerge(v
, pWriter
, pTerm
, nTerm
, &reader
, 1);
4895 dlrDestroy(&reader
);
4901 /****************************************************************/
4902 /* LeafReader is used to iterate over an individual leaf node. */
4903 typedef struct LeafReader
{
4904 DataBuffer term
; /* copy of current term. */
4906 const char *pData
; /* data for current term. */
4910 static void leafReaderDestroy(LeafReader
*pReader
){
4911 dataBufferDestroy(&pReader
->term
);
4915 static int leafReaderAtEnd(LeafReader
*pReader
){
4916 return pReader
->nData
<=0;
4919 /* Access the current term. */
4920 static int leafReaderTermBytes(LeafReader
*pReader
){
4921 return pReader
->term
.nData
;
4923 static const char *leafReaderTerm(LeafReader
*pReader
){
4924 assert( pReader
->term
.nData
>0 );
4925 return pReader
->term
.pData
;
4928 /* Access the doclist data for the current term. */
4929 static int leafReaderDataBytes(LeafReader
*pReader
){
4931 assert( pReader
->term
.nData
>0 );
4932 getVarint32(pReader
->pData
, &nData
);
4935 static const char *leafReaderData(LeafReader
*pReader
){
4937 assert( pReader
->term
.nData
>0 );
4938 n
= getVarint32(pReader
->pData
, &nData
);
4939 return pReader
->pData
+n
;
4942 static void leafReaderInit(const char *pData
, int nData
,
4943 LeafReader
*pReader
){
4947 assert( pData
[0]=='\0' );
4951 /* Read the first term, skipping the header byte. */
4952 n
= getVarint32(pData
+1, &nTerm
);
4953 dataBufferInit(&pReader
->term
, nTerm
);
4954 dataBufferReplace(&pReader
->term
, pData
+1+n
, nTerm
);
4956 /* Position after the first term. */
4957 assert( 1+n
+nTerm
<nData
);
4958 pReader
->pData
= pData
+1+n
+nTerm
;
4959 pReader
->nData
= nData
-1-n
-nTerm
;
4962 /* Step the reader forward to the next term. */
4963 static void leafReaderStep(LeafReader
*pReader
){
4964 int n
, nData
, nPrefix
, nSuffix
;
4965 assert( !leafReaderAtEnd(pReader
) );
4967 /* Skip previous entry's data block. */
4968 n
= getVarint32(pReader
->pData
, &nData
);
4969 assert( n
+nData
<=pReader
->nData
);
4970 pReader
->pData
+= n
+nData
;
4971 pReader
->nData
-= n
+nData
;
4973 if( !leafReaderAtEnd(pReader
) ){
4974 /* Construct the new term using a prefix from the old term plus a
4975 ** suffix from the leaf data.
4977 n
= getVarint32(pReader
->pData
, &nPrefix
);
4978 n
+= getVarint32(pReader
->pData
+n
, &nSuffix
);
4979 assert( n
+nSuffix
<pReader
->nData
);
4980 pReader
->term
.nData
= nPrefix
;
4981 dataBufferAppend(&pReader
->term
, pReader
->pData
+n
, nSuffix
);
4983 pReader
->pData
+= n
+nSuffix
;
4984 pReader
->nData
-= n
+nSuffix
;
4988 /* strcmp-style comparison of pReader's current term against pTerm.
4989 ** If isPrefix, equality means equal through nTerm bytes.
4991 static int leafReaderTermCmp(LeafReader
*pReader
,
4992 const char *pTerm
, int nTerm
, int isPrefix
){
4993 int c
, n
= pReader
->term
.nData
<nTerm
? pReader
->term
.nData
: nTerm
;
4995 if( pReader
->term
.nData
>0 ) return -1;
4996 if(nTerm
>0 ) return 1;
5000 c
= memcmp(pReader
->term
.pData
, pTerm
, n
);
5001 if( c
!=0 ) return c
;
5002 if( isPrefix
&& n
==nTerm
) return 0;
5003 return pReader
->term
.nData
- nTerm
;
5007 /****************************************************************/
5008 /* LeavesReader wraps LeafReader to allow iterating over the entire
5009 ** leaf layer of the tree.
5011 typedef struct LeavesReader
{
5012 int idx
; /* Index within the segment. */
5014 sqlite3_stmt
*pStmt
; /* Statement we're streaming leaves from. */
5015 int eof
; /* we've seen SQLITE_DONE from pStmt. */
5017 LeafReader leafReader
; /* reader for the current leaf. */
5018 DataBuffer rootData
; /* root data for inline. */
5021 /* Access the current term. */
5022 static int leavesReaderTermBytes(LeavesReader
*pReader
){
5023 assert( !pReader
->eof
);
5024 return leafReaderTermBytes(&pReader
->leafReader
);
5026 static const char *leavesReaderTerm(LeavesReader
*pReader
){
5027 assert( !pReader
->eof
);
5028 return leafReaderTerm(&pReader
->leafReader
);
5031 /* Access the doclist data for the current term. */
5032 static int leavesReaderDataBytes(LeavesReader
*pReader
){
5033 assert( !pReader
->eof
);
5034 return leafReaderDataBytes(&pReader
->leafReader
);
5036 static const char *leavesReaderData(LeavesReader
*pReader
){
5037 assert( !pReader
->eof
);
5038 return leafReaderData(&pReader
->leafReader
);
5041 static int leavesReaderAtEnd(LeavesReader
*pReader
){
5042 return pReader
->eof
;
5045 /* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus
5046 ** leaving the statement handle open, which locks the table.
5048 /* TODO(shess) This "solution" is not satisfactory. Really, there
5049 ** should be check-in function for all statement handles which
5050 ** arranges to call sqlite3_reset(). This most likely will require
5051 ** modification to control flow all over the place, though, so for now
5054 ** Note the current system assumes that segment merges will run to
5055 ** completion, which is why this particular probably hasn't arisen in
5056 ** this case. Probably a brittle assumption.
5058 static int leavesReaderReset(LeavesReader
*pReader
){
5059 return sqlite3_reset(pReader
->pStmt
);
5062 static void leavesReaderDestroy(LeavesReader
*pReader
){
5063 /* If idx is -1, that means we're using a non-cached statement
5064 ** handle in the optimize() case, so we need to release it.
5066 if( pReader
->pStmt
!=NULL
&& pReader
->idx
==-1 ){
5067 sqlite3_finalize(pReader
->pStmt
);
5069 leafReaderDestroy(&pReader
->leafReader
);
5070 dataBufferDestroy(&pReader
->rootData
);
5074 /* Initialize pReader with the given root data (if iStartBlockid==0
5075 ** the leaf data was entirely contained in the root), or from the
5076 ** stream of blocks between iStartBlockid and iEndBlockid, inclusive.
5078 static int leavesReaderInit(fulltext_vtab
*v
,
5080 sqlite_int64 iStartBlockid
,
5081 sqlite_int64 iEndBlockid
,
5082 const char *pRootData
, int nRootData
,
5083 LeavesReader
*pReader
){
5087 dataBufferInit(&pReader
->rootData
, 0);
5088 if( iStartBlockid
==0 ){
5089 /* Entire leaf level fit in root data. */
5090 dataBufferReplace(&pReader
->rootData
, pRootData
, nRootData
);
5091 leafReaderInit(pReader
->rootData
.pData
, pReader
->rootData
.nData
,
5092 &pReader
->leafReader
);
5095 int rc
= sql_get_leaf_statement(v
, idx
, &s
);
5096 if( rc
!=SQLITE_OK
) return rc
;
5098 rc
= sqlite3_bind_int64(s
, 1, iStartBlockid
);
5099 if( rc
!=SQLITE_OK
) return rc
;
5101 rc
= sqlite3_bind_int64(s
, 2, iEndBlockid
);
5102 if( rc
!=SQLITE_OK
) return rc
;
5104 rc
= sqlite3_step(s
);
5105 if( rc
==SQLITE_DONE
){
5109 if( rc
!=SQLITE_ROW
) return rc
;
5112 leafReaderInit(sqlite3_column_blob(pReader
->pStmt
, 0),
5113 sqlite3_column_bytes(pReader
->pStmt
, 0),
5114 &pReader
->leafReader
);
5119 /* Step the current leaf forward to the next term. If we reach the
5120 ** end of the current leaf, step forward to the next leaf block.
5122 static int leavesReaderStep(fulltext_vtab
*v
, LeavesReader
*pReader
){
5123 assert( !leavesReaderAtEnd(pReader
) );
5124 leafReaderStep(&pReader
->leafReader
);
5126 if( leafReaderAtEnd(&pReader
->leafReader
) ){
5128 if( pReader
->rootData
.pData
){
5132 rc
= sqlite3_step(pReader
->pStmt
);
5133 if( rc
!=SQLITE_ROW
){
5135 return rc
==SQLITE_DONE
? SQLITE_OK
: rc
;
5137 leafReaderDestroy(&pReader
->leafReader
);
5138 leafReaderInit(sqlite3_column_blob(pReader
->pStmt
, 0),
5139 sqlite3_column_bytes(pReader
->pStmt
, 0),
5140 &pReader
->leafReader
);
5145 /* Order LeavesReaders by their term, ignoring idx. Readers at eof
5146 ** always sort to the end.
5148 static int leavesReaderTermCmp(LeavesReader
*lr1
, LeavesReader
*lr2
){
5149 if( leavesReaderAtEnd(lr1
) ){
5150 if( leavesReaderAtEnd(lr2
) ) return 0;
5153 if( leavesReaderAtEnd(lr2
) ) return -1;
5155 return leafReaderTermCmp(&lr1
->leafReader
,
5156 leavesReaderTerm(lr2
), leavesReaderTermBytes(lr2
),
5160 /* Similar to leavesReaderTermCmp(), with additional ordering by idx
5161 ** so that older segments sort before newer segments.
5163 static int leavesReaderCmp(LeavesReader
*lr1
, LeavesReader
*lr2
){
5164 int c
= leavesReaderTermCmp(lr1
, lr2
);
5165 if( c
!=0 ) return c
;
5166 return lr1
->idx
-lr2
->idx
;
5169 /* Assume that pLr[1]..pLr[nLr] are sorted. Bubble pLr[0] into its
5172 static void leavesReaderReorder(LeavesReader
*pLr
, int nLr
){
5173 while( nLr
>1 && leavesReaderCmp(pLr
, pLr
+1)>0 ){
5174 LeavesReader tmp
= pLr
[0];
5182 /* Initializes pReaders with the segments from level iLevel, returning
5183 ** the number of segments in *piReaders. Leaves pReaders in sorted
5186 static int leavesReadersInit(fulltext_vtab
*v
, int iLevel
,
5187 LeavesReader
*pReaders
, int *piReaders
){
5189 int i
, rc
= sql_get_statement(v
, SEGDIR_SELECT_LEVEL_STMT
, &s
);
5190 if( rc
!=SQLITE_OK
) return rc
;
5192 rc
= sqlite3_bind_int(s
, 1, iLevel
);
5193 if( rc
!=SQLITE_OK
) return rc
;
5196 while( (rc
= sqlite3_step(s
))==SQLITE_ROW
){
5197 sqlite_int64 iStart
= sqlite3_column_int64(s
, 0);
5198 sqlite_int64 iEnd
= sqlite3_column_int64(s
, 1);
5199 const char *pRootData
= sqlite3_column_blob(s
, 2);
5200 int nRootData
= sqlite3_column_bytes(s
, 2);
5202 assert( i
<MERGE_COUNT
);
5203 rc
= leavesReaderInit(v
, i
, iStart
, iEnd
, pRootData
, nRootData
,
5205 if( rc
!=SQLITE_OK
) break;
5209 if( rc
!=SQLITE_DONE
){
5211 leavesReaderDestroy(&pReaders
[i
]);
5218 /* Leave our results sorted by term, then age. */
5220 leavesReaderReorder(pReaders
+i
, *piReaders
-i
);
5225 /* Merge doclists from pReaders[nReaders] into a single doclist, which
5226 ** is written to pWriter. Assumes pReaders is ordered oldest to
5229 /* TODO(shess) Consider putting this inline in segmentMerge(). */
5230 static int leavesReadersMerge(fulltext_vtab
*v
,
5231 LeavesReader
*pReaders
, int nReaders
,
5232 LeafWriter
*pWriter
){
5233 DLReader dlReaders
[MERGE_COUNT
];
5234 const char *pTerm
= leavesReaderTerm(pReaders
);
5235 int i
, nTerm
= leavesReaderTermBytes(pReaders
);
5237 assert( nReaders
<=MERGE_COUNT
);
5239 for(i
=0; i
<nReaders
; i
++){
5240 dlrInit(&dlReaders
[i
], DL_DEFAULT
,
5241 leavesReaderData(pReaders
+i
),
5242 leavesReaderDataBytes(pReaders
+i
));
5245 return leafWriterStepMerge(v
, pWriter
, pTerm
, nTerm
, dlReaders
, nReaders
);
5248 /* Forward ref due to mutual recursion with segdirNextIndex(). */
5249 static int segmentMerge(fulltext_vtab
*v
, int iLevel
);
5251 /* Put the next available index at iLevel into *pidx. If iLevel
5252 ** already has MERGE_COUNT segments, they are merged to a higher
5253 ** level to make room.
5255 static int segdirNextIndex(fulltext_vtab
*v
, int iLevel
, int *pidx
){
5256 int rc
= segdir_max_index(v
, iLevel
, pidx
);
5257 if( rc
==SQLITE_DONE
){ /* No segments at iLevel. */
5259 }else if( rc
==SQLITE_ROW
){
5260 if( *pidx
==(MERGE_COUNT
-1) ){
5261 rc
= segmentMerge(v
, iLevel
);
5262 if( rc
!=SQLITE_OK
) return rc
;
5273 /* Merge MERGE_COUNT segments at iLevel into a new segment at
5274 ** iLevel+1. If iLevel+1 is already full of segments, those will be
5275 ** merged to make room.
5277 static int segmentMerge(fulltext_vtab
*v
, int iLevel
){
5279 LeavesReader lrs
[MERGE_COUNT
];
5282 /* Determine the next available segment index at the next level,
5283 ** merging as necessary.
5285 rc
= segdirNextIndex(v
, iLevel
+1, &idx
);
5286 if( rc
!=SQLITE_OK
) return rc
;
5288 /* TODO(shess) This assumes that we'll always see exactly
5289 ** MERGE_COUNT segments to merge at a given level. That will be
5290 ** broken if we allow the developer to request preemptive or
5291 ** deferred merging.
5293 memset(&lrs
, '\0', sizeof(lrs
));
5294 rc
= leavesReadersInit(v
, iLevel
, lrs
, &i
);
5295 if( rc
!=SQLITE_OK
) return rc
;
5296 assert( i
==MERGE_COUNT
);
5298 leafWriterInit(iLevel
+1, idx
, &writer
);
5300 /* Since leavesReaderReorder() pushes readers at eof to the end,
5301 ** when the first reader is empty, all will be empty.
5303 while( !leavesReaderAtEnd(lrs
) ){
5304 /* Figure out how many readers share their next term. */
5305 for(i
=1; i
<MERGE_COUNT
&& !leavesReaderAtEnd(lrs
+i
); i
++){
5306 if( 0!=leavesReaderTermCmp(lrs
, lrs
+i
) ) break;
5309 rc
= leavesReadersMerge(v
, lrs
, i
, &writer
);
5310 if( rc
!=SQLITE_OK
) goto err
;
5312 /* Step forward those that were merged. */
5314 rc
= leavesReaderStep(v
, lrs
+i
);
5315 if( rc
!=SQLITE_OK
) goto err
;
5317 /* Reorder by term, then by age. */
5318 leavesReaderReorder(lrs
+i
, MERGE_COUNT
-i
);
5322 for(i
=0; i
<MERGE_COUNT
; i
++){
5323 leavesReaderDestroy(&lrs
[i
]);
5326 rc
= leafWriterFinalize(v
, &writer
);
5327 leafWriterDestroy(&writer
);
5328 if( rc
!=SQLITE_OK
) return rc
;
5330 /* Delete the merged segment data. */
5331 return segdir_delete(v
, iLevel
);
5334 for(i
=0; i
<MERGE_COUNT
; i
++){
5335 leavesReaderDestroy(&lrs
[i
]);
5337 leafWriterDestroy(&writer
);
5341 /* Accumulate the union of *acc and *pData into *acc. */
5342 static void docListAccumulateUnion(DataBuffer
*acc
,
5343 const char *pData
, int nData
) {
5344 DataBuffer tmp
= *acc
;
5345 dataBufferInit(acc
, tmp
.nData
+nData
);
5346 docListUnion(tmp
.pData
, tmp
.nData
, pData
, nData
, acc
);
5347 dataBufferDestroy(&tmp
);
5350 /* TODO(shess) It might be interesting to explore different merge
5351 ** strategies, here. For instance, since this is a sorted merge, we
5352 ** could easily merge many doclists in parallel. With some
5353 ** comprehension of the storage format, we could merge all of the
5354 ** doclists within a leaf node directly from the leaf node's storage.
5355 ** It may be worthwhile to merge smaller doclists before larger
5356 ** doclists, since they can be traversed more quickly - but the
5357 ** results may have less overlap, making them more expensive in a
5361 /* Scan pReader for pTerm/nTerm, and merge the term's doclist over
5362 ** *out (any doclists with duplicate docids overwrite those in *out).
5363 ** Internal function for loadSegmentLeaf().
5365 static int loadSegmentLeavesInt(fulltext_vtab
*v
, LeavesReader
*pReader
,
5366 const char *pTerm
, int nTerm
, int isPrefix
,
5368 /* doclist data is accumulated into pBuffers similar to how one does
5369 ** increment in binary arithmetic. If index 0 is empty, the data is
5370 ** stored there. If there is data there, it is merged and the
5371 ** results carried into position 1, with further merge-and-carry
5372 ** until an empty position is found.
5374 DataBuffer
*pBuffers
= NULL
;
5375 int nBuffers
= 0, nMaxBuffers
= 0, rc
;
5379 for(rc
=SQLITE_OK
; rc
==SQLITE_OK
&& !leavesReaderAtEnd(pReader
);
5380 rc
=leavesReaderStep(v
, pReader
)){
5381 /* TODO(shess) Really want leavesReaderTermCmp(), but that name is
5382 ** already taken to compare the terms of two LeavesReaders. Think
5383 ** on a better name. [Meanwhile, break encapsulation rather than
5384 ** use a confusing name.]
5386 int c
= leafReaderTermCmp(&pReader
->leafReader
, pTerm
, nTerm
, isPrefix
);
5387 if( c
>0 ) break; /* Past any possible matches. */
5389 const char *pData
= leavesReaderData(pReader
);
5390 int iBuffer
, nData
= leavesReaderDataBytes(pReader
);
5392 /* Find the first empty buffer. */
5393 for(iBuffer
=0; iBuffer
<nBuffers
; ++iBuffer
){
5394 if( 0==pBuffers
[iBuffer
].nData
) break;
5397 /* Out of buffers, add an empty one. */
5398 if( iBuffer
==nBuffers
){
5399 if( nBuffers
==nMaxBuffers
){
5403 /* Manual realloc so we can handle NULL appropriately. */
5404 p
= sqlite3_malloc(nMaxBuffers
*sizeof(*pBuffers
));
5411 assert(pBuffers
!=NULL
);
5412 memcpy(p
, pBuffers
, nBuffers
*sizeof(*pBuffers
));
5413 sqlite3_free(pBuffers
);
5417 dataBufferInit(&(pBuffers
[nBuffers
]), 0);
5421 /* At this point, must have an empty at iBuffer. */
5422 assert(iBuffer
<nBuffers
&& pBuffers
[iBuffer
].nData
==0);
5424 /* If empty was first buffer, no need for merge logic. */
5426 dataBufferReplace(&(pBuffers
[0]), pData
, nData
);
5428 /* pAcc is the empty buffer the merged data will end up in. */
5429 DataBuffer
*pAcc
= &(pBuffers
[iBuffer
]);
5430 DataBuffer
*p
= &(pBuffers
[0]);
5432 /* Handle position 0 specially to avoid need to prime pAcc
5433 ** with pData/nData.
5435 dataBufferSwap(p
, pAcc
);
5436 docListAccumulateUnion(pAcc
, pData
, nData
);
5438 /* Accumulate remaining doclists into pAcc. */
5439 for(++p
; p
<pAcc
; ++p
){
5440 docListAccumulateUnion(pAcc
, p
->pData
, p
->nData
);
5442 /* dataBufferReset() could allow a large doclist to blow up
5443 ** our memory requirements.
5445 if( p
->nCapacity
<1024 ){
5448 dataBufferDestroy(p
);
5449 dataBufferInit(p
, 0);
5456 /* Union all the doclists together into *out. */
5457 /* TODO(shess) What if *out is big? Sigh. */
5458 if( rc
==SQLITE_OK
&& nBuffers
>0 ){
5460 for(iBuffer
=0; iBuffer
<nBuffers
; ++iBuffer
){
5461 if( pBuffers
[iBuffer
].nData
>0 ){
5462 if( out
->nData
==0 ){
5463 dataBufferSwap(out
, &(pBuffers
[iBuffer
]));
5465 docListAccumulateUnion(out
, pBuffers
[iBuffer
].pData
,
5466 pBuffers
[iBuffer
].nData
);
5472 while( nBuffers
-- ){
5473 dataBufferDestroy(&(pBuffers
[nBuffers
]));
5475 if( pBuffers
!=NULL
) sqlite3_free(pBuffers
);
5480 /* Call loadSegmentLeavesInt() with pData/nData as input. */
5481 static int loadSegmentLeaf(fulltext_vtab
*v
, const char *pData
, int nData
,
5482 const char *pTerm
, int nTerm
, int isPrefix
,
5484 LeavesReader reader
;
5488 assert( *pData
=='\0' );
5489 rc
= leavesReaderInit(v
, 0, 0, 0, pData
, nData
, &reader
);
5490 if( rc
!=SQLITE_OK
) return rc
;
5492 rc
= loadSegmentLeavesInt(v
, &reader
, pTerm
, nTerm
, isPrefix
, out
);
5493 leavesReaderReset(&reader
);
5494 leavesReaderDestroy(&reader
);
5498 /* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to
5499 ** iEndLeaf (inclusive) as input, and merge the resulting doclist into
5502 static int loadSegmentLeaves(fulltext_vtab
*v
,
5503 sqlite_int64 iStartLeaf
, sqlite_int64 iEndLeaf
,
5504 const char *pTerm
, int nTerm
, int isPrefix
,
5507 LeavesReader reader
;
5509 assert( iStartLeaf
<=iEndLeaf
);
5510 rc
= leavesReaderInit(v
, 0, iStartLeaf
, iEndLeaf
, NULL
, 0, &reader
);
5511 if( rc
!=SQLITE_OK
) return rc
;
5513 rc
= loadSegmentLeavesInt(v
, &reader
, pTerm
, nTerm
, isPrefix
, out
);
5514 leavesReaderReset(&reader
);
5515 leavesReaderDestroy(&reader
);
5519 /* Taking pData/nData as an interior node, find the sequence of child
5520 ** nodes which could include pTerm/nTerm/isPrefix. Note that the
5521 ** interior node terms logically come between the blocks, so there is
5522 ** one more blockid than there are terms (that block contains terms >=
5523 ** the last interior-node term).
5525 /* TODO(shess) The calling code may already know that the end child is
5526 ** not worth calculating, because the end may be in a later sibling
5527 ** node. Consider whether breaking symmetry is worthwhile. I suspect
5528 ** it is not worthwhile.
5530 static void getChildrenContaining(const char *pData
, int nData
,
5531 const char *pTerm
, int nTerm
, int isPrefix
,
5532 sqlite_int64
*piStartChild
,
5533 sqlite_int64
*piEndChild
){
5534 InteriorReader reader
;
5537 assert( *pData
!='\0' );
5538 interiorReaderInit(pData
, nData
, &reader
);
5540 /* Scan for the first child which could contain pTerm/nTerm. */
5541 while( !interiorReaderAtEnd(&reader
) ){
5542 if( interiorReaderTermCmp(&reader
, pTerm
, nTerm
, 0)>0 ) break;
5543 interiorReaderStep(&reader
);
5545 *piStartChild
= interiorReaderCurrentBlockid(&reader
);
5547 /* Keep scanning to find a term greater than our term, using prefix
5548 ** comparison if indicated. If isPrefix is false, this will be the
5549 ** same blockid as the starting block.
5551 while( !interiorReaderAtEnd(&reader
) ){
5552 if( interiorReaderTermCmp(&reader
, pTerm
, nTerm
, isPrefix
)>0 ) break;
5553 interiorReaderStep(&reader
);
5555 *piEndChild
= interiorReaderCurrentBlockid(&reader
);
5557 interiorReaderDestroy(&reader
);
5559 /* Children must ascend, and if !prefix, both must be the same. */
5560 assert( *piEndChild
>=*piStartChild
);
5561 assert( isPrefix
|| *piStartChild
==*piEndChild
);
5564 /* Read block at iBlockid and pass it with other params to
5565 ** getChildrenContaining().
5567 static int loadAndGetChildrenContaining(
5569 sqlite_int64 iBlockid
,
5570 const char *pTerm
, int nTerm
, int isPrefix
,
5571 sqlite_int64
*piStartChild
, sqlite_int64
*piEndChild
5573 sqlite3_stmt
*s
= NULL
;
5576 assert( iBlockid
!=0 );
5577 assert( pTerm
!=NULL
);
5578 assert( nTerm
!=0 ); /* TODO(shess) Why not allow this? */
5579 assert( piStartChild
!=NULL
);
5580 assert( piEndChild
!=NULL
);
5582 rc
= sql_get_statement(v
, BLOCK_SELECT_STMT
, &s
);
5583 if( rc
!=SQLITE_OK
) return rc
;
5585 rc
= sqlite3_bind_int64(s
, 1, iBlockid
);
5586 if( rc
!=SQLITE_OK
) return rc
;
5588 rc
= sqlite3_step(s
);
5589 if( rc
==SQLITE_DONE
) return SQLITE_ERROR
;
5590 if( rc
!=SQLITE_ROW
) return rc
;
5592 getChildrenContaining(sqlite3_column_blob(s
, 0), sqlite3_column_bytes(s
, 0),
5593 pTerm
, nTerm
, isPrefix
, piStartChild
, piEndChild
);
5595 /* We expect only one row. We must execute another sqlite3_step()
5596 * to complete the iteration; otherwise the table will remain
5598 rc
= sqlite3_step(s
);
5599 if( rc
==SQLITE_ROW
) return SQLITE_ERROR
;
5600 if( rc
!=SQLITE_DONE
) return rc
;
5605 /* Traverse the tree represented by pData[nData] looking for
5606 ** pTerm[nTerm], placing its doclist into *out. This is internal to
5607 ** loadSegment() to make error-handling cleaner.
5609 static int loadSegmentInt(fulltext_vtab
*v
, const char *pData
, int nData
,
5610 sqlite_int64 iLeavesEnd
,
5611 const char *pTerm
, int nTerm
, int isPrefix
,
5613 /* Special case where root is a leaf. */
5615 return loadSegmentLeaf(v
, pData
, nData
, pTerm
, nTerm
, isPrefix
, out
);
5618 sqlite_int64 iStartChild
, iEndChild
;
5620 /* Process pData as an interior node, then loop down the tree
5621 ** until we find the set of leaf nodes to scan for the term.
5623 getChildrenContaining(pData
, nData
, pTerm
, nTerm
, isPrefix
,
5624 &iStartChild
, &iEndChild
);
5625 while( iStartChild
>iLeavesEnd
){
5626 sqlite_int64 iNextStart
, iNextEnd
;
5627 rc
= loadAndGetChildrenContaining(v
, iStartChild
, pTerm
, nTerm
, isPrefix
,
5628 &iNextStart
, &iNextEnd
);
5629 if( rc
!=SQLITE_OK
) return rc
;
5631 /* If we've branched, follow the end branch, too. */
5632 if( iStartChild
!=iEndChild
){
5633 sqlite_int64 iDummy
;
5634 rc
= loadAndGetChildrenContaining(v
, iEndChild
, pTerm
, nTerm
, isPrefix
,
5635 &iDummy
, &iNextEnd
);
5636 if( rc
!=SQLITE_OK
) return rc
;
5639 assert( iNextStart
<=iNextEnd
);
5640 iStartChild
= iNextStart
;
5641 iEndChild
= iNextEnd
;
5643 assert( iStartChild
<=iLeavesEnd
);
5644 assert( iEndChild
<=iLeavesEnd
);
5646 /* Scan through the leaf segments for doclists. */
5647 return loadSegmentLeaves(v
, iStartChild
, iEndChild
,
5648 pTerm
, nTerm
, isPrefix
, out
);
5652 /* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then
5653 ** merge its doclist over *out (any duplicate doclists read from the
5654 ** segment rooted at pData will overwrite those in *out).
5656 /* TODO(shess) Consider changing this to determine the depth of the
5657 ** leaves using either the first characters of interior nodes (when
5658 ** ==1, we're one level above the leaves), or the first character of
5659 ** the root (which will describe the height of the tree directly).
5660 ** Either feels somewhat tricky to me.
5662 /* TODO(shess) The current merge is likely to be slow for large
5663 ** doclists (though it should process from newest/smallest to
5664 ** oldest/largest, so it may not be that bad). It might be useful to
5665 ** modify things to allow for N-way merging. This could either be
5666 ** within a segment, with pairwise merges across segments, or across
5667 ** all segments at once.
5669 static int loadSegment(fulltext_vtab
*v
, const char *pData
, int nData
,
5670 sqlite_int64 iLeavesEnd
,
5671 const char *pTerm
, int nTerm
, int isPrefix
,
5678 /* This code should never be called with buffered updates. */
5679 assert( v
->nPendingData
<0 );
5681 dataBufferInit(&result
, 0);
5682 rc
= loadSegmentInt(v
, pData
, nData
, iLeavesEnd
,
5683 pTerm
, nTerm
, isPrefix
, &result
);
5684 if( rc
==SQLITE_OK
&& result
.nData
>0 ){
5685 if( out
->nData
==0 ){
5686 DataBuffer tmp
= *out
;
5691 DLReader readers
[2];
5693 dlrInit(&readers
[0], DL_DEFAULT
, out
->pData
, out
->nData
);
5694 dlrInit(&readers
[1], DL_DEFAULT
, result
.pData
, result
.nData
);
5695 dataBufferInit(&merged
, out
->nData
+result
.nData
);
5696 docListMerge(&merged
, readers
, 2);
5697 dataBufferDestroy(out
);
5699 dlrDestroy(&readers
[0]);
5700 dlrDestroy(&readers
[1]);
5703 dataBufferDestroy(&result
);
5707 /* Scan the database and merge together the posting lists for the term
5710 static int termSelect(fulltext_vtab
*v
, int iColumn
,
5711 const char *pTerm
, int nTerm
, int isPrefix
,
5712 DocListType iType
, DataBuffer
*out
){
5715 int rc
= sql_get_statement(v
, SEGDIR_SELECT_ALL_STMT
, &s
);
5716 if( rc
!=SQLITE_OK
) return rc
;
5718 /* This code should never be called with buffered updates. */
5719 assert( v
->nPendingData
<0 );
5721 dataBufferInit(&doclist
, 0);
5723 /* Traverse the segments from oldest to newest so that newer doclist
5724 ** elements for given docids overwrite older elements.
5726 while( (rc
= sqlite3_step(s
))==SQLITE_ROW
){
5727 const char *pData
= sqlite3_column_blob(s
, 2);
5728 const int nData
= sqlite3_column_bytes(s
, 2);
5729 const sqlite_int64 iLeavesEnd
= sqlite3_column_int64(s
, 1);
5730 rc
= loadSegment(v
, pData
, nData
, iLeavesEnd
, pTerm
, nTerm
, isPrefix
,
5732 if( rc
!=SQLITE_OK
) goto err
;
5734 if( rc
==SQLITE_DONE
){
5735 if( doclist
.nData
!=0 ){
5736 /* TODO(shess) The old term_select_all() code applied the column
5737 ** restrict as we merged segments, leading to smaller buffers.
5738 ** This is probably worthwhile to bring back, once the new storage
5739 ** system is checked in.
5741 if( iColumn
==v
->nColumn
) iColumn
= -1;
5742 docListTrim(DL_DEFAULT
, doclist
.pData
, doclist
.nData
,
5743 iColumn
, iType
, out
);
5749 dataBufferDestroy(&doclist
);
5753 /****************************************************************/
5754 /* Used to hold hashtable data for sorting. */
5755 typedef struct TermData
{
5758 DLCollector
*pCollector
;
5761 /* Orders TermData elements in strcmp fashion ( <0 for less-than, 0
5762 ** for equal, >0 for greater-than).
5764 static int termDataCmp(const void *av
, const void *bv
){
5765 const TermData
*a
= (const TermData
*)av
;
5766 const TermData
*b
= (const TermData
*)bv
;
5767 int n
= a
->nTerm
<b
->nTerm
? a
->nTerm
: b
->nTerm
;
5768 int c
= memcmp(a
->pTerm
, b
->pTerm
, n
);
5769 if( c
!=0 ) return c
;
5770 return a
->nTerm
-b
->nTerm
;
5773 /* Order pTerms data by term, then write a new level 0 segment using
5776 static int writeZeroSegment(fulltext_vtab
*v
, fts2Hash
*pTerms
){
5783 /* Determine the next index at level 0, merging as necessary. */
5784 rc
= segdirNextIndex(v
, 0, &idx
);
5785 if( rc
!=SQLITE_OK
) return rc
;
5787 n
= fts2HashCount(pTerms
);
5788 pData
= sqlite3_malloc(n
*sizeof(TermData
));
5790 for(i
= 0, e
= fts2HashFirst(pTerms
); e
; i
++, e
= fts2HashNext(e
)){
5792 pData
[i
].pTerm
= fts2HashKey(e
);
5793 pData
[i
].nTerm
= fts2HashKeysize(e
);
5794 pData
[i
].pCollector
= fts2HashData(e
);
5798 /* TODO(shess) Should we allow user-defined collation sequences,
5799 ** here? I think we only need that once we support prefix searches.
5801 if( n
>1 ) qsort(pData
, n
, sizeof(*pData
), termDataCmp
);
5803 /* TODO(shess) Refactor so that we can write directly to the segment
5804 ** DataBuffer, as happens for segment merges.
5806 leafWriterInit(0, idx
, &writer
);
5807 dataBufferInit(&dl
, 0);
5809 dataBufferReset(&dl
);
5810 dlcAddDoclist(pData
[i
].pCollector
, &dl
);
5811 rc
= leafWriterStep(v
, &writer
,
5812 pData
[i
].pTerm
, pData
[i
].nTerm
, dl
.pData
, dl
.nData
);
5813 if( rc
!=SQLITE_OK
) goto err
;
5815 rc
= leafWriterFinalize(v
, &writer
);
5818 dataBufferDestroy(&dl
);
5819 sqlite3_free(pData
);
5820 leafWriterDestroy(&writer
);
5824 /* If pendingTerms has data, free it. */
5825 static int clearPendingTerms(fulltext_vtab
*v
){
5826 if( v
->nPendingData
>=0 ){
5828 for(e
=fts2HashFirst(&v
->pendingTerms
); e
; e
=fts2HashNext(e
)){
5829 dlcDelete(fts2HashData(e
));
5831 fts2HashClear(&v
->pendingTerms
);
5832 v
->nPendingData
= -1;
5837 /* If pendingTerms has data, flush it to a level-zero segment, and
5840 static int flushPendingTerms(fulltext_vtab
*v
){
5841 if( v
->nPendingData
>=0 ){
5842 int rc
= writeZeroSegment(v
, &v
->pendingTerms
);
5843 if( rc
==SQLITE_OK
) clearPendingTerms(v
);
5849 /* If pendingTerms is "too big", or docid is out of order, flush it.
5850 ** Regardless, be certain that pendingTerms is initialized for use.
5852 static int initPendingTerms(fulltext_vtab
*v
, sqlite_int64 iDocid
){
5853 /* TODO(shess) Explore whether partially flushing the buffer on
5854 ** forced-flush would provide better performance. I suspect that if
5855 ** we ordered the doclists by size and flushed the largest until the
5856 ** buffer was half empty, that would let the less frequent terms
5857 ** generate longer doclists.
5859 if( iDocid
<=v
->iPrevDocid
|| v
->nPendingData
>kPendingThreshold
){
5860 int rc
= flushPendingTerms(v
);
5861 if( rc
!=SQLITE_OK
) return rc
;
5863 if( v
->nPendingData
<0 ){
5864 fts2HashInit(&v
->pendingTerms
, FTS2_HASH_STRING
, 1);
5865 v
->nPendingData
= 0;
5867 v
->iPrevDocid
= iDocid
;
5871 /* This function implements the xUpdate callback; it is the top-level entry
5872 * point for inserting, deleting or updating a row in a full-text table. */
5873 static int fulltextUpdate(sqlite3_vtab
*pVtab
, int nArg
, sqlite3_value
**ppArg
,
5874 sqlite_int64
*pRowid
){
5875 fulltext_vtab
*v
= (fulltext_vtab
*) pVtab
;
5878 TRACE(("FTS2 Update %p\n", pVtab
));
5881 rc
= index_delete(v
, sqlite3_value_int64(ppArg
[0]));
5882 if( rc
==SQLITE_OK
){
5883 /* If we just deleted the last row in the table, clear out the
5886 rc
= content_exists(v
);
5887 if( rc
==SQLITE_ROW
){
5889 }else if( rc
==SQLITE_DONE
){
5890 /* Clear the pending terms so we don't flush a useless level-0
5891 ** segment when the transaction closes.
5893 rc
= clearPendingTerms(v
);
5894 if( rc
==SQLITE_OK
){
5895 rc
= segdir_delete_all(v
);
5899 } else if( sqlite3_value_type(ppArg
[0]) != SQLITE_NULL
){
5901 * ppArg[0] = old rowid
5902 * ppArg[1] = new rowid
5903 * ppArg[2..2+v->nColumn-1] = values
5904 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
5906 sqlite_int64 rowid
= sqlite3_value_int64(ppArg
[0]);
5907 if( sqlite3_value_type(ppArg
[1]) != SQLITE_INTEGER
||
5908 sqlite3_value_int64(ppArg
[1]) != rowid
){
5909 rc
= SQLITE_ERROR
; /* we don't allow changing the rowid */
5911 assert( nArg
==2+v
->nColumn
+1);
5912 rc
= index_update(v
, rowid
, &ppArg
[2]);
5916 * ppArg[1] = requested rowid
5917 * ppArg[2..2+v->nColumn-1] = values
5918 * ppArg[2+v->nColumn] = value for magic column (we ignore this)
5920 assert( nArg
==2+v
->nColumn
+1);
5921 rc
= index_insert(v
, ppArg
[1], &ppArg
[2], pRowid
);
5927 static int fulltextSync(sqlite3_vtab
*pVtab
){
5928 TRACE(("FTS2 xSync()\n"));
5929 return flushPendingTerms((fulltext_vtab
*)pVtab
);
5932 static int fulltextBegin(sqlite3_vtab
*pVtab
){
5933 fulltext_vtab
*v
= (fulltext_vtab
*) pVtab
;
5934 TRACE(("FTS2 xBegin()\n"));
5936 /* Any buffered updates should have been cleared by the previous
5939 assert( v
->nPendingData
<0 );
5940 return clearPendingTerms(v
);
5943 static int fulltextCommit(sqlite3_vtab
*pVtab
){
5944 fulltext_vtab
*v
= (fulltext_vtab
*) pVtab
;
5945 TRACE(("FTS2 xCommit()\n"));
5947 /* Buffered updates should have been cleared by fulltextSync(). */
5948 assert( v
->nPendingData
<0 );
5949 return clearPendingTerms(v
);
5952 static int fulltextRollback(sqlite3_vtab
*pVtab
){
5953 TRACE(("FTS2 xRollback()\n"));
5954 return clearPendingTerms((fulltext_vtab
*)pVtab
);
5958 ** Implementation of the snippet() function for FTS2
5960 static void snippetFunc(
5961 sqlite3_context
*pContext
,
5963 sqlite3_value
**argv
5965 fulltext_cursor
*pCursor
;
5966 if( argc
<1 ) return;
5967 if( sqlite3_value_type(argv
[0])!=SQLITE_BLOB
||
5968 sqlite3_value_bytes(argv
[0])!=sizeof(pCursor
) ){
5969 sqlite3_result_error(pContext
, "illegal first argument to html_snippet",-1);
5971 const char *zStart
= "<b>";
5972 const char *zEnd
= "</b>";
5973 const char *zEllipsis
= "<b>...</b>";
5974 memcpy(&pCursor
, sqlite3_value_blob(argv
[0]), sizeof(pCursor
));
5976 zStart
= (const char*)sqlite3_value_text(argv
[1]);
5978 zEnd
= (const char*)sqlite3_value_text(argv
[2]);
5980 zEllipsis
= (const char*)sqlite3_value_text(argv
[3]);
5984 snippetAllOffsets(pCursor
);
5985 snippetText(pCursor
, zStart
, zEnd
, zEllipsis
);
5986 sqlite3_result_text(pContext
, pCursor
->snippet
.zSnippet
,
5987 pCursor
->snippet
.nSnippet
, SQLITE_STATIC
);
5992 ** Implementation of the offsets() function for FTS2
5994 static void snippetOffsetsFunc(
5995 sqlite3_context
*pContext
,
5997 sqlite3_value
**argv
5999 fulltext_cursor
*pCursor
;
6000 if( argc
<1 ) return;
6001 if( sqlite3_value_type(argv
[0])!=SQLITE_BLOB
||
6002 sqlite3_value_bytes(argv
[0])!=sizeof(pCursor
) ){
6003 sqlite3_result_error(pContext
, "illegal first argument to offsets",-1);
6005 memcpy(&pCursor
, sqlite3_value_blob(argv
[0]), sizeof(pCursor
));
6006 snippetAllOffsets(pCursor
);
6007 snippetOffsetText(&pCursor
->snippet
);
6008 sqlite3_result_text(pContext
,
6009 pCursor
->snippet
.zOffset
, pCursor
->snippet
.nOffset
,
6014 /* OptLeavesReader is nearly identical to LeavesReader, except that
6015 ** where LeavesReader is geared towards the merging of complete
6016 ** segment levels (with exactly MERGE_COUNT segments), OptLeavesReader
6017 ** is geared towards implementation of the optimize() function, and
6018 ** can merge all segments simultaneously. This version may be
6019 ** somewhat less efficient than LeavesReader because it merges into an
6020 ** accumulator rather than doing an N-way merge, but since segment
6021 ** size grows exponentially (so segment count logrithmically) this is
6022 ** probably not an immediate problem.
6024 /* TODO(shess): Prove that assertion, or extend the merge code to
6025 ** merge tree fashion (like the prefix-searching code does).
6027 /* TODO(shess): OptLeavesReader and LeavesReader could probably be
6028 ** merged with little or no loss of performance for LeavesReader. The
6029 ** merged code would need to handle >MERGE_COUNT segments, and would
6030 ** also need to be able to optionally optimize away deletes.
6032 typedef struct OptLeavesReader
{
6033 /* Segment number, to order readers by age. */
6035 LeavesReader reader
;
6038 static int optLeavesReaderAtEnd(OptLeavesReader
*pReader
){
6039 return leavesReaderAtEnd(&pReader
->reader
);
6041 static int optLeavesReaderTermBytes(OptLeavesReader
*pReader
){
6042 return leavesReaderTermBytes(&pReader
->reader
);
6044 static const char *optLeavesReaderData(OptLeavesReader
*pReader
){
6045 return leavesReaderData(&pReader
->reader
);
6047 static int optLeavesReaderDataBytes(OptLeavesReader
*pReader
){
6048 return leavesReaderDataBytes(&pReader
->reader
);
6050 static const char *optLeavesReaderTerm(OptLeavesReader
*pReader
){
6051 return leavesReaderTerm(&pReader
->reader
);
6053 static int optLeavesReaderStep(fulltext_vtab
*v
, OptLeavesReader
*pReader
){
6054 return leavesReaderStep(v
, &pReader
->reader
);
6056 static int optLeavesReaderTermCmp(OptLeavesReader
*lr1
, OptLeavesReader
*lr2
){
6057 return leavesReaderTermCmp(&lr1
->reader
, &lr2
->reader
);
6059 /* Order by term ascending, segment ascending (oldest to newest), with
6060 ** exhausted readers to the end.
6062 static int optLeavesReaderCmp(OptLeavesReader
*lr1
, OptLeavesReader
*lr2
){
6063 int c
= optLeavesReaderTermCmp(lr1
, lr2
);
6064 if( c
!=0 ) return c
;
6065 return lr1
->segment
-lr2
->segment
;
6067 /* Bubble pLr[0] to appropriate place in pLr[1..nLr-1]. Assumes that
6068 ** pLr[1..nLr-1] is already sorted.
6070 static void optLeavesReaderReorder(OptLeavesReader
*pLr
, int nLr
){
6071 while( nLr
>1 && optLeavesReaderCmp(pLr
, pLr
+1)>0 ){
6072 OptLeavesReader tmp
= pLr
[0];
6080 /* optimize() helper function. Put the readers in order and iterate
6081 ** through them, merging doclists for matching terms into pWriter.
6082 ** Returns SQLITE_OK on success, or the SQLite error code which
6083 ** prevented success.
6085 static int optimizeInternal(fulltext_vtab
*v
,
6086 OptLeavesReader
*readers
, int nReaders
,
6087 LeafWriter
*pWriter
){
6088 int i
, rc
= SQLITE_OK
;
6089 DataBuffer doclist
, merged
, tmp
;
6091 /* Order the readers. */
6094 optLeavesReaderReorder(&readers
[i
], nReaders
-i
);
6097 dataBufferInit(&doclist
, LEAF_MAX
);
6098 dataBufferInit(&merged
, LEAF_MAX
);
6100 /* Exhausted readers bubble to the end, so when the first reader is
6101 ** at eof, all are at eof.
6103 while( !optLeavesReaderAtEnd(&readers
[0]) ){
6105 /* Figure out how many readers share the next term. */
6106 for(i
=1; i
<nReaders
&& !optLeavesReaderAtEnd(&readers
[i
]); i
++){
6107 if( 0!=optLeavesReaderTermCmp(&readers
[0], &readers
[i
]) ) break;
6110 /* Special-case for no merge. */
6112 /* Trim deletions from the doclist. */
6113 dataBufferReset(&merged
);
6114 docListTrim(DL_DEFAULT
,
6115 optLeavesReaderData(&readers
[0]),
6116 optLeavesReaderDataBytes(&readers
[0]),
6117 -1, DL_DEFAULT
, &merged
);
6119 DLReader dlReaders
[MERGE_COUNT
];
6120 int iReader
, nReaders
;
6122 /* Prime the pipeline with the first reader's doclist. After
6123 ** one pass index 0 will reference the accumulated doclist.
6125 dlrInit(&dlReaders
[0], DL_DEFAULT
,
6126 optLeavesReaderData(&readers
[0]),
6127 optLeavesReaderDataBytes(&readers
[0]));
6130 assert( iReader
<i
); /* Must execute the loop at least once. */
6132 /* Merge 16 inputs per pass. */
6133 for( nReaders
=1; iReader
<i
&& nReaders
<MERGE_COUNT
;
6134 iReader
++, nReaders
++ ){
6135 dlrInit(&dlReaders
[nReaders
], DL_DEFAULT
,
6136 optLeavesReaderData(&readers
[iReader
]),
6137 optLeavesReaderDataBytes(&readers
[iReader
]));
6140 /* Merge doclists and swap result into accumulator. */
6141 dataBufferReset(&merged
);
6142 docListMerge(&merged
, dlReaders
, nReaders
);
6147 while( nReaders
-- > 0 ){
6148 dlrDestroy(&dlReaders
[nReaders
]);
6151 /* Accumulated doclist to reader 0 for next pass. */
6152 dlrInit(&dlReaders
[0], DL_DEFAULT
, doclist
.pData
, doclist
.nData
);
6155 /* Destroy reader that was left in the pipeline. */
6156 dlrDestroy(&dlReaders
[0]);
6158 /* Trim deletions from the doclist. */
6159 dataBufferReset(&merged
);
6160 docListTrim(DL_DEFAULT
, doclist
.pData
, doclist
.nData
,
6161 -1, DL_DEFAULT
, &merged
);
6164 /* Only pass doclists with hits (skip if all hits deleted). */
6165 if( merged
.nData
>0 ){
6166 rc
= leafWriterStep(v
, pWriter
,
6167 optLeavesReaderTerm(&readers
[0]),
6168 optLeavesReaderTermBytes(&readers
[0]),
6169 merged
.pData
, merged
.nData
);
6170 if( rc
!=SQLITE_OK
) goto err
;
6173 /* Step merged readers to next term and reorder. */
6175 rc
= optLeavesReaderStep(v
, &readers
[i
]);
6176 if( rc
!=SQLITE_OK
) goto err
;
6178 optLeavesReaderReorder(&readers
[i
], nReaders
-i
);
6183 dataBufferDestroy(&doclist
);
6184 dataBufferDestroy(&merged
);
6188 /* Implement optimize() function for FTS3. optimize(t) merges all
6189 ** segments in the fts index into a single segment. 't' is the magic
6190 ** table-named column.
6192 static void optimizeFunc(sqlite3_context
*pContext
,
6193 int argc
, sqlite3_value
**argv
){
6194 fulltext_cursor
*pCursor
;
6196 sqlite3_result_error(pContext
, "excess arguments to optimize()",-1);
6197 }else if( sqlite3_value_type(argv
[0])!=SQLITE_BLOB
||
6198 sqlite3_value_bytes(argv
[0])!=sizeof(pCursor
) ){
6199 sqlite3_result_error(pContext
, "illegal first argument to optimize",-1);
6202 int i
, rc
, iMaxLevel
;
6203 OptLeavesReader
*readers
;
6208 memcpy(&pCursor
, sqlite3_value_blob(argv
[0]), sizeof(pCursor
));
6209 v
= cursor_vtab(pCursor
);
6211 /* Flush any buffered updates before optimizing. */
6212 rc
= flushPendingTerms(v
);
6213 if( rc
!=SQLITE_OK
) goto err
;
6215 rc
= segdir_count(v
, &nReaders
, &iMaxLevel
);
6216 if( rc
!=SQLITE_OK
) goto err
;
6217 if( nReaders
==0 || nReaders
==1 ){
6218 sqlite3_result_text(pContext
, "Index already optimal", -1,
6223 rc
= sql_get_statement(v
, SEGDIR_SELECT_ALL_STMT
, &s
);
6224 if( rc
!=SQLITE_OK
) goto err
;
6226 readers
= sqlite3_malloc(nReaders
*sizeof(readers
[0]));
6227 if( readers
==NULL
) goto err
;
6229 /* Note that there will already be a segment at this position
6230 ** until we call segdir_delete() on iMaxLevel.
6232 leafWriterInit(iMaxLevel
, 0, &writer
);
6235 while( (rc
= sqlite3_step(s
))==SQLITE_ROW
){
6236 sqlite_int64 iStart
= sqlite3_column_int64(s
, 0);
6237 sqlite_int64 iEnd
= sqlite3_column_int64(s
, 1);
6238 const char *pRootData
= sqlite3_column_blob(s
, 2);
6239 int nRootData
= sqlite3_column_bytes(s
, 2);
6241 assert( i
<nReaders
);
6242 rc
= leavesReaderInit(v
, -1, iStart
, iEnd
, pRootData
, nRootData
,
6243 &readers
[i
].reader
);
6244 if( rc
!=SQLITE_OK
) break;
6246 readers
[i
].segment
= i
;
6250 /* If we managed to successfully read them all, optimize them. */
6251 if( rc
==SQLITE_DONE
){
6252 assert( i
==nReaders
);
6253 rc
= optimizeInternal(v
, readers
, nReaders
, &writer
);
6257 leavesReaderDestroy(&readers
[i
].reader
);
6259 sqlite3_free(readers
);
6261 /* If we've successfully gotten to here, delete the old segments
6262 ** and flush the interior structure of the new segment.
6264 if( rc
==SQLITE_OK
){
6265 for( i
=0; i
<=iMaxLevel
; i
++ ){
6266 rc
= segdir_delete(v
, i
);
6267 if( rc
!=SQLITE_OK
) break;
6270 if( rc
==SQLITE_OK
) rc
= leafWriterFinalize(v
, &writer
);
6273 leafWriterDestroy(&writer
);
6275 if( rc
!=SQLITE_OK
) goto err
;
6277 sqlite3_result_text(pContext
, "Index optimized", -1, SQLITE_STATIC
);
6280 /* TODO(shess): Error-handling needs to be improved along the
6281 ** lines of the dump_ functions.
6286 sqlite3_snprintf(sizeof(buf
), buf
, "Error in optimize: %s",
6287 sqlite3_errmsg(sqlite3_context_db_handle(pContext
)));
6288 sqlite3_result_error(pContext
, buf
, -1);
6294 /* Generate an error of the form "<prefix>: <msg>". If msg is NULL,
6295 ** pull the error from the context's db handle.
6297 static void generateError(sqlite3_context
*pContext
,
6298 const char *prefix
, const char *msg
){
6300 if( msg
==NULL
) msg
= sqlite3_errmsg(sqlite3_context_db_handle(pContext
));
6301 sqlite3_snprintf(sizeof(buf
), buf
, "%s: %s", prefix
, msg
);
6302 sqlite3_result_error(pContext
, buf
, -1);
6305 /* Helper function to collect the set of terms in the segment into
6306 ** pTerms. The segment is defined by the leaf nodes between
6307 ** iStartBlockid and iEndBlockid, inclusive, or by the contents of
6308 ** pRootData if iStartBlockid is 0 (in which case the entire segment
6311 static int collectSegmentTerms(fulltext_vtab
*v
, sqlite3_stmt
*s
,
6313 const sqlite_int64 iStartBlockid
= sqlite3_column_int64(s
, 0);
6314 const sqlite_int64 iEndBlockid
= sqlite3_column_int64(s
, 1);
6315 const char *pRootData
= sqlite3_column_blob(s
, 2);
6316 const int nRootData
= sqlite3_column_bytes(s
, 2);
6317 LeavesReader reader
;
6318 int rc
= leavesReaderInit(v
, 0, iStartBlockid
, iEndBlockid
,
6319 pRootData
, nRootData
, &reader
);
6320 if( rc
!=SQLITE_OK
) return rc
;
6322 while( rc
==SQLITE_OK
&& !leavesReaderAtEnd(&reader
) ){
6323 const char *pTerm
= leavesReaderTerm(&reader
);
6324 const int nTerm
= leavesReaderTermBytes(&reader
);
6325 void *oldValue
= sqlite3Fts2HashFind(pTerms
, pTerm
, nTerm
);
6326 void *newValue
= (void *)((char *)oldValue
+1);
6328 /* From the comment before sqlite3Fts2HashInsert in fts2_hash.c,
6329 ** the data value passed is returned in case of malloc failure.
6331 if( newValue
==sqlite3Fts2HashInsert(pTerms
, pTerm
, nTerm
, newValue
) ){
6334 rc
= leavesReaderStep(v
, &reader
);
6338 leavesReaderDestroy(&reader
);
6342 /* Helper function to build the result string for dump_terms(). */
6343 static int generateTermsResult(sqlite3_context
*pContext
, fts2Hash
*pTerms
){
6344 int iTerm
, nTerms
, nResultBytes
, iByte
;
6349 /* Iterate pTerms to generate an array of terms in pData for
6352 nTerms
= fts2HashCount(pTerms
);
6354 pData
= sqlite3_malloc(nTerms
*sizeof(TermData
));
6355 if( pData
==NULL
) return SQLITE_NOMEM
;
6358 for(iTerm
= 0, e
= fts2HashFirst(pTerms
); e
; iTerm
++, e
= fts2HashNext(e
)){
6359 nResultBytes
+= fts2HashKeysize(e
)+1; /* Term plus trailing space */
6360 assert( iTerm
<nTerms
);
6361 pData
[iTerm
].pTerm
= fts2HashKey(e
);
6362 pData
[iTerm
].nTerm
= fts2HashKeysize(e
);
6363 pData
[iTerm
].pCollector
= fts2HashData(e
); /* unused */
6365 assert( iTerm
==nTerms
);
6367 assert( nResultBytes
>0 ); /* nTerms>0, nResultsBytes must be, too. */
6368 result
= sqlite3_malloc(nResultBytes
);
6370 sqlite3_free(pData
);
6371 return SQLITE_NOMEM
;
6374 if( nTerms
>1 ) qsort(pData
, nTerms
, sizeof(*pData
), termDataCmp
);
6376 /* Read the terms in order to build the result. */
6378 for(iTerm
=0; iTerm
<nTerms
; ++iTerm
){
6379 memcpy(result
+iByte
, pData
[iTerm
].pTerm
, pData
[iTerm
].nTerm
);
6380 iByte
+= pData
[iTerm
].nTerm
;
6381 result
[iByte
++] = ' ';
6383 assert( iByte
==nResultBytes
);
6384 assert( result
[nResultBytes
-1]==' ' );
6385 result
[nResultBytes
-1] = '\0';
6387 /* Passes away ownership of result. */
6388 sqlite3_result_text(pContext
, result
, nResultBytes
-1, sqlite3_free
);
6389 sqlite3_free(pData
);
6393 /* Implements dump_terms() for use in inspecting the fts2 index from
6394 ** tests. TEXT result containing the ordered list of terms joined by
6395 ** spaces. dump_terms(t, level, idx) dumps the terms for the segment
6396 ** specified by level, idx (in %_segdir), while dump_terms(t) dumps
6397 ** all terms in the index. In both cases t is the fts table's magic
6398 ** table-named column.
6400 static void dumpTermsFunc(
6401 sqlite3_context
*pContext
,
6402 int argc
, sqlite3_value
**argv
6404 fulltext_cursor
*pCursor
;
6405 if( argc
!=3 && argc
!=1 ){
6406 generateError(pContext
, "dump_terms", "incorrect arguments");
6407 }else if( sqlite3_value_type(argv
[0])!=SQLITE_BLOB
||
6408 sqlite3_value_bytes(argv
[0])!=sizeof(pCursor
) ){
6409 generateError(pContext
, "dump_terms", "illegal first argument");
6413 sqlite3_stmt
*s
= NULL
;
6416 memcpy(&pCursor
, sqlite3_value_blob(argv
[0]), sizeof(pCursor
));
6417 v
= cursor_vtab(pCursor
);
6419 /* If passed only the cursor column, get all segments. Otherwise
6420 ** get the segment described by the following two arguments.
6423 rc
= sql_get_statement(v
, SEGDIR_SELECT_ALL_STMT
, &s
);
6425 rc
= sql_get_statement(v
, SEGDIR_SELECT_SEGMENT_STMT
, &s
);
6426 if( rc
==SQLITE_OK
){
6427 rc
= sqlite3_bind_int(s
, 1, sqlite3_value_int(argv
[1]));
6428 if( rc
==SQLITE_OK
){
6429 rc
= sqlite3_bind_int(s
, 2, sqlite3_value_int(argv
[2]));
6434 if( rc
!=SQLITE_OK
){
6435 generateError(pContext
, "dump_terms", NULL
);
6439 /* Collect the terms for each segment. */
6440 sqlite3Fts2HashInit(&terms
, FTS2_HASH_STRING
, 1);
6441 while( (rc
= sqlite3_step(s
))==SQLITE_ROW
){
6442 rc
= collectSegmentTerms(v
, s
, &terms
);
6443 if( rc
!=SQLITE_OK
) break;
6446 if( rc
!=SQLITE_DONE
){
6448 generateError(pContext
, "dump_terms", NULL
);
6450 const int nTerms
= fts2HashCount(&terms
);
6452 rc
= generateTermsResult(pContext
, &terms
);
6453 if( rc
==SQLITE_NOMEM
){
6454 generateError(pContext
, "dump_terms", "out of memory");
6456 assert( rc
==SQLITE_OK
);
6458 }else if( argc
==3 ){
6459 /* The specific segment asked for could not be found. */
6460 generateError(pContext
, "dump_terms", "segment not found");
6462 /* No segments found. */
6463 /* TODO(shess): It should be impossible to reach this. This
6464 ** case can only happen for an empty table, in which case
6465 ** SQLite has no rows to call this function on.
6467 sqlite3_result_null(pContext
);
6470 sqlite3Fts2HashClear(&terms
);
6474 /* Expand the DL_DEFAULT doclist in pData into a text result in
6477 static void createDoclistResult(sqlite3_context
*pContext
,
6478 const char *pData
, int nData
){
6482 assert( pData
!=NULL
&& nData
>0 );
6484 dataBufferInit(&dump
, 0);
6485 dlrInit(&dlReader
, DL_DEFAULT
, pData
, nData
);
6486 for( ; !dlrAtEnd(&dlReader
); dlrStep(&dlReader
) ){
6490 plrInit(&plReader
, &dlReader
);
6491 if( DL_DEFAULT
==DL_DOCIDS
|| plrAtEnd(&plReader
) ){
6492 sqlite3_snprintf(sizeof(buf
), buf
, "[%lld] ", dlrDocid(&dlReader
));
6493 dataBufferAppend(&dump
, buf
, strlen(buf
));
6495 int iColumn
= plrColumn(&plReader
);
6497 sqlite3_snprintf(sizeof(buf
), buf
, "[%lld %d[",
6498 dlrDocid(&dlReader
), iColumn
);
6499 dataBufferAppend(&dump
, buf
, strlen(buf
));
6501 for( ; !plrAtEnd(&plReader
); plrStep(&plReader
) ){
6502 if( plrColumn(&plReader
)!=iColumn
){
6503 iColumn
= plrColumn(&plReader
);
6504 sqlite3_snprintf(sizeof(buf
), buf
, "] %d[", iColumn
);
6505 assert( dump
.nData
>0 );
6506 dump
.nData
--; /* Overwrite trailing space. */
6507 assert( dump
.pData
[dump
.nData
]==' ');
6508 dataBufferAppend(&dump
, buf
, strlen(buf
));
6510 if( DL_DEFAULT
==DL_POSITIONS_OFFSETS
){
6511 sqlite3_snprintf(sizeof(buf
), buf
, "%d,%d,%d ",
6512 plrPosition(&plReader
),
6513 plrStartOffset(&plReader
), plrEndOffset(&plReader
));
6514 }else if( DL_DEFAULT
==DL_POSITIONS
){
6515 sqlite3_snprintf(sizeof(buf
), buf
, "%d ", plrPosition(&plReader
));
6517 assert( NULL
=="Unhandled DL_DEFAULT value");
6519 dataBufferAppend(&dump
, buf
, strlen(buf
));
6521 plrDestroy(&plReader
);
6523 assert( dump
.nData
>0 );
6524 dump
.nData
--; /* Overwrite trailing space. */
6525 assert( dump
.pData
[dump
.nData
]==' ');
6526 dataBufferAppend(&dump
, "]] ", 3);
6529 dlrDestroy(&dlReader
);
6531 assert( dump
.nData
>0 );
6532 dump
.nData
--; /* Overwrite trailing space. */
6533 assert( dump
.pData
[dump
.nData
]==' ');
6534 dump
.pData
[dump
.nData
] = '\0';
6535 assert( dump
.nData
>0 );
6537 /* Passes ownership of dump's buffer to pContext. */
6538 sqlite3_result_text(pContext
, dump
.pData
, dump
.nData
, sqlite3_free
);
6540 dump
.nData
= dump
.nCapacity
= 0;
6543 /* Implements dump_doclist() for use in inspecting the fts2 index from
6544 ** tests. TEXT result containing a string representation of the
6545 ** doclist for the indicated term. dump_doclist(t, term, level, idx)
6546 ** dumps the doclist for term from the segment specified by level, idx
6547 ** (in %_segdir), while dump_doclist(t, term) dumps the logical
6548 ** doclist for the term across all segments. The per-segment doclist
6549 ** can contain deletions, while the full-index doclist will not
6550 ** (deletions are omitted).
6552 ** Result formats differ with the setting of DL_DEFAULTS. Examples:
6554 ** DL_DOCIDS: [1] [3] [7]
6555 ** DL_POSITIONS: [1 0[0 4] 1[17]] [3 1[5]]
6556 ** DL_POSITIONS_OFFSETS: [1 0[0,0,3 4,23,26] 1[17,102,105]] [3 1[5,20,23]]
6558 ** In each case the number after the outer '[' is the docid. In the
6559 ** latter two cases, the number before the inner '[' is the column
6560 ** associated with the values within. For DL_POSITIONS the numbers
6561 ** within are the positions, for DL_POSITIONS_OFFSETS they are the
6562 ** position, the start offset, and the end offset.
6564 static void dumpDoclistFunc(
6565 sqlite3_context
*pContext
,
6566 int argc
, sqlite3_value
**argv
6568 fulltext_cursor
*pCursor
;
6569 if( argc
!=2 && argc
!=4 ){
6570 generateError(pContext
, "dump_doclist", "incorrect arguments");
6571 }else if( sqlite3_value_type(argv
[0])!=SQLITE_BLOB
||
6572 sqlite3_value_bytes(argv
[0])!=sizeof(pCursor
) ){
6573 generateError(pContext
, "dump_doclist", "illegal first argument");
6574 }else if( sqlite3_value_text(argv
[1])==NULL
||
6575 sqlite3_value_text(argv
[1])[0]=='\0' ){
6576 generateError(pContext
, "dump_doclist", "empty second argument");
6578 const char *pTerm
= (const char *)sqlite3_value_text(argv
[1]);
6579 const int nTerm
= strlen(pTerm
);
6584 memcpy(&pCursor
, sqlite3_value_blob(argv
[0]), sizeof(pCursor
));
6585 v
= cursor_vtab(pCursor
);
6587 dataBufferInit(&doclist
, 0);
6589 /* termSelect() yields the same logical doclist that queries are
6593 rc
= termSelect(v
, v
->nColumn
, pTerm
, nTerm
, 0, DL_DEFAULT
, &doclist
);
6595 sqlite3_stmt
*s
= NULL
;
6597 /* Get our specific segment's information. */
6598 rc
= sql_get_statement(v
, SEGDIR_SELECT_SEGMENT_STMT
, &s
);
6599 if( rc
==SQLITE_OK
){
6600 rc
= sqlite3_bind_int(s
, 1, sqlite3_value_int(argv
[2]));
6601 if( rc
==SQLITE_OK
){
6602 rc
= sqlite3_bind_int(s
, 2, sqlite3_value_int(argv
[3]));
6606 if( rc
==SQLITE_OK
){
6607 rc
= sqlite3_step(s
);
6609 if( rc
==SQLITE_DONE
){
6610 dataBufferDestroy(&doclist
);
6611 generateError(pContext
, "dump_doclist", "segment not found");
6615 /* Found a segment, load it into doclist. */
6616 if( rc
==SQLITE_ROW
){
6617 const sqlite_int64 iLeavesEnd
= sqlite3_column_int64(s
, 1);
6618 const char *pData
= sqlite3_column_blob(s
, 2);
6619 const int nData
= sqlite3_column_bytes(s
, 2);
6621 /* loadSegment() is used by termSelect() to load each
6624 rc
= loadSegment(v
, pData
, nData
, iLeavesEnd
, pTerm
, nTerm
, 0,
6626 if( rc
==SQLITE_OK
){
6627 rc
= sqlite3_step(s
);
6629 /* Should not have more than one matching segment. */
6630 if( rc
!=SQLITE_DONE
){
6632 dataBufferDestroy(&doclist
);
6633 generateError(pContext
, "dump_doclist", "invalid segdir");
6644 if( rc
==SQLITE_OK
){
6645 if( doclist
.nData
>0 ){
6646 createDoclistResult(pContext
, doclist
.pData
, doclist
.nData
);
6648 /* TODO(shess): This can happen if the term is not present, or
6649 ** if all instances of the term have been deleted and this is
6650 ** an all-index dump. It may be interesting to distinguish
6653 sqlite3_result_text(pContext
, "", 0, SQLITE_STATIC
);
6655 }else if( rc
==SQLITE_NOMEM
){
6656 /* Handle out-of-memory cases specially because if they are
6657 ** generated in fts2 code they may not be reflected in the db
6660 /* TODO(shess): Handle this more comprehensively.
6661 ** sqlite3ErrStr() has what I need, but is internal.
6663 generateError(pContext
, "dump_doclist", "out of memory");
6665 generateError(pContext
, "dump_doclist", NULL
);
6668 dataBufferDestroy(&doclist
);
6674 ** This routine implements the xFindFunction method for the FTS2
6677 static int fulltextFindFunction(
6678 sqlite3_vtab
*pVtab
,
6681 void (**pxFunc
)(sqlite3_context
*,int,sqlite3_value
**),
6684 if( strcmp(zName
,"snippet")==0 ){
6685 *pxFunc
= snippetFunc
;
6687 }else if( strcmp(zName
,"offsets")==0 ){
6688 *pxFunc
= snippetOffsetsFunc
;
6690 }else if( strcmp(zName
,"optimize")==0 ){
6691 *pxFunc
= optimizeFunc
;
6694 /* NOTE(shess): These functions are present only for testing
6695 ** purposes. No particular effort is made to optimize their
6696 ** execution or how they build their results.
6698 }else if( strcmp(zName
,"dump_terms")==0 ){
6699 /* fprintf(stderr, "Found dump_terms\n"); */
6700 *pxFunc
= dumpTermsFunc
;
6702 }else if( strcmp(zName
,"dump_doclist")==0 ){
6703 /* fprintf(stderr, "Found dump_doclist\n"); */
6704 *pxFunc
= dumpDoclistFunc
;
6712 ** Rename an fts2 table.
6714 static int fulltextRename(
6715 sqlite3_vtab
*pVtab
,
6718 fulltext_vtab
*p
= (fulltext_vtab
*)pVtab
;
6719 int rc
= SQLITE_NOMEM
;
6720 char *zSql
= sqlite3_mprintf(
6721 "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
6722 "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';"
6723 "ALTER TABLE %Q.'%q_segdir' RENAME TO '%q_segdir';"
6724 , p
->zDb
, p
->zName
, zName
6725 , p
->zDb
, p
->zName
, zName
6726 , p
->zDb
, p
->zName
, zName
6729 rc
= sqlite3_exec(p
->db
, zSql
, 0, 0, 0);
6735 static const sqlite3_module fts2Module
= {
6737 /* xCreate */ fulltextCreate
,
6738 /* xConnect */ fulltextConnect
,
6739 /* xBestIndex */ fulltextBestIndex
,
6740 /* xDisconnect */ fulltextDisconnect
,
6741 /* xDestroy */ fulltextDestroy
,
6742 /* xOpen */ fulltextOpen
,
6743 /* xClose */ fulltextClose
,
6744 /* xFilter */ fulltextFilter
,
6745 /* xNext */ fulltextNext
,
6746 /* xEof */ fulltextEof
,
6747 /* xColumn */ fulltextColumn
,
6748 /* xRowid */ fulltextRowid
,
6749 /* xUpdate */ fulltextUpdate
,
6750 /* xBegin */ fulltextBegin
,
6751 /* xSync */ fulltextSync
,
6752 /* xCommit */ fulltextCommit
,
6753 /* xRollback */ fulltextRollback
,
6754 /* xFindFunction */ fulltextFindFunction
,
6755 /* xRename */ fulltextRename
,
6758 static void hashDestroy(void *p
){
6759 fts2Hash
*pHash
= (fts2Hash
*)p
;
6760 sqlite3Fts2HashClear(pHash
);
6761 sqlite3_free(pHash
);
6765 ** The fts2 built-in tokenizers - "simple" and "porter" - are implemented
6766 ** in files fts2_tokenizer1.c and fts2_porter.c respectively. The following
6767 ** two forward declarations are for functions declared in these files
6768 ** used to retrieve the respective implementations.
6770 ** Calling sqlite3Fts2SimpleTokenizerModule() sets the value pointed
6771 ** to by the argument to point a the "simple" tokenizer implementation.
6772 ** Function ...PorterTokenizerModule() sets *pModule to point to the
6773 ** porter tokenizer/stemmer implementation.
6775 void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module
const**ppModule
);
6776 void sqlite3Fts2PorterTokenizerModule(sqlite3_tokenizer_module
const**ppModule
);
6777 void sqlite3Fts2IcuTokenizerModule(sqlite3_tokenizer_module
const**ppModule
);
6779 int sqlite3Fts2InitHashTable(sqlite3
*, fts2Hash
*, const char *);
6782 ** Initialize the fts2 extension. If this extension is built as part
6783 ** of the sqlite library, then this function is called directly by
6784 ** SQLite. If fts2 is built as a dynamically loadable extension, this
6785 ** function is called by the sqlite3_extension_init() entry point.
6787 int sqlite3Fts2Init(sqlite3
*db
){
6789 fts2Hash
*pHash
= 0;
6790 const sqlite3_tokenizer_module
*pSimple
= 0;
6791 const sqlite3_tokenizer_module
*pPorter
= 0;
6792 const sqlite3_tokenizer_module
*pIcu
= 0;
6794 sqlite3Fts2SimpleTokenizerModule(&pSimple
);
6795 sqlite3Fts2PorterTokenizerModule(&pPorter
);
6796 #ifdef SQLITE_ENABLE_ICU
6797 sqlite3Fts2IcuTokenizerModule(&pIcu
);
6800 /* Allocate and initialize the hash-table used to store tokenizers. */
6801 pHash
= sqlite3_malloc(sizeof(fts2Hash
));
6805 sqlite3Fts2HashInit(pHash
, FTS2_HASH_STRING
, 1);
6808 /* Load the built-in tokenizers into the hash table */
6809 if( rc
==SQLITE_OK
){
6810 if( sqlite3Fts2HashInsert(pHash
, "simple", 7, (void *)pSimple
)
6811 || sqlite3Fts2HashInsert(pHash
, "porter", 7, (void *)pPorter
)
6812 || (pIcu
&& sqlite3Fts2HashInsert(pHash
, "icu", 4, (void *)pIcu
))
6818 /* Create the virtual table wrapper around the hash-table and overload
6819 ** the two scalar functions. If this is successful, register the
6820 ** module with sqlite.
6823 && SQLITE_OK
==(rc
= sqlite3Fts2InitHashTable(db
, pHash
, "fts2_tokenizer"))
6824 && SQLITE_OK
==(rc
= sqlite3_overload_function(db
, "snippet", -1))
6825 && SQLITE_OK
==(rc
= sqlite3_overload_function(db
, "offsets", -1))
6826 && SQLITE_OK
==(rc
= sqlite3_overload_function(db
, "optimize", -1))
6828 && SQLITE_OK
==(rc
= sqlite3_overload_function(db
, "dump_terms", -1))
6829 && SQLITE_OK
==(rc
= sqlite3_overload_function(db
, "dump_doclist", -1))
6832 return sqlite3_create_module_v2(
6833 db
, "fts2", &fts2Module
, (void *)pHash
, hashDestroy
6837 /* An error has occurred. Delete the hash table and return the error code. */
6838 assert( rc
!=SQLITE_OK
);
6840 sqlite3Fts2HashClear(pHash
);
6841 sqlite3_free(pHash
);
6848 __declspec(dllexport
)
6850 int sqlite3_fts2_init(
6853 const sqlite3_api_routines
*pApi
6855 SQLITE_EXTENSION_INIT2(pApi
)
6856 return sqlite3Fts2Init(db
);
6860 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */