4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
16 #include <math.h> /* amalgamator: keep */
19 ** Object used to iterate through all "coalesced phrase instances" in
20 ** a single column of the current row. If the phrase instances in the
21 ** column being considered do not overlap, this object simply iterates
22 ** through them. Or, if they do overlap (share one or more tokens in
23 ** common), each set of overlapping instances is treated as a single
24 ** match. See documentation for the highlight() auxiliary function for
29 ** for(rc = fts5CInstIterNext(pApi, pFts, iCol, &iter);
30 ** (rc==SQLITE_OK && 0==fts5CInstIterEof(&iter);
31 ** rc = fts5CInstIterNext(&iter)
33 ** printf("instance starts at %d, ends at %d\n", iter.iStart, iter.iEnd);
37 typedef struct CInstIter CInstIter
;
39 const Fts5ExtensionApi
*pApi
; /* API offered by current FTS version */
40 Fts5Context
*pFts
; /* First arg to pass to pApi functions */
41 int iCol
; /* Column to search */
42 int iInst
; /* Next phrase instance index */
43 int nInst
; /* Total number of phrase instances */
45 /* Output variables */
46 int iStart
; /* First token in coalesced phrase instance */
47 int iEnd
; /* Last token in coalesced phrase instance */
51 ** Advance the iterator to the next coalesced phrase instance. Return
52 ** an SQLite error code if an error occurs, or SQLITE_OK otherwise.
54 static int fts5CInstIterNext(CInstIter
*pIter
){
59 while( rc
==SQLITE_OK
&& pIter
->iInst
<pIter
->nInst
){
60 int ip
; int ic
; int io
;
61 rc
= pIter
->pApi
->xInst(pIter
->pFts
, pIter
->iInst
, &ip
, &ic
, &io
);
63 if( ic
==pIter
->iCol
){
64 int iEnd
= io
- 1 + pIter
->pApi
->xPhraseSize(pIter
->pFts
, ip
);
65 if( pIter
->iStart
<0 ){
68 }else if( io
<=pIter
->iEnd
){
69 if( iEnd
>pIter
->iEnd
) pIter
->iEnd
= iEnd
;
82 ** Initialize the iterator object indicated by the final parameter to
83 ** iterate through coalesced phrase instances in column iCol.
85 static int fts5CInstIterInit(
86 const Fts5ExtensionApi
*pApi
,
93 memset(pIter
, 0, sizeof(CInstIter
));
97 rc
= pApi
->xInstCount(pFts
, &pIter
->nInst
);
100 rc
= fts5CInstIterNext(pIter
);
108 /*************************************************************************
109 ** Start of highlight() implementation.
111 typedef struct HighlightContext HighlightContext
;
112 struct HighlightContext
{
113 CInstIter iter
; /* Coalesced Instance Iterator */
114 int iPos
; /* Current token offset in zIn[] */
115 int iRangeStart
; /* First token to include */
116 int iRangeEnd
; /* If non-zero, last token to include */
117 const char *zOpen
; /* Opening highlight */
118 const char *zClose
; /* Closing highlight */
119 const char *zIn
; /* Input text */
120 int nIn
; /* Size of input text in bytes */
121 int iOff
; /* Current offset within zIn[] */
122 char *zOut
; /* Output value */
126 ** Append text to the HighlightContext output string - p->zOut. Argument
127 ** z points to a buffer containing n bytes of text to append. If n is
128 ** negative, everything up until the first '\0' is appended to the output.
130 ** If *pRc is set to any value other than SQLITE_OK when this function is
131 ** called, it is a no-op. If an error (i.e. an OOM condition) is encountered,
132 ** *pRc is set to an error code before returning.
134 static void fts5HighlightAppend(
139 if( *pRc
==SQLITE_OK
){
140 if( n
<0 ) n
= (int)strlen(z
);
141 p
->zOut
= sqlite3_mprintf("%z%.*s", p
->zOut
, n
, z
);
142 if( p
->zOut
==0 ) *pRc
= SQLITE_NOMEM
;
147 ** Tokenizer callback used by implementation of highlight() function.
149 static int fts5HighlightCb(
150 void *pContext
, /* Pointer to HighlightContext object */
151 int tflags
, /* Mask of FTS5_TOKEN_* flags */
152 const char *pToken
, /* Buffer containing token */
153 int nToken
, /* Size of token in bytes */
154 int iStartOff
, /* Start offset of token */
155 int iEndOff
/* End offset of token */
157 HighlightContext
*p
= (HighlightContext
*)pContext
;
161 UNUSED_PARAM2(pToken
, nToken
);
163 if( tflags
& FTS5_TOKEN_COLOCATED
) return SQLITE_OK
;
166 if( p
->iRangeEnd
>0 ){
167 if( iPos
<p
->iRangeStart
|| iPos
>p
->iRangeEnd
) return SQLITE_OK
;
168 if( p
->iRangeStart
&& iPos
==p
->iRangeStart
) p
->iOff
= iStartOff
;
171 if( iPos
==p
->iter
.iStart
){
172 fts5HighlightAppend(&rc
, p
, &p
->zIn
[p
->iOff
], iStartOff
- p
->iOff
);
173 fts5HighlightAppend(&rc
, p
, p
->zOpen
, -1);
177 if( iPos
==p
->iter
.iEnd
){
178 if( p
->iRangeEnd
&& p
->iter
.iStart
<p
->iRangeStart
){
179 fts5HighlightAppend(&rc
, p
, p
->zOpen
, -1);
181 fts5HighlightAppend(&rc
, p
, &p
->zIn
[p
->iOff
], iEndOff
- p
->iOff
);
182 fts5HighlightAppend(&rc
, p
, p
->zClose
, -1);
185 rc
= fts5CInstIterNext(&p
->iter
);
189 if( p
->iRangeEnd
>0 && iPos
==p
->iRangeEnd
){
190 fts5HighlightAppend(&rc
, p
, &p
->zIn
[p
->iOff
], iEndOff
- p
->iOff
);
192 if( iPos
>=p
->iter
.iStart
&& iPos
<p
->iter
.iEnd
){
193 fts5HighlightAppend(&rc
, p
, p
->zClose
, -1);
201 ** Implementation of highlight() function.
203 static void fts5HighlightFunction(
204 const Fts5ExtensionApi
*pApi
, /* API offered by current FTS version */
205 Fts5Context
*pFts
, /* First arg to pass to pApi functions */
206 sqlite3_context
*pCtx
, /* Context for returning result/error */
207 int nVal
, /* Number of values in apVal[] array */
208 sqlite3_value
**apVal
/* Array of trailing arguments */
210 HighlightContext ctx
;
215 const char *zErr
= "wrong number of arguments to function highlight()";
216 sqlite3_result_error(pCtx
, zErr
, -1);
220 iCol
= sqlite3_value_int(apVal
[0]);
221 memset(&ctx
, 0, sizeof(HighlightContext
));
222 ctx
.zOpen
= (const char*)sqlite3_value_text(apVal
[1]);
223 ctx
.zClose
= (const char*)sqlite3_value_text(apVal
[2]);
224 rc
= pApi
->xColumnText(pFts
, iCol
, &ctx
.zIn
, &ctx
.nIn
);
228 rc
= fts5CInstIterInit(pApi
, pFts
, iCol
, &ctx
.iter
);
232 rc
= pApi
->xTokenize(pFts
, ctx
.zIn
, ctx
.nIn
, (void*)&ctx
,fts5HighlightCb
);
234 fts5HighlightAppend(&rc
, &ctx
, &ctx
.zIn
[ctx
.iOff
], ctx
.nIn
- ctx
.iOff
);
237 sqlite3_result_text(pCtx
, (const char*)ctx
.zOut
, -1, SQLITE_TRANSIENT
);
239 sqlite3_free(ctx
.zOut
);
242 sqlite3_result_error_code(pCtx
, rc
);
246 ** End of highlight() implementation.
247 **************************************************************************/
250 ** Context object passed to the fts5SentenceFinderCb() function.
252 typedef struct Fts5SFinder Fts5SFinder
;
254 int iPos
; /* Current token position */
255 int nFirstAlloc
; /* Allocated size of aFirst[] */
256 int nFirst
; /* Number of entries in aFirst[] */
257 int *aFirst
; /* Array of first token in each sentence */
258 const char *zDoc
; /* Document being tokenized */
262 ** Add an entry to the Fts5SFinder.aFirst[] array. Grow the array if
263 ** necessary. Return SQLITE_OK if successful, or SQLITE_NOMEM if an
266 static int fts5SentenceFinderAdd(Fts5SFinder
*p
, int iAdd
){
267 if( p
->nFirstAlloc
==p
->nFirst
){
268 int nNew
= p
->nFirstAlloc
? p
->nFirstAlloc
*2 : 64;
271 aNew
= (int*)sqlite3_realloc(p
->aFirst
, nNew
*sizeof(int));
272 if( aNew
==0 ) return SQLITE_NOMEM
;
274 p
->nFirstAlloc
= nNew
;
276 p
->aFirst
[p
->nFirst
++] = iAdd
;
281 ** This function is an xTokenize() callback used by the auxiliary snippet()
282 ** function. Its job is to identify tokens that are the first in a sentence.
283 ** For each such token, an entry is added to the SFinder.aFirst[] array.
285 static int fts5SentenceFinderCb(
286 void *pContext
, /* Pointer to HighlightContext object */
287 int tflags
, /* Mask of FTS5_TOKEN_* flags */
288 const char *pToken
, /* Buffer containing token */
289 int nToken
, /* Size of token in bytes */
290 int iStartOff
, /* Start offset of token */
291 int iEndOff
/* End offset of token */
295 UNUSED_PARAM2(pToken
, nToken
);
296 UNUSED_PARAM(iEndOff
);
298 if( (tflags
& FTS5_TOKEN_COLOCATED
)==0 ){
299 Fts5SFinder
*p
= (Fts5SFinder
*)pContext
;
303 for(i
=iStartOff
-1; i
>=0; i
--){
305 if( c
!=' ' && c
!='\t' && c
!='\n' && c
!='\r' ) break;
307 if( i
!=iStartOff
-1 && (c
=='.' || c
==':') ){
308 rc
= fts5SentenceFinderAdd(p
, p
->iPos
);
311 rc
= fts5SentenceFinderAdd(p
, 0);
318 static int fts5SnippetScore(
319 const Fts5ExtensionApi
*pApi
, /* API offered by current FTS version */
320 Fts5Context
*pFts
, /* First arg to pass to pApi functions */
321 int nDocsize
, /* Size of column in tokens */
322 unsigned char *aSeen
, /* Array with one element per query phrase */
323 int iCol
, /* Column to score */
324 int iPos
, /* Starting offset to score */
325 int nToken
, /* Max tokens per snippet */
326 int *pnScore
, /* OUT: Score */
327 int *piPos
/* OUT: Adjusted offset */
339 rc
= pApi
->xInstCount(pFts
, &nInst
);
340 for(i
=0; i
<nInst
&& rc
==SQLITE_OK
; i
++){
341 rc
= pApi
->xInst(pFts
, i
, &ip
, &ic
, &iOff
);
342 if( rc
==SQLITE_OK
&& ic
==iCol
&& iOff
>=iPos
&& iOff
<(iPos
+nToken
) ){
343 nScore
+= (aSeen
[ip
] ? 1 : 1000);
345 if( iFirst
<0 ) iFirst
= iOff
;
346 iLast
= iOff
+ pApi
->xPhraseSize(pFts
, ip
);
352 int iAdj
= iFirst
- (nToken
- (iLast
-iFirst
)) / 2;
353 if( (iAdj
+nToken
)>nDocsize
) iAdj
= nDocsize
- nToken
;
354 if( iAdj
<0 ) iAdj
= 0;
362 ** Implementation of snippet() function.
364 static void fts5SnippetFunction(
365 const Fts5ExtensionApi
*pApi
, /* API offered by current FTS version */
366 Fts5Context
*pFts
, /* First arg to pass to pApi functions */
367 sqlite3_context
*pCtx
, /* Context for returning result/error */
368 int nVal
, /* Number of values in apVal[] array */
369 sqlite3_value
**apVal
/* Array of trailing arguments */
371 HighlightContext ctx
;
372 int rc
= SQLITE_OK
; /* Return code */
373 int iCol
; /* 1st argument to snippet() */
374 const char *zEllips
; /* 4th argument to snippet() */
375 int nToken
; /* 5th argument to snippet() */
376 int nInst
= 0; /* Number of instance matches this row */
377 int i
; /* Used to iterate through instances */
378 int nPhrase
; /* Number of phrases in query */
379 unsigned char *aSeen
; /* Array of "seen instance" flags */
380 int iBestCol
; /* Column containing best snippet */
381 int iBestStart
= 0; /* First token of best snippet */
382 int nBestScore
= 0; /* Score of best snippet */
383 int nColSize
= 0; /* Total size of iBestCol in tokens */
384 Fts5SFinder sFinder
; /* Used to find the beginnings of sentences */
388 const char *zErr
= "wrong number of arguments to function snippet()";
389 sqlite3_result_error(pCtx
, zErr
, -1);
393 nCol
= pApi
->xColumnCount(pFts
);
394 memset(&ctx
, 0, sizeof(HighlightContext
));
395 iCol
= sqlite3_value_int(apVal
[0]);
396 ctx
.zOpen
= (const char*)sqlite3_value_text(apVal
[1]);
397 ctx
.zClose
= (const char*)sqlite3_value_text(apVal
[2]);
398 zEllips
= (const char*)sqlite3_value_text(apVal
[3]);
399 nToken
= sqlite3_value_int(apVal
[4]);
401 iBestCol
= (iCol
>=0 ? iCol
: 0);
402 nPhrase
= pApi
->xPhraseCount(pFts
);
403 aSeen
= sqlite3_malloc(nPhrase
);
408 rc
= pApi
->xInstCount(pFts
, &nInst
);
411 memset(&sFinder
, 0, sizeof(Fts5SFinder
));
412 for(i
=0; i
<nCol
; i
++){
413 if( iCol
<0 || iCol
==i
){
419 rc
= pApi
->xColumnText(pFts
, i
, &sFinder
.zDoc
, &nDoc
);
420 if( rc
!=SQLITE_OK
) break;
421 rc
= pApi
->xTokenize(pFts
,
422 sFinder
.zDoc
, nDoc
, (void*)&sFinder
,fts5SentenceFinderCb
424 if( rc
!=SQLITE_OK
) break;
425 rc
= pApi
->xColumnSize(pFts
, i
, &nDocsize
);
426 if( rc
!=SQLITE_OK
) break;
428 for(ii
=0; rc
==SQLITE_OK
&& ii
<nInst
; ii
++){
434 rc
= pApi
->xInst(pFts
, ii
, &ip
, &ic
, &io
);
435 if( ic
!=i
|| rc
!=SQLITE_OK
) continue;
436 memset(aSeen
, 0, nPhrase
);
437 rc
= fts5SnippetScore(pApi
, pFts
, nDocsize
, aSeen
, i
,
438 io
, nToken
, &nScore
, &iAdj
440 if( rc
==SQLITE_OK
&& nScore
>nBestScore
){
447 if( rc
==SQLITE_OK
&& sFinder
.nFirst
&& nDocsize
>nToken
){
448 for(jj
=0; jj
<(sFinder
.nFirst
-1); jj
++){
449 if( sFinder
.aFirst
[jj
+1]>io
) break;
452 if( sFinder
.aFirst
[jj
]<io
){
453 memset(aSeen
, 0, nPhrase
);
454 rc
= fts5SnippetScore(pApi
, pFts
, nDocsize
, aSeen
, i
,
455 sFinder
.aFirst
[jj
], nToken
, &nScore
, 0
458 nScore
+= (sFinder
.aFirst
[jj
]==0 ? 120 : 100);
459 if( rc
==SQLITE_OK
&& nScore
>nBestScore
){
462 iBestStart
= sFinder
.aFirst
[jj
];
472 rc
= pApi
->xColumnText(pFts
, iBestCol
, &ctx
.zIn
, &ctx
.nIn
);
474 if( rc
==SQLITE_OK
&& nColSize
==0 ){
475 rc
= pApi
->xColumnSize(pFts
, iBestCol
, &nColSize
);
479 rc
= fts5CInstIterInit(pApi
, pFts
, iBestCol
, &ctx
.iter
);
482 ctx
.iRangeStart
= iBestStart
;
483 ctx
.iRangeEnd
= iBestStart
+ nToken
- 1;
486 fts5HighlightAppend(&rc
, &ctx
, zEllips
, -1);
489 /* Advance iterator ctx.iter so that it points to the first coalesced
490 ** phrase instance at or following position iBestStart. */
491 while( ctx
.iter
.iStart
>=0 && ctx
.iter
.iStart
<iBestStart
&& rc
==SQLITE_OK
){
492 rc
= fts5CInstIterNext(&ctx
.iter
);
496 rc
= pApi
->xTokenize(pFts
, ctx
.zIn
, ctx
.nIn
, (void*)&ctx
,fts5HighlightCb
);
498 if( ctx
.iRangeEnd
>=(nColSize
-1) ){
499 fts5HighlightAppend(&rc
, &ctx
, &ctx
.zIn
[ctx
.iOff
], ctx
.nIn
- ctx
.iOff
);
501 fts5HighlightAppend(&rc
, &ctx
, zEllips
, -1);
505 sqlite3_result_text(pCtx
, (const char*)ctx
.zOut
, -1, SQLITE_TRANSIENT
);
507 sqlite3_result_error_code(pCtx
, rc
);
509 sqlite3_free(ctx
.zOut
);
511 sqlite3_free(sFinder
.aFirst
);
514 /************************************************************************/
517 ** The first time the bm25() function is called for a query, an instance
518 ** of the following structure is allocated and populated.
520 typedef struct Fts5Bm25Data Fts5Bm25Data
;
521 struct Fts5Bm25Data
{
522 int nPhrase
; /* Number of phrases in query */
523 double avgdl
; /* Average number of tokens in each row */
524 double *aIDF
; /* IDF for each phrase */
525 double *aFreq
; /* Array used to calculate phrase freq. */
529 ** Callback used by fts5Bm25GetData() to count the number of rows in the
530 ** table matched by each individual phrase within the query.
532 static int fts5CountCb(
533 const Fts5ExtensionApi
*pApi
,
535 void *pUserData
/* Pointer to sqlite3_int64 variable */
537 sqlite3_int64
*pn
= (sqlite3_int64
*)pUserData
;
538 UNUSED_PARAM2(pApi
, pFts
);
544 ** Set *ppData to point to the Fts5Bm25Data object for the current query.
545 ** If the object has not already been allocated, allocate and populate it
548 static int fts5Bm25GetData(
549 const Fts5ExtensionApi
*pApi
,
551 Fts5Bm25Data
**ppData
/* OUT: bm25-data object for this query */
553 int rc
= SQLITE_OK
; /* Return code */
554 Fts5Bm25Data
*p
; /* Object to return */
556 p
= pApi
->xGetAuxdata(pFts
, 0);
558 int nPhrase
; /* Number of phrases in query */
559 sqlite3_int64 nRow
= 0; /* Number of rows in table */
560 sqlite3_int64 nToken
= 0; /* Number of tokens in table */
561 int nByte
; /* Bytes of space to allocate */
564 /* Allocate the Fts5Bm25Data object */
565 nPhrase
= pApi
->xPhraseCount(pFts
);
566 nByte
= sizeof(Fts5Bm25Data
) + nPhrase
*2*sizeof(double);
567 p
= (Fts5Bm25Data
*)sqlite3_malloc(nByte
);
572 p
->nPhrase
= nPhrase
;
573 p
->aIDF
= (double*)&p
[1];
574 p
->aFreq
= &p
->aIDF
[nPhrase
];
577 /* Calculate the average document length for this FTS5 table */
578 if( rc
==SQLITE_OK
) rc
= pApi
->xRowCount(pFts
, &nRow
);
579 if( rc
==SQLITE_OK
) rc
= pApi
->xColumnTotalSize(pFts
, -1, &nToken
);
580 if( rc
==SQLITE_OK
) p
->avgdl
= (double)nToken
/ (double)nRow
;
582 /* Calculate an IDF for each phrase in the query */
583 for(i
=0; rc
==SQLITE_OK
&& i
<nPhrase
; i
++){
584 sqlite3_int64 nHit
= 0;
585 rc
= pApi
->xQueryPhrase(pFts
, i
, (void*)&nHit
, fts5CountCb
);
587 /* Calculate the IDF (Inverse Document Frequency) for phrase i.
588 ** This is done using the standard BM25 formula as found on wikipedia:
590 ** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) )
592 ** where "N" is the total number of documents in the set and nHit
593 ** is the number that contain at least one instance of the phrase
594 ** under consideration.
596 ** The problem with this is that if (N < 2*nHit), the IDF is
597 ** negative. Which is undesirable. So the mimimum allowable IDF is
598 ** (1e-6) - roughly the same as a term that appears in just over
599 ** half of set of 5,000,000 documents. */
600 double idf
= log( (nRow
- nHit
+ 0.5) / (nHit
+ 0.5) );
601 if( idf
<=0.0 ) idf
= 1e-6;
609 rc
= pApi
->xSetAuxdata(pFts
, p
, sqlite3_free
);
611 if( rc
!=SQLITE_OK
) p
= 0;
618 ** Implementation of bm25() function.
620 static void fts5Bm25Function(
621 const Fts5ExtensionApi
*pApi
, /* API offered by current FTS version */
622 Fts5Context
*pFts
, /* First arg to pass to pApi functions */
623 sqlite3_context
*pCtx
, /* Context for returning result/error */
624 int nVal
, /* Number of values in apVal[] array */
625 sqlite3_value
**apVal
/* Array of trailing arguments */
627 const double k1
= 1.2; /* Constant "k1" from BM25 formula */
628 const double b
= 0.75; /* Constant "b" from BM25 formula */
629 int rc
= SQLITE_OK
; /* Error code */
630 double score
= 0.0; /* SQL function return value */
631 Fts5Bm25Data
*pData
; /* Values allocated/calculated once only */
632 int i
; /* Iterator variable */
633 int nInst
= 0; /* Value returned by xInstCount() */
634 double D
= 0.0; /* Total number of tokens in row */
635 double *aFreq
= 0; /* Array of phrase freq. for current row */
637 /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation)
638 ** for each phrase in the query for the current row. */
639 rc
= fts5Bm25GetData(pApi
, pFts
, &pData
);
641 aFreq
= pData
->aFreq
;
642 memset(aFreq
, 0, sizeof(double) * pData
->nPhrase
);
643 rc
= pApi
->xInstCount(pFts
, &nInst
);
645 for(i
=0; rc
==SQLITE_OK
&& i
<nInst
; i
++){
646 int ip
; int ic
; int io
;
647 rc
= pApi
->xInst(pFts
, i
, &ip
, &ic
, &io
);
649 double w
= (nVal
> ic
) ? sqlite3_value_double(apVal
[ic
]) : 1.0;
654 /* Figure out the total size of the current row in tokens. */
657 rc
= pApi
->xColumnSize(pFts
, -1, &nTok
);
661 /* Determine the BM25 score for the current row. */
662 for(i
=0; rc
==SQLITE_OK
&& i
<pData
->nPhrase
; i
++){
663 score
+= pData
->aIDF
[i
] * (
664 ( aFreq
[i
] * (k1
+ 1.0) ) /
665 ( aFreq
[i
] + k1
* (1 - b
+ b
* D
/ pData
->avgdl
) )
669 /* If no error has occurred, return the calculated score. Otherwise,
670 ** throw an SQL exception. */
672 sqlite3_result_double(pCtx
, -1.0 * score
);
674 sqlite3_result_error_code(pCtx
, rc
);
678 int sqlite3Fts5AuxInit(fts5_api
*pApi
){
680 const char *zFunc
; /* Function name (nul-terminated) */
681 void *pUserData
; /* User-data pointer */
682 fts5_extension_function xFunc
;/* Callback function */
683 void (*xDestroy
)(void*); /* Destructor function */
685 { "snippet", 0, fts5SnippetFunction
, 0 },
686 { "highlight", 0, fts5HighlightFunction
, 0 },
687 { "bm25", 0, fts5Bm25Function
, 0 },
689 int rc
= SQLITE_OK
; /* Return code */
690 int i
; /* To iterate through builtin functions */
692 for(i
=0; rc
==SQLITE_OK
&& i
<ArraySize(aBuiltin
); i
++){
693 rc
= pApi
->xCreateFunction(pApi
,
695 aBuiltin
[i
].pUserData
,