Slight doc touchup for [af41a1e6fc8b36e9bf65] based on feedback. No code changes.
[sqlite.git] / ext / fts5 / fts5_aux.c
blob30101fbe25a597164243cbb4c171f73a8a89ec1c
1 /*
2 ** 2014 May 31
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
15 #include "fts5Int.h"
16 #include <math.h> /* amalgamator: keep */
19 ** Object used to iterate through all "coalesced phrase instances" in
20 ** a single column of the current row. If the phrase instances in the
21 ** column being considered do not overlap, this object simply iterates
22 ** through them. Or, if they do overlap (share one or more tokens in
23 ** common), each set of overlapping instances is treated as a single
24 ** match. See documentation for the highlight() auxiliary function for
25 ** details.
27 ** Usage is:
29 ** for(rc = fts5CInstIterNext(pApi, pFts, iCol, &iter);
30 ** (rc==SQLITE_OK && 0==fts5CInstIterEof(&iter);
31 ** rc = fts5CInstIterNext(&iter)
32 ** ){
33 ** printf("instance starts at %d, ends at %d\n", iter.iStart, iter.iEnd);
34 ** }
37 typedef struct CInstIter CInstIter;
38 struct CInstIter {
39 const Fts5ExtensionApi *pApi; /* API offered by current FTS version */
40 Fts5Context *pFts; /* First arg to pass to pApi functions */
41 int iCol; /* Column to search */
42 int iInst; /* Next phrase instance index */
43 int nInst; /* Total number of phrase instances */
45 /* Output variables */
46 int iStart; /* First token in coalesced phrase instance */
47 int iEnd; /* Last token in coalesced phrase instance */
51 ** Advance the iterator to the next coalesced phrase instance. Return
52 ** an SQLite error code if an error occurs, or SQLITE_OK otherwise.
54 static int fts5CInstIterNext(CInstIter *pIter){
55 int rc = SQLITE_OK;
56 pIter->iStart = -1;
57 pIter->iEnd = -1;
59 while( rc==SQLITE_OK && pIter->iInst<pIter->nInst ){
60 int ip; int ic; int io;
61 rc = pIter->pApi->xInst(pIter->pFts, pIter->iInst, &ip, &ic, &io);
62 if( rc==SQLITE_OK ){
63 if( ic==pIter->iCol ){
64 int iEnd = io - 1 + pIter->pApi->xPhraseSize(pIter->pFts, ip);
65 if( pIter->iStart<0 ){
66 pIter->iStart = io;
67 pIter->iEnd = iEnd;
68 }else if( io<=pIter->iEnd ){
69 if( iEnd>pIter->iEnd ) pIter->iEnd = iEnd;
70 }else{
71 break;
74 pIter->iInst++;
78 return rc;
82 ** Initialize the iterator object indicated by the final parameter to
83 ** iterate through coalesced phrase instances in column iCol.
85 static int fts5CInstIterInit(
86 const Fts5ExtensionApi *pApi,
87 Fts5Context *pFts,
88 int iCol,
89 CInstIter *pIter
91 int rc;
93 memset(pIter, 0, sizeof(CInstIter));
94 pIter->pApi = pApi;
95 pIter->pFts = pFts;
96 pIter->iCol = iCol;
97 rc = pApi->xInstCount(pFts, &pIter->nInst);
99 if( rc==SQLITE_OK ){
100 rc = fts5CInstIterNext(pIter);
103 return rc;
108 /*************************************************************************
109 ** Start of highlight() implementation.
111 typedef struct HighlightContext HighlightContext;
112 struct HighlightContext {
113 /* Constant parameters to fts5HighlightCb() */
114 int iRangeStart; /* First token to include */
115 int iRangeEnd; /* If non-zero, last token to include */
116 const char *zOpen; /* Opening highlight */
117 const char *zClose; /* Closing highlight */
118 const char *zIn; /* Input text */
119 int nIn; /* Size of input text in bytes */
121 /* Variables modified by fts5HighlightCb() */
122 CInstIter iter; /* Coalesced Instance Iterator */
123 int iPos; /* Current token offset in zIn[] */
124 int iOff; /* Have copied up to this offset in zIn[] */
125 int bOpen; /* True if highlight is open */
126 char *zOut; /* Output value */
130 ** Append text to the HighlightContext output string - p->zOut. Argument
131 ** z points to a buffer containing n bytes of text to append. If n is
132 ** negative, everything up until the first '\0' is appended to the output.
134 ** If *pRc is set to any value other than SQLITE_OK when this function is
135 ** called, it is a no-op. If an error (i.e. an OOM condition) is encountered,
136 ** *pRc is set to an error code before returning.
138 static void fts5HighlightAppend(
139 int *pRc,
140 HighlightContext *p,
141 const char *z, int n
143 if( *pRc==SQLITE_OK && z ){
144 if( n<0 ) n = (int)strlen(z);
145 p->zOut = sqlite3_mprintf("%z%.*s", p->zOut, n, z);
146 if( p->zOut==0 ) *pRc = SQLITE_NOMEM;
151 ** Tokenizer callback used by implementation of highlight() function.
153 static int fts5HighlightCb(
154 void *pContext, /* Pointer to HighlightContext object */
155 int tflags, /* Mask of FTS5_TOKEN_* flags */
156 const char *pToken, /* Buffer containing token */
157 int nToken, /* Size of token in bytes */
158 int iStartOff, /* Start byte offset of token */
159 int iEndOff /* End byte offset of token */
161 HighlightContext *p = (HighlightContext*)pContext;
162 int rc = SQLITE_OK;
163 int iPos;
165 UNUSED_PARAM2(pToken, nToken);
167 if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK;
168 iPos = p->iPos++;
170 if( p->iRangeEnd>=0 ){
171 if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;
172 if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
175 /* If the parenthesis is open, and this token is not part of the current
176 ** phrase, and the starting byte offset of this token is past the point
177 ** that has currently been copied into the output buffer, close the
178 ** parenthesis. */
179 if( p->bOpen
180 && (iPos<=p->iter.iStart || p->iter.iStart<0)
181 && iStartOff>p->iOff
183 fts5HighlightAppend(&rc, p, p->zClose, -1);
184 p->bOpen = 0;
187 /* If this is the start of a new phrase, and the highlight is not open:
189 ** * copy text from the input up to the start of the phrase, and
190 ** * open the highlight.
192 if( iPos==p->iter.iStart && p->bOpen==0 ){
193 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iStartOff - p->iOff);
194 fts5HighlightAppend(&rc, p, p->zOpen, -1);
195 p->iOff = iStartOff;
196 p->bOpen = 1;
199 if( iPos==p->iter.iEnd ){
200 if( p->bOpen==0 ){
201 assert( p->iRangeEnd>=0 );
202 fts5HighlightAppend(&rc, p, p->zOpen, -1);
203 p->bOpen = 1;
205 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
206 p->iOff = iEndOff;
208 if( rc==SQLITE_OK ){
209 rc = fts5CInstIterNext(&p->iter);
213 if( iPos==p->iRangeEnd ){
214 if( p->bOpen ){
215 if( p->iter.iStart>=0 && iPos>=p->iter.iStart ){
216 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
217 p->iOff = iEndOff;
219 fts5HighlightAppend(&rc, p, p->zClose, -1);
220 p->bOpen = 0;
222 fts5HighlightAppend(&rc, p, &p->zIn[p->iOff], iEndOff - p->iOff);
223 p->iOff = iEndOff;
226 return rc;
230 ** Implementation of highlight() function.
232 static void fts5HighlightFunction(
233 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
234 Fts5Context *pFts, /* First arg to pass to pApi functions */
235 sqlite3_context *pCtx, /* Context for returning result/error */
236 int nVal, /* Number of values in apVal[] array */
237 sqlite3_value **apVal /* Array of trailing arguments */
239 HighlightContext ctx;
240 int rc;
241 int iCol;
243 if( nVal!=3 ){
244 const char *zErr = "wrong number of arguments to function highlight()";
245 sqlite3_result_error(pCtx, zErr, -1);
246 return;
249 iCol = sqlite3_value_int(apVal[0]);
250 memset(&ctx, 0, sizeof(HighlightContext));
251 ctx.zOpen = (const char*)sqlite3_value_text(apVal[1]);
252 ctx.zClose = (const char*)sqlite3_value_text(apVal[2]);
253 ctx.iRangeEnd = -1;
254 rc = pApi->xColumnText(pFts, iCol, &ctx.zIn, &ctx.nIn);
255 if( rc==SQLITE_RANGE ){
256 sqlite3_result_text(pCtx, "", -1, SQLITE_STATIC);
257 rc = SQLITE_OK;
258 }else if( ctx.zIn ){
259 if( rc==SQLITE_OK ){
260 rc = fts5CInstIterInit(pApi, pFts, iCol, &ctx.iter);
263 if( rc==SQLITE_OK ){
264 rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb);
266 if( ctx.bOpen ){
267 fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1);
269 fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
271 if( rc==SQLITE_OK ){
272 sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
274 sqlite3_free(ctx.zOut);
276 if( rc!=SQLITE_OK ){
277 sqlite3_result_error_code(pCtx, rc);
281 ** End of highlight() implementation.
282 **************************************************************************/
285 ** Context object passed to the fts5SentenceFinderCb() function.
287 typedef struct Fts5SFinder Fts5SFinder;
288 struct Fts5SFinder {
289 int iPos; /* Current token position */
290 int nFirstAlloc; /* Allocated size of aFirst[] */
291 int nFirst; /* Number of entries in aFirst[] */
292 int *aFirst; /* Array of first token in each sentence */
293 const char *zDoc; /* Document being tokenized */
297 ** Add an entry to the Fts5SFinder.aFirst[] array. Grow the array if
298 ** necessary. Return SQLITE_OK if successful, or SQLITE_NOMEM if an
299 ** error occurs.
301 static int fts5SentenceFinderAdd(Fts5SFinder *p, int iAdd){
302 if( p->nFirstAlloc==p->nFirst ){
303 int nNew = p->nFirstAlloc ? p->nFirstAlloc*2 : 64;
304 int *aNew;
306 aNew = (int*)sqlite3_realloc64(p->aFirst, nNew*sizeof(int));
307 if( aNew==0 ) return SQLITE_NOMEM;
308 p->aFirst = aNew;
309 p->nFirstAlloc = nNew;
311 p->aFirst[p->nFirst++] = iAdd;
312 return SQLITE_OK;
316 ** This function is an xTokenize() callback used by the auxiliary snippet()
317 ** function. Its job is to identify tokens that are the first in a sentence.
318 ** For each such token, an entry is added to the SFinder.aFirst[] array.
320 static int fts5SentenceFinderCb(
321 void *pContext, /* Pointer to HighlightContext object */
322 int tflags, /* Mask of FTS5_TOKEN_* flags */
323 const char *pToken, /* Buffer containing token */
324 int nToken, /* Size of token in bytes */
325 int iStartOff, /* Start offset of token */
326 int iEndOff /* End offset of token */
328 int rc = SQLITE_OK;
330 UNUSED_PARAM2(pToken, nToken);
331 UNUSED_PARAM(iEndOff);
333 if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){
334 Fts5SFinder *p = (Fts5SFinder*)pContext;
335 if( p->iPos>0 ){
336 int i;
337 char c = 0;
338 for(i=iStartOff-1; i>=0; i--){
339 c = p->zDoc[i];
340 if( c!=' ' && c!='\t' && c!='\n' && c!='\r' ) break;
342 if( i!=iStartOff-1 && (c=='.' || c==':') ){
343 rc = fts5SentenceFinderAdd(p, p->iPos);
345 }else{
346 rc = fts5SentenceFinderAdd(p, 0);
348 p->iPos++;
350 return rc;
353 static int fts5SnippetScore(
354 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
355 Fts5Context *pFts, /* First arg to pass to pApi functions */
356 int nDocsize, /* Size of column in tokens */
357 unsigned char *aSeen, /* Array with one element per query phrase */
358 int iCol, /* Column to score */
359 int iPos, /* Starting offset to score */
360 int nToken, /* Max tokens per snippet */
361 int *pnScore, /* OUT: Score */
362 int *piPos /* OUT: Adjusted offset */
364 int rc;
365 int i;
366 int ip = 0;
367 int ic = 0;
368 int iOff = 0;
369 int iFirst = -1;
370 int nInst;
371 int nScore = 0;
372 int iLast = 0;
373 sqlite3_int64 iEnd = (sqlite3_int64)iPos + nToken;
375 rc = pApi->xInstCount(pFts, &nInst);
376 for(i=0; i<nInst && rc==SQLITE_OK; i++){
377 rc = pApi->xInst(pFts, i, &ip, &ic, &iOff);
378 if( rc==SQLITE_OK && ic==iCol && iOff>=iPos && iOff<iEnd ){
379 nScore += (aSeen[ip] ? 1 : 1000);
380 aSeen[ip] = 1;
381 if( iFirst<0 ) iFirst = iOff;
382 iLast = iOff + pApi->xPhraseSize(pFts, ip);
386 *pnScore = nScore;
387 if( piPos ){
388 sqlite3_int64 iAdj = iFirst - (nToken - (iLast-iFirst)) / 2;
389 if( (iAdj+nToken)>nDocsize ) iAdj = nDocsize - nToken;
390 if( iAdj<0 ) iAdj = 0;
391 *piPos = (int)iAdj;
394 return rc;
398 ** Return the value in pVal interpreted as utf-8 text. Except, if pVal
399 ** contains a NULL value, return a pointer to a static string zero
400 ** bytes in length instead of a NULL pointer.
402 static const char *fts5ValueToText(sqlite3_value *pVal){
403 const char *zRet = (const char*)sqlite3_value_text(pVal);
404 return zRet ? zRet : "";
408 ** Implementation of snippet() function.
410 static void fts5SnippetFunction(
411 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
412 Fts5Context *pFts, /* First arg to pass to pApi functions */
413 sqlite3_context *pCtx, /* Context for returning result/error */
414 int nVal, /* Number of values in apVal[] array */
415 sqlite3_value **apVal /* Array of trailing arguments */
417 HighlightContext ctx;
418 int rc = SQLITE_OK; /* Return code */
419 int iCol; /* 1st argument to snippet() */
420 const char *zEllips; /* 4th argument to snippet() */
421 int nToken; /* 5th argument to snippet() */
422 int nInst = 0; /* Number of instance matches this row */
423 int i; /* Used to iterate through instances */
424 int nPhrase; /* Number of phrases in query */
425 unsigned char *aSeen; /* Array of "seen instance" flags */
426 int iBestCol; /* Column containing best snippet */
427 int iBestStart = 0; /* First token of best snippet */
428 int nBestScore = 0; /* Score of best snippet */
429 int nColSize = 0; /* Total size of iBestCol in tokens */
430 Fts5SFinder sFinder; /* Used to find the beginnings of sentences */
431 int nCol;
433 if( nVal!=5 ){
434 const char *zErr = "wrong number of arguments to function snippet()";
435 sqlite3_result_error(pCtx, zErr, -1);
436 return;
439 nCol = pApi->xColumnCount(pFts);
440 memset(&ctx, 0, sizeof(HighlightContext));
441 iCol = sqlite3_value_int(apVal[0]);
442 ctx.zOpen = fts5ValueToText(apVal[1]);
443 ctx.zClose = fts5ValueToText(apVal[2]);
444 ctx.iRangeEnd = -1;
445 zEllips = fts5ValueToText(apVal[3]);
446 nToken = sqlite3_value_int(apVal[4]);
448 iBestCol = (iCol>=0 ? iCol : 0);
449 nPhrase = pApi->xPhraseCount(pFts);
450 aSeen = sqlite3_malloc(nPhrase);
451 if( aSeen==0 ){
452 rc = SQLITE_NOMEM;
454 if( rc==SQLITE_OK ){
455 rc = pApi->xInstCount(pFts, &nInst);
458 memset(&sFinder, 0, sizeof(Fts5SFinder));
459 for(i=0; i<nCol; i++){
460 if( iCol<0 || iCol==i ){
461 int nDoc;
462 int nDocsize;
463 int ii;
464 sFinder.iPos = 0;
465 sFinder.nFirst = 0;
466 rc = pApi->xColumnText(pFts, i, &sFinder.zDoc, &nDoc);
467 if( rc!=SQLITE_OK ) break;
468 rc = pApi->xTokenize(pFts,
469 sFinder.zDoc, nDoc, (void*)&sFinder,fts5SentenceFinderCb
471 if( rc!=SQLITE_OK ) break;
472 rc = pApi->xColumnSize(pFts, i, &nDocsize);
473 if( rc!=SQLITE_OK ) break;
475 for(ii=0; rc==SQLITE_OK && ii<nInst; ii++){
476 int ip, ic, io;
477 int iAdj;
478 int nScore;
479 int jj;
481 rc = pApi->xInst(pFts, ii, &ip, &ic, &io);
482 if( ic!=i ) continue;
483 if( io>nDocsize ) rc = FTS5_CORRUPT;
484 if( rc!=SQLITE_OK ) continue;
485 memset(aSeen, 0, nPhrase);
486 rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i,
487 io, nToken, &nScore, &iAdj
489 if( rc==SQLITE_OK && nScore>nBestScore ){
490 nBestScore = nScore;
491 iBestCol = i;
492 iBestStart = iAdj;
493 nColSize = nDocsize;
496 if( rc==SQLITE_OK && sFinder.nFirst && nDocsize>nToken ){
497 for(jj=0; jj<(sFinder.nFirst-1); jj++){
498 if( sFinder.aFirst[jj+1]>io ) break;
501 if( sFinder.aFirst[jj]<io ){
502 memset(aSeen, 0, nPhrase);
503 rc = fts5SnippetScore(pApi, pFts, nDocsize, aSeen, i,
504 sFinder.aFirst[jj], nToken, &nScore, 0
507 nScore += (sFinder.aFirst[jj]==0 ? 120 : 100);
508 if( rc==SQLITE_OK && nScore>nBestScore ){
509 nBestScore = nScore;
510 iBestCol = i;
511 iBestStart = sFinder.aFirst[jj];
512 nColSize = nDocsize;
520 if( rc==SQLITE_OK ){
521 rc = pApi->xColumnText(pFts, iBestCol, &ctx.zIn, &ctx.nIn);
523 if( rc==SQLITE_OK && nColSize==0 ){
524 rc = pApi->xColumnSize(pFts, iBestCol, &nColSize);
526 if( ctx.zIn ){
527 if( rc==SQLITE_OK ){
528 rc = fts5CInstIterInit(pApi, pFts, iBestCol, &ctx.iter);
531 ctx.iRangeStart = iBestStart;
532 ctx.iRangeEnd = iBestStart + nToken - 1;
534 if( iBestStart>0 ){
535 fts5HighlightAppend(&rc, &ctx, zEllips, -1);
538 /* Advance iterator ctx.iter so that it points to the first coalesced
539 ** phrase instance at or following position iBestStart. */
540 while( ctx.iter.iStart>=0 && ctx.iter.iStart<iBestStart && rc==SQLITE_OK ){
541 rc = fts5CInstIterNext(&ctx.iter);
544 if( rc==SQLITE_OK ){
545 rc = pApi->xTokenize(pFts, ctx.zIn, ctx.nIn, (void*)&ctx,fts5HighlightCb);
547 if( ctx.bOpen ){
548 fts5HighlightAppend(&rc, &ctx, ctx.zClose, -1);
550 if( ctx.iRangeEnd>=(nColSize-1) ){
551 fts5HighlightAppend(&rc, &ctx, &ctx.zIn[ctx.iOff], ctx.nIn - ctx.iOff);
552 }else{
553 fts5HighlightAppend(&rc, &ctx, zEllips, -1);
556 if( rc==SQLITE_OK ){
557 sqlite3_result_text(pCtx, (const char*)ctx.zOut, -1, SQLITE_TRANSIENT);
558 }else{
559 sqlite3_result_error_code(pCtx, rc);
561 sqlite3_free(ctx.zOut);
562 sqlite3_free(aSeen);
563 sqlite3_free(sFinder.aFirst);
566 /************************************************************************/
569 ** The first time the bm25() function is called for a query, an instance
570 ** of the following structure is allocated and populated.
572 typedef struct Fts5Bm25Data Fts5Bm25Data;
573 struct Fts5Bm25Data {
574 int nPhrase; /* Number of phrases in query */
575 double avgdl; /* Average number of tokens in each row */
576 double *aIDF; /* IDF for each phrase */
577 double *aFreq; /* Array used to calculate phrase freq. */
581 ** Callback used by fts5Bm25GetData() to count the number of rows in the
582 ** table matched by each individual phrase within the query.
584 static int fts5CountCb(
585 const Fts5ExtensionApi *pApi,
586 Fts5Context *pFts,
587 void *pUserData /* Pointer to sqlite3_int64 variable */
589 sqlite3_int64 *pn = (sqlite3_int64*)pUserData;
590 UNUSED_PARAM2(pApi, pFts);
591 (*pn)++;
592 return SQLITE_OK;
596 ** Set *ppData to point to the Fts5Bm25Data object for the current query.
597 ** If the object has not already been allocated, allocate and populate it
598 ** now.
600 static int fts5Bm25GetData(
601 const Fts5ExtensionApi *pApi,
602 Fts5Context *pFts,
603 Fts5Bm25Data **ppData /* OUT: bm25-data object for this query */
605 int rc = SQLITE_OK; /* Return code */
606 Fts5Bm25Data *p; /* Object to return */
608 p = (Fts5Bm25Data*)pApi->xGetAuxdata(pFts, 0);
609 if( p==0 ){
610 int nPhrase; /* Number of phrases in query */
611 sqlite3_int64 nRow = 0; /* Number of rows in table */
612 sqlite3_int64 nToken = 0; /* Number of tokens in table */
613 sqlite3_int64 nByte; /* Bytes of space to allocate */
614 int i;
616 /* Allocate the Fts5Bm25Data object */
617 nPhrase = pApi->xPhraseCount(pFts);
618 nByte = sizeof(Fts5Bm25Data) + nPhrase*2*sizeof(double);
619 p = (Fts5Bm25Data*)sqlite3_malloc64(nByte);
620 if( p==0 ){
621 rc = SQLITE_NOMEM;
622 }else{
623 memset(p, 0, (size_t)nByte);
624 p->nPhrase = nPhrase;
625 p->aIDF = (double*)&p[1];
626 p->aFreq = &p->aIDF[nPhrase];
629 /* Calculate the average document length for this FTS5 table */
630 if( rc==SQLITE_OK ) rc = pApi->xRowCount(pFts, &nRow);
631 assert( rc!=SQLITE_OK || nRow>0 );
632 if( rc==SQLITE_OK ) rc = pApi->xColumnTotalSize(pFts, -1, &nToken);
633 if( rc==SQLITE_OK ) p->avgdl = (double)nToken / (double)nRow;
635 /* Calculate an IDF for each phrase in the query */
636 for(i=0; rc==SQLITE_OK && i<nPhrase; i++){
637 sqlite3_int64 nHit = 0;
638 rc = pApi->xQueryPhrase(pFts, i, (void*)&nHit, fts5CountCb);
639 if( rc==SQLITE_OK ){
640 /* Calculate the IDF (Inverse Document Frequency) for phrase i.
641 ** This is done using the standard BM25 formula as found on wikipedia:
643 ** IDF = log( (N - nHit + 0.5) / (nHit + 0.5) )
645 ** where "N" is the total number of documents in the set and nHit
646 ** is the number that contain at least one instance of the phrase
647 ** under consideration.
649 ** The problem with this is that if (N < 2*nHit), the IDF is
650 ** negative. Which is undesirable. So the mimimum allowable IDF is
651 ** (1e-6) - roughly the same as a term that appears in just over
652 ** half of set of 5,000,000 documents. */
653 double idf = log( (nRow - nHit + 0.5) / (nHit + 0.5) );
654 if( idf<=0.0 ) idf = 1e-6;
655 p->aIDF[i] = idf;
659 if( rc!=SQLITE_OK ){
660 sqlite3_free(p);
661 }else{
662 rc = pApi->xSetAuxdata(pFts, p, sqlite3_free);
664 if( rc!=SQLITE_OK ) p = 0;
666 *ppData = p;
667 return rc;
671 ** Implementation of bm25() function.
673 static void fts5Bm25Function(
674 const Fts5ExtensionApi *pApi, /* API offered by current FTS version */
675 Fts5Context *pFts, /* First arg to pass to pApi functions */
676 sqlite3_context *pCtx, /* Context for returning result/error */
677 int nVal, /* Number of values in apVal[] array */
678 sqlite3_value **apVal /* Array of trailing arguments */
680 const double k1 = 1.2; /* Constant "k1" from BM25 formula */
681 const double b = 0.75; /* Constant "b" from BM25 formula */
682 int rc; /* Error code */
683 double score = 0.0; /* SQL function return value */
684 Fts5Bm25Data *pData; /* Values allocated/calculated once only */
685 int i; /* Iterator variable */
686 int nInst = 0; /* Value returned by xInstCount() */
687 double D = 0.0; /* Total number of tokens in row */
688 double *aFreq = 0; /* Array of phrase freq. for current row */
690 /* Calculate the phrase frequency (symbol "f(qi,D)" in the documentation)
691 ** for each phrase in the query for the current row. */
692 rc = fts5Bm25GetData(pApi, pFts, &pData);
693 if( rc==SQLITE_OK ){
694 aFreq = pData->aFreq;
695 memset(aFreq, 0, sizeof(double) * pData->nPhrase);
696 rc = pApi->xInstCount(pFts, &nInst);
698 for(i=0; rc==SQLITE_OK && i<nInst; i++){
699 int ip; int ic; int io;
700 rc = pApi->xInst(pFts, i, &ip, &ic, &io);
701 if( rc==SQLITE_OK ){
702 double w = (nVal > ic) ? sqlite3_value_double(apVal[ic]) : 1.0;
703 aFreq[ip] += w;
707 /* Figure out the total size of the current row in tokens. */
708 if( rc==SQLITE_OK ){
709 int nTok;
710 rc = pApi->xColumnSize(pFts, -1, &nTok);
711 D = (double)nTok;
714 /* Determine and return the BM25 score for the current row. Or, if an
715 ** error has occurred, throw an exception. */
716 if( rc==SQLITE_OK ){
717 for(i=0; i<pData->nPhrase; i++){
718 score += pData->aIDF[i] * (
719 ( aFreq[i] * (k1 + 1.0) ) /
720 ( aFreq[i] + k1 * (1 - b + b * D / pData->avgdl) )
723 sqlite3_result_double(pCtx, -1.0 * score);
724 }else{
725 sqlite3_result_error_code(pCtx, rc);
729 int sqlite3Fts5AuxInit(fts5_api *pApi){
730 struct Builtin {
731 const char *zFunc; /* Function name (nul-terminated) */
732 void *pUserData; /* User-data pointer */
733 fts5_extension_function xFunc;/* Callback function */
734 void (*xDestroy)(void*); /* Destructor function */
735 } aBuiltin [] = {
736 { "snippet", 0, fts5SnippetFunction, 0 },
737 { "highlight", 0, fts5HighlightFunction, 0 },
738 { "bm25", 0, fts5Bm25Function, 0 },
740 int rc = SQLITE_OK; /* Return code */
741 int i; /* To iterate through builtin functions */
743 for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
744 rc = pApi->xCreateFunction(pApi,
745 aBuiltin[i].zFunc,
746 aBuiltin[i].pUserData,
747 aBuiltin[i].xFunc,
748 aBuiltin[i].xDestroy
752 return rc;