ext/fts3/fts3_test.c

   1 /*
   2 ** 2011 Jun 13
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 ******************************************************************************
  12 **
  13 ** This file is not part of the production FTS code. It is only used for
  14 ** testing. It contains a Tcl command that can be used to test if a document
  15 ** matches an FTS NEAR expression.
  16 **
  17 ** As of March 2012, it also contains a version 1 tokenizer used for testing
  18 ** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly.
  19 */
  20
  21 #if defined(INCLUDE_SQLITE_TCL_H)
  22 #  include "sqlite_tcl.h"
  23 #else
  24 #  include "tcl.h"
  25 #  ifndef SQLITE_TCLAPI
  26 #    define SQLITE_TCLAPI
  27 #  endif
  28 #endif
  29 #include <string.h>
  30 #include <assert.h>
  31
  32 #if defined(SQLITE_TEST)
  33 #if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)
  34
  35 /* Required so that the "ifdef SQLITE_ENABLE_FTS3" below works */
  36 #include "fts3Int.h"
  37
  38 #define NM_MAX_TOKEN 12
  39
  40 typedef struct NearPhrase NearPhrase;
  41 typedef struct NearDocument NearDocument;
  42 typedef struct NearToken NearToken;
  43
  44 struct NearDocument {
  45   int nToken;                     /* Length of token in bytes */
  46   NearToken *aToken;              /* Token array */
  47 };
  48
  49 struct NearToken {
  50   int n;                          /* Length of token in bytes */
  51   const char *z;                  /* Pointer to token string */
  52 };
  53
  54 struct NearPhrase {
  55   int nNear;                      /* Preceding NEAR value */
  56   int nToken;                     /* Number of tokens in this phrase */
  57   NearToken aToken[NM_MAX_TOKEN]; /* Array of tokens in this phrase */
  58 };
  59
  60 static int nm_phrase_match(
  61   NearPhrase *p,
  62   NearToken *aToken
  63 ){
  64   int ii;
  65
  66   for(ii=0; ii<p->nToken; ii++){
  67     NearToken *pToken = &p->aToken[ii];
  68     if( pToken->n>0 && pToken->z[pToken->n-1]=='*' ){
  69       if( aToken[ii].n<(pToken->n-1) ) return 0;
  70       if( memcmp(aToken[ii].z, pToken->z, pToken->n-1) ) return 0;
  71     }else{
  72       if( aToken[ii].n!=pToken->n ) return 0;
  73       if( memcmp(aToken[ii].z, pToken->z, pToken->n) ) return 0;
  74     }
  75   }
  76
  77   return 1;
  78 }
  79
  80 static int nm_near_chain(
  81   int iDir,                       /* Direction to iterate through aPhrase[] */
  82   NearDocument *pDoc,             /* Document to match against */
  83   int iPos,                       /* Position at which iPhrase was found */
  84   int nPhrase,                    /* Size of phrase array */
  85   NearPhrase *aPhrase,            /* Phrase array */
  86   int iPhrase                     /* Index of phrase found */
  87 ){
  88   int iStart;
  89   int iStop;
  90   int ii;
  91   int nNear;
  92   int iPhrase2;
  93   NearPhrase *p;
  94   NearPhrase *pPrev;
  95
  96   assert( iDir==1 || iDir==-1 );
  97
  98   if( iDir==1 ){
  99     if( (iPhrase+1)==nPhrase ) return 1;
 100     nNear = aPhrase[iPhrase+1].nNear;
 101   }else{
 102     if( iPhrase==0 ) return 1;
 103     nNear = aPhrase[iPhrase].nNear;
 104   }
 105   pPrev = &aPhrase[iPhrase];
 106   iPhrase2 = iPhrase+iDir;
 107   p = &aPhrase[iPhrase2];
 108
 109   iStart = iPos - nNear - p->nToken;
 110   iStop = iPos + nNear + pPrev->nToken;
 111
 112   if( iStart<0 ) iStart = 0;
 113   if( iStop > pDoc->nToken - p->nToken ) iStop = pDoc->nToken - p->nToken;
 114
 115   for(ii=iStart; ii<=iStop; ii++){
 116     if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
 117       if( nm_near_chain(iDir, pDoc, ii, nPhrase, aPhrase, iPhrase2) ) return 1;
 118     }
 119   }
 120
 121   return 0;
 122 }
 123
 124 static int nm_match_count(
 125   NearDocument *pDoc,             /* Document to match against */
 126   int nPhrase,                    /* Size of phrase array */
 127   NearPhrase *aPhrase,            /* Phrase array */
 128   int iPhrase                     /* Index of phrase to count matches for */
 129 ){
 130   int nOcc = 0;
 131   int ii;
 132   NearPhrase *p = &aPhrase[iPhrase];
 133
 134   for(ii=0; ii<(pDoc->nToken + 1 - p->nToken); ii++){
 135     if( nm_phrase_match(p, &pDoc->aToken[ii]) ){
 136       /* Test forward NEAR chain (i>iPhrase) */
 137       if( 0==nm_near_chain(1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
 138
 139       /* Test reverse NEAR chain (i<iPhrase) */
 140       if( 0==nm_near_chain(-1, pDoc, ii, nPhrase, aPhrase, iPhrase) ) continue;
 141
 142       /* This is a real match. Increment the counter. */
 143       nOcc++;
 144     }
 145   }
 146
 147   return nOcc;
 148 }
 149
 150 /*
 151 ** Tclcmd: fts3_near_match DOCUMENT EXPR ?OPTIONS?
 152 */
 153 static int SQLITE_TCLAPI fts3_near_match_cmd(
 154   ClientData clientData,
 155   Tcl_Interp *interp,
 156   int objc,
 157   Tcl_Obj *CONST objv[]
 158 ){
 159   int nTotal = 0;
 160   int rc;
 161   int ii;
 162   int nPhrase;
 163   NearPhrase *aPhrase = 0;
 164   NearDocument doc = {0, 0};
 165   Tcl_Obj **apDocToken;
 166   Tcl_Obj *pRet;
 167   Tcl_Obj *pPhrasecount = 0;
 168
 169   Tcl_Obj **apExprToken;
 170   int nExprToken;
 171
 172   UNUSED_PARAMETER(clientData);
 173
 174   /* Must have 3 or more arguments. */
 175   if( objc<3 || (objc%2)==0 ){
 176     Tcl_WrongNumArgs(interp, 1, objv, "DOCUMENT EXPR ?OPTION VALUE?...");
 177     rc = TCL_ERROR;
 178     goto near_match_out;
 179   }
 180
 181   for(ii=3; ii<objc; ii+=2){
 182     enum NM_enum { NM_PHRASECOUNTS };
 183     struct TestnmSubcmd {
 184       char *zName;
 185       enum NM_enum eOpt;
 186     } aOpt[] = {
 187       { "-phrasecountvar", NM_PHRASECOUNTS },
 188       { 0, 0 }
 189     };
 190     int iOpt;
 191     if( Tcl_GetIndexFromObjStruct(
 192         interp, objv[ii], aOpt, sizeof(aOpt[0]), "option", 0, &iOpt)
 193     ){
 194       return TCL_ERROR;
 195     }
 196
 197     switch( aOpt[iOpt].eOpt ){
 198       case NM_PHRASECOUNTS:
 199         pPhrasecount = objv[ii+1];
 200         break;
 201     }
 202   }
 203
 204   rc = Tcl_ListObjGetElements(interp, objv[1], &doc.nToken, &apDocToken);
 205   if( rc!=TCL_OK ) goto near_match_out;
 206   doc.aToken = (NearToken *)ckalloc(doc.nToken*sizeof(NearToken));
 207   for(ii=0; ii<doc.nToken; ii++){
 208     doc.aToken[ii].z = Tcl_GetStringFromObj(apDocToken[ii], &doc.aToken[ii].n);
 209   }
 210
 211   rc = Tcl_ListObjGetElements(interp, objv[2], &nExprToken, &apExprToken);
 212   if( rc!=TCL_OK ) goto near_match_out;
 213
 214   nPhrase = (nExprToken + 1) / 2;
 215   aPhrase = (NearPhrase *)ckalloc(nPhrase * sizeof(NearPhrase));
 216   memset(aPhrase, 0, nPhrase * sizeof(NearPhrase));
 217   for(ii=0; ii<nPhrase; ii++){
 218     Tcl_Obj *pPhrase = apExprToken[ii*2];
 219     Tcl_Obj **apToken;
 220     int nToken;
 221     int jj;
 222
 223     rc = Tcl_ListObjGetElements(interp, pPhrase, &nToken, &apToken);
 224     if( rc!=TCL_OK ) goto near_match_out;
 225     if( nToken>NM_MAX_TOKEN ){
 226       Tcl_AppendResult(interp, "Too many tokens in phrase", 0);
 227       rc = TCL_ERROR;
 228       goto near_match_out;
 229     }
 230     for(jj=0; jj<nToken; jj++){
 231       NearToken *pT = &aPhrase[ii].aToken[jj];
 232       pT->z = Tcl_GetStringFromObj(apToken[jj], &pT->n);
 233     }
 234     aPhrase[ii].nToken = nToken;
 235   }
 236   for(ii=1; ii<nPhrase; ii++){
 237     Tcl_Obj *pNear = apExprToken[2*ii-1];
 238     int nNear;
 239     rc = Tcl_GetIntFromObj(interp, pNear, &nNear);
 240     if( rc!=TCL_OK ) goto near_match_out;
 241     aPhrase[ii].nNear = nNear;
 242   }
 243
 244   pRet = Tcl_NewObj();
 245   Tcl_IncrRefCount(pRet);
 246   for(ii=0; ii<nPhrase; ii++){
 247     int nOcc = nm_match_count(&doc, nPhrase, aPhrase, ii);
 248     Tcl_ListObjAppendElement(interp, pRet, Tcl_NewIntObj(nOcc));
 249     nTotal += nOcc;
 250   }
 251   if( pPhrasecount ){
 252     Tcl_ObjSetVar2(interp, pPhrasecount, 0, pRet, 0);
 253   }
 254   Tcl_DecrRefCount(pRet);
 255   Tcl_SetObjResult(interp, Tcl_NewBooleanObj(nTotal>0));
 256
 257  near_match_out:
 258   ckfree((char *)aPhrase);
 259   ckfree((char *)doc.aToken);
 260   return rc;
 261 }
 262
 263 /*
 264 **   Tclcmd: fts3_configure_incr_load ?CHUNKSIZE THRESHOLD?
 265 **
 266 ** Normally, FTS uses hard-coded values to determine the minimum doclist
 267 ** size eligible for incremental loading, and the size of the chunks loaded
 268 ** when a doclist is incrementally loaded. This command allows the built-in
 269 ** values to be overridden for testing purposes.
 270 **
 271 ** If present, the first argument is the chunksize in bytes to load doclists
 272 ** in. The second argument is the minimum doclist size in bytes to use
 273 ** incremental loading with.
 274 **
 275 ** Whether or not the arguments are present, this command returns a list of
 276 ** two integers - the initial chunksize and threshold when the command is
 277 ** invoked. This can be used to restore the default behavior after running
 278 ** tests. For example:
 279 **
 280 **    # Override incr-load settings for testing:
 281 **    set cfg [fts3_configure_incr_load $new_chunksize $new_threshold]
 282 **
 283 **    .... run tests ....
 284 **
 285 **    # Restore initial incr-load settings:
 286 **    eval fts3_configure_incr_load $cfg
 287 */
 288 static int SQLITE_TCLAPI fts3_configure_incr_load_cmd(
 289   ClientData clientData,
 290   Tcl_Interp *interp,
 291   int objc,
 292   Tcl_Obj *CONST objv[]
 293 ){
 294 #ifdef SQLITE_ENABLE_FTS3
 295   extern int test_fts3_node_chunksize;
 296   extern int test_fts3_node_chunk_threshold;
 297   Tcl_Obj *pRet;
 298
 299   if( objc!=1 && objc!=3 ){
 300     Tcl_WrongNumArgs(interp, 1, objv, "?CHUNKSIZE THRESHOLD?");
 301     return TCL_ERROR;
 302   }
 303
 304   pRet = Tcl_NewObj();
 305   Tcl_IncrRefCount(pRet);
 306   Tcl_ListObjAppendElement(
 307       interp, pRet, Tcl_NewIntObj(test_fts3_node_chunksize));
 308   Tcl_ListObjAppendElement(
 309       interp, pRet, Tcl_NewIntObj(test_fts3_node_chunk_threshold));
 310
 311   if( objc==3 ){
 312     int iArg1;
 313     int iArg2;
 314     if( Tcl_GetIntFromObj(interp, objv[1], &iArg1)
 315      || Tcl_GetIntFromObj(interp, objv[2], &iArg2)
 316     ){
 317       Tcl_DecrRefCount(pRet);
 318       return TCL_ERROR;
 319     }
 320     test_fts3_node_chunksize = iArg1;
 321     test_fts3_node_chunk_threshold = iArg2;
 322   }
 323
 324   Tcl_SetObjResult(interp, pRet);
 325   Tcl_DecrRefCount(pRet);
 326 #endif
 327   UNUSED_PARAMETER(clientData);
 328   return TCL_OK;
 329 }
 330
 331 #ifdef SQLITE_ENABLE_FTS3
 332 /**************************************************************************
 333 ** Beginning of test tokenizer code.
 334 **
 335 ** For language 0, this tokenizer is similar to the default 'simple'
 336 ** tokenizer. For other languages L, the following:
 337 **
 338 **   * Odd numbered languages are case-sensitive. Even numbered
 339 **     languages are not.
 340 **
 341 **   * Language ids 100 or greater are considered an error.
 342 **
 343 ** The implementation assumes that the input contains only ASCII characters
 344 ** (i.e. those that may be encoded in UTF-8 using a single byte).
 345 */
 346 typedef struct test_tokenizer {
 347   sqlite3_tokenizer base;
 348 } test_tokenizer;
 349
 350 typedef struct test_tokenizer_cursor {
 351   sqlite3_tokenizer_cursor base;
 352   const char *aInput;          /* Input being tokenized */
 353   int nInput;                  /* Size of the input in bytes */
 354   int iInput;                  /* Current offset in aInput */
 355   int iToken;                  /* Index of next token to be returned */
 356   char *aBuffer;               /* Buffer containing current token */
 357   int nBuffer;                 /* Number of bytes allocated at pToken */
 358   int iLangid;                 /* Configured language id */
 359 } test_tokenizer_cursor;
 360
 361 static int testTokenizerCreate(
 362   int argc, const char * const *argv,
 363   sqlite3_tokenizer **ppTokenizer
 364 ){
 365   test_tokenizer *pNew;
 366   UNUSED_PARAMETER(argc);
 367   UNUSED_PARAMETER(argv);
 368
 369   pNew = sqlite3_malloc(sizeof(test_tokenizer));
 370   if( !pNew ) return SQLITE_NOMEM;
 371   memset(pNew, 0, sizeof(test_tokenizer));
 372
 373   *ppTokenizer = (sqlite3_tokenizer *)pNew;
 374   return SQLITE_OK;
 375 }
 376
 377 static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){
 378   test_tokenizer *p = (test_tokenizer *)pTokenizer;
 379   sqlite3_free(p);
 380   return SQLITE_OK;
 381 }
 382
 383 static int testTokenizerOpen(
 384   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
 385   const char *pInput, int nBytes,        /* String to be tokenized */
 386   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
 387 ){
 388   int rc = SQLITE_OK;                    /* Return code */
 389   test_tokenizer_cursor *pCsr;           /* New cursor object */
 390
 391   UNUSED_PARAMETER(pTokenizer);
 392
 393   pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor));
 394   if( pCsr==0 ){
 395     rc = SQLITE_NOMEM;
 396   }else{
 397     memset(pCsr, 0, sizeof(test_tokenizer_cursor));
 398     pCsr->aInput = pInput;
 399     if( nBytes<0 ){
 400       pCsr->nInput = (int)strlen(pInput);
 401     }else{
 402       pCsr->nInput = nBytes;
 403     }
 404   }
 405
 406   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
 407   return rc;
 408 }
 409
 410 static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){
 411   test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
 412   sqlite3_free(pCsr->aBuffer);
 413   sqlite3_free(pCsr);
 414   return SQLITE_OK;
 415 }
 416
 417 static int testIsTokenChar(char c){
 418   return (c>='a' && c<='z') || (c>='A' && c<='Z');
 419 }
 420 static int testTolower(char c){
 421   char ret = c;
 422   if( ret>='A' && ret<='Z') ret = ret - ('A'-'a');
 423   return ret;
 424 }
 425
 426 static int testTokenizerNext(
 427   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by testTokenizerOpen */
 428   const char **ppToken,               /* OUT: *ppToken is the token text */
 429   int *pnBytes,                       /* OUT: Number of bytes in token */
 430   int *piStartOffset,                 /* OUT: Starting offset of token */
 431   int *piEndOffset,                   /* OUT: Ending offset of token */
 432   int *piPosition                     /* OUT: Position integer of token */
 433 ){
 434   test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
 435   int rc = SQLITE_OK;
 436   const char *p;
 437   const char *pEnd;
 438
 439   p = &pCsr->aInput[pCsr->iInput];
 440   pEnd = &pCsr->aInput[pCsr->nInput];
 441
 442   /* Skip past any white-space */
 443   assert( p<=pEnd );
 444   while( p<pEnd && testIsTokenChar(*p)==0 ) p++;
 445
 446   if( p==pEnd ){
 447     rc = SQLITE_DONE;
 448   }else{
 449     /* Advance to the end of the token */
 450     const char *pToken = p;
 451     int nToken;
 452     while( p<pEnd && testIsTokenChar(*p) ) p++;
 453     nToken = (int)(p-pToken);
 454
 455     /* Copy the token into the buffer */
 456     if( nToken>pCsr->nBuffer ){
 457       sqlite3_free(pCsr->aBuffer);
 458       pCsr->aBuffer = sqlite3_malloc(nToken);
 459     }
 460     if( pCsr->aBuffer==0 ){
 461       rc = SQLITE_NOMEM;
 462     }else{
 463       int i;
 464
 465       if( pCsr->iLangid & 0x00000001 ){
 466         for(i=0; i<nToken; i++) pCsr->aBuffer[i] = pToken[i];
 467       }else{
 468         for(i=0; i<nToken; i++) pCsr->aBuffer[i] = (char)testTolower(pToken[i]);
 469       }
 470       pCsr->iToken++;
 471       pCsr->iInput = (int)(p - pCsr->aInput);
 472
 473       *ppToken = pCsr->aBuffer;
 474       *pnBytes = nToken;
 475       *piStartOffset = (int)(pToken - pCsr->aInput);
 476       *piEndOffset = (int)(p - pCsr->aInput);
 477       *piPosition = pCsr->iToken;
 478     }
 479   }
 480
 481   return rc;
 482 }
 483
 484 static int testTokenizerLanguage(
 485   sqlite3_tokenizer_cursor *pCursor,
 486   int iLangid
 487 ){
 488   int rc = SQLITE_OK;
 489   test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
 490   pCsr->iLangid = iLangid;
 491   if( pCsr->iLangid>=100 ){
 492     rc = SQLITE_ERROR;
 493   }
 494   return rc;
 495 }
 496 #endif
 497
 498 static int SQLITE_TCLAPI fts3_test_tokenizer_cmd(
 499   ClientData clientData,
 500   Tcl_Interp *interp,
 501   int objc,
 502   Tcl_Obj *CONST objv[]
 503 ){
 504 #ifdef SQLITE_ENABLE_FTS3
 505   static const sqlite3_tokenizer_module testTokenizerModule = {
 506     1,
 507     testTokenizerCreate,
 508     testTokenizerDestroy,
 509     testTokenizerOpen,
 510     testTokenizerClose,
 511     testTokenizerNext,
 512     testTokenizerLanguage
 513   };
 514   const sqlite3_tokenizer_module *pPtr = &testTokenizerModule;
 515   if( objc!=1 ){
 516     Tcl_WrongNumArgs(interp, 1, objv, "");
 517     return TCL_ERROR;
 518   }
 519   Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(
 520     (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *)
 521   ));
 522 #endif
 523   UNUSED_PARAMETER(clientData);
 524   return TCL_OK;
 525 }
 526
 527 static int SQLITE_TCLAPI fts3_test_varint_cmd(
 528   ClientData clientData,
 529   Tcl_Interp *interp,
 530   int objc,
 531   Tcl_Obj *CONST objv[]
 532 ){
 533 #ifdef SQLITE_ENABLE_FTS3
 534   char aBuf[24];
 535   int rc;
 536   Tcl_WideInt w;
 537   sqlite3_int64 w2;
 538   int nByte, nByte2;
 539
 540   if( objc!=2 ){
 541     Tcl_WrongNumArgs(interp, 1, objv, "INTEGER");
 542     return TCL_ERROR;
 543   }
 544
 545   rc = Tcl_GetWideIntFromObj(interp, objv[1], &w);
 546   if( rc!=TCL_OK ) return rc;
 547
 548   nByte = sqlite3Fts3PutVarint(aBuf, w);
 549   nByte2 = sqlite3Fts3GetVarint(aBuf, &w2);
 550   if( w!=w2 || nByte!=nByte2 ){
 551     char *zErr = sqlite3_mprintf("error testing %lld", w);
 552     Tcl_ResetResult(interp);
 553     Tcl_AppendResult(interp, zErr, 0);
 554     return TCL_ERROR;
 555   }
 556
 557   if( w<=2147483647 && w>=0 ){
 558     int i;
 559     nByte2 = fts3GetVarint32(aBuf, &i);
 560     if( (int)w!=i || nByte!=nByte2 ){
 561       char *zErr = sqlite3_mprintf("error testing %lld (32-bit)", w);
 562       Tcl_ResetResult(interp);
 563       Tcl_AppendResult(interp, zErr, 0);
 564       return TCL_ERROR;
 565     }
 566   }
 567
 568 #endif
 569   UNUSED_PARAMETER(clientData);
 570   return TCL_OK;
 571 }
 572
 573 /*
 574 ** End of tokenizer code.
 575 **************************************************************************/
 576
 577 int Sqlitetestfts3_Init(Tcl_Interp *interp){
 578   Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0);
 579   Tcl_CreateObjCommand(interp,
 580       "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0
 581   );
 582   Tcl_CreateObjCommand(
 583       interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0
 584   );
 585
 586   Tcl_CreateObjCommand(
 587       interp, "fts3_test_varint", fts3_test_varint_cmd, 0, 0
 588   );
 589   return TCL_OK;
 590 }
 591 #endif                  /* SQLITE_ENABLE_FTS3 || SQLITE_ENABLE_FTS4 */
 592 #endif                  /* ifdef SQLITE_TEST */