Another tweak to the MSVC clean target.
[sqlite.git] / ext / fts3 / fts3_expr.c
blob788e5021ec206ee3c942e5dccf18335d6fc55da4
1 /*
2 ** 2008 Nov 28
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
13 ** This module contains code that implements a parser for fts3 query strings
14 ** (the right-hand argument to the MATCH operator). Because the supported
15 ** syntax is relatively simple, the whole tokenizer/parser system is
16 ** hand-coded.
18 #include "fts3Int.h"
19 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
22 ** By default, this module parses the legacy syntax that has been
23 ** traditionally used by fts3. Or, if SQLITE_ENABLE_FTS3_PARENTHESIS
24 ** is defined, then it uses the new syntax. The differences between
25 ** the new and the old syntaxes are:
27 ** a) The new syntax supports parenthesis. The old does not.
29 ** b) The new syntax supports the AND and NOT operators. The old does not.
31 ** c) The old syntax supports the "-" token qualifier. This is not
32 ** supported by the new syntax (it is replaced by the NOT operator).
34 ** d) When using the old syntax, the OR operator has a greater precedence
35 ** than an implicit AND. When using the new, both implicity and explicit
36 ** AND operators have a higher precedence than OR.
38 ** If compiled with SQLITE_TEST defined, then this module exports the
39 ** symbol "int sqlite3_fts3_enable_parentheses". Setting this variable
40 ** to zero causes the module to use the old syntax. If it is set to
41 ** non-zero the new syntax is activated. This is so both syntaxes can
42 ** be tested using a single build of testfixture.
44 ** The following describes the syntax supported by the fts3 MATCH
45 ** operator in a similar format to that used by the lemon parser
46 ** generator. This module does not use actually lemon, it uses a
47 ** custom parser.
49 ** query ::= andexpr (OR andexpr)*.
51 ** andexpr ::= notexpr (AND? notexpr)*.
53 ** notexpr ::= nearexpr (NOT nearexpr|-TOKEN)*.
54 ** notexpr ::= LP query RP.
56 ** nearexpr ::= phrase (NEAR distance_opt nearexpr)*.
58 ** distance_opt ::= .
59 ** distance_opt ::= / INTEGER.
61 ** phrase ::= TOKEN.
62 ** phrase ::= COLUMN:TOKEN.
63 ** phrase ::= "TOKEN TOKEN TOKEN...".
66 #ifdef SQLITE_TEST
67 int sqlite3_fts3_enable_parentheses = 0;
68 #else
69 # ifdef SQLITE_ENABLE_FTS3_PARENTHESIS
70 # define sqlite3_fts3_enable_parentheses 1
71 # else
72 # define sqlite3_fts3_enable_parentheses 0
73 # endif
74 #endif
77 ** Default span for NEAR operators.
79 #define SQLITE_FTS3_DEFAULT_NEAR_PARAM 10
81 #include <string.h>
82 #include <assert.h>
85 ** isNot:
86 ** This variable is used by function getNextNode(). When getNextNode() is
87 ** called, it sets ParseContext.isNot to true if the 'next node' is a
88 ** FTSQUERY_PHRASE with a unary "-" attached to it. i.e. "mysql" in the
89 ** FTS3 query "sqlite -mysql". Otherwise, ParseContext.isNot is set to
90 ** zero.
92 typedef struct ParseContext ParseContext;
93 struct ParseContext {
94 sqlite3_tokenizer *pTokenizer; /* Tokenizer module */
95 int iLangid; /* Language id used with tokenizer */
96 const char **azCol; /* Array of column names for fts3 table */
97 int bFts4; /* True to allow FTS4-only syntax */
98 int nCol; /* Number of entries in azCol[] */
99 int iDefaultCol; /* Default column to query */
100 int isNot; /* True if getNextNode() sees a unary - */
101 sqlite3_context *pCtx; /* Write error message here */
102 int nNest; /* Number of nested brackets */
106 ** This function is equivalent to the standard isspace() function.
108 ** The standard isspace() can be awkward to use safely, because although it
109 ** is defined to accept an argument of type int, its behavior when passed
110 ** an integer that falls outside of the range of the unsigned char type
111 ** is undefined (and sometimes, "undefined" means segfault). This wrapper
112 ** is defined to accept an argument of type char, and always returns 0 for
113 ** any values that fall outside of the range of the unsigned char type (i.e.
114 ** negative values).
116 static int fts3isspace(char c){
117 return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
121 ** Allocate nByte bytes of memory using sqlite3_malloc(). If successful,
122 ** zero the memory before returning a pointer to it. If unsuccessful,
123 ** return NULL.
125 static void *fts3MallocZero(int nByte){
126 void *pRet = sqlite3_malloc(nByte);
127 if( pRet ) memset(pRet, 0, nByte);
128 return pRet;
131 int sqlite3Fts3OpenTokenizer(
132 sqlite3_tokenizer *pTokenizer,
133 int iLangid,
134 const char *z,
135 int n,
136 sqlite3_tokenizer_cursor **ppCsr
138 sqlite3_tokenizer_module const *pModule = pTokenizer->pModule;
139 sqlite3_tokenizer_cursor *pCsr = 0;
140 int rc;
142 rc = pModule->xOpen(pTokenizer, z, n, &pCsr);
143 assert( rc==SQLITE_OK || pCsr==0 );
144 if( rc==SQLITE_OK ){
145 pCsr->pTokenizer = pTokenizer;
146 if( pModule->iVersion>=1 ){
147 rc = pModule->xLanguageid(pCsr, iLangid);
148 if( rc!=SQLITE_OK ){
149 pModule->xClose(pCsr);
150 pCsr = 0;
154 *ppCsr = pCsr;
155 return rc;
159 ** Function getNextNode(), which is called by fts3ExprParse(), may itself
160 ** call fts3ExprParse(). So this forward declaration is required.
162 static int fts3ExprParse(ParseContext *, const char *, int, Fts3Expr **, int *);
165 ** Extract the next token from buffer z (length n) using the tokenizer
166 ** and other information (column names etc.) in pParse. Create an Fts3Expr
167 ** structure of type FTSQUERY_PHRASE containing a phrase consisting of this
168 ** single token and set *ppExpr to point to it. If the end of the buffer is
169 ** reached before a token is found, set *ppExpr to zero. It is the
170 ** responsibility of the caller to eventually deallocate the allocated
171 ** Fts3Expr structure (if any) by passing it to sqlite3_free().
173 ** Return SQLITE_OK if successful, or SQLITE_NOMEM if a memory allocation
174 ** fails.
176 static int getNextToken(
177 ParseContext *pParse, /* fts3 query parse context */
178 int iCol, /* Value for Fts3Phrase.iColumn */
179 const char *z, int n, /* Input string */
180 Fts3Expr **ppExpr, /* OUT: expression */
181 int *pnConsumed /* OUT: Number of bytes consumed */
183 sqlite3_tokenizer *pTokenizer = pParse->pTokenizer;
184 sqlite3_tokenizer_module const *pModule = pTokenizer->pModule;
185 int rc;
186 sqlite3_tokenizer_cursor *pCursor;
187 Fts3Expr *pRet = 0;
188 int i = 0;
190 /* Set variable i to the maximum number of bytes of input to tokenize. */
191 for(i=0; i<n; i++){
192 if( sqlite3_fts3_enable_parentheses && (z[i]=='(' || z[i]==')') ) break;
193 if( z[i]=='"' ) break;
196 *pnConsumed = i;
197 rc = sqlite3Fts3OpenTokenizer(pTokenizer, pParse->iLangid, z, i, &pCursor);
198 if( rc==SQLITE_OK ){
199 const char *zToken;
200 int nToken = 0, iStart = 0, iEnd = 0, iPosition = 0;
201 int nByte; /* total space to allocate */
203 rc = pModule->xNext(pCursor, &zToken, &nToken, &iStart, &iEnd, &iPosition);
204 if( rc==SQLITE_OK ){
205 nByte = sizeof(Fts3Expr) + sizeof(Fts3Phrase) + nToken;
206 pRet = (Fts3Expr *)fts3MallocZero(nByte);
207 if( !pRet ){
208 rc = SQLITE_NOMEM;
209 }else{
210 pRet->eType = FTSQUERY_PHRASE;
211 pRet->pPhrase = (Fts3Phrase *)&pRet[1];
212 pRet->pPhrase->nToken = 1;
213 pRet->pPhrase->iColumn = iCol;
214 pRet->pPhrase->aToken[0].n = nToken;
215 pRet->pPhrase->aToken[0].z = (char *)&pRet->pPhrase[1];
216 memcpy(pRet->pPhrase->aToken[0].z, zToken, nToken);
218 if( iEnd<n && z[iEnd]=='*' ){
219 pRet->pPhrase->aToken[0].isPrefix = 1;
220 iEnd++;
223 while( 1 ){
224 if( !sqlite3_fts3_enable_parentheses
225 && iStart>0 && z[iStart-1]=='-'
227 pParse->isNot = 1;
228 iStart--;
229 }else if( pParse->bFts4 && iStart>0 && z[iStart-1]=='^' ){
230 pRet->pPhrase->aToken[0].bFirst = 1;
231 iStart--;
232 }else{
233 break;
238 *pnConsumed = iEnd;
239 }else if( i && rc==SQLITE_DONE ){
240 rc = SQLITE_OK;
243 pModule->xClose(pCursor);
246 *ppExpr = pRet;
247 return rc;
252 ** Enlarge a memory allocation. If an out-of-memory allocation occurs,
253 ** then free the old allocation.
255 static void *fts3ReallocOrFree(void *pOrig, int nNew){
256 void *pRet = sqlite3_realloc(pOrig, nNew);
257 if( !pRet ){
258 sqlite3_free(pOrig);
260 return pRet;
264 ** Buffer zInput, length nInput, contains the contents of a quoted string
265 ** that appeared as part of an fts3 query expression. Neither quote character
266 ** is included in the buffer. This function attempts to tokenize the entire
267 ** input buffer and create an Fts3Expr structure of type FTSQUERY_PHRASE
268 ** containing the results.
270 ** If successful, SQLITE_OK is returned and *ppExpr set to point at the
271 ** allocated Fts3Expr structure. Otherwise, either SQLITE_NOMEM (out of memory
272 ** error) or SQLITE_ERROR (tokenization error) is returned and *ppExpr set
273 ** to 0.
275 static int getNextString(
276 ParseContext *pParse, /* fts3 query parse context */
277 const char *zInput, int nInput, /* Input string */
278 Fts3Expr **ppExpr /* OUT: expression */
280 sqlite3_tokenizer *pTokenizer = pParse->pTokenizer;
281 sqlite3_tokenizer_module const *pModule = pTokenizer->pModule;
282 int rc;
283 Fts3Expr *p = 0;
284 sqlite3_tokenizer_cursor *pCursor = 0;
285 char *zTemp = 0;
286 int nTemp = 0;
288 const int nSpace = sizeof(Fts3Expr) + sizeof(Fts3Phrase);
289 int nToken = 0;
291 /* The final Fts3Expr data structure, including the Fts3Phrase,
292 ** Fts3PhraseToken structures token buffers are all stored as a single
293 ** allocation so that the expression can be freed with a single call to
294 ** sqlite3_free(). Setting this up requires a two pass approach.
296 ** The first pass, in the block below, uses a tokenizer cursor to iterate
297 ** through the tokens in the expression. This pass uses fts3ReallocOrFree()
298 ** to assemble data in two dynamic buffers:
300 ** Buffer p: Points to the Fts3Expr structure, followed by the Fts3Phrase
301 ** structure, followed by the array of Fts3PhraseToken
302 ** structures. This pass only populates the Fts3PhraseToken array.
304 ** Buffer zTemp: Contains copies of all tokens.
306 ** The second pass, in the block that begins "if( rc==SQLITE_DONE )" below,
307 ** appends buffer zTemp to buffer p, and fills in the Fts3Expr and Fts3Phrase
308 ** structures.
310 rc = sqlite3Fts3OpenTokenizer(
311 pTokenizer, pParse->iLangid, zInput, nInput, &pCursor);
312 if( rc==SQLITE_OK ){
313 int ii;
314 for(ii=0; rc==SQLITE_OK; ii++){
315 const char *zByte;
316 int nByte = 0, iBegin = 0, iEnd = 0, iPos = 0;
317 rc = pModule->xNext(pCursor, &zByte, &nByte, &iBegin, &iEnd, &iPos);
318 if( rc==SQLITE_OK ){
319 Fts3PhraseToken *pToken;
321 p = fts3ReallocOrFree(p, nSpace + ii*sizeof(Fts3PhraseToken));
322 if( !p ) goto no_mem;
324 zTemp = fts3ReallocOrFree(zTemp, nTemp + nByte);
325 if( !zTemp ) goto no_mem;
327 assert( nToken==ii );
328 pToken = &((Fts3Phrase *)(&p[1]))->aToken[ii];
329 memset(pToken, 0, sizeof(Fts3PhraseToken));
331 memcpy(&zTemp[nTemp], zByte, nByte);
332 nTemp += nByte;
334 pToken->n = nByte;
335 pToken->isPrefix = (iEnd<nInput && zInput[iEnd]=='*');
336 pToken->bFirst = (iBegin>0 && zInput[iBegin-1]=='^');
337 nToken = ii+1;
341 pModule->xClose(pCursor);
342 pCursor = 0;
345 if( rc==SQLITE_DONE ){
346 int jj;
347 char *zBuf = 0;
349 p = fts3ReallocOrFree(p, nSpace + nToken*sizeof(Fts3PhraseToken) + nTemp);
350 if( !p ) goto no_mem;
351 memset(p, 0, (char *)&(((Fts3Phrase *)&p[1])->aToken[0])-(char *)p);
352 p->eType = FTSQUERY_PHRASE;
353 p->pPhrase = (Fts3Phrase *)&p[1];
354 p->pPhrase->iColumn = pParse->iDefaultCol;
355 p->pPhrase->nToken = nToken;
357 zBuf = (char *)&p->pPhrase->aToken[nToken];
358 if( zTemp ){
359 memcpy(zBuf, zTemp, nTemp);
360 sqlite3_free(zTemp);
361 }else{
362 assert( nTemp==0 );
365 for(jj=0; jj<p->pPhrase->nToken; jj++){
366 p->pPhrase->aToken[jj].z = zBuf;
367 zBuf += p->pPhrase->aToken[jj].n;
369 rc = SQLITE_OK;
372 *ppExpr = p;
373 return rc;
374 no_mem:
376 if( pCursor ){
377 pModule->xClose(pCursor);
379 sqlite3_free(zTemp);
380 sqlite3_free(p);
381 *ppExpr = 0;
382 return SQLITE_NOMEM;
386 ** The output variable *ppExpr is populated with an allocated Fts3Expr
387 ** structure, or set to 0 if the end of the input buffer is reached.
389 ** Returns an SQLite error code. SQLITE_OK if everything works, SQLITE_NOMEM
390 ** if a malloc failure occurs, or SQLITE_ERROR if a parse error is encountered.
391 ** If SQLITE_ERROR is returned, pContext is populated with an error message.
393 static int getNextNode(
394 ParseContext *pParse, /* fts3 query parse context */
395 const char *z, int n, /* Input string */
396 Fts3Expr **ppExpr, /* OUT: expression */
397 int *pnConsumed /* OUT: Number of bytes consumed */
399 static const struct Fts3Keyword {
400 char *z; /* Keyword text */
401 unsigned char n; /* Length of the keyword */
402 unsigned char parenOnly; /* Only valid in paren mode */
403 unsigned char eType; /* Keyword code */
404 } aKeyword[] = {
405 { "OR" , 2, 0, FTSQUERY_OR },
406 { "AND", 3, 1, FTSQUERY_AND },
407 { "NOT", 3, 1, FTSQUERY_NOT },
408 { "NEAR", 4, 0, FTSQUERY_NEAR }
410 int ii;
411 int iCol;
412 int iColLen;
413 int rc;
414 Fts3Expr *pRet = 0;
416 const char *zInput = z;
417 int nInput = n;
419 pParse->isNot = 0;
421 /* Skip over any whitespace before checking for a keyword, an open or
422 ** close bracket, or a quoted string.
424 while( nInput>0 && fts3isspace(*zInput) ){
425 nInput--;
426 zInput++;
428 if( nInput==0 ){
429 return SQLITE_DONE;
432 /* See if we are dealing with a keyword. */
433 for(ii=0; ii<(int)(sizeof(aKeyword)/sizeof(struct Fts3Keyword)); ii++){
434 const struct Fts3Keyword *pKey = &aKeyword[ii];
436 if( (pKey->parenOnly & ~sqlite3_fts3_enable_parentheses)!=0 ){
437 continue;
440 if( nInput>=pKey->n && 0==memcmp(zInput, pKey->z, pKey->n) ){
441 int nNear = SQLITE_FTS3_DEFAULT_NEAR_PARAM;
442 int nKey = pKey->n;
443 char cNext;
445 /* If this is a "NEAR" keyword, check for an explicit nearness. */
446 if( pKey->eType==FTSQUERY_NEAR ){
447 assert( nKey==4 );
448 if( zInput[4]=='/' && zInput[5]>='0' && zInput[5]<='9' ){
449 nNear = 0;
450 for(nKey=5; zInput[nKey]>='0' && zInput[nKey]<='9'; nKey++){
451 nNear = nNear * 10 + (zInput[nKey] - '0');
456 /* At this point this is probably a keyword. But for that to be true,
457 ** the next byte must contain either whitespace, an open or close
458 ** parenthesis, a quote character, or EOF.
460 cNext = zInput[nKey];
461 if( fts3isspace(cNext)
462 || cNext=='"' || cNext=='(' || cNext==')' || cNext==0
464 pRet = (Fts3Expr *)fts3MallocZero(sizeof(Fts3Expr));
465 if( !pRet ){
466 return SQLITE_NOMEM;
468 pRet->eType = pKey->eType;
469 pRet->nNear = nNear;
470 *ppExpr = pRet;
471 *pnConsumed = (int)((zInput - z) + nKey);
472 return SQLITE_OK;
475 /* Turns out that wasn't a keyword after all. This happens if the
476 ** user has supplied a token such as "ORacle". Continue.
481 /* See if we are dealing with a quoted phrase. If this is the case, then
482 ** search for the closing quote and pass the whole string to getNextString()
483 ** for processing. This is easy to do, as fts3 has no syntax for escaping
484 ** a quote character embedded in a string.
486 if( *zInput=='"' ){
487 for(ii=1; ii<nInput && zInput[ii]!='"'; ii++);
488 *pnConsumed = (int)((zInput - z) + ii + 1);
489 if( ii==nInput ){
490 return SQLITE_ERROR;
492 return getNextString(pParse, &zInput[1], ii-1, ppExpr);
495 if( sqlite3_fts3_enable_parentheses ){
496 if( *zInput=='(' ){
497 int nConsumed = 0;
498 pParse->nNest++;
499 rc = fts3ExprParse(pParse, zInput+1, nInput-1, ppExpr, &nConsumed);
500 if( rc==SQLITE_OK && !*ppExpr ){ rc = SQLITE_DONE; }
501 *pnConsumed = (int)(zInput - z) + 1 + nConsumed;
502 return rc;
503 }else if( *zInput==')' ){
504 pParse->nNest--;
505 *pnConsumed = (int)((zInput - z) + 1);
506 *ppExpr = 0;
507 return SQLITE_DONE;
511 /* If control flows to this point, this must be a regular token, or
512 ** the end of the input. Read a regular token using the sqlite3_tokenizer
513 ** interface. Before doing so, figure out if there is an explicit
514 ** column specifier for the token.
516 ** TODO: Strangely, it is not possible to associate a column specifier
517 ** with a quoted phrase, only with a single token. Not sure if this was
518 ** an implementation artifact or an intentional decision when fts3 was
519 ** first implemented. Whichever it was, this module duplicates the
520 ** limitation.
522 iCol = pParse->iDefaultCol;
523 iColLen = 0;
524 for(ii=0; ii<pParse->nCol; ii++){
525 const char *zStr = pParse->azCol[ii];
526 int nStr = (int)strlen(zStr);
527 if( nInput>nStr && zInput[nStr]==':'
528 && sqlite3_strnicmp(zStr, zInput, nStr)==0
530 iCol = ii;
531 iColLen = (int)((zInput - z) + nStr + 1);
532 break;
535 rc = getNextToken(pParse, iCol, &z[iColLen], n-iColLen, ppExpr, pnConsumed);
536 *pnConsumed += iColLen;
537 return rc;
541 ** The argument is an Fts3Expr structure for a binary operator (any type
542 ** except an FTSQUERY_PHRASE). Return an integer value representing the
543 ** precedence of the operator. Lower values have a higher precedence (i.e.
544 ** group more tightly). For example, in the C language, the == operator
545 ** groups more tightly than ||, and would therefore have a higher precedence.
547 ** When using the new fts3 query syntax (when SQLITE_ENABLE_FTS3_PARENTHESIS
548 ** is defined), the order of the operators in precedence from highest to
549 ** lowest is:
551 ** NEAR
552 ** NOT
553 ** AND (including implicit ANDs)
554 ** OR
556 ** Note that when using the old query syntax, the OR operator has a higher
557 ** precedence than the AND operator.
559 static int opPrecedence(Fts3Expr *p){
560 assert( p->eType!=FTSQUERY_PHRASE );
561 if( sqlite3_fts3_enable_parentheses ){
562 return p->eType;
563 }else if( p->eType==FTSQUERY_NEAR ){
564 return 1;
565 }else if( p->eType==FTSQUERY_OR ){
566 return 2;
568 assert( p->eType==FTSQUERY_AND );
569 return 3;
573 ** Argument ppHead contains a pointer to the current head of a query
574 ** expression tree being parsed. pPrev is the expression node most recently
575 ** inserted into the tree. This function adds pNew, which is always a binary
576 ** operator node, into the expression tree based on the relative precedence
577 ** of pNew and the existing nodes of the tree. This may result in the head
578 ** of the tree changing, in which case *ppHead is set to the new root node.
580 static void insertBinaryOperator(
581 Fts3Expr **ppHead, /* Pointer to the root node of a tree */
582 Fts3Expr *pPrev, /* Node most recently inserted into the tree */
583 Fts3Expr *pNew /* New binary node to insert into expression tree */
585 Fts3Expr *pSplit = pPrev;
586 while( pSplit->pParent && opPrecedence(pSplit->pParent)<=opPrecedence(pNew) ){
587 pSplit = pSplit->pParent;
590 if( pSplit->pParent ){
591 assert( pSplit->pParent->pRight==pSplit );
592 pSplit->pParent->pRight = pNew;
593 pNew->pParent = pSplit->pParent;
594 }else{
595 *ppHead = pNew;
597 pNew->pLeft = pSplit;
598 pSplit->pParent = pNew;
602 ** Parse the fts3 query expression found in buffer z, length n. This function
603 ** returns either when the end of the buffer is reached or an unmatched
604 ** closing bracket - ')' - is encountered.
606 ** If successful, SQLITE_OK is returned, *ppExpr is set to point to the
607 ** parsed form of the expression and *pnConsumed is set to the number of
608 ** bytes read from buffer z. Otherwise, *ppExpr is set to 0 and SQLITE_NOMEM
609 ** (out of memory error) or SQLITE_ERROR (parse error) is returned.
611 static int fts3ExprParse(
612 ParseContext *pParse, /* fts3 query parse context */
613 const char *z, int n, /* Text of MATCH query */
614 Fts3Expr **ppExpr, /* OUT: Parsed query structure */
615 int *pnConsumed /* OUT: Number of bytes consumed */
617 Fts3Expr *pRet = 0;
618 Fts3Expr *pPrev = 0;
619 Fts3Expr *pNotBranch = 0; /* Only used in legacy parse mode */
620 int nIn = n;
621 const char *zIn = z;
622 int rc = SQLITE_OK;
623 int isRequirePhrase = 1;
625 while( rc==SQLITE_OK ){
626 Fts3Expr *p = 0;
627 int nByte = 0;
629 rc = getNextNode(pParse, zIn, nIn, &p, &nByte);
630 assert( nByte>0 || (rc!=SQLITE_OK && p==0) );
631 if( rc==SQLITE_OK ){
632 if( p ){
633 int isPhrase;
635 if( !sqlite3_fts3_enable_parentheses
636 && p->eType==FTSQUERY_PHRASE && pParse->isNot
638 /* Create an implicit NOT operator. */
639 Fts3Expr *pNot = fts3MallocZero(sizeof(Fts3Expr));
640 if( !pNot ){
641 sqlite3Fts3ExprFree(p);
642 rc = SQLITE_NOMEM;
643 goto exprparse_out;
645 pNot->eType = FTSQUERY_NOT;
646 pNot->pRight = p;
647 p->pParent = pNot;
648 if( pNotBranch ){
649 pNot->pLeft = pNotBranch;
650 pNotBranch->pParent = pNot;
652 pNotBranch = pNot;
653 p = pPrev;
654 }else{
655 int eType = p->eType;
656 isPhrase = (eType==FTSQUERY_PHRASE || p->pLeft);
658 /* The isRequirePhrase variable is set to true if a phrase or
659 ** an expression contained in parenthesis is required. If a
660 ** binary operator (AND, OR, NOT or NEAR) is encounted when
661 ** isRequirePhrase is set, this is a syntax error.
663 if( !isPhrase && isRequirePhrase ){
664 sqlite3Fts3ExprFree(p);
665 rc = SQLITE_ERROR;
666 goto exprparse_out;
669 if( isPhrase && !isRequirePhrase ){
670 /* Insert an implicit AND operator. */
671 Fts3Expr *pAnd;
672 assert( pRet && pPrev );
673 pAnd = fts3MallocZero(sizeof(Fts3Expr));
674 if( !pAnd ){
675 sqlite3Fts3ExprFree(p);
676 rc = SQLITE_NOMEM;
677 goto exprparse_out;
679 pAnd->eType = FTSQUERY_AND;
680 insertBinaryOperator(&pRet, pPrev, pAnd);
681 pPrev = pAnd;
684 /* This test catches attempts to make either operand of a NEAR
685 ** operator something other than a phrase. For example, either of
686 ** the following:
688 ** (bracketed expression) NEAR phrase
689 ** phrase NEAR (bracketed expression)
691 ** Return an error in either case.
693 if( pPrev && (
694 (eType==FTSQUERY_NEAR && !isPhrase && pPrev->eType!=FTSQUERY_PHRASE)
695 || (eType!=FTSQUERY_PHRASE && isPhrase && pPrev->eType==FTSQUERY_NEAR)
697 sqlite3Fts3ExprFree(p);
698 rc = SQLITE_ERROR;
699 goto exprparse_out;
702 if( isPhrase ){
703 if( pRet ){
704 assert( pPrev && pPrev->pLeft && pPrev->pRight==0 );
705 pPrev->pRight = p;
706 p->pParent = pPrev;
707 }else{
708 pRet = p;
710 }else{
711 insertBinaryOperator(&pRet, pPrev, p);
713 isRequirePhrase = !isPhrase;
715 pPrev = p;
717 assert( nByte>0 );
719 assert( rc!=SQLITE_OK || (nByte>0 && nByte<=nIn) );
720 nIn -= nByte;
721 zIn += nByte;
724 if( rc==SQLITE_DONE && pRet && isRequirePhrase ){
725 rc = SQLITE_ERROR;
728 if( rc==SQLITE_DONE ){
729 rc = SQLITE_OK;
730 if( !sqlite3_fts3_enable_parentheses && pNotBranch ){
731 if( !pRet ){
732 rc = SQLITE_ERROR;
733 }else{
734 Fts3Expr *pIter = pNotBranch;
735 while( pIter->pLeft ){
736 pIter = pIter->pLeft;
738 pIter->pLeft = pRet;
739 pRet->pParent = pIter;
740 pRet = pNotBranch;
744 *pnConsumed = n - nIn;
746 exprparse_out:
747 if( rc!=SQLITE_OK ){
748 sqlite3Fts3ExprFree(pRet);
749 sqlite3Fts3ExprFree(pNotBranch);
750 pRet = 0;
752 *ppExpr = pRet;
753 return rc;
757 ** Return SQLITE_ERROR if the maximum depth of the expression tree passed
758 ** as the only argument is more than nMaxDepth.
760 static int fts3ExprCheckDepth(Fts3Expr *p, int nMaxDepth){
761 int rc = SQLITE_OK;
762 if( p ){
763 if( nMaxDepth<0 ){
764 rc = SQLITE_TOOBIG;
765 }else{
766 rc = fts3ExprCheckDepth(p->pLeft, nMaxDepth-1);
767 if( rc==SQLITE_OK ){
768 rc = fts3ExprCheckDepth(p->pRight, nMaxDepth-1);
772 return rc;
776 ** This function attempts to transform the expression tree at (*pp) to
777 ** an equivalent but more balanced form. The tree is modified in place.
778 ** If successful, SQLITE_OK is returned and (*pp) set to point to the
779 ** new root expression node.
781 ** nMaxDepth is the maximum allowable depth of the balanced sub-tree.
783 ** Otherwise, if an error occurs, an SQLite error code is returned and
784 ** expression (*pp) freed.
786 static int fts3ExprBalance(Fts3Expr **pp, int nMaxDepth){
787 int rc = SQLITE_OK; /* Return code */
788 Fts3Expr *pRoot = *pp; /* Initial root node */
789 Fts3Expr *pFree = 0; /* List of free nodes. Linked by pParent. */
790 int eType = pRoot->eType; /* Type of node in this tree */
792 if( nMaxDepth==0 ){
793 rc = SQLITE_ERROR;
796 if( rc==SQLITE_OK ){
797 if( (eType==FTSQUERY_AND || eType==FTSQUERY_OR) ){
798 Fts3Expr **apLeaf;
799 apLeaf = (Fts3Expr **)sqlite3_malloc(sizeof(Fts3Expr *) * nMaxDepth);
800 if( 0==apLeaf ){
801 rc = SQLITE_NOMEM;
802 }else{
803 memset(apLeaf, 0, sizeof(Fts3Expr *) * nMaxDepth);
806 if( rc==SQLITE_OK ){
807 int i;
808 Fts3Expr *p;
810 /* Set $p to point to the left-most leaf in the tree of eType nodes. */
811 for(p=pRoot; p->eType==eType; p=p->pLeft){
812 assert( p->pParent==0 || p->pParent->pLeft==p );
813 assert( p->pLeft && p->pRight );
816 /* This loop runs once for each leaf in the tree of eType nodes. */
817 while( 1 ){
818 int iLvl;
819 Fts3Expr *pParent = p->pParent; /* Current parent of p */
821 assert( pParent==0 || pParent->pLeft==p );
822 p->pParent = 0;
823 if( pParent ){
824 pParent->pLeft = 0;
825 }else{
826 pRoot = 0;
828 rc = fts3ExprBalance(&p, nMaxDepth-1);
829 if( rc!=SQLITE_OK ) break;
831 for(iLvl=0; p && iLvl<nMaxDepth; iLvl++){
832 if( apLeaf[iLvl]==0 ){
833 apLeaf[iLvl] = p;
834 p = 0;
835 }else{
836 assert( pFree );
837 pFree->pLeft = apLeaf[iLvl];
838 pFree->pRight = p;
839 pFree->pLeft->pParent = pFree;
840 pFree->pRight->pParent = pFree;
842 p = pFree;
843 pFree = pFree->pParent;
844 p->pParent = 0;
845 apLeaf[iLvl] = 0;
848 if( p ){
849 sqlite3Fts3ExprFree(p);
850 rc = SQLITE_TOOBIG;
851 break;
854 /* If that was the last leaf node, break out of the loop */
855 if( pParent==0 ) break;
857 /* Set $p to point to the next leaf in the tree of eType nodes */
858 for(p=pParent->pRight; p->eType==eType; p=p->pLeft);
860 /* Remove pParent from the original tree. */
861 assert( pParent->pParent==0 || pParent->pParent->pLeft==pParent );
862 pParent->pRight->pParent = pParent->pParent;
863 if( pParent->pParent ){
864 pParent->pParent->pLeft = pParent->pRight;
865 }else{
866 assert( pParent==pRoot );
867 pRoot = pParent->pRight;
870 /* Link pParent into the free node list. It will be used as an
871 ** internal node of the new tree. */
872 pParent->pParent = pFree;
873 pFree = pParent;
876 if( rc==SQLITE_OK ){
877 p = 0;
878 for(i=0; i<nMaxDepth; i++){
879 if( apLeaf[i] ){
880 if( p==0 ){
881 p = apLeaf[i];
882 p->pParent = 0;
883 }else{
884 assert( pFree!=0 );
885 pFree->pRight = p;
886 pFree->pLeft = apLeaf[i];
887 pFree->pLeft->pParent = pFree;
888 pFree->pRight->pParent = pFree;
890 p = pFree;
891 pFree = pFree->pParent;
892 p->pParent = 0;
896 pRoot = p;
897 }else{
898 /* An error occurred. Delete the contents of the apLeaf[] array
899 ** and pFree list. Everything else is cleaned up by the call to
900 ** sqlite3Fts3ExprFree(pRoot) below. */
901 Fts3Expr *pDel;
902 for(i=0; i<nMaxDepth; i++){
903 sqlite3Fts3ExprFree(apLeaf[i]);
905 while( (pDel=pFree)!=0 ){
906 pFree = pDel->pParent;
907 sqlite3_free(pDel);
911 assert( pFree==0 );
912 sqlite3_free( apLeaf );
914 }else if( eType==FTSQUERY_NOT ){
915 Fts3Expr *pLeft = pRoot->pLeft;
916 Fts3Expr *pRight = pRoot->pRight;
918 pRoot->pLeft = 0;
919 pRoot->pRight = 0;
920 pLeft->pParent = 0;
921 pRight->pParent = 0;
923 rc = fts3ExprBalance(&pLeft, nMaxDepth-1);
924 if( rc==SQLITE_OK ){
925 rc = fts3ExprBalance(&pRight, nMaxDepth-1);
928 if( rc!=SQLITE_OK ){
929 sqlite3Fts3ExprFree(pRight);
930 sqlite3Fts3ExprFree(pLeft);
931 }else{
932 assert( pLeft && pRight );
933 pRoot->pLeft = pLeft;
934 pLeft->pParent = pRoot;
935 pRoot->pRight = pRight;
936 pRight->pParent = pRoot;
941 if( rc!=SQLITE_OK ){
942 sqlite3Fts3ExprFree(pRoot);
943 pRoot = 0;
945 *pp = pRoot;
946 return rc;
950 ** This function is similar to sqlite3Fts3ExprParse(), with the following
951 ** differences:
953 ** 1. It does not do expression rebalancing.
954 ** 2. It does not check that the expression does not exceed the
955 ** maximum allowable depth.
956 ** 3. Even if it fails, *ppExpr may still be set to point to an
957 ** expression tree. It should be deleted using sqlite3Fts3ExprFree()
958 ** in this case.
960 static int fts3ExprParseUnbalanced(
961 sqlite3_tokenizer *pTokenizer, /* Tokenizer module */
962 int iLangid, /* Language id for tokenizer */
963 char **azCol, /* Array of column names for fts3 table */
964 int bFts4, /* True to allow FTS4-only syntax */
965 int nCol, /* Number of entries in azCol[] */
966 int iDefaultCol, /* Default column to query */
967 const char *z, int n, /* Text of MATCH query */
968 Fts3Expr **ppExpr /* OUT: Parsed query structure */
970 int nParsed;
971 int rc;
972 ParseContext sParse;
974 memset(&sParse, 0, sizeof(ParseContext));
975 sParse.pTokenizer = pTokenizer;
976 sParse.iLangid = iLangid;
977 sParse.azCol = (const char **)azCol;
978 sParse.nCol = nCol;
979 sParse.iDefaultCol = iDefaultCol;
980 sParse.bFts4 = bFts4;
981 if( z==0 ){
982 *ppExpr = 0;
983 return SQLITE_OK;
985 if( n<0 ){
986 n = (int)strlen(z);
988 rc = fts3ExprParse(&sParse, z, n, ppExpr, &nParsed);
989 assert( rc==SQLITE_OK || *ppExpr==0 );
991 /* Check for mismatched parenthesis */
992 if( rc==SQLITE_OK && sParse.nNest ){
993 rc = SQLITE_ERROR;
996 return rc;
1000 ** Parameters z and n contain a pointer to and length of a buffer containing
1001 ** an fts3 query expression, respectively. This function attempts to parse the
1002 ** query expression and create a tree of Fts3Expr structures representing the
1003 ** parsed expression. If successful, *ppExpr is set to point to the head
1004 ** of the parsed expression tree and SQLITE_OK is returned. If an error
1005 ** occurs, either SQLITE_NOMEM (out-of-memory error) or SQLITE_ERROR (parse
1006 ** error) is returned and *ppExpr is set to 0.
1008 ** If parameter n is a negative number, then z is assumed to point to a
1009 ** nul-terminated string and the length is determined using strlen().
1011 ** The first parameter, pTokenizer, is passed the fts3 tokenizer module to
1012 ** use to normalize query tokens while parsing the expression. The azCol[]
1013 ** array, which is assumed to contain nCol entries, should contain the names
1014 ** of each column in the target fts3 table, in order from left to right.
1015 ** Column names must be nul-terminated strings.
1017 ** The iDefaultCol parameter should be passed the index of the table column
1018 ** that appears on the left-hand-side of the MATCH operator (the default
1019 ** column to match against for tokens for which a column name is not explicitly
1020 ** specified as part of the query string), or -1 if tokens may by default
1021 ** match any table column.
1023 int sqlite3Fts3ExprParse(
1024 sqlite3_tokenizer *pTokenizer, /* Tokenizer module */
1025 int iLangid, /* Language id for tokenizer */
1026 char **azCol, /* Array of column names for fts3 table */
1027 int bFts4, /* True to allow FTS4-only syntax */
1028 int nCol, /* Number of entries in azCol[] */
1029 int iDefaultCol, /* Default column to query */
1030 const char *z, int n, /* Text of MATCH query */
1031 Fts3Expr **ppExpr, /* OUT: Parsed query structure */
1032 char **pzErr /* OUT: Error message (sqlite3_malloc) */
1034 int rc = fts3ExprParseUnbalanced(
1035 pTokenizer, iLangid, azCol, bFts4, nCol, iDefaultCol, z, n, ppExpr
1038 /* Rebalance the expression. And check that its depth does not exceed
1039 ** SQLITE_FTS3_MAX_EXPR_DEPTH. */
1040 if( rc==SQLITE_OK && *ppExpr ){
1041 rc = fts3ExprBalance(ppExpr, SQLITE_FTS3_MAX_EXPR_DEPTH);
1042 if( rc==SQLITE_OK ){
1043 rc = fts3ExprCheckDepth(*ppExpr, SQLITE_FTS3_MAX_EXPR_DEPTH);
1047 if( rc!=SQLITE_OK ){
1048 sqlite3Fts3ExprFree(*ppExpr);
1049 *ppExpr = 0;
1050 if( rc==SQLITE_TOOBIG ){
1051 sqlite3Fts3ErrMsg(pzErr,
1052 "FTS expression tree is too large (maximum depth %d)",
1053 SQLITE_FTS3_MAX_EXPR_DEPTH
1055 rc = SQLITE_ERROR;
1056 }else if( rc==SQLITE_ERROR ){
1057 sqlite3Fts3ErrMsg(pzErr, "malformed MATCH expression: [%s]", z);
1061 return rc;
1065 ** Free a single node of an expression tree.
1067 static void fts3FreeExprNode(Fts3Expr *p){
1068 assert( p->eType==FTSQUERY_PHRASE || p->pPhrase==0 );
1069 sqlite3Fts3EvalPhraseCleanup(p->pPhrase);
1070 sqlite3_free(p->aMI);
1071 sqlite3_free(p);
1075 ** Free a parsed fts3 query expression allocated by sqlite3Fts3ExprParse().
1077 ** This function would be simpler if it recursively called itself. But
1078 ** that would mean passing a sufficiently large expression to ExprParse()
1079 ** could cause a stack overflow.
1081 void sqlite3Fts3ExprFree(Fts3Expr *pDel){
1082 Fts3Expr *p;
1083 assert( pDel==0 || pDel->pParent==0 );
1084 for(p=pDel; p && (p->pLeft||p->pRight); p=(p->pLeft ? p->pLeft : p->pRight)){
1085 assert( p->pParent==0 || p==p->pParent->pRight || p==p->pParent->pLeft );
1087 while( p ){
1088 Fts3Expr *pParent = p->pParent;
1089 fts3FreeExprNode(p);
1090 if( pParent && p==pParent->pLeft && pParent->pRight ){
1091 p = pParent->pRight;
1092 while( p && (p->pLeft || p->pRight) ){
1093 assert( p==p->pParent->pRight || p==p->pParent->pLeft );
1094 p = (p->pLeft ? p->pLeft : p->pRight);
1096 }else{
1097 p = pParent;
1102 /****************************************************************************
1103 *****************************************************************************
1104 ** Everything after this point is just test code.
1107 #ifdef SQLITE_TEST
1109 #include <stdio.h>
1112 ** Function to query the hash-table of tokenizers (see README.tokenizers).
1114 static int queryTestTokenizer(
1115 sqlite3 *db,
1116 const char *zName,
1117 const sqlite3_tokenizer_module **pp
1119 int rc;
1120 sqlite3_stmt *pStmt;
1121 const char zSql[] = "SELECT fts3_tokenizer(?)";
1123 *pp = 0;
1124 rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
1125 if( rc!=SQLITE_OK ){
1126 return rc;
1129 sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
1130 if( SQLITE_ROW==sqlite3_step(pStmt) ){
1131 if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
1132 memcpy((void *)pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
1136 return sqlite3_finalize(pStmt);
1140 ** Return a pointer to a buffer containing a text representation of the
1141 ** expression passed as the first argument. The buffer is obtained from
1142 ** sqlite3_malloc(). It is the responsibility of the caller to use
1143 ** sqlite3_free() to release the memory. If an OOM condition is encountered,
1144 ** NULL is returned.
1146 ** If the second argument is not NULL, then its contents are prepended to
1147 ** the returned expression text and then freed using sqlite3_free().
1149 static char *exprToString(Fts3Expr *pExpr, char *zBuf){
1150 if( pExpr==0 ){
1151 return sqlite3_mprintf("");
1153 switch( pExpr->eType ){
1154 case FTSQUERY_PHRASE: {
1155 Fts3Phrase *pPhrase = pExpr->pPhrase;
1156 int i;
1157 zBuf = sqlite3_mprintf(
1158 "%zPHRASE %d 0", zBuf, pPhrase->iColumn);
1159 for(i=0; zBuf && i<pPhrase->nToken; i++){
1160 zBuf = sqlite3_mprintf("%z %.*s%s", zBuf,
1161 pPhrase->aToken[i].n, pPhrase->aToken[i].z,
1162 (pPhrase->aToken[i].isPrefix?"+":"")
1165 return zBuf;
1168 case FTSQUERY_NEAR:
1169 zBuf = sqlite3_mprintf("%zNEAR/%d ", zBuf, pExpr->nNear);
1170 break;
1171 case FTSQUERY_NOT:
1172 zBuf = sqlite3_mprintf("%zNOT ", zBuf);
1173 break;
1174 case FTSQUERY_AND:
1175 zBuf = sqlite3_mprintf("%zAND ", zBuf);
1176 break;
1177 case FTSQUERY_OR:
1178 zBuf = sqlite3_mprintf("%zOR ", zBuf);
1179 break;
1182 if( zBuf ) zBuf = sqlite3_mprintf("%z{", zBuf);
1183 if( zBuf ) zBuf = exprToString(pExpr->pLeft, zBuf);
1184 if( zBuf ) zBuf = sqlite3_mprintf("%z} {", zBuf);
1186 if( zBuf ) zBuf = exprToString(pExpr->pRight, zBuf);
1187 if( zBuf ) zBuf = sqlite3_mprintf("%z}", zBuf);
1189 return zBuf;
1193 ** This is the implementation of a scalar SQL function used to test the
1194 ** expression parser. It should be called as follows:
1196 ** fts3_exprtest(<tokenizer>, <expr>, <column 1>, ...);
1198 ** The first argument, <tokenizer>, is the name of the fts3 tokenizer used
1199 ** to parse the query expression (see README.tokenizers). The second argument
1200 ** is the query expression to parse. Each subsequent argument is the name
1201 ** of a column of the fts3 table that the query expression may refer to.
1202 ** For example:
1204 ** SELECT fts3_exprtest('simple', 'Bill col2:Bloggs', 'col1', 'col2');
1206 static void fts3ExprTest(
1207 sqlite3_context *context,
1208 int argc,
1209 sqlite3_value **argv
1211 sqlite3_tokenizer_module const *pModule = 0;
1212 sqlite3_tokenizer *pTokenizer = 0;
1213 int rc;
1214 char **azCol = 0;
1215 const char *zExpr;
1216 int nExpr;
1217 int nCol;
1218 int ii;
1219 Fts3Expr *pExpr;
1220 char *zBuf = 0;
1221 sqlite3 *db = sqlite3_context_db_handle(context);
1223 if( argc<3 ){
1224 sqlite3_result_error(context,
1225 "Usage: fts3_exprtest(tokenizer, expr, col1, ...", -1
1227 return;
1230 rc = queryTestTokenizer(db,
1231 (const char *)sqlite3_value_text(argv[0]), &pModule);
1232 if( rc==SQLITE_NOMEM ){
1233 sqlite3_result_error_nomem(context);
1234 goto exprtest_out;
1235 }else if( !pModule ){
1236 sqlite3_result_error(context, "No such tokenizer module", -1);
1237 goto exprtest_out;
1240 rc = pModule->xCreate(0, 0, &pTokenizer);
1241 assert( rc==SQLITE_NOMEM || rc==SQLITE_OK );
1242 if( rc==SQLITE_NOMEM ){
1243 sqlite3_result_error_nomem(context);
1244 goto exprtest_out;
1246 pTokenizer->pModule = pModule;
1248 zExpr = (const char *)sqlite3_value_text(argv[1]);
1249 nExpr = sqlite3_value_bytes(argv[1]);
1250 nCol = argc-2;
1251 azCol = (char **)sqlite3_malloc(nCol*sizeof(char *));
1252 if( !azCol ){
1253 sqlite3_result_error_nomem(context);
1254 goto exprtest_out;
1256 for(ii=0; ii<nCol; ii++){
1257 azCol[ii] = (char *)sqlite3_value_text(argv[ii+2]);
1260 if( sqlite3_user_data(context) ){
1261 char *zDummy = 0;
1262 rc = sqlite3Fts3ExprParse(
1263 pTokenizer, 0, azCol, 0, nCol, nCol, zExpr, nExpr, &pExpr, &zDummy
1265 assert( rc==SQLITE_OK || pExpr==0 );
1266 sqlite3_free(zDummy);
1267 }else{
1268 rc = fts3ExprParseUnbalanced(
1269 pTokenizer, 0, azCol, 0, nCol, nCol, zExpr, nExpr, &pExpr
1273 if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM ){
1274 sqlite3Fts3ExprFree(pExpr);
1275 sqlite3_result_error(context, "Error parsing expression", -1);
1276 }else if( rc==SQLITE_NOMEM || !(zBuf = exprToString(pExpr, 0)) ){
1277 sqlite3_result_error_nomem(context);
1278 }else{
1279 sqlite3_result_text(context, zBuf, -1, SQLITE_TRANSIENT);
1280 sqlite3_free(zBuf);
1283 sqlite3Fts3ExprFree(pExpr);
1285 exprtest_out:
1286 if( pModule && pTokenizer ){
1287 rc = pModule->xDestroy(pTokenizer);
1289 sqlite3_free(azCol);
1293 ** Register the query expression parser test function fts3_exprtest()
1294 ** with database connection db.
1296 int sqlite3Fts3ExprInitTestInterface(sqlite3* db){
1297 int rc = sqlite3_create_function(
1298 db, "fts3_exprtest", -1, SQLITE_UTF8, 0, fts3ExprTest, 0, 0
1300 if( rc==SQLITE_OK ){
1301 rc = sqlite3_create_function(db, "fts3_exprtest_rebalance",
1302 -1, SQLITE_UTF8, (void *)1, fts3ExprTest, 0, 0
1305 return rc;
1308 #endif
1309 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */