Add the "sorter-reference" optimization, allowing SQLite to be configured so
[sqlite.git] / ext / fts5 / fts5_tokenize.c
blobb72a0c24ab9f4c227c9197443f137d2cf67130cd
1 /*
2 ** 2014 May 31
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
15 #include "fts5Int.h"
17 /**************************************************************************
18 ** Start of ascii tokenizer implementation.
22 ** For tokenizers with no "unicode" modifier, the set of token characters
23 ** is the same as the set of ASCII range alphanumeric characters.
25 static unsigned char aAsciiTokenChar[128] = {
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */
30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */
32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
36 typedef struct AsciiTokenizer AsciiTokenizer;
37 struct AsciiTokenizer {
38 unsigned char aTokenChar[128];
41 static void fts5AsciiAddExceptions(
42 AsciiTokenizer *p,
43 const char *zArg,
44 int bTokenChars
46 int i;
47 for(i=0; zArg[i]; i++){
48 if( (zArg[i] & 0x80)==0 ){
49 p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
55 ** Delete a "ascii" tokenizer.
57 static void fts5AsciiDelete(Fts5Tokenizer *p){
58 sqlite3_free(p);
62 ** Create an "ascii" tokenizer.
64 static int fts5AsciiCreate(
65 void *pUnused,
66 const char **azArg, int nArg,
67 Fts5Tokenizer **ppOut
69 int rc = SQLITE_OK;
70 AsciiTokenizer *p = 0;
71 UNUSED_PARAM(pUnused);
72 if( nArg%2 ){
73 rc = SQLITE_ERROR;
74 }else{
75 p = sqlite3_malloc(sizeof(AsciiTokenizer));
76 if( p==0 ){
77 rc = SQLITE_NOMEM;
78 }else{
79 int i;
80 memset(p, 0, sizeof(AsciiTokenizer));
81 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
82 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
83 const char *zArg = azArg[i+1];
84 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
85 fts5AsciiAddExceptions(p, zArg, 1);
86 }else
87 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
88 fts5AsciiAddExceptions(p, zArg, 0);
89 }else{
90 rc = SQLITE_ERROR;
93 if( rc!=SQLITE_OK ){
94 fts5AsciiDelete((Fts5Tokenizer*)p);
95 p = 0;
100 *ppOut = (Fts5Tokenizer*)p;
101 return rc;
105 static void asciiFold(char *aOut, const char *aIn, int nByte){
106 int i;
107 for(i=0; i<nByte; i++){
108 char c = aIn[i];
109 if( c>='A' && c<='Z' ) c += 32;
110 aOut[i] = c;
115 ** Tokenize some text using the ascii tokenizer.
117 static int fts5AsciiTokenize(
118 Fts5Tokenizer *pTokenizer,
119 void *pCtx,
120 int iUnused,
121 const char *pText, int nText,
122 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
124 AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
125 int rc = SQLITE_OK;
126 int ie;
127 int is = 0;
129 char aFold[64];
130 int nFold = sizeof(aFold);
131 char *pFold = aFold;
132 unsigned char *a = p->aTokenChar;
134 UNUSED_PARAM(iUnused);
136 while( is<nText && rc==SQLITE_OK ){
137 int nByte;
139 /* Skip any leading divider characters. */
140 while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
141 is++;
143 if( is==nText ) break;
145 /* Count the token characters */
146 ie = is+1;
147 while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
148 ie++;
151 /* Fold to lower case */
152 nByte = ie-is;
153 if( nByte>nFold ){
154 if( pFold!=aFold ) sqlite3_free(pFold);
155 pFold = sqlite3_malloc(nByte*2);
156 if( pFold==0 ){
157 rc = SQLITE_NOMEM;
158 break;
160 nFold = nByte*2;
162 asciiFold(pFold, &pText[is], nByte);
164 /* Invoke the token callback */
165 rc = xToken(pCtx, 0, pFold, nByte, is, ie);
166 is = ie+1;
169 if( pFold!=aFold ) sqlite3_free(pFold);
170 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
171 return rc;
174 /**************************************************************************
175 ** Start of unicode61 tokenizer implementation.
180 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
181 ** from the sqlite3 source file utf.c. If this file is compiled as part
182 ** of the amalgamation, they are not required.
184 #ifndef SQLITE_AMALGAMATION
186 static const unsigned char sqlite3Utf8Trans1[] = {
187 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
188 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
189 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
190 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
191 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
192 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
193 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
194 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
197 #define READ_UTF8(zIn, zTerm, c) \
198 c = *(zIn++); \
199 if( c>=0xc0 ){ \
200 c = sqlite3Utf8Trans1[c-0xc0]; \
201 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
202 c = (c<<6) + (0x3f & *(zIn++)); \
204 if( c<0x80 \
205 || (c&0xFFFFF800)==0xD800 \
206 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
210 #define WRITE_UTF8(zOut, c) { \
211 if( c<0x00080 ){ \
212 *zOut++ = (unsigned char)(c&0xFF); \
214 else if( c<0x00800 ){ \
215 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \
216 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
218 else if( c<0x10000 ){ \
219 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \
220 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
221 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
222 }else{ \
223 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \
224 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \
225 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
226 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
230 #endif /* ifndef SQLITE_AMALGAMATION */
232 typedef struct Unicode61Tokenizer Unicode61Tokenizer;
233 struct Unicode61Tokenizer {
234 unsigned char aTokenChar[128]; /* ASCII range token characters */
235 char *aFold; /* Buffer to fold text into */
236 int nFold; /* Size of aFold[] in bytes */
237 int bRemoveDiacritic; /* True if remove_diacritics=1 is set */
238 int nException;
239 int *aiException;
242 static int fts5UnicodeAddExceptions(
243 Unicode61Tokenizer *p, /* Tokenizer object */
244 const char *z, /* Characters to treat as exceptions */
245 int bTokenChars /* 1 for 'tokenchars', 0 for 'separators' */
247 int rc = SQLITE_OK;
248 int n = (int)strlen(z);
249 int *aNew;
251 if( n>0 ){
252 aNew = (int*)sqlite3_realloc(p->aiException, (n+p->nException)*sizeof(int));
253 if( aNew ){
254 int nNew = p->nException;
255 const unsigned char *zCsr = (const unsigned char*)z;
256 const unsigned char *zTerm = (const unsigned char*)&z[n];
257 while( zCsr<zTerm ){
258 int iCode;
259 int bToken;
260 READ_UTF8(zCsr, zTerm, iCode);
261 if( iCode<128 ){
262 p->aTokenChar[iCode] = (unsigned char)bTokenChars;
263 }else{
264 bToken = sqlite3Fts5UnicodeIsalnum(iCode);
265 assert( (bToken==0 || bToken==1) );
266 assert( (bTokenChars==0 || bTokenChars==1) );
267 if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
268 int i;
269 for(i=0; i<nNew; i++){
270 if( aNew[i]>iCode ) break;
272 memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
273 aNew[i] = iCode;
274 nNew++;
278 p->aiException = aNew;
279 p->nException = nNew;
280 }else{
281 rc = SQLITE_NOMEM;
285 return rc;
289 ** Return true if the p->aiException[] array contains the value iCode.
291 static int fts5UnicodeIsException(Unicode61Tokenizer *p, int iCode){
292 if( p->nException>0 ){
293 int *a = p->aiException;
294 int iLo = 0;
295 int iHi = p->nException-1;
297 while( iHi>=iLo ){
298 int iTest = (iHi + iLo) / 2;
299 if( iCode==a[iTest] ){
300 return 1;
301 }else if( iCode>a[iTest] ){
302 iLo = iTest+1;
303 }else{
304 iHi = iTest-1;
309 return 0;
313 ** Delete a "unicode61" tokenizer.
315 static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
316 if( pTok ){
317 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
318 sqlite3_free(p->aiException);
319 sqlite3_free(p->aFold);
320 sqlite3_free(p);
322 return;
326 ** Create a "unicode61" tokenizer.
328 static int fts5UnicodeCreate(
329 void *pUnused,
330 const char **azArg, int nArg,
331 Fts5Tokenizer **ppOut
333 int rc = SQLITE_OK; /* Return code */
334 Unicode61Tokenizer *p = 0; /* New tokenizer object */
336 UNUSED_PARAM(pUnused);
338 if( nArg%2 ){
339 rc = SQLITE_ERROR;
340 }else{
341 p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
342 if( p ){
343 int i;
344 memset(p, 0, sizeof(Unicode61Tokenizer));
345 memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
346 p->bRemoveDiacritic = 1;
347 p->nFold = 64;
348 p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
349 if( p->aFold==0 ){
350 rc = SQLITE_NOMEM;
352 for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
353 const char *zArg = azArg[i+1];
354 if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
355 if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
356 rc = SQLITE_ERROR;
358 p->bRemoveDiacritic = (zArg[0]=='1');
359 }else
360 if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
361 rc = fts5UnicodeAddExceptions(p, zArg, 1);
362 }else
363 if( 0==sqlite3_stricmp(azArg[i], "separators") ){
364 rc = fts5UnicodeAddExceptions(p, zArg, 0);
365 }else{
366 rc = SQLITE_ERROR;
369 }else{
370 rc = SQLITE_NOMEM;
372 if( rc!=SQLITE_OK ){
373 fts5UnicodeDelete((Fts5Tokenizer*)p);
374 p = 0;
376 *ppOut = (Fts5Tokenizer*)p;
378 return rc;
382 ** Return true if, for the purposes of tokenizing with the tokenizer
383 ** passed as the first argument, codepoint iCode is considered a token
384 ** character (not a separator).
386 static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
387 assert( (sqlite3Fts5UnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
388 return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode);
391 static int fts5UnicodeTokenize(
392 Fts5Tokenizer *pTokenizer,
393 void *pCtx,
394 int iUnused,
395 const char *pText, int nText,
396 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
398 Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
399 int rc = SQLITE_OK;
400 unsigned char *a = p->aTokenChar;
402 unsigned char *zTerm = (unsigned char*)&pText[nText];
403 unsigned char *zCsr = (unsigned char *)pText;
405 /* Output buffer */
406 char *aFold = p->aFold;
407 int nFold = p->nFold;
408 const char *pEnd = &aFold[nFold-6];
410 UNUSED_PARAM(iUnused);
412 /* Each iteration of this loop gobbles up a contiguous run of separators,
413 ** then the next token. */
414 while( rc==SQLITE_OK ){
415 int iCode; /* non-ASCII codepoint read from input */
416 char *zOut = aFold;
417 int is;
418 int ie;
420 /* Skip any separator characters. */
421 while( 1 ){
422 if( zCsr>=zTerm ) goto tokenize_done;
423 if( *zCsr & 0x80 ) {
424 /* A character outside of the ascii range. Skip past it if it is
425 ** a separator character. Or break out of the loop if it is not. */
426 is = zCsr - (unsigned char*)pText;
427 READ_UTF8(zCsr, zTerm, iCode);
428 if( fts5UnicodeIsAlnum(p, iCode) ){
429 goto non_ascii_tokenchar;
431 }else{
432 if( a[*zCsr] ){
433 is = zCsr - (unsigned char*)pText;
434 goto ascii_tokenchar;
436 zCsr++;
440 /* Run through the tokenchars. Fold them into the output buffer along
441 ** the way. */
442 while( zCsr<zTerm ){
444 /* Grow the output buffer so that there is sufficient space to fit the
445 ** largest possible utf-8 character. */
446 if( zOut>pEnd ){
447 aFold = sqlite3_malloc(nFold*2);
448 if( aFold==0 ){
449 rc = SQLITE_NOMEM;
450 goto tokenize_done;
452 zOut = &aFold[zOut - p->aFold];
453 memcpy(aFold, p->aFold, nFold);
454 sqlite3_free(p->aFold);
455 p->aFold = aFold;
456 p->nFold = nFold = nFold*2;
457 pEnd = &aFold[nFold-6];
460 if( *zCsr & 0x80 ){
461 /* An non-ascii-range character. Fold it into the output buffer if
462 ** it is a token character, or break out of the loop if it is not. */
463 READ_UTF8(zCsr, zTerm, iCode);
464 if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
465 non_ascii_tokenchar:
466 iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
467 if( iCode ) WRITE_UTF8(zOut, iCode);
468 }else{
469 break;
471 }else if( a[*zCsr]==0 ){
472 /* An ascii-range separator character. End of token. */
473 break;
474 }else{
475 ascii_tokenchar:
476 if( *zCsr>='A' && *zCsr<='Z' ){
477 *zOut++ = *zCsr + 32;
478 }else{
479 *zOut++ = *zCsr;
481 zCsr++;
483 ie = zCsr - (unsigned char*)pText;
486 /* Invoke the token callback */
487 rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
490 tokenize_done:
491 if( rc==SQLITE_DONE ) rc = SQLITE_OK;
492 return rc;
495 /**************************************************************************
496 ** Start of porter stemmer implementation.
499 /* Any tokens larger than this (in bytes) are passed through without
500 ** stemming. */
501 #define FTS5_PORTER_MAX_TOKEN 64
503 typedef struct PorterTokenizer PorterTokenizer;
504 struct PorterTokenizer {
505 fts5_tokenizer tokenizer; /* Parent tokenizer module */
506 Fts5Tokenizer *pTokenizer; /* Parent tokenizer instance */
507 char aBuf[FTS5_PORTER_MAX_TOKEN + 64];
511 ** Delete a "porter" tokenizer.
513 static void fts5PorterDelete(Fts5Tokenizer *pTok){
514 if( pTok ){
515 PorterTokenizer *p = (PorterTokenizer*)pTok;
516 if( p->pTokenizer ){
517 p->tokenizer.xDelete(p->pTokenizer);
519 sqlite3_free(p);
524 ** Create a "porter" tokenizer.
526 static int fts5PorterCreate(
527 void *pCtx,
528 const char **azArg, int nArg,
529 Fts5Tokenizer **ppOut
531 fts5_api *pApi = (fts5_api*)pCtx;
532 int rc = SQLITE_OK;
533 PorterTokenizer *pRet;
534 void *pUserdata = 0;
535 const char *zBase = "unicode61";
537 if( nArg>0 ){
538 zBase = azArg[0];
541 pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
542 if( pRet ){
543 memset(pRet, 0, sizeof(PorterTokenizer));
544 rc = pApi->xFindTokenizer(pApi, zBase, &pUserdata, &pRet->tokenizer);
545 }else{
546 rc = SQLITE_NOMEM;
548 if( rc==SQLITE_OK ){
549 int nArg2 = (nArg>0 ? nArg-1 : 0);
550 const char **azArg2 = (nArg2 ? &azArg[1] : 0);
551 rc = pRet->tokenizer.xCreate(pUserdata, azArg2, nArg2, &pRet->pTokenizer);
554 if( rc!=SQLITE_OK ){
555 fts5PorterDelete((Fts5Tokenizer*)pRet);
556 pRet = 0;
558 *ppOut = (Fts5Tokenizer*)pRet;
559 return rc;
562 typedef struct PorterContext PorterContext;
563 struct PorterContext {
564 void *pCtx;
565 int (*xToken)(void*, int, const char*, int, int, int);
566 char *aBuf;
569 typedef struct PorterRule PorterRule;
570 struct PorterRule {
571 const char *zSuffix;
572 int nSuffix;
573 int (*xCond)(char *zStem, int nStem);
574 const char *zOutput;
575 int nOutput;
578 #if 0
579 static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
580 int ret = -1;
581 int nBuf = *pnBuf;
582 PorterRule *p;
584 for(p=aRule; p->zSuffix; p++){
585 assert( strlen(p->zSuffix)==p->nSuffix );
586 assert( strlen(p->zOutput)==p->nOutput );
587 if( nBuf<p->nSuffix ) continue;
588 if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
591 if( p->zSuffix ){
592 int nStem = nBuf - p->nSuffix;
593 if( p->xCond==0 || p->xCond(aBuf, nStem) ){
594 memcpy(&aBuf[nStem], p->zOutput, p->nOutput);
595 *pnBuf = nStem + p->nOutput;
596 ret = p - aRule;
600 return ret;
602 #endif
604 static int fts5PorterIsVowel(char c, int bYIsVowel){
605 return (
606 c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
610 static int fts5PorterGobbleVC(char *zStem, int nStem, int bPrevCons){
611 int i;
612 int bCons = bPrevCons;
614 /* Scan for a vowel */
615 for(i=0; i<nStem; i++){
616 if( 0==(bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) break;
619 /* Scan for a consonent */
620 for(i++; i<nStem; i++){
621 if( (bCons = !fts5PorterIsVowel(zStem[i], bCons)) ) return i+1;
623 return 0;
626 /* porter rule condition: (m > 0) */
627 static int fts5Porter_MGt0(char *zStem, int nStem){
628 return !!fts5PorterGobbleVC(zStem, nStem, 0);
631 /* porter rule condition: (m > 1) */
632 static int fts5Porter_MGt1(char *zStem, int nStem){
633 int n;
634 n = fts5PorterGobbleVC(zStem, nStem, 0);
635 if( n && fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
636 return 1;
638 return 0;
641 /* porter rule condition: (m = 1) */
642 static int fts5Porter_MEq1(char *zStem, int nStem){
643 int n;
644 n = fts5PorterGobbleVC(zStem, nStem, 0);
645 if( n && 0==fts5PorterGobbleVC(&zStem[n], nStem-n, 1) ){
646 return 1;
648 return 0;
651 /* porter rule condition: (*o) */
652 static int fts5Porter_Ostar(char *zStem, int nStem){
653 if( zStem[nStem-1]=='w' || zStem[nStem-1]=='x' || zStem[nStem-1]=='y' ){
654 return 0;
655 }else{
656 int i;
657 int mask = 0;
658 int bCons = 0;
659 for(i=0; i<nStem; i++){
660 bCons = !fts5PorterIsVowel(zStem[i], bCons);
661 assert( bCons==0 || bCons==1 );
662 mask = (mask << 1) + bCons;
664 return ((mask & 0x0007)==0x0005);
668 /* porter rule condition: (m > 1 and (*S or *T)) */
669 static int fts5Porter_MGt1_and_S_or_T(char *zStem, int nStem){
670 assert( nStem>0 );
671 return (zStem[nStem-1]=='s' || zStem[nStem-1]=='t')
672 && fts5Porter_MGt1(zStem, nStem);
675 /* porter rule condition: (*v*) */
676 static int fts5Porter_Vowel(char *zStem, int nStem){
677 int i;
678 for(i=0; i<nStem; i++){
679 if( fts5PorterIsVowel(zStem[i], i>0) ){
680 return 1;
683 return 0;
687 /**************************************************************************
688 ***************************************************************************
689 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
692 static int fts5PorterStep4(char *aBuf, int *pnBuf){
693 int ret = 0;
694 int nBuf = *pnBuf;
695 switch( aBuf[nBuf-2] ){
697 case 'a':
698 if( nBuf>2 && 0==memcmp("al", &aBuf[nBuf-2], 2) ){
699 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
700 *pnBuf = nBuf - 2;
703 break;
705 case 'c':
706 if( nBuf>4 && 0==memcmp("ance", &aBuf[nBuf-4], 4) ){
707 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
708 *pnBuf = nBuf - 4;
710 }else if( nBuf>4 && 0==memcmp("ence", &aBuf[nBuf-4], 4) ){
711 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
712 *pnBuf = nBuf - 4;
715 break;
717 case 'e':
718 if( nBuf>2 && 0==memcmp("er", &aBuf[nBuf-2], 2) ){
719 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
720 *pnBuf = nBuf - 2;
723 break;
725 case 'i':
726 if( nBuf>2 && 0==memcmp("ic", &aBuf[nBuf-2], 2) ){
727 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
728 *pnBuf = nBuf - 2;
731 break;
733 case 'l':
734 if( nBuf>4 && 0==memcmp("able", &aBuf[nBuf-4], 4) ){
735 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
736 *pnBuf = nBuf - 4;
738 }else if( nBuf>4 && 0==memcmp("ible", &aBuf[nBuf-4], 4) ){
739 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
740 *pnBuf = nBuf - 4;
743 break;
745 case 'n':
746 if( nBuf>3 && 0==memcmp("ant", &aBuf[nBuf-3], 3) ){
747 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
748 *pnBuf = nBuf - 3;
750 }else if( nBuf>5 && 0==memcmp("ement", &aBuf[nBuf-5], 5) ){
751 if( fts5Porter_MGt1(aBuf, nBuf-5) ){
752 *pnBuf = nBuf - 5;
754 }else if( nBuf>4 && 0==memcmp("ment", &aBuf[nBuf-4], 4) ){
755 if( fts5Porter_MGt1(aBuf, nBuf-4) ){
756 *pnBuf = nBuf - 4;
758 }else if( nBuf>3 && 0==memcmp("ent", &aBuf[nBuf-3], 3) ){
759 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
760 *pnBuf = nBuf - 3;
763 break;
765 case 'o':
766 if( nBuf>3 && 0==memcmp("ion", &aBuf[nBuf-3], 3) ){
767 if( fts5Porter_MGt1_and_S_or_T(aBuf, nBuf-3) ){
768 *pnBuf = nBuf - 3;
770 }else if( nBuf>2 && 0==memcmp("ou", &aBuf[nBuf-2], 2) ){
771 if( fts5Porter_MGt1(aBuf, nBuf-2) ){
772 *pnBuf = nBuf - 2;
775 break;
777 case 's':
778 if( nBuf>3 && 0==memcmp("ism", &aBuf[nBuf-3], 3) ){
779 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
780 *pnBuf = nBuf - 3;
783 break;
785 case 't':
786 if( nBuf>3 && 0==memcmp("ate", &aBuf[nBuf-3], 3) ){
787 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
788 *pnBuf = nBuf - 3;
790 }else if( nBuf>3 && 0==memcmp("iti", &aBuf[nBuf-3], 3) ){
791 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
792 *pnBuf = nBuf - 3;
795 break;
797 case 'u':
798 if( nBuf>3 && 0==memcmp("ous", &aBuf[nBuf-3], 3) ){
799 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
800 *pnBuf = nBuf - 3;
803 break;
805 case 'v':
806 if( nBuf>3 && 0==memcmp("ive", &aBuf[nBuf-3], 3) ){
807 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
808 *pnBuf = nBuf - 3;
811 break;
813 case 'z':
814 if( nBuf>3 && 0==memcmp("ize", &aBuf[nBuf-3], 3) ){
815 if( fts5Porter_MGt1(aBuf, nBuf-3) ){
816 *pnBuf = nBuf - 3;
819 break;
822 return ret;
826 static int fts5PorterStep1B2(char *aBuf, int *pnBuf){
827 int ret = 0;
828 int nBuf = *pnBuf;
829 switch( aBuf[nBuf-2] ){
831 case 'a':
832 if( nBuf>2 && 0==memcmp("at", &aBuf[nBuf-2], 2) ){
833 memcpy(&aBuf[nBuf-2], "ate", 3);
834 *pnBuf = nBuf - 2 + 3;
835 ret = 1;
837 break;
839 case 'b':
840 if( nBuf>2 && 0==memcmp("bl", &aBuf[nBuf-2], 2) ){
841 memcpy(&aBuf[nBuf-2], "ble", 3);
842 *pnBuf = nBuf - 2 + 3;
843 ret = 1;
845 break;
847 case 'i':
848 if( nBuf>2 && 0==memcmp("iz", &aBuf[nBuf-2], 2) ){
849 memcpy(&aBuf[nBuf-2], "ize", 3);
850 *pnBuf = nBuf - 2 + 3;
851 ret = 1;
853 break;
856 return ret;
860 static int fts5PorterStep2(char *aBuf, int *pnBuf){
861 int ret = 0;
862 int nBuf = *pnBuf;
863 switch( aBuf[nBuf-2] ){
865 case 'a':
866 if( nBuf>7 && 0==memcmp("ational", &aBuf[nBuf-7], 7) ){
867 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
868 memcpy(&aBuf[nBuf-7], "ate", 3);
869 *pnBuf = nBuf - 7 + 3;
871 }else if( nBuf>6 && 0==memcmp("tional", &aBuf[nBuf-6], 6) ){
872 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
873 memcpy(&aBuf[nBuf-6], "tion", 4);
874 *pnBuf = nBuf - 6 + 4;
877 break;
879 case 'c':
880 if( nBuf>4 && 0==memcmp("enci", &aBuf[nBuf-4], 4) ){
881 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
882 memcpy(&aBuf[nBuf-4], "ence", 4);
883 *pnBuf = nBuf - 4 + 4;
885 }else if( nBuf>4 && 0==memcmp("anci", &aBuf[nBuf-4], 4) ){
886 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
887 memcpy(&aBuf[nBuf-4], "ance", 4);
888 *pnBuf = nBuf - 4 + 4;
891 break;
893 case 'e':
894 if( nBuf>4 && 0==memcmp("izer", &aBuf[nBuf-4], 4) ){
895 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
896 memcpy(&aBuf[nBuf-4], "ize", 3);
897 *pnBuf = nBuf - 4 + 3;
900 break;
902 case 'g':
903 if( nBuf>4 && 0==memcmp("logi", &aBuf[nBuf-4], 4) ){
904 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
905 memcpy(&aBuf[nBuf-4], "log", 3);
906 *pnBuf = nBuf - 4 + 3;
909 break;
911 case 'l':
912 if( nBuf>3 && 0==memcmp("bli", &aBuf[nBuf-3], 3) ){
913 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
914 memcpy(&aBuf[nBuf-3], "ble", 3);
915 *pnBuf = nBuf - 3 + 3;
917 }else if( nBuf>4 && 0==memcmp("alli", &aBuf[nBuf-4], 4) ){
918 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
919 memcpy(&aBuf[nBuf-4], "al", 2);
920 *pnBuf = nBuf - 4 + 2;
922 }else if( nBuf>5 && 0==memcmp("entli", &aBuf[nBuf-5], 5) ){
923 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
924 memcpy(&aBuf[nBuf-5], "ent", 3);
925 *pnBuf = nBuf - 5 + 3;
927 }else if( nBuf>3 && 0==memcmp("eli", &aBuf[nBuf-3], 3) ){
928 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
929 memcpy(&aBuf[nBuf-3], "e", 1);
930 *pnBuf = nBuf - 3 + 1;
932 }else if( nBuf>5 && 0==memcmp("ousli", &aBuf[nBuf-5], 5) ){
933 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
934 memcpy(&aBuf[nBuf-5], "ous", 3);
935 *pnBuf = nBuf - 5 + 3;
938 break;
940 case 'o':
941 if( nBuf>7 && 0==memcmp("ization", &aBuf[nBuf-7], 7) ){
942 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
943 memcpy(&aBuf[nBuf-7], "ize", 3);
944 *pnBuf = nBuf - 7 + 3;
946 }else if( nBuf>5 && 0==memcmp("ation", &aBuf[nBuf-5], 5) ){
947 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
948 memcpy(&aBuf[nBuf-5], "ate", 3);
949 *pnBuf = nBuf - 5 + 3;
951 }else if( nBuf>4 && 0==memcmp("ator", &aBuf[nBuf-4], 4) ){
952 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
953 memcpy(&aBuf[nBuf-4], "ate", 3);
954 *pnBuf = nBuf - 4 + 3;
957 break;
959 case 's':
960 if( nBuf>5 && 0==memcmp("alism", &aBuf[nBuf-5], 5) ){
961 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
962 memcpy(&aBuf[nBuf-5], "al", 2);
963 *pnBuf = nBuf - 5 + 2;
965 }else if( nBuf>7 && 0==memcmp("iveness", &aBuf[nBuf-7], 7) ){
966 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
967 memcpy(&aBuf[nBuf-7], "ive", 3);
968 *pnBuf = nBuf - 7 + 3;
970 }else if( nBuf>7 && 0==memcmp("fulness", &aBuf[nBuf-7], 7) ){
971 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
972 memcpy(&aBuf[nBuf-7], "ful", 3);
973 *pnBuf = nBuf - 7 + 3;
975 }else if( nBuf>7 && 0==memcmp("ousness", &aBuf[nBuf-7], 7) ){
976 if( fts5Porter_MGt0(aBuf, nBuf-7) ){
977 memcpy(&aBuf[nBuf-7], "ous", 3);
978 *pnBuf = nBuf - 7 + 3;
981 break;
983 case 't':
984 if( nBuf>5 && 0==memcmp("aliti", &aBuf[nBuf-5], 5) ){
985 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
986 memcpy(&aBuf[nBuf-5], "al", 2);
987 *pnBuf = nBuf - 5 + 2;
989 }else if( nBuf>5 && 0==memcmp("iviti", &aBuf[nBuf-5], 5) ){
990 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
991 memcpy(&aBuf[nBuf-5], "ive", 3);
992 *pnBuf = nBuf - 5 + 3;
994 }else if( nBuf>6 && 0==memcmp("biliti", &aBuf[nBuf-6], 6) ){
995 if( fts5Porter_MGt0(aBuf, nBuf-6) ){
996 memcpy(&aBuf[nBuf-6], "ble", 3);
997 *pnBuf = nBuf - 6 + 3;
1000 break;
1003 return ret;
1007 static int fts5PorterStep3(char *aBuf, int *pnBuf){
1008 int ret = 0;
1009 int nBuf = *pnBuf;
1010 switch( aBuf[nBuf-2] ){
1012 case 'a':
1013 if( nBuf>4 && 0==memcmp("ical", &aBuf[nBuf-4], 4) ){
1014 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1015 memcpy(&aBuf[nBuf-4], "ic", 2);
1016 *pnBuf = nBuf - 4 + 2;
1019 break;
1021 case 's':
1022 if( nBuf>4 && 0==memcmp("ness", &aBuf[nBuf-4], 4) ){
1023 if( fts5Porter_MGt0(aBuf, nBuf-4) ){
1024 *pnBuf = nBuf - 4;
1027 break;
1029 case 't':
1030 if( nBuf>5 && 0==memcmp("icate", &aBuf[nBuf-5], 5) ){
1031 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1032 memcpy(&aBuf[nBuf-5], "ic", 2);
1033 *pnBuf = nBuf - 5 + 2;
1035 }else if( nBuf>5 && 0==memcmp("iciti", &aBuf[nBuf-5], 5) ){
1036 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1037 memcpy(&aBuf[nBuf-5], "ic", 2);
1038 *pnBuf = nBuf - 5 + 2;
1041 break;
1043 case 'u':
1044 if( nBuf>3 && 0==memcmp("ful", &aBuf[nBuf-3], 3) ){
1045 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1046 *pnBuf = nBuf - 3;
1049 break;
1051 case 'v':
1052 if( nBuf>5 && 0==memcmp("ative", &aBuf[nBuf-5], 5) ){
1053 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1054 *pnBuf = nBuf - 5;
1057 break;
1059 case 'z':
1060 if( nBuf>5 && 0==memcmp("alize", &aBuf[nBuf-5], 5) ){
1061 if( fts5Porter_MGt0(aBuf, nBuf-5) ){
1062 memcpy(&aBuf[nBuf-5], "al", 2);
1063 *pnBuf = nBuf - 5 + 2;
1066 break;
1069 return ret;
1073 static int fts5PorterStep1B(char *aBuf, int *pnBuf){
1074 int ret = 0;
1075 int nBuf = *pnBuf;
1076 switch( aBuf[nBuf-2] ){
1078 case 'e':
1079 if( nBuf>3 && 0==memcmp("eed", &aBuf[nBuf-3], 3) ){
1080 if( fts5Porter_MGt0(aBuf, nBuf-3) ){
1081 memcpy(&aBuf[nBuf-3], "ee", 2);
1082 *pnBuf = nBuf - 3 + 2;
1084 }else if( nBuf>2 && 0==memcmp("ed", &aBuf[nBuf-2], 2) ){
1085 if( fts5Porter_Vowel(aBuf, nBuf-2) ){
1086 *pnBuf = nBuf - 2;
1087 ret = 1;
1090 break;
1092 case 'n':
1093 if( nBuf>3 && 0==memcmp("ing", &aBuf[nBuf-3], 3) ){
1094 if( fts5Porter_Vowel(aBuf, nBuf-3) ){
1095 *pnBuf = nBuf - 3;
1096 ret = 1;
1099 break;
1102 return ret;
1106 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1107 ***************************************************************************
1108 **************************************************************************/
1110 static void fts5PorterStep1A(char *aBuf, int *pnBuf){
1111 int nBuf = *pnBuf;
1112 if( aBuf[nBuf-1]=='s' ){
1113 if( aBuf[nBuf-2]=='e' ){
1114 if( (nBuf>4 && aBuf[nBuf-4]=='s' && aBuf[nBuf-3]=='s')
1115 || (nBuf>3 && aBuf[nBuf-3]=='i' )
1117 *pnBuf = nBuf-2;
1118 }else{
1119 *pnBuf = nBuf-1;
1122 else if( aBuf[nBuf-2]!='s' ){
1123 *pnBuf = nBuf-1;
1128 static int fts5PorterCb(
1129 void *pCtx,
1130 int tflags,
1131 const char *pToken,
1132 int nToken,
1133 int iStart,
1134 int iEnd
1136 PorterContext *p = (PorterContext*)pCtx;
1138 char *aBuf;
1139 int nBuf;
1141 if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
1142 aBuf = p->aBuf;
1143 nBuf = nToken;
1144 memcpy(aBuf, pToken, nBuf);
1146 /* Step 1. */
1147 fts5PorterStep1A(aBuf, &nBuf);
1148 if( fts5PorterStep1B(aBuf, &nBuf) ){
1149 if( fts5PorterStep1B2(aBuf, &nBuf)==0 ){
1150 char c = aBuf[nBuf-1];
1151 if( fts5PorterIsVowel(c, 0)==0
1152 && c!='l' && c!='s' && c!='z' && c==aBuf[nBuf-2]
1154 nBuf--;
1155 }else if( fts5Porter_MEq1(aBuf, nBuf) && fts5Porter_Ostar(aBuf, nBuf) ){
1156 aBuf[nBuf++] = 'e';
1161 /* Step 1C. */
1162 if( aBuf[nBuf-1]=='y' && fts5Porter_Vowel(aBuf, nBuf-1) ){
1163 aBuf[nBuf-1] = 'i';
1166 /* Steps 2 through 4. */
1167 fts5PorterStep2(aBuf, &nBuf);
1168 fts5PorterStep3(aBuf, &nBuf);
1169 fts5PorterStep4(aBuf, &nBuf);
1171 /* Step 5a. */
1172 assert( nBuf>0 );
1173 if( aBuf[nBuf-1]=='e' ){
1174 if( fts5Porter_MGt1(aBuf, nBuf-1)
1175 || (fts5Porter_MEq1(aBuf, nBuf-1) && !fts5Porter_Ostar(aBuf, nBuf-1))
1177 nBuf--;
1181 /* Step 5b. */
1182 if( nBuf>1 && aBuf[nBuf-1]=='l'
1183 && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
1185 nBuf--;
1188 return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
1190 pass_through:
1191 return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
1195 ** Tokenize using the porter tokenizer.
1197 static int fts5PorterTokenize(
1198 Fts5Tokenizer *pTokenizer,
1199 void *pCtx,
1200 int flags,
1201 const char *pText, int nText,
1202 int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
1204 PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
1205 PorterContext sCtx;
1206 sCtx.xToken = xToken;
1207 sCtx.pCtx = pCtx;
1208 sCtx.aBuf = p->aBuf;
1209 return p->tokenizer.xTokenize(
1210 p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
1215 ** Register all built-in tokenizers with FTS5.
1217 int sqlite3Fts5TokenizerInit(fts5_api *pApi){
1218 struct BuiltinTokenizer {
1219 const char *zName;
1220 fts5_tokenizer x;
1221 } aBuiltin[] = {
1222 { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
1223 { "ascii", {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
1224 { "porter", {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
1227 int rc = SQLITE_OK; /* Return code */
1228 int i; /* To iterate through builtin functions */
1230 for(i=0; rc==SQLITE_OK && i<ArraySize(aBuiltin); i++){
1231 rc = pApi->xCreateTokenizer(pApi,
1232 aBuiltin[i].zName,
1233 (void*)pApi,
1234 &aBuiltin[i].x,
1239 return rc;