4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 ******************************************************************************
17 /**************************************************************************
18 ** Start of ascii tokenizer implementation.
22 ** For tokenizers with no "unicode" modifier, the set of token characters
23 ** is the same as the set of ASCII range alphanumeric characters.
25 static unsigned char aAsciiTokenChar
[128] = {
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x00..0x0F */
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10..0x1F */
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20..0x2F */
29 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 0x30..0x3F */
30 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x40..0x4F */
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x50..0x5F */
32 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x60..0x6F */
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 0x70..0x7F */
36 typedef struct AsciiTokenizer AsciiTokenizer
;
37 struct AsciiTokenizer
{
38 unsigned char aTokenChar
[128];
41 static void fts5AsciiAddExceptions(
47 for(i
=0; zArg
[i
]; i
++){
48 if( (zArg
[i
] & 0x80)==0 ){
49 p
->aTokenChar
[(int)zArg
[i
]] = (unsigned char)bTokenChars
;
55 ** Delete a "ascii" tokenizer.
57 static void fts5AsciiDelete(Fts5Tokenizer
*p
){
62 ** Create an "ascii" tokenizer.
64 static int fts5AsciiCreate(
66 const char **azArg
, int nArg
,
70 AsciiTokenizer
*p
= 0;
71 UNUSED_PARAM(pUnused
);
75 p
= sqlite3_malloc(sizeof(AsciiTokenizer
));
80 memset(p
, 0, sizeof(AsciiTokenizer
));
81 memcpy(p
->aTokenChar
, aAsciiTokenChar
, sizeof(aAsciiTokenChar
));
82 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
; i
+=2){
83 const char *zArg
= azArg
[i
+1];
84 if( 0==sqlite3_stricmp(azArg
[i
], "tokenchars") ){
85 fts5AsciiAddExceptions(p
, zArg
, 1);
87 if( 0==sqlite3_stricmp(azArg
[i
], "separators") ){
88 fts5AsciiAddExceptions(p
, zArg
, 0);
94 fts5AsciiDelete((Fts5Tokenizer
*)p
);
100 *ppOut
= (Fts5Tokenizer
*)p
;
105 static void asciiFold(char *aOut
, const char *aIn
, int nByte
){
107 for(i
=0; i
<nByte
; i
++){
109 if( c
>='A' && c
<='Z' ) c
+= 32;
115 ** Tokenize some text using the ascii tokenizer.
117 static int fts5AsciiTokenize(
118 Fts5Tokenizer
*pTokenizer
,
121 const char *pText
, int nText
,
122 int (*xToken
)(void*, int, const char*, int nToken
, int iStart
, int iEnd
)
124 AsciiTokenizer
*p
= (AsciiTokenizer
*)pTokenizer
;
130 int nFold
= sizeof(aFold
);
132 unsigned char *a
= p
->aTokenChar
;
134 UNUSED_PARAM(iUnused
);
136 while( is
<nText
&& rc
==SQLITE_OK
){
139 /* Skip any leading divider characters. */
140 while( is
<nText
&& ((pText
[is
]&0x80)==0 && a
[(int)pText
[is
]]==0) ){
143 if( is
==nText
) break;
145 /* Count the token characters */
147 while( ie
<nText
&& ((pText
[ie
]&0x80) || a
[(int)pText
[ie
]] ) ){
151 /* Fold to lower case */
154 if( pFold
!=aFold
) sqlite3_free(pFold
);
155 pFold
= sqlite3_malloc(nByte
*2);
162 asciiFold(pFold
, &pText
[is
], nByte
);
164 /* Invoke the token callback */
165 rc
= xToken(pCtx
, 0, pFold
, nByte
, is
, ie
);
169 if( pFold
!=aFold
) sqlite3_free(pFold
);
170 if( rc
==SQLITE_DONE
) rc
= SQLITE_OK
;
174 /**************************************************************************
175 ** Start of unicode61 tokenizer implementation.
180 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
181 ** from the sqlite3 source file utf.c. If this file is compiled as part
182 ** of the amalgamation, they are not required.
184 #ifndef SQLITE_AMALGAMATION
186 static const unsigned char sqlite3Utf8Trans1
[] = {
187 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
188 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
189 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
190 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
191 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
192 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
193 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
194 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
197 #define READ_UTF8(zIn, zTerm, c) \
200 c = sqlite3Utf8Trans1[c-0xc0]; \
201 while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){ \
202 c = (c<<6) + (0x3f & *(zIn++)); \
205 || (c&0xFFFFF800)==0xD800 \
206 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \
210 #define WRITE_UTF8(zOut, c) { \
212 *zOut++ = (unsigned char)(c&0xFF); \
214 else if( c<0x00800 ){ \
215 *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F); \
216 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
218 else if( c<0x10000 ){ \
219 *zOut++ = 0xE0 + (unsigned char)((c>>12)&0x0F); \
220 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
221 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
223 *zOut++ = 0xF0 + (unsigned char)((c>>18) & 0x07); \
224 *zOut++ = 0x80 + (unsigned char)((c>>12) & 0x3F); \
225 *zOut++ = 0x80 + (unsigned char)((c>>6) & 0x3F); \
226 *zOut++ = 0x80 + (unsigned char)(c & 0x3F); \
230 #endif /* ifndef SQLITE_AMALGAMATION */
232 typedef struct Unicode61Tokenizer Unicode61Tokenizer
;
233 struct Unicode61Tokenizer
{
234 unsigned char aTokenChar
[128]; /* ASCII range token characters */
235 char *aFold
; /* Buffer to fold text into */
236 int nFold
; /* Size of aFold[] in bytes */
237 int bRemoveDiacritic
; /* True if remove_diacritics=1 is set */
242 static int fts5UnicodeAddExceptions(
243 Unicode61Tokenizer
*p
, /* Tokenizer object */
244 const char *z
, /* Characters to treat as exceptions */
245 int bTokenChars
/* 1 for 'tokenchars', 0 for 'separators' */
248 int n
= (int)strlen(z
);
252 aNew
= (int*)sqlite3_realloc(p
->aiException
, (n
+p
->nException
)*sizeof(int));
254 int nNew
= p
->nException
;
255 const unsigned char *zCsr
= (const unsigned char*)z
;
256 const unsigned char *zTerm
= (const unsigned char*)&z
[n
];
260 READ_UTF8(zCsr
, zTerm
, iCode
);
262 p
->aTokenChar
[iCode
] = (unsigned char)bTokenChars
;
264 bToken
= sqlite3Fts5UnicodeIsalnum(iCode
);
265 assert( (bToken
==0 || bToken
==1) );
266 assert( (bTokenChars
==0 || bTokenChars
==1) );
267 if( bToken
!=bTokenChars
&& sqlite3Fts5UnicodeIsdiacritic(iCode
)==0 ){
269 for(i
=0; i
<nNew
; i
++){
270 if( aNew
[i
]>iCode
) break;
272 memmove(&aNew
[i
+1], &aNew
[i
], (nNew
-i
)*sizeof(int));
278 p
->aiException
= aNew
;
279 p
->nException
= nNew
;
289 ** Return true if the p->aiException[] array contains the value iCode.
291 static int fts5UnicodeIsException(Unicode61Tokenizer
*p
, int iCode
){
292 if( p
->nException
>0 ){
293 int *a
= p
->aiException
;
295 int iHi
= p
->nException
-1;
298 int iTest
= (iHi
+ iLo
) / 2;
299 if( iCode
==a
[iTest
] ){
301 }else if( iCode
>a
[iTest
] ){
313 ** Delete a "unicode61" tokenizer.
315 static void fts5UnicodeDelete(Fts5Tokenizer
*pTok
){
317 Unicode61Tokenizer
*p
= (Unicode61Tokenizer
*)pTok
;
318 sqlite3_free(p
->aiException
);
319 sqlite3_free(p
->aFold
);
326 ** Create a "unicode61" tokenizer.
328 static int fts5UnicodeCreate(
330 const char **azArg
, int nArg
,
331 Fts5Tokenizer
**ppOut
333 int rc
= SQLITE_OK
; /* Return code */
334 Unicode61Tokenizer
*p
= 0; /* New tokenizer object */
336 UNUSED_PARAM(pUnused
);
341 p
= (Unicode61Tokenizer
*)sqlite3_malloc(sizeof(Unicode61Tokenizer
));
344 memset(p
, 0, sizeof(Unicode61Tokenizer
));
345 memcpy(p
->aTokenChar
, aAsciiTokenChar
, sizeof(aAsciiTokenChar
));
346 p
->bRemoveDiacritic
= 1;
348 p
->aFold
= sqlite3_malloc(p
->nFold
* sizeof(char));
352 for(i
=0; rc
==SQLITE_OK
&& i
<nArg
; i
+=2){
353 const char *zArg
= azArg
[i
+1];
354 if( 0==sqlite3_stricmp(azArg
[i
], "remove_diacritics") ){
355 if( (zArg
[0]!='0' && zArg
[0]!='1') || zArg
[1] ){
358 p
->bRemoveDiacritic
= (zArg
[0]=='1');
360 if( 0==sqlite3_stricmp(azArg
[i
], "tokenchars") ){
361 rc
= fts5UnicodeAddExceptions(p
, zArg
, 1);
363 if( 0==sqlite3_stricmp(azArg
[i
], "separators") ){
364 rc
= fts5UnicodeAddExceptions(p
, zArg
, 0);
373 fts5UnicodeDelete((Fts5Tokenizer
*)p
);
376 *ppOut
= (Fts5Tokenizer
*)p
;
382 ** Return true if, for the purposes of tokenizing with the tokenizer
383 ** passed as the first argument, codepoint iCode is considered a token
384 ** character (not a separator).
386 static int fts5UnicodeIsAlnum(Unicode61Tokenizer
*p
, int iCode
){
387 assert( (sqlite3Fts5UnicodeIsalnum(iCode
) & 0xFFFFFFFE)==0 );
388 return sqlite3Fts5UnicodeIsalnum(iCode
) ^ fts5UnicodeIsException(p
, iCode
);
391 static int fts5UnicodeTokenize(
392 Fts5Tokenizer
*pTokenizer
,
395 const char *pText
, int nText
,
396 int (*xToken
)(void*, int, const char*, int nToken
, int iStart
, int iEnd
)
398 Unicode61Tokenizer
*p
= (Unicode61Tokenizer
*)pTokenizer
;
400 unsigned char *a
= p
->aTokenChar
;
402 unsigned char *zTerm
= (unsigned char*)&pText
[nText
];
403 unsigned char *zCsr
= (unsigned char *)pText
;
406 char *aFold
= p
->aFold
;
407 int nFold
= p
->nFold
;
408 const char *pEnd
= &aFold
[nFold
-6];
410 UNUSED_PARAM(iUnused
);
412 /* Each iteration of this loop gobbles up a contiguous run of separators,
413 ** then the next token. */
414 while( rc
==SQLITE_OK
){
415 int iCode
; /* non-ASCII codepoint read from input */
420 /* Skip any separator characters. */
422 if( zCsr
>=zTerm
) goto tokenize_done
;
424 /* A character outside of the ascii range. Skip past it if it is
425 ** a separator character. Or break out of the loop if it is not. */
426 is
= zCsr
- (unsigned char*)pText
;
427 READ_UTF8(zCsr
, zTerm
, iCode
);
428 if( fts5UnicodeIsAlnum(p
, iCode
) ){
429 goto non_ascii_tokenchar
;
433 is
= zCsr
- (unsigned char*)pText
;
434 goto ascii_tokenchar
;
440 /* Run through the tokenchars. Fold them into the output buffer along
444 /* Grow the output buffer so that there is sufficient space to fit the
445 ** largest possible utf-8 character. */
447 aFold
= sqlite3_malloc(nFold
*2);
452 zOut
= &aFold
[zOut
- p
->aFold
];
453 memcpy(aFold
, p
->aFold
, nFold
);
454 sqlite3_free(p
->aFold
);
456 p
->nFold
= nFold
= nFold
*2;
457 pEnd
= &aFold
[nFold
-6];
461 /* An non-ascii-range character. Fold it into the output buffer if
462 ** it is a token character, or break out of the loop if it is not. */
463 READ_UTF8(zCsr
, zTerm
, iCode
);
464 if( fts5UnicodeIsAlnum(p
,iCode
)||sqlite3Fts5UnicodeIsdiacritic(iCode
) ){
466 iCode
= sqlite3Fts5UnicodeFold(iCode
, p
->bRemoveDiacritic
);
467 if( iCode
) WRITE_UTF8(zOut
, iCode
);
471 }else if( a
[*zCsr
]==0 ){
472 /* An ascii-range separator character. End of token. */
476 if( *zCsr
>='A' && *zCsr
<='Z' ){
477 *zOut
++ = *zCsr
+ 32;
483 ie
= zCsr
- (unsigned char*)pText
;
486 /* Invoke the token callback */
487 rc
= xToken(pCtx
, 0, aFold
, zOut
-aFold
, is
, ie
);
491 if( rc
==SQLITE_DONE
) rc
= SQLITE_OK
;
495 /**************************************************************************
496 ** Start of porter stemmer implementation.
499 /* Any tokens larger than this (in bytes) are passed through without
501 #define FTS5_PORTER_MAX_TOKEN 64
503 typedef struct PorterTokenizer PorterTokenizer
;
504 struct PorterTokenizer
{
505 fts5_tokenizer tokenizer
; /* Parent tokenizer module */
506 Fts5Tokenizer
*pTokenizer
; /* Parent tokenizer instance */
507 char aBuf
[FTS5_PORTER_MAX_TOKEN
+ 64];
511 ** Delete a "porter" tokenizer.
513 static void fts5PorterDelete(Fts5Tokenizer
*pTok
){
515 PorterTokenizer
*p
= (PorterTokenizer
*)pTok
;
517 p
->tokenizer
.xDelete(p
->pTokenizer
);
524 ** Create a "porter" tokenizer.
526 static int fts5PorterCreate(
528 const char **azArg
, int nArg
,
529 Fts5Tokenizer
**ppOut
531 fts5_api
*pApi
= (fts5_api
*)pCtx
;
533 PorterTokenizer
*pRet
;
535 const char *zBase
= "unicode61";
541 pRet
= (PorterTokenizer
*)sqlite3_malloc(sizeof(PorterTokenizer
));
543 memset(pRet
, 0, sizeof(PorterTokenizer
));
544 rc
= pApi
->xFindTokenizer(pApi
, zBase
, &pUserdata
, &pRet
->tokenizer
);
549 int nArg2
= (nArg
>0 ? nArg
-1 : 0);
550 const char **azArg2
= (nArg2
? &azArg
[1] : 0);
551 rc
= pRet
->tokenizer
.xCreate(pUserdata
, azArg2
, nArg2
, &pRet
->pTokenizer
);
555 fts5PorterDelete((Fts5Tokenizer
*)pRet
);
558 *ppOut
= (Fts5Tokenizer
*)pRet
;
562 typedef struct PorterContext PorterContext
;
563 struct PorterContext
{
565 int (*xToken
)(void*, int, const char*, int, int, int);
569 typedef struct PorterRule PorterRule
;
573 int (*xCond
)(char *zStem
, int nStem
);
579 static int fts5PorterApply(char *aBuf
, int *pnBuf
, PorterRule
*aRule
){
584 for(p
=aRule
; p
->zSuffix
; p
++){
585 assert( strlen(p
->zSuffix
)==p
->nSuffix
);
586 assert( strlen(p
->zOutput
)==p
->nOutput
);
587 if( nBuf
<p
->nSuffix
) continue;
588 if( 0==memcmp(&aBuf
[nBuf
- p
->nSuffix
], p
->zSuffix
, p
->nSuffix
) ) break;
592 int nStem
= nBuf
- p
->nSuffix
;
593 if( p
->xCond
==0 || p
->xCond(aBuf
, nStem
) ){
594 memcpy(&aBuf
[nStem
], p
->zOutput
, p
->nOutput
);
595 *pnBuf
= nStem
+ p
->nOutput
;
604 static int fts5PorterIsVowel(char c
, int bYIsVowel
){
606 c
=='a' || c
=='e' || c
=='i' || c
=='o' || c
=='u' || (bYIsVowel
&& c
=='y')
610 static int fts5PorterGobbleVC(char *zStem
, int nStem
, int bPrevCons
){
612 int bCons
= bPrevCons
;
614 /* Scan for a vowel */
615 for(i
=0; i
<nStem
; i
++){
616 if( 0==(bCons
= !fts5PorterIsVowel(zStem
[i
], bCons
)) ) break;
619 /* Scan for a consonent */
620 for(i
++; i
<nStem
; i
++){
621 if( (bCons
= !fts5PorterIsVowel(zStem
[i
], bCons
)) ) return i
+1;
626 /* porter rule condition: (m > 0) */
627 static int fts5Porter_MGt0(char *zStem
, int nStem
){
628 return !!fts5PorterGobbleVC(zStem
, nStem
, 0);
631 /* porter rule condition: (m > 1) */
632 static int fts5Porter_MGt1(char *zStem
, int nStem
){
634 n
= fts5PorterGobbleVC(zStem
, nStem
, 0);
635 if( n
&& fts5PorterGobbleVC(&zStem
[n
], nStem
-n
, 1) ){
641 /* porter rule condition: (m = 1) */
642 static int fts5Porter_MEq1(char *zStem
, int nStem
){
644 n
= fts5PorterGobbleVC(zStem
, nStem
, 0);
645 if( n
&& 0==fts5PorterGobbleVC(&zStem
[n
], nStem
-n
, 1) ){
651 /* porter rule condition: (*o) */
652 static int fts5Porter_Ostar(char *zStem
, int nStem
){
653 if( zStem
[nStem
-1]=='w' || zStem
[nStem
-1]=='x' || zStem
[nStem
-1]=='y' ){
659 for(i
=0; i
<nStem
; i
++){
660 bCons
= !fts5PorterIsVowel(zStem
[i
], bCons
);
661 assert( bCons
==0 || bCons
==1 );
662 mask
= (mask
<< 1) + bCons
;
664 return ((mask
& 0x0007)==0x0005);
668 /* porter rule condition: (m > 1 and (*S or *T)) */
669 static int fts5Porter_MGt1_and_S_or_T(char *zStem
, int nStem
){
671 return (zStem
[nStem
-1]=='s' || zStem
[nStem
-1]=='t')
672 && fts5Porter_MGt1(zStem
, nStem
);
675 /* porter rule condition: (*v*) */
676 static int fts5Porter_Vowel(char *zStem
, int nStem
){
678 for(i
=0; i
<nStem
; i
++){
679 if( fts5PorterIsVowel(zStem
[i
], i
>0) ){
687 /**************************************************************************
688 ***************************************************************************
689 ** GENERATED CODE STARTS HERE (mkportersteps.tcl)
692 static int fts5PorterStep4(char *aBuf
, int *pnBuf
){
695 switch( aBuf
[nBuf
-2] ){
698 if( nBuf
>2 && 0==memcmp("al", &aBuf
[nBuf
-2], 2) ){
699 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
706 if( nBuf
>4 && 0==memcmp("ance", &aBuf
[nBuf
-4], 4) ){
707 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
710 }else if( nBuf
>4 && 0==memcmp("ence", &aBuf
[nBuf
-4], 4) ){
711 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
718 if( nBuf
>2 && 0==memcmp("er", &aBuf
[nBuf
-2], 2) ){
719 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
726 if( nBuf
>2 && 0==memcmp("ic", &aBuf
[nBuf
-2], 2) ){
727 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
734 if( nBuf
>4 && 0==memcmp("able", &aBuf
[nBuf
-4], 4) ){
735 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
738 }else if( nBuf
>4 && 0==memcmp("ible", &aBuf
[nBuf
-4], 4) ){
739 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
746 if( nBuf
>3 && 0==memcmp("ant", &aBuf
[nBuf
-3], 3) ){
747 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
750 }else if( nBuf
>5 && 0==memcmp("ement", &aBuf
[nBuf
-5], 5) ){
751 if( fts5Porter_MGt1(aBuf
, nBuf
-5) ){
754 }else if( nBuf
>4 && 0==memcmp("ment", &aBuf
[nBuf
-4], 4) ){
755 if( fts5Porter_MGt1(aBuf
, nBuf
-4) ){
758 }else if( nBuf
>3 && 0==memcmp("ent", &aBuf
[nBuf
-3], 3) ){
759 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
766 if( nBuf
>3 && 0==memcmp("ion", &aBuf
[nBuf
-3], 3) ){
767 if( fts5Porter_MGt1_and_S_or_T(aBuf
, nBuf
-3) ){
770 }else if( nBuf
>2 && 0==memcmp("ou", &aBuf
[nBuf
-2], 2) ){
771 if( fts5Porter_MGt1(aBuf
, nBuf
-2) ){
778 if( nBuf
>3 && 0==memcmp("ism", &aBuf
[nBuf
-3], 3) ){
779 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
786 if( nBuf
>3 && 0==memcmp("ate", &aBuf
[nBuf
-3], 3) ){
787 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
790 }else if( nBuf
>3 && 0==memcmp("iti", &aBuf
[nBuf
-3], 3) ){
791 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
798 if( nBuf
>3 && 0==memcmp("ous", &aBuf
[nBuf
-3], 3) ){
799 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
806 if( nBuf
>3 && 0==memcmp("ive", &aBuf
[nBuf
-3], 3) ){
807 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
814 if( nBuf
>3 && 0==memcmp("ize", &aBuf
[nBuf
-3], 3) ){
815 if( fts5Porter_MGt1(aBuf
, nBuf
-3) ){
826 static int fts5PorterStep1B2(char *aBuf
, int *pnBuf
){
829 switch( aBuf
[nBuf
-2] ){
832 if( nBuf
>2 && 0==memcmp("at", &aBuf
[nBuf
-2], 2) ){
833 memcpy(&aBuf
[nBuf
-2], "ate", 3);
834 *pnBuf
= nBuf
- 2 + 3;
840 if( nBuf
>2 && 0==memcmp("bl", &aBuf
[nBuf
-2], 2) ){
841 memcpy(&aBuf
[nBuf
-2], "ble", 3);
842 *pnBuf
= nBuf
- 2 + 3;
848 if( nBuf
>2 && 0==memcmp("iz", &aBuf
[nBuf
-2], 2) ){
849 memcpy(&aBuf
[nBuf
-2], "ize", 3);
850 *pnBuf
= nBuf
- 2 + 3;
860 static int fts5PorterStep2(char *aBuf
, int *pnBuf
){
863 switch( aBuf
[nBuf
-2] ){
866 if( nBuf
>7 && 0==memcmp("ational", &aBuf
[nBuf
-7], 7) ){
867 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
868 memcpy(&aBuf
[nBuf
-7], "ate", 3);
869 *pnBuf
= nBuf
- 7 + 3;
871 }else if( nBuf
>6 && 0==memcmp("tional", &aBuf
[nBuf
-6], 6) ){
872 if( fts5Porter_MGt0(aBuf
, nBuf
-6) ){
873 memcpy(&aBuf
[nBuf
-6], "tion", 4);
874 *pnBuf
= nBuf
- 6 + 4;
880 if( nBuf
>4 && 0==memcmp("enci", &aBuf
[nBuf
-4], 4) ){
881 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
882 memcpy(&aBuf
[nBuf
-4], "ence", 4);
883 *pnBuf
= nBuf
- 4 + 4;
885 }else if( nBuf
>4 && 0==memcmp("anci", &aBuf
[nBuf
-4], 4) ){
886 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
887 memcpy(&aBuf
[nBuf
-4], "ance", 4);
888 *pnBuf
= nBuf
- 4 + 4;
894 if( nBuf
>4 && 0==memcmp("izer", &aBuf
[nBuf
-4], 4) ){
895 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
896 memcpy(&aBuf
[nBuf
-4], "ize", 3);
897 *pnBuf
= nBuf
- 4 + 3;
903 if( nBuf
>4 && 0==memcmp("logi", &aBuf
[nBuf
-4], 4) ){
904 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
905 memcpy(&aBuf
[nBuf
-4], "log", 3);
906 *pnBuf
= nBuf
- 4 + 3;
912 if( nBuf
>3 && 0==memcmp("bli", &aBuf
[nBuf
-3], 3) ){
913 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
914 memcpy(&aBuf
[nBuf
-3], "ble", 3);
915 *pnBuf
= nBuf
- 3 + 3;
917 }else if( nBuf
>4 && 0==memcmp("alli", &aBuf
[nBuf
-4], 4) ){
918 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
919 memcpy(&aBuf
[nBuf
-4], "al", 2);
920 *pnBuf
= nBuf
- 4 + 2;
922 }else if( nBuf
>5 && 0==memcmp("entli", &aBuf
[nBuf
-5], 5) ){
923 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
924 memcpy(&aBuf
[nBuf
-5], "ent", 3);
925 *pnBuf
= nBuf
- 5 + 3;
927 }else if( nBuf
>3 && 0==memcmp("eli", &aBuf
[nBuf
-3], 3) ){
928 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
929 memcpy(&aBuf
[nBuf
-3], "e", 1);
930 *pnBuf
= nBuf
- 3 + 1;
932 }else if( nBuf
>5 && 0==memcmp("ousli", &aBuf
[nBuf
-5], 5) ){
933 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
934 memcpy(&aBuf
[nBuf
-5], "ous", 3);
935 *pnBuf
= nBuf
- 5 + 3;
941 if( nBuf
>7 && 0==memcmp("ization", &aBuf
[nBuf
-7], 7) ){
942 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
943 memcpy(&aBuf
[nBuf
-7], "ize", 3);
944 *pnBuf
= nBuf
- 7 + 3;
946 }else if( nBuf
>5 && 0==memcmp("ation", &aBuf
[nBuf
-5], 5) ){
947 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
948 memcpy(&aBuf
[nBuf
-5], "ate", 3);
949 *pnBuf
= nBuf
- 5 + 3;
951 }else if( nBuf
>4 && 0==memcmp("ator", &aBuf
[nBuf
-4], 4) ){
952 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
953 memcpy(&aBuf
[nBuf
-4], "ate", 3);
954 *pnBuf
= nBuf
- 4 + 3;
960 if( nBuf
>5 && 0==memcmp("alism", &aBuf
[nBuf
-5], 5) ){
961 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
962 memcpy(&aBuf
[nBuf
-5], "al", 2);
963 *pnBuf
= nBuf
- 5 + 2;
965 }else if( nBuf
>7 && 0==memcmp("iveness", &aBuf
[nBuf
-7], 7) ){
966 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
967 memcpy(&aBuf
[nBuf
-7], "ive", 3);
968 *pnBuf
= nBuf
- 7 + 3;
970 }else if( nBuf
>7 && 0==memcmp("fulness", &aBuf
[nBuf
-7], 7) ){
971 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
972 memcpy(&aBuf
[nBuf
-7], "ful", 3);
973 *pnBuf
= nBuf
- 7 + 3;
975 }else if( nBuf
>7 && 0==memcmp("ousness", &aBuf
[nBuf
-7], 7) ){
976 if( fts5Porter_MGt0(aBuf
, nBuf
-7) ){
977 memcpy(&aBuf
[nBuf
-7], "ous", 3);
978 *pnBuf
= nBuf
- 7 + 3;
984 if( nBuf
>5 && 0==memcmp("aliti", &aBuf
[nBuf
-5], 5) ){
985 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
986 memcpy(&aBuf
[nBuf
-5], "al", 2);
987 *pnBuf
= nBuf
- 5 + 2;
989 }else if( nBuf
>5 && 0==memcmp("iviti", &aBuf
[nBuf
-5], 5) ){
990 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
991 memcpy(&aBuf
[nBuf
-5], "ive", 3);
992 *pnBuf
= nBuf
- 5 + 3;
994 }else if( nBuf
>6 && 0==memcmp("biliti", &aBuf
[nBuf
-6], 6) ){
995 if( fts5Porter_MGt0(aBuf
, nBuf
-6) ){
996 memcpy(&aBuf
[nBuf
-6], "ble", 3);
997 *pnBuf
= nBuf
- 6 + 3;
1007 static int fts5PorterStep3(char *aBuf
, int *pnBuf
){
1010 switch( aBuf
[nBuf
-2] ){
1013 if( nBuf
>4 && 0==memcmp("ical", &aBuf
[nBuf
-4], 4) ){
1014 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
1015 memcpy(&aBuf
[nBuf
-4], "ic", 2);
1016 *pnBuf
= nBuf
- 4 + 2;
1022 if( nBuf
>4 && 0==memcmp("ness", &aBuf
[nBuf
-4], 4) ){
1023 if( fts5Porter_MGt0(aBuf
, nBuf
-4) ){
1030 if( nBuf
>5 && 0==memcmp("icate", &aBuf
[nBuf
-5], 5) ){
1031 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1032 memcpy(&aBuf
[nBuf
-5], "ic", 2);
1033 *pnBuf
= nBuf
- 5 + 2;
1035 }else if( nBuf
>5 && 0==memcmp("iciti", &aBuf
[nBuf
-5], 5) ){
1036 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1037 memcpy(&aBuf
[nBuf
-5], "ic", 2);
1038 *pnBuf
= nBuf
- 5 + 2;
1044 if( nBuf
>3 && 0==memcmp("ful", &aBuf
[nBuf
-3], 3) ){
1045 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
1052 if( nBuf
>5 && 0==memcmp("ative", &aBuf
[nBuf
-5], 5) ){
1053 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1060 if( nBuf
>5 && 0==memcmp("alize", &aBuf
[nBuf
-5], 5) ){
1061 if( fts5Porter_MGt0(aBuf
, nBuf
-5) ){
1062 memcpy(&aBuf
[nBuf
-5], "al", 2);
1063 *pnBuf
= nBuf
- 5 + 2;
1073 static int fts5PorterStep1B(char *aBuf
, int *pnBuf
){
1076 switch( aBuf
[nBuf
-2] ){
1079 if( nBuf
>3 && 0==memcmp("eed", &aBuf
[nBuf
-3], 3) ){
1080 if( fts5Porter_MGt0(aBuf
, nBuf
-3) ){
1081 memcpy(&aBuf
[nBuf
-3], "ee", 2);
1082 *pnBuf
= nBuf
- 3 + 2;
1084 }else if( nBuf
>2 && 0==memcmp("ed", &aBuf
[nBuf
-2], 2) ){
1085 if( fts5Porter_Vowel(aBuf
, nBuf
-2) ){
1093 if( nBuf
>3 && 0==memcmp("ing", &aBuf
[nBuf
-3], 3) ){
1094 if( fts5Porter_Vowel(aBuf
, nBuf
-3) ){
1106 ** GENERATED CODE ENDS HERE (mkportersteps.tcl)
1107 ***************************************************************************
1108 **************************************************************************/
1110 static void fts5PorterStep1A(char *aBuf
, int *pnBuf
){
1112 if( aBuf
[nBuf
-1]=='s' ){
1113 if( aBuf
[nBuf
-2]=='e' ){
1114 if( (nBuf
>4 && aBuf
[nBuf
-4]=='s' && aBuf
[nBuf
-3]=='s')
1115 || (nBuf
>3 && aBuf
[nBuf
-3]=='i' )
1122 else if( aBuf
[nBuf
-2]!='s' ){
1128 static int fts5PorterCb(
1136 PorterContext
*p
= (PorterContext
*)pCtx
;
1141 if( nToken
>FTS5_PORTER_MAX_TOKEN
|| nToken
<3 ) goto pass_through
;
1144 memcpy(aBuf
, pToken
, nBuf
);
1147 fts5PorterStep1A(aBuf
, &nBuf
);
1148 if( fts5PorterStep1B(aBuf
, &nBuf
) ){
1149 if( fts5PorterStep1B2(aBuf
, &nBuf
)==0 ){
1150 char c
= aBuf
[nBuf
-1];
1151 if( fts5PorterIsVowel(c
, 0)==0
1152 && c
!='l' && c
!='s' && c
!='z' && c
==aBuf
[nBuf
-2]
1155 }else if( fts5Porter_MEq1(aBuf
, nBuf
) && fts5Porter_Ostar(aBuf
, nBuf
) ){
1162 if( aBuf
[nBuf
-1]=='y' && fts5Porter_Vowel(aBuf
, nBuf
-1) ){
1166 /* Steps 2 through 4. */
1167 fts5PorterStep2(aBuf
, &nBuf
);
1168 fts5PorterStep3(aBuf
, &nBuf
);
1169 fts5PorterStep4(aBuf
, &nBuf
);
1173 if( aBuf
[nBuf
-1]=='e' ){
1174 if( fts5Porter_MGt1(aBuf
, nBuf
-1)
1175 || (fts5Porter_MEq1(aBuf
, nBuf
-1) && !fts5Porter_Ostar(aBuf
, nBuf
-1))
1182 if( nBuf
>1 && aBuf
[nBuf
-1]=='l'
1183 && aBuf
[nBuf
-2]=='l' && fts5Porter_MGt1(aBuf
, nBuf
-1)
1188 return p
->xToken(p
->pCtx
, tflags
, aBuf
, nBuf
, iStart
, iEnd
);
1191 return p
->xToken(p
->pCtx
, tflags
, pToken
, nToken
, iStart
, iEnd
);
1195 ** Tokenize using the porter tokenizer.
1197 static int fts5PorterTokenize(
1198 Fts5Tokenizer
*pTokenizer
,
1201 const char *pText
, int nText
,
1202 int (*xToken
)(void*, int, const char*, int nToken
, int iStart
, int iEnd
)
1204 PorterTokenizer
*p
= (PorterTokenizer
*)pTokenizer
;
1206 sCtx
.xToken
= xToken
;
1208 sCtx
.aBuf
= p
->aBuf
;
1209 return p
->tokenizer
.xTokenize(
1210 p
->pTokenizer
, (void*)&sCtx
, flags
, pText
, nText
, fts5PorterCb
1215 ** Register all built-in tokenizers with FTS5.
1217 int sqlite3Fts5TokenizerInit(fts5_api
*pApi
){
1218 struct BuiltinTokenizer
{
1222 { "unicode61", {fts5UnicodeCreate
, fts5UnicodeDelete
, fts5UnicodeTokenize
}},
1223 { "ascii", {fts5AsciiCreate
, fts5AsciiDelete
, fts5AsciiTokenize
}},
1224 { "porter", {fts5PorterCreate
, fts5PorterDelete
, fts5PorterTokenize
}},
1227 int rc
= SQLITE_OK
; /* Return code */
1228 int i
; /* To iterate through builtin functions */
1230 for(i
=0; rc
==SQLITE_OK
&& i
<ArraySize(aBuiltin
); i
++){
1231 rc
= pApi
->xCreateTokenizer(pApi
,