ext/fts2/fts2_porter.c

   1 /*
   2 ** 2006 September 30
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 *************************************************************************
  12 ** Implementation of the full-text-search tokenizer that implements
  13 ** a Porter stemmer.
  14 */
  15
  16 /*
  17 ** The code in this file is only compiled if:
  18 **
  19 **     * The FTS2 module is being built as an extension
  20 **       (in which case SQLITE_CORE is not defined), or
  21 **
  22 **     * The FTS2 module is being built into the core of
  23 **       SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
  24 */
  25 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
  26
  27
  28 #include <assert.h>
  29 #include <stdlib.h>
  30 #include <stdio.h>
  31 #include <string.h>
  32
  33 #include "sqlite3.h"
  34 #include "sqlite3ext.h"
  35 SQLITE_EXTENSION_INIT3
  36 #include "fts2_tokenizer.h"
  37
  38 /*
  39 ** Class derived from sqlite3_tokenizer
  40 */
  41 typedef struct porter_tokenizer {
  42   sqlite3_tokenizer base;      /* Base class */
  43 } porter_tokenizer;
  44
  45 /*
  46 ** Class derived from sqlit3_tokenizer_cursor
  47 */
  48 typedef struct porter_tokenizer_cursor {
  49   sqlite3_tokenizer_cursor base;
  50   const char *zInput;          /* input we are tokenizing */
  51   int nInput;                  /* size of the input */
  52   int iOffset;                 /* current position in zInput */
  53   int iToken;                  /* index of next token to be returned */
  54   char *zToken;                /* storage for current token */
  55   int nAllocated;              /* space allocated to zToken buffer */
  56 } porter_tokenizer_cursor;
  57
  58
  59 /* Forward declaration */
  60 static const sqlite3_tokenizer_module porterTokenizerModule;
  61
  62
  63 /*
  64 ** Create a new tokenizer instance.
  65 */
  66 static int porterCreate(
  67   int argc, const char * const *argv,
  68   sqlite3_tokenizer **ppTokenizer
  69 ){
  70   porter_tokenizer *t;
  71   t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t));
  72   if( t==NULL ) return SQLITE_NOMEM;
  73   memset(t, 0, sizeof(*t));
  74   *ppTokenizer = &t->base;
  75   return SQLITE_OK;
  76 }
  77
  78 /*
  79 ** Destroy a tokenizer
  80 */
  81 static int porterDestroy(sqlite3_tokenizer *pTokenizer){
  82   sqlite3_free(pTokenizer);
  83   return SQLITE_OK;
  84 }
  85
  86 /*
  87 ** Prepare to begin tokenizing a particular string.  The input
  88 ** string to be tokenized is zInput[0..nInput-1].  A cursor
  89 ** used to incrementally tokenize this string is returned in
  90 ** *ppCursor.
  91 */
  92 static int porterOpen(
  93   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
  94   const char *zInput, int nInput,        /* String to be tokenized */
  95   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
  96 ){
  97   porter_tokenizer_cursor *c;
  98
  99   c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
 100   if( c==NULL ) return SQLITE_NOMEM;
 101
 102   c->zInput = zInput;
 103   if( zInput==0 ){
 104     c->nInput = 0;
 105   }else if( nInput<0 ){
 106     c->nInput = (int)strlen(zInput);
 107   }else{
 108     c->nInput = nInput;
 109   }
 110   c->iOffset = 0;                 /* start tokenizing at the beginning */
 111   c->iToken = 0;
 112   c->zToken = NULL;               /* no space allocated, yet. */
 113   c->nAllocated = 0;
 114
 115   *ppCursor = &c->base;
 116   return SQLITE_OK;
 117 }
 118
 119 /*
 120 ** Close a tokenization cursor previously opened by a call to
 121 ** porterOpen() above.
 122 */
 123 static int porterClose(sqlite3_tokenizer_cursor *pCursor){
 124   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
 125   sqlite3_free(c->zToken);
 126   sqlite3_free(c);
 127   return SQLITE_OK;
 128 }
 129 /*
 130 ** Vowel or consonant
 131 */
 132 static const char cType[] = {
 133    0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
 134    1, 1, 1, 2, 1
 135 };
 136
 137 /*
 138 ** isConsonant() and isVowel() determine if their first character in
 139 ** the string they point to is a consonant or a vowel, according
 140 ** to Porter ruls.
 141 **
 142 ** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
 143 ** 'Y' is a consonant unless it follows another consonant,
 144 ** in which case it is a vowel.
 145 **
 146 ** In these routine, the letters are in reverse order.  So the 'y' rule
 147 ** is that 'y' is a consonant unless it is followed by another
 148 ** consonent.
 149 */
 150 static int isVowel(const char*);
 151 static int isConsonant(const char *z){
 152   int j;
 153   char x = *z;
 154   if( x==0 ) return 0;
 155   assert( x>='a' && x<='z' );
 156   j = cType[x-'a'];
 157   if( j<2 ) return j;
 158   return z[1]==0 || isVowel(z + 1);
 159 }
 160 static int isVowel(const char *z){
 161   int j;
 162   char x = *z;
 163   if( x==0 ) return 0;
 164   assert( x>='a' && x<='z' );
 165   j = cType[x-'a'];
 166   if( j<2 ) return 1-j;
 167   return isConsonant(z + 1);
 168 }
 169
 170 /*
 171 ** Let any sequence of one or more vowels be represented by V and let
 172 ** C be sequence of one or more consonants.  Then every word can be
 173 ** represented as:
 174 **
 175 **           [C] (VC){m} [V]
 176 **
 177 ** In prose:  A word is an optional consonant followed by zero or
 178 ** vowel-consonant pairs followed by an optional vowel.  "m" is the
 179 ** number of vowel consonant pairs.  This routine computes the value
 180 ** of m for the first i bytes of a word.
 181 **
 182 ** Return true if the m-value for z is 1 or more.  In other words,
 183 ** return true if z contains at least one vowel that is followed
 184 ** by a consonant.
 185 **
 186 ** In this routine z[] is in reverse order.  So we are really looking
 187 ** for an instance of of a consonant followed by a vowel.
 188 */
 189 static int m_gt_0(const char *z){
 190   while( isVowel(z) ){ z++; }
 191   if( *z==0 ) return 0;
 192   while( isConsonant(z) ){ z++; }
 193   return *z!=0;
 194 }
 195
 196 /* Like mgt0 above except we are looking for a value of m which is
 197 ** exactly 1
 198 */
 199 static int m_eq_1(const char *z){
 200   while( isVowel(z) ){ z++; }
 201   if( *z==0 ) return 0;
 202   while( isConsonant(z) ){ z++; }
 203   if( *z==0 ) return 0;
 204   while( isVowel(z) ){ z++; }
 205   if( *z==0 ) return 1;
 206   while( isConsonant(z) ){ z++; }
 207   return *z==0;
 208 }
 209
 210 /* Like mgt0 above except we are looking for a value of m>1 instead
 211 ** or m>0
 212 */
 213 static int m_gt_1(const char *z){
 214   while( isVowel(z) ){ z++; }
 215   if( *z==0 ) return 0;
 216   while( isConsonant(z) ){ z++; }
 217   if( *z==0 ) return 0;
 218   while( isVowel(z) ){ z++; }
 219   if( *z==0 ) return 0;
 220   while( isConsonant(z) ){ z++; }
 221   return *z!=0;
 222 }
 223
 224 /*
 225 ** Return TRUE if there is a vowel anywhere within z[0..n-1]
 226 */
 227 static int hasVowel(const char *z){
 228   while( isConsonant(z) ){ z++; }
 229   return *z!=0;
 230 }
 231
 232 /*
 233 ** Return TRUE if the word ends in a double consonant.
 234 **
 235 ** The text is reversed here. So we are really looking at
 236 ** the first two characters of z[].
 237 */
 238 static int doubleConsonant(const char *z){
 239   return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
 240 }
 241
 242 /*
 243 ** Return TRUE if the word ends with three letters which
 244 ** are consonant-vowel-consonent and where the final consonant
 245 ** is not 'w', 'x', or 'y'.
 246 **
 247 ** The word is reversed here.  So we are really checking the
 248 ** first three letters and the first one cannot be in [wxy].
 249 */
 250 static int star_oh(const char *z){
 251   return
 252     z[0]!=0 && isConsonant(z) &&
 253     z[0]!='w' && z[0]!='x' && z[0]!='y' &&
 254     z[1]!=0 && isVowel(z+1) &&
 255     z[2]!=0 && isConsonant(z+2);
 256 }
 257
 258 /*
 259 ** If the word ends with zFrom and xCond() is true for the stem
 260 ** of the word that preceeds the zFrom ending, then change the
 261 ** ending to zTo.
 262 **
 263 ** The input word *pz and zFrom are both in reverse order.  zTo
 264 ** is in normal order.
 265 **
 266 ** Return TRUE if zFrom matches.  Return FALSE if zFrom does not
 267 ** match.  Not that TRUE is returned even if xCond() fails and
 268 ** no substitution occurs.
 269 */
 270 static int stem(
 271   char **pz,             /* The word being stemmed (Reversed) */
 272   const char *zFrom,     /* If the ending matches this... (Reversed) */
 273   const char *zTo,       /* ... change the ending to this (not reversed) */
 274   int (*xCond)(const char*)   /* Condition that must be true */
 275 ){
 276   char *z = *pz;
 277   while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
 278   if( *zFrom!=0 ) return 0;
 279   if( xCond && !xCond(z) ) return 1;
 280   while( *zTo ){
 281     *(--z) = *(zTo++);
 282   }
 283   *pz = z;
 284   return 1;
 285 }
 286
 287 /*
 288 ** This is the fallback stemmer used when the porter stemmer is
 289 ** inappropriate.  The input word is copied into the output with
 290 ** US-ASCII case folding.  If the input word is too long (more
 291 ** than 20 bytes if it contains no digits or more than 6 bytes if
 292 ** it contains digits) then word is truncated to 20 or 6 bytes
 293 ** by taking 10 or 3 bytes from the beginning and end.
 294 */
 295 static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
 296   int i, mx, j;
 297   int hasDigit = 0;
 298   for(i=0; i<nIn; i++){
 299     int c = zIn[i];
 300     if( c>='A' && c<='Z' ){
 301       zOut[i] = c - 'A' + 'a';
 302     }else{
 303       if( c>='0' && c<='9' ) hasDigit = 1;
 304       zOut[i] = c;
 305     }
 306   }
 307   mx = hasDigit ? 3 : 10;
 308   if( nIn>mx*2 ){
 309     for(j=mx, i=nIn-mx; i<nIn; i++, j++){
 310       zOut[j] = zOut[i];
 311     }
 312     i = j;
 313   }
 314   zOut[i] = 0;
 315   *pnOut = i;
 316 }
 317
 318
 319 /*
 320 ** Stem the input word zIn[0..nIn-1].  Store the output in zOut.
 321 ** zOut is at least big enough to hold nIn bytes.  Write the actual
 322 ** size of the output word (exclusive of the '\0' terminator) into *pnOut.
 323 **
 324 ** Any upper-case characters in the US-ASCII character set ([A-Z])
 325 ** are converted to lower case.  Upper-case UTF characters are
 326 ** unchanged.
 327 **
 328 ** Words that are longer than about 20 bytes are stemmed by retaining
 329 ** a few bytes from the beginning and the end of the word.  If the
 330 ** word contains digits, 3 bytes are taken from the beginning and
 331 ** 3 bytes from the end.  For long words without digits, 10 bytes
 332 ** are taken from each end.  US-ASCII case folding still applies.
 333 **
 334 ** If the input word contains not digits but does characters not
 335 ** in [a-zA-Z] then no stemming is attempted and this routine just
 336 ** copies the input into the input into the output with US-ASCII
 337 ** case folding.
 338 **
 339 ** Stemming never increases the length of the word.  So there is
 340 ** no chance of overflowing the zOut buffer.
 341 */
 342 static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
 343   int i, j, c;
 344   char zReverse[28];
 345   char *z, *z2;
 346   if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
 347     /* The word is too big or too small for the porter stemmer.
 348     ** Fallback to the copy stemmer */
 349     copy_stemmer(zIn, nIn, zOut, pnOut);
 350     return;
 351   }
 352   for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
 353     c = zIn[i];
 354     if( c>='A' && c<='Z' ){
 355       zReverse[j] = c + 'a' - 'A';
 356     }else if( c>='a' && c<='z' ){
 357       zReverse[j] = c;
 358     }else{
 359       /* The use of a character not in [a-zA-Z] means that we fallback
 360       ** to the copy stemmer */
 361       copy_stemmer(zIn, nIn, zOut, pnOut);
 362       return;
 363     }
 364   }
 365   memset(&zReverse[sizeof(zReverse)-5], 0, 5);
 366   z = &zReverse[j+1];
 367
 368
 369   /* Step 1a */
 370   if( z[0]=='s' ){
 371     if(
 372      !stem(&z, "sess", "ss", 0) &&
 373      !stem(&z, "sei", "i", 0)  &&
 374      !stem(&z, "ss", "ss", 0)
 375     ){
 376       z++;
 377     }
 378   }
 379
 380   /* Step 1b */
 381   z2 = z;
 382   if( stem(&z, "dee", "ee", m_gt_0) ){
 383     /* Do nothing.  The work was all in the test */
 384   }else if(
 385      (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
 386       && z!=z2
 387   ){
 388      if( stem(&z, "ta", "ate", 0) ||
 389          stem(&z, "lb", "ble", 0) ||
 390          stem(&z, "zi", "ize", 0) ){
 391        /* Do nothing.  The work was all in the test */
 392      }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
 393        z++;
 394      }else if( m_eq_1(z) && star_oh(z) ){
 395        *(--z) = 'e';
 396      }
 397   }
 398
 399   /* Step 1c */
 400   if( z[0]=='y' && hasVowel(z+1) ){
 401     z[0] = 'i';
 402   }
 403
 404   /* Step 2 */
 405   switch( z[1] ){
 406    case 'a':
 407      stem(&z, "lanoita", "ate", m_gt_0) ||
 408      stem(&z, "lanoit", "tion", m_gt_0);
 409      break;
 410    case 'c':
 411      stem(&z, "icne", "ence", m_gt_0) ||
 412      stem(&z, "icna", "ance", m_gt_0);
 413      break;
 414    case 'e':
 415      stem(&z, "rezi", "ize", m_gt_0);
 416      break;
 417    case 'g':
 418      stem(&z, "igol", "log", m_gt_0);
 419      break;
 420    case 'l':
 421      stem(&z, "ilb", "ble", m_gt_0) ||
 422      stem(&z, "illa", "al", m_gt_0) ||
 423      stem(&z, "iltne", "ent", m_gt_0) ||
 424      stem(&z, "ile", "e", m_gt_0) ||
 425      stem(&z, "ilsuo", "ous", m_gt_0);
 426      break;
 427    case 'o':
 428      stem(&z, "noitazi", "ize", m_gt_0) ||
 429      stem(&z, "noita", "ate", m_gt_0) ||
 430      stem(&z, "rota", "ate", m_gt_0);
 431      break;
 432    case 's':
 433      stem(&z, "msila", "al", m_gt_0) ||
 434      stem(&z, "ssenevi", "ive", m_gt_0) ||
 435      stem(&z, "ssenluf", "ful", m_gt_0) ||
 436      stem(&z, "ssensuo", "ous", m_gt_0);
 437      break;
 438    case 't':
 439      stem(&z, "itila", "al", m_gt_0) ||
 440      stem(&z, "itivi", "ive", m_gt_0) ||
 441      stem(&z, "itilib", "ble", m_gt_0);
 442      break;
 443   }
 444
 445   /* Step 3 */
 446   switch( z[0] ){
 447    case 'e':
 448      stem(&z, "etaci", "ic", m_gt_0) ||
 449      stem(&z, "evita", "", m_gt_0)   ||
 450      stem(&z, "ezila", "al", m_gt_0);
 451      break;
 452    case 'i':
 453      stem(&z, "itici", "ic", m_gt_0);
 454      break;
 455    case 'l':
 456      stem(&z, "laci", "ic", m_gt_0) ||
 457      stem(&z, "luf", "", m_gt_0);
 458      break;
 459    case 's':
 460      stem(&z, "ssen", "", m_gt_0);
 461      break;
 462   }
 463
 464   /* Step 4 */
 465   switch( z[1] ){
 466    case 'a':
 467      if( z[0]=='l' && m_gt_1(z+2) ){
 468        z += 2;
 469      }
 470      break;
 471    case 'c':
 472      if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e')  && m_gt_1(z+4)  ){
 473        z += 4;
 474      }
 475      break;
 476    case 'e':
 477      if( z[0]=='r' && m_gt_1(z+2) ){
 478        z += 2;
 479      }
 480      break;
 481    case 'i':
 482      if( z[0]=='c' && m_gt_1(z+2) ){
 483        z += 2;
 484      }
 485      break;
 486    case 'l':
 487      if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
 488        z += 4;
 489      }
 490      break;
 491    case 'n':
 492      if( z[0]=='t' ){
 493        if( z[2]=='a' ){
 494          if( m_gt_1(z+3) ){
 495            z += 3;
 496          }
 497        }else if( z[2]=='e' ){
 498          stem(&z, "tneme", "", m_gt_1) ||
 499          stem(&z, "tnem", "", m_gt_1) ||
 500          stem(&z, "tne", "", m_gt_1);
 501        }
 502      }
 503      break;
 504    case 'o':
 505      if( z[0]=='u' ){
 506        if( m_gt_1(z+2) ){
 507          z += 2;
 508        }
 509      }else if( z[3]=='s' || z[3]=='t' ){
 510        stem(&z, "noi", "", m_gt_1);
 511      }
 512      break;
 513    case 's':
 514      if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
 515        z += 3;
 516      }
 517      break;
 518    case 't':
 519      stem(&z, "eta", "", m_gt_1) ||
 520      stem(&z, "iti", "", m_gt_1);
 521      break;
 522    case 'u':
 523      if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
 524        z += 3;
 525      }
 526      break;
 527    case 'v':
 528    case 'z':
 529      if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
 530        z += 3;
 531      }
 532      break;
 533   }
 534
 535   /* Step 5a */
 536   if( z[0]=='e' ){
 537     if( m_gt_1(z+1) ){
 538       z++;
 539     }else if( m_eq_1(z+1) && !star_oh(z+1) ){
 540       z++;
 541     }
 542   }
 543
 544   /* Step 5b */
 545   if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
 546     z++;
 547   }
 548
 549   /* z[] is now the stemmed word in reverse order.  Flip it back
 550   ** around into forward order and return.
 551   */
 552   *pnOut = i = strlen(z);
 553   zOut[i] = 0;
 554   while( *z ){
 555     zOut[--i] = *(z++);
 556   }
 557 }
 558
 559 /*
 560 ** Characters that can be part of a token.  We assume any character
 561 ** whose value is greater than 0x80 (any UTF character) can be
 562 ** part of a token.  In other words, delimiters all must have
 563 ** values of 0x7f or lower.
 564 */
 565 static const char porterIdChar[] = {
 566 /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
 567     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
 568     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
 569     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
 570     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
 571     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
 572 };
 573 #define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
 574
 575 /*
 576 ** Extract the next token from a tokenization cursor.  The cursor must
 577 ** have been opened by a prior call to porterOpen().
 578 */
 579 static int porterNext(
 580   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by porterOpen */
 581   const char **pzToken,               /* OUT: *pzToken is the token text */
 582   int *pnBytes,                       /* OUT: Number of bytes in token */
 583   int *piStartOffset,                 /* OUT: Starting offset of token */
 584   int *piEndOffset,                   /* OUT: Ending offset of token */
 585   int *piPosition                     /* OUT: Position integer of token */
 586 ){
 587   porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
 588   const char *z = c->zInput;
 589
 590   while( c->iOffset<c->nInput ){
 591     int iStartOffset, ch;
 592
 593     /* Scan past delimiter characters */
 594     while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
 595       c->iOffset++;
 596     }
 597
 598     /* Count non-delimiter characters. */
 599     iStartOffset = c->iOffset;
 600     while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
 601       c->iOffset++;
 602     }
 603
 604     if( c->iOffset>iStartOffset ){
 605       int n = c->iOffset-iStartOffset;
 606       if( n>c->nAllocated ){
 607         c->nAllocated = n+20;
 608         c->zToken = sqlite3_realloc(c->zToken, c->nAllocated);
 609         if( c->zToken==NULL ) return SQLITE_NOMEM;
 610       }
 611       porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
 612       *pzToken = c->zToken;
 613       *piStartOffset = iStartOffset;
 614       *piEndOffset = c->iOffset;
 615       *piPosition = c->iToken++;
 616       return SQLITE_OK;
 617     }
 618   }
 619   return SQLITE_DONE;
 620 }
 621
 622 /*
 623 ** The set of routines that implement the porter-stemmer tokenizer
 624 */
 625 static const sqlite3_tokenizer_module porterTokenizerModule = {
 626   0,
 627   porterCreate,
 628   porterDestroy,
 629   porterOpen,
 630   porterClose,
 631   porterNext,
 632 };
 633
 634 /*
 635 ** Allocate a new porter tokenizer.  Return a pointer to the new
 636 ** tokenizer in *ppModule
 637 */
 638 void sqlite3Fts2PorterTokenizerModule(
 639   sqlite3_tokenizer_module const**ppModule
 640 ){
 641   *ppModule = &porterTokenizerModule;
 642 }
 643
 644 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */