ext/fts3/fts3_unicode.c

   1 /*
   2 ** 2012 May 24
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 ******************************************************************************
  12 **
  13 ** Implementation of the "unicode" full-text-search tokenizer.
  14 */
  15
  16 #ifndef SQLITE_DISABLE_FTS3_UNICODE
  17
  18 #include "fts3Int.h"
  19 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
  20
  21 #include <assert.h>
  22 #include <stdlib.h>
  23 #include <stdio.h>
  24 #include <string.h>
  25
  26 #include "fts3_tokenizer.h"
  27
  28 /*
  29 ** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
  30 ** from the sqlite3 source file utf.c. If this file is compiled as part
  31 ** of the amalgamation, they are not required.
  32 */
  33 #ifndef SQLITE_AMALGAMATION
  34
  35 static const unsigned char sqlite3Utf8Trans1[] = {
  36   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  37   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  38   0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  39   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  40   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  41   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  42   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  43   0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
  44 };
  45
  46 #define READ_UTF8(zIn, zTerm, c)                           \
  47   c = *(zIn++);                                            \
  48   if( c>=0xc0 ){                                           \
  49     c = sqlite3Utf8Trans1[c-0xc0];                         \
  50     while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
  51       c = (c<<6) + (0x3f & *(zIn++));                      \
  52     }                                                      \
  53     if( c<0x80                                             \
  54         || (c&0xFFFFF800)==0xD800                          \
  55         || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
  56   }
  57
  58 #define WRITE_UTF8(zOut, c) {                          \
  59   if( c<0x00080 ){                                     \
  60     *zOut++ = (u8)(c&0xFF);                            \
  61   }                                                    \
  62   else if( c<0x00800 ){                                \
  63     *zOut++ = 0xC0 + (u8)((c>>6)&0x1F);                \
  64     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
  65   }                                                    \
  66   else if( c<0x10000 ){                                \
  67     *zOut++ = 0xE0 + (u8)((c>>12)&0x0F);               \
  68     *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
  69     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
  70   }else{                                               \
  71     *zOut++ = 0xF0 + (u8)((c>>18) & 0x07);             \
  72     *zOut++ = 0x80 + (u8)((c>>12) & 0x3F);             \
  73     *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
  74     *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
  75   }                                                    \
  76 }
  77
  78 #endif /* ifndef SQLITE_AMALGAMATION */
  79
  80 typedef struct unicode_tokenizer unicode_tokenizer;
  81 typedef struct unicode_cursor unicode_cursor;
  82
  83 struct unicode_tokenizer {
  84   sqlite3_tokenizer base;
  85   int bRemoveDiacritic;
  86   int nException;
  87   int *aiException;
  88 };
  89
  90 struct unicode_cursor {
  91   sqlite3_tokenizer_cursor base;
  92   const unsigned char *aInput;    /* Input text being tokenized */
  93   int nInput;                     /* Size of aInput[] in bytes */
  94   int iOff;                       /* Current offset within aInput[] */
  95   int iToken;                     /* Index of next token to be returned */
  96   char *zToken;                   /* storage for current token */
  97   int nAlloc;                     /* space allocated at zToken */
  98 };
  99
 100
 101 /*
 102 ** Destroy a tokenizer allocated by unicodeCreate().
 103 */
 104 static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){
 105   if( pTokenizer ){
 106     unicode_tokenizer *p = (unicode_tokenizer *)pTokenizer;
 107     sqlite3_free(p->aiException);
 108     sqlite3_free(p);
 109   }
 110   return SQLITE_OK;
 111 }
 112
 113 /*
 114 ** As part of a tokenchars= or separators= option, the CREATE VIRTUAL TABLE
 115 ** statement has specified that the tokenizer for this table shall consider
 116 ** all characters in string zIn/nIn to be separators (if bAlnum==0) or
 117 ** token characters (if bAlnum==1).
 118 **
 119 ** For each codepoint in the zIn/nIn string, this function checks if the
 120 ** sqlite3FtsUnicodeIsalnum() function already returns the desired result.
 121 ** If so, no action is taken. Otherwise, the codepoint is added to the
 122 ** unicode_tokenizer.aiException[] array. For the purposes of tokenization,
 123 ** the return value of sqlite3FtsUnicodeIsalnum() is inverted for all
 124 ** codepoints in the aiException[] array.
 125 **
 126 ** If a standalone diacritic mark (one that sqlite3FtsUnicodeIsdiacritic()
 127 ** identifies as a diacritic) occurs in the zIn/nIn string it is ignored.
 128 ** It is not possible to change the behavior of the tokenizer with respect
 129 ** to these codepoints.
 130 */
 131 static int unicodeAddExceptions(
 132   unicode_tokenizer *p,           /* Tokenizer to add exceptions to */
 133   int bAlnum,                     /* Replace Isalnum() return value with this */
 134   const char *zIn,                /* Array of characters to make exceptions */
 135   int nIn                         /* Length of z in bytes */
 136 ){
 137   const unsigned char *z = (const unsigned char *)zIn;
 138   const unsigned char *zTerm = &z[nIn];
 139   unsigned int iCode;
 140   int nEntry = 0;
 141
 142   assert( bAlnum==0 || bAlnum==1 );
 143
 144   while( z<zTerm ){
 145     READ_UTF8(z, zTerm, iCode);
 146     assert( (sqlite3FtsUnicodeIsalnum((int)iCode) & 0xFFFFFFFE)==0 );
 147     if( sqlite3FtsUnicodeIsalnum((int)iCode)!=bAlnum
 148      && sqlite3FtsUnicodeIsdiacritic((int)iCode)==0
 149     ){
 150       nEntry++;
 151     }
 152   }
 153
 154   if( nEntry ){
 155     int *aNew;                    /* New aiException[] array */
 156     int nNew;                     /* Number of valid entries in array aNew[] */
 157
 158     aNew = sqlite3_realloc(p->aiException, (p->nException+nEntry)*sizeof(int));
 159     if( aNew==0 ) return SQLITE_NOMEM;
 160     nNew = p->nException;
 161
 162     z = (const unsigned char *)zIn;
 163     while( z<zTerm ){
 164       READ_UTF8(z, zTerm, iCode);
 165       if( sqlite3FtsUnicodeIsalnum((int)iCode)!=bAlnum
 166        && sqlite3FtsUnicodeIsdiacritic((int)iCode)==0
 167       ){
 168         int i, j;
 169         for(i=0; i<nNew && aNew[i]<(int)iCode; i++);
 170         for(j=nNew; j>i; j--) aNew[j] = aNew[j-1];
 171         aNew[i] = (int)iCode;
 172         nNew++;
 173       }
 174     }
 175     p->aiException = aNew;
 176     p->nException = nNew;
 177   }
 178
 179   return SQLITE_OK;
 180 }
 181
 182 /*
 183 ** Return true if the p->aiException[] array contains the value iCode.
 184 */
 185 static int unicodeIsException(unicode_tokenizer *p, int iCode){
 186   if( p->nException>0 ){
 187     int *a = p->aiException;
 188     int iLo = 0;
 189     int iHi = p->nException-1;
 190
 191     while( iHi>=iLo ){
 192       int iTest = (iHi + iLo) / 2;
 193       if( iCode==a[iTest] ){
 194         return 1;
 195       }else if( iCode>a[iTest] ){
 196         iLo = iTest+1;
 197       }else{
 198         iHi = iTest-1;
 199       }
 200     }
 201   }
 202
 203   return 0;
 204 }
 205
 206 /*
 207 ** Return true if, for the purposes of tokenization, codepoint iCode is
 208 ** considered a token character (not a separator).
 209 */
 210 static int unicodeIsAlnum(unicode_tokenizer *p, int iCode){
 211   assert( (sqlite3FtsUnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
 212   return sqlite3FtsUnicodeIsalnum(iCode) ^ unicodeIsException(p, iCode);
 213 }
 214
 215 /*
 216 ** Create a new tokenizer instance.
 217 */
 218 static int unicodeCreate(
 219   int nArg,                       /* Size of array argv[] */
 220   const char * const *azArg,      /* Tokenizer creation arguments */
 221   sqlite3_tokenizer **pp          /* OUT: New tokenizer handle */
 222 ){
 223   unicode_tokenizer *pNew;        /* New tokenizer object */
 224   int i;
 225   int rc = SQLITE_OK;
 226
 227   pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
 228   if( pNew==NULL ) return SQLITE_NOMEM;
 229   memset(pNew, 0, sizeof(unicode_tokenizer));
 230   pNew->bRemoveDiacritic = 1;
 231
 232   for(i=0; rc==SQLITE_OK && i<nArg; i++){
 233     const char *z = azArg[i];
 234     int n = (int)strlen(z);
 235
 236     if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
 237       pNew->bRemoveDiacritic = 1;
 238     }
 239     else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
 240       pNew->bRemoveDiacritic = 0;
 241     }
 242     else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){
 243       rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);
 244     }
 245     else if( n>=11 && memcmp("separators=", z, 11)==0 ){
 246       rc = unicodeAddExceptions(pNew, 0, &z[11], n-11);
 247     }
 248     else{
 249       /* Unrecognized argument */
 250       rc  = SQLITE_ERROR;
 251     }
 252   }
 253
 254   if( rc!=SQLITE_OK ){
 255     unicodeDestroy((sqlite3_tokenizer *)pNew);
 256     pNew = 0;
 257   }
 258   *pp = (sqlite3_tokenizer *)pNew;
 259   return rc;
 260 }
 261
 262 /*
 263 ** Prepare to begin tokenizing a particular string.  The input
 264 ** string to be tokenized is pInput[0..nBytes-1].  A cursor
 265 ** used to incrementally tokenize this string is returned in
 266 ** *ppCursor.
 267 */
 268 static int unicodeOpen(
 269   sqlite3_tokenizer *p,           /* The tokenizer */
 270   const char *aInput,             /* Input string */
 271   int nInput,                     /* Size of string aInput in bytes */
 272   sqlite3_tokenizer_cursor **pp   /* OUT: New cursor object */
 273 ){
 274   unicode_cursor *pCsr;
 275
 276   pCsr = (unicode_cursor *)sqlite3_malloc(sizeof(unicode_cursor));
 277   if( pCsr==0 ){
 278     return SQLITE_NOMEM;
 279   }
 280   memset(pCsr, 0, sizeof(unicode_cursor));
 281
 282   pCsr->aInput = (const unsigned char *)aInput;
 283   if( aInput==0 ){
 284     pCsr->nInput = 0;
 285   }else if( nInput<0 ){
 286     pCsr->nInput = (int)strlen(aInput);
 287   }else{
 288     pCsr->nInput = nInput;
 289   }
 290
 291   *pp = &pCsr->base;
 292   UNUSED_PARAMETER(p);
 293   return SQLITE_OK;
 294 }
 295
 296 /*
 297 ** Close a tokenization cursor previously opened by a call to
 298 ** simpleOpen() above.
 299 */
 300 static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){
 301   unicode_cursor *pCsr = (unicode_cursor *) pCursor;
 302   sqlite3_free(pCsr->zToken);
 303   sqlite3_free(pCsr);
 304   return SQLITE_OK;
 305 }
 306
 307 /*
 308 ** Extract the next token from a tokenization cursor.  The cursor must
 309 ** have been opened by a prior call to simpleOpen().
 310 */
 311 static int unicodeNext(
 312   sqlite3_tokenizer_cursor *pC,   /* Cursor returned by simpleOpen */
 313   const char **paToken,           /* OUT: Token text */
 314   int *pnToken,                   /* OUT: Number of bytes at *paToken */
 315   int *piStart,                   /* OUT: Starting offset of token */
 316   int *piEnd,                     /* OUT: Ending offset of token */
 317   int *piPos                      /* OUT: Position integer of token */
 318 ){
 319   unicode_cursor *pCsr = (unicode_cursor *)pC;
 320   unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer);
 321   unsigned int iCode = 0;
 322   char *zOut;
 323   const unsigned char *z = &pCsr->aInput[pCsr->iOff];
 324   const unsigned char *zStart = z;
 325   const unsigned char *zEnd;
 326   const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput];
 327
 328   /* Scan past any delimiter characters before the start of the next token.
 329   ** Return SQLITE_DONE early if this takes us all the way to the end of
 330   ** the input.  */
 331   while( z<zTerm ){
 332     READ_UTF8(z, zTerm, iCode);
 333     if( unicodeIsAlnum(p, (int)iCode) ) break;
 334     zStart = z;
 335   }
 336   if( zStart>=zTerm ) return SQLITE_DONE;
 337
 338   zOut = pCsr->zToken;
 339   do {
 340     int iOut;
 341
 342     /* Grow the output buffer if required. */
 343     if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
 344       char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
 345       if( !zNew ) return SQLITE_NOMEM;
 346       zOut = &zNew[zOut - pCsr->zToken];
 347       pCsr->zToken = zNew;
 348       pCsr->nAlloc += 64;
 349     }
 350
 351     /* Write the folded case of the last character read to the output */
 352     zEnd = z;
 353     iOut = sqlite3FtsUnicodeFold((int)iCode, p->bRemoveDiacritic);
 354     if( iOut ){
 355       WRITE_UTF8(zOut, iOut);
 356     }
 357
 358     /* If the cursor is not at EOF, read the next character */
 359     if( z>=zTerm ) break;
 360     READ_UTF8(z, zTerm, iCode);
 361   }while( unicodeIsAlnum(p, (int)iCode)
 362        || sqlite3FtsUnicodeIsdiacritic((int)iCode)
 363   );
 364
 365   /* Set the output variables and return. */
 366   pCsr->iOff = (int)(z - pCsr->aInput);
 367   *paToken = pCsr->zToken;
 368   *pnToken = (int)(zOut - pCsr->zToken);
 369   *piStart = (int)(zStart - pCsr->aInput);
 370   *piEnd = (int)(zEnd - pCsr->aInput);
 371   *piPos = pCsr->iToken++;
 372   return SQLITE_OK;
 373 }
 374
 375 /*
 376 ** Set *ppModule to a pointer to the sqlite3_tokenizer_module
 377 ** structure for the unicode tokenizer.
 378 */
 379 void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const **ppModule){
 380   static const sqlite3_tokenizer_module module = {
 381     0,
 382     unicodeCreate,
 383     unicodeDestroy,
 384     unicodeOpen,
 385     unicodeClose,
 386     unicodeNext,
 387     0,
 388   };
 389   *ppModule = &module;
 390 }
 391
 392 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
 393 #endif /* ifndef SQLITE_DISABLE_FTS3_UNICODE */