Another minor optimization to OP_Transaction.
[sqlite.git] / ext / misc / spellfix.c
blobfaceea189cebdca777104c9e41f6deb90544b5ad
1 /*
2 ** 2012 April 10
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 *************************************************************************
13 ** This module implements the spellfix1 VIRTUAL TABLE that can be used
14 ** to search a large vocabulary for close matches. See separate
15 ** documentation (http://www.sqlite.org/spellfix1.html) for details.
17 #include "sqlite3ext.h"
18 SQLITE_EXTENSION_INIT1
20 #ifndef SQLITE_AMALGAMATION
21 # if !defined(NDEBUG) && !defined(SQLITE_DEBUG)
22 # define NDEBUG 1
23 # endif
24 # if defined(NDEBUG) && defined(SQLITE_DEBUG)
25 # undef NDEBUG
26 # endif
27 # include <string.h>
28 # include <stdio.h>
29 # include <stdlib.h>
30 # include <assert.h>
31 # define ALWAYS(X) 1
32 # define NEVER(X) 0
33 typedef unsigned char u8;
34 typedef unsigned short u16;
35 #endif
36 #include <ctype.h>
38 #ifndef SQLITE_OMIT_VIRTUALTABLE
41 ** Character classes for ASCII characters:
43 ** 0 '' Silent letters: H W
44 ** 1 'A' Any vowel: A E I O U (Y)
45 ** 2 'B' A bilabeal stop or fricative: B F P V W
46 ** 3 'C' Other fricatives or back stops: C G J K Q S X Z
47 ** 4 'D' Alveolar stops: D T
48 ** 5 'H' Letter H at the beginning of a word
49 ** 6 'L' Glide: L
50 ** 7 'R' Semivowel: R
51 ** 8 'M' Nasals: M N
52 ** 9 'Y' Letter Y at the beginning of a word.
53 ** 10 '9' Digits: 0 1 2 3 4 5 6 7 8 9
54 ** 11 ' ' White space
55 ** 12 '?' Other.
57 #define CCLASS_SILENT 0
58 #define CCLASS_VOWEL 1
59 #define CCLASS_B 2
60 #define CCLASS_C 3
61 #define CCLASS_D 4
62 #define CCLASS_H 5
63 #define CCLASS_L 6
64 #define CCLASS_R 7
65 #define CCLASS_M 8
66 #define CCLASS_Y 9
67 #define CCLASS_DIGIT 10
68 #define CCLASS_SPACE 11
69 #define CCLASS_OTHER 12
72 ** The following table gives the character class for non-initial ASCII
73 ** characters.
75 static const unsigned char midClass[] = {
76 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
77 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
78 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
79 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
80 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER,
81 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
82 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
83 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
84 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
85 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
86 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE,
87 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER,
88 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER,
89 /* ' */ CCLASS_SILENT, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER,
90 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER,
91 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER,
92 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT,
93 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT,
94 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT,
95 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER,
96 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER,
97 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL,
98 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D,
99 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C,
100 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C,
101 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M,
102 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B,
103 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C,
104 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B,
105 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_VOWEL,
106 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER,
107 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER,
108 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B,
109 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL,
110 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT,
111 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C,
112 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M,
113 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C,
114 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D,
115 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B,
116 /* x */ CCLASS_C, /* y */ CCLASS_VOWEL, /* z */ CCLASS_C,
117 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER,
118 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER,
121 ** This tables gives the character class for ASCII characters that form the
122 ** initial character of a word. The only difference from midClass is with
123 ** the letters H, W, and Y.
125 static const unsigned char initClass[] = {
126 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
127 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
128 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
129 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
130 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER,
131 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
132 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
133 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
134 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
135 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
136 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE,
137 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER,
138 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER,
139 /* ' */ CCLASS_OTHER, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER,
140 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER,
141 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER,
142 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT,
143 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT,
144 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT,
145 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER,
146 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER,
147 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL,
148 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D,
149 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C,
150 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C,
151 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M,
152 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B,
153 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C,
154 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B,
155 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_Y,
156 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER,
157 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER,
158 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B,
159 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL,
160 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT,
161 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C,
162 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M,
163 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C,
164 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D,
165 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B,
166 /* x */ CCLASS_C, /* y */ CCLASS_Y, /* z */ CCLASS_C,
167 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER,
168 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER,
172 ** Mapping from the character class number (0-13) to a symbol for each
173 ** character class. Note that initClass[] can be used to map the class
174 ** symbol back into the class number.
176 static const unsigned char className[] = ".ABCDHLRMY9 ?";
179 ** Generate a "phonetic hash" from a string of ASCII characters
180 ** in zIn[0..nIn-1].
182 ** * Map characters by character class as defined above.
183 ** * Omit double-letters
184 ** * Omit vowels beside R and L
185 ** * Omit T when followed by CH
186 ** * Omit W when followed by R
187 ** * Omit D when followed by J or G
188 ** * Omit K in KN or G in GN at the beginning of a word
190 ** Space to hold the result is obtained from sqlite3_malloc()
192 ** Return NULL if memory allocation fails.
194 static unsigned char *phoneticHash(const unsigned char *zIn, int nIn){
195 unsigned char *zOut = sqlite3_malloc64( nIn + 1 );
196 int i;
197 int nOut = 0;
198 char cPrev = 0x77;
199 char cPrevX = 0x77;
200 const unsigned char *aClass = initClass;
202 if( zOut==0 ) return 0;
203 if( nIn>2 ){
204 switch( zIn[0] ){
205 case 'g':
206 case 'k': {
207 if( zIn[1]=='n' ){ zIn++; nIn--; }
208 break;
212 for(i=0; i<nIn; i++){
213 unsigned char c = zIn[i];
214 if( i+1<nIn ){
215 if( c=='w' && zIn[i+1]=='r' ) continue;
216 if( c=='d' && (zIn[i+1]=='j' || zIn[i+1]=='g') ) continue;
217 if( i+2<nIn ){
218 if( c=='t' && zIn[i+1]=='c' && zIn[i+2]=='h' ) continue;
221 c = aClass[c&0x7f];
222 if( c==CCLASS_SPACE ) continue;
223 if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue;
224 aClass = midClass;
225 if( c==CCLASS_VOWEL && (cPrevX==CCLASS_R || cPrevX==CCLASS_L) ){
226 continue; /* No vowels beside L or R */
228 if( (c==CCLASS_R || c==CCLASS_L) && cPrevX==CCLASS_VOWEL ){
229 nOut--; /* No vowels beside L or R */
231 cPrev = c;
232 if( c==CCLASS_SILENT ) continue;
233 cPrevX = c;
234 c = className[c];
235 assert( nOut>=0 );
236 if( nOut==0 || c!=zOut[nOut-1] ) zOut[nOut++] = c;
238 zOut[nOut] = 0;
239 return zOut;
243 ** This is an SQL function wrapper around phoneticHash(). See
244 ** the description of phoneticHash() for additional information.
246 static void phoneticHashSqlFunc(
247 sqlite3_context *context,
248 int argc,
249 sqlite3_value **argv
251 const unsigned char *zIn;
252 unsigned char *zOut;
254 zIn = sqlite3_value_text(argv[0]);
255 if( zIn==0 ) return;
256 zOut = phoneticHash(zIn, sqlite3_value_bytes(argv[0]));
257 if( zOut==0 ){
258 sqlite3_result_error_nomem(context);
259 }else{
260 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
265 ** Return the character class number for a character given its
266 ** context.
268 static char characterClass(char cPrev, char c){
269 return cPrev==0 ? initClass[c&0x7f] : midClass[c&0x7f];
273 ** Return the cost of inserting or deleting character c immediately
274 ** following character cPrev. If cPrev==0, that means c is the first
275 ** character of the word.
277 static int insertOrDeleteCost(char cPrev, char c, char cNext){
278 char classC = characterClass(cPrev, c);
279 char classCprev;
281 if( classC==CCLASS_SILENT ){
282 /* Insert or delete "silent" characters such as H or W */
283 return 1;
285 if( cPrev==c ){
286 /* Repeated characters, or miss a repeat */
287 return 10;
289 if( classC==CCLASS_VOWEL && (cPrev=='r' || cNext=='r') ){
290 return 20; /* Insert a vowel before or after 'r' */
292 classCprev = characterClass(cPrev, cPrev);
293 if( classC==classCprev ){
294 if( classC==CCLASS_VOWEL ){
295 /* Remove or add a new vowel to a vowel cluster */
296 return 15;
297 }else{
298 /* Remove or add a consonant not in the same class */
299 return 50;
303 /* any other character insertion or deletion */
304 return 100;
308 ** Divide the insertion cost by this factor when appending to the
309 ** end of the word.
311 #define FINAL_INS_COST_DIV 4
314 ** Return the cost of substituting cTo in place of cFrom assuming
315 ** the previous character is cPrev. If cPrev==0 then cTo is the first
316 ** character of the word.
318 static int substituteCost(char cPrev, char cFrom, char cTo){
319 char classFrom, classTo;
320 if( cFrom==cTo ){
321 /* Exact match */
322 return 0;
324 if( cFrom==(cTo^0x20) && ((cTo>='A' && cTo<='Z') || (cTo>='a' && cTo<='z')) ){
325 /* differ only in case */
326 return 0;
328 classFrom = characterClass(cPrev, cFrom);
329 classTo = characterClass(cPrev, cTo);
330 if( classFrom==classTo ){
331 /* Same character class */
332 return 40;
334 if( classFrom>=CCLASS_B && classFrom<=CCLASS_Y
335 && classTo>=CCLASS_B && classTo<=CCLASS_Y ){
336 /* Convert from one consonant to another, but in a different class */
337 return 75;
339 /* Any other subsitution */
340 return 100;
344 ** Given two strings zA and zB which are pure ASCII, return the cost
345 ** of transforming zA into zB. If zA ends with '*' assume that it is
346 ** a prefix of zB and give only minimal penalty for extra characters
347 ** on the end of zB.
349 ** Smaller numbers mean a closer match.
351 ** Negative values indicate an error:
352 ** -1 One of the inputs is NULL
353 ** -2 Non-ASCII characters on input
354 ** -3 Unable to allocate memory
356 ** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
357 ** of zB that matched the pattern in zA. If zA does not end with a '*',
358 ** then this value is always the number of bytes in zB (i.e. strlen(zB)).
359 ** If zA does end in a '*', then it is the number of bytes in the prefix
360 ** of zB that was deemed to match zA.
362 static int editdist1(const char *zA, const char *zB, int *pnMatch){
363 int nA, nB; /* Number of characters in zA[] and zB[] */
364 int xA, xB; /* Loop counters for zA[] and zB[] */
365 char cA = 0, cB; /* Current character of zA and zB */
366 char cAprev, cBprev; /* Previous character of zA and zB */
367 char cAnext, cBnext; /* Next character in zA and zB */
368 int d; /* North-west cost value */
369 int dc = 0; /* North-west character value */
370 int res; /* Final result */
371 int *m; /* The cost matrix */
372 char *cx; /* Corresponding character values */
373 int *toFree = 0; /* Malloced space */
374 int nMatch = 0;
375 int mStack[60+15]; /* Stack space to use if not too much is needed */
377 /* Early out if either input is NULL */
378 if( zA==0 || zB==0 ) return -1;
380 /* Skip any common prefix */
381 while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; }
382 if( pnMatch ) *pnMatch = nMatch;
383 if( zA[0]==0 && zB[0]==0 ) return 0;
385 #if 0
386 printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
387 #endif
389 /* Verify input strings and measure their lengths */
390 for(nA=0; zA[nA]; nA++){
391 if( zA[nA]&0x80 ) return -2;
393 for(nB=0; zB[nB]; nB++){
394 if( zB[nB]&0x80 ) return -2;
397 /* Special processing if either string is empty */
398 if( nA==0 ){
399 cBprev = (char)dc;
400 for(xB=res=0; (cB = zB[xB])!=0; xB++){
401 res += insertOrDeleteCost(cBprev, cB, zB[xB+1])/FINAL_INS_COST_DIV;
402 cBprev = cB;
404 return res;
406 if( nB==0 ){
407 cAprev = (char)dc;
408 for(xA=res=0; (cA = zA[xA])!=0; xA++){
409 res += insertOrDeleteCost(cAprev, cA, zA[xA+1]);
410 cAprev = cA;
412 return res;
415 /* A is a prefix of B */
416 if( zA[0]=='*' && zA[1]==0 ) return 0;
418 /* Allocate and initialize the Wagner matrix */
419 if( nB<(sizeof(mStack)*4)/(sizeof(mStack[0])*5) ){
420 m = mStack;
421 }else{
422 m = toFree = sqlite3_malloc64( (nB+1)*5*sizeof(m[0])/4 );
423 if( m==0 ) return -3;
425 cx = (char*)&m[nB+1];
427 /* Compute the Wagner edit distance */
428 m[0] = 0;
429 cx[0] = (char)dc;
430 cBprev = (char)dc;
431 for(xB=1; xB<=nB; xB++){
432 cBnext = zB[xB];
433 cB = zB[xB-1];
434 cx[xB] = cB;
435 m[xB] = m[xB-1] + insertOrDeleteCost(cBprev, cB, cBnext);
436 cBprev = cB;
438 cAprev = (char)dc;
439 for(xA=1; xA<=nA; xA++){
440 int lastA = (xA==nA);
441 cA = zA[xA-1];
442 cAnext = zA[xA];
443 if( cA=='*' && lastA ) break;
444 d = m[0];
445 dc = cx[0];
446 m[0] = d + insertOrDeleteCost(cAprev, cA, cAnext);
447 cBprev = 0;
448 for(xB=1; xB<=nB; xB++){
449 int totalCost, insCost, delCost, subCost, ncx;
450 cB = zB[xB-1];
451 cBnext = zB[xB];
453 /* Cost to insert cB */
454 insCost = insertOrDeleteCost(cx[xB-1], cB, cBnext);
455 if( lastA ) insCost /= FINAL_INS_COST_DIV;
457 /* Cost to delete cA */
458 delCost = insertOrDeleteCost(cx[xB], cA, cBnext);
460 /* Cost to substitute cA->cB */
461 subCost = substituteCost(cx[xB-1], cA, cB);
463 /* Best cost */
464 totalCost = insCost + m[xB-1];
465 ncx = cB;
466 if( (delCost + m[xB])<totalCost ){
467 totalCost = delCost + m[xB];
468 ncx = cA;
470 if( (subCost + d)<totalCost ){
471 totalCost = subCost + d;
474 #if 0
475 printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c"
476 " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n",
477 xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB,
478 insCost, delCost, subCost, totalCost, ncx?ncx:' ');
479 #endif
481 /* Update the matrix */
482 d = m[xB];
483 dc = cx[xB];
484 m[xB] = totalCost;
485 cx[xB] = (char)ncx;
486 cBprev = cB;
488 cAprev = cA;
491 /* Free the wagner matrix and return the result */
492 if( cA=='*' ){
493 res = m[1];
494 for(xB=1; xB<=nB; xB++){
495 if( m[xB]<res ){
496 res = m[xB];
497 if( pnMatch ) *pnMatch = xB+nMatch;
500 }else{
501 res = m[nB];
502 /* In the current implementation, pnMatch is always NULL if zA does
503 ** not end in "*" */
504 assert( pnMatch==0 );
506 sqlite3_free(toFree);
507 return res;
511 ** Function: editdist(A,B)
513 ** Return the cost of transforming string A into string B. Both strings
514 ** must be pure ASCII text. If A ends with '*' then it is assumed to be
515 ** a prefix of B and extra characters on the end of B have minimal additional
516 ** cost.
518 static void editdistSqlFunc(
519 sqlite3_context *context,
520 int argc,
521 sqlite3_value **argv
523 int res = editdist1(
524 (const char*)sqlite3_value_text(argv[0]),
525 (const char*)sqlite3_value_text(argv[1]),
527 if( res<0 ){
528 if( res==(-3) ){
529 sqlite3_result_error_nomem(context);
530 }else if( res==(-2) ){
531 sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
532 }else{
533 sqlite3_result_error(context, "NULL input to editdist()", -1);
535 }else{
536 sqlite3_result_int(context, res);
540 /* End of the fixed-cost edit distance implementation
541 ******************************************************************************
542 *****************************************************************************
543 ** Begin: Configurable cost unicode edit distance routines
545 /* Forward declaration of structures */
546 typedef struct EditDist3Cost EditDist3Cost;
547 typedef struct EditDist3Config EditDist3Config;
548 typedef struct EditDist3Point EditDist3Point;
549 typedef struct EditDist3From EditDist3From;
550 typedef struct EditDist3FromString EditDist3FromString;
551 typedef struct EditDist3To EditDist3To;
552 typedef struct EditDist3ToString EditDist3ToString;
553 typedef struct EditDist3Lang EditDist3Lang;
557 ** An entry in the edit cost table
559 struct EditDist3Cost {
560 EditDist3Cost *pNext; /* Next cost element */
561 u8 nFrom; /* Number of bytes in aFrom */
562 u8 nTo; /* Number of bytes in aTo */
563 u16 iCost; /* Cost of this transformation */
564 char a[4] ; /* FROM string followed by TO string */
565 /* Additional TO and FROM string bytes appended as necessary */
569 ** Edit costs for a particular language ID
571 struct EditDist3Lang {
572 int iLang; /* Language ID */
573 int iInsCost; /* Default insertion cost */
574 int iDelCost; /* Default deletion cost */
575 int iSubCost; /* Default substitution cost */
576 EditDist3Cost *pCost; /* Costs */
581 ** The default EditDist3Lang object, with default costs.
583 static const EditDist3Lang editDist3Lang = { 0, 100, 100, 150, 0 };
586 ** Complete configuration
588 struct EditDist3Config {
589 int nLang; /* Number of language IDs. Size of a[] */
590 EditDist3Lang *a; /* One for each distinct language ID */
594 ** Extra information about each character in the FROM string.
596 struct EditDist3From {
597 int nSubst; /* Number of substitution cost entries */
598 int nDel; /* Number of deletion cost entries */
599 int nByte; /* Number of bytes in this character */
600 EditDist3Cost **apSubst; /* Array of substitution costs for this element */
601 EditDist3Cost **apDel; /* Array of deletion cost entries */
605 ** A precompiled FROM string.
607 ** In the common case we expect the FROM string to be reused multiple times.
608 ** In other words, the common case will be to measure the edit distance
609 ** from a single origin string to multiple target strings.
611 struct EditDist3FromString {
612 char *z; /* The complete text of the FROM string */
613 int n; /* Number of characters in the FROM string */
614 int isPrefix; /* True if ends with '*' character */
615 EditDist3From *a; /* Extra info about each char of the FROM string */
619 ** Extra information about each character in the TO string.
621 struct EditDist3To {
622 int nIns; /* Number of insertion cost entries */
623 int nByte; /* Number of bytes in this character */
624 EditDist3Cost **apIns; /* Array of deletion cost entries */
628 ** A precompiled FROM string
630 struct EditDist3ToString {
631 char *z; /* The complete text of the TO string */
632 int n; /* Number of characters in the TO string */
633 EditDist3To *a; /* Extra info about each char of the TO string */
637 ** Clear or delete an instance of the object that records all edit-distance
638 ** weights.
640 static void editDist3ConfigClear(EditDist3Config *p){
641 int i;
642 if( p==0 ) return;
643 for(i=0; i<p->nLang; i++){
644 EditDist3Cost *pCost, *pNext;
645 pCost = p->a[i].pCost;
646 while( pCost ){
647 pNext = pCost->pNext;
648 sqlite3_free(pCost);
649 pCost = pNext;
652 sqlite3_free(p->a);
653 memset(p, 0, sizeof(*p));
655 static void editDist3ConfigDelete(void *pIn){
656 EditDist3Config *p = (EditDist3Config*)pIn;
657 editDist3ConfigClear(p);
658 sqlite3_free(p);
661 /* Compare the FROM values of two EditDist3Cost objects, for sorting.
662 ** Return negative, zero, or positive if the A is less than, equal to,
663 ** or greater than B.
665 static int editDist3CostCompare(EditDist3Cost *pA, EditDist3Cost *pB){
666 int n = pA->nFrom;
667 int rc;
668 if( n>pB->nFrom ) n = pB->nFrom;
669 rc = strncmp(pA->a, pB->a, n);
670 if( rc==0 ) rc = pA->nFrom - pB->nFrom;
671 return rc;
675 ** Merge together two sorted lists of EditDist3Cost objects, in order
676 ** of increasing FROM.
678 static EditDist3Cost *editDist3CostMerge(
679 EditDist3Cost *pA,
680 EditDist3Cost *pB
682 EditDist3Cost *pHead = 0;
683 EditDist3Cost **ppTail = &pHead;
684 EditDist3Cost *p;
685 while( pA && pB ){
686 if( editDist3CostCompare(pA,pB)<=0 ){
687 p = pA;
688 pA = pA->pNext;
689 }else{
690 p = pB;
691 pB = pB->pNext;
693 *ppTail = p;
694 ppTail = &p->pNext;
696 if( pA ){
697 *ppTail = pA;
698 }else{
699 *ppTail = pB;
701 return pHead;
705 ** Sort a list of EditDist3Cost objects into order of increasing FROM
707 static EditDist3Cost *editDist3CostSort(EditDist3Cost *pList){
708 EditDist3Cost *ap[60], *p;
709 int i;
710 int mx = 0;
711 ap[0] = 0;
712 ap[1] = 0;
713 while( pList ){
714 p = pList;
715 pList = p->pNext;
716 p->pNext = 0;
717 for(i=0; ap[i]; i++){
718 p = editDist3CostMerge(ap[i],p);
719 ap[i] = 0;
721 ap[i] = p;
722 if( i>mx ){
723 mx = i;
724 ap[i+1] = 0;
727 p = 0;
728 for(i=0; i<=mx; i++){
729 if( ap[i] ) p = editDist3CostMerge(p,ap[i]);
731 return p;
735 ** Load all edit-distance weights from a table.
737 static int editDist3ConfigLoad(
738 EditDist3Config *p, /* The edit distance configuration to load */
739 sqlite3 *db, /* Load from this database */
740 const char *zTable /* Name of the table from which to load */
742 sqlite3_stmt *pStmt;
743 int rc, rc2;
744 char *zSql;
745 int iLangPrev = -9999;
746 EditDist3Lang *pLang = 0;
748 zSql = sqlite3_mprintf("SELECT iLang, cFrom, cTo, iCost"
749 " FROM \"%w\" WHERE iLang>=0 ORDER BY iLang", zTable);
750 if( zSql==0 ) return SQLITE_NOMEM;
751 rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0);
752 sqlite3_free(zSql);
753 if( rc ) return rc;
754 editDist3ConfigClear(p);
755 while( sqlite3_step(pStmt)==SQLITE_ROW ){
756 int iLang = sqlite3_column_int(pStmt, 0);
757 const char *zFrom = (const char*)sqlite3_column_text(pStmt, 1);
758 int nFrom = zFrom ? sqlite3_column_bytes(pStmt, 1) : 0;
759 const char *zTo = (const char*)sqlite3_column_text(pStmt, 2);
760 int nTo = zTo ? sqlite3_column_bytes(pStmt, 2) : 0;
761 int iCost = sqlite3_column_int(pStmt, 3);
763 assert( zFrom!=0 || nFrom==0 );
764 assert( zTo!=0 || nTo==0 );
765 if( nFrom>100 || nTo>100 ) continue;
766 if( iCost<0 ) continue;
767 if( iCost>=10000 ) continue; /* Costs above 10K are considered infinite */
768 if( pLang==0 || iLang!=iLangPrev ){
769 EditDist3Lang *pNew;
770 pNew = sqlite3_realloc64(p->a, (p->nLang+1)*sizeof(p->a[0]));
771 if( pNew==0 ){ rc = SQLITE_NOMEM; break; }
772 p->a = pNew;
773 pLang = &p->a[p->nLang];
774 p->nLang++;
775 pLang->iLang = iLang;
776 pLang->iInsCost = 100;
777 pLang->iDelCost = 100;
778 pLang->iSubCost = 150;
779 pLang->pCost = 0;
780 iLangPrev = iLang;
782 if( nFrom==1 && zFrom[0]=='?' && nTo==0 ){
783 pLang->iDelCost = iCost;
784 }else if( nFrom==0 && nTo==1 && zTo[0]=='?' ){
785 pLang->iInsCost = iCost;
786 }else if( nFrom==1 && nTo==1 && zFrom[0]=='?' && zTo[0]=='?' ){
787 pLang->iSubCost = iCost;
788 }else{
789 EditDist3Cost *pCost;
790 int nExtra = nFrom + nTo - 4;
791 if( nExtra<0 ) nExtra = 0;
792 pCost = sqlite3_malloc64( sizeof(*pCost) + nExtra );
793 if( pCost==0 ){ rc = SQLITE_NOMEM; break; }
794 pCost->nFrom = (u8)nFrom;
795 pCost->nTo = (u8)nTo;
796 pCost->iCost = (u16)iCost;
797 memcpy(pCost->a, zFrom, nFrom);
798 memcpy(pCost->a + nFrom, zTo, nTo);
799 pCost->pNext = pLang->pCost;
800 pLang->pCost = pCost;
803 rc2 = sqlite3_finalize(pStmt);
804 if( rc==SQLITE_OK ) rc = rc2;
805 if( rc==SQLITE_OK ){
806 int iLang;
807 for(iLang=0; iLang<p->nLang; iLang++){
808 p->a[iLang].pCost = editDist3CostSort(p->a[iLang].pCost);
811 return rc;
815 ** Return the length (in bytes) of a utf-8 character. Or return a maximum
816 ** of N.
818 static int utf8Len(unsigned char c, int N){
819 int len = 1;
820 if( c>0x7f ){
821 if( (c&0xe0)==0xc0 ){
822 len = 2;
823 }else if( (c&0xf0)==0xe0 ){
824 len = 3;
825 }else{
826 len = 4;
829 if( len>N ) len = N;
830 return len;
834 ** Return TRUE (non-zero) if the To side of the given cost matches
835 ** the given string.
837 static int matchTo(EditDist3Cost *p, const char *z, int n){
838 assert( n>0 );
839 if( p->a[p->nFrom]!=z[0] ) return 0;
840 if( p->nTo>n ) return 0;
841 if( strncmp(p->a+p->nFrom, z, p->nTo)!=0 ) return 0;
842 return 1;
846 ** Return TRUE (non-zero) if the From side of the given cost matches
847 ** the given string.
849 static int matchFrom(EditDist3Cost *p, const char *z, int n){
850 assert( p->nFrom<=n );
851 if( p->nFrom ){
852 if( p->a[0]!=z[0] ) return 0;
853 if( strncmp(p->a, z, p->nFrom)!=0 ) return 0;
855 return 1;
859 ** Return TRUE (non-zero) of the next FROM character and the next TO
860 ** character are the same.
862 static int matchFromTo(
863 EditDist3FromString *pStr, /* Left hand string */
864 int n1, /* Index of comparison character on the left */
865 const char *z2, /* Right-handl comparison character */
866 int n2 /* Bytes remaining in z2[] */
868 int b1 = pStr->a[n1].nByte;
869 if( b1>n2 ) return 0;
870 assert( b1>0 );
871 if( pStr->z[n1]!=z2[0] ) return 0;
872 if( strncmp(pStr->z+n1, z2, b1)!=0 ) return 0;
873 return 1;
877 ** Delete an EditDist3FromString objecct
879 static void editDist3FromStringDelete(EditDist3FromString *p){
880 int i;
881 if( p ){
882 for(i=0; i<p->n; i++){
883 sqlite3_free(p->a[i].apDel);
884 sqlite3_free(p->a[i].apSubst);
886 sqlite3_free(p);
891 ** Create a EditDist3FromString object.
893 static EditDist3FromString *editDist3FromStringNew(
894 const EditDist3Lang *pLang,
895 const char *z,
896 int n
898 EditDist3FromString *pStr;
899 EditDist3Cost *p;
900 int i;
902 if( z==0 ) return 0;
903 if( n<0 ) n = (int)strlen(z);
904 pStr = sqlite3_malloc64( sizeof(*pStr) + sizeof(pStr->a[0])*n + n + 1 );
905 if( pStr==0 ) return 0;
906 pStr->a = (EditDist3From*)&pStr[1];
907 memset(pStr->a, 0, sizeof(pStr->a[0])*n);
908 pStr->n = n;
909 pStr->z = (char*)&pStr->a[n];
910 memcpy(pStr->z, z, n+1);
911 if( n && z[n-1]=='*' ){
912 pStr->isPrefix = 1;
913 n--;
914 pStr->n--;
915 pStr->z[n] = 0;
916 }else{
917 pStr->isPrefix = 0;
920 for(i=0; i<n; i++){
921 EditDist3From *pFrom = &pStr->a[i];
922 memset(pFrom, 0, sizeof(*pFrom));
923 pFrom->nByte = utf8Len((unsigned char)z[i], n-i);
924 for(p=pLang->pCost; p; p=p->pNext){
925 EditDist3Cost **apNew;
926 if( i+p->nFrom>n ) continue;
927 if( matchFrom(p, z+i, n-i)==0 ) continue;
928 if( p->nTo==0 ){
929 apNew = sqlite3_realloc64(pFrom->apDel,
930 sizeof(*apNew)*(pFrom->nDel+1));
931 if( apNew==0 ) break;
932 pFrom->apDel = apNew;
933 apNew[pFrom->nDel++] = p;
934 }else{
935 apNew = sqlite3_realloc64(pFrom->apSubst,
936 sizeof(*apNew)*(pFrom->nSubst+1));
937 if( apNew==0 ) break;
938 pFrom->apSubst = apNew;
939 apNew[pFrom->nSubst++] = p;
942 if( p ){
943 editDist3FromStringDelete(pStr);
944 pStr = 0;
945 break;
948 return pStr;
952 ** Update entry m[i] such that it is the minimum of its current value
953 ** and m[j]+iCost.
955 static void updateCost(
956 unsigned int *m,
957 int i,
958 int j,
959 int iCost
961 unsigned int b;
962 assert( iCost>=0 );
963 assert( iCost<10000 );
964 b = m[j] + iCost;
965 if( b<m[i] ) m[i] = b;
969 ** How much stack space (int bytes) to use for Wagner matrix in
970 ** editDist3Core(). If more space than this is required, the entire
971 ** matrix is taken from the heap. To reduce the load on the memory
972 ** allocator, make this value as large as practical for the
973 ** architecture in use.
975 #ifndef SQLITE_SPELLFIX_STACKALLOC_SZ
976 # define SQLITE_SPELLFIX_STACKALLOC_SZ (1024)
977 #endif
979 /* Compute the edit distance between two strings.
981 ** If an error occurs, return a negative number which is the error code.
983 ** If pnMatch is not NULL, then *pnMatch is set to the number of characters
984 ** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does
985 ** not contain the pattern for a prefix-search, then this is always the number
986 ** of characters in z2. If pFrom does contain a prefix search pattern, then
987 ** it is the number of characters in the prefix of z2 that was deemed to
988 ** match pFrom.
990 static int editDist3Core(
991 EditDist3FromString *pFrom, /* The FROM string */
992 const char *z2, /* The TO string */
993 int n2, /* Length of the TO string */
994 const EditDist3Lang *pLang, /* Edit weights for a particular language ID */
995 int *pnMatch /* OUT: Characters in matched prefix */
997 int k, n;
998 int i1, b1;
999 int i2, b2;
1000 EditDist3FromString f = *pFrom;
1001 EditDist3To *a2;
1002 unsigned int *m;
1003 unsigned int *pToFree;
1004 int szRow;
1005 EditDist3Cost *p;
1006 int res;
1007 sqlite3_uint64 nByte;
1008 unsigned int stackSpace[SQLITE_SPELLFIX_STACKALLOC_SZ/sizeof(unsigned int)];
1010 /* allocate the Wagner matrix and the aTo[] array for the TO string */
1011 n = (f.n+1)*(n2+1);
1012 n = (n+1)&~1;
1013 nByte = n*sizeof(m[0]) + sizeof(a2[0])*n2;
1014 if( nByte<=sizeof(stackSpace) ){
1015 m = stackSpace;
1016 pToFree = 0;
1017 }else{
1018 m = pToFree = sqlite3_malloc64( nByte );
1019 if( m==0 ) return -1; /* Out of memory */
1021 a2 = (EditDist3To*)&m[n];
1022 memset(a2, 0, sizeof(a2[0])*n2);
1024 /* Fill in the a1[] matrix for all characters of the TO string */
1025 for(i2=0; i2<n2; i2++){
1026 a2[i2].nByte = utf8Len((unsigned char)z2[i2], n2-i2);
1027 for(p=pLang->pCost; p; p=p->pNext){
1028 EditDist3Cost **apNew;
1029 if( p->nFrom>0 ) break;
1030 if( i2+p->nTo>n2 ) continue;
1031 if( p->a[0]>z2[i2] ) break;
1032 if( matchTo(p, z2+i2, n2-i2)==0 ) continue;
1033 a2[i2].nIns++;
1034 apNew = sqlite3_realloc64(a2[i2].apIns, sizeof(*apNew)*a2[i2].nIns);
1035 if( apNew==0 ){
1036 res = -1; /* Out of memory */
1037 goto editDist3Abort;
1039 a2[i2].apIns = apNew;
1040 a2[i2].apIns[a2[i2].nIns-1] = p;
1044 /* Prepare to compute the minimum edit distance */
1045 szRow = f.n+1;
1046 memset(m, 0x01, (n2+1)*szRow*sizeof(m[0]));
1047 m[0] = 0;
1049 /* First fill in the top-row of the matrix with FROM deletion costs */
1050 for(i1=0; i1<f.n; i1 += b1){
1051 b1 = f.a[i1].nByte;
1052 updateCost(m, i1+b1, i1, pLang->iDelCost);
1053 for(k=0; k<f.a[i1].nDel; k++){
1054 p = f.a[i1].apDel[k];
1055 updateCost(m, i1+p->nFrom, i1, p->iCost);
1059 /* Fill in all subsequent rows, top-to-bottom, left-to-right */
1060 for(i2=0; i2<n2; i2 += b2){
1061 int rx; /* Starting index for current row */
1062 int rxp; /* Starting index for previous row */
1063 b2 = a2[i2].nByte;
1064 rx = szRow*(i2+b2);
1065 rxp = szRow*i2;
1066 updateCost(m, rx, rxp, pLang->iInsCost);
1067 for(k=0; k<a2[i2].nIns; k++){
1068 p = a2[i2].apIns[k];
1069 updateCost(m, szRow*(i2+p->nTo), rxp, p->iCost);
1071 for(i1=0; i1<f.n; i1+=b1){
1072 int cx; /* Index of current cell */
1073 int cxp; /* Index of cell immediately to the left */
1074 int cxd; /* Index of cell to the left and one row above */
1075 int cxu; /* Index of cell immediately above */
1076 b1 = f.a[i1].nByte;
1077 cxp = rx + i1;
1078 cx = cxp + b1;
1079 cxd = rxp + i1;
1080 cxu = cxd + b1;
1081 updateCost(m, cx, cxp, pLang->iDelCost);
1082 for(k=0; k<f.a[i1].nDel; k++){
1083 p = f.a[i1].apDel[k];
1084 updateCost(m, cxp+p->nFrom, cxp, p->iCost);
1086 updateCost(m, cx, cxu, pLang->iInsCost);
1087 if( matchFromTo(&f, i1, z2+i2, n2-i2) ){
1088 updateCost(m, cx, cxd, 0);
1090 updateCost(m, cx, cxd, pLang->iSubCost);
1091 for(k=0; k<f.a[i1].nSubst; k++){
1092 p = f.a[i1].apSubst[k];
1093 if( matchTo(p, z2+i2, n2-i2) ){
1094 updateCost(m, cxd+p->nFrom+szRow*p->nTo, cxd, p->iCost);
1100 #if 0 /* Enable for debugging */
1101 printf(" ^");
1102 for(i1=0; i1<f.n; i1++) printf(" %c-%2x", f.z[i1], f.z[i1]&0xff);
1103 printf("\n ^:");
1104 for(i1=0; i1<szRow; i1++){
1105 int v = m[i1];
1106 if( v>9999 ) printf(" ****");
1107 else printf(" %4d", v);
1109 printf("\n");
1110 for(i2=0; i2<n2; i2++){
1111 printf("%c-%02x:", z2[i2], z2[i2]&0xff);
1112 for(i1=0; i1<szRow; i1++){
1113 int v = m[(i2+1)*szRow+i1];
1114 if( v>9999 ) printf(" ****");
1115 else printf(" %4d", v);
1117 printf("\n");
1119 #endif
1121 /* Free memory allocations and return the result */
1122 res = (int)m[szRow*(n2+1)-1];
1123 n = n2;
1124 if( f.isPrefix ){
1125 for(i2=1; i2<=n2; i2++){
1126 int b = m[szRow*i2-1];
1127 if( b<=res ){
1128 res = b;
1129 n = i2 - 1;
1133 if( pnMatch ){
1134 int nExtra = 0;
1135 for(k=0; k<n; k++){
1136 if( (z2[k] & 0xc0)==0x80 ) nExtra++;
1138 *pnMatch = n - nExtra;
1141 editDist3Abort:
1142 for(i2=0; i2<n2; i2++) sqlite3_free(a2[i2].apIns);
1143 sqlite3_free(pToFree);
1144 return res;
1148 ** Get an appropriate EditDist3Lang object.
1150 static const EditDist3Lang *editDist3FindLang(
1151 EditDist3Config *pConfig,
1152 int iLang
1154 int i;
1155 for(i=0; i<pConfig->nLang; i++){
1156 if( pConfig->a[i].iLang==iLang ) return &pConfig->a[i];
1158 return &editDist3Lang;
1162 ** Function: editdist3(A,B,iLang)
1163 ** editdist3(tablename)
1165 ** Return the cost of transforming string A into string B using edit
1166 ** weights for iLang.
1168 ** The second form loads edit weights into memory from a table.
1170 static void editDist3SqlFunc(
1171 sqlite3_context *context,
1172 int argc,
1173 sqlite3_value **argv
1175 EditDist3Config *pConfig = (EditDist3Config*)sqlite3_user_data(context);
1176 sqlite3 *db = sqlite3_context_db_handle(context);
1177 int rc;
1178 if( argc==1 ){
1179 const char *zTable = (const char*)sqlite3_value_text(argv[0]);
1180 rc = editDist3ConfigLoad(pConfig, db, zTable);
1181 if( rc ) sqlite3_result_error_code(context, rc);
1182 }else{
1183 const char *zA = (const char*)sqlite3_value_text(argv[0]);
1184 const char *zB = (const char*)sqlite3_value_text(argv[1]);
1185 int nA = sqlite3_value_bytes(argv[0]);
1186 int nB = sqlite3_value_bytes(argv[1]);
1187 int iLang = argc==3 ? sqlite3_value_int(argv[2]) : 0;
1188 const EditDist3Lang *pLang = editDist3FindLang(pConfig, iLang);
1189 EditDist3FromString *pFrom;
1190 int dist;
1192 pFrom = editDist3FromStringNew(pLang, zA, nA);
1193 if( pFrom==0 ){
1194 sqlite3_result_error_nomem(context);
1195 return;
1197 dist = editDist3Core(pFrom, zB, nB, pLang, 0);
1198 editDist3FromStringDelete(pFrom);
1199 if( dist==(-1) ){
1200 sqlite3_result_error_nomem(context);
1201 }else{
1202 sqlite3_result_int(context, dist);
1208 ** Register the editDist3 function with SQLite
1210 static int editDist3Install(sqlite3 *db){
1211 int rc;
1212 EditDist3Config *pConfig = sqlite3_malloc64( sizeof(*pConfig) );
1213 if( pConfig==0 ) return SQLITE_NOMEM;
1214 memset(pConfig, 0, sizeof(*pConfig));
1215 rc = sqlite3_create_function_v2(db, "editdist3",
1216 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig,
1217 editDist3SqlFunc, 0, 0, 0);
1218 if( rc==SQLITE_OK ){
1219 rc = sqlite3_create_function_v2(db, "editdist3",
1220 3, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig,
1221 editDist3SqlFunc, 0, 0, 0);
1223 if( rc==SQLITE_OK ){
1224 rc = sqlite3_create_function_v2(db, "editdist3",
1225 1, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig,
1226 editDist3SqlFunc, 0, 0, editDist3ConfigDelete);
1227 }else{
1228 sqlite3_free(pConfig);
1230 return rc;
1232 /* End configurable cost unicode edit distance routines
1233 ******************************************************************************
1234 ******************************************************************************
1235 ** Begin transliterate unicode-to-ascii implementation
1238 #if !SQLITE_AMALGAMATION
1240 ** This lookup table is used to help decode the first byte of
1241 ** a multi-byte UTF8 character.
1243 static const unsigned char sqlite3Utf8Trans1[] = {
1244 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1245 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1246 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1247 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1248 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1249 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1250 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1251 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
1253 #endif
1256 ** Return the value of the first UTF-8 character in the string.
1258 static int utf8Read(const unsigned char *z, int n, int *pSize){
1259 int c, i;
1261 /* All callers to this routine (in the current implementation)
1262 ** always have n>0. */
1263 if( NEVER(n==0) ){
1264 c = i = 0;
1265 }else{
1266 c = z[0];
1267 i = 1;
1268 if( c>=0xc0 ){
1269 c = sqlite3Utf8Trans1[c-0xc0];
1270 while( i<n && (z[i] & 0xc0)==0x80 ){
1271 c = (c<<6) + (0x3f & z[i++]);
1275 *pSize = i;
1276 return c;
1280 ** Return the number of characters in the utf-8 string in the nIn byte
1281 ** buffer pointed to by zIn.
1283 static int utf8Charlen(const char *zIn, int nIn){
1284 int i;
1285 int nChar = 0;
1286 for(i=0; i<nIn; nChar++){
1287 int sz;
1288 utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1289 i += sz;
1291 return nChar;
1294 typedef struct Transliteration Transliteration;
1295 struct Transliteration {
1296 unsigned short int cFrom;
1297 unsigned char cTo0, cTo1, cTo2, cTo3;
1301 ** Table of translations from unicode characters into ASCII.
1303 static const Transliteration translit[] = {
1304 { 0x00A0, 0x20, 0x00, 0x00, 0x00 }, /*   to */
1305 { 0x00B5, 0x75, 0x00, 0x00, 0x00 }, /* µ to u */
1306 { 0x00C0, 0x41, 0x00, 0x00, 0x00 }, /* À to A */
1307 { 0x00C1, 0x41, 0x00, 0x00, 0x00 }, /* Á to A */
1308 { 0x00C2, 0x41, 0x00, 0x00, 0x00 }, /* Â to A */
1309 { 0x00C3, 0x41, 0x00, 0x00, 0x00 }, /* Ã to A */
1310 { 0x00C4, 0x41, 0x65, 0x00, 0x00 }, /* Ä to Ae */
1311 { 0x00C5, 0x41, 0x61, 0x00, 0x00 }, /* Å to Aa */
1312 { 0x00C6, 0x41, 0x45, 0x00, 0x00 }, /* Æ to AE */
1313 { 0x00C7, 0x43, 0x00, 0x00, 0x00 }, /* Ç to C */
1314 { 0x00C8, 0x45, 0x00, 0x00, 0x00 }, /* È to E */
1315 { 0x00C9, 0x45, 0x00, 0x00, 0x00 }, /* É to E */
1316 { 0x00CA, 0x45, 0x00, 0x00, 0x00 }, /* Ê to E */
1317 { 0x00CB, 0x45, 0x00, 0x00, 0x00 }, /* Ë to E */
1318 { 0x00CC, 0x49, 0x00, 0x00, 0x00 }, /* Ì to I */
1319 { 0x00CD, 0x49, 0x00, 0x00, 0x00 }, /* Í to I */
1320 { 0x00CE, 0x49, 0x00, 0x00, 0x00 }, /* Î to I */
1321 { 0x00CF, 0x49, 0x00, 0x00, 0x00 }, /* Ï to I */
1322 { 0x00D0, 0x44, 0x00, 0x00, 0x00 }, /* Ð to D */
1323 { 0x00D1, 0x4E, 0x00, 0x00, 0x00 }, /* Ñ to N */
1324 { 0x00D2, 0x4F, 0x00, 0x00, 0x00 }, /* Ò to O */
1325 { 0x00D3, 0x4F, 0x00, 0x00, 0x00 }, /* Ó to O */
1326 { 0x00D4, 0x4F, 0x00, 0x00, 0x00 }, /* Ô to O */
1327 { 0x00D5, 0x4F, 0x00, 0x00, 0x00 }, /* Õ to O */
1328 { 0x00D6, 0x4F, 0x65, 0x00, 0x00 }, /* Ö to Oe */
1329 { 0x00D7, 0x78, 0x00, 0x00, 0x00 }, /* × to x */
1330 { 0x00D8, 0x4F, 0x00, 0x00, 0x00 }, /* Ø to O */
1331 { 0x00D9, 0x55, 0x00, 0x00, 0x00 }, /* Ù to U */
1332 { 0x00DA, 0x55, 0x00, 0x00, 0x00 }, /* Ú to U */
1333 { 0x00DB, 0x55, 0x00, 0x00, 0x00 }, /* Û to U */
1334 { 0x00DC, 0x55, 0x65, 0x00, 0x00 }, /* Ü to Ue */
1335 { 0x00DD, 0x59, 0x00, 0x00, 0x00 }, /* Ý to Y */
1336 { 0x00DE, 0x54, 0x68, 0x00, 0x00 }, /* Þ to Th */
1337 { 0x00DF, 0x73, 0x73, 0x00, 0x00 }, /* ß to ss */
1338 { 0x00E0, 0x61, 0x00, 0x00, 0x00 }, /* à to a */
1339 { 0x00E1, 0x61, 0x00, 0x00, 0x00 }, /* á to a */
1340 { 0x00E2, 0x61, 0x00, 0x00, 0x00 }, /* â to a */
1341 { 0x00E3, 0x61, 0x00, 0x00, 0x00 }, /* ã to a */
1342 { 0x00E4, 0x61, 0x65, 0x00, 0x00 }, /* ä to ae */
1343 { 0x00E5, 0x61, 0x61, 0x00, 0x00 }, /* å to aa */
1344 { 0x00E6, 0x61, 0x65, 0x00, 0x00 }, /* æ to ae */
1345 { 0x00E7, 0x63, 0x00, 0x00, 0x00 }, /* ç to c */
1346 { 0x00E8, 0x65, 0x00, 0x00, 0x00 }, /* è to e */
1347 { 0x00E9, 0x65, 0x00, 0x00, 0x00 }, /* é to e */
1348 { 0x00EA, 0x65, 0x00, 0x00, 0x00 }, /* ê to e */
1349 { 0x00EB, 0x65, 0x00, 0x00, 0x00 }, /* ë to e */
1350 { 0x00EC, 0x69, 0x00, 0x00, 0x00 }, /* ì to i */
1351 { 0x00ED, 0x69, 0x00, 0x00, 0x00 }, /* í to i */
1352 { 0x00EE, 0x69, 0x00, 0x00, 0x00 }, /* î to i */
1353 { 0x00EF, 0x69, 0x00, 0x00, 0x00 }, /* ï to i */
1354 { 0x00F0, 0x64, 0x00, 0x00, 0x00 }, /* ð to d */
1355 { 0x00F1, 0x6E, 0x00, 0x00, 0x00 }, /* ñ to n */
1356 { 0x00F2, 0x6F, 0x00, 0x00, 0x00 }, /* ò to o */
1357 { 0x00F3, 0x6F, 0x00, 0x00, 0x00 }, /* ó to o */
1358 { 0x00F4, 0x6F, 0x00, 0x00, 0x00 }, /* ô to o */
1359 { 0x00F5, 0x6F, 0x00, 0x00, 0x00 }, /* õ to o */
1360 { 0x00F6, 0x6F, 0x65, 0x00, 0x00 }, /* ö to oe */
1361 { 0x00F7, 0x3A, 0x00, 0x00, 0x00 }, /* ÷ to : */
1362 { 0x00F8, 0x6F, 0x00, 0x00, 0x00 }, /* ø to o */
1363 { 0x00F9, 0x75, 0x00, 0x00, 0x00 }, /* ù to u */
1364 { 0x00FA, 0x75, 0x00, 0x00, 0x00 }, /* ú to u */
1365 { 0x00FB, 0x75, 0x00, 0x00, 0x00 }, /* û to u */
1366 { 0x00FC, 0x75, 0x65, 0x00, 0x00 }, /* ü to ue */
1367 { 0x00FD, 0x79, 0x00, 0x00, 0x00 }, /* ý to y */
1368 { 0x00FE, 0x74, 0x68, 0x00, 0x00 }, /* þ to th */
1369 { 0x00FF, 0x79, 0x00, 0x00, 0x00 }, /* ÿ to y */
1370 { 0x0100, 0x41, 0x00, 0x00, 0x00 }, /* Ā to A */
1371 { 0x0101, 0x61, 0x00, 0x00, 0x00 }, /* ā to a */
1372 { 0x0102, 0x41, 0x00, 0x00, 0x00 }, /* Ă to A */
1373 { 0x0103, 0x61, 0x00, 0x00, 0x00 }, /* ă to a */
1374 { 0x0104, 0x41, 0x00, 0x00, 0x00 }, /* Ą to A */
1375 { 0x0105, 0x61, 0x00, 0x00, 0x00 }, /* ą to a */
1376 { 0x0106, 0x43, 0x00, 0x00, 0x00 }, /* Ć to C */
1377 { 0x0107, 0x63, 0x00, 0x00, 0x00 }, /* ć to c */
1378 { 0x0108, 0x43, 0x68, 0x00, 0x00 }, /* Ĉ to Ch */
1379 { 0x0109, 0x63, 0x68, 0x00, 0x00 }, /* ĉ to ch */
1380 { 0x010A, 0x43, 0x00, 0x00, 0x00 }, /* Ċ to C */
1381 { 0x010B, 0x63, 0x00, 0x00, 0x00 }, /* ċ to c */
1382 { 0x010C, 0x43, 0x00, 0x00, 0x00 }, /* Č to C */
1383 { 0x010D, 0x63, 0x00, 0x00, 0x00 }, /* č to c */
1384 { 0x010E, 0x44, 0x00, 0x00, 0x00 }, /* Ď to D */
1385 { 0x010F, 0x64, 0x00, 0x00, 0x00 }, /* ď to d */
1386 { 0x0110, 0x44, 0x00, 0x00, 0x00 }, /* Đ to D */
1387 { 0x0111, 0x64, 0x00, 0x00, 0x00 }, /* đ to d */
1388 { 0x0112, 0x45, 0x00, 0x00, 0x00 }, /* Ē to E */
1389 { 0x0113, 0x65, 0x00, 0x00, 0x00 }, /* ē to e */
1390 { 0x0114, 0x45, 0x00, 0x00, 0x00 }, /* Ĕ to E */
1391 { 0x0115, 0x65, 0x00, 0x00, 0x00 }, /* ĕ to e */
1392 { 0x0116, 0x45, 0x00, 0x00, 0x00 }, /* Ė to E */
1393 { 0x0117, 0x65, 0x00, 0x00, 0x00 }, /* ė to e */
1394 { 0x0118, 0x45, 0x00, 0x00, 0x00 }, /* Ę to E */
1395 { 0x0119, 0x65, 0x00, 0x00, 0x00 }, /* ę to e */
1396 { 0x011A, 0x45, 0x00, 0x00, 0x00 }, /* Ě to E */
1397 { 0x011B, 0x65, 0x00, 0x00, 0x00 }, /* ě to e */
1398 { 0x011C, 0x47, 0x68, 0x00, 0x00 }, /* Ĝ to Gh */
1399 { 0x011D, 0x67, 0x68, 0x00, 0x00 }, /* ĝ to gh */
1400 { 0x011E, 0x47, 0x00, 0x00, 0x00 }, /* Ğ to G */
1401 { 0x011F, 0x67, 0x00, 0x00, 0x00 }, /* ğ to g */
1402 { 0x0120, 0x47, 0x00, 0x00, 0x00 }, /* Ġ to G */
1403 { 0x0121, 0x67, 0x00, 0x00, 0x00 }, /* ġ to g */
1404 { 0x0122, 0x47, 0x00, 0x00, 0x00 }, /* Ģ to G */
1405 { 0x0123, 0x67, 0x00, 0x00, 0x00 }, /* ģ to g */
1406 { 0x0124, 0x48, 0x68, 0x00, 0x00 }, /* Ĥ to Hh */
1407 { 0x0125, 0x68, 0x68, 0x00, 0x00 }, /* ĥ to hh */
1408 { 0x0126, 0x48, 0x00, 0x00, 0x00 }, /* Ħ to H */
1409 { 0x0127, 0x68, 0x00, 0x00, 0x00 }, /* ħ to h */
1410 { 0x0128, 0x49, 0x00, 0x00, 0x00 }, /* Ĩ to I */
1411 { 0x0129, 0x69, 0x00, 0x00, 0x00 }, /* ĩ to i */
1412 { 0x012A, 0x49, 0x00, 0x00, 0x00 }, /* Ī to I */
1413 { 0x012B, 0x69, 0x00, 0x00, 0x00 }, /* ī to i */
1414 { 0x012C, 0x49, 0x00, 0x00, 0x00 }, /* Ĭ to I */
1415 { 0x012D, 0x69, 0x00, 0x00, 0x00 }, /* ĭ to i */
1416 { 0x012E, 0x49, 0x00, 0x00, 0x00 }, /* Į to I */
1417 { 0x012F, 0x69, 0x00, 0x00, 0x00 }, /* į to i */
1418 { 0x0130, 0x49, 0x00, 0x00, 0x00 }, /* İ to I */
1419 { 0x0131, 0x69, 0x00, 0x00, 0x00 }, /* ı to i */
1420 { 0x0132, 0x49, 0x4A, 0x00, 0x00 }, /* IJ to IJ */
1421 { 0x0133, 0x69, 0x6A, 0x00, 0x00 }, /* ij to ij */
1422 { 0x0134, 0x4A, 0x68, 0x00, 0x00 }, /* Ĵ to Jh */
1423 { 0x0135, 0x6A, 0x68, 0x00, 0x00 }, /* ĵ to jh */
1424 { 0x0136, 0x4B, 0x00, 0x00, 0x00 }, /* Ķ to K */
1425 { 0x0137, 0x6B, 0x00, 0x00, 0x00 }, /* ķ to k */
1426 { 0x0138, 0x6B, 0x00, 0x00, 0x00 }, /* ĸ to k */
1427 { 0x0139, 0x4C, 0x00, 0x00, 0x00 }, /* Ĺ to L */
1428 { 0x013A, 0x6C, 0x00, 0x00, 0x00 }, /* ĺ to l */
1429 { 0x013B, 0x4C, 0x00, 0x00, 0x00 }, /* Ļ to L */
1430 { 0x013C, 0x6C, 0x00, 0x00, 0x00 }, /* ļ to l */
1431 { 0x013D, 0x4C, 0x00, 0x00, 0x00 }, /* Ľ to L */
1432 { 0x013E, 0x6C, 0x00, 0x00, 0x00 }, /* ľ to l */
1433 { 0x013F, 0x4C, 0x2E, 0x00, 0x00 }, /* Ŀ to L. */
1434 { 0x0140, 0x6C, 0x2E, 0x00, 0x00 }, /* ŀ to l. */
1435 { 0x0141, 0x4C, 0x00, 0x00, 0x00 }, /* Ł to L */
1436 { 0x0142, 0x6C, 0x00, 0x00, 0x00 }, /* ł to l */
1437 { 0x0143, 0x4E, 0x00, 0x00, 0x00 }, /* Ń to N */
1438 { 0x0144, 0x6E, 0x00, 0x00, 0x00 }, /* ń to n */
1439 { 0x0145, 0x4E, 0x00, 0x00, 0x00 }, /* Ņ to N */
1440 { 0x0146, 0x6E, 0x00, 0x00, 0x00 }, /* ņ to n */
1441 { 0x0147, 0x4E, 0x00, 0x00, 0x00 }, /* Ň to N */
1442 { 0x0148, 0x6E, 0x00, 0x00, 0x00 }, /* ň to n */
1443 { 0x0149, 0x27, 0x6E, 0x00, 0x00 }, /* ʼn to 'n */
1444 { 0x014A, 0x4E, 0x47, 0x00, 0x00 }, /* Ŋ to NG */
1445 { 0x014B, 0x6E, 0x67, 0x00, 0x00 }, /* ŋ to ng */
1446 { 0x014C, 0x4F, 0x00, 0x00, 0x00 }, /* Ō to O */
1447 { 0x014D, 0x6F, 0x00, 0x00, 0x00 }, /* ō to o */
1448 { 0x014E, 0x4F, 0x00, 0x00, 0x00 }, /* Ŏ to O */
1449 { 0x014F, 0x6F, 0x00, 0x00, 0x00 }, /* ŏ to o */
1450 { 0x0150, 0x4F, 0x00, 0x00, 0x00 }, /* Ő to O */
1451 { 0x0151, 0x6F, 0x00, 0x00, 0x00 }, /* ő to o */
1452 { 0x0152, 0x4F, 0x45, 0x00, 0x00 }, /* Πto OE */
1453 { 0x0153, 0x6F, 0x65, 0x00, 0x00 }, /* œ to oe */
1454 { 0x0154, 0x52, 0x00, 0x00, 0x00 }, /* Ŕ to R */
1455 { 0x0155, 0x72, 0x00, 0x00, 0x00 }, /* ŕ to r */
1456 { 0x0156, 0x52, 0x00, 0x00, 0x00 }, /* Ŗ to R */
1457 { 0x0157, 0x72, 0x00, 0x00, 0x00 }, /* ŗ to r */
1458 { 0x0158, 0x52, 0x00, 0x00, 0x00 }, /* Ř to R */
1459 { 0x0159, 0x72, 0x00, 0x00, 0x00 }, /* ř to r */
1460 { 0x015A, 0x53, 0x00, 0x00, 0x00 }, /* Ś to S */
1461 { 0x015B, 0x73, 0x00, 0x00, 0x00 }, /* ś to s */
1462 { 0x015C, 0x53, 0x68, 0x00, 0x00 }, /* Ŝ to Sh */
1463 { 0x015D, 0x73, 0x68, 0x00, 0x00 }, /* ŝ to sh */
1464 { 0x015E, 0x53, 0x00, 0x00, 0x00 }, /* Ş to S */
1465 { 0x015F, 0x73, 0x00, 0x00, 0x00 }, /* ş to s */
1466 { 0x0160, 0x53, 0x00, 0x00, 0x00 }, /* Š to S */
1467 { 0x0161, 0x73, 0x00, 0x00, 0x00 }, /* š to s */
1468 { 0x0162, 0x54, 0x00, 0x00, 0x00 }, /* Ţ to T */
1469 { 0x0163, 0x74, 0x00, 0x00, 0x00 }, /* ţ to t */
1470 { 0x0164, 0x54, 0x00, 0x00, 0x00 }, /* Ť to T */
1471 { 0x0165, 0x74, 0x00, 0x00, 0x00 }, /* ť to t */
1472 { 0x0166, 0x54, 0x00, 0x00, 0x00 }, /* Ŧ to T */
1473 { 0x0167, 0x74, 0x00, 0x00, 0x00 }, /* ŧ to t */
1474 { 0x0168, 0x55, 0x00, 0x00, 0x00 }, /* Ũ to U */
1475 { 0x0169, 0x75, 0x00, 0x00, 0x00 }, /* ũ to u */
1476 { 0x016A, 0x55, 0x00, 0x00, 0x00 }, /* Ū to U */
1477 { 0x016B, 0x75, 0x00, 0x00, 0x00 }, /* ū to u */
1478 { 0x016C, 0x55, 0x00, 0x00, 0x00 }, /* Ŭ to U */
1479 { 0x016D, 0x75, 0x00, 0x00, 0x00 }, /* ŭ to u */
1480 { 0x016E, 0x55, 0x00, 0x00, 0x00 }, /* Ů to U */
1481 { 0x016F, 0x75, 0x00, 0x00, 0x00 }, /* ů to u */
1482 { 0x0170, 0x55, 0x00, 0x00, 0x00 }, /* Ű to U */
1483 { 0x0171, 0x75, 0x00, 0x00, 0x00 }, /* ű to u */
1484 { 0x0172, 0x55, 0x00, 0x00, 0x00 }, /* Ų to U */
1485 { 0x0173, 0x75, 0x00, 0x00, 0x00 }, /* ų to u */
1486 { 0x0174, 0x57, 0x00, 0x00, 0x00 }, /* Ŵ to W */
1487 { 0x0175, 0x77, 0x00, 0x00, 0x00 }, /* ŵ to w */
1488 { 0x0176, 0x59, 0x00, 0x00, 0x00 }, /* Ŷ to Y */
1489 { 0x0177, 0x79, 0x00, 0x00, 0x00 }, /* ŷ to y */
1490 { 0x0178, 0x59, 0x00, 0x00, 0x00 }, /* Ÿ to Y */
1491 { 0x0179, 0x5A, 0x00, 0x00, 0x00 }, /* Ź to Z */
1492 { 0x017A, 0x7A, 0x00, 0x00, 0x00 }, /* ź to z */
1493 { 0x017B, 0x5A, 0x00, 0x00, 0x00 }, /* Ż to Z */
1494 { 0x017C, 0x7A, 0x00, 0x00, 0x00 }, /* ż to z */
1495 { 0x017D, 0x5A, 0x00, 0x00, 0x00 }, /* Ž to Z */
1496 { 0x017E, 0x7A, 0x00, 0x00, 0x00 }, /* ž to z */
1497 { 0x017F, 0x73, 0x00, 0x00, 0x00 }, /* ſ to s */
1498 { 0x0192, 0x66, 0x00, 0x00, 0x00 }, /* ƒ to f */
1499 { 0x0218, 0x53, 0x00, 0x00, 0x00 }, /* Ș to S */
1500 { 0x0219, 0x73, 0x00, 0x00, 0x00 }, /* ș to s */
1501 { 0x021A, 0x54, 0x00, 0x00, 0x00 }, /* Ț to T */
1502 { 0x021B, 0x74, 0x00, 0x00, 0x00 }, /* ț to t */
1503 { 0x0386, 0x41, 0x00, 0x00, 0x00 }, /* Ά to A */
1504 { 0x0388, 0x45, 0x00, 0x00, 0x00 }, /* Έ to E */
1505 { 0x0389, 0x49, 0x00, 0x00, 0x00 }, /* Ή to I */
1506 { 0x038A, 0x49, 0x00, 0x00, 0x00 }, /* Ί to I */
1507 { 0x038C, 0x4f, 0x00, 0x00, 0x00 }, /* Ό to O */
1508 { 0x038E, 0x59, 0x00, 0x00, 0x00 }, /* Ύ to Y */
1509 { 0x038F, 0x4f, 0x00, 0x00, 0x00 }, /* Ώ to O */
1510 { 0x0390, 0x69, 0x00, 0x00, 0x00 }, /* ΐ to i */
1511 { 0x0391, 0x41, 0x00, 0x00, 0x00 }, /* Α to A */
1512 { 0x0392, 0x42, 0x00, 0x00, 0x00 }, /* Β to B */
1513 { 0x0393, 0x47, 0x00, 0x00, 0x00 }, /* Γ to G */
1514 { 0x0394, 0x44, 0x00, 0x00, 0x00 }, /* Δ to D */
1515 { 0x0395, 0x45, 0x00, 0x00, 0x00 }, /* Ε to E */
1516 { 0x0396, 0x5a, 0x00, 0x00, 0x00 }, /* Ζ to Z */
1517 { 0x0397, 0x49, 0x00, 0x00, 0x00 }, /* Η to I */
1518 { 0x0398, 0x54, 0x68, 0x00, 0x00 }, /* Θ to Th */
1519 { 0x0399, 0x49, 0x00, 0x00, 0x00 }, /* Ι to I */
1520 { 0x039A, 0x4b, 0x00, 0x00, 0x00 }, /* Κ to K */
1521 { 0x039B, 0x4c, 0x00, 0x00, 0x00 }, /* Λ to L */
1522 { 0x039C, 0x4d, 0x00, 0x00, 0x00 }, /* Μ to M */
1523 { 0x039D, 0x4e, 0x00, 0x00, 0x00 }, /* Ν to N */
1524 { 0x039E, 0x58, 0x00, 0x00, 0x00 }, /* Ξ to X */
1525 { 0x039F, 0x4f, 0x00, 0x00, 0x00 }, /* Ο to O */
1526 { 0x03A0, 0x50, 0x00, 0x00, 0x00 }, /* Π to P */
1527 { 0x03A1, 0x52, 0x00, 0x00, 0x00 }, /* Ρ to R */
1528 { 0x03A3, 0x53, 0x00, 0x00, 0x00 }, /* Σ to S */
1529 { 0x03A4, 0x54, 0x00, 0x00, 0x00 }, /* Τ to T */
1530 { 0x03A5, 0x59, 0x00, 0x00, 0x00 }, /* Υ to Y */
1531 { 0x03A6, 0x46, 0x00, 0x00, 0x00 }, /* Φ to F */
1532 { 0x03A7, 0x43, 0x68, 0x00, 0x00 }, /* Χ to Ch */
1533 { 0x03A8, 0x50, 0x73, 0x00, 0x00 }, /* Ψ to Ps */
1534 { 0x03A9, 0x4f, 0x00, 0x00, 0x00 }, /* Ω to O */
1535 { 0x03AA, 0x49, 0x00, 0x00, 0x00 }, /* Ϊ to I */
1536 { 0x03AB, 0x59, 0x00, 0x00, 0x00 }, /* Ϋ to Y */
1537 { 0x03AC, 0x61, 0x00, 0x00, 0x00 }, /* ά to a */
1538 { 0x03AD, 0x65, 0x00, 0x00, 0x00 }, /* έ to e */
1539 { 0x03AE, 0x69, 0x00, 0x00, 0x00 }, /* ή to i */
1540 { 0x03AF, 0x69, 0x00, 0x00, 0x00 }, /* ί to i */
1541 { 0x03B1, 0x61, 0x00, 0x00, 0x00 }, /* α to a */
1542 { 0x03B2, 0x62, 0x00, 0x00, 0x00 }, /* β to b */
1543 { 0x03B3, 0x67, 0x00, 0x00, 0x00 }, /* γ to g */
1544 { 0x03B4, 0x64, 0x00, 0x00, 0x00 }, /* δ to d */
1545 { 0x03B5, 0x65, 0x00, 0x00, 0x00 }, /* ε to e */
1546 { 0x03B6, 0x7a, 0x00, 0x00, 0x00 }, /* ζ to z */
1547 { 0x03B7, 0x69, 0x00, 0x00, 0x00 }, /* η to i */
1548 { 0x03B8, 0x74, 0x68, 0x00, 0x00 }, /* θ to th */
1549 { 0x03B9, 0x69, 0x00, 0x00, 0x00 }, /* ι to i */
1550 { 0x03BA, 0x6b, 0x00, 0x00, 0x00 }, /* κ to k */
1551 { 0x03BB, 0x6c, 0x00, 0x00, 0x00 }, /* λ to l */
1552 { 0x03BC, 0x6d, 0x00, 0x00, 0x00 }, /* μ to m */
1553 { 0x03BD, 0x6e, 0x00, 0x00, 0x00 }, /* ν to n */
1554 { 0x03BE, 0x78, 0x00, 0x00, 0x00 }, /* ξ to x */
1555 { 0x03BF, 0x6f, 0x00, 0x00, 0x00 }, /* ο to o */
1556 { 0x03C0, 0x70, 0x00, 0x00, 0x00 }, /* π to p */
1557 { 0x03C1, 0x72, 0x00, 0x00, 0x00 }, /* ρ to r */
1558 { 0x03C3, 0x73, 0x00, 0x00, 0x00 }, /* σ to s */
1559 { 0x03C4, 0x74, 0x00, 0x00, 0x00 }, /* τ to t */
1560 { 0x03C5, 0x79, 0x00, 0x00, 0x00 }, /* υ to y */
1561 { 0x03C6, 0x66, 0x00, 0x00, 0x00 }, /* φ to f */
1562 { 0x03C7, 0x63, 0x68, 0x00, 0x00 }, /* χ to ch */
1563 { 0x03C8, 0x70, 0x73, 0x00, 0x00 }, /* ψ to ps */
1564 { 0x03C9, 0x6f, 0x00, 0x00, 0x00 }, /* ω to o */
1565 { 0x03CA, 0x69, 0x00, 0x00, 0x00 }, /* ϊ to i */
1566 { 0x03CB, 0x79, 0x00, 0x00, 0x00 }, /* ϋ to y */
1567 { 0x03CC, 0x6f, 0x00, 0x00, 0x00 }, /* ό to o */
1568 { 0x03CD, 0x79, 0x00, 0x00, 0x00 }, /* ύ to y */
1569 { 0x03CE, 0x69, 0x00, 0x00, 0x00 }, /* ώ to i */
1570 { 0x0400, 0x45, 0x00, 0x00, 0x00 }, /* Ѐ to E */
1571 { 0x0401, 0x45, 0x00, 0x00, 0x00 }, /* Ё to E */
1572 { 0x0402, 0x44, 0x00, 0x00, 0x00 }, /* Ђ to D */
1573 { 0x0403, 0x47, 0x00, 0x00, 0x00 }, /* Ѓ to G */
1574 { 0x0404, 0x45, 0x00, 0x00, 0x00 }, /* Є to E */
1575 { 0x0405, 0x5a, 0x00, 0x00, 0x00 }, /* Ѕ to Z */
1576 { 0x0406, 0x49, 0x00, 0x00, 0x00 }, /* І to I */
1577 { 0x0407, 0x49, 0x00, 0x00, 0x00 }, /* Ї to I */
1578 { 0x0408, 0x4a, 0x00, 0x00, 0x00 }, /* Ј to J */
1579 { 0x0409, 0x49, 0x00, 0x00, 0x00 }, /* Љ to I */
1580 { 0x040A, 0x4e, 0x00, 0x00, 0x00 }, /* Њ to N */
1581 { 0x040B, 0x44, 0x00, 0x00, 0x00 }, /* Ћ to D */
1582 { 0x040C, 0x4b, 0x00, 0x00, 0x00 }, /* Ќ to K */
1583 { 0x040D, 0x49, 0x00, 0x00, 0x00 }, /* Ѝ to I */
1584 { 0x040E, 0x55, 0x00, 0x00, 0x00 }, /* Ў to U */
1585 { 0x040F, 0x44, 0x00, 0x00, 0x00 }, /* Џ to D */
1586 { 0x0410, 0x41, 0x00, 0x00, 0x00 }, /* А to A */
1587 { 0x0411, 0x42, 0x00, 0x00, 0x00 }, /* Б to B */
1588 { 0x0412, 0x56, 0x00, 0x00, 0x00 }, /* В to V */
1589 { 0x0413, 0x47, 0x00, 0x00, 0x00 }, /* Г to G */
1590 { 0x0414, 0x44, 0x00, 0x00, 0x00 }, /* Д to D */
1591 { 0x0415, 0x45, 0x00, 0x00, 0x00 }, /* Е to E */
1592 { 0x0416, 0x5a, 0x68, 0x00, 0x00 }, /* Ж to Zh */
1593 { 0x0417, 0x5a, 0x00, 0x00, 0x00 }, /* З to Z */
1594 { 0x0418, 0x49, 0x00, 0x00, 0x00 }, /* И to I */
1595 { 0x0419, 0x49, 0x00, 0x00, 0x00 }, /* Й to I */
1596 { 0x041A, 0x4b, 0x00, 0x00, 0x00 }, /* К to K */
1597 { 0x041B, 0x4c, 0x00, 0x00, 0x00 }, /* Л to L */
1598 { 0x041C, 0x4d, 0x00, 0x00, 0x00 }, /* М to M */
1599 { 0x041D, 0x4e, 0x00, 0x00, 0x00 }, /* Н to N */
1600 { 0x041E, 0x4f, 0x00, 0x00, 0x00 }, /* О to O */
1601 { 0x041F, 0x50, 0x00, 0x00, 0x00 }, /* П to P */
1602 { 0x0420, 0x52, 0x00, 0x00, 0x00 }, /* Р to R */
1603 { 0x0421, 0x53, 0x00, 0x00, 0x00 }, /* С to S */
1604 { 0x0422, 0x54, 0x00, 0x00, 0x00 }, /* Т to T */
1605 { 0x0423, 0x55, 0x00, 0x00, 0x00 }, /* У to U */
1606 { 0x0424, 0x46, 0x00, 0x00, 0x00 }, /* Ф to F */
1607 { 0x0425, 0x4b, 0x68, 0x00, 0x00 }, /* Х to Kh */
1608 { 0x0426, 0x54, 0x63, 0x00, 0x00 }, /* Ц to Tc */
1609 { 0x0427, 0x43, 0x68, 0x00, 0x00 }, /* Ч to Ch */
1610 { 0x0428, 0x53, 0x68, 0x00, 0x00 }, /* Ш to Sh */
1611 { 0x0429, 0x53, 0x68, 0x63, 0x68 }, /* Щ to Shch */
1612 { 0x042A, 0x61, 0x00, 0x00, 0x00 }, /* to A */
1613 { 0x042B, 0x59, 0x00, 0x00, 0x00 }, /* Ы to Y */
1614 { 0x042C, 0x59, 0x00, 0x00, 0x00 }, /* to Y */
1615 { 0x042D, 0x45, 0x00, 0x00, 0x00 }, /* Э to E */
1616 { 0x042E, 0x49, 0x75, 0x00, 0x00 }, /* Ю to Iu */
1617 { 0x042F, 0x49, 0x61, 0x00, 0x00 }, /* Я to Ia */
1618 { 0x0430, 0x61, 0x00, 0x00, 0x00 }, /* а to a */
1619 { 0x0431, 0x62, 0x00, 0x00, 0x00 }, /* б to b */
1620 { 0x0432, 0x76, 0x00, 0x00, 0x00 }, /* в to v */
1621 { 0x0433, 0x67, 0x00, 0x00, 0x00 }, /* г to g */
1622 { 0x0434, 0x64, 0x00, 0x00, 0x00 }, /* д to d */
1623 { 0x0435, 0x65, 0x00, 0x00, 0x00 }, /* е to e */
1624 { 0x0436, 0x7a, 0x68, 0x00, 0x00 }, /* ж to zh */
1625 { 0x0437, 0x7a, 0x00, 0x00, 0x00 }, /* з to z */
1626 { 0x0438, 0x69, 0x00, 0x00, 0x00 }, /* и to i */
1627 { 0x0439, 0x69, 0x00, 0x00, 0x00 }, /* й to i */
1628 { 0x043A, 0x6b, 0x00, 0x00, 0x00 }, /* к to k */
1629 { 0x043B, 0x6c, 0x00, 0x00, 0x00 }, /* л to l */
1630 { 0x043C, 0x6d, 0x00, 0x00, 0x00 }, /* м to m */
1631 { 0x043D, 0x6e, 0x00, 0x00, 0x00 }, /* н to n */
1632 { 0x043E, 0x6f, 0x00, 0x00, 0x00 }, /* о to o */
1633 { 0x043F, 0x70, 0x00, 0x00, 0x00 }, /* п to p */
1634 { 0x0440, 0x72, 0x00, 0x00, 0x00 }, /* р to r */
1635 { 0x0441, 0x73, 0x00, 0x00, 0x00 }, /* с to s */
1636 { 0x0442, 0x74, 0x00, 0x00, 0x00 }, /* т to t */
1637 { 0x0443, 0x75, 0x00, 0x00, 0x00 }, /* у to u */
1638 { 0x0444, 0x66, 0x00, 0x00, 0x00 }, /* ф to f */
1639 { 0x0445, 0x6b, 0x68, 0x00, 0x00 }, /* х to kh */
1640 { 0x0446, 0x74, 0x63, 0x00, 0x00 }, /* ц to tc */
1641 { 0x0447, 0x63, 0x68, 0x00, 0x00 }, /* ч to ch */
1642 { 0x0448, 0x73, 0x68, 0x00, 0x00 }, /* ш to sh */
1643 { 0x0449, 0x73, 0x68, 0x63, 0x68 }, /* щ to shch */
1644 { 0x044A, 0x61, 0x00, 0x00, 0x00 }, /* to a */
1645 { 0x044B, 0x79, 0x00, 0x00, 0x00 }, /* ы to y */
1646 { 0x044C, 0x79, 0x00, 0x00, 0x00 }, /* to y */
1647 { 0x044D, 0x65, 0x00, 0x00, 0x00 }, /* э to e */
1648 { 0x044E, 0x69, 0x75, 0x00, 0x00 }, /* ю to iu */
1649 { 0x044F, 0x69, 0x61, 0x00, 0x00 }, /* я to ia */
1650 { 0x0450, 0x65, 0x00, 0x00, 0x00 }, /* ѐ to e */
1651 { 0x0451, 0x65, 0x00, 0x00, 0x00 }, /* ё to e */
1652 { 0x0452, 0x64, 0x00, 0x00, 0x00 }, /* ђ to d */
1653 { 0x0453, 0x67, 0x00, 0x00, 0x00 }, /* ѓ to g */
1654 { 0x0454, 0x65, 0x00, 0x00, 0x00 }, /* є to e */
1655 { 0x0455, 0x7a, 0x00, 0x00, 0x00 }, /* ѕ to z */
1656 { 0x0456, 0x69, 0x00, 0x00, 0x00 }, /* і to i */
1657 { 0x0457, 0x69, 0x00, 0x00, 0x00 }, /* ї to i */
1658 { 0x0458, 0x6a, 0x00, 0x00, 0x00 }, /* ј to j */
1659 { 0x0459, 0x69, 0x00, 0x00, 0x00 }, /* љ to i */
1660 { 0x045A, 0x6e, 0x00, 0x00, 0x00 }, /* њ to n */
1661 { 0x045B, 0x64, 0x00, 0x00, 0x00 }, /* ћ to d */
1662 { 0x045C, 0x6b, 0x00, 0x00, 0x00 }, /* ќ to k */
1663 { 0x045D, 0x69, 0x00, 0x00, 0x00 }, /* ѝ to i */
1664 { 0x045E, 0x75, 0x00, 0x00, 0x00 }, /* ў to u */
1665 { 0x045F, 0x64, 0x00, 0x00, 0x00 }, /* џ to d */
1666 { 0x1E02, 0x42, 0x00, 0x00, 0x00 }, /* Ḃ to B */
1667 { 0x1E03, 0x62, 0x00, 0x00, 0x00 }, /* ḃ to b */
1668 { 0x1E0A, 0x44, 0x00, 0x00, 0x00 }, /* Ḋ to D */
1669 { 0x1E0B, 0x64, 0x00, 0x00, 0x00 }, /* ḋ to d */
1670 { 0x1E1E, 0x46, 0x00, 0x00, 0x00 }, /* Ḟ to F */
1671 { 0x1E1F, 0x66, 0x00, 0x00, 0x00 }, /* ḟ to f */
1672 { 0x1E40, 0x4D, 0x00, 0x00, 0x00 }, /* Ṁ to M */
1673 { 0x1E41, 0x6D, 0x00, 0x00, 0x00 }, /* ṁ to m */
1674 { 0x1E56, 0x50, 0x00, 0x00, 0x00 }, /* Ṗ to P */
1675 { 0x1E57, 0x70, 0x00, 0x00, 0x00 }, /* ṗ to p */
1676 { 0x1E60, 0x53, 0x00, 0x00, 0x00 }, /* Ṡ to S */
1677 { 0x1E61, 0x73, 0x00, 0x00, 0x00 }, /* ṡ to s */
1678 { 0x1E6A, 0x54, 0x00, 0x00, 0x00 }, /* Ṫ to T */
1679 { 0x1E6B, 0x74, 0x00, 0x00, 0x00 }, /* ṫ to t */
1680 { 0x1E80, 0x57, 0x00, 0x00, 0x00 }, /* Ẁ to W */
1681 { 0x1E81, 0x77, 0x00, 0x00, 0x00 }, /* ẁ to w */
1682 { 0x1E82, 0x57, 0x00, 0x00, 0x00 }, /* Ẃ to W */
1683 { 0x1E83, 0x77, 0x00, 0x00, 0x00 }, /* ẃ to w */
1684 { 0x1E84, 0x57, 0x00, 0x00, 0x00 }, /* Ẅ to W */
1685 { 0x1E85, 0x77, 0x00, 0x00, 0x00 }, /* ẅ to w */
1686 { 0x1EF2, 0x59, 0x00, 0x00, 0x00 }, /* Ỳ to Y */
1687 { 0x1EF3, 0x79, 0x00, 0x00, 0x00 }, /* ỳ to y */
1688 { 0xFB00, 0x66, 0x66, 0x00, 0x00 }, /* ff to ff */
1689 { 0xFB01, 0x66, 0x69, 0x00, 0x00 }, /* fi to fi */
1690 { 0xFB02, 0x66, 0x6C, 0x00, 0x00 }, /* fl to fl */
1691 { 0xFB05, 0x73, 0x74, 0x00, 0x00 }, /* ſt to st */
1692 { 0xFB06, 0x73, 0x74, 0x00, 0x00 }, /* st to st */
1695 static const Transliteration *spellfixFindTranslit(int c, int *pxTop){
1696 *pxTop = (sizeof(translit)/sizeof(translit[0])) - 1;
1697 return translit;
1701 ** Convert the input string from UTF-8 into pure ASCII by converting
1702 ** all non-ASCII characters to some combination of characters in the
1703 ** ASCII subset.
1705 ** The returned string might contain more characters than the input.
1707 ** Space to hold the returned string comes from sqlite3_malloc() and
1708 ** should be freed by the caller.
1710 static unsigned char *transliterate(const unsigned char *zIn, int nIn){
1711 unsigned char *zOut = sqlite3_malloc64( nIn*4 + 1 );
1712 int c, sz, nOut;
1713 if( zOut==0 ) return 0;
1714 nOut = 0;
1715 while( nIn>0 ){
1716 c = utf8Read(zIn, nIn, &sz);
1717 zIn += sz;
1718 nIn -= sz;
1719 if( c<=127 ){
1720 zOut[nOut++] = (unsigned char)c;
1721 }else{
1722 int xTop, xBtm, x;
1723 const Transliteration *tbl = spellfixFindTranslit(c, &xTop);
1724 xBtm = 0;
1725 while( xTop>=xBtm ){
1726 x = (xTop + xBtm)/2;
1727 if( tbl[x].cFrom==c ){
1728 zOut[nOut++] = tbl[x].cTo0;
1729 if( tbl[x].cTo1 ){
1730 zOut[nOut++] = tbl[x].cTo1;
1731 if( tbl[x].cTo2 ){
1732 zOut[nOut++] = tbl[x].cTo2;
1733 if( tbl[x].cTo3 ){
1734 zOut[nOut++] = tbl[x].cTo3;
1738 c = 0;
1739 break;
1740 }else if( tbl[x].cFrom>c ){
1741 xTop = x-1;
1742 }else{
1743 xBtm = x+1;
1746 if( c ) zOut[nOut++] = '?';
1749 zOut[nOut] = 0;
1750 return zOut;
1754 ** Return the number of characters in the shortest prefix of the input
1755 ** string that transliterates to an ASCII string nTrans bytes or longer.
1756 ** Or, if the transliteration of the input string is less than nTrans
1757 ** bytes in size, return the number of characters in the input string.
1759 static int translen_to_charlen(const char *zIn, int nIn, int nTrans){
1760 int i, c, sz, nOut;
1761 int nChar;
1763 i = nOut = 0;
1764 for(nChar=0; i<nIn && nOut<nTrans; nChar++){
1765 c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1766 i += sz;
1768 nOut++;
1769 if( c>=128 ){
1770 int xTop, xBtm, x;
1771 const Transliteration *tbl = spellfixFindTranslit(c, &xTop);
1772 xBtm = 0;
1773 while( xTop>=xBtm ){
1774 x = (xTop + xBtm)/2;
1775 if( tbl[x].cFrom==c ){
1776 if( tbl[x].cTo1 ){
1777 nOut++;
1778 if( tbl[x].cTo2 ){
1779 nOut++;
1780 if( tbl[x].cTo3 ){
1781 nOut++;
1785 break;
1786 }else if( tbl[x].cFrom>c ){
1787 xTop = x-1;
1788 }else{
1789 xBtm = x+1;
1795 return nChar;
1800 ** spellfix1_translit(X)
1802 ** Convert a string that contains non-ASCII Roman characters into
1803 ** pure ASCII.
1805 static void transliterateSqlFunc(
1806 sqlite3_context *context,
1807 int argc,
1808 sqlite3_value **argv
1810 const unsigned char *zIn = sqlite3_value_text(argv[0]);
1811 int nIn = sqlite3_value_bytes(argv[0]);
1812 unsigned char *zOut = transliterate(zIn, nIn);
1813 if( zOut==0 ){
1814 sqlite3_result_error_nomem(context);
1815 }else{
1816 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
1821 ** spellfix1_scriptcode(X)
1823 ** Try to determine the dominant script used by the word X and return
1824 ** its ISO 15924 numeric code.
1826 ** The current implementation only understands the following scripts:
1828 ** 215 (Latin)
1829 ** 220 (Cyrillic)
1830 ** 200 (Greek)
1832 ** This routine will return 998 if the input X contains characters from
1833 ** two or more of the above scripts or 999 if X contains no characters
1834 ** from any of the above scripts.
1836 static void scriptCodeSqlFunc(
1837 sqlite3_context *context,
1838 int argc,
1839 sqlite3_value **argv
1841 const unsigned char *zIn = sqlite3_value_text(argv[0]);
1842 int nIn = sqlite3_value_bytes(argv[0]);
1843 int c, sz;
1844 int scriptMask = 0;
1845 int res;
1846 int seenDigit = 0;
1847 # define SCRIPT_LATIN 0x0001
1848 # define SCRIPT_CYRILLIC 0x0002
1849 # define SCRIPT_GREEK 0x0004
1850 # define SCRIPT_HEBREW 0x0008
1851 # define SCRIPT_ARABIC 0x0010
1853 while( nIn>0 ){
1854 c = utf8Read(zIn, nIn, &sz);
1855 zIn += sz;
1856 nIn -= sz;
1857 if( c<0x02af ){
1858 if( c>=0x80 || midClass[c&0x7f]<CCLASS_DIGIT ){
1859 scriptMask |= SCRIPT_LATIN;
1860 }else if( c>='0' && c<='9' ){
1861 seenDigit = 1;
1863 }else if( c>=0x0400 && c<=0x04ff ){
1864 scriptMask |= SCRIPT_CYRILLIC;
1865 }else if( c>=0x0386 && c<=0x03ce ){
1866 scriptMask |= SCRIPT_GREEK;
1867 }else if( c>=0x0590 && c<=0x05ff ){
1868 scriptMask |= SCRIPT_HEBREW;
1869 }else if( c>=0x0600 && c<=0x06ff ){
1870 scriptMask |= SCRIPT_ARABIC;
1873 if( scriptMask==0 && seenDigit ) scriptMask = SCRIPT_LATIN;
1874 switch( scriptMask ){
1875 case 0: res = 999; break;
1876 case SCRIPT_LATIN: res = 215; break;
1877 case SCRIPT_CYRILLIC: res = 220; break;
1878 case SCRIPT_GREEK: res = 200; break;
1879 case SCRIPT_HEBREW: res = 125; break;
1880 case SCRIPT_ARABIC: res = 160; break;
1881 default: res = 998; break;
1883 sqlite3_result_int(context, res);
1886 /* End transliterate
1887 ******************************************************************************
1888 ******************************************************************************
1889 ** Begin spellfix1 virtual table.
1892 /* Maximum length of a phonehash used for querying the shadow table */
1893 #define SPELLFIX_MX_HASH 32
1895 /* Maximum number of hash strings to examine per query */
1896 #define SPELLFIX_MX_RUN 1
1898 typedef struct spellfix1_vtab spellfix1_vtab;
1899 typedef struct spellfix1_cursor spellfix1_cursor;
1901 /* Fuzzy-search virtual table object */
1902 struct spellfix1_vtab {
1903 sqlite3_vtab base; /* Base class - must be first */
1904 sqlite3 *db; /* Database connection */
1905 char *zDbName; /* Name of database holding this table */
1906 char *zTableName; /* Name of the virtual table */
1907 char *zCostTable; /* Table holding edit-distance cost numbers */
1908 EditDist3Config *pConfig3; /* Parsed edit distance costs */
1911 /* Fuzzy-search cursor object */
1912 struct spellfix1_cursor {
1913 sqlite3_vtab_cursor base; /* Base class - must be first */
1914 spellfix1_vtab *pVTab; /* The table to which this cursor belongs */
1915 char *zPattern; /* rhs of MATCH clause */
1916 int idxNum; /* idxNum value passed to xFilter() */
1917 int nRow; /* Number of rows of content */
1918 int nAlloc; /* Number of allocated rows */
1919 int iRow; /* Current row of content */
1920 int iLang; /* Value of the langid= constraint */
1921 int iTop; /* Value of the top= constraint */
1922 int iScope; /* Value of the scope= constraint */
1923 int nSearch; /* Number of vocabulary items checked */
1924 sqlite3_stmt *pFullScan; /* Shadow query for a full table scan */
1925 struct spellfix1_row { /* For each row of content */
1926 sqlite3_int64 iRowid; /* Rowid for this row */
1927 char *zWord; /* Text for this row */
1928 int iRank; /* Rank for this row */
1929 int iDistance; /* Distance from pattern for this row */
1930 int iScore; /* Score for sorting */
1931 int iMatchlen; /* Value of matchlen column (or -1) */
1932 char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */
1933 } *a;
1937 ** Construct one or more SQL statements from the format string given
1938 ** and then evaluate those statements. The success code is written
1939 ** into *pRc.
1941 ** If *pRc is initially non-zero then this routine is a no-op.
1943 static void spellfix1DbExec(
1944 int *pRc, /* Success code */
1945 sqlite3 *db, /* Database in which to run SQL */
1946 const char *zFormat, /* Format string for SQL */
1947 ... /* Arguments to the format string */
1949 va_list ap;
1950 char *zSql;
1951 if( *pRc ) return;
1952 va_start(ap, zFormat);
1953 zSql = sqlite3_vmprintf(zFormat, ap);
1954 va_end(ap);
1955 if( zSql==0 ){
1956 *pRc = SQLITE_NOMEM;
1957 }else{
1958 *pRc = sqlite3_exec(db, zSql, 0, 0, 0);
1959 sqlite3_free(zSql);
1964 ** xDisconnect/xDestroy method for the fuzzy-search module.
1966 static int spellfix1Uninit(int isDestroy, sqlite3_vtab *pVTab){
1967 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
1968 int rc = SQLITE_OK;
1969 if( isDestroy ){
1970 sqlite3 *db = p->db;
1971 spellfix1DbExec(&rc, db, "DROP TABLE IF EXISTS \"%w\".\"%w_vocab\"",
1972 p->zDbName, p->zTableName);
1974 if( rc==SQLITE_OK ){
1975 sqlite3_free(p->zTableName);
1976 editDist3ConfigDelete(p->pConfig3);
1977 sqlite3_free(p->zCostTable);
1978 sqlite3_free(p);
1980 return rc;
1982 static int spellfix1Disconnect(sqlite3_vtab *pVTab){
1983 return spellfix1Uninit(0, pVTab);
1985 static int spellfix1Destroy(sqlite3_vtab *pVTab){
1986 return spellfix1Uninit(1, pVTab);
1990 ** Make a copy of a string. Remove leading and trailing whitespace
1991 ** and dequote it.
1993 static char *spellfix1Dequote(const char *zIn){
1994 char *zOut;
1995 int i, j;
1996 char c;
1997 while( isspace((unsigned char)zIn[0]) ) zIn++;
1998 zOut = sqlite3_mprintf("%s", zIn);
1999 if( zOut==0 ) return 0;
2000 i = (int)strlen(zOut);
2001 #if 0 /* The parser will never leave spaces at the end */
2002 while( i>0 && isspace(zOut[i-1]) ){ i--; }
2003 #endif
2004 zOut[i] = 0;
2005 c = zOut[0];
2006 if( c=='\'' || c=='"' ){
2007 for(i=1, j=0; ALWAYS(zOut[i]); i++){
2008 zOut[j++] = zOut[i];
2009 if( zOut[i]==c ){
2010 if( zOut[i+1]==c ){
2011 i++;
2012 }else{
2013 zOut[j-1] = 0;
2014 break;
2019 return zOut;
2024 ** xConnect/xCreate method for the spellfix1 module. Arguments are:
2026 ** argv[0] -> module name ("spellfix1")
2027 ** argv[1] -> database name
2028 ** argv[2] -> table name
2029 ** argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter)
2031 static int spellfix1Init(
2032 int isCreate,
2033 sqlite3 *db,
2034 void *pAux,
2035 int argc, const char *const*argv,
2036 sqlite3_vtab **ppVTab,
2037 char **pzErr
2039 spellfix1_vtab *pNew = 0;
2040 /* const char *zModule = argv[0]; // not used */
2041 const char *zDbName = argv[1];
2042 const char *zTableName = argv[2];
2043 int nDbName;
2044 int rc = SQLITE_OK;
2045 int i;
2047 nDbName = (int)strlen(zDbName);
2048 pNew = sqlite3_malloc64( sizeof(*pNew) + nDbName + 1);
2049 if( pNew==0 ){
2050 rc = SQLITE_NOMEM;
2051 }else{
2052 memset(pNew, 0, sizeof(*pNew));
2053 pNew->zDbName = (char*)&pNew[1];
2054 memcpy(pNew->zDbName, zDbName, nDbName+1);
2055 pNew->zTableName = sqlite3_mprintf("%s", zTableName);
2056 pNew->db = db;
2057 if( pNew->zTableName==0 ){
2058 rc = SQLITE_NOMEM;
2059 }else{
2060 rc = sqlite3_declare_vtab(db,
2061 "CREATE TABLE x(word,rank,distance,langid, "
2062 "score, matchlen, phonehash HIDDEN, "
2063 "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
2064 "soundslike HIDDEN, command HIDDEN)"
2066 #define SPELLFIX_COL_WORD 0
2067 #define SPELLFIX_COL_RANK 1
2068 #define SPELLFIX_COL_DISTANCE 2
2069 #define SPELLFIX_COL_LANGID 3
2070 #define SPELLFIX_COL_SCORE 4
2071 #define SPELLFIX_COL_MATCHLEN 5
2072 #define SPELLFIX_COL_PHONEHASH 6
2073 #define SPELLFIX_COL_TOP 7
2074 #define SPELLFIX_COL_SCOPE 8
2075 #define SPELLFIX_COL_SRCHCNT 9
2076 #define SPELLFIX_COL_SOUNDSLIKE 10
2077 #define SPELLFIX_COL_COMMAND 11
2079 if( rc==SQLITE_OK && isCreate ){
2080 spellfix1DbExec(&rc, db,
2081 "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
2082 " id INTEGER PRIMARY KEY,\n"
2083 " rank INT,\n"
2084 " langid INT,\n"
2085 " word TEXT,\n"
2086 " k1 TEXT,\n"
2087 " k2 TEXT\n"
2088 ");\n",
2089 zDbName, zTableName
2091 spellfix1DbExec(&rc, db,
2092 "CREATE INDEX IF NOT EXISTS \"%w\".\"%w_vocab_index_langid_k2\" "
2093 "ON \"%w_vocab\"(langid,k2);",
2094 zDbName, zTableName, zTableName
2097 for(i=3; rc==SQLITE_OK && i<argc; i++){
2098 if( strncmp(argv[i],"edit_cost_table=",16)==0 && pNew->zCostTable==0 ){
2099 pNew->zCostTable = spellfix1Dequote(&argv[i][16]);
2100 if( pNew->zCostTable==0 ) rc = SQLITE_NOMEM;
2101 continue;
2103 *pzErr = sqlite3_mprintf("bad argument to spellfix1(): \"%s\"", argv[i]);
2104 rc = SQLITE_ERROR;
2108 if( rc && pNew ){
2109 *ppVTab = 0;
2110 spellfix1Uninit(0, &pNew->base);
2111 }else{
2112 *ppVTab = (sqlite3_vtab *)pNew;
2114 return rc;
2118 ** The xConnect and xCreate methods
2120 static int spellfix1Connect(
2121 sqlite3 *db,
2122 void *pAux,
2123 int argc, const char *const*argv,
2124 sqlite3_vtab **ppVTab,
2125 char **pzErr
2127 return spellfix1Init(0, db, pAux, argc, argv, ppVTab, pzErr);
2129 static int spellfix1Create(
2130 sqlite3 *db,
2131 void *pAux,
2132 int argc, const char *const*argv,
2133 sqlite3_vtab **ppVTab,
2134 char **pzErr
2136 return spellfix1Init(1, db, pAux, argc, argv, ppVTab, pzErr);
2140 ** Clear all of the content from a cursor.
2142 static void spellfix1ResetCursor(spellfix1_cursor *pCur){
2143 int i;
2144 for(i=0; i<pCur->nRow; i++){
2145 sqlite3_free(pCur->a[i].zWord);
2147 pCur->nRow = 0;
2148 pCur->iRow = 0;
2149 pCur->nSearch = 0;
2150 if( pCur->pFullScan ){
2151 sqlite3_finalize(pCur->pFullScan);
2152 pCur->pFullScan = 0;
2157 ** Resize the cursor to hold up to N rows of content
2159 static void spellfix1ResizeCursor(spellfix1_cursor *pCur, int N){
2160 struct spellfix1_row *aNew;
2161 assert( N>=pCur->nRow );
2162 aNew = sqlite3_realloc64(pCur->a, sizeof(pCur->a[0])*N);
2163 if( aNew==0 && N>0 ){
2164 spellfix1ResetCursor(pCur);
2165 sqlite3_free(pCur->a);
2166 pCur->nAlloc = 0;
2167 pCur->a = 0;
2168 }else{
2169 pCur->nAlloc = N;
2170 pCur->a = aNew;
2176 ** Close a fuzzy-search cursor.
2178 static int spellfix1Close(sqlite3_vtab_cursor *cur){
2179 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2180 spellfix1ResetCursor(pCur);
2181 spellfix1ResizeCursor(pCur, 0);
2182 sqlite3_free(pCur->zPattern);
2183 sqlite3_free(pCur);
2184 return SQLITE_OK;
2187 #define SPELLFIX_IDXNUM_MATCH 0x01 /* word MATCH $str */
2188 #define SPELLFIX_IDXNUM_LANGID 0x02 /* langid == $langid */
2189 #define SPELLFIX_IDXNUM_TOP 0x04 /* top = $top */
2190 #define SPELLFIX_IDXNUM_SCOPE 0x08 /* scope = $scope */
2191 #define SPELLFIX_IDXNUM_DISTLT 0x10 /* distance < $distance */
2192 #define SPELLFIX_IDXNUM_DISTLE 0x20 /* distance <= $distance */
2193 #define SPELLFIX_IDXNUM_ROWID 0x40 /* rowid = $rowid */
2194 #define SPELLFIX_IDXNUM_DIST (0x10|0x20) /* DISTLT and DISTLE */
2198 ** The plan number is a bitmask of the SPELLFIX_IDXNUM_* values defined
2199 ** above.
2201 ** filter.argv[*] values contains $str, $langid, $top, $scope and $rowid
2202 ** if specified and in that order.
2204 static int spellfix1BestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
2205 int iPlan = 0;
2206 int iLangTerm = -1;
2207 int iTopTerm = -1;
2208 int iScopeTerm = -1;
2209 int iDistTerm = -1;
2210 int iRowidTerm = -1;
2211 int i;
2212 const struct sqlite3_index_constraint *pConstraint;
2213 pConstraint = pIdxInfo->aConstraint;
2214 for(i=0; i<pIdxInfo->nConstraint; i++, pConstraint++){
2215 if( pConstraint->usable==0 ) continue;
2217 /* Terms of the form: word MATCH $str */
2218 if( (iPlan & SPELLFIX_IDXNUM_MATCH)==0
2219 && pConstraint->iColumn==SPELLFIX_COL_WORD
2220 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH
2222 iPlan |= SPELLFIX_IDXNUM_MATCH;
2223 pIdxInfo->aConstraintUsage[i].argvIndex = 1;
2224 pIdxInfo->aConstraintUsage[i].omit = 1;
2227 /* Terms of the form: langid = $langid */
2228 if( (iPlan & SPELLFIX_IDXNUM_LANGID)==0
2229 && pConstraint->iColumn==SPELLFIX_COL_LANGID
2230 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2232 iPlan |= SPELLFIX_IDXNUM_LANGID;
2233 iLangTerm = i;
2236 /* Terms of the form: top = $top */
2237 if( (iPlan & SPELLFIX_IDXNUM_TOP)==0
2238 && pConstraint->iColumn==SPELLFIX_COL_TOP
2239 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2241 iPlan |= SPELLFIX_IDXNUM_TOP;
2242 iTopTerm = i;
2245 /* Terms of the form: scope = $scope */
2246 if( (iPlan & SPELLFIX_IDXNUM_SCOPE)==0
2247 && pConstraint->iColumn==SPELLFIX_COL_SCOPE
2248 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2250 iPlan |= SPELLFIX_IDXNUM_SCOPE;
2251 iScopeTerm = i;
2254 /* Terms of the form: distance < $dist or distance <= $dist */
2255 if( (iPlan & SPELLFIX_IDXNUM_DIST)==0
2256 && pConstraint->iColumn==SPELLFIX_COL_DISTANCE
2257 && (pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT
2258 || pConstraint->op==SQLITE_INDEX_CONSTRAINT_LE)
2260 if( pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT ){
2261 iPlan |= SPELLFIX_IDXNUM_DISTLT;
2262 }else{
2263 iPlan |= SPELLFIX_IDXNUM_DISTLE;
2265 iDistTerm = i;
2268 /* Terms of the form: distance < $dist or distance <= $dist */
2269 if( (iPlan & SPELLFIX_IDXNUM_ROWID)==0
2270 && pConstraint->iColumn<0
2271 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2273 iPlan |= SPELLFIX_IDXNUM_ROWID;
2274 iRowidTerm = i;
2277 if( iPlan&SPELLFIX_IDXNUM_MATCH ){
2278 int idx = 2;
2279 pIdxInfo->idxNum = iPlan;
2280 if( pIdxInfo->nOrderBy==1
2281 && pIdxInfo->aOrderBy[0].iColumn==SPELLFIX_COL_SCORE
2282 && pIdxInfo->aOrderBy[0].desc==0
2284 pIdxInfo->orderByConsumed = 1; /* Default order by iScore */
2286 if( iPlan&SPELLFIX_IDXNUM_LANGID ){
2287 pIdxInfo->aConstraintUsage[iLangTerm].argvIndex = idx++;
2288 pIdxInfo->aConstraintUsage[iLangTerm].omit = 1;
2290 if( iPlan&SPELLFIX_IDXNUM_TOP ){
2291 pIdxInfo->aConstraintUsage[iTopTerm].argvIndex = idx++;
2292 pIdxInfo->aConstraintUsage[iTopTerm].omit = 1;
2294 if( iPlan&SPELLFIX_IDXNUM_SCOPE ){
2295 pIdxInfo->aConstraintUsage[iScopeTerm].argvIndex = idx++;
2296 pIdxInfo->aConstraintUsage[iScopeTerm].omit = 1;
2298 if( iPlan&SPELLFIX_IDXNUM_DIST ){
2299 pIdxInfo->aConstraintUsage[iDistTerm].argvIndex = idx++;
2300 pIdxInfo->aConstraintUsage[iDistTerm].omit = 1;
2302 pIdxInfo->estimatedCost = 1e5;
2303 }else if( (iPlan & SPELLFIX_IDXNUM_ROWID) ){
2304 pIdxInfo->idxNum = SPELLFIX_IDXNUM_ROWID;
2305 pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1;
2306 pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1;
2307 pIdxInfo->estimatedCost = 5;
2308 }else{
2309 pIdxInfo->idxNum = 0;
2310 pIdxInfo->estimatedCost = 1e50;
2312 return SQLITE_OK;
2316 ** Open a new fuzzy-search cursor.
2318 static int spellfix1Open(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2319 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2320 spellfix1_cursor *pCur;
2321 pCur = sqlite3_malloc64( sizeof(*pCur) );
2322 if( pCur==0 ) return SQLITE_NOMEM;
2323 memset(pCur, 0, sizeof(*pCur));
2324 pCur->pVTab = p;
2325 *ppCursor = &pCur->base;
2326 return SQLITE_OK;
2330 ** Adjust a distance measurement by the words rank in order to show
2331 ** preference to common words.
2333 static int spellfix1Score(int iDistance, int iRank){
2334 int iLog2;
2335 for(iLog2=0; iRank>0; iLog2++, iRank>>=1){}
2336 return iDistance + 32 - iLog2;
2340 ** Compare two spellfix1_row objects for sorting purposes in qsort() such
2341 ** that they sort in order of increasing distance.
2343 static int SQLITE_CDECL spellfix1RowCompare(const void *A, const void *B){
2344 const struct spellfix1_row *a = (const struct spellfix1_row*)A;
2345 const struct spellfix1_row *b = (const struct spellfix1_row*)B;
2346 return a->iScore - b->iScore;
2350 ** A structure used to pass information from spellfix1FilterForMatch()
2351 ** into spellfix1RunQuery().
2353 typedef struct MatchQuery {
2354 spellfix1_cursor *pCur; /* The cursor being queried */
2355 sqlite3_stmt *pStmt; /* shadow table query statment */
2356 char zHash[SPELLFIX_MX_HASH]; /* The current phonehash for zPattern */
2357 const char *zPattern; /* Transliterated input string */
2358 int nPattern; /* Length of zPattern */
2359 EditDist3FromString *pMatchStr3; /* Original unicode string */
2360 EditDist3Config *pConfig3; /* Edit-distance cost coefficients */
2361 const EditDist3Lang *pLang; /* The selected language coefficients */
2362 int iLang; /* The language id */
2363 int iScope; /* Default scope */
2364 int iMaxDist; /* Maximum allowed edit distance, or -1 */
2365 int rc; /* Error code */
2366 int nRun; /* Number of prior runs for the same zPattern */
2367 char azPrior[SPELLFIX_MX_RUN][SPELLFIX_MX_HASH]; /* Prior hashes */
2368 } MatchQuery;
2371 ** Run a query looking for the best matches against zPattern using
2372 ** zHash as the character class seed hash.
2374 static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){
2375 const char *zK1;
2376 const char *zWord;
2377 int iDist;
2378 int iRank;
2379 int iScore;
2380 int iWorst = 0;
2381 int idx;
2382 int idxWorst = -1;
2383 int i;
2384 int iScope = p->iScope;
2385 spellfix1_cursor *pCur = p->pCur;
2386 sqlite3_stmt *pStmt = p->pStmt;
2387 char zHash1[SPELLFIX_MX_HASH];
2388 char zHash2[SPELLFIX_MX_HASH];
2389 char *zClass;
2390 int nClass;
2391 int rc;
2393 if( pCur->a==0 || p->rc ) return; /* Prior memory allocation failure */
2394 zClass = (char*)phoneticHash((unsigned char*)zQuery, nQuery);
2395 if( zClass==0 ){
2396 p->rc = SQLITE_NOMEM;
2397 return;
2399 nClass = (int)strlen(zClass);
2400 if( nClass>SPELLFIX_MX_HASH-2 ){
2401 nClass = SPELLFIX_MX_HASH-2;
2402 zClass[nClass] = 0;
2404 if( nClass<=iScope ){
2405 if( nClass>2 ){
2406 iScope = nClass-1;
2407 }else{
2408 iScope = nClass;
2411 memcpy(zHash1, zClass, iScope);
2412 sqlite3_free(zClass);
2413 zHash1[iScope] = 0;
2414 memcpy(zHash2, zHash1, iScope);
2415 zHash2[iScope] = 'Z';
2416 zHash2[iScope+1] = 0;
2417 #if SPELLFIX_MX_RUN>1
2418 for(i=0; i<p->nRun; i++){
2419 if( strcmp(p->azPrior[i], zHash1)==0 ) return;
2421 #endif
2422 assert( p->nRun<SPELLFIX_MX_RUN );
2423 memcpy(p->azPrior[p->nRun++], zHash1, iScope+1);
2424 if( sqlite3_bind_text(pStmt, 1, zHash1, -1, SQLITE_STATIC)==SQLITE_NOMEM
2425 || sqlite3_bind_text(pStmt, 2, zHash2, -1, SQLITE_STATIC)==SQLITE_NOMEM
2427 p->rc = SQLITE_NOMEM;
2428 return;
2430 #if SPELLFIX_MX_RUN>1
2431 for(i=0; i<pCur->nRow; i++){
2432 if( pCur->a[i].iScore>iWorst ){
2433 iWorst = pCur->a[i].iScore;
2434 idxWorst = i;
2437 #endif
2438 while( sqlite3_step(pStmt)==SQLITE_ROW ){
2439 int iMatchlen = -1;
2440 iRank = sqlite3_column_int(pStmt, 2);
2441 if( p->pMatchStr3 ){
2442 int nWord = sqlite3_column_bytes(pStmt, 1);
2443 zWord = (const char*)sqlite3_column_text(pStmt, 1);
2444 iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen);
2445 }else{
2446 zK1 = (const char*)sqlite3_column_text(pStmt, 3);
2447 if( zK1==0 ) continue;
2448 iDist = editdist1(p->zPattern, zK1, 0);
2450 if( iDist<0 ){
2451 p->rc = SQLITE_NOMEM;
2452 break;
2454 pCur->nSearch++;
2456 /* If there is a "distance < $dist" or "distance <= $dist" constraint,
2457 ** check if this row meets it. If not, jump back up to the top of the
2458 ** loop to process the next row. Otherwise, if the row does match the
2459 ** distance constraint, check if the pCur->a[] array is already full.
2460 ** If it is and no explicit "top = ?" constraint was present in the
2461 ** query, grow the array to ensure there is room for the new entry. */
2462 assert( (p->iMaxDist>=0)==((pCur->idxNum & SPELLFIX_IDXNUM_DIST) ? 1 : 0) );
2463 if( p->iMaxDist>=0 ){
2464 if( iDist>p->iMaxDist ) continue;
2465 if( pCur->nRow>=pCur->nAlloc && (pCur->idxNum & SPELLFIX_IDXNUM_TOP)==0 ){
2466 spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10);
2467 if( pCur->a==0 ) break;
2471 iScore = spellfix1Score(iDist,iRank);
2472 if( pCur->nRow<pCur->nAlloc ){
2473 idx = pCur->nRow;
2474 }else if( iScore<iWorst ){
2475 idx = idxWorst;
2476 sqlite3_free(pCur->a[idx].zWord);
2477 }else{
2478 continue;
2481 pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
2482 if( pCur->a[idx].zWord==0 ){
2483 p->rc = SQLITE_NOMEM;
2484 break;
2486 pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
2487 pCur->a[idx].iRank = iRank;
2488 pCur->a[idx].iDistance = iDist;
2489 pCur->a[idx].iScore = iScore;
2490 pCur->a[idx].iMatchlen = iMatchlen;
2491 memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
2492 if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
2493 if( pCur->nRow==pCur->nAlloc ){
2494 iWorst = pCur->a[0].iScore;
2495 idxWorst = 0;
2496 for(i=1; i<pCur->nRow; i++){
2497 iScore = pCur->a[i].iScore;
2498 if( iWorst<iScore ){
2499 iWorst = iScore;
2500 idxWorst = i;
2505 rc = sqlite3_reset(pStmt);
2506 if( rc ) p->rc = rc;
2510 ** This version of the xFilter method work if the MATCH term is present
2511 ** and we are doing a scan.
2513 static int spellfix1FilterForMatch(
2514 spellfix1_cursor *pCur,
2515 int argc,
2516 sqlite3_value **argv
2518 int idxNum = pCur->idxNum;
2519 const unsigned char *zMatchThis; /* RHS of the MATCH operator */
2520 EditDist3FromString *pMatchStr3 = 0; /* zMatchThis as an editdist string */
2521 char *zPattern; /* Transliteration of zMatchThis */
2522 int nPattern; /* Length of zPattern */
2523 int iLimit = 20; /* Max number of rows of output */
2524 int iScope = 3; /* Use this many characters of zClass */
2525 int iLang = 0; /* Language code */
2526 char *zSql; /* SQL of shadow table query */
2527 sqlite3_stmt *pStmt = 0; /* Shadow table query */
2528 int rc; /* Result code */
2529 int idx = 1; /* Next available filter parameter */
2530 spellfix1_vtab *p = pCur->pVTab; /* The virtual table that owns pCur */
2531 MatchQuery x; /* For passing info to RunQuery() */
2533 /* Load the cost table if we have not already done so */
2534 if( p->zCostTable!=0 && p->pConfig3==0 ){
2535 p->pConfig3 = sqlite3_malloc64( sizeof(p->pConfig3[0]) );
2536 if( p->pConfig3==0 ) return SQLITE_NOMEM;
2537 memset(p->pConfig3, 0, sizeof(p->pConfig3[0]));
2538 rc = editDist3ConfigLoad(p->pConfig3, p->db, p->zCostTable);
2539 if( rc ) return rc;
2541 memset(&x, 0, sizeof(x));
2542 x.iScope = 3; /* Default scope if none specified by "WHERE scope=N" */
2543 x.iMaxDist = -1; /* Maximum allowed edit distance */
2545 if( idxNum&2 ){
2546 iLang = sqlite3_value_int(argv[idx++]);
2548 if( idxNum&4 ){
2549 iLimit = sqlite3_value_int(argv[idx++]);
2550 if( iLimit<1 ) iLimit = 1;
2552 if( idxNum&8 ){
2553 x.iScope = sqlite3_value_int(argv[idx++]);
2554 if( x.iScope<1 ) x.iScope = 1;
2555 if( x.iScope>SPELLFIX_MX_HASH-2 ) x.iScope = SPELLFIX_MX_HASH-2;
2557 if( idxNum&(16|32) ){
2558 x.iMaxDist = sqlite3_value_int(argv[idx++]);
2559 if( idxNum&16 ) x.iMaxDist--;
2560 if( x.iMaxDist<0 ) x.iMaxDist = 0;
2562 spellfix1ResetCursor(pCur);
2563 spellfix1ResizeCursor(pCur, iLimit);
2564 zMatchThis = sqlite3_value_text(argv[0]);
2565 if( zMatchThis==0 ) return SQLITE_OK;
2566 if( p->pConfig3 ){
2567 x.pLang = editDist3FindLang(p->pConfig3, iLang);
2568 pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1);
2569 if( pMatchStr3==0 ){
2570 x.rc = SQLITE_NOMEM;
2571 goto filter_exit;
2573 }else{
2574 x.pLang = 0;
2576 zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));
2577 sqlite3_free(pCur->zPattern);
2578 pCur->zPattern = zPattern;
2579 if( zPattern==0 ){
2580 x.rc = SQLITE_NOMEM;
2581 goto filter_exit;
2583 nPattern = (int)strlen(zPattern);
2584 if( zPattern[nPattern-1]=='*' ) nPattern--;
2585 zSql = sqlite3_mprintf(
2586 "SELECT id, word, rank, coalesce(k1,word)"
2587 " FROM \"%w\".\"%w_vocab\""
2588 " WHERE langid=%d AND k2>=?1 AND k2<?2",
2589 p->zDbName, p->zTableName, iLang
2591 if( zSql==0 ){
2592 x.rc = SQLITE_NOMEM;
2593 pStmt = 0;
2594 goto filter_exit;
2596 rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0);
2597 sqlite3_free(zSql);
2598 pCur->iLang = iLang;
2599 x.pCur = pCur;
2600 x.pStmt = pStmt;
2601 x.zPattern = zPattern;
2602 x.nPattern = nPattern;
2603 x.pMatchStr3 = pMatchStr3;
2604 x.iLang = iLang;
2605 x.rc = rc;
2606 x.pConfig3 = p->pConfig3;
2607 if( x.rc==SQLITE_OK ){
2608 spellfix1RunQuery(&x, zPattern, nPattern);
2611 if( pCur->a ){
2612 qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare);
2613 pCur->iTop = iLimit;
2614 pCur->iScope = iScope;
2615 }else{
2616 x.rc = SQLITE_NOMEM;
2619 filter_exit:
2620 sqlite3_finalize(pStmt);
2621 editDist3FromStringDelete(pMatchStr3);
2622 return x.rc;
2626 ** This version of xFilter handles a full-table scan case
2628 static int spellfix1FilterForFullScan(
2629 spellfix1_cursor *pCur,
2630 int argc,
2631 sqlite3_value **argv
2633 int rc = SQLITE_OK;
2634 int idxNum = pCur->idxNum;
2635 char *zSql;
2636 spellfix1_vtab *pVTab = pCur->pVTab;
2637 spellfix1ResetCursor(pCur);
2638 assert( idxNum==0 || idxNum==64 );
2639 zSql = sqlite3_mprintf(
2640 "SELECT word, rank, NULL, langid, id FROM \"%w\".\"%w_vocab\"%s",
2641 pVTab->zDbName, pVTab->zTableName,
2642 ((idxNum & 64) ? " WHERE rowid=?" : "")
2644 if( zSql==0 ) return SQLITE_NOMEM;
2645 rc = sqlite3_prepare_v2(pVTab->db, zSql, -1, &pCur->pFullScan, 0);
2646 sqlite3_free(zSql);
2647 if( rc==SQLITE_OK && (idxNum & 64) ){
2648 assert( argc==1 );
2649 rc = sqlite3_bind_value(pCur->pFullScan, 1, argv[0]);
2651 pCur->nRow = pCur->iRow = 0;
2652 if( rc==SQLITE_OK ){
2653 rc = sqlite3_step(pCur->pFullScan);
2654 if( rc==SQLITE_ROW ){ pCur->iRow = -1; rc = SQLITE_OK; }
2655 if( rc==SQLITE_DONE ){ rc = SQLITE_OK; }
2656 }else{
2657 pCur->iRow = 0;
2659 return rc;
2664 ** Called to "rewind" a cursor back to the beginning so that
2665 ** it starts its output over again. Always called at least once
2666 ** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call.
2668 static int spellfix1Filter(
2669 sqlite3_vtab_cursor *cur,
2670 int idxNum, const char *idxStr,
2671 int argc, sqlite3_value **argv
2673 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2674 int rc;
2675 pCur->idxNum = idxNum;
2676 if( idxNum & 1 ){
2677 rc = spellfix1FilterForMatch(pCur, argc, argv);
2678 }else{
2679 rc = spellfix1FilterForFullScan(pCur, argc, argv);
2681 return rc;
2686 ** Advance a cursor to its next row of output
2688 static int spellfix1Next(sqlite3_vtab_cursor *cur){
2689 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2690 int rc = SQLITE_OK;
2691 if( pCur->iRow < pCur->nRow ){
2692 if( pCur->pFullScan ){
2693 rc = sqlite3_step(pCur->pFullScan);
2694 if( rc!=SQLITE_ROW ) pCur->iRow = pCur->nRow;
2695 if( rc==SQLITE_ROW || rc==SQLITE_DONE ) rc = SQLITE_OK;
2696 }else{
2697 pCur->iRow++;
2700 return rc;
2704 ** Return TRUE if we are at the end-of-file
2706 static int spellfix1Eof(sqlite3_vtab_cursor *cur){
2707 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2708 return pCur->iRow>=pCur->nRow;
2712 ** Return columns from the current row.
2714 static int spellfix1Column(
2715 sqlite3_vtab_cursor *cur,
2716 sqlite3_context *ctx,
2717 int i
2719 spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2720 if( pCur->pFullScan ){
2721 if( i<=SPELLFIX_COL_LANGID ){
2722 sqlite3_result_value(ctx, sqlite3_column_value(pCur->pFullScan, i));
2723 }else{
2724 sqlite3_result_null(ctx);
2726 return SQLITE_OK;
2728 switch( i ){
2729 case SPELLFIX_COL_WORD: {
2730 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zWord, -1, SQLITE_STATIC);
2731 break;
2733 case SPELLFIX_COL_RANK: {
2734 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iRank);
2735 break;
2737 case SPELLFIX_COL_DISTANCE: {
2738 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iDistance);
2739 break;
2741 case SPELLFIX_COL_LANGID: {
2742 sqlite3_result_int(ctx, pCur->iLang);
2743 break;
2745 case SPELLFIX_COL_SCORE: {
2746 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
2747 break;
2749 case SPELLFIX_COL_MATCHLEN: {
2750 int iMatchlen = pCur->a[pCur->iRow].iMatchlen;
2751 if( iMatchlen<0 ){
2752 int nPattern = (int)strlen(pCur->zPattern);
2753 char *zWord = pCur->a[pCur->iRow].zWord;
2754 int nWord = (int)strlen(zWord);
2756 if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){
2757 char *zTranslit;
2758 int res;
2759 zTranslit = (char *)transliterate((unsigned char *)zWord, nWord);
2760 if( !zTranslit ) return SQLITE_NOMEM;
2761 res = editdist1(pCur->zPattern, zTranslit, &iMatchlen);
2762 sqlite3_free(zTranslit);
2763 if( res<0 ) return SQLITE_NOMEM;
2764 iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen);
2765 }else{
2766 iMatchlen = utf8Charlen(zWord, nWord);
2770 sqlite3_result_int(ctx, iMatchlen);
2771 break;
2773 case SPELLFIX_COL_PHONEHASH: {
2774 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC);
2775 break;
2777 case SPELLFIX_COL_TOP: {
2778 sqlite3_result_int(ctx, pCur->iTop);
2779 break;
2781 case SPELLFIX_COL_SCOPE: {
2782 sqlite3_result_int(ctx, pCur->iScope);
2783 break;
2785 case SPELLFIX_COL_SRCHCNT: {
2786 sqlite3_result_int(ctx, pCur->nSearch);
2787 break;
2789 default: {
2790 sqlite3_result_null(ctx);
2791 break;
2794 return SQLITE_OK;
2798 ** The rowid.
2800 static int spellfix1Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
2801 spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2802 if( pCur->pFullScan ){
2803 *pRowid = sqlite3_column_int64(pCur->pFullScan, 4);
2804 }else{
2805 *pRowid = pCur->a[pCur->iRow].iRowid;
2807 return SQLITE_OK;
2811 ** This function is called by the xUpdate() method. It returns a string
2812 ** containing the conflict mode that xUpdate() should use for the current
2813 ** operation. One of: "ROLLBACK", "IGNORE", "ABORT" or "REPLACE".
2815 static const char *spellfix1GetConflict(sqlite3 *db){
2816 static const char *azConflict[] = {
2817 /* Note: Instead of "FAIL" - "ABORT". */
2818 "ROLLBACK", "IGNORE", "ABORT", "ABORT", "REPLACE"
2820 int eConflict = sqlite3_vtab_on_conflict(db);
2822 assert( eConflict==SQLITE_ROLLBACK || eConflict==SQLITE_IGNORE
2823 || eConflict==SQLITE_FAIL || eConflict==SQLITE_ABORT
2824 || eConflict==SQLITE_REPLACE
2826 assert( SQLITE_ROLLBACK==1 );
2827 assert( SQLITE_IGNORE==2 );
2828 assert( SQLITE_FAIL==3 );
2829 assert( SQLITE_ABORT==4 );
2830 assert( SQLITE_REPLACE==5 );
2832 return azConflict[eConflict-1];
2836 ** The xUpdate() method.
2838 static int spellfix1Update(
2839 sqlite3_vtab *pVTab,
2840 int argc,
2841 sqlite3_value **argv,
2842 sqlite_int64 *pRowid
2844 int rc = SQLITE_OK;
2845 sqlite3_int64 rowid, newRowid;
2846 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2847 sqlite3 *db = p->db;
2849 if( argc==1 ){
2850 /* A delete operation on the rowid given by argv[0] */
2851 rowid = *pRowid = sqlite3_value_int64(argv[0]);
2852 spellfix1DbExec(&rc, db, "DELETE FROM \"%w\".\"%w_vocab\" "
2853 " WHERE id=%lld",
2854 p->zDbName, p->zTableName, rowid);
2855 }else{
2856 const unsigned char *zWord = sqlite3_value_text(argv[SPELLFIX_COL_WORD+2]);
2857 int nWord = sqlite3_value_bytes(argv[SPELLFIX_COL_WORD+2]);
2858 int iLang = sqlite3_value_int(argv[SPELLFIX_COL_LANGID+2]);
2859 int iRank = sqlite3_value_int(argv[SPELLFIX_COL_RANK+2]);
2860 const unsigned char *zSoundslike =
2861 sqlite3_value_text(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2862 int nSoundslike = sqlite3_value_bytes(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2863 char *zK1, *zK2;
2864 int i;
2865 char c;
2866 const char *zConflict = spellfix1GetConflict(db);
2868 if( zWord==0 ){
2869 /* Inserts of the form: INSERT INTO table(command) VALUES('xyzzy');
2870 ** cause zWord to be NULL, so we look at the "command" column to see
2871 ** what special actions to take */
2872 const char *zCmd =
2873 (const char*)sqlite3_value_text(argv[SPELLFIX_COL_COMMAND+2]);
2874 if( zCmd==0 ){
2875 pVTab->zErrMsg = sqlite3_mprintf("NOT NULL constraint failed: %s.word",
2876 p->zTableName);
2877 return SQLITE_CONSTRAINT_NOTNULL;
2879 if( strcmp(zCmd,"reset")==0 ){
2880 /* Reset the edit cost table (if there is one). */
2881 editDist3ConfigDelete(p->pConfig3);
2882 p->pConfig3 = 0;
2883 return SQLITE_OK;
2885 if( strncmp(zCmd,"edit_cost_table=",16)==0 ){
2886 editDist3ConfigDelete(p->pConfig3);
2887 p->pConfig3 = 0;
2888 sqlite3_free(p->zCostTable);
2889 p->zCostTable = spellfix1Dequote(zCmd+16);
2890 if( p->zCostTable==0 ) return SQLITE_NOMEM;
2891 if( p->zCostTable[0]==0 || sqlite3_stricmp(p->zCostTable,"null")==0 ){
2892 sqlite3_free(p->zCostTable);
2893 p->zCostTable = 0;
2895 return SQLITE_OK;
2897 pVTab->zErrMsg = sqlite3_mprintf("unknown value for %s.command: \"%w\"",
2898 p->zTableName, zCmd);
2899 return SQLITE_ERROR;
2901 if( iRank<1 ) iRank = 1;
2902 if( zSoundslike ){
2903 zK1 = (char*)transliterate(zSoundslike, nSoundslike);
2904 }else{
2905 zK1 = (char*)transliterate(zWord, nWord);
2907 if( zK1==0 ) return SQLITE_NOMEM;
2908 for(i=0; (c = zK1[i])!=0; i++){
2909 if( c>='A' && c<='Z' ) zK1[i] += 'a' - 'A';
2911 zK2 = (char*)phoneticHash((const unsigned char*)zK1, i);
2912 if( zK2==0 ){
2913 sqlite3_free(zK1);
2914 return SQLITE_NOMEM;
2916 if( sqlite3_value_type(argv[0])==SQLITE_NULL ){
2917 if( sqlite3_value_type(argv[1])==SQLITE_NULL ){
2918 spellfix1DbExec(&rc, db,
2919 "INSERT INTO \"%w\".\"%w_vocab\"(rank,langid,word,k1,k2) "
2920 "VALUES(%d,%d,%Q,nullif(%Q,%Q),%Q)",
2921 p->zDbName, p->zTableName,
2922 iRank, iLang, zWord, zK1, zWord, zK2
2924 }else{
2925 newRowid = sqlite3_value_int64(argv[1]);
2926 spellfix1DbExec(&rc, db,
2927 "INSERT OR %s INTO \"%w\".\"%w_vocab\"(id,rank,langid,word,k1,k2) "
2928 "VALUES(%lld,%d,%d,%Q,nullif(%Q,%Q),%Q)",
2929 zConflict, p->zDbName, p->zTableName,
2930 newRowid, iRank, iLang, zWord, zK1, zWord, zK2
2933 *pRowid = sqlite3_last_insert_rowid(db);
2934 }else{
2935 rowid = sqlite3_value_int64(argv[0]);
2936 newRowid = *pRowid = sqlite3_value_int64(argv[1]);
2937 spellfix1DbExec(&rc, db,
2938 "UPDATE OR %s \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, langid=%d,"
2939 " word=%Q, k1=nullif(%Q,%Q), k2=%Q WHERE id=%lld",
2940 zConflict, p->zDbName, p->zTableName, newRowid, iRank, iLang,
2941 zWord, zK1, zWord, zK2, rowid
2944 sqlite3_free(zK1);
2945 sqlite3_free(zK2);
2947 return rc;
2951 ** Rename the spellfix1 table.
2953 static int spellfix1Rename(sqlite3_vtab *pVTab, const char *zNew){
2954 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2955 sqlite3 *db = p->db;
2956 int rc = SQLITE_OK;
2957 char *zNewName = sqlite3_mprintf("%s", zNew);
2958 if( zNewName==0 ){
2959 return SQLITE_NOMEM;
2961 spellfix1DbExec(&rc, db,
2962 "ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"",
2963 p->zDbName, p->zTableName, zNewName
2965 if( rc==SQLITE_OK ){
2966 sqlite3_free(p->zTableName);
2967 p->zTableName = zNewName;
2968 }else{
2969 sqlite3_free(zNewName);
2971 return rc;
2976 ** A virtual table module that provides fuzzy search.
2978 static sqlite3_module spellfix1Module = {
2979 0, /* iVersion */
2980 spellfix1Create, /* xCreate - handle CREATE VIRTUAL TABLE */
2981 spellfix1Connect, /* xConnect - reconnected to an existing table */
2982 spellfix1BestIndex, /* xBestIndex - figure out how to do a query */
2983 spellfix1Disconnect, /* xDisconnect - close a connection */
2984 spellfix1Destroy, /* xDestroy - handle DROP TABLE */
2985 spellfix1Open, /* xOpen - open a cursor */
2986 spellfix1Close, /* xClose - close a cursor */
2987 spellfix1Filter, /* xFilter - configure scan constraints */
2988 spellfix1Next, /* xNext - advance a cursor */
2989 spellfix1Eof, /* xEof - check for end of scan */
2990 spellfix1Column, /* xColumn - read data */
2991 spellfix1Rowid, /* xRowid - read data */
2992 spellfix1Update, /* xUpdate */
2993 0, /* xBegin */
2994 0, /* xSync */
2995 0, /* xCommit */
2996 0, /* xRollback */
2997 0, /* xFindMethod */
2998 spellfix1Rename, /* xRename */
3002 ** Register the various functions and the virtual table.
3004 static int spellfix1Register(sqlite3 *db){
3005 int rc = SQLITE_OK;
3006 int i;
3007 rc = sqlite3_create_function(db, "spellfix1_translit", 1,
3008 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3009 transliterateSqlFunc, 0, 0);
3010 if( rc==SQLITE_OK ){
3011 rc = sqlite3_create_function(db, "spellfix1_editdist", 2,
3012 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3013 editdistSqlFunc, 0, 0);
3015 if( rc==SQLITE_OK ){
3016 rc = sqlite3_create_function(db, "spellfix1_phonehash", 1,
3017 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3018 phoneticHashSqlFunc, 0, 0);
3020 if( rc==SQLITE_OK ){
3021 rc = sqlite3_create_function(db, "spellfix1_scriptcode", 1,
3022 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3023 scriptCodeSqlFunc, 0, 0);
3025 if( rc==SQLITE_OK ){
3026 rc = sqlite3_create_module(db, "spellfix1", &spellfix1Module, 0);
3028 if( rc==SQLITE_OK ){
3029 rc = editDist3Install(db);
3032 /* Verify sanity of the translit[] table */
3033 for(i=0; i<sizeof(translit)/sizeof(translit[0])-1; i++){
3034 assert( translit[i].cFrom<translit[i+1].cFrom );
3037 return rc;
3040 #endif /* SQLITE_OMIT_VIRTUALTABLE */
3043 ** Extension load function.
3045 #ifdef _WIN32
3046 __declspec(dllexport)
3047 #endif
3048 int sqlite3_spellfix_init(
3049 sqlite3 *db,
3050 char **pzErrMsg,
3051 const sqlite3_api_routines *pApi
3053 SQLITE_EXTENSION_INIT2(pApi);
3054 #ifndef SQLITE_OMIT_VIRTUALTABLE
3055 return spellfix1Register(db);
3056 #endif
3057 return SQLITE_OK;