Enhance the command-line completion extension to return the names of
[sqlite.git] / ext / misc / spellfix.c
blob8651bb7a9d3ac5c2baf5e008e0b9b5dd94b15805
1 /*
2 ** 2012 April 10
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 *************************************************************************
13 ** This module implements the spellfix1 VIRTUAL TABLE that can be used
14 ** to search a large vocabulary for close matches. See separate
15 ** documentation (http://www.sqlite.org/spellfix1.html) for details.
17 #include "sqlite3ext.h"
18 SQLITE_EXTENSION_INIT1
20 #ifndef SQLITE_AMALGAMATION
21 # if !defined(NDEBUG) && !defined(SQLITE_DEBUG)
22 # define NDEBUG 1
23 # endif
24 # if defined(NDEBUG) && defined(SQLITE_DEBUG)
25 # undef NDEBUG
26 # endif
27 # include <string.h>
28 # include <stdio.h>
29 # include <stdlib.h>
30 # include <assert.h>
31 # define ALWAYS(X) 1
32 # define NEVER(X) 0
33 typedef unsigned char u8;
34 typedef unsigned short u16;
35 #endif
36 #include <ctype.h>
38 #ifndef SQLITE_OMIT_VIRTUALTABLE
41 ** Character classes for ASCII characters:
43 ** 0 '' Silent letters: H W
44 ** 1 'A' Any vowel: A E I O U (Y)
45 ** 2 'B' A bilabeal stop or fricative: B F P V W
46 ** 3 'C' Other fricatives or back stops: C G J K Q S X Z
47 ** 4 'D' Alveolar stops: D T
48 ** 5 'H' Letter H at the beginning of a word
49 ** 6 'L' Glide: L
50 ** 7 'R' Semivowel: R
51 ** 8 'M' Nasals: M N
52 ** 9 'Y' Letter Y at the beginning of a word.
53 ** 10 '9' Digits: 0 1 2 3 4 5 6 7 8 9
54 ** 11 ' ' White space
55 ** 12 '?' Other.
57 #define CCLASS_SILENT 0
58 #define CCLASS_VOWEL 1
59 #define CCLASS_B 2
60 #define CCLASS_C 3
61 #define CCLASS_D 4
62 #define CCLASS_H 5
63 #define CCLASS_L 6
64 #define CCLASS_R 7
65 #define CCLASS_M 8
66 #define CCLASS_Y 9
67 #define CCLASS_DIGIT 10
68 #define CCLASS_SPACE 11
69 #define CCLASS_OTHER 12
72 ** The following table gives the character class for non-initial ASCII
73 ** characters.
75 static const unsigned char midClass[] = {
76 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
77 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
78 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
79 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
80 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER,
81 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
82 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
83 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
84 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
85 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
86 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE,
87 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER,
88 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER,
89 /* ' */ CCLASS_SILENT, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER,
90 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER,
91 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER,
92 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT,
93 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT,
94 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT,
95 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER,
96 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER,
97 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL,
98 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D,
99 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C,
100 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C,
101 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M,
102 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B,
103 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C,
104 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B,
105 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_VOWEL,
106 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER,
107 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER,
108 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B,
109 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL,
110 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT,
111 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C,
112 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M,
113 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C,
114 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D,
115 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B,
116 /* x */ CCLASS_C, /* y */ CCLASS_VOWEL, /* z */ CCLASS_C,
117 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER,
118 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER,
121 ** This tables gives the character class for ASCII characters that form the
122 ** initial character of a word. The only difference from midClass is with
123 ** the letters H, W, and Y.
125 static const unsigned char initClass[] = {
126 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
127 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
128 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
129 /* */ CCLASS_SPACE, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
130 /* */ CCLASS_SPACE, /* */ CCLASS_SPACE, /* */ CCLASS_OTHER,
131 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
132 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
133 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
134 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
135 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_OTHER,
136 /* */ CCLASS_OTHER, /* */ CCLASS_OTHER, /* */ CCLASS_SPACE,
137 /* ! */ CCLASS_OTHER, /* " */ CCLASS_OTHER, /* # */ CCLASS_OTHER,
138 /* $ */ CCLASS_OTHER, /* % */ CCLASS_OTHER, /* & */ CCLASS_OTHER,
139 /* ' */ CCLASS_OTHER, /* ( */ CCLASS_OTHER, /* ) */ CCLASS_OTHER,
140 /* * */ CCLASS_OTHER, /* + */ CCLASS_OTHER, /* , */ CCLASS_OTHER,
141 /* - */ CCLASS_OTHER, /* . */ CCLASS_OTHER, /* / */ CCLASS_OTHER,
142 /* 0 */ CCLASS_DIGIT, /* 1 */ CCLASS_DIGIT, /* 2 */ CCLASS_DIGIT,
143 /* 3 */ CCLASS_DIGIT, /* 4 */ CCLASS_DIGIT, /* 5 */ CCLASS_DIGIT,
144 /* 6 */ CCLASS_DIGIT, /* 7 */ CCLASS_DIGIT, /* 8 */ CCLASS_DIGIT,
145 /* 9 */ CCLASS_DIGIT, /* : */ CCLASS_OTHER, /* ; */ CCLASS_OTHER,
146 /* < */ CCLASS_OTHER, /* = */ CCLASS_OTHER, /* > */ CCLASS_OTHER,
147 /* ? */ CCLASS_OTHER, /* @ */ CCLASS_OTHER, /* A */ CCLASS_VOWEL,
148 /* B */ CCLASS_B, /* C */ CCLASS_C, /* D */ CCLASS_D,
149 /* E */ CCLASS_VOWEL, /* F */ CCLASS_B, /* G */ CCLASS_C,
150 /* H */ CCLASS_SILENT, /* I */ CCLASS_VOWEL, /* J */ CCLASS_C,
151 /* K */ CCLASS_C, /* L */ CCLASS_L, /* M */ CCLASS_M,
152 /* N */ CCLASS_M, /* O */ CCLASS_VOWEL, /* P */ CCLASS_B,
153 /* Q */ CCLASS_C, /* R */ CCLASS_R, /* S */ CCLASS_C,
154 /* T */ CCLASS_D, /* U */ CCLASS_VOWEL, /* V */ CCLASS_B,
155 /* W */ CCLASS_B, /* X */ CCLASS_C, /* Y */ CCLASS_Y,
156 /* Z */ CCLASS_C, /* [ */ CCLASS_OTHER, /* \ */ CCLASS_OTHER,
157 /* ] */ CCLASS_OTHER, /* ^ */ CCLASS_OTHER, /* _ */ CCLASS_OTHER,
158 /* ` */ CCLASS_OTHER, /* a */ CCLASS_VOWEL, /* b */ CCLASS_B,
159 /* c */ CCLASS_C, /* d */ CCLASS_D, /* e */ CCLASS_VOWEL,
160 /* f */ CCLASS_B, /* g */ CCLASS_C, /* h */ CCLASS_SILENT,
161 /* i */ CCLASS_VOWEL, /* j */ CCLASS_C, /* k */ CCLASS_C,
162 /* l */ CCLASS_L, /* m */ CCLASS_M, /* n */ CCLASS_M,
163 /* o */ CCLASS_VOWEL, /* p */ CCLASS_B, /* q */ CCLASS_C,
164 /* r */ CCLASS_R, /* s */ CCLASS_C, /* t */ CCLASS_D,
165 /* u */ CCLASS_VOWEL, /* v */ CCLASS_B, /* w */ CCLASS_B,
166 /* x */ CCLASS_C, /* y */ CCLASS_Y, /* z */ CCLASS_C,
167 /* { */ CCLASS_OTHER, /* | */ CCLASS_OTHER, /* } */ CCLASS_OTHER,
168 /* ~ */ CCLASS_OTHER, /* */ CCLASS_OTHER,
172 ** Mapping from the character class number (0-13) to a symbol for each
173 ** character class. Note that initClass[] can be used to map the class
174 ** symbol back into the class number.
176 static const unsigned char className[] = ".ABCDHLRMY9 ?";
179 ** Generate a "phonetic hash" from a string of ASCII characters
180 ** in zIn[0..nIn-1].
182 ** * Map characters by character class as defined above.
183 ** * Omit double-letters
184 ** * Omit vowels beside R and L
185 ** * Omit T when followed by CH
186 ** * Omit W when followed by R
187 ** * Omit D when followed by J or G
188 ** * Omit K in KN or G in GN at the beginning of a word
190 ** Space to hold the result is obtained from sqlite3_malloc()
192 ** Return NULL if memory allocation fails.
194 static unsigned char *phoneticHash(const unsigned char *zIn, int nIn){
195 unsigned char *zOut = sqlite3_malloc64( nIn + 1 );
196 int i;
197 int nOut = 0;
198 char cPrev = 0x77;
199 char cPrevX = 0x77;
200 const unsigned char *aClass = initClass;
202 if( zOut==0 ) return 0;
203 if( nIn>2 ){
204 switch( zIn[0] ){
205 case 'g':
206 case 'k': {
207 if( zIn[1]=='n' ){ zIn++; nIn--; }
208 break;
212 for(i=0; i<nIn; i++){
213 unsigned char c = zIn[i];
214 if( i+1<nIn ){
215 if( c=='w' && zIn[i+1]=='r' ) continue;
216 if( c=='d' && (zIn[i+1]=='j' || zIn[i+1]=='g') ) continue;
217 if( i+2<nIn ){
218 if( c=='t' && zIn[i+1]=='c' && zIn[i+2]=='h' ) continue;
221 c = aClass[c&0x7f];
222 if( c==CCLASS_SPACE ) continue;
223 if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue;
224 aClass = midClass;
225 if( c==CCLASS_VOWEL && (cPrevX==CCLASS_R || cPrevX==CCLASS_L) ){
226 continue; /* No vowels beside L or R */
228 if( (c==CCLASS_R || c==CCLASS_L) && cPrevX==CCLASS_VOWEL ){
229 nOut--; /* No vowels beside L or R */
231 cPrev = c;
232 if( c==CCLASS_SILENT ) continue;
233 cPrevX = c;
234 c = className[c];
235 assert( nOut>=0 );
236 if( nOut==0 || c!=zOut[nOut-1] ) zOut[nOut++] = c;
238 zOut[nOut] = 0;
239 return zOut;
243 ** This is an SQL function wrapper around phoneticHash(). See
244 ** the description of phoneticHash() for additional information.
246 static void phoneticHashSqlFunc(
247 sqlite3_context *context,
248 int argc,
249 sqlite3_value **argv
251 const unsigned char *zIn;
252 unsigned char *zOut;
254 zIn = sqlite3_value_text(argv[0]);
255 if( zIn==0 ) return;
256 zOut = phoneticHash(zIn, sqlite3_value_bytes(argv[0]));
257 if( zOut==0 ){
258 sqlite3_result_error_nomem(context);
259 }else{
260 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
265 ** Return the character class number for a character given its
266 ** context.
268 static char characterClass(char cPrev, char c){
269 return cPrev==0 ? initClass[c&0x7f] : midClass[c&0x7f];
273 ** Return the cost of inserting or deleting character c immediately
274 ** following character cPrev. If cPrev==0, that means c is the first
275 ** character of the word.
277 static int insertOrDeleteCost(char cPrev, char c, char cNext){
278 char classC = characterClass(cPrev, c);
279 char classCprev;
281 if( classC==CCLASS_SILENT ){
282 /* Insert or delete "silent" characters such as H or W */
283 return 1;
285 if( cPrev==c ){
286 /* Repeated characters, or miss a repeat */
287 return 10;
289 if( classC==CCLASS_VOWEL && (cPrev=='r' || cNext=='r') ){
290 return 20; /* Insert a vowel before or after 'r' */
292 classCprev = characterClass(cPrev, cPrev);
293 if( classC==classCprev ){
294 if( classC==CCLASS_VOWEL ){
295 /* Remove or add a new vowel to a vowel cluster */
296 return 15;
297 }else{
298 /* Remove or add a consonant not in the same class */
299 return 50;
303 /* any other character insertion or deletion */
304 return 100;
308 ** Divide the insertion cost by this factor when appending to the
309 ** end of the word.
311 #define FINAL_INS_COST_DIV 4
314 ** Return the cost of substituting cTo in place of cFrom assuming
315 ** the previous character is cPrev. If cPrev==0 then cTo is the first
316 ** character of the word.
318 static int substituteCost(char cPrev, char cFrom, char cTo){
319 char classFrom, classTo;
320 if( cFrom==cTo ){
321 /* Exact match */
322 return 0;
324 if( cFrom==(cTo^0x20) && ((cTo>='A' && cTo<='Z') || (cTo>='a' && cTo<='z')) ){
325 /* differ only in case */
326 return 0;
328 classFrom = characterClass(cPrev, cFrom);
329 classTo = characterClass(cPrev, cTo);
330 if( classFrom==classTo ){
331 /* Same character class */
332 return 40;
334 if( classFrom>=CCLASS_B && classFrom<=CCLASS_Y
335 && classTo>=CCLASS_B && classTo<=CCLASS_Y ){
336 /* Convert from one consonant to another, but in a different class */
337 return 75;
339 /* Any other subsitution */
340 return 100;
344 ** Given two strings zA and zB which are pure ASCII, return the cost
345 ** of transforming zA into zB. If zA ends with '*' assume that it is
346 ** a prefix of zB and give only minimal penalty for extra characters
347 ** on the end of zB.
349 ** Smaller numbers mean a closer match.
351 ** Negative values indicate an error:
352 ** -1 One of the inputs is NULL
353 ** -2 Non-ASCII characters on input
354 ** -3 Unable to allocate memory
356 ** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
357 ** of zB that matched the pattern in zA. If zA does not end with a '*',
358 ** then this value is always the number of bytes in zB (i.e. strlen(zB)).
359 ** If zA does end in a '*', then it is the number of bytes in the prefix
360 ** of zB that was deemed to match zA.
362 static int editdist1(const char *zA, const char *zB, int *pnMatch){
363 int nA, nB; /* Number of characters in zA[] and zB[] */
364 int xA, xB; /* Loop counters for zA[] and zB[] */
365 char cA = 0, cB; /* Current character of zA and zB */
366 char cAprev, cBprev; /* Previous character of zA and zB */
367 char cAnext, cBnext; /* Next character in zA and zB */
368 int d; /* North-west cost value */
369 int dc = 0; /* North-west character value */
370 int res; /* Final result */
371 int *m; /* The cost matrix */
372 char *cx; /* Corresponding character values */
373 int *toFree = 0; /* Malloced space */
374 int nMatch = 0;
375 int mStack[60+15]; /* Stack space to use if not too much is needed */
377 /* Early out if either input is NULL */
378 if( zA==0 || zB==0 ) return -1;
380 /* Skip any common prefix */
381 while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; }
382 if( pnMatch ) *pnMatch = nMatch;
383 if( zA[0]==0 && zB[0]==0 ) return 0;
385 #if 0
386 printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
387 #endif
389 /* Verify input strings and measure their lengths */
390 for(nA=0; zA[nA]; nA++){
391 if( zA[nA]&0x80 ) return -2;
393 for(nB=0; zB[nB]; nB++){
394 if( zB[nB]&0x80 ) return -2;
397 /* Special processing if either string is empty */
398 if( nA==0 ){
399 cBprev = (char)dc;
400 for(xB=res=0; (cB = zB[xB])!=0; xB++){
401 res += insertOrDeleteCost(cBprev, cB, zB[xB+1])/FINAL_INS_COST_DIV;
402 cBprev = cB;
404 return res;
406 if( nB==0 ){
407 cAprev = (char)dc;
408 for(xA=res=0; (cA = zA[xA])!=0; xA++){
409 res += insertOrDeleteCost(cAprev, cA, zA[xA+1]);
410 cAprev = cA;
412 return res;
415 /* A is a prefix of B */
416 if( zA[0]=='*' && zA[1]==0 ) return 0;
418 /* Allocate and initialize the Wagner matrix */
419 if( nB<(sizeof(mStack)*4)/(sizeof(mStack[0])*5) ){
420 m = mStack;
421 }else{
422 m = toFree = sqlite3_malloc64( (nB+1)*5*sizeof(m[0])/4 );
423 if( m==0 ) return -3;
425 cx = (char*)&m[nB+1];
427 /* Compute the Wagner edit distance */
428 m[0] = 0;
429 cx[0] = (char)dc;
430 cBprev = (char)dc;
431 for(xB=1; xB<=nB; xB++){
432 cBnext = zB[xB];
433 cB = zB[xB-1];
434 cx[xB] = cB;
435 m[xB] = m[xB-1] + insertOrDeleteCost(cBprev, cB, cBnext);
436 cBprev = cB;
438 cAprev = (char)dc;
439 for(xA=1; xA<=nA; xA++){
440 int lastA = (xA==nA);
441 cA = zA[xA-1];
442 cAnext = zA[xA];
443 if( cA=='*' && lastA ) break;
444 d = m[0];
445 dc = cx[0];
446 m[0] = d + insertOrDeleteCost(cAprev, cA, cAnext);
447 cBprev = 0;
448 for(xB=1; xB<=nB; xB++){
449 int totalCost, insCost, delCost, subCost, ncx;
450 cB = zB[xB-1];
451 cBnext = zB[xB];
453 /* Cost to insert cB */
454 insCost = insertOrDeleteCost(cx[xB-1], cB, cBnext);
455 if( lastA ) insCost /= FINAL_INS_COST_DIV;
457 /* Cost to delete cA */
458 delCost = insertOrDeleteCost(cx[xB], cA, cBnext);
460 /* Cost to substitute cA->cB */
461 subCost = substituteCost(cx[xB-1], cA, cB);
463 /* Best cost */
464 totalCost = insCost + m[xB-1];
465 ncx = cB;
466 if( (delCost + m[xB])<totalCost ){
467 totalCost = delCost + m[xB];
468 ncx = cA;
470 if( (subCost + d)<totalCost ){
471 totalCost = subCost + d;
474 #if 0
475 printf("%d,%d d=%4d u=%4d r=%4d dc=%c cA=%c cB=%c"
476 " ins=%4d del=%4d sub=%4d t=%4d ncx=%c\n",
477 xA, xB, d, m[xB], m[xB-1], dc?dc:' ', cA, cB,
478 insCost, delCost, subCost, totalCost, ncx?ncx:' ');
479 #endif
481 /* Update the matrix */
482 d = m[xB];
483 dc = cx[xB];
484 m[xB] = totalCost;
485 cx[xB] = (char)ncx;
486 cBprev = cB;
488 cAprev = cA;
491 /* Free the wagner matrix and return the result */
492 if( cA=='*' ){
493 res = m[1];
494 for(xB=1; xB<=nB; xB++){
495 if( m[xB]<res ){
496 res = m[xB];
497 if( pnMatch ) *pnMatch = xB+nMatch;
500 }else{
501 res = m[nB];
502 /* In the current implementation, pnMatch is always NULL if zA does
503 ** not end in "*" */
504 assert( pnMatch==0 );
506 sqlite3_free(toFree);
507 return res;
511 ** Function: editdist(A,B)
513 ** Return the cost of transforming string A into string B. Both strings
514 ** must be pure ASCII text. If A ends with '*' then it is assumed to be
515 ** a prefix of B and extra characters on the end of B have minimal additional
516 ** cost.
518 static void editdistSqlFunc(
519 sqlite3_context *context,
520 int argc,
521 sqlite3_value **argv
523 int res = editdist1(
524 (const char*)sqlite3_value_text(argv[0]),
525 (const char*)sqlite3_value_text(argv[1]),
527 if( res<0 ){
528 if( res==(-3) ){
529 sqlite3_result_error_nomem(context);
530 }else if( res==(-2) ){
531 sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
532 }else{
533 sqlite3_result_error(context, "NULL input to editdist()", -1);
535 }else{
536 sqlite3_result_int(context, res);
540 /* End of the fixed-cost edit distance implementation
541 ******************************************************************************
542 *****************************************************************************
543 ** Begin: Configurable cost unicode edit distance routines
545 /* Forward declaration of structures */
546 typedef struct EditDist3Cost EditDist3Cost;
547 typedef struct EditDist3Config EditDist3Config;
548 typedef struct EditDist3Point EditDist3Point;
549 typedef struct EditDist3From EditDist3From;
550 typedef struct EditDist3FromString EditDist3FromString;
551 typedef struct EditDist3To EditDist3To;
552 typedef struct EditDist3ToString EditDist3ToString;
553 typedef struct EditDist3Lang EditDist3Lang;
557 ** An entry in the edit cost table
559 struct EditDist3Cost {
560 EditDist3Cost *pNext; /* Next cost element */
561 u8 nFrom; /* Number of bytes in aFrom */
562 u8 nTo; /* Number of bytes in aTo */
563 u16 iCost; /* Cost of this transformation */
564 char a[4] ; /* FROM string followed by TO string */
565 /* Additional TO and FROM string bytes appended as necessary */
569 ** Edit costs for a particular language ID
571 struct EditDist3Lang {
572 int iLang; /* Language ID */
573 int iInsCost; /* Default insertion cost */
574 int iDelCost; /* Default deletion cost */
575 int iSubCost; /* Default substitution cost */
576 EditDist3Cost *pCost; /* Costs */
581 ** The default EditDist3Lang object, with default costs.
583 static const EditDist3Lang editDist3Lang = { 0, 100, 100, 150, 0 };
586 ** Complete configuration
588 struct EditDist3Config {
589 int nLang; /* Number of language IDs. Size of a[] */
590 EditDist3Lang *a; /* One for each distinct language ID */
594 ** Extra information about each character in the FROM string.
596 struct EditDist3From {
597 int nSubst; /* Number of substitution cost entries */
598 int nDel; /* Number of deletion cost entries */
599 int nByte; /* Number of bytes in this character */
600 EditDist3Cost **apSubst; /* Array of substitution costs for this element */
601 EditDist3Cost **apDel; /* Array of deletion cost entries */
605 ** A precompiled FROM string.
607 ** In the common case we expect the FROM string to be reused multiple times.
608 ** In other words, the common case will be to measure the edit distance
609 ** from a single origin string to multiple target strings.
611 struct EditDist3FromString {
612 char *z; /* The complete text of the FROM string */
613 int n; /* Number of characters in the FROM string */
614 int isPrefix; /* True if ends with '*' character */
615 EditDist3From *a; /* Extra info about each char of the FROM string */
619 ** Extra information about each character in the TO string.
621 struct EditDist3To {
622 int nIns; /* Number of insertion cost entries */
623 int nByte; /* Number of bytes in this character */
624 EditDist3Cost **apIns; /* Array of deletion cost entries */
628 ** A precompiled FROM string
630 struct EditDist3ToString {
631 char *z; /* The complete text of the TO string */
632 int n; /* Number of characters in the TO string */
633 EditDist3To *a; /* Extra info about each char of the TO string */
637 ** Clear or delete an instance of the object that records all edit-distance
638 ** weights.
640 static void editDist3ConfigClear(EditDist3Config *p){
641 int i;
642 if( p==0 ) return;
643 for(i=0; i<p->nLang; i++){
644 EditDist3Cost *pCost, *pNext;
645 pCost = p->a[i].pCost;
646 while( pCost ){
647 pNext = pCost->pNext;
648 sqlite3_free(pCost);
649 pCost = pNext;
652 sqlite3_free(p->a);
653 memset(p, 0, sizeof(*p));
655 static void editDist3ConfigDelete(void *pIn){
656 EditDist3Config *p = (EditDist3Config*)pIn;
657 editDist3ConfigClear(p);
658 sqlite3_free(p);
661 /* Compare the FROM values of two EditDist3Cost objects, for sorting.
662 ** Return negative, zero, or positive if the A is less than, equal to,
663 ** or greater than B.
665 static int editDist3CostCompare(EditDist3Cost *pA, EditDist3Cost *pB){
666 int n = pA->nFrom;
667 int rc;
668 if( n>pB->nFrom ) n = pB->nFrom;
669 rc = strncmp(pA->a, pB->a, n);
670 if( rc==0 ) rc = pA->nFrom - pB->nFrom;
671 return rc;
675 ** Merge together two sorted lists of EditDist3Cost objects, in order
676 ** of increasing FROM.
678 static EditDist3Cost *editDist3CostMerge(
679 EditDist3Cost *pA,
680 EditDist3Cost *pB
682 EditDist3Cost *pHead = 0;
683 EditDist3Cost **ppTail = &pHead;
684 EditDist3Cost *p;
685 while( pA && pB ){
686 if( editDist3CostCompare(pA,pB)<=0 ){
687 p = pA;
688 pA = pA->pNext;
689 }else{
690 p = pB;
691 pB = pB->pNext;
693 *ppTail = p;
694 ppTail = &p->pNext;
696 if( pA ){
697 *ppTail = pA;
698 }else{
699 *ppTail = pB;
701 return pHead;
705 ** Sort a list of EditDist3Cost objects into order of increasing FROM
707 static EditDist3Cost *editDist3CostSort(EditDist3Cost *pList){
708 EditDist3Cost *ap[60], *p;
709 int i;
710 int mx = 0;
711 ap[0] = 0;
712 ap[1] = 0;
713 while( pList ){
714 p = pList;
715 pList = p->pNext;
716 p->pNext = 0;
717 for(i=0; ap[i]; i++){
718 p = editDist3CostMerge(ap[i],p);
719 ap[i] = 0;
721 ap[i] = p;
722 if( i>mx ){
723 mx = i;
724 ap[i+1] = 0;
727 p = 0;
728 for(i=0; i<=mx; i++){
729 if( ap[i] ) p = editDist3CostMerge(p,ap[i]);
731 return p;
735 ** Load all edit-distance weights from a table.
737 static int editDist3ConfigLoad(
738 EditDist3Config *p, /* The edit distance configuration to load */
739 sqlite3 *db, /* Load from this database */
740 const char *zTable /* Name of the table from which to load */
742 sqlite3_stmt *pStmt;
743 int rc, rc2;
744 char *zSql;
745 int iLangPrev = -9999;
746 EditDist3Lang *pLang = 0;
748 zSql = sqlite3_mprintf("SELECT iLang, cFrom, cTo, iCost"
749 " FROM \"%w\" WHERE iLang>=0 ORDER BY iLang", zTable);
750 if( zSql==0 ) return SQLITE_NOMEM;
751 rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0);
752 sqlite3_free(zSql);
753 if( rc ) return rc;
754 editDist3ConfigClear(p);
755 while( sqlite3_step(pStmt)==SQLITE_ROW ){
756 int iLang = sqlite3_column_int(pStmt, 0);
757 const char *zFrom = (const char*)sqlite3_column_text(pStmt, 1);
758 int nFrom = zFrom ? sqlite3_column_bytes(pStmt, 1) : 0;
759 const char *zTo = (const char*)sqlite3_column_text(pStmt, 2);
760 int nTo = zTo ? sqlite3_column_bytes(pStmt, 2) : 0;
761 int iCost = sqlite3_column_int(pStmt, 3);
763 assert( zFrom!=0 || nFrom==0 );
764 assert( zTo!=0 || nTo==0 );
765 if( nFrom>100 || nTo>100 ) continue;
766 if( iCost<0 ) continue;
767 if( iCost>10000 ) continue; /* Costs above 10K are considered infinite */
768 if( pLang==0 || iLang!=iLangPrev ){
769 EditDist3Lang *pNew;
770 pNew = sqlite3_realloc64(p->a, (p->nLang+1)*sizeof(p->a[0]));
771 if( pNew==0 ){ rc = SQLITE_NOMEM; break; }
772 p->a = pNew;
773 pLang = &p->a[p->nLang];
774 p->nLang++;
775 pLang->iLang = iLang;
776 pLang->iInsCost = 100;
777 pLang->iDelCost = 100;
778 pLang->iSubCost = 150;
779 pLang->pCost = 0;
780 iLangPrev = iLang;
782 if( nFrom==1 && zFrom[0]=='?' && nTo==0 ){
783 pLang->iDelCost = iCost;
784 }else if( nFrom==0 && nTo==1 && zTo[0]=='?' ){
785 pLang->iInsCost = iCost;
786 }else if( nFrom==1 && nTo==1 && zFrom[0]=='?' && zTo[0]=='?' ){
787 pLang->iSubCost = iCost;
788 }else{
789 EditDist3Cost *pCost;
790 int nExtra = nFrom + nTo - 4;
791 if( nExtra<0 ) nExtra = 0;
792 pCost = sqlite3_malloc64( sizeof(*pCost) + nExtra );
793 if( pCost==0 ){ rc = SQLITE_NOMEM; break; }
794 pCost->nFrom = (u8)nFrom;
795 pCost->nTo = (u8)nTo;
796 pCost->iCost = (u16)iCost;
797 memcpy(pCost->a, zFrom, nFrom);
798 memcpy(pCost->a + nFrom, zTo, nTo);
799 pCost->pNext = pLang->pCost;
800 pLang->pCost = pCost;
803 rc2 = sqlite3_finalize(pStmt);
804 if( rc==SQLITE_OK ) rc = rc2;
805 if( rc==SQLITE_OK ){
806 int iLang;
807 for(iLang=0; iLang<p->nLang; iLang++){
808 p->a[iLang].pCost = editDist3CostSort(p->a[iLang].pCost);
811 return rc;
815 ** Return the length (in bytes) of a utf-8 character. Or return a maximum
816 ** of N.
818 static int utf8Len(unsigned char c, int N){
819 int len = 1;
820 if( c>0x7f ){
821 if( (c&0xe0)==0xc0 ){
822 len = 2;
823 }else if( (c&0xf0)==0xe0 ){
824 len = 3;
825 }else{
826 len = 4;
829 if( len>N ) len = N;
830 return len;
834 ** Return TRUE (non-zero) if the To side of the given cost matches
835 ** the given string.
837 static int matchTo(EditDist3Cost *p, const char *z, int n){
838 if( p->a[p->nFrom]!=z[0] ) return 0;
839 if( p->nTo>n ) return 0;
840 if( strncmp(p->a+p->nFrom, z, p->nTo)!=0 ) return 0;
841 return 1;
845 ** Return TRUE (non-zero) if the From side of the given cost matches
846 ** the given string.
848 static int matchFrom(EditDist3Cost *p, const char *z, int n){
849 assert( p->nFrom<=n );
850 if( p->a[0]!=z[0] ) return 0;
851 if( strncmp(p->a, z, p->nFrom)!=0 ) return 0;
852 return 1;
856 ** Return TRUE (non-zero) of the next FROM character and the next TO
857 ** character are the same.
859 static int matchFromTo(
860 EditDist3FromString *pStr, /* Left hand string */
861 int n1, /* Index of comparison character on the left */
862 const char *z2, /* Right-handl comparison character */
863 int n2 /* Bytes remaining in z2[] */
865 int b1 = pStr->a[n1].nByte;
866 if( b1>n2 ) return 0;
867 if( pStr->z[n1]!=z2[0] ) return 0;
868 if( strncmp(pStr->z+n1, z2, b1)!=0 ) return 0;
869 return 1;
873 ** Delete an EditDist3FromString objecct
875 static void editDist3FromStringDelete(EditDist3FromString *p){
876 int i;
877 if( p ){
878 for(i=0; i<p->n; i++){
879 sqlite3_free(p->a[i].apDel);
880 sqlite3_free(p->a[i].apSubst);
882 sqlite3_free(p);
887 ** Create a EditDist3FromString object.
889 static EditDist3FromString *editDist3FromStringNew(
890 const EditDist3Lang *pLang,
891 const char *z,
892 int n
894 EditDist3FromString *pStr;
895 EditDist3Cost *p;
896 int i;
898 if( z==0 ) return 0;
899 if( n<0 ) n = (int)strlen(z);
900 pStr = sqlite3_malloc64( sizeof(*pStr) + sizeof(pStr->a[0])*n + n + 1 );
901 if( pStr==0 ) return 0;
902 pStr->a = (EditDist3From*)&pStr[1];
903 memset(pStr->a, 0, sizeof(pStr->a[0])*n);
904 pStr->n = n;
905 pStr->z = (char*)&pStr->a[n];
906 memcpy(pStr->z, z, n+1);
907 if( n && z[n-1]=='*' ){
908 pStr->isPrefix = 1;
909 n--;
910 pStr->n--;
911 pStr->z[n] = 0;
912 }else{
913 pStr->isPrefix = 0;
916 for(i=0; i<n; i++){
917 EditDist3From *pFrom = &pStr->a[i];
918 memset(pFrom, 0, sizeof(*pFrom));
919 pFrom->nByte = utf8Len((unsigned char)z[i], n-i);
920 for(p=pLang->pCost; p; p=p->pNext){
921 EditDist3Cost **apNew;
922 if( i+p->nFrom>n ) continue;
923 if( matchFrom(p, z+i, n-i)==0 ) continue;
924 if( p->nTo==0 ){
925 apNew = sqlite3_realloc64(pFrom->apDel,
926 sizeof(*apNew)*(pFrom->nDel+1));
927 if( apNew==0 ) break;
928 pFrom->apDel = apNew;
929 apNew[pFrom->nDel++] = p;
930 }else{
931 apNew = sqlite3_realloc64(pFrom->apSubst,
932 sizeof(*apNew)*(pFrom->nSubst+1));
933 if( apNew==0 ) break;
934 pFrom->apSubst = apNew;
935 apNew[pFrom->nSubst++] = p;
938 if( p ){
939 editDist3FromStringDelete(pStr);
940 pStr = 0;
941 break;
944 return pStr;
948 ** Update entry m[i] such that it is the minimum of its current value
949 ** and m[j]+iCost.
951 static void updateCost(
952 unsigned int *m,
953 int i,
954 int j,
955 int iCost
957 unsigned int b;
958 assert( iCost>=0 );
959 assert( iCost<10000 );
960 b = m[j] + iCost;
961 if( b<m[i] ) m[i] = b;
965 ** How much stack space (int bytes) to use for Wagner matrix in
966 ** editDist3Core(). If more space than this is required, the entire
967 ** matrix is taken from the heap. To reduce the load on the memory
968 ** allocator, make this value as large as practical for the
969 ** architecture in use.
971 #ifndef SQLITE_SPELLFIX_STACKALLOC_SZ
972 # define SQLITE_SPELLFIX_STACKALLOC_SZ (1024)
973 #endif
975 /* Compute the edit distance between two strings.
977 ** If an error occurs, return a negative number which is the error code.
979 ** If pnMatch is not NULL, then *pnMatch is set to the number of characters
980 ** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does
981 ** not contain the pattern for a prefix-search, then this is always the number
982 ** of characters in z2. If pFrom does contain a prefix search pattern, then
983 ** it is the number of characters in the prefix of z2 that was deemed to
984 ** match pFrom.
986 static int editDist3Core(
987 EditDist3FromString *pFrom, /* The FROM string */
988 const char *z2, /* The TO string */
989 int n2, /* Length of the TO string */
990 const EditDist3Lang *pLang, /* Edit weights for a particular language ID */
991 int *pnMatch /* OUT: Characters in matched prefix */
993 int k, n;
994 int i1, b1;
995 int i2, b2;
996 EditDist3FromString f = *pFrom;
997 EditDist3To *a2;
998 unsigned int *m;
999 unsigned int *pToFree;
1000 int szRow;
1001 EditDist3Cost *p;
1002 int res;
1003 sqlite3_uint64 nByte;
1004 unsigned int stackSpace[SQLITE_SPELLFIX_STACKALLOC_SZ/sizeof(unsigned int)];
1006 /* allocate the Wagner matrix and the aTo[] array for the TO string */
1007 n = (f.n+1)*(n2+1);
1008 n = (n+1)&~1;
1009 nByte = n*sizeof(m[0]) + sizeof(a2[0])*n2;
1010 if( nByte<=sizeof(stackSpace) ){
1011 m = stackSpace;
1012 pToFree = 0;
1013 }else{
1014 m = pToFree = sqlite3_malloc64( nByte );
1015 if( m==0 ) return -1; /* Out of memory */
1017 a2 = (EditDist3To*)&m[n];
1018 memset(a2, 0, sizeof(a2[0])*n2);
1020 /* Fill in the a1[] matrix for all characters of the TO string */
1021 for(i2=0; i2<n2; i2++){
1022 a2[i2].nByte = utf8Len((unsigned char)z2[i2], n2-i2);
1023 for(p=pLang->pCost; p; p=p->pNext){
1024 EditDist3Cost **apNew;
1025 if( p->nFrom>0 ) break;
1026 if( i2+p->nTo>n2 ) continue;
1027 if( p->a[0]>z2[i2] ) break;
1028 if( matchTo(p, z2+i2, n2-i2)==0 ) continue;
1029 a2[i2].nIns++;
1030 apNew = sqlite3_realloc64(a2[i2].apIns, sizeof(*apNew)*a2[i2].nIns);
1031 if( apNew==0 ){
1032 res = -1; /* Out of memory */
1033 goto editDist3Abort;
1035 a2[i2].apIns = apNew;
1036 a2[i2].apIns[a2[i2].nIns-1] = p;
1040 /* Prepare to compute the minimum edit distance */
1041 szRow = f.n+1;
1042 memset(m, 0x01, (n2+1)*szRow*sizeof(m[0]));
1043 m[0] = 0;
1045 /* First fill in the top-row of the matrix with FROM deletion costs */
1046 for(i1=0; i1<f.n; i1 += b1){
1047 b1 = f.a[i1].nByte;
1048 updateCost(m, i1+b1, i1, pLang->iDelCost);
1049 for(k=0; k<f.a[i1].nDel; k++){
1050 p = f.a[i1].apDel[k];
1051 updateCost(m, i1+p->nFrom, i1, p->iCost);
1055 /* Fill in all subsequent rows, top-to-bottom, left-to-right */
1056 for(i2=0; i2<n2; i2 += b2){
1057 int rx; /* Starting index for current row */
1058 int rxp; /* Starting index for previous row */
1059 b2 = a2[i2].nByte;
1060 rx = szRow*(i2+b2);
1061 rxp = szRow*i2;
1062 updateCost(m, rx, rxp, pLang->iInsCost);
1063 for(k=0; k<a2[i2].nIns; k++){
1064 p = a2[i2].apIns[k];
1065 updateCost(m, szRow*(i2+p->nTo), rxp, p->iCost);
1067 for(i1=0; i1<f.n; i1+=b1){
1068 int cx; /* Index of current cell */
1069 int cxp; /* Index of cell immediately to the left */
1070 int cxd; /* Index of cell to the left and one row above */
1071 int cxu; /* Index of cell immediately above */
1072 b1 = f.a[i1].nByte;
1073 cxp = rx + i1;
1074 cx = cxp + b1;
1075 cxd = rxp + i1;
1076 cxu = cxd + b1;
1077 updateCost(m, cx, cxp, pLang->iDelCost);
1078 for(k=0; k<f.a[i1].nDel; k++){
1079 p = f.a[i1].apDel[k];
1080 updateCost(m, cxp+p->nFrom, cxp, p->iCost);
1082 updateCost(m, cx, cxu, pLang->iInsCost);
1083 if( matchFromTo(&f, i1, z2+i2, n2-i2) ){
1084 updateCost(m, cx, cxd, 0);
1086 updateCost(m, cx, cxd, pLang->iSubCost);
1087 for(k=0; k<f.a[i1].nSubst; k++){
1088 p = f.a[i1].apSubst[k];
1089 if( matchTo(p, z2+i2, n2-i2) ){
1090 updateCost(m, cxd+p->nFrom+szRow*p->nTo, cxd, p->iCost);
1096 #if 0 /* Enable for debugging */
1097 printf(" ^");
1098 for(i1=0; i1<f.n; i1++) printf(" %c-%2x", f.z[i1], f.z[i1]&0xff);
1099 printf("\n ^:");
1100 for(i1=0; i1<szRow; i1++){
1101 int v = m[i1];
1102 if( v>9999 ) printf(" ****");
1103 else printf(" %4d", v);
1105 printf("\n");
1106 for(i2=0; i2<n2; i2++){
1107 printf("%c-%02x:", z2[i2], z2[i2]&0xff);
1108 for(i1=0; i1<szRow; i1++){
1109 int v = m[(i2+1)*szRow+i1];
1110 if( v>9999 ) printf(" ****");
1111 else printf(" %4d", v);
1113 printf("\n");
1115 #endif
1117 /* Free memory allocations and return the result */
1118 res = (int)m[szRow*(n2+1)-1];
1119 n = n2;
1120 if( f.isPrefix ){
1121 for(i2=1; i2<=n2; i2++){
1122 int b = m[szRow*i2-1];
1123 if( b<=res ){
1124 res = b;
1125 n = i2 - 1;
1129 if( pnMatch ){
1130 int nExtra = 0;
1131 for(k=0; k<n; k++){
1132 if( (z2[k] & 0xc0)==0x80 ) nExtra++;
1134 *pnMatch = n - nExtra;
1137 editDist3Abort:
1138 for(i2=0; i2<n2; i2++) sqlite3_free(a2[i2].apIns);
1139 sqlite3_free(pToFree);
1140 return res;
1144 ** Get an appropriate EditDist3Lang object.
1146 static const EditDist3Lang *editDist3FindLang(
1147 EditDist3Config *pConfig,
1148 int iLang
1150 int i;
1151 for(i=0; i<pConfig->nLang; i++){
1152 if( pConfig->a[i].iLang==iLang ) return &pConfig->a[i];
1154 return &editDist3Lang;
1158 ** Function: editdist3(A,B,iLang)
1159 ** editdist3(tablename)
1161 ** Return the cost of transforming string A into string B using edit
1162 ** weights for iLang.
1164 ** The second form loads edit weights into memory from a table.
1166 static void editDist3SqlFunc(
1167 sqlite3_context *context,
1168 int argc,
1169 sqlite3_value **argv
1171 EditDist3Config *pConfig = (EditDist3Config*)sqlite3_user_data(context);
1172 sqlite3 *db = sqlite3_context_db_handle(context);
1173 int rc;
1174 if( argc==1 ){
1175 const char *zTable = (const char*)sqlite3_value_text(argv[0]);
1176 rc = editDist3ConfigLoad(pConfig, db, zTable);
1177 if( rc ) sqlite3_result_error_code(context, rc);
1178 }else{
1179 const char *zA = (const char*)sqlite3_value_text(argv[0]);
1180 const char *zB = (const char*)sqlite3_value_text(argv[1]);
1181 int nA = sqlite3_value_bytes(argv[0]);
1182 int nB = sqlite3_value_bytes(argv[1]);
1183 int iLang = argc==3 ? sqlite3_value_int(argv[2]) : 0;
1184 const EditDist3Lang *pLang = editDist3FindLang(pConfig, iLang);
1185 EditDist3FromString *pFrom;
1186 int dist;
1188 pFrom = editDist3FromStringNew(pLang, zA, nA);
1189 if( pFrom==0 ){
1190 sqlite3_result_error_nomem(context);
1191 return;
1193 dist = editDist3Core(pFrom, zB, nB, pLang, 0);
1194 editDist3FromStringDelete(pFrom);
1195 if( dist==(-1) ){
1196 sqlite3_result_error_nomem(context);
1197 }else{
1198 sqlite3_result_int(context, dist);
1204 ** Register the editDist3 function with SQLite
1206 static int editDist3Install(sqlite3 *db){
1207 int rc;
1208 EditDist3Config *pConfig = sqlite3_malloc64( sizeof(*pConfig) );
1209 if( pConfig==0 ) return SQLITE_NOMEM;
1210 memset(pConfig, 0, sizeof(*pConfig));
1211 rc = sqlite3_create_function_v2(db, "editdist3",
1212 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig,
1213 editDist3SqlFunc, 0, 0, 0);
1214 if( rc==SQLITE_OK ){
1215 rc = sqlite3_create_function_v2(db, "editdist3",
1216 3, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig,
1217 editDist3SqlFunc, 0, 0, 0);
1219 if( rc==SQLITE_OK ){
1220 rc = sqlite3_create_function_v2(db, "editdist3",
1221 1, SQLITE_UTF8|SQLITE_DETERMINISTIC, pConfig,
1222 editDist3SqlFunc, 0, 0, editDist3ConfigDelete);
1223 }else{
1224 sqlite3_free(pConfig);
1226 return rc;
1228 /* End configurable cost unicode edit distance routines
1229 ******************************************************************************
1230 ******************************************************************************
1231 ** Begin transliterate unicode-to-ascii implementation
1234 #if !SQLITE_AMALGAMATION
1236 ** This lookup table is used to help decode the first byte of
1237 ** a multi-byte UTF8 character.
1239 static const unsigned char sqlite3Utf8Trans1[] = {
1240 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1241 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1242 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1243 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
1244 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1245 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1246 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1247 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
1249 #endif
1252 ** Return the value of the first UTF-8 character in the string.
1254 static int utf8Read(const unsigned char *z, int n, int *pSize){
1255 int c, i;
1257 /* All callers to this routine (in the current implementation)
1258 ** always have n>0. */
1259 if( NEVER(n==0) ){
1260 c = i = 0;
1261 }else{
1262 c = z[0];
1263 i = 1;
1264 if( c>=0xc0 ){
1265 c = sqlite3Utf8Trans1[c-0xc0];
1266 while( i<n && (z[i] & 0xc0)==0x80 ){
1267 c = (c<<6) + (0x3f & z[i++]);
1271 *pSize = i;
1272 return c;
1276 ** Return the number of characters in the utf-8 string in the nIn byte
1277 ** buffer pointed to by zIn.
1279 static int utf8Charlen(const char *zIn, int nIn){
1280 int i;
1281 int nChar = 0;
1282 for(i=0; i<nIn; nChar++){
1283 int sz;
1284 utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1285 i += sz;
1287 return nChar;
1290 typedef struct Transliteration Transliteration;
1291 struct Transliteration {
1292 unsigned short int cFrom;
1293 unsigned char cTo0, cTo1, cTo2, cTo3;
1297 ** Table of translations from unicode characters into ASCII.
1299 static const Transliteration translit[] = {
1300 { 0x00A0, 0x20, 0x00, 0x00, 0x00 }, /*   to */
1301 { 0x00B5, 0x75, 0x00, 0x00, 0x00 }, /* µ to u */
1302 { 0x00C0, 0x41, 0x00, 0x00, 0x00 }, /* À to A */
1303 { 0x00C1, 0x41, 0x00, 0x00, 0x00 }, /* Á to A */
1304 { 0x00C2, 0x41, 0x00, 0x00, 0x00 }, /* Â to A */
1305 { 0x00C3, 0x41, 0x00, 0x00, 0x00 }, /* Ã to A */
1306 { 0x00C4, 0x41, 0x65, 0x00, 0x00 }, /* Ä to Ae */
1307 { 0x00C5, 0x41, 0x61, 0x00, 0x00 }, /* Å to Aa */
1308 { 0x00C6, 0x41, 0x45, 0x00, 0x00 }, /* Æ to AE */
1309 { 0x00C7, 0x43, 0x00, 0x00, 0x00 }, /* Ç to C */
1310 { 0x00C8, 0x45, 0x00, 0x00, 0x00 }, /* È to E */
1311 { 0x00C9, 0x45, 0x00, 0x00, 0x00 }, /* É to E */
1312 { 0x00CA, 0x45, 0x00, 0x00, 0x00 }, /* Ê to E */
1313 { 0x00CB, 0x45, 0x00, 0x00, 0x00 }, /* Ë to E */
1314 { 0x00CC, 0x49, 0x00, 0x00, 0x00 }, /* Ì to I */
1315 { 0x00CD, 0x49, 0x00, 0x00, 0x00 }, /* Í to I */
1316 { 0x00CE, 0x49, 0x00, 0x00, 0x00 }, /* Î to I */
1317 { 0x00CF, 0x49, 0x00, 0x00, 0x00 }, /* Ï to I */
1318 { 0x00D0, 0x44, 0x00, 0x00, 0x00 }, /* Ð to D */
1319 { 0x00D1, 0x4E, 0x00, 0x00, 0x00 }, /* Ñ to N */
1320 { 0x00D2, 0x4F, 0x00, 0x00, 0x00 }, /* Ò to O */
1321 { 0x00D3, 0x4F, 0x00, 0x00, 0x00 }, /* Ó to O */
1322 { 0x00D4, 0x4F, 0x00, 0x00, 0x00 }, /* Ô to O */
1323 { 0x00D5, 0x4F, 0x00, 0x00, 0x00 }, /* Õ to O */
1324 { 0x00D6, 0x4F, 0x65, 0x00, 0x00 }, /* Ö to Oe */
1325 { 0x00D7, 0x78, 0x00, 0x00, 0x00 }, /* × to x */
1326 { 0x00D8, 0x4F, 0x00, 0x00, 0x00 }, /* Ø to O */
1327 { 0x00D9, 0x55, 0x00, 0x00, 0x00 }, /* Ù to U */
1328 { 0x00DA, 0x55, 0x00, 0x00, 0x00 }, /* Ú to U */
1329 { 0x00DB, 0x55, 0x00, 0x00, 0x00 }, /* Û to U */
1330 { 0x00DC, 0x55, 0x65, 0x00, 0x00 }, /* Ü to Ue */
1331 { 0x00DD, 0x59, 0x00, 0x00, 0x00 }, /* Ý to Y */
1332 { 0x00DE, 0x54, 0x68, 0x00, 0x00 }, /* Þ to Th */
1333 { 0x00DF, 0x73, 0x73, 0x00, 0x00 }, /* ß to ss */
1334 { 0x00E0, 0x61, 0x00, 0x00, 0x00 }, /* à to a */
1335 { 0x00E1, 0x61, 0x00, 0x00, 0x00 }, /* á to a */
1336 { 0x00E2, 0x61, 0x00, 0x00, 0x00 }, /* â to a */
1337 { 0x00E3, 0x61, 0x00, 0x00, 0x00 }, /* ã to a */
1338 { 0x00E4, 0x61, 0x65, 0x00, 0x00 }, /* ä to ae */
1339 { 0x00E5, 0x61, 0x61, 0x00, 0x00 }, /* å to aa */
1340 { 0x00E6, 0x61, 0x65, 0x00, 0x00 }, /* æ to ae */
1341 { 0x00E7, 0x63, 0x00, 0x00, 0x00 }, /* ç to c */
1342 { 0x00E8, 0x65, 0x00, 0x00, 0x00 }, /* è to e */
1343 { 0x00E9, 0x65, 0x00, 0x00, 0x00 }, /* é to e */
1344 { 0x00EA, 0x65, 0x00, 0x00, 0x00 }, /* ê to e */
1345 { 0x00EB, 0x65, 0x00, 0x00, 0x00 }, /* ë to e */
1346 { 0x00EC, 0x69, 0x00, 0x00, 0x00 }, /* ì to i */
1347 { 0x00ED, 0x69, 0x00, 0x00, 0x00 }, /* í to i */
1348 { 0x00EE, 0x69, 0x00, 0x00, 0x00 }, /* î to i */
1349 { 0x00EF, 0x69, 0x00, 0x00, 0x00 }, /* ï to i */
1350 { 0x00F0, 0x64, 0x00, 0x00, 0x00 }, /* ð to d */
1351 { 0x00F1, 0x6E, 0x00, 0x00, 0x00 }, /* ñ to n */
1352 { 0x00F2, 0x6F, 0x00, 0x00, 0x00 }, /* ò to o */
1353 { 0x00F3, 0x6F, 0x00, 0x00, 0x00 }, /* ó to o */
1354 { 0x00F4, 0x6F, 0x00, 0x00, 0x00 }, /* ô to o */
1355 { 0x00F5, 0x6F, 0x00, 0x00, 0x00 }, /* õ to o */
1356 { 0x00F6, 0x6F, 0x65, 0x00, 0x00 }, /* ö to oe */
1357 { 0x00F7, 0x3A, 0x00, 0x00, 0x00 }, /* ÷ to : */
1358 { 0x00F8, 0x6F, 0x00, 0x00, 0x00 }, /* ø to o */
1359 { 0x00F9, 0x75, 0x00, 0x00, 0x00 }, /* ù to u */
1360 { 0x00FA, 0x75, 0x00, 0x00, 0x00 }, /* ú to u */
1361 { 0x00FB, 0x75, 0x00, 0x00, 0x00 }, /* û to u */
1362 { 0x00FC, 0x75, 0x65, 0x00, 0x00 }, /* ü to ue */
1363 { 0x00FD, 0x79, 0x00, 0x00, 0x00 }, /* ý to y */
1364 { 0x00FE, 0x74, 0x68, 0x00, 0x00 }, /* þ to th */
1365 { 0x00FF, 0x79, 0x00, 0x00, 0x00 }, /* ÿ to y */
1366 { 0x0100, 0x41, 0x00, 0x00, 0x00 }, /* Ā to A */
1367 { 0x0101, 0x61, 0x00, 0x00, 0x00 }, /* ā to a */
1368 { 0x0102, 0x41, 0x00, 0x00, 0x00 }, /* Ă to A */
1369 { 0x0103, 0x61, 0x00, 0x00, 0x00 }, /* ă to a */
1370 { 0x0104, 0x41, 0x00, 0x00, 0x00 }, /* Ą to A */
1371 { 0x0105, 0x61, 0x00, 0x00, 0x00 }, /* ą to a */
1372 { 0x0106, 0x43, 0x00, 0x00, 0x00 }, /* Ć to C */
1373 { 0x0107, 0x63, 0x00, 0x00, 0x00 }, /* ć to c */
1374 { 0x0108, 0x43, 0x68, 0x00, 0x00 }, /* Ĉ to Ch */
1375 { 0x0109, 0x63, 0x68, 0x00, 0x00 }, /* ĉ to ch */
1376 { 0x010A, 0x43, 0x00, 0x00, 0x00 }, /* Ċ to C */
1377 { 0x010B, 0x63, 0x00, 0x00, 0x00 }, /* ċ to c */
1378 { 0x010C, 0x43, 0x00, 0x00, 0x00 }, /* Č to C */
1379 { 0x010D, 0x63, 0x00, 0x00, 0x00 }, /* č to c */
1380 { 0x010E, 0x44, 0x00, 0x00, 0x00 }, /* Ď to D */
1381 { 0x010F, 0x64, 0x00, 0x00, 0x00 }, /* ď to d */
1382 { 0x0110, 0x44, 0x00, 0x00, 0x00 }, /* Đ to D */
1383 { 0x0111, 0x64, 0x00, 0x00, 0x00 }, /* đ to d */
1384 { 0x0112, 0x45, 0x00, 0x00, 0x00 }, /* Ē to E */
1385 { 0x0113, 0x65, 0x00, 0x00, 0x00 }, /* ē to e */
1386 { 0x0114, 0x45, 0x00, 0x00, 0x00 }, /* Ĕ to E */
1387 { 0x0115, 0x65, 0x00, 0x00, 0x00 }, /* ĕ to e */
1388 { 0x0116, 0x45, 0x00, 0x00, 0x00 }, /* Ė to E */
1389 { 0x0117, 0x65, 0x00, 0x00, 0x00 }, /* ė to e */
1390 { 0x0118, 0x45, 0x00, 0x00, 0x00 }, /* Ę to E */
1391 { 0x0119, 0x65, 0x00, 0x00, 0x00 }, /* ę to e */
1392 { 0x011A, 0x45, 0x00, 0x00, 0x00 }, /* Ě to E */
1393 { 0x011B, 0x65, 0x00, 0x00, 0x00 }, /* ě to e */
1394 { 0x011C, 0x47, 0x68, 0x00, 0x00 }, /* Ĝ to Gh */
1395 { 0x011D, 0x67, 0x68, 0x00, 0x00 }, /* ĝ to gh */
1396 { 0x011E, 0x47, 0x00, 0x00, 0x00 }, /* Ğ to G */
1397 { 0x011F, 0x67, 0x00, 0x00, 0x00 }, /* ğ to g */
1398 { 0x0120, 0x47, 0x00, 0x00, 0x00 }, /* Ġ to G */
1399 { 0x0121, 0x67, 0x00, 0x00, 0x00 }, /* ġ to g */
1400 { 0x0122, 0x47, 0x00, 0x00, 0x00 }, /* Ģ to G */
1401 { 0x0123, 0x67, 0x00, 0x00, 0x00 }, /* ģ to g */
1402 { 0x0124, 0x48, 0x68, 0x00, 0x00 }, /* Ĥ to Hh */
1403 { 0x0125, 0x68, 0x68, 0x00, 0x00 }, /* ĥ to hh */
1404 { 0x0126, 0x48, 0x00, 0x00, 0x00 }, /* Ħ to H */
1405 { 0x0127, 0x68, 0x00, 0x00, 0x00 }, /* ħ to h */
1406 { 0x0128, 0x49, 0x00, 0x00, 0x00 }, /* Ĩ to I */
1407 { 0x0129, 0x69, 0x00, 0x00, 0x00 }, /* ĩ to i */
1408 { 0x012A, 0x49, 0x00, 0x00, 0x00 }, /* Ī to I */
1409 { 0x012B, 0x69, 0x00, 0x00, 0x00 }, /* ī to i */
1410 { 0x012C, 0x49, 0x00, 0x00, 0x00 }, /* Ĭ to I */
1411 { 0x012D, 0x69, 0x00, 0x00, 0x00 }, /* ĭ to i */
1412 { 0x012E, 0x49, 0x00, 0x00, 0x00 }, /* Į to I */
1413 { 0x012F, 0x69, 0x00, 0x00, 0x00 }, /* į to i */
1414 { 0x0130, 0x49, 0x00, 0x00, 0x00 }, /* İ to I */
1415 { 0x0131, 0x69, 0x00, 0x00, 0x00 }, /* ı to i */
1416 { 0x0132, 0x49, 0x4A, 0x00, 0x00 }, /* IJ to IJ */
1417 { 0x0133, 0x69, 0x6A, 0x00, 0x00 }, /* ij to ij */
1418 { 0x0134, 0x4A, 0x68, 0x00, 0x00 }, /* Ĵ to Jh */
1419 { 0x0135, 0x6A, 0x68, 0x00, 0x00 }, /* ĵ to jh */
1420 { 0x0136, 0x4B, 0x00, 0x00, 0x00 }, /* Ķ to K */
1421 { 0x0137, 0x6B, 0x00, 0x00, 0x00 }, /* ķ to k */
1422 { 0x0138, 0x6B, 0x00, 0x00, 0x00 }, /* ĸ to k */
1423 { 0x0139, 0x4C, 0x00, 0x00, 0x00 }, /* Ĺ to L */
1424 { 0x013A, 0x6C, 0x00, 0x00, 0x00 }, /* ĺ to l */
1425 { 0x013B, 0x4C, 0x00, 0x00, 0x00 }, /* Ļ to L */
1426 { 0x013C, 0x6C, 0x00, 0x00, 0x00 }, /* ļ to l */
1427 { 0x013D, 0x4C, 0x00, 0x00, 0x00 }, /* Ľ to L */
1428 { 0x013E, 0x6C, 0x00, 0x00, 0x00 }, /* ľ to l */
1429 { 0x013F, 0x4C, 0x2E, 0x00, 0x00 }, /* Ŀ to L. */
1430 { 0x0140, 0x6C, 0x2E, 0x00, 0x00 }, /* ŀ to l. */
1431 { 0x0141, 0x4C, 0x00, 0x00, 0x00 }, /* Ł to L */
1432 { 0x0142, 0x6C, 0x00, 0x00, 0x00 }, /* ł to l */
1433 { 0x0143, 0x4E, 0x00, 0x00, 0x00 }, /* Ń to N */
1434 { 0x0144, 0x6E, 0x00, 0x00, 0x00 }, /* ń to n */
1435 { 0x0145, 0x4E, 0x00, 0x00, 0x00 }, /* Ņ to N */
1436 { 0x0146, 0x6E, 0x00, 0x00, 0x00 }, /* ņ to n */
1437 { 0x0147, 0x4E, 0x00, 0x00, 0x00 }, /* Ň to N */
1438 { 0x0148, 0x6E, 0x00, 0x00, 0x00 }, /* ň to n */
1439 { 0x0149, 0x27, 0x6E, 0x00, 0x00 }, /* ʼn to 'n */
1440 { 0x014A, 0x4E, 0x47, 0x00, 0x00 }, /* Ŋ to NG */
1441 { 0x014B, 0x6E, 0x67, 0x00, 0x00 }, /* ŋ to ng */
1442 { 0x014C, 0x4F, 0x00, 0x00, 0x00 }, /* Ō to O */
1443 { 0x014D, 0x6F, 0x00, 0x00, 0x00 }, /* ō to o */
1444 { 0x014E, 0x4F, 0x00, 0x00, 0x00 }, /* Ŏ to O */
1445 { 0x014F, 0x6F, 0x00, 0x00, 0x00 }, /* ŏ to o */
1446 { 0x0150, 0x4F, 0x00, 0x00, 0x00 }, /* Ő to O */
1447 { 0x0151, 0x6F, 0x00, 0x00, 0x00 }, /* ő to o */
1448 { 0x0152, 0x4F, 0x45, 0x00, 0x00 }, /* Πto OE */
1449 { 0x0153, 0x6F, 0x65, 0x00, 0x00 }, /* œ to oe */
1450 { 0x0154, 0x52, 0x00, 0x00, 0x00 }, /* Ŕ to R */
1451 { 0x0155, 0x72, 0x00, 0x00, 0x00 }, /* ŕ to r */
1452 { 0x0156, 0x52, 0x00, 0x00, 0x00 }, /* Ŗ to R */
1453 { 0x0157, 0x72, 0x00, 0x00, 0x00 }, /* ŗ to r */
1454 { 0x0158, 0x52, 0x00, 0x00, 0x00 }, /* Ř to R */
1455 { 0x0159, 0x72, 0x00, 0x00, 0x00 }, /* ř to r */
1456 { 0x015A, 0x53, 0x00, 0x00, 0x00 }, /* Ś to S */
1457 { 0x015B, 0x73, 0x00, 0x00, 0x00 }, /* ś to s */
1458 { 0x015C, 0x53, 0x68, 0x00, 0x00 }, /* Ŝ to Sh */
1459 { 0x015D, 0x73, 0x68, 0x00, 0x00 }, /* ŝ to sh */
1460 { 0x015E, 0x53, 0x00, 0x00, 0x00 }, /* Ş to S */
1461 { 0x015F, 0x73, 0x00, 0x00, 0x00 }, /* ş to s */
1462 { 0x0160, 0x53, 0x00, 0x00, 0x00 }, /* Š to S */
1463 { 0x0161, 0x73, 0x00, 0x00, 0x00 }, /* š to s */
1464 { 0x0162, 0x54, 0x00, 0x00, 0x00 }, /* Ţ to T */
1465 { 0x0163, 0x74, 0x00, 0x00, 0x00 }, /* ţ to t */
1466 { 0x0164, 0x54, 0x00, 0x00, 0x00 }, /* Ť to T */
1467 { 0x0165, 0x74, 0x00, 0x00, 0x00 }, /* ť to t */
1468 { 0x0166, 0x54, 0x00, 0x00, 0x00 }, /* Ŧ to T */
1469 { 0x0167, 0x74, 0x00, 0x00, 0x00 }, /* ŧ to t */
1470 { 0x0168, 0x55, 0x00, 0x00, 0x00 }, /* Ũ to U */
1471 { 0x0169, 0x75, 0x00, 0x00, 0x00 }, /* ũ to u */
1472 { 0x016A, 0x55, 0x00, 0x00, 0x00 }, /* Ū to U */
1473 { 0x016B, 0x75, 0x00, 0x00, 0x00 }, /* ū to u */
1474 { 0x016C, 0x55, 0x00, 0x00, 0x00 }, /* Ŭ to U */
1475 { 0x016D, 0x75, 0x00, 0x00, 0x00 }, /* ŭ to u */
1476 { 0x016E, 0x55, 0x00, 0x00, 0x00 }, /* Ů to U */
1477 { 0x016F, 0x75, 0x00, 0x00, 0x00 }, /* ů to u */
1478 { 0x0170, 0x55, 0x00, 0x00, 0x00 }, /* Ű to U */
1479 { 0x0171, 0x75, 0x00, 0x00, 0x00 }, /* ű to u */
1480 { 0x0172, 0x55, 0x00, 0x00, 0x00 }, /* Ų to U */
1481 { 0x0173, 0x75, 0x00, 0x00, 0x00 }, /* ų to u */
1482 { 0x0174, 0x57, 0x00, 0x00, 0x00 }, /* Ŵ to W */
1483 { 0x0175, 0x77, 0x00, 0x00, 0x00 }, /* ŵ to w */
1484 { 0x0176, 0x59, 0x00, 0x00, 0x00 }, /* Ŷ to Y */
1485 { 0x0177, 0x79, 0x00, 0x00, 0x00 }, /* ŷ to y */
1486 { 0x0178, 0x59, 0x00, 0x00, 0x00 }, /* Ÿ to Y */
1487 { 0x0179, 0x5A, 0x00, 0x00, 0x00 }, /* Ź to Z */
1488 { 0x017A, 0x7A, 0x00, 0x00, 0x00 }, /* ź to z */
1489 { 0x017B, 0x5A, 0x00, 0x00, 0x00 }, /* Ż to Z */
1490 { 0x017C, 0x7A, 0x00, 0x00, 0x00 }, /* ż to z */
1491 { 0x017D, 0x5A, 0x00, 0x00, 0x00 }, /* Ž to Z */
1492 { 0x017E, 0x7A, 0x00, 0x00, 0x00 }, /* ž to z */
1493 { 0x017F, 0x73, 0x00, 0x00, 0x00 }, /* ſ to s */
1494 { 0x0192, 0x66, 0x00, 0x00, 0x00 }, /* ƒ to f */
1495 { 0x0218, 0x53, 0x00, 0x00, 0x00 }, /* Ș to S */
1496 { 0x0219, 0x73, 0x00, 0x00, 0x00 }, /* ș to s */
1497 { 0x021A, 0x54, 0x00, 0x00, 0x00 }, /* Ț to T */
1498 { 0x021B, 0x74, 0x00, 0x00, 0x00 }, /* ț to t */
1499 { 0x0386, 0x41, 0x00, 0x00, 0x00 }, /* Ά to A */
1500 { 0x0388, 0x45, 0x00, 0x00, 0x00 }, /* Έ to E */
1501 { 0x0389, 0x49, 0x00, 0x00, 0x00 }, /* Ή to I */
1502 { 0x038A, 0x49, 0x00, 0x00, 0x00 }, /* Ί to I */
1503 { 0x038C, 0x4f, 0x00, 0x00, 0x00 }, /* Ό to O */
1504 { 0x038E, 0x59, 0x00, 0x00, 0x00 }, /* Ύ to Y */
1505 { 0x038F, 0x4f, 0x00, 0x00, 0x00 }, /* Ώ to O */
1506 { 0x0390, 0x69, 0x00, 0x00, 0x00 }, /* ΐ to i */
1507 { 0x0391, 0x41, 0x00, 0x00, 0x00 }, /* Α to A */
1508 { 0x0392, 0x42, 0x00, 0x00, 0x00 }, /* Β to B */
1509 { 0x0393, 0x47, 0x00, 0x00, 0x00 }, /* Γ to G */
1510 { 0x0394, 0x44, 0x00, 0x00, 0x00 }, /* Δ to D */
1511 { 0x0395, 0x45, 0x00, 0x00, 0x00 }, /* Ε to E */
1512 { 0x0396, 0x5a, 0x00, 0x00, 0x00 }, /* Ζ to Z */
1513 { 0x0397, 0x49, 0x00, 0x00, 0x00 }, /* Η to I */
1514 { 0x0398, 0x54, 0x68, 0x00, 0x00 }, /* Θ to Th */
1515 { 0x0399, 0x49, 0x00, 0x00, 0x00 }, /* Ι to I */
1516 { 0x039A, 0x4b, 0x00, 0x00, 0x00 }, /* Κ to K */
1517 { 0x039B, 0x4c, 0x00, 0x00, 0x00 }, /* Λ to L */
1518 { 0x039C, 0x4d, 0x00, 0x00, 0x00 }, /* Μ to M */
1519 { 0x039D, 0x4e, 0x00, 0x00, 0x00 }, /* Ν to N */
1520 { 0x039E, 0x58, 0x00, 0x00, 0x00 }, /* Ξ to X */
1521 { 0x039F, 0x4f, 0x00, 0x00, 0x00 }, /* Ο to O */
1522 { 0x03A0, 0x50, 0x00, 0x00, 0x00 }, /* Π to P */
1523 { 0x03A1, 0x52, 0x00, 0x00, 0x00 }, /* Ρ to R */
1524 { 0x03A3, 0x53, 0x00, 0x00, 0x00 }, /* Σ to S */
1525 { 0x03A4, 0x54, 0x00, 0x00, 0x00 }, /* Τ to T */
1526 { 0x03A5, 0x59, 0x00, 0x00, 0x00 }, /* Υ to Y */
1527 { 0x03A6, 0x46, 0x00, 0x00, 0x00 }, /* Φ to F */
1528 { 0x03A7, 0x43, 0x68, 0x00, 0x00 }, /* Χ to Ch */
1529 { 0x03A8, 0x50, 0x73, 0x00, 0x00 }, /* Ψ to Ps */
1530 { 0x03A9, 0x4f, 0x00, 0x00, 0x00 }, /* Ω to O */
1531 { 0x03AA, 0x49, 0x00, 0x00, 0x00 }, /* Ϊ to I */
1532 { 0x03AB, 0x59, 0x00, 0x00, 0x00 }, /* Ϋ to Y */
1533 { 0x03AC, 0x61, 0x00, 0x00, 0x00 }, /* ά to a */
1534 { 0x03AD, 0x65, 0x00, 0x00, 0x00 }, /* έ to e */
1535 { 0x03AE, 0x69, 0x00, 0x00, 0x00 }, /* ή to i */
1536 { 0x03AF, 0x69, 0x00, 0x00, 0x00 }, /* ί to i */
1537 { 0x03B1, 0x61, 0x00, 0x00, 0x00 }, /* α to a */
1538 { 0x03B2, 0x62, 0x00, 0x00, 0x00 }, /* β to b */
1539 { 0x03B3, 0x67, 0x00, 0x00, 0x00 }, /* γ to g */
1540 { 0x03B4, 0x64, 0x00, 0x00, 0x00 }, /* δ to d */
1541 { 0x03B5, 0x65, 0x00, 0x00, 0x00 }, /* ε to e */
1542 { 0x03B6, 0x7a, 0x00, 0x00, 0x00 }, /* ζ to z */
1543 { 0x03B7, 0x69, 0x00, 0x00, 0x00 }, /* η to i */
1544 { 0x03B8, 0x74, 0x68, 0x00, 0x00 }, /* θ to th */
1545 { 0x03B9, 0x69, 0x00, 0x00, 0x00 }, /* ι to i */
1546 { 0x03BA, 0x6b, 0x00, 0x00, 0x00 }, /* κ to k */
1547 { 0x03BB, 0x6c, 0x00, 0x00, 0x00 }, /* λ to l */
1548 { 0x03BC, 0x6d, 0x00, 0x00, 0x00 }, /* μ to m */
1549 { 0x03BD, 0x6e, 0x00, 0x00, 0x00 }, /* ν to n */
1550 { 0x03BE, 0x78, 0x00, 0x00, 0x00 }, /* ξ to x */
1551 { 0x03BF, 0x6f, 0x00, 0x00, 0x00 }, /* ο to o */
1552 { 0x03C0, 0x70, 0x00, 0x00, 0x00 }, /* π to p */
1553 { 0x03C1, 0x72, 0x00, 0x00, 0x00 }, /* ρ to r */
1554 { 0x03C3, 0x73, 0x00, 0x00, 0x00 }, /* σ to s */
1555 { 0x03C4, 0x74, 0x00, 0x00, 0x00 }, /* τ to t */
1556 { 0x03C5, 0x79, 0x00, 0x00, 0x00 }, /* υ to y */
1557 { 0x03C6, 0x66, 0x00, 0x00, 0x00 }, /* φ to f */
1558 { 0x03C7, 0x63, 0x68, 0x00, 0x00 }, /* χ to ch */
1559 { 0x03C8, 0x70, 0x73, 0x00, 0x00 }, /* ψ to ps */
1560 { 0x03C9, 0x6f, 0x00, 0x00, 0x00 }, /* ω to o */
1561 { 0x03CA, 0x69, 0x00, 0x00, 0x00 }, /* ϊ to i */
1562 { 0x03CB, 0x79, 0x00, 0x00, 0x00 }, /* ϋ to y */
1563 { 0x03CC, 0x6f, 0x00, 0x00, 0x00 }, /* ό to o */
1564 { 0x03CD, 0x79, 0x00, 0x00, 0x00 }, /* ύ to y */
1565 { 0x03CE, 0x69, 0x00, 0x00, 0x00 }, /* ώ to i */
1566 { 0x0400, 0x45, 0x00, 0x00, 0x00 }, /* Ѐ to E */
1567 { 0x0401, 0x45, 0x00, 0x00, 0x00 }, /* Ё to E */
1568 { 0x0402, 0x44, 0x00, 0x00, 0x00 }, /* Ђ to D */
1569 { 0x0403, 0x47, 0x00, 0x00, 0x00 }, /* Ѓ to G */
1570 { 0x0404, 0x45, 0x00, 0x00, 0x00 }, /* Є to E */
1571 { 0x0405, 0x5a, 0x00, 0x00, 0x00 }, /* Ѕ to Z */
1572 { 0x0406, 0x49, 0x00, 0x00, 0x00 }, /* І to I */
1573 { 0x0407, 0x49, 0x00, 0x00, 0x00 }, /* Ї to I */
1574 { 0x0408, 0x4a, 0x00, 0x00, 0x00 }, /* Ј to J */
1575 { 0x0409, 0x49, 0x00, 0x00, 0x00 }, /* Љ to I */
1576 { 0x040A, 0x4e, 0x00, 0x00, 0x00 }, /* Њ to N */
1577 { 0x040B, 0x44, 0x00, 0x00, 0x00 }, /* Ћ to D */
1578 { 0x040C, 0x4b, 0x00, 0x00, 0x00 }, /* Ќ to K */
1579 { 0x040D, 0x49, 0x00, 0x00, 0x00 }, /* Ѝ to I */
1580 { 0x040E, 0x55, 0x00, 0x00, 0x00 }, /* Ў to U */
1581 { 0x040F, 0x44, 0x00, 0x00, 0x00 }, /* Џ to D */
1582 { 0x0410, 0x41, 0x00, 0x00, 0x00 }, /* А to A */
1583 { 0x0411, 0x42, 0x00, 0x00, 0x00 }, /* Б to B */
1584 { 0x0412, 0x56, 0x00, 0x00, 0x00 }, /* В to V */
1585 { 0x0413, 0x47, 0x00, 0x00, 0x00 }, /* Г to G */
1586 { 0x0414, 0x44, 0x00, 0x00, 0x00 }, /* Д to D */
1587 { 0x0415, 0x45, 0x00, 0x00, 0x00 }, /* Е to E */
1588 { 0x0416, 0x5a, 0x68, 0x00, 0x00 }, /* Ж to Zh */
1589 { 0x0417, 0x5a, 0x00, 0x00, 0x00 }, /* З to Z */
1590 { 0x0418, 0x49, 0x00, 0x00, 0x00 }, /* И to I */
1591 { 0x0419, 0x49, 0x00, 0x00, 0x00 }, /* Й to I */
1592 { 0x041A, 0x4b, 0x00, 0x00, 0x00 }, /* К to K */
1593 { 0x041B, 0x4c, 0x00, 0x00, 0x00 }, /* Л to L */
1594 { 0x041C, 0x4d, 0x00, 0x00, 0x00 }, /* М to M */
1595 { 0x041D, 0x4e, 0x00, 0x00, 0x00 }, /* Н to N */
1596 { 0x041E, 0x4f, 0x00, 0x00, 0x00 }, /* О to O */
1597 { 0x041F, 0x50, 0x00, 0x00, 0x00 }, /* П to P */
1598 { 0x0420, 0x52, 0x00, 0x00, 0x00 }, /* Р to R */
1599 { 0x0421, 0x53, 0x00, 0x00, 0x00 }, /* С to S */
1600 { 0x0422, 0x54, 0x00, 0x00, 0x00 }, /* Т to T */
1601 { 0x0423, 0x55, 0x00, 0x00, 0x00 }, /* У to U */
1602 { 0x0424, 0x46, 0x00, 0x00, 0x00 }, /* Ф to F */
1603 { 0x0425, 0x4b, 0x68, 0x00, 0x00 }, /* Х to Kh */
1604 { 0x0426, 0x54, 0x63, 0x00, 0x00 }, /* Ц to Tc */
1605 { 0x0427, 0x43, 0x68, 0x00, 0x00 }, /* Ч to Ch */
1606 { 0x0428, 0x53, 0x68, 0x00, 0x00 }, /* Ш to Sh */
1607 { 0x0429, 0x53, 0x68, 0x63, 0x68 }, /* Щ to Shch */
1608 { 0x042A, 0x61, 0x00, 0x00, 0x00 }, /* to A */
1609 { 0x042B, 0x59, 0x00, 0x00, 0x00 }, /* Ы to Y */
1610 { 0x042C, 0x59, 0x00, 0x00, 0x00 }, /* to Y */
1611 { 0x042D, 0x45, 0x00, 0x00, 0x00 }, /* Э to E */
1612 { 0x042E, 0x49, 0x75, 0x00, 0x00 }, /* Ю to Iu */
1613 { 0x042F, 0x49, 0x61, 0x00, 0x00 }, /* Я to Ia */
1614 { 0x0430, 0x61, 0x00, 0x00, 0x00 }, /* а to a */
1615 { 0x0431, 0x62, 0x00, 0x00, 0x00 }, /* б to b */
1616 { 0x0432, 0x76, 0x00, 0x00, 0x00 }, /* в to v */
1617 { 0x0433, 0x67, 0x00, 0x00, 0x00 }, /* г to g */
1618 { 0x0434, 0x64, 0x00, 0x00, 0x00 }, /* д to d */
1619 { 0x0435, 0x65, 0x00, 0x00, 0x00 }, /* е to e */
1620 { 0x0436, 0x7a, 0x68, 0x00, 0x00 }, /* ж to zh */
1621 { 0x0437, 0x7a, 0x00, 0x00, 0x00 }, /* з to z */
1622 { 0x0438, 0x69, 0x00, 0x00, 0x00 }, /* и to i */
1623 { 0x0439, 0x69, 0x00, 0x00, 0x00 }, /* й to i */
1624 { 0x043A, 0x6b, 0x00, 0x00, 0x00 }, /* к to k */
1625 { 0x043B, 0x6c, 0x00, 0x00, 0x00 }, /* л to l */
1626 { 0x043C, 0x6d, 0x00, 0x00, 0x00 }, /* м to m */
1627 { 0x043D, 0x6e, 0x00, 0x00, 0x00 }, /* н to n */
1628 { 0x043E, 0x6f, 0x00, 0x00, 0x00 }, /* о to o */
1629 { 0x043F, 0x70, 0x00, 0x00, 0x00 }, /* п to p */
1630 { 0x0440, 0x72, 0x00, 0x00, 0x00 }, /* р to r */
1631 { 0x0441, 0x73, 0x00, 0x00, 0x00 }, /* с to s */
1632 { 0x0442, 0x74, 0x00, 0x00, 0x00 }, /* т to t */
1633 { 0x0443, 0x75, 0x00, 0x00, 0x00 }, /* у to u */
1634 { 0x0444, 0x66, 0x00, 0x00, 0x00 }, /* ф to f */
1635 { 0x0445, 0x6b, 0x68, 0x00, 0x00 }, /* х to kh */
1636 { 0x0446, 0x74, 0x63, 0x00, 0x00 }, /* ц to tc */
1637 { 0x0447, 0x63, 0x68, 0x00, 0x00 }, /* ч to ch */
1638 { 0x0448, 0x73, 0x68, 0x00, 0x00 }, /* ш to sh */
1639 { 0x0449, 0x73, 0x68, 0x63, 0x68 }, /* щ to shch */
1640 { 0x044A, 0x61, 0x00, 0x00, 0x00 }, /* to a */
1641 { 0x044B, 0x79, 0x00, 0x00, 0x00 }, /* ы to y */
1642 { 0x044C, 0x79, 0x00, 0x00, 0x00 }, /* to y */
1643 { 0x044D, 0x65, 0x00, 0x00, 0x00 }, /* э to e */
1644 { 0x044E, 0x69, 0x75, 0x00, 0x00 }, /* ю to iu */
1645 { 0x044F, 0x69, 0x61, 0x00, 0x00 }, /* я to ia */
1646 { 0x0450, 0x65, 0x00, 0x00, 0x00 }, /* ѐ to e */
1647 { 0x0451, 0x65, 0x00, 0x00, 0x00 }, /* ё to e */
1648 { 0x0452, 0x64, 0x00, 0x00, 0x00 }, /* ђ to d */
1649 { 0x0453, 0x67, 0x00, 0x00, 0x00 }, /* ѓ to g */
1650 { 0x0454, 0x65, 0x00, 0x00, 0x00 }, /* є to e */
1651 { 0x0455, 0x7a, 0x00, 0x00, 0x00 }, /* ѕ to z */
1652 { 0x0456, 0x69, 0x00, 0x00, 0x00 }, /* і to i */
1653 { 0x0457, 0x69, 0x00, 0x00, 0x00 }, /* ї to i */
1654 { 0x0458, 0x6a, 0x00, 0x00, 0x00 }, /* ј to j */
1655 { 0x0459, 0x69, 0x00, 0x00, 0x00 }, /* љ to i */
1656 { 0x045A, 0x6e, 0x00, 0x00, 0x00 }, /* њ to n */
1657 { 0x045B, 0x64, 0x00, 0x00, 0x00 }, /* ћ to d */
1658 { 0x045C, 0x6b, 0x00, 0x00, 0x00 }, /* ќ to k */
1659 { 0x045D, 0x69, 0x00, 0x00, 0x00 }, /* ѝ to i */
1660 { 0x045E, 0x75, 0x00, 0x00, 0x00 }, /* ў to u */
1661 { 0x045F, 0x64, 0x00, 0x00, 0x00 }, /* џ to d */
1662 { 0x1E02, 0x42, 0x00, 0x00, 0x00 }, /* Ḃ to B */
1663 { 0x1E03, 0x62, 0x00, 0x00, 0x00 }, /* ḃ to b */
1664 { 0x1E0A, 0x44, 0x00, 0x00, 0x00 }, /* Ḋ to D */
1665 { 0x1E0B, 0x64, 0x00, 0x00, 0x00 }, /* ḋ to d */
1666 { 0x1E1E, 0x46, 0x00, 0x00, 0x00 }, /* Ḟ to F */
1667 { 0x1E1F, 0x66, 0x00, 0x00, 0x00 }, /* ḟ to f */
1668 { 0x1E40, 0x4D, 0x00, 0x00, 0x00 }, /* Ṁ to M */
1669 { 0x1E41, 0x6D, 0x00, 0x00, 0x00 }, /* ṁ to m */
1670 { 0x1E56, 0x50, 0x00, 0x00, 0x00 }, /* Ṗ to P */
1671 { 0x1E57, 0x70, 0x00, 0x00, 0x00 }, /* ṗ to p */
1672 { 0x1E60, 0x53, 0x00, 0x00, 0x00 }, /* Ṡ to S */
1673 { 0x1E61, 0x73, 0x00, 0x00, 0x00 }, /* ṡ to s */
1674 { 0x1E6A, 0x54, 0x00, 0x00, 0x00 }, /* Ṫ to T */
1675 { 0x1E6B, 0x74, 0x00, 0x00, 0x00 }, /* ṫ to t */
1676 { 0x1E80, 0x57, 0x00, 0x00, 0x00 }, /* Ẁ to W */
1677 { 0x1E81, 0x77, 0x00, 0x00, 0x00 }, /* ẁ to w */
1678 { 0x1E82, 0x57, 0x00, 0x00, 0x00 }, /* Ẃ to W */
1679 { 0x1E83, 0x77, 0x00, 0x00, 0x00 }, /* ẃ to w */
1680 { 0x1E84, 0x57, 0x00, 0x00, 0x00 }, /* Ẅ to W */
1681 { 0x1E85, 0x77, 0x00, 0x00, 0x00 }, /* ẅ to w */
1682 { 0x1EF2, 0x59, 0x00, 0x00, 0x00 }, /* Ỳ to Y */
1683 { 0x1EF3, 0x79, 0x00, 0x00, 0x00 }, /* ỳ to y */
1684 { 0xFB00, 0x66, 0x66, 0x00, 0x00 }, /* ff to ff */
1685 { 0xFB01, 0x66, 0x69, 0x00, 0x00 }, /* fi to fi */
1686 { 0xFB02, 0x66, 0x6C, 0x00, 0x00 }, /* fl to fl */
1687 { 0xFB05, 0x73, 0x74, 0x00, 0x00 }, /* ſt to st */
1688 { 0xFB06, 0x73, 0x74, 0x00, 0x00 }, /* st to st */
1691 static const Transliteration *spellfixFindTranslit(int c, int *pxTop){
1692 *pxTop = (sizeof(translit)/sizeof(translit[0])) - 1;
1693 return translit;
1697 ** Convert the input string from UTF-8 into pure ASCII by converting
1698 ** all non-ASCII characters to some combination of characters in the
1699 ** ASCII subset.
1701 ** The returned string might contain more characters than the input.
1703 ** Space to hold the returned string comes from sqlite3_malloc() and
1704 ** should be freed by the caller.
1706 static unsigned char *transliterate(const unsigned char *zIn, int nIn){
1707 unsigned char *zOut = sqlite3_malloc64( nIn*4 + 1 );
1708 int c, sz, nOut;
1709 if( zOut==0 ) return 0;
1710 nOut = 0;
1711 while( nIn>0 ){
1712 c = utf8Read(zIn, nIn, &sz);
1713 zIn += sz;
1714 nIn -= sz;
1715 if( c<=127 ){
1716 zOut[nOut++] = (unsigned char)c;
1717 }else{
1718 int xTop, xBtm, x;
1719 const Transliteration *tbl = spellfixFindTranslit(c, &xTop);
1720 xBtm = 0;
1721 while( xTop>=xBtm ){
1722 x = (xTop + xBtm)/2;
1723 if( tbl[x].cFrom==c ){
1724 zOut[nOut++] = tbl[x].cTo0;
1725 if( tbl[x].cTo1 ){
1726 zOut[nOut++] = tbl[x].cTo1;
1727 if( tbl[x].cTo2 ){
1728 zOut[nOut++] = tbl[x].cTo2;
1729 if( tbl[x].cTo3 ){
1730 zOut[nOut++] = tbl[x].cTo3;
1734 c = 0;
1735 break;
1736 }else if( tbl[x].cFrom>c ){
1737 xTop = x-1;
1738 }else{
1739 xBtm = x+1;
1742 if( c ) zOut[nOut++] = '?';
1745 zOut[nOut] = 0;
1746 return zOut;
1750 ** Return the number of characters in the shortest prefix of the input
1751 ** string that transliterates to an ASCII string nTrans bytes or longer.
1752 ** Or, if the transliteration of the input string is less than nTrans
1753 ** bytes in size, return the number of characters in the input string.
1755 static int translen_to_charlen(const char *zIn, int nIn, int nTrans){
1756 int i, c, sz, nOut;
1757 int nChar;
1759 i = nOut = 0;
1760 for(nChar=0; i<nIn && nOut<nTrans; nChar++){
1761 c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
1762 i += sz;
1764 nOut++;
1765 if( c>=128 ){
1766 int xTop, xBtm, x;
1767 const Transliteration *tbl = spellfixFindTranslit(c, &xTop);
1768 xBtm = 0;
1769 while( xTop>=xBtm ){
1770 x = (xTop + xBtm)/2;
1771 if( tbl[x].cFrom==c ){
1772 if( tbl[x].cTo1 ){
1773 nOut++;
1774 if( tbl[x].cTo2 ){
1775 nOut++;
1776 if( tbl[x].cTo3 ){
1777 nOut++;
1781 break;
1782 }else if( tbl[x].cFrom>c ){
1783 xTop = x-1;
1784 }else{
1785 xBtm = x+1;
1791 return nChar;
1796 ** spellfix1_translit(X)
1798 ** Convert a string that contains non-ASCII Roman characters into
1799 ** pure ASCII.
1801 static void transliterateSqlFunc(
1802 sqlite3_context *context,
1803 int argc,
1804 sqlite3_value **argv
1806 const unsigned char *zIn = sqlite3_value_text(argv[0]);
1807 int nIn = sqlite3_value_bytes(argv[0]);
1808 unsigned char *zOut = transliterate(zIn, nIn);
1809 if( zOut==0 ){
1810 sqlite3_result_error_nomem(context);
1811 }else{
1812 sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
1817 ** spellfix1_scriptcode(X)
1819 ** Try to determine the dominant script used by the word X and return
1820 ** its ISO 15924 numeric code.
1822 ** The current implementation only understands the following scripts:
1824 ** 215 (Latin)
1825 ** 220 (Cyrillic)
1826 ** 200 (Greek)
1828 ** This routine will return 998 if the input X contains characters from
1829 ** two or more of the above scripts or 999 if X contains no characters
1830 ** from any of the above scripts.
1832 static void scriptCodeSqlFunc(
1833 sqlite3_context *context,
1834 int argc,
1835 sqlite3_value **argv
1837 const unsigned char *zIn = sqlite3_value_text(argv[0]);
1838 int nIn = sqlite3_value_bytes(argv[0]);
1839 int c, sz;
1840 int scriptMask = 0;
1841 int res;
1842 int seenDigit = 0;
1843 # define SCRIPT_LATIN 0x0001
1844 # define SCRIPT_CYRILLIC 0x0002
1845 # define SCRIPT_GREEK 0x0004
1846 # define SCRIPT_HEBREW 0x0008
1847 # define SCRIPT_ARABIC 0x0010
1849 while( nIn>0 ){
1850 c = utf8Read(zIn, nIn, &sz);
1851 zIn += sz;
1852 nIn -= sz;
1853 if( c<0x02af ){
1854 if( c>=0x80 || midClass[c&0x7f]<CCLASS_DIGIT ){
1855 scriptMask |= SCRIPT_LATIN;
1856 }else if( c>='0' && c<='9' ){
1857 seenDigit = 1;
1859 }else if( c>=0x0400 && c<=0x04ff ){
1860 scriptMask |= SCRIPT_CYRILLIC;
1861 }else if( c>=0x0386 && c<=0x03ce ){
1862 scriptMask |= SCRIPT_GREEK;
1863 }else if( c>=0x0590 && c<=0x05ff ){
1864 scriptMask |= SCRIPT_HEBREW;
1865 }else if( c>=0x0600 && c<=0x06ff ){
1866 scriptMask |= SCRIPT_ARABIC;
1869 if( scriptMask==0 && seenDigit ) scriptMask = SCRIPT_LATIN;
1870 switch( scriptMask ){
1871 case 0: res = 999; break;
1872 case SCRIPT_LATIN: res = 215; break;
1873 case SCRIPT_CYRILLIC: res = 220; break;
1874 case SCRIPT_GREEK: res = 200; break;
1875 case SCRIPT_HEBREW: res = 125; break;
1876 case SCRIPT_ARABIC: res = 160; break;
1877 default: res = 998; break;
1879 sqlite3_result_int(context, res);
1882 /* End transliterate
1883 ******************************************************************************
1884 ******************************************************************************
1885 ** Begin spellfix1 virtual table.
1888 /* Maximum length of a phonehash used for querying the shadow table */
1889 #define SPELLFIX_MX_HASH 32
1891 /* Maximum number of hash strings to examine per query */
1892 #define SPELLFIX_MX_RUN 1
1894 typedef struct spellfix1_vtab spellfix1_vtab;
1895 typedef struct spellfix1_cursor spellfix1_cursor;
1897 /* Fuzzy-search virtual table object */
1898 struct spellfix1_vtab {
1899 sqlite3_vtab base; /* Base class - must be first */
1900 sqlite3 *db; /* Database connection */
1901 char *zDbName; /* Name of database holding this table */
1902 char *zTableName; /* Name of the virtual table */
1903 char *zCostTable; /* Table holding edit-distance cost numbers */
1904 EditDist3Config *pConfig3; /* Parsed edit distance costs */
1907 /* Fuzzy-search cursor object */
1908 struct spellfix1_cursor {
1909 sqlite3_vtab_cursor base; /* Base class - must be first */
1910 spellfix1_vtab *pVTab; /* The table to which this cursor belongs */
1911 char *zPattern; /* rhs of MATCH clause */
1912 int idxNum; /* idxNum value passed to xFilter() */
1913 int nRow; /* Number of rows of content */
1914 int nAlloc; /* Number of allocated rows */
1915 int iRow; /* Current row of content */
1916 int iLang; /* Value of the langid= constraint */
1917 int iTop; /* Value of the top= constraint */
1918 int iScope; /* Value of the scope= constraint */
1919 int nSearch; /* Number of vocabulary items checked */
1920 sqlite3_stmt *pFullScan; /* Shadow query for a full table scan */
1921 struct spellfix1_row { /* For each row of content */
1922 sqlite3_int64 iRowid; /* Rowid for this row */
1923 char *zWord; /* Text for this row */
1924 int iRank; /* Rank for this row */
1925 int iDistance; /* Distance from pattern for this row */
1926 int iScore; /* Score for sorting */
1927 int iMatchlen; /* Value of matchlen column (or -1) */
1928 char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */
1929 } *a;
1933 ** Construct one or more SQL statements from the format string given
1934 ** and then evaluate those statements. The success code is written
1935 ** into *pRc.
1937 ** If *pRc is initially non-zero then this routine is a no-op.
1939 static void spellfix1DbExec(
1940 int *pRc, /* Success code */
1941 sqlite3 *db, /* Database in which to run SQL */
1942 const char *zFormat, /* Format string for SQL */
1943 ... /* Arguments to the format string */
1945 va_list ap;
1946 char *zSql;
1947 if( *pRc ) return;
1948 va_start(ap, zFormat);
1949 zSql = sqlite3_vmprintf(zFormat, ap);
1950 va_end(ap);
1951 if( zSql==0 ){
1952 *pRc = SQLITE_NOMEM;
1953 }else{
1954 *pRc = sqlite3_exec(db, zSql, 0, 0, 0);
1955 sqlite3_free(zSql);
1960 ** xDisconnect/xDestroy method for the fuzzy-search module.
1962 static int spellfix1Uninit(int isDestroy, sqlite3_vtab *pVTab){
1963 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
1964 int rc = SQLITE_OK;
1965 if( isDestroy ){
1966 sqlite3 *db = p->db;
1967 spellfix1DbExec(&rc, db, "DROP TABLE IF EXISTS \"%w\".\"%w_vocab\"",
1968 p->zDbName, p->zTableName);
1970 if( rc==SQLITE_OK ){
1971 sqlite3_free(p->zTableName);
1972 editDist3ConfigDelete(p->pConfig3);
1973 sqlite3_free(p->zCostTable);
1974 sqlite3_free(p);
1976 return rc;
1978 static int spellfix1Disconnect(sqlite3_vtab *pVTab){
1979 return spellfix1Uninit(0, pVTab);
1981 static int spellfix1Destroy(sqlite3_vtab *pVTab){
1982 return spellfix1Uninit(1, pVTab);
1986 ** Make a copy of a string. Remove leading and trailing whitespace
1987 ** and dequote it.
1989 static char *spellfix1Dequote(const char *zIn){
1990 char *zOut;
1991 int i, j;
1992 char c;
1993 while( isspace((unsigned char)zIn[0]) ) zIn++;
1994 zOut = sqlite3_mprintf("%s", zIn);
1995 if( zOut==0 ) return 0;
1996 i = (int)strlen(zOut);
1997 #if 0 /* The parser will never leave spaces at the end */
1998 while( i>0 && isspace(zOut[i-1]) ){ i--; }
1999 #endif
2000 zOut[i] = 0;
2001 c = zOut[0];
2002 if( c=='\'' || c=='"' ){
2003 for(i=1, j=0; ALWAYS(zOut[i]); i++){
2004 zOut[j++] = zOut[i];
2005 if( zOut[i]==c ){
2006 if( zOut[i+1]==c ){
2007 i++;
2008 }else{
2009 zOut[j-1] = 0;
2010 break;
2015 return zOut;
2020 ** xConnect/xCreate method for the spellfix1 module. Arguments are:
2022 ** argv[0] -> module name ("spellfix1")
2023 ** argv[1] -> database name
2024 ** argv[2] -> table name
2025 ** argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter)
2027 static int spellfix1Init(
2028 int isCreate,
2029 sqlite3 *db,
2030 void *pAux,
2031 int argc, const char *const*argv,
2032 sqlite3_vtab **ppVTab,
2033 char **pzErr
2035 spellfix1_vtab *pNew = 0;
2036 /* const char *zModule = argv[0]; // not used */
2037 const char *zDbName = argv[1];
2038 const char *zTableName = argv[2];
2039 int nDbName;
2040 int rc = SQLITE_OK;
2041 int i;
2043 nDbName = (int)strlen(zDbName);
2044 pNew = sqlite3_malloc64( sizeof(*pNew) + nDbName + 1);
2045 if( pNew==0 ){
2046 rc = SQLITE_NOMEM;
2047 }else{
2048 memset(pNew, 0, sizeof(*pNew));
2049 pNew->zDbName = (char*)&pNew[1];
2050 memcpy(pNew->zDbName, zDbName, nDbName+1);
2051 pNew->zTableName = sqlite3_mprintf("%s", zTableName);
2052 pNew->db = db;
2053 if( pNew->zTableName==0 ){
2054 rc = SQLITE_NOMEM;
2055 }else{
2056 rc = sqlite3_declare_vtab(db,
2057 "CREATE TABLE x(word,rank,distance,langid, "
2058 "score, matchlen, phonehash HIDDEN, "
2059 "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
2060 "soundslike HIDDEN, command HIDDEN)"
2062 #define SPELLFIX_COL_WORD 0
2063 #define SPELLFIX_COL_RANK 1
2064 #define SPELLFIX_COL_DISTANCE 2
2065 #define SPELLFIX_COL_LANGID 3
2066 #define SPELLFIX_COL_SCORE 4
2067 #define SPELLFIX_COL_MATCHLEN 5
2068 #define SPELLFIX_COL_PHONEHASH 6
2069 #define SPELLFIX_COL_TOP 7
2070 #define SPELLFIX_COL_SCOPE 8
2071 #define SPELLFIX_COL_SRCHCNT 9
2072 #define SPELLFIX_COL_SOUNDSLIKE 10
2073 #define SPELLFIX_COL_COMMAND 11
2075 if( rc==SQLITE_OK && isCreate ){
2076 spellfix1DbExec(&rc, db,
2077 "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
2078 " id INTEGER PRIMARY KEY,\n"
2079 " rank INT,\n"
2080 " langid INT,\n"
2081 " word TEXT,\n"
2082 " k1 TEXT,\n"
2083 " k2 TEXT\n"
2084 ");\n",
2085 zDbName, zTableName
2087 spellfix1DbExec(&rc, db,
2088 "CREATE INDEX IF NOT EXISTS \"%w\".\"%w_vocab_index_langid_k2\" "
2089 "ON \"%w_vocab\"(langid,k2);",
2090 zDbName, zTableName, zTableName
2093 for(i=3; rc==SQLITE_OK && i<argc; i++){
2094 if( strncmp(argv[i],"edit_cost_table=",16)==0 && pNew->zCostTable==0 ){
2095 pNew->zCostTable = spellfix1Dequote(&argv[i][16]);
2096 if( pNew->zCostTable==0 ) rc = SQLITE_NOMEM;
2097 continue;
2099 *pzErr = sqlite3_mprintf("bad argument to spellfix1(): \"%s\"", argv[i]);
2100 rc = SQLITE_ERROR;
2104 if( rc && pNew ){
2105 *ppVTab = 0;
2106 spellfix1Uninit(0, &pNew->base);
2107 }else{
2108 *ppVTab = (sqlite3_vtab *)pNew;
2110 return rc;
2114 ** The xConnect and xCreate methods
2116 static int spellfix1Connect(
2117 sqlite3 *db,
2118 void *pAux,
2119 int argc, const char *const*argv,
2120 sqlite3_vtab **ppVTab,
2121 char **pzErr
2123 return spellfix1Init(0, db, pAux, argc, argv, ppVTab, pzErr);
2125 static int spellfix1Create(
2126 sqlite3 *db,
2127 void *pAux,
2128 int argc, const char *const*argv,
2129 sqlite3_vtab **ppVTab,
2130 char **pzErr
2132 return spellfix1Init(1, db, pAux, argc, argv, ppVTab, pzErr);
2136 ** Clear all of the content from a cursor.
2138 static void spellfix1ResetCursor(spellfix1_cursor *pCur){
2139 int i;
2140 for(i=0; i<pCur->nRow; i++){
2141 sqlite3_free(pCur->a[i].zWord);
2143 pCur->nRow = 0;
2144 pCur->iRow = 0;
2145 pCur->nSearch = 0;
2146 if( pCur->pFullScan ){
2147 sqlite3_finalize(pCur->pFullScan);
2148 pCur->pFullScan = 0;
2153 ** Resize the cursor to hold up to N rows of content
2155 static void spellfix1ResizeCursor(spellfix1_cursor *pCur, int N){
2156 struct spellfix1_row *aNew;
2157 assert( N>=pCur->nRow );
2158 aNew = sqlite3_realloc64(pCur->a, sizeof(pCur->a[0])*N);
2159 if( aNew==0 && N>0 ){
2160 spellfix1ResetCursor(pCur);
2161 sqlite3_free(pCur->a);
2162 pCur->nAlloc = 0;
2163 pCur->a = 0;
2164 }else{
2165 pCur->nAlloc = N;
2166 pCur->a = aNew;
2172 ** Close a fuzzy-search cursor.
2174 static int spellfix1Close(sqlite3_vtab_cursor *cur){
2175 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2176 spellfix1ResetCursor(pCur);
2177 spellfix1ResizeCursor(pCur, 0);
2178 sqlite3_free(pCur->zPattern);
2179 sqlite3_free(pCur);
2180 return SQLITE_OK;
2183 #define SPELLFIX_IDXNUM_MATCH 0x01 /* word MATCH $str */
2184 #define SPELLFIX_IDXNUM_LANGID 0x02 /* langid == $langid */
2185 #define SPELLFIX_IDXNUM_TOP 0x04 /* top = $top */
2186 #define SPELLFIX_IDXNUM_SCOPE 0x08 /* scope = $scope */
2187 #define SPELLFIX_IDXNUM_DISTLT 0x10 /* distance < $distance */
2188 #define SPELLFIX_IDXNUM_DISTLE 0x20 /* distance <= $distance */
2189 #define SPELLFIX_IDXNUM_ROWID 0x40 /* rowid = $rowid */
2190 #define SPELLFIX_IDXNUM_DIST (0x10|0x20) /* DISTLT and DISTLE */
2194 ** The plan number is a bitmask of the SPELLFIX_IDXNUM_* values defined
2195 ** above.
2197 ** filter.argv[*] values contains $str, $langid, $top, $scope and $rowid
2198 ** if specified and in that order.
2200 static int spellfix1BestIndex(sqlite3_vtab *tab, sqlite3_index_info *pIdxInfo){
2201 int iPlan = 0;
2202 int iLangTerm = -1;
2203 int iTopTerm = -1;
2204 int iScopeTerm = -1;
2205 int iDistTerm = -1;
2206 int iRowidTerm = -1;
2207 int i;
2208 const struct sqlite3_index_constraint *pConstraint;
2209 pConstraint = pIdxInfo->aConstraint;
2210 for(i=0; i<pIdxInfo->nConstraint; i++, pConstraint++){
2211 if( pConstraint->usable==0 ) continue;
2213 /* Terms of the form: word MATCH $str */
2214 if( (iPlan & SPELLFIX_IDXNUM_MATCH)==0
2215 && pConstraint->iColumn==SPELLFIX_COL_WORD
2216 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH
2218 iPlan |= SPELLFIX_IDXNUM_MATCH;
2219 pIdxInfo->aConstraintUsage[i].argvIndex = 1;
2220 pIdxInfo->aConstraintUsage[i].omit = 1;
2223 /* Terms of the form: langid = $langid */
2224 if( (iPlan & SPELLFIX_IDXNUM_LANGID)==0
2225 && pConstraint->iColumn==SPELLFIX_COL_LANGID
2226 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2228 iPlan |= SPELLFIX_IDXNUM_LANGID;
2229 iLangTerm = i;
2232 /* Terms of the form: top = $top */
2233 if( (iPlan & SPELLFIX_IDXNUM_TOP)==0
2234 && pConstraint->iColumn==SPELLFIX_COL_TOP
2235 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2237 iPlan |= SPELLFIX_IDXNUM_TOP;
2238 iTopTerm = i;
2241 /* Terms of the form: scope = $scope */
2242 if( (iPlan & SPELLFIX_IDXNUM_SCOPE)==0
2243 && pConstraint->iColumn==SPELLFIX_COL_SCOPE
2244 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2246 iPlan |= SPELLFIX_IDXNUM_SCOPE;
2247 iScopeTerm = i;
2250 /* Terms of the form: distance < $dist or distance <= $dist */
2251 if( (iPlan & SPELLFIX_IDXNUM_DIST)==0
2252 && pConstraint->iColumn==SPELLFIX_COL_DISTANCE
2253 && (pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT
2254 || pConstraint->op==SQLITE_INDEX_CONSTRAINT_LE)
2256 if( pConstraint->op==SQLITE_INDEX_CONSTRAINT_LT ){
2257 iPlan |= SPELLFIX_IDXNUM_DISTLT;
2258 }else{
2259 iPlan |= SPELLFIX_IDXNUM_DISTLE;
2261 iDistTerm = i;
2264 /* Terms of the form: distance < $dist or distance <= $dist */
2265 if( (iPlan & SPELLFIX_IDXNUM_ROWID)==0
2266 && pConstraint->iColumn<0
2267 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ
2269 iPlan |= SPELLFIX_IDXNUM_ROWID;
2270 iRowidTerm = i;
2273 if( iPlan&SPELLFIX_IDXNUM_MATCH ){
2274 int idx = 2;
2275 pIdxInfo->idxNum = iPlan;
2276 if( pIdxInfo->nOrderBy==1
2277 && pIdxInfo->aOrderBy[0].iColumn==SPELLFIX_COL_SCORE
2278 && pIdxInfo->aOrderBy[0].desc==0
2280 pIdxInfo->orderByConsumed = 1; /* Default order by iScore */
2282 if( iPlan&SPELLFIX_IDXNUM_LANGID ){
2283 pIdxInfo->aConstraintUsage[iLangTerm].argvIndex = idx++;
2284 pIdxInfo->aConstraintUsage[iLangTerm].omit = 1;
2286 if( iPlan&SPELLFIX_IDXNUM_TOP ){
2287 pIdxInfo->aConstraintUsage[iTopTerm].argvIndex = idx++;
2288 pIdxInfo->aConstraintUsage[iTopTerm].omit = 1;
2290 if( iPlan&SPELLFIX_IDXNUM_SCOPE ){
2291 pIdxInfo->aConstraintUsage[iScopeTerm].argvIndex = idx++;
2292 pIdxInfo->aConstraintUsage[iScopeTerm].omit = 1;
2294 if( iPlan&SPELLFIX_IDXNUM_DIST ){
2295 pIdxInfo->aConstraintUsage[iDistTerm].argvIndex = idx++;
2296 pIdxInfo->aConstraintUsage[iDistTerm].omit = 1;
2298 pIdxInfo->estimatedCost = 1e5;
2299 }else if( (iPlan & SPELLFIX_IDXNUM_ROWID) ){
2300 pIdxInfo->idxNum = SPELLFIX_IDXNUM_ROWID;
2301 pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1;
2302 pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1;
2303 pIdxInfo->estimatedCost = 5;
2304 }else{
2305 pIdxInfo->idxNum = 0;
2306 pIdxInfo->estimatedCost = 1e50;
2308 return SQLITE_OK;
2312 ** Open a new fuzzy-search cursor.
2314 static int spellfix1Open(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
2315 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2316 spellfix1_cursor *pCur;
2317 pCur = sqlite3_malloc64( sizeof(*pCur) );
2318 if( pCur==0 ) return SQLITE_NOMEM;
2319 memset(pCur, 0, sizeof(*pCur));
2320 pCur->pVTab = p;
2321 *ppCursor = &pCur->base;
2322 return SQLITE_OK;
2326 ** Adjust a distance measurement by the words rank in order to show
2327 ** preference to common words.
2329 static int spellfix1Score(int iDistance, int iRank){
2330 int iLog2;
2331 for(iLog2=0; iRank>0; iLog2++, iRank>>=1){}
2332 return iDistance + 32 - iLog2;
2336 ** Compare two spellfix1_row objects for sorting purposes in qsort() such
2337 ** that they sort in order of increasing distance.
2339 static int SQLITE_CDECL spellfix1RowCompare(const void *A, const void *B){
2340 const struct spellfix1_row *a = (const struct spellfix1_row*)A;
2341 const struct spellfix1_row *b = (const struct spellfix1_row*)B;
2342 return a->iScore - b->iScore;
2346 ** A structure used to pass information from spellfix1FilterForMatch()
2347 ** into spellfix1RunQuery().
2349 typedef struct MatchQuery {
2350 spellfix1_cursor *pCur; /* The cursor being queried */
2351 sqlite3_stmt *pStmt; /* shadow table query statment */
2352 char zHash[SPELLFIX_MX_HASH]; /* The current phonehash for zPattern */
2353 const char *zPattern; /* Transliterated input string */
2354 int nPattern; /* Length of zPattern */
2355 EditDist3FromString *pMatchStr3; /* Original unicode string */
2356 EditDist3Config *pConfig3; /* Edit-distance cost coefficients */
2357 const EditDist3Lang *pLang; /* The selected language coefficients */
2358 int iLang; /* The language id */
2359 int iScope; /* Default scope */
2360 int iMaxDist; /* Maximum allowed edit distance, or -1 */
2361 int rc; /* Error code */
2362 int nRun; /* Number of prior runs for the same zPattern */
2363 char azPrior[SPELLFIX_MX_RUN][SPELLFIX_MX_HASH]; /* Prior hashes */
2364 } MatchQuery;
2367 ** Run a query looking for the best matches against zPattern using
2368 ** zHash as the character class seed hash.
2370 static void spellfix1RunQuery(MatchQuery *p, const char *zQuery, int nQuery){
2371 const char *zK1;
2372 const char *zWord;
2373 int iDist;
2374 int iRank;
2375 int iScore;
2376 int iWorst = 0;
2377 int idx;
2378 int idxWorst = -1;
2379 int i;
2380 int iScope = p->iScope;
2381 spellfix1_cursor *pCur = p->pCur;
2382 sqlite3_stmt *pStmt = p->pStmt;
2383 char zHash1[SPELLFIX_MX_HASH];
2384 char zHash2[SPELLFIX_MX_HASH];
2385 char *zClass;
2386 int nClass;
2387 int rc;
2389 if( pCur->a==0 || p->rc ) return; /* Prior memory allocation failure */
2390 zClass = (char*)phoneticHash((unsigned char*)zQuery, nQuery);
2391 if( zClass==0 ){
2392 p->rc = SQLITE_NOMEM;
2393 return;
2395 nClass = (int)strlen(zClass);
2396 if( nClass>SPELLFIX_MX_HASH-2 ){
2397 nClass = SPELLFIX_MX_HASH-2;
2398 zClass[nClass] = 0;
2400 if( nClass<=iScope ){
2401 if( nClass>2 ){
2402 iScope = nClass-1;
2403 }else{
2404 iScope = nClass;
2407 memcpy(zHash1, zClass, iScope);
2408 sqlite3_free(zClass);
2409 zHash1[iScope] = 0;
2410 memcpy(zHash2, zHash1, iScope);
2411 zHash2[iScope] = 'Z';
2412 zHash2[iScope+1] = 0;
2413 #if SPELLFIX_MX_RUN>1
2414 for(i=0; i<p->nRun; i++){
2415 if( strcmp(p->azPrior[i], zHash1)==0 ) return;
2417 #endif
2418 assert( p->nRun<SPELLFIX_MX_RUN );
2419 memcpy(p->azPrior[p->nRun++], zHash1, iScope+1);
2420 if( sqlite3_bind_text(pStmt, 1, zHash1, -1, SQLITE_STATIC)==SQLITE_NOMEM
2421 || sqlite3_bind_text(pStmt, 2, zHash2, -1, SQLITE_STATIC)==SQLITE_NOMEM
2423 p->rc = SQLITE_NOMEM;
2424 return;
2426 #if SPELLFIX_MX_RUN>1
2427 for(i=0; i<pCur->nRow; i++){
2428 if( pCur->a[i].iScore>iWorst ){
2429 iWorst = pCur->a[i].iScore;
2430 idxWorst = i;
2433 #endif
2434 while( sqlite3_step(pStmt)==SQLITE_ROW ){
2435 int iMatchlen = -1;
2436 iRank = sqlite3_column_int(pStmt, 2);
2437 if( p->pMatchStr3 ){
2438 int nWord = sqlite3_column_bytes(pStmt, 1);
2439 zWord = (const char*)sqlite3_column_text(pStmt, 1);
2440 iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen);
2441 }else{
2442 zK1 = (const char*)sqlite3_column_text(pStmt, 3);
2443 if( zK1==0 ) continue;
2444 iDist = editdist1(p->zPattern, zK1, 0);
2446 if( iDist<0 ){
2447 p->rc = SQLITE_NOMEM;
2448 break;
2450 pCur->nSearch++;
2452 /* If there is a "distance < $dist" or "distance <= $dist" constraint,
2453 ** check if this row meets it. If not, jump back up to the top of the
2454 ** loop to process the next row. Otherwise, if the row does match the
2455 ** distance constraint, check if the pCur->a[] array is already full.
2456 ** If it is and no explicit "top = ?" constraint was present in the
2457 ** query, grow the array to ensure there is room for the new entry. */
2458 assert( (p->iMaxDist>=0)==((pCur->idxNum & SPELLFIX_IDXNUM_DIST) ? 1 : 0) );
2459 if( p->iMaxDist>=0 ){
2460 if( iDist>p->iMaxDist ) continue;
2461 if( pCur->nRow>=pCur->nAlloc && (pCur->idxNum & SPELLFIX_IDXNUM_TOP)==0 ){
2462 spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10);
2463 if( pCur->a==0 ) break;
2467 iScore = spellfix1Score(iDist,iRank);
2468 if( pCur->nRow<pCur->nAlloc ){
2469 idx = pCur->nRow;
2470 }else if( iScore<iWorst ){
2471 idx = idxWorst;
2472 sqlite3_free(pCur->a[idx].zWord);
2473 }else{
2474 continue;
2477 pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
2478 if( pCur->a[idx].zWord==0 ){
2479 p->rc = SQLITE_NOMEM;
2480 break;
2482 pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
2483 pCur->a[idx].iRank = iRank;
2484 pCur->a[idx].iDistance = iDist;
2485 pCur->a[idx].iScore = iScore;
2486 pCur->a[idx].iMatchlen = iMatchlen;
2487 memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
2488 if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
2489 if( pCur->nRow==pCur->nAlloc ){
2490 iWorst = pCur->a[0].iScore;
2491 idxWorst = 0;
2492 for(i=1; i<pCur->nRow; i++){
2493 iScore = pCur->a[i].iScore;
2494 if( iWorst<iScore ){
2495 iWorst = iScore;
2496 idxWorst = i;
2501 rc = sqlite3_reset(pStmt);
2502 if( rc ) p->rc = rc;
2506 ** This version of the xFilter method work if the MATCH term is present
2507 ** and we are doing a scan.
2509 static int spellfix1FilterForMatch(
2510 spellfix1_cursor *pCur,
2511 int argc,
2512 sqlite3_value **argv
2514 int idxNum = pCur->idxNum;
2515 const unsigned char *zMatchThis; /* RHS of the MATCH operator */
2516 EditDist3FromString *pMatchStr3 = 0; /* zMatchThis as an editdist string */
2517 char *zPattern; /* Transliteration of zMatchThis */
2518 int nPattern; /* Length of zPattern */
2519 int iLimit = 20; /* Max number of rows of output */
2520 int iScope = 3; /* Use this many characters of zClass */
2521 int iLang = 0; /* Language code */
2522 char *zSql; /* SQL of shadow table query */
2523 sqlite3_stmt *pStmt = 0; /* Shadow table query */
2524 int rc; /* Result code */
2525 int idx = 1; /* Next available filter parameter */
2526 spellfix1_vtab *p = pCur->pVTab; /* The virtual table that owns pCur */
2527 MatchQuery x; /* For passing info to RunQuery() */
2529 /* Load the cost table if we have not already done so */
2530 if( p->zCostTable!=0 && p->pConfig3==0 ){
2531 p->pConfig3 = sqlite3_malloc64( sizeof(p->pConfig3[0]) );
2532 if( p->pConfig3==0 ) return SQLITE_NOMEM;
2533 memset(p->pConfig3, 0, sizeof(p->pConfig3[0]));
2534 rc = editDist3ConfigLoad(p->pConfig3, p->db, p->zCostTable);
2535 if( rc ) return rc;
2537 memset(&x, 0, sizeof(x));
2538 x.iScope = 3; /* Default scope if none specified by "WHERE scope=N" */
2539 x.iMaxDist = -1; /* Maximum allowed edit distance */
2541 if( idxNum&2 ){
2542 iLang = sqlite3_value_int(argv[idx++]);
2544 if( idxNum&4 ){
2545 iLimit = sqlite3_value_int(argv[idx++]);
2546 if( iLimit<1 ) iLimit = 1;
2548 if( idxNum&8 ){
2549 x.iScope = sqlite3_value_int(argv[idx++]);
2550 if( x.iScope<1 ) x.iScope = 1;
2551 if( x.iScope>SPELLFIX_MX_HASH-2 ) x.iScope = SPELLFIX_MX_HASH-2;
2553 if( idxNum&(16|32) ){
2554 x.iMaxDist = sqlite3_value_int(argv[idx++]);
2555 if( idxNum&16 ) x.iMaxDist--;
2556 if( x.iMaxDist<0 ) x.iMaxDist = 0;
2558 spellfix1ResetCursor(pCur);
2559 spellfix1ResizeCursor(pCur, iLimit);
2560 zMatchThis = sqlite3_value_text(argv[0]);
2561 if( zMatchThis==0 ) return SQLITE_OK;
2562 if( p->pConfig3 ){
2563 x.pLang = editDist3FindLang(p->pConfig3, iLang);
2564 pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1);
2565 if( pMatchStr3==0 ){
2566 x.rc = SQLITE_NOMEM;
2567 goto filter_exit;
2569 }else{
2570 x.pLang = 0;
2572 zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));
2573 sqlite3_free(pCur->zPattern);
2574 pCur->zPattern = zPattern;
2575 if( zPattern==0 ){
2576 x.rc = SQLITE_NOMEM;
2577 goto filter_exit;
2579 nPattern = (int)strlen(zPattern);
2580 if( zPattern[nPattern-1]=='*' ) nPattern--;
2581 zSql = sqlite3_mprintf(
2582 "SELECT id, word, rank, coalesce(k1,word)"
2583 " FROM \"%w\".\"%w_vocab\""
2584 " WHERE langid=%d AND k2>=?1 AND k2<?2",
2585 p->zDbName, p->zTableName, iLang
2587 if( zSql==0 ){
2588 x.rc = SQLITE_NOMEM;
2589 pStmt = 0;
2590 goto filter_exit;
2592 rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0);
2593 sqlite3_free(zSql);
2594 pCur->iLang = iLang;
2595 x.pCur = pCur;
2596 x.pStmt = pStmt;
2597 x.zPattern = zPattern;
2598 x.nPattern = nPattern;
2599 x.pMatchStr3 = pMatchStr3;
2600 x.iLang = iLang;
2601 x.rc = rc;
2602 x.pConfig3 = p->pConfig3;
2603 if( x.rc==SQLITE_OK ){
2604 spellfix1RunQuery(&x, zPattern, nPattern);
2607 if( pCur->a ){
2608 qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare);
2609 pCur->iTop = iLimit;
2610 pCur->iScope = iScope;
2611 }else{
2612 x.rc = SQLITE_NOMEM;
2615 filter_exit:
2616 sqlite3_finalize(pStmt);
2617 editDist3FromStringDelete(pMatchStr3);
2618 return x.rc;
2622 ** This version of xFilter handles a full-table scan case
2624 static int spellfix1FilterForFullScan(
2625 spellfix1_cursor *pCur,
2626 int argc,
2627 sqlite3_value **argv
2629 int rc = SQLITE_OK;
2630 int idxNum = pCur->idxNum;
2631 char *zSql;
2632 spellfix1_vtab *pVTab = pCur->pVTab;
2633 spellfix1ResetCursor(pCur);
2634 assert( idxNum==0 || idxNum==64 );
2635 zSql = sqlite3_mprintf(
2636 "SELECT word, rank, NULL, langid, id FROM \"%w\".\"%w_vocab\"%s",
2637 pVTab->zDbName, pVTab->zTableName,
2638 ((idxNum & 64) ? " WHERE rowid=?" : "")
2640 if( zSql==0 ) return SQLITE_NOMEM;
2641 rc = sqlite3_prepare_v2(pVTab->db, zSql, -1, &pCur->pFullScan, 0);
2642 sqlite3_free(zSql);
2643 if( rc==SQLITE_OK && (idxNum & 64) ){
2644 assert( argc==1 );
2645 rc = sqlite3_bind_value(pCur->pFullScan, 1, argv[0]);
2647 pCur->nRow = pCur->iRow = 0;
2648 if( rc==SQLITE_OK ){
2649 rc = sqlite3_step(pCur->pFullScan);
2650 if( rc==SQLITE_ROW ){ pCur->iRow = -1; rc = SQLITE_OK; }
2651 if( rc==SQLITE_DONE ){ rc = SQLITE_OK; }
2652 }else{
2653 pCur->iRow = 0;
2655 return rc;
2660 ** Called to "rewind" a cursor back to the beginning so that
2661 ** it starts its output over again. Always called at least once
2662 ** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call.
2664 static int spellfix1Filter(
2665 sqlite3_vtab_cursor *cur,
2666 int idxNum, const char *idxStr,
2667 int argc, sqlite3_value **argv
2669 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2670 int rc;
2671 pCur->idxNum = idxNum;
2672 if( idxNum & 1 ){
2673 rc = spellfix1FilterForMatch(pCur, argc, argv);
2674 }else{
2675 rc = spellfix1FilterForFullScan(pCur, argc, argv);
2677 return rc;
2682 ** Advance a cursor to its next row of output
2684 static int spellfix1Next(sqlite3_vtab_cursor *cur){
2685 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2686 int rc = SQLITE_OK;
2687 if( pCur->iRow < pCur->nRow ){
2688 if( pCur->pFullScan ){
2689 rc = sqlite3_step(pCur->pFullScan);
2690 if( rc!=SQLITE_ROW ) pCur->iRow = pCur->nRow;
2691 if( rc==SQLITE_ROW || rc==SQLITE_DONE ) rc = SQLITE_OK;
2692 }else{
2693 pCur->iRow++;
2696 return rc;
2700 ** Return TRUE if we are at the end-of-file
2702 static int spellfix1Eof(sqlite3_vtab_cursor *cur){
2703 spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
2704 return pCur->iRow>=pCur->nRow;
2708 ** Return columns from the current row.
2710 static int spellfix1Column(
2711 sqlite3_vtab_cursor *cur,
2712 sqlite3_context *ctx,
2713 int i
2715 spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2716 if( pCur->pFullScan ){
2717 if( i<=SPELLFIX_COL_LANGID ){
2718 sqlite3_result_value(ctx, sqlite3_column_value(pCur->pFullScan, i));
2719 }else{
2720 sqlite3_result_null(ctx);
2722 return SQLITE_OK;
2724 switch( i ){
2725 case SPELLFIX_COL_WORD: {
2726 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zWord, -1, SQLITE_STATIC);
2727 break;
2729 case SPELLFIX_COL_RANK: {
2730 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iRank);
2731 break;
2733 case SPELLFIX_COL_DISTANCE: {
2734 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iDistance);
2735 break;
2737 case SPELLFIX_COL_LANGID: {
2738 sqlite3_result_int(ctx, pCur->iLang);
2739 break;
2741 case SPELLFIX_COL_SCORE: {
2742 sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
2743 break;
2745 case SPELLFIX_COL_MATCHLEN: {
2746 int iMatchlen = pCur->a[pCur->iRow].iMatchlen;
2747 if( iMatchlen<0 ){
2748 int nPattern = (int)strlen(pCur->zPattern);
2749 char *zWord = pCur->a[pCur->iRow].zWord;
2750 int nWord = (int)strlen(zWord);
2752 if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){
2753 char *zTranslit;
2754 int res;
2755 zTranslit = (char *)transliterate((unsigned char *)zWord, nWord);
2756 if( !zTranslit ) return SQLITE_NOMEM;
2757 res = editdist1(pCur->zPattern, zTranslit, &iMatchlen);
2758 sqlite3_free(zTranslit);
2759 if( res<0 ) return SQLITE_NOMEM;
2760 iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen);
2761 }else{
2762 iMatchlen = utf8Charlen(zWord, nWord);
2766 sqlite3_result_int(ctx, iMatchlen);
2767 break;
2769 case SPELLFIX_COL_PHONEHASH: {
2770 sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC);
2771 break;
2773 case SPELLFIX_COL_TOP: {
2774 sqlite3_result_int(ctx, pCur->iTop);
2775 break;
2777 case SPELLFIX_COL_SCOPE: {
2778 sqlite3_result_int(ctx, pCur->iScope);
2779 break;
2781 case SPELLFIX_COL_SRCHCNT: {
2782 sqlite3_result_int(ctx, pCur->nSearch);
2783 break;
2785 default: {
2786 sqlite3_result_null(ctx);
2787 break;
2790 return SQLITE_OK;
2794 ** The rowid.
2796 static int spellfix1Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
2797 spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
2798 if( pCur->pFullScan ){
2799 *pRowid = sqlite3_column_int64(pCur->pFullScan, 4);
2800 }else{
2801 *pRowid = pCur->a[pCur->iRow].iRowid;
2803 return SQLITE_OK;
2807 ** This function is called by the xUpdate() method. It returns a string
2808 ** containing the conflict mode that xUpdate() should use for the current
2809 ** operation. One of: "ROLLBACK", "IGNORE", "ABORT" or "REPLACE".
2811 static const char *spellfix1GetConflict(sqlite3 *db){
2812 static const char *azConflict[] = {
2813 /* Note: Instead of "FAIL" - "ABORT". */
2814 "ROLLBACK", "IGNORE", "ABORT", "ABORT", "REPLACE"
2816 int eConflict = sqlite3_vtab_on_conflict(db);
2818 assert( eConflict==SQLITE_ROLLBACK || eConflict==SQLITE_IGNORE
2819 || eConflict==SQLITE_FAIL || eConflict==SQLITE_ABORT
2820 || eConflict==SQLITE_REPLACE
2822 assert( SQLITE_ROLLBACK==1 );
2823 assert( SQLITE_IGNORE==2 );
2824 assert( SQLITE_FAIL==3 );
2825 assert( SQLITE_ABORT==4 );
2826 assert( SQLITE_REPLACE==5 );
2828 return azConflict[eConflict-1];
2832 ** The xUpdate() method.
2834 static int spellfix1Update(
2835 sqlite3_vtab *pVTab,
2836 int argc,
2837 sqlite3_value **argv,
2838 sqlite_int64 *pRowid
2840 int rc = SQLITE_OK;
2841 sqlite3_int64 rowid, newRowid;
2842 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2843 sqlite3 *db = p->db;
2845 if( argc==1 ){
2846 /* A delete operation on the rowid given by argv[0] */
2847 rowid = *pRowid = sqlite3_value_int64(argv[0]);
2848 spellfix1DbExec(&rc, db, "DELETE FROM \"%w\".\"%w_vocab\" "
2849 " WHERE id=%lld",
2850 p->zDbName, p->zTableName, rowid);
2851 }else{
2852 const unsigned char *zWord = sqlite3_value_text(argv[SPELLFIX_COL_WORD+2]);
2853 int nWord = sqlite3_value_bytes(argv[SPELLFIX_COL_WORD+2]);
2854 int iLang = sqlite3_value_int(argv[SPELLFIX_COL_LANGID+2]);
2855 int iRank = sqlite3_value_int(argv[SPELLFIX_COL_RANK+2]);
2856 const unsigned char *zSoundslike =
2857 sqlite3_value_text(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2858 int nSoundslike = sqlite3_value_bytes(argv[SPELLFIX_COL_SOUNDSLIKE+2]);
2859 char *zK1, *zK2;
2860 int i;
2861 char c;
2862 const char *zConflict = spellfix1GetConflict(db);
2864 if( zWord==0 ){
2865 /* Inserts of the form: INSERT INTO table(command) VALUES('xyzzy');
2866 ** cause zWord to be NULL, so we look at the "command" column to see
2867 ** what special actions to take */
2868 const char *zCmd =
2869 (const char*)sqlite3_value_text(argv[SPELLFIX_COL_COMMAND+2]);
2870 if( zCmd==0 ){
2871 pVTab->zErrMsg = sqlite3_mprintf("NOT NULL constraint failed: %s.word",
2872 p->zTableName);
2873 return SQLITE_CONSTRAINT_NOTNULL;
2875 if( strcmp(zCmd,"reset")==0 ){
2876 /* Reset the edit cost table (if there is one). */
2877 editDist3ConfigDelete(p->pConfig3);
2878 p->pConfig3 = 0;
2879 return SQLITE_OK;
2881 if( strncmp(zCmd,"edit_cost_table=",16)==0 ){
2882 editDist3ConfigDelete(p->pConfig3);
2883 p->pConfig3 = 0;
2884 sqlite3_free(p->zCostTable);
2885 p->zCostTable = spellfix1Dequote(zCmd+16);
2886 if( p->zCostTable==0 ) return SQLITE_NOMEM;
2887 if( p->zCostTable[0]==0 || sqlite3_stricmp(p->zCostTable,"null")==0 ){
2888 sqlite3_free(p->zCostTable);
2889 p->zCostTable = 0;
2891 return SQLITE_OK;
2893 pVTab->zErrMsg = sqlite3_mprintf("unknown value for %s.command: \"%w\"",
2894 p->zTableName, zCmd);
2895 return SQLITE_ERROR;
2897 if( iRank<1 ) iRank = 1;
2898 if( zSoundslike ){
2899 zK1 = (char*)transliterate(zSoundslike, nSoundslike);
2900 }else{
2901 zK1 = (char*)transliterate(zWord, nWord);
2903 if( zK1==0 ) return SQLITE_NOMEM;
2904 for(i=0; (c = zK1[i])!=0; i++){
2905 if( c>='A' && c<='Z' ) zK1[i] += 'a' - 'A';
2907 zK2 = (char*)phoneticHash((const unsigned char*)zK1, i);
2908 if( zK2==0 ){
2909 sqlite3_free(zK1);
2910 return SQLITE_NOMEM;
2912 if( sqlite3_value_type(argv[0])==SQLITE_NULL ){
2913 if( sqlite3_value_type(argv[1])==SQLITE_NULL ){
2914 spellfix1DbExec(&rc, db,
2915 "INSERT INTO \"%w\".\"%w_vocab\"(rank,langid,word,k1,k2) "
2916 "VALUES(%d,%d,%Q,nullif(%Q,%Q),%Q)",
2917 p->zDbName, p->zTableName,
2918 iRank, iLang, zWord, zK1, zWord, zK2
2920 }else{
2921 newRowid = sqlite3_value_int64(argv[1]);
2922 spellfix1DbExec(&rc, db,
2923 "INSERT OR %s INTO \"%w\".\"%w_vocab\"(id,rank,langid,word,k1,k2) "
2924 "VALUES(%lld,%d,%d,%Q,nullif(%Q,%Q),%Q)",
2925 zConflict, p->zDbName, p->zTableName,
2926 newRowid, iRank, iLang, zWord, zK1, zWord, zK2
2929 *pRowid = sqlite3_last_insert_rowid(db);
2930 }else{
2931 rowid = sqlite3_value_int64(argv[0]);
2932 newRowid = *pRowid = sqlite3_value_int64(argv[1]);
2933 spellfix1DbExec(&rc, db,
2934 "UPDATE OR %s \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, langid=%d,"
2935 " word=%Q, k1=nullif(%Q,%Q), k2=%Q WHERE id=%lld",
2936 zConflict, p->zDbName, p->zTableName, newRowid, iRank, iLang,
2937 zWord, zK1, zWord, zK2, rowid
2940 sqlite3_free(zK1);
2941 sqlite3_free(zK2);
2943 return rc;
2947 ** Rename the spellfix1 table.
2949 static int spellfix1Rename(sqlite3_vtab *pVTab, const char *zNew){
2950 spellfix1_vtab *p = (spellfix1_vtab*)pVTab;
2951 sqlite3 *db = p->db;
2952 int rc = SQLITE_OK;
2953 char *zNewName = sqlite3_mprintf("%s", zNew);
2954 if( zNewName==0 ){
2955 return SQLITE_NOMEM;
2957 spellfix1DbExec(&rc, db,
2958 "ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"",
2959 p->zDbName, p->zTableName, zNewName
2961 if( rc==SQLITE_OK ){
2962 sqlite3_free(p->zTableName);
2963 p->zTableName = zNewName;
2964 }else{
2965 sqlite3_free(zNewName);
2967 return rc;
2972 ** A virtual table module that provides fuzzy search.
2974 static sqlite3_module spellfix1Module = {
2975 0, /* iVersion */
2976 spellfix1Create, /* xCreate - handle CREATE VIRTUAL TABLE */
2977 spellfix1Connect, /* xConnect - reconnected to an existing table */
2978 spellfix1BestIndex, /* xBestIndex - figure out how to do a query */
2979 spellfix1Disconnect, /* xDisconnect - close a connection */
2980 spellfix1Destroy, /* xDestroy - handle DROP TABLE */
2981 spellfix1Open, /* xOpen - open a cursor */
2982 spellfix1Close, /* xClose - close a cursor */
2983 spellfix1Filter, /* xFilter - configure scan constraints */
2984 spellfix1Next, /* xNext - advance a cursor */
2985 spellfix1Eof, /* xEof - check for end of scan */
2986 spellfix1Column, /* xColumn - read data */
2987 spellfix1Rowid, /* xRowid - read data */
2988 spellfix1Update, /* xUpdate */
2989 0, /* xBegin */
2990 0, /* xSync */
2991 0, /* xCommit */
2992 0, /* xRollback */
2993 0, /* xFindMethod */
2994 spellfix1Rename, /* xRename */
2998 ** Register the various functions and the virtual table.
3000 static int spellfix1Register(sqlite3 *db){
3001 int rc = SQLITE_OK;
3002 int i;
3003 rc = sqlite3_create_function(db, "spellfix1_translit", 1,
3004 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3005 transliterateSqlFunc, 0, 0);
3006 if( rc==SQLITE_OK ){
3007 rc = sqlite3_create_function(db, "spellfix1_editdist", 2,
3008 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3009 editdistSqlFunc, 0, 0);
3011 if( rc==SQLITE_OK ){
3012 rc = sqlite3_create_function(db, "spellfix1_phonehash", 1,
3013 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3014 phoneticHashSqlFunc, 0, 0);
3016 if( rc==SQLITE_OK ){
3017 rc = sqlite3_create_function(db, "spellfix1_scriptcode", 1,
3018 SQLITE_UTF8|SQLITE_DETERMINISTIC, 0,
3019 scriptCodeSqlFunc, 0, 0);
3021 if( rc==SQLITE_OK ){
3022 rc = sqlite3_create_module(db, "spellfix1", &spellfix1Module, 0);
3024 if( rc==SQLITE_OK ){
3025 rc = editDist3Install(db);
3028 /* Verify sanity of the translit[] table */
3029 for(i=0; i<sizeof(translit)/sizeof(translit[0])-1; i++){
3030 assert( translit[i].cFrom<translit[i+1].cFrom );
3033 return rc;
3036 #endif /* SQLITE_OMIT_VIRTUALTABLE */
3039 ** Extension load function.
3041 #ifdef _WIN32
3042 __declspec(dllexport)
3043 #endif
3044 int sqlite3_spellfix_init(
3045 sqlite3 *db,
3046 char **pzErrMsg,
3047 const sqlite3_api_routines *pApi
3049 SQLITE_EXTENSION_INIT2(pApi);
3050 #ifndef SQLITE_OMIT_VIRTUALTABLE
3051 return spellfix1Register(db);
3052 #endif
3053 return SQLITE_OK;