ext/misc/regexp.c

   1 /*
   2 ** 2012-11-13
   3 **
   4 ** The author disclaims copyright to this source code.  In place of
   5 ** a legal notice, here is a blessing:
   6 **
   7 **    May you do good and not evil.
   8 **    May you find forgiveness for yourself and forgive others.
   9 **    May you share freely, never taking more than you give.
  10 **
  11 ******************************************************************************
  12 **
  13 ** The code in this file implements a compact but reasonably
  14 ** efficient regular-expression matcher for posix extended regular
  15 ** expressions against UTF8 text.
  16 **
  17 ** This file is an SQLite extension.  It registers a single function
  18 ** named "regexp(A,B)" where A is the regular expression and B is the
  19 ** string to be matched.  By registering this function, SQLite will also
  20 ** then implement the "B regexp A" operator.  Note that with the function
  21 ** the regular expression comes first, but with the operator it comes
  22 ** second.
  23 **
  24 **  The following regular expression syntax is supported:
  25 **
  26 **     X*      zero or more occurrences of X
  27 **     X+      one or more occurrences of X
  28 **     X?      zero or one occurrences of X
  29 **     X{p,q}  between p and q occurrences of X
  30 **     (X)     match X
  31 **     X|Y     X or Y
  32 **     ^X      X occurring at the beginning of the string
  33 **     X$      X occurring at the end of the string
  34 **     .       Match any single character
  35 **     \c      Character c where c is one of \{}()[]|*+?.
  36 **     \c      C-language escapes for c in afnrtv.  ex: \t or \n
  37 **     \uXXXX  Where XXXX is exactly 4 hex digits, unicode value XXXX
  38 **     \xXX    Where XX is exactly 2 hex digits, unicode value XX
  39 **     [abc]   Any single character from the set abc
  40 **     [^abc]  Any single character not in the set abc
  41 **     [a-z]   Any single character in the range a-z
  42 **     [^a-z]  Any single character not in the range a-z
  43 **     \b      Word boundary
  44 **     \w      Word character.  [A-Za-z0-9_]
  45 **     \W      Non-word character
  46 **     \d      Digit
  47 **     \D      Non-digit
  48 **     \s      Whitespace character
  49 **     \S      Non-whitespace character
  50 **
  51 ** A nondeterministic finite automaton (NFA) is used for matching, so the
  52 ** performance is bounded by O(N*M) where N is the size of the regular
  53 ** expression and M is the size of the input string.  The matcher never
  54 ** exhibits exponential behavior.  Note that the X{p,q} operator expands
  55 ** to p copies of X following by q-p copies of X? and that the size of the
  56 ** regular expression in the O(N*M) performance bound is computed after
  57 ** this expansion.
  58 */
  59 #include <string.h>
  60 #include <stdlib.h>
  61 #include "sqlite3ext.h"
  62 SQLITE_EXTENSION_INIT1
  63
  64 /*
  65 ** The following #defines change the names of some functions implemented in
  66 ** this file to prevent name collisions with C-library functions of the
  67 ** same name.
  68 */
  69 #define re_match   sqlite3re_match
  70 #define re_compile sqlite3re_compile
  71 #define re_free    sqlite3re_free
  72
  73 /* The end-of-input character */
  74 #define RE_EOF            0    /* End of input */
  75 #define RE_START  0xfffffff    /* Start of input - larger than an UTF-8 */
  76
  77 /* The NFA is implemented as sequence of opcodes taken from the following
  78 ** set.  Each opcode has a single integer argument.
  79 */
  80 #define RE_OP_MATCH       1    /* Match the one character in the argument */
  81 #define RE_OP_ANY         2    /* Match any one character.  (Implements ".") */
  82 #define RE_OP_ANYSTAR     3    /* Special optimized version of .* */
  83 #define RE_OP_FORK        4    /* Continue to both next and opcode at iArg */
  84 #define RE_OP_GOTO        5    /* Jump to opcode at iArg */
  85 #define RE_OP_ACCEPT      6    /* Halt and indicate a successful match */
  86 #define RE_OP_CC_INC      7    /* Beginning of a [...] character class */
  87 #define RE_OP_CC_EXC      8    /* Beginning of a [^...] character class */
  88 #define RE_OP_CC_VALUE    9    /* Single value in a character class */
  89 #define RE_OP_CC_RANGE   10    /* Range of values in a character class */
  90 #define RE_OP_WORD       11    /* Perl word character [A-Za-z0-9_] */
  91 #define RE_OP_NOTWORD    12    /* Not a perl word character */
  92 #define RE_OP_DIGIT      13    /* digit:  [0-9] */
  93 #define RE_OP_NOTDIGIT   14    /* Not a digit */
  94 #define RE_OP_SPACE      15    /* space:  [ \t\n\r\v\f] */
  95 #define RE_OP_NOTSPACE   16    /* Not a digit */
  96 #define RE_OP_BOUNDARY   17    /* Boundary between word and non-word */
  97 #define RE_OP_ATSTART    18    /* Currently at the start of the string */
  98
  99 #if defined(SQLITE_DEBUG)
 100 /* Opcode names used for symbolic debugging */
 101 static const char *ReOpName[] = {
 102   "EOF",
 103   "MATCH",
 104   "ANY",
 105   "ANYSTAR",
 106   "FORK",
 107   "GOTO",
 108   "ACCEPT",
 109   "CC_INC",
 110   "CC_EXC",
 111   "CC_VALUE",
 112   "CC_RANGE",
 113   "WORD",
 114   "NOTWORD",
 115   "DIGIT",
 116   "NOTDIGIT",
 117   "SPACE",
 118   "NOTSPACE",
 119   "BOUNDARY",
 120   "ATSTART",
 121 };
 122 #endif /* SQLITE_DEBUG */
 123
 124
 125 /* Each opcode is a "state" in the NFA */
 126 typedef unsigned short ReStateNumber;
 127
 128 /* Because this is an NFA and not a DFA, multiple states can be active at
 129 ** once.  An instance of the following object records all active states in
 130 ** the NFA.  The implementation is optimized for the common case where the
 131 ** number of actives states is small.
 132 */
 133 typedef struct ReStateSet {
 134   unsigned nState;            /* Number of current states */
 135   ReStateNumber *aState;      /* Current states */
 136 } ReStateSet;
 137
 138 /* An input string read one character at a time.
 139 */
 140 typedef struct ReInput ReInput;
 141 struct ReInput {
 142   const unsigned char *z;  /* All text */
 143   int i;                   /* Next byte to read */
 144   int mx;                  /* EOF when i>=mx */
 145 };
 146
 147 /* A compiled NFA (or an NFA that is in the process of being compiled) is
 148 ** an instance of the following object.
 149 */
 150 typedef struct ReCompiled ReCompiled;
 151 struct ReCompiled {
 152   ReInput sIn;                /* Regular expression text */
 153   const char *zErr;           /* Error message to return */
 154   char *aOp;                  /* Operators for the virtual machine */
 155   int *aArg;                  /* Arguments to each operator */
 156   unsigned (*xNextChar)(ReInput*);  /* Next character function */
 157   unsigned char zInit[12];    /* Initial text to match */
 158   int nInit;                  /* Number of bytes in zInit */
 159   unsigned nState;            /* Number of entries in aOp[] and aArg[] */
 160   unsigned nAlloc;            /* Slots allocated for aOp[] and aArg[] */
 161 };
 162
 163 /* Add a state to the given state set if it is not already there */
 164 static void re_add_state(ReStateSet *pSet, int newState){
 165   unsigned i;
 166   for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;
 167   pSet->aState[pSet->nState++] = (ReStateNumber)newState;
 168 }
 169
 170 /* Extract the next unicode character from *pzIn and return it.  Advance
 171 ** *pzIn to the first byte past the end of the character returned.  To
 172 ** be clear:  this routine converts utf8 to unicode.  This routine is
 173 ** optimized for the common case where the next character is a single byte.
 174 */
 175 static unsigned re_next_char(ReInput *p){
 176   unsigned c;
 177   if( p->i>=p->mx ) return 0;
 178   c = p->z[p->i++];
 179   if( c>=0x80 ){
 180     if( (c&0xe0)==0xc0 && p->i<p->mx && (p->z[p->i]&0xc0)==0x80 ){
 181       c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f);
 182       if( c<0x80 ) c = 0xfffd;
 183     }else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80
 184            && (p->z[p->i+1]&0xc0)==0x80 ){
 185       c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f);
 186       p->i += 2;
 187       if( c<=0x7ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
 188     }else if( (c&0xf8)==0xf0 && p->i+2<p->mx && (p->z[p->i]&0xc0)==0x80
 189            && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){
 190       c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6)
 191                        | (p->z[p->i+2]&0x3f);
 192       p->i += 3;
 193       if( c<=0xffff || c>0x10ffff ) c = 0xfffd;
 194     }else{
 195       c = 0xfffd;
 196     }
 197   }
 198   return c;
 199 }
 200 static unsigned re_next_char_nocase(ReInput *p){
 201   unsigned c = re_next_char(p);
 202   if( c>='A' && c<='Z' ) c += 'a' - 'A';
 203   return c;
 204 }
 205
 206 /* Return true if c is a perl "word" character:  [A-Za-z0-9_] */
 207 static int re_word_char(int c){
 208   return (c>='0' && c<='9') || (c>='a' && c<='z')
 209       || (c>='A' && c<='Z') || c=='_';
 210 }
 211
 212 /* Return true if c is a "digit" character:  [0-9] */
 213 static int re_digit_char(int c){
 214   return (c>='0' && c<='9');
 215 }
 216
 217 /* Return true if c is a perl "space" character:  [ \t\r\n\v\f] */
 218 static int re_space_char(int c){
 219   return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
 220 }
 221
 222 /* Run a compiled regular expression on the zero-terminated input
 223 ** string zIn[].  Return true on a match and false if there is no match.
 224 */
 225 static int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
 226   ReStateSet aStateSet[2], *pThis, *pNext;
 227   ReStateNumber aSpace[100];
 228   ReStateNumber *pToFree;
 229   unsigned int i = 0;
 230   unsigned int iSwap = 0;
 231   int c = RE_START;
 232   int cPrev = 0;
 233   int rc = 0;
 234   ReInput in;
 235
 236   in.z = zIn;
 237   in.i = 0;
 238   in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);
 239
 240   /* Look for the initial prefix match, if there is one. */
 241   if( pRe->nInit ){
 242     unsigned char x = pRe->zInit[0];
 243     while( in.i+pRe->nInit<=in.mx
 244      && (zIn[in.i]!=x ||
 245          strncmp((const char*)zIn+in.i, (const char*)pRe->zInit, pRe->nInit)!=0)
 246     ){
 247       in.i++;
 248     }
 249     if( in.i+pRe->nInit>in.mx ) return 0;
 250     c = RE_START-1;
 251   }
 252
 253   if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
 254     pToFree = 0;
 255     aStateSet[0].aState = aSpace;
 256   }else{
 257     pToFree = sqlite3_malloc64( sizeof(ReStateNumber)*2*pRe->nState );
 258     if( pToFree==0 ) return -1;
 259     aStateSet[0].aState = pToFree;
 260   }
 261   aStateSet[1].aState = &aStateSet[0].aState[pRe->nState];
 262   pNext = &aStateSet[1];
 263   pNext->nState = 0;
 264   re_add_state(pNext, 0);
 265   while( c!=RE_EOF && pNext->nState>0 ){
 266     cPrev = c;
 267     c = pRe->xNextChar(&in);
 268     pThis = pNext;
 269     pNext = &aStateSet[iSwap];
 270     iSwap = 1 - iSwap;
 271     pNext->nState = 0;
 272     for(i=0; i<pThis->nState; i++){
 273       int x = pThis->aState[i];
 274       switch( pRe->aOp[x] ){
 275         case RE_OP_MATCH: {
 276           if( pRe->aArg[x]==c ) re_add_state(pNext, x+1);
 277           break;
 278         }
 279         case RE_OP_ATSTART: {
 280           if( cPrev==RE_START ) re_add_state(pThis, x+1);
 281           break;
 282         }
 283         case RE_OP_ANY: {
 284           if( c!=0 ) re_add_state(pNext, x+1);
 285           break;
 286         }
 287         case RE_OP_WORD: {
 288           if( re_word_char(c) ) re_add_state(pNext, x+1);
 289           break;
 290         }
 291         case RE_OP_NOTWORD: {
 292           if( !re_word_char(c) && c!=0 ) re_add_state(pNext, x+1);
 293           break;
 294         }
 295         case RE_OP_DIGIT: {
 296           if( re_digit_char(c) ) re_add_state(pNext, x+1);
 297           break;
 298         }
 299         case RE_OP_NOTDIGIT: {
 300           if( !re_digit_char(c) && c!=0 ) re_add_state(pNext, x+1);
 301           break;
 302         }
 303         case RE_OP_SPACE: {
 304           if( re_space_char(c) ) re_add_state(pNext, x+1);
 305           break;
 306         }
 307         case RE_OP_NOTSPACE: {
 308           if( !re_space_char(c) && c!=0 ) re_add_state(pNext, x+1);
 309           break;
 310         }
 311         case RE_OP_BOUNDARY: {
 312           if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1);
 313           break;
 314         }
 315         case RE_OP_ANYSTAR: {
 316           re_add_state(pNext, x);
 317           re_add_state(pThis, x+1);
 318           break;
 319         }
 320         case RE_OP_FORK: {
 321           re_add_state(pThis, x+pRe->aArg[x]);
 322           re_add_state(pThis, x+1);
 323           break;
 324         }
 325         case RE_OP_GOTO: {
 326           re_add_state(pThis, x+pRe->aArg[x]);
 327           break;
 328         }
 329         case RE_OP_ACCEPT: {
 330           rc = 1;
 331           goto re_match_end;
 332         }
 333         case RE_OP_CC_EXC: {
 334           if( c==0 ) break;
 335           /* fall-through */ goto re_op_cc_inc;
 336         }
 337         case RE_OP_CC_INC: re_op_cc_inc: {
 338           int j = 1;
 339           int n = pRe->aArg[x];
 340           int hit = 0;
 341           for(j=1; j>0 && j<n; j++){
 342             if( pRe->aOp[x+j]==RE_OP_CC_VALUE ){
 343               if( pRe->aArg[x+j]==c ){
 344                 hit = 1;
 345                 j = -1;
 346               }
 347             }else{
 348               if( pRe->aArg[x+j]<=c && pRe->aArg[x+j+1]>=c ){
 349                 hit = 1;
 350                 j = -1;
 351               }else{
 352                 j++;
 353               }
 354             }
 355           }
 356           if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit;
 357           if( hit ) re_add_state(pNext, x+n);
 358           break;
 359         }
 360       }
 361     }
 362   }
 363   for(i=0; i<pNext->nState; i++){
 364     int x = pNext->aState[i];
 365     while( pRe->aOp[x]==RE_OP_GOTO ) x += pRe->aArg[x];
 366     if( pRe->aOp[x]==RE_OP_ACCEPT ){ rc = 1; break; }
 367   }
 368 re_match_end:
 369   sqlite3_free(pToFree);
 370   return rc;
 371 }
 372
 373 /* Resize the opcode and argument arrays for an RE under construction.
 374 */
 375 static int re_resize(ReCompiled *p, int N){
 376   char *aOp;
 377   int *aArg;
 378   aOp = sqlite3_realloc64(p->aOp, N*sizeof(p->aOp[0]));
 379   if( aOp==0 ) return 1;
 380   p->aOp = aOp;
 381   aArg = sqlite3_realloc64(p->aArg, N*sizeof(p->aArg[0]));
 382   if( aArg==0 ) return 1;
 383   p->aArg = aArg;
 384   p->nAlloc = N;
 385   return 0;
 386 }
 387
 388 /* Insert a new opcode and argument into an RE under construction.  The
 389 ** insertion point is just prior to existing opcode iBefore.
 390 */
 391 static int re_insert(ReCompiled *p, int iBefore, int op, int arg){
 392   int i;
 393   if( p->nAlloc<=p->nState && re_resize(p, p->nAlloc*2) ) return 0;
 394   for(i=p->nState; i>iBefore; i--){
 395     p->aOp[i] = p->aOp[i-1];
 396     p->aArg[i] = p->aArg[i-1];
 397   }
 398   p->nState++;
 399   p->aOp[iBefore] = (char)op;
 400   p->aArg[iBefore] = arg;
 401   return iBefore;
 402 }
 403
 404 /* Append a new opcode and argument to the end of the RE under construction.
 405 */
 406 static int re_append(ReCompiled *p, int op, int arg){
 407   return re_insert(p, p->nState, op, arg);
 408 }
 409
 410 /* Make a copy of N opcodes starting at iStart onto the end of the RE
 411 ** under construction.
 412 */
 413 static void re_copy(ReCompiled *p, int iStart, int N){
 414   if( p->nState+N>=p->nAlloc && re_resize(p, p->nAlloc*2+N) ) return;
 415   memcpy(&p->aOp[p->nState], &p->aOp[iStart], N*sizeof(p->aOp[0]));
 416   memcpy(&p->aArg[p->nState], &p->aArg[iStart], N*sizeof(p->aArg[0]));
 417   p->nState += N;
 418 }
 419
 420 /* Return true if c is a hexadecimal digit character:  [0-9a-fA-F]
 421 ** If c is a hex digit, also set *pV = (*pV)*16 + valueof(c).  If
 422 ** c is not a hex digit *pV is unchanged.
 423 */
 424 static int re_hex(int c, int *pV){
 425   if( c>='0' && c<='9' ){
 426     c -= '0';
 427   }else if( c>='a' && c<='f' ){
 428     c -= 'a' - 10;
 429   }else if( c>='A' && c<='F' ){
 430     c -= 'A' - 10;
 431   }else{
 432     return 0;
 433   }
 434   *pV = (*pV)*16 + (c & 0xff);
 435   return 1;
 436 }
 437
 438 /* A backslash character has been seen, read the next character and
 439 ** return its interpretation.
 440 */
 441 static unsigned re_esc_char(ReCompiled *p){
 442   static const char zEsc[] = "afnrtv\\()*.+?[$^{|}]";
 443   static const char zTrans[] = "\a\f\n\r\t\v";
 444   int i, v = 0;
 445   char c;
 446   if( p->sIn.i>=p->sIn.mx ) return 0;
 447   c = p->sIn.z[p->sIn.i];
 448   if( c=='u' && p->sIn.i+4<p->sIn.mx ){
 449     const unsigned char *zIn = p->sIn.z + p->sIn.i;
 450     if( re_hex(zIn[1],&v)
 451      && re_hex(zIn[2],&v)
 452      && re_hex(zIn[3],&v)
 453      && re_hex(zIn[4],&v)
 454     ){
 455       p->sIn.i += 5;
 456       return v;
 457     }
 458   }
 459   if( c=='x' && p->sIn.i+2<p->sIn.mx ){
 460     const unsigned char *zIn = p->sIn.z + p->sIn.i;
 461     if( re_hex(zIn[1],&v)
 462      && re_hex(zIn[2],&v)
 463     ){
 464       p->sIn.i += 3;
 465       return v;
 466     }
 467   }
 468   for(i=0; zEsc[i] && zEsc[i]!=c; i++){}
 469   if( zEsc[i] ){
 470     if( i<6 ) c = zTrans[i];
 471     p->sIn.i++;
 472   }else{
 473     p->zErr = "unknown \\ escape";
 474   }
 475   return c;
 476 }
 477
 478 /* Forward declaration */
 479 static const char *re_subcompile_string(ReCompiled*);
 480
 481 /* Peek at the next byte of input */
 482 static unsigned char rePeek(ReCompiled *p){
 483   return p->sIn.i<p->sIn.mx ? p->sIn.z[p->sIn.i] : 0;
 484 }
 485
 486 /* Compile RE text into a sequence of opcodes.  Continue up to the
 487 ** first unmatched ")" character, then return.  If an error is found,
 488 ** return a pointer to the error message string.
 489 */
 490 static const char *re_subcompile_re(ReCompiled *p){
 491   const char *zErr;
 492   int iStart, iEnd, iGoto;
 493   iStart = p->nState;
 494   zErr = re_subcompile_string(p);
 495   if( zErr ) return zErr;
 496   while( rePeek(p)=='|' ){
 497     iEnd = p->nState;
 498     re_insert(p, iStart, RE_OP_FORK, iEnd + 2 - iStart);
 499     iGoto = re_append(p, RE_OP_GOTO, 0);
 500     p->sIn.i++;
 501     zErr = re_subcompile_string(p);
 502     if( zErr ) return zErr;
 503     p->aArg[iGoto] = p->nState - iGoto;
 504   }
 505   return 0;
 506 }
 507
 508 /* Compile an element of regular expression text (anything that can be
 509 ** an operand to the "|" operator).  Return NULL on success or a pointer
 510 ** to the error message if there is a problem.
 511 */
 512 static const char *re_subcompile_string(ReCompiled *p){
 513   int iPrev = -1;
 514   int iStart;
 515   unsigned c;
 516   const char *zErr;
 517   while( (c = p->xNextChar(&p->sIn))!=0 ){
 518     iStart = p->nState;
 519     switch( c ){
 520       case '|':
 521       case ')': {
 522         p->sIn.i--;
 523         return 0;
 524       }
 525       case '(': {
 526         zErr = re_subcompile_re(p);
 527         if( zErr ) return zErr;
 528         if( rePeek(p)!=')' ) return "unmatched '('";
 529         p->sIn.i++;
 530         break;
 531       }
 532       case '.': {
 533         if( rePeek(p)=='*' ){
 534           re_append(p, RE_OP_ANYSTAR, 0);
 535           p->sIn.i++;
 536         }else{
 537           re_append(p, RE_OP_ANY, 0);
 538         }
 539         break;
 540       }
 541       case '*': {
 542         if( iPrev<0 ) return "'*' without operand";
 543         re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1);
 544         re_append(p, RE_OP_FORK, iPrev - p->nState + 1);
 545         break;
 546       }
 547       case '+': {
 548         if( iPrev<0 ) return "'+' without operand";
 549         re_append(p, RE_OP_FORK, iPrev - p->nState);
 550         break;
 551       }
 552       case '?': {
 553         if( iPrev<0 ) return "'?' without operand";
 554         re_insert(p, iPrev, RE_OP_FORK, p->nState - iPrev+1);
 555         break;
 556       }
 557       case '$': {
 558         re_append(p, RE_OP_MATCH, RE_EOF);
 559         break;
 560       }
 561       case '^': {
 562         re_append(p, RE_OP_ATSTART, 0);
 563         break;
 564       }
 565       case '{': {
 566         int m = 0, n = 0;
 567         int sz, j;
 568         if( iPrev<0 ) return "'{m,n}' without operand";
 569         while( (c=rePeek(p))>='0' && c<='9' ){ m = m*10 + c - '0'; p->sIn.i++; }
 570         n = m;
 571         if( c==',' ){
 572           p->sIn.i++;
 573           n = 0;
 574           while( (c=rePeek(p))>='0' && c<='9' ){ n = n*10 + c-'0'; p->sIn.i++; }
 575         }
 576         if( c!='}' ) return "unmatched '{'";
 577         if( n>0 && n<m ) return "n less than m in '{m,n}'";
 578         p->sIn.i++;
 579         sz = p->nState - iPrev;
 580         if( m==0 ){
 581           if( n==0 ) return "both m and n are zero in '{m,n}'";
 582           re_insert(p, iPrev, RE_OP_FORK, sz+1);
 583           iPrev++;
 584           n--;
 585         }else{
 586           for(j=1; j<m; j++) re_copy(p, iPrev, sz);
 587         }
 588         for(j=m; j<n; j++){
 589           re_append(p, RE_OP_FORK, sz+1);
 590           re_copy(p, iPrev, sz);
 591         }
 592         if( n==0 && m>0 ){
 593           re_append(p, RE_OP_FORK, -sz);
 594         }
 595         break;
 596       }
 597       case '[': {
 598         unsigned int iFirst = p->nState;
 599         if( rePeek(p)=='^' ){
 600           re_append(p, RE_OP_CC_EXC, 0);
 601           p->sIn.i++;
 602         }else{
 603           re_append(p, RE_OP_CC_INC, 0);
 604         }
 605         while( (c = p->xNextChar(&p->sIn))!=0 ){
 606           if( c=='[' && rePeek(p)==':' ){
 607             return "POSIX character classes not supported";
 608           }
 609           if( c=='\\' ) c = re_esc_char(p);
 610           if( rePeek(p)=='-' ){
 611             re_append(p, RE_OP_CC_RANGE, c);
 612             p->sIn.i++;
 613             c = p->xNextChar(&p->sIn);
 614             if( c=='\\' ) c = re_esc_char(p);
 615             re_append(p, RE_OP_CC_RANGE, c);
 616           }else{
 617             re_append(p, RE_OP_CC_VALUE, c);
 618           }
 619           if( rePeek(p)==']' ){ p->sIn.i++; break; }
 620         }
 621         if( c==0 ) return "unclosed '['";
 622         if( p->nState>iFirst ) p->aArg[iFirst] = p->nState - iFirst;
 623         break;
 624       }
 625       case '\\': {
 626         int specialOp = 0;
 627         switch( rePeek(p) ){
 628           case 'b': specialOp = RE_OP_BOUNDARY;   break;
 629           case 'd': specialOp = RE_OP_DIGIT;      break;
 630           case 'D': specialOp = RE_OP_NOTDIGIT;   break;
 631           case 's': specialOp = RE_OP_SPACE;      break;
 632           case 'S': specialOp = RE_OP_NOTSPACE;   break;
 633           case 'w': specialOp = RE_OP_WORD;       break;
 634           case 'W': specialOp = RE_OP_NOTWORD;    break;
 635         }
 636         if( specialOp ){
 637           p->sIn.i++;
 638           re_append(p, specialOp, 0);
 639         }else{
 640           c = re_esc_char(p);
 641           re_append(p, RE_OP_MATCH, c);
 642         }
 643         break;
 644       }
 645       default: {
 646         re_append(p, RE_OP_MATCH, c);
 647         break;
 648       }
 649     }
 650     iPrev = iStart;
 651   }
 652   return 0;
 653 }
 654
 655 /* Free and reclaim all the memory used by a previously compiled
 656 ** regular expression.  Applications should invoke this routine once
 657 ** for every call to re_compile() to avoid memory leaks.
 658 */
 659 static void re_free(ReCompiled *pRe){
 660   if( pRe ){
 661     sqlite3_free(pRe->aOp);
 662     sqlite3_free(pRe->aArg);
 663     sqlite3_free(pRe);
 664   }
 665 }
 666
 667 /*
 668 ** Compile a textual regular expression in zIn[] into a compiled regular
 669 ** expression suitable for us by re_match() and return a pointer to the
 670 ** compiled regular expression in *ppRe.  Return NULL on success or an
 671 ** error message if something goes wrong.
 672 */
 673 static const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
 674   ReCompiled *pRe;
 675   const char *zErr;
 676   int i, j;
 677
 678   *ppRe = 0;
 679   pRe = sqlite3_malloc( sizeof(*pRe) );
 680   if( pRe==0 ){
 681     return "out of memory";
 682   }
 683   memset(pRe, 0, sizeof(*pRe));
 684   pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char;
 685   if( re_resize(pRe, 30) ){
 686     re_free(pRe);
 687     return "out of memory";
 688   }
 689   if( zIn[0]=='^' ){
 690     zIn++;
 691   }else{
 692     re_append(pRe, RE_OP_ANYSTAR, 0);
 693   }
 694   pRe->sIn.z = (unsigned char*)zIn;
 695   pRe->sIn.i = 0;
 696   pRe->sIn.mx = (int)strlen(zIn);
 697   zErr = re_subcompile_re(pRe);
 698   if( zErr ){
 699     re_free(pRe);
 700     return zErr;
 701   }
 702   if( pRe->sIn.i>=pRe->sIn.mx ){
 703     re_append(pRe, RE_OP_ACCEPT, 0);
 704     *ppRe = pRe;
 705   }else{
 706     re_free(pRe);
 707     return "unrecognized character";
 708   }
 709
 710   /* The following is a performance optimization.  If the regex begins with
 711   ** ".*" (if the input regex lacks an initial "^") and afterwards there are
 712   ** one or more matching characters, enter those matching characters into
 713   ** zInit[].  The re_match() routine can then search ahead in the input
 714   ** string looking for the initial match without having to run the whole
 715   ** regex engine over the string.  Do not worry about trying to match
 716   ** unicode characters beyond plane 0 - those are very rare and this is
 717   ** just an optimization. */
 718   if( pRe->aOp[0]==RE_OP_ANYSTAR && !noCase ){
 719     for(j=0, i=1; j<(int)sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
 720       unsigned x = pRe->aArg[i];
 721       if( x<=0x7f ){
 722         pRe->zInit[j++] = (unsigned char)x;
 723       }else if( x<=0x7ff ){
 724         pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
 725         pRe->zInit[j++] = 0x80 | (x&0x3f);
 726       }else if( x<=0xffff ){
 727         pRe->zInit[j++] = (unsigned char)(0xe0 | (x>>12));
 728         pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
 729         pRe->zInit[j++] = 0x80 | (x&0x3f);
 730       }else{
 731         break;
 732       }
 733     }
 734     if( j>0 && pRe->zInit[j-1]==0 ) j--;
 735     pRe->nInit = j;
 736   }
 737   return pRe->zErr;
 738 }
 739
 740 /*
 741 ** Implementation of the regexp() SQL function.  This function implements
 742 ** the build-in REGEXP operator.  The first argument to the function is the
 743 ** pattern and the second argument is the string.  So, the SQL statements:
 744 **
 745 **       A REGEXP B
 746 **
 747 ** is implemented as regexp(B,A).
 748 */
 749 static void re_sql_func(
 750   sqlite3_context *context,
 751   int argc,
 752   sqlite3_value **argv
 753 ){
 754   ReCompiled *pRe;          /* Compiled regular expression */
 755   const char *zPattern;     /* The regular expression */
 756   const unsigned char *zStr;/* String being searched */
 757   const char *zErr;         /* Compile error message */
 758   int setAux = 0;           /* True to invoke sqlite3_set_auxdata() */
 759
 760   (void)argc;  /* Unused */
 761   pRe = sqlite3_get_auxdata(context, 0);
 762   if( pRe==0 ){
 763     zPattern = (const char*)sqlite3_value_text(argv[0]);
 764     if( zPattern==0 ) return;
 765     zErr = re_compile(&pRe, zPattern, sqlite3_user_data(context)!=0);
 766     if( zErr ){
 767       re_free(pRe);
 768       sqlite3_result_error(context, zErr, -1);
 769       return;
 770     }
 771     if( pRe==0 ){
 772       sqlite3_result_error_nomem(context);
 773       return;
 774     }
 775     setAux = 1;
 776   }
 777   zStr = (const unsigned char*)sqlite3_value_text(argv[1]);
 778   if( zStr!=0 ){
 779     sqlite3_result_int(context, re_match(pRe, zStr, -1));
 780   }
 781   if( setAux ){
 782     sqlite3_set_auxdata(context, 0, pRe, (void(*)(void*))re_free);
 783   }
 784 }
 785
 786 #if defined(SQLITE_DEBUG)
 787 /*
 788 ** This function is used for testing and debugging only.  It is only available
 789 ** if the SQLITE_DEBUG compile-time option is used.
 790 **
 791 ** Compile a regular expression and then convert the compiled expression into
 792 ** text and return that text.
 793 */
 794 static void re_bytecode_func(
 795   sqlite3_context *context,
 796   int argc,
 797   sqlite3_value **argv
 798 ){
 799   const char *zPattern;
 800   const char *zErr;
 801   ReCompiled *pRe;
 802   sqlite3_str *pStr;
 803   int i;
 804   int n;
 805   char *z;
 806   (void)argc;
 807
 808   zPattern = (const char*)sqlite3_value_text(argv[0]);
 809   if( zPattern==0 ) return;
 810   zErr = re_compile(&pRe, zPattern, sqlite3_user_data(context)!=0);
 811   if( zErr ){
 812     re_free(pRe);
 813     sqlite3_result_error(context, zErr, -1);
 814     return;
 815   }
 816   if( pRe==0 ){
 817     sqlite3_result_error_nomem(context);
 818     return;
 819   }
 820   pStr = sqlite3_str_new(0);
 821   if( pStr==0 ) goto re_bytecode_func_err;
 822   if( pRe->nInit>0 ){
 823     sqlite3_str_appendf(pStr, "INIT     ");
 824     for(i=0; i<pRe->nInit; i++){
 825       sqlite3_str_appendf(pStr, "%02x", pRe->zInit[i]);
 826     }
 827     sqlite3_str_appendf(pStr, "\n");
 828   }
 829   for(i=0; (unsigned)i<pRe->nState; i++){
 830     sqlite3_str_appendf(pStr, "%-8s %4d\n",
 831          ReOpName[(unsigned char)pRe->aOp[i]], pRe->aArg[i]);
 832   }
 833   n = sqlite3_str_length(pStr);
 834   z = sqlite3_str_finish(pStr);
 835   if( n==0 ){
 836     sqlite3_free(z);
 837   }else{
 838     sqlite3_result_text(context, z, n-1, sqlite3_free);
 839   }
 840
 841 re_bytecode_func_err:
 842   re_free(pRe);
 843 }
 844
 845 #endif /* SQLITE_DEBUG */
 846
 847
 848 /*
 849 ** Invoke this routine to register the regexp() function with the
 850 ** SQLite database connection.
 851 */
 852 #ifdef _WIN32
 853 __declspec(dllexport)
 854 #endif
 855 int sqlite3_regexp_init(
 856   sqlite3 *db,
 857   char **pzErrMsg,
 858   const sqlite3_api_routines *pApi
 859 ){
 860   int rc = SQLITE_OK;
 861   SQLITE_EXTENSION_INIT2(pApi);
 862   (void)pzErrMsg;  /* Unused */
 863   rc = sqlite3_create_function(db, "regexp", 2,
 864                             SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_DETERMINISTIC,
 865                             0, re_sql_func, 0, 0);
 866   if( rc==SQLITE_OK ){
 867     /* The regexpi(PATTERN,STRING) function is a case-insensitive version
 868     ** of regexp(PATTERN,STRING). */
 869     rc = sqlite3_create_function(db, "regexpi", 2,
 870                             SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_DETERMINISTIC,
 871                             (void*)db, re_sql_func, 0, 0);
 872 #if defined(SQLITE_DEBUG)
 873     if( rc==SQLITE_OK ){
 874       rc = sqlite3_create_function(db, "regexp_bytecode", 1,
 875                             SQLITE_UTF8|SQLITE_INNOCUOUS|SQLITE_DETERMINISTIC,
 876                             0, re_bytecode_func, 0, 0);
 877     }
 878 #endif /* SQLITE_DEBUG */
 879   }
 880   return rc;
 881 }