4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 *************************************************************************
12 ** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $
14 ** This file implements an integration between the ICU library
15 ** ("International Components for Unicode", an open-source library
16 ** for handling unicode data) and SQLite. The integration uses
17 ** ICU to provide the following to SQLite:
19 ** * An implementation of the SQL regexp() function (and hence REGEXP
20 ** operator) using the ICU uregex_XX() APIs.
22 ** * Implementations of the SQL scalar upper() and lower() functions
25 ** * Integration of ICU and SQLite collation sequences.
27 ** * An implementation of the LIKE operator that uses ICU to
28 ** provide case-independent matching.
31 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
33 /* Include ICU headers */
34 #include <unicode/utypes.h>
35 #include <unicode/uregex.h>
36 #include <unicode/ustring.h>
37 #include <unicode/ucol.h>
42 #include "sqlite3ext.h"
43 SQLITE_EXTENSION_INIT1
49 ** Maximum length (in bytes) of the pattern in a LIKE or GLOB
52 #ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
53 # define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
57 ** Version of sqlite3_free() that is always a function, never a macro.
59 static void xFree(void *p
){
64 ** This lookup table is used to help decode the first byte of
65 ** a multi-byte UTF8 character. It is copied here from SQLite source
68 static const unsigned char icuUtf8Trans1
[] = {
69 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
70 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
71 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
72 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
73 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
74 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
75 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
76 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
79 #define SQLITE_ICU_READ_UTF8(zIn, c) \
82 c = icuUtf8Trans1[c-0xc0]; \
83 while( (*zIn & 0xc0)==0x80 ){ \
84 c = (c<<6) + (0x3f & *(zIn++)); \
88 #define SQLITE_ICU_SKIP_UTF8(zIn) \
90 if( *(zIn++)>=0xc0 ){ \
91 while( (*zIn & 0xc0)==0x80 ){zIn++;} \
96 ** Compare two UTF-8 strings for equality where the first string is
97 ** a "LIKE" expression. Return true (1) if they are the same and
98 ** false (0) if they are different.
100 static int icuLikeCompare(
101 const uint8_t *zPattern
, /* LIKE pattern */
102 const uint8_t *zString
, /* The UTF-8 string to compare against */
103 const UChar32 uEsc
/* The escape character */
105 static const uint32_t MATCH_ONE
= (uint32_t)'_';
106 static const uint32_t MATCH_ALL
= (uint32_t)'%';
108 int prevEscape
= 0; /* True if the previous character was uEsc */
112 /* Read (and consume) the next character from the input pattern. */
114 SQLITE_ICU_READ_UTF8(zPattern
, uPattern
);
115 if( uPattern
==0 ) break;
117 /* There are now 4 possibilities:
119 ** 1. uPattern is an unescaped match-all character "%",
120 ** 2. uPattern is an unescaped match-one character "_",
121 ** 3. uPattern is an unescaped escape character, or
122 ** 4. uPattern is to be handled as an ordinary character
124 if( !prevEscape
&& uPattern
==MATCH_ALL
){
128 /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
129 ** MATCH_ALL. For each MATCH_ONE, skip one character in the
132 while( (c
=*zPattern
) == MATCH_ALL
|| c
== MATCH_ONE
){
134 if( *zString
==0 ) return 0;
135 SQLITE_ICU_SKIP_UTF8(zString
);
140 if( *zPattern
==0 ) return 1;
143 if( icuLikeCompare(zPattern
, zString
, uEsc
) ){
146 SQLITE_ICU_SKIP_UTF8(zString
);
150 }else if( !prevEscape
&& uPattern
==MATCH_ONE
){
152 if( *zString
==0 ) return 0;
153 SQLITE_ICU_SKIP_UTF8(zString
);
155 }else if( !prevEscape
&& uPattern
==(uint32_t)uEsc
){
162 SQLITE_ICU_READ_UTF8(zString
, uString
);
163 uString
= (uint32_t)u_foldCase((UChar32
)uString
, U_FOLD_CASE_DEFAULT
);
164 uPattern
= (uint32_t)u_foldCase((UChar32
)uPattern
, U_FOLD_CASE_DEFAULT
);
165 if( uString
!=uPattern
){
176 ** Implementation of the like() SQL function. This function implements
177 ** the build-in LIKE operator. The first argument to the function is the
178 ** pattern and the second argument is the string. So, the SQL statements:
182 ** is implemented as like(B, A). If there is an escape character E,
186 ** is mapped to like(B, A, E).
188 static void icuLikeFunc(
189 sqlite3_context
*context
,
193 const unsigned char *zA
= sqlite3_value_text(argv
[0]);
194 const unsigned char *zB
= sqlite3_value_text(argv
[1]);
197 /* Limit the length of the LIKE or GLOB pattern to avoid problems
198 ** of deep recursion and N*N behavior in patternCompare().
200 if( sqlite3_value_bytes(argv
[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH
){
201 sqlite3_result_error(context
, "LIKE or GLOB pattern too complex", -1);
207 /* The escape character string must consist of a single UTF-8 character.
208 ** Otherwise, return an error.
210 int nE
= sqlite3_value_bytes(argv
[2]);
211 const unsigned char *zE
= sqlite3_value_text(argv
[2]);
214 U8_NEXT(zE
, i
, nE
, uEsc
);
216 sqlite3_result_error(context
,
217 "ESCAPE expression must be a single character", -1);
223 sqlite3_result_int(context
, icuLikeCompare(zA
, zB
, uEsc
));
228 ** This function is called when an ICU function called from within
229 ** the implementation of an SQL scalar function returns an error.
231 ** The scalar function context passed as the first argument is
232 ** loaded with an error message based on the following two args.
234 static void icuFunctionError(
235 sqlite3_context
*pCtx
, /* SQLite scalar function context */
236 const char *zName
, /* Name of ICU function that failed */
237 UErrorCode e
/* Error code returned by ICU function */
240 sqlite3_snprintf(128, zBuf
, "ICU error: %s(): %s", zName
, u_errorName(e
));
242 sqlite3_result_error(pCtx
, zBuf
, -1);
246 ** Function to delete compiled regexp objects. Registered as
247 ** a destructor function with sqlite3_set_auxdata().
249 static void icuRegexpDelete(void *p
){
250 URegularExpression
*pExpr
= (URegularExpression
*)p
;
255 ** Implementation of SQLite REGEXP operator. This scalar function takes
256 ** two arguments. The first is a regular expression pattern to compile
257 ** the second is a string to match against that pattern. If either
258 ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result
259 ** is 1 if the string matches the pattern, or 0 otherwise.
261 ** SQLite maps the regexp() function to the regexp() operator such
262 ** that the following two are equivalent:
264 ** zString REGEXP zPattern
265 ** regexp(zPattern, zString)
267 ** Uses the following ICU regexp APIs:
273 static void icuRegexpFunc(sqlite3_context
*p
, int nArg
, sqlite3_value
**apArg
){
274 UErrorCode status
= U_ZERO_ERROR
;
275 URegularExpression
*pExpr
;
277 const UChar
*zString
= sqlite3_value_text16(apArg
[1]);
279 (void)nArg
; /* Unused parameter */
281 /* If the left hand side of the regexp operator is NULL,
282 ** then the result is also NULL.
288 pExpr
= sqlite3_get_auxdata(p
, 0);
290 const UChar
*zPattern
= sqlite3_value_text16(apArg
[0]);
294 pExpr
= uregex_open(zPattern
, -1, 0, 0, &status
);
296 if( U_SUCCESS(status
) ){
297 sqlite3_set_auxdata(p
, 0, pExpr
, icuRegexpDelete
);
300 icuFunctionError(p
, "uregex_open", status
);
305 /* Configure the text that the regular expression operates on. */
306 uregex_setText(pExpr
, zString
, -1, &status
);
307 if( !U_SUCCESS(status
) ){
308 icuFunctionError(p
, "uregex_setText", status
);
312 /* Attempt the match */
313 res
= uregex_matches(pExpr
, 0, &status
);
314 if( !U_SUCCESS(status
) ){
315 icuFunctionError(p
, "uregex_matches", status
);
319 /* Set the text that the regular expression operates on to a NULL
320 ** pointer. This is not really necessary, but it is tidier than
321 ** leaving the regular expression object configured with an invalid
322 ** pointer after this function returns.
324 uregex_setText(pExpr
, 0, 0, &status
);
327 sqlite3_result_int(p
, res
? 1 : 0);
331 ** Implementations of scalar functions for case mapping - upper() and
332 ** lower(). Function upper() converts its input to upper-case (ABC).
333 ** Function lower() converts to lower-case (abc).
335 ** ICU provides two types of case mapping, "general" case mapping and
336 ** "language specific". Refer to ICU documentation for the differences
339 ** To utilise "general" case mapping, the upper() or lower() scalar
340 ** functions are invoked with one argument:
342 ** upper('ABC') -> 'abc'
343 ** lower('abc') -> 'ABC'
345 ** To access ICU "language specific" case mapping, upper() or lower()
346 ** should be invoked with two arguments. The second argument is the name
347 ** of the locale to use. Passing an empty string ("") or SQL NULL value
348 ** as the second argument is the same as invoking the 1 argument version
349 ** of upper() or lower().
351 ** lower('I', 'en_us') -> 'i'
352 ** lower('I', 'tr_tr') -> '\u131' (small dotless i)
354 ** http://www.icu-project.org/userguide/posix.html#case_mappings
356 static void icuCaseFunc16(sqlite3_context
*p
, int nArg
, sqlite3_value
**apArg
){
357 const UChar
*zInput
; /* Pointer to input string */
358 UChar
*zOutput
= 0; /* Pointer to output buffer */
359 int nInput
; /* Size of utf-16 input string in bytes */
360 int nOut
; /* Size of output buffer in bytes */
362 int bToUpper
; /* True for toupper(), false for tolower() */
364 const char *zLocale
= 0;
366 assert(nArg
==1 || nArg
==2);
367 bToUpper
= (sqlite3_user_data(p
)!=0);
369 zLocale
= (const char *)sqlite3_value_text(apArg
[1]);
372 zInput
= sqlite3_value_text16(apArg
[0]);
376 nOut
= nInput
= sqlite3_value_bytes16(apArg
[0]);
378 sqlite3_result_text16(p
, "", 0, SQLITE_STATIC
);
382 for(cnt
=0; cnt
<2; cnt
++){
383 UChar
*zNew
= sqlite3_realloc(zOutput
, nOut
);
385 sqlite3_free(zOutput
);
386 sqlite3_result_error_nomem(p
);
390 status
= U_ZERO_ERROR
;
392 nOut
= 2*u_strToUpper(zOutput
,nOut
/2,zInput
,nInput
/2,zLocale
,&status
);
394 nOut
= 2*u_strToLower(zOutput
,nOut
/2,zInput
,nInput
/2,zLocale
,&status
);
397 if( U_SUCCESS(status
) ){
398 sqlite3_result_text16(p
, zOutput
, nOut
, xFree
);
399 }else if( status
==U_BUFFER_OVERFLOW_ERROR
){
403 icuFunctionError(p
, bToUpper
? "u_strToUpper" : "u_strToLower", status
);
407 assert( 0 ); /* Unreachable */
411 ** Collation sequence destructor function. The pCtx argument points to
412 ** a UCollator structure previously allocated using ucol_open().
414 static void icuCollationDel(void *pCtx
){
415 UCollator
*p
= (UCollator
*)pCtx
;
420 ** Collation sequence comparison function. The pCtx argument points to
421 ** a UCollator structure previously allocated using ucol_open().
423 static int icuCollationColl(
430 UCollationResult res
;
431 UCollator
*p
= (UCollator
*)pCtx
;
432 res
= ucol_strcoll(p
, (UChar
*)zLeft
, nLeft
/2, (UChar
*)zRight
, nRight
/2);
434 case UCOL_LESS
: return -1;
435 case UCOL_GREATER
: return +1;
436 case UCOL_EQUAL
: return 0;
438 assert(!"Unexpected return value from ucol_strcoll()");
443 ** Implementation of the scalar function icu_load_collation().
445 ** This scalar function is used to add ICU collation based collation
446 ** types to an SQLite database connection. It is intended to be called
449 ** SELECT icu_load_collation(<locale>, <collation-name>);
451 ** Where <locale> is a string containing an ICU locale identifier (i.e.
452 ** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
453 ** collation sequence to create.
455 static void icuLoadCollation(
458 sqlite3_value
**apArg
460 sqlite3
*db
= (sqlite3
*)sqlite3_user_data(p
);
461 UErrorCode status
= U_ZERO_ERROR
;
462 const char *zLocale
; /* Locale identifier - (eg. "jp_JP") */
463 const char *zName
; /* SQL Collation sequence name (eg. "japanese") */
464 UCollator
*pUCollator
; /* ICU library collation object */
465 int rc
; /* Return code from sqlite3_create_collation_x() */
468 (void)nArg
; /* Unused parameter */
469 zLocale
= (const char *)sqlite3_value_text(apArg
[0]);
470 zName
= (const char *)sqlite3_value_text(apArg
[1]);
472 if( !zLocale
|| !zName
){
476 pUCollator
= ucol_open(zLocale
, &status
);
477 if( !U_SUCCESS(status
) ){
478 icuFunctionError(p
, "ucol_open", status
);
483 rc
= sqlite3_create_collation_v2(db
, zName
, SQLITE_UTF16
, (void *)pUCollator
,
484 icuCollationColl
, icuCollationDel
487 ucol_close(pUCollator
);
488 sqlite3_result_error(p
, "Error registering collation function", -1);
493 ** Register the ICU extension functions with database db.
495 int sqlite3IcuInit(sqlite3
*db
){
496 static const struct IcuScalar
{
497 const char *zName
; /* Function name */
498 unsigned char nArg
; /* Number of arguments */
499 unsigned short enc
; /* Optimal text encoding */
500 unsigned char iContext
; /* sqlite3_user_data() context */
501 void (*xFunc
)(sqlite3_context
*,int,sqlite3_value
**);
503 {"icu_load_collation", 2, SQLITE_UTF8
, 1, icuLoadCollation
},
504 {"regexp", 2, SQLITE_ANY
|SQLITE_DETERMINISTIC
, 0, icuRegexpFunc
},
505 {"lower", 1, SQLITE_UTF16
|SQLITE_DETERMINISTIC
, 0, icuCaseFunc16
},
506 {"lower", 2, SQLITE_UTF16
|SQLITE_DETERMINISTIC
, 0, icuCaseFunc16
},
507 {"upper", 1, SQLITE_UTF16
|SQLITE_DETERMINISTIC
, 1, icuCaseFunc16
},
508 {"upper", 2, SQLITE_UTF16
|SQLITE_DETERMINISTIC
, 1, icuCaseFunc16
},
509 {"lower", 1, SQLITE_UTF8
|SQLITE_DETERMINISTIC
, 0, icuCaseFunc16
},
510 {"lower", 2, SQLITE_UTF8
|SQLITE_DETERMINISTIC
, 0, icuCaseFunc16
},
511 {"upper", 1, SQLITE_UTF8
|SQLITE_DETERMINISTIC
, 1, icuCaseFunc16
},
512 {"upper", 2, SQLITE_UTF8
|SQLITE_DETERMINISTIC
, 1, icuCaseFunc16
},
513 {"like", 2, SQLITE_UTF8
|SQLITE_DETERMINISTIC
, 0, icuLikeFunc
},
514 {"like", 3, SQLITE_UTF8
|SQLITE_DETERMINISTIC
, 0, icuLikeFunc
},
520 for(i
=0; rc
==SQLITE_OK
&& i
<(int)(sizeof(scalars
)/sizeof(scalars
[0])); i
++){
521 const struct IcuScalar
*p
= &scalars
[i
];
522 rc
= sqlite3_create_function(
523 db
, p
->zName
, p
->nArg
, p
->enc
,
524 p
->iContext
? (void*)db
: (void*)0,
534 __declspec(dllexport
)
536 int sqlite3_icu_init(
539 const sqlite3_api_routines
*pApi
541 SQLITE_EXTENSION_INIT2(pApi
)
542 return sqlite3IcuInit(db
);