Updates to the Makefiles for MSVC. Cherrypick of [ac8786f3f9f35cb6].
[sqlite.git] / ext / icu / icu.c
blob33c4aa76f37524efc965624c4ad86f3d96752eb5
1 /*
2 ** 2007 May 6
3 **
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
6 **
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 *************************************************************************
12 ** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $
14 ** This file implements an integration between the ICU library
15 ** ("International Components for Unicode", an open-source library
16 ** for handling unicode data) and SQLite. The integration uses
17 ** ICU to provide the following to SQLite:
19 ** * An implementation of the SQL regexp() function (and hence REGEXP
20 ** operator) using the ICU uregex_XX() APIs.
22 ** * Implementations of the SQL scalar upper() and lower() functions
23 ** for case mapping.
25 ** * Integration of ICU and SQLite collation sequences.
27 ** * An implementation of the LIKE operator that uses ICU to
28 ** provide case-independent matching.
31 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
33 /* Include ICU headers */
34 #include <unicode/utypes.h>
35 #include <unicode/uregex.h>
36 #include <unicode/ustring.h>
37 #include <unicode/ucol.h>
39 #include <assert.h>
41 #ifndef SQLITE_CORE
42 #include "sqlite3ext.h"
43 SQLITE_EXTENSION_INIT1
44 #else
45 #include "sqlite3.h"
46 #endif
49 ** Maximum length (in bytes) of the pattern in a LIKE or GLOB
50 ** operator.
52 #ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
53 # define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
54 #endif
57 ** Version of sqlite3_free() that is always a function, never a macro.
59 static void xFree(void *p){
60 sqlite3_free(p);
64 ** This lookup table is used to help decode the first byte of
65 ** a multi-byte UTF8 character. It is copied here from SQLite source
66 ** code file utf8.c.
68 static const unsigned char icuUtf8Trans1[] = {
69 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
70 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
71 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
72 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
73 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
74 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
75 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
76 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
79 #define SQLITE_ICU_READ_UTF8(zIn, c) \
80 c = *(zIn++); \
81 if( c>=0xc0 ){ \
82 c = icuUtf8Trans1[c-0xc0]; \
83 while( (*zIn & 0xc0)==0x80 ){ \
84 c = (c<<6) + (0x3f & *(zIn++)); \
85 } \
88 #define SQLITE_ICU_SKIP_UTF8(zIn) \
89 assert( *zIn ); \
90 if( *(zIn++)>=0xc0 ){ \
91 while( (*zIn & 0xc0)==0x80 ){zIn++;} \
96 ** Compare two UTF-8 strings for equality where the first string is
97 ** a "LIKE" expression. Return true (1) if they are the same and
98 ** false (0) if they are different.
100 static int icuLikeCompare(
101 const uint8_t *zPattern, /* LIKE pattern */
102 const uint8_t *zString, /* The UTF-8 string to compare against */
103 const UChar32 uEsc /* The escape character */
105 static const uint32_t MATCH_ONE = (uint32_t)'_';
106 static const uint32_t MATCH_ALL = (uint32_t)'%';
108 int prevEscape = 0; /* True if the previous character was uEsc */
110 while( 1 ){
112 /* Read (and consume) the next character from the input pattern. */
113 uint32_t uPattern;
114 SQLITE_ICU_READ_UTF8(zPattern, uPattern);
115 if( uPattern==0 ) break;
117 /* There are now 4 possibilities:
119 ** 1. uPattern is an unescaped match-all character "%",
120 ** 2. uPattern is an unescaped match-one character "_",
121 ** 3. uPattern is an unescaped escape character, or
122 ** 4. uPattern is to be handled as an ordinary character
124 if( !prevEscape && uPattern==MATCH_ALL ){
125 /* Case 1. */
126 uint8_t c;
128 /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
129 ** MATCH_ALL. For each MATCH_ONE, skip one character in the
130 ** test string.
132 while( (c=*zPattern) == MATCH_ALL || c == MATCH_ONE ){
133 if( c==MATCH_ONE ){
134 if( *zString==0 ) return 0;
135 SQLITE_ICU_SKIP_UTF8(zString);
137 zPattern++;
140 if( *zPattern==0 ) return 1;
142 while( *zString ){
143 if( icuLikeCompare(zPattern, zString, uEsc) ){
144 return 1;
146 SQLITE_ICU_SKIP_UTF8(zString);
148 return 0;
150 }else if( !prevEscape && uPattern==MATCH_ONE ){
151 /* Case 2. */
152 if( *zString==0 ) return 0;
153 SQLITE_ICU_SKIP_UTF8(zString);
155 }else if( !prevEscape && uPattern==(uint32_t)uEsc){
156 /* Case 3. */
157 prevEscape = 1;
159 }else{
160 /* Case 4. */
161 uint32_t uString;
162 SQLITE_ICU_READ_UTF8(zString, uString);
163 uString = (uint32_t)u_foldCase((UChar32)uString, U_FOLD_CASE_DEFAULT);
164 uPattern = (uint32_t)u_foldCase((UChar32)uPattern, U_FOLD_CASE_DEFAULT);
165 if( uString!=uPattern ){
166 return 0;
168 prevEscape = 0;
172 return *zString==0;
176 ** Implementation of the like() SQL function. This function implements
177 ** the build-in LIKE operator. The first argument to the function is the
178 ** pattern and the second argument is the string. So, the SQL statements:
180 ** A LIKE B
182 ** is implemented as like(B, A). If there is an escape character E,
184 ** A LIKE B ESCAPE E
186 ** is mapped to like(B, A, E).
188 static void icuLikeFunc(
189 sqlite3_context *context,
190 int argc,
191 sqlite3_value **argv
193 const unsigned char *zA = sqlite3_value_text(argv[0]);
194 const unsigned char *zB = sqlite3_value_text(argv[1]);
195 UChar32 uEsc = 0;
197 /* Limit the length of the LIKE or GLOB pattern to avoid problems
198 ** of deep recursion and N*N behavior in patternCompare().
200 if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){
201 sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1);
202 return;
206 if( argc==3 ){
207 /* The escape character string must consist of a single UTF-8 character.
208 ** Otherwise, return an error.
210 int nE= sqlite3_value_bytes(argv[2]);
211 const unsigned char *zE = sqlite3_value_text(argv[2]);
212 int i = 0;
213 if( zE==0 ) return;
214 U8_NEXT(zE, i, nE, uEsc);
215 if( i!=nE){
216 sqlite3_result_error(context,
217 "ESCAPE expression must be a single character", -1);
218 return;
222 if( zA && zB ){
223 sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc));
228 ** This function is called when an ICU function called from within
229 ** the implementation of an SQL scalar function returns an error.
231 ** The scalar function context passed as the first argument is
232 ** loaded with an error message based on the following two args.
234 static void icuFunctionError(
235 sqlite3_context *pCtx, /* SQLite scalar function context */
236 const char *zName, /* Name of ICU function that failed */
237 UErrorCode e /* Error code returned by ICU function */
239 char zBuf[128];
240 sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e));
241 zBuf[127] = '\0';
242 sqlite3_result_error(pCtx, zBuf, -1);
246 ** Function to delete compiled regexp objects. Registered as
247 ** a destructor function with sqlite3_set_auxdata().
249 static void icuRegexpDelete(void *p){
250 URegularExpression *pExpr = (URegularExpression *)p;
251 uregex_close(pExpr);
255 ** Implementation of SQLite REGEXP operator. This scalar function takes
256 ** two arguments. The first is a regular expression pattern to compile
257 ** the second is a string to match against that pattern. If either
258 ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result
259 ** is 1 if the string matches the pattern, or 0 otherwise.
261 ** SQLite maps the regexp() function to the regexp() operator such
262 ** that the following two are equivalent:
264 ** zString REGEXP zPattern
265 ** regexp(zPattern, zString)
267 ** Uses the following ICU regexp APIs:
269 ** uregex_open()
270 ** uregex_matches()
271 ** uregex_close()
273 static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){
274 UErrorCode status = U_ZERO_ERROR;
275 URegularExpression *pExpr;
276 UBool res;
277 const UChar *zString = sqlite3_value_text16(apArg[1]);
279 (void)nArg; /* Unused parameter */
281 /* If the left hand side of the regexp operator is NULL,
282 ** then the result is also NULL.
284 if( !zString ){
285 return;
288 pExpr = sqlite3_get_auxdata(p, 0);
289 if( !pExpr ){
290 const UChar *zPattern = sqlite3_value_text16(apArg[0]);
291 if( !zPattern ){
292 return;
294 pExpr = uregex_open(zPattern, -1, 0, 0, &status);
296 if( U_SUCCESS(status) ){
297 sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete);
298 }else{
299 assert(!pExpr);
300 icuFunctionError(p, "uregex_open", status);
301 return;
305 /* Configure the text that the regular expression operates on. */
306 uregex_setText(pExpr, zString, -1, &status);
307 if( !U_SUCCESS(status) ){
308 icuFunctionError(p, "uregex_setText", status);
309 return;
312 /* Attempt the match */
313 res = uregex_matches(pExpr, 0, &status);
314 if( !U_SUCCESS(status) ){
315 icuFunctionError(p, "uregex_matches", status);
316 return;
319 /* Set the text that the regular expression operates on to a NULL
320 ** pointer. This is not really necessary, but it is tidier than
321 ** leaving the regular expression object configured with an invalid
322 ** pointer after this function returns.
324 uregex_setText(pExpr, 0, 0, &status);
326 /* Return 1 or 0. */
327 sqlite3_result_int(p, res ? 1 : 0);
331 ** Implementations of scalar functions for case mapping - upper() and
332 ** lower(). Function upper() converts its input to upper-case (ABC).
333 ** Function lower() converts to lower-case (abc).
335 ** ICU provides two types of case mapping, "general" case mapping and
336 ** "language specific". Refer to ICU documentation for the differences
337 ** between the two.
339 ** To utilise "general" case mapping, the upper() or lower() scalar
340 ** functions are invoked with one argument:
342 ** upper('ABC') -> 'abc'
343 ** lower('abc') -> 'ABC'
345 ** To access ICU "language specific" case mapping, upper() or lower()
346 ** should be invoked with two arguments. The second argument is the name
347 ** of the locale to use. Passing an empty string ("") or SQL NULL value
348 ** as the second argument is the same as invoking the 1 argument version
349 ** of upper() or lower().
351 ** lower('I', 'en_us') -> 'i'
352 ** lower('I', 'tr_tr') -> '\u131' (small dotless i)
354 ** http://www.icu-project.org/userguide/posix.html#case_mappings
356 static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){
357 const UChar *zInput; /* Pointer to input string */
358 UChar *zOutput = 0; /* Pointer to output buffer */
359 int nInput; /* Size of utf-16 input string in bytes */
360 int nOut; /* Size of output buffer in bytes */
361 int cnt;
362 int bToUpper; /* True for toupper(), false for tolower() */
363 UErrorCode status;
364 const char *zLocale = 0;
366 assert(nArg==1 || nArg==2);
367 bToUpper = (sqlite3_user_data(p)!=0);
368 if( nArg==2 ){
369 zLocale = (const char *)sqlite3_value_text(apArg[1]);
372 zInput = sqlite3_value_text16(apArg[0]);
373 if( !zInput ){
374 return;
376 nOut = nInput = sqlite3_value_bytes16(apArg[0]);
377 if( nOut==0 ){
378 sqlite3_result_text16(p, "", 0, SQLITE_STATIC);
379 return;
382 for(cnt=0; cnt<2; cnt++){
383 UChar *zNew = sqlite3_realloc(zOutput, nOut);
384 if( zNew==0 ){
385 sqlite3_free(zOutput);
386 sqlite3_result_error_nomem(p);
387 return;
389 zOutput = zNew;
390 status = U_ZERO_ERROR;
391 if( bToUpper ){
392 nOut = 2*u_strToUpper(zOutput,nOut/2,zInput,nInput/2,zLocale,&status);
393 }else{
394 nOut = 2*u_strToLower(zOutput,nOut/2,zInput,nInput/2,zLocale,&status);
397 if( U_SUCCESS(status) ){
398 sqlite3_result_text16(p, zOutput, nOut, xFree);
399 }else if( status==U_BUFFER_OVERFLOW_ERROR ){
400 assert( cnt==0 );
401 continue;
402 }else{
403 icuFunctionError(p, bToUpper ? "u_strToUpper" : "u_strToLower", status);
405 return;
407 assert( 0 ); /* Unreachable */
411 ** Collation sequence destructor function. The pCtx argument points to
412 ** a UCollator structure previously allocated using ucol_open().
414 static void icuCollationDel(void *pCtx){
415 UCollator *p = (UCollator *)pCtx;
416 ucol_close(p);
420 ** Collation sequence comparison function. The pCtx argument points to
421 ** a UCollator structure previously allocated using ucol_open().
423 static int icuCollationColl(
424 void *pCtx,
425 int nLeft,
426 const void *zLeft,
427 int nRight,
428 const void *zRight
430 UCollationResult res;
431 UCollator *p = (UCollator *)pCtx;
432 res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2);
433 switch( res ){
434 case UCOL_LESS: return -1;
435 case UCOL_GREATER: return +1;
436 case UCOL_EQUAL: return 0;
438 assert(!"Unexpected return value from ucol_strcoll()");
439 return 0;
443 ** Implementation of the scalar function icu_load_collation().
445 ** This scalar function is used to add ICU collation based collation
446 ** types to an SQLite database connection. It is intended to be called
447 ** as follows:
449 ** SELECT icu_load_collation(<locale>, <collation-name>);
451 ** Where <locale> is a string containing an ICU locale identifier (i.e.
452 ** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
453 ** collation sequence to create.
455 static void icuLoadCollation(
456 sqlite3_context *p,
457 int nArg,
458 sqlite3_value **apArg
460 sqlite3 *db = (sqlite3 *)sqlite3_user_data(p);
461 UErrorCode status = U_ZERO_ERROR;
462 const char *zLocale; /* Locale identifier - (eg. "jp_JP") */
463 const char *zName; /* SQL Collation sequence name (eg. "japanese") */
464 UCollator *pUCollator; /* ICU library collation object */
465 int rc; /* Return code from sqlite3_create_collation_x() */
467 assert(nArg==2);
468 (void)nArg; /* Unused parameter */
469 zLocale = (const char *)sqlite3_value_text(apArg[0]);
470 zName = (const char *)sqlite3_value_text(apArg[1]);
472 if( !zLocale || !zName ){
473 return;
476 pUCollator = ucol_open(zLocale, &status);
477 if( !U_SUCCESS(status) ){
478 icuFunctionError(p, "ucol_open", status);
479 return;
481 assert(p);
483 rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator,
484 icuCollationColl, icuCollationDel
486 if( rc!=SQLITE_OK ){
487 ucol_close(pUCollator);
488 sqlite3_result_error(p, "Error registering collation function", -1);
493 ** Register the ICU extension functions with database db.
495 int sqlite3IcuInit(sqlite3 *db){
496 static const struct IcuScalar {
497 const char *zName; /* Function name */
498 unsigned char nArg; /* Number of arguments */
499 unsigned short enc; /* Optimal text encoding */
500 unsigned char iContext; /* sqlite3_user_data() context */
501 void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
502 } scalars[] = {
503 {"icu_load_collation", 2, SQLITE_UTF8, 1, icuLoadCollation},
504 {"regexp", 2, SQLITE_ANY|SQLITE_DETERMINISTIC, 0, icuRegexpFunc},
505 {"lower", 1, SQLITE_UTF16|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
506 {"lower", 2, SQLITE_UTF16|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
507 {"upper", 1, SQLITE_UTF16|SQLITE_DETERMINISTIC, 1, icuCaseFunc16},
508 {"upper", 2, SQLITE_UTF16|SQLITE_DETERMINISTIC, 1, icuCaseFunc16},
509 {"lower", 1, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
510 {"lower", 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
511 {"upper", 1, SQLITE_UTF8|SQLITE_DETERMINISTIC, 1, icuCaseFunc16},
512 {"upper", 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, 1, icuCaseFunc16},
513 {"like", 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuLikeFunc},
514 {"like", 3, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuLikeFunc},
516 int rc = SQLITE_OK;
517 int i;
520 for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){
521 const struct IcuScalar *p = &scalars[i];
522 rc = sqlite3_create_function(
523 db, p->zName, p->nArg, p->enc,
524 p->iContext ? (void*)db : (void*)0,
525 p->xFunc, 0, 0
529 return rc;
532 #if !SQLITE_CORE
533 #ifdef _WIN32
534 __declspec(dllexport)
535 #endif
536 int sqlite3_icu_init(
537 sqlite3 *db,
538 char **pzErrMsg,
539 const sqlite3_api_routines *pApi
541 SQLITE_EXTENSION_INIT2(pApi)
542 return sqlite3IcuInit(db);
544 #endif
546 #endif