ext/fts5/test/fts5unicode2.test

   1 # 2012 May 25
   2 #
   3 # The author disclaims copyright to this source code.  In place of
   4 # a legal notice, here is a blessing:
   5 #
   6 #    May you do good and not evil.
   7 #    May you find forgiveness for yourself and forgive others.
   8 #    May you share freely, never taking more than you give.
   9 #
  10 #*************************************************************************
  11 #
  12 # The tests in this file focus on testing the "unicode" FTS tokenizer.
  13 #
  14 # This is a modified copy of FTS4 test file "fts4_unicode.test".
  15 #
  16
  17 source [file join [file dirname [info script]] fts5_common.tcl]
  18 set testprefix fts5unicode2
  19
  20 # If SQLITE_ENABLE_FTS5 is defined, omit this file.
  21 ifcapable !fts5 {
  22   finish_test
  23   return
  24 }
  25
  26 proc do_unicode_token_test {tn input res} {
  27   uplevel [list do_test $tn [list \
  28     sqlite3_fts5_tokenize -subst db "unicode61 remove_diacritics 0" $input
  29   ] [list {*}$res]]
  30 }
  31
  32 proc do_unicode_token_test2 {tn input res} {
  33   uplevel [list do_test $tn [list \
  34     sqlite3_fts5_tokenize -subst db "unicode61" $input
  35   ] [list {*}$res]]
  36 }
  37
  38 proc do_unicode_token_test3 {tn args} {
  39   set tokenizer [concat unicode61 {*}[lrange $args 0 end-2]]
  40   set input [lindex $args end-1]
  41   set res [lindex $args end]
  42   uplevel [list do_test $tn [list \
  43     sqlite3_fts5_tokenize -subst db $tokenizer $input
  44   ] [list {*}$res]]
  45 }
  46
  47 do_unicode_token_test 1.0 {a B c D} {a a b B c c d D}
  48
  49 do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \
  50     "\uE4 \uC4 \uF6 \uD6 \uFC \uDC"
  51
  52 do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \
  53     "x\uE4x x\uC4x x\uF6x x\uD6x x\uFCx x\uDCx"
  54
  55 # 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
  56 do_unicode_token_test 1.3 "\uDF" "\uDF \uDF"
  57 do_unicode_token_test 1.4 "\u1E9E" "\uDF \u1E9E"
  58
  59 do_unicode_token_test 1.5 "The quick brown fox" {
  60   the The quick quick brown brown fox fox
  61 }
  62 do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {
  63   the The quick quick brown brown fox fox
  64 }
  65
  66 do_unicode_token_test2 1.7  {a B c D} {a a b B c c d D}
  67 do_unicode_token_test2 1.8  "\uC4 \uD6 \uDC" "a \uC4 o \uD6 u \uDC"
  68
  69 do_unicode_token_test2 1.9  "x\uC4x x\uD6x x\uDCx" \
  70     "xax x\uC4x xox x\uD6x xux x\uDCx"
  71
  72 # Check that diacritics are removed if remove_diacritics=1 is specified.
  73 # And that they do not break tokens.
  74 do_unicode_token_test2 1.10 "xx\u0301xx" "xxxx xx\u301xx"
  75
  76 # Title-case mappings work
  77 do_unicode_token_test 1.11 "\u01c5" "\u01c6 \u01c5"
  78
  79 do_unicode_token_test 1.12 "\u00C1abc\u00C2 \u00D1def\u00C3" \
  80     "\u00E1abc\u00E2 \u00C1abc\u00C2 \u00F1def\u00E3 \u00D1def\u00C3"
  81
  82 do_unicode_token_test 1.13 "\u00A2abc\u00A3 \u00A4def\u00A5" \
  83     "abc abc def def"
  84
  85 #-------------------------------------------------------------------------
  86 #
  87 set docs [list {
  88   Enhance the INSERT syntax to allow multiple rows to be inserted via the
  89   VALUES clause.
  90 } {
  91   Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.
  92 } {
  93   Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().
  94 } {
  95   Added the sqlite3_db_readonly() interface.
  96 } {
  97   Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the
  98   ability to add new PRAGMA statements or to override built-in PRAGMAs.
  99 } {
 100   Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
 101   the same row that contains the maximum x value.
 102 } {
 103   Added support for the FTS4 languageid option.
 104 } {
 105   Documented support for the FTS4 content option. This feature has actually
 106   been in the code since version 3.7.9 but is only now considered to be
 107   officially supported.
 108 } {
 109   Pending statements no longer block ROLLBACK. Instead, the pending statement
 110   will return SQLITE_ABORT upon next access after the ROLLBACK.
 111 } {
 112   Improvements to the handling of CSV inputs in the command-line shell
 113 } {
 114   Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be
 115   incorrectly converted into an INNER JOIN if the WHERE clause indexable terms
 116   connected by OR.
 117 }]
 118
 119 set map(a) [list "\u00C4" "\u00E4"]  ; # LATIN LETTER A WITH DIAERESIS
 120 set map(e) [list "\u00CB" "\u00EB"]  ; # LATIN LETTER E WITH DIAERESIS
 121 set map(i) [list "\u00CF" "\u00EF"]  ; # LATIN LETTER I WITH DIAERESIS
 122 set map(o) [list "\u00D6" "\u00F6"]  ; # LATIN LETTER O WITH DIAERESIS
 123 set map(u) [list "\u00DC" "\u00FC"]  ; # LATIN LETTER U WITH DIAERESIS
 124 set map(y) [list "\u0178" "\u00FF"]  ; # LATIN LETTER Y WITH DIAERESIS
 125 set map(h) [list "\u1E26" "\u1E27"]  ; # LATIN LETTER H WITH DIAERESIS
 126 set map(w) [list "\u1E84" "\u1E85"]  ; # LATIN LETTER W WITH DIAERESIS
 127 set map(x) [list "\u1E8C" "\u1E8D"]  ; # LATIN LETTER X WITH DIAERESIS
 128 foreach k [array names map] {
 129   lappend mappings [string toupper $k] [lindex $map($k) 0]
 130   lappend mappings $k [lindex $map($k) 1]
 131 }
 132 proc mapdoc {doc} {
 133   set doc [regsub -all {[[:space:]]+} $doc " "]
 134   string map $::mappings [string trim $doc]
 135 }
 136
 137 do_test 2.0 {
 138   execsql { CREATE VIRTUAL TABLE t2 USING fts5(tokenize=unicode61, x); }
 139   foreach doc $docs {
 140     set d [mapdoc $doc]
 141     execsql { INSERT INTO t2 VALUES($d) }
 142   }
 143 } {}
 144
 145 do_test 2.1 {
 146   set q [mapdoc "row"]
 147   execsql { SELECT * FROM t2 WHERE t2 MATCH $q }
 148 } [list [mapdoc {
 149   Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
 150   the same row that contains the maximum x value.
 151 }]]
 152
 153 foreach {tn query snippet} {
 154   2 "row" {
 155      ...returns the value of y on the same [row] that contains
 156      the maximum x value.
 157   }
 158   3 "ROW" {
 159      ...returns the value of y on the same [row] that contains
 160      the maximum x value.
 161   }
 162   4 "rollback" {
 163      Pending statements no longer block [ROLLBACK]. Instead, the pending
 164      statement will return SQLITE_ABORT upon...
 165   }
 166   5 "rOllback" {
 167      Pending statements no longer block [ROLLBACK]. Instead, the pending
 168      statement will return SQLITE_ABORT upon...
 169   }
 170   6 "lang*" {
 171      Added support for the FTS4 [languageid] option.
 172   }
 173 } {
 174   do_test 2.$tn {
 175     set q [mapdoc $query]
 176     execsql {
 177       SELECT snippet(t2, -1, '[', ']', '...', 15) FROM t2 WHERE t2 MATCH $q
 178     }
 179   } [list [mapdoc $snippet]]
 180 }
 181
 182 #-------------------------------------------------------------------------
 183 # Make sure the unicode61 tokenizer does not crash if it is passed a
 184 # NULL pointer.
 185 reset_db
 186 do_execsql_test 3.1 {
 187   CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x, y);
 188   INSERT INTO t1 VALUES(NULL, 'a b c');
 189 }
 190
 191 do_execsql_test 3.2 {
 192   SELECT snippet(t1, -1, '[', ']', '...', 15) FROM t1 WHERE t1 MATCH 'b'
 193 } {{a [b] c}}
 194
 195 do_execsql_test 3.3 {
 196   BEGIN;
 197   DELETE FROM t1;
 198   INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');
 199   INSERT INTO t1 SELECT * FROM t1;
 200   INSERT INTO t1 SELECT * FROM t1;
 201   INSERT INTO t1 SELECT * FROM t1;
 202   INSERT INTO t1 SELECT * FROM t1;
 203   INSERT INTO t1 SELECT * FROM t1;
 204   INSERT INTO t1 SELECT * FROM t1;
 205   INSERT INTO t1 SELECT * FROM t1;
 206   INSERT INTO t1 SELECT * FROM t1;
 207   INSERT INTO t1 SELECT * FROM t1;
 208   INSERT INTO t1 SELECT * FROM t1;
 209   INSERT INTO t1 SELECT * FROM t1;
 210   INSERT INTO t1 SELECT * FROM t1;
 211   INSERT INTO t1 SELECT * FROM t1;
 212   INSERT INTO t1 SELECT * FROM t1;
 213   INSERT INTO t1 SELECT * FROM t1;
 214   INSERT INTO t1 SELECT * FROM t1;
 215   INSERT INTO t1 VALUES('a b c', NULL);
 216   INSERT INTO t1 VALUES('a x c', NULL);
 217   COMMIT;
 218 }
 219
 220 do_execsql_test 3.4 {
 221   SELECT * FROM t1 WHERE t1 MATCH 'a b';
 222 } {{a b c} {}}
 223
 224 #-------------------------------------------------------------------------
 225 #
 226 reset_db
 227
 228 do_test 4.1 {
 229   set a "abc\uFFFEdef"
 230   set b "abc\uD800def"
 231   set c "\uFFFEdef"
 232   set d "\uD800def"
 233   execsql {
 234     CREATE VIRTUAL TABLE t1 USING fts5(tokenize=unicode61, x);
 235     INSERT INTO t1 VALUES($a);
 236     INSERT INTO t1 VALUES($b);
 237     INSERT INTO t1 VALUES($c);
 238     INSERT INTO t1 VALUES($d);
 239   }
 240
 241   execsql "CREATE VIRTUAL TABLE t8 USING fts5(
 242       a, b, tokenize=\"unicode61 separators '\uFFFE\uD800\u00BF'\"
 243   )"
 244 } {}
 245
 246 do_test 4.2 {
 247   set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
 248   set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
 249   set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
 250   set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
 251   execsql {
 252     INSERT INTO t1 VALUES($a);
 253     INSERT INTO t1 VALUES($b);
 254     INSERT INTO t1 VALUES($c);
 255     INSERT INTO t1 VALUES($d);
 256   }
 257 } {}
 258
 259 do_test 4.3 {
 260   set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]
 261   set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]
 262   set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]
 263   set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]
 264   execsql {
 265     INSERT INTO t1 VALUES($a);
 266     INSERT INTO t1 VALUES($b);
 267     INSERT INTO t1 VALUES($c);
 268     INSERT INTO t1 VALUES($d);
 269   }
 270 } {}
 271
 272 do_test 4.4 {
 273   sqlite3_exec_hex db {
 274     CREATE VIRTUAL TABLE t9 USING fts5(a, b,
 275       tokenize="unicode61 separators '%C09004'"
 276     );
 277     INSERT INTO t9(a) VALUES('abc%88def %89ghi%90');
 278   }
 279 } {0 {}}
 280
 281
 282 #-------------------------------------------------------------------------
 283
 284 breakpoint
 285 do_unicode_token_test3 5.1 {tokenchars {}} {
 286   sqlite3_reset sqlite3_column_int
 287 } {
 288   sqlite3 sqlite3
 289   reset reset
 290   sqlite3 sqlite3
 291   column column
 292   int int
 293 }
 294
 295 do_unicode_token_test3 5.2 {tokenchars _} {
 296   sqlite3_reset sqlite3_column_int
 297 } {
 298   sqlite3_reset sqlite3_reset
 299   sqlite3_column_int sqlite3_column_int
 300 }
 301
 302 do_unicode_token_test3 5.3 {separators xyz} {
 303   Laotianxhorseyrunszfast
 304 } {
 305   laotian Laotian
 306   horse horse
 307   runs runs
 308   fast fast
 309 }
 310
 311 do_unicode_token_test3 5.4 {tokenchars xyz} {
 312   Laotianxhorseyrunszfast
 313 } {
 314   laotianxhorseyrunszfast Laotianxhorseyrunszfast
 315 }
 316
 317 do_unicode_token_test3 5.5 {tokenchars _} {separators zyx} {
 318   sqlite3_resetxsqlite3_column_intyhonda_phantom
 319 } {
 320   sqlite3_reset sqlite3_reset
 321   sqlite3_column_int sqlite3_column_int
 322   honda_phantom honda_phantom
 323 }
 324
 325 do_unicode_token_test3 5.6 "separators \u05D1" "abc\u05D1def" {
 326   abc abc def def
 327 }
 328
 329 do_unicode_token_test3 5.7                             \
 330   "tokenchars \u2444\u2445"                            \
 331   "separators \u05D0\u05D1\u05D2"                      \
 332   "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
 333   [list                                                \
 334     \u2444fre\u2445sh \u2444fre\u2445sh              \
 335     water water                                      \
 336     fish fish                                        \
 337     \u2445timer \u2445timer                          \
 338   ]
 339
 340 # Check that it is not possible to add a standalone diacritic codepoint
 341 # to either separators or tokenchars.
 342 do_unicode_token_test3 5.8 "separators \u0301" \
 343   "hello\u0301world \u0301helloworld"          \
 344   "helloworld hello\u0301world helloworld helloworld"
 345
 346 do_unicode_token_test3 5.9 "tokenchars \u0301" \
 347   "hello\u0301world \u0301helloworld"          \
 348   "helloworld hello\u0301world helloworld helloworld"
 349
 350 do_unicode_token_test3 5.10 "separators \u0301" \
 351   "remove_diacritics 0"                        \
 352   "hello\u0301world \u0301helloworld"          \
 353   "hello\u0301world hello\u0301world helloworld helloworld"
 354
 355 do_unicode_token_test3 5.11 "tokenchars \u0301" \
 356   "remove_diacritics 0"                         \
 357   "hello\u0301world \u0301helloworld"           \
 358   "hello\u0301world hello\u0301world helloworld helloworld"
 359
 360 #-------------------------------------------------------------------------
 361
 362 proc do_tokenize {tokenizer txt} {
 363   set res [list]
 364   foreach {b c} [sqlite3_fts5_tokenize -subst db $tokenizer $txt] {
 365     lappend res $b
 366   }
 367   set res
 368 }
 369
 370 # Argument $lCodepoint must be a list of codepoints (integers) that
 371 # correspond to whitespace characters. This command creates a string
 372 # $W from the codepoints, then tokenizes "${W}hello{$W}world${W}"
 373 # using tokenizer $tokenizer. The test passes if the tokenizer successfully
 374 # extracts the two 5 character tokens.
 375 #
 376 proc do_isspace_test {tn tokenizer lCp} {
 377   set whitespace [format [string repeat %c [llength $lCp]] {*}$lCp]
 378   set txt "${whitespace}hello${whitespace}world${whitespace}"
 379   uplevel [list do_test $tn [list do_tokenize $tokenizer $txt] {hello world}]
 380 }
 381
 382 set tokenizers [list unicode61]
 383 #ifcapable icu { lappend tokenizers icu }
 384
 385 # Some tests to check that the tokenizers can both identify white-space
 386 # codepoints. All codepoints tested below are of type "Zs" in the
 387 # UnicodeData.txt file.
 388 foreach T $tokenizers {
 389   do_isspace_test 6.$T.1 $T    32
 390   do_isspace_test 6.$T.2 $T    160
 391   do_isspace_test 6.$T.3 $T    5760
 392   do_isspace_test 6.$T.4 $T    6158
 393   do_isspace_test 6.$T.5 $T    8192
 394   do_isspace_test 6.$T.6 $T    8193
 395   do_isspace_test 6.$T.7 $T    8194
 396   do_isspace_test 6.$T.8 $T    8195
 397   do_isspace_test 6.$T.9 $T    8196
 398   do_isspace_test 6.$T.10 $T    8197
 399   do_isspace_test 6.$T.11 $T    8198
 400   do_isspace_test 6.$T.12 $T    8199
 401   do_isspace_test 6.$T.13 $T    8200
 402   do_isspace_test 6.$T.14 $T    8201
 403   do_isspace_test 6.$T.15 $T    8202
 404   do_isspace_test 6.$T.16 $T    8239
 405   do_isspace_test 6.$T.17 $T    8287
 406   do_isspace_test 6.$T.18 $T   12288
 407
 408   do_isspace_test 6.$T.19 $T   {32 160 5760 6158}
 409   do_isspace_test 6.$T.20 $T   {8192 8193 8194 8195}
 410   do_isspace_test 6.$T.21 $T   {8196 8197 8198 8199}
 411   do_isspace_test 6.$T.22 $T   {8200 8201 8202 8239}
 412   do_isspace_test 6.$T.23 $T   {8287 12288}
 413 }
 414
 415
 416 #-------------------------------------------------------------------------
 417 # Test that the private use ranges are treated as alphanumeric.
 418 #
 419 foreach {tn1 c} {
 420   1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
 421 } {
 422   foreach {tn2 config res} {
 423     1 ""             "hello*world hello*world"
 424     2 "separators *" "hello hello world world"
 425   } {
 426     set config [string map [list * $c] $config]
 427     set input  [string map [list * $c] "hello*world"]
 428     set output [string map [list * $c] $res]
 429     do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output
 430   }
 431 }
 432
 433 #-------------------------------------------------------------------------
 434 # Cursory test of remove_diacritics=0.
 435 #
 436 # 00C4;LATIN CAPITAL LETTER A WITH DIAERESIS
 437 # 00D6;LATIN CAPITAL LETTER O WITH DIAERESIS
 438 # 00E4;LATIN SMALL LETTER A WITH DIAERESIS
 439 # 00F6;LATIN SMALL LETTER O WITH DIAERESIS
 440 #
 441 do_execsql_test 8.1.1 "
 442   CREATE VIRTUAL TABLE t3 USING fts5(
 443     content, tokenize='unicode61 remove_diacritics 1'
 444   );
 445   INSERT INTO t3 VALUES('o');
 446   INSERT INTO t3 VALUES('a');
 447   INSERT INTO t3 VALUES('O');
 448   INSERT INTO t3 VALUES('A');
 449   INSERT INTO t3 VALUES('\xD6');
 450   INSERT INTO t3 VALUES('\xC4');
 451   INSERT INTO t3 VALUES('\xF6');
 452   INSERT INTO t3 VALUES('\xE4');
 453 "
 454 do_execsql_test 8.1.2 {
 455   SELECT rowid FROM t3 WHERE t3 MATCH 'o' ORDER BY rowid ASC;
 456 } {1 3 5 7}
 457 do_execsql_test 8.1.3 {
 458   SELECT rowid FROM t3 WHERE t3 MATCH 'a' ORDER BY rowid ASC;
 459 } {2 4 6 8}
 460 do_execsql_test 8.2.1 {
 461   CREATE VIRTUAL TABLE t4 USING fts5(
 462     content, tokenize='unicode61 remove_diacritics 0'
 463   );
 464   INSERT INTO t4 SELECT * FROM t3 ORDER BY rowid ASC;
 465 }
 466 do_execsql_test 8.2.2 {
 467   SELECT rowid FROM t4 WHERE t4 MATCH 'o' ORDER BY rowid ASC;
 468 } {1 3}
 469 do_execsql_test 8.2.3 {
 470   SELECT rowid FROM t4 WHERE t4 MATCH 'a' ORDER BY rowid ASC;
 471 } {2 4}
 472
 473 #-------------------------------------------------------------------------
 474 #
 475 if 0 {
 476 foreach {tn sql} {
 477   1 {
 478     CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 [tokenchars= .]);
 479     CREATE VIRTUAL TABLE t6 USING fts4(
 480         tokenize=unicode61 [tokenchars=="] "tokenchars=[]");
 481     CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 [separators=x\xC4]);
 482   }
 483   2 {
 484     CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 "tokenchars= .");
 485     CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 "tokenchars=[=""]");
 486     CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 "separators=x\xC4");
 487   }
 488   3 {
 489     CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 'tokenchars= .');
 490     CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 'tokenchars=="[]');
 491     CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 'separators=x\xC4');
 492   }
 493   4 {
 494     CREATE VIRTUAL TABLE t5 USING fts4(tokenize=unicode61 `tokenchars= .`);
 495     CREATE VIRTUAL TABLE t6 USING fts4(tokenize=unicode61 `tokenchars=[="]`);
 496     CREATE VIRTUAL TABLE t7 USING fts4(tokenize=unicode61 `separators=x\xC4`);
 497   }
 498 } {
 499   do_execsql_test 9.$tn.0 {
 500     DROP TABLE IF EXISTS t5;
 501     DROP TABLE IF EXISTS t5aux;
 502     DROP TABLE IF EXISTS t6;
 503     DROP TABLE IF EXISTS t6aux;
 504     DROP TABLE IF EXISTS t7;
 505     DROP TABLE IF EXISTS t7aux;
 506   }
 507   do_execsql_test 9.$tn.1 $sql
 508
 509   do_execsql_test 9.$tn.2 {
 510     CREATE VIRTUAL TABLE t5aux USING fts4aux(t5);
 511     INSERT INTO t5 VALUES('one two three/four.five.six');
 512     SELECT * FROM t5aux;
 513   } {
 514     four.five.six   * 1 1 four.five.six   0 1 1
 515     {one two three} * 1 1 {one two three} 0 1 1
 516   }
 517
 518   do_execsql_test 9.$tn.3 {
 519     CREATE VIRTUAL TABLE t6aux USING fts4aux(t6);
 520     INSERT INTO t6 VALUES('alpha=beta"gamma/delta[epsilon]zeta');
 521     SELECT * FROM t6aux;
 522   } {
 523     {alpha=beta"gamma}   * 1 1 {alpha=beta"gamma} 0 1 1
 524     {delta[epsilon]zeta} * 1 1 {delta[epsilon]zeta} 0 1 1
 525   }
 526
 527   do_execsql_test 9.$tn.4 {
 528     CREATE VIRTUAL TABLE t7aux USING fts4aux(t7);
 529     INSERT INTO t7 VALUES('alephxbeth\xC4gimel');
 530     SELECT * FROM t7aux;
 531   } {
 532     aleph * 1 1 aleph 0 1 1
 533     beth  * 1 1 beth  0 1 1
 534     gimel * 1 1 gimel 0 1 1
 535   }
 536 }
 537
 538 # Check that multiple options are handled correctly.
 539 #
 540 do_execsql_test 10.1 {
 541   DROP TABLE IF EXISTS t1;
 542   CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61
 543     "tokenchars=xyz" "tokenchars=.=" "separators=.=" "separators=xy"
 544     "separators=a" "separators=a" "tokenchars=a" "tokenchars=a"
 545   );
 546
 547   INSERT INTO t1 VALUES('oneatwoxthreeyfour');
 548   INSERT INTO t1 VALUES('a.single=word');
 549   CREATE VIRTUAL TABLE t1aux USING fts4aux(t1);
 550   SELECT * FROM t1aux;
 551 } {
 552   .single=word * 1 1 .single=word 0 1 1
 553   four         * 1 1 four         0 1 1
 554   one          * 1 1 one          0 1 1
 555   three        * 1 1 three        0 1 1
 556   two          * 1 1 two          0 1 1
 557 }
 558
 559 # Test that case folding happens after tokenization, not before.
 560 #
 561 do_execsql_test 10.2 {
 562   DROP TABLE IF EXISTS t2;
 563   CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61 "separators=aB");
 564   INSERT INTO t2 VALUES('oneatwoBthree');
 565   INSERT INTO t2 VALUES('onebtwoAthree');
 566   CREATE VIRTUAL TABLE t2aux USING fts4aux(t2);
 567   SELECT * FROM t2aux;
 568 } {
 569   one           * 1 1 one           0 1 1
 570   onebtwoathree * 1 1 onebtwoathree 0 1 1
 571   three         * 1 1 three         0 1 1
 572   two           * 1 1 two           0 1 1
 573 }
 574
 575 # Test that the tokenchars and separators options work with the
 576 # fts3tokenize table.
 577 #
 578 do_execsql_test 11.1 {
 579   CREATE VIRTUAL TABLE ft1 USING fts3tokenize(
 580     "unicode61", "tokenchars=@.", "separators=1234567890"
 581   );
 582   SELECT token FROM ft1 WHERE input = 'berlin@street123sydney.road';
 583 } {
 584   berlin@street sydney.road
 585 }
 586
 587 }
 588
 589 finish_test