Add tests for the new code on this branch.
[sqlite.git] / ext / fts3 / unicode / mkunicode.tcl
blob58d90c68c72adf86b69aa380f82df0010897aa53
2 source [file join [file dirname [info script]] parseunicode.tcl]
4 proc print_rd {map} {
5 global tl_lookup_table
6 set aChar [list]
7 set lRange [list]
9 set nRange 1
10 set iFirst [lindex $map 0 0]
11 set cPrev [lindex $map 0 1]
12 set fPrev [lindex $map 0 2]
14 foreach m [lrange $map 1 end] {
15 foreach {i c f} $m {}
17 if {$cPrev == $c && $fPrev==$f} {
18 for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
19 if {[info exists tl_lookup_table($j)]==0} break
22 if {$j==$i} {
23 set nNew [expr {(1 + $i - $iFirst)}]
24 if {$nNew<=8} {
25 set nRange $nNew
26 continue
31 lappend lRange [list $iFirst $nRange]
32 lappend aChar $cPrev
33 lappend aFlag $fPrev
35 set iFirst $i
36 set cPrev $c
37 set fPrev $f
38 set nRange 1
40 lappend lRange [list $iFirst $nRange]
41 lappend aChar $cPrev
42 lappend aFlag $fPrev
44 puts "/*"
45 puts "** If the argument is a codepoint corresponding to a lowercase letter"
46 puts "** in the ASCII range with a diacritic added, return the codepoint"
47 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
48 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
49 puts "** E\"). The resuls of passing a codepoint that corresponds to an"
50 puts "** uppercase letter are undefined."
51 puts "*/"
52 puts "static int ${::remove_diacritic}(int c, int bComplex)\{"
53 puts " unsigned short aDia\[\] = \{"
54 puts -nonewline " 0, "
55 set i 1
56 foreach r $lRange {
57 foreach {iCode nRange} $r {}
58 if {($i % 8)==0} {puts "" ; puts -nonewline " " }
59 incr i
61 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
62 puts -nonewline ", "
64 puts ""
65 puts " \};"
66 puts "#define HIBIT ((unsigned char)0x80)"
67 puts " unsigned char aChar\[\] = \{"
68 puts -nonewline " '\\0', "
69 set i 1
70 foreach c $aChar f $aFlag {
71 if { $f } {
72 set str "'$c'|HIBIT, "
73 } else {
74 set str "'$c', "
76 if {$c == ""} { set str "'\\0', " }
78 if {($i % 6)==0} {puts "" ; puts -nonewline " " }
79 incr i
80 puts -nonewline "$str"
82 puts ""
83 puts " \};"
84 puts {
85 unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
86 int iRes = 0;
87 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
88 int iLo = 0;
89 while( iHi>=iLo ){
90 int iTest = (iHi + iLo) / 2;
91 if( key >= aDia[iTest] ){
92 iRes = iTest;
93 iLo = iTest+1;
94 }else{
95 iHi = iTest-1;
98 assert( key>=aDia[iRes] );
99 if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
100 return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);}
101 puts "\}"
104 proc print_isdiacritic {zFunc map} {
106 set lCode [list]
107 foreach m $map {
108 foreach {code char flag} $m {}
109 if {$flag} continue
110 if {$code && $char == ""} { lappend lCode $code }
112 set lCode [lsort -integer $lCode]
113 set iFirst [lindex $lCode 0]
114 set iLast [lindex $lCode end]
116 set i1 0
117 set i2 0
119 foreach c $lCode {
120 set i [expr $c - $iFirst]
121 if {$i < 32} {
122 set i1 [expr {$i1 | (1<<$i)}]
123 } else {
124 set i2 [expr {$i2 | (1<<($i-32))}]
128 puts "/*"
129 puts "** Return true if the argument interpreted as a unicode codepoint"
130 puts "** is a diacritical modifier character."
131 puts "*/"
132 puts "int ${zFunc}\(int c)\{"
133 puts " unsigned int mask0 = [format "0x%08X" $i1];"
134 puts " unsigned int mask1 = [format "0x%08X" $i2];"
136 puts " if( c<$iFirst || c>$iLast ) return 0;"
137 puts " return (c < $iFirst+32) ?"
138 puts " (mask0 & ((unsigned int)1 << (c-$iFirst))) :"
139 puts " (mask1 & ((unsigned int)1 << (c-$iFirst-32)));"
140 puts "\}"
144 #-------------------------------------------------------------------------
146 proc an_load_separator_ranges {} {
147 global unicodedata.txt
148 set lSep [an_load_unicodedata_text ${unicodedata.txt}]
149 unset -nocomplain iFirst
150 unset -nocomplain nRange
151 set lRange [list]
152 foreach sep $lSep {
153 if {0==[info exists iFirst]} {
154 set iFirst $sep
155 set nRange 1
156 } elseif { $sep == ($iFirst+$nRange) } {
157 incr nRange
158 } else {
159 lappend lRange [list $iFirst $nRange]
160 set iFirst $sep
161 set nRange 1
164 lappend lRange [list $iFirst $nRange]
165 set lRange
168 proc an_print_range_array {lRange} {
169 set iFirstMax 0
170 set nRangeMax 0
171 foreach range $lRange {
172 foreach {iFirst nRange} $range {}
173 if {$iFirst > $iFirstMax} {set iFirstMax $iFirst}
174 if {$nRange > $nRangeMax} {set nRangeMax $nRange}
176 if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
177 if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}
179 puts -nonewline " "
180 puts [string trim {
181 /* Each unsigned integer in the following array corresponds to a contiguous
182 ** range of unicode codepoints that are not either letters or numbers (i.e.
183 ** codepoints for which this function should return 0).
185 ** The most significant 22 bits in each 32-bit value contain the first
186 ** codepoint in the range. The least significant 10 bits are used to store
187 ** the size of the range (always at least 1). In other words, the value
188 ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
189 ** C. It is not possible to represent a range larger than 1023 codepoints
190 ** using this format.
193 puts -nonewline " static const unsigned int aEntry\[\] = \{"
194 set i 0
195 foreach range $lRange {
196 foreach {iFirst nRange} $range {}
197 set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]]
199 if {($i % 5)==0} {puts "" ; puts -nonewline " "}
200 puts -nonewline " $u32,"
201 incr i
203 puts ""
204 puts " \};"
207 proc an_print_ascii_bitmap {lRange} {
208 foreach range $lRange {
209 foreach {iFirst nRange} $range {}
210 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} {
211 if {$i<=127} { set a($i) 1 }
215 set aAscii [list 0 0 0 0]
216 foreach key [array names a] {
217 set idx [expr $key >> 5]
218 lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))]
221 puts " static const unsigned int aAscii\[4\] = \{"
222 puts -nonewline " "
223 foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] }
224 puts ""
225 puts " \};"
228 proc print_isalnum {zFunc lRange} {
229 puts "/*"
230 puts "** Return true if the argument corresponds to a unicode codepoint"
231 puts "** classified as either a letter or a number. Otherwise false."
232 puts "**"
233 puts "** The results are undefined if the value passed to this function"
234 puts "** is less than zero."
235 puts "*/"
236 puts "int ${zFunc}\(int c)\{"
237 an_print_range_array $lRange
238 an_print_ascii_bitmap $lRange
239 puts {
240 if( (unsigned int)c<128 ){
241 return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 );
242 }else if( (unsigned int)c<(1<<22) ){
243 unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
244 int iRes = 0;
245 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
246 int iLo = 0;
247 while( iHi>=iLo ){
248 int iTest = (iHi + iLo) / 2;
249 if( key >= aEntry[iTest] ){
250 iRes = iTest;
251 iLo = iTest+1;
252 }else{
253 iHi = iTest-1;
256 assert( aEntry[0]<key );
257 assert( key>=aEntry[iRes] );
258 return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
260 return 1;}
261 puts "\}"
264 proc print_test_isalnum {zFunc lRange} {
265 foreach range $lRange {
266 foreach {iFirst nRange} $range {}
267 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 }
270 puts "static int isalnum_test(int *piCode)\{"
271 puts -nonewline " unsigned char aAlnum\[\] = \{"
272 for {set i 0} {$i < 70000} {incr i} {
273 if {($i % 32)==0} { puts "" ; puts -nonewline " " }
274 set bFlag [expr ![info exists a($i)]]
275 puts -nonewline "${bFlag},"
277 puts ""
278 puts " \};"
280 puts -nonewline " int aLargeSep\[\] = \{"
281 set i 0
282 foreach iSep [lsort -integer [array names a]] {
283 if {$iSep<70000} continue
284 if {($i % 8)==0} { puts "" ; puts -nonewline " " }
285 puts -nonewline " $iSep,"
286 incr i
288 puts ""
289 puts " \};"
290 puts -nonewline " int aLargeOther\[\] = \{"
291 set i 0
292 foreach iSep [lsort -integer [array names a]] {
293 if {$iSep<70000} continue
294 if {[info exists a([expr $iSep-1])]==0} {
295 if {($i % 8)==0} { puts "" ; puts -nonewline " " }
296 puts -nonewline " [expr $iSep-1],"
297 incr i
299 if {[info exists a([expr $iSep+1])]==0} {
300 if {($i % 8)==0} { puts "" ; puts -nonewline " " }
301 puts -nonewline " [expr $iSep+1],"
302 incr i
305 puts ""
306 puts " \};"
308 puts [subst -nocommands {
309 int i;
310 for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){
311 if( ${zFunc}(i)!=aAlnum[i] ){
312 *piCode = i;
313 return 1;
316 for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){
317 if( ${zFunc}(aLargeSep[i])!=0 ){
318 *piCode = aLargeSep[i];
319 return 1;
322 for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){
323 if( ${zFunc}(aLargeOther[i])!=1 ){
324 *piCode = aLargeOther[i];
325 return 1;
329 puts " return 0;"
330 puts "\}"
333 #-------------------------------------------------------------------------
335 proc tl_create_records {} {
336 global tl_lookup_table
338 set iFirst ""
339 set nOff 0
340 set nRange 0
341 set nIncr 0
343 set lRecord [list]
344 foreach code [lsort -integer [array names tl_lookup_table]] {
345 set mapping $tl_lookup_table($code)
346 if {$iFirst == ""} {
347 set iFirst $code
348 set nOff [expr $mapping - $code]
349 set nRange 1
350 set nIncr 1
351 } else {
352 set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))]
353 if { $nRange==1 && ($diff==1 || $diff==2) } {
354 set nIncr $diff
357 if {$diff != $nIncr || ($mapping - $code)!=$nOff} {
358 if { $nRange==1 } {set nIncr 1}
359 lappend lRecord [list $iFirst $nIncr $nRange $nOff]
360 set iFirst $code
361 set nOff [expr $mapping - $code]
362 set nRange 1
363 set nIncr 1
364 } else {
365 incr nRange
370 lappend lRecord [list $iFirst $nIncr $nRange $nOff]
372 set lRecord
375 proc tl_print_table_header {} {
376 puts -nonewline " "
377 puts [string trim {
378 /* Each entry in the following array defines a rule for folding a range
379 ** of codepoints to lower case. The rule applies to a range of nRange
380 ** codepoints starting at codepoint iCode.
382 ** If the least significant bit in flags is clear, then the rule applies
383 ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
384 ** need to be folded). Or, if it is set, then the rule only applies to
385 ** every second codepoint in the range, starting with codepoint C.
387 ** The 7 most significant bits in flags are an index into the aiOff[]
388 ** array. If a specific codepoint C does require folding, then its lower
389 ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
391 ** The contents of this array are generated by parsing the CaseFolding.txt
392 ** file distributed as part of the "Unicode Character Database". See
393 ** http://www.unicode.org for details.
396 puts " static const struct TableEntry \{"
397 puts " unsigned short iCode;"
398 puts " unsigned char flags;"
399 puts " unsigned char nRange;"
400 puts " \} aEntry\[\] = \{"
403 proc tl_print_table_entry {togglevar entry liOff} {
404 upvar $togglevar t
405 foreach {iFirst nIncr nRange nOff} $entry {}
407 if {$iFirst > (1<<16)} { return 1 }
409 if {[info exists t]==0} {set t 0}
410 if {$t==0} { puts -nonewline " " }
412 set flags 0
413 if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]}
414 if {$nOff<0} { incr nOff [expr (1<<16)] }
416 set idx [lsearch $liOff $nOff]
417 if {$idx<0} {error "malfunction generating aiOff"}
418 set flags [expr $flags + $idx*2]
420 set txt "{$iFirst, $flags, $nRange},"
421 if {$t==2} {
422 puts $txt
423 } else {
424 puts -nonewline [format "% -23s" $txt]
426 set t [expr ($t+1)%3]
428 return 0
431 proc tl_print_table_footer {togglevar} {
432 upvar $togglevar t
433 if {$t!=0} {puts ""}
434 puts " \};"
437 proc tl_print_if_entry {entry} {
438 foreach {iFirst nIncr nRange nOff} $entry {}
439 if {$nIncr==2} {error "tl_print_if_entry needs improvement!"}
441 puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{"
442 puts " ret = c + $nOff;"
443 puts " \}"
446 proc tl_generate_ioff_table {lRecord} {
447 foreach entry $lRecord {
448 foreach {iFirst nIncr nRange iOff} $entry {}
449 if {$iOff<0} { incr iOff [expr (1<<16)] }
450 if {[info exists a($iOff)]} continue
451 set a($iOff) 1
454 set liOff [lsort -integer [array names a]]
455 if {[llength $liOff]>128} { error "Too many distinct ioffs" }
456 return $liOff
459 proc tl_print_ioff_table {liOff} {
460 puts -nonewline " static const unsigned short aiOff\[\] = \{"
461 set i 0
462 foreach off $liOff {
463 if {($i % 8)==0} {puts "" ; puts -nonewline " "}
464 puts -nonewline [format "% -7s" "$off,"]
465 incr i
467 puts ""
468 puts " \};"
472 proc print_fold {zFunc} {
474 set lRecord [tl_create_records]
476 set lHigh [list]
477 puts "/*"
478 puts "** Interpret the argument as a unicode codepoint. If the codepoint"
479 puts "** is an upper case character that has a lower case equivalent,"
480 puts "** return the codepoint corresponding to the lower case version."
481 puts "** Otherwise, return a copy of the argument."
482 puts "**"
483 puts "** The results are undefined if the value passed to this function"
484 puts "** is less than zero."
485 puts "*/"
486 puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{"
488 set liOff [tl_generate_ioff_table $lRecord]
489 tl_print_table_header
490 foreach entry $lRecord {
491 if {[tl_print_table_entry toggle $entry $liOff]} {
492 lappend lHigh $entry
495 tl_print_table_footer toggle
496 tl_print_ioff_table $liOff
498 puts [subst -nocommands {
499 int ret = c;
501 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
503 if( c<128 ){
504 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
505 }else if( c<65536 ){
506 const struct TableEntry *p;
507 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
508 int iLo = 0;
509 int iRes = -1;
511 assert( c>aEntry[0].iCode );
512 while( iHi>=iLo ){
513 int iTest = (iHi + iLo) / 2;
514 int cmp = (c - aEntry[iTest].iCode);
515 if( cmp>=0 ){
516 iRes = iTest;
517 iLo = iTest+1;
518 }else{
519 iHi = iTest-1;
523 assert( iRes>=0 && c>=aEntry[iRes].iCode );
524 p = &aEntry[iRes];
525 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
526 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
527 assert( ret>0 );
530 if( eRemoveDiacritic ){
531 ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2);
536 foreach entry $lHigh {
537 tl_print_if_entry $entry
540 puts ""
541 puts " return ret;"
542 puts "\}"
545 proc code {txt} {
546 set txt [string trimright $txt]
547 set txt [string trimleft $txt "\n"]
548 set n [expr {[string length $txt] - [string length [string trim $txt]]}]
549 set ret ""
550 foreach L [split $txt "\n"] {
551 append ret "[string range $L $n end]\n"
553 return [uplevel "subst -nocommands {$ret}"]
556 proc intarray {lInt} {
557 set ret ""
558 set n [llength $lInt]
559 for {set i 0} {$i < $n} {incr i 10} {
560 append ret "\n "
561 foreach int [lrange $lInt $i [expr $i+9]] {
562 append ret [format "%-7s" "$int, "]
565 append ret "\n "
566 set ret
569 proc categories_switch {Cvar first lSecond} {
570 upvar $Cvar C
571 set ret ""
572 append ret "case '$first':\n"
573 append ret " switch( zCat\[1\] ){\n"
574 foreach s $lSecond {
575 append ret " case '$s': aArray\[$C($first$s)\] = 1; break;\n"
577 append ret " case '*': \n"
578 foreach s $lSecond {
579 append ret " aArray\[$C($first$s)\] = 1;\n"
581 append ret " break;\n"
582 append ret " default: return 1;"
583 append ret " }\n"
584 append ret " break;\n"
587 # Argument is a list. Each element of which is itself a list of two elements:
589 # * the codepoint
590 # * the category
592 # List elements are sorted in order of codepoint.
594 proc print_categories {lMap} {
595 set categories {
596 Cc Cf Cn Cs
597 Ll Lm Lo Lt Lu
598 Mc Me Mn
599 Nd Nl No
600 Pc Pd Pe Pf Pi Po Ps
601 Sc Sk Sm So
602 Zl Zp Zs
604 LC Co
607 for {set i 0} {$i < [llength $categories]} {incr i} {
608 set C([lindex $categories $i]) [expr 1+$i]
611 set caseC [categories_switch C C {c f n s o}]
612 set caseL [categories_switch C L {l m o t u C}]
613 set caseM [categories_switch C M {c e n}]
614 set caseN [categories_switch C N {d l o}]
615 set caseP [categories_switch C P {c d e f i o s}]
616 set caseS [categories_switch C S {c k m o}]
617 set caseZ [categories_switch C Z {l p s}]
619 set nCat [expr [llength [array names C]] + 1]
620 puts [code {
621 int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){
622 aArray[0] = 1;
623 switch( zCat[0] ){
624 $caseC
625 $caseL
626 $caseM
627 $caseN
628 $caseP
629 $caseS
630 $caseZ
632 return 0;
636 set nRepeat 0
637 set first [lindex $lMap 0 0]
638 set class [lindex $lMap 0 1]
639 set prev -1
641 set CASE(0) "Lu"
642 set CASE(1) "Ll"
644 foreach m $lMap {
645 foreach {codepoint cl} $m {}
646 set codepoint [expr "0x$codepoint"]
647 if {$codepoint>=(1<<20)} continue
649 set bNew 0
650 if {$codepoint!=($prev+1)} {
651 set bNew 1
652 } elseif {
653 $cl==$class || ($class=="LC" && $cl==$CASE([expr $nRepeat & 0x01]))
655 incr nRepeat
656 } elseif {$class=="Lu" && $nRepeat==1 && $cl=="Ll"} {
657 set class LC
658 incr nRepeat
659 } else {
660 set bNew 1
662 if {$bNew} {
663 lappend lEntries [list $first $class $nRepeat]
664 set nRepeat 1
665 set first $codepoint
666 set class $cl
668 set prev $codepoint
670 if {$nRepeat>0} {
671 lappend lEntries [list $first $class $nRepeat]
674 set aBlock [list 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
675 set aMap [list]
676 foreach e $lEntries {
677 foreach {cp class nRepeat} $e {}
678 set block [expr ($cp>>16)]
679 if {$block>0 && [lindex $aBlock $block]==0} {
680 for {set i 1} {$i<=$block} {incr i} {
681 if {[lindex $aBlock $i]==0} {
682 lset aBlock $i [llength $aMap]
686 lappend aMap [expr {$cp & 0xFFFF}]
687 lappend aData [expr {($nRepeat << 5) + $C($class)}]
689 for {set i 1} {$i<[llength $aBlock]} {incr i} {
690 if {[lindex $aBlock $i]==0} {
691 lset aBlock $i [llength $aMap]
695 set aBlockArray [intarray $aBlock]
696 set aMapArray [intarray $aMap]
697 set aDataArray [intarray $aData]
698 puts [code {
699 static u16 aFts5UnicodeBlock[] = {$aBlockArray};
700 static u16 aFts5UnicodeMap[] = {$aMapArray};
701 static u16 aFts5UnicodeData[] = {$aDataArray};
703 int sqlite3Fts5UnicodeCategory(u32 iCode) {
704 int iRes = -1;
705 int iHi;
706 int iLo;
707 int ret;
708 u16 iKey;
710 if( iCode>=(1<<20) ){
711 return 0;
713 iLo = aFts5UnicodeBlock[(iCode>>16)];
714 iHi = aFts5UnicodeBlock[1+(iCode>>16)];
715 iKey = (iCode & 0xFFFF);
716 while( iHi>iLo ){
717 int iTest = (iHi + iLo) / 2;
718 assert( iTest>=iLo && iTest<iHi );
719 if( iKey>=aFts5UnicodeMap[iTest] ){
720 iRes = iTest;
721 iLo = iTest+1;
722 }else{
723 iHi = iTest;
727 if( iRes<0 ) return 0;
728 if( iKey>=(aFts5UnicodeMap[iRes]+(aFts5UnicodeData[iRes]>>5)) ) return 0;
729 ret = aFts5UnicodeData[iRes] & 0x1F;
730 if( ret!=$C(LC) ) return ret;
731 return ((iKey - aFts5UnicodeMap[iRes]) & 0x01) ? $C(Ll) : $C(Lu);
734 void sqlite3Fts5UnicodeAscii(u8 *aArray, u8 *aAscii){
735 int i = 0;
736 int iTbl = 0;
737 while( i<128 ){
738 int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ];
739 int n = (aFts5UnicodeData[iTbl] >> 5) + i;
740 for(; i<128 && i<n; i++){
741 aAscii[i] = (u8)bToken;
743 iTbl++;
745 aAscii[0] = 0; /* 0x00 is never a token character */
750 proc print_test_categories {lMap} {
752 set lCP [list]
753 foreach e $lMap {
754 foreach {cp cat} $e {}
755 if {[expr 0x$cp] < (1<<20)} {
756 lappend lCP "{0x$cp, \"$cat\"}, "
760 set aCP "\n"
761 for {set i 0} {$i < [llength $lCP]} {incr i 4} {
762 append aCP " [join [lrange $lCP $i $i+3]]\n"
766 puts [code {
767 static int categories_test (int *piCode){
768 struct Codepoint {
769 int iCode;
770 const char *zCat;
771 } aCP[] = {$aCP};
772 int i;
773 int iCP = 0;
775 for(i=0; i<1000000; i++){
776 u8 aArray[40];
777 int cat = 0;
778 int c = 0;
779 memset(aArray, 0, sizeof(aArray));
780 if( aCP[iCP].iCode==i ){
781 sqlite3Fts5UnicodeCatParse(aCP[iCP].zCat, aArray);
782 iCP++;
783 }else{
784 aArray[0] = 1;
787 c = sqlite3Fts5UnicodeCategory((u32)i);
788 if( aArray[c]==0 ){
789 *piCode = i;
790 return 1;
794 return 0;
799 proc print_fold_test {zFunc mappings} {
800 global tl_lookup_table
802 foreach m $mappings {
803 set c [lindex $m 1]
804 if {$c == ""} {
805 set extra([lindex $m 0]) 0
806 } else {
807 scan $c %c i
808 set extra([lindex $m 0]) $i
812 puts "static int fold_test(int *piCode)\{"
813 puts -nonewline " static int aLookup\[\] = \{"
814 for {set i 0} {$i < 70000} {incr i} {
816 set expected $i
817 catch { set expected $tl_lookup_table($i) }
818 set expected2 $expected
819 catch { set expected2 $extra($expected2) }
821 if {($i % 4)==0} { puts "" ; puts -nonewline " " }
822 puts -nonewline "$expected, $expected2, "
824 puts " \};"
825 puts " int i;"
826 puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
827 puts " int iCode = (i/2);"
828 puts " int bFlag = i & 0x0001;"
829 puts " if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{"
830 puts " *piCode = iCode;"
831 puts " return 1;"
832 puts " \}"
833 puts " \}"
834 puts " return 0;"
835 puts "\}"
839 proc print_fileheader {} {
840 puts [string trim {
842 ** 2012-05-25
844 ** The author disclaims copyright to this source code. In place of
845 ** a legal notice, here is a blessing:
847 ** May you do good and not evil.
848 ** May you find forgiveness for yourself and forgive others.
849 ** May you share freely, never taking more than you give.
851 ******************************************************************************
855 ** DO NOT EDIT THIS MACHINE GENERATED FILE.
858 puts ""
859 if {$::generate_fts5_code} {
860 # no-op
861 } else {
862 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE"
863 puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
865 puts ""
866 puts "#include <assert.h>"
867 puts ""
870 proc print_test_main {} {
871 puts ""
872 puts "#include <stdio.h>"
873 puts ""
874 puts "int main(int argc, char **argv)\{"
875 puts " int r1, r2, r3;"
876 puts " int code;"
877 puts " r3 = 0;"
878 puts " r1 = isalnum_test(&code);"
879 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
880 puts " else printf(\"isalnum(): test passed\\n\");"
881 puts " r2 = fold_test(&code);"
882 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"
883 puts " else printf(\"fold(): test passed\\n\");"
884 if {$::generate_fts5_code} {
885 puts " r3 = categories_test(&code);"
886 puts " if( r3 ) printf(\"categories(): Problem with code %d\\n\",code);"
887 puts " else printf(\"categories(): test passed\\n\");"
889 puts " return (r1 || r2 || r3);"
890 puts "\}"
893 # Proces the command line arguments. Exit early if they are not to
894 # our liking.
896 proc usage {} {
897 puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? "
898 puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
899 exit 1
901 if {[llength $argv]<2} usage
902 set unicodedata.txt [lindex $argv end]
903 set casefolding.txt [lindex $argv end-1]
905 set remove_diacritic remove_diacritic
906 set generate_test_code 0
907 set generate_fts5_code 0
908 set function_prefix "sqlite3Fts"
909 for {set i 0} {$i < [llength $argv]-2} {incr i} {
910 switch -- [lindex $argv $i] {
911 -test {
912 set generate_test_code 1
914 -fts5 {
915 set function_prefix sqlite3Fts5
916 set generate_fts5_code 1
917 set remove_diacritic fts5_remove_diacritic
919 default {
920 usage
925 print_fileheader
927 if {$::generate_test_code} {
928 puts "typedef unsigned short int u16;"
929 puts "typedef unsigned char u8;"
930 puts "#include <string.h>"
933 # Print the isalnum() function to stdout.
935 set lRange [an_load_separator_ranges]
936 if {$generate_fts5_code==0} {
937 print_isalnum ${function_prefix}UnicodeIsalnum $lRange
940 # Leave a gap between the two generated C functions.
942 puts ""
943 puts ""
945 # Load the fold data. This is used by the [rd_XXX] commands
946 # as well as [print_fold].
947 tl_load_casefolding_txt ${casefolding.txt}
949 set mappings [rd_load_unicodedata_text ${unicodedata.txt}]
950 print_rd $mappings
951 puts ""
952 puts ""
953 print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings
954 puts ""
955 puts ""
957 # Print the fold() function to stdout.
959 print_fold ${function_prefix}UnicodeFold
961 if {$generate_fts5_code} {
962 puts ""
963 puts ""
964 print_categories [cc_load_unicodedata_text ${unicodedata.txt}]
967 # Print the test routines and main() function to stdout, if -test
968 # was specified.
970 if {$::generate_test_code} {
971 if {$generate_fts5_code==0} {
972 print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange
974 print_fold_test ${function_prefix}UnicodeFold $mappings
975 print_test_categories [cc_load_unicodedata_text ${unicodedata.txt}]
976 print_test_main
979 if {$generate_fts5_code} {
980 # no-op
981 } else {
982 puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
983 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"