Additional clarification in the comments to sqlite3_stdio.c. No changes
[sqlite.git] / ext / fts3 / unicode / mkunicode.tcl
blob1306629da82a51dd6cee23512e6c33113294f67b
2 source [file join [file dirname [info script]] parseunicode.tcl]
4 proc print_rd {map} {
5 global tl_lookup_table
6 set aChar [list]
7 set lRange [list]
9 set nRange 1
10 set iFirst [lindex $map 0 0]
11 set cPrev [lindex $map 0 1]
12 set fPrev [lindex $map 0 2]
14 foreach m [lrange $map 1 end] {
15 foreach {i c f} $m {}
17 if {$cPrev == $c && $fPrev==$f} {
18 for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
19 if {[info exists tl_lookup_table($j)]==0} break
22 if {$j==$i} {
23 set nNew [expr {(1 + $i - $iFirst)}]
24 if {$nNew<=8} {
25 set nRange $nNew
26 continue
31 lappend lRange [list $iFirst $nRange]
32 lappend aChar $cPrev
33 lappend aFlag $fPrev
35 set iFirst $i
36 set cPrev $c
37 set fPrev $f
38 set nRange 1
40 lappend lRange [list $iFirst $nRange]
41 lappend aChar $cPrev
42 lappend aFlag $fPrev
44 puts "/*"
45 puts "** If the argument is a codepoint corresponding to a lowercase letter"
46 puts "** in the ASCII range with a diacritic added, return the codepoint"
47 puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
48 puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
49 puts "** E\"). The resuls of passing a codepoint that corresponds to an"
50 puts "** uppercase letter are undefined."
51 puts "*/"
52 puts "static int ${::remove_diacritic}(int c, int bComplex)\{"
53 puts " unsigned short aDia\[\] = \{"
54 puts -nonewline " 0, "
55 set i 1
56 foreach r $lRange {
57 foreach {iCode nRange} $r {}
58 if {($i % 8)==0} {puts "" ; puts -nonewline " " }
59 incr i
61 puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
62 puts -nonewline ", "
64 puts ""
65 puts " \};"
66 puts "#define HIBIT ((unsigned char)0x80)"
67 puts " unsigned char aChar\[\] = \{"
68 puts -nonewline " '\\0', "
69 set i 1
70 foreach c $aChar f $aFlag {
71 if { $f } {
72 set str "'$c'|HIBIT, "
73 } else {
74 set str "'$c', "
76 if {$c == ""} { set str "'\\0', " }
78 if {($i % 6)==0} {puts "" ; puts -nonewline " " }
79 incr i
80 puts -nonewline "$str"
82 puts ""
83 puts " \};"
84 puts {
85 unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
86 int iRes = 0;
87 int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
88 int iLo = 0;
89 while( iHi>=iLo ){
90 int iTest = (iHi + iLo) / 2;
91 if( key >= aDia[iTest] ){
92 iRes = iTest;
93 iLo = iTest+1;
94 }else{
95 iHi = iTest-1;
98 assert( key>=aDia[iRes] );
99 if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
100 return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);}
101 puts "\}"
104 proc print_isdiacritic {zFunc map} {
106 set lCode [list]
107 foreach m $map {
108 foreach {code char flag} $m {}
109 if {$flag} continue
110 if {$code && $char == ""} { lappend lCode $code }
112 set lCode [lsort -integer $lCode]
113 set iFirst [lindex $lCode 0]
114 set iLast [lindex $lCode end]
116 set i1 0
117 set i2 0
119 foreach c $lCode {
120 set i [expr $c - $iFirst]
121 if {$i < 32} {
122 set i1 [expr {$i1 | (1<<$i)}]
123 } else {
124 set i2 [expr {$i2 | (1<<($i-32))}]
128 puts "/*"
129 puts "** Return true if the argument interpreted as a unicode codepoint"
130 puts "** is a diacritical modifier character."
131 puts "*/"
132 puts "int ${zFunc}\(int c)\{"
133 puts " unsigned int mask0 = [format "0x%08X" $i1];"
134 puts " unsigned int mask1 = [format "0x%08X" $i2];"
136 puts " if( c<$iFirst || c>$iLast ) return 0;"
137 puts " return (c < $iFirst+32) ?"
138 puts " (mask0 & ((unsigned int)1 << (c-$iFirst))) :"
139 puts " (mask1 & ((unsigned int)1 << (c-$iFirst-32)));"
140 puts "\}"
144 #-------------------------------------------------------------------------
146 proc an_load_separator_ranges {} {
147 global unicodedata.txt
148 set lSep [an_load_unicodedata_text ${unicodedata.txt}]
149 unset -nocomplain iFirst
150 unset -nocomplain nRange
151 set lRange [list]
152 foreach sep $lSep {
153 if {0==[info exists iFirst]} {
154 set iFirst $sep
155 set nRange 1
156 } elseif { $sep == ($iFirst+$nRange) } {
157 incr nRange
158 } else {
159 lappend lRange [list $iFirst $nRange]
160 set iFirst $sep
161 set nRange 1
164 lappend lRange [list $iFirst $nRange]
165 set lRange
168 proc an_print_range_array {lRange} {
169 set iFirstMax 0
170 set nRangeMax 0
171 foreach range $lRange {
172 foreach {iFirst nRange} $range {}
173 if {$iFirst > $iFirstMax} {set iFirstMax $iFirst}
174 if {$nRange > $nRangeMax} {set nRangeMax $nRange}
176 if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
177 if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}
179 puts -nonewline " "
180 puts [string trim {
181 /* Each unsigned integer in the following array corresponds to a contiguous
182 ** range of unicode codepoints that are not either letters or numbers (i.e.
183 ** codepoints for which this function should return 0).
185 ** The most significant 22 bits in each 32-bit value contain the first
186 ** codepoint in the range. The least significant 10 bits are used to store
187 ** the size of the range (always at least 1). In other words, the value
188 ** ((C<<22) + N) represents a range of N codepoints starting with codepoint
189 ** C. It is not possible to represent a range larger than 1023 codepoints
190 ** using this format.
193 puts -nonewline " static const unsigned int aEntry\[\] = \{"
194 set i 0
195 foreach range $lRange {
196 foreach {iFirst nRange} $range {}
197 set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]]
199 if {($i % 5)==0} {puts "" ; puts -nonewline " "}
200 puts -nonewline " $u32,"
201 incr i
203 puts ""
204 puts " \};"
207 proc an_print_ascii_bitmap {lRange} {
208 foreach range $lRange {
209 foreach {iFirst nRange} $range {}
210 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} {
211 if {$i<=127} { set a($i) 1 }
215 set aAscii [list 0 0 0 0]
216 foreach key [array names a] {
217 set idx [expr $key >> 5]
218 lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))]
221 puts " static const unsigned int aAscii\[4\] = \{"
222 puts -nonewline " "
223 foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] }
224 puts ""
225 puts " \};"
228 proc print_isalnum {zFunc lRange} {
229 puts "/*"
230 puts "** Return true if the argument corresponds to a unicode codepoint"
231 puts "** classified as either a letter or a number. Otherwise false."
232 puts "**"
233 puts "** The results are undefined if the value passed to this function"
234 puts "** is less than zero."
235 puts "*/"
236 puts "int ${zFunc}\(int c)\{"
237 an_print_range_array $lRange
238 an_print_ascii_bitmap $lRange
239 puts {
240 if( (unsigned int)c<128 ){
241 return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 );
242 }else if( (unsigned int)c<(1<<22) ){
243 unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
244 int iRes = 0;
245 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
246 int iLo = 0;
247 while( iHi>=iLo ){
248 int iTest = (iHi + iLo) / 2;
249 if( key >= aEntry[iTest] ){
250 iRes = iTest;
251 iLo = iTest+1;
252 }else{
253 iHi = iTest-1;
256 assert( aEntry[0]<key );
257 assert( key>=aEntry[iRes] );
258 return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
260 return 1;}
261 puts "\}"
264 proc print_test_isalnum {zFunc lRange} {
265 foreach range $lRange {
266 foreach {iFirst nRange} $range {}
267 for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 }
270 puts "static int isalnum_test(int *piCode)\{"
271 puts -nonewline " unsigned char aAlnum\[\] = \{"
272 for {set i 0} {$i < 70000} {incr i} {
273 if {($i % 32)==0} { puts "" ; puts -nonewline " " }
274 set bFlag [expr ![info exists a($i)]]
275 puts -nonewline "${bFlag},"
277 puts ""
278 puts " \};"
280 puts -nonewline " int aLargeSep\[\] = \{"
281 set i 0
282 foreach iSep [lsort -integer [array names a]] {
283 if {$iSep<70000} continue
284 if {($i % 8)==0} { puts "" ; puts -nonewline " " }
285 puts -nonewline " $iSep,"
286 incr i
288 puts ""
289 puts " \};"
290 puts -nonewline " int aLargeOther\[\] = \{"
291 set i 0
292 foreach iSep [lsort -integer [array names a]] {
293 if {$iSep<70000} continue
294 if {[info exists a([expr $iSep-1])]==0} {
295 if {($i % 8)==0} { puts "" ; puts -nonewline " " }
296 puts -nonewline " [expr $iSep-1],"
297 incr i
299 if {[info exists a([expr $iSep+1])]==0} {
300 if {($i % 8)==0} { puts "" ; puts -nonewline " " }
301 puts -nonewline " [expr $iSep+1],"
302 incr i
305 puts ""
306 puts " \};"
308 puts [subst -nocommands {
309 int i;
310 for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){
311 if( ${zFunc}(i)!=aAlnum[i] ){
312 *piCode = i;
313 return 1;
316 for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){
317 if( ${zFunc}(aLargeSep[i])!=0 ){
318 *piCode = aLargeSep[i];
319 return 1;
322 for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){
323 if( ${zFunc}(aLargeOther[i])!=1 ){
324 *piCode = aLargeOther[i];
325 return 1;
329 puts " return 0;"
330 puts "\}"
333 #-------------------------------------------------------------------------
335 proc tl_create_records {} {
336 global tl_lookup_table
338 set iFirst ""
339 set nOff 0
340 set nRange 0
341 set nIncr 0
343 set lRecord [list]
344 foreach code [lsort -integer [array names tl_lookup_table]] {
345 set mapping $tl_lookup_table($code)
346 if {$iFirst == ""} {
347 set iFirst $code
348 set nOff [expr $mapping - $code]
349 set nRange 1
350 set nIncr 1
351 } else {
352 set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))]
353 if { $nRange==1 && ($diff==1 || $diff==2) } {
354 set nIncr $diff
357 if {$diff != $nIncr || ($mapping - $code)!=$nOff} {
358 if { $nRange==1 } {set nIncr 1}
359 lappend lRecord [list $iFirst $nIncr $nRange $nOff]
360 set iFirst $code
361 set nOff [expr $mapping - $code]
362 set nRange 1
363 set nIncr 1
364 } else {
365 incr nRange
370 lappend lRecord [list $iFirst $nIncr $nRange $nOff]
372 set lRecord
375 proc tl_print_table_header {} {
376 puts -nonewline " "
377 puts [string trim {
378 /* Each entry in the following array defines a rule for folding a range
379 ** of codepoints to lower case. The rule applies to a range of nRange
380 ** codepoints starting at codepoint iCode.
382 ** If the least significant bit in flags is clear, then the rule applies
383 ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
384 ** need to be folded). Or, if it is set, then the rule only applies to
385 ** every second codepoint in the range, starting with codepoint C.
387 ** The 7 most significant bits in flags are an index into the aiOff[]
388 ** array. If a specific codepoint C does require folding, then its lower
389 ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
391 ** The contents of this array are generated by parsing the CaseFolding.txt
392 ** file distributed as part of the "Unicode Character Database". See
393 ** http://www.unicode.org for details.
396 puts " static const struct TableEntry \{"
397 puts " unsigned short iCode;"
398 puts " unsigned char flags;"
399 puts " unsigned char nRange;"
400 puts " \} aEntry\[\] = \{"
403 proc tl_print_table_entry {togglevar entry liOff} {
404 upvar $togglevar t
405 foreach {iFirst nIncr nRange nOff} $entry {}
407 if {$iFirst > (1<<16)} { return 1 }
409 if {[info exists t]==0} {set t 0}
410 if {$t==0} { puts -nonewline " " }
412 set flags 0
413 if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]}
414 if {$nOff<0} { incr nOff [expr (1<<16)] }
416 set idx [lsearch $liOff $nOff]
417 if {$idx<0} {error "malfunction generating aiOff"}
418 set flags [expr $flags + $idx*2]
420 set txt "{$iFirst, $flags, $nRange},"
421 if {$t==2} {
422 puts $txt
423 } else {
424 puts -nonewline [format "% -23s" $txt]
426 set t [expr ($t+1)%3]
428 return 0
431 proc tl_print_table_footer {togglevar} {
432 upvar $togglevar t
433 if {$t!=0} {puts ""}
434 puts " \};"
437 proc tl_print_if_entry {entry} {
438 foreach {iFirst nIncr nRange nOff} $entry {}
439 if {$nIncr==2} {error "tl_print_if_entry needs improvement!"}
441 puts " else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{"
442 puts " ret = c + $nOff;"
443 puts " \}"
446 proc tl_generate_ioff_table {lRecord} {
447 foreach entry $lRecord {
448 foreach {iFirst nIncr nRange iOff} $entry {}
449 if {$iOff<0} { incr iOff [expr (1<<16)] }
450 if {[info exists a($iOff)]} continue
451 set a($iOff) 1
454 set liOff [lsort -integer [array names a]]
455 if {[llength $liOff]>128} { error "Too many distinct ioffs" }
456 return $liOff
459 proc tl_print_ioff_table {liOff} {
460 puts -nonewline " static const unsigned short aiOff\[\] = \{"
461 set i 0
462 foreach off $liOff {
463 if {($i % 8)==0} {puts "" ; puts -nonewline " "}
464 puts -nonewline [format "% -7s" "$off,"]
465 incr i
467 puts ""
468 puts " \};"
472 proc print_fold {zFunc} {
474 set lRecord [tl_create_records]
476 set lHigh [list]
477 puts "/*"
478 puts "** Interpret the argument as a unicode codepoint. If the codepoint"
479 puts "** is an upper case character that has a lower case equivalent,"
480 puts "** return the codepoint corresponding to the lower case version."
481 puts "** Otherwise, return a copy of the argument."
482 puts "**"
483 puts "** The results are undefined if the value passed to this function"
484 puts "** is less than zero."
485 puts "*/"
486 puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{"
488 set liOff [tl_generate_ioff_table $lRecord]
489 tl_print_table_header
490 foreach entry $lRecord {
491 if {[tl_print_table_entry toggle $entry $liOff]} {
492 lappend lHigh $entry
495 tl_print_table_footer toggle
496 tl_print_ioff_table $liOff
498 puts [subst -nocommands {
499 int ret = c;
501 assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
503 if( c<128 ){
504 if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
505 }else if( c<65536 ){
506 const struct TableEntry *p;
507 int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
508 int iLo = 0;
509 int iRes = -1;
511 assert( c>aEntry[0].iCode );
512 while( iHi>=iLo ){
513 int iTest = (iHi + iLo) / 2;
514 int cmp = (c - aEntry[iTest].iCode);
515 if( cmp>=0 ){
516 iRes = iTest;
517 iLo = iTest+1;
518 }else{
519 iHi = iTest-1;
523 assert( iRes>=0 && c>=aEntry[iRes].iCode );
524 p = &aEntry[iRes];
525 if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
526 ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
527 assert( ret>0 );
530 if( eRemoveDiacritic ){
531 ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2);
536 foreach entry $lHigh {
537 tl_print_if_entry $entry
540 puts ""
541 puts " return ret;"
542 puts "\}"
545 proc code {txt} {
546 set txt [string trimright $txt]
547 set txt [string trimleft $txt "\n"]
548 set n [expr {[string length $txt] - [string length [string trim $txt]]}]
549 set ret ""
550 foreach L [split $txt "\n"] {
551 append ret "[string range $L $n end]\n"
553 return [uplevel "subst -nocommands {$ret}"]
556 proc intarray {lInt} {
557 set ret ""
558 set n [llength $lInt]
559 for {set i 0} {$i < $n} {incr i 10} {
560 append ret "\n "
561 foreach int [lrange $lInt $i [expr $i+9]] {
562 append ret [format "%-7s" "$int, "]
565 append ret "\n "
566 set ret
569 proc categories_switch {Cvar first lSecond} {
570 upvar $Cvar C
571 set ret ""
572 append ret "case '$first':\n"
573 append ret " switch( zCat\[1\] ){\n"
574 foreach s $lSecond {
575 append ret " case '$s': aArray\[$C($first$s)\] = 1; break;\n"
577 append ret " case '*': \n"
578 foreach s $lSecond {
579 append ret " aArray\[$C($first$s)\] = 1;\n"
581 append ret " break;\n"
582 append ret " default: return 1;"
583 append ret " }\n"
584 append ret " break;\n"
587 # Argument is a list. Each element of which is itself a list of two elements:
589 # * the codepoint
590 # * the category
592 # List elements are sorted in order of codepoint.
594 proc print_categories {lMap} {
595 set categories {
596 Cc Cf Cn Cs
597 Ll Lm Lo Lt Lu
598 Mc Me Mn
599 Nd Nl No
600 Pc Pd Pe Pf Pi Po Ps
601 Sc Sk Sm So
602 Zl Zp Zs
604 LC Co
607 for {set i 0} {$i < [llength $categories]} {incr i} {
608 set C([lindex $categories $i]) [expr 1+$i]
611 set caseC [categories_switch C C {c f n s o}]
612 set caseL [categories_switch C L {l m o t u C}]
613 set caseM [categories_switch C M {c e n}]
614 set caseN [categories_switch C N {d l o}]
615 set caseP [categories_switch C P {c d e f i o s}]
616 set caseS [categories_switch C S {c k m o}]
617 set caseZ [categories_switch C Z {l p s}]
619 set nCat [expr [llength [array names C]] + 1]
620 puts [code {
621 int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){
622 aArray[0] = 1;
623 switch( zCat[0] ){
624 $caseC
625 $caseL
626 $caseM
627 $caseN
628 $caseP
629 $caseS
630 $caseZ
632 default:
633 return 1;
635 return 0;
639 set nRepeat 0
640 set first [lindex $lMap 0 0]
641 set class [lindex $lMap 0 1]
642 set prev -1
644 set CASE(0) "Lu"
645 set CASE(1) "Ll"
647 foreach m $lMap {
648 foreach {codepoint cl} $m {}
649 set codepoint [expr "0x$codepoint"]
650 if {$codepoint>=(1<<20)} continue
652 set bNew 0
653 if {$codepoint!=($prev+1)} {
654 set bNew 1
655 } elseif {
656 $cl==$class || ($class=="LC" && $cl==$CASE([expr $nRepeat & 0x01]))
658 incr nRepeat
659 } elseif {$class=="Lu" && $nRepeat==1 && $cl=="Ll"} {
660 set class LC
661 incr nRepeat
662 } else {
663 set bNew 1
665 if {$bNew} {
666 lappend lEntries [list $first $class $nRepeat]
667 set nRepeat 1
668 set first $codepoint
669 set class $cl
671 set prev $codepoint
673 if {$nRepeat>0} {
674 lappend lEntries [list $first $class $nRepeat]
677 set aBlock [list 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
678 set aMap [list]
679 foreach e $lEntries {
680 foreach {cp class nRepeat} $e {}
681 set block [expr ($cp>>16)]
682 if {$block>0 && [lindex $aBlock $block]==0} {
683 for {set i 1} {$i<=$block} {incr i} {
684 if {[lindex $aBlock $i]==0} {
685 lset aBlock $i [llength $aMap]
689 lappend aMap [expr {$cp & 0xFFFF}]
690 lappend aData [expr {($nRepeat << 5) + $C($class)}]
692 for {set i 1} {$i<[llength $aBlock]} {incr i} {
693 if {[lindex $aBlock $i]==0} {
694 lset aBlock $i [llength $aMap]
698 set aBlockArray [intarray $aBlock]
699 set aMapArray [intarray $aMap]
700 set aDataArray [intarray $aData]
701 puts [code {
702 static u16 aFts5UnicodeBlock[] = {$aBlockArray};
703 static u16 aFts5UnicodeMap[] = {$aMapArray};
704 static u16 aFts5UnicodeData[] = {$aDataArray};
706 int sqlite3Fts5UnicodeCategory(u32 iCode) {
707 int iRes = -1;
708 int iHi;
709 int iLo;
710 int ret;
711 u16 iKey;
713 if( iCode>=(1<<20) ){
714 return 0;
716 iLo = aFts5UnicodeBlock[(iCode>>16)];
717 iHi = aFts5UnicodeBlock[1+(iCode>>16)];
718 iKey = (iCode & 0xFFFF);
719 while( iHi>iLo ){
720 int iTest = (iHi + iLo) / 2;
721 assert( iTest>=iLo && iTest<iHi );
722 if( iKey>=aFts5UnicodeMap[iTest] ){
723 iRes = iTest;
724 iLo = iTest+1;
725 }else{
726 iHi = iTest;
730 if( iRes<0 ) return 0;
731 if( iKey>=(aFts5UnicodeMap[iRes]+(aFts5UnicodeData[iRes]>>5)) ) return 0;
732 ret = aFts5UnicodeData[iRes] & 0x1F;
733 if( ret!=$C(LC) ) return ret;
734 return ((iKey - aFts5UnicodeMap[iRes]) & 0x01) ? $C(Ll) : $C(Lu);
737 void sqlite3Fts5UnicodeAscii(u8 *aArray, u8 *aAscii){
738 int i = 0;
739 int iTbl = 0;
740 while( i<128 ){
741 int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ];
742 int n = (aFts5UnicodeData[iTbl] >> 5) + i;
743 for(; i<128 && i<n; i++){
744 aAscii[i] = (u8)bToken;
746 iTbl++;
748 aAscii[0] = 0; /* 0x00 is never a token character */
753 proc print_test_categories {lMap} {
755 set lCP [list]
756 foreach e $lMap {
757 foreach {cp cat} $e {}
758 if {[expr 0x$cp] < (1<<20)} {
759 lappend lCP "{0x$cp, \"$cat\"}, "
763 set aCP "\n"
764 for {set i 0} {$i < [llength $lCP]} {incr i 4} {
765 append aCP " [join [lrange $lCP $i $i+3]]\n"
769 puts [code {
770 static int categories_test (int *piCode){
771 struct Codepoint {
772 int iCode;
773 const char *zCat;
774 } aCP[] = {$aCP};
775 int i;
776 int iCP = 0;
778 for(i=0; i<1000000; i++){
779 u8 aArray[40];
780 int cat = 0;
781 int c = 0;
782 memset(aArray, 0, sizeof(aArray));
783 if( aCP[iCP].iCode==i ){
784 sqlite3Fts5UnicodeCatParse(aCP[iCP].zCat, aArray);
785 iCP++;
786 }else{
787 aArray[0] = 1;
790 c = sqlite3Fts5UnicodeCategory((u32)i);
791 if( aArray[c]==0 ){
792 *piCode = i;
793 return 1;
797 return 0;
802 proc print_fold_test {zFunc mappings} {
803 global tl_lookup_table
805 foreach m $mappings {
806 set c [lindex $m 1]
807 if {$c == ""} {
808 set extra([lindex $m 0]) 0
809 } else {
810 scan $c %c i
811 set extra([lindex $m 0]) $i
815 puts "static int fold_test(int *piCode)\{"
816 puts -nonewline " static int aLookup\[\] = \{"
817 for {set i 0} {$i < 70000} {incr i} {
819 set expected $i
820 catch { set expected $tl_lookup_table($i) }
821 set expected2 $expected
822 catch { set expected2 $extra($expected2) }
824 if {($i % 4)==0} { puts "" ; puts -nonewline " " }
825 puts -nonewline "$expected, $expected2, "
827 puts " \};"
828 puts " int i;"
829 puts " for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
830 puts " int iCode = (i/2);"
831 puts " int bFlag = i & 0x0001;"
832 puts " if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{"
833 puts " *piCode = iCode;"
834 puts " return 1;"
835 puts " \}"
836 puts " \}"
837 puts " return 0;"
838 puts "\}"
842 proc print_fileheader {} {
843 puts [string trim {
845 ** 2012-05-25
847 ** The author disclaims copyright to this source code. In place of
848 ** a legal notice, here is a blessing:
850 ** May you do good and not evil.
851 ** May you find forgiveness for yourself and forgive others.
852 ** May you share freely, never taking more than you give.
854 ******************************************************************************
858 ** DO NOT EDIT THIS MACHINE GENERATED FILE.
861 puts ""
862 if {$::generate_fts5_code} {
863 # no-op
864 } else {
865 puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE"
866 puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)"
868 puts ""
869 puts "#include <assert.h>"
870 puts ""
873 proc print_test_main {} {
874 puts ""
875 puts "#include <stdio.h>"
876 puts ""
877 puts "int main(int argc, char **argv)\{"
878 puts " int r1, r2, r3;"
879 puts " int code;"
880 puts " r3 = 0;"
881 puts " r1 = isalnum_test(&code);"
882 puts " if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
883 puts " else printf(\"isalnum(): test passed\\n\");"
884 puts " r2 = fold_test(&code);"
885 puts " if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"
886 puts " else printf(\"fold(): test passed\\n\");"
887 if {$::generate_fts5_code} {
888 puts " r3 = categories_test(&code);"
889 puts " if( r3 ) printf(\"categories(): Problem with code %d\\n\",code);"
890 puts " else printf(\"categories(): test passed\\n\");"
892 puts " return (r1 || r2 || r3);"
893 puts "\}"
896 # Proces the command line arguments. Exit early if they are not to
897 # our liking.
899 proc usage {} {
900 puts -nonewline stderr "Usage: $::argv0 ?-test? ?-fts5? "
901 puts stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
902 exit 1
904 if {[llength $argv]<2} usage
905 set unicodedata.txt [lindex $argv end]
906 set casefolding.txt [lindex $argv end-1]
908 set remove_diacritic remove_diacritic
909 set generate_test_code 0
910 set generate_fts5_code 0
911 set function_prefix "sqlite3Fts"
912 for {set i 0} {$i < [llength $argv]-2} {incr i} {
913 switch -- [lindex $argv $i] {
914 -test {
915 set generate_test_code 1
917 -fts5 {
918 set function_prefix sqlite3Fts5
919 set generate_fts5_code 1
920 set remove_diacritic fts5_remove_diacritic
922 default {
923 usage
928 print_fileheader
930 if {$::generate_test_code} {
931 puts "typedef unsigned short int u16;"
932 puts "typedef unsigned char u8;"
933 puts "#include <string.h>"
936 # Print the isalnum() function to stdout.
938 set lRange [an_load_separator_ranges]
939 if {$generate_fts5_code==0} {
940 print_isalnum ${function_prefix}UnicodeIsalnum $lRange
943 # Leave a gap between the two generated C functions.
945 puts ""
946 puts ""
948 # Load the fold data. This is used by the [rd_XXX] commands
949 # as well as [print_fold].
950 tl_load_casefolding_txt ${casefolding.txt}
952 set mappings [rd_load_unicodedata_text ${unicodedata.txt}]
953 print_rd $mappings
954 puts ""
955 puts ""
956 print_isdiacritic ${function_prefix}UnicodeIsdiacritic $mappings
957 puts ""
958 puts ""
960 # Print the fold() function to stdout.
962 print_fold ${function_prefix}UnicodeFold
964 if {$generate_fts5_code} {
965 puts ""
966 puts ""
967 print_categories [cc_load_unicodedata_text ${unicodedata.txt}]
970 # Print the test routines and main() function to stdout, if -test
971 # was specified.
973 if {$::generate_test_code} {
974 if {$generate_fts5_code==0} {
975 print_test_isalnum ${function_prefix}UnicodeIsalnum $lRange
977 print_fold_test ${function_prefix}UnicodeFold $mappings
978 print_test_categories [cc_load_unicodedata_text ${unicodedata.txt}]
979 print_test_main
982 if {$generate_fts5_code} {
983 # no-op
984 } else {
985 puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
986 puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"