Add the "categories" option to the unicode61 tokenizer in fts5.
[sqlite.git] / ext / fts3 / unicode / parseunicode.tcl
blob966d7bdd3a18d9c629cedeb023822c5ebf7cfd49
2 #--------------------------------------------------------------------------
3 # Parameter $zName must be a path to the file UnicodeData.txt. This command
4 # reads the file and returns a list of mappings required to remove all
5 # diacritical marks from a unicode string. Each mapping is itself a list
6 # consisting of two elements - the unicode codepoint and the single ASCII
7 # character that it should be replaced with, or an empty string if the
8 # codepoint should simply be removed from the input. Examples:
10 # { 224 a } (replace codepoint 224 to "a")
11 # { 769 "" } (remove codepoint 769 from input)
13 # Mappings are only returned for non-upper case codepoints. It is assumed
14 # that the input has already been folded to lower case.
16 proc rd_load_unicodedata_text {zName} {
17 global tl_lookup_table
19 set fd [open $zName]
20 set lField {
21 code
22 character_name
23 general_category
24 canonical_combining_classes
25 bidirectional_category
26 character_decomposition_mapping
27 decimal_digit_value
28 digit_value
29 numeric_value
30 mirrored
31 unicode_1_name
32 iso10646_comment_field
33 uppercase_mapping
34 lowercase_mapping
35 titlecase_mapping
37 set lRet [list]
39 while { ![eof $fd] } {
40 set line [gets $fd]
41 if {$line == ""} continue
43 set fields [split $line ";"]
44 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
45 foreach $lField $fields {}
46 if { [llength $character_decomposition_mapping]!=2
47 || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
48 } {
49 continue
52 set iCode [expr "0x$code"]
53 set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
54 set iDia [expr "0x[lindex $character_decomposition_mapping 1]"]
56 if {[info exists tl_lookup_table($iCode)]} continue
58 if { ($iAscii >= 97 && $iAscii <= 122)
59 || ($iAscii >= 65 && $iAscii <= 90)
60 } {
61 lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
62 set dia($iDia) 1
66 foreach d [array names dia] {
67 lappend lRet [list $d ""]
69 set lRet [lsort -integer -index 0 $lRet]
71 close $fd
72 set lRet
75 #-------------------------------------------------------------------------
76 # Parameter $zName must be a path to the file UnicodeData.txt. This command
77 # reads the file and returns a list of codepoints (integers). The list
78 # contains all codepoints in the UnicodeData.txt assigned to any "General
79 # Category" that is not a "Letter" or "Number".
81 proc an_load_unicodedata_text {zName} {
82 set fd [open $zName]
83 set lField {
84 code
85 character_name
86 general_category
87 canonical_combining_classes
88 bidirectional_category
89 character_decomposition_mapping
90 decimal_digit_value
91 digit_value
92 numeric_value
93 mirrored
94 unicode_1_name
95 iso10646_comment_field
96 uppercase_mapping
97 lowercase_mapping
98 titlecase_mapping
100 set lRet [list]
102 while { ![eof $fd] } {
103 set line [gets $fd]
104 if {$line == ""} continue
106 set fields [split $line ";"]
107 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
108 foreach $lField $fields {}
110 set iCode [expr "0x$code"]
111 set bAlnum [expr {
112 [lsearch {L N} [string range $general_category 0 0]] >= 0
113 || $general_category=="Co"
116 if { !$bAlnum } { lappend lRet $iCode }
119 close $fd
120 set lRet
123 proc tl_load_casefolding_txt {zName} {
124 global tl_lookup_table
126 set fd [open $zName]
127 while { ![eof $fd] } {
128 set line [gets $fd]
129 if {[string range $line 0 0] == "#"} continue
130 if {$line == ""} continue
132 foreach x {a b c d} {unset -nocomplain $x}
133 foreach {a b c d} [split $line ";"] {}
135 set a2 [list]
136 set c2 [list]
137 foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
138 foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
139 set b [string trim $b]
140 set d [string trim $d]
142 if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
146 proc cc_load_unicodedata_text {zName} {
147 set fd [open $zName]
148 set lField {
149 code
150 character_name
151 general_category
152 canonical_combining_classes
153 bidirectional_category
154 character_decomposition_mapping
155 decimal_digit_value
156 digit_value
157 numeric_value
158 mirrored
159 unicode_1_name
160 iso10646_comment_field
161 uppercase_mapping
162 lowercase_mapping
163 titlecase_mapping
165 set lRet [list]
167 while { ![eof $fd] } {
168 set line [gets $fd]
169 if {$line == ""} continue
171 set fields [split $line ";"]
172 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
173 foreach $lField $fields {}
175 lappend lRet [list $code $general_category]
178 close $fd
179 set lRet