2 #--------------------------------------------------------------------------
3 # Parameter $zName must be a path to the file UnicodeData.txt. This command
4 # reads the file and returns a list of mappings required to remove all
5 # diacritical marks from a unicode string. Each mapping is itself a list
6 # consisting of two elements - the unicode codepoint and the single ASCII
7 # character that it should be replaced with, or an empty string if the
8 # codepoint should simply be removed from the input. Examples:
10 # { 224 a } (replace codepoint 224 to "a")
11 # { 769 "" } (remove codepoint 769 from input)
13 # Mappings are only returned for non-upper case codepoints. It is assumed
14 # that the input has already been folded to lower case.
16 proc rd_load_unicodedata_text
{zName
} {
17 global tl_lookup_table
24 canonical_combining_classes
25 bidirectional_category
26 character_decomposition_mapping
32 iso10646_comment_field
39 while { ![eof $fd] } {
41 if {$line == ""} continue
43 set fields
[split $line ";"]
44 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
45 foreach $lField $fields {}
46 if { [llength $character_decomposition_mapping]!=2
47 ||
[string is xdigit
[lindex $character_decomposition_mapping 0]]==0
52 set iCode
[expr "0x$code"]
53 set iAscii
[expr "0x[lindex $character_decomposition_mapping 0]"]
54 set iDia
[expr "0x[lindex $character_decomposition_mapping 1]"]
56 if {[info exists tl_lookup_table
($iCode)]} continue
58 if { ($iAscii >= 97 && $iAscii <= 122)
59 ||
($iAscii >= 65 && $iAscii <= 90)
61 lappend lRet
[list $iCode [string tolower
[format %c
$iAscii]]]
66 foreach d
[array names dia
] {
67 lappend lRet
[list $d ""]
69 set lRet
[lsort -integer -index 0 $lRet]
75 #-------------------------------------------------------------------------
76 # Parameter $zName must be a path to the file UnicodeData.txt. This command
77 # reads the file and returns a list of codepoints (integers). The list
78 # contains all codepoints in the UnicodeData.txt assigned to any "General
79 # Category" that is not a "Letter" or "Number".
81 proc an_load_unicodedata_text
{zName
} {
87 canonical_combining_classes
88 bidirectional_category
89 character_decomposition_mapping
95 iso10646_comment_field
102 while { ![eof $fd] } {
104 if {$line == ""} continue
106 set fields
[split $line ";"]
107 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
108 foreach $lField $fields {}
110 set iCode
[expr "0x$code"]
112 [lsearch {L N
} [string range
$general_category 0 0]] >= 0
113 ||
$general_category=="Co"
116 if { !$bAlnum } { lappend lRet
$iCode }
123 proc tl_load_casefolding_txt
{zName
} {
124 global tl_lookup_table
127 while { ![eof $fd] } {
129 if {[string range
$line 0 0] == "#"} continue
130 if {$line == ""} continue
132 foreach x
{a b c d
} {unset -nocomplain $x}
133 foreach {a b c d
} [split $line ";"] {}
137 foreach elem
$a { lappend a2
[expr "0x[string trim $elem]"] }
138 foreach elem
$c { lappend c2
[expr "0x[string trim $elem]"] }
139 set b
[string trim
$b]
140 set d
[string trim
$d]
142 if {$b=="C" ||
$b=="S"} { set tl_lookup_table
($a2) $c2 }
146 proc cc_load_unicodedata_text
{zName
} {
152 canonical_combining_classes
153 bidirectional_category
154 character_decomposition_mapping
160 iso10646_comment_field
167 while { ![eof $fd] } {
169 if {$line == ""} continue
171 set fields
[split $line ";"]
172 if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
173 foreach $lField $fields {}
175 lappend lRet
[list $code $general_category]