ext/fts3/unicode/parseunicode.tcl

   1
   2 #--------------------------------------------------------------------------
   3 # Parameter $zName must be a path to the file UnicodeData.txt. This command
   4 # reads the file and returns a list of mappings required to remove all
   5 # diacritical marks from a unicode string. Each mapping is itself a list
   6 # consisting of two elements - the unicode codepoint and the single ASCII
   7 # character that it should be replaced with, or an empty string if the
   8 # codepoint should simply be removed from the input. Examples:
   9 #
  10 #   { 224 a  }     (replace codepoint 224 to "a")
  11 #   { 769 "" }     (remove codepoint 769 from input)
  12 #
  13 # Mappings are only returned for non-upper case codepoints. It is assumed
  14 # that the input has already been folded to lower case.
  15 #
  16 proc rd_load_unicodedata_text {zName} {
  17   global tl_lookup_table
  18
  19   set fd [open $zName]
  20   set lField {
  21     code
  22     character_name
  23     general_category
  24     canonical_combining_classes
  25     bidirectional_category
  26     character_decomposition_mapping
  27     decimal_digit_value
  28     digit_value
  29     numeric_value
  30     mirrored
  31     unicode_1_name
  32     iso10646_comment_field
  33     uppercase_mapping
  34     lowercase_mapping
  35     titlecase_mapping
  36   }
  37   set lRet [list]
  38
  39   while { ![eof $fd] } {
  40     set line [gets $fd]
  41     if {$line == ""} continue
  42
  43     set fields [split $line ";"]
  44     if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
  45     foreach $lField $fields {}
  46     if { [llength $character_decomposition_mapping]!=2
  47       || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
  48     } {
  49       continue
  50     }
  51
  52     set iCode  [expr "0x$code"]
  53     set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
  54     set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
  55
  56     if {[info exists tl_lookup_table($iCode)]} continue
  57
  58     if { ($iAscii >= 97 && $iAscii <= 122)
  59       || ($iAscii >= 65 && $iAscii <= 90)
  60     } {
  61       lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
  62       set dia($iDia) 1
  63     }
  64   }
  65
  66   foreach d [array names dia] {
  67     lappend lRet [list $d ""]
  68   }
  69   set lRet [lsort -integer -index 0 $lRet]
  70
  71   close $fd
  72   set lRet
  73 }
  74
  75 #-------------------------------------------------------------------------
  76 # Parameter $zName must be a path to the file UnicodeData.txt. This command
  77 # reads the file and returns a list of codepoints (integers). The list
  78 # contains all codepoints in the UnicodeData.txt assigned to any "General
  79 # Category" that is not a "Letter" or "Number".
  80 #
  81 proc an_load_unicodedata_text {zName} {
  82   set fd [open $zName]
  83   set lField {
  84     code
  85     character_name
  86     general_category
  87     canonical_combining_classes
  88     bidirectional_category
  89     character_decomposition_mapping
  90     decimal_digit_value
  91     digit_value
  92     numeric_value
  93     mirrored
  94     unicode_1_name
  95     iso10646_comment_field
  96     uppercase_mapping
  97     lowercase_mapping
  98     titlecase_mapping
  99   }
 100   set lRet [list]
 101
 102   while { ![eof $fd] } {
 103     set line [gets $fd]
 104     if {$line == ""} continue
 105
 106     set fields [split $line ";"]
 107     if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
 108     foreach $lField $fields {}
 109
 110     set iCode [expr "0x$code"]
 111     set bAlnum [expr {
 112          [lsearch {L N} [string range $general_category 0 0]] >= 0
 113       || $general_category=="Co"
 114     }]
 115
 116     if { !$bAlnum } { lappend lRet $iCode }
 117   }
 118
 119   close $fd
 120   set lRet
 121 }
 122
 123 proc tl_load_casefolding_txt {zName} {
 124   global tl_lookup_table
 125
 126   set fd [open $zName]
 127   while { ![eof $fd] } {
 128     set line [gets $fd]
 129     if {[string range $line 0 0] == "#"} continue
 130     if {$line == ""} continue
 131
 132     foreach x {a b c d} {unset -nocomplain $x}
 133     foreach {a b c d} [split $line ";"] {}
 134
 135     set a2 [list]
 136     set c2 [list]
 137     foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
 138     foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
 139     set b [string trim $b]
 140     set d [string trim $d]
 141
 142     if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
 143   }
 144 }
 145
 146 proc cc_load_unicodedata_text {zName} {
 147   set fd [open $zName]
 148   set lField {
 149     code
 150     character_name
 151     general_category
 152     canonical_combining_classes
 153     bidirectional_category
 154     character_decomposition_mapping
 155     decimal_digit_value
 156     digit_value
 157     numeric_value
 158     mirrored
 159     unicode_1_name
 160     iso10646_comment_field
 161     uppercase_mapping
 162     lowercase_mapping
 163     titlecase_mapping
 164   }
 165   set lRet [list]
 166
 167   while { ![eof $fd] } {
 168     set line [gets $fd]
 169     if {$line == ""} continue
 170
 171     set fields [split $line ";"]
 172     if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
 173     foreach $lField $fields {}
 174
 175     lappend lRet [list $code $general_category]
 176   }
 177
 178   close $fd
 179   set lRet
 180 }
 181
 182