string last: fix segfault with invalid index
[jimtcl.git] / parse-unidata.tcl
blob2bf654710efa0120d123bdf96844d471d3e944be
1 #!/usr/bin/env tclsh
3 # Generate UTF-8 case mapping tables
5 # (c) 2010 Steve Bennett <steveb@workware.net.au>
7 # See LICENCE for licence details.
8 #/
10 # Parse the unicode data from: http://unicode.org/Public/UNIDATA/UnicodeData.txt
11 # and http://unicode.org/Public/UNIDATA/EastAsianWidth.txt
12 # to generate case mapping and display width tables
13 set map(lower) {}
14 set map(upper) {}
15 set map(title) {}
16 set map(combining) {}
17 set map(wide) {}
19 set USAGE "Usage: parse-unidata.tcl \[-width\] UnicodeData.txt \[EastAsianWidth.txt\]"
20 set do_width 0
22 if {[lindex $argv 0] eq "-width"} {
23 set do_width 1
24 set argv [lrange $argv 1 end]
27 if {[llength $argv] ni {1 2}} {
28 puts stderr $USAGE
29 exit 1
32 lassign $argv unicodefile widthfile
34 set f [open $unicodefile]
35 while {[gets $f buf] >= 0} {
36 set title ""
37 set lower ""
38 set upper ""
39 lassign [split $buf ";"] code name class x x x x x x x x x upper lower title
40 set codex [string tolower 0x$code]
41 if {[string match M* $class]} {
42 if {![info exists combining]} {
43 set combining $codex
45 continue
46 } elseif {[info exists combining]} {
47 lappend map(combining) $combining $codex
48 unset combining
50 if {$codex <= 0x7f} {
51 continue
53 if {$codex > 0xffff} {
54 break
56 if {![string match L* $class]} {
57 continue
59 if {$upper ne ""} {
60 lappend map(upper) $codex [string tolower 0x$upper]
62 if {$lower ne ""} {
63 lappend map(lower) $codex [string tolower 0x$lower]
65 if {$title ne "" && $title ne $upper} {
66 if {$title eq $code} {
67 set title 0
69 lappend map(title) $codex [string tolower 0x$title]
72 close $f
74 proc output-int-pairs {list} {
75 set n 0
76 foreach {v1 v2} $list {
77 puts -nonewline "\t{ $v1, $v2 },"
78 if {[incr n] % 4 == 0} {
79 puts ""
82 if {$n % 4} {
83 puts ""
87 # Merges adjacent ranges in a list of ranges (lower upper lower upper ...)
88 proc combine-adjacent-ranges {list} {
89 set newlist {}
90 foreach {lower upper} $list {
91 if {[info exists prev_upper]} {
92 if {$lower == $prev_upper + 1} {
93 # combine these
94 set prev_upper $upper
95 continue
96 } else {
97 # can't combine
98 lappend newlist $prev_lower $prev_upper
101 set prev_lower $lower
102 set prev_upper $upper
104 # Now add the last range
105 lappend newlist $prev_lower $prev_upper
106 return $newlist
109 foreach type {upper lower title} {
110 puts "static const struct casemap unicode_case_mapping_$type\[\] = \{"
111 output-int-pairs $map($type)
112 puts "\};\n"
115 if {$do_width} {
116 set f [open $widthfile]
117 while {[gets $f buf] >= 0} {
118 if {[regexp {^([0-9A-Fa-f.]+);W} $buf -> range]} {
119 set range [string tolower $range]
120 lassign [split $range .] lower - upper
121 if {$upper eq ""} {
122 set upper $lower
124 lappend map(wide) 0x$lower 0x$upper
127 close $f
130 foreach type {combining wide} {
131 puts "static const struct utf8range unicode_range_$type\[\] = \{"
132 if {$do_width} {
133 output-int-pairs [combine-adjacent-ranges $map($type)]
134 } else {
135 # Just produce empty width tables in this case
136 output-int-pairs {}
138 puts "\};\n"