1 " Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
2 " The format of the UnicodeData.txt file is explained here:
3 " http://www.unicode.org/Public/5.1.0/ucd/UCD.html
4 " For the other files see the header.
6 " Usage: Vim -S <this-file>
8 " Author: Bram Moolenaar
9 " Last Update: 2010 Jan 12
11 " Parse lines of UnicodeData.txt. Creates a list of lists in s:dataprops.
12 func! ParseDataToProps()
15 while lnum <= line('$')
16 let l = split(getline(lnum), '\s*;\s*', 1)
18 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
21 call add(s:dataprops, l)
26 " Parse lines of CaseFolding.txt. Creates a list of lists in s:foldprops.
27 func! ParseFoldProps()
30 while lnum <= line('$')
31 let line = getline(lnum)
32 if line !~ '^#' && line !~ '^\s*$'
33 let l = split(line, '\s*;\s*', 1)
35 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
38 call add(s:foldprops, l)
44 " Parse lines of EastAsianWidth.txt. Creates a list of lists in s:widthprops.
45 func! ParseWidthProps()
48 while lnum <= line('$')
49 let line = getline(lnum)
50 if line !~ '^#' && line !~ '^\s*$'
51 let l = split(line, '\s*;\s*', 1)
53 echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
56 call add(s:widthprops, l)
62 " Build the toLower or toUpper table in a new buffer.
64 func! BuildCaseTable(name, index)
72 let n = ('0x' . p[0]) + 0
73 let nl = ('0x' . p[a:index]) + 0
74 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
75 " continue with same range.
80 " produce previous range
81 call Range(ranges, start, end, step, add)
91 call Range(ranges, start, end, step, add)
94 " New buffer to put the result in.
96 exe "file to" . a:name
97 call setline(1, "static convertStruct to" . a:name . "[] =")
99 call append('$', ranges)
100 call setline('$', getline('$')[:-2]) " remove last comma
101 call setline(line('$') + 1, "};")
105 " Build the foldCase table in a new buffer.
107 func! BuildFoldTable()
114 if p[1] == 'C' || p[1] == 'S'
115 let n = ('0x' . p[0]) + 0
116 let nl = ('0x' . p[2]) + 0
117 if start >= 0 && add == nl - n && (step == 0 || n - end == step)
118 " continue with same range.
123 " produce previous range
124 call Range(ranges, start, end, step, add)
134 call Range(ranges, start, end, step, add)
137 " New buffer to put the result in.
140 call setline(1, "static convertStruct foldCase[] =")
142 call append('$', ranges)
143 call setline('$', getline('$')[:-2]) " remove last comma
144 call setline(line('$') + 1, "};")
148 func! Range(ranges, start, end, step, add)
149 let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
150 call add(a:ranges, s)
153 " Build the combining table.
155 func! BuildCombiningTable()
160 if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
161 let n = ('0x' . p[0]) + 0
162 if start >= 0 && end + 1 == n
163 " continue with same range.
167 " produce previous range
168 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
176 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
179 " New buffer to put the result in.
182 call setline(1, " static struct interval combining[] =")
183 call setline(2, " {")
184 call append('$', ranges)
185 call setline('$', getline('$')[:-2]) " remove last comma
186 call setline(line('$') + 1, " };")
190 " Build the double width or ambiguous width table in a new buffer.
191 " Uses s:widthprops and s:dataprops.
192 func! BuildWidthTable(pattern, tableName)
197 for p in s:widthprops
198 if p[1][0] =~ a:pattern
200 " It is a range. we don't check for composing char then.
201 let rng = split(p[0], '\.\.')
203 echoerr "Cannot parse range: '" . p[0] . "' in width table"
205 let n = ('0x' . rng[0]) + 0
206 let n_last = ('0x' . rng[1]) + 0
208 let n = ('0x' . p[0]) + 0
211 " Find this char in the data table.
213 let dn = ('0x' . s:dataprops[dataidx][0]) + 0
219 if dn != n && n_last == n
220 echoerr "Cannot find character " . n . " in data table"
222 " Only use the char when it's not a composing char.
223 " But use all chars from a range.
224 let dp = s:dataprops[dataidx]
225 if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
226 if start >= 0 && end + 1 == n
227 " continue with same range.
230 " produce previous range
231 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
240 call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
243 " New buffer to put the result in.
245 exe "file " . a:tableName
246 call setline(1, " static struct interval " . a:tableName . "[] =")
247 call setline(2, " {")
248 call append('$', ranges)
249 call setline('$', getline('$')[:-2]) " remove last comma
250 call setline(line('$') + 1, " };")
256 " Edit the Unicode text file. Requires the netrw plugin.
257 edit http://unicode.org/Public/UNIDATA/UnicodeData.txt
259 " Parse each line, create a list of lists.
260 call ParseDataToProps()
262 " Build the toLower table.
263 call BuildCaseTable("Lower", 13)
265 " Build the toUpper table.
266 call BuildCaseTable("Upper", 12)
268 " Build the ranges of composing chars.
269 call BuildCombiningTable()
271 " Edit the case folding text file. Requires the netrw plugin.
272 edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
274 " Parse each line, create a list of lists.
275 call ParseFoldProps()
277 " Build the foldCase table.
278 call BuildFoldTable()
280 " Edit the width text file. Requires the netrw plugin.
281 edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
283 " Parse each line, create a list of lists.
284 call ParseWidthProps()
286 " Build the double width table.
287 call BuildWidthTable('[WF]', 'doublewidth')
289 " Build the ambiguous width table.
290 call BuildWidthTable('A', 'ambiguous')