3 ## Copyright (C) 2015-2016 Free Software Foundation, Inc.
5 ## Author: Glenn Morris <rgm@gnu.org>
7 ## This file is part of GNU Emacs.
9 ## GNU Emacs is free software: you can redistribute it and/or modify
10 ## it under the terms of the GNU General Public License as published by
11 ## the Free Software Foundation, either version 3 of the License, or
12 ## (at your option) any later version.
14 ## GNU Emacs is distributed in the hope that it will be useful,
15 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ## GNU General Public License for more details.
19 ## You should have received a copy of the GNU General Public License
20 ## along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
24 ## This script takes as input Unicode's Blocks.txt
25 ## (http://www.unicode.org/Public/UNIDATA/Blocks.txt)
26 ## and produces output for Emacs's lisp/international/charscript.el.
28 ## It lumps together all the blocks belonging to the same language.
29 ## E.g., "Basic Latin", "Latin-1 Supplement", "Latin Extended-A",
30 ## etc. are all lumped together under "latin".
32 ## The Unicode blocks actually extend past some of these ranges with
33 ## undefined codepoints.
35 ## For additional details, see <http://debbugs.gnu.org/20789#11>.
37 ## Things to do after installing a new version of Blocks.txt:
38 ## Check the output against the old output.
39 ## Adjust the alias array, and the name2alias function for any new
40 ## entries, if necessary.
41 ## Check fix_start (and fix_end) to see if entries need adding/removing.
42 ## Review the hard-coded splits at the end of the main body.
47 ## Hard-coded names. See name2alias for the rest.
48 alias
["ipa extensions"] =
"phonetic"
49 alias
["letterlike symbols"] =
"symbol"
50 alias
["number forms"] =
"symbol"
51 alias
["miscellaneous technical"] =
"symbol"
52 alias
["control pictures"] =
"symbol"
53 alias
["optical character recognition"] =
"symbol"
54 alias
["enclosed alphanumerics"] =
"symbol"
55 alias
["box drawing"] =
"symbol"
56 alias
["block elements"] =
"symbol"
57 alias
["miscellaneous symbols"] =
"symbol"
58 alias
["cjk strokes"] =
"cjk-misc"
59 alias
["cjk symbols and punctuation"] =
"cjk-misc"
60 alias
["halfwidth and fullwidth forms"] =
"cjk-misc"
61 alias
["common indic number forms"] =
"north-indic-number"
70 fix_start
["0080"] =
"00A0"
71 ## Define fix_end here if you need it.
74 ## From admin/charsets/.
75 ## With gawk's --non-decimal-data switch we wouldn't need this.
76 function decode_hex
(str
, n
, len
, i
, c
) {
79 for (i =
1; i
<= len
; i
++)
81 c =
substr (str
, i
, 1)
82 if (c
>=
"0" && c
<=
"9")
83 n = n
* 16 + (c
- "0")
85 n = n
* 16 + tohex
[tolower(c
)]
90 function name2alias
(name
, w
, w2
) {
92 if (alias
[name
]) return alias
[name
]
93 else if (name ~
/for symbols
/) return "symbol"
94 else if (name ~
/latin
|combining .
* marks
|spacing modifier
|tone letters
|alphabetic presentation
/) return "latin"
95 else if (name ~
/cjk
|yijing
|enclosed ideograph
|kangxi
/) return "han"
96 else if (name ~
/arabic
/) return "arabic"
97 else if (name ~
/^greek
/) return "greek"
98 else if (name ~
/^coptic
/) return "coptic"
99 else if (name ~
/cuneiform number
/) return "cuneiform-numbers-and-punctuation"
100 else if (name ~
/cuneiform
/) return "cuneiform"
101 else if (name ~
/mathematical alphanumeric symbol
/) return "mathematical"
102 else if (name ~
/punctuation
|mathematical
|arrows
|currency
|superscript
|small form variants
|geometric
|dingbats
|enclosed
|alchemical
|pictograph
|emoticon
|transport
/) return "symbol"
103 else if (name ~
/canadian aboriginal
/) return "canadian-aboriginal"
104 else if (name ~
/katakana
|hiragana
/) return "kana"
105 else if (name ~
/myanmar
/) return "burmese"
106 else if (name ~
/hangul
/) return "hangul"
107 else if (name ~
/khmer
/) return "khmer"
108 else if (name ~
/braille
/) return "braille"
109 else if (name ~
/^yi
/) return "yi"
110 else if (name ~
/surrogates
|private use
|variation selectors
/) return 0
111 else if (name ~
/^
(specials
|tags
)$
/) return 0
112 else if (name ~
/linear b
/) return "linear-b"
113 else if (name ~
/aramaic
/) return "aramaic"
114 else if (name ~
/rumi num
/) return "rumi-number"
115 else if (name ~
/duployan
|shorthand
/) return "duployan-shorthand"
116 else if (name ~
/sutton signwriting
/) return "sutton-sign-writing"
118 sub(/ (extended
|extensions
|supplement
).
*/, "", name
)
119 sub(/numbers
/, "number", name
)
120 sub(/numerals
/, "numeral", name
)
121 sub(/symbols
/, "symbol", name
)
122 sub(/forms$
/, "form", name
)
123 sub(/tiles$
/, "tile", name
)
124 sub(/^new
/, "", name
)
125 sub(/ (characters
|hieroglyphs
|cursive
)$
/, "", name
)
132 sep =
index($
1, "..")
134 s =
substr($
1,1,sep
-1)
135 e =
substr($
1,sep
+2,len
-sep
-2)
139 start
[i
] = fix_start
[s
] ? fix_start
[s
] : s
140 end[i
] = fix_end
[e
] ? fix_end
[e
]: e
143 alt
[i
] = name2alias
(name
[i
])
151 ## Combine adjacent ranges with the same name.
152 if (alt
[i
] == alt
[i
-1] && decode_hex
(start
[i
]) ==
1 + decode_hex
(end[i
-1]))
155 name
[i
-1] =
(name
[i
-1] ", " name
[i
])
159 ## Some hard-coded splits.
160 if (start
[i
] ==
"0370")
172 else if (start
[i
] ==
"FB00")
184 else if (start
[i
] ==
"FF00")
203 print ";;; charscript.el --- character script table"
204 print ";;; Automatically generated from admin/unidata/Blocks.txt"
205 print "(let (script-list)"
206 print " (dolist (elt '("
210 printf(" (#x%s #x%s %s)", start
[j
], end[j
], alt
[j
])
211 ## Fuzz to decide whether worth printing original name as a comment.
212 if (name
[j
] && alt
[j
] != tolower(name
[j
]) && alt
[j
] !~
/-/)
213 printf(" ; %s", name
[j
])
218 print " (set-char-table-range char-script-table"
219 print " (cons (car elt) (nth 1 elt)) (nth 2 elt))"
220 print " (or (memq (nth 2 elt) script-list)"
221 print " (setq script-list (cons (nth 2 elt) script-list))))"
222 print " (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))"
224 print "(provide 'charscript)"