* make-dist: Updates related to nt/.
[emacs.git] / admin / unidata / blocks.awk
blobbf9a942489255e15f32eedb8ba704fa41b5965ab
1 #!/usr/bin/awk -f
3 ## Copyright (C) 2015-2016 Free Software Foundation, Inc.
5 ## Author: Glenn Morris <rgm@gnu.org>
7 ## This file is part of GNU Emacs.
9 ## GNU Emacs is free software: you can redistribute it and/or modify
10 ## it under the terms of the GNU General Public License as published by
11 ## the Free Software Foundation, either version 3 of the License, or
12 ## (at your option) any later version.
14 ## GNU Emacs is distributed in the hope that it will be useful,
15 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ## GNU General Public License for more details.
19 ## You should have received a copy of the GNU General Public License
20 ## along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
22 ### Commentary:
24 ## This script takes as input Unicode's Blocks.txt
25 ## (http://www.unicode.org/Public/UNIDATA/Blocks.txt)
26 ## and produces output for Emacs's lisp/international/charscript.el.
28 ## It lumps together all the blocks belonging to the same language.
29 ## E.g., "Basic Latin", "Latin-1 Supplement", "Latin Extended-A",
30 ## etc. are all lumped together under "latin".
32 ## The Unicode blocks actually extend past some of these ranges with
33 ## undefined codepoints.
35 ## For additional details, see <http://debbugs.gnu.org/20789#11>.
37 ## Things to do after installing a new version of Blocks.txt:
38 ## Check the output against the old output.
39 ## Adjust the alias array, and the name2alias function for any new
40 ## entries, if necessary.
41 ## Check fix_start (and fix_end) to see if entries need adding/removing.
42 ## Review the hard-coded splits at the end of the main body.
44 ### Code:
46 BEGIN {
47 ## Hard-coded names. See name2alias for the rest.
48 alias["ipa extensions"] = "phonetic"
49 alias["letterlike symbols"] = "symbol"
50 alias["number forms"] = "symbol"
51 alias["miscellaneous technical"] = "symbol"
52 alias["control pictures"] = "symbol"
53 alias["optical character recognition"] = "symbol"
54 alias["enclosed alphanumerics"] = "symbol"
55 alias["box drawing"] = "symbol"
56 alias["block elements"] = "symbol"
57 alias["miscellaneous symbols"] = "symbol"
58 alias["cjk strokes"] = "cjk-misc"
59 alias["cjk symbols and punctuation"] = "cjk-misc"
60 alias["halfwidth and fullwidth forms"] = "cjk-misc"
61 alias["common indic number forms"] = "north-indic-number"
63 tohex["a"] = 10
64 tohex["b"] = 11
65 tohex["c"] = 12
66 tohex["d"] = 13
67 tohex["e"] = 14
68 tohex["f"] = 15
70 fix_start["0080"] = "00A0"
71 ## Define fix_end here if you need it.
74 ## From admin/charsets/.
75 ## With gawk's --non-decimal-data switch we wouldn't need this.
76 function decode_hex(str , n, len, i, c) {
77 n = 0
78 len = length(str)
79 for (i = 1; i <= len; i++)
81 c = substr (str, i, 1)
82 if (c >= "0" && c <= "9")
83 n = n * 16 + (c - "0")
84 else
85 n = n * 16 + tohex[tolower(c)]
87 return n
90 function name2alias(name , w, w2) {
91 name = tolower(name)
92 if (alias[name]) return alias[name]
93 else if (name ~ /for symbols/) return "symbol"
94 else if (name ~ /latin|combining .* marks|spacing modifier|tone letters|alphabetic presentation/) return "latin"
95 else if (name ~ /cjk|yijing|enclosed ideograph|kangxi/) return "han"
96 else if (name ~ /arabic/) return "arabic"
97 else if (name ~ /^greek/) return "greek"
98 else if (name ~ /^coptic/) return "coptic"
99 else if (name ~ /cuneiform number/) return "cuneiform-numbers-and-punctuation"
100 else if (name ~ /cuneiform/) return "cuneiform"
101 else if (name ~ /mathematical alphanumeric symbol/) return "mathematical"
102 else if (name ~ /punctuation|mathematical|arrows|currency|superscript|small form variants|geometric|dingbats|enclosed|alchemical|pictograph|emoticon|transport/) return "symbol"
103 else if (name ~ /canadian aboriginal/) return "canadian-aboriginal"
104 else if (name ~ /katakana|hiragana/) return "kana"
105 else if (name ~ /myanmar/) return "burmese"
106 else if (name ~ /hangul/) return "hangul"
107 else if (name ~ /khmer/) return "khmer"
108 else if (name ~ /braille/) return "braille"
109 else if (name ~ /^yi /) return "yi"
110 else if (name ~ /surrogates|private use|variation selectors/) return 0
111 else if (name ~/^(specials|tags)$/) return 0
112 else if (name ~ /linear b/) return "linear-b"
113 else if (name ~ /aramaic/) return "aramaic"
114 else if (name ~ /rumi num/) return "rumi-number"
115 else if (name ~ /duployan|shorthand/) return "duployan-shorthand"
116 else if (name ~ /sutton signwriting/) return "sutton-sign-writing"
118 sub(/ (extended|extensions|supplement).*/, "", name)
119 sub(/numbers/, "number", name)
120 sub(/numerals/, "numeral", name)
121 sub(/symbols/, "symbol", name)
122 sub(/forms$/, "form", name)
123 sub(/tiles$/, "tile", name)
124 sub(/^new /, "", name)
125 sub(/ (characters|hieroglyphs|cursive)$/, "", name)
126 gsub(/ /, "-", name)
128 return name
131 /^[0-9A-F]/ {
132 sep = index($1, "..")
133 len = length($1)
134 s = substr($1,1,sep-1)
135 e = substr($1,sep+2,len-sep-2)
136 $1 = ""
137 sub(/^ */, "", $0)
139 start[i] = fix_start[s] ? fix_start[s] : s
140 end[i] = fix_end[e] ? fix_end[e]: e
141 name[i] = $0
143 alt[i] = name2alias(name[i])
145 if (!alt[i])
148 next
151 ## Combine adjacent ranges with the same name.
152 if (alt[i] == alt[i-1] && decode_hex(start[i]) == 1 + decode_hex(end[i-1]))
154 end[i-1] = end[i]
155 name[i-1] = (name[i-1] ", " name[i])
159 ## Some hard-coded splits.
160 if (start[i] == "0370")
162 end[i] = "03E1"
164 start[i] = "03E2"
165 end[i] = "03EF"
166 alt[i] = "coptic"
168 start[i] = "03F0"
169 end[i] = "03FF"
170 alt[i] = "greek"
172 else if (start[i] == "FB00")
174 end[i] = "FB06"
176 start[i] = "FB13"
177 end[i] = "FB17"
178 alt[i] = "armenian"
180 start[i] = "FB1D"
181 end[i] = "FB4F"
182 alt[i] = "hebrew"
184 else if (start[i] == "FF00")
186 end[i] = "FF60"
188 start[i] = "FF61"
189 end[i] = "FF9F"
190 alt[i] = "kana"
192 start[i] = "FFA0"
193 end[i] = "FFDF"
194 alt[i] = "hangul"
196 start[i] = "FFE0"
197 end[i] = "FFEF"
198 alt[i] = "cjk-misc"
202 END {
203 print ";;; charscript.el --- character script table"
204 print ";;; Automatically generated from admin/unidata/Blocks.txt"
205 print "(let (script-list)"
206 print " (dolist (elt '("
208 for (j=1;j<=i;j++)
210 printf(" (#x%s #x%s %s)", start[j], end[j], alt[j])
211 ## Fuzz to decide whether worth printing original name as a comment.
212 if (name[j] && alt[j] != tolower(name[j]) && alt[j] !~ /-/)
213 printf(" ; %s", name[j])
214 printf("\n")
217 print " ))"
218 print " (set-char-table-range char-script-table"
219 print " (cons (car elt) (nth 1 elt)) (nth 2 elt))"
220 print " (or (memq (nth 2 elt) script-list)"
221 print " (setq script-list (cons (nth 2 elt) script-list))))"
222 print " (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))"
223 print ""
224 print "(provide 'charscript)"