admin/unidata/blocks.awk

   1 #!/usr/bin/awk -f
   2
   3 ## Copyright (C) 2015-2016 Free Software Foundation, Inc.
   4
   5 ## Author: Glenn Morris <rgm@gnu.org>
   6
   7 ## This file is part of GNU Emacs.
   8
   9 ## GNU Emacs is free software: you can redistribute it and/or modify
  10 ## it under the terms of the GNU General Public License as published by
  11 ## the Free Software Foundation, either version 3 of the License, or
  12 ## (at your option) any later version.
  13
  14 ## GNU Emacs is distributed in the hope that it will be useful,
  15 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 ## GNU General Public License for more details.
  18
  19 ## You should have received a copy of the GNU General Public License
  20 ## along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
  21
  22 ### Commentary:
  23
  24 ## This script takes as input Unicode's Blocks.txt
  25 ## (http://www.unicode.org/Public/UNIDATA/Blocks.txt)
  26 ## and produces output for Emacs's lisp/international/charscript.el.
  27
  28 ## It lumps together all the blocks belonging to the same language.
  29 ## E.g., "Basic Latin", "Latin-1 Supplement", "Latin Extended-A",
  30 ## etc. are all lumped together under "latin".
  31
  32 ## The Unicode blocks actually extend past some of these ranges with
  33 ## undefined codepoints.
  34
  35 ## For additional details, see <http://debbugs.gnu.org/20789#11>.
  36
  37 ## Things to do after installing a new version of Blocks.txt:
  38 ## Check the output against the old output.
  39 ## Adjust the alias array, and the name2alias function for any new
  40 ## entries, if necessary.
  41 ## Check fix_start (and fix_end) to see if entries need adding/removing.
  42 ## Review the hard-coded splits at the end of the main body.
  43
  44 ### Code:
  45
  46 BEGIN {
  47     ## Hard-coded names.  See name2alias for the rest.
  48     alias["ipa extensions"] = "phonetic"
  49     alias["letterlike symbols"] = "symbol"
  50     alias["number forms"] = "symbol"
  51     alias["miscellaneous technical"] = "symbol"
  52     alias["control pictures"] = "symbol"
  53     alias["optical character recognition"] = "symbol"
  54     alias["enclosed alphanumerics"] = "symbol"
  55     alias["box drawing"] = "symbol"
  56     alias["block elements"] = "symbol"
  57     alias["miscellaneous symbols"] = "symbol"
  58     alias["cjk strokes"] = "cjk-misc"
  59     alias["cjk symbols and punctuation"] = "cjk-misc"
  60     alias["halfwidth and fullwidth forms"] = "cjk-misc"
  61     alias["common indic number forms"] = "north-indic-number"
  62
  63     tohex["a"] = 10
  64     tohex["b"] = 11
  65     tohex["c"] = 12
  66     tohex["d"] = 13
  67     tohex["e"] = 14
  68     tohex["f"] = 15
  69
  70     fix_start["0080"] = "00A0"
  71     ## Define fix_end here if you need it.
  72 }
  73
  74 ## From admin/charsets/.
  75 ## With gawk's --non-decimal-data switch we wouldn't need this.
  76 function decode_hex(str   , n, len, i, c) {
  77   n = 0
  78   len = length(str)
  79   for (i = 1; i <= len; i++)
  80     {
  81       c = substr (str, i, 1)
  82       if (c >= "0" && c <= "9")
  83         n = n * 16 + (c - "0")
  84       else
  85         n = n * 16 + tohex[tolower(c)]
  86     }
  87   return n
  88 }
  89
  90 function name2alias(name   , w, w2) {
  91     name = tolower(name)
  92     if (alias[name]) return alias[name]
  93     else if (name ~ /for symbols/) return "symbol"
  94     else if (name ~ /latin|combining .* marks|spacing modifier|tone letters|alphabetic presentation/) return "latin"
  95     else if (name ~ /cjk|yijing|enclosed ideograph|kangxi/) return "han"
  96     else if (name ~ /arabic/) return "arabic"
  97     else if (name ~ /^greek/) return "greek"
  98     else if (name ~ /^coptic/) return "coptic"
  99     else if (name ~ /cuneiform number/) return "cuneiform-numbers-and-punctuation"
 100     else if (name ~ /cuneiform/) return "cuneiform"
 101     else if (name ~ /mathematical alphanumeric symbol/) return "mathematical"
 102     else if (name ~ /punctuation|mathematical|arrows|currency|superscript|small form variants|geometric|dingbats|enclosed|alchemical|pictograph|emoticon|transport/) return "symbol"
 103     else if (name ~ /canadian aboriginal/) return "canadian-aboriginal"
 104     else if (name ~ /katakana|hiragana/) return "kana"
 105     else if (name ~ /myanmar/) return "burmese"
 106     else if (name ~ /hangul/) return "hangul"
 107     else if (name ~ /khmer/) return "khmer"
 108     else if (name ~ /braille/) return "braille"
 109     else if (name ~ /^yi /) return "yi"
 110     else if (name ~ /surrogates|private use|variation selectors/) return 0
 111     else if (name ~/^(specials|tags)$/) return 0
 112     else if (name ~ /linear b/) return "linear-b"
 113     else if (name ~ /aramaic/) return "aramaic"
 114     else if (name ~ /rumi num/) return "rumi-number"
 115     else if (name ~ /duployan|shorthand/) return "duployan-shorthand"
 116     else if (name ~ /sutton signwriting/) return "sutton-sign-writing"
 117
 118     sub(/ (extended|extensions|supplement).*/, "", name)
 119     sub(/numbers/, "number", name)
 120     sub(/numerals/, "numeral", name)
 121     sub(/symbols/, "symbol", name)
 122     sub(/forms$/, "form", name)
 123     sub(/tiles$/, "tile", name)
 124     sub(/^new /, "", name)
 125     sub(/ (characters|hieroglyphs|cursive)$/, "", name)
 126     gsub(/ /, "-", name)
 127
 128     return name
 129 }
 130
 131 /^[0-9A-F]/ {
 132     sep = index($1, "..")
 133     len = length($1)
 134     s = substr($1,1,sep-1)
 135     e = substr($1,sep+2,len-sep-2)
 136     $1 = ""
 137     sub(/^ */, "", $0)
 138     i++
 139     start[i] = fix_start[s] ? fix_start[s] : s
 140     end[i] = fix_end[e] ? fix_end[e]: e
 141     name[i] = $0
 142
 143     alt[i] = name2alias(name[i])
 144
 145     if (!alt[i])
 146     {
 147         i--
 148         next
 149     }
 150
 151     ## Combine adjacent ranges with the same name.
 152     if (alt[i] == alt[i-1] && decode_hex(start[i]) == 1 + decode_hex(end[i-1]))
 153     {
 154         end[i-1] = end[i]
 155         name[i-1] = (name[i-1] ", " name[i])
 156         i--
 157     }
 158
 159     ## Some hard-coded splits.
 160     if (start[i] == "0370")
 161     {
 162         end[i] = "03E1"
 163         i++
 164         start[i] = "03E2"
 165         end[i] = "03EF"
 166         alt[i] = "coptic"
 167         i++
 168         start[i] = "03F0"
 169         end[i] = "03FF"
 170         alt[i] = "greek"
 171     }
 172     else if (start[i] == "FB00")
 173     {
 174         end[i] = "FB06"
 175         i++
 176         start[i] = "FB13"
 177         end[i] = "FB17"
 178         alt[i] = "armenian"
 179         i++
 180         start[i] = "FB1D"
 181         end[i] = "FB4F"
 182         alt[i] = "hebrew"
 183     }
 184     else if (start[i] == "FF00")
 185     {
 186         end[i] = "FF60"
 187         i++
 188         start[i] = "FF61"
 189         end[i] = "FF9F"
 190         alt[i] = "kana"
 191         i++
 192         start[i] = "FFA0"
 193         end[i] = "FFDF"
 194         alt[i] = "hangul"
 195         i++
 196         start[i] = "FFE0"
 197         end[i] = "FFEF"
 198         alt[i] = "cjk-misc"
 199     }
 200 }
 201
 202 END {
 203     print ";;; charscript.el --- character script table"
 204     print ";;; Automatically generated from admin/unidata/Blocks.txt"
 205     print "(let (script-list)"
 206     print "  (dolist (elt '("
 207
 208     for (j=1;j<=i;j++)
 209     {
 210         printf("    (#x%s #x%s %s)", start[j], end[j], alt[j])
 211         ## Fuzz to decide whether worth printing original name as a comment.
 212         if (name[j] && alt[j] != tolower(name[j]) && alt[j] !~ /-/)
 213             printf(" ; %s", name[j])
 214         printf("\n")
 215     }
 216
 217     print "    ))"
 218     print "    (set-char-table-range char-script-table"
 219     print "                       (cons (car elt) (nth 1 elt)) (nth 2 elt))"
 220     print "    (or (memq (nth 2 elt) script-list)"
 221     print "     (setq script-list (cons (nth 2 elt) script-list))))"
 222     print "  (set-char-table-extra-slot char-script-table 0 (nreverse script-list)))"
 223     print ""
 224     print "(provide 'charscript)"
 225 }