admin/charsets/mapconv

   1 #!/bin/sh
   2
   3 # Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008
   4 #   National Institute of Advanced Industrial Science and Technology (AIST)
   5 #   Registration Number H13PRO009
   6
   7 # This file is part of GNU Emacs.
   8
   9 # GNU Emacs is free software: you can redistribute it and/or modify
  10 # it under the terms of the GNU General Public License as published by
  11 # the Free Software Foundation, either version 3 of the License, or
  12 # (at your option) any later version.
  13
  14 # GNU Emacs is distributed in the hope that it will be useful,
  15 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 # GNU General Public License for more details.
  18
  19 # You should have received a copy of the GNU General Public License
  20 # along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
  21
  22 # Commentary:
  23
  24 # Convert charset map of various format into this:
  25 #       0xXX 0xYYYY
  26 # where,
  27 #   XX is a code point of the charset in hexa-decimal,
  28 #   YYYY is the corresponding Unicode character code in hexa-decimal.
  29 # Arguments are:
  30 #   $1: source map file
  31 #   $2: address pattern for sed (optionally with substitution command)
  32 #   $3: format of source map file
  33 #       GLIBC-1 GLIBC-2 GLIBC-2-7 CZYBORRA IANA UNICODE YASUOKA MICROSOFT
  34 #   $4: awk script
  35
  36 BASE=`basename $1`
  37
  38 case "$3" in
  39     GLIBC*)
  40         SOURCE="glibc-2.3.2/localedata/charmaps/${BASE}";;
  41     CZYBORRA)
  42         SOURCE="http://czyborra.com/charsets/${BASE}";;
  43     IANA)
  44         SOURCE="http://www.iana.org/assignments/charset-reg/${BASE}";;
  45     UNICODE)
  46         SOURCE="http://www.unicode.org/Public/MAPPINGS/.../${BASE}";;
  47     UNICODE2)
  48         SOURCE="http://www.unicode.org/Public/MAPPINGS/.../${BASE}";;
  49     YASUOKA)
  50         SOURCE="http://kanji.zinbun.kyoto-u.ac.jp/~yasuoka/.../${BASE}";;
  51     MICROSOFT)
  52         SOURCE="http://www.microsoft.com/globaldev/reference/oem/${BASE}";;
  53     KANJI-DATABASE)
  54         SOURCE="data at http://sourceforge.net/cvs/?group_id=26261";;
  55     *)
  56         echo "Unknown file type: $3";
  57         exit 1;;
  58 esac
  59
  60 echo "# Generated from $SOURCE"
  61
  62 if [ -n "$4" ] ; then
  63     if [ -f "$4" ] ; then
  64         AWKPROG="gawk -f $4"
  65     else
  66         echo "Awk program does not exist: $4"
  67         exit 1
  68     fi
  69 else
  70     AWKPROG=cat
  71 fi
  72
  73 if [ "$3" == "GLIBC-1" ] ; then
  74     # Source format is:
  75     #   <UYYYY> /xXX
  76     sed -n -e "$2 p" < $1 \
  77         | sed -e 's,<U\([^>]*\)>[       ]*/x\(..\).*,0x\2 0x\1,' \
  78         | sort | ${AWKPROG}
  79 elif [ "$3" == "GLIBC-2" ] ; then
  80     # Source format is:
  81     #   <UYYYY> /xXX/xZZ
  82     sed -n -e "$2 p" < $1 \
  83         | sed -e 's,<U\([^>]*\)>[       ]*/x\(..\)/x\(..\).*,0x\2\3 0x\1,' \
  84         | sort | ${AWKPROG}
  85 elif [ "$3" == "GLIBC-2-7" ] ; then
  86     # Source format is:
  87     #   <UYYYY> /xXX/xZZ
  88     # We must drop MSBs of XX and ZZ
  89     sed -n -e "$2 p" < $1 \
  90         | sed -e 's/xa/x2/g' -e 's/xb/x3/g' -e 's/xc/x4/g' \
  91               -e 's/xd/x5/g' -e 's/xe/x6/g' -e 's/xf/x7/g' \
  92               -e 's,<U\([^>]*\)>[       ]*/x\(..\)/x\(..\).*,0x\2\3 0x\1,' \
  93         | tee temp \
  94         | sort | ${AWKPROG}
  95 elif [ "$3" == "CZYBORRA" ] ; then
  96     # Source format is:
  97     #   =XX     U+YYYY
  98     zcat $1 | sed -n -e "$2 p" \
  99         | sed -e 's/=\(..\)[^U]*U+\([0-9A-F]*\).*/0x\1 0x\2/' \
 100         | sort | ${AWKPROG}
 101 elif [ "$3" == "IANA" ] ; then
 102     # Source format is:
 103     #   0xXX    0xYYYY
 104     sed -n -e "$2 p" < $1 \
 105         | sed -e 's/\(0x[0-9A-Fa-f]*\)[^0]*\(0x[0-9A-Fa-f]*\).*/\1 \2/' \
 106         | sort | ${AWKPROG}
 107 elif [ "$3" == "UNICODE" ] ; then
 108     # Source format is:
 109     #   YYYY    XX
 110     sed -n -e "$2 p" < $1 \
 111         | sed -e 's/\([0-9A-F]*\)[^0-9A-F]*\([0-9A-F]*\).*/0x\2 0x\1/' \
 112         | sort | ${AWKPROG}
 113 elif [ "$3" == "UNICODE2" ] ; then
 114     # Source format is:
 115     #   0xXXXX  0xYYYY  # ...
 116     sed -n -e "$2 p" < $1 \
 117         | sed -e 's/\([0-9A-Fx]*\)[^0]*\([0-9A-Fx]*\).*/\1 \2/' \
 118         | ${AWKPROG} | sort -n -k 4,4
 119 elif [ "$3" == "YASUOKA" ] ; then
 120     # Source format is:
 121     # YYYY      0-XXXX (XXXX is a Kuten code)
 122     sed -n -e "$2 p" < $1 \
 123         | sed -e 's/\([0-9A-F]*\)[^0]*0-\([0-9]*\).*/0x\2 0x\1/' \
 124         | sort | ${AWKPROG}
 125 elif [ "$3" == "MICROSOFT" ] ; then
 126     # Source format is:
 127     # XX = U+YYYY
 128     sed -n -e "$2 p" < $1 \
 129         | sed -e 's/\([0-9A-F]*\).*U+\([0-9A-F]*\).*/0x\1 0x\2/' \
 130         | sort | ${AWKPROG}
 131 elif [ "$3" == "KANJI-DATABASE" ] ; then
 132     # Source format is:
 133     # C?-XXXX U+YYYYY .....
 134     sed -n -e "$2 p" < $1 \
 135         | sed -e 's/...\(....\) U+\([0-9A-F]*\).*/0x\1 0x\2/' \
 136         | sort | ${AWKPROG}
 137 else
 138     echo "Invalid arguments"
 139     exit 1
 140 fi
 141
 142 # arch-tag: c33acb47-7eb6-4872-b871-15e1447e8f0e