lisp/international/mule-conf.el

   1 ;;; mule-conf.el --- configure multilingual environment
   2
   3 ;; Copyright (C) 1997-2014 Free Software Foundation, Inc.
   4 ;; Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   5 ;;   National Institute of Advanced Industrial Science and Technology (AIST)
   6 ;;   Registration Number H14PRO021
   7 ;; Copyright (C) 2003
   8 ;;   National Institute of Advanced Industrial Science and Technology (AIST)
   9 ;;   Registration Number H13PRO009
  10
  11 ;; Keywords: i18n, mule, multilingual, character set, coding system
  12
  13 ;; This file is part of GNU Emacs.
  14
  15 ;; GNU Emacs is free software: you can redistribute it and/or modify
  16 ;; it under the terms of the GNU General Public License as published by
  17 ;; the Free Software Foundation, either version 3 of the License, or
  18 ;; (at your option) any later version.
  19
  20 ;; GNU Emacs is distributed in the hope that it will be useful,
  21 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  22 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  23 ;; GNU General Public License for more details.
  24
  25 ;; You should have received a copy of the GNU General Public License
  26 ;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
  27
  28 ;;; Commentary:
  29
  30 ;; This file defines the Emacs charsets and some basic coding systems.
  31 ;; Other coding systems are defined in the files in directory
  32 ;; lisp/language.
  33
  34 ;;; Code:
  35
  36 ;;; Remarks
  37
  38 ;; The ISO-IR registry is at http://www.itscj.ipsj.or.jp/ISO-IR/.
  39 ;; Standards docs equivalent to iso-2022 and iso-8859 are at
  40 ;; http://www.ecma.ch/.
  41
  42 ;; FWIW, http://www.microsoft.com/globaldev/ lists the following for
  43 ;; MS Windows, which are presumably the only charsets we really need
  44 ;; to worry about on such systems:
  45 ;; `OEM codepages': 437, 720, 737, 775, 850, 852, 855, 857, 858, 862, 866
  46 ;; `Windows codepages': 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257,
  47 ;;                      1258, 874, 932, 936, 949, 950
  48
  49 ;;; Definitions of character sets.
  50
  51 ;; The charsets `ascii', `unicode' and `eight-bit' are already defined
  52 ;; in charset.c as below:
  53 ;;
  54 ;; (define-charset 'ascii
  55 ;;   ""
  56 ;;   :dimension 1
  57 ;;   :code-space [0 127]
  58 ;;   :iso-final-char ?B
  59 ;;   :ascii-compatible-p t
  60 ;;   :emacs-mule-id 0
  61 ;;   :code-offset 0)
  62 ;;
  63 ;; (define-charset 'unicode
  64 ;;   ""
  65 ;;   :dimension 3
  66 ;;   :code-space [0 255 0 255 0 16]
  67 ;;   :ascii-compatible-p t
  68 ;;   :code-offset 0)
  69 ;;
  70 ;; (define-charset 'emacs
  71 ;;   ""
  72 ;;   :dimension 3
  73 ;;   :code-space [0 255 0 255 0 63]
  74 ;;   :ascii-compatible-p t
  75 ;;   :supplementary-p t
  76 ;;   :code-offset 0)
  77 ;;
  78 ;; (define-charset 'eight-bit
  79 ;;   ""
  80 ;;   :dimension 1
  81 ;;   :code-space [128 255]
  82 ;;   :code-offset #x3FFF80)
  83 ;;
  84 ;; We now set :docstring, :short-name, and :long-name properties.
  85
  86 (put-charset-property
  87  'ascii :docstring "ASCII (ISO646 IRV)")
  88 (put-charset-property
  89  'ascii :short-name "ASCII")
  90 (put-charset-property
  91  'ascii :long-name "ASCII (ISO646 IRV)")
  92 (put-charset-property
  93  'iso-8859-1 :docstring "Latin-1 (ISO/IEC 8859-1)")
  94 (put-charset-property
  95  'iso-8859-1 :short-name "Latin-1")
  96 (put-charset-property
  97  'iso-8859-1 :long-name "Latin-1")
  98 (put-charset-property
  99  'unicode :docstring "Unicode (ISO10646)")
 100 (put-charset-property
 101  'unicode :short-name "Unicode")
 102 (put-charset-property
 103  'unicode :long-name "Unicode (ISO10646)")
 104 (put-charset-property
 105  'emacs :docstring "Full Emacs charset (excluding eight bit chars)")
 106 (put-charset-property
 107  'emacs :short-name "Emacs")
 108 (put-charset-property
 109  'emacs :long-name "Emacs")
 110
 111 (put-charset-property 'eight-bit :docstring "Raw bytes 128-255")
 112 (put-charset-property 'eight-bit :short-name "Raw bytes")
 113
 114 (define-charset-alias 'ucs 'unicode)
 115
 116 (define-charset 'latin-iso8859-1
 117   "Right-Hand Part of ISO/IEC 8859/1 (Latin-1): ISO-IR-100"
 118   :short-name "RHP of Latin-1"
 119   :long-name "RHP of ISO/IEC 8859/1 (Latin-1): ISO-IR-100"
 120   :iso-final-char ?A
 121   :emacs-mule-id 129
 122   :code-space [32 127]
 123   :code-offset 160)
 124
 125 ;; Name perhaps not ideal, but is XEmacs-compatible.
 126 (define-charset 'control-1
 127   "8-bit control code (0x80..0x9F)"
 128   :short-name "8-bit control code"
 129   :code-space [128 159]
 130   :code-offset 128)
 131
 132 (define-charset 'eight-bit-control
 133   "Raw bytes in the range 0x80..0x9F (usually produced from invalid encodings)"
 134   :short-name "Raw bytes 0x80..0x9F"
 135   :supplementary-p t
 136   :code-space [128 159]
 137   :code-offset #x3FFF80)                ; see character.h
 138
 139 (define-charset 'eight-bit-graphic
 140   "Raw bytes in the range 0xA0..0xFF (usually produced from invalid encodings)"
 141   :short-name "Raw bytes 0xA0..0xFF"
 142   :supplementary-p t
 143   :code-space [160 255]
 144   :code-offset #x3FFFA0)                ; see character.h
 145
 146 (defmacro define-iso-single-byte-charset (symbol iso-symbol name nickname
 147                                                  iso-ir iso-final
 148                                                  emacs-mule-id map)
 149   `(progn
 150      (define-charset ,symbol
 151        ,name
 152        :short-name ,nickname
 153        :long-name ,name
 154        :ascii-compatible-p t
 155        :code-space [0 255]
 156        :map ,map)
 157      (if ,iso-symbol
 158          (define-charset ,iso-symbol
 159            (if ,iso-ir
 160                (format "Right-Hand Part of %s (%s): ISO-IR-%d"
 161                        ,name ,nickname ,iso-ir)
 162              (format "Right-Hand Part of %s (%s)" ,name ,nickname))
 163            :short-name (format "RHP of %s" ,name)
 164            :long-name (format "RHP of %s (%s)" ,name ,nickname)
 165            :iso-final-char ,iso-final
 166            :emacs-mule-id ,emacs-mule-id
 167            :code-space [32 127]
 168            :subset (list ,symbol 160 255 -128)))))
 169
 170 (define-iso-single-byte-charset 'iso-8859-2 'latin-iso8859-2
 171   "ISO/IEC 8859/2" "Latin-2" 101 ?B 130 "8859-2")
 172
 173 (define-iso-single-byte-charset 'iso-8859-3 'latin-iso8859-3
 174   "ISO/IEC 8859/3" "Latin-3" 109 ?C 131 "8859-3")
 175
 176 (define-iso-single-byte-charset 'iso-8859-4 'latin-iso8859-4
 177   "ISO/IEC 8859/4" "Latin-4" 110 ?D 132 "8859-4")
 178
 179 (define-iso-single-byte-charset 'iso-8859-5 'cyrillic-iso8859-5
 180   "ISO/IEC 8859/5" "Latin/Cyrillic" 144 ?L 140 "8859-5")
 181
 182 (define-iso-single-byte-charset 'iso-8859-6 'arabic-iso8859-6
 183   "ISO/IEC 8859/6" "Latin/Arabic" 127 ?G 135 "8859-6")
 184
 185 (define-iso-single-byte-charset 'iso-8859-7 'greek-iso8859-7
 186   "ISO/IEC 8859/7" "Latin/Greek" 126 ?F 134 "8859-7")
 187
 188 (define-iso-single-byte-charset 'iso-8859-8 'hebrew-iso8859-8
 189   "ISO/IEC 8859/8" "Latin/Hebrew" 138 ?H 136 "8859-8")
 190
 191 (define-iso-single-byte-charset 'iso-8859-9 'latin-iso8859-9
 192   "ISO/IEC 8859/9" "Latin-5" 148 ?M 141 "8859-9")
 193
 194 (define-iso-single-byte-charset 'iso-8859-10 'latin-iso8859-10
 195   "ISO/IEC 8859/10" "Latin-6" 157 ?V nil "8859-10")
 196
 197 ;; http://www.nectec.or.th/it-standards/iso8859-11/
 198 ;; http://www.cwi.nl/~dik/english/codes/8859.html says this is tis-620
 199 ;; plus nbsp
 200 (define-iso-single-byte-charset 'iso-8859-11 'thai-iso8859-11
 201   "ISO/IEC 8859/11" "Latin/Thai" 166 ?T nil "8859-11")
 202
 203 ;; 8859-12 doesn't (yet?) exist.
 204
 205 (define-iso-single-byte-charset 'iso-8859-13 'latin-iso8859-13
 206   "ISO/IEC 8859/13" "Latin-7" 179 ?Y nil "8859-13")
 207
 208 (define-iso-single-byte-charset 'iso-8859-14 'latin-iso8859-14
 209   "ISO/IEC 8859/14" "Latin-8" 199 ?_ 143 "8859-14")
 210
 211 (define-iso-single-byte-charset 'iso-8859-15 'latin-iso8859-15
 212   "ISO/IEC 8859/15" "Latin-9" 203 ?b 142 "8859-15")
 213
 214 (define-iso-single-byte-charset 'iso-8859-16 'latin-iso8859-16
 215   "ISO/IEC 8859/16" "Latin-10" 226 ?f nil "8859-16")
 216
 217 ;; No point in keeping it around.
 218 (fmakunbound 'define-iso-single-byte-charset)
 219
 220 ;; Can this be shared with 8859-11?
 221 ;; N.b. not all of these are defined in Unicode.
 222 (define-charset 'thai-tis620
 223   "TIS620.2533"
 224   :short-name "TIS620.2533"
 225   :iso-final-char ?T
 226   :emacs-mule-id 133
 227   :code-space [32 127]
 228   :code-offset #x0E00)
 229
 230 ;; Fixme: doc for this, c.f. above
 231 (define-charset 'tis620-2533
 232   "TIS620.2533"
 233   :short-name "TIS620.2533"
 234   :ascii-compatible-p t
 235   :code-space [0 255]
 236   :superset '(ascii eight-bit-control (thai-tis620 . 128)))
 237
 238 (define-charset 'jisx0201
 239   "JISX0201"
 240   :short-name "JISX0201"
 241   :code-space [0 #xDF]
 242   :map "JISX0201")
 243
 244 (define-charset 'latin-jisx0201
 245   "Roman Part of JISX0201.1976"
 246   :short-name "JISX0201 Roman"
 247   :long-name "Japanese Roman (JISX0201.1976)"
 248   :iso-final-char ?J
 249   :emacs-mule-id  138
 250   :supplementary-p t
 251   :code-space [33 126]
 252   :subset '(jisx0201 33 126 0))
 253
 254 (define-charset 'katakana-jisx0201
 255   "Katakana Part of JISX0201.1976"
 256   :short-name "JISX0201 Katakana"
 257   :long-name "Japanese Katakana (JISX0201.1976)"
 258   :iso-final-char ?I
 259   :emacs-mule-id  137
 260   :supplementary-p t
 261   :code-space [33 126]
 262   :subset '(jisx0201 161 254 -128))
 263
 264 (define-charset 'chinese-gb2312
 265   "GB2312 Chinese simplified: ISO-IR-58"
 266   :short-name "GB2312"
 267   :long-name "GB2312: ISO-IR-58"
 268   :iso-final-char ?A
 269   :emacs-mule-id 145
 270   :code-space [33 126 33 126]
 271   :code-offset #x110000
 272   :unify-map "GB2312")
 273
 274 (define-charset 'chinese-gbk
 275   "GBK Chinese simplified."
 276   :short-name "GBK"
 277   :code-space [#x40 #xFE #x81 #xFE]
 278   :code-offset #x160000
 279   :unify-map "GBK")
 280 (define-charset-alias 'cp936 'chinese-gbk)
 281 (define-charset-alias 'windows-936 'chinese-gbk)
 282
 283 (define-charset 'chinese-cns11643-1
 284   "CNS11643 Plane 1 Chinese traditional: ISO-IR-171"
 285   :short-name "CNS11643-1"
 286   :long-name "CNS11643-1 (Chinese traditional): ISO-IR-171"
 287   :iso-final-char ?G
 288   :emacs-mule-id  149
 289   :code-space [33 126 33 126]
 290   :code-offset #x114000
 291   :unify-map "CNS-1")
 292
 293 (define-charset 'chinese-cns11643-2
 294   "CNS11643 Plane 2 Chinese traditional: ISO-IR-172"
 295   :short-name "CNS11643-2"
 296   :long-name "CNS11643-2 (Chinese traditional): ISO-IR-172"
 297   :iso-final-char ?H
 298   :emacs-mule-id  150
 299   :code-space [33 126 33 126]
 300   :code-offset #x118000
 301   :unify-map "CNS-2")
 302
 303 (define-charset 'chinese-cns11643-3
 304   "CNS11643 Plane 3 Chinese Traditional: ISO-IR-183"
 305   :short-name  "CNS11643-3"
 306   :long-name "CNS11643-3 (Chinese traditional): ISO-IR-183"
 307   :iso-final-char ?I
 308   :code-space [33 126 33 126]
 309   :emacs-mule-id  246
 310   :code-offset #x11C000
 311   :unify-map "CNS-3")
 312
 313 (define-charset 'chinese-cns11643-4
 314   "CNS11643 Plane 4 Chinese Traditional: ISO-IR-184"
 315   :short-name  "CNS11643-4"
 316   :long-name "CNS11643-4 (Chinese traditional): ISO-IR-184"
 317   :iso-final-char ?J
 318   :emacs-mule-id  247
 319   :code-space [33 126 33 126]
 320   :code-offset #x120000
 321   :unify-map "CNS-4")
 322
 323 (define-charset 'chinese-cns11643-5
 324   "CNS11643 Plane 5 Chinese Traditional: ISO-IR-185"
 325   :short-name  "CNS11643-5"
 326   :long-name "CNS11643-5 (Chinese traditional): ISO-IR-185"
 327   :iso-final-char ?K
 328   :emacs-mule-id  248
 329   :code-space [33 126 33 126]
 330   :code-offset #x124000
 331   :unify-map "CNS-5")
 332
 333 (define-charset 'chinese-cns11643-6
 334   "CNS11643 Plane 6 Chinese Traditional: ISO-IR-186"
 335   :short-name  "CNS11643-6"
 336   :long-name "CNS11643-6 (Chinese traditional): ISO-IR-186"
 337   :iso-final-char ?L
 338   :emacs-mule-id 249
 339   :code-space [33 126 33 126]
 340   :code-offset #x128000
 341   :unify-map "CNS-6")
 342
 343 (define-charset 'chinese-cns11643-7
 344   "CNS11643 Plane 7 Chinese Traditional: ISO-IR-187"
 345   :short-name  "CNS11643-7"
 346   :long-name "CNS11643-7 (Chinese traditional): ISO-IR-187"
 347   :iso-final-char ?M
 348   :emacs-mule-id 250
 349   :code-space [33 126 33 126]
 350   :code-offset #x12C000
 351   :unify-map "CNS-7")
 352
 353 (define-charset 'big5
 354   "Big5 (Chinese traditional)"
 355   :short-name "Big5"
 356   :code-space [#x40 #xFE #xA1 #xFE]
 357   :code-offset #x130000
 358   :unify-map "BIG5")
 359 ;; Fixme: AKA cp950 according to
 360 ;; <URL:http://www.microsoft.com/globaldev/reference/WinCP.asp>.  Is
 361 ;; that correct?
 362
 363 (define-charset 'chinese-big5-1
 364   "Frequently used part (A141-C67E) of Big5 (Chinese traditional)"
 365   :short-name "Big5 (Level-1)"
 366   :long-name "Big5 (Level-1) A141-C67F"
 367   :iso-final-char ?0
 368   :emacs-mule-id 152
 369   :supplementary-p t
 370   :code-space [#x21 #x7E #x21 #x7E]
 371   :code-offset #x135000
 372   :unify-map "BIG5-1")
 373
 374 (define-charset 'chinese-big5-2
 375   "Less frequently used part (C940-FEFE) of Big5 (Chinese traditional)"
 376   :short-name "Big5 (Level-2)"
 377   :long-name "Big5 (Level-2) C940-FEFE"
 378   :iso-final-char ?1
 379   :emacs-mule-id  153
 380   :supplementary-p t
 381   :code-space [#x21 #x7E #x21 #x7E]
 382   :code-offset #x137800
 383   :unify-map "BIG5-2")
 384
 385 (define-charset 'japanese-jisx0208
 386   "JISX0208.1983/1990 Japanese Kanji: ISO-IR-87"
 387   :short-name "JISX0208"
 388   :long-name "JISX0208.1983/1990 (Japanese): ISO-IR-87"
 389   :iso-final-char ?B
 390   :emacs-mule-id 146
 391   :code-space [33 126 33 126]
 392   :code-offset #x140000
 393   :unify-map "JISX0208")
 394
 395 (define-charset 'japanese-jisx0208-1978
 396   "JISX0208.1978 Japanese Kanji (so called \"old JIS\"): ISO-IR-42"
 397   :short-name "JISX0208.1978"
 398   :long-name  "JISX0208.1978 (JISC6226.1978): ISO-IR-42"
 399   :iso-final-char ?@
 400   :emacs-mule-id  144
 401   :code-space [33 126 33 126]
 402   :code-offset #x144000
 403   :unify-map "JISC6226")
 404
 405 (define-charset 'japanese-jisx0212
 406   "JISX0212 Japanese supplement: ISO-IR-159"
 407   :short-name "JISX0212"
 408   :long-name "JISX0212 (Japanese): ISO-IR-159"
 409   :iso-final-char ?D
 410   :emacs-mule-id 148
 411   :code-space [33 126 33 126]
 412   :code-offset #x148000
 413   :unify-map "JISX0212")
 414
 415 ;; Note that jisx0213 contains characters not in Unicode (3.2?).  It's
 416 ;; arguable whether it should have a unify-map.
 417 (define-charset 'japanese-jisx0213-1
 418   "JISX0213.2000 Plane 1 (Japanese)"
 419   :short-name "JISX0213-1"
 420   :iso-final-char ?O
 421   :emacs-mule-id  151
 422   :unify-map "JISX2131"
 423   :code-space [33 126 33 126]
 424   :code-offset #x14C000)
 425
 426 (define-charset 'japanese-jisx0213-2
 427   "JISX0213.2000 Plane 2 (Japanese)"
 428   :short-name "JISX0213-2"
 429   :iso-final-char ?P
 430   :emacs-mule-id 254
 431   :unify-map "JISX2132"
 432   :code-space [33 126 33 126]
 433   :code-offset #x150000)
 434
 435 (define-charset 'japanese-jisx0213-a
 436   "JISX0213.2004 adds these characters to JISX0213.2000."
 437   :short-name "JISX0213A"
 438   :dimension 2
 439   :code-space [33 126 33 126]
 440   :supplementary-p t
 441   :map "JISX213A")
 442
 443 (define-charset 'japanese-jisx0213.2004-1
 444   "JISX0213.2004 Plane1 (Japanese)"
 445   :short-name "JISX0213.2004-1"
 446   :dimension 2
 447   :code-space [33 126 33 126]
 448   :iso-final-char ?Q
 449   :superset '(japanese-jisx0213-a japanese-jisx0213-1))
 450
 451 (define-charset 'katakana-sjis
 452   "Katakana part of Shift-JIS"
 453   :dimension 1
 454   :code-space [#xA1 #xDF]
 455   :subset '(jisx0201 #xA1 #xDF 0)
 456   :supplementary-p t)
 457
 458 (define-charset 'cp932-2-byte
 459   "2-byte part of CP932"
 460   :dimension 2
 461   :map "CP932-2BYTE"
 462   :code-space [#x40 #xFC #x81 #xFC]
 463   :supplementary-p t)
 464
 465 (define-charset 'cp932
 466   "CP932 (Microsoft shift-jis)"
 467   :code-space [#x00 #xFF #x00 #xFE]
 468   :short-name "CP932"
 469   :superset '(ascii katakana-sjis cp932-2-byte))
 470
 471 (define-charset 'korean-ksc5601
 472   "KSC5601 Korean Hangul and Hanja: ISO-IR-149"
 473   :short-name "KSC5601"
 474   :long-name "KSC5601 (Korean): ISO-IR-149"
 475   :iso-final-char ?C
 476   :emacs-mule-id 147
 477   :code-space [33 126 33 126]
 478   :code-offset #x279f94                 ; ... #x27c217
 479   :unify-map "KSC5601")
 480
 481 (define-charset 'big5-hkscs
 482   "Big5-HKSCS (Chinese traditional, Hong Kong supplement)"
 483   :short-name "Big5"
 484   :code-space [#x40 #xFE #xA1 #xFE]
 485   :code-offset #x27c218                 ; ... #x280839
 486   :unify-map "BIG5-HKSCS")
 487
 488 (define-charset 'cp949-2-byte
 489   "2-byte part of CP949"
 490   :dimension 2
 491   :map "CP949-2BYTE"
 492   :code-space [#x41 #xFE #x81 #xFD]
 493   :supplementary-p t)
 494
 495 (define-charset 'cp949
 496   "CP949 (Korean)"
 497   :short-name "CP949"
 498   :long-name  "CP949 (Korean)"
 499   :code-space [#x00 #xFE #x00 #xFD]
 500   :superset '(ascii cp949-2-byte))
 501
 502 (define-charset 'chinese-sisheng
 503   "SiSheng characters for PinYin/ZhuYin"
 504   :short-name "SiSheng"
 505   :long-name "SiSheng (PinYin/ZhuYin)"
 506   :iso-final-char ?0
 507   :emacs-mule-id 160
 508   :code-space [33 126]
 509   :unify-map "MULE-sisheng"
 510   :supplementary-p t
 511   :code-offset #x200000)
 512
 513 ;; A subset of the 1989 version of IPA.  It consists of the consonant
 514 ;; signs used in English, French, German and Italian, and all vowels
 515 ;; signs in the table.  [says old MULE doc]
 516 (define-charset 'ipa
 517   "IPA (International Phonetic Association)"
 518   :short-name "IPA"
 519   :iso-final-char ?0
 520   :emacs-mule-id  161
 521   :unify-map "MULE-ipa"
 522   :code-space [32 127]
 523   :supplementary-p t
 524   :code-offset #x200080)
 525
 526 (define-charset 'viscii
 527   "VISCII1.1"
 528   :short-name "VISCII"
 529   :long-name "VISCII 1.1"
 530   :code-space [0 255]
 531   :map "VISCII")
 532
 533 (define-charset 'vietnamese-viscii-lower
 534   "VISCII1.1 lower-case"
 535   :short-name "VISCII lower"
 536   :long-name "VISCII lower-case"
 537   :iso-final-char ?1
 538   :emacs-mule-id  162
 539   :code-space [32 127]
 540   :code-offset #x200200
 541   :supplementary-p t
 542   :unify-map "MULE-lviscii")
 543
 544 (define-charset 'vietnamese-viscii-upper
 545   "VISCII1.1 upper-case"
 546   :short-name "VISCII upper"
 547   :long-name "VISCII upper-case"
 548   :iso-final-char ?2
 549   :emacs-mule-id  163
 550   :code-space [32 127]
 551   :code-offset #x200280
 552   :supplementary-p t
 553   :unify-map "MULE-uviscii")
 554
 555 (define-charset 'vscii
 556   "VSCII1.1 (TCVN-5712 VN1)"
 557   :short-name "VSCII"
 558   :code-space [0 255]
 559   :map "VSCII")
 560
 561 (define-charset-alias 'tcvn-5712 'vscii)
 562
 563 ;; Fixme: see note in tcvn.map about combining characters
 564 (define-charset 'vscii-2
 565   "VSCII-2 (TCVN-5712 VN2)"
 566   :code-space [0 255]
 567   :map "VSCII-2")
 568
 569 (define-charset 'koi8-r
 570   "KOI8-R"
 571   :short-name "KOI8-R"
 572   :ascii-compatible-p t
 573   :code-space [0 255]
 574   :map "KOI8-R")
 575
 576 (define-charset-alias 'koi8 'koi8-r)
 577
 578 (define-charset 'alternativnyj
 579   "ALTERNATIVNYJ"
 580   :short-name "alternativnyj"
 581   :ascii-compatible-p t
 582   :code-space [0 255]
 583   :map "ALTERNATIVNYJ")
 584
 585 (define-charset 'cp866
 586   "CP866"
 587   :short-name "cp866"
 588   :ascii-compatible-p t
 589   :code-space [0 255]
 590   :map "IBM866")
 591 (define-charset-alias 'ibm866 'cp866)
 592
 593 (define-charset 'koi8-u
 594   "KOI8-U"
 595   :short-name "KOI8-U"
 596   :ascii-compatible-p t
 597   :code-space [0 255]
 598   :map "KOI8-U")
 599
 600 (define-charset 'koi8-t
 601   "KOI8-T"
 602   :short-name "KOI8-T"
 603   :ascii-compatible-p t
 604   :code-space [0 255]
 605   :map "KOI8-T")
 606
 607 (define-charset 'georgian-ps
 608   "GEORGIAN-PS"
 609   :short-name "GEORGIAN-PS"
 610   :ascii-compatible-p t
 611   :code-space [0 255]
 612   :map "KA-PS")
 613
 614 (define-charset 'georgian-academy
 615   "GEORGIAN-ACADEMY"
 616   :short-name "GEORGIAN-ACADEMY"
 617   :ascii-compatible-p t
 618   :code-space [0 255]
 619   :map "KA-ACADEMY")
 620
 621 (define-charset 'windows-1250
 622   "WINDOWS-1250 (Central Europe)"
 623   :short-name "WINDOWS-1250"
 624   :ascii-compatible-p t
 625   :code-space [0 255]
 626   :map "CP1250")
 627 (define-charset-alias 'cp1250 'windows-1250)
 628
 629 (define-charset 'windows-1251
 630   "WINDOWS-1251 (Cyrillic)"
 631   :short-name "WINDOWS-1251"
 632   :ascii-compatible-p t
 633   :code-space [0 255]
 634   :map "CP1251")
 635 (define-charset-alias 'cp1251 'windows-1251)
 636
 637 (define-charset 'windows-1252
 638   "WINDOWS-1252 (Latin I)"
 639   :short-name "WINDOWS-1252"
 640   :ascii-compatible-p t
 641   :code-space [0 255]
 642   :map "CP1252")
 643 (define-charset-alias 'cp1252 'windows-1252)
 644
 645 (define-charset 'windows-1253
 646   "WINDOWS-1253 (Greek)"
 647   :short-name "WINDOWS-1253"
 648   :ascii-compatible-p t
 649   :code-space [0 255]
 650   :map "CP1253")
 651 (define-charset-alias 'cp1253 'windows-1253)
 652
 653 (define-charset 'windows-1254
 654   "WINDOWS-1254 (Turkish)"
 655   :short-name "WINDOWS-1254"
 656   :ascii-compatible-p t
 657   :code-space [0 255]
 658   :map "CP1254")
 659 (define-charset-alias 'cp1254 'windows-1254)
 660
 661 (define-charset 'windows-1255
 662   "WINDOWS-1255 (Hebrew)"
 663   :short-name "WINDOWS-1255"
 664   :ascii-compatible-p t
 665   :code-space [0 255]
 666   :map "CP1255")
 667 (define-charset-alias 'cp1255 'windows-1255)
 668
 669 (define-charset 'windows-1256
 670   "WINDOWS-1256 (Arabic)"
 671   :short-name "WINDOWS-1256"
 672   :ascii-compatible-p t
 673   :code-space [0 255]
 674   :map "CP1256")
 675 (define-charset-alias 'cp1256 'windows-1256)
 676
 677 (define-charset 'windows-1257
 678   "WINDOWS-1257 (Baltic)"
 679   :short-name "WINDOWS-1257"
 680   :ascii-compatible-p t
 681   :code-space [0 255]
 682   :map "CP1257")
 683 (define-charset-alias 'cp1257 'windows-1257)
 684
 685 (define-charset 'windows-1258
 686   "WINDOWS-1258 (Viet Nam)"
 687   :short-name "WINDOWS-1258"
 688   :ascii-compatible-p t
 689   :code-space [0 255]
 690   :map "CP1258")
 691 (define-charset-alias 'cp1258 'windows-1258)
 692
 693 (define-charset 'next
 694   "NEXT"
 695   :short-name "NEXT"
 696   :ascii-compatible-p t
 697   :code-space [0 255]
 698   :map "NEXTSTEP")
 699
 700 (define-charset 'cp1125
 701   "CP1125"
 702   :short-name "CP1125"
 703   :code-space [0 255]
 704   :ascii-compatible-p t
 705   :map "CP1125")
 706 (define-charset-alias 'ruscii 'cp1125)
 707 ;; Original name for cp1125, says Serhii Hlodin <hlodin@lutsk.bank.gov.ua>
 708 (define-charset-alias 'cp866u 'cp1125)
 709
 710 ;; Fixme: C.f. iconv, http://czyborra.com/charsets/codepages.html
 711 ;; shows this as not ASCII compatible, with various graphics in
 712 ;; 0x01-0x1F.
 713 (define-charset 'cp437
 714   "CP437 (MS-DOS United States, Australia, New Zealand, South Africa)"
 715   :short-name "CP437"
 716   :code-space [0 255]
 717   :ascii-compatible-p t
 718   :map "IBM437")
 719
 720 (define-charset 'cp720
 721   "CP720 (Arabic)"
 722   :short-name "CP720"
 723   :code-space [0 255]
 724   :ascii-compatible-p t
 725   :map "CP720")
 726
 727 (define-charset 'cp737
 728   "CP737 (PC Greek)"
 729   :short-name "CP737"
 730   :code-space [0 255]
 731   :ascii-compatible-p t
 732   :map "CP737")
 733
 734 (define-charset 'cp775
 735   "CP775 (PC Baltic)"
 736   :short-name "CP775"
 737   :code-space [0 255]
 738   :ascii-compatible-p t
 739   :map "CP775")
 740
 741 (define-charset 'cp851
 742   "CP851 (Greek)"
 743   :short-name "CP851"
 744   :code-space [0 255]
 745   :ascii-compatible-p t
 746   :map "IBM851")
 747
 748 (define-charset 'cp852
 749   "CP852 (MS-DOS Latin-2)"
 750   :short-name "CP852"
 751   :code-space [0 255]
 752   :ascii-compatible-p t
 753   :map "IBM852")
 754
 755 (define-charset 'cp855
 756   "CP855 (IBM Cyrillic)"
 757   :short-name "CP855"
 758   :code-space [0 255]
 759   :ascii-compatible-p t
 760   :map "IBM855")
 761
 762 (define-charset 'cp857
 763   "CP857 (IBM Turkish)"
 764   :short-name "CP857"
 765   :code-space [0 255]
 766   :ascii-compatible-p t
 767   :map "IBM857")
 768
 769 (define-charset 'cp858
 770   "CP858 (Multilingual Latin I + Euro)"
 771   :short-name "CP858"
 772   :code-space [0 255]
 773   :ascii-compatible-p t
 774   :map "CP858")
 775 (define-charset-alias 'cp00858 'cp858)  ; IANA has IBM00858/CP00858
 776
 777 (define-charset 'cp860
 778   "CP860 (MS-DOS Portuguese)"
 779   :short-name "CP860"
 780   :code-space [0 255]
 781   :ascii-compatible-p t
 782   :map "IBM860")
 783
 784 (define-charset 'cp861
 785   "CP861 (MS-DOS Icelandic)"
 786   :short-name "CP861"
 787   :code-space [0 255]
 788   :ascii-compatible-p t
 789   :map "IBM861")
 790
 791 (define-charset 'cp862
 792   "CP862 (PC Hebrew)"
 793   :short-name "CP862"
 794   :code-space [0 255]
 795   :ascii-compatible-p t
 796   :map "IBM862")
 797
 798 (define-charset 'cp863
 799   "CP863 (MS-DOS Canadian French)"
 800   :short-name "CP863"
 801   :code-space [0 255]
 802   :ascii-compatible-p t
 803   :map "IBM863")
 804
 805 (define-charset 'cp864
 806   "CP864 (PC Arabic)"
 807   :short-name "CP864"
 808   :code-space [0 255]
 809   :ascii-compatible-p t
 810   :map "IBM864")
 811
 812 (define-charset 'cp865
 813   "CP865 (MS-DOS Nordic)"
 814   :short-name "CP865"
 815   :code-space [0 255]
 816   :ascii-compatible-p t
 817   :map "IBM865")
 818
 819 (define-charset 'cp869
 820   "CP869 (IBM Modern Greek)"
 821   :short-name "CP869"
 822   :code-space [0 255]
 823   :ascii-compatible-p t
 824   :map "IBM869")
 825
 826 (define-charset 'cp874
 827   "CP874 (IBM Thai)"
 828   :short-name "CP874"
 829   :code-space [0 255]
 830   :ascii-compatible-p t
 831   :map "IBM874")
 832
 833 ;; For Arabic, we need three different types of character sets.
 834 ;; Digits are of direction left-to-right and of width 1-column.
 835 ;; Others are of direction right-to-left and of width 1-column or
 836 ;; 2-column.
 837 (define-charset 'arabic-digit
 838   "Arabic digit"
 839   :short-name "Arabic digit"
 840   :iso-final-char ?2
 841   :emacs-mule-id 164
 842   :supplementary-p t
 843   :code-space [34 42]
 844   :code-offset #x0600)
 845
 846 (define-charset 'arabic-1-column
 847   "Arabic 1-column"
 848   :short-name "Arabic 1-col"
 849   :long-name "Arabic 1-column"
 850   :iso-final-char ?3
 851   :emacs-mule-id 165
 852   :supplementary-p t
 853   :code-space [33 126]
 854   :code-offset #x200100)
 855
 856 (define-charset 'arabic-2-column
 857   "Arabic 2-column"
 858   :short-name "Arabic 2-col"
 859   :long-name "Arabic 2-column"
 860   :iso-final-char ?4
 861   :emacs-mule-id 224
 862   :supplementary-p t
 863   :code-space [33 126]
 864   :code-offset #x200180)
 865
 866 ;; Lao script.
 867 ;; Codes 0x21..0x7E are mapped to Unicode U+0E81..U+0EDF.
 868 ;; Not all of them are defined in Unicode.
 869 (define-charset 'lao
 870   "Lao characters (ISO10646 0E81..0EDF)"
 871   :short-name "Lao"
 872   :iso-final-char ?1
 873   :emacs-mule-id 167
 874   :supplementary-p t
 875   :code-space [33 126]
 876   :code-offset #x0E81)
 877
 878 (define-charset 'mule-lao
 879   "Lao characters (ISO10646 0E81..0EDF)"
 880   :short-name "Lao"
 881   :code-space [0 255]
 882   :supplementary-p t
 883   :superset '(ascii eight-bit-control (lao . 128)))
 884
 885
 886 ;; Indian scripts.  Symbolic charset for data exchange.  Glyphs are
 887 ;; not assigned.  They are automatically converted to each Indian
 888 ;; script which IS-13194 supports.
 889
 890 (define-charset 'indian-is13194
 891   "7-bit representation of IS 13194 (ISCII) for Devanagari"
 892   :short-name "IS 13194 (DEV)"
 893   :long-name "Indian IS 13194 (DEV)"
 894   :iso-final-char ?5
 895   :emacs-mule-id 225
 896   :supplementary-p t
 897   :code-space [33 126]
 898   :code-offset #x180000
 899   :unify-map "MULE-is13194")
 900
 901 (let ((code-offset #x180100))
 902   (dolist (script '(devanagari sanskrit bengali tamil telugu assamese
 903                                oriya kannada malayalam gujarati punjabi))
 904     (define-charset (intern (format "%s-cdac" script))
 905       (format "Glyphs of %s script for CDAC font.  Subset of `indian-glyph'."
 906               (capitalize (symbol-name script)))
 907       :short-name (format "CDAC %s glyphs" (capitalize (symbol-name script)))
 908       :supplementary-p t
 909       :code-space [0 255]
 910       :code-offset code-offset)
 911     (setq code-offset (+ code-offset #x100)))
 912
 913   (dolist (script '(devanagari bengali punjabi gujarati
 914                                oriya tamil telugu kannada malayalam))
 915     (define-charset (intern (format "%s-akruti" script))
 916       (format "Glyphs of %s script for AKRUTI font.  Subset of `indian-glyph'."
 917               (capitalize (symbol-name script)))
 918       :short-name (format "AKRUTI %s glyphs" (capitalize (symbol-name script)))
 919       :supplementary-p t
 920       :code-space [0 255]
 921       :code-offset code-offset)
 922     (setq code-offset (+ code-offset #x100))))
 923
 924 (define-charset 'indian-glyph
 925   "Glyphs for Indian characters."
 926   :short-name "Indian glyph"
 927   :iso-final-char ?4
 928   :emacs-mule-id 240
 929   :supplementary-p t
 930   :code-space [32 127 32 127]
 931   :code-offset #x180100)
 932
 933 ;; Actual Glyph for 1-column width.
 934 (define-charset 'indian-1-column
 935   "Indian charset for 1-column width glyphs."
 936   :short-name "Indian 1-col"
 937   :long-name "Indian 1 Column"
 938   :iso-final-char ?6
 939   :emacs-mule-id  251
 940   :supplementary-p t
 941   :code-space [33 126 33 126]
 942   :code-offset #x184000)
 943
 944 ;; Actual Glyph for 2-column width.
 945 (define-charset 'indian-2-column
 946   "Indian charset for 2-column width glyphs."
 947   :short-name "Indian 2-col"
 948   :long-name "Indian 2 Column"
 949   :iso-final-char ?5
 950   :emacs-mule-id  251
 951   :supplementary-p t
 952   :code-space [33 126 33 126]
 953   :code-offset #x184000)
 954
 955 (define-charset 'tibetan
 956   "Tibetan characters"
 957   :iso-final-char ?7
 958   :short-name "Tibetan 2-col"
 959   :long-name "Tibetan 2 column"
 960   :iso-final-char ?7
 961   :emacs-mule-id 252
 962   :unify-map "MULE-tibetan"
 963   :supplementary-p t
 964   :code-space [33 126 33 37]
 965   :code-offset #x190000)
 966
 967 (define-charset 'tibetan-1-column
 968   "Tibetan 1 column glyph"
 969   :short-name "Tibetan 1-col"
 970   :long-name "Tibetan 1 column"
 971   :iso-final-char ?8
 972   :emacs-mule-id 241
 973   :supplementary-p t
 974   :code-space [33 126 33 37]
 975   :code-offset #x190000)
 976
 977 ;; Subsets of Unicode.
 978 (define-charset 'mule-unicode-2500-33ff
 979   "Unicode characters of the range U+2500..U+33FF."
 980   :short-name "Unicode subset 2"
 981   :long-name "Unicode subset (U+2500..U+33FF)"
 982   :iso-final-char ?2
 983   :emacs-mule-id 242
 984   :supplementary-p t
 985   :code-space [#x20 #x7f #x20 #x47]
 986   :code-offset #x2500)
 987
 988 (define-charset 'mule-unicode-e000-ffff
 989   "Unicode characters of the range U+E000..U+FFFF."
 990   :short-name "Unicode subset 3"
 991   :long-name "Unicode subset (U+E000+FFFF)"
 992   :iso-final-char ?3
 993   :emacs-mule-id 243
 994   :supplementary-p t
 995   :code-space [#x20 #x7F #x20 #x75]
 996   :code-offset #xE000
 997   :max-code 30015)                      ; U+FFFF
 998
 999 (define-charset 'mule-unicode-0100-24ff
1000   "Unicode characters of the range U+0100..U+24FF."
1001   :short-name "Unicode subset"
1002   :long-name "Unicode subset (U+0100..U+24FF)"
1003   :iso-final-char ?1
1004   :emacs-mule-id 244
1005   :supplementary-p t
1006   :code-space [#x20 #x7F #x20 #x7F]
1007   :code-offset #x100)
1008
1009 (define-charset 'unicode-bmp
1010   "Unicode Basic Multilingual Plane (U+0000..U+FFFF)"
1011   :short-name "Unicode BMP"
1012   :code-space [0 255 0 255]
1013   :code-offset 0)
1014
1015 (define-charset 'unicode-smp
1016   "Unicode Supplementary Multilingual Plane (U+10000..U+1FFFF)"
1017   :short-name "Unicode SMP "
1018   :code-space [0 255 0 255]
1019   :code-offset #x10000)
1020
1021 (define-charset 'unicode-sip
1022   "Unicode Supplementary Ideographic Plane (U+20000..U+2FFFF)"
1023   :short-name "Unicode SIP"
1024   :code-space [0 255 0 255]
1025   :code-offset #x20000)
1026
1027 (define-charset 'unicode-ssp
1028   "Unicode Supplementary Special-purpose Plane (U+E0000..U+EFFFF)"
1029   :short-name "Unicode SSP"
1030   :code-space [0 255 0 255]
1031   :code-offset #xE0000)
1032
1033 (define-charset 'ethiopic
1034   "Ethiopic characters for Amharic and Tigrigna."
1035   :short-name "Ethiopic"
1036   :long-name "Ethiopic characters"
1037   :iso-final-char ?3
1038   :emacs-mule-id  245
1039   :supplementary-p t
1040   :unify-map "MULE-ethiopic"
1041   :code-space [33 126 33 126]
1042   :code-offset #x1A0000)
1043
1044 (define-charset 'mac-roman
1045   "Mac Roman charset"
1046   :short-name "Mac Roman"
1047   :ascii-compatible-p t
1048   :code-space [0 255]
1049   :map "MACINTOSH")
1050
1051 ;; Fixme: modern EBCDIC variants, e.g. IBM00924?
1052 (define-charset 'ebcdic-us
1053   "US version of EBCDIC"
1054   :short-name "EBCDIC-US"
1055   :code-space [0 255]
1056   :mime-charset 'ebcdic-us
1057   :map "EBCDICUS")
1058
1059 (define-charset 'ebcdic-uk
1060   "UK version of EBCDIC"
1061   :short-name "EBCDIC-UK"
1062   :code-space [0 255]
1063   :mime-charset 'ebcdic-uk
1064   :map "EBCDICUK")
1065
1066 (define-charset 'ibm1047
1067   ;; Says groff:
1068   "IBM1047, `EBCDIC Latin 1/Open Systems' used by OS/390 Unix."
1069   :short-name "IBM1047"
1070   :code-space [0 255]
1071   :mime-charset 'ibm1047
1072   :map "IBM1047")
1073 (define-charset-alias 'cp1047 'ibm1047)
1074
1075 (define-charset 'hp-roman8
1076   "Encoding used by Hewlet-Packard printer software"
1077   :short-name "HP-ROMAN8"
1078   :ascii-compatible-p t
1079   :code-space [0 255]
1080   :map "HP-ROMAN8")
1081
1082 ;; To make a coding system with this, a pre-write-conversion should
1083 ;; account for the commented-out multi-valued code points in
1084 ;; stdenc.map.
1085 (define-charset 'adobe-standard-encoding
1086   "Adobe `standard encoding' used in PostScript"
1087   :short-name "ADOBE-STANDARD-ENCODING"
1088   :code-space [#x20 255]
1089   :map "stdenc")
1090
1091 (define-charset 'symbol
1092   "Adobe symbol encoding used in PostScript"
1093   :short-name "ADOBE-SYMBOL"
1094   :code-space [#x20 255]
1095   :map "symbol")
1096
1097 (define-charset 'ibm850
1098   "DOS codepage 850 (Latin-1)"
1099   :short-name "IBM850"
1100   :ascii-compatible-p t
1101   :code-space [0 255]
1102   :map "IBM850")
1103 (define-charset-alias 'cp850 'ibm850)
1104
1105 (define-charset 'mik
1106   "Bulgarian DOS codepage"
1107   :short-name "MIK"
1108   :ascii-compatible-p t
1109   :code-space [0 255]
1110   :map "MIK")
1111
1112 (define-charset 'ptcp154
1113   "ParaType codepage (Asian Cyrillic)"
1114   :short-name "PT154"
1115   :ascii-compatible-p t
1116   :code-space [0 255]
1117   :mime-charset 'pt154
1118   :map "PTCP154")
1119 (define-charset-alias 'pt154 'ptcp154)
1120 (define-charset-alias 'cp154 'ptcp154)
1121
1122 (define-charset 'gb18030-2-byte
1123   "GB18030 2-byte (0x814E..0xFEFE)"
1124   :code-space [#x40 #xFE #x81 #xFE]
1125   :supplementary-p t
1126   :map "GB180302")
1127
1128 (define-charset 'gb18030-4-byte-bmp
1129   "GB18030 4-byte for BMP (0x81308130-0x8431A439)"
1130   :code-space [#x30 #x39 #x81 #xFE #x30 #x39 #x81 #x84]
1131   :supplementary-p t
1132   :map "GB180304")
1133
1134 (define-charset 'gb18030-4-byte-smp
1135   "GB18030 4-byte for SMP (0x90308130-0xE3329A35)"
1136   :code-space [#x30 #x39 #x81 #xFE #x30 #x39 #x90 #xE3]
1137   :min-code '(#x9030 . #x8130)
1138   :max-code '(#xE332 . #x9A35)
1139   :supplementary-p t
1140   :code-offset #x10000)
1141
1142 (define-charset 'gb18030-4-byte-ext-1
1143   "GB18030 4-byte (0x8431A530-0x8F39FE39)"
1144   :code-space [#x30 #x39 #x81 #xFE #x30 #x39 #x84 #x8F]
1145   :min-code '(#x8431 . #xA530)
1146   :max-code '(#x8F39 . #xFE39)
1147   :supplementary-p t
1148   :code-offset #x200000                 ; ... #x22484B
1149   )
1150
1151 (define-charset 'gb18030-4-byte-ext-2
1152   "GB18030 4-byte (0xE3329A36-0xFE39FE39)"
1153   :code-space [#x30 #x39 #x81 #xFE #x30 #x39 #xE3 #xFE]
1154   :min-code '(#xE332 . #x9A36)
1155   :max-code '(#xFE39 . #xFE39)
1156   :supplementary-p t
1157   :code-offset #x22484C                 ; ... #x279f93
1158   )
1159
1160 (define-charset 'gb18030
1161   "GB18030"
1162   :code-space [#x00 #xFF #x00 #xFE #x00 #xFE #x00 #xFE]
1163   :min-code 0
1164   :max-code '(#xFE39 . #xFE39)
1165   :superset '(ascii gb18030-2-byte
1166                     gb18030-4-byte-bmp gb18030-4-byte-smp
1167                     gb18030-4-byte-ext-1 gb18030-4-byte-ext-2))
1168
1169 (define-charset 'chinese-cns11643-15
1170   "CNS11643 Plane 15 Chinese Traditional"
1171   :short-name  "CNS11643-15"
1172   :long-name "CNS11643-15 (Chinese traditional)"
1173   :code-space [33 126 33 126]
1174   :code-offset #x27A000)
1175
1176 (unify-charset 'chinese-gb2312)
1177 (unify-charset 'chinese-gbk)
1178 (unify-charset 'chinese-cns11643-1)
1179 (unify-charset 'chinese-cns11643-2)
1180 (unify-charset 'chinese-cns11643-3)
1181 (unify-charset 'chinese-cns11643-4)
1182 (unify-charset 'chinese-cns11643-5)
1183 (unify-charset 'chinese-cns11643-6)
1184 (unify-charset 'chinese-cns11643-7)
1185 (unify-charset 'big5)
1186 (unify-charset 'chinese-big5-1)
1187 (unify-charset 'chinese-big5-2)
1188 (unify-charset 'big5-hkscs)
1189 (unify-charset 'korean-ksc5601)
1190 (unify-charset 'vietnamese-viscii-lower)
1191 (unify-charset 'vietnamese-viscii-upper)
1192 (unify-charset 'chinese-sisheng)
1193 (unify-charset 'ipa)
1194 (unify-charset 'tibetan)
1195 (unify-charset 'ethiopic)
1196 (unify-charset 'indian-is13194)
1197 (unify-charset 'japanese-jisx0208-1978)
1198 (unify-charset 'japanese-jisx0208)
1199 (unify-charset 'japanese-jisx0212)
1200 (unify-charset 'japanese-jisx0213-1)
1201 (unify-charset 'japanese-jisx0213-2)
1202
1203 \f
1204 ;; These are tables for translating characters on decoding and
1205 ;; encoding.
1206 ;; Fixme: these aren't used now -- should they be?
1207 (setq standard-translation-table-for-decode nil)
1208
1209 (setq standard-translation-table-for-encode nil)
1210 \f
1211 ;;; Make fundamental coding systems.
1212
1213 ;; The coding system `no-conversion' and `undecided' are already
1214 ;; defined in coding.c as below:
1215 ;;
1216 ;; (define-coding-system 'no-conversion
1217 ;;   "..."
1218 ;;   :coding-type 'raw-text
1219 ;;   ...)
1220 ;; (define-coding-system 'undecided
1221 ;;   "..."
1222 ;;   :coding-type 'undecided
1223 ;;   ...)
1224
1225 (define-coding-system-alias 'binary 'no-conversion)
1226 (define-coding-system-alias 'unix 'undecided-unix)
1227 (define-coding-system-alias 'dos 'undecided-dos)
1228 (define-coding-system-alias 'mac 'undecided-mac)
1229
1230 (define-coding-system 'prefer-utf-8
1231   "Like `undecided' but prefer UTF-8 when appropriate.
1232 On decoding, if the source contains 8-bit codes and they all
1233 are valid UTF-8 sequences, detect the source as UTF-8 encoding
1234 regardless of the coding priority.
1235 On encoding, if the source contains non-ASCII characters, encode them
1236 by UTF-8."
1237   :coding-type 'undecided
1238   :mnemonic ?-
1239   :charset-list '(emacs)
1240   :prefer-utf-8 t)
1241
1242 (define-coding-system 'raw-text
1243   "Raw text, which means text contains random 8-bit codes.
1244 Encoding text with this coding system produces the actual byte
1245 sequence of the text in buffers and strings.  An exception is made for
1246 characters from the `eight-bit' character set.  Each of them is encoded
1247 into a single byte.
1248
1249 When you visit a file with this coding, the file is read into a
1250 unibyte buffer as is (except for EOL format), thus each byte of a file
1251 is treated as a character."
1252   :coding-type 'raw-text
1253   :for-unibyte t
1254   :mnemonic ?t)
1255
1256 (define-coding-system 'no-conversion-multibyte
1257   "Like `no-conversion' but don't read a file into a unibyte buffer."
1258   :coding-type 'raw-text
1259   :eol-type 'unix
1260   :mnemonic ?=)
1261
1262 (define-coding-system 'iso-latin-1
1263   "ISO 2022 based 8-bit encoding for Latin-1 (MIME:ISO-8859-1)."
1264   :coding-type 'charset
1265   :mnemonic ?1
1266   :charset-list '(iso-8859-1)
1267   :mime-charset 'iso-8859-1)
1268
1269 (define-coding-system-alias 'iso-8859-1 'iso-latin-1)
1270 (define-coding-system-alias 'latin-1 'iso-latin-1)
1271
1272 ;; Coding systems not specific to each language environment.
1273
1274 (define-coding-system 'emacs-mule
1275  "Emacs 21 internal format used in buffer and string."
1276  :coding-type 'emacs-mule
1277  :charset-list 'emacs-mule
1278  :mnemonic ?M)
1279
1280 (define-coding-system 'utf-8
1281   "UTF-8 (no signature (BOM))"
1282   :coding-type 'utf-8
1283   :mnemonic ?U
1284   :charset-list '(unicode)
1285   :mime-charset 'utf-8)
1286
1287 (define-coding-system 'utf-8-with-signature
1288   "UTF-8 (with signature (BOM))"
1289   :coding-type 'utf-8
1290   :mnemonic ?U
1291   :charset-list '(unicode)
1292   :bom t)
1293
1294 (define-coding-system 'utf-8-auto
1295   "UTF-8 (auto-detect signature (BOM))"
1296   :coding-type 'utf-8
1297   :mnemonic ?U
1298   :charset-list '(unicode)
1299   :bom '(utf-8-with-signature . utf-8))
1300
1301 (define-coding-system-alias 'mule-utf-8 'utf-8)
1302
1303 (define-coding-system 'utf-8-emacs
1304   "Support for all Emacs characters (including non-Unicode characters)."
1305   :coding-type 'utf-8
1306   :mnemonic ?U
1307   :charset-list '(emacs))
1308
1309 ;; The encoding used internally.  This encoding is meant to be able to save
1310 ;; any multibyte buffer without losing information.  It can change between
1311 ;; Emacs releases, tho, so should only be used for internal files.
1312 (define-coding-system-alias 'emacs-internal 'utf-8-emacs-unix)
1313
1314 (define-coding-system 'utf-16le
1315   "UTF-16LE (little endian, no signature (BOM))."
1316   :coding-type 'utf-16
1317   :mnemonic ?U
1318   :charset-list '(unicode)
1319   :endian 'little
1320   :mime-text-unsuitable t
1321   :mime-charset 'utf-16le)
1322
1323 (define-coding-system 'utf-16be
1324   "UTF-16BE (big endian, no signature (BOM))."
1325   :coding-type 'utf-16
1326   :mnemonic ?U
1327   :charset-list '(unicode)
1328   :endian 'big
1329   :mime-text-unsuitable t
1330   :mime-charset 'utf-16be)
1331
1332 (define-coding-system 'utf-16le-with-signature
1333   "UTF-16 (little endian, with signature (BOM))."
1334   :coding-type 'utf-16
1335   :mnemonic ?U
1336   :charset-list '(unicode)
1337   :bom t
1338   :endian 'little
1339   :mime-text-unsuitable t
1340   :mime-charset 'utf-16)
1341
1342 (define-coding-system 'utf-16be-with-signature
1343   "UTF-16 (big endian, with signature (BOM))."
1344   :coding-type 'utf-16
1345   :mnemonic ?U
1346   :charset-list '(unicode)
1347   :bom t
1348   :endian 'big
1349   :mime-text-unsuitable t
1350   :mime-charset 'utf-16)
1351
1352 (define-coding-system 'utf-16
1353   "UTF-16 (detect endian on decoding, use big endian on encoding with BOM)."
1354   :coding-type 'utf-16
1355   :mnemonic ?U
1356   :charset-list '(unicode)
1357   :bom '(utf-16le-with-signature . utf-16be-with-signature)
1358   :endian 'big
1359   :mime-text-unsuitable t
1360   :mime-charset 'utf-16)
1361
1362 ;; Backwards compatibility (old names, also used by Mule-UCS).  We
1363 ;; prefer the MIME names.
1364 (define-coding-system-alias 'utf-16-le 'utf-16le-with-signature)
1365 (define-coding-system-alias 'utf-16-be 'utf-16be-with-signature)
1366
1367
1368 (define-coding-system 'iso-2022-7bit
1369   "ISO 2022 based 7-bit encoding using only G0."
1370   :coding-type 'iso-2022
1371   :mnemonic ?J
1372   :charset-list 'iso-2022
1373   :designation [(ascii t) nil nil nil]
1374   :flags '(short ascii-at-eol ascii-at-cntl 7-bit designation composition))
1375
1376 (define-coding-system 'iso-2022-7bit-ss2
1377   "ISO 2022 based 7-bit encoding using SS2 for 96-charset."
1378   :coding-type 'iso-2022
1379   :mnemonic ?$
1380   :charset-list 'iso-2022
1381   :designation [(ascii 94) nil (nil 96) nil]
1382   :flags '(short ascii-at-eol ascii-at-cntl 7-bit
1383                  designation single-shift composition))
1384
1385 (define-coding-system 'iso-2022-7bit-lock
1386   "ISO-2022 coding system using Locking-Shift for 96-charset."
1387   :coding-type 'iso-2022
1388   :mnemonic ?&
1389   :charset-list 'iso-2022
1390   :designation [(ascii 94) (nil 96) nil nil]
1391   :flags '(ascii-at-eol ascii-at-cntl 7-bit
1392                         designation locking-shift composition))
1393
1394 (define-coding-system-alias 'iso-2022-int-1 'iso-2022-7bit-lock)
1395
1396 (define-coding-system 'iso-2022-7bit-lock-ss2
1397   "Mixture of ISO-2022-JP, ISO-2022-KR, and ISO-2022-CN."
1398   :coding-type 'iso-2022
1399   :mnemonic ?i
1400   :charset-list '(ascii
1401                   japanese-jisx0208 japanese-jisx0208-1978 latin-jisx0201
1402                   korean-ksc5601
1403                   chinese-gb2312
1404                   chinese-cns11643-1 chinese-cns11643-2 chinese-cns11643-3
1405                   chinese-cns11643-4 chinese-cns11643-5 chinese-cns11643-6
1406                   chinese-cns11643-7)
1407   :designation [(ascii 94)
1408                 (nil korean-ksc5601 chinese-gb2312 chinese-cns11643-1 96)
1409                 (nil chinese-cns11643-2)
1410                 (nil chinese-cns11643-3 chinese-cns11643-4 chinese-cns11643-5
1411                      chinese-cns11643-6 chinese-cns11643-7)]
1412   :flags '(short ascii-at-eol ascii-at-cntl 7-bit locking-shift
1413                  single-shift init-bol))
1414
1415 (define-coding-system-alias 'iso-2022-cjk 'iso-2022-7bit-lock-ss2)
1416
1417 (define-coding-system 'iso-2022-8bit-ss2
1418   "ISO 2022 based 8-bit encoding using SS2 for 96-charset."
1419   :coding-type 'iso-2022
1420   :mnemonic ?@
1421   :charset-list 'iso-2022
1422   :designation [(ascii 94) nil (nil 96) nil]
1423   :flags '(ascii-at-eol ascii-at-cntl designation single-shift composition))
1424
1425 (define-coding-system 'compound-text
1426   "Compound text based generic encoding.
1427 This coding system is an extension of X's \"Compound Text Encoding\".
1428 It encodes many characters using the normal ISO-2022 designation sequences,
1429 but it doesn't support extended segments of CTEXT."
1430   :coding-type 'iso-2022
1431   :mnemonic ?x
1432   :charset-list 'iso-2022
1433   :designation [(ascii 94) (latin-iso8859-1 katakana-jisx0201 96) nil nil]
1434   :flags '(ascii-at-eol ascii-at-cntl long-form
1435                         designation locking-shift single-shift composition)
1436   ;; Fixme: this isn't a valid MIME charset and has to be
1437   ;; special-cased elsewhere  -- fx
1438   :mime-charset 'x-ctext)
1439
1440 (define-coding-system-alias  'x-ctext 'compound-text)
1441 (define-coding-system-alias  'ctext 'compound-text)
1442
1443 ;; Same as compound-text, but doesn't produce composition escape
1444 ;; sequences.  Used in post-read and pre-write conversions of
1445 ;; compound-text-with-extensions, see mule.el.  Note that this should
1446 ;; not have a mime-charset property, to prevent it from showing up
1447 ;; close to the beginning of coding systems ordered by priority.
1448 (define-coding-system 'ctext-no-compositions
1449  "Compound text based generic encoding.
1450
1451 Like `compound-text', but does not produce escape sequences for compositions."
1452   :coding-type 'iso-2022
1453   :mnemonic ?x
1454   :charset-list 'iso-2022
1455   :designation [(ascii 94) (latin-iso8859-1 katakana-jisx0201 96) nil nil]
1456   :flags '(ascii-at-eol ascii-at-cntl
1457                         designation locking-shift single-shift))
1458
1459 (define-coding-system 'compound-text-with-extensions
1460  "Compound text encoding with ICCCM Extended Segment extensions.
1461
1462 See the variables `ctext-standard-encodings' and
1463 `ctext-non-standard-encodings-alist' for the detail about how
1464 extended segments are handled.
1465
1466 This coding system should be used only for X selections.  It is inappropriate
1467 for decoding and encoding files, process I/O, etc."
1468   :coding-type 'iso-2022
1469   :mnemonic ?x
1470   :charset-list 'iso-2022
1471   :designation [(ascii 94) (latin-iso8859-1 katakana-jisx0201 96) nil nil]
1472   :flags '(ascii-at-eol ascii-at-cntl long-form
1473                         designation locking-shift single-shift)
1474   :post-read-conversion 'ctext-post-read-conversion
1475   :pre-write-conversion 'ctext-pre-write-conversion
1476   :mime-charset 'x-ctext)
1477
1478 (define-coding-system-alias
1479   'x-ctext-with-extensions 'compound-text-with-extensions)
1480 (define-coding-system-alias
1481   'ctext-with-extensions 'compound-text-with-extensions)
1482
1483 (define-coding-system 'us-ascii
1484   "Encode ASCII as-is and encode non-ASCII characters to `?'."
1485   :coding-type 'charset
1486   :mnemonic ?-
1487   :charset-list '(ascii)
1488   :default-char ??
1489   :mime-charset 'us-ascii)
1490
1491 (define-coding-system-alias 'iso-safe 'us-ascii)
1492
1493 (define-coding-system 'utf-7
1494   "UTF-7 encoding of Unicode (RFC 2152)."
1495   :coding-type 'utf-8
1496   :mnemonic ?U
1497   :mime-charset 'utf-7
1498   :charset-list '(unicode)
1499   :pre-write-conversion 'utf-7-pre-write-conversion
1500   :post-read-conversion 'utf-7-post-read-conversion)
1501
1502 (define-coding-system 'utf-7-imap
1503   "UTF-7 encoding of Unicode, IMAP version (RFC 2060)"
1504   :coding-type 'utf-8
1505   :mnemonic ?u
1506   :charset-list '(unicode)
1507   :pre-write-conversion 'utf-7-imap-pre-write-conversion
1508   :post-read-conversion 'utf-7-imap-post-read-conversion)
1509
1510 ;; Use us-ascii for terminal output if some other coding system is not
1511 ;; specified explicitly.
1512 (set-safe-terminal-coding-system-internal 'us-ascii)
1513
1514 ;; The other coding-systems are defined in each language specific
1515 ;; files under lisp/language.
1516
1517 ;; Normally, set coding system to `undecided' before reading a file.
1518 ;; Compiled Emacs Lisp files (*.elc) are not decoded at all,
1519 ;; but we regard them as containing multibyte characters.
1520 ;; Tar files are not decoded at all, but we treat them as raw bytes.
1521
1522 (setq file-coding-system-alist
1523       (mapcar (lambda (arg) (cons (purecopy (car arg)) (cdr arg)))
1524       '(("\\.elc\\'" . utf-8-emacs)
1525         ("\\.el\\'" . prefer-utf-8)
1526         ("\\.utf\\(-8\\)?\\'" . utf-8)
1527         ("\\.xml\\'" . xml-find-file-coding-system)
1528         ;; We use raw-text for reading loaddefs.el so that if it
1529         ;; happens to have DOS or Mac EOLs, they are converted to
1530         ;; newlines.  This is required to make the special treatment
1531         ;; of the "\ newline" combination in loaddefs.el, which marks
1532         ;; the beginning of a doc string, work.
1533         ("\\(\\`\\|/\\)loaddefs.el\\'" . (raw-text . raw-text-unix))
1534         ("\\.tar\\'" . (no-conversion . no-conversion))
1535         ( "\\.po[tx]?\\'\\|\\.po\\." . po-find-file-coding-system)
1536         ("\\.\\(tex\\|ltx\\|dtx\\|drv\\)\\'" . latexenc-find-file-coding-system)
1537         ("" . (undecided . nil)))))
1538
1539 \f
1540 ;;; Setting coding categories and their priorities.
1541
1542 ;; This setting is just to read an Emacs Lisp source files which
1543 ;; contain multilingual text while dumping Emacs.  More appropriate
1544 ;; values are set by the command `set-language-environment' for each
1545 ;; language environment.
1546
1547 (set-coding-system-priority
1548  'iso-latin-1
1549  'utf-8
1550  'iso-2022-7bit
1551  )
1552
1553 \f
1554 ;;; Miscellaneous settings.
1555
1556 ;; Make all multibyte characters self-insert.
1557 (set-char-table-range (nth 1 global-map)
1558                       (cons 128 (max-char))
1559                       'self-insert-command)
1560
1561 (aset latin-extra-code-table ?\221 t)
1562 (aset latin-extra-code-table ?\222 t)
1563 (aset latin-extra-code-table ?\223 t)
1564 (aset latin-extra-code-table ?\224 t)
1565 (aset latin-extra-code-table ?\225 t)
1566 (aset latin-extra-code-table ?\226 t)
1567
1568 ;; The old code-pages library is obsoleted by coding systems based on
1569 ;; the charsets defined in this file but might be required by user
1570 ;; code.
1571 (provide 'code-pages)
1572
1573 ;;; mule-conf.el ends here