1 ;;; code-pages.el --- coding systems for assorted codepages -*-coding: utf-8;-*-
3 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
4 ;; Copyright (C) 2004, 2005
5 ;; National Institute of Advanced Industrial Science and Technology (AIST)
6 ;; Registration Number H14PRO021
8 ;; Author: Dave Love <fx@gnu.org>
11 ;; This file is part of GNU Emacs.
13 ;; GNU Emacs is free software; you can redistribute it and/or modify
14 ;; it under the terms of the GNU General Public License as published by
15 ;; the Free Software Foundation; either version 2, or (at your option)
18 ;; GNU Emacs is distributed in the hope that it will be useful,
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;; GNU General Public License for more details.
23 ;; You should have received a copy of the GNU General Public License
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the
25 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26 ;; Boston, MA 02110-1301, USA.
30 ;; Definitions of miscellaneous 8-bit coding systems based on ASCII
31 ;; (we can't cope properly with EBCDIC, for instance), mainly for PC
32 ;; `code pages'. They are decoded into Latin-1 and mule-unicode
33 ;; charsets rather than (lossily) into single iso8859 charsets à la
34 ;; codepage.el. The utility `cp-make-coding-system' derives them from
37 ;; Those covered are: cp437, cp737, cp720, cp775, cp850, cp851, cp852,
38 ;; cp855, cp857, cp860, cp861, cp862, cp863, cp864, cp865, cp866,
39 ;; cp869, cp874, cp1125, windows-1250, windows-1253, windows-1254,
40 ;; windows-1255, windows-1256, windows-1257, windows-1258, next,
41 ;; iso-8859-6, iso-8859-10, iso-8859-11, iso-8859-16, koi8-t,
42 ;; georgian-ps. This is meant to include all the single-byte ones
43 ;; relevant to GNU (used in glibc-defined locales); we don't yet get
44 ;; all the multibyte ones in base Emacs.
46 ;; Note that various of these can clash with definitions in
47 ;; codepage.el; we try to avoid damage from that. A few CPs from
48 ;; codepage.el (770, 773, 774) aren't covered (in the absence of
49 ;; translation tables to Unicode).
51 ;; Compile this, to avoid loading `ccl' at runtime.
53 ;; Although the tables used here aren't very big, it might be worth
54 ;; splitting the file and autoloading the coding systems if/when my
55 ;; (or similar) autoloading code is installed.
59 ;; The defsubsts here are just so that language files can use
60 ;; `cp-make-coding-system' and not require functions from this file
63 (defsubst cp-make-translation-table
(v)
64 "Return a translation table made from 128-long vector V.
65 V comprises characters encodable by mule-utf-8."
66 (let ((encoding-vector (make-vector 256 0)))
68 (aset encoding-vector i i
))
70 (aset encoding-vector
(+ i
128) (aref v i
)))
71 ;; Add equivalent characters to the encoder so that we can unify
73 (let* ((tab (make-translation-table-from-vector encoding-vector
))
74 ;; Translation table used for encoding:
75 (encode-table (char-table-extra-slot tab
0)))
76 (map-char-table (lambda (c v
)
78 (let ((c1 (aref encode-table v
)))
79 (if c1
; we encode that unicode
80 (aset encode-table c c1
)))))
81 ucs-mule-to-mule-unicode
)
84 (defsubst cp-valid-codes
(v)
85 "Derive a valid-codes list for translation vector V.
86 See `make-coding-system'."
88 (i 128) ; index into v
89 (start 0) ; start of a valid range
90 (end 127)) ; end of a valid range
92 (if (aref v
(- i
128)) ; start or extend range
95 (unless start
(setq start i
)))
97 (push (cons start end
) pairs
))
100 (if start
(push (cons start end
) pairs
))
103 ;; Fix things that have been, or might be, done by codepage.el.
104 (eval-after-load "codepage"
107 ;; Semi-dummy version for the stuff in codepage.el which we don't
108 ;; define here. (Used by mule-diag.)
109 (defun cp-supported-codepages ()
110 "Return an alist of supported codepages.
112 Each association in the alist has the form (NNN . CHARSET), where NNN is the
113 codepage number, and CHARSET is the MULE charset which is the closest match
114 for the character set supported by that codepage.
116 A codepage NNN is supported if a variable called `cpNNN-decode-table' exists,
117 is a vector, and has a charset property."
118 '(("774" . latin-iso8859-4
) ("770" . latin-iso8859-4
)
119 ("773" . latin-iso8859-4
)))
121 ;; A version which doesn't override the coding systems set up by this
122 ;; file. It could still be used for the few missing ones from
124 (defun codepage-setup (codepage)
125 "Create a coding system cpCODEPAGE to support the IBM codepage CODEPAGE.
127 These coding systems are meant for encoding and decoding 8-bit non-ASCII
128 characters used by the IBM codepages, typically in conjunction with files
129 read/written by MS-DOS software, or for display on the MS-DOS terminal."
131 (let ((completion-ignore-case t
)
132 (candidates (cp-supported-codepages)))
133 (list (completing-read "Setup DOS Codepage (default 437): " candidates
134 nil t nil nil
"437"))))
135 (let ((cp (format "cp%s" codepage
)))
136 (unless (coding-system-p (intern cp
))
137 (cp-make-coding-systems-for-codepage
138 cp
(cp-charset-for-codepage cp
) (cp-offset-for-codepage cp
))))))
141 ;; Macro to allow ccl compilation at byte-compile time, avoiding
144 (defmacro cp-make-coding-system
(name v
&optional doc-string mnemonic
)
145 "Make coding system NAME for and 8-bit, extended-ASCII character set.
146 V is a 128-long vector of characters to translate the upper half of
147 the character set. DOC-STRING and MNEMONIC are used as the
148 corresponding args of `make-coding-system'. If MNEMONIC isn't given,
150 Return an updated `non-iso-charset-alist'."
151 (let* ((encoder (intern (format "encode-%s" name
)))
152 (decoder (intern (format "decode-%s" name
)))
158 (if (r1 < 128) ;; ASCII
159 (r0 = ,(charset-id 'ascii
))
161 (r0 = ,(charset-id 'eight-bit-control
))
162 (r0 = ,(charset-id 'eight-bit-graphic
))))
163 (translate-character ,decoder r0 r1
)
164 ;; Allow fragmentation on decoding -- relevant for
165 ;; Cyrillic, Greek and, possibly Arabic and Hebrew.
166 (translate-character utf-translation-table-for-decode r0 r1
)
167 (write-multibyte-character r0 r1
)
173 (read-multibyte-character r0 r1
)
174 (translate-character ,encoder r0 r1
)
175 (if (r0 != ,(charset-id 'ascii
))
176 (if (r0 != ,(charset-id 'eight-bit-graphic
))
177 (if (r0 != ,(charset-id 'eight-bit-control
))
179 (write-repeat r1
)))))))
180 `(let ((translation-table (cp-make-translation-table ,v
))
181 (codes (cp-valid-codes ,v
)))
182 (define-translation-table ',decoder translation-table
)
183 (define-translation-table ',encoder
184 (char-table-extra-slot translation-table
0))
186 ',name
4 ,(or mnemonic ?
*)
187 (or ,doc-string
(format "%s encoding" ',name
))
188 (cons ,ccl-decoder
,ccl-encoder
)
189 (list (cons 'safe-chars
(get ',encoder
'translation-table
))
190 (cons 'valid-codes codes
)
191 (cons 'mime-charset
',name
)
192 ;; For Quail translation. Fixme: this should really be
193 ;; a separate table that only translates the coding
194 ;; system's safe-chars.
195 (cons 'translation-table-for-input
'ucs-mule-to-mule-unicode
)))
196 (let ((slot (assq ',name non-iso-charset-alist
))
197 (elt (list nil
; charset list
199 (let (l) ; code range
200 (dolist (elt (reverse codes
))
205 (push (cons ',name elt
) non-iso-charset-alist
)
207 non-iso-charset-alist
)))))
209 (eval-when-compile (defvar non-iso-charset-alist
))
211 ;; These tables were mostly derived by running somthing like
212 ;; `recode -f cpxxx/..utf-8' on a binary file filled by
213 ;; `(dotimes (i 128) (insert ?? ?\\ (+ 128 i) ?\n))' and then
214 ;; exchanging the ?\� entries for nil. iconv was used instead in some
217 ;; Fixme: Do better for mode-line mnemonics?
219 ;;;###autoload(autoload-coding-system 'cp437 '(require 'code-pages))
220 (cp-make-coding-system
351 ;;;###autoload(autoload-coding-system 'cp737 '(require 'code-pages))
352 (cp-make-coding-system
482 (coding-system-put 'cp737
'mime-charset nil
) ; not in IANA list
484 ;;;###autoload(autoload-coding-system 'cp775 '(require 'code-pages))
485 (cp-make-coding-system
616 ;;;###autoload(autoload-coding-system 'cp850 '(require 'code-pages))
617 (cp-make-coding-system
748 ;;;###autoload(autoload-coding-system 'cp851 '(require 'code-pages))
749 (cp-make-coding-system
880 ;;;###autoload(autoload-coding-system 'cp852 '(require 'code-pages))
881 (cp-make-coding-system
1012 ;;;###autoload(autoload-coding-system 'cp855 '(require 'code-pages))
1013 (cp-make-coding-system
1144 ;;;###autoload(autoload-coding-system 'cp857 '(require 'code-pages))
1145 (cp-make-coding-system
1276 ;;;###autoload(autoload-coding-system 'cp860 '(require 'code-pages))
1277 (cp-make-coding-system
1408 ;;;###autoload(autoload-coding-system 'cp861 '(require 'code-pages))
1409 (cp-make-coding-system
1540 ;;;###autoload(autoload-coding-system 'cp862 '(require 'code-pages))
1541 (cp-make-coding-system
1672 ;;;###autoload(autoload-coding-system 'cp863 '(require 'code-pages))
1673 (cp-make-coding-system
1804 ;;;###autoload(autoload-coding-system 'cp864 '(require 'code-pages))
1805 (cp-make-coding-system
1936 ;;;###autoload(autoload-coding-system 'cp865 '(require 'code-pages))
1937 (cp-make-coding-system
2068 ;;;###autoload(autoload-coding-system 'cp866 '(require 'code-pages))
2069 (cp-make-coding-system
2202 ;;;###autoload(autoload-coding-system 'cp869 '(require 'code-pages))
2203 (cp-make-coding-system
2334 ;;;###autoload(autoload-coding-system 'cp874 '(require 'code-pages))
2335 (cp-make-coding-system
2466 ;;;###autoload(autoload-coding-system 'windows-1250 '(require 'code-pages))
2467 (cp-make-coding-system
2598 ;;;###autoload(autoload-coding-system 'windows-1253 '(require 'code-pages))
2599 (cp-make-coding-system
2731 ;;;###autoload(autoload-coding-system 'windows-1254 '(require 'code-pages))
2732 (cp-make-coding-system
2864 ;;;###autoload(autoload-coding-system 'windows-1255 '(require 'code-pages))
2865 (cp-make-coding-system
2997 ;;;###autoload(autoload-coding-system 'windows-1256 '(require 'code-pages))
2998 (cp-make-coding-system
3130 ;;;###autoload(autoload-coding-system 'windows-1257 '(require 'code-pages))
3131 (cp-make-coding-system
3262 ;;;###autoload(autoload-coding-system 'windows-1258 '(require 'code-pages))
3263 (cp-make-coding-system
3394 ;;;###autoload(autoload-coding-system 'next '(require 'code-pages))
3395 (cp-make-coding-system
3525 "NeXTstep encoding." ?N
)
3527 ;;;###autoload(autoload-coding-system 'koi8-t '(require 'code-pages))
3528 (cp-make-coding-system
3529 koi8-t
; used by glibc for tg_TJ
3658 "Unicode-based KOI8-T encoding for Cyrillic")
3659 (coding-system-put 'koi8-t
'mime-charset nil
) ; not in the IANA list
3660 (define-coding-system-alias 'cyrillic-koi8-t
'koi8-t
)
3662 ;; Online final ISO draft:
3664 ;; http://www.evertype.com/standards/iso8859/fdis8859-16-en.pdf
3666 ;; Equivalent National Standard:
3667 ;; Romanian Standard SR 14111:1998, Romanian Standards Institution
3672 ;; "This set of coded graphic characters is intended for use in data and
3673 ;; text processing applications and also for information interchange. The
3674 ;; set contains graphic characters used for general purpose applications in
3675 ;; typical office environments in at least the following languages:
3676 ;; Albanian, Croatian, English, Finnish, French, German, Hungarian, Irish
3677 ;; Gaelic (new orthography), Italian, Latin, Polish, Romanian, and
3678 ;; Slovenian. This set of coded graphic characters may be regarded as a
3679 ;; version of an 8-bit code according to ISO/IEC 2022 or ISO/IEC 4873 at
3680 ;; level 1." [ISO 8859-16:2001(E), p. 1]
3682 ;; This charset is suitable for use in MIME text body parts.
3684 ;; ISO 8859-16 was primarily designed for single-byte encoding the Romanian
3685 ;; language. The UTF-8 charset is the preferred and in today's MIME software
3686 ;; more widely implemented encoding suitable for Romanian.
3687 ;;;###autoload(autoload-coding-system 'iso-8859-16 '(require 'code-pages))
3688 (cp-make-coding-system
3689 iso-latin-10
; consistent with, e.g. Latin-1
3690 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3691 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3788 "Unicode-based encoding for Latin-10 (MIME: ISO-8859-16)"
3790 (coding-system-put 'iso-latin-10
'mime-charset
'iso-8859-16
)
3791 (define-coding-system-alias 'iso-8859-16
'iso-latin-10
)
3792 (define-coding-system-alias 'latin-10
'iso-latin-10
)
3794 ;; Unicode-based alternative which has the possible advantage of
3795 ;; having its relative sparseness specified.
3796 ;;;###autoload(autoload-coding-system 'iso-8859-6 '(require 'code-pages))
3797 (cp-make-coding-system
3798 ;; The base system uses arabic-iso-8bit, but that's not a MIME charset.
3800 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3801 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3853 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3854 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3855 nil nil nil nil nil nil nil nil nil nil nil
]
3856 "Unicode-based Arabic ISO/IEC 8859-6 (MIME: ISO-8859-6)"
3858 (define-coding-system-alias 'arabic-iso-8bit
'iso-8859-6
)
3860 ;;;###autoload(autoload-coding-system 'iso-8859-10 '(require 'code-pages))
3861 (cp-make-coding-system
3863 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3864 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3961 "Unicode-based encoding for Latin-6 (MIME: ISO-8859-10)")
3962 (coding-system-put 'iso-latin-6
'mime-charset
'iso-8859-10
)
3963 (define-coding-system-alias 'iso-8859-10
'iso-latin-6
)
3964 (define-coding-system-alias 'latin-6
'iso-latin-6
)
3966 ;; used by lt_LT, lv_LV, mi_NZ
3967 ;;;###autoload(autoload-coding-system 'iso-8859-13 '(require 'code-pages))
3968 (cp-make-coding-system
3970 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3971 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4069 "Unicode-based encoding for Latin-7 (MIME: ISO-8859-13)"
4070 ?l
) ;; Lithuanian/Latvian
4071 (coding-system-put 'iso-latin-7
'mime-charset
'iso-8859-13
)
4072 (define-coding-system-alias 'iso-8859-13
'iso-latin-7
)
4073 (define-coding-system-alias 'latin-7
'iso-latin-7
)
4075 ;; Fixme: check on the C1 characters which libiconv includes. They
4076 ;; are reproduced below, but are probably wrong. I can't find an
4077 ;; official definition of georgian-ps.
4078 ;;;###autoload(autoload-coding-system 'georgian-ps '(require 'code-pages))
4079 (cp-make-coding-system
4080 georgian-ps
; used by glibc for ka_GE
4210 (coding-system-put 'georgian-ps
'mime-charset nil
) ; not in IANA list
4212 ;;;###autoload(autoload-coding-system 'cp720 '(require 'code-pages))
4213 ;; From http://www.microsoft.com/globaldev/reference/oem/720.htm
4214 (cp-make-coding-system
4344 (coding-system-put 'cp720
'mime-charset nil
) ; not in IANA list
4346 ;;;###autoload(autoload-coding-system 'cp1125 '(require 'code-pages))
4347 ;; http://oss.software.ibm.com/cvs/icu/charset/data/ucm/ibm-1125_P100-2000.ucm
4348 (cp-make-coding-system
4478 (define-coding-system-alias 'ruscii
'cp1125
)
4479 ;; Original name for cp1125, says Serhii Hlodin <hlodin@lutsk.bank.gov.ua>
4480 (define-coding-system-alias 'cp866u
'cp1125
)
4481 (coding-system-put 'cp1125
'mime-charset nil
)
4483 ;; Suggested by Anton Zinoviev <anton@lml.bas.bg>: Bulgarian DOS
4484 ;; codepage. Table at
4485 ;; <URL:http://czyborra.com/charsets/bulgarian-mik.txt.gz>.
4486 ;;;###autoload(autoload-coding-system 'mik '(require 'code-pages))
4487 (cp-make-coding-system
4489 [?А ?Б ?В ?Г ?Д ?Е ?Ж ?З ?И ?Й ?К ?Л ?М ?Н ?О ?П ?Р ?С ?Т ?У ?Ф ?Х ?Ц
4490 ?Ч ?Ш ?Щ ?Ъ ?Ы ?Ь ?Э ?Ю ?Я ?а ?б ?в ?г ?д ?е ?ж ?з ?и ?й ?к ?л ?м ?н
4491 ?о ?п ?р ?с ?т ?у ?ф ?х ?ц ?ч ?ш ?щ ?ъ ?ы ?ь ?э ?ю ?я ?└ ?┴ ?┬ ?├ ?─
4492 ?┼ ?╣ ?║ ?╚ ?╔ ?╩ ?╦ ?╠ ?═ ?╬ ?┐ ?░ ?▒ ?▓ ?│ ?┤ ?№ ?§ ?╗ ?╝ ?┘ ?┌ ?█
4493 ?▄ ?▌ ?▐ ?▀ ?α ?β ?Γ ?π ?Σ ?σ ?μ ?τ ?Φ ?Θ ?Ω ?δ ?∞ ?∅ ?∈ ?∩ ?≡ ?± ?≥
4494 ?≤ ?⌠ ?⌡ ?÷ ?≈ ?° ?∙ ?· ?√ ?ⁿ ?² ?■ ?
])
4495 (coding-system-put 'mik
'mime-charset nil
)
4497 ;; Suggested by Anton Zinoviev <anton@lml.bas.bg>: similar to CP1251
4498 ;; and used for some non-Slavic Cyrillic languages. Table found at
4499 ;; <URL:ftp://ftp.logic.ru/pub/logic/linux/cyr-asian/PT154>. See also
4500 ;; <URL:http://lists.w3.org/Archives/Public/ietf-charsets/2002AprJun/0092.html,
4501 ;; which suggests it's used in an Asian Cyrillic context.
4502 ;;;###autoload(autoload-coding-system 'pt154 '(require 'code-pages))
4503 (cp-make-coding-system
4505 [?Җ ?Ғ ?Ӯ ?ғ ?„ ?… ?Ҷ ?Ү ?Ҳ ?ү ?Ҡ ?Ӣ ?Ң ?Қ ?Һ ?Ҹ ?җ ?‘ ?’ ?“ ?” ?• ?–
4506 ?— ?ҳ ?ҷ ?ҡ ?ӣ ?ң ?қ ?һ ?ҹ ? ?Ў ?ў ?Ј ?Ө ?Ҙ ?Ұ ?§ ?Ё ?© ?Ә ?\« ?¬ ?ӯ
4507 ?® ?Ҝ ?° ?ұ ?І ?і ?ҙ ?ө ?¶ ?· ?ё ?№ ?ә ?\» ?ј ?Ҫ ?ҫ ?ҝ ?А ?Б ?В ?Г ?Д
4508 ?Е ?Ж ?З ?И ?Й ?К ?Л ?М ?Н ?О ?П ?Р ?С ?Т ?У ?Ф ?Х ?Ц ?Ч ?Ш ?Щ ?Ъ ?Ы
4509 ?Ь ?Э ?Ю ?Я ?а ?б ?в ?г ?д ?е ?ж ?з ?и ?й ?к ?л ?м ?н ?о ?п ?р ?с ?т
4510 ?у ?ф ?х ?ц ?ч ?ш ?щ ?ъ ?ы ?ь ?э ?ю ?я
])
4512 ;;;###autoload(autoload-coding-system 'iso-8859-11 '(require 'code-pages))
4513 (cp-make-coding-system
4515 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4516 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4517 ? ?ก ?ข ?ฃ ?ค ?ฅ ?ฆ ?ง ?จ ?ฉ ?ช ?ซ ?ฌ ?ญ ?ฎ ?ฏ
4518 ?ฐ ?ฑ ?ฒ ?ณ ?ด ?ต ?ถ ?ท ?ธ ?น ?บ ?ป ?ผ ?ฝ ?พ ?ฟ
4519 ?ภ ?ม ?ย ?ร ?ฤ ?ล ?ฦ ?ว ?ศ ?ษ ?ส ?ห ?ฬ ?อ ?ฮ ?ฯ
4520 ?ะ ?ั ?า ?ำ ?ิ ?ี ?ึ ?ื ?ุ ?ู ?ฺ nil nil nil nil ?฿
4521 ?เ ?แ ?โ ?ใ ?ไ ?ๅ ?ๆ ?็ ?่ ?้ ?๊ ?๋ ?์ ?ํ ?๎ ?๏
4522 ?๐ ?๑ ?๒ ?๓ ?๔ ?๕ ?๖ ?๗ ?๘ ?๙ ?๚ ?๛ nil nil nil nil
]
4523 "ISO-8859-11. This is `thai-tis620' with the addition of no-break-space.")
4526 (let ((w (intern (format "windows-125%d" i
)))
4527 (c (intern (format "cp125%d" i
))))
4528 ;; Define cp125* as aliases for all windows-125*, so on Windows
4529 ;; we can just concat "cp" to the ANSI codepage we get from the system
4530 ;; and not have to worry about whether it should be "cp" or "windows-".
4531 (when (coding-system-p w
)
4532 (define-coding-system-alias c w
)
4533 ;; Compatibility with codepage.el, though cp... are not the
4535 (if (not (assq c non-iso-charset-alist
))
4536 (let ((slot (assq w non-iso-charset-alist
)))
4538 (push (cons c
(cdr slot
)) non-iso-charset-alist
)))))))
4540 (provide 'code-pages
)
4542 ;;; arch-tag: 8b6e3c73-b271-4198-866d-ea6d0ceff1b2
4543 ;;; code-pages.el ends here