1 ;;; code-pages.el --- coding systems for assorted codepages -*-coding: utf-8;-*-
3 ;; Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007
4 ;; Free Software Foundation, Inc.
5 ;; Copyright (C) 2004, 2005, 2006, 2007
6 ;; National Institute of Advanced Industrial Science and Technology (AIST)
7 ;; Registration Number H14PRO021
9 ;; Author: Dave Love <fx@gnu.org>
12 ;; This file is part of GNU Emacs.
14 ;; GNU Emacs is free software; you can redistribute it and/or modify
15 ;; it under the terms of the GNU General Public License as published by
16 ;; the Free Software Foundation; either version 2, or (at your option)
19 ;; GNU Emacs is distributed in the hope that it will be useful,
20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ;; GNU General Public License for more details.
24 ;; You should have received a copy of the GNU General Public License
25 ;; along with GNU Emacs; see the file COPYING. If not, write to the
26 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
27 ;; Boston, MA 02110-1301, USA.
31 ;; Definitions of miscellaneous 8-bit coding systems based on ASCII
32 ;; (we can't cope properly with EBCDIC, for instance), mainly for PC
33 ;; `code pages'. They are decoded into Latin-1 and mule-unicode
34 ;; charsets rather than (lossily) into single iso8859 charsets à la
35 ;; codepage.el. The utility `cp-make-coding-system' derives them from
38 ;; Those covered are: cp437, cp737, cp720, cp775, cp850, cp851, cp852,
39 ;; cp855, cp857, cp860, cp861, cp862, cp863, cp864, cp865, cp866,
40 ;; cp869, cp874, cp1125, windows-1250, windows-1253, windows-1254,
41 ;; windows-1255, windows-1256, windows-1257, windows-1258, next,
42 ;; iso-8859-6, iso-8859-10, iso-8859-11, iso-8859-16, koi8-t,
43 ;; georgian-ps. This is meant to include all the single-byte ones
44 ;; relevant to GNU (used in glibc-defined locales); we don't yet get
45 ;; all the multibyte ones in base Emacs.
47 ;; Note that various of these can clash with definitions in
48 ;; codepage.el; we try to avoid damage from that. A few CPs from
49 ;; codepage.el (770, 773, 774) aren't covered (in the absence of
50 ;; translation tables to Unicode).
52 ;; Compile this, to avoid loading `ccl' at runtime.
54 ;; Although the tables used here aren't very big, it might be worth
55 ;; splitting the file and autoloading the coding systems if/when my
56 ;; (or similar) autoloading code is installed.
60 ;; The defsubsts here are just so that language files can use
61 ;; `cp-make-coding-system' and not require functions from this file
64 (defsubst cp-make-translation-table
(v)
65 "Return a translation table made from 128-long vector V.
66 V comprises characters encodable by mule-utf-8."
67 (let ((encoding-vector (make-vector 256 0)))
69 (aset encoding-vector i i
))
71 (aset encoding-vector
(+ i
128) (aref v i
)))
72 ;; Add equivalent characters to the encoder so that we can unify
74 (let* ((tab (make-translation-table-from-vector encoding-vector
))
75 ;; Translation table used for encoding:
76 (encode-table (char-table-extra-slot tab
0)))
77 (map-char-table (lambda (c v
)
79 (let ((c1 (aref encode-table v
)))
80 (if c1
; we encode that unicode
81 (aset encode-table c c1
)))))
82 ucs-mule-to-mule-unicode
)
85 (defsubst cp-valid-codes
(v)
86 "Derive a valid-codes list for translation vector V.
87 See `make-coding-system'."
89 (i 128) ; index into v
90 (start 0) ; start of a valid range
91 (end 127)) ; end of a valid range
93 (if (aref v
(- i
128)) ; start or extend range
96 (unless start
(setq start i
)))
98 (push (cons start end
) pairs
))
101 (if start
(push (cons start end
) pairs
))
104 ;; Fix things that have been, or might be, done by codepage.el.
105 (eval-after-load "codepage"
108 ;; Semi-dummy version for the stuff in codepage.el which we don't
109 ;; define here. (Used by mule-diag.)
110 (defun cp-supported-codepages ()
111 "Return an alist of supported codepages.
113 Each association in the alist has the form (NNN . CHARSET), where NNN is the
114 codepage number, and CHARSET is the MULE charset which is the closest match
115 for the character set supported by that codepage.
117 A codepage NNN is supported if a variable called `cpNNN-decode-table' exists,
118 is a vector, and has a charset property."
119 '(("774" . latin-iso8859-4
) ("770" . latin-iso8859-4
)
120 ("773" . latin-iso8859-4
)))
122 ;; A version which doesn't override the coding systems set up by this
123 ;; file. It could still be used for the few missing ones from
125 (defun codepage-setup (codepage)
126 "Create a coding system cpCODEPAGE to support the IBM codepage CODEPAGE.
128 These coding systems are meant for encoding and decoding 8-bit non-ASCII
129 characters used by the IBM codepages, typically in conjunction with files
130 read/written by MS-DOS software, or for display on the MS-DOS terminal."
132 (let ((completion-ignore-case t
)
133 (candidates (cp-supported-codepages)))
134 (list (completing-read "Setup DOS Codepage (default 437): " candidates
135 nil t nil nil
"437"))))
136 (let ((cp (format "cp%s" codepage
)))
137 (unless (coding-system-p (intern cp
))
138 (cp-make-coding-systems-for-codepage
139 cp
(cp-charset-for-codepage cp
) (cp-offset-for-codepage cp
))))))
142 ;; Macro to allow ccl compilation at byte-compile time, avoiding
145 (defmacro cp-make-coding-system
(name v
&optional doc-string mnemonic
)
146 "Make coding system NAME for and 8-bit, extended-ASCII character set.
147 V is a 128-long vector of characters to translate the upper half of
148 the character set. DOC-STRING and MNEMONIC are used as the
149 corresponding args of `make-coding-system'. If MNEMONIC isn't given,
151 Return an updated `non-iso-charset-alist'."
152 (let* ((encoder (intern (format "encode-%s" name
)))
153 (decoder (intern (format "decode-%s" name
)))
159 (if (r1 < 128) ;; ASCII
160 (r0 = ,(charset-id 'ascii
))
162 (r0 = ,(charset-id 'eight-bit-control
))
163 (r0 = ,(charset-id 'eight-bit-graphic
))))
164 (translate-character ,decoder r0 r1
)
165 ;; Allow fragmentation on decoding -- relevant for
166 ;; Cyrillic, Greek and, possibly Arabic and Hebrew.
167 (translate-character utf-translation-table-for-decode r0 r1
)
168 (write-multibyte-character r0 r1
)
174 (read-multibyte-character r0 r1
)
175 (translate-character ,encoder r0 r1
)
176 (if (r0 != ,(charset-id 'ascii
))
177 (if (r0 != ,(charset-id 'eight-bit-graphic
))
178 (if (r0 != ,(charset-id 'eight-bit-control
))
180 (write-repeat r1
)))))))
181 `(let ((translation-table (cp-make-translation-table ,v
))
182 (codes (cp-valid-codes ,v
)))
183 (define-translation-table ',decoder translation-table
)
184 (define-translation-table ',encoder
185 (char-table-extra-slot translation-table
0))
187 ',name
4 ,(or mnemonic ?
*)
188 (or ,doc-string
(format "%s encoding" ',name
))
189 (cons ,ccl-decoder
,ccl-encoder
)
190 (list (cons 'safe-chars
(get ',encoder
'translation-table
))
191 (cons 'valid-codes codes
)
192 (cons 'mime-charset
',name
)
193 ;; For Quail translation. Fixme: this should really be
194 ;; a separate table that only translates the coding
195 ;; system's safe-chars.
196 (cons 'translation-table-for-input
'ucs-mule-to-mule-unicode
)))
197 (let ((slot (assq ',name non-iso-charset-alist
))
198 (elt (list nil
; charset list
200 (let (l) ; code range
201 (dolist (elt (reverse codes
))
206 (push (cons ',name elt
) non-iso-charset-alist
)
208 non-iso-charset-alist
)))))
210 (eval-when-compile (defvar non-iso-charset-alist
))
212 ;; These tables were mostly derived by running somthing like
213 ;; `recode -f cpxxx/..utf-8' on a binary file filled by
214 ;; `(dotimes (i 128) (insert ?? ?\\ (+ 128 i) ?\n))' and then
215 ;; exchanging the ?\� entries for nil. iconv was used instead in some
218 ;; Fixme: Do better for mode-line mnemonics?
220 ;;;###autoload(autoload-coding-system 'cp437 '(require 'code-pages))
221 (cp-make-coding-system
352 ;;;###autoload(autoload-coding-system 'cp737 '(require 'code-pages))
353 (cp-make-coding-system
483 (coding-system-put 'cp737
'mime-charset nil
) ; not in IANA list
485 ;;;###autoload(autoload-coding-system 'cp775 '(require 'code-pages))
486 (cp-make-coding-system
617 ;;;###autoload(autoload-coding-system 'cp850 '(require 'code-pages))
618 (cp-make-coding-system
749 ;;;###autoload(autoload-coding-system 'cp851 '(require 'code-pages))
750 (cp-make-coding-system
881 ;;;###autoload(autoload-coding-system 'cp852 '(require 'code-pages))
882 (cp-make-coding-system
1013 ;;;###autoload(autoload-coding-system 'cp855 '(require 'code-pages))
1014 (cp-make-coding-system
1145 ;;;###autoload(autoload-coding-system 'cp857 '(require 'code-pages))
1146 (cp-make-coding-system
1277 ;;;###autoload(autoload-coding-system 'cp858 '(require 'code-pages))
1278 (cp-make-coding-system
1409 ;;;###autoload(autoload-coding-system 'cp860 '(require 'code-pages))
1410 (cp-make-coding-system
1541 ;;;###autoload(autoload-coding-system 'cp861 '(require 'code-pages))
1542 (cp-make-coding-system
1673 ;;;###autoload(autoload-coding-system 'cp862 '(require 'code-pages))
1674 (cp-make-coding-system
1805 ;;;###autoload(autoload-coding-system 'cp863 '(require 'code-pages))
1806 (cp-make-coding-system
1937 ;;;###autoload(autoload-coding-system 'cp864 '(require 'code-pages))
1938 (cp-make-coding-system
2069 ;;;###autoload(autoload-coding-system 'cp865 '(require 'code-pages))
2070 (cp-make-coding-system
2201 ;;;###autoload(autoload-coding-system 'cp866 '(require 'code-pages))
2202 (cp-make-coding-system
2335 ;;;###autoload(autoload-coding-system 'cp869 '(require 'code-pages))
2336 (cp-make-coding-system
2467 ;;;###autoload(autoload-coding-system 'cp874 '(require 'code-pages))
2468 (cp-make-coding-system
2599 ;;;###autoload(autoload-coding-system 'windows-1250 '(require 'code-pages))
2600 ;;;###autoload(autoload-coding-system 'cp1250 '(require 'code-pages))
2601 (cp-make-coding-system
2732 ;;;###autoload(autoload-coding-system 'windows-1253 '(require 'code-pages))
2733 ;;;###autoload(autoload-coding-system 'cp1253 '(require 'code-pages))
2734 (cp-make-coding-system
2866 ;;;###autoload(autoload-coding-system 'windows-1254 '(require 'code-pages))
2867 ;;;###autoload(autoload-coding-system 'cp1254 '(require 'code-pages))
2868 (cp-make-coding-system
3000 ;;;###autoload(autoload-coding-system 'windows-1255 '(require 'code-pages))
3001 ;;;###autoload(autoload-coding-system 'cp1255 '(require 'code-pages))
3002 (cp-make-coding-system
3134 ;;;###autoload(autoload-coding-system 'windows-1256 '(require 'code-pages))
3135 ;;;###autoload(autoload-coding-system 'cp1256 '(require 'code-pages))
3136 (cp-make-coding-system
3268 ;;;###autoload(autoload-coding-system 'windows-1257 '(require 'code-pages))
3269 ;;;###autoload(autoload-coding-system 'cp1257 '(require 'code-pages))
3270 (cp-make-coding-system
3401 ;;;###autoload(autoload-coding-system 'windows-1258 '(require 'code-pages))
3402 ;;;###autoload(autoload-coding-system 'cp1258 '(require 'code-pages))
3403 (cp-make-coding-system
3534 ;;;###autoload(autoload-coding-system 'next '(require 'code-pages))
3535 (cp-make-coding-system
3665 "NeXTstep encoding." ?N
)
3667 ;;;###autoload(autoload-coding-system 'koi8-t '(require 'code-pages))
3668 (cp-make-coding-system
3669 koi8-t
; used by glibc for tg_TJ
3798 "Unicode-based KOI8-T encoding for Cyrillic")
3799 (coding-system-put 'koi8-t
'mime-charset nil
) ; not in the IANA list
3800 (define-coding-system-alias 'cyrillic-koi8-t
'koi8-t
)
3802 ;; Online final ISO draft:
3804 ;; http://www.evertype.com/standards/iso8859/fdis8859-16-en.pdf
3806 ;; Equivalent National Standard:
3807 ;; Romanian Standard SR 14111:1998, Romanian Standards Institution
3812 ;; "This set of coded graphic characters is intended for use in data and
3813 ;; text processing applications and also for information interchange. The
3814 ;; set contains graphic characters used for general purpose applications in
3815 ;; typical office environments in at least the following languages:
3816 ;; Albanian, Croatian, English, Finnish, French, German, Hungarian, Irish
3817 ;; Gaelic (new orthography), Italian, Latin, Polish, Romanian, and
3818 ;; Slovenian. This set of coded graphic characters may be regarded as a
3819 ;; version of an 8-bit code according to ISO/IEC 2022 or ISO/IEC 4873 at
3820 ;; level 1." [ISO 8859-16:2001(E), p. 1]
3822 ;; This charset is suitable for use in MIME text body parts.
3824 ;; ISO 8859-16 was primarily designed for single-byte encoding the Romanian
3825 ;; language. The UTF-8 charset is the preferred and in today's MIME software
3826 ;; more widely implemented encoding suitable for Romanian.
3827 ;;;###autoload(autoload-coding-system 'iso-8859-16 '(require 'code-pages))
3828 (cp-make-coding-system
3829 iso-latin-10
; consistent with, e.g. Latin-1
3830 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3831 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3928 "Unicode-based encoding for Latin-10 (MIME: ISO-8859-16)"
3930 (coding-system-put 'iso-latin-10
'mime-charset
'iso-8859-16
)
3931 (define-coding-system-alias 'iso-8859-16
'iso-latin-10
)
3932 (define-coding-system-alias 'latin-10
'iso-latin-10
)
3934 ;; Unicode-based alternative which has the possible advantage of
3935 ;; having its relative sparseness specified.
3936 ;;;###autoload(autoload-coding-system 'iso-8859-6 '(require 'code-pages))
3937 (cp-make-coding-system
3938 ;; The base system uses arabic-iso-8bit, but that's not a MIME charset.
3940 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
3941 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4025 nil nil nil nil nil nil nil nil nil nil nil nil nil
]
4026 "Unicode-based Arabic ISO/IEC 8859-6 (MIME: ISO-8859-6)"
4028 (define-coding-system-alias 'arabic-iso-8bit
'iso-8859-6
)
4030 ;;;###autoload(autoload-coding-system 'iso-8859-10 '(require 'code-pages))
4031 (cp-make-coding-system
4033 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4034 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4131 "Unicode-based encoding for Latin-6 (MIME: ISO-8859-10)")
4132 (coding-system-put 'iso-latin-6
'mime-charset
'iso-8859-10
)
4133 (define-coding-system-alias 'iso-8859-10
'iso-latin-6
)
4134 (define-coding-system-alias 'latin-6
'iso-latin-6
)
4136 ;; used by lt_LT, lv_LV, mi_NZ
4137 ;;;###autoload(autoload-coding-system 'iso-8859-13 '(require 'code-pages))
4138 (cp-make-coding-system
4140 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4141 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4239 "Unicode-based encoding for Latin-7 (MIME: ISO-8859-13)"
4240 ?l
) ;; Lithuanian/Latvian
4241 (coding-system-put 'iso-latin-7
'mime-charset
'iso-8859-13
)
4242 (define-coding-system-alias 'iso-8859-13
'iso-latin-7
)
4243 (define-coding-system-alias 'latin-7
'iso-latin-7
)
4245 ;; Fixme: check on the C1 characters which libiconv includes. They
4246 ;; are reproduced below, but are probably wrong. I can't find an
4247 ;; official definition of georgian-ps.
4248 ;;;###autoload(autoload-coding-system 'georgian-ps '(require 'code-pages))
4249 (cp-make-coding-system
4250 georgian-ps
; used by glibc for ka_GE
4380 (coding-system-put 'georgian-ps
'mime-charset nil
) ; not in IANA list
4382 ;;;###autoload(autoload-coding-system 'cp720 '(require 'code-pages))
4383 ;; From http://www.microsoft.com/globaldev/reference/oem/720.htm
4384 (cp-make-coding-system
4514 (coding-system-put 'cp720
'mime-charset nil
) ; not in IANA list
4516 ;;;###autoload(autoload-coding-system 'cp1125 '(require 'code-pages))
4517 ;; http://oss.software.ibm.com/cvs/icu/charset/data/ucm/ibm-1125_P100-2000.ucm
4518 (cp-make-coding-system
4648 (define-coding-system-alias 'ruscii
'cp1125
)
4649 ;; Original name for cp1125, says Serhii Hlodin <hlodin@lutsk.bank.gov.ua>
4650 (define-coding-system-alias 'cp866u
'cp1125
)
4651 (coding-system-put 'cp1125
'mime-charset nil
)
4653 ;; Suggested by Anton Zinoviev <anton@lml.bas.bg>: Bulgarian DOS
4654 ;; codepage. Table at
4655 ;; <http://en.wikipedia.org/wiki/MIK_Code_page>.
4656 ;;;###autoload(autoload-coding-system 'mik '(require 'code-pages))
4657 (cp-make-coding-system
4659 [?А ?Б ?В ?Г ?Д ?Е ?Ж ?З ?И ?Й ?К ?Л ?М ?Н ?О ?П ?Р ?С ?Т ?У ?Ф ?Х ?Ц
4660 ?Ч ?Ш ?Щ ?Ъ ?Ы ?Ь ?Э ?Ю ?Я ?а ?б ?в ?г ?д ?е ?ж ?з ?и ?й ?к ?л ?м ?н
4661 ?о ?п ?р ?с ?т ?у ?ф ?х ?ц ?ч ?ш ?щ ?ъ ?ы ?ь ?э ?ю ?я ?└ ?┴ ?┬ ?├ ?─
4662 ?┼ ?╣ ?║ ?╚ ?╔ ?╩ ?╦ ?╠ ?═ ?╬ ?┐ ?░ ?▒ ?▓ ?│ ?┤ ?№ ?§ ?╗ ?╝ ?┘ ?┌ ?█
4663 ?▄ ?▌ ?▐ ?▀ ?α ?ß ?Γ ?π ?Σ ?σ ?µ ?τ ?Φ ?Θ ?Ω ?δ ?∞ ?φ ?ε ?∩ ?≡ ?± ?≥
4664 ?≤ ?⌠ ?⌡ ?÷ ?≈ ?° ?∙ ?· ?√ ?ⁿ ?² ?■ ?
])
4665 (coding-system-put 'mik
'mime-charset nil
)
4667 ;; Suggested by Anton Zinoviev <anton@lml.bas.bg>: similar to CP1251
4668 ;; and used for some non-Slavic Cyrillic languages. Table found at
4669 ;; <URL:ftp://ftp.logic.ru/pub/logic/linux/cyr-asian/PT154>. See also
4670 ;; <URL:http://lists.w3.org/Archives/Public/ietf-charsets/2002AprJun/0092.html,
4671 ;; which suggests it's used in an Asian Cyrillic context.
4672 ;;;###autoload(autoload-coding-system 'pt154 '(require 'code-pages))
4673 (cp-make-coding-system
4675 [?Җ ?Ғ ?Ӯ ?ғ ?„ ?… ?Ҷ ?Ү ?Ҳ ?ү ?Ҡ ?Ӣ ?Ң ?Қ ?Һ ?Ҹ ?җ ?‘ ?’ ?“ ?” ?• ?–
4676 ?— ?ҳ ?ҷ ?ҡ ?ӣ ?ң ?қ ?һ ?ҹ ? ?Ў ?ў ?Ј ?Ө ?Ҙ ?Ұ ?§ ?Ё ?© ?Ә ?\« ?¬ ?ӯ
4677 ?® ?Ҝ ?° ?ұ ?І ?і ?ҙ ?ө ?¶ ?· ?ё ?№ ?ә ?\» ?ј ?Ҫ ?ҫ ?ҝ ?А ?Б ?В ?Г ?Д
4678 ?Е ?Ж ?З ?И ?Й ?К ?Л ?М ?Н ?О ?П ?Р ?С ?Т ?У ?Ф ?Х ?Ц ?Ч ?Ш ?Щ ?Ъ ?Ы
4679 ?Ь ?Э ?Ю ?Я ?а ?б ?в ?г ?д ?е ?ж ?з ?и ?й ?к ?л ?м ?н ?о ?п ?р ?с ?т
4680 ?у ?ф ?х ?ц ?ч ?ш ?щ ?ъ ?ы ?ь ?э ?ю ?я
])
4682 ;;;###autoload(autoload-coding-system 'iso-8859-11 '(require 'code-pages))
4683 (cp-make-coding-system
4685 [nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4686 nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil nil
4687 ? ?ก ?ข ?ฃ ?ค ?ฅ ?ฆ ?ง ?จ ?ฉ ?ช ?ซ ?ฌ ?ญ ?ฎ ?ฏ
4688 ?ฐ ?ฑ ?ฒ ?ณ ?ด ?ต ?ถ ?ท ?ธ ?น ?บ ?ป ?ผ ?ฝ ?พ ?ฟ
4689 ?ภ ?ม ?ย ?ร ?ฤ ?ล ?ฦ ?ว ?ศ ?ษ ?ส ?ห ?ฬ ?อ ?ฮ ?ฯ
4690 ?ะ ?ั ?า ?ำ ?ิ ?ี ?ึ ?ื ?ุ ?ู ?ฺ nil nil nil nil ?฿
4691 ?เ ?แ ?โ ?ใ ?ไ ?ๅ ?ๆ ?็ ?่ ?้ ?๊ ?๋ ?์ ?ํ ?๎ ?๏
4692 ?๐ ?๑ ?๒ ?๓ ?๔ ?๕ ?๖ ?๗ ?๘ ?๙ ?๚ ?๛ nil nil nil nil
]
4693 "ISO-8859-11. This is `thai-tis620' with the addition of no-break-space.")
4696 (let ((w (intern (format "windows-125%d" i
)))
4697 (c (intern (format "cp125%d" i
))))
4698 ;; Define cp125* as aliases for all windows-125*, so on Windows
4699 ;; we can just concat "cp" to the ANSI codepage we get from the system
4700 ;; and not have to worry about whether it should be "cp" or "windows-".
4701 (when (coding-system-p w
)
4702 (define-coding-system-alias c w
)
4703 ;; Compatibility with codepage.el, though cp... are not the
4705 (if (not (assq c non-iso-charset-alist
))
4706 (let ((slot (assq w non-iso-charset-alist
)))
4708 (push (cons c
(cdr slot
)) non-iso-charset-alist
)))))))
4710 (provide 'code-pages
)
4712 ;;; arch-tag: 8b6e3c73-b271-4198-866d-ea6d0ceff1b2
4713 ;;; code-pages.el ends here