Update copyright years.
[emacs.git] / lisp / language / china-util.el
blobb1bf656289168952a5192cc90791040e93398965
1 ;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*-
3 ;; Copyright (C) 1995, 2001, 2002, 2003, 2004, 2005, 2006
4 ;; Free Software Foundation, Inc.
5 ;; Copyright (C) 1995, 1997, 2001, 2002, 2003, 2004, 2005, 2006
6 ;; National Institute of Advanced Industrial Science and Technology (AIST)
7 ;; Registration Number H14PRO021
9 ;; Keywords: mule, multilingual, Chinese
11 ;; This file is part of GNU Emacs.
13 ;; GNU Emacs is free software; you can redistribute it and/or modify
14 ;; it under the terms of the GNU General Public License as published by
15 ;; the Free Software Foundation; either version 2, or (at your option)
16 ;; any later version.
18 ;; GNU Emacs is distributed in the hope that it will be useful,
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;; GNU General Public License for more details.
23 ;; You should have received a copy of the GNU General Public License
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the
25 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26 ;; Boston, MA 02110-1301, USA.
28 ;;; Commentary:
30 ;;; Code:
32 ;; Hz/ZW/EUC-TW encoding stuff
34 ;; HZ is an encoding method for Chinese character set GB2312 used
35 ;; widely in Internet. It is very similar to 7-bit environment of
36 ;; ISO-2022. The difference is that HZ uses the sequence "~{" and
37 ;; "~}" for designating GB2312 and ASCII respectively, hence, it
38 ;; doesn't uses ESC (0x1B) code.
40 ;; ZW is another encoding method for Chinese character set GB2312. It
41 ;; encodes Chinese characters line by line by starting each line with
42 ;; the sequence "zW". It also uses only 7-bit as HZ.
44 ;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is
45 ;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with
46 ;; a single shift escape followed by three bytes: the first gives the
47 ;; plane, the second and third the character code. Note that characters
48 ;; of plane 1 are (redundantly) accessible with a single shift escape
49 ;; also.
51 ;; ISO-2022 escape sequence to designate GB2312.
52 (defvar iso2022-gb-designation "\e$A")
53 ;; HZ escape sequence to designate GB2312.
54 (defvar hz-gb-designnation "~{")
55 ;; ISO-2022 escape sequence to designate ASCII.
56 (defvar iso2022-ascii-designation "\e(B")
57 ;; HZ escape sequence to designate ASCII.
58 (defvar hz-ascii-designnation "~}")
59 ;; Regexp of ZW sequence to start GB2312.
60 (defvar zw-start-gb "^zW")
61 ;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW.
62 (defvar hz/zw-start-gb
63 (concat hz-gb-designnation "\\|" zw-start-gb "\\|[^\0-\177]"))
65 (defvar decode-hz-line-continuation nil
66 "Flag to tell if we should care line continuation convention of Hz.")
68 (defconst hz-set-msb-table
69 (eval-when-compile
70 (let ((chars nil)
71 (i 0))
72 (while (< i 33)
73 (push i chars)
74 (setq i (1+ i)))
75 (while (< i 127)
76 (push (+ i 128) chars)
77 (setq i (1+ i)))
78 (apply 'string (nreverse chars)))))
80 ;;;###autoload
81 (defun decode-hz-region (beg end)
82 "Decode HZ/ZW encoded text in the current region.
83 Return the length of resulting text."
84 (interactive "r")
85 (save-excursion
86 (save-restriction
87 (let (pos ch)
88 (narrow-to-region beg end)
90 ;; We, at first, convert HZ/ZW to `euc-china',
91 ;; then decode it.
93 ;; "~\n" -> "\n", "~~" -> "~"
94 (goto-char (point-min))
95 (while (search-forward "~" nil t)
96 (setq ch (following-char))
97 (if (or (= ch ?\n) (= ch ?~)) (delete-char -1)))
99 ;; "^zW...\n" -> Chinese GB2312
100 ;; "~{...~}" -> Chinese GB2312
101 (goto-char (point-min))
102 (setq beg nil)
103 (while (re-search-forward hz/zw-start-gb nil t)
104 (setq pos (match-beginning 0)
105 ch (char-after pos))
106 ;; Record the first position to start conversion.
107 (or beg (setq beg pos))
108 (end-of-line)
109 (setq end (point))
110 (if (>= ch 128) ; 8bit GB2312
112 (goto-char pos)
113 (delete-char 2)
114 (setq end (- end 2))
115 (if (= ch ?z) ; ZW -> euc-china
116 (progn
117 (translate-region (point) end hz-set-msb-table)
118 (goto-char end))
119 (if (search-forward hz-ascii-designnation
120 (if decode-hz-line-continuation nil end)
122 (delete-char -2))
123 (setq end (point))
124 (translate-region pos (point) hz-set-msb-table))))
125 (if beg
126 (decode-coding-region beg end 'euc-china)))
127 (- (point-max) (point-min)))))
129 ;;;###autoload
130 (defun decode-hz-buffer ()
131 "Decode HZ/ZW encoded text in the current buffer."
132 (interactive)
133 (decode-hz-region (point-min) (point-max)))
135 ;;;###autoload
136 (defun encode-hz-region (beg end)
137 "Encode the text in the current region to HZ.
138 Return the length of resulting text."
139 (interactive "r")
140 (save-excursion
141 (save-restriction
142 (narrow-to-region beg end)
144 ;; "~" -> "~~"
145 (goto-char (point-min))
146 (while (search-forward "~" nil t) (insert ?~))
148 ;; Chinese GB2312 -> "~{...~}"
149 (goto-char (point-min))
150 (if (re-search-forward "\\cc" nil t)
151 (let (pos)
152 (goto-char (setq pos (match-beginning 0)))
153 (encode-coding-region pos (point-max) 'iso-2022-7bit)
154 (goto-char pos)
155 (while (search-forward iso2022-gb-designation nil t)
156 (delete-char -3)
157 (insert hz-gb-designnation))
158 (goto-char pos)
159 (while (search-forward iso2022-ascii-designation nil t)
160 (delete-char -3)
161 (insert hz-ascii-designnation))))
162 (- (point-max) (point-min)))))
164 ;;;###autoload
165 (defun encode-hz-buffer ()
166 "Encode the text in the current buffer to HZ."
167 (interactive)
168 (encode-hz-region (point-min) (point-max)))
170 ;; The following sets up a translation table (big5-to-cns) from Big 5
171 ;; to CNS encoding, using some auxiliary functions to make the code
172 ;; more readable.
174 ;; Many kudos to Himi! The used code has been adapted from his
175 ;; mule-ucs package.
177 (eval-when-compile
178 (defun big5-to-flat-code (num)
179 "Convert NUM in Big 5 encoding to a `flat code'.
180 0xA140 will be mapped to position 0, 0xA141 to position 1, etc.
181 There are no gaps in the flat code."
183 (let ((hi (/ num 256))
184 (lo (% num 256)))
185 (+ (* 157 (- hi #xa1))
186 (- lo (if (>= lo #xa1) 98 64)))))
188 (defun flat-code-to-big5 (num)
189 "Convert NUM from a `flat code' to Big 5 encoding.
190 This is the inverse function of `big5-to-flat-code'."
192 (let ((hi (/ num 157))
193 (lo (% num 157)))
194 (+ (* 256 (+ hi #xa1))
195 (+ lo (if (< lo 63) 64 98)))))
197 (defun euc-to-flat-code (num)
198 "Convert NUM in EUC encoding (in GL representation) to a `flat code'.
199 0x2121 will be mapped to position 0, 0x2122 to position 1, etc.
200 There are no gaps in the flat code."
202 (let ((hi (/ num 256))
203 (lo (% num 256)))
204 (+ (* 94 (- hi #x21))
205 (- lo #x21))))
207 (defun flat-code-to-euc (num)
208 "Convert NUM from a `flat code' to EUC encoding (in GL representation).
209 The inverse function of `euc-to-flat-code'. The high and low bytes are
210 returned in a list."
212 (let ((hi (/ num 94))
213 (lo (% num 94)))
214 (list (+ hi #x21) (+ lo #x21))))
216 (defun expand-euc-big5-alist (alist)
217 "Create a translation table and fills it with data given in ALIST.
218 Elements of ALIST can be either given as
220 ((euc-charset . startchar) . (big5-range-begin . big5-range-end))
222 or as
224 (euc-character . big5-charcode)
226 The former maps a range of glyphs in an EUC charset (where STARTCHAR
227 is in GL representation) to a certain range of Big 5 encoded
228 characters, the latter maps a single glyph. Glyphs which can't be
229 mapped will be represented with the byte 0xFF.
231 The return value is the filled translation table."
233 (let ((chartable (make-char-table 'translation-table #xFF))
234 char
235 big5
238 codepoint
239 charset)
240 (dolist (elem alist)
241 (setq char (car elem)
242 big5 (cdr elem))
243 (cond ((and (consp char)
244 (consp big5))
245 (setq i (big5-to-flat-code (car big5))
246 end (big5-to-flat-code (cdr big5))
247 codepoint (euc-to-flat-code (cdr char))
248 charset (car char))
249 (while (>= end i)
250 (aset chartable
251 (decode-big5-char (flat-code-to-big5 i))
252 (apply (function make-char)
253 charset
254 (flat-code-to-euc codepoint)))
255 (setq i (1+ i)
256 codepoint (1+ codepoint))))
257 ((and (char-valid-p char)
258 (numberp big5))
259 (setq i (decode-big5-char big5))
260 (aset chartable i char))
262 (error "Unknown slot type: %S" elem))))
263 ;; the return value
264 chartable)))
266 ;; All non-CNS encodings are commented out.
268 (define-translation-table 'big5-to-cns
269 (eval-when-compile
270 (expand-euc-big5-alist
272 ;; Symbols
273 ((chinese-cns11643-1 . #x2121) . (#xA140 . #xA1F5))
274 (?\e$(G"X\e(B . #xA1F6)
275 (?\e$(G"W\e(B . #xA1F7)
276 ((chinese-cns11643-1 . #x2259) . (#xA1F8 . #xA2AE))
277 ((chinese-cns11643-1 . #x2421) . (#xA2AF . #xA3BF))
278 ;; Control codes (vendor dependent)
279 ((chinese-cns11643-1 . #x4221) . (#xA3C0 . #xA3E0))
280 ;; Level 1 Ideographs
281 ((chinese-cns11643-1 . #x4421) . (#xA440 . #xACFD))
282 (?\e$(GWS\e(B . #xACFE)
283 ((chinese-cns11643-1 . #x5323) . (#xAD40 . #xAFCF))
284 ((chinese-cns11643-1 . #x5754) . (#xAFD0 . #xBBC7))
285 ((chinese-cns11643-1 . #x6B51) . (#xBBC8 . #xBE51))
286 (?\e$(GkP\e(B . #xBE52)
287 ((chinese-cns11643-1 . #x6F5C) . (#xBE53 . #xC1AA))
288 ((chinese-cns11643-1 . #x7536) . (#xC1AB . #xC2CA))
289 (?\e$(Gu5\e(B . #xC2CB)
290 ((chinese-cns11643-1 . #x7737) . (#xC2CC . #xC360))
291 ((chinese-cns11643-1 . #x782E) . (#xC361 . #xC3B8))
292 (?\e$(Gxe\e(B . #xC3B9)
293 (?\e$(Gxd\e(B . #xC3BA)
294 ((chinese-cns11643-1 . #x7866) . (#xC3BB . #xC455))
295 (?\e$(Gx-\e(B . #xC456)
296 ((chinese-cns11643-1 . #x7962) . (#xC457 . #xC67E))
297 ;; Symbols
298 ((chinese-cns11643-1 . #x2621) . (#xC6A1 . #xC6BE))
299 ;; Radicals
300 (?\e$(G'#\e(B . #xC6BF)
301 (?\e$(G'$\e(B . #xC6C0)
302 (?\e$(G'&\e(B . #xC6C1)
303 (?\e$(G'(\e(B . #xC6C2)
304 (?\e$(G'-\e(B . #xC6C3)
305 (?\e$(G'.\e(B . #xC6C4)
306 (?\e$(G'/\e(B . #xC6C5)
307 (?\e$(G'4\e(B . #xC6C6)
308 (?\e$(G'7\e(B . #xC6C7)
309 (?\e$(G':\e(B . #xC6C8)
310 (?\e$(G'<\e(B . #xC6C9)
311 (?\e$(G'B\e(B . #xC6CA)
312 (?\e$(G'G\e(B . #xC6CB)
313 (?\e$(G'N\e(B . #xC6CC)
314 (?\e$(G'S\e(B . #xC6CD)
315 (?\e$(G'T\e(B . #xC6CE)
316 (?\e$(G'U\e(B . #xC6CF)
317 (?\e$(G'Y\e(B . #xC6D0)
318 (?\e$(G'Z\e(B . #xC6D1)
319 (?\e$(G'a\e(B . #xC6D2)
320 (?\e$(G'f\e(B . #xC6D3)
321 (?\e$(G()\e(B . #xC6D4)
322 (?\e$(G(*\e(B . #xC6D5)
323 (?\e$(G(c\e(B . #xC6D6)
324 (?\e$(G(l\e(B . #xC6D7)
325 ;; Diacritical Marks
326 ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9))
327 ;; Japanese Kana Supplement
328 ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3))
329 ;; Japanese Hiragana
330 ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A))
331 ;; Japanese Katakana
332 ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2))
333 ;; Cyrillic Characters
334 ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854))
335 ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875))
336 ;; Special Chinese Characters
337 (?\e$(J!#\e(B . #xC879)
338 (?\e$(J!$\e(B . #xC87B)
339 (?\e$(J!*\e(B . #xC87D)
340 (?\e$(J!R\e(B . #xC8A2)
342 ;; JIS X 0208 NOT SIGN (cf. U+00AC)
343 ; (?\e$B"L\e(B . #xC8CD)
344 ;; JIS X 0212 BROKEN BAR (cf. U+00A6)
345 ; (?\e$(D"C\e(B . #xC8CE)
347 ;; GB 2312 characters
348 ; (?\e$A!d\e(B . #xC8CF)
349 ; (?\e$A!e\e(B . #xC8D0)
350 ;;;;; C8D1 - Japanese `(\e$B3t\e(B)'
351 ; (?\e$A!m\e(B . #xC8D2)
352 ;;;;; C8D2 - Tel.
354 ;; Level 2 Ideographs
355 ((chinese-cns11643-2 . #x2121) . (#xC940 . #xC949))
356 (?\e$(GDB\e(B . #xC94A);; a duplicate of #xA461
357 ((chinese-cns11643-2 . #x212B) . (#xC94B . #xC96B))
358 ((chinese-cns11643-2 . #x214D) . (#xC96C . #xC9BD))
359 (?\e$(H!L\e(B . #xC9BE)
360 ((chinese-cns11643-2 . #x217D) . (#xC9BF . #xC9EC))
361 ((chinese-cns11643-2 . #x224E) . (#xC9ED . #xCAF6))
362 (?\e$(H"M\e(B . #xCAF7)
363 ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB))
364 (?\e$(H>c\e(B . #xD6CC)
365 ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779))
366 (?\e$(H?j\e(B . #xD77A)
367 ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE))
368 (?\e$(H7o\e(B . #xDADF)
369 ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6))
370 ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB))
371 (?\e$(HAv\e(B . #xDDFC);; a duplicate of #xDCD1
372 ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2))
373 ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975))
374 ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A))
375 ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0))
376 (?\e$(HUK\e(B . #xEBF1)
377 ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD))
378 (?\e$(HW"\e(B . #xECDE)
379 ((chinese-cns11643-2 . #x5C6A) . (#xECDF . #xEDA9))
380 ((chinese-cns11643-2 . #x5D75) . (#xEDAA . #xEEEA))
381 (?\e$(Hd/\e(B . #xEEEB)
382 ((chinese-cns11643-2 . #x6039) . (#xEEEC . #xF055))
383 (?\e$(H]t\e(B . #xF056)
384 ((chinese-cns11643-2 . #x6243) . (#xF057 . #xF0CA))
385 (?\e$(HZ(\e(B . #xF0CB)
386 ((chinese-cns11643-2 . #x6337) . (#xF0CC . #xF162))
387 ((chinese-cns11643-2 . #x6430) . (#xF163 . #xF16A))
388 (?\e$(Hga\e(B . #xF16B)
389 ((chinese-cns11643-2 . #x6438) . (#xF16C . #xF267))
390 (?\e$(Hi4\e(B . #xF268)
391 ((chinese-cns11643-2 . #x6573) . (#xF269 . #xF2C2))
392 ((chinese-cns11643-2 . #x664E) . (#xF2C3 . #xF374))
393 ((chinese-cns11643-2 . #x6762) . (#xF375 . #xF465))
394 ((chinese-cns11643-2 . #x6935) . (#xF466 . #xF4B4))
395 (?\e$(HfM\e(B . #xF4B5)
396 ((chinese-cns11643-2 . #x6962) . (#xF4B6 . #xF4FC))
397 ((chinese-cns11643-2 . #x6A4C) . (#xF4FD . #xF662))
398 (?\e$(HjK\e(B . #xF663)
399 ((chinese-cns11643-2 . #x6C52) . (#xF664 . #xF976))
400 ((chinese-cns11643-2 . #x7167) . (#xF977 . #xF9C3))
401 (?\e$(Hqf\e(B . #xF9C4)
402 (?\e$(Hr4\e(B . #xF9C5)
403 (?\e$(Hr@\e(B . #xF9C6)
404 ((chinese-cns11643-2 . #x7235) . (#xF9C7 . #xF9D1))
405 ((chinese-cns11643-2 . #x7241) . (#xF9D2 . #xF9D5))
407 ;; Additional Ideographs
408 (?\e$(IC7\e(B . #xF9D6)
409 (?\e$(IOP\e(B . #xF9D7)
410 (?\e$(IDN\e(B . #xF9D8)
411 (?\e$(IPJ\e(B . #xF9D9)
412 (?\e$(I,]\e(B . #xF9DA)
413 (?\e$(I=~\e(B . #xF9DB)
414 (?\e$(IK\\e(B . #xF9DC)
420 (provide 'china-util)
422 ;;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836
423 ;;; china-util.el ends here