1 ;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*-
3 ;; Copyright (C) 1995, 2001, 2002, 2003, 2004, 2005, 2006
4 ;; Free Software Foundation, Inc.
5 ;; Copyright (C) 1995, 1997, 2001, 2002, 2003, 2004, 2005, 2006
6 ;; National Institute of Advanced Industrial Science and Technology (AIST)
7 ;; Registration Number H14PRO021
9 ;; Keywords: mule, multilingual, Chinese
11 ;; This file is part of GNU Emacs.
13 ;; GNU Emacs is free software; you can redistribute it and/or modify
14 ;; it under the terms of the GNU General Public License as published by
15 ;; the Free Software Foundation; either version 2, or (at your option)
18 ;; GNU Emacs is distributed in the hope that it will be useful,
19 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
20 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 ;; GNU General Public License for more details.
23 ;; You should have received a copy of the GNU General Public License
24 ;; along with GNU Emacs; see the file COPYING. If not, write to the
25 ;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
26 ;; Boston, MA 02110-1301, USA.
32 ;; Hz/ZW/EUC-TW encoding stuff
34 ;; HZ is an encoding method for Chinese character set GB2312 used
35 ;; widely in Internet. It is very similar to 7-bit environment of
36 ;; ISO-2022. The difference is that HZ uses the sequence "~{" and
37 ;; "~}" for designating GB2312 and ASCII respectively, hence, it
38 ;; doesn't uses ESC (0x1B) code.
40 ;; ZW is another encoding method for Chinese character set GB2312. It
41 ;; encodes Chinese characters line by line by starting each line with
42 ;; the sequence "zW". It also uses only 7-bit as HZ.
44 ;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is
45 ;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with
46 ;; a single shift escape followed by three bytes: the first gives the
47 ;; plane, the second and third the character code. Note that characters
48 ;; of plane 1 are (redundantly) accessible with a single shift escape
51 ;; ISO-2022 escape sequence to designate GB2312.
52 (defvar iso2022-gb-designation
"\e$A")
53 ;; HZ escape sequence to designate GB2312.
54 (defvar hz-gb-designnation
"~{")
55 ;; ISO-2022 escape sequence to designate ASCII.
56 (defvar iso2022-ascii-designation
"\e(B")
57 ;; HZ escape sequence to designate ASCII.
58 (defvar hz-ascii-designnation
"~}")
59 ;; Regexp of ZW sequence to start GB2312.
60 (defvar zw-start-gb
"^zW")
61 ;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW.
62 (defvar hz
/zw-start-gb
63 (concat hz-gb-designnation
"\\|" zw-start-gb
"\\|[^\0-\177]"))
65 (defvar decode-hz-line-continuation nil
66 "Flag to tell if we should care line continuation convention of Hz.")
68 (defconst hz-set-msb-table
76 (push (+ i
128) chars
)
78 (apply 'string
(nreverse chars
)))))
81 (defun decode-hz-region (beg end
)
82 "Decode HZ/ZW encoded text in the current region.
83 Return the length of resulting text."
88 (narrow-to-region beg end
)
90 ;; We, at first, convert HZ/ZW to `euc-china',
93 ;; "~\n" -> "\n", "~~" -> "~"
94 (goto-char (point-min))
95 (while (search-forward "~" nil t
)
96 (setq ch
(following-char))
97 (if (or (= ch ?
\n) (= ch ?~
)) (delete-char -
1)))
99 ;; "^zW...\n" -> Chinese GB2312
100 ;; "~{...~}" -> Chinese GB2312
101 (goto-char (point-min))
103 (while (re-search-forward hz
/zw-start-gb nil t
)
104 (setq pos
(match-beginning 0)
106 ;; Record the first position to start conversion.
107 (or beg
(setq beg pos
))
110 (if (>= ch
128) ; 8bit GB2312
115 (if (= ch ?z
) ; ZW -> euc-china
117 (translate-region (point) end hz-set-msb-table
)
119 (if (search-forward hz-ascii-designnation
120 (if decode-hz-line-continuation nil end
)
124 (translate-region pos
(point) hz-set-msb-table
))))
126 (decode-coding-region beg end
'euc-china
)))
127 (- (point-max) (point-min)))))
130 (defun decode-hz-buffer ()
131 "Decode HZ/ZW encoded text in the current buffer."
133 (decode-hz-region (point-min) (point-max)))
136 (defun encode-hz-region (beg end
)
137 "Encode the text in the current region to HZ.
138 Return the length of resulting text."
142 (narrow-to-region beg end
)
145 (goto-char (point-min))
146 (while (search-forward "~" nil t
) (insert ?~
))
148 ;; Chinese GB2312 -> "~{...~}"
149 (goto-char (point-min))
150 (if (re-search-forward "\\cc" nil t
)
152 (goto-char (setq pos
(match-beginning 0)))
153 (encode-coding-region pos
(point-max) 'iso-2022-7bit
)
155 (while (search-forward iso2022-gb-designation nil t
)
157 (insert hz-gb-designnation
))
159 (while (search-forward iso2022-ascii-designation nil t
)
161 (insert hz-ascii-designnation
))))
162 (- (point-max) (point-min)))))
165 (defun encode-hz-buffer ()
166 "Encode the text in the current buffer to HZ."
168 (encode-hz-region (point-min) (point-max)))
170 ;; The following sets up a translation table (big5-to-cns) from Big 5
171 ;; to CNS encoding, using some auxiliary functions to make the code
174 ;; Many kudos to Himi! The used code has been adapted from his
178 (defun big5-to-flat-code (num)
179 "Convert NUM in Big 5 encoding to a `flat code'.
180 0xA140 will be mapped to position 0, 0xA141 to position 1, etc.
181 There are no gaps in the flat code."
183 (let ((hi (/ num
256))
185 (+ (* 157 (- hi
#xa1
))
186 (- lo
(if (>= lo
#xa1
) 98 64)))))
188 (defun flat-code-to-big5 (num)
189 "Convert NUM from a `flat code' to Big 5 encoding.
190 This is the inverse function of `big5-to-flat-code'."
192 (let ((hi (/ num
157))
194 (+ (* 256 (+ hi
#xa1
))
195 (+ lo
(if (< lo
63) 64 98)))))
197 (defun euc-to-flat-code (num)
198 "Convert NUM in EUC encoding (in GL representation) to a `flat code'.
199 0x2121 will be mapped to position 0, 0x2122 to position 1, etc.
200 There are no gaps in the flat code."
202 (let ((hi (/ num
256))
204 (+ (* 94 (- hi
#x21
))
207 (defun flat-code-to-euc (num)
208 "Convert NUM from a `flat code' to EUC encoding (in GL representation).
209 The inverse function of `euc-to-flat-code'. The high and low bytes are
212 (let ((hi (/ num
94))
214 (list (+ hi
#x21
) (+ lo
#x21
))))
216 (defun expand-euc-big5-alist (alist)
217 "Create a translation table and fills it with data given in ALIST.
218 Elements of ALIST can be either given as
220 ((euc-charset . startchar) . (big5-range-begin . big5-range-end))
224 (euc-character . big5-charcode)
226 The former maps a range of glyphs in an EUC charset (where STARTCHAR
227 is in GL representation) to a certain range of Big 5 encoded
228 characters, the latter maps a single glyph. Glyphs which can't be
229 mapped will be represented with the byte 0xFF.
231 The return value is the filled translation table."
233 (let ((chartable (make-char-table 'translation-table
#xFF
))
241 (setq char
(car elem
)
243 (cond ((and (consp char
)
245 (setq i
(big5-to-flat-code (car big5
))
246 end
(big5-to-flat-code (cdr big5
))
247 codepoint
(euc-to-flat-code (cdr char
))
251 (decode-big5-char (flat-code-to-big5 i
))
252 (apply (function make-char
)
254 (flat-code-to-euc codepoint
)))
256 codepoint
(1+ codepoint
))))
257 ((and (char-valid-p char
)
259 (setq i
(decode-big5-char big5
))
260 (aset chartable i char
))
262 (error "Unknown slot type: %S" elem
))))
266 ;; All non-CNS encodings are commented out.
268 (define-translation-table 'big5-to-cns
270 (expand-euc-big5-alist
273 ((chinese-cns11643-1 .
#x2121
) .
(#xA140 .
#xA1F5
))
274 (?
\e$
(G"X\e(B . #xA1F6)
275 (?\e$(G"W
\e(B .
#xA1F7
)
276 ((chinese-cns11643-1 .
#x2259
) .
(#xA1F8 .
#xA2AE
))
277 ((chinese-cns11643-1 .
#x2421
) .
(#xA2AF .
#xA3BF
))
278 ;; Control codes (vendor dependent)
279 ((chinese-cns11643-1 .
#x4221
) .
(#xA3C0 .
#xA3E0
))
280 ;; Level 1 Ideographs
281 ((chinese-cns11643-1 .
#x4421
) .
(#xA440 .
#xACFD
))
282 (?
\e$
(GWS\e(B .
#xACFE
)
283 ((chinese-cns11643-1 .
#x5323
) .
(#xAD40 .
#xAFCF
))
284 ((chinese-cns11643-1 .
#x5754
) .
(#xAFD0 .
#xBBC7
))
285 ((chinese-cns11643-1 .
#x6B51
) .
(#xBBC8 .
#xBE51
))
286 (?
\e$
(GkP\e(B .
#xBE52
)
287 ((chinese-cns11643-1 .
#x6F5C
) .
(#xBE53 .
#xC1AA
))
288 ((chinese-cns11643-1 .
#x7536
) .
(#xC1AB .
#xC2CA
))
289 (?
\e$
(Gu5\e(B .
#xC2CB
)
290 ((chinese-cns11643-1 .
#x7737
) .
(#xC2CC .
#xC360
))
291 ((chinese-cns11643-1 .
#x782E
) .
(#xC361 .
#xC3B8
))
292 (?
\e$
(Gxe\e(B .
#xC3B9
)
293 (?
\e$
(Gxd\e(B .
#xC3BA
)
294 ((chinese-cns11643-1 .
#x7866
) .
(#xC3BB .
#xC455
))
295 (?
\e$
(Gx-\e(B .
#xC456
)
296 ((chinese-cns11643-1 .
#x7962
) .
(#xC457 .
#xC67E
))
298 ((chinese-cns11643-1 .
#x2621
) .
(#xC6A1 .
#xC6BE
))
300 (?
\e$
(G'#\e(B .
#xC6BF
)
301 (?
\e$
(G'$
\e(B .
#xC6C0
)
302 (?
\e$
(G'&\e(B .
#xC6C1
)
303 (?
\e$
(G'(\e(B .
#xC6C2
)
304 (?
\e$
(G'-
\e(B .
#xC6C3
)
305 (?
\e$
(G'.
\e(B .
#xC6C4
)
306 (?
\e$
(G'/\e(B .
#xC6C5
)
307 (?
\e$
(G'4\e(B .
#xC6C6
)
308 (?
\e$
(G'7\e(B .
#xC6C7
)
309 (?
\e$
(G':\e(B .
#xC6C8
)
310 (?
\e$
(G'<\e(B .
#xC6C9
)
311 (?
\e$
(G'B
\e(B .
#xC6CA
)
312 (?
\e$
(G'G
\e(B .
#xC6CB
)
313 (?
\e$
(G'N
\e(B .
#xC6CC
)
314 (?
\e$
(G'S
\e(B .
#xC6CD
)
315 (?
\e$
(G'T
\e(B .
#xC6CE
)
316 (?
\e$
(G'U
\e(B .
#xC6CF
)
317 (?
\e$
(G'Y
\e(B .
#xC6D0
)
318 (?
\e$
(G'Z
\e(B .
#xC6D1
)
319 (?
\e$
(G'a
\e(B .
#xC6D2
)
320 (?
\e$
(G'f
\e(B .
#xC6D3
)
321 (?
\e$
(G()\e(B .
#xC6D4
)
322 (?
\e$
(G(*\e(B .
#xC6D5
)
323 (?
\e$
(G(c\e(B .
#xC6D6
)
324 (?
\e$
(G(l\e(B .
#xC6D7
)
326 ; ((japanese-jisx0208 . #x212F) . (#xC6D8 . #xC6D9))
327 ;; Japanese Kana Supplement
328 ; ((japanese-jisx0208 . #x2133) . (#xC6DA . #xC6E3))
330 ; ((japanese-jisx0208 . #x2421) . (#xC6E7 . #xC77A))
332 ; ((japanese-jisx0208 . #x2521) . (#xC77B . #xC7F2))
333 ;; Cyrillic Characters
334 ; ((japanese-jisx0208 . #x2721) . (#xC7F3 . #xC854))
335 ; ((japanese-jisx0208 . #x2751) . (#xC855 . #xC875))
336 ;; Special Chinese Characters
337 (?
\e$
(J!#\e(B .
#xC879
)
338 (?
\e$
(J!$
\e(B .
#xC87B
)
339 (?
\e$
(J!*\e(B .
#xC87D
)
340 (?
\e$
(J!R
\e(B .
#xC8A2
)
342 ;; JIS X 0208 NOT SIGN (cf. U+00AC)
343 ; (?\e$B"L\e(B . #xC8CD)
344 ;; JIS X 0212 BROKEN BAR (cf. U+00A6)
345 ; (?\e$(D"C\e(B . #xC8CE)
347 ;; GB 2312 characters
348 ; (?\e$A!d\e(B . #xC8CF)
349 ; (?\e$A!e\e(B . #xC8D0)
350 ;;;;; C8D1 - Japanese `(\e$B3t\e(B)'
351 ; (?\e$A!m\e(B . #xC8D2)
354 ;; Level 2 Ideographs
355 ((chinese-cns11643-2 .
#x2121
) .
(#xC940 .
#xC949
))
356 (?
\e$
(GDB\e(B .
#xC94A
);; a duplicate of #xA461
357 ((chinese-cns11643-2 .
#x212B
) .
(#xC94B .
#xC96B
))
358 ((chinese-cns11643-2 .
#x214D
) .
(#xC96C .
#xC9BD
))
359 (?
\e$
(H!L
\e(B .
#xC9BE
)
360 ((chinese-cns11643-2 .
#x217D
) .
(#xC9BF .
#xC9EC
))
361 ((chinese-cns11643-2 .
#x224E
) .
(#xC9ED .
#xCAF6
))
362 (?
\e$
(H"M\e(B . #xCAF7)
363 ((chinese-cns11643-2 . #x2439) . (#xCAF8 . #xD6CB))
364 (?\e$(H>c\e(B . #xD6CC)
365 ((chinese-cns11643-2 . #x3770) . (#xD6CD . #xD779))
366 (?\e$(H?j\e(B . #xD77A)
367 ((chinese-cns11643-2 . #x387E) . (#xD77B . #xDADE))
368 (?\e$(H7o\e(B . #xDADF)
369 ((chinese-cns11643-2 . #x3E64) . (#xDAE0 . #xDBA6))
370 ((chinese-cns11643-2 . #x3F6B) . (#xDBA7 . #xDDFB))
371 (?\e$(HAv\e(B . #xDDFC);; a duplicate of #xDCD1
372 ((chinese-cns11643-2 . #x4424) . (#xDDFD . #xE8A2))
373 ((chinese-cns11643-2 . #x554C) . (#xE8A3 . #xE975))
374 ((chinese-cns11643-2 . #x5723) . (#xE976 . #xEB5A))
375 ((chinese-cns11643-2 . #x5A29) . (#xEB5B . #xEBF0))
376 (?\e$(HUK\e(B . #xEBF1)
377 ((chinese-cns11643-2 . #x5B3F) . (#xEBF2 . #xECDD))
378 (?\e$(HW"\e(B .
#xECDE
)
379 ((chinese-cns11643-2 .
#x5C6A
) .
(#xECDF .
#xEDA9
))
380 ((chinese-cns11643-2 .
#x5D75
) .
(#xEDAA .
#xEEEA
))
381 (?
\e$
(Hd/\e(B .
#xEEEB
)
382 ((chinese-cns11643-2 .
#x6039
) .
(#xEEEC .
#xF055
))
383 (?
\e$
(H]t
\e(B .
#xF056
)
384 ((chinese-cns11643-2 .
#x6243
) .
(#xF057 .
#xF0CA
))
385 (?
\e$
(HZ(\e(B .
#xF0CB
)
386 ((chinese-cns11643-2 .
#x6337
) .
(#xF0CC .
#xF162
))
387 ((chinese-cns11643-2 .
#x6430
) .
(#xF163 .
#xF16A
))
388 (?
\e$
(Hga\e(B .
#xF16B
)
389 ((chinese-cns11643-2 .
#x6438
) .
(#xF16C .
#xF267
))
390 (?
\e$
(Hi4\e(B .
#xF268
)
391 ((chinese-cns11643-2 .
#x6573
) .
(#xF269 .
#xF2C2
))
392 ((chinese-cns11643-2 .
#x664E
) .
(#xF2C3 .
#xF374
))
393 ((chinese-cns11643-2 .
#x6762
) .
(#xF375 .
#xF465
))
394 ((chinese-cns11643-2 .
#x6935
) .
(#xF466 .
#xF4B4
))
395 (?
\e$
(HfM\e(B .
#xF4B5
)
396 ((chinese-cns11643-2 .
#x6962
) .
(#xF4B6 .
#xF4FC
))
397 ((chinese-cns11643-2 .
#x6A4C
) .
(#xF4FD .
#xF662
))
398 (?
\e$
(HjK\e(B .
#xF663
)
399 ((chinese-cns11643-2 .
#x6C52
) .
(#xF664 .
#xF976
))
400 ((chinese-cns11643-2 .
#x7167
) .
(#xF977 .
#xF9C3
))
401 (?
\e$
(Hqf\e(B .
#xF9C4
)
402 (?
\e$
(Hr4\e(B .
#xF9C5
)
403 (?
\e$
(Hr@\e(B .
#xF9C6
)
404 ((chinese-cns11643-2 .
#x7235
) .
(#xF9C7 .
#xF9D1
))
405 ((chinese-cns11643-2 .
#x7241
) .
(#xF9D2 .
#xF9D5
))
407 ;; Additional Ideographs
408 (?
\e$
(IC7\e(B .
#xF9D6
)
409 (?
\e$
(IOP\e(B .
#xF9D7
)
410 (?
\e$
(IDN\e(B .
#xF9D8
)
411 (?
\e$
(IPJ\e(B .
#xF9D9
)
412 (?
\e$
(I,]\e(B .
#xF9DA
)
413 (?
\e$
(I=~
\e(B .
#xF9DB
)
414 (?
\e$
(IK\
\e(B .
#xF9DC
)
420 (provide 'china-util
)
422 ;;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836
423 ;;; china-util.el ends here