1 ;;; china-util.el --- utilities for Chinese -*- coding: iso-2022-7bit -*-
3 ;; Copyright (C) 1995, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
4 ;; Free Software Foundation, Inc.
5 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6 ;; 2005, 2006, 2007, 2008, 2009, 2010
7 ;; National Institute of Advanced Industrial Science and Technology (AIST)
8 ;; Registration Number H14PRO021
10 ;; National Institute of Advanced Industrial Science and Technology (AIST)
11 ;; Registration Number H13PRO009
13 ;; Keywords: mule, multilingual, Chinese
15 ;; This file is part of GNU Emacs.
17 ;; GNU Emacs is free software: you can redistribute it and/or modify
18 ;; it under the terms of the GNU General Public License as published by
19 ;; the Free Software Foundation, either version 3 of the License, or
20 ;; (at your option) any later version.
22 ;; GNU Emacs is distributed in the hope that it will be useful,
23 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
24 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 ;; GNU General Public License for more details.
27 ;; You should have received a copy of the GNU General Public License
28 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
34 ;; Hz/ZW/EUC-TW encoding stuff
36 ;; HZ is an encoding method for Chinese character set GB2312 used
37 ;; widely in Internet. It is very similar to 7-bit environment of
38 ;; ISO-2022. The difference is that HZ uses the sequence "~{" and
39 ;; "~}" for designating GB2312 and ASCII respectively, hence, it
40 ;; doesn't uses ESC (0x1B) code.
42 ;; ZW is another encoding method for Chinese character set GB2312. It
43 ;; encodes Chinese characters line by line by starting each line with
44 ;; the sequence "zW". It also uses only 7-bit as HZ.
46 ;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is
47 ;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with
48 ;; a single shift escape followed by three bytes: the first gives the
49 ;; plane, the second and third the character code. Note that characters
50 ;; of plane 1 are (redundantly) accessible with a single shift escape
53 ;; ISO-2022 escape sequence to designate GB2312.
54 (defvar iso2022-gb-designation
"\e$A")
55 ;; HZ escape sequence to designate GB2312.
56 (defvar hz-gb-designnation
"~{")
57 ;; ISO-2022 escape sequence to designate ASCII.
58 (defvar iso2022-ascii-designation
"\e(B")
59 ;; HZ escape sequence to designate ASCII.
60 (defvar hz-ascii-designnation
"~}")
61 ;; Regexp of ZW sequence to start GB2312.
62 (defvar zw-start-gb
"^zW")
63 ;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW.
64 (defvar hz
/zw-start-gb
65 (concat hz-gb-designnation
"\\|" zw-start-gb
"\\|[^\0-\177]"))
67 (defvar decode-hz-line-continuation nil
68 "Flag to tell if we should care line continuation convention of Hz.")
70 (defconst hz-set-msb-table
78 (push (decode-char 'eight-bit
(+ i
128)) chars
)
80 (apply 'string
(nreverse chars
)))))
83 (defun decode-hz-region (beg end
)
84 "Decode HZ/ZW encoded text in the current region.
85 Return the length of resulting text."
90 (narrow-to-region beg end
)
92 ;; We, at first, convert HZ/ZW to `euc-china',
95 ;; "~\n" -> "\n", "~~" -> "~"
96 (goto-char (point-min))
97 (while (search-forward "~" nil t
)
98 (setq ch
(following-char))
99 (if (or (= ch ?
\n) (= ch ?~
)) (delete-char -
1)))
101 ;; "^zW...\n" -> Chinese GB2312
102 ;; "~{...~}" -> Chinese GB2312
103 (goto-char (point-min))
105 (while (re-search-forward hz
/zw-start-gb nil t
)
106 (setq pos
(match-beginning 0)
108 ;; Record the first position to start conversion.
109 (or beg
(setq beg pos
))
112 (if (>= ch
128) ; 8bit GB2312
117 (if (= ch ?z
) ; ZW -> euc-china
119 (translate-region (point) end hz-set-msb-table
)
121 (if (search-forward hz-ascii-designnation
122 (if decode-hz-line-continuation nil end
)
126 (translate-region pos
(point) hz-set-msb-table
))))
128 (decode-coding-region beg end
'euc-china
)))
129 (- (point-max) (point-min)))))
132 (defun decode-hz-buffer ()
133 "Decode HZ/ZW encoded text in the current buffer."
135 (decode-hz-region (point-min) (point-max)))
138 (defun encode-hz-region (beg end
)
139 "Encode the text in the current region to HZ.
140 Return the length of resulting text."
144 (narrow-to-region beg end
)
147 (goto-char (point-min))
148 (while (search-forward "~" nil t
) (insert ?~
))
150 ;; Chinese GB2312 -> "~{...~}"
151 (goto-char (point-min))
152 (if (re-search-forward "\\cc" nil t
)
154 (goto-char (setq pos
(match-beginning 0)))
155 (encode-coding-region pos
(point-max) 'iso-2022-7bit
)
157 (while (search-forward iso2022-gb-designation nil t
)
159 (insert hz-gb-designnation
))
161 (while (search-forward iso2022-ascii-designation nil t
)
163 (insert hz-ascii-designnation
))))
164 (- (point-max) (point-min)))))
167 (defun encode-hz-buffer ()
168 "Encode the text in the current buffer to HZ."
170 (encode-hz-region (point-min) (point-max)))
173 (defun post-read-decode-hz (len)
175 (buffer-modified-p (buffer-modified-p))
176 last-coding-system-used
)
178 (decode-hz-region pos
(+ pos len
))
179 (set-buffer-modified-p buffer-modified-p
))))
182 (defun pre-write-encode-hz (from to
)
183 (let ((buf (current-buffer)))
184 (set-buffer (generate-new-buffer " *temp*"))
187 (insert-buffer-substring buf from to
))
188 (let (last-coding-system-used)
189 (encode-hz-region 1 (point-max)))
192 (provide 'china-util
)
194 ;; arch-tag: 5a47b084-b9ac-420e-8191-70c5b3a14836
195 ;;; china-util.el ends here