Merge branch 'master' into comment-cache
[emacs.git] / lisp / language / china-util.el
blob955c2999b8c2a0f0f657b54bac2ec6bc12e5dc60
1 ;;; china-util.el --- utilities for Chinese -*- coding: utf-8 -*-
3 ;; Copyright (C) 1995, 2001-2017 Free Software Foundation, Inc.
4 ;; Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
5 ;; 2005, 2006, 2007, 2008, 2009, 2010, 2011
6 ;; National Institute of Advanced Industrial Science and Technology (AIST)
7 ;; Registration Number H14PRO021
8 ;; Copyright (C) 2003
9 ;; National Institute of Advanced Industrial Science and Technology (AIST)
10 ;; Registration Number H13PRO009
12 ;; Keywords: mule, multilingual, Chinese
14 ;; This file is part of GNU Emacs.
16 ;; GNU Emacs is free software: you can redistribute it and/or modify
17 ;; it under the terms of the GNU General Public License as published by
18 ;; the Free Software Foundation, either version 3 of the License, or
19 ;; (at your option) any later version.
21 ;; GNU Emacs is distributed in the hope that it will be useful,
22 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
23 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 ;; GNU General Public License for more details.
26 ;; You should have received a copy of the GNU General Public License
27 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
29 ;;; Commentary:
31 ;;; Code:
33 ;; Hz/ZW/EUC-TW encoding stuff
35 ;; HZ is an encoding method for Chinese character set GB2312 used
36 ;; widely in Internet. It is very similar to 7-bit environment of
37 ;; ISO-2022. The difference is that HZ uses the sequence "~{" and
38 ;; "~}" for designating GB2312 and ASCII respectively, hence, it
39 ;; doesn't uses ESC (0x1B) code.
41 ;; ZW is another encoding method for Chinese character set GB2312. It
42 ;; encodes Chinese characters line by line by starting each line with
43 ;; the sequence "zW". It also uses only 7-bit as HZ.
45 ;; EUC-TW is similar to EUC-KS or EUC-JP. Its main character set is
46 ;; plane 1 of CNS 11643; characters of planes 2 to 7 are accessed with
47 ;; a single shift escape followed by three bytes: the first gives the
48 ;; plane, the second and third the character code. Note that characters
49 ;; of plane 1 are (redundantly) accessible with a single shift escape
50 ;; also.
52 ;; ISO-2022 escape sequence to designate GB2312.
53 (defvar iso2022-gb-designation "\e$A")
54 ;; HZ escape sequence to designate GB2312.
55 (defvar hz-gb-designation "~{")
56 ;; ISO-2022 escape sequence to designate ASCII.
57 (defvar iso2022-ascii-designation "\e(B")
58 ;; HZ escape sequence to designate ASCII.
59 (defvar hz-ascii-designation "~}")
60 ;; Regexp of ZW sequence to start GB2312.
61 (defvar zw-start-gb "^zW")
62 ;; Regexp for start of GB2312 in an encoding mixture of HZ and ZW.
63 (defvar hz/zw-start-gb
64 (concat hz-gb-designation "\\|" zw-start-gb "\\|[^\0-\177]"))
66 (defvar decode-hz-line-continuation nil
67 "Flag to tell if we should care line continuation convention of Hz.")
69 (defconst hz-set-msb-table
70 (eval-when-compile
71 (let ((chars nil)
72 (i 0))
73 (while (< i 33)
74 (push i chars)
75 (setq i (1+ i)))
76 (while (< i 127)
77 (push (decode-char 'eight-bit (+ i 128)) chars)
78 (setq i (1+ i)))
79 (apply 'string (nreverse chars)))))
81 ;;;###autoload
82 (defun decode-hz-region (beg end)
83 "Decode HZ/ZW encoded text in the current region.
84 Return the length of resulting text."
85 (interactive "r")
86 (save-excursion
87 (save-restriction
88 (let (pos ch)
89 (narrow-to-region beg end)
91 ;; We, at first, convert HZ/ZW to `iso-2022-7bit',
92 ;; then decode it.
94 ;; "~\n" -> "", "~~" -> "~"
95 (goto-char (point-min))
96 (while (search-forward "~" nil t)
97 (setq ch (following-char))
98 (cond ((= ch ?{)
99 (delete-region (1- (point)) (1+ (point)))
100 (setq pos (point))
101 (insert iso2022-gb-designation)
102 (if (looking-at "\\([!-}][!-~]\\)*")
103 (goto-char (match-end 0)))
104 (if (looking-at hz-ascii-designation)
105 (delete-region (match-beginning 0) (match-end 0)))
106 (insert iso2022-ascii-designation)
107 (decode-coding-region pos (point) 'iso-2022-7bit))
109 ((= ch ?~)
110 (delete-char 1))
112 ((and (= ch ?\n)
113 decode-hz-line-continuation)
114 (delete-region (1- (point)) (1+ (point))))
117 (forward-char 1)))))
119 (- (point-max) (point-min)))))
121 ;;;###autoload
122 (defun decode-hz-buffer ()
123 "Decode HZ/ZW encoded text in the current buffer."
124 (interactive)
125 (decode-hz-region (point-min) (point-max)))
127 (defvar hz-category-table nil)
129 ;;;###autoload
130 (defun encode-hz-region (beg end)
131 "Encode the text in the current region to HZ.
132 Return the length of resulting text."
133 (interactive "r")
134 (unless hz-category-table
135 (setq hz-category-table (make-category-table))
136 (with-category-table hz-category-table
137 (define-category ?c "hz encodable")
138 (map-charset-chars #'modify-category-entry 'ascii ?c)
139 (map-charset-chars #'modify-category-entry 'chinese-gb2312 ?c)))
140 (save-excursion
141 (save-restriction
142 (narrow-to-region beg end)
143 (with-category-table hz-category-table
144 ;; ~ -> ~~
145 (goto-char (point-min))
146 (while (search-forward "~" nil t) (insert ?~))
148 ;; ESC -> ESC ESC
149 (goto-char (point-min))
150 (while (search-forward "\e" nil t) (insert ?\e))
152 ;; Non-ASCII-GB2312 -> \uXXXX
153 (goto-char (point-min))
154 (while (re-search-forward "\\Cc" nil t)
155 (let ((ch (preceding-char)))
156 (delete-char -1)
157 (insert (format (if (< ch #x10000) "\\u%04X" "\\U%08X") ch))))
159 ;; Prefer chinese-gb2312 for Chinese characters.
160 (put-text-property (point-min) (point-max) 'charset 'chinese-gb2312)
161 (encode-coding-region (point-min) (point-max) 'iso-2022-7bit)
163 ;; ESC $ B ... ESC ( B -> ~{ ... ~}
164 ;; ESC ESC -> ESC
165 (goto-char (point-min))
166 (while (search-forward "\e" nil t)
167 (if (= (following-char) ?\e)
168 ;; ESC ESC -> ESC
169 (delete-char 1)
170 (forward-char -1)
171 (if (looking-at iso2022-gb-designation)
172 (progn
173 (delete-region (match-beginning 0) (match-end 0))
174 (insert hz-gb-designation)
175 (search-forward iso2022-ascii-designation nil 'move)
176 (delete-region (match-beginning 0) (match-end 0))
177 (insert hz-ascii-designation))))))
178 (- (point-max) (point-min)))))
180 ;;;###autoload
181 (defun encode-hz-buffer ()
182 "Encode the text in the current buffer to HZ."
183 (interactive)
184 (encode-hz-region (point-min) (point-max)))
186 ;;;###autoload
187 (defun post-read-decode-hz (len)
188 (let ((pos (point))
189 (buffer-modified-p (buffer-modified-p))
190 last-coding-system-used)
191 (prog1
192 (decode-hz-region pos (+ pos len))
193 (set-buffer-modified-p buffer-modified-p))))
195 ;;;###autoload
196 (defun pre-write-encode-hz (from to)
197 (let ((buf (current-buffer)))
198 (set-buffer (generate-new-buffer " *temp*"))
199 (if (stringp from)
200 (insert from)
201 (insert-buffer-substring buf from to))
202 (let (last-coding-system-used)
203 (encode-hz-region 1 (point-max)))
204 nil))
206 (provide 'china-util)
208 ;;; china-util.el ends here