(ucs-tables): Require when compiling.
[emacs.git] / lisp / international / utf-8.el
blob0ee120021da4868b4ab4faf7499e07dc935c9662
1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation.
6 ;; Keywords: multilingual, Unicode, UTF-8
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
13 ;; any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
25 ;;; Commentary:
27 ;; The coding-system `mule-utf-8' supports encoding/decoding of the
28 ;; following character sets:
30 ;; ascii
31 ;; eight-bit-control
32 ;; latin-iso8859-1
33 ;; mule-unicode-0100-24ff
34 ;; mule-unicode-2500-33ff
35 ;; mule-unicode-e000-ffff
37 ;; Characters of other character sets cannot be encoded with
38 ;; mule-utf-8.
40 ;; On decoding, Unicode characters that do not fit in above character
41 ;; sets are handled as `eight-bit-control' or `eight-bit-graphic'
42 ;; characters to retain original information (i.e. original byte
43 ;; sequence).
45 ;; scalar | utf-8
46 ;; value | 1st byte | 2nd byte | 3rd byte
47 ;; --------------------+-----------+-----------+----------
48 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
49 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
50 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
52 ;;; Code:
54 (eval-when-compile (require 'ucs-tables))
56 (define-ccl-program ccl-decode-mule-utf-8
58 ;; charset | bytes in utf-8 | bytes in emacs
59 ;; -----------------------+----------------+---------------
60 ;; ascii | 1 | 1
61 ;; -----------------------+----------------+---------------
62 ;; eight-bit-control | 2 | 2
63 ;; latin-iso8859-1 | 2 | 2
64 ;; -----------------------+----------------+---------------
65 ;; mule-unicode-0100-24ff | 2 | 4
66 ;; (< 0800) | |
67 ;; -----------------------+----------------+---------------
68 ;; mule-unicode-0100-24ff | 3 | 4
69 ;; (>= 8000) | |
70 ;; mule-unicode-2500-33ff | 3 | 4
71 ;; mule-unicode-e000-ffff | 3 | 4
73 ;; Thus magnification factor is two.
75 `(2
76 ((loop
77 (read r0)
79 ;; 1byte encoding, i.e., ascii
80 (if (r0 < #x80)
81 (write r0)
83 ;; 2byte encoding
84 (if (r0 < #xe0)
85 ((read r1)
86 (r0 &= #x1f)
87 (r0 <<= 6)
88 (r1 &= #x3f)
89 (r1 += r0)
90 ;; now r1 holds scalar value
92 ;; eight-bit-control
93 (if (r1 < 160)
94 ((r0 = ,(charset-id 'eight-bit-control))
95 (write-multibyte-character r0 r1))
97 ;; latin-iso8859-1
98 (if (r1 < 256)
99 ((r0 = ,(charset-id 'latin-iso8859-1))
100 (r1 -= 128)
101 (write-multibyte-character r0 r1))
103 ;; mule-unicode-0100-24ff (< 0800)
104 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
105 (r1 -= #x0100)
106 (r2 = (((r1 / 96) + 32) << 7))
107 (r1 %= 96)
108 (r1 += (r2 + 32))
109 (write-multibyte-character r0 r1)))))
111 ;; 3byte encoding
112 (if (r0 < #xf0)
113 ((read r1 r2)
114 (r3 = ((r0 & #x0f) << 12))
115 (r3 += ((r1 & #x3f) << 6))
116 (r3 += (r2 & #x3f))
117 ;; now r3 holds scalar value
119 ;; mule-unicode-0100-24ff (>= 0800)
120 (if (r3 < #x2500)
121 ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
122 (r3 -= #x0100)
123 (r3 //= 96)
124 (r1 = (r7 + 32))
125 (r1 += ((r3 + 32) << 7))
126 (write-multibyte-character r0 r1))
128 ;; mule-unicode-2500-33ff
129 (if (r3 < #x3400)
130 ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
131 (r3 -= #x2500)
132 (r3 //= 96)
133 (r1 = (r7 + 32))
134 (r1 += ((r3 + 32) << 7))
135 (write-multibyte-character r0 r1))
137 ;; U+3400 .. U+DFFF
138 ;; keep those bytes as eight-bit-{control|graphic}
139 (if (r3 < #xe000)
140 (;; #xe0 < r0 < #xf0, so r0 is eight-bit-graphic
141 (r3 = ,(charset-id 'eight-bit-graphic))
142 (write-multibyte-character r3 r0)
143 (if (r1 < #xa0)
144 (r3 = ,(charset-id 'eight-bit-control)))
145 (write-multibyte-character r3 r1)
146 (if (r2 < #xa0)
147 (r3 = ,(charset-id 'eight-bit-control))
148 (r3 = ,(charset-id 'eight-bit-graphic)))
149 (write-multibyte-character r3 r2))
151 ;; mule-unicode-e000-ffff
152 ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
153 (r3 -= #xe000)
154 (r3 //= 96)
155 (r1 = (r7 + 32))
156 (r1 += ((r3 + 32) << 7))
157 (write-multibyte-character r0 r1))))))
159 ;; 4byte encoding
160 ;; keep those bytes as eight-bit-{control|graphic}
161 ((read r1 r2 r3)
162 ;; r0 > #xf0, thus eight-bit-graphic
163 (r4 = ,(charset-id 'eight-bit-graphic))
164 (write-multibyte-character r4 r0)
165 (if (r1 < #xa0)
166 (r4 = ,(charset-id 'eight-bit-control)))
167 (write-multibyte-character r4 r1)
168 (if (r2 < #xa0)
169 (r4 = ,(charset-id 'eight-bit-control))
170 (r4 = ,(charset-id 'eight-bit-graphic)))
171 (write-multibyte-character r4 r2)
172 (if (r3 < #xa0)
173 (r4 = ,(charset-id 'eight-bit-control))
174 (r4 = ,(charset-id 'eight-bit-graphic)))
175 (write-multibyte-character r4 r3)))))
177 (repeat))))
179 "CCL program to decode UTF-8 into ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*.")
181 (define-ccl-program ccl-encode-mule-utf-8
183 (loop
184 (read-multibyte-character r0 r1)
186 (translate-character ucs-mule-8859-to-mule-unicode r0 r1)
188 (if (r0 == ,(charset-id 'ascii))
189 (write r1)
191 (if (r0 == ,(charset-id 'latin-iso8859-1))
192 ;; r1 scalar utf-8
193 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
194 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
195 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
196 ((r0 = (((r1 & #x40) >> 6) | #xc2))
197 (r1 &= #x3f)
198 (r1 |= #x80)
199 (write r0 r1))
201 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
202 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
203 ;; #x3f80 == (0011 1111 1000 0000)b
204 (r1 &= #x7f)
205 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
206 ;; now r1 holds scalar value
207 (if (r1 < #x0800)
208 ;; 2byte encoding
209 ((r0 = (((r1 & #x07c0) >> 6) | #xc0))
210 ;; #x07c0 == (0000 0111 1100 0000)b
211 (r1 &= #x3f)
212 (r1 |= #x80)
213 (write r0 r1))
214 ;; 3byte encoding
215 ((r0 = (((r1 & #xf000) >> 12) | #xe0))
216 (r2 = ((r1 & #x3f) | #x80))
217 (r1 &= #x0fc0)
218 (r1 >>= 6)
219 (r1 |= #x80)
220 (write r0 r1 r2))))
222 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
223 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
224 (r1 &= #x7f)
225 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
226 (r0 = (((r1 & #xf000) >> 12) | #xe0))
227 (r2 = ((r1 & #x3f) | #x80))
228 (r1 &= #x0fc0)
229 (r1 >>= 6)
230 (r1 |= #x80)
231 (write r0 r1 r2))
233 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
234 ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
235 (r1 &= #x7f)
236 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
237 (r0 = (((r1 & #xf000) >> 12) | #xe0))
238 (r2 = ((r1 & #x3f) | #x80))
239 (r1 &= #x0fc0)
240 (r1 >>= 6)
241 (r1 |= #x80)
242 (write r0 r1 r2))
244 (if (r0 == ,(charset-id 'eight-bit-control))
245 ;; r1 scalar utf-8
246 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
247 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
248 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
249 (write r1)
251 (if (r0 == ,(charset-id 'eight-bit-graphic))
252 ;; r1 scalar utf-8
253 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
254 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
255 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
256 (write r1)
258 ;; unsupported character.
259 ;; output U+FFFD, which is `ef bf bd' in UTF-8
260 ;; actually it never reach here
261 ((write #xef)
262 (write #xbf)
263 (write #xbd)))))))))
264 (repeat)))
266 "CCL program to encode ascii, eight-bit-control, latin-iso8859-1 and mule-unicode-*. into UTF-8.")
268 (make-coding-system
269 'mule-utf-8 4 ?u
270 "UTF-8 encoding for Emacs-supported Unicode characters.
271 Supported character sets are:
272 ascii
273 eight-bit-control
274 eight-bit-graphic
275 latin-iso8859-1
276 mule-unicode-0100-24ff
277 mule-unicode-2500-33ff
278 mule-unicode-e000-ffff
280 Unicode characters out of these ranges are decoded
281 into eight-bit-control or eight-bit-graphic."
283 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
284 `((safe-charsets
285 ascii
286 eight-bit-control
287 eight-bit-graphic
288 latin-iso8859-1
289 latin-iso8859-15
290 latin-iso8859-14
291 latin-iso8859-9
292 hebrew-iso8859-8
293 greek-iso8859-7
294 cyrillic-iso8859-5
295 latin-iso8859-4
296 latin-iso8859-3
297 latin-iso8859-2
298 mule-unicode-0100-24ff
299 mule-unicode-2500-33ff
300 mule-unicode-e000-ffff)
301 (mime-charset . utf-8)
302 ;; Kluge to ensure the translation table is loaded.
303 `(pre-write-conversion . ,(lambda (junk) (require 'ucs-tables)))))
305 (define-coding-system-alias 'utf-8 'mule-utf-8)