1 ;;; utf-8.el --- Limited UTF-8 decoding/encoding support
3 ;; Copyright (C) 2001 Electrotechnical Laboratory, JAPAN.
4 ;; Licensed to the Free Software Foundation.
6 ;; Keywords: multilingual, Unicode, UTF-8, i18n
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software; you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation; either version 2, or (at your option)
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs; see the file COPYING. If not, write to the
22 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 ;; Boston, MA 02111-1307, USA.
27 ;; The coding-system `mule-utf-8' supports encoding/decoding of the
28 ;; following character sets to and from UTF-8:
33 ;; mule-unicode-0100-24ff
34 ;; mule-unicode-2500-33ff
35 ;; mule-unicode-e000-ffff
37 ;; Characters of other character sets cannot be encoded with
38 ;; mule-utf-8. Note that the mule-unicode charsets currently lack
39 ;; case and syntax information, so things like `downcase' will only
40 ;; work for characters from ASCII and Latin-1.
42 ;; On decoding, Unicode characters that do not fit into the above
43 ;; character sets are handled as `eight-bit-control' or
44 ;; `eight-bit-graphic' characters to retain the information about the
45 ;; original byte sequence.
47 ;; UTF-8 is defined in RFC 2279. A sketch of the encoding is:
50 ;; value | 1st byte | 2nd byte | 3rd byte
51 ;; --------------------+-----------+-----------+----------
52 ;; 0000 0000 0xxx xxxx | 0xxx xxxx | |
53 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
54 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
58 (define-ccl-program ccl-decode-mule-utf-8
60 ;; charset | bytes in utf-8 | bytes in emacs
61 ;; -----------------------+----------------+---------------
63 ;; -----------------------+----------------+---------------
64 ;; eight-bit-control | 2 | 2
65 ;; latin-iso8859-1 | 2 | 2
66 ;; -----------------------+----------------+---------------
67 ;; mule-unicode-0100-24ff | 2 | 4
69 ;; -----------------------+----------------+---------------
70 ;; mule-unicode-0100-24ff | 3 | 4
72 ;; mule-unicode-2500-33ff | 3 | 4
73 ;; mule-unicode-e000-ffff | 3 | 4
75 ;; Thus magnification factor is two.
81 ;; 1byte encoding, i.e., ascii
92 ;; now r1 holds scalar value
96 ((r0 = ,(charset-id 'eight-bit-control
))
97 (write-multibyte-character r0 r1
))
101 ((r0 = ,(charset-id 'latin-iso8859-1
))
103 (write-multibyte-character r0 r1
))
105 ;; mule-unicode-0100-24ff (< 0800)
106 ((r0 = ,(charset-id 'mule-unicode-0100-24ff
))
108 (r2 = (((r1 / 96) + 32) << 7))
111 (write-multibyte-character r0 r1
)))))
116 (r3 = ((r0 & #x0f
) << 12))
117 (r3 += ((r1 & #x3f
) << 6))
119 ;; now r3 holds scalar value
121 ;; mule-unicode-0100-24ff (>= 0800)
123 ((r0 = ,(charset-id 'mule-unicode-0100-24ff
))
127 (r1 += ((r3 + 32) << 7))
128 (write-multibyte-character r0 r1
))
130 ;; mule-unicode-2500-33ff
132 ((r0 = ,(charset-id 'mule-unicode-2500-33ff
))
136 (r1 += ((r3 + 32) << 7))
137 (write-multibyte-character r0 r1
))
140 ;; keep those bytes as eight-bit-{control|graphic}
142 (;; #xe0 <= r0 < #xf0, so r0 is eight-bit-graphic
143 (r3 = ,(charset-id 'eight-bit-graphic
))
144 (write-multibyte-character r3 r0
)
146 (r3 = ,(charset-id 'eight-bit-control
)))
147 (write-multibyte-character r3 r1
)
149 (r3 = ,(charset-id 'eight-bit-control
))
150 (r3 = ,(charset-id 'eight-bit-graphic
)))
151 (write-multibyte-character r3 r2
))
153 ;; mule-unicode-e000-ffff
154 ((r0 = ,(charset-id 'mule-unicode-e000-ffff
))
158 (r1 += ((r3 + 32) << 7))
159 (write-multibyte-character r0 r1
))))))
162 ;; keep those bytes as eight-bit-{control|graphic}
164 ;; r0 > #xf0, thus eight-bit-graphic
165 (r4 = ,(charset-id 'eight-bit-graphic
))
166 (write-multibyte-character r4 r0
)
168 (r4 = ,(charset-id 'eight-bit-control
)))
169 (write-multibyte-character r4 r1
)
171 (r4 = ,(charset-id 'eight-bit-control
))
172 (r4 = ,(charset-id 'eight-bit-graphic
)))
173 (write-multibyte-character r4 r2
)
175 (r4 = ,(charset-id 'eight-bit-control
))
176 (r4 = ,(charset-id 'eight-bit-graphic
)))
177 (write-multibyte-character r4 r3
)))))
181 "CCL program to decode UTF-8.
182 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
183 mule-unicode-*. Encodings of un-representable Unicode characters are
184 decoded asis into eight-bit-control and eight-bit-graphic
187 (define-ccl-program ccl-encode-mule-utf-8
190 (read-multibyte-character r0 r1
)
192 (if (r0 == ,(charset-id 'ascii
))
195 (if (r0 == ,(charset-id 'latin-iso8859-1
))
197 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
198 ;; 20 0000 0000 1010 0000 1100 0010 1010 0000
199 ;; 7f 0000 0000 1111 1111 1100 0011 1011 1111
200 ((r0 = (((r1 & #x40
) >> 6) |
#xc2
))
205 (if (r0 == ,(charset-id 'mule-unicode-0100-24ff
))
206 ((r0 = ((((r1 & #x3f80
) >> 7) -
32) * 96))
207 ;; #x3f80 == (0011 1111 1000 0000)b
209 (r1 += (r0 + 224)) ; 240 == -32 + #x0100
210 ;; now r1 holds scalar value
213 ((r0 = (((r1 & #x07c0
) >> 6) |
#xc0
))
214 ;; #x07c0 == (0000 0111 1100 0000)b
219 ((r0 = (((r1 & #xf000
) >> 12) |
#xe0
))
220 (r2 = ((r1 & #x3f
) |
#x80
))
226 (if (r0 == ,(charset-id 'mule-unicode-2500-33ff
))
227 ((r0 = ((((r1 & #x3f80
) >> 7) -
32) * 96))
229 (r1 += (r0 + 9440)) ; 9440 == -32 + #x2500
230 (r0 = (((r1 & #xf000
) >> 12) |
#xe0
))
231 (r2 = ((r1 & #x3f
) |
#x80
))
237 (if (r0 == ,(charset-id 'mule-unicode-e000-ffff
))
238 ((r0 = ((((r1 & #x3f80
) >> 7) -
32) * 96))
240 (r1 += (r0 + 57312)) ; 57312 == -160 + #xe000
241 (r0 = (((r1 & #xf000
) >> 12) |
#xe0
))
242 (r2 = ((r1 & #x3f
) |
#x80
))
248 (if (r0 == ,(charset-id 'eight-bit-control
))
250 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
251 ;; 80 0000 0000 1000 0000 1100 0010 1000 0000
252 ;; 9f 0000 0000 1001 1111 1100 0010 1001 1111
255 (if (r0 == ,(charset-id 'eight-bit-graphic
))
257 ;; 0000 0yyy yyxx xxxx 110y yyyy 10xx xxxx
258 ;; a0 0000 0000 1010 0000 1100 0010 1010 0000
259 ;; ff 0000 0000 1111 1111 1101 1111 1011 1111
262 ;; Unsupported character.
263 ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
269 "CCL program to encode into UTF-8.
270 Only characters from the charsets ascii, eight-bit-control,
271 eight-bit-graphic, latin-iso8859-1 and mule-unicode-* are recognized.
272 Others are encoded as U+FFFD.")
276 "UTF-8 encoding for Emacs-supported Unicode characters.
277 The supported Emacs character sets are:
282 mule-unicode-0100-24ff
283 mule-unicode-2500-33ff
284 mule-unicode-e000-ffff
286 Unicode characters out of the ranges U+0000-U+33FF and U+E200-U+FFFF
287 are decoded into sequences of eight-bit-control and eight-bit-graphic
288 characters to preserve their byte sequences. Emacs characters out of
289 these ranges are encoded into U+FFFD.
291 Note that, currently, characters in the mule-unicode charsets have no
292 syntax and case information. Thus, for instance, upper- and
293 lower-casing commands won't work with them."
295 '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8
)
301 mule-unicode-0100-24ff
302 mule-unicode-2500-33ff
303 mule-unicode-e000-ffff
)
304 (mime-charset . utf-8
)
305 (coding-category . coding-category-utf-8
)
306 (valid-codes (0 .
255))))
308 (define-coding-system-alias 'utf-8
'mule-utf-8
)