lisp/international/utf-8.el

   1 ;;; utf-8.el --- UTF-8 decoding/encoding support -*- coding: iso-2022-7bit -*-
   2
   3 ;; Copyright (C) 2001, 2002, 2003, 2004  Free Software Foundation, Inc.
   4 ;; Copyright (C) 2001, 2002, 2003, 2004
   5 ;;   National Institute of Advanced Industrial Science and Technology (AIST)
   6 ;;   Registration Number H14PRO021
   7
   8 ;; Author: TAKAHASHI Naoto  <ntakahas@m17n.org>
   9 ;; Maintainer: FSF
  10 ;; Keywords: multilingual, Unicode, UTF-8, i18n
  11
  12 ;; This file is part of GNU Emacs.
  13
  14 ;; GNU Emacs is free software; you can redistribute it and/or modify
  15 ;; it under the terms of the GNU General Public License as published by
  16 ;; the Free Software Foundation; either version 2, or (at your option)
  17 ;; any later version.
  18
  19 ;; GNU Emacs is distributed in the hope that it will be useful,
  20 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 ;; GNU General Public License for more details.
  23
  24 ;; You should have received a copy of the GNU General Public License
  25 ;; along with GNU Emacs; see the file COPYING.  If not, write to the
  26 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  27 ;; Boston, MA 02111-1307, USA.
  28
  29 ;;; Commentary:
  30
  31 ;; The coding-system `mule-utf-8' basically supports encoding/decoding
  32 ;; of the following character sets to and from UTF-8:
  33 ;;
  34 ;;   ascii
  35 ;;   eight-bit-control
  36 ;;   latin-iso8859-1
  37 ;;   mule-unicode-0100-24ff
  38 ;;   mule-unicode-2500-33ff
  39 ;;   mule-unicode-e000-ffff
  40 ;;
  41 ;; On decoding, Unicode characters that do not fit into the above
  42 ;; character sets are handled as `eight-bit-control' or
  43 ;; `eight-bit-graphic' characters to retain the information about the
  44 ;; original byte sequence and text properties record the corresponding
  45 ;; unicode.
  46 ;;
  47 ;; Fixme: note that reading and writing invalid utf-8 may not be
  48 ;; idempotent -- to represent the bytes to fix that needs a new charset.
  49 ;;
  50 ;; Characters from other character sets can be encoded with mule-utf-8
  51 ;; by populating the translation table
  52 ;; `utf-translation-table-for-encode'.  Hash tables
  53 ;; `utf-subst-table-for-decode' and `utf-subst-table-for-encode' are
  54 ;; used to support encoding and decoding of about a quarter of the CJK
  55 ;; space between U+3400 and U+DFFF.
  56
  57 ;; UTF-8 is defined in RFC 3629.  A sketch of the encoding is:
  58
  59 ;;        scalar       |               utf-8
  60 ;;        value        | 1st byte  | 2nd byte  | 3rd byte
  61 ;; --------------------+-----------+-----------+----------
  62 ;; 0000 0000 0xxx xxxx | 0xxx xxxx |           |
  63 ;; 0000 0yyy yyxx xxxx | 110y yyyy | 10xx xxxx |
  64 ;; zzzz yyyy yyxx xxxx | 1110 zzzz | 10yy yyyy | 10xx xxxx
  65
  66 ;;; Code:
  67
  68 (defvar ucs-mule-to-mule-unicode (make-char-table 'translation-table nil)
  69   "Char table mapping characters to latin-iso8859-1 or mule-unicode-*.
  70
  71 If `unify-8859-on-encoding-mode' is non-nil, this table populates the
  72 translation-table named `utf-translation-table-for-encode'.")
  73
  74 (define-translation-table 'utf-translation-table-for-encode)
  75
  76
  77 ;; Map Cyrillic and Greek to iso-8859 charsets, which take half the
  78 ;; space of mule-unicode.  For Latin scripts this isn't very
  79 ;; important.  Hebrew and Arabic might go here too when there's proper
  80 ;; support for them.
  81
  82 (defvar utf-fragmentation-table (make-char-table 'translation-table nil)
  83   "Char-table normally mapping non-Latin mule-unicode-* chars to iso-8859-*.
  84
  85 If `utf-fragment-on-decoding' is non-nil, this table populates the
  86 translation-table named `utf-translation-table-for-decode'")
  87
  88 (defvar utf-defragmentation-table (make-char-table 'translation-table nil)
  89   "Char-table for reverse mapping of `utf-fragmentation-table'.
  90
  91 If `utf-fragment-on-decoding' is non-nil and
  92 `unify-8859-on-encoding-mode' is nil, this table populates the
  93 translation-table named `utf-translation-table-for-encode'")
  94
  95 (define-translation-table 'utf-translation-table-for-decode)
  96
  97
  98 (defvar ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
  99   "Hash table mapping Emacs CJK character sets to Unicode code points.
 100
 101 If `utf-translate-cjk-mode' is non-nil, this table populates the
 102 translation-hash-table named `utf-subst-table-for-encode'.")
 103
 104 (define-translation-hash-table 'utf-subst-table-for-encode
 105   ucs-mule-cjk-to-unicode)
 106
 107 (defvar ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)
 108   "Hash table mapping Unicode code points to Emacs CJK character sets.
 109
 110 If `utf-translate-cjk-mode' is non-nil, this table populates the
 111 translation-hash-table named `utf-subst-table-for-decode'.")
 112
 113 (define-translation-hash-table 'utf-subst-table-for-decode
 114   ucs-unicode-to-mule-cjk)
 115
 116 (mapc
 117  (lambda (pair)
 118    (aset utf-fragmentation-table (car pair) (cdr pair))
 119    (aset utf-defragmentation-table (cdr pair) (car pair)))
 120  '((?\e$,1&d\e(B . ?\e,F4\e(B) (?\e$,1&e\e(B . ?\e,F5\e(B) (?\e$,1&f\e(B . ?\e,F6\e(B) (?\e$,1&h\e(B . ?\e,F8\e(B) (?\e$,1&i\e(B . ?\e,F9\e(B)
 121    (?\e$,1&j\e(B . ?\e,F:\e(B) (?\e$,1&l\e(B . ?\e,F<\e(B) (?\e$,1&n\e(B . ?\e,F>\e(B) (?\e$,1&o\e(B . ?\e,F?\e(B) (?\e$,1&p\e(B . ?\e,F@\e(B)
 122    (?\e$,1&q\e(B . ?\e,FA\e(B) (?\e$,1&r\e(B . ?\e,FB\e(B) (?\e$,1&s\e(B . ?\e,FC\e(B) (?\e$,1&t\e(B . ?\e,FD\e(B) (?\e$,1&u\e(B . ?\e,FE\e(B)
 123    (?\e$,1&v\e(B . ?\e,FF\e(B) (?\e$,1&w\e(B . ?\e,FG\e(B) (?\e$,1&x\e(B . ?\e,FH\e(B) (?\e$,1&y\e(B . ?\e,FI\e(B) (?\e$,1&z\e(B . ?\e,FJ\e(B)
 124    (?\e$,1&{\e(B . ?\e,FK\e(B) (?\e$,1&|\e(B . ?\e,FL\e(B) (?\e$,1&}\e(B . ?\e,FM\e(B) (?\e$,1&~\e(B . ?\e,FN\e(B) (?\e$,1&\x7f\e(B . ?\e,FO\e(B)
 125    (?\e$,1' \e(B . ?\e,FP\e(B) (?\e$,1'!\e(B . ?\e,FQ\e(B) (?\e$,1'#\e(B . ?\e,FS\e(B) (?\e$,1'$\e(B . ?\e,FT\e(B) (?\e$,1'%\e(B . ?\e,FU\e(B)
 126    (?\e$,1'&\e(B . ?\e,FV\e(B) (?\e$,1''\e(B . ?\e,FW\e(B) (?\e$,1'(\e(B . ?\e,FX\e(B) (?\e$,1')\e(B . ?\e,FY\e(B) (?\e$,1'*\e(B . ?\e,FZ\e(B)
 127    (?\e$,1'+\e(B . ?\e,F[\e(B) (?\e$,1',\e(B . ?\e,F\\e(B) (?\e$,1'-\e(B . ?\e,F]\e(B) (?\e$,1'.\e(B . ?\e,F^\e(B) (?\e$,1'/\e(B . ?\e,F_\e(B)
 128    (?\e$,1'0\e(B . ?\e,F`\e(B) (?\e$,1'1\e(B . ?\e,Fa\e(B) (?\e$,1'2\e(B . ?\e,Fb\e(B) (?\e$,1'3\e(B . ?\e,Fc\e(B) (?\e$,1'4\e(B . ?\e,Fd\e(B)
 129    (?\e$,1'5\e(B . ?\e,Fe\e(B) (?\e$,1'6\e(B . ?\e,Ff\e(B) (?\e$,1'7\e(B . ?\e,Fg\e(B) (?\e$,1'8\e(B . ?\e,Fh\e(B) (?\e$,1'9\e(B . ?\e,Fi\e(B)
 130    (?\e$,1':\e(B . ?\e,Fj\e(B) (?\e$,1';\e(B . ?\e,Fk\e(B) (?\e$,1'<\e(B . ?\e,Fl\e(B) (?\e$,1'=\e(B . ?\e,Fm\e(B) (?\e$,1'>\e(B . ?\e,Fn\e(B)
 131    (?\e$,1'?\e(B . ?\e,Fo\e(B) (?\e$,1'@\e(B . ?\e,Fp\e(B) (?\e$,1'A\e(B . ?\e,Fq\e(B) (?\e$,1'B\e(B . ?\e,Fr\e(B) (?\e$,1'C\e(B . ?\e,Fs\e(B)
 132    (?\e$,1'D\e(B . ?\e,Ft\e(B) (?\e$,1'E\e(B . ?\e,Fu\e(B) (?\e$,1'F\e(B . ?\e,Fv\e(B) (?\e$,1'G\e(B . ?\e,Fw\e(B) (?\e$,1'H\e(B . ?\e,Fx\e(B)
 133    (?\e$,1'I\e(B . ?\e,Fy\e(B) (?\e$,1'J\e(B . ?\e,Fz\e(B) (?\e$,1'K\e(B . ?\e,F{\e(B) (?\e$,1'L\e(B . ?\e,F|\e(B) (?\e$,1'M\e(B . ?\e,F}\e(B)
 134    (?\e$,1'N\e(B . ?\e,F~\e(B)
 135
 136    (?\e$,1(!\e(B . ?\e,L!\e(B) (?\e$,1("\e(B . ?\e,L"\e(B) (?\e$,1(#\e(B . ?\e,L#\e(B) (?\e$,1($\e(B . ?\e,L$\e(B)
 137    (?\e$,1(%\e(B . ?\e,L%\e(B) (?\e$,1(&\e(B . ?\e,L&\e(B) (?\e$,1('\e(B . ?\e,L'\e(B) (?\e$,1((\e(B . ?\e,L(\e(B) (?\e$,1()\e(B . ?\e,L)\e(B)
 138    (?\e$,1(*\e(B . ?\e,L*\e(B) (?\e$,1(+\e(B . ?\e,L+\e(B) (?\e$,1(,\e(B . ?\e,L,\e(B) (?\e$,1(.\e(B . ?\e,L.\e(B) (?\e$,1(/\e(B . ?\e,L/\e(B)
 139    (?\e$,1(0\e(B . ?\e,L0\e(B) (?\e$,1(1\e(B . ?\e,L1\e(B) (?\e$,1(2\e(B . ?\e,L2\e(B) (?\e$,1(3\e(B . ?\e,L3\e(B) (?\e$,1(4\e(B . ?\e,L4\e(B)
 140    (?\e$,1(5\e(B . ?\e,L5\e(B) (?\e$,1(6\e(B . ?\e,L6\e(B) (?\e$,1(7\e(B . ?\e,L7\e(B) (?\e$,1(8\e(B . ?\e,L8\e(B) (?\e$,1(9\e(B . ?\e,L9\e(B)
 141    (?\e$,1(:\e(B . ?\e,L:\e(B) (?\e$,1(;\e(B . ?\e,L;\e(B) (?\e$,1(<\e(B . ?\e,L<\e(B) (?\e$,1(=\e(B . ?\e,L=\e(B) (?\e$,1(>\e(B . ?\e,L>\e(B)
 142    (?\e$,1(?\e(B . ?\e,L?\e(B) (?\e$,1(@\e(B . ?\e,L@\e(B) (?\e$,1(A\e(B . ?\e,LA\e(B) (?\e$,1(B\e(B . ?\e,LB\e(B) (?\e$,1(C\e(B . ?\e,LC\e(B)
 143    (?\e$,1(D\e(B . ?\e,LD\e(B) (?\e$,1(E\e(B . ?\e,LE\e(B) (?\e$,1(F\e(B . ?\e,LF\e(B) (?\e$,1(G\e(B . ?\e,LG\e(B) (?\e$,1(H\e(B . ?\e,LH\e(B)
 144    (?\e$,1(I\e(B . ?\e,LI\e(B) (?\e$,1(J\e(B . ?\e,LJ\e(B) (?\e$,1(K\e(B . ?\e,LK\e(B) (?\e$,1(L\e(B . ?\e,LL\e(B) (?\e$,1(M\e(B . ?\e,LM\e(B)
 145    (?\e$,1(N\e(B . ?\e,LN\e(B) (?\e$,1(O\e(B . ?\e,LO\e(B) (?\e$,1(P\e(B . ?\e,LP\e(B) (?\e$,1(Q\e(B . ?\e,LQ\e(B) (?\e$,1(R\e(B . ?\e,LR\e(B)
 146    (?\e$,1(S\e(B . ?\e,LS\e(B) (?\e$,1(T\e(B . ?\e,LT\e(B) (?\e$,1(U\e(B . ?\e,LU\e(B) (?\e$,1(V\e(B . ?\e,LV\e(B) (?\e$,1(W\e(B . ?\e,LW\e(B)
 147    (?\e$,1(X\e(B . ?\e,LX\e(B) (?\e$,1(Y\e(B . ?\e,LY\e(B) (?\e$,1(Z\e(B . ?\e,LZ\e(B) (?\e$,1([\e(B . ?\e,L[\e(B) (?\e$,1(\\e(B . ?\e,L\\e(B)
 148    (?\e$,1(]\e(B . ?\e,L]\e(B) (?\e$,1(^\e(B . ?\e,L^\e(B) (?\e$,1(_\e(B . ?\e,L_\e(B) (?\e$,1(`\e(B . ?\e,L`\e(B) (?\e$,1(a\e(B . ?\e,La\e(B)
 149    (?\e$,1(b\e(B . ?\e,Lb\e(B) (?\e$,1(c\e(B . ?\e,Lc\e(B) (?\e$,1(d\e(B . ?\e,Ld\e(B) (?\e$,1(e\e(B . ?\e,Le\e(B) (?\e$,1(f\e(B . ?\e,Lf\e(B)
 150    (?\e$,1(g\e(B . ?\e,Lg\e(B) (?\e$,1(h\e(B . ?\e,Lh\e(B) (?\e$,1(i\e(B . ?\e,Li\e(B) (?\e$,1(j\e(B . ?\e,Lj\e(B) (?\e$,1(k\e(B . ?\e,Lk\e(B)
 151    (?\e$,1(l\e(B . ?\e,Ll\e(B) (?\e$,1(m\e(B . ?\e,Lm\e(B) (?\e$,1(n\e(B . ?\e,Ln\e(B) (?\e$,1(o\e(B . ?\e,Lo\e(B) (?\e$,1(q\e(B . ?\e,Lq\e(B)
 152    (?\e$,1(r\e(B . ?\e,Lr\e(B) (?\e$,1(s\e(B . ?\e,Ls\e(B) (?\e$,1(t\e(B . ?\e,Lt\e(B) (?\e$,1(u\e(B . ?\e,Lu\e(B) (?\e$,1(v\e(B . ?\e,Lv\e(B)
 153    (?\e$,1(w\e(B . ?\e,Lw\e(B) (?\e$,1(x\e(B . ?\e,Lx\e(B) (?\e$,1(y\e(B . ?\e,Ly\e(B) (?\e$,1(z\e(B . ?\e,Lz\e(B) (?\e$,1({\e(B . ?\e,L{\e(B)
 154    (?\e$,1(|\e(B . ?\e,L|\e(B) (?\e$,1(~\e(B . ?\e,L~\e(B) (?\e$,1(\x7f\e(B . ?\e,L\x7f\e(B)))
 155
 156
 157 (defcustom utf-fragment-on-decoding nil
 158   "Whether or not to decode some chars in UTF-8/16 text into iso8859 charsets.
 159 Setting this means that the relevant Cyrillic and Greek characters are
 160 decoded into the iso8859 charsets rather than into
 161 mule-unicode-0100-24ff.  The iso8859 charsets take half as much space
 162 in the buffer, but using them may affect how the buffer can be re-encoded
 163 and may require a different input method to search for them, for instance.
 164 See `unify-8859-on-decoding-mode' and `unify-8859-on-encoding-mode'
 165 for mechanisms to make this largely transparent.
 166
 167 Setting this variable outside customize has no effect."
 168   :set (lambda (s v)
 169          (if v
 170              (progn
 171                (define-translation-table 'utf-translation-table-for-decode
 172                  utf-fragmentation-table)
 173                ;; Even if unify-8859-on-encoding-mode is off, make
 174                ;; mule-utf-* encode characters in
 175                ;; utf-fragmentation-table.
 176                (unless (eq (get 'utf-translation-table-for-encode
 177                                 'translation-table)
 178                            ucs-mule-to-mule-unicode)
 179                  (define-translation-table 'utf-translation-table-for-encode
 180                    utf-defragmentation-table)))
 181            (define-translation-table 'utf-translation-table-for-decode)
 182            ;; When unify-8859-on-encoding-mode is off, be sure to make
 183            ;; mule-utf-* disabled for characters in
 184            ;; utf-fragmentation-table.
 185            (unless (eq (get 'utf-translation-table-for-encode
 186                             'translation-table)
 187                        ucs-mule-to-mule-unicode)
 188              (define-translation-table 'utf-translation-table-for-encode)))
 189          (set-default s v))
 190   :version "22.1"
 191   :type 'boolean
 192   :group 'mule)
 193
 194
 195 (defconst utf-translate-cjk-charsets '(chinese-gb2312
 196                                        chinese-big5-1 chinese-big5-2
 197                                        japanese-jisx0208 japanese-jisx0212
 198                                        katakana-jisx0201
 199                                        korean-ksc5601)
 200   "List of charsets supported by `utf-translate-cjk-mode'.")
 201
 202 (defvar utf-translate-cjk-lang-env nil
 203   "Language environment in which tables for `utf-translate-cjk-mode' is loaded.
 204 The value nil means that the tables are not yet loaded.")
 205
 206 (defvar utf-translate-cjk-unicode-range)
 207
 208 ;; String generated from utf-translate-cjk-unicode-range.  It is
 209 ;; suitable for an argument to skip-chars-forward.
 210 (defvar utf-translate-cjk-unicode-range-string nil)
 211
 212 (defun utf-translate-cjk-set-unicode-range (range)
 213   (setq utf-translate-cjk-unicode-range range)
 214   (setq utf-translate-cjk-unicode-range-string
 215         (let ((decode-char-no-trans
 216                #'(lambda (x)
 217                    (cond ((< x #x100) (make-char 'latin-iso8859-1 x))
 218                          ((< x #x2500)
 219                           (setq x (- x #x100))
 220                           (make-char 'mule-unicode-0100-24ff
 221                                      (+ (/ x 96) 32) (+ (% x 96) 32)))
 222                          ((< x #x3400)
 223                           (setq x (- x #x2500))
 224                           (make-char 'mule-unicode-2500-33ff
 225                                      (+ (/ x 96) 32) (+ (% x 96) 32)))
 226                          (t
 227                           (setq x (- x #xe000))
 228                           (make-char 'mule-unicode-e000-ffff
 229                                      (+ (/ x 96) 32) (+ (% x 96) 32))))))
 230               ranges from to)
 231           (dolist (elt range)
 232             (setq from (max #xA0 (car elt)) to (min #xffff (cdr elt)))
 233             (if (and (>= to #x3400) (< to #xE000))
 234                 (setq to #x33FF))
 235             (cond ((< from #x100)
 236                    (if (>= to #xE000)
 237                        (setq ranges (cons (cons #xE000 to) ranges)
 238                              to #x33FF))
 239                    (if (>= to #x2500)
 240                        (setq ranges (cons (cons #x2500 to) ranges)
 241                              to #x24FF))
 242                    (if (>= to #x100)
 243                        (setq ranges (cons (cons #x100 to) ranges)
 244                              to #xFF)))
 245                   ((< from #x2500)
 246                    (if (>= to #xE000)
 247                        (setq ranges (cons (cons #xE000 to) ranges)
 248                              to #x33FF))
 249                    (if (>= to #x2500)
 250                        (setq ranges (cons (cons #x2500 to) ranges)
 251                              to #x24FF)))
 252                   ((< from #x3400)
 253                    (if (>= to #xE000)
 254                        (setq ranges (cons (cons #xE000 to) ranges)
 255                              to #x33FF))))
 256             (if (<= from to)
 257                 (setq ranges (cons (cons from to) ranges))))
 258           (mapconcat #'(lambda (x)
 259                          (format "%c-%c"
 260                                  (funcall decode-char-no-trans (car x))
 261                                  (funcall decode-char-no-trans (cdr x))))
 262                      ranges "")))
 263   ;; These forces loading and settting tables for
 264   ;; utf-translate-cjk-mode.
 265   (setq utf-translate-cjk-lang-env nil
 266         ucs-mule-cjk-to-unicode (make-hash-table :test 'eq)
 267         ucs-unicode-to-mule-cjk (make-hash-table :test 'eq)))
 268
 269 (defcustom utf-translate-cjk-unicode-range '((#x2e80 . #xd7a3)
 270                                              (#xff00 . #xffef))
 271   "List of Unicode code ranges supported by `utf-translate-cjk-mode'.
 272 Setting this variable directly does not take effect;
 273 use either \\[customize] or the function
 274 `utf-translate-cjk-set-unicode-range'."
 275   :version "22.1"
 276   :type '(repeat (cons integer integer))
 277   :set (lambda (symbol value)
 278          (utf-translate-cjk-set-unicode-range value))
 279   :group 'mule)
 280
 281 ;; Return non-nil if CODE-POINT is in `utf-translate-cjk-unicode-range'.
 282 (defsubst utf-translate-cjk-substitutable-p (code-point)
 283   (let ((tail utf-translate-cjk-unicode-range)
 284         elt)
 285     (while tail
 286       (setq elt (car tail) tail (cdr tail))
 287       (if (and (>= code-point (car elt)) (<= code-point (cdr elt)))
 288           (setq tail nil)
 289         (setq elt nil)))
 290     elt))
 291
 292 (defun utf-translate-cjk-load-tables ()
 293   "Load tables for `utf-translate-cjk-mode'."
 294   ;; Fixme: Allow the use of the CJK charsets to be
 295   ;; customized by reordering and possible omission.
 296   (let ((redefined (< (hash-table-size ucs-mule-cjk-to-unicode) 43000)))
 297     (if redefined
 298         ;; Redefine them with realistic initial sizes and a
 299         ;; smallish rehash size to avoid wasting significant
 300         ;; space after they're built.
 301         (setq ucs-mule-cjk-to-unicode
 302               (make-hash-table :test 'eq :size 43000 :rehash-size 1000)
 303               ucs-unicode-to-mule-cjk
 304               (make-hash-table :test 'eq :size 21500 :rehash-size 1000)))
 305
 306     ;; Load the files explicitly, to avoid having to keep
 307     ;; around the large tables they contain (as well as the
 308     ;; ones which get built).
 309     ;; Here we bind coding-system-for-read to nil so that coding tags
 310     ;; in the files are respected even if the files are not yet
 311     ;; byte-compiled
 312     (let ((coding-system-for-read nil))
 313       (cond ((string= "Korean" current-language-environment)
 314              (load "subst-jis")
 315              (load "subst-big5")
 316              (load "subst-gb2312")
 317              (load "subst-ksc"))
 318             ((string= "Chinese-BIG5" current-language-environment)
 319              (load "subst-jis")
 320              (load "subst-ksc")
 321              (load "subst-gb2312")
 322              (load "subst-big5"))
 323             ((string= "Chinese-GB" current-language-environment)
 324              (load "subst-jis")
 325              (load "subst-ksc")
 326              (load "subst-big5")
 327              (load "subst-gb2312"))
 328             (t
 329              (load "subst-ksc")
 330              (load "subst-gb2312")
 331              (load "subst-big5")
 332              (load "subst-jis")))) ; jis covers as much as big5, gb2312
 333
 334     (when redefined
 335       (define-translation-hash-table 'utf-subst-table-for-decode
 336         ucs-unicode-to-mule-cjk)
 337       (define-translation-hash-table 'utf-subst-table-for-encode
 338         ucs-mule-cjk-to-unicode)
 339       (set-char-table-extra-slot (get 'utf-translation-table-for-encode
 340                                       'translation-table)
 341                                  1 ucs-mule-cjk-to-unicode))
 342
 343     (setq utf-translate-cjk-lang-env current-language-environment)))
 344
 345 (defun utf-lookup-subst-table-for-decode (code-point)
 346   (if (and utf-translate-cjk-mode
 347            (not utf-translate-cjk-lang-env)
 348            (utf-translate-cjk-substitutable-p code-point))
 349       (utf-translate-cjk-load-tables))
 350   (gethash code-point
 351            (get 'utf-subst-table-for-decode 'translation-hash-table)))
 352
 353
 354 (defun utf-lookup-subst-table-for-encode (char)
 355   (if (and utf-translate-cjk-mode
 356            (not utf-translate-cjk-lang-env)
 357            (memq (char-charset char) utf-translate-cjk-charsets))
 358       (utf-translate-cjk-load-tables))
 359   (gethash char
 360            (get 'utf-subst-table-for-encode 'translation-hash-table)))
 361
 362 (define-minor-mode utf-translate-cjk-mode
 363   "Toggle whether UTF based coding systems de/encode CJK characters.
 364 If ARG is an integer, enable if ARG is positive and disable if
 365 zero or negative.  This is a minor mode.
 366 Enabling this allows the coding systems mule-utf-8,
 367 mule-utf-16le and mule-utf-16be to encode characters in the charsets
 368 `korean-ksc5601', `chinese-gb2312', `chinese-big5-1',
 369 `chinese-big5-2', `japanese-jisx0208' and `japanese-jisx0212', and to
 370 decode the corresponding unicodes into such characters.
 371
 372 Where the charsets overlap, the one preferred for decoding is chosen
 373 according to the language environment in effect when this option is
 374 turned on: ksc5601 for Korean, gb2312 for Chinese-GB, big5 for
 375 Chinese-Big5 and jisx for other environments.
 376
 377 This mode is on by default.  If you are not interested in CJK
 378 characters and want to avoid some overhead on encoding/decoding
 379 by the above coding systems, you can customize the user option
 380 `utf-translate-cjk-mode' to nil."
 381   :init-value t
 382   :version "22.1"
 383   :type 'boolean
 384   :group 'mule
 385   :global t
 386   (if utf-translate-cjk-mode
 387       (progn
 388         (define-translation-hash-table 'utf-subst-table-for-decode
 389           ucs-unicode-to-mule-cjk)
 390         (define-translation-hash-table 'utf-subst-table-for-encode
 391           ucs-mule-cjk-to-unicode)
 392         (set-char-table-extra-slot (get 'utf-translation-table-for-encode
 393                                         'translation-table)
 394                                    1 ucs-mule-cjk-to-unicode))
 395     (define-translation-hash-table 'utf-subst-table-for-decode
 396       (make-hash-table :test 'eq))
 397     (define-translation-hash-table 'utf-subst-table-for-encode
 398       (make-hash-table :test 'eq))
 399     (set-char-table-extra-slot (get 'utf-translation-table-for-encode
 400                                     'translation-table)
 401                                1 nil))
 402
 403   ;; Update safe-chars of mule-utf-* coding systems.
 404   (dolist (elt (coding-system-list t))
 405     (if (string-match "^mule-utf" (symbol-name elt))
 406         (let ((safe-charsets (coding-system-get elt 'safe-charsets))
 407               (safe-chars (coding-system-get elt 'safe-chars))
 408               (need-update nil))
 409           (dolist (charset utf-translate-cjk-charsets)
 410             (unless (eq utf-translate-cjk-mode (memq charset safe-charsets))
 411               (setq safe-charsets
 412                     (if utf-translate-cjk-mode
 413                         (cons charset safe-charsets)
 414                       (delq charset safe-charsets))
 415                     need-update t)
 416               (aset safe-chars (make-char charset) utf-translate-cjk-mode)))
 417           (when need-update
 418             (coding-system-put elt 'safe-charsets safe-charsets)
 419             (define-coding-system-internal elt))))))
 420
 421 (define-ccl-program ccl-mule-utf-untrans
 422   ;; R0 is an untranslatable Unicode code-point (U+3500..U+DFFF or
 423   ;; U+10000..U+10FFFF) or an invaid byte (#x00..#xFF).  Write
 424   ;; eight-bit-control/graphic sequence (2 to 4 chars) representing
 425   ;; UTF-8 sequence of r0.  Registers r4, r5, r6 are modified.
 426   ;;
 427   ;; This is a subrountine because we assume that this is called very
 428   ;; rarely (so we don't have to worry about the overhead of the
 429   ;; call).
 430   `(0
 431     ((r5 = ,(charset-id 'eight-bit-control))
 432      (r6 = ,(charset-id 'eight-bit-graphic))
 433      (if (r0 < #x100)
 434          ((r4 = ((r0 >> 6) | #xC0))
 435           (write-multibyte-character r6 r4))
 436        ((if (r0 < #x10000)
 437             ((r4 = ((r0 >> 12) | #xE0))
 438              (write-multibyte-character r6 r4))
 439           ((r4 = ((r0 >> 18) | #xF0))
 440            (write-multibyte-character r6 r4)
 441            (r4 = (((r0 >> 12) & #x3F) | #x80))
 442            (if (r4 < #xA0)
 443                (write-multibyte-character r5 r4)
 444              (write-multibyte-character r6 r4))))
 445         (r4 = (((r0 >> 6) & #x3F) | #x80))
 446         (if (r4 < #xA0)
 447             (write-multibyte-character r5 r4)
 448           (write-multibyte-character r6 r4))))
 449      (r4 = ((r0 & #x3F) | #x80))
 450      (if (r4 < #xA0)
 451          (write-multibyte-character r5 r4)
 452        (write-multibyte-character r6 r4)))))
 453
 454 (define-ccl-program ccl-decode-mule-utf-8
 455   ;;
 456   ;;        charset         | bytes in utf-8 | bytes in emacs
 457   ;; -----------------------+----------------+---------------
 458   ;;         ascii          |       1        |       1
 459   ;; -----------------------+----------------+---------------
 460   ;;    eight-bit-control   |       2        |       2
 461   ;;    eight-bit-graphic   |       2        |       1
 462   ;;     latin-iso8859-1    |       2        |       2
 463   ;; -----------------------+----------------+---------------
 464   ;; mule-unicode-0100-24ff |       2        |       4
 465   ;;        (< 0800)        |                |
 466   ;; -----------------------+----------------+---------------
 467   ;; mule-unicode-0100-24ff |       3        |       4
 468   ;;        (>= 8000)       |                |
 469   ;; mule-unicode-2500-33ff |       3        |       4
 470   ;; mule-unicode-e000-ffff |       3        |       4
 471   ;; -----------------------+----------------+---------------
 472   ;;      invalid byte      |       1        |       2
 473   ;;
 474   ;; Thus magnification factor is two.
 475   ;;
 476   `(2
 477     ((r6 = ,(charset-id 'latin-iso8859-1))
 478      (read r0)
 479      (loop
 480       (if (r0 < #x80)
 481           ;; 1-byte encoding, i.e., ascii
 482           (write-read-repeat r0))
 483       (if (r0 < #xc2)
 484           ;; continuation byte (invalid here) or 1st byte of overlong
 485           ;; 2-byte sequence.
 486           ((call ccl-mule-utf-untrans)
 487            (r6 = ,(charset-id 'latin-iso8859-1))
 488            (read r0)
 489            (repeat)))
 490
 491       ;; Read the 2nd byte.
 492       (read r1)
 493       (if ((r1 & #b11000000) != #b10000000) ; Invalid 2nd byte
 494           ((call ccl-mule-utf-untrans)
 495            (r6 = ,(charset-id 'latin-iso8859-1))
 496            ;; Handle it in the next loop.
 497            (r0 = r1)
 498            (repeat)))
 499
 500       (if (r0 < #xe0)
 501           ;; 2-byte encoding 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
 502           ((r1 &= #x3F)
 503            (r1 |= ((r0 & #x1F) << 6))
 504            ;; Now r1 holds scalar value.  We don't have to check
 505            ;; `overlong sequence' because r0 >= 0xC2.
 506
 507            (if (r1 >= 256)
 508                ;; mule-unicode-0100-24ff (< 0800)
 509                ((r0 = r1)
 510                 (lookup-integer utf-subst-table-for-decode r0 r1)
 511                 (if (r7 == 0)
 512                     ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
 513                      (r1 -= #x0100)
 514                      (r2 = (((r1 / 96) + 32) << 7))
 515                      (r1 %= 96)
 516                      (r1 += (r2 + 32))
 517                      (translate-character
 518                       utf-translation-table-for-decode r0 r1)))
 519                 (write-multibyte-character r0 r1)
 520                 (read r0)
 521                 (repeat))
 522              (if (r1 >= 160)
 523                  ;; latin-iso8859-1
 524                  ((r0 = r1)
 525                   (lookup-integer utf-subst-table-for-decode r0 r1)
 526                   (if (r7 == 0)
 527                       ((r1 -= 128)
 528                        (write-multibyte-character r6 r1))
 529                     ((write-multibyte-character r0 r1)))
 530                   (read r0)
 531                   (repeat))
 532                ;; eight-bit-control
 533                ((r0 = ,(charset-id 'eight-bit-control))
 534                 (write-multibyte-character r0 r1)
 535                 (read r0)
 536                 (repeat))))))
 537
 538       ;; Read the 3rd bytes.
 539       (read r2)
 540       (if ((r2 & #b11000000) != #b10000000) ; Invalid 3rd byte
 541           ((call ccl-mule-utf-untrans)
 542            (r0 = r1)
 543            (call ccl-mule-utf-untrans)
 544            (r6 = ,(charset-id 'latin-iso8859-1))
 545            ;; Handle it in the next loop.
 546            (r0 = r2)
 547            (repeat)))
 548
 549       (if (r0 < #xF0)
 550           ;; 3byte encoding
 551           ;; zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
 552           ((r3 = ((r0 & #xF) << 12))
 553            (r3 |= ((r1 & #x3F) << 6))
 554            (r3 |= (r2 & #x3F))
 555
 556            (if (r3 < #x800)             ; `overlong sequence'
 557                ((call ccl-mule-utf-untrans)
 558                 (r0 = r1)
 559                 (call ccl-mule-utf-untrans)
 560                 (r0 = r2)
 561                 (call ccl-mule-utf-untrans)
 562                 (r6 = ,(charset-id 'latin-iso8859-1))
 563                 (read r0)
 564                 (repeat)))
 565
 566            (if (r3 < #x2500)
 567                ;; mule-unicode-0100-24ff (>= 0800)
 568                ((r0 = r3)
 569                 (lookup-integer utf-subst-table-for-decode r0 r1)
 570                 (if (r7 == 0)
 571                     ((r0 = ,(charset-id 'mule-unicode-0100-24ff))
 572                      (r3 -= #x0100)
 573                      (r3 //= 96)
 574                      (r1 = (r7 + 32))
 575                      (r1 += ((r3 + 32) << 7))
 576                      (translate-character
 577                       utf-translation-table-for-decode r0 r1)))
 578                 (write-multibyte-character r0 r1)
 579                 (read r0)
 580                 (repeat)))
 581
 582            (if (r3 < #x3400)
 583                ;; mule-unicode-2500-33ff
 584                ((r0 = r3)               ; don't zap r3
 585                 (lookup-integer utf-subst-table-for-decode r0 r1)
 586                 (if (r7 == 0)
 587                     ((r0 = ,(charset-id 'mule-unicode-2500-33ff))
 588                      (r3 -= #x2500)
 589                      (r3 //= 96)
 590                      (r1 = (r7 + 32))
 591                      (r1 += ((r3 + 32) << 7))))
 592                 (write-multibyte-character r0 r1)
 593                 (read r0)
 594                 (repeat)))
 595
 596            (if (r3 < #xE000)
 597                ;; Try to convert to CJK chars, else
 598                ;; keep them as eight-bit-{control|graphic}.
 599                ((r0 = r3)
 600                 (lookup-integer utf-subst-table-for-decode r3 r1)
 601                 (if r7
 602                     ;; got a translation
 603                     ((write-multibyte-character r3 r1)
 604                      (read r0)
 605                      (repeat))
 606                   ((call ccl-mule-utf-untrans)
 607                    (r6 = ,(charset-id 'latin-iso8859-1))
 608                    (read r0)
 609                    (repeat)))))
 610
 611            ;; mule-unicode-e000-ffff
 612            ;; Fixme: fffe and ffff are invalid.
 613            (r0 = r3)            ; don't zap r3
 614            (lookup-integer utf-subst-table-for-decode r0 r1)
 615            (if (r7 == 0)
 616                ((r0 = ,(charset-id 'mule-unicode-e000-ffff))
 617                 (r3 -= #xe000)
 618                 (r3 //= 96)
 619                 (r1 = (r7 + 32))
 620                 (r1 += ((r3 + 32) << 7))))
 621            (write-multibyte-character r0 r1)
 622            (read r0)
 623            (repeat)))
 624
 625       ;; Read the 4th bytes.
 626       (read r3)
 627       (if ((r3 & #b11000000) != #b10000000) ; Invalid 4th byte
 628           ((call ccl-mule-utf-untrans)
 629            (r0 = r1)
 630            (call ccl-mule-utf-untrans)
 631            (r0 = r2)
 632            (call ccl-mule-utf-untrans)
 633            (r6 = ,(charset-id 'latin-iso8859-1))
 634            ;; Handle it in the next loop.
 635            (r0 = r3)
 636            (repeat)))
 637
 638       (if (r0 < #xF8)
 639           ;; 4-byte encoding:
 640           ;; wwwzzzzzzyyyyyyxxxxxx = 11110www 10zzzzzz 10yyyyyy 10xxxxxx
 641           ;; keep those bytes as eight-bit-{control|graphic}
 642           ;; Fixme: allow lookup in utf-subst-table-for-decode.
 643           ((r4 = ((r0 & #x7) << 18))
 644            (r4 |= ((r1 & #x3F) << 12))
 645            (r4 |= ((r2 & #x3F) << 6))
 646            (r4 |= (r3 & #x3F))
 647
 648            (if (r4 < #x10000)           ; `overlong sequence'
 649                ((call ccl-mule-utf-untrans)
 650                 (r0 = r1)
 651                 (call ccl-mule-utf-untrans)
 652                 (r0 = r2)
 653                 (call ccl-mule-utf-untrans)
 654                 (r0 = r3)
 655                 (call ccl-mule-utf-untrans))
 656              ((r0 = r4)
 657               (call ccl-mule-utf-untrans))))
 658
 659         ;; Unsupported sequence.
 660         ((call ccl-mule-utf-untrans)
 661          (r0 = r1)
 662          (call ccl-mule-utf-untrans)
 663          (r0 = r2)
 664          (call ccl-mule-utf-untrans)
 665          (r0 = r3)
 666          (call ccl-mule-utf-untrans)))
 667       (r6 = ,(charset-id 'latin-iso8859-1))
 668       (read r0)
 669       (repeat)))
 670
 671
 672     ;; At EOF...
 673     (if (r0 >= 0)
 674         ;; r0 >= #x80
 675         ((call ccl-mule-utf-untrans)
 676          (if (r1 >= 0)
 677              ((r0 = r1)
 678               (call ccl-mule-utf-untrans)
 679               (if (r2 >= 0)
 680                   ((r0 = r2)
 681                    (call ccl-mule-utf-untrans)
 682                    (if (r3 >= 0)
 683                        ((r0 = r3)
 684                         (call ccl-mule-utf-untrans))))))))))
 685
 686   "CCL program to decode UTF-8.
 687 Basic decoding is done into the charsets ascii, latin-iso8859-1 and
 688 mule-unicode-*, but see also `utf-fragmentation-table' and
 689 `ucs-mule-cjk-to-unicode'.
 690 Encodings of un-representable Unicode characters are decoded asis into
 691 eight-bit-control and eight-bit-graphic characters.")
 692
 693 (define-ccl-program ccl-mule-utf-8-encode-untrans
 694   ;; UTF-8 decoder generates an UTF-8 sequence represented by a
 695   ;; sequence eight-bit-control/graphic chars for an untranslatable
 696   ;; character and an invalid byte.
 697   ;;
 698   ;; This CCL parses that sequence (the first byte is already in r1),
 699   ;; writes out the original bytes of that sequence, and sets r5 to
 700   ;; -1.
 701   ;;
 702   ;; If the eight-bit-control/graphic sequence is shorter than what r1
 703   ;; suggests, it sets r5 and r6 to the last character read that
 704   ;; should be handled by the next loop of a caller.
 705   ;;
 706   ;; Note: For UTF-8 validation, we only check if a character is
 707   ;; eight-bit-control/graphic or not.  It may result in incorrect
 708   ;; handling of random binary data, but such a data can't be encoded
 709   ;; by UTF-8 anyway.  At least, UTF-8 decoders doesn't generate such
 710   ;; a sequence even if a source contains invalid byte-sequence.
 711   `(0
 712     (;; Read the 2nd byte.
 713      (read-multibyte-character r5 r6)
 714      (r0 = (r5 != ,(charset-id 'eight-bit-control)))
 715      (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
 716          ((write r1)                    ; invalid UTF-8
 717           (r1 = -1)
 718           (end)))
 719
 720      (if (r1 <= #xC3)
 721          ;; 2-byte sequence for an originally invalid byte.
 722          ((r6 &= #x3F)
 723           (r6 |= ((r1 & #x1F) << 6))
 724           (write r6)
 725           (r5 = -1)
 726           (end)))
 727
 728      (write r1 r6)
 729      (r2 = r1)
 730      (r1 = -1)
 731      ;; Read the 3rd byte.
 732      (read-multibyte-character r5 r6)
 733      (r0 = (r5 != ,(charset-id 'eight-bit-control)))
 734      (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
 735          (end))                         ; invalid UTF-8
 736      (write r6)
 737      (if (r2 < #xF0)
 738          ;; 3-byte sequence for an untranslated character.
 739          ((r5 = -1)
 740           (end)))
 741      ;; Read the 4th byte.
 742      (read-multibyte-character r5 r6)
 743      (r0 = (r5 != ,(charset-id 'eight-bit-control)))
 744      (if ((r5 != ,(charset-id 'eight-bit-graphic)) & r0)
 745          (end))                 ; invalid UTF-8
 746      ;; 4-byte sequence for an untranslated character.
 747      (write r6)
 748      (r5 = -1)
 749      (end))
 750
 751     ;; At EOF...
 752     ((r5 = -1)
 753      (if (r1 >= 0)
 754          (write r1)))))
 755
 756 (define-ccl-program ccl-encode-mule-utf-8
 757   `(1
 758     ((r5 = -1)
 759      (loop
 760       (if (r5 < 0)
 761           (read-multibyte-character r0 r1)
 762         ;; Pre-read character is in r5 (charset-ID) and r6 (code-point).
 763         ((r0 = r5)
 764          (r1 = r6)
 765          (r5 = -1)))
 766       (translate-character utf-translation-table-for-encode r0 r1)
 767
 768       (if (r0 == ,(charset-id 'ascii))
 769           (write-repeat r1))
 770
 771       (if (r0 == ,(charset-id 'latin-iso8859-1))
 772           ;; r1          scalar                  utf-8
 773           ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
 774           ;; 20    0000 0000 1010 0000    1100 0010 1010 0000
 775           ;; 7f    0000 0000 1111 1111    1100 0011 1011 1111
 776           ((write ((r1 >> 6) | #xc2))
 777            (r1 &= #x3f)
 778            (r1 |= #x80)
 779            (write-repeat r1)))
 780
 781       (if (r0 == ,(charset-id 'mule-unicode-0100-24ff))
 782           ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
 783            ;; #x3f80 == (0011 1111 1000 0000)b
 784            (r1 &= #x7f)
 785            (r1 += (r0 + 224))           ; 240 == -32 + #x0100
 786            ;; now r1 holds scalar value
 787            (if (r1 < #x0800)
 788                ;; 2byte encoding
 789                ((write ((r1 >> 6) | #xC0))
 790                 (r1 &= #x3F)
 791                 (r1 |= #x80)
 792                 (write-repeat r1))
 793              ;; 3byte encoding
 794              ((write ((r1 >> 12) | #xE0))
 795               (write  (((r1 & #x0FC0) >> 6) | #x80))
 796               (r1 &= #x3F)
 797               (r1 |= #x80)
 798               (write-repeat r1)))))
 799
 800       (if (r0 == ,(charset-id 'mule-unicode-2500-33ff))
 801           ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
 802            (r1 &= #x7f)
 803            (r1 += (r0 + 9440))          ; 9440 == -32 + #x2500
 804            ;; now r1 holds scalar value
 805            (write ((r1 >> 12) | #xE0))
 806            (write  (((r1 & #x0FC0) >> 6) | #x80))
 807            (r1 &= #x3F)
 808            (r1 |= #x80)
 809            (write-repeat r1)))
 810
 811       (if (r0 == ,(charset-id 'mule-unicode-e000-ffff))
 812           ((r0 = ((((r1 & #x3f80) >> 7) - 32) * 96))
 813            (r1 &= #x7f)
 814            (r1 += (r0 + 57312))         ; 57312 == -32 + #xe000
 815            ;; now r1 holds scalar value
 816            (write ((r1 >> 12) | #xE0))
 817            (write  (((r1 & #x0FC0) >> 6) | #x80))
 818            (r1 &= #x3F)
 819            (r1 |= #x80)
 820            (write-repeat r1)))
 821
 822       (if (r0 == ,(charset-id 'eight-bit-control))
 823           ;; r1          scalar                  utf-8
 824           ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
 825           ;; 80    0000 0000 1000 0000    1100 0010 1000 0000
 826           ;; 9f    0000 0000 1001 1111    1100 0010 1001 1111
 827           ((write #xC2)
 828            (write-repeat r1)))
 829
 830       (if (r0 == ,(charset-id 'eight-bit-graphic))
 831           ;; r1          scalar                  utf-8
 832           ;;       0000 0yyy yyxx xxxx    110y yyyy 10xx xxxx
 833           ;; a0    0000 0000 1010 0000    1100 0010 1010 0000
 834           ;; ff    0000 0000 1111 1111    1101 1111 1011 1111
 835           ((r0 = (r1 >= #xC0))
 836            (r0 &= (r1 <= #xC3))
 837            (r4 = (r1 >= #xE1))
 838            (r4 &= (r1 <= #xF7))
 839            (r0 |= r4)
 840            (if r0
 841                ((call ccl-mule-utf-8-encode-untrans)
 842                 (repeat))
 843              (write-repeat r1))))
 844
 845       (lookup-character utf-subst-table-for-encode r0 r1)
 846       (if r7            ; lookup succeeded
 847           (if (r0 < #x800)
 848               ;; 2byte encoding
 849               ((write ((r0 >> 6) | #xC0))
 850                (r0 = ((r0 & #x3F) | #x80))
 851                (write-repeat r0))
 852             ;; 3byte encoding
 853             ((write ((r0 >> 12) | #xE0))
 854              (write  (((r0 & #x0FC0) >> 6) | #x80))
 855              (r0 = ((r0 & #x3F) | #x80))
 856              (write-repeat r0))))
 857
 858       ;; Unsupported character.
 859       ;; Output U+FFFD, which is `ef bf bd' in UTF-8.
 860       (write #xef)
 861       (write #xbf)
 862       (write-repeat #xbd))))
 863   "CCL program to encode into UTF-8.")
 864
 865
 866 (define-ccl-program ccl-untranslated-to-ucs
 867   `(0
 868     (if (r1 == 0)
 869         nil
 870       (if (r0 <= #xC3)                  ; 2-byte encoding
 871           ((r0 = ((r0 & #x3) << 6))
 872            (r0 |= (r1 & #x3F))
 873            (r1 = 2))
 874         (if (r2 == 0)
 875             (r1 = 0)
 876           (if (r0 < #xF0)               ; 3-byte encoding, as above
 877               ((r0 = ((r0 & #xF) << 12))
 878                (r0 |= ((r1 & #x3F) << 6))
 879                (r0 |= (r2 & #x3F))
 880                (r1 = 3))
 881             (if (r3 == 0)
 882                 (r1 = 0)
 883               ((r0 = ((r0 & #x7) << 18))
 884                (r0 |= ((r1 & #x3F) << 12))
 885                (r0 |= ((r2 & #x3F) << 6))
 886                (r0 |= (r3 & #x3F))
 887                (r1 = 4))))))))
 888   "Decode 2-, 3-, or 4-byte sequences in r0, r1, r2 [,r3] to unicodes in r0.
 889 Set r1 to the byte length.  r0 == 0 for invalid sequence.")
 890
 891 (defvar utf-8-ccl-regs (make-vector 8 0))
 892
 893 (defsubst utf-8-untranslated-to-ucs ()
 894   "Return the UCS code for an untranslated sequence of raw bytes t point.
 895 Only for 3- or 4-byte sequences."
 896   (aset utf-8-ccl-regs 0 (or (char-after) 0))
 897   (aset utf-8-ccl-regs 1 (or (char-after (1+ (point))) 0))
 898   (aset utf-8-ccl-regs 2 (or (char-after (+ 2 (point))) 0))
 899   (aset utf-8-ccl-regs 3 (or (char-after (+ 3 (point))) 0))
 900   (ccl-execute 'ccl-untranslated-to-ucs utf-8-ccl-regs))
 901
 902 (defun utf-8-help-echo (window object position)
 903   (format "Untranslated Unicode U+%04X"
 904           (get-char-property position 'untranslated-utf-8 object)))
 905
 906 ;; We compose the untranslatable sequences into a single character,
 907 ;; and move point to the next character.
 908 ;; This is infelicitous for editing, because there's currently no
 909 ;; mechanism for treating compositions as atomic, but is OK for
 910 ;; display.  They are composed to U+FFFD with help-echo which
 911 ;; indicates the unicodes they represent.  This function GCs too much.
 912
 913 ;; If utf-translate-cjk-mode is non-nil, this function is called with
 914 ;; HASH-TABLE which translates CJK characters into some of CJK
 915 ;; charsets.
 916
 917 (defsubst utf-8-compose (hash-table)
 918   "Put a suitable composition on an untranslatable sequence at point.
 919 If HASH-TABLE is non-nil, try to translate CJK characters by it at first.
 920 Move point to the end of the sequence."
 921   (utf-8-untranslated-to-ucs)
 922   (let ((l (aref utf-8-ccl-regs 1))
 923         ch)
 924     (if (> l 0)
 925         (if (and hash-table
 926                  (setq ch (gethash (aref utf-8-ccl-regs 0)  hash-table)))
 927             (progn
 928               (insert ch)
 929               (delete-region (point) (min (point-max) (+ l (point)))))
 930           (setq ch (aref utf-8-ccl-regs 0))
 931           (put-text-property (point) (min (point-max) (+ l (point)))
 932                              'untranslated-utf-8 ch)
 933           (put-text-property (point) (min (point-max) (+ l (point)))
 934                              'help-echo 'utf-8-help-echo)
 935           (if (= l 2)
 936               (put-text-property (point) (min (point-max) (+ l (point)))
 937                                  'display (format "\\%03o" ch))
 938             (compose-region (point) (+ l (point)) ?\e$,3u=\e(B))
 939           (forward-char l))
 940       (forward-char 1))))
 941
 942 (defcustom utf-8-compose-scripts nil
 943   "*Non-nil means compose various scripts on decoding utf-8 text."
 944   :group 'mule
 945   :version "22.1"
 946   :type 'boolean)
 947
 948 (defun utf-8-post-read-conversion (length)
 949   "Compose untranslated utf-8 sequences into single characters.
 950 If `utf-translate-cjk-mode' is non-nil, tries to translate CJK characters.
 951 Also compose particular scripts if `utf-8-compose-scripts' is non-nil."
 952   (save-excursion
 953     (save-restriction
 954       (narrow-to-region (point) (+ (point) length))
 955       ;; Can't do eval-when-compile to insert a multibyte constant
 956       ;; version of the string in the loop, since it's always loaded as
 957       ;; unibyte from a byte-compiled file.
 958       (let ((range (string-as-multibyte "^\xc0-\xc3\xe1-\xf7"))
 959             (buffer-multibyte enable-multibyte-characters)
 960             hash-table ch)
 961         (set-buffer-multibyte t)
 962         (when utf-translate-cjk-mode
 963           (unless utf-translate-cjk-lang-env
 964             ;; Check these characters in utf-translate-cjk-range.
 965             ;; We may have to translate them to CJK charsets.
 966             (skip-chars-forward
 967              (concat range utf-translate-cjk-unicode-range-string))
 968             (unless (eobp)
 969               (utf-translate-cjk-load-tables)
 970               (setq range
 971                     (concat range utf-translate-cjk-unicode-range-string)))
 972             (setq hash-table (get 'utf-subst-table-for-decode
 973                                   'translation-hash-table))))
 974         (while (and (skip-chars-forward range)
 975                     (not (eobp)))
 976           (setq ch (following-char))
 977           (if (< ch 256)
 978               (utf-8-compose hash-table)
 979             (if (and hash-table
 980                      (setq ch (gethash (encode-char ch 'ucs) hash-table)))
 981                 (progn
 982                   (insert ch)
 983                   (delete-char 1))
 984               (forward-char 1))))
 985         (or buffer-multibyte
 986             (set-buffer-multibyte nil)))
 987
 988       (when (and utf-8-compose-scripts (> length 1))
 989         ;; These currently have definitions which cover the relevant
 990         ;; unicodes.  We could avoid loading thai-util &c by checking
 991         ;; whether the region contains any characters with the appropriate
 992         ;; categories.  There aren't yet Unicode-based rules for Tibetan.
 993         (diacritic-compose-region (point-max) (point-min))
 994         (thai-compose-region (point-max) (point-min))
 995         (lao-compose-region (point-max) (point-min))
 996         (devanagari-compose-region (point-max) (point-min))
 997         (malayalam-compose-region (point-max) (point-min))
 998         (tamil-compose-region (point-max) (point-min)))
 999       (- (point-max) (point-min)))))
1000
1001 (defun utf-8-pre-write-conversion (beg end)
1002   "Prepare for `utf-translate-cjk-mode' to encode text between BEG and END.
1003 This is used as a post-read-conversion of utf-8 coding system."
1004   (if (and utf-translate-cjk-mode
1005            (not utf-translate-cjk-lang-env)
1006            (save-excursion
1007              (goto-char beg)
1008              (re-search-forward "\\cc\\|\\cj\\|\\ch" end t)))
1009       (utf-translate-cjk-load-tables))
1010   nil)
1011
1012 (make-coding-system
1013  'mule-utf-8 4 ?u
1014  "UTF-8 encoding for Emacs-supported Unicode characters.
1015 It supports Unicode characters of these ranges:
1016     U+0000..U+33FF, U+E000..U+FFFF.
1017 They correspond to these Emacs character sets:
1018     ascii, latin-iso8859-1, mule-unicode-0100-24ff,
1019     mule-unicode-2500-33ff, mule-unicode-e000-ffff
1020
1021 On decoding (e.g. reading a file), Unicode characters not in the above
1022 ranges are decoded into sequences of eight-bit-control and
1023 eight-bit-graphic characters to preserve their byte sequences.  The
1024 byte sequence is preserved on i/o for valid utf-8, but not necessarily
1025 for invalid utf-8.
1026
1027 On encoding (e.g. writing a file), Emacs characters not belonging to
1028 any of the character sets listed above are encoded into the UTF-8 byte
1029 sequence representing U+FFFD (REPLACEMENT CHARACTER)."
1030
1031  '(ccl-decode-mule-utf-8 . ccl-encode-mule-utf-8)
1032  `((safe-charsets
1033     ascii
1034     eight-bit-control
1035     eight-bit-graphic
1036     latin-iso8859-1
1037     mule-unicode-0100-24ff
1038     mule-unicode-2500-33ff
1039     mule-unicode-e000-ffff
1040     ,@(if utf-translate-cjk-mode
1041           utf-translate-cjk-charsets))
1042    (mime-charset . utf-8)
1043    (coding-category . coding-category-utf-8)
1044    (valid-codes (0 . 255))
1045    (pre-write-conversion . utf-8-pre-write-conversion)
1046    (post-read-conversion . utf-8-post-read-conversion)
1047    (translation-table-for-encode . utf-translation-table-for-encode)
1048    (dependency unify-8859-on-encoding-mode
1049                unify-8859-on-decoding-mode
1050                utf-fragment-on-decoding
1051                utf-translate-cjk-mode)))
1052
1053 (define-coding-system-alias 'utf-8 'mule-utf-8)
1054
1055 ;; I think this needs special private charsets defined for the
1056 ;; untranslated sequences, if it's going to work well.
1057
1058 ;;; (defun utf-8-compose-function (pos to pattern &optional string)
1059 ;;;   (let* ((prop (get-char-property pos 'composition string))
1060 ;;;      (l (and prop (- (cadr prop) (car prop)))))
1061 ;;;     (cond ((and l (> l (- to pos)))
1062 ;;;        (delete-region pos to))
1063 ;;;       ((and (> (char-after pos) 224)
1064 ;;;             (< (char-after pos) 256)
1065 ;;;             (save-restriction
1066 ;;;               (narrow-to-region pos to)
1067 ;;;               (utf-8-compose)))
1068 ;;;        t))))
1069
1070 ;;; (dotimes (i 96)
1071 ;;;   (aset composition-function-table
1072 ;;;     (+ 128 i)
1073 ;;;     `((,(string-as-multibyte "[\200-\237\240-\377]")
1074 ;;;        . utf-8-compose-function))))
1075
1076 ;;; arch-tag: b08735b7-753b-4ae6-b754-0f3efe4515c5
1077 ;;; utf-8.el ends here