src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995,97,1998,2002,2003  Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  21 Boston, MA 02110-1301, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   0. General comments
  26   1. Preamble
  27   2. Emacs' internal format (emacs-mule) handlers
  28   3. ISO2022 handlers
  29   4. Shift-JIS and BIG5 handlers
  30   5. CCL handlers
  31   6. End-of-line handlers
  32   7. C library functions
  33   8. Emacs Lisp library functions
  34   9. Post-amble
  35
  36 */
  37
  38 /*** 0. General comments ***/
  39
  40
  41 /*** GENERAL NOTE on CODING SYSTEMS ***
  42
  43   A coding system is an encoding mechanism for one or more character
  44   sets.  Here's a list of coding systems which Emacs can handle.  When
  45   we say "decode", it means converting some other coding system to
  46   Emacs' internal format (emacs-mule), and when we say "encode",
  47   it means converting the coding system emacs-mule to some other
  48   coding system.
  49
  50   0. Emacs' internal format (emacs-mule)
  51
  52   Emacs itself holds a multi-lingual character in buffers and strings
  53   in a special format.  Details are described in section 2.
  54
  55   1. ISO2022
  56
  57   The most famous coding system for multiple character sets.  X's
  58   Compound Text, various EUCs (Extended Unix Code), and coding
  59   systems used in Internet communication such as ISO-2022-JP are
  60   all variants of ISO2022.  Details are described in section 3.
  61
  62   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  63
  64   A coding system to encode character sets: ASCII, JISX0201, and
  65   JISX0208.  Widely used for PC's in Japan.  Details are described in
  66   section 4.
  67
  68   3. BIG5
  69
  70   A coding system to encode the character sets ASCII and Big5.  Widely
  71   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  72   described in section 4.  In this file, when we write "BIG5"
  73   (all uppercase), we mean the coding system, and when we write
  74   "Big5" (capitalized), we mean the character set.
  75
  76   4. Raw text
  77
  78   A coding system for text containing random 8-bit code.  Emacs does
  79   no code conversion on such text except for end-of-line format.
  80
  81   5. Other
  82
  83   If a user wants to read/write text encoded in a coding system not
  84   listed above, he can supply a decoder and an encoder for it as CCL
  85   (Code Conversion Language) programs.  Emacs executes the CCL program
  86   while reading/writing.
  87
  88   Emacs represents a coding system by a Lisp symbol that has a property
  89   `coding-system'.  But, before actually using the coding system, the
  90   information about it is set in a structure of type `struct
  91   coding_system' for rapid processing.  See section 6 for more details.
  92
  93 */
  94
  95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  96
  97   How end-of-line of text is encoded depends on the operating system.
  98   For instance, Unix's format is just one byte of `line-feed' code,
  99   whereas DOS's format is two-byte sequence of `carriage-return' and
 100   `line-feed' codes.  MacOS's format is usually one byte of
 101   `carriage-return'.
 102
 103   Since text character encoding and end-of-line encoding are
 104   independent, any coding system described above can have any
 105   end-of-line format.  So Emacs has information about end-of-line
 106   format in each coding-system.  See section 6 for more details.
 107
 108 */
 109
 110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 111
 112   These functions check if a text between SRC and SRC_END is encoded
 113   in the coding system category XXX.  Each returns an integer value in
 114   which appropriate flag bits for the category XXX are set.  The flag
 115   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 116   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 117   of the range 0x80..0x9F are in multibyte form.  */
 118 #if 0
 119 int
 120 detect_coding_emacs_mule (src, src_end, multibytep)
 121      unsigned char *src, *src_end;
 122      int multibytep;
 123 {
 124   ...
 125 }
 126 #endif
 127
 128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 129
 130   These functions decode SRC_BYTES length of unibyte text at SOURCE
 131   encoded in CODING to Emacs' internal format.  The resulting
 132   multibyte text goes to a place pointed to by DESTINATION, the length
 133   of which should not exceed DST_BYTES.
 134
 135   These functions set the information about original and decoded texts
 136   in the members `produced', `produced_char', `consumed', and
 137   `consumed_char' of the structure *CODING.  They also set the member
 138   `result' to one of CODING_FINISH_XXX indicating how the decoding
 139   finished.
 140
 141   DST_BYTES zero means that the source area and destination area are
 142   overlapped, which means that we can produce a decoded text until it
 143   reaches the head of the not-yet-decoded source text.
 144
 145   Below is a template for these functions.  */
 146 #if 0
 147 static void
 148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 149      struct coding_system *coding;
 150      const unsigned char *source;
 151      unsigned char *destination;
 152      int src_bytes, dst_bytes;
 153 {
 154   ...
 155 }
 156 #endif
 157
 158 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 159
 160   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 161   internal multibyte format to CODING.  The resulting unibyte text
 162   goes to a place pointed to by DESTINATION, the length of which
 163   should not exceed DST_BYTES.
 164
 165   These functions set the information about original and encoded texts
 166   in the members `produced', `produced_char', `consumed', and
 167   `consumed_char' of the structure *CODING.  They also set the member
 168   `result' to one of CODING_FINISH_XXX indicating how the encoding
 169   finished.
 170
 171   DST_BYTES zero means that the source area and destination area are
 172   overlapped, which means that we can produce encoded text until it
 173   reaches at the head of the not-yet-encoded source text.
 174
 175   Below is a template for these functions.  */
 176 #if 0
 177 static void
 178 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 179      struct coding_system *coding;
 180      unsigned char *source, *destination;
 181      int src_bytes, dst_bytes;
 182 {
 183   ...
 184 }
 185 #endif
 186
 187 /*** COMMONLY USED MACROS ***/
 188
 189 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 190    get one, two, and three bytes from the source text respectively.
 191    If there are not enough bytes in the source, they jump to
 192    `label_end_of_loop'.  The caller should set variables `coding',
 193    `src' and `src_end' to appropriate pointer in advance.  These
 194    macros are called from decoding routines `decode_coding_XXX', thus
 195    it is assumed that the source text is unibyte.  */
 196
 197 #define ONE_MORE_BYTE(c1)                                       \
 198   do {                                                          \
 199     if (src >= src_end)                                         \
 200       {                                                         \
 201         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 202         goto label_end_of_loop;                                 \
 203       }                                                         \
 204     c1 = *src++;                                                \
 205   } while (0)
 206
 207 #define TWO_MORE_BYTES(c1, c2)                                  \
 208   do {                                                          \
 209     if (src + 1 >= src_end)                                     \
 210       {                                                         \
 211         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 212         goto label_end_of_loop;                                 \
 213       }                                                         \
 214     c1 = *src++;                                                \
 215     c2 = *src++;                                                \
 216   } while (0)
 217
 218
 219 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 220    form if MULTIBYTEP is nonzero.  */
 221
 222 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 223   do {                                                          \
 224     if (src >= src_end)                                         \
 225       {                                                         \
 226         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 227         goto label_end_of_loop;                                 \
 228       }                                                         \
 229     c1 = *src++;                                                \
 230     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 231       c1 = *src++ - 0x20;                                       \
 232   } while (0)
 233
 234 /* Set C to the next character at the source text pointed by `src'.
 235    If there are not enough characters in the source, jump to
 236    `label_end_of_loop'.  The caller should set variables `coding'
 237    `src', `src_end', and `translation_table' to appropriate pointers
 238    in advance.  This macro is used in encoding routines
 239    `encode_coding_XXX', thus it assumes that the source text is in
 240    multibyte form except for 8-bit characters.  8-bit characters are
 241    in multibyte form if coding->src_multibyte is nonzero, else they
 242    are represented by a single byte.  */
 243
 244 #define ONE_MORE_CHAR(c)                                        \
 245   do {                                                          \
 246     int len = src_end - src;                                    \
 247     int bytes;                                                  \
 248     if (len <= 0)                                               \
 249       {                                                         \
 250         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 251         goto label_end_of_loop;                                 \
 252       }                                                         \
 253     if (coding->src_multibyte                                   \
 254         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 255       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 256     else                                                        \
 257       c = *src, bytes = 1;                                      \
 258     if (!NILP (translation_table))                              \
 259       c = translate_char (translation_table, c, -1, 0, 0);      \
 260     src += bytes;                                               \
 261   } while (0)
 262
 263
 264 /* Produce a multibyte form of character C to `dst'.  Jump to
 265    `label_end_of_loop' if there's not enough space at `dst'.
 266
 267    If we are now in the middle of a composition sequence, the decoded
 268    character may be ALTCHAR (for the current composition).  In that
 269    case, the character goes to coding->cmp_data->data instead of
 270    `dst'.
 271
 272    This macro is used in decoding routines.  */
 273
 274 #define EMIT_CHAR(c)                                                    \
 275   do {                                                                  \
 276     if (! COMPOSING_P (coding)                                          \
 277         || coding->composing == COMPOSITION_RELATIVE                    \
 278         || coding->composing == COMPOSITION_WITH_RULE)                  \
 279       {                                                                 \
 280         int bytes = CHAR_BYTES (c);                                     \
 281         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 282           {                                                             \
 283             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 284             goto label_end_of_loop;                                     \
 285           }                                                             \
 286         dst += CHAR_STRING (c, dst);                                    \
 287         coding->produced_char++;                                        \
 288       }                                                                 \
 289                                                                         \
 290     if (COMPOSING_P (coding)                                            \
 291         && coding->composing != COMPOSITION_RELATIVE)                   \
 292       {                                                                 \
 293         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 294         coding->composition_rule_follows                                \
 295           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 296       }                                                                 \
 297   } while (0)
 298
 299
 300 #define EMIT_ONE_BYTE(c)                                        \
 301   do {                                                          \
 302     if (dst >= (dst_bytes ? dst_end : src))                     \
 303       {                                                         \
 304         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 305         goto label_end_of_loop;                                 \
 306       }                                                         \
 307     *dst++ = c;                                                 \
 308   } while (0)
 309
 310 #define EMIT_TWO_BYTES(c1, c2)                                  \
 311   do {                                                          \
 312     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 313       {                                                         \
 314         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 315         goto label_end_of_loop;                                 \
 316       }                                                         \
 317     *dst++ = c1, *dst++ = c2;                                   \
 318   } while (0)
 319
 320 #define EMIT_BYTES(from, to)                                    \
 321   do {                                                          \
 322     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 323       {                                                         \
 324         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 325         goto label_end_of_loop;                                 \
 326       }                                                         \
 327     while (from < to)                                           \
 328       *dst++ = *from++;                                         \
 329   } while (0)
 330
 331 \f
 332 /*** 1. Preamble ***/
 333
 334 #ifdef emacs
 335 #include <config.h>
 336 #endif
 337
 338 #include <stdio.h>
 339
 340 #ifdef emacs
 341
 342 #include "lisp.h"
 343 #include "buffer.h"
 344 #include "charset.h"
 345 #include "composite.h"
 346 #include "ccl.h"
 347 #include "coding.h"
 348 #include "window.h"
 349 #include "intervals.h"
 350
 351 #else  /* not emacs */
 352
 353 #include "mulelib.h"
 354
 355 #endif /* not emacs */
 356
 357 Lisp_Object Qcoding_system, Qeol_type;
 358 Lisp_Object Qbuffer_file_coding_system;
 359 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 360 Lisp_Object Qno_conversion, Qundecided;
 361 Lisp_Object Qcoding_system_history;
 362 Lisp_Object Qsafe_chars;
 363 Lisp_Object Qvalid_codes;
 364
 365 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 366 Lisp_Object Qcall_process, Qcall_process_region;
 367 Lisp_Object Qstart_process, Qopen_network_stream;
 368 Lisp_Object Qtarget_idx;
 369
 370 /* If a symbol has this property, evaluate the value to define the
 371    symbol as a coding system.  */
 372 Lisp_Object Qcoding_system_define_form;
 373
 374 Lisp_Object Vselect_safe_coding_system_function;
 375
 376 int coding_system_require_warning;
 377
 378 /* Mnemonic string for each format of end-of-line.  */
 379 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 380 /* Mnemonic string to indicate format of end-of-line is not yet
 381    decided.  */
 382 Lisp_Object eol_mnemonic_undecided;
 383
 384 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 385    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 386 int system_eol_type;
 387
 388 #ifdef emacs
 389
 390 /* Information about which coding system is safe for which chars.
 391    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 392
 393    GENERIC-LIST is a list of generic coding systems which can encode
 394    any characters.
 395
 396    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 397    corresponding char table that contains safe chars.  */
 398 Lisp_Object Vcoding_system_safe_chars;
 399
 400 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 401
 402 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 403
 404 /* Coding system emacs-mule and raw-text are for converting only
 405    end-of-line format.  */
 406 Lisp_Object Qemacs_mule, Qraw_text;
 407
 408 Lisp_Object Qutf_8;
 409
 410 /* Coding-systems are handed between Emacs Lisp programs and C internal
 411    routines by the following three variables.  */
 412 /* Coding-system for reading files and receiving data from process.  */
 413 Lisp_Object Vcoding_system_for_read;
 414 /* Coding-system for writing files and sending data to process.  */
 415 Lisp_Object Vcoding_system_for_write;
 416 /* Coding-system actually used in the latest I/O.  */
 417 Lisp_Object Vlast_coding_system_used;
 418
 419 /* A vector of length 256 which contains information about special
 420    Latin codes (especially for dealing with Microsoft codes).  */
 421 Lisp_Object Vlatin_extra_code_table;
 422
 423 /* Flag to inhibit code conversion of end-of-line format.  */
 424 int inhibit_eol_conversion;
 425
 426 /* Flag to inhibit ISO2022 escape sequence detection.  */
 427 int inhibit_iso_escape_detection;
 428
 429 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 430 int inherit_process_coding_system;
 431
 432 /* Coding system to be used to encode text for terminal display.  */
 433 struct coding_system terminal_coding;
 434
 435 /* Coding system to be used to encode text for terminal display when
 436    terminal coding system is nil.  */
 437 struct coding_system safe_terminal_coding;
 438
 439 /* Coding system of what is sent from terminal keyboard.  */
 440 struct coding_system keyboard_coding;
 441
 442 /* Default coding system to be used to write a file.  */
 443 struct coding_system default_buffer_file_coding;
 444
 445 Lisp_Object Vfile_coding_system_alist;
 446 Lisp_Object Vprocess_coding_system_alist;
 447 Lisp_Object Vnetwork_coding_system_alist;
 448
 449 Lisp_Object Vlocale_coding_system;
 450
 451 #endif /* emacs */
 452
 453 Lisp_Object Qcoding_category, Qcoding_category_index;
 454
 455 /* List of symbols `coding-category-xxx' ordered by priority.  */
 456 Lisp_Object Vcoding_category_list;
 457
 458 /* Table of coding categories (Lisp symbols).  */
 459 Lisp_Object Vcoding_category_table;
 460
 461 /* Table of names of symbol for each coding-category.  */
 462 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 463   "coding-category-emacs-mule",
 464   "coding-category-sjis",
 465   "coding-category-iso-7",
 466   "coding-category-iso-7-tight",
 467   "coding-category-iso-8-1",
 468   "coding-category-iso-8-2",
 469   "coding-category-iso-7-else",
 470   "coding-category-iso-8-else",
 471   "coding-category-ccl",
 472   "coding-category-big5",
 473   "coding-category-utf-8",
 474   "coding-category-utf-16-be",
 475   "coding-category-utf-16-le",
 476   "coding-category-raw-text",
 477   "coding-category-binary"
 478 };
 479
 480 /* Table of pointers to coding systems corresponding to each coding
 481    categories.  */
 482 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 483
 484 /* Table of coding category masks.  Nth element is a mask for a coding
 485    category of which priority is Nth.  */
 486 static
 487 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 488
 489 /* Flag to tell if we look up translation table on character code
 490    conversion.  */
 491 Lisp_Object Venable_character_translation;
 492 /* Standard translation table to look up on decoding (reading).  */
 493 Lisp_Object Vstandard_translation_table_for_decode;
 494 /* Standard translation table to look up on encoding (writing).  */
 495 Lisp_Object Vstandard_translation_table_for_encode;
 496
 497 Lisp_Object Qtranslation_table;
 498 Lisp_Object Qtranslation_table_id;
 499 Lisp_Object Qtranslation_table_for_decode;
 500 Lisp_Object Qtranslation_table_for_encode;
 501
 502 /* Alist of charsets vs revision number.  */
 503 Lisp_Object Vcharset_revision_alist;
 504
 505 /* Default coding systems used for process I/O.  */
 506 Lisp_Object Vdefault_process_coding_system;
 507
 508 /* Char table for translating Quail and self-inserting input.  */
 509 Lisp_Object Vtranslation_table_for_input;
 510
 511 /* Global flag to tell that we can't call post-read-conversion and
 512    pre-write-conversion functions.  Usually the value is zero, but it
 513    is set to 1 temporarily while such functions are running.  This is
 514    to avoid infinite recursive call.  */
 515 static int inhibit_pre_post_conversion;
 516
 517 Lisp_Object Qchar_coding_system;
 518
 519 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 520    its validity.  */
 521
 522 Lisp_Object
 523 coding_safe_chars (coding_system)
 524      Lisp_Object coding_system;
 525 {
 526   Lisp_Object coding_spec, plist, safe_chars;
 527
 528   coding_spec = Fget (coding_system, Qcoding_system);
 529   plist = XVECTOR (coding_spec)->contents[3];
 530   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 531   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 532 }
 533
 534 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 535   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 536
 537 \f
 538 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 539
 540 /* Emacs' internal format for representation of multiple character
 541    sets is a kind of multi-byte encoding, i.e. characters are
 542    represented by variable-length sequences of one-byte codes.
 543
 544    ASCII characters and control characters (e.g. `tab', `newline') are
 545    represented by one-byte sequences which are their ASCII codes, in
 546    the range 0x00 through 0x7F.
 547
 548    8-bit characters of the range 0x80..0x9F are represented by
 549    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 550    code + 0x20).
 551
 552    8-bit characters of the range 0xA0..0xFF are represented by
 553    one-byte sequences which are their 8-bit code.
 554
 555    The other characters are represented by a sequence of `base
 556    leading-code', optional `extended leading-code', and one or two
 557    `position-code's.  The length of the sequence is determined by the
 558    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 559    whereas extended leading-code and position-code take the range 0xA0
 560    through 0xFF.  See `charset.h' for more details about leading-code
 561    and position-code.
 562
 563    --- CODE RANGE of Emacs' internal format ---
 564    character set        range
 565    -------------        -----
 566    ascii                0x00..0x7F
 567    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 568    eight-bit-graphic    0xA0..0xBF
 569    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 570    ---------------------------------------------
 571
 572    As this is the internal character representation, the format is
 573    usually not used externally (i.e. in a file or in a data sent to a
 574    process).  But, it is possible to have a text externally in this
 575    format (i.e. by encoding by the coding system `emacs-mule').
 576
 577    In that case, a sequence of one-byte codes has a slightly different
 578    form.
 579
 580    Firstly, all characters in eight-bit-control are represented by
 581    one-byte sequences which are their 8-bit code.
 582
 583    Next, character composition data are represented by the byte
 584    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 585    where,
 586         METHOD is 0xF0 plus one of composition method (enum
 587         composition_method),
 588
 589         BYTES is 0xA0 plus the byte length of these composition data,
 590
 591         CHARS is 0xA0 plus the number of characters composed by these
 592         data,
 593
 594         COMPONENTs are characters of multibyte form or composition
 595         rules encoded by two-byte of ASCII codes.
 596
 597    In addition, for backward compatibility, the following formats are
 598    also recognized as composition data on decoding.
 599
 600    0x80 MSEQ ...
 601    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 602
 603    Here,
 604         MSEQ is a multibyte form but in these special format:
 605           ASCII: 0xA0 ASCII_CODE+0x80,
 606           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 607         RULE is a one byte code of the range 0xA0..0xF0 that
 608         represents a composition rule.
 609   */
 610
 611 enum emacs_code_class_type emacs_code_class[256];
 612
 613 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 614    Check if a text is encoded in Emacs' internal format.  If it is,
 615    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 616
 617 static int
 618 detect_coding_emacs_mule (src, src_end, multibytep)
 619       unsigned char *src, *src_end;
 620       int multibytep;
 621 {
 622   unsigned char c;
 623   int composing = 0;
 624   /* Dummy for ONE_MORE_BYTE.  */
 625   struct coding_system dummy_coding;
 626   struct coding_system *coding = &dummy_coding;
 627
 628   while (1)
 629     {
 630       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 631
 632       if (composing)
 633         {
 634           if (c < 0xA0)
 635             composing = 0;
 636           else if (c == 0xA0)
 637             {
 638               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 639               c &= 0x7F;
 640             }
 641           else
 642             c -= 0x20;
 643         }
 644
 645       if (c < 0x20)
 646         {
 647           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 648             return 0;
 649         }
 650       else if (c >= 0x80 && c < 0xA0)
 651         {
 652           if (c == 0x80)
 653             /* Old leading code for a composite character.  */
 654             composing = 1;
 655           else
 656             {
 657               unsigned char *src_base = src - 1;
 658               int bytes;
 659
 660               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 661                                                bytes))
 662                 return 0;
 663               src = src_base + bytes;
 664             }
 665         }
 666     }
 667  label_end_of_loop:
 668   return CODING_CATEGORY_MASK_EMACS_MULE;
 669 }
 670
 671
 672 /* Record the starting position START and METHOD of one composition.  */
 673
 674 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 675   do {                                                          \
 676     struct composition_data *cmp_data = coding->cmp_data;       \
 677     int *data = cmp_data->data + cmp_data->used;                \
 678     coding->cmp_data_start = cmp_data->used;                    \
 679     data[0] = -1;                                               \
 680     data[1] = cmp_data->char_offset + start;                    \
 681     data[3] = (int) method;                                     \
 682     cmp_data->used += 4;                                        \
 683   } while (0)
 684
 685 /* Record the ending position END of the current composition.  */
 686
 687 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 688   do {                                                          \
 689     struct composition_data *cmp_data = coding->cmp_data;       \
 690     int *data = cmp_data->data + coding->cmp_data_start;        \
 691     data[0] = cmp_data->used - coding->cmp_data_start;          \
 692     data[2] = cmp_data->char_offset + end;                      \
 693   } while (0)
 694
 695 /* Record one COMPONENT (alternate character or composition rule).  */
 696
 697 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 698   do {                                                                  \
 699     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 700     if (coding->cmp_data->used - coding->cmp_data_start                 \
 701         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 702       {                                                                 \
 703         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 704         coding->composing = COMPOSITION_NO;                             \
 705       }                                                                 \
 706   } while (0)
 707
 708
 709 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 710    is not less than SRC_END, return -1 without incrementing Src.  */
 711
 712 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 713
 714
 715 /* Decode a character represented as a component of composition
 716    sequence of Emacs 20 style at SRC.  Set C to that character, store
 717    its multibyte form sequence at P, and set P to the end of that
 718    sequence.  If no valid character is found, set C to -1.  */
 719
 720 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 721   do {                                                          \
 722     int bytes;                                                  \
 723                                                                 \
 724     c = SAFE_ONE_MORE_BYTE ();                                  \
 725     if (c < 0)                                                  \
 726       break;                                                    \
 727     if (CHAR_HEAD_P (c))                                        \
 728       c = -1;                                                   \
 729     else if (c == 0xA0)                                         \
 730       {                                                         \
 731         c = SAFE_ONE_MORE_BYTE ();                              \
 732         if (c < 0xA0)                                           \
 733           c = -1;                                               \
 734         else                                                    \
 735           {                                                     \
 736             c -= 0xA0;                                          \
 737             *p++ = c;                                           \
 738           }                                                     \
 739       }                                                         \
 740     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 741       {                                                         \
 742         unsigned char *p0 = p;                                  \
 743                                                                 \
 744         c -= 0x20;                                              \
 745         *p++ = c;                                               \
 746         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 747         while (--bytes)                                         \
 748           {                                                     \
 749             c = SAFE_ONE_MORE_BYTE ();                          \
 750             if (c < 0)                                          \
 751               break;                                            \
 752             *p++ = c;                                           \
 753           }                                                     \
 754         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 755             || (coding->flags /* We are recovering a file.  */  \
 756                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 757                 && ! CHAR_HEAD_P (p0[1])))                      \
 758           c = STRING_CHAR (p0, bytes);                          \
 759         else                                                    \
 760           c = -1;                                               \
 761       }                                                         \
 762     else                                                        \
 763       c = -1;                                                   \
 764   } while (0)
 765
 766
 767 /* Decode a composition rule represented as a component of composition
 768    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 769    valid rule is found, set C to -1.  */
 770
 771 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 772   do {                                                  \
 773     c = SAFE_ONE_MORE_BYTE ();                          \
 774     c -= 0xA0;                                          \
 775     if (c < 0 || c >= 81)                               \
 776       c = -1;                                           \
 777     else                                                \
 778       {                                                 \
 779         gref = c / 9, nref = c % 9;                     \
 780         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 781       }                                                 \
 782   } while (0)
 783
 784
 785 /* Decode composition sequence encoded by `emacs-mule' at the source
 786    pointed by SRC.  SRC_END is the end of source.  Store information
 787    of the composition in CODING->cmp_data.
 788
 789    For backward compatibility, decode also a composition sequence of
 790    Emacs 20 style.  In that case, the composition sequence contains
 791    characters that should be extracted into a buffer or string.  Store
 792    those characters at *DESTINATION in multibyte form.
 793
 794    If we encounter an invalid byte sequence, return 0.
 795    If we encounter an insufficient source or destination, or
 796    insufficient space in CODING->cmp_data, return 1.
 797    Otherwise, return consumed bytes in the source.
 798
 799 */
 800 static INLINE int
 801 decode_composition_emacs_mule (coding, src, src_end,
 802                                destination, dst_end, dst_bytes)
 803      struct coding_system *coding;
 804      const unsigned char *src, *src_end;
 805      unsigned char **destination, *dst_end;
 806      int dst_bytes;
 807 {
 808   unsigned char *dst = *destination;
 809   int method, data_len, nchars;
 810   const unsigned char *src_base = src++;
 811   /* Store components of composition.  */
 812   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 813   int ncomponent;
 814   /* Store multibyte form of characters to be composed.  This is for
 815      Emacs 20 style composition sequence.  */
 816   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 817   unsigned char *bufp = buf;
 818   int c, i, gref, nref;
 819
 820   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 821       >= COMPOSITION_DATA_SIZE)
 822     {
 823       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 824       return -1;
 825     }
 826
 827   ONE_MORE_BYTE (c);
 828   if (c - 0xF0 >= COMPOSITION_RELATIVE
 829            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 830     {
 831       int with_rule;
 832
 833       method = c - 0xF0;
 834       with_rule = (method == COMPOSITION_WITH_RULE
 835                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 836       ONE_MORE_BYTE (c);
 837       data_len = c - 0xA0;
 838       if (data_len < 4
 839           || src_base + data_len > src_end)
 840         return 0;
 841       ONE_MORE_BYTE (c);
 842       nchars = c - 0xA0;
 843       if (c < 1)
 844         return 0;
 845       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 846         {
 847           /* If it is longer than this, it can't be valid.  */
 848           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 849             return 0;
 850
 851           if (ncomponent % 2 && with_rule)
 852             {
 853               ONE_MORE_BYTE (gref);
 854               gref -= 32;
 855               ONE_MORE_BYTE (nref);
 856               nref -= 32;
 857               c = COMPOSITION_ENCODE_RULE (gref, nref);
 858             }
 859           else
 860             {
 861               int bytes;
 862               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 863                   || (coding->flags /* We are recovering a file.  */
 864                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 865                       && ! CHAR_HEAD_P (src[1])))
 866                 c = STRING_CHAR (src, bytes);
 867               else
 868                 c = *src, bytes = 1;
 869               src += bytes;
 870             }
 871           component[ncomponent] = c;
 872         }
 873     }
 874   else
 875     {
 876       /* This may be an old Emacs 20 style format.  See the comment at
 877          the section 2 of this file.  */
 878       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 879       if (src == src_end
 880           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 881         goto label_end_of_loop;
 882
 883       src_end = src;
 884       src = src_base + 1;
 885       if (c < 0xC0)
 886         {
 887           method = COMPOSITION_RELATIVE;
 888           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 889             {
 890               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 891               if (c < 0)
 892                 break;
 893               component[ncomponent++] = c;
 894             }
 895           if (ncomponent < 2)
 896             return 0;
 897           nchars = ncomponent;
 898         }
 899       else if (c == 0xFF)
 900         {
 901           method = COMPOSITION_WITH_RULE;
 902           src++;
 903           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 904           if (c < 0)
 905             return 0;
 906           component[0] = c;
 907           for (ncomponent = 1;
 908                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 909             {
 910               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 911               if (c < 0)
 912                 break;
 913               component[ncomponent++] = c;
 914               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 915               if (c < 0)
 916                 break;
 917               component[ncomponent++] = c;
 918             }
 919           if (ncomponent < 3)
 920             return 0;
 921           nchars = (ncomponent + 1) / 2;
 922         }
 923       else
 924         return 0;
 925     }
 926
 927   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 928     {
 929       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 930       for (i = 0; i < ncomponent; i++)
 931         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 932       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 933       if (buf < bufp)
 934         {
 935           unsigned char *p = buf;
 936           EMIT_BYTES (p, bufp);
 937           *destination += bufp - buf;
 938           coding->produced_char += nchars;
 939         }
 940       return (src - src_base);
 941     }
 942  label_end_of_loop:
 943   return -1;
 944 }
 945
 946 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 947
 948 static void
 949 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 950      struct coding_system *coding;
 951      const unsigned char *source;
 952      unsigned char *destination;
 953      int src_bytes, dst_bytes;
 954 {
 955   const unsigned char *src = source;
 956   const unsigned char *src_end = source + src_bytes;
 957   unsigned char *dst = destination;
 958   unsigned char *dst_end = destination + dst_bytes;
 959   /* SRC_BASE remembers the start position in source in each loop.
 960      The loop will be exited when there's not enough source code, or
 961      when there's not enough destination area to produce a
 962      character.  */
 963   const unsigned char *src_base;
 964
 965   coding->produced_char = 0;
 966   while ((src_base = src) < src_end)
 967     {
 968       unsigned char tmp[MAX_MULTIBYTE_LENGTH];
 969       const unsigned char *p;
 970       int bytes;
 971
 972       if (*src == '\r')
 973         {
 974           int c = *src++;
 975
 976           if (coding->eol_type == CODING_EOL_CR)
 977             c = '\n';
 978           else if (coding->eol_type == CODING_EOL_CRLF)
 979             {
 980               ONE_MORE_BYTE (c);
 981               if (c != '\n')
 982                 {
 983                   src--;
 984                   c = '\r';
 985                 }
 986             }
 987           *dst++ = c;
 988           coding->produced_char++;
 989           continue;
 990         }
 991       else if (*src == '\n')
 992         {
 993           if ((coding->eol_type == CODING_EOL_CR
 994                || coding->eol_type == CODING_EOL_CRLF)
 995               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 996             {
 997               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 998               goto label_end_of_loop;
 999             }
1000           *dst++ = *src++;
1001           coding->produced_char++;
1002           continue;
1003         }
1004       else if (*src == 0x80 && coding->cmp_data)
1005         {
1006           /* Start of composition data.  */
1007           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1008                                                          &dst, dst_end,
1009                                                          dst_bytes);
1010           if (consumed < 0)
1011             goto label_end_of_loop;
1012           else if (consumed > 0)
1013             {
1014               src += consumed;
1015               continue;
1016             }
1017           bytes = CHAR_STRING (*src, tmp);
1018           p = tmp;
1019           src++;
1020         }
1021       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1022                || (coding->flags /* We are recovering a file.  */
1023                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1024                    && ! CHAR_HEAD_P (src[1])))
1025         {
1026           p = src;
1027           src += bytes;
1028         }
1029       else
1030         {
1031           int i, c;
1032
1033           bytes = BYTES_BY_CHAR_HEAD (*src);
1034           src++;
1035           for (i = 1; i < bytes; i++)
1036             {
1037               ONE_MORE_BYTE (c);
1038               if (CHAR_HEAD_P (c))
1039                 break;
1040             }
1041           if (i < bytes)
1042             {
1043               bytes = CHAR_STRING (*src_base, tmp);
1044               p = tmp;
1045               src = src_base + 1;
1046             }
1047           else
1048             {
1049               p = src_base;
1050             }
1051         }
1052       if (dst + bytes >= (dst_bytes ? dst_end : src))
1053         {
1054           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1055           break;
1056         }
1057       while (bytes--) *dst++ = *p++;
1058       coding->produced_char++;
1059     }
1060  label_end_of_loop:
1061   coding->consumed = coding->consumed_char = src_base - source;
1062   coding->produced = dst - destination;
1063 }
1064
1065
1066 /* Encode composition data stored at DATA into a special byte sequence
1067    starting by 0x80.  Update CODING->cmp_data_start and maybe
1068    CODING->cmp_data for the next call.  */
1069
1070 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1071   do {                                                                  \
1072     unsigned char buf[1024], *p0 = buf, *p;                             \
1073     int len = data[0];                                                  \
1074     int i;                                                              \
1075                                                                         \
1076     buf[0] = 0x80;                                                      \
1077     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1078     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1079     p = buf + 4;                                                        \
1080     if (data[3] == COMPOSITION_WITH_RULE                                \
1081         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1082       {                                                                 \
1083         p += CHAR_STRING (data[4], p);                                  \
1084         for (i = 5; i < len; i += 2)                                    \
1085           {                                                             \
1086             int gref, nref;                                             \
1087              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1088             *p++ = 0x20 + gref;                                         \
1089             *p++ = 0x20 + nref;                                         \
1090             p += CHAR_STRING (data[i + 1], p);                          \
1091           }                                                             \
1092       }                                                                 \
1093     else                                                                \
1094       {                                                                 \
1095         for (i = 4; i < len; i++)                                       \
1096           p += CHAR_STRING (data[i], p);                                \
1097       }                                                                 \
1098     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1099                                                                         \
1100     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1101       {                                                                 \
1102         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1103         goto label_end_of_loop;                                         \
1104       }                                                                 \
1105     while (p0 < p)                                                      \
1106       *dst++ = *p0++;                                                   \
1107     coding->cmp_data_start += data[0];                                  \
1108     if (coding->cmp_data_start == coding->cmp_data->used                \
1109         && coding->cmp_data->next)                                      \
1110       {                                                                 \
1111         coding->cmp_data = coding->cmp_data->next;                      \
1112         coding->cmp_data_start = 0;                                     \
1113       }                                                                 \
1114   } while (0)
1115
1116
1117 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1118                             unsigned char *, int, int));
1119
1120 static void
1121 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1122      struct coding_system *coding;
1123      const unsigned char *source;
1124      unsigned char *destination;
1125      int src_bytes, dst_bytes;
1126 {
1127   const unsigned char *src = source;
1128   const unsigned char *src_end = source + src_bytes;
1129   unsigned char *dst = destination;
1130   unsigned char *dst_end = destination + dst_bytes;
1131   const unsigned char *src_base;
1132   int c;
1133   int char_offset;
1134   int *data;
1135
1136   Lisp_Object translation_table;
1137
1138   translation_table = Qnil;
1139
1140   /* Optimization for the case that there's no composition.  */
1141   if (!coding->cmp_data || coding->cmp_data->used == 0)
1142     {
1143       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1144       return;
1145     }
1146
1147   char_offset = coding->cmp_data->char_offset;
1148   data = coding->cmp_data->data + coding->cmp_data_start;
1149   while (1)
1150     {
1151       src_base = src;
1152
1153       /* If SRC starts a composition, encode the information about the
1154          composition in advance.  */
1155       if (coding->cmp_data_start < coding->cmp_data->used
1156           && char_offset + coding->consumed_char == data[1])
1157         {
1158           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1159           char_offset = coding->cmp_data->char_offset;
1160           data = coding->cmp_data->data + coding->cmp_data_start;
1161         }
1162
1163       ONE_MORE_CHAR (c);
1164       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1165                         || coding->eol_type == CODING_EOL_CR))
1166         {
1167           if (coding->eol_type == CODING_EOL_CRLF)
1168             EMIT_TWO_BYTES ('\r', c);
1169           else
1170             EMIT_ONE_BYTE ('\r');
1171         }
1172       else if (SINGLE_BYTE_CHAR_P (c))
1173         {
1174           if (coding->flags && ! ASCII_BYTE_P (c))
1175             {
1176               /* As we are auto saving, retain the multibyte form for
1177                  8-bit chars.  */
1178               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1179               int bytes = CHAR_STRING (c, buf);
1180
1181               if (bytes == 1)
1182                 EMIT_ONE_BYTE (buf[0]);
1183               else
1184                 EMIT_TWO_BYTES (buf[0], buf[1]);
1185             }
1186           else
1187             EMIT_ONE_BYTE (c);
1188         }
1189       else
1190         EMIT_BYTES (src_base, src);
1191       coding->consumed_char++;
1192     }
1193  label_end_of_loop:
1194   coding->consumed = src_base - source;
1195   coding->produced = coding->produced_char = dst - destination;
1196   return;
1197 }
1198
1199 \f
1200 /*** 3. ISO2022 handlers ***/
1201
1202 /* The following note describes the coding system ISO2022 briefly.
1203    Since the intention of this note is to help understand the
1204    functions in this file, some parts are NOT ACCURATE or are OVERLY
1205    SIMPLIFIED.  For thorough understanding, please refer to the
1206    original document of ISO2022.  This is equivalent to the standard
1207    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1208
1209    ISO2022 provides many mechanisms to encode several character sets
1210    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1211    is encoded using bytes less than 128.  This may make the encoded
1212    text a little bit longer, but the text passes more easily through
1213    several types of gateway, some of which strip off the MSB (Most
1214    Significant Bit).
1215
1216    There are two kinds of character sets: control character sets and
1217    graphic character sets.  The former contain control characters such
1218    as `newline' and `escape' to provide control functions (control
1219    functions are also provided by escape sequences).  The latter
1220    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1221    two control character sets and many graphic character sets.
1222
1223    Graphic character sets are classified into one of the following
1224    four classes, according to the number of bytes (DIMENSION) and
1225    number of characters in one dimension (CHARS) of the set:
1226    - DIMENSION1_CHARS94
1227    - DIMENSION1_CHARS96
1228    - DIMENSION2_CHARS94
1229    - DIMENSION2_CHARS96
1230
1231    In addition, each character set is assigned an identification tag,
1232    unique for each set, called the "final character" (denoted as <F>
1233    hereafter).  The <F> of each character set is decided by ECMA(*)
1234    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1235    (0x30..0x3F are for private use only).
1236
1237    Note (*): ECMA = European Computer Manufacturers Association
1238
1239    Here are examples of graphic character sets [NAME(<F>)]:
1240         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1241         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1242         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1243         o DIMENSION2_CHARS96 -- none for the moment
1244
1245    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1246         C0 [0x00..0x1F] -- control character plane 0
1247         GL [0x20..0x7F] -- graphic character plane 0
1248         C1 [0x80..0x9F] -- control character plane 1
1249         GR [0xA0..0xFF] -- graphic character plane 1
1250
1251    A control character set is directly designated and invoked to C0 or
1252    C1 by an escape sequence.  The most common case is that:
1253    - ISO646's  control character set is designated/invoked to C0, and
1254    - ISO6429's control character set is designated/invoked to C1,
1255    and usually these designations/invocations are omitted in encoded
1256    text.  In a 7-bit environment, only C0 can be used, and a control
1257    character for C1 is encoded by an appropriate escape sequence to
1258    fit into the environment.  All control characters for C1 are
1259    defined to have corresponding escape sequences.
1260
1261    A graphic character set is at first designated to one of four
1262    graphic registers (G0 through G3), then these graphic registers are
1263    invoked to GL or GR.  These designations and invocations can be
1264    done independently.  The most common case is that G0 is invoked to
1265    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1266    these invocations and designations are omitted in encoded text.
1267    In a 7-bit environment, only GL can be used.
1268
1269    When a graphic character set of CHARS94 is invoked to GL, codes
1270    0x20 and 0x7F of the GL area work as control characters SPACE and
1271    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1272    be used.
1273
1274    There are two ways of invocation: locking-shift and single-shift.
1275    With locking-shift, the invocation lasts until the next different
1276    invocation, whereas with single-shift, the invocation affects the
1277    following character only and doesn't affect the locking-shift
1278    state.  Invocations are done by the following control characters or
1279    escape sequences:
1280
1281    ----------------------------------------------------------------------
1282    abbrev  function                  cntrl escape seq   description
1283    ----------------------------------------------------------------------
1284    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1285    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1286    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1287    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1288    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1289    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1290    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1291    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1292    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1293    ----------------------------------------------------------------------
1294    (*) These are not used by any known coding system.
1295
1296    Control characters for these functions are defined by macros
1297    ISO_CODE_XXX in `coding.h'.
1298
1299    Designations are done by the following escape sequences:
1300    ----------------------------------------------------------------------
1301    escape sequence      description
1302    ----------------------------------------------------------------------
1303    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1304    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1305    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1306    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1307    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1308    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1309    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1310    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1311    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1312    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1313    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1314    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1315    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1316    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1317    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1318    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1319    ----------------------------------------------------------------------
1320
1321    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1322    of dimension 1, chars 94, and final character <F>, etc...
1323
1324    Note (*): Although these designations are not allowed in ISO2022,
1325    Emacs accepts them on decoding, and produces them on encoding
1326    CHARS96 character sets in a coding system which is characterized as
1327    7-bit environment, non-locking-shift, and non-single-shift.
1328
1329    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1330    '(' can be omitted.  We refer to this as "short-form" hereafter.
1331
1332    Now you may notice that there are a lot of ways of encoding the
1333    same multilingual text in ISO2022.  Actually, there exist many
1334    coding systems such as Compound Text (used in X11's inter client
1335    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1336    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1337    localized platforms), and all of these are variants of ISO2022.
1338
1339    In addition to the above, Emacs handles two more kinds of escape
1340    sequences: ISO6429's direction specification and Emacs' private
1341    sequence for specifying character composition.
1342
1343    ISO6429's direction specification takes the following form:
1344         o CSI ']'      -- end of the current direction
1345         o CSI '0' ']'  -- end of the current direction
1346         o CSI '1' ']'  -- start of left-to-right text
1347         o CSI '2' ']'  -- start of right-to-left text
1348    The control character CSI (0x9B: control sequence introducer) is
1349    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1350
1351    Character composition specification takes the following form:
1352         o ESC '0' -- start relative composition
1353         o ESC '1' -- end composition
1354         o ESC '2' -- start rule-base composition (*)
1355         o ESC '3' -- start relative composition with alternate chars  (**)
1356         o ESC '4' -- start rule-base composition with alternate chars  (**)
1357   Since these are not standard escape sequences of any ISO standard,
1358   the use of them with these meanings is restricted to Emacs only.
1359
1360   (*) This form is used only in Emacs 20.5 and older versions,
1361   but the newer versions can safely decode it.
1362   (**) This form is used only in Emacs 21.1 and newer versions,
1363   and the older versions can't decode it.
1364
1365   Here's a list of example usages of these composition escape
1366   sequences (categorized by `enum composition_method').
1367
1368   COMPOSITION_RELATIVE:
1369         ESC 0 CHAR [ CHAR ] ESC 1
1370   COMPOSITION_WITH_RULE:
1371         ESC 2 CHAR [ RULE CHAR ] ESC 1
1372   COMPOSITION_WITH_ALTCHARS:
1373         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1374   COMPOSITION_WITH_RULE_ALTCHARS:
1375         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1376
1377 enum iso_code_class_type iso_code_class[256];
1378
1379 #define CHARSET_OK(idx, charset, c)                                     \
1380   (coding_system_table[idx]                                             \
1381    && (charset == CHARSET_ASCII                                         \
1382        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1383            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1384    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1385                                               charset)                  \
1386        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1387
1388 #define SHIFT_OUT_OK(idx) \
1389   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1390
1391 #define COMPOSITION_OK(idx)     \
1392   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1393
1394 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1395    Check if a text is encoded in ISO2022.  If it is, return an
1396    integer in which appropriate flag bits any of:
1397         CODING_CATEGORY_MASK_ISO_7
1398         CODING_CATEGORY_MASK_ISO_7_TIGHT
1399         CODING_CATEGORY_MASK_ISO_8_1
1400         CODING_CATEGORY_MASK_ISO_8_2
1401         CODING_CATEGORY_MASK_ISO_7_ELSE
1402         CODING_CATEGORY_MASK_ISO_8_ELSE
1403    are set.  If a code which should never appear in ISO2022 is found,
1404    returns 0.  */
1405
1406 static int
1407 detect_coding_iso2022 (src, src_end, multibytep)
1408      unsigned char *src, *src_end;
1409      int multibytep;
1410 {
1411   int mask = CODING_CATEGORY_MASK_ISO;
1412   int mask_found = 0;
1413   int reg[4], shift_out = 0, single_shifting = 0;
1414   int c, c1, charset;
1415   /* Dummy for ONE_MORE_BYTE.  */
1416   struct coding_system dummy_coding;
1417   struct coding_system *coding = &dummy_coding;
1418   Lisp_Object safe_chars;
1419
1420   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1421   while (mask && src < src_end)
1422     {
1423       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1424     retry:
1425       switch (c)
1426         {
1427         case ISO_CODE_ESC:
1428           if (inhibit_iso_escape_detection)
1429             break;
1430           single_shifting = 0;
1431           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1432           if (c >= '(' && c <= '/')
1433             {
1434               /* Designation sequence for a charset of dimension 1.  */
1435               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1436               if (c1 < ' ' || c1 >= 0x80
1437                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1438                 /* Invalid designation sequence.  Just ignore.  */
1439                 break;
1440               reg[(c - '(') % 4] = charset;
1441             }
1442           else if (c == '$')
1443             {
1444               /* Designation sequence for a charset of dimension 2.  */
1445               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1446               if (c >= '@' && c <= 'B')
1447                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1448                 reg[0] = charset = iso_charset_table[1][0][c];
1449               else if (c >= '(' && c <= '/')
1450                 {
1451                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1452                   if (c1 < ' ' || c1 >= 0x80
1453                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1454                     /* Invalid designation sequence.  Just ignore.  */
1455                     break;
1456                   reg[(c - '(') % 4] = charset;
1457                 }
1458               else
1459                 /* Invalid designation sequence.  Just ignore.  */
1460                 break;
1461             }
1462           else if (c == 'N' || c == 'O')
1463             {
1464               /* ESC <Fe> for SS2 or SS3.  */
1465               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1466               break;
1467             }
1468           else if (c >= '0' && c <= '4')
1469             {
1470               /* ESC <Fp> for start/end composition.  */
1471               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1472                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1473               else
1474                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1475               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1476                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1477               else
1478                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1479               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1480                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1481               else
1482                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1483               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1484                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1485               else
1486                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1487               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1488                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1489               else
1490                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1491               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1492                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1493               else
1494                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1495               break;
1496             }
1497           else
1498             /* Invalid escape sequence.  Just ignore.  */
1499             break;
1500
1501           /* We found a valid designation sequence for CHARSET.  */
1502           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1503           c = MAKE_CHAR (charset, 0, 0);
1504           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1505             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1506           else
1507             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1508           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1509             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1510           else
1511             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1512           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1513             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1514           else
1515             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1516           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1517             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1518           else
1519             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1520           break;
1521
1522         case ISO_CODE_SO:
1523           if (inhibit_iso_escape_detection)
1524             break;
1525           single_shifting = 0;
1526           if (shift_out == 0
1527               && (reg[1] >= 0
1528                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1529                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1530             {
1531               /* Locking shift out.  */
1532               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1533               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1534             }
1535           break;
1536
1537         case ISO_CODE_SI:
1538           if (inhibit_iso_escape_detection)
1539             break;
1540           single_shifting = 0;
1541           if (shift_out == 1)
1542             {
1543               /* Locking shift in.  */
1544               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1545               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1546             }
1547           break;
1548
1549         case ISO_CODE_CSI:
1550           single_shifting = 0;
1551         case ISO_CODE_SS2:
1552         case ISO_CODE_SS3:
1553           {
1554             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1555
1556             if (inhibit_iso_escape_detection)
1557               break;
1558             if (c != ISO_CODE_CSI)
1559               {
1560                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1561                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1562                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1563                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1564                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1565                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1566                 single_shifting = 1;
1567               }
1568             if (VECTORP (Vlatin_extra_code_table)
1569                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1570               {
1571                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1572                     & CODING_FLAG_ISO_LATIN_EXTRA)
1573                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1574                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1575                     & CODING_FLAG_ISO_LATIN_EXTRA)
1576                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1577               }
1578             mask &= newmask;
1579             mask_found |= newmask;
1580           }
1581           break;
1582
1583         default:
1584           if (c < 0x80)
1585             {
1586               single_shifting = 0;
1587               break;
1588             }
1589           else if (c < 0xA0)
1590             {
1591               single_shifting = 0;
1592               if (VECTORP (Vlatin_extra_code_table)
1593                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1594                 {
1595                   int newmask = 0;
1596
1597                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1598                       & CODING_FLAG_ISO_LATIN_EXTRA)
1599                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1600                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1601                       & CODING_FLAG_ISO_LATIN_EXTRA)
1602                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1603                   mask &= newmask;
1604                   mask_found |= newmask;
1605                 }
1606               else
1607                 return 0;
1608             }
1609           else
1610             {
1611               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1612                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1613               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1614               /* Check the length of succeeding codes of the range
1615                  0xA0..0FF.  If the byte length is odd, we exclude
1616                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1617                  when we are not single shifting.  */
1618               if (!single_shifting
1619                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1620                 {
1621                   int i = 1;
1622
1623                   c = -1;
1624                   while (src < src_end)
1625                     {
1626                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1627                       if (c < 0xA0)
1628                         break;
1629                       i++;
1630                     }
1631
1632                   if (i & 1 && src < src_end)
1633                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1634                   else
1635                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1636                   if (c >= 0)
1637                     /* This means that we have read one extra byte.  */
1638                     goto retry;
1639                 }
1640             }
1641           break;
1642         }
1643     }
1644  label_end_of_loop:
1645   return (mask & mask_found);
1646 }
1647
1648 /* Decode a character of which charset is CHARSET, the 1st position
1649    code is C1, the 2nd position code is C2, and return the decoded
1650    character code.  If the variable `translation_table' is non-nil,
1651    returned the translated code.  */
1652
1653 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1654   (NILP (translation_table)                     \
1655    ? MAKE_CHAR (charset, c1, c2)                \
1656    : translate_char (translation_table, -1, charset, c1, c2))
1657
1658 /* Set designation state into CODING.  */
1659 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1660   do {                                                                     \
1661     int charset, c;                                                        \
1662                                                                            \
1663     if (final_char < '0' || final_char >= 128)                             \
1664       goto label_invalid_code;                                             \
1665     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1666                                  make_number (chars),                      \
1667                                  make_number (final_char));                \
1668     c = MAKE_CHAR (charset, 0, 0);                                         \
1669     if (charset >= 0                                                       \
1670         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1671             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1672       {                                                                    \
1673         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1674             && reg == 0                                                    \
1675             && charset == CHARSET_ASCII)                                   \
1676           {                                                                \
1677             /* We should insert this designation sequence as is so         \
1678                that it is surely written back to a file.  */               \
1679             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1680             goto label_invalid_code;                                       \
1681           }                                                                \
1682         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1683         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1684             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1685           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1686         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1687       }                                                                    \
1688     else                                                                   \
1689       {                                                                    \
1690         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1691         goto label_invalid_code;                                           \
1692       }                                                                    \
1693   } while (0)
1694
1695 /* Allocate a memory block for storing information about compositions.
1696    The block is chained to the already allocated blocks.  */
1697
1698 void
1699 coding_allocate_composition_data (coding, char_offset)
1700      struct coding_system *coding;
1701      int char_offset;
1702 {
1703   struct composition_data *cmp_data
1704     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1705
1706   cmp_data->char_offset = char_offset;
1707   cmp_data->used = 0;
1708   cmp_data->prev = coding->cmp_data;
1709   cmp_data->next = NULL;
1710   if (coding->cmp_data)
1711     coding->cmp_data->next = cmp_data;
1712   coding->cmp_data = cmp_data;
1713   coding->cmp_data_start = 0;
1714   coding->composing = COMPOSITION_NO;
1715 }
1716
1717 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1718    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1719    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1720    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1721    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1722   */
1723
1724 #define DECODE_COMPOSITION_START(c1)                                       \
1725   do {                                                                     \
1726     if (coding->composing == COMPOSITION_DISABLED)                         \
1727       {                                                                    \
1728         *dst++ = ISO_CODE_ESC;                                             \
1729         *dst++ = c1 & 0x7f;                                                \
1730         coding->produced_char += 2;                                        \
1731       }                                                                    \
1732     else if (!COMPOSING_P (coding))                                        \
1733       {                                                                    \
1734         /* This is surely the start of a composition.  We must be sure     \
1735            that coding->cmp_data has enough space to store the             \
1736            information about the composition.  If not, terminate the       \
1737            current decoding loop, allocate one more memory block for       \
1738            coding->cmp_data in the caller, then start the decoding         \
1739            loop again.  We can't allocate memory here directly because     \
1740            it may cause buffer/string relocation.  */                      \
1741         if (!coding->cmp_data                                              \
1742             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1743                 >= COMPOSITION_DATA_SIZE))                                 \
1744           {                                                                \
1745             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1746             goto label_end_of_loop;                                        \
1747           }                                                                \
1748         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1749                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1750                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1751                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1752         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1753                                       coding->composing);                  \
1754         coding->composition_rule_follows = 0;                              \
1755       }                                                                    \
1756     else                                                                   \
1757       {                                                                    \
1758         /* We are already handling a composition.  If the method is        \
1759            the following two, the codes following the current escape       \
1760            sequence are actual characters stored in a buffer.  */          \
1761         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1762             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1763           {                                                                \
1764             coding->composing = COMPOSITION_RELATIVE;                      \
1765             coding->composition_rule_follows = 0;                          \
1766           }                                                                \
1767       }                                                                    \
1768   } while (0)
1769
1770 /* Handle composition end sequence ESC 1.  */
1771
1772 #define DECODE_COMPOSITION_END(c1)                                      \
1773   do {                                                                  \
1774     if (! COMPOSING_P (coding))                                         \
1775       {                                                                 \
1776         *dst++ = ISO_CODE_ESC;                                          \
1777         *dst++ = c1;                                                    \
1778         coding->produced_char += 2;                                     \
1779       }                                                                 \
1780     else                                                                \
1781       {                                                                 \
1782         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1783         coding->composing = COMPOSITION_NO;                             \
1784       }                                                                 \
1785   } while (0)
1786
1787 /* Decode a composition rule from the byte C1 (and maybe one more byte
1788    from SRC) and store one encoded composition rule in
1789    coding->cmp_data.  */
1790
1791 #define DECODE_COMPOSITION_RULE(c1)                                     \
1792   do {                                                                  \
1793     int rule = 0;                                                       \
1794     (c1) -= 32;                                                         \
1795     if (c1 < 81)                /* old format (before ver.21) */        \
1796       {                                                                 \
1797         int gref = (c1) / 9;                                            \
1798         int nref = (c1) % 9;                                            \
1799         if (gref == 4) gref = 10;                                       \
1800         if (nref == 4) nref = 10;                                       \
1801         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1802       }                                                                 \
1803     else if (c1 < 93)           /* new format (after ver.21) */         \
1804       {                                                                 \
1805         ONE_MORE_BYTE (c2);                                             \
1806         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1807       }                                                                 \
1808     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1809     coding->composition_rule_follows = 0;                               \
1810   } while (0)
1811
1812
1813 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1814
1815 static void
1816 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1817      struct coding_system *coding;
1818      const unsigned char *source;
1819      unsigned char *destination;
1820      int src_bytes, dst_bytes;
1821 {
1822   const unsigned char *src = source;
1823   const unsigned char *src_end = source + src_bytes;
1824   unsigned char *dst = destination;
1825   unsigned char *dst_end = destination + dst_bytes;
1826   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1827   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1828   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1829   /* SRC_BASE remembers the start position in source in each loop.
1830      The loop will be exited when there's not enough source code
1831      (within macro ONE_MORE_BYTE), or when there's not enough
1832      destination area to produce a character (within macro
1833      EMIT_CHAR).  */
1834   const unsigned char *src_base;
1835   int c, charset;
1836   Lisp_Object translation_table;
1837   Lisp_Object safe_chars;
1838
1839   safe_chars = coding_safe_chars (coding->symbol);
1840
1841   if (NILP (Venable_character_translation))
1842     translation_table = Qnil;
1843   else
1844     {
1845       translation_table = coding->translation_table_for_decode;
1846       if (NILP (translation_table))
1847         translation_table = Vstandard_translation_table_for_decode;
1848     }
1849
1850   coding->result = CODING_FINISH_NORMAL;
1851
1852   while (1)
1853     {
1854       int c1, c2 = 0;
1855
1856       src_base = src;
1857       ONE_MORE_BYTE (c1);
1858
1859       /* We produce no character or one character.  */
1860       switch (iso_code_class [c1])
1861         {
1862         case ISO_0x20_or_0x7F:
1863           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1864             {
1865               DECODE_COMPOSITION_RULE (c1);
1866               continue;
1867             }
1868           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1869             {
1870               /* This is SPACE or DEL.  */
1871               charset = CHARSET_ASCII;
1872               break;
1873             }
1874           /* This is a graphic character, we fall down ...  */
1875
1876         case ISO_graphic_plane_0:
1877           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1878             {
1879               DECODE_COMPOSITION_RULE (c1);
1880               continue;
1881             }
1882           charset = charset0;
1883           break;
1884
1885         case ISO_0xA0_or_0xFF:
1886           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1887               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1888             goto label_invalid_code;
1889           /* This is a graphic character, we fall down ... */
1890
1891         case ISO_graphic_plane_1:
1892           if (charset1 < 0)
1893             goto label_invalid_code;
1894           charset = charset1;
1895           break;
1896
1897         case ISO_control_0:
1898           if (COMPOSING_P (coding))
1899             DECODE_COMPOSITION_END ('1');
1900
1901           /* All ISO2022 control characters in this class have the
1902              same representation in Emacs internal format.  */
1903           if (c1 == '\n'
1904               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1905               && (coding->eol_type == CODING_EOL_CR
1906                   || coding->eol_type == CODING_EOL_CRLF))
1907             {
1908               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1909               goto label_end_of_loop;
1910             }
1911           charset = CHARSET_ASCII;
1912           break;
1913
1914         case ISO_control_1:
1915           if (COMPOSING_P (coding))
1916             DECODE_COMPOSITION_END ('1');
1917           goto label_invalid_code;
1918
1919         case ISO_carriage_return:
1920           if (COMPOSING_P (coding))
1921             DECODE_COMPOSITION_END ('1');
1922
1923           if (coding->eol_type == CODING_EOL_CR)
1924             c1 = '\n';
1925           else if (coding->eol_type == CODING_EOL_CRLF)
1926             {
1927               ONE_MORE_BYTE (c1);
1928               if (c1 != ISO_CODE_LF)
1929                 {
1930                   src--;
1931                   c1 = '\r';
1932                 }
1933             }
1934           charset = CHARSET_ASCII;
1935           break;
1936
1937         case ISO_shift_out:
1938           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1939               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1940             goto label_invalid_code;
1941           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1942           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1943           continue;
1944
1945         case ISO_shift_in:
1946           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1947             goto label_invalid_code;
1948           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1949           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1950           continue;
1951
1952         case ISO_single_shift_2_7:
1953         case ISO_single_shift_2:
1954           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1955             goto label_invalid_code;
1956           /* SS2 is handled as an escape sequence of ESC 'N' */
1957           c1 = 'N';
1958           goto label_escape_sequence;
1959
1960         case ISO_single_shift_3:
1961           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1962             goto label_invalid_code;
1963           /* SS2 is handled as an escape sequence of ESC 'O' */
1964           c1 = 'O';
1965           goto label_escape_sequence;
1966
1967         case ISO_control_sequence_introducer:
1968           /* CSI is handled as an escape sequence of ESC '[' ...  */
1969           c1 = '[';
1970           goto label_escape_sequence;
1971
1972         case ISO_escape:
1973           ONE_MORE_BYTE (c1);
1974         label_escape_sequence:
1975           /* Escape sequences handled by Emacs are invocation,
1976              designation, direction specification, and character
1977              composition specification.  */
1978           switch (c1)
1979             {
1980             case '&':           /* revision of following character set */
1981               ONE_MORE_BYTE (c1);
1982               if (!(c1 >= '@' && c1 <= '~'))
1983                 goto label_invalid_code;
1984               ONE_MORE_BYTE (c1);
1985               if (c1 != ISO_CODE_ESC)
1986                 goto label_invalid_code;
1987               ONE_MORE_BYTE (c1);
1988               goto label_escape_sequence;
1989
1990             case '$':           /* designation of 2-byte character set */
1991               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1992                 goto label_invalid_code;
1993               ONE_MORE_BYTE (c1);
1994               if (c1 >= '@' && c1 <= 'B')
1995                 {       /* designation of JISX0208.1978, GB2312.1980,
1996                            or JISX0208.1980 */
1997                   DECODE_DESIGNATION (0, 2, 94, c1);
1998                 }
1999               else if (c1 >= 0x28 && c1 <= 0x2B)
2000                 {       /* designation of DIMENSION2_CHARS94 character set */
2001                   ONE_MORE_BYTE (c2);
2002                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2003                 }
2004               else if (c1 >= 0x2C && c1 <= 0x2F)
2005                 {       /* designation of DIMENSION2_CHARS96 character set */
2006                   ONE_MORE_BYTE (c2);
2007                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2008                 }
2009               else
2010                 goto label_invalid_code;
2011               /* We must update these variables now.  */
2012               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2013               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2014               continue;
2015
2016             case 'n':           /* invocation of locking-shift-2 */
2017               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2018                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2019                 goto label_invalid_code;
2020               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2021               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2022               continue;
2023
2024             case 'o':           /* invocation of locking-shift-3 */
2025               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2026                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2027                 goto label_invalid_code;
2028               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2029               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2030               continue;
2031
2032             case 'N':           /* invocation of single-shift-2 */
2033               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2034                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2035                 goto label_invalid_code;
2036               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2037               ONE_MORE_BYTE (c1);
2038               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2039                 goto label_invalid_code;
2040               break;
2041
2042             case 'O':           /* invocation of single-shift-3 */
2043               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2044                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2045                 goto label_invalid_code;
2046               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2047               ONE_MORE_BYTE (c1);
2048               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2049                 goto label_invalid_code;
2050               break;
2051
2052             case '0': case '2': case '3': case '4': /* start composition */
2053               DECODE_COMPOSITION_START (c1);
2054               continue;
2055
2056             case '1':           /* end composition */
2057               DECODE_COMPOSITION_END (c1);
2058               continue;
2059
2060             case '[':           /* specification of direction */
2061               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2062                 goto label_invalid_code;
2063               /* For the moment, nested direction is not supported.
2064                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2065                  left-to-right, and nonzero means right-to-left.  */
2066               ONE_MORE_BYTE (c1);
2067               switch (c1)
2068                 {
2069                 case ']':       /* end of the current direction */
2070                   coding->mode &= ~CODING_MODE_DIRECTION;
2071
2072                 case '0':       /* end of the current direction */
2073                 case '1':       /* start of left-to-right direction */
2074                   ONE_MORE_BYTE (c1);
2075                   if (c1 == ']')
2076                     coding->mode &= ~CODING_MODE_DIRECTION;
2077                   else
2078                     goto label_invalid_code;
2079                   break;
2080
2081                 case '2':       /* start of right-to-left direction */
2082                   ONE_MORE_BYTE (c1);
2083                   if (c1 == ']')
2084                     coding->mode |= CODING_MODE_DIRECTION;
2085                   else
2086                     goto label_invalid_code;
2087                   break;
2088
2089                 default:
2090                   goto label_invalid_code;
2091                 }
2092               continue;
2093
2094             case '%':
2095               if (COMPOSING_P (coding))
2096                 DECODE_COMPOSITION_END ('1');
2097               ONE_MORE_BYTE (c1);
2098               if (c1 == '/')
2099                 {
2100                   /* CTEXT extended segment:
2101                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2102                      We keep these bytes as is for the moment.
2103                      They may be decoded by post-read-conversion.  */
2104                   int dim, M, L;
2105                   int size, required;
2106                   int produced_chars;
2107
2108                   ONE_MORE_BYTE (dim);
2109                   ONE_MORE_BYTE (M);
2110                   ONE_MORE_BYTE (L);
2111                   size = ((M - 128) * 128) + (L - 128);
2112                   required = 8 + size * 2;
2113                   if (dst + required > (dst_bytes ? dst_end : src))
2114                     goto label_end_of_loop;
2115                   *dst++ = ISO_CODE_ESC;
2116                   *dst++ = '%';
2117                   *dst++ = '/';
2118                   *dst++ = dim;
2119                   produced_chars = 4;
2120                   dst += CHAR_STRING (M, dst), produced_chars++;
2121                   dst += CHAR_STRING (L, dst), produced_chars++;
2122                   while (size-- > 0)
2123                     {
2124                       ONE_MORE_BYTE (c1);
2125                       dst += CHAR_STRING (c1, dst), produced_chars++;
2126                     }
2127                   coding->produced_char += produced_chars;
2128                 }
2129               else if (c1 == 'G')
2130                 {
2131                   unsigned char *d = dst;
2132                   int produced_chars;
2133
2134                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2135                      ESC % G --UTF-8-BYTES-- ESC % @
2136                      We keep these bytes as is for the moment.
2137                      They may be decoded by post-read-conversion.  */
2138                   if (d + 6 > (dst_bytes ? dst_end : src))
2139                     goto label_end_of_loop;
2140                   *d++ = ISO_CODE_ESC;
2141                   *d++ = '%';
2142                   *d++ = 'G';
2143                   produced_chars = 3;
2144                   while (d + 1 < (dst_bytes ? dst_end : src))
2145                     {
2146                       ONE_MORE_BYTE (c1);
2147                       if (c1 == ISO_CODE_ESC
2148                           && src + 1 < src_end
2149                           && src[0] == '%'
2150                           && src[1] == '@')
2151                         {
2152                           src += 2;
2153                           break;
2154                         }
2155                       d += CHAR_STRING (c1, d), produced_chars++;
2156                     }
2157                   if (d + 3 > (dst_bytes ? dst_end : src))
2158                     goto label_end_of_loop;
2159                   *d++ = ISO_CODE_ESC;
2160                   *d++ = '%';
2161                   *d++ = '@';
2162                   dst = d;
2163                   coding->produced_char += produced_chars + 3;
2164                 }
2165               else
2166                 goto label_invalid_code;
2167               continue;
2168
2169             default:
2170               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2171                 goto label_invalid_code;
2172               if (c1 >= 0x28 && c1 <= 0x2B)
2173                 {       /* designation of DIMENSION1_CHARS94 character set */
2174                   ONE_MORE_BYTE (c2);
2175                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2176                 }
2177               else if (c1 >= 0x2C && c1 <= 0x2F)
2178                 {       /* designation of DIMENSION1_CHARS96 character set */
2179                   ONE_MORE_BYTE (c2);
2180                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2181                 }
2182               else
2183                 goto label_invalid_code;
2184               /* We must update these variables now.  */
2185               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2186               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2187               continue;
2188             }
2189         }
2190
2191       /* Now we know CHARSET and 1st position code C1 of a character.
2192          Produce a multibyte sequence for that character while getting
2193          2nd position code C2 if necessary.  */
2194       if (CHARSET_DIMENSION (charset) == 2)
2195         {
2196           ONE_MORE_BYTE (c2);
2197           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2198             /* C2 is not in a valid range.  */
2199             goto label_invalid_code;
2200         }
2201       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2202       EMIT_CHAR (c);
2203       continue;
2204
2205     label_invalid_code:
2206       coding->errors++;
2207       if (COMPOSING_P (coding))
2208         DECODE_COMPOSITION_END ('1');
2209       src = src_base;
2210       c = *src++;
2211       if (! NILP (translation_table))
2212         c = translate_char (translation_table, c, 0, 0, 0);
2213       EMIT_CHAR (c);
2214     }
2215
2216  label_end_of_loop:
2217   coding->consumed = coding->consumed_char = src_base - source;
2218   coding->produced = dst - destination;
2219   return;
2220 }
2221
2222
2223 /* ISO2022 encoding stuff.  */
2224
2225 /*
2226    It is not enough to say just "ISO2022" on encoding, we have to
2227    specify more details.  In Emacs, each ISO2022 coding system
2228    variant has the following specifications:
2229         1. Initial designation to G0 through G3.
2230         2. Allows short-form designation?
2231         3. ASCII should be designated to G0 before control characters?
2232         4. ASCII should be designated to G0 at end of line?
2233         5. 7-bit environment or 8-bit environment?
2234         6. Use locking-shift?
2235         7. Use Single-shift?
2236    And the following two are only for Japanese:
2237         8. Use ASCII in place of JIS0201-1976-Roman?
2238         9. Use JISX0208-1983 in place of JISX0208-1978?
2239    These specifications are encoded in `coding->flags' as flag bits
2240    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2241    details.
2242 */
2243
2244 /* Produce codes (escape sequence) for designating CHARSET to graphic
2245    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2246    '@', 'A', or 'B' and the coding system CODING allows, produce
2247    designation sequence of short-form.  */
2248
2249 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2250   do {                                                                  \
2251     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2252     char *intermediate_char_94 = "()*+";                                \
2253     char *intermediate_char_96 = ",-./";                                \
2254     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2255                                                                         \
2256     if (revision < 255)                                                 \
2257       {                                                                 \
2258         *dst++ = ISO_CODE_ESC;                                          \
2259         *dst++ = '&';                                                   \
2260         *dst++ = '@' + revision;                                        \
2261       }                                                                 \
2262     *dst++ = ISO_CODE_ESC;                                              \
2263     if (CHARSET_DIMENSION (charset) == 1)                               \
2264       {                                                                 \
2265         if (CHARSET_CHARS (charset) == 94)                              \
2266           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2267         else                                                            \
2268           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2269       }                                                                 \
2270     else                                                                \
2271       {                                                                 \
2272         *dst++ = '$';                                                   \
2273         if (CHARSET_CHARS (charset) == 94)                              \
2274           {                                                             \
2275             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2276                 || reg != 0                                             \
2277                 || final_char < '@' || final_char > 'B')                \
2278               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2279           }                                                             \
2280         else                                                            \
2281           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2282       }                                                                 \
2283     *dst++ = final_char;                                                \
2284     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2285   } while (0)
2286
2287 /* The following two macros produce codes (control character or escape
2288    sequence) for ISO2022 single-shift functions (single-shift-2 and
2289    single-shift-3).  */
2290
2291 #define ENCODE_SINGLE_SHIFT_2                           \
2292   do {                                                  \
2293     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2294       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2295     else                                                \
2296       *dst++ = ISO_CODE_SS2;                            \
2297     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2298   } while (0)
2299
2300 #define ENCODE_SINGLE_SHIFT_3                           \
2301   do {                                                  \
2302     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2303       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2304     else                                                \
2305       *dst++ = ISO_CODE_SS3;                            \
2306     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2307   } while (0)
2308
2309 /* The following four macros produce codes (control character or
2310    escape sequence) for ISO2022 locking-shift functions (shift-in,
2311    shift-out, locking-shift-2, and locking-shift-3).  */
2312
2313 #define ENCODE_SHIFT_IN                         \
2314   do {                                          \
2315     *dst++ = ISO_CODE_SI;                       \
2316     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2317   } while (0)
2318
2319 #define ENCODE_SHIFT_OUT                        \
2320   do {                                          \
2321     *dst++ = ISO_CODE_SO;                       \
2322     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2323   } while (0)
2324
2325 #define ENCODE_LOCKING_SHIFT_2                  \
2326   do {                                          \
2327     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2328     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2329   } while (0)
2330
2331 #define ENCODE_LOCKING_SHIFT_3                  \
2332   do {                                          \
2333     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2334     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2335   } while (0)
2336
2337 /* Produce codes for a DIMENSION1 character whose character set is
2338    CHARSET and whose position-code is C1.  Designation and invocation
2339    sequences are also produced in advance if necessary.  */
2340
2341 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2342   do {                                                                  \
2343     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2344       {                                                                 \
2345         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2346           *dst++ = c1 & 0x7F;                                           \
2347         else                                                            \
2348           *dst++ = c1 | 0x80;                                           \
2349         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2350         break;                                                          \
2351       }                                                                 \
2352     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2353       {                                                                 \
2354         *dst++ = c1 & 0x7F;                                             \
2355         break;                                                          \
2356       }                                                                 \
2357     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2358       {                                                                 \
2359         *dst++ = c1 | 0x80;                                             \
2360         break;                                                          \
2361       }                                                                 \
2362     else                                                                \
2363       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2364          must invoke it, or, at first, designate it to some graphic     \
2365          register.  Then repeat the loop to actually produce the        \
2366          character.  */                                                 \
2367       dst = encode_invocation_designation (charset, coding, dst);       \
2368   } while (1)
2369
2370 /* Produce codes for a DIMENSION2 character whose character set is
2371    CHARSET and whose position-codes are C1 and C2.  Designation and
2372    invocation codes are also produced in advance if necessary.  */
2373
2374 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2375   do {                                                                  \
2376     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2377       {                                                                 \
2378         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2379           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2380         else                                                            \
2381           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2382         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2383         break;                                                          \
2384       }                                                                 \
2385     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2386       {                                                                 \
2387         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2388         break;                                                          \
2389       }                                                                 \
2390     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2391       {                                                                 \
2392         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2393         break;                                                          \
2394       }                                                                 \
2395     else                                                                \
2396       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2397          must invoke it, or, at first, designate it to some graphic     \
2398          register.  Then repeat the loop to actually produce the        \
2399          character.  */                                                 \
2400       dst = encode_invocation_designation (charset, coding, dst);       \
2401   } while (1)
2402
2403 #define ENCODE_ISO_CHARACTER(c)                                 \
2404   do {                                                          \
2405     int charset, c1, c2;                                        \
2406                                                                 \
2407     SPLIT_CHAR (c, charset, c1, c2);                            \
2408     if (CHARSET_DEFINED_P (charset))                            \
2409       {                                                         \
2410         if (CHARSET_DIMENSION (charset) == 1)                   \
2411           {                                                     \
2412             if (charset == CHARSET_ASCII                        \
2413                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2414               charset = charset_latin_jisx0201;                 \
2415             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2416           }                                                     \
2417         else                                                    \
2418           {                                                     \
2419             if (charset == charset_jisx0208                     \
2420                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2421               charset = charset_jisx0208_1978;                  \
2422             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2423           }                                                     \
2424       }                                                         \
2425     else                                                        \
2426       {                                                         \
2427         *dst++ = c1;                                            \
2428         if (c2 >= 0)                                            \
2429           *dst++ = c2;                                          \
2430       }                                                         \
2431   } while (0)
2432
2433
2434 /* Instead of encoding character C, produce one or two `?'s.  */
2435
2436 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2437   do {                                                          \
2438     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2439     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2440       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2441   } while (0)
2442
2443
2444 /* Produce designation and invocation codes at a place pointed by DST
2445    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2446    Return new DST.  */
2447
2448 unsigned char *
2449 encode_invocation_designation (charset, coding, dst)
2450      int charset;
2451      struct coding_system *coding;
2452      unsigned char *dst;
2453 {
2454   int reg;                      /* graphic register number */
2455
2456   /* At first, check designations.  */
2457   for (reg = 0; reg < 4; reg++)
2458     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2459       break;
2460
2461   if (reg >= 4)
2462     {
2463       /* CHARSET is not yet designated to any graphic registers.  */
2464       /* At first check the requested designation.  */
2465       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2466       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2467         /* Since CHARSET requests no special designation, designate it
2468            to graphic register 0.  */
2469         reg = 0;
2470
2471       ENCODE_DESIGNATION (charset, reg, coding);
2472     }
2473
2474   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2475       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2476     {
2477       /* Since the graphic register REG is not invoked to any graphic
2478          planes, invoke it to graphic plane 0.  */
2479       switch (reg)
2480         {
2481         case 0:                 /* graphic register 0 */
2482           ENCODE_SHIFT_IN;
2483           break;
2484
2485         case 1:                 /* graphic register 1 */
2486           ENCODE_SHIFT_OUT;
2487           break;
2488
2489         case 2:                 /* graphic register 2 */
2490           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2491             ENCODE_SINGLE_SHIFT_2;
2492           else
2493             ENCODE_LOCKING_SHIFT_2;
2494           break;
2495
2496         case 3:                 /* graphic register 3 */
2497           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2498             ENCODE_SINGLE_SHIFT_3;
2499           else
2500             ENCODE_LOCKING_SHIFT_3;
2501           break;
2502         }
2503     }
2504
2505   return dst;
2506 }
2507
2508 /* Produce 2-byte codes for encoded composition rule RULE.  */
2509
2510 #define ENCODE_COMPOSITION_RULE(rule)           \
2511   do {                                          \
2512     int gref, nref;                             \
2513     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2514     *dst++ = 32 + 81 + gref;                    \
2515     *dst++ = 32 + nref;                         \
2516   } while (0)
2517
2518 /* Produce codes for indicating the start of a composition sequence
2519    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2520    which specify information about the composition.  See the comment
2521    in coding.h for the format of DATA.  */
2522
2523 #define ENCODE_COMPOSITION_START(coding, data)                          \
2524   do {                                                                  \
2525     coding->composing = data[3];                                        \
2526     *dst++ = ISO_CODE_ESC;                                              \
2527     if (coding->composing == COMPOSITION_RELATIVE)                      \
2528       *dst++ = '0';                                                     \
2529     else                                                                \
2530       {                                                                 \
2531         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2532                   ? '3' : '4');                                         \
2533         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2534         coding->composition_rule_follows = 0;                           \
2535       }                                                                 \
2536   } while (0)
2537
2538 /* Produce codes for indicating the end of the current composition.  */
2539
2540 #define ENCODE_COMPOSITION_END(coding, data)                    \
2541   do {                                                          \
2542     *dst++ = ISO_CODE_ESC;                                      \
2543     *dst++ = '1';                                               \
2544     coding->cmp_data_start += data[0];                          \
2545     coding->composing = COMPOSITION_NO;                         \
2546     if (coding->cmp_data_start == coding->cmp_data->used        \
2547         && coding->cmp_data->next)                              \
2548       {                                                         \
2549         coding->cmp_data = coding->cmp_data->next;              \
2550         coding->cmp_data_start = 0;                             \
2551       }                                                         \
2552   } while (0)
2553
2554 /* Produce composition start sequence ESC 0.  Here, this sequence
2555    doesn't mean the start of a new composition but means that we have
2556    just produced components (alternate chars and composition rules) of
2557    the composition and the actual text follows in SRC.  */
2558
2559 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2560   do {                                          \
2561     *dst++ = ISO_CODE_ESC;                      \
2562     *dst++ = '0';                               \
2563     coding->composing = COMPOSITION_RELATIVE;   \
2564   } while (0)
2565
2566 /* The following three macros produce codes for indicating direction
2567    of text.  */
2568 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2569   do {                                                  \
2570     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2571       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2572     else                                                \
2573       *dst++ = ISO_CODE_CSI;                            \
2574   } while (0)
2575
2576 #define ENCODE_DIRECTION_R2L    \
2577   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2578
2579 #define ENCODE_DIRECTION_L2R    \
2580   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2581
2582 /* Produce codes for designation and invocation to reset the graphic
2583    planes and registers to initial state.  */
2584 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2585   do {                                                                      \
2586     int reg;                                                                \
2587     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2588       ENCODE_SHIFT_IN;                                                      \
2589     for (reg = 0; reg < 4; reg++)                                           \
2590       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2591           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2592               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2593         ENCODE_DESIGNATION                                                  \
2594           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2595   } while (0)
2596
2597 /* Produce designation sequences of charsets in the line started from
2598    SRC to a place pointed by DST, and return updated DST.
2599
2600    If the current block ends before any end-of-line, we may fail to
2601    find all the necessary designations.  */
2602
2603 static unsigned char *
2604 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2605      struct coding_system *coding;
2606      Lisp_Object translation_table;
2607      const unsigned char *src, *src_end;
2608      unsigned char *dst;
2609 {
2610   int charset, c, found = 0, reg;
2611   /* Table of charsets to be designated to each graphic register.  */
2612   int r[4];
2613
2614   for (reg = 0; reg < 4; reg++)
2615     r[reg] = -1;
2616
2617   while (found < 4)
2618     {
2619       ONE_MORE_CHAR (c);
2620       if (c == '\n')
2621         break;
2622
2623       charset = CHAR_CHARSET (c);
2624       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2625       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2626         {
2627           found++;
2628           r[reg] = charset;
2629         }
2630     }
2631
2632  label_end_of_loop:
2633   if (found)
2634     {
2635       for (reg = 0; reg < 4; reg++)
2636         if (r[reg] >= 0
2637             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2638           ENCODE_DESIGNATION (r[reg], reg, coding);
2639     }
2640
2641   return dst;
2642 }
2643
2644 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2645
2646 static void
2647 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2648      struct coding_system *coding;
2649      const unsigned char *source;
2650      unsigned char *destination;
2651      int src_bytes, dst_bytes;
2652 {
2653   const unsigned char *src = source;
2654   const unsigned char *src_end = source + src_bytes;
2655   unsigned char *dst = destination;
2656   unsigned char *dst_end = destination + dst_bytes;
2657   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2658      from DST_END to assure overflow checking is necessary only at the
2659      head of loop.  */
2660   unsigned char *adjusted_dst_end = dst_end - 19;
2661   /* SRC_BASE remembers the start position in source in each loop.
2662      The loop will be exited when there's not enough source text to
2663      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2664      there's not enough destination area to produce encoded codes
2665      (within macro EMIT_BYTES).  */
2666   const unsigned char *src_base;
2667   int c;
2668   Lisp_Object translation_table;
2669   Lisp_Object safe_chars;
2670
2671   if (coding->flags & CODING_FLAG_ISO_SAFE)
2672     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2673
2674   safe_chars = coding_safe_chars (coding->symbol);
2675
2676   if (NILP (Venable_character_translation))
2677     translation_table = Qnil;
2678   else
2679     {
2680       translation_table = coding->translation_table_for_encode;
2681       if (NILP (translation_table))
2682         translation_table = Vstandard_translation_table_for_encode;
2683     }
2684
2685   coding->consumed_char = 0;
2686   coding->errors = 0;
2687   while (1)
2688     {
2689       src_base = src;
2690
2691       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2692         {
2693           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2694           break;
2695         }
2696
2697       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2698           && CODING_SPEC_ISO_BOL (coding))
2699         {
2700           /* We have to produce designation sequences if any now.  */
2701           dst = encode_designation_at_bol (coding, translation_table,
2702                                            src, src_end, dst);
2703           CODING_SPEC_ISO_BOL (coding) = 0;
2704         }
2705
2706       /* Check composition start and end.  */
2707       if (coding->composing != COMPOSITION_DISABLED
2708           && coding->cmp_data_start < coding->cmp_data->used)
2709         {
2710           struct composition_data *cmp_data = coding->cmp_data;
2711           int *data = cmp_data->data + coding->cmp_data_start;
2712           int this_pos = cmp_data->char_offset + coding->consumed_char;
2713
2714           if (coding->composing == COMPOSITION_RELATIVE)
2715             {
2716               if (this_pos == data[2])
2717                 {
2718                   ENCODE_COMPOSITION_END (coding, data);
2719                   cmp_data = coding->cmp_data;
2720                   data = cmp_data->data + coding->cmp_data_start;
2721                 }
2722             }
2723           else if (COMPOSING_P (coding))
2724             {
2725               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2726               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2727                 /* We have consumed components of the composition.
2728                    What follows in SRC is the composition's base
2729                    text.  */
2730                 ENCODE_COMPOSITION_FAKE_START (coding);
2731               else
2732                 {
2733                   int c = cmp_data->data[coding->cmp_data_index++];
2734                   if (coding->composition_rule_follows)
2735                     {
2736                       ENCODE_COMPOSITION_RULE (c);
2737                       coding->composition_rule_follows = 0;
2738                     }
2739                   else
2740                     {
2741                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2742                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2743                         ENCODE_UNSAFE_CHARACTER (c);
2744                       else
2745                         ENCODE_ISO_CHARACTER (c);
2746                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2747                         coding->composition_rule_follows = 1;
2748                     }
2749                   continue;
2750                 }
2751             }
2752           if (!COMPOSING_P (coding))
2753             {
2754               if (this_pos == data[1])
2755                 {
2756                   ENCODE_COMPOSITION_START (coding, data);
2757                   continue;
2758                 }
2759             }
2760         }
2761
2762       ONE_MORE_CHAR (c);
2763
2764       /* Now encode the character C.  */
2765       if (c < 0x20 || c == 0x7F)
2766         {
2767           if (c == '\r')
2768             {
2769               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2770                 {
2771                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2772                     ENCODE_RESET_PLANE_AND_REGISTER;
2773                   *dst++ = c;
2774                   continue;
2775                 }
2776               /* fall down to treat '\r' as '\n' ...  */
2777               c = '\n';
2778             }
2779           if (c == '\n')
2780             {
2781               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2782                 ENCODE_RESET_PLANE_AND_REGISTER;
2783               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2784                 bcopy (coding->spec.iso2022.initial_designation,
2785                        coding->spec.iso2022.current_designation,
2786                        sizeof coding->spec.iso2022.initial_designation);
2787               if (coding->eol_type == CODING_EOL_LF
2788                   || coding->eol_type == CODING_EOL_UNDECIDED)
2789                 *dst++ = ISO_CODE_LF;
2790               else if (coding->eol_type == CODING_EOL_CRLF)
2791                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2792               else
2793                 *dst++ = ISO_CODE_CR;
2794               CODING_SPEC_ISO_BOL (coding) = 1;
2795             }
2796           else
2797             {
2798               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2799                 ENCODE_RESET_PLANE_AND_REGISTER;
2800               *dst++ = c;
2801             }
2802         }
2803       else if (ASCII_BYTE_P (c))
2804         ENCODE_ISO_CHARACTER (c);
2805       else if (SINGLE_BYTE_CHAR_P (c))
2806         {
2807           *dst++ = c;
2808           coding->errors++;
2809         }
2810       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2811                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2812         ENCODE_UNSAFE_CHARACTER (c);
2813       else
2814         ENCODE_ISO_CHARACTER (c);
2815
2816       coding->consumed_char++;
2817     }
2818
2819  label_end_of_loop:
2820   coding->consumed = src_base - source;
2821   coding->produced = coding->produced_char = dst - destination;
2822 }
2823
2824 \f
2825 /*** 4. SJIS and BIG5 handlers ***/
2826
2827 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2828    quite widely.  So, for the moment, Emacs supports them in the bare
2829    C code.  But, in the future, they may be supported only by CCL.  */
2830
2831 /* SJIS is a coding system encoding three character sets: ASCII, right
2832    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2833    as is.  A character of charset katakana-jisx0201 is encoded by
2834    "position-code + 0x80".  A character of charset japanese-jisx0208
2835    is encoded in 2-byte but two position-codes are divided and shifted
2836    so that it fits in the range below.
2837
2838    --- CODE RANGE of SJIS ---
2839    (character set)      (range)
2840    ASCII                0x00 .. 0x7F
2841    KATAKANA-JISX0201    0xA1 .. 0xDF
2842    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2843             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2844    -------------------------------
2845
2846 */
2847
2848 /* BIG5 is a coding system encoding two character sets: ASCII and
2849    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2850    character set and is encoded in two bytes.
2851
2852    --- CODE RANGE of BIG5 ---
2853    (character set)      (range)
2854    ASCII                0x00 .. 0x7F
2855    Big5 (1st byte)      0xA1 .. 0xFE
2856         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2857    --------------------------
2858
2859    Since the number of characters in Big5 is larger than maximum
2860    characters in Emacs' charset (96x96), it can't be handled as one
2861    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2862    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2863    contains frequently used characters and the latter contains less
2864    frequently used characters.  */
2865
2866 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2867    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2868    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2869    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2870
2871 /* Number of Big5 characters which have the same code in 1st byte.  */
2872 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2873
2874 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2875   do {                                                                  \
2876     unsigned int temp                                                   \
2877       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2878     if (b1 < 0xC9)                                                      \
2879       charset = charset_big5_1;                                         \
2880     else                                                                \
2881       {                                                                 \
2882         charset = charset_big5_2;                                       \
2883         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2884       }                                                                 \
2885     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2886     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2887   } while (0)
2888
2889 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2890   do {                                                                  \
2891     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2892     if (charset == charset_big5_2)                                      \
2893       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2894     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2895     b2 = temp % BIG5_SAME_ROW;                                          \
2896     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2897   } while (0)
2898
2899 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2900    Check if a text is encoded in SJIS.  If it is, return
2901    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2902
2903 static int
2904 detect_coding_sjis (src, src_end, multibytep)
2905      unsigned char *src, *src_end;
2906      int multibytep;
2907 {
2908   int c;
2909   /* Dummy for ONE_MORE_BYTE.  */
2910   struct coding_system dummy_coding;
2911   struct coding_system *coding = &dummy_coding;
2912
2913   while (1)
2914     {
2915       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2916       if (c < 0x80)
2917         continue;
2918       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2919         return 0;
2920       if (c <= 0x9F || c >= 0xE0)
2921         {
2922           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2923           if (c < 0x40 || c == 0x7F || c > 0xFC)
2924             return 0;
2925         }
2926     }
2927  label_end_of_loop:
2928   return CODING_CATEGORY_MASK_SJIS;
2929 }
2930
2931 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2932    Check if a text is encoded in BIG5.  If it is, return
2933    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2934
2935 static int
2936 detect_coding_big5 (src, src_end, multibytep)
2937      unsigned char *src, *src_end;
2938      int multibytep;
2939 {
2940   int c;
2941   /* Dummy for ONE_MORE_BYTE.  */
2942   struct coding_system dummy_coding;
2943   struct coding_system *coding = &dummy_coding;
2944
2945   while (1)
2946     {
2947       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2948       if (c < 0x80)
2949         continue;
2950       if (c < 0xA1 || c > 0xFE)
2951         return 0;
2952       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2953       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2954         return 0;
2955     }
2956  label_end_of_loop:
2957   return CODING_CATEGORY_MASK_BIG5;
2958 }
2959
2960 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2961    Check if a text is encoded in UTF-8.  If it is, return
2962    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2963
2964 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2965 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2966 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2967 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2968 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2969 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2970 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2971
2972 static int
2973 detect_coding_utf_8 (src, src_end, multibytep)
2974      unsigned char *src, *src_end;
2975      int multibytep;
2976 {
2977   unsigned char c;
2978   int seq_maybe_bytes;
2979   /* Dummy for ONE_MORE_BYTE.  */
2980   struct coding_system dummy_coding;
2981   struct coding_system *coding = &dummy_coding;
2982
2983   while (1)
2984     {
2985       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2986       if (UTF_8_1_OCTET_P (c))
2987         continue;
2988       else if (UTF_8_2_OCTET_LEADING_P (c))
2989         seq_maybe_bytes = 1;
2990       else if (UTF_8_3_OCTET_LEADING_P (c))
2991         seq_maybe_bytes = 2;
2992       else if (UTF_8_4_OCTET_LEADING_P (c))
2993         seq_maybe_bytes = 3;
2994       else if (UTF_8_5_OCTET_LEADING_P (c))
2995         seq_maybe_bytes = 4;
2996       else if (UTF_8_6_OCTET_LEADING_P (c))
2997         seq_maybe_bytes = 5;
2998       else
2999         return 0;
3000
3001       do
3002         {
3003           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3004           if (!UTF_8_EXTRA_OCTET_P (c))
3005             return 0;
3006           seq_maybe_bytes--;
3007         }
3008       while (seq_maybe_bytes > 0);
3009     }
3010
3011  label_end_of_loop:
3012   return CODING_CATEGORY_MASK_UTF_8;
3013 }
3014
3015 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3016    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3017    Little Endian (otherwise).  If it is, return
3018    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3019    else return 0.  */
3020
3021 #define UTF_16_INVALID_P(val)   \
3022   (((val) == 0xFFFE)            \
3023    || ((val) == 0xFFFF))
3024
3025 #define UTF_16_HIGH_SURROGATE_P(val) \
3026   (((val) & 0xD800) == 0xD800)
3027
3028 #define UTF_16_LOW_SURROGATE_P(val) \
3029   (((val) & 0xDC00) == 0xDC00)
3030
3031 static int
3032 detect_coding_utf_16 (src, src_end, multibytep)
3033      unsigned char *src, *src_end;
3034      int multibytep;
3035 {
3036   unsigned char c1, c2;
3037   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3038   struct coding_system dummy_coding;
3039   struct coding_system *coding = &dummy_coding;
3040
3041   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3042   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3043
3044   if ((c1 == 0xFF) && (c2 == 0xFE))
3045     return CODING_CATEGORY_MASK_UTF_16_LE;
3046   else if ((c1 == 0xFE) && (c2 == 0xFF))
3047     return CODING_CATEGORY_MASK_UTF_16_BE;
3048
3049  label_end_of_loop:
3050   return 0;
3051 }
3052
3053 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3054    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3055
3056 static void
3057 decode_coding_sjis_big5 (coding, source, destination,
3058                          src_bytes, dst_bytes, sjis_p)
3059      struct coding_system *coding;
3060      const unsigned char *source;
3061      unsigned char  *destination;
3062      int src_bytes, dst_bytes;
3063      int sjis_p;
3064 {
3065   const unsigned char *src = source;
3066   const unsigned char *src_end = source + src_bytes;
3067   unsigned char *dst = destination;
3068   unsigned char *dst_end = destination + dst_bytes;
3069   /* SRC_BASE remembers the start position in source in each loop.
3070      The loop will be exited when there's not enough source code
3071      (within macro ONE_MORE_BYTE), or when there's not enough
3072      destination area to produce a character (within macro
3073      EMIT_CHAR).  */
3074   const unsigned char *src_base;
3075   Lisp_Object translation_table;
3076
3077   if (NILP (Venable_character_translation))
3078     translation_table = Qnil;
3079   else
3080     {
3081       translation_table = coding->translation_table_for_decode;
3082       if (NILP (translation_table))
3083         translation_table = Vstandard_translation_table_for_decode;
3084     }
3085
3086   coding->produced_char = 0;
3087   while (1)
3088     {
3089       int c, charset, c1, c2 = 0;
3090
3091       src_base = src;
3092       ONE_MORE_BYTE (c1);
3093
3094       if (c1 < 0x80)
3095         {
3096           charset = CHARSET_ASCII;
3097           if (c1 < 0x20)
3098             {
3099               if (c1 == '\r')
3100                 {
3101                   if (coding->eol_type == CODING_EOL_CRLF)
3102                     {
3103                       ONE_MORE_BYTE (c2);
3104                       if (c2 == '\n')
3105                         c1 = c2;
3106                       else
3107                         /* To process C2 again, SRC is subtracted by 1.  */
3108                         src--;
3109                     }
3110                   else if (coding->eol_type == CODING_EOL_CR)
3111                     c1 = '\n';
3112                 }
3113               else if (c1 == '\n'
3114                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3115                        && (coding->eol_type == CODING_EOL_CR
3116                            || coding->eol_type == CODING_EOL_CRLF))
3117                 {
3118                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3119                   goto label_end_of_loop;
3120                 }
3121             }
3122         }
3123       else
3124         {
3125           if (sjis_p)
3126             {
3127               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3128                 goto label_invalid_code;
3129               if (c1 <= 0x9F || c1 >= 0xE0)
3130                 {
3131                   /* SJIS -> JISX0208 */
3132                   ONE_MORE_BYTE (c2);
3133                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3134                     goto label_invalid_code;
3135                   DECODE_SJIS (c1, c2, c1, c2);
3136                   charset = charset_jisx0208;
3137                 }
3138               else
3139                 /* SJIS -> JISX0201-Kana */
3140                 charset = charset_katakana_jisx0201;
3141             }
3142           else
3143             {
3144               /* BIG5 -> Big5 */
3145               if (c1 < 0xA0 || c1 > 0xFE)
3146                 goto label_invalid_code;
3147               ONE_MORE_BYTE (c2);
3148               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3149                 goto label_invalid_code;
3150               DECODE_BIG5 (c1, c2, charset, c1, c2);
3151             }
3152         }
3153
3154       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3155       EMIT_CHAR (c);
3156       continue;
3157
3158     label_invalid_code:
3159       coding->errors++;
3160       src = src_base;
3161       c = *src++;
3162       EMIT_CHAR (c);
3163     }
3164
3165  label_end_of_loop:
3166   coding->consumed = coding->consumed_char = src_base - source;
3167   coding->produced = dst - destination;
3168   return;
3169 }
3170
3171 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3172    This function can encode charsets `ascii', `katakana-jisx0201',
3173    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3174    are sure that all these charsets are registered as official charset
3175    (i.e. do not have extended leading-codes).  Characters of other
3176    charsets are produced without any encoding.  If SJIS_P is 1, encode
3177    SJIS text, else encode BIG5 text.  */
3178
3179 static void
3180 encode_coding_sjis_big5 (coding, source, destination,
3181                          src_bytes, dst_bytes, sjis_p)
3182      struct coding_system *coding;
3183      unsigned char *source, *destination;
3184      int src_bytes, dst_bytes;
3185      int sjis_p;
3186 {
3187   unsigned char *src = source;
3188   unsigned char *src_end = source + src_bytes;
3189   unsigned char *dst = destination;
3190   unsigned char *dst_end = destination + dst_bytes;
3191   /* SRC_BASE remembers the start position in source in each loop.
3192      The loop will be exited when there's not enough source text to
3193      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3194      there's not enough destination area to produce encoded codes
3195      (within macro EMIT_BYTES).  */
3196   unsigned char *src_base;
3197   Lisp_Object translation_table;
3198
3199   if (NILP (Venable_character_translation))
3200     translation_table = Qnil;
3201   else
3202     {
3203       translation_table = coding->translation_table_for_encode;
3204       if (NILP (translation_table))
3205         translation_table = Vstandard_translation_table_for_encode;
3206     }
3207
3208   while (1)
3209     {
3210       int c, charset, c1, c2;
3211
3212       src_base = src;
3213       ONE_MORE_CHAR (c);
3214
3215       /* Now encode the character C.  */
3216       if (SINGLE_BYTE_CHAR_P (c))
3217         {
3218           switch (c)
3219             {
3220             case '\r':
3221               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3222                 {
3223                   EMIT_ONE_BYTE (c);
3224                   break;
3225                 }
3226               c = '\n';
3227             case '\n':
3228               if (coding->eol_type == CODING_EOL_CRLF)
3229                 {
3230                   EMIT_TWO_BYTES ('\r', c);
3231                   break;
3232                 }
3233               else if (coding->eol_type == CODING_EOL_CR)
3234                 c = '\r';
3235             default:
3236               EMIT_ONE_BYTE (c);
3237             }
3238         }
3239       else
3240         {
3241           SPLIT_CHAR (c, charset, c1, c2);
3242           if (sjis_p)
3243             {
3244               if (charset == charset_jisx0208
3245                   || charset == charset_jisx0208_1978)
3246                 {
3247                   ENCODE_SJIS (c1, c2, c1, c2);
3248                   EMIT_TWO_BYTES (c1, c2);
3249                 }
3250               else if (charset == charset_katakana_jisx0201)
3251                 EMIT_ONE_BYTE (c1 | 0x80);
3252               else if (charset == charset_latin_jisx0201)
3253                 EMIT_ONE_BYTE (c1);
3254               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3255                 {
3256                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3257                   if (CHARSET_WIDTH (charset) > 1)
3258                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3259                 }
3260               else
3261                 /* There's no way other than producing the internal
3262                    codes as is.  */
3263                 EMIT_BYTES (src_base, src);
3264             }
3265           else
3266             {
3267               if (charset == charset_big5_1 || charset == charset_big5_2)
3268                 {
3269                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3270                   EMIT_TWO_BYTES (c1, c2);
3271                 }
3272               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3273                 {
3274                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3275                   if (CHARSET_WIDTH (charset) > 1)
3276                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3277                 }
3278               else
3279                 /* There's no way other than producing the internal
3280                    codes as is.  */
3281                 EMIT_BYTES (src_base, src);
3282             }
3283         }
3284       coding->consumed_char++;
3285     }
3286
3287  label_end_of_loop:
3288   coding->consumed = src_base - source;
3289   coding->produced = coding->produced_char = dst - destination;
3290 }
3291
3292 \f
3293 /*** 5. CCL handlers ***/
3294
3295 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3296    Check if a text is encoded in a coding system of which
3297    encoder/decoder are written in CCL program.  If it is, return
3298    CODING_CATEGORY_MASK_CCL, else return 0.  */
3299
3300 static int
3301 detect_coding_ccl (src, src_end, multibytep)
3302      unsigned char *src, *src_end;
3303      int multibytep;
3304 {
3305   unsigned char *valid;
3306   int c;
3307   /* Dummy for ONE_MORE_BYTE.  */
3308   struct coding_system dummy_coding;
3309   struct coding_system *coding = &dummy_coding;
3310
3311   /* No coding system is assigned to coding-category-ccl.  */
3312   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3313     return 0;
3314
3315   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3316   while (1)
3317     {
3318       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3319       if (! valid[c])
3320         return 0;
3321     }
3322  label_end_of_loop:
3323   return CODING_CATEGORY_MASK_CCL;
3324 }
3325
3326 \f
3327 /*** 6. End-of-line handlers ***/
3328
3329 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3330
3331 static void
3332 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3333      struct coding_system *coding;
3334      const unsigned char *source;
3335      unsigned char *destination;
3336      int src_bytes, dst_bytes;
3337 {
3338   const unsigned char *src = source;
3339   unsigned char *dst = destination;
3340   const unsigned char *src_end = src + src_bytes;
3341   unsigned char *dst_end = dst + dst_bytes;
3342   Lisp_Object translation_table;
3343   /* SRC_BASE remembers the start position in source in each loop.
3344      The loop will be exited when there's not enough source code
3345      (within macro ONE_MORE_BYTE), or when there's not enough
3346      destination area to produce a character (within macro
3347      EMIT_CHAR).  */
3348   const unsigned char *src_base;
3349   int c;
3350
3351   translation_table = Qnil;
3352   switch (coding->eol_type)
3353     {
3354     case CODING_EOL_CRLF:
3355       while (1)
3356         {
3357           src_base = src;
3358           ONE_MORE_BYTE (c);
3359           if (c == '\r')
3360             {
3361               ONE_MORE_BYTE (c);
3362               if (c != '\n')
3363                 {
3364                   src--;
3365                   c = '\r';
3366                 }
3367             }
3368           else if (c == '\n'
3369                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3370             {
3371               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3372               goto label_end_of_loop;
3373             }
3374           EMIT_CHAR (c);
3375         }
3376       break;
3377
3378     case CODING_EOL_CR:
3379       while (1)
3380         {
3381           src_base = src;
3382           ONE_MORE_BYTE (c);
3383           if (c == '\n')
3384             {
3385               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3386                 {
3387                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3388                   goto label_end_of_loop;
3389                 }
3390             }
3391           else if (c == '\r')
3392             c = '\n';
3393           EMIT_CHAR (c);
3394         }
3395       break;
3396
3397     default:                    /* no need for EOL handling */
3398       while (1)
3399         {
3400           src_base = src;
3401           ONE_MORE_BYTE (c);
3402           EMIT_CHAR (c);
3403         }
3404     }
3405
3406  label_end_of_loop:
3407   coding->consumed = coding->consumed_char = src_base - source;
3408   coding->produced = dst - destination;
3409   return;
3410 }
3411
3412 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3413    format of end-of-line according to `coding->eol_type'.  It also
3414    convert multibyte form 8-bit characters to unibyte if
3415    CODING->src_multibyte is nonzero.  If `coding->mode &
3416    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3417    also means end-of-line.  */
3418
3419 static void
3420 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3421      struct coding_system *coding;
3422      const unsigned char *source;
3423      unsigned char *destination;
3424      int src_bytes, dst_bytes;
3425 {
3426   const unsigned char *src = source;
3427   unsigned char *dst = destination;
3428   const unsigned char *src_end = src + src_bytes;
3429   unsigned char *dst_end = dst + dst_bytes;
3430   Lisp_Object translation_table;
3431   /* SRC_BASE remembers the start position in source in each loop.
3432      The loop will be exited when there's not enough source text to
3433      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3434      there's not enough destination area to produce encoded codes
3435      (within macro EMIT_BYTES).  */
3436   const unsigned char *src_base;
3437   unsigned char *tmp;
3438   int c;
3439   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3440
3441   translation_table = Qnil;
3442   if (coding->src_multibyte
3443       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3444     {
3445       src_end--;
3446       src_bytes--;
3447       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3448     }
3449
3450   if (coding->eol_type == CODING_EOL_CRLF)
3451     {
3452       while (src < src_end)
3453         {
3454           src_base = src;
3455           c = *src++;
3456           if (c >= 0x20)
3457             EMIT_ONE_BYTE (c);
3458           else if (c == '\n' || (c == '\r' && selective_display))
3459             EMIT_TWO_BYTES ('\r', '\n');
3460           else
3461             EMIT_ONE_BYTE (c);
3462         }
3463       src_base = src;
3464     label_end_of_loop:
3465       ;
3466     }
3467   else
3468     {
3469       if (!dst_bytes || src_bytes <= dst_bytes)
3470         {
3471           safe_bcopy (src, dst, src_bytes);
3472           src_base = src_end;
3473           dst += src_bytes;
3474         }
3475       else
3476         {
3477           if (coding->src_multibyte
3478               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3479             dst_bytes--;
3480           safe_bcopy (src, dst, dst_bytes);
3481           src_base = src + dst_bytes;
3482           dst = destination + dst_bytes;
3483           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3484         }
3485       if (coding->eol_type == CODING_EOL_CR)
3486         {
3487           for (tmp = destination; tmp < dst; tmp++)
3488             if (*tmp == '\n') *tmp = '\r';
3489         }
3490       else if (selective_display)
3491         {
3492           for (tmp = destination; tmp < dst; tmp++)
3493             if (*tmp == '\r') *tmp = '\n';
3494         }
3495     }
3496   if (coding->src_multibyte)
3497     dst = destination + str_as_unibyte (destination, dst - destination);
3498
3499   coding->consumed = src_base - source;
3500   coding->produced = dst - destination;
3501   coding->produced_char = coding->produced;
3502 }
3503
3504 \f
3505 /*** 7. C library functions ***/
3506
3507 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3508    has a property `coding-system'.  The value of this property is a
3509    vector of length 5 (called the coding-vector).  Among elements of
3510    this vector, the first (element[0]) and the fifth (element[4])
3511    carry important information for decoding/encoding.  Before
3512    decoding/encoding, this information should be set in fields of a
3513    structure of type `coding_system'.
3514
3515    The value of the property `coding-system' can be a symbol of another
3516    subsidiary coding-system.  In that case, Emacs gets coding-vector
3517    from that symbol.
3518
3519    `element[0]' contains information to be set in `coding->type'.  The
3520    value and its meaning is as follows:
3521
3522    0 -- coding_type_emacs_mule
3523    1 -- coding_type_sjis
3524    2 -- coding_type_iso2022
3525    3 -- coding_type_big5
3526    4 -- coding_type_ccl encoder/decoder written in CCL
3527    nil -- coding_type_no_conversion
3528    t -- coding_type_undecided (automatic conversion on decoding,
3529                                no-conversion on encoding)
3530
3531    `element[4]' contains information to be set in `coding->flags' and
3532    `coding->spec'.  The meaning varies by `coding->type'.
3533
3534    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3535    of length 32 (of which the first 13 sub-elements are used now).
3536    Meanings of these sub-elements are:
3537
3538    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3539         If the value is an integer of valid charset, the charset is
3540         assumed to be designated to graphic register N initially.
3541
3542         If the value is minus, it is a minus value of charset which
3543         reserves graphic register N, which means that the charset is
3544         not designated initially but should be designated to graphic
3545         register N just before encoding a character in that charset.
3546
3547         If the value is nil, graphic register N is never used on
3548         encoding.
3549
3550    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3551         Each value takes t or nil.  See the section ISO2022 of
3552         `coding.h' for more information.
3553
3554    If `coding->type' is `coding_type_big5', element[4] is t to denote
3555    BIG5-ETen or nil to denote BIG5-HKU.
3556
3557    If `coding->type' takes the other value, element[4] is ignored.
3558
3559    Emacs Lisp's coding systems also carry information about format of
3560    end-of-line in a value of property `eol-type'.  If the value is
3561    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3562    means CODING_EOL_CR.  If it is not integer, it should be a vector
3563    of subsidiary coding systems of which property `eol-type' has one
3564    of the above values.
3565
3566 */
3567
3568 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3569    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3570    is setup so that no conversion is necessary and return -1, else
3571    return 0.  */
3572
3573 int
3574 setup_coding_system (coding_system, coding)
3575      Lisp_Object coding_system;
3576      struct coding_system *coding;
3577 {
3578   Lisp_Object coding_spec, coding_type, eol_type, plist;
3579   Lisp_Object val;
3580
3581   /* At first, zero clear all members.  */
3582   bzero (coding, sizeof (struct coding_system));
3583
3584   /* Initialize some fields required for all kinds of coding systems.  */
3585   coding->symbol = coding_system;
3586   coding->heading_ascii = -1;
3587   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3588   coding->composing = COMPOSITION_DISABLED;
3589   coding->cmp_data = NULL;
3590
3591   if (NILP (coding_system))
3592     goto label_invalid_coding_system;
3593
3594   coding_spec = Fget (coding_system, Qcoding_system);
3595
3596   if (!VECTORP (coding_spec)
3597       || XVECTOR (coding_spec)->size != 5
3598       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3599     goto label_invalid_coding_system;
3600
3601   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3602   if (VECTORP (eol_type))
3603     {
3604       coding->eol_type = CODING_EOL_UNDECIDED;
3605       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3606     }
3607   else if (XFASTINT (eol_type) == 1)
3608     {
3609       coding->eol_type = CODING_EOL_CRLF;
3610       coding->common_flags
3611         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3612     }
3613   else if (XFASTINT (eol_type) == 2)
3614     {
3615       coding->eol_type = CODING_EOL_CR;
3616       coding->common_flags
3617         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3618     }
3619   else
3620     coding->eol_type = CODING_EOL_LF;
3621
3622   coding_type = XVECTOR (coding_spec)->contents[0];
3623   /* Try short cut.  */
3624   if (SYMBOLP (coding_type))
3625     {
3626       if (EQ (coding_type, Qt))
3627         {
3628           coding->type = coding_type_undecided;
3629           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3630         }
3631       else
3632         coding->type = coding_type_no_conversion;
3633       /* Initialize this member.  Any thing other than
3634          CODING_CATEGORY_IDX_UTF_16_BE and
3635          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3636          special treatment in detect_eol.  */
3637       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3638
3639       return 0;
3640     }
3641
3642   /* Get values of coding system properties:
3643      `post-read-conversion', `pre-write-conversion',
3644      `translation-table-for-decode', `translation-table-for-encode'.  */
3645   plist = XVECTOR (coding_spec)->contents[3];
3646   /* Pre & post conversion functions should be disabled if
3647      inhibit_eol_conversion is nonzero.  This is the case that a code
3648      conversion function is called while those functions are running.  */
3649   if (! inhibit_pre_post_conversion)
3650     {
3651       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3652       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3653     }
3654   val = Fplist_get (plist, Qtranslation_table_for_decode);
3655   if (SYMBOLP (val))
3656     val = Fget (val, Qtranslation_table_for_decode);
3657   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3658   val = Fplist_get (plist, Qtranslation_table_for_encode);
3659   if (SYMBOLP (val))
3660     val = Fget (val, Qtranslation_table_for_encode);
3661   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3662   val = Fplist_get (plist, Qcoding_category);
3663   if (!NILP (val))
3664     {
3665       val = Fget (val, Qcoding_category_index);
3666       if (INTEGERP (val))
3667         coding->category_idx = XINT (val);
3668       else
3669         goto label_invalid_coding_system;
3670     }
3671   else
3672     goto label_invalid_coding_system;
3673
3674   /* If the coding system has non-nil `composition' property, enable
3675      composition handling.  */
3676   val = Fplist_get (plist, Qcomposition);
3677   if (!NILP (val))
3678     coding->composing = COMPOSITION_NO;
3679
3680   switch (XFASTINT (coding_type))
3681     {
3682     case 0:
3683       coding->type = coding_type_emacs_mule;
3684       coding->common_flags
3685         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3686       if (!NILP (coding->post_read_conversion))
3687         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3688       if (!NILP (coding->pre_write_conversion))
3689         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3690       break;
3691
3692     case 1:
3693       coding->type = coding_type_sjis;
3694       coding->common_flags
3695         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3696       break;
3697
3698     case 2:
3699       coding->type = coding_type_iso2022;
3700       coding->common_flags
3701         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3702       {
3703         Lisp_Object val, temp;
3704         Lisp_Object *flags;
3705         int i, charset, reg_bits = 0;
3706
3707         val = XVECTOR (coding_spec)->contents[4];
3708
3709         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3710           goto label_invalid_coding_system;
3711
3712         flags = XVECTOR (val)->contents;
3713         coding->flags
3714           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3715              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3716              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3717              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3718              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3719              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3720              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3721              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3722              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3723              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3724              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3725              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3726              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3727              );
3728
3729         /* Invoke graphic register 0 to plane 0.  */
3730         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3731         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3732         CODING_SPEC_ISO_INVOCATION (coding, 1)
3733           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3734         /* Not single shifting at first.  */
3735         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3736         /* Beginning of buffer should also be regarded as bol. */
3737         CODING_SPEC_ISO_BOL (coding) = 1;
3738
3739         for (charset = 0; charset <= MAX_CHARSET; charset++)
3740           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3741         val = Vcharset_revision_alist;
3742         while (CONSP (val))
3743           {
3744             charset = get_charset_id (Fcar_safe (XCAR (val)));
3745             if (charset >= 0
3746                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3747                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3748               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3749             val = XCDR (val);
3750           }
3751
3752         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3753            FLAGS[REG] can be one of below:
3754                 integer CHARSET: CHARSET occupies register I,
3755                 t: designate nothing to REG initially, but can be used
3756                   by any charsets,
3757                 list of integer, nil, or t: designate the first
3758                   element (if integer) to REG initially, the remaining
3759                   elements (if integer) is designated to REG on request,
3760                   if an element is t, REG can be used by any charsets,
3761                 nil: REG is never used.  */
3762         for (charset = 0; charset <= MAX_CHARSET; charset++)
3763           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3764             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3765         for (i = 0; i < 4; i++)
3766           {
3767             if ((INTEGERP (flags[i])
3768                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3769                 || (charset = get_charset_id (flags[i])) >= 0)
3770               {
3771                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3772                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3773               }
3774             else if (EQ (flags[i], Qt))
3775               {
3776                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3777                 reg_bits |= 1 << i;
3778                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3779               }
3780             else if (CONSP (flags[i]))
3781               {
3782                 Lisp_Object tail;
3783                 tail = flags[i];
3784
3785                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3786                 if ((INTEGERP (XCAR (tail))
3787                      && (charset = XINT (XCAR (tail)),
3788                          CHARSET_VALID_P (charset)))
3789                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3790                   {
3791                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3792                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3793                   }
3794                 else
3795                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3796                 tail = XCDR (tail);
3797                 while (CONSP (tail))
3798                   {
3799                     if ((INTEGERP (XCAR (tail))
3800                          && (charset = XINT (XCAR (tail)),
3801                              CHARSET_VALID_P (charset)))
3802                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3803                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3804                         = i;
3805                     else if (EQ (XCAR (tail), Qt))
3806                       reg_bits |= 1 << i;
3807                     tail = XCDR (tail);
3808                   }
3809               }
3810             else
3811               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3812
3813             CODING_SPEC_ISO_DESIGNATION (coding, i)
3814               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3815           }
3816
3817         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3818           {
3819             /* REG 1 can be used only by locking shift in 7-bit env.  */
3820             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3821               reg_bits &= ~2;
3822             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3823               /* Without any shifting, only REG 0 and 1 can be used.  */
3824               reg_bits &= 3;
3825           }
3826
3827         if (reg_bits)
3828           for (charset = 0; charset <= MAX_CHARSET; charset++)
3829             {
3830               if (CHARSET_DEFINED_P (charset)
3831                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3832                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3833                 {
3834                   /* There exist some default graphic registers to be
3835                      used by CHARSET.  */
3836
3837                   /* We had better avoid designating a charset of
3838                      CHARS96 to REG 0 as far as possible.  */
3839                   if (CHARSET_CHARS (charset) == 96)
3840                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3841                       = (reg_bits & 2
3842                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3843                   else
3844                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3845                       = (reg_bits & 1
3846                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3847                 }
3848             }
3849       }
3850       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3851       coding->spec.iso2022.last_invalid_designation_register = -1;
3852       break;
3853
3854     case 3:
3855       coding->type = coding_type_big5;
3856       coding->common_flags
3857         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3858       coding->flags
3859         = (NILP (XVECTOR (coding_spec)->contents[4])
3860            ? CODING_FLAG_BIG5_HKU
3861            : CODING_FLAG_BIG5_ETEN);
3862       break;
3863
3864     case 4:
3865       coding->type = coding_type_ccl;
3866       coding->common_flags
3867         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3868       {
3869         val = XVECTOR (coding_spec)->contents[4];
3870         if (! CONSP (val)
3871             || setup_ccl_program (&(coding->spec.ccl.decoder),
3872                                   XCAR (val)) < 0
3873             || setup_ccl_program (&(coding->spec.ccl.encoder),
3874                                   XCDR (val)) < 0)
3875           goto label_invalid_coding_system;
3876
3877         bzero (coding->spec.ccl.valid_codes, 256);
3878         val = Fplist_get (plist, Qvalid_codes);
3879         if (CONSP (val))
3880           {
3881             Lisp_Object this;
3882
3883             for (; CONSP (val); val = XCDR (val))
3884               {
3885                 this = XCAR (val);
3886                 if (INTEGERP (this)
3887                     && XINT (this) >= 0 && XINT (this) < 256)
3888                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3889                 else if (CONSP (this)
3890                          && INTEGERP (XCAR (this))
3891                          && INTEGERP (XCDR (this)))
3892                   {
3893                     int start = XINT (XCAR (this));
3894                     int end = XINT (XCDR (this));
3895
3896                     if (start >= 0 && start <= end && end < 256)
3897                       while (start <= end)
3898                         coding->spec.ccl.valid_codes[start++] = 1;
3899                   }
3900               }
3901           }
3902       }
3903       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3904       coding->spec.ccl.cr_carryover = 0;
3905       coding->spec.ccl.eight_bit_carryover[0] = 0;
3906       break;
3907
3908     case 5:
3909       coding->type = coding_type_raw_text;
3910       break;
3911
3912     default:
3913       goto label_invalid_coding_system;
3914     }
3915   return 0;
3916
3917  label_invalid_coding_system:
3918   coding->type = coding_type_no_conversion;
3919   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3920   coding->common_flags = 0;
3921   coding->eol_type = CODING_EOL_LF;
3922   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3923   return -1;
3924 }
3925
3926 /* Free memory blocks allocated for storing composition information.  */
3927
3928 void
3929 coding_free_composition_data (coding)
3930      struct coding_system *coding;
3931 {
3932   struct composition_data *cmp_data = coding->cmp_data, *next;
3933
3934   if (!cmp_data)
3935     return;
3936   /* Memory blocks are chained.  At first, rewind to the first, then,
3937      free blocks one by one.  */
3938   while (cmp_data->prev)
3939     cmp_data = cmp_data->prev;
3940   while (cmp_data)
3941     {
3942       next = cmp_data->next;
3943       xfree (cmp_data);
3944       cmp_data = next;
3945     }
3946   coding->cmp_data = NULL;
3947 }
3948
3949 /* Set `char_offset' member of all memory blocks pointed by
3950    coding->cmp_data to POS.  */
3951
3952 void
3953 coding_adjust_composition_offset (coding, pos)
3954      struct coding_system *coding;
3955      int pos;
3956 {
3957   struct composition_data *cmp_data;
3958
3959   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3960     cmp_data->char_offset = pos;
3961 }
3962
3963 /* Setup raw-text or one of its subsidiaries in the structure
3964    coding_system CODING according to the already setup value eol_type
3965    in CODING.  CODING should be setup for some coding system in
3966    advance.  */
3967
3968 void
3969 setup_raw_text_coding_system (coding)
3970      struct coding_system *coding;
3971 {
3972   if (coding->type != coding_type_raw_text)
3973     {
3974       coding->symbol = Qraw_text;
3975       coding->type = coding_type_raw_text;
3976       if (coding->eol_type != CODING_EOL_UNDECIDED)
3977         {
3978           Lisp_Object subsidiaries;
3979           subsidiaries = Fget (Qraw_text, Qeol_type);
3980
3981           if (VECTORP (subsidiaries)
3982               && XVECTOR (subsidiaries)->size == 3)
3983             coding->symbol
3984               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3985         }
3986       setup_coding_system (coding->symbol, coding);
3987     }
3988   return;
3989 }
3990
3991 /* Emacs has a mechanism to automatically detect a coding system if it
3992    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3993    it's impossible to distinguish some coding systems accurately
3994    because they use the same range of codes.  So, at first, coding
3995    systems are categorized into 7, those are:
3996
3997    o coding-category-emacs-mule
3998
3999         The category for a coding system which has the same code range
4000         as Emacs' internal format.  Assigned the coding-system (Lisp
4001         symbol) `emacs-mule' by default.
4002
4003    o coding-category-sjis
4004
4005         The category for a coding system which has the same code range
4006         as SJIS.  Assigned the coding-system (Lisp
4007         symbol) `japanese-shift-jis' by default.
4008
4009    o coding-category-iso-7
4010
4011         The category for a coding system which has the same code range
4012         as ISO2022 of 7-bit environment.  This doesn't use any locking
4013         shift and single shift functions.  This can encode/decode all
4014         charsets.  Assigned the coding-system (Lisp symbol)
4015         `iso-2022-7bit' by default.
4016
4017    o coding-category-iso-7-tight
4018
4019         Same as coding-category-iso-7 except that this can
4020         encode/decode only the specified charsets.
4021
4022    o coding-category-iso-8-1
4023
4024         The category for a coding system which has the same code range
4025         as ISO2022 of 8-bit environment and graphic plane 1 used only
4026         for DIMENSION1 charset.  This doesn't use any locking shift
4027         and single shift functions.  Assigned the coding-system (Lisp
4028         symbol) `iso-latin-1' by default.
4029
4030    o coding-category-iso-8-2
4031
4032         The category for a coding system which has the same code range
4033         as ISO2022 of 8-bit environment and graphic plane 1 used only
4034         for DIMENSION2 charset.  This doesn't use any locking shift
4035         and single shift functions.  Assigned the coding-system (Lisp
4036         symbol) `japanese-iso-8bit' by default.
4037
4038    o coding-category-iso-7-else
4039
4040         The category for a coding system which has the same code range
4041         as ISO2022 of 7-bit environment but uses locking shift or
4042         single shift functions.  Assigned the coding-system (Lisp
4043         symbol) `iso-2022-7bit-lock' by default.
4044
4045    o coding-category-iso-8-else
4046
4047         The category for a coding system which has the same code range
4048         as ISO2022 of 8-bit environment but uses locking shift or
4049         single shift functions.  Assigned the coding-system (Lisp
4050         symbol) `iso-2022-8bit-ss2' by default.
4051
4052    o coding-category-big5
4053
4054         The category for a coding system which has the same code range
4055         as BIG5.  Assigned the coding-system (Lisp symbol)
4056         `cn-big5' by default.
4057
4058    o coding-category-utf-8
4059
4060         The category for a coding system which has the same code range
4061         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4062         symbol) `utf-8' by default.
4063
4064    o coding-category-utf-16-be
4065
4066         The category for a coding system in which a text has an
4067         Unicode signature (cf. Unicode Standard) in the order of BIG
4068         endian at the head.  Assigned the coding-system (Lisp symbol)
4069         `utf-16-be' by default.
4070
4071    o coding-category-utf-16-le
4072
4073         The category for a coding system in which a text has an
4074         Unicode signature (cf. Unicode Standard) in the order of
4075         LITTLE endian at the head.  Assigned the coding-system (Lisp
4076         symbol) `utf-16-le' by default.
4077
4078    o coding-category-ccl
4079
4080         The category for a coding system of which encoder/decoder is
4081         written in CCL programs.  The default value is nil, i.e., no
4082         coding system is assigned.
4083
4084    o coding-category-binary
4085
4086         The category for a coding system not categorized in any of the
4087         above.  Assigned the coding-system (Lisp symbol)
4088         `no-conversion' by default.
4089
4090    Each of them is a Lisp symbol and the value is an actual
4091    `coding-system' (this is also a Lisp symbol) assigned by a user.
4092    What Emacs does actually is to detect a category of coding system.
4093    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4094    decide a single possible category, it selects a category of the
4095    highest priority.  Priorities of categories are also specified by a
4096    user in a Lisp variable `coding-category-list'.
4097
4098 */
4099
4100 static
4101 int ascii_skip_code[256];
4102
4103 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4104    If it detects possible coding systems, return an integer in which
4105    appropriate flag bits are set.  Flag bits are defined by macros
4106    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4107    it should point the table `coding_priorities'.  In that case, only
4108    the flag bit for a coding system of the highest priority is set in
4109    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4110    range 0x80..0x9F are in multibyte form.
4111
4112    How many ASCII characters are at the head is returned as *SKIP.  */
4113
4114 static int
4115 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4116      unsigned char *source;
4117      int src_bytes, *priorities, *skip;
4118      int multibytep;
4119 {
4120   register unsigned char c;
4121   unsigned char *src = source, *src_end = source + src_bytes;
4122   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4123   int i;
4124
4125   /* At first, skip all ASCII characters and control characters except
4126      for three ISO2022 specific control characters.  */
4127   ascii_skip_code[ISO_CODE_SO] = 0;
4128   ascii_skip_code[ISO_CODE_SI] = 0;
4129   ascii_skip_code[ISO_CODE_ESC] = 0;
4130
4131  label_loop_detect_coding:
4132   while (src < src_end && ascii_skip_code[*src]) src++;
4133   *skip = src - source;
4134
4135   if (src >= src_end)
4136     /* We found nothing other than ASCII.  There's nothing to do.  */
4137     return 0;
4138
4139   c = *src;
4140   /* The text seems to be encoded in some multilingual coding system.
4141      Now, try to find in which coding system the text is encoded.  */
4142   if (c < 0x80)
4143     {
4144       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4145       /* C is an ISO2022 specific control code of C0.  */
4146       mask = detect_coding_iso2022 (src, src_end, multibytep);
4147       if (mask == 0)
4148         {
4149           /* No valid ISO2022 code follows C.  Try again.  */
4150           src++;
4151           if (c == ISO_CODE_ESC)
4152             ascii_skip_code[ISO_CODE_ESC] = 1;
4153           else
4154             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4155           goto label_loop_detect_coding;
4156         }
4157       if (priorities)
4158         {
4159           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4160             {
4161               if (mask & priorities[i])
4162                 return priorities[i];
4163             }
4164           return CODING_CATEGORY_MASK_RAW_TEXT;
4165         }
4166     }
4167   else
4168     {
4169       int try;
4170
4171       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4172         c = src[1] - 0x20;
4173
4174       if (c < 0xA0)
4175         {
4176           /* C is the first byte of SJIS character code,
4177              or a leading-code of Emacs' internal format (emacs-mule),
4178              or the first byte of UTF-16.  */
4179           try = (CODING_CATEGORY_MASK_SJIS
4180                   | CODING_CATEGORY_MASK_EMACS_MULE
4181                   | CODING_CATEGORY_MASK_UTF_16_BE
4182                   | CODING_CATEGORY_MASK_UTF_16_LE);
4183
4184           /* Or, if C is a special latin extra code,
4185              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4186              or is an ISO2022 control-sequence-introducer (CSI),
4187              we should also consider the possibility of ISO2022 codings.  */
4188           if ((VECTORP (Vlatin_extra_code_table)
4189                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4190               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4191               || (c == ISO_CODE_CSI
4192                   && (src < src_end
4193                       && (*src == ']'
4194                           || ((*src == '0' || *src == '1' || *src == '2')
4195                               && src + 1 < src_end
4196                               && src[1] == ']')))))
4197             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4198                      | CODING_CATEGORY_MASK_ISO_8BIT);
4199         }
4200       else
4201         /* C is a character of ISO2022 in graphic plane right,
4202            or a SJIS's 1-byte character code (i.e. JISX0201),
4203            or the first byte of BIG5's 2-byte code,
4204            or the first byte of UTF-8/16.  */
4205         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4206                 | CODING_CATEGORY_MASK_ISO_8BIT
4207                 | CODING_CATEGORY_MASK_SJIS
4208                 | CODING_CATEGORY_MASK_BIG5
4209                 | CODING_CATEGORY_MASK_UTF_8
4210                 | CODING_CATEGORY_MASK_UTF_16_BE
4211                 | CODING_CATEGORY_MASK_UTF_16_LE);
4212
4213       /* Or, we may have to consider the possibility of CCL.  */
4214       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4215           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4216               ->spec.ccl.valid_codes)[c])
4217         try |= CODING_CATEGORY_MASK_CCL;
4218
4219       mask = 0;
4220       utf16_examined_p = iso2022_examined_p = 0;
4221       if (priorities)
4222         {
4223           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4224             {
4225               if (!iso2022_examined_p
4226                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4227                 {
4228                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4229                   iso2022_examined_p = 1;
4230                 }
4231               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4232                 mask |= detect_coding_sjis (src, src_end, multibytep);
4233               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4234                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4235               else if (!utf16_examined_p
4236                        && (priorities[i] & try &
4237                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4238                 {
4239                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4240                   utf16_examined_p = 1;
4241                 }
4242               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4243                 mask |= detect_coding_big5 (src, src_end, multibytep);
4244               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4245                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4246               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4247                 mask |= detect_coding_ccl (src, src_end, multibytep);
4248               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4249                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4250               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4251                 mask |= CODING_CATEGORY_MASK_BINARY;
4252               if (mask & priorities[i])
4253                 return priorities[i];
4254             }
4255           return CODING_CATEGORY_MASK_RAW_TEXT;
4256         }
4257       if (try & CODING_CATEGORY_MASK_ISO)
4258         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4259       if (try & CODING_CATEGORY_MASK_SJIS)
4260         mask |= detect_coding_sjis (src, src_end, multibytep);
4261       if (try & CODING_CATEGORY_MASK_BIG5)
4262         mask |= detect_coding_big5 (src, src_end, multibytep);
4263       if (try & CODING_CATEGORY_MASK_UTF_8)
4264         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4265       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4266         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4267       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4268         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4269       if (try & CODING_CATEGORY_MASK_CCL)
4270         mask |= detect_coding_ccl (src, src_end, multibytep);
4271     }
4272   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4273 }
4274
4275 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4276    The information of the detected coding system is set in CODING.  */
4277
4278 void
4279 detect_coding (coding, src, src_bytes)
4280      struct coding_system *coding;
4281      const unsigned char *src;
4282      int src_bytes;
4283 {
4284   unsigned int idx;
4285   int skip, mask;
4286   Lisp_Object val;
4287
4288   val = Vcoding_category_list;
4289   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4290                              coding->src_multibyte);
4291   coding->heading_ascii = skip;
4292
4293   if (!mask) return;
4294
4295   /* We found a single coding system of the highest priority in MASK.  */
4296   idx = 0;
4297   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4298   if (! mask)
4299     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4300
4301   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4302
4303   if (coding->eol_type != CODING_EOL_UNDECIDED)
4304     {
4305       Lisp_Object tmp;
4306
4307       tmp = Fget (val, Qeol_type);
4308       if (VECTORP (tmp))
4309         val = XVECTOR (tmp)->contents[coding->eol_type];
4310     }
4311
4312   /* Setup this new coding system while preserving some slots.  */
4313   {
4314     int src_multibyte = coding->src_multibyte;
4315     int dst_multibyte = coding->dst_multibyte;
4316
4317     setup_coding_system (val, coding);
4318     coding->src_multibyte = src_multibyte;
4319     coding->dst_multibyte = dst_multibyte;
4320     coding->heading_ascii = skip;
4321   }
4322 }
4323
4324 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4325    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4326    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4327
4328    How many non-eol characters are at the head is returned as *SKIP.  */
4329
4330 #define MAX_EOL_CHECK_COUNT 3
4331
4332 static int
4333 detect_eol_type (source, src_bytes, skip)
4334      unsigned char *source;
4335      int src_bytes, *skip;
4336 {
4337   unsigned char *src = source, *src_end = src + src_bytes;
4338   unsigned char c;
4339   int total = 0;                /* How many end-of-lines are found so far.  */
4340   int eol_type = CODING_EOL_UNDECIDED;
4341   int this_eol_type;
4342
4343   *skip = 0;
4344
4345   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4346     {
4347       c = *src++;
4348       if (c == '\n' || c == '\r')
4349         {
4350           if (*skip == 0)
4351             *skip = src - 1 - source;
4352           total++;
4353           if (c == '\n')
4354             this_eol_type = CODING_EOL_LF;
4355           else if (src >= src_end || *src != '\n')
4356             this_eol_type = CODING_EOL_CR;
4357           else
4358             this_eol_type = CODING_EOL_CRLF, src++;
4359
4360           if (eol_type == CODING_EOL_UNDECIDED)
4361             /* This is the first end-of-line.  */
4362             eol_type = this_eol_type;
4363           else if (eol_type != this_eol_type)
4364             {
4365               /* The found type is different from what found before.  */
4366               eol_type = CODING_EOL_INCONSISTENT;
4367               break;
4368             }
4369         }
4370     }
4371
4372   if (*skip == 0)
4373     *skip = src_end - source;
4374   return eol_type;
4375 }
4376
4377 /* Like detect_eol_type, but detect EOL type in 2-octet
4378    big-endian/little-endian format for coding systems utf-16-be and
4379    utf-16-le.  */
4380
4381 static int
4382 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4383      unsigned char *source;
4384      int src_bytes, *skip, big_endian_p;
4385 {
4386   unsigned char *src = source, *src_end = src + src_bytes;
4387   unsigned int c1, c2;
4388   int total = 0;                /* How many end-of-lines are found so far.  */
4389   int eol_type = CODING_EOL_UNDECIDED;
4390   int this_eol_type;
4391   int msb, lsb;
4392
4393   if (big_endian_p)
4394     msb = 0, lsb = 1;
4395   else
4396     msb = 1, lsb = 0;
4397
4398   *skip = 0;
4399
4400   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4401     {
4402       c1 = (src[msb] << 8) | (src[lsb]);
4403       src += 2;
4404
4405       if (c1 == '\n' || c1 == '\r')
4406         {
4407           if (*skip == 0)
4408             *skip = src - 2 - source;
4409           total++;
4410           if (c1 == '\n')
4411             {
4412               this_eol_type = CODING_EOL_LF;
4413             }
4414           else
4415             {
4416               if ((src + 1) >= src_end)
4417                 {
4418                   this_eol_type = CODING_EOL_CR;
4419                 }
4420               else
4421                 {
4422                   c2 = (src[msb] << 8) | (src[lsb]);
4423                   if (c2 == '\n')
4424                     this_eol_type = CODING_EOL_CRLF, src += 2;
4425                   else
4426                     this_eol_type = CODING_EOL_CR;
4427                 }
4428             }
4429
4430           if (eol_type == CODING_EOL_UNDECIDED)
4431             /* This is the first end-of-line.  */
4432             eol_type = this_eol_type;
4433           else if (eol_type != this_eol_type)
4434             {
4435               /* The found type is different from what found before.  */
4436               eol_type = CODING_EOL_INCONSISTENT;
4437               break;
4438             }
4439         }
4440     }
4441
4442   if (*skip == 0)
4443     *skip = src_end - source;
4444   return eol_type;
4445 }
4446
4447 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4448    is encoded.  If it detects an appropriate format of end-of-line, it
4449    sets the information in *CODING.  */
4450
4451 void
4452 detect_eol (coding, src, src_bytes)
4453      struct coding_system *coding;
4454      const unsigned char *src;
4455      int src_bytes;
4456 {
4457   Lisp_Object val;
4458   int skip;
4459   int eol_type;
4460
4461   switch (coding->category_idx)
4462     {
4463     case CODING_CATEGORY_IDX_UTF_16_BE:
4464       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4465       break;
4466     case CODING_CATEGORY_IDX_UTF_16_LE:
4467       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4468       break;
4469     default:
4470       eol_type = detect_eol_type (src, src_bytes, &skip);
4471       break;
4472     }
4473
4474   if (coding->heading_ascii > skip)
4475     coding->heading_ascii = skip;
4476   else
4477     skip = coding->heading_ascii;
4478
4479   if (eol_type == CODING_EOL_UNDECIDED)
4480     return;
4481   if (eol_type == CODING_EOL_INCONSISTENT)
4482     {
4483 #if 0
4484       /* This code is suppressed until we find a better way to
4485          distinguish raw text file and binary file.  */
4486
4487       /* If we have already detected that the coding is raw-text, the
4488          coding should actually be no-conversion.  */
4489       if (coding->type == coding_type_raw_text)
4490         {
4491           setup_coding_system (Qno_conversion, coding);
4492           return;
4493         }
4494       /* Else, let's decode only text code anyway.  */
4495 #endif /* 0 */
4496       eol_type = CODING_EOL_LF;
4497     }
4498
4499   val = Fget (coding->symbol, Qeol_type);
4500   if (VECTORP (val) && XVECTOR (val)->size == 3)
4501     {
4502       int src_multibyte = coding->src_multibyte;
4503       int dst_multibyte = coding->dst_multibyte;
4504       struct composition_data *cmp_data = coding->cmp_data;
4505
4506       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4507       coding->src_multibyte = src_multibyte;
4508       coding->dst_multibyte = dst_multibyte;
4509       coding->heading_ascii = skip;
4510       coding->cmp_data = cmp_data;
4511     }
4512 }
4513
4514 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4515
4516 #define DECODING_BUFFER_MAG(coding)                     \
4517   (coding->type == coding_type_iso2022                  \
4518    ? 3                                                  \
4519    : (coding->type == coding_type_ccl                   \
4520       ? coding->spec.ccl.decoder.buf_magnification      \
4521       : 2))
4522
4523 /* Return maximum size (bytes) of a buffer enough for decoding
4524    SRC_BYTES of text encoded in CODING.  */
4525
4526 int
4527 decoding_buffer_size (coding, src_bytes)
4528      struct coding_system *coding;
4529      int src_bytes;
4530 {
4531   return (src_bytes * DECODING_BUFFER_MAG (coding)
4532           + CONVERSION_BUFFER_EXTRA_ROOM);
4533 }
4534
4535 /* Return maximum size (bytes) of a buffer enough for encoding
4536    SRC_BYTES of text to CODING.  */
4537
4538 int
4539 encoding_buffer_size (coding, src_bytes)
4540      struct coding_system *coding;
4541      int src_bytes;
4542 {
4543   int magnification;
4544
4545   if (coding->type == coding_type_ccl)
4546     {
4547       magnification = coding->spec.ccl.encoder.buf_magnification;
4548       if (coding->eol_type == CODING_EOL_CRLF)
4549         magnification *= 2;
4550     }
4551   else if (CODING_REQUIRE_ENCODING (coding))
4552     magnification = 3;
4553   else
4554     magnification = 1;
4555
4556   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4557 }
4558
4559 /* Working buffer for code conversion.  */
4560 struct conversion_buffer
4561 {
4562   int size;                     /* size of data.  */
4563   int on_stack;                 /* 1 if allocated by alloca.  */
4564   unsigned char *data;
4565 };
4566
4567 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4568 #define allocate_conversion_buffer(buf, len)            \
4569   do {                                                  \
4570     if (len < MAX_ALLOCA)                               \
4571       {                                                 \
4572         buf.data = (unsigned char *) alloca (len);      \
4573         buf.on_stack = 1;                               \
4574       }                                                 \
4575     else                                                \
4576       {                                                 \
4577         buf.data = (unsigned char *) xmalloc (len);     \
4578         buf.on_stack = 0;                               \
4579       }                                                 \
4580     buf.size = len;                                     \
4581   } while (0)
4582
4583 /* Double the allocated memory for *BUF.  */
4584 static void
4585 extend_conversion_buffer (buf)
4586      struct conversion_buffer *buf;
4587 {
4588   if (buf->on_stack)
4589     {
4590       unsigned char *save = buf->data;
4591       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4592       bcopy (save, buf->data, buf->size);
4593       buf->on_stack = 0;
4594     }
4595   else
4596     {
4597       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4598     }
4599   buf->size *= 2;
4600 }
4601
4602 /* Free the allocated memory for BUF if it is not on stack.  */
4603 static void
4604 free_conversion_buffer (buf)
4605      struct conversion_buffer *buf;
4606 {
4607   if (!buf->on_stack)
4608     xfree (buf->data);
4609 }
4610
4611 int
4612 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4613      struct coding_system *coding;
4614      unsigned char *source, *destination;
4615      int src_bytes, dst_bytes, encodep;
4616 {
4617   struct ccl_program *ccl
4618     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4619   unsigned char *dst = destination;
4620
4621   ccl->suppress_error = coding->suppress_error;
4622   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4623   if (encodep)
4624     {
4625       /* On encoding, EOL format is converted within ccl_driver.  For
4626          that, setup proper information in the structure CCL.  */
4627       ccl->eol_type = coding->eol_type;
4628       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4629         ccl->eol_type = CODING_EOL_LF;
4630       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4631       ccl->eight_bit_control = coding->dst_multibyte;
4632     }
4633   else
4634     ccl->eight_bit_control = 1;
4635   ccl->multibyte = coding->src_multibyte;
4636   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4637     {
4638       /* Move carryover bytes to DESTINATION.  */
4639       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4640       while (*p)
4641         *dst++ = *p++;
4642       coding->spec.ccl.eight_bit_carryover[0] = 0;
4643       if (dst_bytes)
4644         dst_bytes -= dst - destination;
4645     }
4646
4647   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4648                                   &(coding->consumed))
4649                       + dst - destination);
4650
4651   if (encodep)
4652     {
4653       coding->produced_char = coding->produced;
4654       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4655     }
4656   else if (!ccl->eight_bit_control)
4657     {
4658       /* The produced bytes forms a valid multibyte sequence. */
4659       coding->produced_char
4660         = multibyte_chars_in_text (destination, coding->produced);
4661       coding->spec.ccl.eight_bit_carryover[0] = 0;
4662     }
4663   else
4664     {
4665       /* On decoding, the destination should always multibyte.  But,
4666          CCL program might have been generated an invalid multibyte
4667          sequence.  Here we make such a sequence valid as
4668          multibyte.  */
4669       int bytes
4670         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4671
4672       if ((coding->consumed < src_bytes
4673            || !ccl->last_block)
4674           && coding->produced >= 1
4675           && destination[coding->produced - 1] >= 0x80)
4676         {
4677           /* We should not convert the tailing 8-bit codes to
4678              multibyte form even if they doesn't form a valid
4679              multibyte sequence.  They may form a valid sequence in
4680              the next call.  */
4681           int carryover = 0;
4682
4683           if (destination[coding->produced - 1] < 0xA0)
4684             carryover = 1;
4685           else if (coding->produced >= 2)
4686             {
4687               if (destination[coding->produced - 2] >= 0x80)
4688                 {
4689                   if (destination[coding->produced - 2] < 0xA0)
4690                     carryover = 2;
4691                   else if (coding->produced >= 3
4692                            && destination[coding->produced - 3] >= 0x80
4693                            && destination[coding->produced - 3] < 0xA0)
4694                     carryover = 3;
4695                 }
4696             }
4697           if (carryover > 0)
4698             {
4699               BCOPY_SHORT (destination + coding->produced - carryover,
4700                            coding->spec.ccl.eight_bit_carryover,
4701                            carryover);
4702               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4703               coding->produced -= carryover;
4704             }
4705         }
4706       coding->produced = str_as_multibyte (destination, bytes,
4707                                            coding->produced,
4708                                            &(coding->produced_char));
4709     }
4710
4711   switch (ccl->status)
4712     {
4713     case CCL_STAT_SUSPEND_BY_SRC:
4714       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4715       break;
4716     case CCL_STAT_SUSPEND_BY_DST:
4717       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4718       break;
4719     case CCL_STAT_QUIT:
4720     case CCL_STAT_INVALID_CMD:
4721       coding->result = CODING_FINISH_INTERRUPT;
4722       break;
4723     default:
4724       coding->result = CODING_FINISH_NORMAL;
4725       break;
4726     }
4727   return coding->result;
4728 }
4729
4730 /* Decode EOL format of the text at PTR of BYTES length destructively
4731    according to CODING->eol_type.  This is called after the CCL
4732    program produced a decoded text at PTR.  If we do CRLF->LF
4733    conversion, update CODING->produced and CODING->produced_char.  */
4734
4735 static void
4736 decode_eol_post_ccl (coding, ptr, bytes)
4737      struct coding_system *coding;
4738      unsigned char *ptr;
4739      int bytes;
4740 {
4741   Lisp_Object val, saved_coding_symbol;
4742   unsigned char *pend = ptr + bytes;
4743   int dummy;
4744
4745   /* Remember the current coding system symbol.  We set it back when
4746      an inconsistent EOL is found so that `last-coding-system-used' is
4747      set to the coding system that doesn't specify EOL conversion.  */
4748   saved_coding_symbol = coding->symbol;
4749
4750   coding->spec.ccl.cr_carryover = 0;
4751   if (coding->eol_type == CODING_EOL_UNDECIDED)
4752     {
4753       /* Here, to avoid the call of setup_coding_system, we directly
4754          call detect_eol_type.  */
4755       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4756       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4757         coding->eol_type = CODING_EOL_LF;
4758       if (coding->eol_type != CODING_EOL_UNDECIDED)
4759         {
4760           val = Fget (coding->symbol, Qeol_type);
4761           if (VECTORP (val) && XVECTOR (val)->size == 3)
4762             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4763         }
4764       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4765     }
4766
4767   if (coding->eol_type == CODING_EOL_LF
4768       || coding->eol_type == CODING_EOL_UNDECIDED)
4769     {
4770       /* We have nothing to do.  */
4771       ptr = pend;
4772     }
4773   else if (coding->eol_type == CODING_EOL_CRLF)
4774     {
4775       unsigned char *pstart = ptr, *p = ptr;
4776
4777       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4778           && *(pend - 1) == '\r')
4779         {
4780           /* If the last character is CR, we can't handle it here
4781              because LF will be in the not-yet-decoded source text.
4782              Record that the CR is not yet processed.  */
4783           coding->spec.ccl.cr_carryover = 1;
4784           coding->produced--;
4785           coding->produced_char--;
4786           pend--;
4787         }
4788       while (ptr < pend)
4789         {
4790           if (*ptr == '\r')
4791             {
4792               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4793                 {
4794                   *p++ = '\n';
4795                   ptr += 2;
4796                 }
4797               else
4798                 {
4799                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4800                     goto undo_eol_conversion;
4801                   *p++ = *ptr++;
4802                 }
4803             }
4804           else if (*ptr == '\n'
4805                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4806             goto undo_eol_conversion;
4807           else
4808             *p++ = *ptr++;
4809           continue;
4810
4811         undo_eol_conversion:
4812           /* We have faced with inconsistent EOL format at PTR.
4813              Convert all LFs before PTR back to CRLFs.  */
4814           for (p--, ptr--; p >= pstart; p--)
4815             {
4816               if (*p == '\n')
4817                 *ptr-- = '\n', *ptr-- = '\r';
4818               else
4819                 *ptr-- = *p;
4820             }
4821           /*  If carryover is recorded, cancel it because we don't
4822               convert CRLF anymore.  */
4823           if (coding->spec.ccl.cr_carryover)
4824             {
4825               coding->spec.ccl.cr_carryover = 0;
4826               coding->produced++;
4827               coding->produced_char++;
4828               pend++;
4829             }
4830           p = ptr = pend;
4831           coding->eol_type = CODING_EOL_LF;
4832           coding->symbol = saved_coding_symbol;
4833         }
4834       if (p < pend)
4835         {
4836           /* As each two-byte sequence CRLF was converted to LF, (PEND
4837              - P) is the number of deleted characters.  */
4838           coding->produced -= pend - p;
4839           coding->produced_char -= pend - p;
4840         }
4841     }
4842   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4843     {
4844       unsigned char *p = ptr;
4845
4846       for (; ptr < pend; ptr++)
4847         {
4848           if (*ptr == '\r')
4849             *ptr = '\n';
4850           else if (*ptr == '\n'
4851                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4852             {
4853               for (; p < ptr; p++)
4854                 {
4855                   if (*p == '\n')
4856                     *p = '\r';
4857                 }
4858               ptr = pend;
4859               coding->eol_type = CODING_EOL_LF;
4860               coding->symbol = saved_coding_symbol;
4861             }
4862         }
4863     }
4864 }
4865
4866 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4867    decoding, it may detect coding system and format of end-of-line if
4868    those are not yet decided.  The source should be unibyte, the
4869    result is multibyte if CODING->dst_multibyte is nonzero, else
4870    unibyte.  */
4871
4872 int
4873 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4874      struct coding_system *coding;
4875      const unsigned char *source;
4876      unsigned char *destination;
4877      int src_bytes, dst_bytes;
4878 {
4879   int extra = 0;
4880
4881   if (coding->type == coding_type_undecided)
4882     detect_coding (coding, source, src_bytes);
4883
4884   if (coding->eol_type == CODING_EOL_UNDECIDED
4885       && coding->type != coding_type_ccl)
4886     {
4887       detect_eol (coding, source, src_bytes);
4888       /* We had better recover the original eol format if we
4889          encounter an inconsistent eol format while decoding.  */
4890       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4891     }
4892
4893   coding->produced = coding->produced_char = 0;
4894   coding->consumed = coding->consumed_char = 0;
4895   coding->errors = 0;
4896   coding->result = CODING_FINISH_NORMAL;
4897
4898   switch (coding->type)
4899     {
4900     case coding_type_sjis:
4901       decode_coding_sjis_big5 (coding, source, destination,
4902                                src_bytes, dst_bytes, 1);
4903       break;
4904
4905     case coding_type_iso2022:
4906       decode_coding_iso2022 (coding, source, destination,
4907                              src_bytes, dst_bytes);
4908       break;
4909
4910     case coding_type_big5:
4911       decode_coding_sjis_big5 (coding, source, destination,
4912                                src_bytes, dst_bytes, 0);
4913       break;
4914
4915     case coding_type_emacs_mule:
4916       decode_coding_emacs_mule (coding, source, destination,
4917                                 src_bytes, dst_bytes);
4918       break;
4919
4920     case coding_type_ccl:
4921       if (coding->spec.ccl.cr_carryover)
4922         {
4923           /* Put the CR which was not processed by the previous call
4924              of decode_eol_post_ccl in DESTINATION.  It will be
4925              decoded together with the following LF by the call to
4926              decode_eol_post_ccl below.  */
4927           *destination = '\r';
4928           coding->produced++;
4929           coding->produced_char++;
4930           dst_bytes--;
4931           extra = coding->spec.ccl.cr_carryover;
4932         }
4933       ccl_coding_driver (coding, source, destination + extra,
4934                          src_bytes, dst_bytes, 0);
4935       if (coding->eol_type != CODING_EOL_LF)
4936         {
4937           coding->produced += extra;
4938           coding->produced_char += extra;
4939           decode_eol_post_ccl (coding, destination, coding->produced);
4940         }
4941       break;
4942
4943     default:
4944       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4945     }
4946
4947   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4948       && coding->mode & CODING_MODE_LAST_BLOCK
4949       && coding->consumed == src_bytes)
4950     coding->result = CODING_FINISH_NORMAL;
4951
4952   if (coding->mode & CODING_MODE_LAST_BLOCK
4953       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4954     {
4955       const unsigned char *src = source + coding->consumed;
4956       unsigned char *dst = destination + coding->produced;
4957
4958       src_bytes -= coding->consumed;
4959       coding->errors++;
4960       if (COMPOSING_P (coding))
4961         DECODE_COMPOSITION_END ('1');
4962       while (src_bytes--)
4963         {
4964           int c = *src++;
4965           dst += CHAR_STRING (c, dst);
4966           coding->produced_char++;
4967         }
4968       coding->consumed = coding->consumed_char = src - source;
4969       coding->produced = dst - destination;
4970       coding->result = CODING_FINISH_NORMAL;
4971     }
4972
4973   if (!coding->dst_multibyte)
4974     {
4975       coding->produced = str_as_unibyte (destination, coding->produced);
4976       coding->produced_char = coding->produced;
4977     }
4978
4979   return coding->result;
4980 }
4981
4982 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4983    multibyteness of the source is CODING->src_multibyte, the
4984    multibyteness of the result is always unibyte.  */
4985
4986 int
4987 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4988      struct coding_system *coding;
4989      const unsigned char *source;
4990      unsigned char *destination;
4991      int src_bytes, dst_bytes;
4992 {
4993   coding->produced = coding->produced_char = 0;
4994   coding->consumed = coding->consumed_char = 0;
4995   coding->errors = 0;
4996   coding->result = CODING_FINISH_NORMAL;
4997
4998   switch (coding->type)
4999     {
5000     case coding_type_sjis:
5001       encode_coding_sjis_big5 (coding, source, destination,
5002                                src_bytes, dst_bytes, 1);
5003       break;
5004
5005     case coding_type_iso2022:
5006       encode_coding_iso2022 (coding, source, destination,
5007                              src_bytes, dst_bytes);
5008       break;
5009
5010     case coding_type_big5:
5011       encode_coding_sjis_big5 (coding, source, destination,
5012                                src_bytes, dst_bytes, 0);
5013       break;
5014
5015     case coding_type_emacs_mule:
5016       encode_coding_emacs_mule (coding, source, destination,
5017                                 src_bytes, dst_bytes);
5018       break;
5019
5020     case coding_type_ccl:
5021       ccl_coding_driver (coding, source, destination,
5022                          src_bytes, dst_bytes, 1);
5023       break;
5024
5025     default:
5026       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5027     }
5028
5029   if (coding->mode & CODING_MODE_LAST_BLOCK
5030       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5031     {
5032       const unsigned char *src = source + coding->consumed;
5033       unsigned char *dst = destination + coding->produced;
5034
5035       if (coding->type == coding_type_iso2022)
5036         ENCODE_RESET_PLANE_AND_REGISTER;
5037       if (COMPOSING_P (coding))
5038         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5039       if (coding->consumed < src_bytes)
5040         {
5041           int len = src_bytes - coding->consumed;
5042
5043           BCOPY_SHORT (src, dst, len);
5044           if (coding->src_multibyte)
5045             len = str_as_unibyte (dst, len);
5046           dst += len;
5047           coding->consumed = src_bytes;
5048         }
5049       coding->produced = coding->produced_char = dst - destination;
5050       coding->result = CODING_FINISH_NORMAL;
5051     }
5052
5053   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5054       && coding->consumed == src_bytes)
5055     coding->result = CODING_FINISH_NORMAL;
5056
5057   return coding->result;
5058 }
5059
5060 /* Scan text in the region between *BEG and *END (byte positions),
5061    skip characters which we don't have to decode by coding system
5062    CODING at the head and tail, then set *BEG and *END to the region
5063    of the text we actually have to convert.  The caller should move
5064    the gap out of the region in advance if the region is from a
5065    buffer.
5066
5067    If STR is not NULL, *BEG and *END are indices into STR.  */
5068
5069 static void
5070 shrink_decoding_region (beg, end, coding, str)
5071      int *beg, *end;
5072      struct coding_system *coding;
5073      unsigned char *str;
5074 {
5075   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5076   int eol_conversion;
5077   Lisp_Object translation_table;
5078
5079   if (coding->type == coding_type_ccl
5080       || coding->type == coding_type_undecided
5081       || coding->eol_type != CODING_EOL_LF
5082       || !NILP (coding->post_read_conversion)
5083       || coding->composing != COMPOSITION_DISABLED)
5084     {
5085       /* We can't skip any data.  */
5086       return;
5087     }
5088   if (coding->type == coding_type_no_conversion
5089       || coding->type == coding_type_raw_text
5090       || coding->type == coding_type_emacs_mule)
5091     {
5092       /* We need no conversion, but don't have to skip any data here.
5093          Decoding routine handles them effectively anyway.  */
5094       return;
5095     }
5096
5097   translation_table = coding->translation_table_for_decode;
5098   if (NILP (translation_table) && !NILP (Venable_character_translation))
5099     translation_table = Vstandard_translation_table_for_decode;
5100   if (CHAR_TABLE_P (translation_table))
5101     {
5102       int i;
5103       for (i = 0; i < 128; i++)
5104         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5105           break;
5106       if (i < 128)
5107         /* Some ASCII character should be translated.  We give up
5108            shrinking.  */
5109         return;
5110     }
5111
5112   if (coding->heading_ascii >= 0)
5113     /* Detection routine has already found how much we can skip at the
5114        head.  */
5115     *beg += coding->heading_ascii;
5116
5117   if (str)
5118     {
5119       begp_orig = begp = str + *beg;
5120       endp_orig = endp = str + *end;
5121     }
5122   else
5123     {
5124       begp_orig = begp = BYTE_POS_ADDR (*beg);
5125       endp_orig = endp = begp + *end - *beg;
5126     }
5127
5128   eol_conversion = (coding->eol_type == CODING_EOL_CR
5129                     || coding->eol_type == CODING_EOL_CRLF);
5130
5131   switch (coding->type)
5132     {
5133     case coding_type_sjis:
5134     case coding_type_big5:
5135       /* We can skip all ASCII characters at the head.  */
5136       if (coding->heading_ascii < 0)
5137         {
5138           if (eol_conversion)
5139             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5140           else
5141             while (begp < endp && *begp < 0x80) begp++;
5142         }
5143       /* We can skip all ASCII characters at the tail except for the
5144          second byte of SJIS or BIG5 code.  */
5145       if (eol_conversion)
5146         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5147       else
5148         while (begp < endp && endp[-1] < 0x80) endp--;
5149       /* Do not consider LF as ascii if preceded by CR, since that
5150          confuses eol decoding. */
5151       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5152         endp++;
5153       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5154         endp++;
5155       break;
5156
5157     case coding_type_iso2022:
5158       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5159         /* We can't skip any data.  */
5160         break;
5161       if (coding->heading_ascii < 0)
5162         {
5163           /* We can skip all ASCII characters at the head except for a
5164              few control codes.  */
5165           while (begp < endp && (c = *begp) < 0x80
5166                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5167                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5168                  && (!eol_conversion || c != ISO_CODE_LF))
5169             begp++;
5170         }
5171       switch (coding->category_idx)
5172         {
5173         case CODING_CATEGORY_IDX_ISO_8_1:
5174         case CODING_CATEGORY_IDX_ISO_8_2:
5175           /* We can skip all ASCII characters at the tail.  */
5176           if (eol_conversion)
5177             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5178           else
5179             while (begp < endp && endp[-1] < 0x80) endp--;
5180           /* Do not consider LF as ascii if preceded by CR, since that
5181              confuses eol decoding. */
5182           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5183             endp++;
5184           break;
5185
5186         case CODING_CATEGORY_IDX_ISO_7:
5187         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5188           {
5189             /* We can skip all characters at the tail except for 8-bit
5190                codes and ESC and the following 2-byte at the tail.  */
5191             unsigned char *eight_bit = NULL;
5192
5193             if (eol_conversion)
5194               while (begp < endp
5195                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5196                 {
5197                   if (!eight_bit && c & 0x80) eight_bit = endp;
5198                   endp--;
5199                 }
5200             else
5201               while (begp < endp
5202                      && (c = endp[-1]) != ISO_CODE_ESC)
5203                 {
5204                   if (!eight_bit && c & 0x80) eight_bit = endp;
5205                   endp--;
5206                 }
5207             /* Do not consider LF as ascii if preceded by CR, since that
5208                confuses eol decoding. */
5209             if (begp < endp && endp < endp_orig
5210                 && endp[-1] == '\r' && endp[0] == '\n')
5211               endp++;
5212             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5213               {
5214                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5215                   /* This is an ASCII designation sequence.  We can
5216                      surely skip the tail.  But, if we have
5217                      encountered an 8-bit code, skip only the codes
5218                      after that.  */
5219                   endp = eight_bit ? eight_bit : endp + 2;
5220                 else
5221                   /* Hmmm, we can't skip the tail.  */
5222                   endp = endp_orig;
5223               }
5224             else if (eight_bit)
5225               endp = eight_bit;
5226           }
5227         }
5228       break;
5229
5230     default:
5231       abort ();
5232     }
5233   *beg += begp - begp_orig;
5234   *end += endp - endp_orig;
5235   return;
5236 }
5237
5238 /* Like shrink_decoding_region but for encoding.  */
5239
5240 static void
5241 shrink_encoding_region (beg, end, coding, str)
5242      int *beg, *end;
5243      struct coding_system *coding;
5244      unsigned char *str;
5245 {
5246   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5247   int eol_conversion;
5248   Lisp_Object translation_table;
5249
5250   if (coding->type == coding_type_ccl
5251       || coding->eol_type == CODING_EOL_CRLF
5252       || coding->eol_type == CODING_EOL_CR
5253       || (coding->cmp_data && coding->cmp_data->used > 0))
5254     {
5255       /* We can't skip any data.  */
5256       return;
5257     }
5258   if (coding->type == coding_type_no_conversion
5259       || coding->type == coding_type_raw_text
5260       || coding->type == coding_type_emacs_mule
5261       || coding->type == coding_type_undecided)
5262     {
5263       /* We need no conversion, but don't have to skip any data here.
5264          Encoding routine handles them effectively anyway.  */
5265       return;
5266     }
5267
5268   translation_table = coding->translation_table_for_encode;
5269   if (NILP (translation_table) && !NILP (Venable_character_translation))
5270     translation_table = Vstandard_translation_table_for_encode;
5271   if (CHAR_TABLE_P (translation_table))
5272     {
5273       int i;
5274       for (i = 0; i < 128; i++)
5275         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5276           break;
5277       if (i < 128)
5278         /* Some ASCII character should be translated.  We give up
5279            shrinking.  */
5280         return;
5281     }
5282
5283   if (str)
5284     {
5285       begp_orig = begp = str + *beg;
5286       endp_orig = endp = str + *end;
5287     }
5288   else
5289     {
5290       begp_orig = begp = BYTE_POS_ADDR (*beg);
5291       endp_orig = endp = begp + *end - *beg;
5292     }
5293
5294   eol_conversion = (coding->eol_type == CODING_EOL_CR
5295                     || coding->eol_type == CODING_EOL_CRLF);
5296
5297   /* Here, we don't have to check coding->pre_write_conversion because
5298      the caller is expected to have handled it already.  */
5299   switch (coding->type)
5300     {
5301     case coding_type_iso2022:
5302       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5303         /* We can't skip any data.  */
5304         break;
5305       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5306         {
5307           unsigned char *bol = begp;
5308           while (begp < endp && *begp < 0x80)
5309             {
5310               begp++;
5311               if (begp[-1] == '\n')
5312                 bol = begp;
5313             }
5314           begp = bol;
5315           goto label_skip_tail;
5316         }
5317       /* fall down ... */
5318
5319     case coding_type_sjis:
5320     case coding_type_big5:
5321       /* We can skip all ASCII characters at the head and tail.  */
5322       if (eol_conversion)
5323         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5324       else
5325         while (begp < endp && *begp < 0x80) begp++;
5326     label_skip_tail:
5327       if (eol_conversion)
5328         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5329       else
5330         while (begp < endp && *(endp - 1) < 0x80) endp--;
5331       break;
5332
5333     default:
5334       abort ();
5335     }
5336
5337   *beg += begp - begp_orig;
5338   *end += endp - endp_orig;
5339   return;
5340 }
5341
5342 /* As shrinking conversion region requires some overhead, we don't try
5343    shrinking if the length of conversion region is less than this
5344    value.  */
5345 static int shrink_conversion_region_threshhold = 1024;
5346
5347 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5348   do {                                                                  \
5349     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5350       {                                                                 \
5351         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5352         else shrink_decoding_region (beg, end, coding, str);            \
5353       }                                                                 \
5354   } while (0)
5355
5356 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5357    Vlast_coding_system_used and the remaining elements are buffers to
5358    kill.  */
5359 static Lisp_Object
5360 code_convert_region_unwind (arg)
5361      Lisp_Object arg;
5362 {
5363   inhibit_pre_post_conversion = 0;
5364   Vlast_coding_system_used = XCAR (arg);
5365   for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5366     Fkill_buffer (XCAR (arg));
5367   return Qnil;
5368 }
5369
5370 /* Store information about all compositions in the range FROM and TO
5371    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5372    buffer or a string, defaults to the current buffer.  */
5373
5374 void
5375 coding_save_composition (coding, from, to, obj)
5376      struct coding_system *coding;
5377      int from, to;
5378      Lisp_Object obj;
5379 {
5380   Lisp_Object prop;
5381   int start, end;
5382
5383   if (coding->composing == COMPOSITION_DISABLED)
5384     return;
5385   if (!coding->cmp_data)
5386     coding_allocate_composition_data (coding, from);
5387   if (!find_composition (from, to, &start, &end, &prop, obj)
5388       || end > to)
5389     return;
5390   if (start < from
5391       && (!find_composition (end, to, &start, &end, &prop, obj)
5392           || end > to))
5393     return;
5394   coding->composing = COMPOSITION_NO;
5395   do
5396     {
5397       if (COMPOSITION_VALID_P (start, end, prop))
5398         {
5399           enum composition_method method = COMPOSITION_METHOD (prop);
5400           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5401               >= COMPOSITION_DATA_SIZE)
5402             coding_allocate_composition_data (coding, from);
5403           /* For relative composition, we remember start and end
5404              positions, for the other compositions, we also remember
5405              components.  */
5406           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5407           if (method != COMPOSITION_RELATIVE)
5408             {
5409               /* We must store a*/
5410               Lisp_Object val, ch;
5411
5412               val = COMPOSITION_COMPONENTS (prop);
5413               if (CONSP (val))
5414                 while (CONSP (val))
5415                   {
5416                     ch = XCAR (val), val = XCDR (val);
5417                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5418                   }
5419               else if (VECTORP (val) || STRINGP (val))
5420                 {
5421                   int len = (VECTORP (val)
5422                              ? XVECTOR (val)->size : SCHARS (val));
5423                   int i;
5424                   for (i = 0; i < len; i++)
5425                     {
5426                       ch = (STRINGP (val)
5427                             ? Faref (val, make_number (i))
5428                             : XVECTOR (val)->contents[i]);
5429                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5430                     }
5431                 }
5432               else              /* INTEGERP (val) */
5433                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5434             }
5435           CODING_ADD_COMPOSITION_END (coding, end - from);
5436         }
5437       start = end;
5438     }
5439   while (start < to
5440          && find_composition (start, to, &start, &end, &prop, obj)
5441          && end <= to);
5442
5443   /* Make coding->cmp_data point to the first memory block.  */
5444   while (coding->cmp_data->prev)
5445     coding->cmp_data = coding->cmp_data->prev;
5446   coding->cmp_data_start = 0;
5447 }
5448
5449 /* Reflect the saved information about compositions to OBJ.
5450    CODING->cmp_data points to a memory block for the information.  OBJ
5451    is a buffer or a string, defaults to the current buffer.  */
5452
5453 void
5454 coding_restore_composition (coding, obj)
5455      struct coding_system *coding;
5456      Lisp_Object obj;
5457 {
5458   struct composition_data *cmp_data = coding->cmp_data;
5459
5460   if (!cmp_data)
5461     return;
5462
5463   while (cmp_data->prev)
5464     cmp_data = cmp_data->prev;
5465
5466   while (cmp_data)
5467     {
5468       int i;
5469
5470       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5471            i += cmp_data->data[i])
5472         {
5473           int *data = cmp_data->data + i;
5474           enum composition_method method = (enum composition_method) data[3];
5475           Lisp_Object components;
5476
5477           if (data[0] < 0 || i + data[0] > cmp_data->used)
5478             /* Invalid composition data.  */
5479             break;
5480
5481           if (method == COMPOSITION_RELATIVE)
5482             components = Qnil;
5483           else
5484             {
5485               int len = data[0] - 4, j;
5486               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5487
5488               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5489                   && len % 2 == 0)
5490                 len --;
5491               if (len < 1)
5492                 /* Invalid composition data.  */
5493                 break;
5494               for (j = 0; j < len; j++)
5495                 args[j] = make_number (data[4 + j]);
5496               components = (method == COMPOSITION_WITH_ALTCHARS
5497                             ? Fstring (len, args)
5498                             : Fvector (len, args));
5499             }
5500           compose_text (data[1], data[2], components, Qnil, obj);
5501         }
5502       cmp_data = cmp_data->next;
5503     }
5504 }
5505
5506 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5507    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5508    coding system CODING, and return the status code of code conversion
5509    (currently, this value has no meaning).
5510
5511    How many characters (and bytes) are converted to how many
5512    characters (and bytes) are recorded in members of the structure
5513    CODING.
5514
5515    If REPLACE is nonzero, we do various things as if the original text
5516    is deleted and a new text is inserted.  See the comments in
5517    replace_range (insdel.c) to know what we are doing.
5518
5519    If REPLACE is zero, it is assumed that the source text is unibyte.
5520    Otherwise, it is assumed that the source text is multibyte.  */
5521
5522 int
5523 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5524      int from, from_byte, to, to_byte, encodep, replace;
5525      struct coding_system *coding;
5526 {
5527   int len = to - from, len_byte = to_byte - from_byte;
5528   int nchars_del = 0, nbytes_del = 0;
5529   int require, inserted, inserted_byte;
5530   int head_skip, tail_skip, total_skip = 0;
5531   Lisp_Object saved_coding_symbol;
5532   int first = 1;
5533   unsigned char *src, *dst;
5534   Lisp_Object deletion;
5535   int orig_point = PT, orig_len = len;
5536   int prev_Z;
5537   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5538
5539   deletion = Qnil;
5540   saved_coding_symbol = coding->symbol;
5541
5542   if (from < PT && PT < to)
5543     {
5544       TEMP_SET_PT_BOTH (from, from_byte);
5545       orig_point = from;
5546     }
5547
5548   if (replace)
5549     {
5550       int saved_from = from;
5551       int saved_inhibit_modification_hooks;
5552
5553       prepare_to_modify_buffer (from, to, &from);
5554       if (saved_from != from)
5555         {
5556           to = from + len;
5557           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5558           len_byte = to_byte - from_byte;
5559         }
5560
5561       /* The code conversion routine can not preserve text properties
5562          for now.  So, we must remove all text properties in the
5563          region.  Here, we must suppress all modification hooks.  */
5564       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5565       inhibit_modification_hooks = 1;
5566       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5567       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5568     }
5569
5570   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5571     {
5572       /* We must detect encoding of text and eol format.  */
5573
5574       if (from < GPT && to > GPT)
5575         move_gap_both (from, from_byte);
5576       if (coding->type == coding_type_undecided)
5577         {
5578           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5579           if (coding->type == coding_type_undecided)
5580             {
5581               /* It seems that the text contains only ASCII, but we
5582                  should not leave it undecided because the deeper
5583                  decoding routine (decode_coding) tries to detect the
5584                  encodings again in vain.  */
5585               coding->type = coding_type_emacs_mule;
5586               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5587               /* As emacs-mule decoder will handle composition, we
5588                  need this setting to allocate coding->cmp_data
5589                  later.  */
5590               coding->composing = COMPOSITION_NO;
5591             }
5592         }
5593       if (coding->eol_type == CODING_EOL_UNDECIDED
5594           && coding->type != coding_type_ccl)
5595         {
5596           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5597           if (coding->eol_type == CODING_EOL_UNDECIDED)
5598             coding->eol_type = CODING_EOL_LF;
5599           /* We had better recover the original eol format if we
5600              encounter an inconsistent eol format while decoding.  */
5601           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5602         }
5603     }
5604
5605   /* Now we convert the text.  */
5606
5607   /* For encoding, we must process pre-write-conversion in advance.  */
5608   if (! inhibit_pre_post_conversion
5609       && encodep
5610       && SYMBOLP (coding->pre_write_conversion)
5611       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5612     {
5613       /* The function in pre-write-conversion may put a new text in a
5614          new buffer.  */
5615       struct buffer *prev = current_buffer;
5616       Lisp_Object new;
5617
5618       record_unwind_protect (code_convert_region_unwind,
5619                              Fcons (Vlast_coding_system_used, Qnil));
5620       /* We should not call any more pre-write/post-read-conversion
5621          functions while this pre-write-conversion is running.  */
5622       inhibit_pre_post_conversion = 1;
5623       call2 (coding->pre_write_conversion,
5624              make_number (from), make_number (to));
5625       inhibit_pre_post_conversion = 0;
5626       /* Discard the unwind protect.  */
5627       specpdl_ptr--;
5628
5629       if (current_buffer != prev)
5630         {
5631           len = ZV - BEGV;
5632           new = Fcurrent_buffer ();
5633           set_buffer_internal_1 (prev);
5634           del_range_2 (from, from_byte, to, to_byte, 0);
5635           TEMP_SET_PT_BOTH (from, from_byte);
5636           insert_from_buffer (XBUFFER (new), 1, len, 0);
5637           Fkill_buffer (new);
5638           if (orig_point >= to)
5639             orig_point += len - orig_len;
5640           else if (orig_point > from)
5641             orig_point = from;
5642           orig_len = len;
5643           to = from + len;
5644           from_byte = CHAR_TO_BYTE (from);
5645           to_byte = CHAR_TO_BYTE (to);
5646           len_byte = to_byte - from_byte;
5647           TEMP_SET_PT_BOTH (from, from_byte);
5648         }
5649     }
5650
5651   if (replace)
5652     {
5653       if (! EQ (current_buffer->undo_list, Qt))
5654         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5655       else
5656         {
5657           nchars_del = to - from;
5658           nbytes_del = to_byte - from_byte;
5659         }
5660     }
5661
5662   if (coding->composing != COMPOSITION_DISABLED)
5663     {
5664       if (encodep)
5665         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5666       else
5667         coding_allocate_composition_data (coding, from);
5668     }
5669
5670   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5671      if we must run CCL program or there are compositions to
5672      encode.  */
5673   if (coding->type != coding_type_ccl
5674       && (! coding->cmp_data || coding->cmp_data->used == 0))
5675     {
5676       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5677
5678       if (from < GPT && GPT < to)
5679         move_gap_both (from, from_byte);
5680       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5681       if (from_byte == to_byte
5682           && (encodep || NILP (coding->post_read_conversion))
5683           && ! CODING_REQUIRE_FLUSHING (coding))
5684         {
5685           coding->produced = len_byte;
5686           coding->produced_char = len;
5687           if (!replace)
5688             /* We must record and adjust for this new text now.  */
5689             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5690           coding_free_composition_data (coding);
5691           return 0;
5692         }
5693
5694       head_skip = from_byte - from_byte_orig;
5695       tail_skip = to_byte_orig - to_byte;
5696       total_skip = head_skip + tail_skip;
5697       from += head_skip;
5698       to -= tail_skip;
5699       len -= total_skip; len_byte -= total_skip;
5700     }
5701
5702   /* For conversion, we must put the gap before the text in addition to
5703      making the gap larger for efficient decoding.  The required gap
5704      size starts from 2000 which is the magic number used in make_gap.
5705      But, after one batch of conversion, it will be incremented if we
5706      find that it is not enough .  */
5707   require = 2000;
5708
5709   if (GAP_SIZE  < require)
5710     make_gap (require - GAP_SIZE);
5711   move_gap_both (from, from_byte);
5712
5713   inserted = inserted_byte = 0;
5714
5715   GAP_SIZE += len_byte;
5716   ZV -= len;
5717   Z -= len;
5718   ZV_BYTE -= len_byte;
5719   Z_BYTE -= len_byte;
5720
5721   if (GPT - BEG < BEG_UNCHANGED)
5722     BEG_UNCHANGED = GPT - BEG;
5723   if (Z - GPT < END_UNCHANGED)
5724     END_UNCHANGED = Z - GPT;
5725
5726   if (!encodep && coding->src_multibyte)
5727     {
5728       /* Decoding routines expects that the source text is unibyte.
5729          We must convert 8-bit characters of multibyte form to
5730          unibyte.  */
5731       int len_byte_orig = len_byte;
5732       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5733       if (len_byte < len_byte_orig)
5734         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5735                     len_byte);
5736       coding->src_multibyte = 0;
5737     }
5738
5739   for (;;)
5740     {
5741       int result;
5742
5743       /* The buffer memory is now:
5744          +--------+converted-text+---------+-------original-text-------+---+
5745          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5746                   |<---------------------- GAP ----------------------->|  */
5747       src = GAP_END_ADDR - len_byte;
5748       dst = GPT_ADDR + inserted_byte;
5749
5750       if (encodep)
5751         result = encode_coding (coding, src, dst, len_byte, 0);
5752       else
5753         {
5754           if (coding->composing != COMPOSITION_DISABLED)
5755             coding->cmp_data->char_offset = from + inserted;
5756           result = decode_coding (coding, src, dst, len_byte, 0);
5757         }
5758
5759       /* The buffer memory is now:
5760          +--------+-------converted-text----+--+------original-text----+---+
5761          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5762                   |<---------------------- GAP ----------------------->|  */
5763
5764       inserted += coding->produced_char;
5765       inserted_byte += coding->produced;
5766       len_byte -= coding->consumed;
5767
5768       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5769         {
5770           coding_allocate_composition_data (coding, from + inserted);
5771           continue;
5772         }
5773
5774       src += coding->consumed;
5775       dst += coding->produced;
5776
5777       if (result == CODING_FINISH_NORMAL)
5778         {
5779           src += len_byte;
5780           break;
5781         }
5782       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5783         {
5784           unsigned char *pend = dst, *p = pend - inserted_byte;
5785           Lisp_Object eol_type;
5786
5787           /* Encode LFs back to the original eol format (CR or CRLF).  */
5788           if (coding->eol_type == CODING_EOL_CR)
5789             {
5790               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5791             }
5792           else
5793             {
5794               int count = 0;
5795
5796               while (p < pend) if (*p++ == '\n') count++;
5797               if (src - dst < count)
5798                 {
5799                   /* We don't have sufficient room for encoding LFs
5800                      back to CRLF.  We must record converted and
5801                      not-yet-converted text back to the buffer
5802                      content, enlarge the gap, then record them out of
5803                      the buffer contents again.  */
5804                   int add = len_byte + inserted_byte;
5805
5806                   GAP_SIZE -= add;
5807                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5808                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5809                   make_gap (count - GAP_SIZE);
5810                   GAP_SIZE += add;
5811                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5812                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5813                   /* Don't forget to update SRC, DST, and PEND.  */
5814                   src = GAP_END_ADDR - len_byte;
5815                   dst = GPT_ADDR + inserted_byte;
5816                   pend = dst;
5817                 }
5818               inserted += count;
5819               inserted_byte += count;
5820               coding->produced += count;
5821               p = dst = pend + count;
5822               while (count)
5823                 {
5824                   *--p = *--pend;
5825                   if (*p == '\n') count--, *--p = '\r';
5826                 }
5827             }
5828
5829           /* Suppress eol-format conversion in the further conversion.  */
5830           coding->eol_type = CODING_EOL_LF;
5831
5832           /* Set the coding system symbol to that for Unix-like EOL.  */
5833           eol_type = Fget (saved_coding_symbol, Qeol_type);
5834           if (VECTORP (eol_type)
5835               && XVECTOR (eol_type)->size == 3
5836               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5837             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5838           else
5839             coding->symbol = saved_coding_symbol;
5840
5841           continue;
5842         }
5843       if (len_byte <= 0)
5844         {
5845           if (coding->type != coding_type_ccl
5846               || coding->mode & CODING_MODE_LAST_BLOCK)
5847             break;
5848           coding->mode |= CODING_MODE_LAST_BLOCK;
5849           continue;
5850         }
5851       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5852         {
5853           /* The source text ends in invalid codes.  Let's just
5854              make them valid buffer contents, and finish conversion.  */
5855           if (multibyte_p)
5856             {
5857               unsigned char *start = dst;
5858
5859               inserted += len_byte;
5860               while (len_byte--)
5861                 {
5862                   int c = *src++;
5863                   dst += CHAR_STRING (c, dst);
5864                 }
5865
5866               inserted_byte += dst - start;
5867             }
5868           else
5869             {
5870               inserted += len_byte;
5871               inserted_byte += len_byte;
5872               while (len_byte--)
5873                 *dst++ = *src++;
5874             }
5875           break;
5876         }
5877       if (result == CODING_FINISH_INTERRUPT)
5878         {
5879           /* The conversion procedure was interrupted by a user.  */
5880           break;
5881         }
5882       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5883       if (coding->consumed < 1)
5884         {
5885           /* It's quite strange to require more memory without
5886              consuming any bytes.  Perhaps CCL program bug.  */
5887           break;
5888         }
5889       if (first)
5890         {
5891           /* We have just done the first batch of conversion which was
5892              stopped because of insufficient gap.  Let's reconsider the
5893              required gap size (i.e. SRT - DST) now.
5894
5895              We have converted ORIG bytes (== coding->consumed) into
5896              NEW bytes (coding->produced).  To convert the remaining
5897              LEN bytes, we may need REQUIRE bytes of gap, where:
5898                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5899                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5900              Here, we are sure that NEW >= ORIG.  */
5901
5902           if (coding->produced <= coding->consumed)
5903             {
5904               /* This happens because of CCL-based coding system with
5905                  eol-type CRLF.  */
5906               require = 0;
5907             }
5908           else
5909             {
5910               float ratio = coding->produced - coding->consumed;
5911               ratio /= coding->consumed;
5912               require = len_byte * ratio;
5913             }
5914           first = 0;
5915         }
5916       if ((src - dst) < (require + 2000))
5917         {
5918           /* See the comment above the previous call of make_gap.  */
5919           int add = len_byte + inserted_byte;
5920
5921           GAP_SIZE -= add;
5922           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5923           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5924           make_gap (require + 2000);
5925           GAP_SIZE += add;
5926           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5927           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5928         }
5929     }
5930   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5931
5932   if (encodep && coding->dst_multibyte)
5933     {
5934       /* The output is unibyte.  We must convert 8-bit characters to
5935          multibyte form.  */
5936       if (inserted_byte * 2 > GAP_SIZE)
5937         {
5938           GAP_SIZE -= inserted_byte;
5939           ZV += inserted_byte; Z += inserted_byte;
5940           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5941           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5942           make_gap (inserted_byte - GAP_SIZE);
5943           GAP_SIZE += inserted_byte;
5944           ZV -= inserted_byte; Z -= inserted_byte;
5945           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5946           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5947         }
5948       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5949     }
5950
5951   /* If we shrank the conversion area, adjust it now.  */
5952   if (total_skip > 0)
5953     {
5954       if (tail_skip > 0)
5955         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5956       inserted += total_skip; inserted_byte += total_skip;
5957       GAP_SIZE += total_skip;
5958       GPT -= head_skip; GPT_BYTE -= head_skip;
5959       ZV -= total_skip; ZV_BYTE -= total_skip;
5960       Z -= total_skip; Z_BYTE -= total_skip;
5961       from -= head_skip; from_byte -= head_skip;
5962       to += tail_skip; to_byte += tail_skip;
5963     }
5964
5965   prev_Z = Z;
5966   if (! EQ (current_buffer->undo_list, Qt))
5967     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5968   else
5969     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5970                                  inserted, inserted_byte);
5971   inserted = Z - prev_Z;
5972
5973   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5974     coding_restore_composition (coding, Fcurrent_buffer ());
5975   coding_free_composition_data (coding);
5976
5977   if (! inhibit_pre_post_conversion
5978       && ! encodep && ! NILP (coding->post_read_conversion))
5979     {
5980       Lisp_Object val;
5981       Lisp_Object saved_coding_system;
5982
5983       if (from != PT)
5984         TEMP_SET_PT_BOTH (from, from_byte);
5985       prev_Z = Z;
5986       record_unwind_protect (code_convert_region_unwind,
5987                              Fcons (Vlast_coding_system_used, Qnil));
5988       saved_coding_system = Vlast_coding_system_used;
5989       Vlast_coding_system_used = coding->symbol;
5990       /* We should not call any more pre-write/post-read-conversion
5991          functions while this post-read-conversion is running.  */
5992       inhibit_pre_post_conversion = 1;
5993       val = call1 (coding->post_read_conversion, make_number (inserted));
5994       inhibit_pre_post_conversion = 0;
5995       coding->symbol = Vlast_coding_system_used;
5996       Vlast_coding_system_used = saved_coding_system;
5997       /* Discard the unwind protect.  */
5998       specpdl_ptr--;
5999       CHECK_NUMBER (val);
6000       inserted += Z - prev_Z;
6001     }
6002
6003   if (orig_point >= from)
6004     {
6005       if (orig_point >= from + orig_len)
6006         orig_point += inserted - orig_len;
6007       else
6008         orig_point = from;
6009       TEMP_SET_PT (orig_point);
6010     }
6011
6012   if (replace)
6013     {
6014       signal_after_change (from, to - from, inserted);
6015       update_compositions (from, from + inserted, CHECK_BORDER);
6016     }
6017
6018   {
6019     coding->consumed = to_byte - from_byte;
6020     coding->consumed_char = to - from;
6021     coding->produced = inserted_byte;
6022     coding->produced_char = inserted;
6023   }
6024
6025   return 0;
6026 }
6027
6028 /* Name (or base name) of work buffer for code conversion.  */
6029 static Lisp_Object Vcode_conversion_workbuf_name;
6030
6031 /* Set the current buffer to the working buffer prepared for
6032    code-conversion.  MULTIBYTE specifies the multibyteness of the
6033    buffer.  Return the buffer we set if it must be killed after use.
6034    Otherwise return Qnil.  */
6035
6036 static Lisp_Object
6037 set_conversion_work_buffer (multibyte)
6038      int multibyte;
6039 {
6040   Lisp_Object buffer, buffer_to_kill;
6041   struct buffer *buf;
6042
6043   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6044   buf = XBUFFER (buffer);
6045   if (buf == current_buffer)
6046     {
6047       /* As we are already in the work buffer, we must generate a new
6048          buffer for the work.  */
6049       Lisp_Object name;
6050
6051       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6052       buffer = buffer_to_kill = Fget_buffer_create (name);
6053       buf = XBUFFER (buffer);
6054     }
6055   else
6056     buffer_to_kill = Qnil;
6057
6058   delete_all_overlays (buf);
6059   buf->directory = current_buffer->directory;
6060   buf->read_only = Qnil;
6061   buf->filename = Qnil;
6062   buf->undo_list = Qt;
6063   eassert (buf->overlays_before == NULL);
6064   eassert (buf->overlays_after == NULL);
6065   set_buffer_internal (buf);
6066   if (BEG != BEGV || Z != ZV)
6067     Fwiden ();
6068   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6069   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6070   return buffer_to_kill;
6071 }
6072
6073 Lisp_Object
6074 run_pre_post_conversion_on_str (str, coding, encodep)
6075      Lisp_Object str;
6076      struct coding_system *coding;
6077      int encodep;
6078 {
6079   int count = SPECPDL_INDEX ();
6080   struct gcpro gcpro1, gcpro2;
6081   int multibyte = STRING_MULTIBYTE (str);
6082   Lisp_Object old_deactivate_mark;
6083   Lisp_Object buffer_to_kill;
6084   Lisp_Object unwind_arg;
6085
6086   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6087   /* It is not crucial to specbind this.  */
6088   old_deactivate_mark = Vdeactivate_mark;
6089   GCPRO2 (str, old_deactivate_mark);
6090
6091   /* We must insert the contents of STR as is without
6092      unibyte<->multibyte conversion.  For that, we adjust the
6093      multibyteness of the working buffer to that of STR.  */
6094   buffer_to_kill = set_conversion_work_buffer (multibyte);
6095   if (NILP (buffer_to_kill))
6096     unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6097   else
6098     unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6099   record_unwind_protect (code_convert_region_unwind, unwind_arg);
6100
6101   insert_from_string (str, 0, 0,
6102                       SCHARS (str), SBYTES (str), 0);
6103   UNGCPRO;
6104   inhibit_pre_post_conversion = 1;
6105   if (encodep)
6106     {
6107       struct buffer *prev = current_buffer;
6108
6109       call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6110       if (prev != current_buffer)
6111         /* We must kill the current buffer too.  */
6112         Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6113     }
6114   else
6115     {
6116       Vlast_coding_system_used = coding->symbol;
6117       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6118       call1 (coding->post_read_conversion, make_number (Z - BEG));
6119       coding->symbol = Vlast_coding_system_used;
6120     }
6121   inhibit_pre_post_conversion = 0;
6122   Vdeactivate_mark = old_deactivate_mark;
6123   str = make_buffer_string (BEG, Z, 1);
6124   return unbind_to (count, str);
6125 }
6126
6127
6128 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6129    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6130    is intended that this function is called from encode_terminal_code,
6131    the pre-write-conversion function is run by safe_call and thus
6132    "Error during redisplay: ..." is logged when an error occurs.
6133
6134    Store the resulting text in *STR and set CODING->produced_char and
6135    CODING->produced to the number of characters and bytes
6136    respectively.  If the size of *STR is too small, enlarge it by
6137    xrealloc and update *STR and *SIZE.  */
6138
6139 void
6140 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6141      unsigned char **str;
6142      int *size, nchars, nbytes;
6143      struct coding_system *coding;
6144 {
6145   struct gcpro gcpro1, gcpro2;
6146   struct buffer *cur = current_buffer;
6147   struct buffer *prev;
6148   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6149   Lisp_Object args[3];
6150   Lisp_Object buffer_to_kill;
6151
6152   /* It is not crucial to specbind this.  */
6153   old_deactivate_mark = Vdeactivate_mark;
6154   old_last_coding_system_used = Vlast_coding_system_used;
6155   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6156
6157   /* We must insert the contents of STR as is without
6158      unibyte<->multibyte conversion.  For that, we adjust the
6159      multibyteness of the working buffer to that of STR.  */
6160   buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6161   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6162   UNGCPRO;
6163   inhibit_pre_post_conversion = 1;
6164   prev = current_buffer;
6165   args[0] = coding->pre_write_conversion;
6166   args[1] = make_number (BEG);
6167   args[2] = make_number (Z);
6168   safe_call (3, args);
6169   inhibit_pre_post_conversion = 0;
6170   Vdeactivate_mark = old_deactivate_mark;
6171   Vlast_coding_system_used = old_last_coding_system_used;
6172   coding->produced_char = Z - BEG;
6173   coding->produced = Z_BYTE - BEG_BYTE;
6174   if (coding->produced > *size)
6175     {
6176       *size = coding->produced;
6177       *str = xrealloc (*str, *size);
6178     }
6179   if (BEG < GPT && GPT < Z)
6180     move_gap (BEG);
6181   bcopy (BEG_ADDR, *str, coding->produced);
6182   coding->src_multibyte
6183     = ! NILP (current_buffer->enable_multibyte_characters);
6184   if (prev != current_buffer)
6185     Fkill_buffer (Fcurrent_buffer ());
6186   set_buffer_internal (cur);
6187   if (! NILP (buffer_to_kill))
6188     Fkill_buffer (buffer_to_kill);
6189 }
6190
6191
6192 Lisp_Object
6193 decode_coding_string (str, coding, nocopy)
6194      Lisp_Object str;
6195      struct coding_system *coding;
6196      int nocopy;
6197 {
6198   int len;
6199   struct conversion_buffer buf;
6200   int from, to_byte;
6201   Lisp_Object saved_coding_symbol;
6202   int result;
6203   int require_decoding;
6204   int shrinked_bytes = 0;
6205   Lisp_Object newstr;
6206   int consumed, consumed_char, produced, produced_char;
6207
6208   from = 0;
6209   to_byte = SBYTES (str);
6210
6211   saved_coding_symbol = coding->symbol;
6212   coding->src_multibyte = STRING_MULTIBYTE (str);
6213   coding->dst_multibyte = 1;
6214   if (CODING_REQUIRE_DETECTION (coding))
6215     {
6216       /* See the comments in code_convert_region.  */
6217       if (coding->type == coding_type_undecided)
6218         {
6219           detect_coding (coding, SDATA (str), to_byte);
6220           if (coding->type == coding_type_undecided)
6221             {
6222               coding->type = coding_type_emacs_mule;
6223               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6224               /* As emacs-mule decoder will handle composition, we
6225                  need this setting to allocate coding->cmp_data
6226                  later.  */
6227               coding->composing = COMPOSITION_NO;
6228             }
6229         }
6230       if (coding->eol_type == CODING_EOL_UNDECIDED
6231           && coding->type != coding_type_ccl)
6232         {
6233           saved_coding_symbol = coding->symbol;
6234           detect_eol (coding, SDATA (str), to_byte);
6235           if (coding->eol_type == CODING_EOL_UNDECIDED)
6236             coding->eol_type = CODING_EOL_LF;
6237           /* We had better recover the original eol format if we
6238              encounter an inconsistent eol format while decoding.  */
6239           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6240         }
6241     }
6242
6243   if (coding->type == coding_type_no_conversion
6244       || coding->type == coding_type_raw_text)
6245     coding->dst_multibyte = 0;
6246
6247   require_decoding = CODING_REQUIRE_DECODING (coding);
6248
6249   if (STRING_MULTIBYTE (str))
6250     {
6251       /* Decoding routines expect the source text to be unibyte.  */
6252       str = Fstring_as_unibyte (str);
6253       to_byte = SBYTES (str);
6254       nocopy = 1;
6255       coding->src_multibyte = 0;
6256     }
6257
6258   /* Try to skip the heading and tailing ASCIIs.  */
6259   if (require_decoding && coding->type != coding_type_ccl)
6260     {
6261       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6262                                 0);
6263       if (from == to_byte)
6264         require_decoding = 0;
6265       shrinked_bytes = from + (SBYTES (str) - to_byte);
6266     }
6267
6268   if (!require_decoding
6269       && !(SYMBOLP (coding->post_read_conversion)
6270            && !NILP (Ffboundp (coding->post_read_conversion))))
6271     {
6272       coding->consumed = SBYTES (str);
6273       coding->consumed_char = SCHARS (str);
6274       if (coding->dst_multibyte)
6275         {
6276           str = Fstring_as_multibyte (str);
6277           nocopy = 1;
6278         }
6279       coding->produced = SBYTES (str);
6280       coding->produced_char = SCHARS (str);
6281       return (nocopy ? str : Fcopy_sequence (str));
6282     }
6283
6284   if (coding->composing != COMPOSITION_DISABLED)
6285     coding_allocate_composition_data (coding, from);
6286   len = decoding_buffer_size (coding, to_byte - from);
6287   allocate_conversion_buffer (buf, len);
6288
6289   consumed = consumed_char = produced = produced_char = 0;
6290   while (1)
6291     {
6292       result = decode_coding (coding, SDATA (str) + from + consumed,
6293                               buf.data + produced, to_byte - from - consumed,
6294                               buf.size - produced);
6295       consumed += coding->consumed;
6296       consumed_char += coding->consumed_char;
6297       produced += coding->produced;
6298       produced_char += coding->produced_char;
6299       if (result == CODING_FINISH_NORMAL
6300           || result == CODING_FINISH_INTERRUPT
6301           || (result == CODING_FINISH_INSUFFICIENT_SRC
6302               && coding->consumed == 0))
6303         break;
6304       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6305         coding_allocate_composition_data (coding, from + produced_char);
6306       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6307         extend_conversion_buffer (&buf);
6308       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6309         {
6310           Lisp_Object eol_type;
6311
6312           /* Recover the original EOL format.  */
6313           if (coding->eol_type == CODING_EOL_CR)
6314             {
6315               unsigned char *p;
6316               for (p = buf.data; p < buf.data + produced; p++)
6317                 if (*p == '\n') *p = '\r';
6318             }
6319           else if (coding->eol_type == CODING_EOL_CRLF)
6320             {
6321               int num_eol = 0;
6322               unsigned char *p0, *p1;
6323               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6324                 if (*p0 == '\n') num_eol++;
6325               if (produced + num_eol >= buf.size)
6326                 extend_conversion_buffer (&buf);
6327               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6328                 {
6329                   *--p1 = *--p0;
6330                   if (*p0 == '\n') *--p1 = '\r';
6331                 }
6332               produced += num_eol;
6333               produced_char += num_eol;
6334             }
6335           /* Suppress eol-format conversion in the further conversion.  */
6336           coding->eol_type = CODING_EOL_LF;
6337
6338           /* Set the coding system symbol to that for Unix-like EOL.  */
6339           eol_type = Fget (saved_coding_symbol, Qeol_type);
6340           if (VECTORP (eol_type)
6341               && XVECTOR (eol_type)->size == 3
6342               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6343             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6344           else
6345             coding->symbol = saved_coding_symbol;
6346
6347
6348         }
6349     }
6350
6351   coding->consumed = consumed;
6352   coding->consumed_char = consumed_char;
6353   coding->produced = produced;
6354   coding->produced_char = produced_char;
6355
6356   if (coding->dst_multibyte)
6357     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6358                                            produced + shrinked_bytes);
6359   else
6360     newstr = make_uninit_string (produced + shrinked_bytes);
6361   if (from > 0)
6362     STRING_COPYIN (newstr, 0, SDATA (str), from);
6363   STRING_COPYIN (newstr, from, buf.data, produced);
6364   if (shrinked_bytes > from)
6365     STRING_COPYIN (newstr, from + produced,
6366                    SDATA (str) + to_byte,
6367                    shrinked_bytes - from);
6368   free_conversion_buffer (&buf);
6369
6370   coding->consumed += shrinked_bytes;
6371   coding->consumed_char += shrinked_bytes;
6372   coding->produced += shrinked_bytes;
6373   coding->produced_char += shrinked_bytes;
6374
6375   if (coding->cmp_data && coding->cmp_data->used)
6376     coding_restore_composition (coding, newstr);
6377   coding_free_composition_data (coding);
6378
6379   if (SYMBOLP (coding->post_read_conversion)
6380       && !NILP (Ffboundp (coding->post_read_conversion)))
6381     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6382
6383   return newstr;
6384 }
6385
6386 Lisp_Object
6387 encode_coding_string (str, coding, nocopy)
6388      Lisp_Object str;
6389      struct coding_system *coding;
6390      int nocopy;
6391 {
6392   int len;
6393   struct conversion_buffer buf;
6394   int from, to, to_byte;
6395   int result;
6396   int shrinked_bytes = 0;
6397   Lisp_Object newstr;
6398   int consumed, consumed_char, produced, produced_char;
6399
6400   if (SYMBOLP (coding->pre_write_conversion)
6401       && !NILP (Ffboundp (coding->pre_write_conversion)))
6402     {
6403       str = run_pre_post_conversion_on_str (str, coding, 1);
6404       /* As STR is just newly generated, we don't have to copy it
6405          anymore.  */
6406       nocopy = 1;
6407     }
6408
6409   from = 0;
6410   to = SCHARS (str);
6411   to_byte = SBYTES (str);
6412
6413   /* Encoding routines determine the multibyteness of the source text
6414      by coding->src_multibyte.  */
6415   coding->src_multibyte = SCHARS (str) < SBYTES (str);
6416   coding->dst_multibyte = 0;
6417   if (! CODING_REQUIRE_ENCODING (coding))
6418     goto no_need_of_encoding;
6419
6420   if (coding->composing != COMPOSITION_DISABLED)
6421     coding_save_composition (coding, from, to, str);
6422
6423   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6424      if we must run CCL program or there are compositions to
6425      encode.  */
6426   if (coding->type != coding_type_ccl
6427       && (! coding->cmp_data || coding->cmp_data->used == 0))
6428     {
6429       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6430                                 1);
6431       if (from == to_byte)
6432         {
6433           coding_free_composition_data (coding);
6434           goto no_need_of_encoding;
6435         }
6436       shrinked_bytes = from + (SBYTES (str) - to_byte);
6437     }
6438
6439   len = encoding_buffer_size (coding, to_byte - from);
6440   allocate_conversion_buffer (buf, len);
6441
6442   consumed = consumed_char = produced = produced_char = 0;
6443   while (1)
6444     {
6445       result = encode_coding (coding, SDATA (str) + from + consumed,
6446                               buf.data + produced, to_byte - from - consumed,
6447                               buf.size - produced);
6448       consumed += coding->consumed;
6449       consumed_char += coding->consumed_char;
6450       produced += coding->produced;
6451       produced_char += coding->produced_char;
6452       if (result == CODING_FINISH_NORMAL
6453           || result == CODING_FINISH_INTERRUPT
6454           || (result == CODING_FINISH_INSUFFICIENT_SRC
6455               && coding->consumed == 0))
6456         break;
6457       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6458       extend_conversion_buffer (&buf);
6459     }
6460
6461   coding->consumed = consumed;
6462   coding->consumed_char = consumed_char;
6463   coding->produced = produced;
6464   coding->produced_char = produced_char;
6465
6466   newstr = make_uninit_string (produced + shrinked_bytes);
6467   if (from > 0)
6468     STRING_COPYIN (newstr, 0, SDATA (str), from);
6469   STRING_COPYIN (newstr, from, buf.data, produced);
6470   if (shrinked_bytes > from)
6471     STRING_COPYIN (newstr, from + produced,
6472                    SDATA (str) + to_byte,
6473                    shrinked_bytes - from);
6474
6475   free_conversion_buffer (&buf);
6476   coding_free_composition_data (coding);
6477
6478   return newstr;
6479
6480  no_need_of_encoding:
6481   coding->consumed = SBYTES (str);
6482   coding->consumed_char = SCHARS (str);
6483   if (STRING_MULTIBYTE (str))
6484     {
6485       if (nocopy)
6486         /* We are sure that STR doesn't contain a multibyte
6487            character.  */
6488         STRING_SET_UNIBYTE (str);
6489       else
6490         {
6491           str = Fstring_as_unibyte (str);
6492           nocopy = 1;
6493         }
6494     }
6495   coding->produced = SBYTES (str);
6496   coding->produced_char = SCHARS (str);
6497   return (nocopy ? str : Fcopy_sequence (str));
6498 }
6499
6500 \f
6501 #ifdef emacs
6502 /*** 8. Emacs Lisp library functions ***/
6503
6504 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6505        doc: /* Return t if OBJECT is nil or a coding-system.
6506 See the documentation of `make-coding-system' for information
6507 about coding-system objects.  */)
6508      (obj)
6509      Lisp_Object obj;
6510 {
6511   if (NILP (obj))
6512     return Qt;
6513   if (!SYMBOLP (obj))
6514     return Qnil;
6515   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6516     return Qt;
6517   /* Get coding-spec vector for OBJ.  */
6518   obj = Fget (obj, Qcoding_system);
6519   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6520           ? Qt : Qnil);
6521 }
6522
6523 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6524        Sread_non_nil_coding_system, 1, 1, 0,
6525        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6526      (prompt)
6527      Lisp_Object prompt;
6528 {
6529   Lisp_Object val;
6530   do
6531     {
6532       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6533                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6534     }
6535   while (SCHARS (val) == 0);
6536   return (Fintern (val, Qnil));
6537 }
6538
6539 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6540        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6541 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6542      (prompt, default_coding_system)
6543      Lisp_Object prompt, default_coding_system;
6544 {
6545   Lisp_Object val;
6546   if (SYMBOLP (default_coding_system))
6547     default_coding_system = SYMBOL_NAME (default_coding_system);
6548   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6549                           Qt, Qnil, Qcoding_system_history,
6550                           default_coding_system, Qnil);
6551   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6552 }
6553
6554 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6555        1, 1, 0,
6556        doc: /* Check validity of CODING-SYSTEM.
6557 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6558 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6559 The value of this property should be a vector of length 5.  */)
6560      (coding_system)
6561      Lisp_Object coding_system;
6562 {
6563   Lisp_Object define_form;
6564
6565   define_form = Fget (coding_system, Qcoding_system_define_form);
6566   if (! NILP (define_form))
6567     {
6568       Fput (coding_system, Qcoding_system_define_form, Qnil);
6569       safe_eval (define_form);
6570     }
6571   if (!NILP (Fcoding_system_p (coding_system)))
6572     return coding_system;
6573   while (1)
6574     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6575 }
6576 \f
6577 Lisp_Object
6578 detect_coding_system (src, src_bytes, highest, multibytep)
6579      const unsigned char *src;
6580      int src_bytes, highest;
6581      int multibytep;
6582 {
6583   int coding_mask, eol_type;
6584   Lisp_Object val, tmp;
6585   int dummy;
6586
6587   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6588   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6589   if (eol_type == CODING_EOL_INCONSISTENT)
6590     eol_type = CODING_EOL_UNDECIDED;
6591
6592   if (!coding_mask)
6593     {
6594       val = Qundecided;
6595       if (eol_type != CODING_EOL_UNDECIDED)
6596         {
6597           Lisp_Object val2;
6598           val2 = Fget (Qundecided, Qeol_type);
6599           if (VECTORP (val2))
6600             val = XVECTOR (val2)->contents[eol_type];
6601         }
6602       return (highest ? val : Fcons (val, Qnil));
6603     }
6604
6605   /* At first, gather possible coding systems in VAL.  */
6606   val = Qnil;
6607   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6608     {
6609       Lisp_Object category_val, category_index;
6610
6611       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6612       category_val = Fsymbol_value (XCAR (tmp));
6613       if (!NILP (category_val)
6614           && NATNUMP (category_index)
6615           && (coding_mask & (1 << XFASTINT (category_index))))
6616         {
6617           val = Fcons (category_val, val);
6618           if (highest)
6619             break;
6620         }
6621     }
6622   if (!highest)
6623     val = Fnreverse (val);
6624
6625   /* Then, replace the elements with subsidiary coding systems.  */
6626   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6627     {
6628       if (eol_type != CODING_EOL_UNDECIDED
6629           && eol_type != CODING_EOL_INCONSISTENT)
6630         {
6631           Lisp_Object eol;
6632           eol = Fget (XCAR (tmp), Qeol_type);
6633           if (VECTORP (eol))
6634             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6635         }
6636     }
6637   return (highest ? XCAR (val) : val);
6638 }
6639
6640 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6641        2, 3, 0,
6642        doc: /* Detect how the byte sequence in the region is encoded.
6643 Return a list of possible coding systems used on decoding a byte
6644 sequence containing the bytes in the region between START and END when
6645 the coding system `undecided' is specified.  The list is ordered by
6646 priority decided in the current language environment.
6647
6648 If only ASCII characters are found, it returns a list of single element
6649 `undecided' or its subsidiary coding system according to a detected
6650 end-of-line format.
6651
6652 If optional argument HIGHEST is non-nil, return the coding system of
6653 highest priority.  */)
6654      (start, end, highest)
6655      Lisp_Object start, end, highest;
6656 {
6657   int from, to;
6658   int from_byte, to_byte;
6659   int include_anchor_byte = 0;
6660
6661   CHECK_NUMBER_COERCE_MARKER (start);
6662   CHECK_NUMBER_COERCE_MARKER (end);
6663
6664   validate_region (&start, &end);
6665   from = XINT (start), to = XINT (end);
6666   from_byte = CHAR_TO_BYTE (from);
6667   to_byte = CHAR_TO_BYTE (to);
6668
6669   if (from < GPT && to >= GPT)
6670     move_gap_both (to, to_byte);
6671   /* If we an anchor byte `\0' follows the region, we include it in
6672      the detecting source.  Then code detectors can handle the tailing
6673      byte sequence more accurately.
6674
6675      Fix me: This is not a perfect solution.  It is better that we
6676      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6677   */
6678   if (to == Z || (to == GPT && GAP_SIZE > 0))
6679     include_anchor_byte = 1;
6680   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6681                                to_byte - from_byte + include_anchor_byte,
6682                                !NILP (highest),
6683                                !NILP (current_buffer
6684                                       ->enable_multibyte_characters));
6685 }
6686
6687 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6688        1, 2, 0,
6689        doc: /* Detect how the byte sequence in STRING is encoded.
6690 Return a list of possible coding systems used on decoding a byte
6691 sequence containing the bytes in STRING when the coding system
6692 `undecided' is specified.  The list is ordered by priority decided in
6693 the current language environment.
6694
6695 If only ASCII characters are found, it returns a list of single element
6696 `undecided' or its subsidiary coding system according to a detected
6697 end-of-line format.
6698
6699 If optional argument HIGHEST is non-nil, return the coding system of
6700 highest priority.  */)
6701      (string, highest)
6702      Lisp_Object string, highest;
6703 {
6704   CHECK_STRING (string);
6705
6706   return detect_coding_system (SDATA (string),
6707                                /* "+ 1" is to include the anchor byte
6708                                   `\0'.  With this, code detectors can
6709                                   handle the tailing bytes more
6710                                   accurately.  */
6711                                SBYTES (string) + 1,
6712                                !NILP (highest),
6713                                STRING_MULTIBYTE (string));
6714 }
6715
6716 /*  Subroutine for Ffind_coding_systems_region_internal.
6717
6718     Return a list of coding systems that safely encode the multibyte
6719     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6720     possible coding systems.  If it is nil, it means that we have not
6721     yet found any coding systems.
6722
6723     WORK_TABLE a char-table of which element is set to t once the
6724     element is looked up.
6725
6726     If a non-ASCII single byte char is found, set
6727     *single_byte_char_found to 1.  */
6728
6729 static Lisp_Object
6730 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6731      unsigned char *p, *pend;
6732      Lisp_Object safe_codings, work_table;
6733      int *single_byte_char_found;
6734 {
6735   int c, len;
6736   Lisp_Object val, ch;
6737   Lisp_Object prev, tail;
6738
6739   if (NILP (safe_codings))
6740     goto done_safe_codings;
6741   while (p < pend)
6742     {
6743       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6744       p += len;
6745       if (ASCII_BYTE_P (c))
6746         /* We can ignore ASCII characters here.  */
6747         continue;
6748       if (SINGLE_BYTE_CHAR_P (c))
6749         *single_byte_char_found = 1;
6750       /* Check the safe coding systems for C.  */
6751       ch = make_number (c);
6752       val = Faref (work_table, ch);
6753       if (EQ (val, Qt))
6754         /* This element was already checked.  Ignore it.  */
6755         continue;
6756       /* Remember that we checked this element.  */
6757       Faset (work_table, ch, Qt);
6758
6759       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6760         {
6761           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6762           int encodable;
6763
6764           elt = XCAR (tail);
6765           if (CONSP (XCDR (elt)))
6766             {
6767               /* This entry has this format now:
6768                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6769                           ACCEPT-LATIN-EXTRA ) */
6770               val = XCDR (elt);
6771               encodable = ! NILP (Faref (XCAR (val), ch));
6772               if (! encodable)
6773                 {
6774                   val = XCDR (val);
6775                   translation_table = XCAR (val);
6776                   hash_table = XCAR (XCDR (val));
6777                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6778                 }
6779             }
6780           else
6781             {
6782               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6783               encodable = ! NILP (Faref (XCDR (elt), ch));
6784               if (! encodable)
6785                 {
6786                   /* Transform the format to:
6787                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6788                        ACCEPT-LATIN-EXTRA )  */
6789                   val = Fget (XCAR (elt), Qcoding_system);
6790                   translation_table
6791                     = Fplist_get (AREF (val, 3),
6792                                   Qtranslation_table_for_encode);
6793                   if (SYMBOLP (translation_table))
6794                     translation_table = Fget (translation_table,
6795                                               Qtranslation_table);
6796                   hash_table
6797                     = (CHAR_TABLE_P (translation_table)
6798                        ? XCHAR_TABLE (translation_table)->extras[1]
6799                        : Qnil);
6800                   accept_latin_extra
6801                     = ((EQ (AREF (val, 0), make_number (2))
6802                         && VECTORP (AREF (val, 4)))
6803                        ? AREF (AREF (val, 4), 16)
6804                        : Qnil);
6805                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6806                                         translation_table, hash_table,
6807                                         accept_latin_extra));
6808                 }
6809             }
6810
6811           if (! encodable
6812               && ((CHAR_TABLE_P (translation_table)
6813                    && ! NILP (Faref (translation_table, ch)))
6814                   || (HASH_TABLE_P (hash_table)
6815                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6816                   || (SINGLE_BYTE_CHAR_P (c)
6817                       && ! NILP (accept_latin_extra)
6818                       && VECTORP (Vlatin_extra_code_table)
6819                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6820             encodable = 1;
6821           if (encodable)
6822             prev = tail;
6823           else
6824             {
6825               /* Exclude this coding system from SAFE_CODINGS.  */
6826               if (EQ (tail, safe_codings))
6827                 {
6828                   safe_codings = XCDR (safe_codings);
6829                   if (NILP (safe_codings))
6830                     goto done_safe_codings;
6831                 }
6832               else
6833                 XSETCDR (prev, XCDR (tail));
6834             }
6835         }
6836     }
6837
6838  done_safe_codings:
6839   /* If the above loop was terminated before P reaches PEND, it means
6840      SAFE_CODINGS was set to nil.  If we have not yet found an
6841      non-ASCII single-byte char, check it now.  */
6842   if (! *single_byte_char_found)
6843     while (p < pend)
6844       {
6845         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6846         p += len;
6847         if (! ASCII_BYTE_P (c)
6848             && SINGLE_BYTE_CHAR_P (c))
6849           {
6850             *single_byte_char_found = 1;
6851             break;
6852           }
6853       }
6854   return safe_codings;
6855 }
6856
6857 DEFUN ("find-coding-systems-region-internal",
6858        Ffind_coding_systems_region_internal,
6859        Sfind_coding_systems_region_internal, 2, 2, 0,
6860        doc: /* Internal use only.  */)
6861      (start, end)
6862      Lisp_Object start, end;
6863 {
6864   Lisp_Object work_table, safe_codings;
6865   int non_ascii_p = 0;
6866   int single_byte_char_found = 0;
6867   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6868
6869   if (STRINGP (start))
6870     {
6871       if (!STRING_MULTIBYTE (start))
6872         return Qt;
6873       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6874       p2 = p2end = p1end;
6875       if (SCHARS (start) != SBYTES (start))
6876         non_ascii_p = 1;
6877     }
6878   else
6879     {
6880       int from, to, stop;
6881
6882       CHECK_NUMBER_COERCE_MARKER (start);
6883       CHECK_NUMBER_COERCE_MARKER (end);
6884       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6885         args_out_of_range (start, end);
6886       if (NILP (current_buffer->enable_multibyte_characters))
6887         return Qt;
6888       from = CHAR_TO_BYTE (XINT (start));
6889       to = CHAR_TO_BYTE (XINT (end));
6890       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6891       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6892       if (stop == to)
6893         p2 = p2end = p1end;
6894       else
6895         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6896       if (XINT (end) - XINT (start) != to - from)
6897         non_ascii_p = 1;
6898     }
6899
6900   if (!non_ascii_p)
6901     {
6902       /* We are sure that the text contains no multibyte character.
6903          Check if it contains eight-bit-graphic.  */
6904       p = p1;
6905       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6906       if (p == p1end)
6907         {
6908           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6909           if (p == p2end)
6910             return Qt;
6911         }
6912     }
6913
6914   /* The text contains non-ASCII characters.  */
6915
6916   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6917   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6918
6919   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6920                                     &single_byte_char_found);
6921   if (p2 < p2end)
6922     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6923                                       &single_byte_char_found);
6924   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6925     safe_codings = Qt;
6926   else
6927     {
6928       /* Turn safe_codings to a list of coding systems... */
6929       Lisp_Object val;
6930
6931       if (single_byte_char_found)
6932         /* ... and append these for eight-bit chars.  */
6933         val = Fcons (Qraw_text,
6934                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6935       else
6936         /* ... and append generic coding systems.  */
6937         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6938
6939       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6940         val = Fcons (XCAR (XCAR (safe_codings)), val);
6941       safe_codings = val;
6942     }
6943
6944   return safe_codings;
6945 }
6946
6947
6948 /* Search from position POS for such characters that are unencodable
6949    accoding to SAFE_CHARS, and return a list of their positions.  P
6950    points where in the memory the character at POS exists.  Limit the
6951    search at PEND or when Nth unencodable characters are found.
6952
6953    If SAFE_CHARS is a char table, an element for an unencodable
6954    character is nil.
6955
6956    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6957
6958    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6959    eight-bit-graphic characters are unencodable.  */
6960
6961 static Lisp_Object
6962 unencodable_char_position (safe_chars, pos, p, pend, n)
6963      Lisp_Object safe_chars;
6964      int pos;
6965      unsigned char *p, *pend;
6966      int n;
6967 {
6968   Lisp_Object pos_list;
6969
6970   pos_list = Qnil;
6971   while (p < pend)
6972     {
6973       int len;
6974       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6975
6976       if (c >= 128
6977           && (CHAR_TABLE_P (safe_chars)
6978               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6979               : (NILP (safe_chars) || c < 256)))
6980         {
6981           pos_list = Fcons (make_number (pos), pos_list);
6982           if (--n <= 0)
6983             break;
6984         }
6985       pos++;
6986       p += len;
6987     }
6988   return Fnreverse (pos_list);
6989 }
6990
6991
6992 DEFUN ("unencodable-char-position", Funencodable_char_position,
6993        Sunencodable_char_position, 3, 5, 0,
6994        doc: /*
6995 Return position of first un-encodable character in a region.
6996 START and END specfiy the region and CODING-SYSTEM specifies the
6997 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
6998
6999 If optional 4th argument COUNT is non-nil, it specifies at most how
7000 many un-encodable characters to search.  In this case, the value is a
7001 list of positions.
7002
7003 If optional 5th argument STRING is non-nil, it is a string to search
7004 for un-encodable characters.  In that case, START and END are indexes
7005 to the string.  */)
7006      (start, end, coding_system, count, string)
7007      Lisp_Object start, end, coding_system, count, string;
7008 {
7009   int n;
7010   Lisp_Object safe_chars;
7011   struct coding_system coding;
7012   Lisp_Object positions;
7013   int from, to;
7014   unsigned char *p, *pend;
7015
7016   if (NILP (string))
7017     {
7018       validate_region (&start, &end);
7019       from = XINT (start);
7020       to = XINT (end);
7021       if (NILP (current_buffer->enable_multibyte_characters))
7022         return Qnil;
7023       p = CHAR_POS_ADDR (from);
7024       if (to == GPT)
7025         pend = GPT_ADDR;
7026       else
7027         pend = CHAR_POS_ADDR (to);
7028     }
7029   else
7030     {
7031       CHECK_STRING (string);
7032       CHECK_NATNUM (start);
7033       CHECK_NATNUM (end);
7034       from = XINT (start);
7035       to = XINT (end);
7036       if (from > to
7037           || to > SCHARS (string))
7038         args_out_of_range_3 (string, start, end);
7039       if (! STRING_MULTIBYTE (string))
7040         return Qnil;
7041       p = SDATA (string) + string_char_to_byte (string, from);
7042       pend = SDATA (string) + string_char_to_byte (string, to);
7043     }
7044
7045   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7046
7047   if (NILP (count))
7048     n = 1;
7049   else
7050     {
7051       CHECK_NATNUM (count);
7052       n = XINT (count);
7053     }
7054
7055   if (coding.type == coding_type_no_conversion
7056       || coding.type == coding_type_raw_text)
7057     return Qnil;
7058
7059   if (coding.type == coding_type_undecided)
7060     safe_chars = Qnil;
7061   else
7062     safe_chars = coding_safe_chars (coding_system);
7063
7064   if (STRINGP (string)
7065       || from >= GPT || to <= GPT)
7066     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7067   else
7068     {
7069       Lisp_Object args[2];
7070
7071       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7072       n -= XINT (Flength (args[0]));
7073       if (n <= 0)
7074         positions = args[0];
7075       else
7076         {
7077           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7078                                                pend, n);
7079           positions = Fappend (2, args);
7080         }
7081     }
7082
7083   return  (NILP (count) ? Fcar (positions) : positions);
7084 }
7085
7086
7087 Lisp_Object
7088 code_convert_region1 (start, end, coding_system, encodep)
7089      Lisp_Object start, end, coding_system;
7090      int encodep;
7091 {
7092   struct coding_system coding;
7093   int from, to;
7094
7095   CHECK_NUMBER_COERCE_MARKER (start);
7096   CHECK_NUMBER_COERCE_MARKER (end);
7097   CHECK_SYMBOL (coding_system);
7098
7099   validate_region (&start, &end);
7100   from = XFASTINT (start);
7101   to = XFASTINT (end);
7102
7103   if (NILP (coding_system))
7104     return make_number (to - from);
7105
7106   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7107     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7108
7109   coding.mode |= CODING_MODE_LAST_BLOCK;
7110   coding.src_multibyte = coding.dst_multibyte
7111     = !NILP (current_buffer->enable_multibyte_characters);
7112   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7113                        &coding, encodep, 1);
7114   Vlast_coding_system_used = coding.symbol;
7115   return make_number (coding.produced_char);
7116 }
7117
7118 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7119        3, 3, "r\nzCoding system: ",
7120        doc: /* Decode the current region from the specified coding system.
7121 When called from a program, takes three arguments:
7122 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7123 This function sets `last-coding-system-used' to the precise coding system
7124 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7125 not fully specified.)
7126 It returns the length of the decoded text.  */)
7127      (start, end, coding_system)
7128      Lisp_Object start, end, coding_system;
7129 {
7130   return code_convert_region1 (start, end, coding_system, 0);
7131 }
7132
7133 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7134        3, 3, "r\nzCoding system: ",
7135        doc: /* Encode the current region into the specified coding system.
7136 When called from a program, takes three arguments:
7137 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7138 This function sets `last-coding-system-used' to the precise coding system
7139 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7140 not fully specified.)
7141 It returns the length of the encoded text.  */)
7142      (start, end, coding_system)
7143      Lisp_Object start, end, coding_system;
7144 {
7145   return code_convert_region1 (start, end, coding_system, 1);
7146 }
7147
7148 Lisp_Object
7149 code_convert_string1 (string, coding_system, nocopy, encodep)
7150      Lisp_Object string, coding_system, nocopy;
7151      int encodep;
7152 {
7153   struct coding_system coding;
7154
7155   CHECK_STRING (string);
7156   CHECK_SYMBOL (coding_system);
7157
7158   if (NILP (coding_system))
7159     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7160
7161   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7162     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7163
7164   coding.mode |= CODING_MODE_LAST_BLOCK;
7165   string = (encodep
7166             ? encode_coding_string (string, &coding, !NILP (nocopy))
7167             : decode_coding_string (string, &coding, !NILP (nocopy)));
7168   Vlast_coding_system_used = coding.symbol;
7169
7170   return string;
7171 }
7172
7173 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7174        2, 3, 0,
7175        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7176 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7177 if the decoding operation is trivial.
7178 This function sets `last-coding-system-used' to the precise coding system
7179 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7180 not fully specified.)  */)
7181      (string, coding_system, nocopy)
7182      Lisp_Object string, coding_system, nocopy;
7183 {
7184   return code_convert_string1 (string, coding_system, nocopy, 0);
7185 }
7186
7187 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7188        2, 3, 0,
7189        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7190 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7191 if the encoding operation is trivial.
7192 This function sets `last-coding-system-used' to the precise coding system
7193 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7194 not fully specified.)  */)
7195      (string, coding_system, nocopy)
7196      Lisp_Object string, coding_system, nocopy;
7197 {
7198   return code_convert_string1 (string, coding_system, nocopy, 1);
7199 }
7200
7201 /* Encode or decode STRING according to CODING_SYSTEM.
7202    Do not set Vlast_coding_system_used.
7203
7204    This function is called only from macros DECODE_FILE and
7205    ENCODE_FILE, thus we ignore character composition.  */
7206
7207 Lisp_Object
7208 code_convert_string_norecord (string, coding_system, encodep)
7209      Lisp_Object string, coding_system;
7210      int encodep;
7211 {
7212   struct coding_system coding;
7213
7214   CHECK_STRING (string);
7215   CHECK_SYMBOL (coding_system);
7216
7217   if (NILP (coding_system))
7218     return string;
7219
7220   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7221     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7222
7223   coding.composing = COMPOSITION_DISABLED;
7224   coding.mode |= CODING_MODE_LAST_BLOCK;
7225   return (encodep
7226           ? encode_coding_string (string, &coding, 1)
7227           : decode_coding_string (string, &coding, 1));
7228 }
7229 \f
7230 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7231        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7232 Return the corresponding character.  */)
7233      (code)
7234      Lisp_Object code;
7235 {
7236   unsigned char c1, c2, s1, s2;
7237   Lisp_Object val;
7238
7239   CHECK_NUMBER (code);
7240   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7241   if (s1 == 0)
7242     {
7243       if (s2 < 0x80)
7244         XSETFASTINT (val, s2);
7245       else if (s2 >= 0xA0 || s2 <= 0xDF)
7246         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7247       else
7248         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7249     }
7250   else
7251     {
7252       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7253           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7254         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7255       DECODE_SJIS (s1, s2, c1, c2);
7256       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7257     }
7258   return val;
7259 }
7260
7261 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7262        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7263 Return the corresponding code in SJIS.  */)
7264      (ch)
7265      Lisp_Object ch;
7266 {
7267   int charset, c1, c2, s1, s2;
7268   Lisp_Object val;
7269
7270   CHECK_NUMBER (ch);
7271   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7272   if (charset == CHARSET_ASCII)
7273     {
7274       val = ch;
7275     }
7276   else if (charset == charset_jisx0208
7277            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7278     {
7279       ENCODE_SJIS (c1, c2, s1, s2);
7280       XSETFASTINT (val, (s1 << 8) | s2);
7281     }
7282   else if (charset == charset_katakana_jisx0201
7283            && c1 > 0x20 && c2 < 0xE0)
7284     {
7285       XSETFASTINT (val, c1 | 0x80);
7286     }
7287   else
7288     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7289   return val;
7290 }
7291
7292 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7293        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7294 Return the corresponding character.  */)
7295      (code)
7296      Lisp_Object code;
7297 {
7298   int charset;
7299   unsigned char b1, b2, c1, c2;
7300   Lisp_Object val;
7301
7302   CHECK_NUMBER (code);
7303   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7304   if (b1 == 0)
7305     {
7306       if (b2 >= 0x80)
7307         error ("Invalid BIG5 code: %x", XFASTINT (code));
7308       val = code;
7309     }
7310   else
7311     {
7312       if ((b1 < 0xA1 || b1 > 0xFE)
7313           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7314         error ("Invalid BIG5 code: %x", XFASTINT (code));
7315       DECODE_BIG5 (b1, b2, charset, c1, c2);
7316       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7317     }
7318   return val;
7319 }
7320
7321 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7322        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7323 Return the corresponding character code in Big5.  */)
7324      (ch)
7325      Lisp_Object ch;
7326 {
7327   int charset, c1, c2, b1, b2;
7328   Lisp_Object val;
7329
7330   CHECK_NUMBER (ch);
7331   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7332   if (charset == CHARSET_ASCII)
7333     {
7334       val = ch;
7335     }
7336   else if ((charset == charset_big5_1
7337             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7338            || (charset == charset_big5_2
7339                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7340     {
7341       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7342       XSETFASTINT (val, (b1 << 8) | b2);
7343     }
7344   else
7345     error ("Can't encode to Big5: %d", XFASTINT (ch));
7346   return val;
7347 }
7348 \f
7349 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7350        Sset_terminal_coding_system_internal, 1, 1, 0,
7351        doc: /* Internal use only.  */)
7352      (coding_system)
7353      Lisp_Object coding_system;
7354 {
7355   CHECK_SYMBOL (coding_system);
7356   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7357   /* We had better not send unsafe characters to terminal.  */
7358   terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7359   /* Character composition should be disabled.  */
7360   terminal_coding.composing = COMPOSITION_DISABLED;
7361   /* Error notification should be suppressed.  */
7362   terminal_coding.suppress_error = 1;
7363   terminal_coding.src_multibyte = 1;
7364   terminal_coding.dst_multibyte = 0;
7365   return Qnil;
7366 }
7367
7368 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7369        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7370        doc: /* Internal use only.  */)
7371      (coding_system)
7372      Lisp_Object coding_system;
7373 {
7374   CHECK_SYMBOL (coding_system);
7375   setup_coding_system (Fcheck_coding_system (coding_system),
7376                        &safe_terminal_coding);
7377   /* Character composition should be disabled.  */
7378   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7379   /* Error notification should be suppressed.  */
7380   safe_terminal_coding.suppress_error = 1;
7381   safe_terminal_coding.src_multibyte = 1;
7382   safe_terminal_coding.dst_multibyte = 0;
7383   return Qnil;
7384 }
7385
7386 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7387        Sterminal_coding_system, 0, 0, 0,
7388        doc: /* Return coding system specified for terminal output.  */)
7389      ()
7390 {
7391   return terminal_coding.symbol;
7392 }
7393
7394 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7395        Sset_keyboard_coding_system_internal, 1, 1, 0,
7396        doc: /* Internal use only.  */)
7397      (coding_system)
7398      Lisp_Object coding_system;
7399 {
7400   CHECK_SYMBOL (coding_system);
7401   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7402   /* Character composition should be disabled.  */
7403   keyboard_coding.composing = COMPOSITION_DISABLED;
7404   return Qnil;
7405 }
7406
7407 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7408        Skeyboard_coding_system, 0, 0, 0,
7409        doc: /* Return coding system specified for decoding keyboard input.  */)
7410      ()
7411 {
7412   return keyboard_coding.symbol;
7413 }
7414
7415 \f
7416 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7417        Sfind_operation_coding_system,  1, MANY, 0,
7418        doc: /* Choose a coding system for an operation based on the target name.
7419 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7420 DECODING-SYSTEM is the coding system to use for decoding
7421 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7422 for encoding (in case OPERATION does encoding).
7423
7424 The first argument OPERATION specifies an I/O primitive:
7425   For file I/O, `insert-file-contents' or `write-region'.
7426   For process I/O, `call-process', `call-process-region', or `start-process'.
7427   For network I/O, `open-network-stream'.
7428
7429 The remaining arguments should be the same arguments that were passed
7430 to the primitive.  Depending on which primitive, one of those arguments
7431 is selected as the TARGET.  For example, if OPERATION does file I/O,
7432 whichever argument specifies the file name is TARGET.
7433
7434 TARGET has a meaning which depends on OPERATION:
7435   For file I/O, TARGET is a file name.
7436   For process I/O, TARGET is a process name.
7437   For network I/O, TARGET is a service name or a port number
7438
7439 This function looks up what specified for TARGET in,
7440 `file-coding-system-alist', `process-coding-system-alist',
7441 or `network-coding-system-alist' depending on OPERATION.
7442 They may specify a coding system, a cons of coding systems,
7443 or a function symbol to call.
7444 In the last case, we call the function with one argument,
7445 which is a list of all the arguments given to this function.
7446
7447 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7448      (nargs, args)
7449      int nargs;
7450      Lisp_Object *args;
7451 {
7452   Lisp_Object operation, target_idx, target, val;
7453   register Lisp_Object chain;
7454
7455   if (nargs < 2)
7456     error ("Too few arguments");
7457   operation = args[0];
7458   if (!SYMBOLP (operation)
7459       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7460     error ("Invalid first argument");
7461   if (nargs < 1 + XINT (target_idx))
7462     error ("Too few arguments for operation: %s",
7463            SDATA (SYMBOL_NAME (operation)));
7464   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7465      argument to write-region) is string, it must be treated as a
7466      target file name.  */
7467   if (EQ (operation, Qwrite_region)
7468       && nargs > 5
7469       && STRINGP (args[5]))
7470     target_idx = make_number (4);
7471   target = args[XINT (target_idx) + 1];
7472   if (!(STRINGP (target)
7473         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7474     error ("Invalid argument %d", XINT (target_idx) + 1);
7475
7476   chain = ((EQ (operation, Qinsert_file_contents)
7477             || EQ (operation, Qwrite_region))
7478            ? Vfile_coding_system_alist
7479            : (EQ (operation, Qopen_network_stream)
7480               ? Vnetwork_coding_system_alist
7481               : Vprocess_coding_system_alist));
7482   if (NILP (chain))
7483     return Qnil;
7484
7485   for (; CONSP (chain); chain = XCDR (chain))
7486     {
7487       Lisp_Object elt;
7488       elt = XCAR (chain);
7489
7490       if (CONSP (elt)
7491           && ((STRINGP (target)
7492                && STRINGP (XCAR (elt))
7493                && fast_string_match (XCAR (elt), target) >= 0)
7494               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7495         {
7496           val = XCDR (elt);
7497           /* Here, if VAL is both a valid coding system and a valid
7498              function symbol, we return VAL as a coding system.  */
7499           if (CONSP (val))
7500             return val;
7501           if (! SYMBOLP (val))
7502             return Qnil;
7503           if (! NILP (Fcoding_system_p (val)))
7504             return Fcons (val, val);
7505           if (! NILP (Ffboundp (val)))
7506             {
7507               val = call1 (val, Flist (nargs, args));
7508               if (CONSP (val))
7509                 return val;
7510               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7511                 return Fcons (val, val);
7512             }
7513           return Qnil;
7514         }
7515     }
7516   return Qnil;
7517 }
7518
7519 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7520        Supdate_coding_systems_internal, 0, 0, 0,
7521        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7522 When values of any coding categories are changed, you must
7523 call this function.  */)
7524      ()
7525 {
7526   int i;
7527
7528   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7529     {
7530       Lisp_Object val;
7531
7532       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7533       if (!NILP (val))
7534         {
7535           if (! coding_system_table[i])
7536             coding_system_table[i] = ((struct coding_system *)
7537                                       xmalloc (sizeof (struct coding_system)));
7538           setup_coding_system (val, coding_system_table[i]);
7539         }
7540       else if (coding_system_table[i])
7541         {
7542           xfree (coding_system_table[i]);
7543           coding_system_table[i] = NULL;
7544         }
7545     }
7546
7547   return Qnil;
7548 }
7549
7550 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7551        Sset_coding_priority_internal, 0, 0, 0,
7552        doc: /* Update internal database for the current value of `coding-category-list'.
7553 This function is internal use only.  */)
7554      ()
7555 {
7556   int i = 0, idx;
7557   Lisp_Object val;
7558
7559   val = Vcoding_category_list;
7560
7561   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7562     {
7563       if (! SYMBOLP (XCAR (val)))
7564         break;
7565       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7566       if (idx >= CODING_CATEGORY_IDX_MAX)
7567         break;
7568       coding_priorities[i++] = (1 << idx);
7569       val = XCDR (val);
7570     }
7571   /* If coding-category-list is valid and contains all coding
7572      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7573      the following code saves Emacs from crashing.  */
7574   while (i < CODING_CATEGORY_IDX_MAX)
7575     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7576
7577   return Qnil;
7578 }
7579
7580 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7581        Sdefine_coding_system_internal, 1, 1, 0,
7582        doc: /* Register CODING-SYSTEM as a base coding system.
7583 This function is internal use only.  */)
7584      (coding_system)
7585      Lisp_Object coding_system;
7586 {
7587   Lisp_Object safe_chars, slot;
7588
7589   if (NILP (Fcheck_coding_system (coding_system)))
7590     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7591   safe_chars = coding_safe_chars (coding_system);
7592   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7593     error ("No valid safe-chars property for %s",
7594            SDATA (SYMBOL_NAME (coding_system)));
7595   if (EQ (safe_chars, Qt))
7596     {
7597       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7598         XSETCAR (Vcoding_system_safe_chars,
7599                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7600     }
7601   else
7602     {
7603       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7604       if (NILP (slot))
7605         XSETCDR (Vcoding_system_safe_chars,
7606                  nconc2 (XCDR (Vcoding_system_safe_chars),
7607                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7608       else
7609         XSETCDR (slot, safe_chars);
7610     }
7611   return Qnil;
7612 }
7613
7614 #endif /* emacs */
7615
7616 \f
7617 /*** 9. Post-amble ***/
7618
7619 void
7620 init_coding_once ()
7621 {
7622   int i;
7623
7624   /* Emacs' internal format specific initialize routine.  */
7625   for (i = 0; i <= 0x20; i++)
7626     emacs_code_class[i] = EMACS_control_code;
7627   emacs_code_class[0x0A] = EMACS_linefeed_code;
7628   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7629   for (i = 0x21 ; i < 0x7F; i++)
7630     emacs_code_class[i] = EMACS_ascii_code;
7631   emacs_code_class[0x7F] = EMACS_control_code;
7632   for (i = 0x80; i < 0xFF; i++)
7633     emacs_code_class[i] = EMACS_invalid_code;
7634   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7635   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7636   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7637   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7638
7639   /* ISO2022 specific initialize routine.  */
7640   for (i = 0; i < 0x20; i++)
7641     iso_code_class[i] = ISO_control_0;
7642   for (i = 0x21; i < 0x7F; i++)
7643     iso_code_class[i] = ISO_graphic_plane_0;
7644   for (i = 0x80; i < 0xA0; i++)
7645     iso_code_class[i] = ISO_control_1;
7646   for (i = 0xA1; i < 0xFF; i++)
7647     iso_code_class[i] = ISO_graphic_plane_1;
7648   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7649   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7650   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7651   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7652   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7653   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7654   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7655   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7656   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7657   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7658
7659   setup_coding_system (Qnil, &keyboard_coding);
7660   setup_coding_system (Qnil, &terminal_coding);
7661   setup_coding_system (Qnil, &safe_terminal_coding);
7662   setup_coding_system (Qnil, &default_buffer_file_coding);
7663
7664   bzero (coding_system_table, sizeof coding_system_table);
7665
7666   bzero (ascii_skip_code, sizeof ascii_skip_code);
7667   for (i = 0; i < 128; i++)
7668     ascii_skip_code[i] = 1;
7669
7670 #if defined (MSDOS) || defined (WINDOWSNT)
7671   system_eol_type = CODING_EOL_CRLF;
7672 #else
7673   system_eol_type = CODING_EOL_LF;
7674 #endif
7675
7676   inhibit_pre_post_conversion = 0;
7677 }
7678
7679 #ifdef emacs
7680
7681 void
7682 syms_of_coding ()
7683 {
7684   staticpro (&Vcode_conversion_workbuf_name);
7685   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7686
7687   Qtarget_idx = intern ("target-idx");
7688   staticpro (&Qtarget_idx);
7689
7690   Qcoding_system_history = intern ("coding-system-history");
7691   staticpro (&Qcoding_system_history);
7692   Fset (Qcoding_system_history, Qnil);
7693
7694   /* Target FILENAME is the first argument.  */
7695   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7696   /* Target FILENAME is the third argument.  */
7697   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7698
7699   Qcall_process = intern ("call-process");
7700   staticpro (&Qcall_process);
7701   /* Target PROGRAM is the first argument.  */
7702   Fput (Qcall_process, Qtarget_idx, make_number (0));
7703
7704   Qcall_process_region = intern ("call-process-region");
7705   staticpro (&Qcall_process_region);
7706   /* Target PROGRAM is the third argument.  */
7707   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7708
7709   Qstart_process = intern ("start-process");
7710   staticpro (&Qstart_process);
7711   /* Target PROGRAM is the third argument.  */
7712   Fput (Qstart_process, Qtarget_idx, make_number (2));
7713
7714   Qopen_network_stream = intern ("open-network-stream");
7715   staticpro (&Qopen_network_stream);
7716   /* Target SERVICE is the fourth argument.  */
7717   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7718
7719   Qcoding_system = intern ("coding-system");
7720   staticpro (&Qcoding_system);
7721
7722   Qeol_type = intern ("eol-type");
7723   staticpro (&Qeol_type);
7724
7725   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7726   staticpro (&Qbuffer_file_coding_system);
7727
7728   Qpost_read_conversion = intern ("post-read-conversion");
7729   staticpro (&Qpost_read_conversion);
7730
7731   Qpre_write_conversion = intern ("pre-write-conversion");
7732   staticpro (&Qpre_write_conversion);
7733
7734   Qno_conversion = intern ("no-conversion");
7735   staticpro (&Qno_conversion);
7736
7737   Qundecided = intern ("undecided");
7738   staticpro (&Qundecided);
7739
7740   Qcoding_system_p = intern ("coding-system-p");
7741   staticpro (&Qcoding_system_p);
7742
7743   Qcoding_system_error = intern ("coding-system-error");
7744   staticpro (&Qcoding_system_error);
7745
7746   Fput (Qcoding_system_error, Qerror_conditions,
7747         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7748   Fput (Qcoding_system_error, Qerror_message,
7749         build_string ("Invalid coding system"));
7750
7751   Qcoding_category = intern ("coding-category");
7752   staticpro (&Qcoding_category);
7753   Qcoding_category_index = intern ("coding-category-index");
7754   staticpro (&Qcoding_category_index);
7755
7756   Vcoding_category_table
7757     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7758   staticpro (&Vcoding_category_table);
7759   {
7760     int i;
7761     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7762       {
7763         XVECTOR (Vcoding_category_table)->contents[i]
7764           = intern (coding_category_name[i]);
7765         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7766               Qcoding_category_index, make_number (i));
7767       }
7768   }
7769
7770   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7771   staticpro (&Vcoding_system_safe_chars);
7772
7773   Qtranslation_table = intern ("translation-table");
7774   staticpro (&Qtranslation_table);
7775   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7776
7777   Qtranslation_table_id = intern ("translation-table-id");
7778   staticpro (&Qtranslation_table_id);
7779
7780   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7781   staticpro (&Qtranslation_table_for_decode);
7782
7783   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7784   staticpro (&Qtranslation_table_for_encode);
7785
7786   Qsafe_chars = intern ("safe-chars");
7787   staticpro (&Qsafe_chars);
7788
7789   Qchar_coding_system = intern ("char-coding-system");
7790   staticpro (&Qchar_coding_system);
7791
7792   /* Intern this now in case it isn't already done.
7793      Setting this variable twice is harmless.
7794      But don't staticpro it here--that is done in alloc.c.  */
7795   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7796   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7797   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7798
7799   Qvalid_codes = intern ("valid-codes");
7800   staticpro (&Qvalid_codes);
7801
7802   Qemacs_mule = intern ("emacs-mule");
7803   staticpro (&Qemacs_mule);
7804
7805   Qraw_text = intern ("raw-text");
7806   staticpro (&Qraw_text);
7807
7808   Qutf_8 = intern ("utf-8");
7809   staticpro (&Qutf_8);
7810
7811   Qcoding_system_define_form = intern ("coding-system-define-form");
7812   staticpro (&Qcoding_system_define_form);
7813
7814   defsubr (&Scoding_system_p);
7815   defsubr (&Sread_coding_system);
7816   defsubr (&Sread_non_nil_coding_system);
7817   defsubr (&Scheck_coding_system);
7818   defsubr (&Sdetect_coding_region);
7819   defsubr (&Sdetect_coding_string);
7820   defsubr (&Sfind_coding_systems_region_internal);
7821   defsubr (&Sunencodable_char_position);
7822   defsubr (&Sdecode_coding_region);
7823   defsubr (&Sencode_coding_region);
7824   defsubr (&Sdecode_coding_string);
7825   defsubr (&Sencode_coding_string);
7826   defsubr (&Sdecode_sjis_char);
7827   defsubr (&Sencode_sjis_char);
7828   defsubr (&Sdecode_big5_char);
7829   defsubr (&Sencode_big5_char);
7830   defsubr (&Sset_terminal_coding_system_internal);
7831   defsubr (&Sset_safe_terminal_coding_system_internal);
7832   defsubr (&Sterminal_coding_system);
7833   defsubr (&Sset_keyboard_coding_system_internal);
7834   defsubr (&Skeyboard_coding_system);
7835   defsubr (&Sfind_operation_coding_system);
7836   defsubr (&Supdate_coding_systems_internal);
7837   defsubr (&Sset_coding_priority_internal);
7838   defsubr (&Sdefine_coding_system_internal);
7839
7840   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7841                doc: /* List of coding systems.
7842
7843 Do not alter the value of this variable manually.  This variable should be
7844 updated by the functions `make-coding-system' and
7845 `define-coding-system-alias'.  */);
7846   Vcoding_system_list = Qnil;
7847
7848   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7849                doc: /* Alist of coding system names.
7850 Each element is one element list of coding system name.
7851 This variable is given to `completing-read' as TABLE argument.
7852
7853 Do not alter the value of this variable manually.  This variable should be
7854 updated by the functions `make-coding-system' and
7855 `define-coding-system-alias'.  */);
7856   Vcoding_system_alist = Qnil;
7857
7858   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7859                doc: /* List of coding-categories (symbols) ordered by priority.
7860
7861 On detecting a coding system, Emacs tries code detection algorithms
7862 associated with each coding-category one by one in this order.  When
7863 one algorithm agrees with a byte sequence of source text, the coding
7864 system bound to the corresponding coding-category is selected.
7865
7866 Don't modify this variable directly, but use `set-coding-priority'.  */);
7867   {
7868     int i;
7869
7870     Vcoding_category_list = Qnil;
7871     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7872       Vcoding_category_list
7873         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7874                  Vcoding_category_list);
7875   }
7876
7877   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7878                doc: /* Specify the coding system for read operations.
7879 It is useful to bind this variable with `let', but do not set it globally.
7880 If the value is a coding system, it is used for decoding on read operation.
7881 If not, an appropriate element is used from one of the coding system alists:
7882 There are three such tables, `file-coding-system-alist',
7883 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7884   Vcoding_system_for_read = Qnil;
7885
7886   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7887                doc: /* Specify the coding system for write operations.
7888 Programs bind this variable with `let', but you should not set it globally.
7889 If the value is a coding system, it is used for encoding of output,
7890 when writing it to a file and when sending it to a file or subprocess.
7891
7892 If this does not specify a coding system, an appropriate element
7893 is used from one of the coding system alists:
7894 There are three such tables, `file-coding-system-alist',
7895 `process-coding-system-alist', and `network-coding-system-alist'.
7896 For output to files, if the above procedure does not specify a coding system,
7897 the value of `buffer-file-coding-system' is used.  */);
7898   Vcoding_system_for_write = Qnil;
7899
7900   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7901                doc: /* Coding system used in the latest file or process I/O.
7902 Also set by `encode-coding-region', `decode-coding-region',
7903 `encode-coding-string' and `decode-coding-string'.  */);
7904   Vlast_coding_system_used = Qnil;
7905
7906   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7907                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7908 See info node `Coding Systems' and info node `Text and Binary' concerning
7909 such conversion.  */);
7910   inhibit_eol_conversion = 0;
7911
7912   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7913                doc: /* Non-nil means process buffer inherits coding system of process output.
7914 Bind it to t if the process output is to be treated as if it were a file
7915 read from some filesystem.  */);
7916   inherit_process_coding_system = 0;
7917
7918   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7919                doc: /* Alist to decide a coding system to use for a file I/O operation.
7920 The format is ((PATTERN . VAL) ...),
7921 where PATTERN is a regular expression matching a file name,
7922 VAL is a coding system, a cons of coding systems, or a function symbol.
7923 If VAL is a coding system, it is used for both decoding and encoding
7924 the file contents.
7925 If VAL is a cons of coding systems, the car part is used for decoding,
7926 and the cdr part is used for encoding.
7927 If VAL is a function symbol, the function must return a coding system
7928 or a cons of coding systems which are used as above.  The function gets
7929 the arguments with which `find-operation-coding-system' was called.
7930
7931 See also the function `find-operation-coding-system'
7932 and the variable `auto-coding-alist'.  */);
7933   Vfile_coding_system_alist = Qnil;
7934
7935   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7936     doc: /* Alist to decide a coding system to use for a process I/O operation.
7937 The format is ((PATTERN . VAL) ...),
7938 where PATTERN is a regular expression matching a program name,
7939 VAL is a coding system, a cons of coding systems, or a function symbol.
7940 If VAL is a coding system, it is used for both decoding what received
7941 from the program and encoding what sent to the program.
7942 If VAL is a cons of coding systems, the car part is used for decoding,
7943 and the cdr part is used for encoding.
7944 If VAL is a function symbol, the function must return a coding system
7945 or a cons of coding systems which are used as above.
7946
7947 See also the function `find-operation-coding-system'.  */);
7948   Vprocess_coding_system_alist = Qnil;
7949
7950   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7951     doc: /* Alist to decide a coding system to use for a network I/O operation.
7952 The format is ((PATTERN . VAL) ...),
7953 where PATTERN is a regular expression matching a network service name
7954 or is a port number to connect to,
7955 VAL is a coding system, a cons of coding systems, or a function symbol.
7956 If VAL is a coding system, it is used for both decoding what received
7957 from the network stream and encoding what sent to the network stream.
7958 If VAL is a cons of coding systems, the car part is used for decoding,
7959 and the cdr part is used for encoding.
7960 If VAL is a function symbol, the function must return a coding system
7961 or a cons of coding systems which are used as above.
7962
7963 See also the function `find-operation-coding-system'.  */);
7964   Vnetwork_coding_system_alist = Qnil;
7965
7966   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7967                doc: /* Coding system to use with system messages.
7968 Also used for decoding keyboard input on X Window system.  */);
7969   Vlocale_coding_system = Qnil;
7970
7971   /* The eol mnemonics are reset in startup.el system-dependently.  */
7972   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7973                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7974   eol_mnemonic_unix = build_string (":");
7975
7976   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7977                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
7978   eol_mnemonic_dos = build_string ("\\");
7979
7980   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7981                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
7982   eol_mnemonic_mac = build_string ("/");
7983
7984   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7985                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
7986   eol_mnemonic_undecided = build_string (":");
7987
7988   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7989                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
7990   Venable_character_translation = Qt;
7991
7992   DEFVAR_LISP ("standard-translation-table-for-decode",
7993                &Vstandard_translation_table_for_decode,
7994                doc: /* Table for translating characters while decoding.  */);
7995   Vstandard_translation_table_for_decode = Qnil;
7996
7997   DEFVAR_LISP ("standard-translation-table-for-encode",
7998                &Vstandard_translation_table_for_encode,
7999                doc: /* Table for translating characters while encoding.  */);
8000   Vstandard_translation_table_for_encode = Qnil;
8001
8002   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8003                doc: /* Alist of charsets vs revision numbers.
8004 While encoding, if a charset (car part of an element) is found,
8005 designate it with the escape sequence identifying revision (cdr part of the element).  */);
8006   Vcharset_revision_alist = Qnil;
8007
8008   DEFVAR_LISP ("default-process-coding-system",
8009                &Vdefault_process_coding_system,
8010                doc: /* Cons of coding systems used for process I/O by default.
8011 The car part is used for decoding a process output,
8012 the cdr part is used for encoding a text to be sent to a process.  */);
8013   Vdefault_process_coding_system = Qnil;
8014
8015   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8016                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8017 This is a vector of length 256.
8018 If Nth element is non-nil, the existence of code N in a file
8019 \(or output of subprocess) doesn't prevent it to be detected as
8020 a coding system of ISO 2022 variant which has a flag
8021 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8022 or reading output of a subprocess.
8023 Only 128th through 159th elements has a meaning.  */);
8024   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8025
8026   DEFVAR_LISP ("select-safe-coding-system-function",
8027                &Vselect_safe_coding_system_function,
8028                doc: /* Function to call to select safe coding system for encoding a text.
8029
8030 If set, this function is called to force a user to select a proper
8031 coding system which can encode the text in the case that a default
8032 coding system used in each operation can't encode the text.
8033
8034 The default value is `select-safe-coding-system' (which see).  */);
8035   Vselect_safe_coding_system_function = Qnil;
8036
8037   DEFVAR_BOOL ("coding-system-require-warning",
8038                &coding_system_require_warning,
8039                doc: /* Internal use only.
8040 If non-nil, on writing a file, `select-safe-coding-system-function' is
8041 called even if `coding-system-for-write' is non-nil.  The command
8042 `universal-coding-system-argument' binds this variable to t temporarily.  */);
8043   coding_system_require_warning = 0;
8044
8045
8046   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8047                &inhibit_iso_escape_detection,
8048                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8049
8050 By default, on reading a file, Emacs tries to detect how the text is
8051 encoded.  This code detection is sensitive to escape sequences.  If
8052 the sequence is valid as ISO2022, the code is determined as one of
8053 the ISO2022 encodings, and the file is decoded by the corresponding
8054 coding system (e.g. `iso-2022-7bit').
8055
8056 However, there may be a case that you want to read escape sequences in
8057 a file as is.  In such a case, you can set this variable to non-nil.
8058 Then, as the code detection ignores any escape sequences, no file is
8059 detected as encoded in some ISO2022 encoding.  The result is that all
8060 escape sequences become visible in a buffer.
8061
8062 The default value is nil, and it is strongly recommended not to change
8063 it.  That is because many Emacs Lisp source files that contain
8064 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8065 in Emacs's distribution, and they won't be decoded correctly on
8066 reading if you suppress escape sequence detection.
8067
8068 The other way to read escape sequences in a file without decoding is
8069 to explicitly specify some coding system that doesn't use ISO2022's
8070 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8071   inhibit_iso_escape_detection = 0;
8072
8073   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8074                doc: /* Char table for translating self-inserting characters.
8075 This is applied to the result of input methods, not their input.  See also
8076 `keyboard-translate-table'.  */);
8077     Vtranslation_table_for_input = Qnil;
8078 }
8079
8080 char *
8081 emacs_strerror (error_number)
8082      int error_number;
8083 {
8084   char *str;
8085
8086   synchronize_system_messages_locale ();
8087   str = strerror (error_number);
8088
8089   if (! NILP (Vlocale_coding_system))
8090     {
8091       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8092                                                       Vlocale_coding_system,
8093                                                       0);
8094       str = (char *) SDATA (dec);
8095     }
8096
8097   return str;
8098 }
8099
8100 #endif /* emacs */
8101
8102 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8103    (do not change this comment) */