src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 3, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  24 Boston, MA 02110-1301, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-mule) handlers
  31   3. ISO2022 handlers
  32   4. Shift-JIS and BIG5 handlers
  33   5. CCL handlers
  34   6. End-of-line handlers
  35   7. C library functions
  36   8. Emacs Lisp library functions
  37   9. Post-amble
  38
  39 */
  40
  41 /*** 0. General comments ***/
  42
  43
  44 /*** GENERAL NOTE on CODING SYSTEMS ***
  45
  46   A coding system is an encoding mechanism for one or more character
  47   sets.  Here's a list of coding systems which Emacs can handle.  When
  48   we say "decode", it means converting some other coding system to
  49   Emacs' internal format (emacs-mule), and when we say "encode",
  50   it means converting the coding system emacs-mule to some other
  51   coding system.
  52
  53   0. Emacs' internal format (emacs-mule)
  54
  55   Emacs itself holds a multi-lingual character in buffers and strings
  56   in a special format.  Details are described in section 2.
  57
  58   1. ISO2022
  59
  60   The most famous coding system for multiple character sets.  X's
  61   Compound Text, various EUCs (Extended Unix Code), and coding
  62   systems used in Internet communication such as ISO-2022-JP are
  63   all variants of ISO2022.  Details are described in section 3.
  64
  65   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  66
  67   A coding system to encode character sets: ASCII, JISX0201, and
  68   JISX0208.  Widely used for PC's in Japan.  Details are described in
  69   section 4.
  70
  71   3. BIG5
  72
  73   A coding system to encode the character sets ASCII and Big5.  Widely
  74   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  75   described in section 4.  In this file, when we write "BIG5"
  76   (all uppercase), we mean the coding system, and when we write
  77   "Big5" (capitalized), we mean the character set.
  78
  79   4. Raw text
  80
  81   A coding system for text containing random 8-bit code.  Emacs does
  82   no code conversion on such text except for end-of-line format.
  83
  84   5. Other
  85
  86   If a user wants to read/write text encoded in a coding system not
  87   listed above, he can supply a decoder and an encoder for it as CCL
  88   (Code Conversion Language) programs.  Emacs executes the CCL program
  89   while reading/writing.
  90
  91   Emacs represents a coding system by a Lisp symbol that has a property
  92   `coding-system'.  But, before actually using the coding system, the
  93   information about it is set in a structure of type `struct
  94   coding_system' for rapid processing.  See section 6 for more details.
  95
  96 */
  97
  98 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  99
 100   How end-of-line of text is encoded depends on the operating system.
 101   For instance, Unix's format is just one byte of `line-feed' code,
 102   whereas DOS's format is two-byte sequence of `carriage-return' and
 103   `line-feed' codes.  MacOS's format is usually one byte of
 104   `carriage-return'.
 105
 106   Since text character encoding and end-of-line encoding are
 107   independent, any coding system described above can have any
 108   end-of-line format.  So Emacs has information about end-of-line
 109   format in each coding-system.  See section 6 for more details.
 110
 111 */
 112
 113 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 114
 115   These functions check if a text between SRC and SRC_END is encoded
 116   in the coding system category XXX.  Each returns an integer value in
 117   which appropriate flag bits for the category XXX are set.  The flag
 118   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 119   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 120   of the range 0x80..0x9F are in multibyte form.  */
 121 #if 0
 122 int
 123 detect_coding_emacs_mule (src, src_end, multibytep)
 124      unsigned char *src, *src_end;
 125      int multibytep;
 126 {
 127   ...
 128 }
 129 #endif
 130
 131 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 132
 133   These functions decode SRC_BYTES length of unibyte text at SOURCE
 134   encoded in CODING to Emacs' internal format.  The resulting
 135   multibyte text goes to a place pointed to by DESTINATION, the length
 136   of which should not exceed DST_BYTES.
 137
 138   These functions set the information about original and decoded texts
 139   in the members `produced', `produced_char', `consumed', and
 140   `consumed_char' of the structure *CODING.  They also set the member
 141   `result' to one of CODING_FINISH_XXX indicating how the decoding
 142   finished.
 143
 144   DST_BYTES zero means that the source area and destination area are
 145   overlapped, which means that we can produce a decoded text until it
 146   reaches the head of the not-yet-decoded source text.
 147
 148   Below is a template for these functions.  */
 149 #if 0
 150 static void
 151 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 152      struct coding_system *coding;
 153      const unsigned char *source;
 154      unsigned char *destination;
 155      int src_bytes, dst_bytes;
 156 {
 157   ...
 158 }
 159 #endif
 160
 161 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 162
 163   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 164   internal multibyte format to CODING.  The resulting unibyte text
 165   goes to a place pointed to by DESTINATION, the length of which
 166   should not exceed DST_BYTES.
 167
 168   These functions set the information about original and encoded texts
 169   in the members `produced', `produced_char', `consumed', and
 170   `consumed_char' of the structure *CODING.  They also set the member
 171   `result' to one of CODING_FINISH_XXX indicating how the encoding
 172   finished.
 173
 174   DST_BYTES zero means that the source area and destination area are
 175   overlapped, which means that we can produce encoded text until it
 176   reaches at the head of the not-yet-encoded source text.
 177
 178   Below is a template for these functions.  */
 179 #if 0
 180 static void
 181 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 182      struct coding_system *coding;
 183      unsigned char *source, *destination;
 184      int src_bytes, dst_bytes;
 185 {
 186   ...
 187 }
 188 #endif
 189
 190 /*** COMMONLY USED MACROS ***/
 191
 192 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 193    get one, two, and three bytes from the source text respectively.
 194    If there are not enough bytes in the source, they jump to
 195    `label_end_of_loop'.  The caller should set variables `coding',
 196    `src' and `src_end' to appropriate pointer in advance.  These
 197    macros are called from decoding routines `decode_coding_XXX', thus
 198    it is assumed that the source text is unibyte.  */
 199
 200 #define ONE_MORE_BYTE(c1)                                       \
 201   do {                                                          \
 202     if (src >= src_end)                                         \
 203       {                                                         \
 204         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 205         goto label_end_of_loop;                                 \
 206       }                                                         \
 207     c1 = *src++;                                                \
 208   } while (0)
 209
 210 #define TWO_MORE_BYTES(c1, c2)                                  \
 211   do {                                                          \
 212     if (src + 1 >= src_end)                                     \
 213       {                                                         \
 214         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 215         goto label_end_of_loop;                                 \
 216       }                                                         \
 217     c1 = *src++;                                                \
 218     c2 = *src++;                                                \
 219   } while (0)
 220
 221
 222 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 223    form if MULTIBYTEP is nonzero.  In addition, if SRC is not less
 224    than SRC_END, return with RET.  */
 225
 226 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret)      \
 227   do {                                                          \
 228     if (src >= src_end)                                         \
 229       {                                                         \
 230         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 231         return ret;                                             \
 232       }                                                         \
 233     c1 = *src++;                                                \
 234     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 235       c1 = *src++ - 0x20;                                       \
 236   } while (0)
 237
 238 /* Set C to the next character at the source text pointed by `src'.
 239    If there are not enough characters in the source, jump to
 240    `label_end_of_loop'.  The caller should set variables `coding'
 241    `src', `src_end', and `translation_table' to appropriate pointers
 242    in advance.  This macro is used in encoding routines
 243    `encode_coding_XXX', thus it assumes that the source text is in
 244    multibyte form except for 8-bit characters.  8-bit characters are
 245    in multibyte form if coding->src_multibyte is nonzero, else they
 246    are represented by a single byte.  */
 247
 248 #define ONE_MORE_CHAR(c)                                        \
 249   do {                                                          \
 250     int len = src_end - src;                                    \
 251     int bytes;                                                  \
 252     if (len <= 0)                                               \
 253       {                                                         \
 254         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 255         goto label_end_of_loop;                                 \
 256       }                                                         \
 257     if (coding->src_multibyte                                   \
 258         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 259       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 260     else                                                        \
 261       c = *src, bytes = 1;                                      \
 262     if (!NILP (translation_table))                              \
 263       c = translate_char (translation_table, c, -1, 0, 0);      \
 264     src += bytes;                                               \
 265   } while (0)
 266
 267
 268 /* Produce a multibyte form of character C to `dst'.  Jump to
 269    `label_end_of_loop' if there's not enough space at `dst'.
 270
 271    If we are now in the middle of a composition sequence, the decoded
 272    character may be ALTCHAR (for the current composition).  In that
 273    case, the character goes to coding->cmp_data->data instead of
 274    `dst'.
 275
 276    This macro is used in decoding routines.  */
 277
 278 #define EMIT_CHAR(c)                                                    \
 279   do {                                                                  \
 280     if (! COMPOSING_P (coding)                                          \
 281         || coding->composing == COMPOSITION_RELATIVE                    \
 282         || coding->composing == COMPOSITION_WITH_RULE)                  \
 283       {                                                                 \
 284         int bytes = CHAR_BYTES (c);                                     \
 285         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 286           {                                                             \
 287             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 288             goto label_end_of_loop;                                     \
 289           }                                                             \
 290         dst += CHAR_STRING (c, dst);                                    \
 291         coding->produced_char++;                                        \
 292       }                                                                 \
 293                                                                         \
 294     if (COMPOSING_P (coding)                                            \
 295         && coding->composing != COMPOSITION_RELATIVE)                   \
 296       {                                                                 \
 297         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 298         coding->composition_rule_follows                                \
 299           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 300       }                                                                 \
 301   } while (0)
 302
 303
 304 #define EMIT_ONE_BYTE(c)                                        \
 305   do {                                                          \
 306     if (dst >= (dst_bytes ? dst_end : src))                     \
 307       {                                                         \
 308         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 309         goto label_end_of_loop;                                 \
 310       }                                                         \
 311     *dst++ = c;                                                 \
 312   } while (0)
 313
 314 #define EMIT_TWO_BYTES(c1, c2)                                  \
 315   do {                                                          \
 316     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 317       {                                                         \
 318         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 319         goto label_end_of_loop;                                 \
 320       }                                                         \
 321     *dst++ = c1, *dst++ = c2;                                   \
 322   } while (0)
 323
 324 #define EMIT_BYTES(from, to)                                    \
 325   do {                                                          \
 326     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 327       {                                                         \
 328         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 329         goto label_end_of_loop;                                 \
 330       }                                                         \
 331     while (from < to)                                           \
 332       *dst++ = *from++;                                         \
 333   } while (0)
 334
 335 \f
 336 /*** 1. Preamble ***/
 337
 338 #ifdef emacs
 339 #include <config.h>
 340 #endif
 341
 342 #include <stdio.h>
 343
 344 #ifdef emacs
 345
 346 #include "lisp.h"
 347 #include "buffer.h"
 348 #include "charset.h"
 349 #include "composite.h"
 350 #include "ccl.h"
 351 #include "coding.h"
 352 #include "window.h"
 353 #include "intervals.h"
 354
 355 #else  /* not emacs */
 356
 357 #include "mulelib.h"
 358
 359 #endif /* not emacs */
 360
 361 Lisp_Object Qcoding_system, Qeol_type;
 362 Lisp_Object Qbuffer_file_coding_system;
 363 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 364 Lisp_Object Qno_conversion, Qundecided;
 365 Lisp_Object Qcoding_system_history;
 366 Lisp_Object Qsafe_chars;
 367 Lisp_Object Qvalid_codes;
 368 Lisp_Object Qascii_incompatible;
 369
 370 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 371 Lisp_Object Qcall_process, Qcall_process_region;
 372 Lisp_Object Qstart_process, Qopen_network_stream;
 373 Lisp_Object Qtarget_idx;
 374
 375 /* If a symbol has this property, evaluate the value to define the
 376    symbol as a coding system.  */
 377 Lisp_Object Qcoding_system_define_form;
 378
 379 Lisp_Object Vselect_safe_coding_system_function;
 380
 381 int coding_system_require_warning;
 382
 383 /* Mnemonic string for each format of end-of-line.  */
 384 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 385 /* Mnemonic string to indicate format of end-of-line is not yet
 386    decided.  */
 387 Lisp_Object eol_mnemonic_undecided;
 388
 389 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 390    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.
 391    This has an effect only for external encoding (i.e. for output to
 392    file and process), not for in-buffer or Lisp string encoding.  */
 393 int system_eol_type;
 394
 395 #ifdef emacs
 396
 397 /* Information about which coding system is safe for which chars.
 398    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 399
 400    GENERIC-LIST is a list of generic coding systems which can encode
 401    any characters.
 402
 403    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 404    corresponding char table that contains safe chars.  */
 405 Lisp_Object Vcoding_system_safe_chars;
 406
 407 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 408
 409 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 410
 411 /* Coding system emacs-mule and raw-text are for converting only
 412    end-of-line format.  */
 413 Lisp_Object Qemacs_mule, Qraw_text;
 414
 415 Lisp_Object Qutf_8;
 416
 417 /* Coding-systems are handed between Emacs Lisp programs and C internal
 418    routines by the following three variables.  */
 419 /* Coding-system for reading files and receiving data from process.  */
 420 Lisp_Object Vcoding_system_for_read;
 421 /* Coding-system for writing files and sending data to process.  */
 422 Lisp_Object Vcoding_system_for_write;
 423 /* Coding-system actually used in the latest I/O.  */
 424 Lisp_Object Vlast_coding_system_used;
 425
 426 /* A vector of length 256 which contains information about special
 427    Latin codes (especially for dealing with Microsoft codes).  */
 428 Lisp_Object Vlatin_extra_code_table;
 429
 430 /* Flag to inhibit code conversion of end-of-line format.  */
 431 int inhibit_eol_conversion;
 432
 433 /* Flag to inhibit ISO2022 escape sequence detection.  */
 434 int inhibit_iso_escape_detection;
 435
 436 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 437 int inherit_process_coding_system;
 438
 439 /* Coding system to be used to encode text for terminal display.  */
 440 struct coding_system terminal_coding;
 441
 442 /* Coding system to be used to encode text for terminal display when
 443    terminal coding system is nil.  */
 444 struct coding_system safe_terminal_coding;
 445
 446 /* Coding system of what is sent from terminal keyboard.  */
 447 struct coding_system keyboard_coding;
 448
 449 /* Default coding system to be used to write a file.  */
 450 struct coding_system default_buffer_file_coding;
 451
 452 Lisp_Object Vfile_coding_system_alist;
 453 Lisp_Object Vprocess_coding_system_alist;
 454 Lisp_Object Vnetwork_coding_system_alist;
 455
 456 Lisp_Object Vlocale_coding_system;
 457
 458 #endif /* emacs */
 459
 460 Lisp_Object Qcoding_category, Qcoding_category_index;
 461
 462 /* List of symbols `coding-category-xxx' ordered by priority.  */
 463 Lisp_Object Vcoding_category_list;
 464
 465 /* Table of coding categories (Lisp symbols).  */
 466 Lisp_Object Vcoding_category_table;
 467
 468 /* Table of names of symbol for each coding-category.  */
 469 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 470   "coding-category-emacs-mule",
 471   "coding-category-sjis",
 472   "coding-category-iso-7",
 473   "coding-category-iso-7-tight",
 474   "coding-category-iso-8-1",
 475   "coding-category-iso-8-2",
 476   "coding-category-iso-7-else",
 477   "coding-category-iso-8-else",
 478   "coding-category-ccl",
 479   "coding-category-big5",
 480   "coding-category-utf-8",
 481   "coding-category-utf-16-be",
 482   "coding-category-utf-16-le",
 483   "coding-category-raw-text",
 484   "coding-category-binary"
 485 };
 486
 487 /* Table of pointers to coding systems corresponding to each coding
 488    categories.  */
 489 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 490
 491 /* Table of coding category masks.  Nth element is a mask for a coding
 492    category of which priority is Nth.  */
 493 static
 494 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 495
 496 /* Flag to tell if we look up translation table on character code
 497    conversion.  */
 498 Lisp_Object Venable_character_translation;
 499 /* Standard translation table to look up on decoding (reading).  */
 500 Lisp_Object Vstandard_translation_table_for_decode;
 501 /* Standard translation table to look up on encoding (writing).  */
 502 Lisp_Object Vstandard_translation_table_for_encode;
 503
 504 Lisp_Object Qtranslation_table;
 505 Lisp_Object Qtranslation_table_id;
 506 Lisp_Object Qtranslation_table_for_decode;
 507 Lisp_Object Qtranslation_table_for_encode;
 508
 509 /* Alist of charsets vs revision number.  */
 510 Lisp_Object Vcharset_revision_alist;
 511
 512 /* Default coding systems used for process I/O.  */
 513 Lisp_Object Vdefault_process_coding_system;
 514
 515 /* Char table for translating Quail and self-inserting input.  */
 516 Lisp_Object Vtranslation_table_for_input;
 517
 518 /* Global flag to tell that we can't call post-read-conversion and
 519    pre-write-conversion functions.  Usually the value is zero, but it
 520    is set to 1 temporarily while such functions are running.  This is
 521    to avoid infinite recursive call.  */
 522 static int inhibit_pre_post_conversion;
 523
 524 Lisp_Object Qchar_coding_system;
 525
 526 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 527    its validity.  */
 528
 529 Lisp_Object
 530 coding_safe_chars (coding_system)
 531      Lisp_Object coding_system;
 532 {
 533   Lisp_Object coding_spec, plist, safe_chars;
 534
 535   coding_spec = Fget (coding_system, Qcoding_system);
 536   plist = XVECTOR (coding_spec)->contents[3];
 537   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 538   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 539 }
 540
 541 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 542   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 543
 544 \f
 545 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 546
 547 /* Emacs' internal format for representation of multiple character
 548    sets is a kind of multi-byte encoding, i.e. characters are
 549    represented by variable-length sequences of one-byte codes.
 550
 551    ASCII characters and control characters (e.g. `tab', `newline') are
 552    represented by one-byte sequences which are their ASCII codes, in
 553    the range 0x00 through 0x7F.
 554
 555    8-bit characters of the range 0x80..0x9F are represented by
 556    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 557    code + 0x20).
 558
 559    8-bit characters of the range 0xA0..0xFF are represented by
 560    one-byte sequences which are their 8-bit code.
 561
 562    The other characters are represented by a sequence of `base
 563    leading-code', optional `extended leading-code', and one or two
 564    `position-code's.  The length of the sequence is determined by the
 565    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 566    whereas extended leading-code and position-code take the range 0xA0
 567    through 0xFF.  See `charset.h' for more details about leading-code
 568    and position-code.
 569
 570    --- CODE RANGE of Emacs' internal format ---
 571    character set        range
 572    -------------        -----
 573    ascii                0x00..0x7F
 574    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 575    eight-bit-graphic    0xA0..0xBF
 576    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 577    ---------------------------------------------
 578
 579    As this is the internal character representation, the format is
 580    usually not used externally (i.e. in a file or in a data sent to a
 581    process).  But, it is possible to have a text externally in this
 582    format (i.e. by encoding by the coding system `emacs-mule').
 583
 584    In that case, a sequence of one-byte codes has a slightly different
 585    form.
 586
 587    Firstly, all characters in eight-bit-control are represented by
 588    one-byte sequences which are their 8-bit code.
 589
 590    Next, character composition data are represented by the byte
 591    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 592    where,
 593         METHOD is 0xF0 plus one of composition method (enum
 594         composition_method),
 595
 596         BYTES is 0xA0 plus the byte length of these composition data,
 597
 598         CHARS is 0xA0 plus the number of characters composed by these
 599         data,
 600
 601         COMPONENTs are characters of multibyte form or composition
 602         rules encoded by two-byte of ASCII codes.
 603
 604    In addition, for backward compatibility, the following formats are
 605    also recognized as composition data on decoding.
 606
 607    0x80 MSEQ ...
 608    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 609
 610    Here,
 611         MSEQ is a multibyte form but in these special format:
 612           ASCII: 0xA0 ASCII_CODE+0x80,
 613           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 614         RULE is a one byte code of the range 0xA0..0xF0 that
 615         represents a composition rule.
 616   */
 617
 618 enum emacs_code_class_type emacs_code_class[256];
 619
 620 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 621    Check if a text is encoded in Emacs' internal format.  If it is,
 622    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 623
 624 static int
 625 detect_coding_emacs_mule (src, src_end, multibytep)
 626       unsigned char *src, *src_end;
 627       int multibytep;
 628 {
 629   unsigned char c;
 630   int composing = 0;
 631   /* Dummy for ONE_MORE_BYTE.  */
 632   struct coding_system dummy_coding;
 633   struct coding_system *coding = &dummy_coding;
 634
 635   while (1)
 636     {
 637       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
 638                                      CODING_CATEGORY_MASK_EMACS_MULE);
 639       if (composing)
 640         {
 641           if (c < 0xA0)
 642             composing = 0;
 643           else if (c == 0xA0)
 644             {
 645               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
 646               c &= 0x7F;
 647             }
 648           else
 649             c -= 0x20;
 650         }
 651
 652       if (c < 0x20)
 653         {
 654           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 655             return 0;
 656         }
 657       else if (c >= 0x80 && c < 0xA0)
 658         {
 659           if (c == 0x80)
 660             /* Old leading code for a composite character.  */
 661             composing = 1;
 662           else
 663             {
 664               unsigned char *src_base = src - 1;
 665               int bytes;
 666
 667               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 668                                                bytes))
 669                 return 0;
 670               src = src_base + bytes;
 671             }
 672         }
 673     }
 674 }
 675
 676
 677 /* Record the starting position START and METHOD of one composition.  */
 678
 679 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 680   do {                                                          \
 681     struct composition_data *cmp_data = coding->cmp_data;       \
 682     int *data = cmp_data->data + cmp_data->used;                \
 683     coding->cmp_data_start = cmp_data->used;                    \
 684     data[0] = -1;                                               \
 685     data[1] = cmp_data->char_offset + start;                    \
 686     data[3] = (int) method;                                     \
 687     cmp_data->used += 4;                                        \
 688   } while (0)
 689
 690 /* Record the ending position END of the current composition.  */
 691
 692 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 693   do {                                                          \
 694     struct composition_data *cmp_data = coding->cmp_data;       \
 695     int *data = cmp_data->data + coding->cmp_data_start;        \
 696     data[0] = cmp_data->used - coding->cmp_data_start;          \
 697     data[2] = cmp_data->char_offset + end;                      \
 698   } while (0)
 699
 700 /* Record one COMPONENT (alternate character or composition rule).  */
 701
 702 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 703   do {                                                                  \
 704     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 705     if (coding->cmp_data->used - coding->cmp_data_start                 \
 706         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 707       {                                                                 \
 708         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 709         coding->composing = COMPOSITION_NO;                             \
 710       }                                                                 \
 711   } while (0)
 712
 713
 714 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 715    is not less than SRC_END, return -1 without incrementing Src.  */
 716
 717 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 718
 719
 720 /* Decode a character represented as a component of composition
 721    sequence of Emacs 20 style at SRC.  Set C to that character, store
 722    its multibyte form sequence at P, and set P to the end of that
 723    sequence.  If no valid character is found, set C to -1.  */
 724
 725 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 726   do {                                                          \
 727     int bytes;                                                  \
 728                                                                 \
 729     c = SAFE_ONE_MORE_BYTE ();                                  \
 730     if (c < 0)                                                  \
 731       break;                                                    \
 732     if (CHAR_HEAD_P (c))                                        \
 733       c = -1;                                                   \
 734     else if (c == 0xA0)                                         \
 735       {                                                         \
 736         c = SAFE_ONE_MORE_BYTE ();                              \
 737         if (c < 0xA0)                                           \
 738           c = -1;                                               \
 739         else                                                    \
 740           {                                                     \
 741             c -= 0x80;                                          \
 742             *p++ = c;                                           \
 743           }                                                     \
 744       }                                                         \
 745     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 746       {                                                         \
 747         unsigned char *p0 = p;                                  \
 748                                                                 \
 749         c -= 0x20;                                              \
 750         *p++ = c;                                               \
 751         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 752         while (--bytes)                                         \
 753           {                                                     \
 754             c = SAFE_ONE_MORE_BYTE ();                          \
 755             if (c < 0)                                          \
 756               break;                                            \
 757             *p++ = c;                                           \
 758           }                                                     \
 759         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 760             || (coding->flags /* We are recovering a file.  */  \
 761                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 762                 && ! CHAR_HEAD_P (p0[1])))                      \
 763           c = STRING_CHAR (p0, bytes);                          \
 764         else                                                    \
 765           c = -1;                                               \
 766       }                                                         \
 767     else                                                        \
 768       c = -1;                                                   \
 769   } while (0)
 770
 771
 772 /* Decode a composition rule represented as a component of composition
 773    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 774    valid rule is found, set C to -1.  */
 775
 776 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 777   do {                                                  \
 778     c = SAFE_ONE_MORE_BYTE ();                          \
 779     c -= 0xA0;                                          \
 780     if (c < 0 || c >= 81)                               \
 781       c = -1;                                           \
 782     else                                                \
 783       {                                                 \
 784         gref = c / 9, nref = c % 9;                     \
 785         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 786       }                                                 \
 787   } while (0)
 788
 789
 790 /* Decode composition sequence encoded by `emacs-mule' at the source
 791    pointed by SRC.  SRC_END is the end of source.  Store information
 792    of the composition in CODING->cmp_data.
 793
 794    For backward compatibility, decode also a composition sequence of
 795    Emacs 20 style.  In that case, the composition sequence contains
 796    characters that should be extracted into a buffer or string.  Store
 797    those characters at *DESTINATION in multibyte form.
 798
 799    If we encounter an invalid byte sequence, return 0.
 800    If we encounter an insufficient source or destination, or
 801    insufficient space in CODING->cmp_data, return 1.
 802    Otherwise, return consumed bytes in the source.
 803
 804 */
 805 static INLINE int
 806 decode_composition_emacs_mule (coding, src, src_end,
 807                                destination, dst_end, dst_bytes)
 808      struct coding_system *coding;
 809      const unsigned char *src, *src_end;
 810      unsigned char **destination, *dst_end;
 811      int dst_bytes;
 812 {
 813   unsigned char *dst = *destination;
 814   int method, data_len, nchars;
 815   const unsigned char *src_base = src++;
 816   /* Store components of composition.  */
 817   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 818   int ncomponent;
 819   /* Store multibyte form of characters to be composed.  This is for
 820      Emacs 20 style composition sequence.  */
 821   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 822   unsigned char *bufp = buf;
 823   int c, i, gref, nref;
 824
 825   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 826       >= COMPOSITION_DATA_SIZE)
 827     {
 828       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 829       return -1;
 830     }
 831
 832   ONE_MORE_BYTE (c);
 833   if (c - 0xF0 >= COMPOSITION_RELATIVE
 834            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 835     {
 836       int with_rule;
 837
 838       method = c - 0xF0;
 839       with_rule = (method == COMPOSITION_WITH_RULE
 840                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 841       ONE_MORE_BYTE (c);
 842       data_len = c - 0xA0;
 843       if (data_len < 4
 844           || src_base + data_len > src_end)
 845         return 0;
 846       ONE_MORE_BYTE (c);
 847       nchars = c - 0xA0;
 848       if (c < 1)
 849         return 0;
 850       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 851         {
 852           /* If it is longer than this, it can't be valid.  */
 853           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 854             return 0;
 855
 856           if (ncomponent % 2 && with_rule)
 857             {
 858               ONE_MORE_BYTE (gref);
 859               gref -= 32;
 860               ONE_MORE_BYTE (nref);
 861               nref -= 32;
 862               c = COMPOSITION_ENCODE_RULE (gref, nref);
 863             }
 864           else
 865             {
 866               int bytes;
 867               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 868                   || (coding->flags /* We are recovering a file.  */
 869                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 870                       && ! CHAR_HEAD_P (src[1])))
 871                 c = STRING_CHAR (src, bytes);
 872               else
 873                 c = *src, bytes = 1;
 874               src += bytes;
 875             }
 876           component[ncomponent] = c;
 877         }
 878     }
 879   else if (c >= 0x80)
 880     {
 881       /* This may be an old Emacs 20 style format.  See the comment at
 882          the section 2 of this file.  */
 883       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 884       if (src == src_end
 885           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 886         goto label_end_of_loop;
 887
 888       src_end = src;
 889       src = src_base + 1;
 890       if (c < 0xC0)
 891         {
 892           method = COMPOSITION_RELATIVE;
 893           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 894             {
 895               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 896               if (c < 0)
 897                 break;
 898               component[ncomponent++] = c;
 899             }
 900           if (ncomponent < 2)
 901             return 0;
 902           nchars = ncomponent;
 903         }
 904       else if (c == 0xFF)
 905         {
 906           method = COMPOSITION_WITH_RULE;
 907           src++;
 908           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 909           if (c < 0)
 910             return 0;
 911           component[0] = c;
 912           for (ncomponent = 1;
 913                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 914             {
 915               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 916               if (c < 0)
 917                 break;
 918               component[ncomponent++] = c;
 919               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 920               if (c < 0)
 921                 break;
 922               component[ncomponent++] = c;
 923             }
 924           if (ncomponent < 3)
 925             return 0;
 926           nchars = (ncomponent + 1) / 2;
 927         }
 928       else
 929         return 0;
 930     }
 931   else
 932     return 0;
 933
 934   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 935     {
 936       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 937       for (i = 0; i < ncomponent; i++)
 938         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 939       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 940       if (buf < bufp)
 941         {
 942           unsigned char *p = buf;
 943           EMIT_BYTES (p, bufp);
 944           *destination += bufp - buf;
 945           coding->produced_char += nchars;
 946         }
 947       return (src - src_base);
 948     }
 949  label_end_of_loop:
 950   return -1;
 951 }
 952
 953 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 954
 955 static void
 956 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 957      struct coding_system *coding;
 958      const unsigned char *source;
 959      unsigned char *destination;
 960      int src_bytes, dst_bytes;
 961 {
 962   const unsigned char *src = source;
 963   const unsigned char *src_end = source + src_bytes;
 964   unsigned char *dst = destination;
 965   unsigned char *dst_end = destination + dst_bytes;
 966   /* SRC_BASE remembers the start position in source in each loop.
 967      The loop will be exited when there's not enough source code, or
 968      when there's not enough destination area to produce a
 969      character.  */
 970   const unsigned char *src_base;
 971
 972   coding->produced_char = 0;
 973   while ((src_base = src) < src_end)
 974     {
 975       unsigned char tmp[MAX_MULTIBYTE_LENGTH];
 976       const unsigned char *p;
 977       int bytes;
 978
 979       if (*src == '\r')
 980         {
 981           int c = *src++;
 982
 983           if (coding->eol_type == CODING_EOL_CR)
 984             c = '\n';
 985           else if (coding->eol_type == CODING_EOL_CRLF)
 986             {
 987               ONE_MORE_BYTE (c);
 988               if (c != '\n')
 989                 {
 990                   src--;
 991                   c = '\r';
 992                 }
 993             }
 994           *dst++ = c;
 995           coding->produced_char++;
 996           continue;
 997         }
 998       else if (*src == '\n')
 999         {
1000           if ((coding->eol_type == CODING_EOL_CR
1001                || coding->eol_type == CODING_EOL_CRLF)
1002               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1003             {
1004               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1005               goto label_end_of_loop;
1006             }
1007           *dst++ = *src++;
1008           coding->produced_char++;
1009           continue;
1010         }
1011       else if (*src == 0x80 && coding->cmp_data)
1012         {
1013           /* Start of composition data.  */
1014           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1015                                                          &dst, dst_end,
1016                                                          dst_bytes);
1017           if (consumed < 0)
1018             goto label_end_of_loop;
1019           else if (consumed > 0)
1020             {
1021               src += consumed;
1022               continue;
1023             }
1024           bytes = CHAR_STRING (*src, tmp);
1025           p = tmp;
1026           src++;
1027         }
1028       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1029                || (coding->flags /* We are recovering a file.  */
1030                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1031                    && ! CHAR_HEAD_P (src[1])))
1032         {
1033           p = src;
1034           src += bytes;
1035         }
1036       else
1037         {
1038           int i, c;
1039
1040           bytes = BYTES_BY_CHAR_HEAD (*src);
1041           src++;
1042           for (i = 1; i < bytes; i++)
1043             {
1044               ONE_MORE_BYTE (c);
1045               if (CHAR_HEAD_P (c))
1046                 break;
1047             }
1048           if (i < bytes)
1049             {
1050               bytes = CHAR_STRING (*src_base, tmp);
1051               p = tmp;
1052               src = src_base + 1;
1053             }
1054           else
1055             {
1056               p = src_base;
1057             }
1058         }
1059       if (dst + bytes >= (dst_bytes ? dst_end : src))
1060         {
1061           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1062           break;
1063         }
1064       while (bytes--) *dst++ = *p++;
1065       coding->produced_char++;
1066     }
1067  label_end_of_loop:
1068   coding->consumed = coding->consumed_char = src_base - source;
1069   coding->produced = dst - destination;
1070 }
1071
1072
1073 /* Encode composition data stored at DATA into a special byte sequence
1074    starting by 0x80.  Update CODING->cmp_data_start and maybe
1075    CODING->cmp_data for the next call.  */
1076
1077 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1078   do {                                                                  \
1079     unsigned char buf[1024], *p0 = buf, *p;                             \
1080     int len = data[0];                                                  \
1081     int i;                                                              \
1082                                                                         \
1083     buf[0] = 0x80;                                                      \
1084     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1085     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1086     p = buf + 4;                                                        \
1087     if (data[3] == COMPOSITION_WITH_RULE                                \
1088         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1089       {                                                                 \
1090         p += CHAR_STRING (data[4], p);                                  \
1091         for (i = 5; i < len; i += 2)                                    \
1092           {                                                             \
1093             int gref, nref;                                             \
1094              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1095             *p++ = 0x20 + gref;                                         \
1096             *p++ = 0x20 + nref;                                         \
1097             p += CHAR_STRING (data[i + 1], p);                          \
1098           }                                                             \
1099       }                                                                 \
1100     else                                                                \
1101       {                                                                 \
1102         for (i = 4; i < len; i++)                                       \
1103           p += CHAR_STRING (data[i], p);                                \
1104       }                                                                 \
1105     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1106                                                                         \
1107     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1108       {                                                                 \
1109         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1110         goto label_end_of_loop;                                         \
1111       }                                                                 \
1112     while (p0 < p)                                                      \
1113       *dst++ = *p0++;                                                   \
1114     coding->cmp_data_start += data[0];                                  \
1115     if (coding->cmp_data_start == coding->cmp_data->used                \
1116         && coding->cmp_data->next)                                      \
1117       {                                                                 \
1118         coding->cmp_data = coding->cmp_data->next;                      \
1119         coding->cmp_data_start = 0;                                     \
1120       }                                                                 \
1121   } while (0)
1122
1123
1124 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1125                             unsigned char *, int, int));
1126
1127 static void
1128 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1129      struct coding_system *coding;
1130      const unsigned char *source;
1131      unsigned char *destination;
1132      int src_bytes, dst_bytes;
1133 {
1134   const unsigned char *src = source;
1135   const unsigned char *src_end = source + src_bytes;
1136   unsigned char *dst = destination;
1137   unsigned char *dst_end = destination + dst_bytes;
1138   const unsigned char *src_base;
1139   int c;
1140   int char_offset;
1141   int *data;
1142
1143   Lisp_Object translation_table;
1144
1145   translation_table = Qnil;
1146
1147   /* Optimization for the case that there's no composition.  */
1148   if (!coding->cmp_data || coding->cmp_data->used == 0)
1149     {
1150       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1151       return;
1152     }
1153
1154   char_offset = coding->cmp_data->char_offset;
1155   data = coding->cmp_data->data + coding->cmp_data_start;
1156   while (1)
1157     {
1158       src_base = src;
1159
1160       /* If SRC starts a composition, encode the information about the
1161          composition in advance.  */
1162       if (coding->cmp_data_start < coding->cmp_data->used
1163           && char_offset + coding->consumed_char == data[1])
1164         {
1165           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1166           char_offset = coding->cmp_data->char_offset;
1167           data = coding->cmp_data->data + coding->cmp_data_start;
1168         }
1169
1170       ONE_MORE_CHAR (c);
1171       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1172                         || coding->eol_type == CODING_EOL_CR))
1173         {
1174           if (coding->eol_type == CODING_EOL_CRLF)
1175             EMIT_TWO_BYTES ('\r', c);
1176           else
1177             EMIT_ONE_BYTE ('\r');
1178         }
1179       else if (SINGLE_BYTE_CHAR_P (c))
1180         {
1181           if (coding->flags && ! ASCII_BYTE_P (c))
1182             {
1183               /* As we are auto saving, retain the multibyte form for
1184                  8-bit chars.  */
1185               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1186               int bytes = CHAR_STRING (c, buf);
1187
1188               if (bytes == 1)
1189                 EMIT_ONE_BYTE (buf[0]);
1190               else
1191                 EMIT_TWO_BYTES (buf[0], buf[1]);
1192             }
1193           else
1194             EMIT_ONE_BYTE (c);
1195         }
1196       else
1197         EMIT_BYTES (src_base, src);
1198       coding->consumed_char++;
1199     }
1200  label_end_of_loop:
1201   coding->consumed = src_base - source;
1202   coding->produced = coding->produced_char = dst - destination;
1203   return;
1204 }
1205
1206 \f
1207 /*** 3. ISO2022 handlers ***/
1208
1209 /* The following note describes the coding system ISO2022 briefly.
1210    Since the intention of this note is to help understand the
1211    functions in this file, some parts are NOT ACCURATE or are OVERLY
1212    SIMPLIFIED.  For thorough understanding, please refer to the
1213    original document of ISO2022.  This is equivalent to the standard
1214    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1215
1216    ISO2022 provides many mechanisms to encode several character sets
1217    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1218    is encoded using bytes less than 128.  This may make the encoded
1219    text a little bit longer, but the text passes more easily through
1220    several types of gateway, some of which strip off the MSB (Most
1221    Significant Bit).
1222
1223    There are two kinds of character sets: control character sets and
1224    graphic character sets.  The former contain control characters such
1225    as `newline' and `escape' to provide control functions (control
1226    functions are also provided by escape sequences).  The latter
1227    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1228    two control character sets and many graphic character sets.
1229
1230    Graphic character sets are classified into one of the following
1231    four classes, according to the number of bytes (DIMENSION) and
1232    number of characters in one dimension (CHARS) of the set:
1233    - DIMENSION1_CHARS94
1234    - DIMENSION1_CHARS96
1235    - DIMENSION2_CHARS94
1236    - DIMENSION2_CHARS96
1237
1238    In addition, each character set is assigned an identification tag,
1239    unique for each set, called the "final character" (denoted as <F>
1240    hereafter).  The <F> of each character set is decided by ECMA(*)
1241    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1242    (0x30..0x3F are for private use only).
1243
1244    Note (*): ECMA = European Computer Manufacturers Association
1245
1246    Here are examples of graphic character sets [NAME(<F>)]:
1247         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1248         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1249         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1250         o DIMENSION2_CHARS96 -- none for the moment
1251
1252    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1253         C0 [0x00..0x1F] -- control character plane 0
1254         GL [0x20..0x7F] -- graphic character plane 0
1255         C1 [0x80..0x9F] -- control character plane 1
1256         GR [0xA0..0xFF] -- graphic character plane 1
1257
1258    A control character set is directly designated and invoked to C0 or
1259    C1 by an escape sequence.  The most common case is that:
1260    - ISO646's  control character set is designated/invoked to C0, and
1261    - ISO6429's control character set is designated/invoked to C1,
1262    and usually these designations/invocations are omitted in encoded
1263    text.  In a 7-bit environment, only C0 can be used, and a control
1264    character for C1 is encoded by an appropriate escape sequence to
1265    fit into the environment.  All control characters for C1 are
1266    defined to have corresponding escape sequences.
1267
1268    A graphic character set is at first designated to one of four
1269    graphic registers (G0 through G3), then these graphic registers are
1270    invoked to GL or GR.  These designations and invocations can be
1271    done independently.  The most common case is that G0 is invoked to
1272    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1273    these invocations and designations are omitted in encoded text.
1274    In a 7-bit environment, only GL can be used.
1275
1276    When a graphic character set of CHARS94 is invoked to GL, codes
1277    0x20 and 0x7F of the GL area work as control characters SPACE and
1278    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1279    be used.
1280
1281    There are two ways of invocation: locking-shift and single-shift.
1282    With locking-shift, the invocation lasts until the next different
1283    invocation, whereas with single-shift, the invocation affects the
1284    following character only and doesn't affect the locking-shift
1285    state.  Invocations are done by the following control characters or
1286    escape sequences:
1287
1288    ----------------------------------------------------------------------
1289    abbrev  function                  cntrl escape seq   description
1290    ----------------------------------------------------------------------
1291    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1292    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1293    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1294    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1295    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1296    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1297    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1298    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1299    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1300    ----------------------------------------------------------------------
1301    (*) These are not used by any known coding system.
1302
1303    Control characters for these functions are defined by macros
1304    ISO_CODE_XXX in `coding.h'.
1305
1306    Designations are done by the following escape sequences:
1307    ----------------------------------------------------------------------
1308    escape sequence      description
1309    ----------------------------------------------------------------------
1310    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1311    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1312    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1313    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1314    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1315    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1316    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1317    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1318    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1319    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1320    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1321    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1322    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1323    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1324    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1325    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1326    ----------------------------------------------------------------------
1327
1328    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1329    of dimension 1, chars 94, and final character <F>, etc...
1330
1331    Note (*): Although these designations are not allowed in ISO2022,
1332    Emacs accepts them on decoding, and produces them on encoding
1333    CHARS96 character sets in a coding system which is characterized as
1334    7-bit environment, non-locking-shift, and non-single-shift.
1335
1336    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1337    '(' can be omitted.  We refer to this as "short-form" hereafter.
1338
1339    Now you may notice that there are a lot of ways of encoding the
1340    same multilingual text in ISO2022.  Actually, there exist many
1341    coding systems such as Compound Text (used in X11's inter client
1342    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1343    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1344    localized platforms), and all of these are variants of ISO2022.
1345
1346    In addition to the above, Emacs handles two more kinds of escape
1347    sequences: ISO6429's direction specification and Emacs' private
1348    sequence for specifying character composition.
1349
1350    ISO6429's direction specification takes the following form:
1351         o CSI ']'      -- end of the current direction
1352         o CSI '0' ']'  -- end of the current direction
1353         o CSI '1' ']'  -- start of left-to-right text
1354         o CSI '2' ']'  -- start of right-to-left text
1355    The control character CSI (0x9B: control sequence introducer) is
1356    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1357
1358    Character composition specification takes the following form:
1359         o ESC '0' -- start relative composition
1360         o ESC '1' -- end composition
1361         o ESC '2' -- start rule-base composition (*)
1362         o ESC '3' -- start relative composition with alternate chars  (**)
1363         o ESC '4' -- start rule-base composition with alternate chars  (**)
1364   Since these are not standard escape sequences of any ISO standard,
1365   the use of them with these meanings is restricted to Emacs only.
1366
1367   (*) This form is used only in Emacs 20.5 and older versions,
1368   but the newer versions can safely decode it.
1369   (**) This form is used only in Emacs 21.1 and newer versions,
1370   and the older versions can't decode it.
1371
1372   Here's a list of example usages of these composition escape
1373   sequences (categorized by `enum composition_method').
1374
1375   COMPOSITION_RELATIVE:
1376         ESC 0 CHAR [ CHAR ] ESC 1
1377   COMPOSITION_WITH_RULE:
1378         ESC 2 CHAR [ RULE CHAR ] ESC 1
1379   COMPOSITION_WITH_ALTCHARS:
1380         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1381   COMPOSITION_WITH_RULE_ALTCHARS:
1382         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1383
1384 enum iso_code_class_type iso_code_class[256];
1385
1386 #define CHARSET_OK(idx, charset, c)                                     \
1387   (coding_system_table[idx]                                             \
1388    && (charset == CHARSET_ASCII                                         \
1389        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1390            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1391    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1392                                               charset)                  \
1393        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1394
1395 #define SHIFT_OUT_OK(idx) \
1396   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1397
1398 #define COMPOSITION_OK(idx)     \
1399   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1400
1401 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1402    Check if a text is encoded in ISO2022.  If it is, return an
1403    integer in which appropriate flag bits any of:
1404         CODING_CATEGORY_MASK_ISO_7
1405         CODING_CATEGORY_MASK_ISO_7_TIGHT
1406         CODING_CATEGORY_MASK_ISO_8_1
1407         CODING_CATEGORY_MASK_ISO_8_2
1408         CODING_CATEGORY_MASK_ISO_7_ELSE
1409         CODING_CATEGORY_MASK_ISO_8_ELSE
1410    are set.  If a code which should never appear in ISO2022 is found,
1411    returns 0.  */
1412
1413 static int
1414 detect_coding_iso2022 (src, src_end, multibytep)
1415      unsigned char *src, *src_end;
1416      int multibytep;
1417 {
1418   int mask = CODING_CATEGORY_MASK_ISO;
1419   int mask_found = 0;
1420   int reg[4], shift_out = 0, single_shifting = 0;
1421   int c, c1, charset;
1422   /* Dummy for ONE_MORE_BYTE.  */
1423   struct coding_system dummy_coding;
1424   struct coding_system *coding = &dummy_coding;
1425   Lisp_Object safe_chars;
1426
1427   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1428   while (mask)
1429     {
1430       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1431     retry:
1432       switch (c)
1433         {
1434         case ISO_CODE_ESC:
1435           if (inhibit_iso_escape_detection)
1436             break;
1437           single_shifting = 0;
1438           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1439           if (c >= '(' && c <= '/')
1440             {
1441               /* Designation sequence for a charset of dimension 1.  */
1442               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found);
1443               if (c1 < ' ' || c1 >= 0x80
1444                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1445                 /* Invalid designation sequence.  Just ignore.  */
1446                 break;
1447               reg[(c - '(') % 4] = charset;
1448             }
1449           else if (c == '$')
1450             {
1451               /* Designation sequence for a charset of dimension 2.  */
1452               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found);
1453               if (c >= '@' && c <= 'B')
1454                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1455                 reg[0] = charset = iso_charset_table[1][0][c];
1456               else if (c >= '(' && c <= '/')
1457                 {
1458                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep,
1459                                                  mask & mask_found);
1460                   if (c1 < ' ' || c1 >= 0x80
1461                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1462                     /* Invalid designation sequence.  Just ignore.  */
1463                     break;
1464                   reg[(c - '(') % 4] = charset;
1465                 }
1466               else
1467                 /* Invalid designation sequence.  Just ignore.  */
1468                 break;
1469             }
1470           else if (c == 'N' || c == 'O')
1471             {
1472               /* ESC <Fe> for SS2 or SS3.  */
1473               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1474               break;
1475             }
1476           else if (c >= '0' && c <= '4')
1477             {
1478               /* ESC <Fp> for start/end composition.  */
1479               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1480                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1481               else
1482                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1483               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1484                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1485               else
1486                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1487               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1488                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1489               else
1490                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1491               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1492                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1493               else
1494                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1495               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1496                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1497               else
1498                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1499               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1500                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1501               else
1502                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1503               break;
1504             }
1505           else
1506             /* Invalid escape sequence.  Just ignore.  */
1507             break;
1508
1509           /* We found a valid designation sequence for CHARSET.  */
1510           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1511           c = MAKE_CHAR (charset, 0, 0);
1512           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1513             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1514           else
1515             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1516           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1517             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1518           else
1519             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1520           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1521             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1522           else
1523             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1524           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1525             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1526           else
1527             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1528           break;
1529
1530         case ISO_CODE_SO:
1531           if (inhibit_iso_escape_detection)
1532             break;
1533           single_shifting = 0;
1534           if (shift_out == 0
1535               && (reg[1] >= 0
1536                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1537                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1538             {
1539               /* Locking shift out.  */
1540               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1541               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1542             }
1543           break;
1544
1545         case ISO_CODE_SI:
1546           if (inhibit_iso_escape_detection)
1547             break;
1548           single_shifting = 0;
1549           if (shift_out == 1)
1550             {
1551               /* Locking shift in.  */
1552               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1553               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1554             }
1555           break;
1556
1557         case ISO_CODE_CSI:
1558           single_shifting = 0;
1559         case ISO_CODE_SS2:
1560         case ISO_CODE_SS3:
1561           {
1562             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1563
1564             if (inhibit_iso_escape_detection)
1565               break;
1566             if (c != ISO_CODE_CSI)
1567               {
1568                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1569                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1570                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1571                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1572                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1573                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1574                 single_shifting = 1;
1575               }
1576             if (VECTORP (Vlatin_extra_code_table)
1577                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1578               {
1579                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1580                     & CODING_FLAG_ISO_LATIN_EXTRA)
1581                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1582                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1583                     & CODING_FLAG_ISO_LATIN_EXTRA)
1584                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1585               }
1586             mask &= newmask;
1587             mask_found |= newmask;
1588           }
1589           break;
1590
1591         default:
1592           if (c < 0x80)
1593             {
1594               single_shifting = 0;
1595               break;
1596             }
1597           else if (c < 0xA0)
1598             {
1599               single_shifting = 0;
1600               if (VECTORP (Vlatin_extra_code_table)
1601                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1602                 {
1603                   int newmask = 0;
1604
1605                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1606                       & CODING_FLAG_ISO_LATIN_EXTRA)
1607                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1608                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1609                       & CODING_FLAG_ISO_LATIN_EXTRA)
1610                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1611                   mask &= newmask;
1612                   mask_found |= newmask;
1613                 }
1614               else
1615                 return 0;
1616             }
1617           else
1618             {
1619               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1620                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1621               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1622               /* Check the length of succeeding codes of the range
1623                  0xA0..0FF.  If the byte length is odd, we exclude
1624                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1625                  when we are not single shifting.  */
1626               if (!single_shifting
1627                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1628                 {
1629                   int i = 1;
1630
1631                   c = -1;
1632                   while (src < src_end)
1633                     {
1634                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep,
1635                                                      mask & mask_found);
1636                       if (c < 0xA0)
1637                         break;
1638                       i++;
1639                     }
1640
1641                   if (i & 1 && src < src_end)
1642                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1643                   else
1644                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1645                   if (c >= 0)
1646                     /* This means that we have read one extra byte.  */
1647                     goto retry;
1648                 }
1649             }
1650           break;
1651         }
1652     }
1653   return (mask & mask_found);
1654 }
1655
1656 /* Decode a character of which charset is CHARSET, the 1st position
1657    code is C1, the 2nd position code is C2, and return the decoded
1658    character code.  If the variable `translation_table' is non-nil,
1659    returned the translated code.  */
1660
1661 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1662   (NILP (translation_table)                     \
1663    ? MAKE_CHAR (charset, c1, c2)                \
1664    : translate_char (translation_table, -1, charset, c1, c2))
1665
1666 /* Set designation state into CODING.  */
1667 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1668   do {                                                                     \
1669     int charset, c;                                                        \
1670                                                                            \
1671     if (final_char < '0' || final_char >= 128)                             \
1672       goto label_invalid_code;                                             \
1673     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1674                                  make_number (chars),                      \
1675                                  make_number (final_char));                \
1676     c = MAKE_CHAR (charset, 0, 0);                                         \
1677     if (charset >= 0                                                       \
1678         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1679             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1680       {                                                                    \
1681         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1682             && reg == 0                                                    \
1683             && charset == CHARSET_ASCII)                                   \
1684           {                                                                \
1685             /* We should insert this designation sequence as is so         \
1686                that it is surely written back to a file.  */               \
1687             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1688             goto label_invalid_code;                                       \
1689           }                                                                \
1690         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1691         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1692             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1693           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1694         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1695       }                                                                    \
1696     else                                                                   \
1697       {                                                                    \
1698         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1699         goto label_invalid_code;                                           \
1700       }                                                                    \
1701   } while (0)
1702
1703 /* Allocate a memory block for storing information about compositions.
1704    The block is chained to the already allocated blocks.  */
1705
1706 void
1707 coding_allocate_composition_data (coding, char_offset)
1708      struct coding_system *coding;
1709      int char_offset;
1710 {
1711   struct composition_data *cmp_data
1712     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1713
1714   cmp_data->char_offset = char_offset;
1715   cmp_data->used = 0;
1716   cmp_data->prev = coding->cmp_data;
1717   cmp_data->next = NULL;
1718   if (coding->cmp_data)
1719     coding->cmp_data->next = cmp_data;
1720   coding->cmp_data = cmp_data;
1721   coding->cmp_data_start = 0;
1722   coding->composing = COMPOSITION_NO;
1723 }
1724
1725 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1726    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1727    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1728    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1729    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1730   */
1731
1732 #define DECODE_COMPOSITION_START(c1)                                       \
1733   do {                                                                     \
1734     if (coding->composing == COMPOSITION_DISABLED)                         \
1735       {                                                                    \
1736         *dst++ = ISO_CODE_ESC;                                             \
1737         *dst++ = c1 & 0x7f;                                                \
1738         coding->produced_char += 2;                                        \
1739       }                                                                    \
1740     else if (!COMPOSING_P (coding))                                        \
1741       {                                                                    \
1742         /* This is surely the start of a composition.  We must be sure     \
1743            that coding->cmp_data has enough space to store the             \
1744            information about the composition.  If not, terminate the       \
1745            current decoding loop, allocate one more memory block for       \
1746            coding->cmp_data in the caller, then start the decoding         \
1747            loop again.  We can't allocate memory here directly because     \
1748            it may cause buffer/string relocation.  */                      \
1749         if (!coding->cmp_data                                              \
1750             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1751                 >= COMPOSITION_DATA_SIZE))                                 \
1752           {                                                                \
1753             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1754             goto label_end_of_loop;                                        \
1755           }                                                                \
1756         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1757                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1758                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1759                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1760         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1761                                       coding->composing);                  \
1762         coding->composition_rule_follows = 0;                              \
1763       }                                                                    \
1764     else                                                                   \
1765       {                                                                    \
1766         /* We are already handling a composition.  If the method is        \
1767            the following two, the codes following the current escape       \
1768            sequence are actual characters stored in a buffer.  */          \
1769         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1770             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1771           {                                                                \
1772             coding->composing = COMPOSITION_RELATIVE;                      \
1773             coding->composition_rule_follows = 0;                          \
1774           }                                                                \
1775       }                                                                    \
1776   } while (0)
1777
1778 /* Handle composition end sequence ESC 1.  */
1779
1780 #define DECODE_COMPOSITION_END(c1)                                      \
1781   do {                                                                  \
1782     if (! COMPOSING_P (coding))                                         \
1783       {                                                                 \
1784         *dst++ = ISO_CODE_ESC;                                          \
1785         *dst++ = c1;                                                    \
1786         coding->produced_char += 2;                                     \
1787       }                                                                 \
1788     else                                                                \
1789       {                                                                 \
1790         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1791         coding->composing = COMPOSITION_NO;                             \
1792       }                                                                 \
1793   } while (0)
1794
1795 /* Decode a composition rule from the byte C1 (and maybe one more byte
1796    from SRC) and store one encoded composition rule in
1797    coding->cmp_data.  */
1798
1799 #define DECODE_COMPOSITION_RULE(c1)                                     \
1800   do {                                                                  \
1801     int rule = 0;                                                       \
1802     (c1) -= 32;                                                         \
1803     if (c1 < 81)                /* old format (before ver.21) */        \
1804       {                                                                 \
1805         int gref = (c1) / 9;                                            \
1806         int nref = (c1) % 9;                                            \
1807         if (gref == 4) gref = 10;                                       \
1808         if (nref == 4) nref = 10;                                       \
1809         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1810       }                                                                 \
1811     else if (c1 < 93)           /* new format (after ver.21) */         \
1812       {                                                                 \
1813         ONE_MORE_BYTE (c2);                                             \
1814         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1815       }                                                                 \
1816     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1817     coding->composition_rule_follows = 0;                               \
1818   } while (0)
1819
1820
1821 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1822
1823 static void
1824 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1825      struct coding_system *coding;
1826      const unsigned char *source;
1827      unsigned char *destination;
1828      int src_bytes, dst_bytes;
1829 {
1830   const unsigned char *src = source;
1831   const unsigned char *src_end = source + src_bytes;
1832   unsigned char *dst = destination;
1833   unsigned char *dst_end = destination + dst_bytes;
1834   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1835   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1836   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1837   /* SRC_BASE remembers the start position in source in each loop.
1838      The loop will be exited when there's not enough source code
1839      (within macro ONE_MORE_BYTE), or when there's not enough
1840      destination area to produce a character (within macro
1841      EMIT_CHAR).  */
1842   const unsigned char *src_base;
1843   int c, charset;
1844   Lisp_Object translation_table;
1845   Lisp_Object safe_chars;
1846
1847   safe_chars = coding_safe_chars (coding->symbol);
1848
1849   if (NILP (Venable_character_translation))
1850     translation_table = Qnil;
1851   else
1852     {
1853       translation_table = coding->translation_table_for_decode;
1854       if (NILP (translation_table))
1855         translation_table = Vstandard_translation_table_for_decode;
1856     }
1857
1858   coding->result = CODING_FINISH_NORMAL;
1859
1860   while (1)
1861     {
1862       int c1, c2 = 0;
1863
1864       src_base = src;
1865       ONE_MORE_BYTE (c1);
1866
1867       /* We produce no character or one character.  */
1868       switch (iso_code_class [c1])
1869         {
1870         case ISO_0x20_or_0x7F:
1871           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1872             {
1873               DECODE_COMPOSITION_RULE (c1);
1874               continue;
1875             }
1876           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1877             {
1878               /* This is SPACE or DEL.  */
1879               charset = CHARSET_ASCII;
1880               break;
1881             }
1882           /* This is a graphic character, we fall down ...  */
1883
1884         case ISO_graphic_plane_0:
1885           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1886             {
1887               DECODE_COMPOSITION_RULE (c1);
1888               continue;
1889             }
1890           charset = charset0;
1891           break;
1892
1893         case ISO_0xA0_or_0xFF:
1894           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1895               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1896             goto label_invalid_code;
1897           /* This is a graphic character, we fall down ... */
1898
1899         case ISO_graphic_plane_1:
1900           if (charset1 < 0)
1901             goto label_invalid_code;
1902           charset = charset1;
1903           break;
1904
1905         case ISO_control_0:
1906           if (COMPOSING_P (coding))
1907             DECODE_COMPOSITION_END ('1');
1908
1909           /* All ISO2022 control characters in this class have the
1910              same representation in Emacs internal format.  */
1911           if (c1 == '\n'
1912               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1913               && (coding->eol_type == CODING_EOL_CR
1914                   || coding->eol_type == CODING_EOL_CRLF))
1915             {
1916               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1917               goto label_end_of_loop;
1918             }
1919           charset = CHARSET_ASCII;
1920           break;
1921
1922         case ISO_control_1:
1923           if (COMPOSING_P (coding))
1924             DECODE_COMPOSITION_END ('1');
1925           goto label_invalid_code;
1926
1927         case ISO_carriage_return:
1928           if (COMPOSING_P (coding))
1929             DECODE_COMPOSITION_END ('1');
1930
1931           if (coding->eol_type == CODING_EOL_CR)
1932             c1 = '\n';
1933           else if (coding->eol_type == CODING_EOL_CRLF)
1934             {
1935               ONE_MORE_BYTE (c1);
1936               if (c1 != ISO_CODE_LF)
1937                 {
1938                   src--;
1939                   c1 = '\r';
1940                 }
1941             }
1942           charset = CHARSET_ASCII;
1943           break;
1944
1945         case ISO_shift_out:
1946           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1947               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1948             goto label_invalid_code;
1949           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1950           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1951           continue;
1952
1953         case ISO_shift_in:
1954           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1955             goto label_invalid_code;
1956           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1957           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1958           continue;
1959
1960         case ISO_single_shift_2_7:
1961         case ISO_single_shift_2:
1962           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1963             goto label_invalid_code;
1964           /* SS2 is handled as an escape sequence of ESC 'N' */
1965           c1 = 'N';
1966           goto label_escape_sequence;
1967
1968         case ISO_single_shift_3:
1969           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1970             goto label_invalid_code;
1971           /* SS2 is handled as an escape sequence of ESC 'O' */
1972           c1 = 'O';
1973           goto label_escape_sequence;
1974
1975         case ISO_control_sequence_introducer:
1976           /* CSI is handled as an escape sequence of ESC '[' ...  */
1977           c1 = '[';
1978           goto label_escape_sequence;
1979
1980         case ISO_escape:
1981           ONE_MORE_BYTE (c1);
1982         label_escape_sequence:
1983           /* Escape sequences handled by Emacs are invocation,
1984              designation, direction specification, and character
1985              composition specification.  */
1986           switch (c1)
1987             {
1988             case '&':           /* revision of following character set */
1989               ONE_MORE_BYTE (c1);
1990               if (!(c1 >= '@' && c1 <= '~'))
1991                 goto label_invalid_code;
1992               ONE_MORE_BYTE (c1);
1993               if (c1 != ISO_CODE_ESC)
1994                 goto label_invalid_code;
1995               ONE_MORE_BYTE (c1);
1996               goto label_escape_sequence;
1997
1998             case '$':           /* designation of 2-byte character set */
1999               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2000                 goto label_invalid_code;
2001               ONE_MORE_BYTE (c1);
2002               if (c1 >= '@' && c1 <= 'B')
2003                 {       /* designation of JISX0208.1978, GB2312.1980,
2004                            or JISX0208.1980 */
2005                   DECODE_DESIGNATION (0, 2, 94, c1);
2006                 }
2007               else if (c1 >= 0x28 && c1 <= 0x2B)
2008                 {       /* designation of DIMENSION2_CHARS94 character set */
2009                   ONE_MORE_BYTE (c2);
2010                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2011                 }
2012               else if (c1 >= 0x2C && c1 <= 0x2F)
2013                 {       /* designation of DIMENSION2_CHARS96 character set */
2014                   ONE_MORE_BYTE (c2);
2015                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2016                 }
2017               else
2018                 goto label_invalid_code;
2019               /* We must update these variables now.  */
2020               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2021               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2022               continue;
2023
2024             case 'n':           /* invocation of locking-shift-2 */
2025               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2026                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2027                 goto label_invalid_code;
2028               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2029               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2030               continue;
2031
2032             case 'o':           /* invocation of locking-shift-3 */
2033               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2034                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2035                 goto label_invalid_code;
2036               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2037               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2038               continue;
2039
2040             case 'N':           /* invocation of single-shift-2 */
2041               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2042                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2043                 goto label_invalid_code;
2044               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2045               ONE_MORE_BYTE (c1);
2046               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2047                 goto label_invalid_code;
2048               break;
2049
2050             case 'O':           /* invocation of single-shift-3 */
2051               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2052                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2053                 goto label_invalid_code;
2054               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2055               ONE_MORE_BYTE (c1);
2056               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2057                 goto label_invalid_code;
2058               break;
2059
2060             case '0': case '2': case '3': case '4': /* start composition */
2061               DECODE_COMPOSITION_START (c1);
2062               continue;
2063
2064             case '1':           /* end composition */
2065               DECODE_COMPOSITION_END (c1);
2066               continue;
2067
2068             case '[':           /* specification of direction */
2069               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2070                 goto label_invalid_code;
2071               /* For the moment, nested direction is not supported.
2072                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2073                  left-to-right, and nonzero means right-to-left.  */
2074               ONE_MORE_BYTE (c1);
2075               switch (c1)
2076                 {
2077                 case ']':       /* end of the current direction */
2078                   coding->mode &= ~CODING_MODE_DIRECTION;
2079
2080                 case '0':       /* end of the current direction */
2081                 case '1':       /* start of left-to-right direction */
2082                   ONE_MORE_BYTE (c1);
2083                   if (c1 == ']')
2084                     coding->mode &= ~CODING_MODE_DIRECTION;
2085                   else
2086                     goto label_invalid_code;
2087                   break;
2088
2089                 case '2':       /* start of right-to-left direction */
2090                   ONE_MORE_BYTE (c1);
2091                   if (c1 == ']')
2092                     coding->mode |= CODING_MODE_DIRECTION;
2093                   else
2094                     goto label_invalid_code;
2095                   break;
2096
2097                 default:
2098                   goto label_invalid_code;
2099                 }
2100               continue;
2101
2102             case '%':
2103               if (COMPOSING_P (coding))
2104                 DECODE_COMPOSITION_END ('1');
2105               ONE_MORE_BYTE (c1);
2106               if (c1 == '/')
2107                 {
2108                   /* CTEXT extended segment:
2109                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2110                      We keep these bytes as is for the moment.
2111                      They may be decoded by post-read-conversion.  */
2112                   int dim, M, L;
2113                   int size, required;
2114                   int produced_chars;
2115
2116                   ONE_MORE_BYTE (dim);
2117                   ONE_MORE_BYTE (M);
2118                   ONE_MORE_BYTE (L);
2119                   size = ((M - 128) * 128) + (L - 128);
2120                   required = 8 + size * 2;
2121                   if (dst + required > (dst_bytes ? dst_end : src))
2122                     goto label_end_of_loop;
2123                   *dst++ = ISO_CODE_ESC;
2124                   *dst++ = '%';
2125                   *dst++ = '/';
2126                   *dst++ = dim;
2127                   produced_chars = 4;
2128                   dst += CHAR_STRING (M, dst), produced_chars++;
2129                   dst += CHAR_STRING (L, dst), produced_chars++;
2130                   while (size-- > 0)
2131                     {
2132                       ONE_MORE_BYTE (c1);
2133                       dst += CHAR_STRING (c1, dst), produced_chars++;
2134                     }
2135                   coding->produced_char += produced_chars;
2136                 }
2137               else if (c1 == 'G')
2138                 {
2139                   unsigned char *d = dst;
2140                   int produced_chars;
2141
2142                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2143                      ESC % G --UTF-8-BYTES-- ESC % @
2144                      We keep these bytes as is for the moment.
2145                      They may be decoded by post-read-conversion.  */
2146                   if (d + 6 > (dst_bytes ? dst_end : src))
2147                     goto label_end_of_loop;
2148                   *d++ = ISO_CODE_ESC;
2149                   *d++ = '%';
2150                   *d++ = 'G';
2151                   produced_chars = 3;
2152                   while (d + 1 < (dst_bytes ? dst_end : src))
2153                     {
2154                       ONE_MORE_BYTE (c1);
2155                       if (c1 == ISO_CODE_ESC
2156                           && src + 1 < src_end
2157                           && src[0] == '%'
2158                           && src[1] == '@')
2159                         {
2160                           src += 2;
2161                           break;
2162                         }
2163                       d += CHAR_STRING (c1, d), produced_chars++;
2164                     }
2165                   if (d + 3 > (dst_bytes ? dst_end : src))
2166                     goto label_end_of_loop;
2167                   *d++ = ISO_CODE_ESC;
2168                   *d++ = '%';
2169                   *d++ = '@';
2170                   dst = d;
2171                   coding->produced_char += produced_chars + 3;
2172                 }
2173               else
2174                 goto label_invalid_code;
2175               continue;
2176
2177             default:
2178               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2179                 goto label_invalid_code;
2180               if (c1 >= 0x28 && c1 <= 0x2B)
2181                 {       /* designation of DIMENSION1_CHARS94 character set */
2182                   ONE_MORE_BYTE (c2);
2183                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2184                 }
2185               else if (c1 >= 0x2C && c1 <= 0x2F)
2186                 {       /* designation of DIMENSION1_CHARS96 character set */
2187                   ONE_MORE_BYTE (c2);
2188                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2189                 }
2190               else
2191                 goto label_invalid_code;
2192               /* We must update these variables now.  */
2193               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2194               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2195               continue;
2196             }
2197         }
2198
2199       /* Now we know CHARSET and 1st position code C1 of a character.
2200          Produce a multibyte sequence for that character while getting
2201          2nd position code C2 if necessary.  */
2202       if (CHARSET_DIMENSION (charset) == 2)
2203         {
2204           ONE_MORE_BYTE (c2);
2205           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2206             /* C2 is not in a valid range.  */
2207             goto label_invalid_code;
2208         }
2209       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2210       EMIT_CHAR (c);
2211       continue;
2212
2213     label_invalid_code:
2214       coding->errors++;
2215       if (COMPOSING_P (coding))
2216         DECODE_COMPOSITION_END ('1');
2217       src = src_base;
2218       c = *src++;
2219       if (! NILP (translation_table))
2220         c = translate_char (translation_table, c, 0, 0, 0);
2221       EMIT_CHAR (c);
2222     }
2223
2224  label_end_of_loop:
2225   coding->consumed = coding->consumed_char = src_base - source;
2226   coding->produced = dst - destination;
2227   return;
2228 }
2229
2230
2231 /* ISO2022 encoding stuff.  */
2232
2233 /*
2234    It is not enough to say just "ISO2022" on encoding, we have to
2235    specify more details.  In Emacs, each ISO2022 coding system
2236    variant has the following specifications:
2237         1. Initial designation to G0 through G3.
2238         2. Allows short-form designation?
2239         3. ASCII should be designated to G0 before control characters?
2240         4. ASCII should be designated to G0 at end of line?
2241         5. 7-bit environment or 8-bit environment?
2242         6. Use locking-shift?
2243         7. Use Single-shift?
2244    And the following two are only for Japanese:
2245         8. Use ASCII in place of JIS0201-1976-Roman?
2246         9. Use JISX0208-1983 in place of JISX0208-1978?
2247    These specifications are encoded in `coding->flags' as flag bits
2248    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2249    details.
2250 */
2251
2252 /* Produce codes (escape sequence) for designating CHARSET to graphic
2253    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2254    '@', 'A', or 'B' and the coding system CODING allows, produce
2255    designation sequence of short-form.  */
2256
2257 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2258   do {                                                                  \
2259     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2260     char *intermediate_char_94 = "()*+";                                \
2261     char *intermediate_char_96 = ",-./";                                \
2262     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2263                                                                         \
2264     if (revision < 255)                                                 \
2265       {                                                                 \
2266         *dst++ = ISO_CODE_ESC;                                          \
2267         *dst++ = '&';                                                   \
2268         *dst++ = '@' + revision;                                        \
2269       }                                                                 \
2270     *dst++ = ISO_CODE_ESC;                                              \
2271     if (CHARSET_DIMENSION (charset) == 1)                               \
2272       {                                                                 \
2273         if (CHARSET_CHARS (charset) == 94)                              \
2274           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2275         else                                                            \
2276           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2277       }                                                                 \
2278     else                                                                \
2279       {                                                                 \
2280         *dst++ = '$';                                                   \
2281         if (CHARSET_CHARS (charset) == 94)                              \
2282           {                                                             \
2283             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2284                 || reg != 0                                             \
2285                 || final_char < '@' || final_char > 'B')                \
2286               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2287           }                                                             \
2288         else                                                            \
2289           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2290       }                                                                 \
2291     *dst++ = final_char;                                                \
2292     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2293   } while (0)
2294
2295 /* The following two macros produce codes (control character or escape
2296    sequence) for ISO2022 single-shift functions (single-shift-2 and
2297    single-shift-3).  */
2298
2299 #define ENCODE_SINGLE_SHIFT_2                           \
2300   do {                                                  \
2301     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2302       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2303     else                                                \
2304       *dst++ = ISO_CODE_SS2;                            \
2305     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2306   } while (0)
2307
2308 #define ENCODE_SINGLE_SHIFT_3                           \
2309   do {                                                  \
2310     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2311       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2312     else                                                \
2313       *dst++ = ISO_CODE_SS3;                            \
2314     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2315   } while (0)
2316
2317 /* The following four macros produce codes (control character or
2318    escape sequence) for ISO2022 locking-shift functions (shift-in,
2319    shift-out, locking-shift-2, and locking-shift-3).  */
2320
2321 #define ENCODE_SHIFT_IN                         \
2322   do {                                          \
2323     *dst++ = ISO_CODE_SI;                       \
2324     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2325   } while (0)
2326
2327 #define ENCODE_SHIFT_OUT                        \
2328   do {                                          \
2329     *dst++ = ISO_CODE_SO;                       \
2330     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2331   } while (0)
2332
2333 #define ENCODE_LOCKING_SHIFT_2                  \
2334   do {                                          \
2335     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2336     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2337   } while (0)
2338
2339 #define ENCODE_LOCKING_SHIFT_3                  \
2340   do {                                          \
2341     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2342     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2343   } while (0)
2344
2345 /* Produce codes for a DIMENSION1 character whose character set is
2346    CHARSET and whose position-code is C1.  Designation and invocation
2347    sequences are also produced in advance if necessary.  */
2348
2349 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2350   do {                                                                  \
2351     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2352       {                                                                 \
2353         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2354           *dst++ = c1 & 0x7F;                                           \
2355         else                                                            \
2356           *dst++ = c1 | 0x80;                                           \
2357         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2358         break;                                                          \
2359       }                                                                 \
2360     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2361       {                                                                 \
2362         *dst++ = c1 & 0x7F;                                             \
2363         break;                                                          \
2364       }                                                                 \
2365     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2366       {                                                                 \
2367         *dst++ = c1 | 0x80;                                             \
2368         break;                                                          \
2369       }                                                                 \
2370     else                                                                \
2371       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2372          must invoke it, or, at first, designate it to some graphic     \
2373          register.  Then repeat the loop to actually produce the        \
2374          character.  */                                                 \
2375       dst = encode_invocation_designation (charset, coding, dst);       \
2376   } while (1)
2377
2378 /* Produce codes for a DIMENSION2 character whose character set is
2379    CHARSET and whose position-codes are C1 and C2.  Designation and
2380    invocation codes are also produced in advance if necessary.  */
2381
2382 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2383   do {                                                                  \
2384     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2385       {                                                                 \
2386         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2387           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2388         else                                                            \
2389           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2390         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2391         break;                                                          \
2392       }                                                                 \
2393     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2394       {                                                                 \
2395         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2396         break;                                                          \
2397       }                                                                 \
2398     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2399       {                                                                 \
2400         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2401         break;                                                          \
2402       }                                                                 \
2403     else                                                                \
2404       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2405          must invoke it, or, at first, designate it to some graphic     \
2406          register.  Then repeat the loop to actually produce the        \
2407          character.  */                                                 \
2408       dst = encode_invocation_designation (charset, coding, dst);       \
2409   } while (1)
2410
2411 #define ENCODE_ISO_CHARACTER(c)                                 \
2412   do {                                                          \
2413     int charset, c1, c2;                                        \
2414                                                                 \
2415     SPLIT_CHAR (c, charset, c1, c2);                            \
2416     if (CHARSET_DEFINED_P (charset))                            \
2417       {                                                         \
2418         if (CHARSET_DIMENSION (charset) == 1)                   \
2419           {                                                     \
2420             if (charset == CHARSET_ASCII                        \
2421                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2422               charset = charset_latin_jisx0201;                 \
2423             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2424           }                                                     \
2425         else                                                    \
2426           {                                                     \
2427             if (charset == charset_jisx0208                     \
2428                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2429               charset = charset_jisx0208_1978;                  \
2430             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2431           }                                                     \
2432       }                                                         \
2433     else                                                        \
2434       {                                                         \
2435         *dst++ = c1;                                            \
2436         if (c2 >= 0)                                            \
2437           *dst++ = c2;                                          \
2438       }                                                         \
2439   } while (0)
2440
2441
2442 /* Instead of encoding character C, produce one or two `?'s.  */
2443
2444 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2445   do {                                                          \
2446     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2447     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2448       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2449   } while (0)
2450
2451
2452 /* Produce designation and invocation codes at a place pointed by DST
2453    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2454    Return new DST.  */
2455
2456 unsigned char *
2457 encode_invocation_designation (charset, coding, dst)
2458      int charset;
2459      struct coding_system *coding;
2460      unsigned char *dst;
2461 {
2462   int reg;                      /* graphic register number */
2463
2464   /* At first, check designations.  */
2465   for (reg = 0; reg < 4; reg++)
2466     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2467       break;
2468
2469   if (reg >= 4)
2470     {
2471       /* CHARSET is not yet designated to any graphic registers.  */
2472       /* At first check the requested designation.  */
2473       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2474       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2475         /* Since CHARSET requests no special designation, designate it
2476            to graphic register 0.  */
2477         reg = 0;
2478
2479       ENCODE_DESIGNATION (charset, reg, coding);
2480     }
2481
2482   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2483       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2484     {
2485       /* Since the graphic register REG is not invoked to any graphic
2486          planes, invoke it to graphic plane 0.  */
2487       switch (reg)
2488         {
2489         case 0:                 /* graphic register 0 */
2490           ENCODE_SHIFT_IN;
2491           break;
2492
2493         case 1:                 /* graphic register 1 */
2494           ENCODE_SHIFT_OUT;
2495           break;
2496
2497         case 2:                 /* graphic register 2 */
2498           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2499             ENCODE_SINGLE_SHIFT_2;
2500           else
2501             ENCODE_LOCKING_SHIFT_2;
2502           break;
2503
2504         case 3:                 /* graphic register 3 */
2505           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2506             ENCODE_SINGLE_SHIFT_3;
2507           else
2508             ENCODE_LOCKING_SHIFT_3;
2509           break;
2510         }
2511     }
2512
2513   return dst;
2514 }
2515
2516 /* Produce 2-byte codes for encoded composition rule RULE.  */
2517
2518 #define ENCODE_COMPOSITION_RULE(rule)           \
2519   do {                                          \
2520     int gref, nref;                             \
2521     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2522     *dst++ = 32 + 81 + gref;                    \
2523     *dst++ = 32 + nref;                         \
2524   } while (0)
2525
2526 /* Produce codes for indicating the start of a composition sequence
2527    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2528    which specify information about the composition.  See the comment
2529    in coding.h for the format of DATA.  */
2530
2531 #define ENCODE_COMPOSITION_START(coding, data)                          \
2532   do {                                                                  \
2533     coding->composing = data[3];                                        \
2534     *dst++ = ISO_CODE_ESC;                                              \
2535     if (coding->composing == COMPOSITION_RELATIVE)                      \
2536       *dst++ = '0';                                                     \
2537     else                                                                \
2538       {                                                                 \
2539         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2540                   ? '3' : '4');                                         \
2541         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2542         coding->composition_rule_follows = 0;                           \
2543       }                                                                 \
2544   } while (0)
2545
2546 /* Produce codes for indicating the end of the current composition.  */
2547
2548 #define ENCODE_COMPOSITION_END(coding, data)                    \
2549   do {                                                          \
2550     *dst++ = ISO_CODE_ESC;                                      \
2551     *dst++ = '1';                                               \
2552     coding->cmp_data_start += data[0];                          \
2553     coding->composing = COMPOSITION_NO;                         \
2554     if (coding->cmp_data_start == coding->cmp_data->used        \
2555         && coding->cmp_data->next)                              \
2556       {                                                         \
2557         coding->cmp_data = coding->cmp_data->next;              \
2558         coding->cmp_data_start = 0;                             \
2559       }                                                         \
2560   } while (0)
2561
2562 /* Produce composition start sequence ESC 0.  Here, this sequence
2563    doesn't mean the start of a new composition but means that we have
2564    just produced components (alternate chars and composition rules) of
2565    the composition and the actual text follows in SRC.  */
2566
2567 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2568   do {                                          \
2569     *dst++ = ISO_CODE_ESC;                      \
2570     *dst++ = '0';                               \
2571     coding->composing = COMPOSITION_RELATIVE;   \
2572   } while (0)
2573
2574 /* The following three macros produce codes for indicating direction
2575    of text.  */
2576 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2577   do {                                                  \
2578     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2579       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2580     else                                                \
2581       *dst++ = ISO_CODE_CSI;                            \
2582   } while (0)
2583
2584 #define ENCODE_DIRECTION_R2L    \
2585   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2586
2587 #define ENCODE_DIRECTION_L2R    \
2588   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2589
2590 /* Produce codes for designation and invocation to reset the graphic
2591    planes and registers to initial state.  */
2592 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2593   do {                                                                      \
2594     int reg;                                                                \
2595     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2596       ENCODE_SHIFT_IN;                                                      \
2597     for (reg = 0; reg < 4; reg++)                                           \
2598       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2599           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2600               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2601         ENCODE_DESIGNATION                                                  \
2602           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2603   } while (0)
2604
2605 /* Produce designation sequences of charsets in the line started from
2606    SRC to a place pointed by DST, and return updated DST.
2607
2608    If the current block ends before any end-of-line, we may fail to
2609    find all the necessary designations.  */
2610
2611 static unsigned char *
2612 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2613      struct coding_system *coding;
2614      Lisp_Object translation_table;
2615      const unsigned char *src, *src_end;
2616      unsigned char *dst;
2617 {
2618   int charset, c, found = 0, reg;
2619   /* Table of charsets to be designated to each graphic register.  */
2620   int r[4];
2621
2622   for (reg = 0; reg < 4; reg++)
2623     r[reg] = -1;
2624
2625   while (found < 4)
2626     {
2627       ONE_MORE_CHAR (c);
2628       if (c == '\n')
2629         break;
2630
2631       charset = CHAR_CHARSET (c);
2632       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2633       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2634         {
2635           found++;
2636           r[reg] = charset;
2637         }
2638     }
2639
2640  label_end_of_loop:
2641   if (found)
2642     {
2643       for (reg = 0; reg < 4; reg++)
2644         if (r[reg] >= 0
2645             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2646           ENCODE_DESIGNATION (r[reg], reg, coding);
2647     }
2648
2649   return dst;
2650 }
2651
2652 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2653
2654 static void
2655 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2656      struct coding_system *coding;
2657      const unsigned char *source;
2658      unsigned char *destination;
2659      int src_bytes, dst_bytes;
2660 {
2661   const unsigned char *src = source;
2662   const unsigned char *src_end = source + src_bytes;
2663   unsigned char *dst = destination;
2664   unsigned char *dst_end = destination + dst_bytes;
2665   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2666      from DST_END to assure overflow checking is necessary only at the
2667      head of loop.  */
2668   unsigned char *adjusted_dst_end = dst_end - 19;
2669   /* SRC_BASE remembers the start position in source in each loop.
2670      The loop will be exited when there's not enough source text to
2671      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2672      there's not enough destination area to produce encoded codes
2673      (within macro EMIT_BYTES).  */
2674   const unsigned char *src_base;
2675   int c;
2676   Lisp_Object translation_table;
2677   Lisp_Object safe_chars;
2678
2679   if (coding->flags & CODING_FLAG_ISO_SAFE)
2680     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2681
2682   safe_chars = coding_safe_chars (coding->symbol);
2683
2684   if (NILP (Venable_character_translation))
2685     translation_table = Qnil;
2686   else
2687     {
2688       translation_table = coding->translation_table_for_encode;
2689       if (NILP (translation_table))
2690         translation_table = Vstandard_translation_table_for_encode;
2691     }
2692
2693   coding->consumed_char = 0;
2694   coding->errors = 0;
2695   while (1)
2696     {
2697       src_base = src;
2698
2699       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2700         {
2701           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2702           break;
2703         }
2704
2705       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2706           && CODING_SPEC_ISO_BOL (coding))
2707         {
2708           /* We have to produce designation sequences if any now.  */
2709           dst = encode_designation_at_bol (coding, translation_table,
2710                                            src, src_end, dst);
2711           CODING_SPEC_ISO_BOL (coding) = 0;
2712         }
2713
2714       /* Check composition start and end.  */
2715       if (coding->composing != COMPOSITION_DISABLED
2716           && coding->cmp_data_start < coding->cmp_data->used)
2717         {
2718           struct composition_data *cmp_data = coding->cmp_data;
2719           int *data = cmp_data->data + coding->cmp_data_start;
2720           int this_pos = cmp_data->char_offset + coding->consumed_char;
2721
2722           if (coding->composing == COMPOSITION_RELATIVE)
2723             {
2724               if (this_pos == data[2])
2725                 {
2726                   ENCODE_COMPOSITION_END (coding, data);
2727                   cmp_data = coding->cmp_data;
2728                   data = cmp_data->data + coding->cmp_data_start;
2729                 }
2730             }
2731           else if (COMPOSING_P (coding))
2732             {
2733               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2734               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2735                 /* We have consumed components of the composition.
2736                    What follows in SRC is the composition's base
2737                    text.  */
2738                 ENCODE_COMPOSITION_FAKE_START (coding);
2739               else
2740                 {
2741                   int c = cmp_data->data[coding->cmp_data_index++];
2742                   if (coding->composition_rule_follows)
2743                     {
2744                       ENCODE_COMPOSITION_RULE (c);
2745                       coding->composition_rule_follows = 0;
2746                     }
2747                   else
2748                     {
2749                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2750                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2751                         ENCODE_UNSAFE_CHARACTER (c);
2752                       else
2753                         ENCODE_ISO_CHARACTER (c);
2754                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2755                         coding->composition_rule_follows = 1;
2756                     }
2757                   continue;
2758                 }
2759             }
2760           if (!COMPOSING_P (coding))
2761             {
2762               if (this_pos == data[1])
2763                 {
2764                   ENCODE_COMPOSITION_START (coding, data);
2765                   continue;
2766                 }
2767             }
2768         }
2769
2770       ONE_MORE_CHAR (c);
2771
2772       /* Now encode the character C.  */
2773       if (c < 0x20 || c == 0x7F)
2774         {
2775           if (c == '\r')
2776             {
2777               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2778                 {
2779                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2780                     ENCODE_RESET_PLANE_AND_REGISTER;
2781                   *dst++ = c;
2782                   continue;
2783                 }
2784               /* fall down to treat '\r' as '\n' ...  */
2785               c = '\n';
2786             }
2787           if (c == '\n')
2788             {
2789               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2790                 ENCODE_RESET_PLANE_AND_REGISTER;
2791               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2792                 bcopy (coding->spec.iso2022.initial_designation,
2793                        coding->spec.iso2022.current_designation,
2794                        sizeof coding->spec.iso2022.initial_designation);
2795               if (coding->eol_type == CODING_EOL_LF
2796                   || coding->eol_type == CODING_EOL_UNDECIDED)
2797                 *dst++ = ISO_CODE_LF;
2798               else if (coding->eol_type == CODING_EOL_CRLF)
2799                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2800               else
2801                 *dst++ = ISO_CODE_CR;
2802               CODING_SPEC_ISO_BOL (coding) = 1;
2803             }
2804           else
2805             {
2806               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2807                 ENCODE_RESET_PLANE_AND_REGISTER;
2808               *dst++ = c;
2809             }
2810         }
2811       else if (ASCII_BYTE_P (c))
2812         ENCODE_ISO_CHARACTER (c);
2813       else if (SINGLE_BYTE_CHAR_P (c))
2814         {
2815           *dst++ = c;
2816           coding->errors++;
2817         }
2818       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2819                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2820         ENCODE_UNSAFE_CHARACTER (c);
2821       else
2822         ENCODE_ISO_CHARACTER (c);
2823
2824       coding->consumed_char++;
2825     }
2826
2827  label_end_of_loop:
2828   coding->consumed = src_base - source;
2829   coding->produced = coding->produced_char = dst - destination;
2830 }
2831
2832 \f
2833 /*** 4. SJIS and BIG5 handlers ***/
2834
2835 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2836    quite widely.  So, for the moment, Emacs supports them in the bare
2837    C code.  But, in the future, they may be supported only by CCL.  */
2838
2839 /* SJIS is a coding system encoding three character sets: ASCII, right
2840    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2841    as is.  A character of charset katakana-jisx0201 is encoded by
2842    "position-code + 0x80".  A character of charset japanese-jisx0208
2843    is encoded in 2-byte but two position-codes are divided and shifted
2844    so that it fits in the range below.
2845
2846    --- CODE RANGE of SJIS ---
2847    (character set)      (range)
2848    ASCII                0x00 .. 0x7F
2849    KATAKANA-JISX0201    0xA1 .. 0xDF
2850    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2851             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2852    -------------------------------
2853
2854 */
2855
2856 /* BIG5 is a coding system encoding two character sets: ASCII and
2857    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2858    character set and is encoded in two bytes.
2859
2860    --- CODE RANGE of BIG5 ---
2861    (character set)      (range)
2862    ASCII                0x00 .. 0x7F
2863    Big5 (1st byte)      0xA1 .. 0xFE
2864         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2865    --------------------------
2866
2867    Since the number of characters in Big5 is larger than maximum
2868    characters in Emacs' charset (96x96), it can't be handled as one
2869    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2870    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2871    contains frequently used characters and the latter contains less
2872    frequently used characters.  */
2873
2874 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2875    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2876    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2877    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2878
2879 /* Number of Big5 characters which have the same code in 1st byte.  */
2880 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2881
2882 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2883   do {                                                                  \
2884     unsigned int temp                                                   \
2885       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2886     if (b1 < 0xC9)                                                      \
2887       charset = charset_big5_1;                                         \
2888     else                                                                \
2889       {                                                                 \
2890         charset = charset_big5_2;                                       \
2891         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2892       }                                                                 \
2893     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2894     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2895   } while (0)
2896
2897 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2898   do {                                                                  \
2899     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2900     if (charset == charset_big5_2)                                      \
2901       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2902     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2903     b2 = temp % BIG5_SAME_ROW;                                          \
2904     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2905   } while (0)
2906
2907 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2908    Check if a text is encoded in SJIS.  If it is, return
2909    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2910
2911 static int
2912 detect_coding_sjis (src, src_end, multibytep)
2913      unsigned char *src, *src_end;
2914      int multibytep;
2915 {
2916   int c;
2917   /* Dummy for ONE_MORE_BYTE.  */
2918   struct coding_system dummy_coding;
2919   struct coding_system *coding = &dummy_coding;
2920
2921   while (1)
2922     {
2923       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS);
2924       if (c < 0x80)
2925         continue;
2926       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2927         return 0;
2928       if (c <= 0x9F || c >= 0xE0)
2929         {
2930           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2931           if (c < 0x40 || c == 0x7F || c > 0xFC)
2932             return 0;
2933         }
2934     }
2935 }
2936
2937 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2938    Check if a text is encoded in BIG5.  If it is, return
2939    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2940
2941 static int
2942 detect_coding_big5 (src, src_end, multibytep)
2943      unsigned char *src, *src_end;
2944      int multibytep;
2945 {
2946   int c;
2947   /* Dummy for ONE_MORE_BYTE.  */
2948   struct coding_system dummy_coding;
2949   struct coding_system *coding = &dummy_coding;
2950
2951   while (1)
2952     {
2953       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5);
2954       if (c < 0x80)
2955         continue;
2956       if (c < 0xA1 || c > 0xFE)
2957         return 0;
2958       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
2959       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2960         return 0;
2961     }
2962 }
2963
2964 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2965    Check if a text is encoded in UTF-8.  If it is, return
2966    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2967
2968 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2969 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2970 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2971 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2972 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2973 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2974 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2975
2976 static int
2977 detect_coding_utf_8 (src, src_end, multibytep)
2978      unsigned char *src, *src_end;
2979      int multibytep;
2980 {
2981   unsigned char c;
2982   int seq_maybe_bytes;
2983   /* Dummy for ONE_MORE_BYTE.  */
2984   struct coding_system dummy_coding;
2985   struct coding_system *coding = &dummy_coding;
2986
2987   while (1)
2988     {
2989       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8);
2990       if (UTF_8_1_OCTET_P (c))
2991         continue;
2992       else if (UTF_8_2_OCTET_LEADING_P (c))
2993         seq_maybe_bytes = 1;
2994       else if (UTF_8_3_OCTET_LEADING_P (c))
2995         seq_maybe_bytes = 2;
2996       else if (UTF_8_4_OCTET_LEADING_P (c))
2997         seq_maybe_bytes = 3;
2998       else if (UTF_8_5_OCTET_LEADING_P (c))
2999         seq_maybe_bytes = 4;
3000       else if (UTF_8_6_OCTET_LEADING_P (c))
3001         seq_maybe_bytes = 5;
3002       else
3003         return 0;
3004
3005       do
3006         {
3007           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0);
3008           if (!UTF_8_EXTRA_OCTET_P (c))
3009             return 0;
3010           seq_maybe_bytes--;
3011         }
3012       while (seq_maybe_bytes > 0);
3013     }
3014 }
3015
3016 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3017    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3018    Little Endian (otherwise).  If it is, return
3019    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3020    else return 0.  */
3021
3022 #define UTF_16_INVALID_P(val)   \
3023   (((val) == 0xFFFE)            \
3024    || ((val) == 0xFFFF))
3025
3026 #define UTF_16_HIGH_SURROGATE_P(val) \
3027   (((val) & 0xD800) == 0xD800)
3028
3029 #define UTF_16_LOW_SURROGATE_P(val) \
3030   (((val) & 0xDC00) == 0xDC00)
3031
3032 static int
3033 detect_coding_utf_16 (src, src_end, multibytep)
3034      unsigned char *src, *src_end;
3035      int multibytep;
3036 {
3037   unsigned char c1, c2;
3038   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3039   struct coding_system dummy_coding;
3040   struct coding_system *coding = &dummy_coding;
3041
3042   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0);
3043   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0);
3044
3045   if ((c1 == 0xFF) && (c2 == 0xFE))
3046     return CODING_CATEGORY_MASK_UTF_16_LE;
3047   else if ((c1 == 0xFE) && (c2 == 0xFF))
3048     return CODING_CATEGORY_MASK_UTF_16_BE;
3049   return 0;
3050 }
3051
3052 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3053    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3054
3055 static void
3056 decode_coding_sjis_big5 (coding, source, destination,
3057                          src_bytes, dst_bytes, sjis_p)
3058      struct coding_system *coding;
3059      const unsigned char *source;
3060      unsigned char  *destination;
3061      int src_bytes, dst_bytes;
3062      int sjis_p;
3063 {
3064   const unsigned char *src = source;
3065   const unsigned char *src_end = source + src_bytes;
3066   unsigned char *dst = destination;
3067   unsigned char *dst_end = destination + dst_bytes;
3068   /* SRC_BASE remembers the start position in source in each loop.
3069      The loop will be exited when there's not enough source code
3070      (within macro ONE_MORE_BYTE), or when there's not enough
3071      destination area to produce a character (within macro
3072      EMIT_CHAR).  */
3073   const unsigned char *src_base;
3074   Lisp_Object translation_table;
3075
3076   if (NILP (Venable_character_translation))
3077     translation_table = Qnil;
3078   else
3079     {
3080       translation_table = coding->translation_table_for_decode;
3081       if (NILP (translation_table))
3082         translation_table = Vstandard_translation_table_for_decode;
3083     }
3084
3085   coding->produced_char = 0;
3086   while (1)
3087     {
3088       int c, charset, c1, c2 = 0;
3089
3090       src_base = src;
3091       ONE_MORE_BYTE (c1);
3092
3093       if (c1 < 0x80)
3094         {
3095           charset = CHARSET_ASCII;
3096           if (c1 < 0x20)
3097             {
3098               if (c1 == '\r')
3099                 {
3100                   if (coding->eol_type == CODING_EOL_CRLF)
3101                     {
3102                       ONE_MORE_BYTE (c2);
3103                       if (c2 == '\n')
3104                         c1 = c2;
3105                       else
3106                         /* To process C2 again, SRC is subtracted by 1.  */
3107                         src--;
3108                     }
3109                   else if (coding->eol_type == CODING_EOL_CR)
3110                     c1 = '\n';
3111                 }
3112               else if (c1 == '\n'
3113                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3114                        && (coding->eol_type == CODING_EOL_CR
3115                            || coding->eol_type == CODING_EOL_CRLF))
3116                 {
3117                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3118                   goto label_end_of_loop;
3119                 }
3120             }
3121         }
3122       else
3123         {
3124           if (sjis_p)
3125             {
3126               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3127                 goto label_invalid_code;
3128               if (c1 <= 0x9F || c1 >= 0xE0)
3129                 {
3130                   /* SJIS -> JISX0208 */
3131                   ONE_MORE_BYTE (c2);
3132                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3133                     goto label_invalid_code;
3134                   DECODE_SJIS (c1, c2, c1, c2);
3135                   charset = charset_jisx0208;
3136                 }
3137               else
3138                 /* SJIS -> JISX0201-Kana */
3139                 charset = charset_katakana_jisx0201;
3140             }
3141           else
3142             {
3143               /* BIG5 -> Big5 */
3144               if (c1 < 0xA0 || c1 > 0xFE)
3145                 goto label_invalid_code;
3146               ONE_MORE_BYTE (c2);
3147               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3148                 goto label_invalid_code;
3149               DECODE_BIG5 (c1, c2, charset, c1, c2);
3150             }
3151         }
3152
3153       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3154       EMIT_CHAR (c);
3155       continue;
3156
3157     label_invalid_code:
3158       coding->errors++;
3159       src = src_base;
3160       c = *src++;
3161       EMIT_CHAR (c);
3162     }
3163
3164  label_end_of_loop:
3165   coding->consumed = coding->consumed_char = src_base - source;
3166   coding->produced = dst - destination;
3167   return;
3168 }
3169
3170 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3171    This function can encode charsets `ascii', `katakana-jisx0201',
3172    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3173    are sure that all these charsets are registered as official charset
3174    (i.e. do not have extended leading-codes).  Characters of other
3175    charsets are produced without any encoding.  If SJIS_P is 1, encode
3176    SJIS text, else encode BIG5 text.  */
3177
3178 static void
3179 encode_coding_sjis_big5 (coding, source, destination,
3180                          src_bytes, dst_bytes, sjis_p)
3181      struct coding_system *coding;
3182      unsigned char *source, *destination;
3183      int src_bytes, dst_bytes;
3184      int sjis_p;
3185 {
3186   unsigned char *src = source;
3187   unsigned char *src_end = source + src_bytes;
3188   unsigned char *dst = destination;
3189   unsigned char *dst_end = destination + dst_bytes;
3190   /* SRC_BASE remembers the start position in source in each loop.
3191      The loop will be exited when there's not enough source text to
3192      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3193      there's not enough destination area to produce encoded codes
3194      (within macro EMIT_BYTES).  */
3195   unsigned char *src_base;
3196   Lisp_Object translation_table;
3197
3198   if (NILP (Venable_character_translation))
3199     translation_table = Qnil;
3200   else
3201     {
3202       translation_table = coding->translation_table_for_encode;
3203       if (NILP (translation_table))
3204         translation_table = Vstandard_translation_table_for_encode;
3205     }
3206
3207   while (1)
3208     {
3209       int c, charset, c1, c2;
3210
3211       src_base = src;
3212       ONE_MORE_CHAR (c);
3213
3214       /* Now encode the character C.  */
3215       if (SINGLE_BYTE_CHAR_P (c))
3216         {
3217           switch (c)
3218             {
3219             case '\r':
3220               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3221                 {
3222                   EMIT_ONE_BYTE (c);
3223                   break;
3224                 }
3225               c = '\n';
3226             case '\n':
3227               if (coding->eol_type == CODING_EOL_CRLF)
3228                 {
3229                   EMIT_TWO_BYTES ('\r', c);
3230                   break;
3231                 }
3232               else if (coding->eol_type == CODING_EOL_CR)
3233                 c = '\r';
3234             default:
3235               EMIT_ONE_BYTE (c);
3236             }
3237         }
3238       else
3239         {
3240           SPLIT_CHAR (c, charset, c1, c2);
3241           if (sjis_p)
3242             {
3243               if (charset == charset_jisx0208
3244                   || charset == charset_jisx0208_1978)
3245                 {
3246                   ENCODE_SJIS (c1, c2, c1, c2);
3247                   EMIT_TWO_BYTES (c1, c2);
3248                 }
3249               else if (charset == charset_katakana_jisx0201)
3250                 EMIT_ONE_BYTE (c1 | 0x80);
3251               else if (charset == charset_latin_jisx0201)
3252                 EMIT_ONE_BYTE (c1);
3253               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3254                 {
3255                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3256                   if (CHARSET_WIDTH (charset) > 1)
3257                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3258                 }
3259               else
3260                 /* There's no way other than producing the internal
3261                    codes as is.  */
3262                 EMIT_BYTES (src_base, src);
3263             }
3264           else
3265             {
3266               if (charset == charset_big5_1 || charset == charset_big5_2)
3267                 {
3268                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3269                   EMIT_TWO_BYTES (c1, c2);
3270                 }
3271               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3272                 {
3273                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3274                   if (CHARSET_WIDTH (charset) > 1)
3275                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3276                 }
3277               else
3278                 /* There's no way other than producing the internal
3279                    codes as is.  */
3280                 EMIT_BYTES (src_base, src);
3281             }
3282         }
3283       coding->consumed_char++;
3284     }
3285
3286  label_end_of_loop:
3287   coding->consumed = src_base - source;
3288   coding->produced = coding->produced_char = dst - destination;
3289 }
3290
3291 \f
3292 /*** 5. CCL handlers ***/
3293
3294 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3295    Check if a text is encoded in a coding system of which
3296    encoder/decoder are written in CCL program.  If it is, return
3297    CODING_CATEGORY_MASK_CCL, else return 0.  */
3298
3299 static int
3300 detect_coding_ccl (src, src_end, multibytep)
3301      unsigned char *src, *src_end;
3302      int multibytep;
3303 {
3304   unsigned char *valid;
3305   int c;
3306   /* Dummy for ONE_MORE_BYTE.  */
3307   struct coding_system dummy_coding;
3308   struct coding_system *coding = &dummy_coding;
3309
3310   /* No coding system is assigned to coding-category-ccl.  */
3311   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3312     return 0;
3313
3314   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3315   while (1)
3316     {
3317       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL);
3318       if (! valid[c])
3319         return 0;
3320     }
3321 }
3322
3323 \f
3324 /*** 6. End-of-line handlers ***/
3325
3326 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3327
3328 static void
3329 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3330      struct coding_system *coding;
3331      const unsigned char *source;
3332      unsigned char *destination;
3333      int src_bytes, dst_bytes;
3334 {
3335   const unsigned char *src = source;
3336   unsigned char *dst = destination;
3337   const unsigned char *src_end = src + src_bytes;
3338   unsigned char *dst_end = dst + dst_bytes;
3339   Lisp_Object translation_table;
3340   /* SRC_BASE remembers the start position in source in each loop.
3341      The loop will be exited when there's not enough source code
3342      (within macro ONE_MORE_BYTE), or when there's not enough
3343      destination area to produce a character (within macro
3344      EMIT_CHAR).  */
3345   const unsigned char *src_base;
3346   int c;
3347
3348   translation_table = Qnil;
3349   switch (coding->eol_type)
3350     {
3351     case CODING_EOL_CRLF:
3352       while (1)
3353         {
3354           src_base = src;
3355           ONE_MORE_BYTE (c);
3356           if (c == '\r')
3357             {
3358               ONE_MORE_BYTE (c);
3359               if (c != '\n')
3360                 {
3361                   src--;
3362                   c = '\r';
3363                 }
3364             }
3365           else if (c == '\n'
3366                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3367             {
3368               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3369               goto label_end_of_loop;
3370             }
3371           EMIT_CHAR (c);
3372         }
3373       break;
3374
3375     case CODING_EOL_CR:
3376       while (1)
3377         {
3378           src_base = src;
3379           ONE_MORE_BYTE (c);
3380           if (c == '\n')
3381             {
3382               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3383                 {
3384                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3385                   goto label_end_of_loop;
3386                 }
3387             }
3388           else if (c == '\r')
3389             c = '\n';
3390           EMIT_CHAR (c);
3391         }
3392       break;
3393
3394     default:                    /* no need for EOL handling */
3395       while (1)
3396         {
3397           src_base = src;
3398           ONE_MORE_BYTE (c);
3399           EMIT_CHAR (c);
3400         }
3401     }
3402
3403  label_end_of_loop:
3404   coding->consumed = coding->consumed_char = src_base - source;
3405   coding->produced = dst - destination;
3406   return;
3407 }
3408
3409 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3410    format of end-of-line according to `coding->eol_type'.  It also
3411    convert multibyte form 8-bit characters to unibyte if
3412    CODING->src_multibyte is nonzero.  If `coding->mode &
3413    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3414    also means end-of-line.  */
3415
3416 static void
3417 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3418      struct coding_system *coding;
3419      const unsigned char *source;
3420      unsigned char *destination;
3421      int src_bytes, dst_bytes;
3422 {
3423   const unsigned char *src = source;
3424   unsigned char *dst = destination;
3425   const unsigned char *src_end = src + src_bytes;
3426   unsigned char *dst_end = dst + dst_bytes;
3427   Lisp_Object translation_table;
3428   /* SRC_BASE remembers the start position in source in each loop.
3429      The loop will be exited when there's not enough source text to
3430      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3431      there's not enough destination area to produce encoded codes
3432      (within macro EMIT_BYTES).  */
3433   const unsigned char *src_base;
3434   unsigned char *tmp;
3435   int c;
3436   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3437
3438   translation_table = Qnil;
3439   if (coding->src_multibyte
3440       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3441     {
3442       src_end--;
3443       src_bytes--;
3444       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3445     }
3446
3447   if (coding->eol_type == CODING_EOL_CRLF)
3448     {
3449       while (src < src_end)
3450         {
3451           src_base = src;
3452           c = *src++;
3453           if (c >= 0x20)
3454             EMIT_ONE_BYTE (c);
3455           else if (c == '\n' || (c == '\r' && selective_display))
3456             EMIT_TWO_BYTES ('\r', '\n');
3457           else
3458             EMIT_ONE_BYTE (c);
3459         }
3460       src_base = src;
3461     label_end_of_loop:
3462       ;
3463     }
3464   else
3465     {
3466       if (!dst_bytes || src_bytes <= dst_bytes)
3467         {
3468           safe_bcopy (src, dst, src_bytes);
3469           src_base = src_end;
3470           dst += src_bytes;
3471         }
3472       else
3473         {
3474           if (coding->src_multibyte
3475               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3476             dst_bytes--;
3477           safe_bcopy (src, dst, dst_bytes);
3478           src_base = src + dst_bytes;
3479           dst = destination + dst_bytes;
3480           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3481         }
3482       if (coding->eol_type == CODING_EOL_CR)
3483         {
3484           for (tmp = destination; tmp < dst; tmp++)
3485             if (*tmp == '\n') *tmp = '\r';
3486         }
3487       else if (selective_display)
3488         {
3489           for (tmp = destination; tmp < dst; tmp++)
3490             if (*tmp == '\r') *tmp = '\n';
3491         }
3492     }
3493   if (coding->src_multibyte)
3494     dst = destination + str_as_unibyte (destination, dst - destination);
3495
3496   coding->consumed = src_base - source;
3497   coding->produced = dst - destination;
3498   coding->produced_char = coding->produced;
3499 }
3500
3501 \f
3502 /*** 7. C library functions ***/
3503
3504 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3505    has a property `coding-system'.  The value of this property is a
3506    vector of length 5 (called the coding-vector).  Among elements of
3507    this vector, the first (element[0]) and the fifth (element[4])
3508    carry important information for decoding/encoding.  Before
3509    decoding/encoding, this information should be set in fields of a
3510    structure of type `coding_system'.
3511
3512    The value of the property `coding-system' can be a symbol of another
3513    subsidiary coding-system.  In that case, Emacs gets coding-vector
3514    from that symbol.
3515
3516    `element[0]' contains information to be set in `coding->type'.  The
3517    value and its meaning is as follows:
3518
3519    0 -- coding_type_emacs_mule
3520    1 -- coding_type_sjis
3521    2 -- coding_type_iso2022
3522    3 -- coding_type_big5
3523    4 -- coding_type_ccl encoder/decoder written in CCL
3524    nil -- coding_type_no_conversion
3525    t -- coding_type_undecided (automatic conversion on decoding,
3526                                no-conversion on encoding)
3527
3528    `element[4]' contains information to be set in `coding->flags' and
3529    `coding->spec'.  The meaning varies by `coding->type'.
3530
3531    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3532    of length 32 (of which the first 13 sub-elements are used now).
3533    Meanings of these sub-elements are:
3534
3535    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3536         If the value is an integer of valid charset, the charset is
3537         assumed to be designated to graphic register N initially.
3538
3539         If the value is minus, it is a minus value of charset which
3540         reserves graphic register N, which means that the charset is
3541         not designated initially but should be designated to graphic
3542         register N just before encoding a character in that charset.
3543
3544         If the value is nil, graphic register N is never used on
3545         encoding.
3546
3547    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3548         Each value takes t or nil.  See the section ISO2022 of
3549         `coding.h' for more information.
3550
3551    If `coding->type' is `coding_type_big5', element[4] is t to denote
3552    BIG5-ETen or nil to denote BIG5-HKU.
3553
3554    If `coding->type' takes the other value, element[4] is ignored.
3555
3556    Emacs Lisp's coding systems also carry information about format of
3557    end-of-line in a value of property `eol-type'.  If the value is
3558    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3559    means CODING_EOL_CR.  If it is not integer, it should be a vector
3560    of subsidiary coding systems of which property `eol-type' has one
3561    of the above values.
3562
3563 */
3564
3565 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3566    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3567    is setup so that no conversion is necessary and return -1, else
3568    return 0.  */
3569
3570 int
3571 setup_coding_system (coding_system, coding)
3572      Lisp_Object coding_system;
3573      struct coding_system *coding;
3574 {
3575   Lisp_Object coding_spec, coding_type, eol_type, plist;
3576   Lisp_Object val;
3577
3578   /* At first, zero clear all members.  */
3579   bzero (coding, sizeof (struct coding_system));
3580
3581   /* Initialize some fields required for all kinds of coding systems.  */
3582   coding->symbol = coding_system;
3583   coding->heading_ascii = -1;
3584   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3585   coding->composing = COMPOSITION_DISABLED;
3586   coding->cmp_data = NULL;
3587
3588   if (NILP (coding_system))
3589     goto label_invalid_coding_system;
3590
3591   coding_spec = Fget (coding_system, Qcoding_system);
3592
3593   if (!VECTORP (coding_spec)
3594       || XVECTOR (coding_spec)->size != 5
3595       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3596     goto label_invalid_coding_system;
3597
3598   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3599   if (VECTORP (eol_type))
3600     {
3601       coding->eol_type = CODING_EOL_UNDECIDED;
3602       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3603       if (system_eol_type != CODING_EOL_LF)
3604         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3605     }
3606   else if (XFASTINT (eol_type) == 1)
3607     {
3608       coding->eol_type = CODING_EOL_CRLF;
3609       coding->common_flags
3610         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3611     }
3612   else if (XFASTINT (eol_type) == 2)
3613     {
3614       coding->eol_type = CODING_EOL_CR;
3615       coding->common_flags
3616         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3617     }
3618   else
3619     {
3620       coding->common_flags = 0;
3621       coding->eol_type = CODING_EOL_LF;
3622     }
3623
3624   coding_type = XVECTOR (coding_spec)->contents[0];
3625   /* Try short cut.  */
3626   if (SYMBOLP (coding_type))
3627     {
3628       if (EQ (coding_type, Qt))
3629         {
3630           coding->type = coding_type_undecided;
3631           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3632         }
3633       else
3634         coding->type = coding_type_no_conversion;
3635       /* Initialize this member.  Any thing other than
3636          CODING_CATEGORY_IDX_UTF_16_BE and
3637          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3638          special treatment in detect_eol.  */
3639       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3640
3641       return 0;
3642     }
3643
3644   /* Get values of coding system properties:
3645      `post-read-conversion', `pre-write-conversion',
3646      `translation-table-for-decode', `translation-table-for-encode'.  */
3647   plist = XVECTOR (coding_spec)->contents[3];
3648   /* Pre & post conversion functions should be disabled if
3649      inhibit_eol_conversion is nonzero.  This is the case that a code
3650      conversion function is called while those functions are running.  */
3651   if (! inhibit_pre_post_conversion)
3652     {
3653       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3654       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3655     }
3656   val = Fplist_get (plist, Qtranslation_table_for_decode);
3657   if (SYMBOLP (val))
3658     val = Fget (val, Qtranslation_table_for_decode);
3659   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3660   val = Fplist_get (plist, Qtranslation_table_for_encode);
3661   if (SYMBOLP (val))
3662     val = Fget (val, Qtranslation_table_for_encode);
3663   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3664   val = Fplist_get (plist, Qcoding_category);
3665   if (!NILP (val))
3666     {
3667       val = Fget (val, Qcoding_category_index);
3668       if (INTEGERP (val))
3669         coding->category_idx = XINT (val);
3670       else
3671         goto label_invalid_coding_system;
3672     }
3673   else
3674     goto label_invalid_coding_system;
3675
3676   /* If the coding system has non-nil `composition' property, enable
3677      composition handling.  */
3678   val = Fplist_get (plist, Qcomposition);
3679   if (!NILP (val))
3680     coding->composing = COMPOSITION_NO;
3681
3682   /* If the coding system is ascii-incompatible, record it in
3683      common_flags.   */
3684   val = Fplist_get (plist, Qascii_incompatible);
3685   if (! NILP (val))
3686     coding->common_flags |= CODING_ASCII_INCOMPATIBLE_MASK;
3687
3688   switch (XFASTINT (coding_type))
3689     {
3690     case 0:
3691       coding->type = coding_type_emacs_mule;
3692       coding->common_flags
3693         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3694       if (!NILP (coding->post_read_conversion))
3695         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3696       if (!NILP (coding->pre_write_conversion))
3697         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3698       break;
3699
3700     case 1:
3701       coding->type = coding_type_sjis;
3702       coding->common_flags
3703         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3704       break;
3705
3706     case 2:
3707       coding->type = coding_type_iso2022;
3708       coding->common_flags
3709         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3710       {
3711         Lisp_Object val, temp;
3712         Lisp_Object *flags;
3713         int i, charset, reg_bits = 0;
3714
3715         val = XVECTOR (coding_spec)->contents[4];
3716
3717         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3718           goto label_invalid_coding_system;
3719
3720         flags = XVECTOR (val)->contents;
3721         coding->flags
3722           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3723              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3724              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3725              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3726              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3727              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3728              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3729              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3730              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3731              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3732              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3733              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3734              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3735              );
3736
3737         /* Invoke graphic register 0 to plane 0.  */
3738         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3739         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3740         CODING_SPEC_ISO_INVOCATION (coding, 1)
3741           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3742         /* Not single shifting at first.  */
3743         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3744         /* Beginning of buffer should also be regarded as bol. */
3745         CODING_SPEC_ISO_BOL (coding) = 1;
3746
3747         for (charset = 0; charset <= MAX_CHARSET; charset++)
3748           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3749         val = Vcharset_revision_alist;
3750         while (CONSP (val))
3751           {
3752             charset = get_charset_id (Fcar_safe (XCAR (val)));
3753             if (charset >= 0
3754                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3755                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3756               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3757             val = XCDR (val);
3758           }
3759
3760         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3761            FLAGS[REG] can be one of below:
3762                 integer CHARSET: CHARSET occupies register I,
3763                 t: designate nothing to REG initially, but can be used
3764                   by any charsets,
3765                 list of integer, nil, or t: designate the first
3766                   element (if integer) to REG initially, the remaining
3767                   elements (if integer) is designated to REG on request,
3768                   if an element is t, REG can be used by any charsets,
3769                 nil: REG is never used.  */
3770         for (charset = 0; charset <= MAX_CHARSET; charset++)
3771           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3772             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3773         for (i = 0; i < 4; i++)
3774           {
3775             if ((INTEGERP (flags[i])
3776                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3777                 || (charset = get_charset_id (flags[i])) >= 0)
3778               {
3779                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3780                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3781               }
3782             else if (EQ (flags[i], Qt))
3783               {
3784                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3785                 reg_bits |= 1 << i;
3786                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3787               }
3788             else if (CONSP (flags[i]))
3789               {
3790                 Lisp_Object tail;
3791                 tail = flags[i];
3792
3793                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3794                 if ((INTEGERP (XCAR (tail))
3795                      && (charset = XINT (XCAR (tail)),
3796                          CHARSET_VALID_P (charset)))
3797                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3798                   {
3799                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3800                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3801                   }
3802                 else
3803                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3804                 tail = XCDR (tail);
3805                 while (CONSP (tail))
3806                   {
3807                     if ((INTEGERP (XCAR (tail))
3808                          && (charset = XINT (XCAR (tail)),
3809                              CHARSET_VALID_P (charset)))
3810                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3811                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3812                         = i;
3813                     else if (EQ (XCAR (tail), Qt))
3814                       reg_bits |= 1 << i;
3815                     tail = XCDR (tail);
3816                   }
3817               }
3818             else
3819               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3820
3821             CODING_SPEC_ISO_DESIGNATION (coding, i)
3822               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3823           }
3824
3825         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3826           {
3827             /* REG 1 can be used only by locking shift in 7-bit env.  */
3828             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3829               reg_bits &= ~2;
3830             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3831               /* Without any shifting, only REG 0 and 1 can be used.  */
3832               reg_bits &= 3;
3833           }
3834
3835         if (reg_bits)
3836           for (charset = 0; charset <= MAX_CHARSET; charset++)
3837             {
3838               if (CHARSET_DEFINED_P (charset)
3839                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3840                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3841                 {
3842                   /* There exist some default graphic registers to be
3843                      used by CHARSET.  */
3844
3845                   /* We had better avoid designating a charset of
3846                      CHARS96 to REG 0 as far as possible.  */
3847                   if (CHARSET_CHARS (charset) == 96)
3848                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3849                       = (reg_bits & 2
3850                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3851                   else
3852                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3853                       = (reg_bits & 1
3854                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3855                 }
3856             }
3857       }
3858       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3859       coding->spec.iso2022.last_invalid_designation_register = -1;
3860       break;
3861
3862     case 3:
3863       coding->type = coding_type_big5;
3864       coding->common_flags
3865         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3866       coding->flags
3867         = (NILP (XVECTOR (coding_spec)->contents[4])
3868            ? CODING_FLAG_BIG5_HKU
3869            : CODING_FLAG_BIG5_ETEN);
3870       break;
3871
3872     case 4:
3873       coding->type = coding_type_ccl;
3874       coding->common_flags
3875         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3876       {
3877         val = XVECTOR (coding_spec)->contents[4];
3878         if (! CONSP (val)
3879             || setup_ccl_program (&(coding->spec.ccl.decoder),
3880                                   XCAR (val)) < 0
3881             || setup_ccl_program (&(coding->spec.ccl.encoder),
3882                                   XCDR (val)) < 0)
3883           goto label_invalid_coding_system;
3884
3885         bzero (coding->spec.ccl.valid_codes, 256);
3886         val = Fplist_get (plist, Qvalid_codes);
3887         if (CONSP (val))
3888           {
3889             Lisp_Object this;
3890
3891             for (; CONSP (val); val = XCDR (val))
3892               {
3893                 this = XCAR (val);
3894                 if (INTEGERP (this)
3895                     && XINT (this) >= 0 && XINT (this) < 256)
3896                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3897                 else if (CONSP (this)
3898                          && INTEGERP (XCAR (this))
3899                          && INTEGERP (XCDR (this)))
3900                   {
3901                     int start = XINT (XCAR (this));
3902                     int end = XINT (XCDR (this));
3903
3904                     if (start >= 0 && start <= end && end < 256)
3905                       while (start <= end)
3906                         coding->spec.ccl.valid_codes[start++] = 1;
3907                   }
3908               }
3909           }
3910       }
3911       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3912       coding->spec.ccl.cr_carryover = 0;
3913       coding->spec.ccl.eight_bit_carryover[0] = 0;
3914       break;
3915
3916     case 5:
3917       coding->type = coding_type_raw_text;
3918       break;
3919
3920     default:
3921       goto label_invalid_coding_system;
3922     }
3923   return 0;
3924
3925  label_invalid_coding_system:
3926   coding->type = coding_type_no_conversion;
3927   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3928   coding->common_flags = 0;
3929   coding->eol_type = CODING_EOL_UNDECIDED;
3930   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3931   return NILP (coding_system) ? 0 : -1;
3932 }
3933
3934 /* Free memory blocks allocated for storing composition information.  */
3935
3936 void
3937 coding_free_composition_data (coding)
3938      struct coding_system *coding;
3939 {
3940   struct composition_data *cmp_data = coding->cmp_data, *next;
3941
3942   if (!cmp_data)
3943     return;
3944   /* Memory blocks are chained.  At first, rewind to the first, then,
3945      free blocks one by one.  */
3946   while (cmp_data->prev)
3947     cmp_data = cmp_data->prev;
3948   while (cmp_data)
3949     {
3950       next = cmp_data->next;
3951       xfree (cmp_data);
3952       cmp_data = next;
3953     }
3954   coding->cmp_data = NULL;
3955 }
3956
3957 /* Set `char_offset' member of all memory blocks pointed by
3958    coding->cmp_data to POS.  */
3959
3960 void
3961 coding_adjust_composition_offset (coding, pos)
3962      struct coding_system *coding;
3963      int pos;
3964 {
3965   struct composition_data *cmp_data;
3966
3967   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3968     cmp_data->char_offset = pos;
3969 }
3970
3971 /* Setup raw-text or one of its subsidiaries in the structure
3972    coding_system CODING according to the already setup value eol_type
3973    in CODING.  CODING should be setup for some coding system in
3974    advance.  */
3975
3976 void
3977 setup_raw_text_coding_system (coding)
3978      struct coding_system *coding;
3979 {
3980   if (coding->type != coding_type_raw_text)
3981     {
3982       coding->symbol = Qraw_text;
3983       coding->type = coding_type_raw_text;
3984       if (coding->eol_type != CODING_EOL_UNDECIDED)
3985         {
3986           Lisp_Object subsidiaries;
3987           subsidiaries = Fget (Qraw_text, Qeol_type);
3988
3989           if (VECTORP (subsidiaries)
3990               && XVECTOR (subsidiaries)->size == 3)
3991             coding->symbol
3992               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3993         }
3994       setup_coding_system (coding->symbol, coding);
3995     }
3996   return;
3997 }
3998
3999 /* Emacs has a mechanism to automatically detect a coding system if it
4000    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
4001    it's impossible to distinguish some coding systems accurately
4002    because they use the same range of codes.  So, at first, coding
4003    systems are categorized into 7, those are:
4004
4005    o coding-category-emacs-mule
4006
4007         The category for a coding system which has the same code range
4008         as Emacs' internal format.  Assigned the coding-system (Lisp
4009         symbol) `emacs-mule' by default.
4010
4011    o coding-category-sjis
4012
4013         The category for a coding system which has the same code range
4014         as SJIS.  Assigned the coding-system (Lisp
4015         symbol) `japanese-shift-jis' by default.
4016
4017    o coding-category-iso-7
4018
4019         The category for a coding system which has the same code range
4020         as ISO2022 of 7-bit environment.  This doesn't use any locking
4021         shift and single shift functions.  This can encode/decode all
4022         charsets.  Assigned the coding-system (Lisp symbol)
4023         `iso-2022-7bit' by default.
4024
4025    o coding-category-iso-7-tight
4026
4027         Same as coding-category-iso-7 except that this can
4028         encode/decode only the specified charsets.
4029
4030    o coding-category-iso-8-1
4031
4032         The category for a coding system which has the same code range
4033         as ISO2022 of 8-bit environment and graphic plane 1 used only
4034         for DIMENSION1 charset.  This doesn't use any locking shift
4035         and single shift functions.  Assigned the coding-system (Lisp
4036         symbol) `iso-latin-1' by default.
4037
4038    o coding-category-iso-8-2
4039
4040         The category for a coding system which has the same code range
4041         as ISO2022 of 8-bit environment and graphic plane 1 used only
4042         for DIMENSION2 charset.  This doesn't use any locking shift
4043         and single shift functions.  Assigned the coding-system (Lisp
4044         symbol) `japanese-iso-8bit' by default.
4045
4046    o coding-category-iso-7-else
4047
4048         The category for a coding system which has the same code range
4049         as ISO2022 of 7-bit environment but uses locking shift or
4050         single shift functions.  Assigned the coding-system (Lisp
4051         symbol) `iso-2022-7bit-lock' by default.
4052
4053    o coding-category-iso-8-else
4054
4055         The category for a coding system which has the same code range
4056         as ISO2022 of 8-bit environment but uses locking shift or
4057         single shift functions.  Assigned the coding-system (Lisp
4058         symbol) `iso-2022-8bit-ss2' by default.
4059
4060    o coding-category-big5
4061
4062         The category for a coding system which has the same code range
4063         as BIG5.  Assigned the coding-system (Lisp symbol)
4064         `cn-big5' by default.
4065
4066    o coding-category-utf-8
4067
4068         The category for a coding system which has the same code range
4069         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4070         symbol) `utf-8' by default.
4071
4072    o coding-category-utf-16-be
4073
4074         The category for a coding system in which a text has an
4075         Unicode signature (cf. Unicode Standard) in the order of BIG
4076         endian at the head.  Assigned the coding-system (Lisp symbol)
4077         `utf-16-be' by default.
4078
4079    o coding-category-utf-16-le
4080
4081         The category for a coding system in which a text has an
4082         Unicode signature (cf. Unicode Standard) in the order of
4083         LITTLE endian at the head.  Assigned the coding-system (Lisp
4084         symbol) `utf-16-le' by default.
4085
4086    o coding-category-ccl
4087
4088         The category for a coding system of which encoder/decoder is
4089         written in CCL programs.  The default value is nil, i.e., no
4090         coding system is assigned.
4091
4092    o coding-category-binary
4093
4094         The category for a coding system not categorized in any of the
4095         above.  Assigned the coding-system (Lisp symbol)
4096         `no-conversion' by default.
4097
4098    Each of them is a Lisp symbol and the value is an actual
4099    `coding-system' (this is also a Lisp symbol) assigned by a user.
4100    What Emacs does actually is to detect a category of coding system.
4101    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4102    decide a single possible category, it selects a category of the
4103    highest priority.  Priorities of categories are also specified by a
4104    user in a Lisp variable `coding-category-list'.
4105
4106 */
4107
4108 static
4109 int ascii_skip_code[256];
4110
4111 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4112    If it detects possible coding systems, return an integer in which
4113    appropriate flag bits are set.  Flag bits are defined by macros
4114    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4115    it should point the table `coding_priorities'.  In that case, only
4116    the flag bit for a coding system of the highest priority is set in
4117    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4118    range 0x80..0x9F are in multibyte form.
4119
4120    How many ASCII characters are at the head is returned as *SKIP.  */
4121
4122 static int
4123 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4124      unsigned char *source;
4125      int src_bytes, *priorities, *skip;
4126      int multibytep;
4127 {
4128   register unsigned char c;
4129   unsigned char *src = source, *src_end = source + src_bytes;
4130   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4131   int i;
4132
4133   /* At first, skip all ASCII characters and control characters except
4134      for three ISO2022 specific control characters.  */
4135   ascii_skip_code[ISO_CODE_SO] = 0;
4136   ascii_skip_code[ISO_CODE_SI] = 0;
4137   ascii_skip_code[ISO_CODE_ESC] = 0;
4138
4139  label_loop_detect_coding:
4140   while (src < src_end && ascii_skip_code[*src]) src++;
4141   *skip = src - source;
4142
4143   if (src >= src_end)
4144     /* We found nothing other than ASCII.  There's nothing to do.  */
4145     return 0;
4146
4147   c = *src;
4148   /* The text seems to be encoded in some multilingual coding system.
4149      Now, try to find in which coding system the text is encoded.  */
4150   if (c < 0x80)
4151     {
4152       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4153       /* C is an ISO2022 specific control code of C0.  */
4154       mask = detect_coding_iso2022 (src, src_end, multibytep);
4155       if (mask == 0)
4156         {
4157           /* No valid ISO2022 code follows C.  Try again.  */
4158           src++;
4159           if (c == ISO_CODE_ESC)
4160             ascii_skip_code[ISO_CODE_ESC] = 1;
4161           else
4162             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4163           goto label_loop_detect_coding;
4164         }
4165       if (priorities)
4166         {
4167           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4168             {
4169               if (mask & priorities[i])
4170                 return priorities[i];
4171             }
4172           return CODING_CATEGORY_MASK_RAW_TEXT;
4173         }
4174     }
4175   else
4176     {
4177       int try;
4178
4179       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4180         c = src[1] - 0x20;
4181
4182       if (c < 0xA0)
4183         {
4184           /* C is the first byte of SJIS character code,
4185              or a leading-code of Emacs' internal format (emacs-mule),
4186              or the first byte of UTF-16.  */
4187           try = (CODING_CATEGORY_MASK_SJIS
4188                   | CODING_CATEGORY_MASK_EMACS_MULE
4189                   | CODING_CATEGORY_MASK_UTF_16_BE
4190                   | CODING_CATEGORY_MASK_UTF_16_LE);
4191
4192           /* Or, if C is a special latin extra code,
4193              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4194              or is an ISO2022 control-sequence-introducer (CSI),
4195              we should also consider the possibility of ISO2022 codings.  */
4196           if ((VECTORP (Vlatin_extra_code_table)
4197                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4198               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4199               || (c == ISO_CODE_CSI
4200                   && (src < src_end
4201                       && (*src == ']'
4202                           || ((*src == '0' || *src == '1' || *src == '2')
4203                               && src + 1 < src_end
4204                               && src[1] == ']')))))
4205             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4206                      | CODING_CATEGORY_MASK_ISO_8BIT);
4207         }
4208       else
4209         /* C is a character of ISO2022 in graphic plane right,
4210            or a SJIS's 1-byte character code (i.e. JISX0201),
4211            or the first byte of BIG5's 2-byte code,
4212            or the first byte of UTF-8/16.  */
4213         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4214                 | CODING_CATEGORY_MASK_ISO_8BIT
4215                 | CODING_CATEGORY_MASK_SJIS
4216                 | CODING_CATEGORY_MASK_BIG5
4217                 | CODING_CATEGORY_MASK_UTF_8
4218                 | CODING_CATEGORY_MASK_UTF_16_BE
4219                 | CODING_CATEGORY_MASK_UTF_16_LE);
4220
4221       /* Or, we may have to consider the possibility of CCL.  */
4222       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4223           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4224               ->spec.ccl.valid_codes)[c])
4225         try |= CODING_CATEGORY_MASK_CCL;
4226
4227       mask = 0;
4228       utf16_examined_p = iso2022_examined_p = 0;
4229       if (priorities)
4230         {
4231           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4232             {
4233               if (!iso2022_examined_p
4234                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4235                 {
4236                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4237                   iso2022_examined_p = 1;
4238                 }
4239               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4240                 mask |= detect_coding_sjis (src, src_end, multibytep);
4241               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4242                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4243               else if (!utf16_examined_p
4244                        && (priorities[i] & try &
4245                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4246                 {
4247                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4248                   utf16_examined_p = 1;
4249                 }
4250               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4251                 mask |= detect_coding_big5 (src, src_end, multibytep);
4252               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4253                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4254               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4255                 mask |= detect_coding_ccl (src, src_end, multibytep);
4256               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4257                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4258               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4259                 mask |= CODING_CATEGORY_MASK_BINARY;
4260               if (mask & priorities[i])
4261                 return priorities[i];
4262             }
4263           return CODING_CATEGORY_MASK_RAW_TEXT;
4264         }
4265       if (try & CODING_CATEGORY_MASK_ISO)
4266         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4267       if (try & CODING_CATEGORY_MASK_SJIS)
4268         mask |= detect_coding_sjis (src, src_end, multibytep);
4269       if (try & CODING_CATEGORY_MASK_BIG5)
4270         mask |= detect_coding_big5 (src, src_end, multibytep);
4271       if (try & CODING_CATEGORY_MASK_UTF_8)
4272         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4273       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4274         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4275       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4276         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4277       if (try & CODING_CATEGORY_MASK_CCL)
4278         mask |= detect_coding_ccl (src, src_end, multibytep);
4279     }
4280   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4281 }
4282
4283 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4284    The information of the detected coding system is set in CODING.  */
4285
4286 void
4287 detect_coding (coding, src, src_bytes)
4288      struct coding_system *coding;
4289      const unsigned char *src;
4290      int src_bytes;
4291 {
4292   unsigned int idx;
4293   int skip, mask;
4294   Lisp_Object val;
4295
4296   val = Vcoding_category_list;
4297   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4298                              coding->src_multibyte);
4299   coding->heading_ascii = skip;
4300
4301   if (!mask) return;
4302
4303   /* We found a single coding system of the highest priority in MASK.  */
4304   idx = 0;
4305   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4306   if (! mask)
4307     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4308
4309   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4310
4311   if (coding->eol_type != CODING_EOL_UNDECIDED)
4312     {
4313       Lisp_Object tmp;
4314
4315       tmp = Fget (val, Qeol_type);
4316       if (VECTORP (tmp))
4317         val = XVECTOR (tmp)->contents[coding->eol_type];
4318     }
4319
4320   /* Setup this new coding system while preserving some slots.  */
4321   {
4322     int src_multibyte = coding->src_multibyte;
4323     int dst_multibyte = coding->dst_multibyte;
4324
4325     setup_coding_system (val, coding);
4326     coding->src_multibyte = src_multibyte;
4327     coding->dst_multibyte = dst_multibyte;
4328     coding->heading_ascii = skip;
4329   }
4330 }
4331
4332 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4333    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4334    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4335
4336    How many non-eol characters are at the head is returned as *SKIP.  */
4337
4338 #define MAX_EOL_CHECK_COUNT 3
4339
4340 static int
4341 detect_eol_type (source, src_bytes, skip)
4342      unsigned char *source;
4343      int src_bytes, *skip;
4344 {
4345   unsigned char *src = source, *src_end = src + src_bytes;
4346   unsigned char c;
4347   int total = 0;                /* How many end-of-lines are found so far.  */
4348   int eol_type = CODING_EOL_UNDECIDED;
4349   int this_eol_type;
4350
4351   *skip = 0;
4352
4353   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4354     {
4355       c = *src++;
4356       if (c == '\n' || c == '\r')
4357         {
4358           if (*skip == 0)
4359             *skip = src - 1 - source;
4360           total++;
4361           if (c == '\n')
4362             this_eol_type = CODING_EOL_LF;
4363           else if (src >= src_end || *src != '\n')
4364             this_eol_type = CODING_EOL_CR;
4365           else
4366             this_eol_type = CODING_EOL_CRLF, src++;
4367
4368           if (eol_type == CODING_EOL_UNDECIDED)
4369             /* This is the first end-of-line.  */
4370             eol_type = this_eol_type;
4371           else if (eol_type != this_eol_type)
4372             {
4373               /* The found type is different from what found before.  */
4374               eol_type = CODING_EOL_INCONSISTENT;
4375               break;
4376             }
4377         }
4378     }
4379
4380   if (*skip == 0)
4381     *skip = src_end - source;
4382   return eol_type;
4383 }
4384
4385 /* Like detect_eol_type, but detect EOL type in 2-octet
4386    big-endian/little-endian format for coding systems utf-16-be and
4387    utf-16-le.  */
4388
4389 static int
4390 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4391      unsigned char *source;
4392      int src_bytes, *skip, big_endian_p;
4393 {
4394   unsigned char *src = source, *src_end = src + src_bytes;
4395   unsigned int c1, c2;
4396   int total = 0;                /* How many end-of-lines are found so far.  */
4397   int eol_type = CODING_EOL_UNDECIDED;
4398   int this_eol_type;
4399   int msb, lsb;
4400
4401   if (big_endian_p)
4402     msb = 0, lsb = 1;
4403   else
4404     msb = 1, lsb = 0;
4405
4406   *skip = 0;
4407
4408   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4409     {
4410       c1 = (src[msb] << 8) | (src[lsb]);
4411       src += 2;
4412
4413       if (c1 == '\n' || c1 == '\r')
4414         {
4415           if (*skip == 0)
4416             *skip = src - 2 - source;
4417           total++;
4418           if (c1 == '\n')
4419             {
4420               this_eol_type = CODING_EOL_LF;
4421             }
4422           else
4423             {
4424               if ((src + 1) >= src_end)
4425                 {
4426                   this_eol_type = CODING_EOL_CR;
4427                 }
4428               else
4429                 {
4430                   c2 = (src[msb] << 8) | (src[lsb]);
4431                   if (c2 == '\n')
4432                     this_eol_type = CODING_EOL_CRLF, src += 2;
4433                   else
4434                     this_eol_type = CODING_EOL_CR;
4435                 }
4436             }
4437
4438           if (eol_type == CODING_EOL_UNDECIDED)
4439             /* This is the first end-of-line.  */
4440             eol_type = this_eol_type;
4441           else if (eol_type != this_eol_type)
4442             {
4443               /* The found type is different from what found before.  */
4444               eol_type = CODING_EOL_INCONSISTENT;
4445               break;
4446             }
4447         }
4448     }
4449
4450   if (*skip == 0)
4451     *skip = src_end - source;
4452   return eol_type;
4453 }
4454
4455 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4456    is encoded.  If it detects an appropriate format of end-of-line, it
4457    sets the information in *CODING.  */
4458
4459 void
4460 detect_eol (coding, src, src_bytes)
4461      struct coding_system *coding;
4462      const unsigned char *src;
4463      int src_bytes;
4464 {
4465   Lisp_Object val;
4466   int skip;
4467   int eol_type;
4468
4469   switch (coding->category_idx)
4470     {
4471     case CODING_CATEGORY_IDX_UTF_16_BE:
4472       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4473       break;
4474     case CODING_CATEGORY_IDX_UTF_16_LE:
4475       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4476       break;
4477     default:
4478       eol_type = detect_eol_type (src, src_bytes, &skip);
4479       break;
4480     }
4481
4482   if (coding->heading_ascii > skip)
4483     coding->heading_ascii = skip;
4484   else
4485     skip = coding->heading_ascii;
4486
4487   if (eol_type == CODING_EOL_UNDECIDED)
4488     return;
4489   if (eol_type == CODING_EOL_INCONSISTENT)
4490     {
4491 #if 0
4492       /* This code is suppressed until we find a better way to
4493          distinguish raw text file and binary file.  */
4494
4495       /* If we have already detected that the coding is raw-text, the
4496          coding should actually be no-conversion.  */
4497       if (coding->type == coding_type_raw_text)
4498         {
4499           setup_coding_system (Qno_conversion, coding);
4500           return;
4501         }
4502       /* Else, let's decode only text code anyway.  */
4503 #endif /* 0 */
4504       eol_type = CODING_EOL_LF;
4505     }
4506
4507   val = Fget (coding->symbol, Qeol_type);
4508   if (VECTORP (val) && XVECTOR (val)->size == 3)
4509     {
4510       int src_multibyte = coding->src_multibyte;
4511       int dst_multibyte = coding->dst_multibyte;
4512       struct composition_data *cmp_data = coding->cmp_data;
4513
4514       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4515       coding->src_multibyte = src_multibyte;
4516       coding->dst_multibyte = dst_multibyte;
4517       coding->heading_ascii = skip;
4518       coding->cmp_data = cmp_data;
4519     }
4520 }
4521
4522 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4523
4524 #define DECODING_BUFFER_MAG(coding)                     \
4525   (coding->type == coding_type_iso2022                  \
4526    ? 3                                                  \
4527    : (coding->type == coding_type_ccl                   \
4528       ? coding->spec.ccl.decoder.buf_magnification      \
4529       : 2))
4530
4531 /* Return maximum size (bytes) of a buffer enough for decoding
4532    SRC_BYTES of text encoded in CODING.  */
4533
4534 int
4535 decoding_buffer_size (coding, src_bytes)
4536      struct coding_system *coding;
4537      int src_bytes;
4538 {
4539   return (src_bytes * DECODING_BUFFER_MAG (coding)
4540           + CONVERSION_BUFFER_EXTRA_ROOM);
4541 }
4542
4543 /* Return maximum size (bytes) of a buffer enough for encoding
4544    SRC_BYTES of text to CODING.  */
4545
4546 int
4547 encoding_buffer_size (coding, src_bytes)
4548      struct coding_system *coding;
4549      int src_bytes;
4550 {
4551   int magnification;
4552
4553   if (coding->type == coding_type_ccl)
4554     {
4555       magnification = coding->spec.ccl.encoder.buf_magnification;
4556       if (coding->eol_type == CODING_EOL_CRLF)
4557         magnification *= 2;
4558     }
4559   else if (CODING_REQUIRE_ENCODING (coding))
4560     magnification = 3;
4561   else
4562     magnification = 1;
4563
4564   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4565 }
4566
4567 /* Working buffer for code conversion.  */
4568 struct conversion_buffer
4569 {
4570   int size;                     /* size of data.  */
4571   int on_stack;                 /* 1 if allocated by alloca.  */
4572   unsigned char *data;
4573 };
4574
4575 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4576 #define allocate_conversion_buffer(buf, len)            \
4577   do {                                                  \
4578     if (len < MAX_ALLOCA)                               \
4579       {                                                 \
4580         buf.data = (unsigned char *) alloca (len);      \
4581         buf.on_stack = 1;                               \
4582       }                                                 \
4583     else                                                \
4584       {                                                 \
4585         buf.data = (unsigned char *) xmalloc (len);     \
4586         buf.on_stack = 0;                               \
4587       }                                                 \
4588     buf.size = len;                                     \
4589   } while (0)
4590
4591 /* Double the allocated memory for *BUF.  */
4592 static void
4593 extend_conversion_buffer (buf)
4594      struct conversion_buffer *buf;
4595 {
4596   if (buf->on_stack)
4597     {
4598       unsigned char *save = buf->data;
4599       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4600       bcopy (save, buf->data, buf->size);
4601       buf->on_stack = 0;
4602     }
4603   else
4604     {
4605       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4606     }
4607   buf->size *= 2;
4608 }
4609
4610 /* Free the allocated memory for BUF if it is not on stack.  */
4611 static void
4612 free_conversion_buffer (buf)
4613      struct conversion_buffer *buf;
4614 {
4615   if (!buf->on_stack)
4616     xfree (buf->data);
4617 }
4618
4619 int
4620 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4621      struct coding_system *coding;
4622      unsigned char *source, *destination;
4623      int src_bytes, dst_bytes, encodep;
4624 {
4625   struct ccl_program *ccl
4626     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4627   unsigned char *dst = destination;
4628
4629   ccl->suppress_error = coding->suppress_error;
4630   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4631   if (encodep)
4632     {
4633       /* On encoding, EOL format is converted within ccl_driver.  For
4634          that, setup proper information in the structure CCL.  */
4635       ccl->eol_type = coding->eol_type;
4636       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4637         ccl->eol_type = CODING_EOL_LF;
4638       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4639       ccl->eight_bit_control = coding->dst_multibyte;
4640     }
4641   else
4642     ccl->eight_bit_control = 1;
4643   ccl->multibyte = coding->src_multibyte;
4644   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4645     {
4646       /* Move carryover bytes to DESTINATION.  */
4647       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4648       while (*p)
4649         *dst++ = *p++;
4650       coding->spec.ccl.eight_bit_carryover[0] = 0;
4651       if (dst_bytes)
4652         dst_bytes -= dst - destination;
4653     }
4654
4655   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4656                                   &(coding->consumed))
4657                       + dst - destination);
4658
4659   if (encodep)
4660     {
4661       coding->produced_char = coding->produced;
4662       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4663     }
4664   else if (!ccl->eight_bit_control)
4665     {
4666       /* The produced bytes forms a valid multibyte sequence. */
4667       coding->produced_char
4668         = multibyte_chars_in_text (destination, coding->produced);
4669       coding->spec.ccl.eight_bit_carryover[0] = 0;
4670     }
4671   else
4672     {
4673       /* On decoding, the destination should always multibyte.  But,
4674          CCL program might have been generated an invalid multibyte
4675          sequence.  Here we make such a sequence valid as
4676          multibyte.  */
4677       int bytes
4678         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4679
4680       if ((coding->consumed < src_bytes
4681            || !ccl->last_block)
4682           && coding->produced >= 1
4683           && destination[coding->produced - 1] >= 0x80)
4684         {
4685           /* We should not convert the tailing 8-bit codes to
4686              multibyte form even if they doesn't form a valid
4687              multibyte sequence.  They may form a valid sequence in
4688              the next call.  */
4689           int carryover = 0;
4690
4691           if (destination[coding->produced - 1] < 0xA0)
4692             carryover = 1;
4693           else if (coding->produced >= 2)
4694             {
4695               if (destination[coding->produced - 2] >= 0x80)
4696                 {
4697                   if (destination[coding->produced - 2] < 0xA0)
4698                     carryover = 2;
4699                   else if (coding->produced >= 3
4700                            && destination[coding->produced - 3] >= 0x80
4701                            && destination[coding->produced - 3] < 0xA0)
4702                     carryover = 3;
4703                 }
4704             }
4705           if (carryover > 0)
4706             {
4707               BCOPY_SHORT (destination + coding->produced - carryover,
4708                            coding->spec.ccl.eight_bit_carryover,
4709                            carryover);
4710               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4711               coding->produced -= carryover;
4712             }
4713         }
4714       coding->produced = str_as_multibyte (destination, bytes,
4715                                            coding->produced,
4716                                            &(coding->produced_char));
4717     }
4718
4719   switch (ccl->status)
4720     {
4721     case CCL_STAT_SUSPEND_BY_SRC:
4722       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4723       break;
4724     case CCL_STAT_SUSPEND_BY_DST:
4725       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4726       break;
4727     case CCL_STAT_QUIT:
4728     case CCL_STAT_INVALID_CMD:
4729       coding->result = CODING_FINISH_INTERRUPT;
4730       break;
4731     default:
4732       coding->result = CODING_FINISH_NORMAL;
4733       break;
4734     }
4735   return coding->result;
4736 }
4737
4738 /* Decode EOL format of the text at PTR of BYTES length destructively
4739    according to CODING->eol_type.  This is called after the CCL
4740    program produced a decoded text at PTR.  If we do CRLF->LF
4741    conversion, update CODING->produced and CODING->produced_char.  */
4742
4743 static void
4744 decode_eol_post_ccl (coding, ptr, bytes)
4745      struct coding_system *coding;
4746      unsigned char *ptr;
4747      int bytes;
4748 {
4749   Lisp_Object val, saved_coding_symbol;
4750   unsigned char *pend = ptr + bytes;
4751   int dummy;
4752
4753   /* Remember the current coding system symbol.  We set it back when
4754      an inconsistent EOL is found so that `last-coding-system-used' is
4755      set to the coding system that doesn't specify EOL conversion.  */
4756   saved_coding_symbol = coding->symbol;
4757
4758   coding->spec.ccl.cr_carryover = 0;
4759   if (coding->eol_type == CODING_EOL_UNDECIDED)
4760     {
4761       /* Here, to avoid the call of setup_coding_system, we directly
4762          call detect_eol_type.  */
4763       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4764       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4765         coding->eol_type = CODING_EOL_LF;
4766       if (coding->eol_type != CODING_EOL_UNDECIDED)
4767         {
4768           val = Fget (coding->symbol, Qeol_type);
4769           if (VECTORP (val) && XVECTOR (val)->size == 3)
4770             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4771         }
4772       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4773     }
4774
4775   if (coding->eol_type == CODING_EOL_LF
4776       || coding->eol_type == CODING_EOL_UNDECIDED)
4777     {
4778       /* We have nothing to do.  */
4779       ptr = pend;
4780     }
4781   else if (coding->eol_type == CODING_EOL_CRLF)
4782     {
4783       unsigned char *pstart = ptr, *p = ptr;
4784
4785       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4786           && *(pend - 1) == '\r')
4787         {
4788           /* If the last character is CR, we can't handle it here
4789              because LF will be in the not-yet-decoded source text.
4790              Record that the CR is not yet processed.  */
4791           coding->spec.ccl.cr_carryover = 1;
4792           coding->produced--;
4793           coding->produced_char--;
4794           pend--;
4795         }
4796       while (ptr < pend)
4797         {
4798           if (*ptr == '\r')
4799             {
4800               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4801                 {
4802                   *p++ = '\n';
4803                   ptr += 2;
4804                 }
4805               else
4806                 {
4807                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4808                     goto undo_eol_conversion;
4809                   *p++ = *ptr++;
4810                 }
4811             }
4812           else if (*ptr == '\n'
4813                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4814             goto undo_eol_conversion;
4815           else
4816             *p++ = *ptr++;
4817           continue;
4818
4819         undo_eol_conversion:
4820           /* We have faced with inconsistent EOL format at PTR.
4821              Convert all LFs before PTR back to CRLFs.  */
4822           for (p--, ptr--; p >= pstart; p--)
4823             {
4824               if (*p == '\n')
4825                 *ptr-- = '\n', *ptr-- = '\r';
4826               else
4827                 *ptr-- = *p;
4828             }
4829           /*  If carryover is recorded, cancel it because we don't
4830               convert CRLF anymore.  */
4831           if (coding->spec.ccl.cr_carryover)
4832             {
4833               coding->spec.ccl.cr_carryover = 0;
4834               coding->produced++;
4835               coding->produced_char++;
4836               pend++;
4837             }
4838           p = ptr = pend;
4839           coding->eol_type = CODING_EOL_LF;
4840           coding->symbol = saved_coding_symbol;
4841         }
4842       if (p < pend)
4843         {
4844           /* As each two-byte sequence CRLF was converted to LF, (PEND
4845              - P) is the number of deleted characters.  */
4846           coding->produced -= pend - p;
4847           coding->produced_char -= pend - p;
4848         }
4849     }
4850   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4851     {
4852       unsigned char *p = ptr;
4853
4854       for (; ptr < pend; ptr++)
4855         {
4856           if (*ptr == '\r')
4857             *ptr = '\n';
4858           else if (*ptr == '\n'
4859                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4860             {
4861               for (; p < ptr; p++)
4862                 {
4863                   if (*p == '\n')
4864                     *p = '\r';
4865                 }
4866               ptr = pend;
4867               coding->eol_type = CODING_EOL_LF;
4868               coding->symbol = saved_coding_symbol;
4869             }
4870         }
4871     }
4872 }
4873
4874 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4875    decoding, it may detect coding system and format of end-of-line if
4876    those are not yet decided.  The source should be unibyte, the
4877    result is multibyte if CODING->dst_multibyte is nonzero, else
4878    unibyte.  */
4879
4880 int
4881 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4882      struct coding_system *coding;
4883      const unsigned char *source;
4884      unsigned char *destination;
4885      int src_bytes, dst_bytes;
4886 {
4887   int extra = 0;
4888
4889   if (coding->type == coding_type_undecided)
4890     detect_coding (coding, source, src_bytes);
4891
4892   if (coding->eol_type == CODING_EOL_UNDECIDED
4893       && coding->type != coding_type_ccl)
4894     {
4895       detect_eol (coding, source, src_bytes);
4896       /* We had better recover the original eol format if we
4897          encounter an inconsistent eol format while decoding.  */
4898       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4899     }
4900
4901   coding->produced = coding->produced_char = 0;
4902   coding->consumed = coding->consumed_char = 0;
4903   coding->errors = 0;
4904   coding->result = CODING_FINISH_NORMAL;
4905
4906   switch (coding->type)
4907     {
4908     case coding_type_sjis:
4909       decode_coding_sjis_big5 (coding, source, destination,
4910                                src_bytes, dst_bytes, 1);
4911       break;
4912
4913     case coding_type_iso2022:
4914       decode_coding_iso2022 (coding, source, destination,
4915                              src_bytes, dst_bytes);
4916       break;
4917
4918     case coding_type_big5:
4919       decode_coding_sjis_big5 (coding, source, destination,
4920                                src_bytes, dst_bytes, 0);
4921       break;
4922
4923     case coding_type_emacs_mule:
4924       decode_coding_emacs_mule (coding, source, destination,
4925                                 src_bytes, dst_bytes);
4926       break;
4927
4928     case coding_type_ccl:
4929       if (coding->spec.ccl.cr_carryover)
4930         {
4931           /* Put the CR which was not processed by the previous call
4932              of decode_eol_post_ccl in DESTINATION.  It will be
4933              decoded together with the following LF by the call to
4934              decode_eol_post_ccl below.  */
4935           *destination = '\r';
4936           coding->produced++;
4937           coding->produced_char++;
4938           dst_bytes--;
4939           extra = coding->spec.ccl.cr_carryover;
4940         }
4941       ccl_coding_driver (coding, source, destination + extra,
4942                          src_bytes, dst_bytes, 0);
4943       if (coding->eol_type != CODING_EOL_LF)
4944         {
4945           coding->produced += extra;
4946           coding->produced_char += extra;
4947           decode_eol_post_ccl (coding, destination, coding->produced);
4948         }
4949       break;
4950
4951     default:
4952       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4953     }
4954
4955   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4956       && coding->mode & CODING_MODE_LAST_BLOCK
4957       && coding->consumed == src_bytes)
4958     coding->result = CODING_FINISH_NORMAL;
4959
4960   if (coding->mode & CODING_MODE_LAST_BLOCK
4961       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4962     {
4963       const unsigned char *src = source + coding->consumed;
4964       unsigned char *dst = destination + coding->produced;
4965
4966       src_bytes -= coding->consumed;
4967       coding->errors++;
4968       if (COMPOSING_P (coding))
4969         DECODE_COMPOSITION_END ('1');
4970       while (src_bytes--)
4971         {
4972           int c = *src++;
4973           dst += CHAR_STRING (c, dst);
4974           coding->produced_char++;
4975         }
4976       coding->consumed = coding->consumed_char = src - source;
4977       coding->produced = dst - destination;
4978       coding->result = CODING_FINISH_NORMAL;
4979     }
4980
4981   if (!coding->dst_multibyte)
4982     {
4983       coding->produced = str_as_unibyte (destination, coding->produced);
4984       coding->produced_char = coding->produced;
4985     }
4986
4987   return coding->result;
4988 }
4989
4990 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4991    multibyteness of the source is CODING->src_multibyte, the
4992    multibyteness of the result is always unibyte.  */
4993
4994 int
4995 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4996      struct coding_system *coding;
4997      const unsigned char *source;
4998      unsigned char *destination;
4999      int src_bytes, dst_bytes;
5000 {
5001   coding->produced = coding->produced_char = 0;
5002   coding->consumed = coding->consumed_char = 0;
5003   coding->errors = 0;
5004   coding->result = CODING_FINISH_NORMAL;
5005   if (coding->eol_type == CODING_EOL_UNDECIDED)
5006     coding->eol_type = CODING_EOL_LF;
5007
5008   switch (coding->type)
5009     {
5010     case coding_type_sjis:
5011       encode_coding_sjis_big5 (coding, source, destination,
5012                                src_bytes, dst_bytes, 1);
5013       break;
5014
5015     case coding_type_iso2022:
5016       encode_coding_iso2022 (coding, source, destination,
5017                              src_bytes, dst_bytes);
5018       break;
5019
5020     case coding_type_big5:
5021       encode_coding_sjis_big5 (coding, source, destination,
5022                                src_bytes, dst_bytes, 0);
5023       break;
5024
5025     case coding_type_emacs_mule:
5026       encode_coding_emacs_mule (coding, source, destination,
5027                                 src_bytes, dst_bytes);
5028       break;
5029
5030     case coding_type_ccl:
5031       ccl_coding_driver (coding, source, destination,
5032                          src_bytes, dst_bytes, 1);
5033       break;
5034
5035     default:
5036       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5037     }
5038
5039   if (coding->mode & CODING_MODE_LAST_BLOCK
5040       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5041     {
5042       const unsigned char *src = source + coding->consumed;
5043       unsigned char *dst = destination + coding->produced;
5044
5045       if (coding->type == coding_type_iso2022)
5046         ENCODE_RESET_PLANE_AND_REGISTER;
5047       if (COMPOSING_P (coding))
5048         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5049       if (coding->consumed < src_bytes)
5050         {
5051           int len = src_bytes - coding->consumed;
5052
5053           BCOPY_SHORT (src, dst, len);
5054           if (coding->src_multibyte)
5055             len = str_as_unibyte (dst, len);
5056           dst += len;
5057           coding->consumed = src_bytes;
5058         }
5059       coding->produced = coding->produced_char = dst - destination;
5060       coding->result = CODING_FINISH_NORMAL;
5061     }
5062
5063   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5064       && coding->consumed == src_bytes)
5065     coding->result = CODING_FINISH_NORMAL;
5066
5067   return coding->result;
5068 }
5069
5070 /* Scan text in the region between *BEG and *END (byte positions),
5071    skip characters which we don't have to decode by coding system
5072    CODING at the head and tail, then set *BEG and *END to the region
5073    of the text we actually have to convert.  The caller should move
5074    the gap out of the region in advance if the region is from a
5075    buffer.
5076
5077    If STR is not NULL, *BEG and *END are indices into STR.  */
5078
5079 static void
5080 shrink_decoding_region (beg, end, coding, str)
5081      int *beg, *end;
5082      struct coding_system *coding;
5083      unsigned char *str;
5084 {
5085   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5086   int eol_conversion;
5087   Lisp_Object translation_table;
5088
5089   if (coding->type == coding_type_ccl
5090       || coding->type == coding_type_undecided
5091       || coding->eol_type != CODING_EOL_LF
5092       || !NILP (coding->post_read_conversion)
5093       || coding->composing != COMPOSITION_DISABLED)
5094     {
5095       /* We can't skip any data.  */
5096       return;
5097     }
5098   if (coding->type == coding_type_no_conversion
5099       || coding->type == coding_type_raw_text
5100       || coding->type == coding_type_emacs_mule)
5101     {
5102       /* We need no conversion, but don't have to skip any data here.
5103          Decoding routine handles them effectively anyway.  */
5104       return;
5105     }
5106
5107   translation_table = coding->translation_table_for_decode;
5108   if (NILP (translation_table) && !NILP (Venable_character_translation))
5109     translation_table = Vstandard_translation_table_for_decode;
5110   if (CHAR_TABLE_P (translation_table))
5111     {
5112       int i;
5113       for (i = 0; i < 128; i++)
5114         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5115           break;
5116       if (i < 128)
5117         /* Some ASCII character should be translated.  We give up
5118            shrinking.  */
5119         return;
5120     }
5121
5122   if (coding->heading_ascii >= 0)
5123     /* Detection routine has already found how much we can skip at the
5124        head.  */
5125     *beg += coding->heading_ascii;
5126
5127   if (str)
5128     {
5129       begp_orig = begp = str + *beg;
5130       endp_orig = endp = str + *end;
5131     }
5132   else
5133     {
5134       begp_orig = begp = BYTE_POS_ADDR (*beg);
5135       endp_orig = endp = begp + *end - *beg;
5136     }
5137
5138   eol_conversion = (coding->eol_type == CODING_EOL_CR
5139                     || coding->eol_type == CODING_EOL_CRLF);
5140
5141   switch (coding->type)
5142     {
5143     case coding_type_sjis:
5144     case coding_type_big5:
5145       /* We can skip all ASCII characters at the head.  */
5146       if (coding->heading_ascii < 0)
5147         {
5148           if (eol_conversion)
5149             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5150           else
5151             while (begp < endp && *begp < 0x80) begp++;
5152         }
5153       /* We can skip all ASCII characters at the tail except for the
5154          second byte of SJIS or BIG5 code.  */
5155       if (eol_conversion)
5156         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5157       else
5158         while (begp < endp && endp[-1] < 0x80) endp--;
5159       /* Do not consider LF as ascii if preceded by CR, since that
5160          confuses eol decoding. */
5161       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5162         endp++;
5163       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5164         endp++;
5165       break;
5166
5167     case coding_type_iso2022:
5168       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5169         /* We can't skip any data.  */
5170         break;
5171       if (coding->heading_ascii < 0)
5172         {
5173           /* We can skip all ASCII characters at the head except for a
5174              few control codes.  */
5175           while (begp < endp && (c = *begp) < 0x80
5176                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5177                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5178                  && (!eol_conversion || c != ISO_CODE_LF))
5179             begp++;
5180         }
5181       switch (coding->category_idx)
5182         {
5183         case CODING_CATEGORY_IDX_ISO_8_1:
5184         case CODING_CATEGORY_IDX_ISO_8_2:
5185           /* We can skip all ASCII characters at the tail.  */
5186           if (eol_conversion)
5187             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5188           else
5189             while (begp < endp && endp[-1] < 0x80) endp--;
5190           /* Do not consider LF as ascii if preceded by CR, since that
5191              confuses eol decoding. */
5192           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5193             endp++;
5194           break;
5195
5196         case CODING_CATEGORY_IDX_ISO_7:
5197         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5198           {
5199             /* We can skip all characters at the tail except for 8-bit
5200                codes and ESC and the following 2-byte at the tail.  */
5201             unsigned char *eight_bit = NULL;
5202
5203             if (eol_conversion)
5204               while (begp < endp
5205                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5206                 {
5207                   if (!eight_bit && c & 0x80) eight_bit = endp;
5208                   endp--;
5209                 }
5210             else
5211               while (begp < endp
5212                      && (c = endp[-1]) != ISO_CODE_ESC)
5213                 {
5214                   if (!eight_bit && c & 0x80) eight_bit = endp;
5215                   endp--;
5216                 }
5217             /* Do not consider LF as ascii if preceded by CR, since that
5218                confuses eol decoding. */
5219             if (begp < endp && endp < endp_orig
5220                 && endp[-1] == '\r' && endp[0] == '\n')
5221               endp++;
5222             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5223               {
5224                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5225                   /* This is an ASCII designation sequence.  We can
5226                      surely skip the tail.  But, if we have
5227                      encountered an 8-bit code, skip only the codes
5228                      after that.  */
5229                   endp = eight_bit ? eight_bit : endp + 2;
5230                 else
5231                   /* Hmmm, we can't skip the tail.  */
5232                   endp = endp_orig;
5233               }
5234             else if (eight_bit)
5235               endp = eight_bit;
5236           }
5237         }
5238       break;
5239
5240     default:
5241       abort ();
5242     }
5243   *beg += begp - begp_orig;
5244   *end += endp - endp_orig;
5245   return;
5246 }
5247
5248 /* Like shrink_decoding_region but for encoding.  */
5249
5250 static void
5251 shrink_encoding_region (beg, end, coding, str)
5252      int *beg, *end;
5253      struct coding_system *coding;
5254      unsigned char *str;
5255 {
5256   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5257   int eol_conversion;
5258   Lisp_Object translation_table;
5259
5260   if (coding->type == coding_type_ccl
5261       || coding->eol_type == CODING_EOL_CRLF
5262       || coding->eol_type == CODING_EOL_CR
5263       || (coding->cmp_data && coding->cmp_data->used > 0))
5264     {
5265       /* We can't skip any data.  */
5266       return;
5267     }
5268   if (coding->type == coding_type_no_conversion
5269       || coding->type == coding_type_raw_text
5270       || coding->type == coding_type_emacs_mule
5271       || coding->type == coding_type_undecided)
5272     {
5273       /* We need no conversion, but don't have to skip any data here.
5274          Encoding routine handles them effectively anyway.  */
5275       return;
5276     }
5277
5278   translation_table = coding->translation_table_for_encode;
5279   if (NILP (translation_table) && !NILP (Venable_character_translation))
5280     translation_table = Vstandard_translation_table_for_encode;
5281   if (CHAR_TABLE_P (translation_table))
5282     {
5283       int i;
5284       for (i = 0; i < 128; i++)
5285         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5286           break;
5287       if (i < 128)
5288         /* Some ASCII character should be translated.  We give up
5289            shrinking.  */
5290         return;
5291     }
5292
5293   if (str)
5294     {
5295       begp_orig = begp = str + *beg;
5296       endp_orig = endp = str + *end;
5297     }
5298   else
5299     {
5300       begp_orig = begp = BYTE_POS_ADDR (*beg);
5301       endp_orig = endp = begp + *end - *beg;
5302     }
5303
5304   eol_conversion = (coding->eol_type == CODING_EOL_CR
5305                     || coding->eol_type == CODING_EOL_CRLF);
5306
5307   /* Here, we don't have to check coding->pre_write_conversion because
5308      the caller is expected to have handled it already.  */
5309   switch (coding->type)
5310     {
5311     case coding_type_iso2022:
5312       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5313         /* We can't skip any data.  */
5314         break;
5315       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5316         {
5317           unsigned char *bol = begp;
5318           while (begp < endp && *begp < 0x80)
5319             {
5320               begp++;
5321               if (begp[-1] == '\n')
5322                 bol = begp;
5323             }
5324           begp = bol;
5325           goto label_skip_tail;
5326         }
5327       /* fall down ... */
5328
5329     case coding_type_sjis:
5330     case coding_type_big5:
5331       /* We can skip all ASCII characters at the head and tail.  */
5332       if (eol_conversion)
5333         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5334       else
5335         while (begp < endp && *begp < 0x80) begp++;
5336     label_skip_tail:
5337       if (eol_conversion)
5338         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5339       else
5340         while (begp < endp && *(endp - 1) < 0x80) endp--;
5341       break;
5342
5343     default:
5344       abort ();
5345     }
5346
5347   *beg += begp - begp_orig;
5348   *end += endp - endp_orig;
5349   return;
5350 }
5351
5352 /* As shrinking conversion region requires some overhead, we don't try
5353    shrinking if the length of conversion region is less than this
5354    value.  */
5355 static int shrink_conversion_region_threshhold = 1024;
5356
5357 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5358   do {                                                                  \
5359     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5360       {                                                                 \
5361         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5362         else shrink_decoding_region (beg, end, coding, str);            \
5363       }                                                                 \
5364   } while (0)
5365
5366 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5367    Vlast_coding_system_used and the remaining elements are buffers to
5368    kill.  */
5369 static Lisp_Object
5370 code_convert_region_unwind (arg)
5371      Lisp_Object arg;
5372 {
5373   struct gcpro gcpro1;
5374   GCPRO1 (arg);
5375
5376   inhibit_pre_post_conversion = 0;
5377   Vlast_coding_system_used = XCAR (arg);
5378   for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5379     Fkill_buffer (XCAR (arg));
5380
5381   UNGCPRO;
5382   return Qnil;
5383 }
5384
5385 /* Store information about all compositions in the range FROM and TO
5386    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5387    buffer or a string, defaults to the current buffer.  */
5388
5389 void
5390 coding_save_composition (coding, from, to, obj)
5391      struct coding_system *coding;
5392      int from, to;
5393      Lisp_Object obj;
5394 {
5395   Lisp_Object prop;
5396   int start, end;
5397
5398   if (coding->composing == COMPOSITION_DISABLED)
5399     return;
5400   if (!coding->cmp_data)
5401     coding_allocate_composition_data (coding, from);
5402   if (!find_composition (from, to, &start, &end, &prop, obj)
5403       || end > to)
5404     return;
5405   if (start < from
5406       && (!find_composition (end, to, &start, &end, &prop, obj)
5407           || end > to))
5408     return;
5409   coding->composing = COMPOSITION_NO;
5410   do
5411     {
5412       if (COMPOSITION_VALID_P (start, end, prop))
5413         {
5414           enum composition_method method = COMPOSITION_METHOD (prop);
5415           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5416               >= COMPOSITION_DATA_SIZE)
5417             coding_allocate_composition_data (coding, from);
5418           /* For relative composition, we remember start and end
5419              positions, for the other compositions, we also remember
5420              components.  */
5421           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5422           if (method != COMPOSITION_RELATIVE)
5423             {
5424               /* We must store a*/
5425               Lisp_Object val, ch;
5426
5427               val = COMPOSITION_COMPONENTS (prop);
5428               if (CONSP (val))
5429                 while (CONSP (val))
5430                   {
5431                     ch = XCAR (val), val = XCDR (val);
5432                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5433                   }
5434               else if (VECTORP (val) || STRINGP (val))
5435                 {
5436                   int len = (VECTORP (val)
5437                              ? XVECTOR (val)->size : SCHARS (val));
5438                   int i;
5439                   for (i = 0; i < len; i++)
5440                     {
5441                       ch = (STRINGP (val)
5442                             ? Faref (val, make_number (i))
5443                             : XVECTOR (val)->contents[i]);
5444                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5445                     }
5446                 }
5447               else              /* INTEGERP (val) */
5448                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5449             }
5450           CODING_ADD_COMPOSITION_END (coding, end - from);
5451         }
5452       start = end;
5453     }
5454   while (start < to
5455          && find_composition (start, to, &start, &end, &prop, obj)
5456          && end <= to);
5457
5458   /* Make coding->cmp_data point to the first memory block.  */
5459   while (coding->cmp_data->prev)
5460     coding->cmp_data = coding->cmp_data->prev;
5461   coding->cmp_data_start = 0;
5462 }
5463
5464 /* Reflect the saved information about compositions to OBJ.
5465    CODING->cmp_data points to a memory block for the information.  OBJ
5466    is a buffer or a string, defaults to the current buffer.  */
5467
5468 void
5469 coding_restore_composition (coding, obj)
5470      struct coding_system *coding;
5471      Lisp_Object obj;
5472 {
5473   struct composition_data *cmp_data = coding->cmp_data;
5474
5475   if (!cmp_data)
5476     return;
5477
5478   while (cmp_data->prev)
5479     cmp_data = cmp_data->prev;
5480
5481   while (cmp_data)
5482     {
5483       int i;
5484
5485       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5486            i += cmp_data->data[i])
5487         {
5488           int *data = cmp_data->data + i;
5489           enum composition_method method = (enum composition_method) data[3];
5490           Lisp_Object components;
5491
5492           if (data[0] < 0 || i + data[0] > cmp_data->used)
5493             /* Invalid composition data.  */
5494             break;
5495
5496           if (method == COMPOSITION_RELATIVE)
5497             components = Qnil;
5498           else
5499             {
5500               int len = data[0] - 4, j;
5501               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5502
5503               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5504                   && len % 2 == 0)
5505                 len --;
5506               if (len < 1)
5507                 /* Invalid composition data.  */
5508                 break;
5509               for (j = 0; j < len; j++)
5510                 args[j] = make_number (data[4 + j]);
5511               components = (method == COMPOSITION_WITH_ALTCHARS
5512                             ? Fstring (len, args)
5513                             : Fvector (len, args));
5514             }
5515           compose_text (data[1], data[2], components, Qnil, obj);
5516         }
5517       cmp_data = cmp_data->next;
5518     }
5519 }
5520
5521 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5522    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5523    coding system CODING, and return the status code of code conversion
5524    (currently, this value has no meaning).
5525
5526    How many characters (and bytes) are converted to how many
5527    characters (and bytes) are recorded in members of the structure
5528    CODING.
5529
5530    If REPLACE is nonzero, we do various things as if the original text
5531    is deleted and a new text is inserted.  See the comments in
5532    replace_range (insdel.c) to know what we are doing.
5533
5534    If REPLACE is zero, it is assumed that the source text is unibyte.
5535    Otherwise, it is assumed that the source text is multibyte.  */
5536
5537 int
5538 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5539      int from, from_byte, to, to_byte, encodep, replace;
5540      struct coding_system *coding;
5541 {
5542   int len = to - from, len_byte = to_byte - from_byte;
5543   int nchars_del = 0, nbytes_del = 0;
5544   int require, inserted, inserted_byte;
5545   int head_skip, tail_skip, total_skip = 0;
5546   Lisp_Object saved_coding_symbol;
5547   int first = 1;
5548   unsigned char *src, *dst;
5549   Lisp_Object deletion;
5550   int orig_point = PT, orig_len = len;
5551   int prev_Z;
5552   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5553
5554   deletion = Qnil;
5555   saved_coding_symbol = coding->symbol;
5556
5557   if (from < PT && PT < to)
5558     {
5559       TEMP_SET_PT_BOTH (from, from_byte);
5560       orig_point = from;
5561     }
5562
5563   if (replace)
5564     {
5565       int saved_from = from;
5566       int saved_inhibit_modification_hooks;
5567
5568       prepare_to_modify_buffer (from, to, &from);
5569       if (saved_from != from)
5570         {
5571           to = from + len;
5572           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5573           len_byte = to_byte - from_byte;
5574         }
5575
5576       /* The code conversion routine can not preserve text properties
5577          for now.  So, we must remove all text properties in the
5578          region.  Here, we must suppress all modification hooks.  */
5579       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5580       inhibit_modification_hooks = 1;
5581       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5582       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5583     }
5584
5585   coding->heading_ascii = 0;
5586
5587   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5588     {
5589       /* We must detect encoding of text and eol format.  */
5590
5591       if (from < GPT && to > GPT)
5592         move_gap_both (from, from_byte);
5593       if (coding->type == coding_type_undecided)
5594         {
5595           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5596           if (coding->type == coding_type_undecided)
5597             {
5598               /* It seems that the text contains only ASCII, but we
5599                  should not leave it undecided because the deeper
5600                  decoding routine (decode_coding) tries to detect the
5601                  encodings again in vain.  */
5602               coding->type = coding_type_emacs_mule;
5603               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5604               /* As emacs-mule decoder will handle composition, we
5605                  need this setting to allocate coding->cmp_data
5606                  later.  */
5607               coding->composing = COMPOSITION_NO;
5608             }
5609         }
5610       if (coding->eol_type == CODING_EOL_UNDECIDED
5611           && coding->type != coding_type_ccl)
5612         {
5613           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5614           if (coding->eol_type == CODING_EOL_UNDECIDED)
5615             coding->eol_type = CODING_EOL_LF;
5616           /* We had better recover the original eol format if we
5617              encounter an inconsistent eol format while decoding.  */
5618           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5619         }
5620     }
5621
5622   /* Now we convert the text.  */
5623
5624   /* For encoding, we must process pre-write-conversion in advance.  */
5625   if (! inhibit_pre_post_conversion
5626       && encodep
5627       && SYMBOLP (coding->pre_write_conversion)
5628       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5629     {
5630       /* The function in pre-write-conversion may put a new text in a
5631          new buffer.  */
5632       struct buffer *prev = current_buffer;
5633       Lisp_Object new;
5634
5635       record_unwind_protect (code_convert_region_unwind,
5636                              Fcons (Vlast_coding_system_used, Qnil));
5637       /* We should not call any more pre-write/post-read-conversion
5638          functions while this pre-write-conversion is running.  */
5639       inhibit_pre_post_conversion = 1;
5640       call2 (coding->pre_write_conversion,
5641              make_number (from), make_number (to));
5642       inhibit_pre_post_conversion = 0;
5643       /* Discard the unwind protect.  */
5644       specpdl_ptr--;
5645
5646       if (current_buffer != prev)
5647         {
5648           len = ZV - BEGV;
5649           new = Fcurrent_buffer ();
5650           set_buffer_internal_1 (prev);
5651           del_range_2 (from, from_byte, to, to_byte, 0);
5652           TEMP_SET_PT_BOTH (from, from_byte);
5653           insert_from_buffer (XBUFFER (new), 1, len, 0);
5654           Fkill_buffer (new);
5655           if (orig_point >= to)
5656             orig_point += len - orig_len;
5657           else if (orig_point > from)
5658             orig_point = from;
5659           orig_len = len;
5660           to = from + len;
5661           from_byte = CHAR_TO_BYTE (from);
5662           to_byte = CHAR_TO_BYTE (to);
5663           len_byte = to_byte - from_byte;
5664           TEMP_SET_PT_BOTH (from, from_byte);
5665         }
5666     }
5667
5668   if (replace)
5669     {
5670       if (! EQ (current_buffer->undo_list, Qt))
5671         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5672       else
5673         {
5674           nchars_del = to - from;
5675           nbytes_del = to_byte - from_byte;
5676         }
5677     }
5678
5679   if (coding->composing != COMPOSITION_DISABLED)
5680     {
5681       if (encodep)
5682         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5683       else
5684         coding_allocate_composition_data (coding, from);
5685     }
5686
5687   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5688      if we must run CCL program or there are compositions to
5689      encode.  */
5690   if (coding->type != coding_type_ccl
5691       && (! coding->cmp_data || coding->cmp_data->used == 0))
5692     {
5693       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5694
5695       if (from < GPT && GPT < to)
5696         move_gap_both (from, from_byte);
5697       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5698       if (from_byte == to_byte
5699           && (encodep || NILP (coding->post_read_conversion))
5700           && ! CODING_REQUIRE_FLUSHING (coding))
5701         {
5702           coding->produced = len_byte;
5703           coding->produced_char = len;
5704           if (!replace)
5705             /* We must record and adjust for this new text now.  */
5706             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5707           coding_free_composition_data (coding);
5708           return 0;
5709         }
5710
5711       head_skip = from_byte - from_byte_orig;
5712       tail_skip = to_byte_orig - to_byte;
5713       total_skip = head_skip + tail_skip;
5714       from += head_skip;
5715       to -= tail_skip;
5716       len -= total_skip; len_byte -= total_skip;
5717     }
5718
5719   /* For conversion, we must put the gap before the text in addition to
5720      making the gap larger for efficient decoding.  The required gap
5721      size starts from 2000 which is the magic number used in make_gap.
5722      But, after one batch of conversion, it will be incremented if we
5723      find that it is not enough .  */
5724   require = 2000;
5725
5726   if (GAP_SIZE  < require)
5727     make_gap (require - GAP_SIZE);
5728   move_gap_both (from, from_byte);
5729
5730   inserted = inserted_byte = 0;
5731
5732   GAP_SIZE += len_byte;
5733   ZV -= len;
5734   Z -= len;
5735   ZV_BYTE -= len_byte;
5736   Z_BYTE -= len_byte;
5737
5738   if (GPT - BEG < BEG_UNCHANGED)
5739     BEG_UNCHANGED = GPT - BEG;
5740   if (Z - GPT < END_UNCHANGED)
5741     END_UNCHANGED = Z - GPT;
5742
5743   if (!encodep && coding->src_multibyte)
5744     {
5745       /* Decoding routines expects that the source text is unibyte.
5746          We must convert 8-bit characters of multibyte form to
5747          unibyte.  */
5748       int len_byte_orig = len_byte;
5749       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5750       if (len_byte < len_byte_orig)
5751         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5752                     len_byte);
5753       coding->src_multibyte = 0;
5754     }
5755
5756   for (;;)
5757     {
5758       int result;
5759
5760       /* The buffer memory is now:
5761          +--------+converted-text+---------+-------original-text-------+---+
5762          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5763                   |<---------------------- GAP ----------------------->|  */
5764       src = GAP_END_ADDR - len_byte;
5765       dst = GPT_ADDR + inserted_byte;
5766
5767       if (encodep)
5768         result = encode_coding (coding, src, dst, len_byte, 0);
5769       else
5770         {
5771           if (coding->composing != COMPOSITION_DISABLED)
5772             coding->cmp_data->char_offset = from + inserted;
5773           result = decode_coding (coding, src, dst, len_byte, 0);
5774         }
5775
5776       /* The buffer memory is now:
5777          +--------+-------converted-text----+--+------original-text----+---+
5778          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5779                   |<---------------------- GAP ----------------------->|  */
5780
5781       inserted += coding->produced_char;
5782       inserted_byte += coding->produced;
5783       len_byte -= coding->consumed;
5784
5785       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5786         {
5787           coding_allocate_composition_data (coding, from + inserted);
5788           continue;
5789         }
5790
5791       src += coding->consumed;
5792       dst += coding->produced;
5793
5794       if (result == CODING_FINISH_NORMAL)
5795         {
5796           src += len_byte;
5797           break;
5798         }
5799       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5800         {
5801           unsigned char *pend = dst, *p = pend - inserted_byte;
5802           Lisp_Object eol_type;
5803
5804           /* Encode LFs back to the original eol format (CR or CRLF).  */
5805           if (coding->eol_type == CODING_EOL_CR)
5806             {
5807               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5808             }
5809           else
5810             {
5811               int count = 0;
5812
5813               while (p < pend) if (*p++ == '\n') count++;
5814               if (src - dst < count)
5815                 {
5816                   /* We don't have sufficient room for encoding LFs
5817                      back to CRLF.  We must record converted and
5818                      not-yet-converted text back to the buffer
5819                      content, enlarge the gap, then record them out of
5820                      the buffer contents again.  */
5821                   int add = len_byte + inserted_byte;
5822
5823                   GAP_SIZE -= add;
5824                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5825                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5826                   make_gap (count - GAP_SIZE);
5827                   GAP_SIZE += add;
5828                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5829                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5830                   /* Don't forget to update SRC, DST, and PEND.  */
5831                   src = GAP_END_ADDR - len_byte;
5832                   dst = GPT_ADDR + inserted_byte;
5833                   pend = dst;
5834                 }
5835               inserted += count;
5836               inserted_byte += count;
5837               coding->produced += count;
5838               p = dst = pend + count;
5839               while (count)
5840                 {
5841                   *--p = *--pend;
5842                   if (*p == '\n') count--, *--p = '\r';
5843                 }
5844             }
5845
5846           /* Suppress eol-format conversion in the further conversion.  */
5847           coding->eol_type = CODING_EOL_LF;
5848
5849           /* Set the coding system symbol to that for Unix-like EOL.  */
5850           eol_type = Fget (saved_coding_symbol, Qeol_type);
5851           if (VECTORP (eol_type)
5852               && XVECTOR (eol_type)->size == 3
5853               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5854             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5855           else
5856             coding->symbol = saved_coding_symbol;
5857
5858           continue;
5859         }
5860       if (len_byte <= 0)
5861         {
5862           if (coding->type != coding_type_ccl
5863               || coding->mode & CODING_MODE_LAST_BLOCK)
5864             break;
5865           coding->mode |= CODING_MODE_LAST_BLOCK;
5866           continue;
5867         }
5868       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5869         {
5870           /* The source text ends in invalid codes.  Let's just
5871              make them valid buffer contents, and finish conversion.  */
5872           if (multibyte_p)
5873             {
5874               unsigned char *start = dst;
5875
5876               inserted += len_byte;
5877               while (len_byte--)
5878                 {
5879                   int c = *src++;
5880                   dst += CHAR_STRING (c, dst);
5881                 }
5882
5883               inserted_byte += dst - start;
5884             }
5885           else
5886             {
5887               inserted += len_byte;
5888               inserted_byte += len_byte;
5889               while (len_byte--)
5890                 *dst++ = *src++;
5891             }
5892           break;
5893         }
5894       if (result == CODING_FINISH_INTERRUPT)
5895         {
5896           /* The conversion procedure was interrupted by a user.  */
5897           break;
5898         }
5899       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5900       if (coding->consumed < 1)
5901         {
5902           /* It's quite strange to require more memory without
5903              consuming any bytes.  Perhaps CCL program bug.  */
5904           break;
5905         }
5906       if (first)
5907         {
5908           /* We have just done the first batch of conversion which was
5909              stopped because of insufficient gap.  Let's reconsider the
5910              required gap size (i.e. SRT - DST) now.
5911
5912              We have converted ORIG bytes (== coding->consumed) into
5913              NEW bytes (coding->produced).  To convert the remaining
5914              LEN bytes, we may need REQUIRE bytes of gap, where:
5915                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5916                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5917              Here, we are sure that NEW >= ORIG.  */
5918
5919           if (coding->produced <= coding->consumed)
5920             {
5921               /* This happens because of CCL-based coding system with
5922                  eol-type CRLF.  */
5923               require = 0;
5924             }
5925           else
5926             {
5927               float ratio = coding->produced - coding->consumed;
5928               ratio /= coding->consumed;
5929               require = len_byte * ratio;
5930             }
5931           first = 0;
5932         }
5933       if ((src - dst) < (require + 2000))
5934         {
5935           /* See the comment above the previous call of make_gap.  */
5936           int add = len_byte + inserted_byte;
5937
5938           GAP_SIZE -= add;
5939           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5940           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5941           make_gap (require + 2000);
5942           GAP_SIZE += add;
5943           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5944           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5945         }
5946     }
5947   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5948
5949   if (encodep && coding->dst_multibyte)
5950     {
5951       /* The output is unibyte.  We must convert 8-bit characters to
5952          multibyte form.  */
5953       if (inserted_byte * 2 > GAP_SIZE)
5954         {
5955           GAP_SIZE -= inserted_byte;
5956           ZV += inserted_byte; Z += inserted_byte;
5957           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5958           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5959           make_gap (inserted_byte - GAP_SIZE);
5960           GAP_SIZE += inserted_byte;
5961           ZV -= inserted_byte; Z -= inserted_byte;
5962           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5963           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5964         }
5965       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5966     }
5967
5968   /* If we shrank the conversion area, adjust it now.  */
5969   if (total_skip > 0)
5970     {
5971       if (tail_skip > 0)
5972         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5973       inserted += total_skip; inserted_byte += total_skip;
5974       GAP_SIZE += total_skip;
5975       GPT -= head_skip; GPT_BYTE -= head_skip;
5976       ZV -= total_skip; ZV_BYTE -= total_skip;
5977       Z -= total_skip; Z_BYTE -= total_skip;
5978       from -= head_skip; from_byte -= head_skip;
5979       to += tail_skip; to_byte += tail_skip;
5980     }
5981
5982   prev_Z = Z;
5983   if (! EQ (current_buffer->undo_list, Qt))
5984     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5985   else
5986     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5987                                  inserted, inserted_byte);
5988   inserted = Z - prev_Z;
5989
5990   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5991     coding_restore_composition (coding, Fcurrent_buffer ());
5992   coding_free_composition_data (coding);
5993
5994   if (! inhibit_pre_post_conversion
5995       && ! encodep && ! NILP (coding->post_read_conversion))
5996     {
5997       Lisp_Object val;
5998       Lisp_Object saved_coding_system;
5999
6000       if (from != PT)
6001         TEMP_SET_PT_BOTH (from, from_byte);
6002       prev_Z = Z;
6003       record_unwind_protect (code_convert_region_unwind,
6004                              Fcons (Vlast_coding_system_used, Qnil));
6005       saved_coding_system = Vlast_coding_system_used;
6006       Vlast_coding_system_used = coding->symbol;
6007       /* We should not call any more pre-write/post-read-conversion
6008          functions while this post-read-conversion is running.  */
6009       inhibit_pre_post_conversion = 1;
6010       val = call1 (coding->post_read_conversion, make_number (inserted));
6011       inhibit_pre_post_conversion = 0;
6012       coding->symbol = Vlast_coding_system_used;
6013       Vlast_coding_system_used = saved_coding_system;
6014       /* Discard the unwind protect.  */
6015       specpdl_ptr--;
6016       CHECK_NUMBER (val);
6017       inserted += Z - prev_Z;
6018     }
6019
6020   if (orig_point >= from)
6021     {
6022       if (orig_point >= from + orig_len)
6023         orig_point += inserted - orig_len;
6024       else
6025         orig_point = from;
6026       TEMP_SET_PT (orig_point);
6027     }
6028
6029   if (replace)
6030     {
6031       signal_after_change (from, to - from, inserted);
6032       update_compositions (from, from + inserted, CHECK_BORDER);
6033     }
6034
6035   {
6036     coding->consumed = to_byte - from_byte;
6037     coding->consumed_char = to - from;
6038     coding->produced = inserted_byte;
6039     coding->produced_char = inserted;
6040   }
6041
6042   return 0;
6043 }
6044
6045 /* Name (or base name) of work buffer for code conversion.  */
6046 static Lisp_Object Vcode_conversion_workbuf_name;
6047
6048 /* Set the current buffer to the working buffer prepared for
6049    code-conversion.  MULTIBYTE specifies the multibyteness of the
6050    buffer.  Return the buffer we set if it must be killed after use.
6051    Otherwise return Qnil.  */
6052
6053 static Lisp_Object
6054 set_conversion_work_buffer (multibyte)
6055      int multibyte;
6056 {
6057   Lisp_Object buffer, buffer_to_kill;
6058   struct buffer *buf;
6059
6060   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6061   buf = XBUFFER (buffer);
6062   if (buf == current_buffer)
6063     {
6064       /* As we are already in the work buffer, we must generate a new
6065          buffer for the work.  */
6066       Lisp_Object name;
6067
6068       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6069       buffer = buffer_to_kill = Fget_buffer_create (name);
6070       buf = XBUFFER (buffer);
6071     }
6072   else
6073     buffer_to_kill = Qnil;
6074
6075   delete_all_overlays (buf);
6076   buf->directory = current_buffer->directory;
6077   buf->read_only = Qnil;
6078   buf->filename = Qnil;
6079   buf->undo_list = Qt;
6080   eassert (buf->overlays_before == NULL);
6081   eassert (buf->overlays_after == NULL);
6082   set_buffer_internal (buf);
6083   if (BEG != BEGV || Z != ZV)
6084     Fwiden ();
6085   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6086   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6087   return buffer_to_kill;
6088 }
6089
6090 Lisp_Object
6091 run_pre_post_conversion_on_str (str, coding, encodep)
6092      Lisp_Object str;
6093      struct coding_system *coding;
6094      int encodep;
6095 {
6096   int count = SPECPDL_INDEX ();
6097   struct gcpro gcpro1, gcpro2;
6098   int multibyte = STRING_MULTIBYTE (str);
6099   Lisp_Object old_deactivate_mark;
6100   Lisp_Object buffer_to_kill;
6101   Lisp_Object unwind_arg;
6102
6103   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6104   /* It is not crucial to specbind this.  */
6105   old_deactivate_mark = Vdeactivate_mark;
6106   GCPRO2 (str, old_deactivate_mark);
6107
6108   /* We must insert the contents of STR as is without
6109      unibyte<->multibyte conversion.  For that, we adjust the
6110      multibyteness of the working buffer to that of STR.  */
6111   buffer_to_kill = set_conversion_work_buffer (multibyte);
6112   if (NILP (buffer_to_kill))
6113     unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6114   else
6115     unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6116   record_unwind_protect (code_convert_region_unwind, unwind_arg);
6117
6118   insert_from_string (str, 0, 0,
6119                       SCHARS (str), SBYTES (str), 0);
6120   UNGCPRO;
6121   inhibit_pre_post_conversion = 1;
6122   if (encodep)
6123     {
6124       struct buffer *prev = current_buffer;
6125
6126       call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6127       if (prev != current_buffer)
6128         /* We must kill the current buffer too.  */
6129         Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6130     }
6131   else
6132     {
6133       Vlast_coding_system_used = coding->symbol;
6134       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6135       call1 (coding->post_read_conversion, make_number (Z - BEG));
6136       coding->symbol = Vlast_coding_system_used;
6137     }
6138   inhibit_pre_post_conversion = 0;
6139   Vdeactivate_mark = old_deactivate_mark;
6140   str = make_buffer_string (BEG, Z, 1);
6141   return unbind_to (count, str);
6142 }
6143
6144
6145 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6146    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6147    is intended that this function is called from encode_terminal_code,
6148    the pre-write-conversion function is run by safe_call and thus
6149    "Error during redisplay: ..." is logged when an error occurs.
6150
6151    Store the resulting text in *STR and set CODING->produced_char and
6152    CODING->produced to the number of characters and bytes
6153    respectively.  If the size of *STR is too small, enlarge it by
6154    xrealloc and update *STR and *SIZE.  */
6155
6156 void
6157 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6158      unsigned char **str;
6159      int *size, nchars, nbytes;
6160      struct coding_system *coding;
6161 {
6162   struct gcpro gcpro1, gcpro2;
6163   struct buffer *cur = current_buffer;
6164   struct buffer *prev;
6165   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6166   Lisp_Object args[3];
6167   Lisp_Object buffer_to_kill;
6168
6169   /* It is not crucial to specbind this.  */
6170   old_deactivate_mark = Vdeactivate_mark;
6171   old_last_coding_system_used = Vlast_coding_system_used;
6172   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6173
6174   /* We must insert the contents of STR as is without
6175      unibyte<->multibyte conversion.  For that, we adjust the
6176      multibyteness of the working buffer to that of STR.  */
6177   buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6178   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6179   UNGCPRO;
6180   inhibit_pre_post_conversion = 1;
6181   prev = current_buffer;
6182   args[0] = coding->pre_write_conversion;
6183   args[1] = make_number (BEG);
6184   args[2] = make_number (Z);
6185   safe_call (3, args);
6186   inhibit_pre_post_conversion = 0;
6187   Vdeactivate_mark = old_deactivate_mark;
6188   Vlast_coding_system_used = old_last_coding_system_used;
6189   coding->produced_char = Z - BEG;
6190   coding->produced = Z_BYTE - BEG_BYTE;
6191   if (coding->produced > *size)
6192     {
6193       *size = coding->produced;
6194       *str = xrealloc (*str, *size);
6195     }
6196   if (BEG < GPT && GPT < Z)
6197     move_gap (BEG);
6198   bcopy (BEG_ADDR, *str, coding->produced);
6199   coding->src_multibyte
6200     = ! NILP (current_buffer->enable_multibyte_characters);
6201   if (prev != current_buffer)
6202     Fkill_buffer (Fcurrent_buffer ());
6203   set_buffer_internal (cur);
6204   if (! NILP (buffer_to_kill))
6205     Fkill_buffer (buffer_to_kill);
6206 }
6207
6208
6209 Lisp_Object
6210 decode_coding_string (str, coding, nocopy)
6211      Lisp_Object str;
6212      struct coding_system *coding;
6213      int nocopy;
6214 {
6215   int len;
6216   struct conversion_buffer buf;
6217   int from, to_byte;
6218   Lisp_Object saved_coding_symbol;
6219   int result;
6220   int require_decoding;
6221   int shrinked_bytes = 0;
6222   Lisp_Object newstr;
6223   int consumed, consumed_char, produced, produced_char;
6224
6225   from = 0;
6226   to_byte = SBYTES (str);
6227
6228   saved_coding_symbol = coding->symbol;
6229   coding->src_multibyte = STRING_MULTIBYTE (str);
6230   coding->dst_multibyte = 1;
6231   coding->heading_ascii = 0;
6232
6233   if (CODING_REQUIRE_DETECTION (coding))
6234     {
6235       /* See the comments in code_convert_region.  */
6236       if (coding->type == coding_type_undecided)
6237         {
6238           detect_coding (coding, SDATA (str), to_byte);
6239           if (coding->type == coding_type_undecided)
6240             {
6241               coding->type = coding_type_emacs_mule;
6242               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6243               /* As emacs-mule decoder will handle composition, we
6244                  need this setting to allocate coding->cmp_data
6245                  later.  */
6246               coding->composing = COMPOSITION_NO;
6247             }
6248         }
6249       if (coding->eol_type == CODING_EOL_UNDECIDED
6250           && coding->type != coding_type_ccl)
6251         {
6252           saved_coding_symbol = coding->symbol;
6253           detect_eol (coding, SDATA (str), to_byte);
6254           if (coding->eol_type == CODING_EOL_UNDECIDED)
6255             coding->eol_type = CODING_EOL_LF;
6256           /* We had better recover the original eol format if we
6257              encounter an inconsistent eol format while decoding.  */
6258           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6259         }
6260     }
6261
6262   if (coding->type == coding_type_no_conversion
6263       || coding->type == coding_type_raw_text)
6264     coding->dst_multibyte = 0;
6265
6266   require_decoding = CODING_REQUIRE_DECODING (coding);
6267
6268   if (STRING_MULTIBYTE (str))
6269     {
6270       /* Decoding routines expect the source text to be unibyte.  */
6271       str = Fstring_as_unibyte (str);
6272       to_byte = SBYTES (str);
6273       nocopy = 1;
6274       coding->src_multibyte = 0;
6275     }
6276
6277   /* Try to skip the heading and tailing ASCIIs.  */
6278   if (require_decoding && coding->type != coding_type_ccl)
6279     {
6280       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6281                                 0);
6282       if (from == to_byte)
6283         require_decoding = 0;
6284       shrinked_bytes = from + (SBYTES (str) - to_byte);
6285     }
6286
6287   if (!require_decoding
6288       && !(SYMBOLP (coding->post_read_conversion)
6289            && !NILP (Ffboundp (coding->post_read_conversion))))
6290     {
6291       coding->consumed = SBYTES (str);
6292       coding->consumed_char = SCHARS (str);
6293       if (coding->dst_multibyte)
6294         {
6295           str = Fstring_as_multibyte (str);
6296           nocopy = 1;
6297         }
6298       coding->produced = SBYTES (str);
6299       coding->produced_char = SCHARS (str);
6300       return (nocopy ? str : Fcopy_sequence (str));
6301     }
6302
6303   if (coding->composing != COMPOSITION_DISABLED)
6304     coding_allocate_composition_data (coding, from);
6305   len = decoding_buffer_size (coding, to_byte - from);
6306   allocate_conversion_buffer (buf, len);
6307
6308   consumed = consumed_char = produced = produced_char = 0;
6309   while (1)
6310     {
6311       result = decode_coding (coding, SDATA (str) + from + consumed,
6312                               buf.data + produced, to_byte - from - consumed,
6313                               buf.size - produced);
6314       consumed += coding->consumed;
6315       consumed_char += coding->consumed_char;
6316       produced += coding->produced;
6317       produced_char += coding->produced_char;
6318       if (result == CODING_FINISH_NORMAL
6319           || result == CODING_FINISH_INTERRUPT
6320           || (result == CODING_FINISH_INSUFFICIENT_SRC
6321               && coding->consumed == 0))
6322         break;
6323       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6324         coding_allocate_composition_data (coding, from + produced_char);
6325       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6326         extend_conversion_buffer (&buf);
6327       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6328         {
6329           Lisp_Object eol_type;
6330
6331           /* Recover the original EOL format.  */
6332           if (coding->eol_type == CODING_EOL_CR)
6333             {
6334               unsigned char *p;
6335               for (p = buf.data; p < buf.data + produced; p++)
6336                 if (*p == '\n') *p = '\r';
6337             }
6338           else if (coding->eol_type == CODING_EOL_CRLF)
6339             {
6340               int num_eol = 0;
6341               unsigned char *p0, *p1;
6342               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6343                 if (*p0 == '\n') num_eol++;
6344               if (produced + num_eol >= buf.size)
6345                 extend_conversion_buffer (&buf);
6346               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6347                 {
6348                   *--p1 = *--p0;
6349                   if (*p0 == '\n') *--p1 = '\r';
6350                 }
6351               produced += num_eol;
6352               produced_char += num_eol;
6353             }
6354           /* Suppress eol-format conversion in the further conversion.  */
6355           coding->eol_type = CODING_EOL_LF;
6356
6357           /* Set the coding system symbol to that for Unix-like EOL.  */
6358           eol_type = Fget (saved_coding_symbol, Qeol_type);
6359           if (VECTORP (eol_type)
6360               && XVECTOR (eol_type)->size == 3
6361               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6362             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6363           else
6364             coding->symbol = saved_coding_symbol;
6365
6366
6367         }
6368     }
6369
6370   coding->consumed = consumed;
6371   coding->consumed_char = consumed_char;
6372   coding->produced = produced;
6373   coding->produced_char = produced_char;
6374
6375   if (coding->dst_multibyte)
6376     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6377                                            produced + shrinked_bytes);
6378   else
6379     newstr = make_uninit_string (produced + shrinked_bytes);
6380   if (from > 0)
6381     STRING_COPYIN (newstr, 0, SDATA (str), from);
6382   STRING_COPYIN (newstr, from, buf.data, produced);
6383   if (shrinked_bytes > from)
6384     STRING_COPYIN (newstr, from + produced,
6385                    SDATA (str) + to_byte,
6386                    shrinked_bytes - from);
6387   free_conversion_buffer (&buf);
6388
6389   coding->consumed += shrinked_bytes;
6390   coding->consumed_char += shrinked_bytes;
6391   coding->produced += shrinked_bytes;
6392   coding->produced_char += shrinked_bytes;
6393
6394   if (coding->cmp_data && coding->cmp_data->used)
6395     coding_restore_composition (coding, newstr);
6396   coding_free_composition_data (coding);
6397
6398   if (SYMBOLP (coding->post_read_conversion)
6399       && !NILP (Ffboundp (coding->post_read_conversion)))
6400     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6401
6402   return newstr;
6403 }
6404
6405 Lisp_Object
6406 encode_coding_string (str, coding, nocopy)
6407      Lisp_Object str;
6408      struct coding_system *coding;
6409      int nocopy;
6410 {
6411   int len;
6412   struct conversion_buffer buf;
6413   int from, to, to_byte;
6414   int result;
6415   int shrinked_bytes = 0;
6416   Lisp_Object newstr;
6417   int consumed, consumed_char, produced, produced_char;
6418
6419   if (SYMBOLP (coding->pre_write_conversion)
6420       && !NILP (Ffboundp (coding->pre_write_conversion)))
6421     {
6422       str = run_pre_post_conversion_on_str (str, coding, 1);
6423       /* As STR is just newly generated, we don't have to copy it
6424          anymore.  */
6425       nocopy = 1;
6426     }
6427
6428   from = 0;
6429   to = SCHARS (str);
6430   to_byte = SBYTES (str);
6431
6432   /* Encoding routines determine the multibyteness of the source text
6433      by coding->src_multibyte.  */
6434   coding->src_multibyte = SCHARS (str) < SBYTES (str);
6435   coding->dst_multibyte = 0;
6436   if (! CODING_REQUIRE_ENCODING (coding))
6437     goto no_need_of_encoding;
6438
6439   if (coding->composing != COMPOSITION_DISABLED)
6440     coding_save_composition (coding, from, to, str);
6441
6442   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6443      if we must run CCL program or there are compositions to
6444      encode.  */
6445   coding->heading_ascii = 0;
6446   if (coding->type != coding_type_ccl
6447       && (! coding->cmp_data || coding->cmp_data->used == 0))
6448     {
6449       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6450                                 1);
6451       if (from == to_byte)
6452         {
6453           coding_free_composition_data (coding);
6454           goto no_need_of_encoding;
6455         }
6456       shrinked_bytes = from + (SBYTES (str) - to_byte);
6457     }
6458
6459   len = encoding_buffer_size (coding, to_byte - from);
6460   allocate_conversion_buffer (buf, len);
6461
6462   consumed = consumed_char = produced = produced_char = 0;
6463   while (1)
6464     {
6465       result = encode_coding (coding, SDATA (str) + from + consumed,
6466                               buf.data + produced, to_byte - from - consumed,
6467                               buf.size - produced);
6468       consumed += coding->consumed;
6469       consumed_char += coding->consumed_char;
6470       produced += coding->produced;
6471       produced_char += coding->produced_char;
6472       if (result == CODING_FINISH_NORMAL
6473           || result == CODING_FINISH_INTERRUPT
6474           || (result == CODING_FINISH_INSUFFICIENT_SRC
6475               && coding->consumed == 0))
6476         break;
6477       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6478       extend_conversion_buffer (&buf);
6479     }
6480
6481   coding->consumed = consumed;
6482   coding->consumed_char = consumed_char;
6483   coding->produced = produced;
6484   coding->produced_char = produced_char;
6485
6486   newstr = make_uninit_string (produced + shrinked_bytes);
6487   if (from > 0)
6488     STRING_COPYIN (newstr, 0, SDATA (str), from);
6489   STRING_COPYIN (newstr, from, buf.data, produced);
6490   if (shrinked_bytes > from)
6491     STRING_COPYIN (newstr, from + produced,
6492                    SDATA (str) + to_byte,
6493                    shrinked_bytes - from);
6494
6495   free_conversion_buffer (&buf);
6496   coding_free_composition_data (coding);
6497
6498   return newstr;
6499
6500  no_need_of_encoding:
6501   coding->consumed = SBYTES (str);
6502   coding->consumed_char = SCHARS (str);
6503   if (STRING_MULTIBYTE (str))
6504     {
6505       if (nocopy)
6506         /* We are sure that STR doesn't contain a multibyte
6507            character.  */
6508         STRING_SET_UNIBYTE (str);
6509       else
6510         {
6511           str = Fstring_as_unibyte (str);
6512           nocopy = 1;
6513         }
6514     }
6515   coding->produced = SBYTES (str);
6516   coding->produced_char = SCHARS (str);
6517   return (nocopy ? str : Fcopy_sequence (str));
6518 }
6519
6520 \f
6521 #ifdef emacs
6522 /*** 8. Emacs Lisp library functions ***/
6523
6524 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6525        doc: /* Return t if OBJECT is nil or a coding-system.
6526 See the documentation of `make-coding-system' for information
6527 about coding-system objects.  */)
6528      (obj)
6529      Lisp_Object obj;
6530 {
6531   if (NILP (obj))
6532     return Qt;
6533   if (!SYMBOLP (obj))
6534     return Qnil;
6535   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6536     return Qt;
6537   /* Get coding-spec vector for OBJ.  */
6538   obj = Fget (obj, Qcoding_system);
6539   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6540           ? Qt : Qnil);
6541 }
6542
6543 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6544        Sread_non_nil_coding_system, 1, 1, 0,
6545        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6546      (prompt)
6547      Lisp_Object prompt;
6548 {
6549   Lisp_Object val;
6550   do
6551     {
6552       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6553                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6554     }
6555   while (SCHARS (val) == 0);
6556   return (Fintern (val, Qnil));
6557 }
6558
6559 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6560        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6561 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6562      (prompt, default_coding_system)
6563      Lisp_Object prompt, default_coding_system;
6564 {
6565   Lisp_Object val;
6566   if (SYMBOLP (default_coding_system))
6567     default_coding_system = SYMBOL_NAME (default_coding_system);
6568   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6569                           Qt, Qnil, Qcoding_system_history,
6570                           default_coding_system, Qnil);
6571   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6572 }
6573
6574 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6575        1, 1, 0,
6576        doc: /* Check validity of CODING-SYSTEM.
6577 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6578 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6579 The value of this property should be a vector of length 5.  */)
6580      (coding_system)
6581      Lisp_Object coding_system;
6582 {
6583   Lisp_Object define_form;
6584
6585   define_form = Fget (coding_system, Qcoding_system_define_form);
6586   if (! NILP (define_form))
6587     {
6588       Fput (coding_system, Qcoding_system_define_form, Qnil);
6589       safe_eval (define_form);
6590     }
6591   if (!NILP (Fcoding_system_p (coding_system)))
6592     return coding_system;
6593   xsignal1 (Qcoding_system_error, coding_system);
6594 }
6595 \f
6596 Lisp_Object
6597 detect_coding_system (src, src_bytes, highest, multibytep)
6598      const unsigned char *src;
6599      int src_bytes, highest;
6600      int multibytep;
6601 {
6602   int coding_mask, eol_type;
6603   Lisp_Object val, tmp;
6604   int dummy;
6605
6606   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6607   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6608   if (eol_type == CODING_EOL_INCONSISTENT)
6609     eol_type = CODING_EOL_UNDECIDED;
6610
6611   if (!coding_mask)
6612     {
6613       val = Qundecided;
6614       if (eol_type != CODING_EOL_UNDECIDED)
6615         {
6616           Lisp_Object val2;
6617           val2 = Fget (Qundecided, Qeol_type);
6618           if (VECTORP (val2))
6619             val = XVECTOR (val2)->contents[eol_type];
6620         }
6621       return (highest ? val : Fcons (val, Qnil));
6622     }
6623
6624   /* At first, gather possible coding systems in VAL.  */
6625   val = Qnil;
6626   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6627     {
6628       Lisp_Object category_val, category_index;
6629
6630       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6631       category_val = Fsymbol_value (XCAR (tmp));
6632       if (!NILP (category_val)
6633           && NATNUMP (category_index)
6634           && (coding_mask & (1 << XFASTINT (category_index))))
6635         {
6636           val = Fcons (category_val, val);
6637           if (highest)
6638             break;
6639         }
6640     }
6641   if (!highest)
6642     val = Fnreverse (val);
6643
6644   /* Then, replace the elements with subsidiary coding systems.  */
6645   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6646     {
6647       if (eol_type != CODING_EOL_UNDECIDED
6648           && eol_type != CODING_EOL_INCONSISTENT)
6649         {
6650           Lisp_Object eol;
6651           eol = Fget (XCAR (tmp), Qeol_type);
6652           if (VECTORP (eol))
6653             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6654         }
6655     }
6656   return (highest ? XCAR (val) : val);
6657 }
6658
6659 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6660        2, 3, 0,
6661        doc: /* Detect how the byte sequence in the region is encoded.
6662 Return a list of possible coding systems used on decoding a byte
6663 sequence containing the bytes in the region between START and END when
6664 the coding system `undecided' is specified.  The list is ordered by
6665 priority decided in the current language environment.
6666
6667 If only ASCII characters are found (except for such ISO-2022 control
6668 characters ISO-2022 as ESC), it returns a list of single element
6669 `undecided' or its subsidiary coding system according to a detected
6670 end-of-line format.
6671
6672 If optional argument HIGHEST is non-nil, return the coding system of
6673 highest priority.  */)
6674      (start, end, highest)
6675      Lisp_Object start, end, highest;
6676 {
6677   int from, to;
6678   int from_byte, to_byte;
6679   int include_anchor_byte = 0;
6680
6681   CHECK_NUMBER_COERCE_MARKER (start);
6682   CHECK_NUMBER_COERCE_MARKER (end);
6683
6684   validate_region (&start, &end);
6685   from = XINT (start), to = XINT (end);
6686   from_byte = CHAR_TO_BYTE (from);
6687   to_byte = CHAR_TO_BYTE (to);
6688
6689   if (from < GPT && to >= GPT)
6690     move_gap_both (to, to_byte);
6691   /* If we an anchor byte `\0' follows the region, we include it in
6692      the detecting source.  Then code detectors can handle the tailing
6693      byte sequence more accurately.
6694
6695      Fix me: This is not a perfect solution.  It is better that we
6696      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6697   */
6698   if (to == Z || (to == GPT && GAP_SIZE > 0))
6699     include_anchor_byte = 1;
6700   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6701                                to_byte - from_byte + include_anchor_byte,
6702                                !NILP (highest),
6703                                !NILP (current_buffer
6704                                       ->enable_multibyte_characters));
6705 }
6706
6707 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6708        1, 2, 0,
6709        doc: /* Detect how the byte sequence in STRING is encoded.
6710 Return a list of possible coding systems used on decoding a byte
6711 sequence containing the bytes in STRING when the coding system
6712 `undecided' is specified.  The list is ordered by priority decided in
6713 the current language environment.
6714
6715 If only ASCII characters are found (except for such ISO-2022 control
6716 characters ISO-2022 as ESC), it returns a list of single element
6717 `undecided' or its subsidiary coding system according to a detected
6718 end-of-line format.
6719
6720 If optional argument HIGHEST is non-nil, return the coding system of
6721 highest priority.  */)
6722      (string, highest)
6723      Lisp_Object string, highest;
6724 {
6725   CHECK_STRING (string);
6726
6727   return detect_coding_system (SDATA (string),
6728                                /* "+ 1" is to include the anchor byte
6729                                   `\0'.  With this, code detectors can
6730                                   handle the tailing bytes more
6731                                   accurately.  */
6732                                SBYTES (string) + 1,
6733                                !NILP (highest),
6734                                STRING_MULTIBYTE (string));
6735 }
6736
6737 /*  Subroutine for Ffind_coding_systems_region_internal.
6738
6739     Return a list of coding systems that safely encode the multibyte
6740     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6741     possible coding systems.  If it is nil, it means that we have not
6742     yet found any coding systems.
6743
6744     WORK_TABLE a char-table of which element is set to t once the
6745     element is looked up.
6746
6747     If a non-ASCII single byte char is found, set
6748     *single_byte_char_found to 1.  */
6749
6750 static Lisp_Object
6751 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6752      unsigned char *p, *pend;
6753      Lisp_Object safe_codings, work_table;
6754      int *single_byte_char_found;
6755 {
6756   int c, len;
6757   Lisp_Object val, ch;
6758   Lisp_Object prev, tail;
6759
6760   if (NILP (safe_codings))
6761     goto done_safe_codings;
6762   while (p < pend)
6763     {
6764       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6765       p += len;
6766       if (ASCII_BYTE_P (c))
6767         /* We can ignore ASCII characters here.  */
6768         continue;
6769       if (SINGLE_BYTE_CHAR_P (c))
6770         *single_byte_char_found = 1;
6771       /* Check the safe coding systems for C.  */
6772       ch = make_number (c);
6773       val = Faref (work_table, ch);
6774       if (EQ (val, Qt))
6775         /* This element was already checked.  Ignore it.  */
6776         continue;
6777       /* Remember that we checked this element.  */
6778       Faset (work_table, ch, Qt);
6779
6780       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6781         {
6782           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6783           int encodable;
6784
6785           elt = XCAR (tail);
6786           if (CONSP (XCDR (elt)))
6787             {
6788               /* This entry has this format now:
6789                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6790                           ACCEPT-LATIN-EXTRA ) */
6791               val = XCDR (elt);
6792               encodable = ! NILP (Faref (XCAR (val), ch));
6793               if (! encodable)
6794                 {
6795                   val = XCDR (val);
6796                   translation_table = XCAR (val);
6797                   hash_table = XCAR (XCDR (val));
6798                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6799                 }
6800             }
6801           else
6802             {
6803               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6804               encodable = ! NILP (Faref (XCDR (elt), ch));
6805               if (! encodable)
6806                 {
6807                   /* Transform the format to:
6808                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6809                        ACCEPT-LATIN-EXTRA )  */
6810                   val = Fget (XCAR (elt), Qcoding_system);
6811                   translation_table
6812                     = Fplist_get (AREF (val, 3),
6813                                   Qtranslation_table_for_encode);
6814                   if (SYMBOLP (translation_table))
6815                     translation_table = Fget (translation_table,
6816                                               Qtranslation_table);
6817                   hash_table
6818                     = (CHAR_TABLE_P (translation_table)
6819                        ? XCHAR_TABLE (translation_table)->extras[1]
6820                        : Qnil);
6821                   accept_latin_extra
6822                     = ((EQ (AREF (val, 0), make_number (2))
6823                         && VECTORP (AREF (val, 4)))
6824                        ? AREF (AREF (val, 4), 16)
6825                        : Qnil);
6826                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6827                                         translation_table, hash_table,
6828                                         accept_latin_extra));
6829                 }
6830             }
6831
6832           if (! encodable
6833               && ((CHAR_TABLE_P (translation_table)
6834                    && ! NILP (Faref (translation_table, ch)))
6835                   || (HASH_TABLE_P (hash_table)
6836                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6837                   || (SINGLE_BYTE_CHAR_P (c)
6838                       && ! NILP (accept_latin_extra)
6839                       && VECTORP (Vlatin_extra_code_table)
6840                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6841             encodable = 1;
6842           if (encodable)
6843             prev = tail;
6844           else
6845             {
6846               /* Exclude this coding system from SAFE_CODINGS.  */
6847               if (EQ (tail, safe_codings))
6848                 {
6849                   safe_codings = XCDR (safe_codings);
6850                   if (NILP (safe_codings))
6851                     goto done_safe_codings;
6852                 }
6853               else
6854                 XSETCDR (prev, XCDR (tail));
6855             }
6856         }
6857     }
6858
6859  done_safe_codings:
6860   /* If the above loop was terminated before P reaches PEND, it means
6861      SAFE_CODINGS was set to nil.  If we have not yet found an
6862      non-ASCII single-byte char, check it now.  */
6863   if (! *single_byte_char_found)
6864     while (p < pend)
6865       {
6866         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6867         p += len;
6868         if (! ASCII_BYTE_P (c)
6869             && SINGLE_BYTE_CHAR_P (c))
6870           {
6871             *single_byte_char_found = 1;
6872             break;
6873           }
6874       }
6875   return safe_codings;
6876 }
6877
6878 DEFUN ("find-coding-systems-region-internal",
6879        Ffind_coding_systems_region_internal,
6880        Sfind_coding_systems_region_internal, 2, 2, 0,
6881        doc: /* Internal use only.  */)
6882      (start, end)
6883      Lisp_Object start, end;
6884 {
6885   Lisp_Object work_table, safe_codings;
6886   int non_ascii_p = 0;
6887   int single_byte_char_found = 0;
6888   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6889
6890   if (STRINGP (start))
6891     {
6892       if (!STRING_MULTIBYTE (start))
6893         return Qt;
6894       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6895       p2 = p2end = p1end;
6896       if (SCHARS (start) != SBYTES (start))
6897         non_ascii_p = 1;
6898     }
6899   else
6900     {
6901       int from, to, stop;
6902
6903       CHECK_NUMBER_COERCE_MARKER (start);
6904       CHECK_NUMBER_COERCE_MARKER (end);
6905       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6906         args_out_of_range (start, end);
6907       if (NILP (current_buffer->enable_multibyte_characters))
6908         return Qt;
6909       from = CHAR_TO_BYTE (XINT (start));
6910       to = CHAR_TO_BYTE (XINT (end));
6911       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6912       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6913       if (stop == to)
6914         p2 = p2end = p1end;
6915       else
6916         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6917       if (XINT (end) - XINT (start) != to - from)
6918         non_ascii_p = 1;
6919     }
6920
6921   if (!non_ascii_p)
6922     {
6923       /* We are sure that the text contains no multibyte character.
6924          Check if it contains eight-bit-graphic.  */
6925       p = p1;
6926       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6927       if (p == p1end)
6928         {
6929           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6930           if (p == p2end)
6931             return Qt;
6932         }
6933     }
6934
6935   /* The text contains non-ASCII characters.  */
6936
6937   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6938   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6939
6940   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6941                                     &single_byte_char_found);
6942   if (p2 < p2end)
6943     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6944                                       &single_byte_char_found);
6945   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6946     safe_codings = Qt;
6947   else
6948     {
6949       /* Turn safe_codings to a list of coding systems... */
6950       Lisp_Object val;
6951
6952       if (single_byte_char_found)
6953         /* ... and append these for eight-bit chars.  */
6954         val = Fcons (Qraw_text,
6955                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6956       else
6957         /* ... and append generic coding systems.  */
6958         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6959
6960       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6961         val = Fcons (XCAR (XCAR (safe_codings)), val);
6962       safe_codings = val;
6963     }
6964
6965   return safe_codings;
6966 }
6967
6968
6969 /* Search from position POS for such characters that are unencodable
6970    accoding to SAFE_CHARS, and return a list of their positions.  P
6971    points where in the memory the character at POS exists.  Limit the
6972    search at PEND or when Nth unencodable characters are found.
6973
6974    If SAFE_CHARS is a char table, an element for an unencodable
6975    character is nil.
6976
6977    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6978
6979    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6980    eight-bit-graphic characters are unencodable.  */
6981
6982 static Lisp_Object
6983 unencodable_char_position (safe_chars, pos, p, pend, n)
6984      Lisp_Object safe_chars;
6985      int pos;
6986      unsigned char *p, *pend;
6987      int n;
6988 {
6989   Lisp_Object pos_list;
6990
6991   pos_list = Qnil;
6992   while (p < pend)
6993     {
6994       int len;
6995       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6996
6997       if (c >= 128
6998           && (CHAR_TABLE_P (safe_chars)
6999               ? NILP (CHAR_TABLE_REF (safe_chars, c))
7000               : (NILP (safe_chars) || c < 256)))
7001         {
7002           pos_list = Fcons (make_number (pos), pos_list);
7003           if (--n <= 0)
7004             break;
7005         }
7006       pos++;
7007       p += len;
7008     }
7009   return Fnreverse (pos_list);
7010 }
7011
7012
7013 DEFUN ("unencodable-char-position", Funencodable_char_position,
7014        Sunencodable_char_position, 3, 5, 0,
7015        doc: /*
7016 Return position of first un-encodable character in a region.
7017 START and END specfiy the region and CODING-SYSTEM specifies the
7018 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7019
7020 If optional 4th argument COUNT is non-nil, it specifies at most how
7021 many un-encodable characters to search.  In this case, the value is a
7022 list of positions.
7023
7024 If optional 5th argument STRING is non-nil, it is a string to search
7025 for un-encodable characters.  In that case, START and END are indexes
7026 to the string.  */)
7027      (start, end, coding_system, count, string)
7028      Lisp_Object start, end, coding_system, count, string;
7029 {
7030   int n;
7031   Lisp_Object safe_chars;
7032   struct coding_system coding;
7033   Lisp_Object positions;
7034   int from, to;
7035   unsigned char *p, *pend;
7036
7037   if (NILP (string))
7038     {
7039       validate_region (&start, &end);
7040       from = XINT (start);
7041       to = XINT (end);
7042       if (NILP (current_buffer->enable_multibyte_characters))
7043         return Qnil;
7044       p = CHAR_POS_ADDR (from);
7045       if (to == GPT)
7046         pend = GPT_ADDR;
7047       else
7048         pend = CHAR_POS_ADDR (to);
7049     }
7050   else
7051     {
7052       CHECK_STRING (string);
7053       CHECK_NATNUM (start);
7054       CHECK_NATNUM (end);
7055       from = XINT (start);
7056       to = XINT (end);
7057       if (from > to
7058           || to > SCHARS (string))
7059         args_out_of_range_3 (string, start, end);
7060       if (! STRING_MULTIBYTE (string))
7061         return Qnil;
7062       p = SDATA (string) + string_char_to_byte (string, from);
7063       pend = SDATA (string) + string_char_to_byte (string, to);
7064     }
7065
7066   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7067
7068   if (NILP (count))
7069     n = 1;
7070   else
7071     {
7072       CHECK_NATNUM (count);
7073       n = XINT (count);
7074     }
7075
7076   if (coding.type == coding_type_no_conversion
7077       || coding.type == coding_type_raw_text)
7078     return Qnil;
7079
7080   if (coding.type == coding_type_undecided)
7081     safe_chars = Qnil;
7082   else
7083     safe_chars = coding_safe_chars (coding_system);
7084
7085   if (STRINGP (string)
7086       || from >= GPT || to <= GPT)
7087     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7088   else
7089     {
7090       Lisp_Object args[2];
7091
7092       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7093       n -= XINT (Flength (args[0]));
7094       if (n <= 0)
7095         positions = args[0];
7096       else
7097         {
7098           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7099                                                pend, n);
7100           positions = Fappend (2, args);
7101         }
7102     }
7103
7104   return  (NILP (count) ? Fcar (positions) : positions);
7105 }
7106
7107
7108 Lisp_Object
7109 code_convert_region1 (start, end, coding_system, encodep)
7110      Lisp_Object start, end, coding_system;
7111      int encodep;
7112 {
7113   struct coding_system coding;
7114   int from, to;
7115
7116   CHECK_NUMBER_COERCE_MARKER (start);
7117   CHECK_NUMBER_COERCE_MARKER (end);
7118   CHECK_SYMBOL (coding_system);
7119
7120   validate_region (&start, &end);
7121   from = XFASTINT (start);
7122   to = XFASTINT (end);
7123
7124   if (NILP (coding_system))
7125     return make_number (to - from);
7126
7127   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7128     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7129
7130   coding.mode |= CODING_MODE_LAST_BLOCK;
7131   coding.src_multibyte = coding.dst_multibyte
7132     = !NILP (current_buffer->enable_multibyte_characters);
7133   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7134                        &coding, encodep, 1);
7135   Vlast_coding_system_used = coding.symbol;
7136   return make_number (coding.produced_char);
7137 }
7138
7139 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7140        3, 3, "r\nzCoding system: ",
7141        doc: /* Decode the current region from the specified coding system.
7142 When called from a program, takes three arguments:
7143 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7144 This function sets `last-coding-system-used' to the precise coding system
7145 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7146 not fully specified.)
7147 It returns the length of the decoded text.  */)
7148      (start, end, coding_system)
7149      Lisp_Object start, end, coding_system;
7150 {
7151   return code_convert_region1 (start, end, coding_system, 0);
7152 }
7153
7154 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7155        3, 3, "r\nzCoding system: ",
7156        doc: /* Encode the current region into the specified coding system.
7157 When called from a program, takes three arguments:
7158 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7159 This function sets `last-coding-system-used' to the precise coding system
7160 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7161 not fully specified.)
7162 It returns the length of the encoded text.  */)
7163      (start, end, coding_system)
7164      Lisp_Object start, end, coding_system;
7165 {
7166   return code_convert_region1 (start, end, coding_system, 1);
7167 }
7168
7169 Lisp_Object
7170 code_convert_string1 (string, coding_system, nocopy, encodep)
7171      Lisp_Object string, coding_system, nocopy;
7172      int encodep;
7173 {
7174   struct coding_system coding;
7175
7176   CHECK_STRING (string);
7177   CHECK_SYMBOL (coding_system);
7178
7179   if (NILP (coding_system))
7180     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7181
7182   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7183     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7184
7185   coding.mode |= CODING_MODE_LAST_BLOCK;
7186   string = (encodep
7187             ? encode_coding_string (string, &coding, !NILP (nocopy))
7188             : decode_coding_string (string, &coding, !NILP (nocopy)));
7189   Vlast_coding_system_used = coding.symbol;
7190
7191   return string;
7192 }
7193
7194 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7195        2, 3, 0,
7196        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7197 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7198 if the decoding operation is trivial.
7199 This function sets `last-coding-system-used' to the precise coding system
7200 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7201 not fully specified.)  */)
7202      (string, coding_system, nocopy)
7203      Lisp_Object string, coding_system, nocopy;
7204 {
7205   return code_convert_string1 (string, coding_system, nocopy, 0);
7206 }
7207
7208 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7209        2, 3, 0,
7210        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7211 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7212 if the encoding operation is trivial.
7213 This function sets `last-coding-system-used' to the precise coding system
7214 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7215 not fully specified.)  */)
7216      (string, coding_system, nocopy)
7217      Lisp_Object string, coding_system, nocopy;
7218 {
7219   return code_convert_string1 (string, coding_system, nocopy, 1);
7220 }
7221
7222 /* Encode or decode STRING according to CODING_SYSTEM.
7223    Do not set Vlast_coding_system_used.
7224
7225    This function is called only from macros DECODE_FILE and
7226    ENCODE_FILE, thus we ignore character composition.  */
7227
7228 Lisp_Object
7229 code_convert_string_norecord (string, coding_system, encodep)
7230      Lisp_Object string, coding_system;
7231      int encodep;
7232 {
7233   struct coding_system coding;
7234
7235   CHECK_STRING (string);
7236   CHECK_SYMBOL (coding_system);
7237
7238   if (NILP (coding_system))
7239     return string;
7240
7241   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7242     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7243
7244   coding.composing = COMPOSITION_DISABLED;
7245   coding.mode |= CODING_MODE_LAST_BLOCK;
7246   return (encodep
7247           ? encode_coding_string (string, &coding, 1)
7248           : decode_coding_string (string, &coding, 1));
7249 }
7250 \f
7251 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7252        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7253 Return the corresponding character.  */)
7254      (code)
7255      Lisp_Object code;
7256 {
7257   unsigned char c1, c2, s1, s2;
7258   Lisp_Object val;
7259
7260   CHECK_NUMBER (code);
7261   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7262   if (s1 == 0)
7263     {
7264       if (s2 < 0x80)
7265         XSETFASTINT (val, s2);
7266       else if (s2 >= 0xA0 || s2 <= 0xDF)
7267         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7268       else
7269         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7270     }
7271   else
7272     {
7273       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7274           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7275         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7276       DECODE_SJIS (s1, s2, c1, c2);
7277       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7278     }
7279   return val;
7280 }
7281
7282 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7283        doc: /* Encode a Japanese character CH to shift_jis encoding.
7284 Return the corresponding code in SJIS.  */)
7285      (ch)
7286      Lisp_Object ch;
7287 {
7288   int charset, c1, c2, s1, s2;
7289   Lisp_Object val;
7290
7291   CHECK_NUMBER (ch);
7292   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7293   if (charset == CHARSET_ASCII)
7294     {
7295       val = ch;
7296     }
7297   else if (charset == charset_jisx0208
7298            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7299     {
7300       ENCODE_SJIS (c1, c2, s1, s2);
7301       XSETFASTINT (val, (s1 << 8) | s2);
7302     }
7303   else if (charset == charset_katakana_jisx0201
7304            && c1 > 0x20 && c2 < 0xE0)
7305     {
7306       XSETFASTINT (val, c1 | 0x80);
7307     }
7308   else
7309     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7310   return val;
7311 }
7312
7313 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7314        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7315 Return the corresponding character.  */)
7316      (code)
7317      Lisp_Object code;
7318 {
7319   int charset;
7320   unsigned char b1, b2, c1, c2;
7321   Lisp_Object val;
7322
7323   CHECK_NUMBER (code);
7324   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7325   if (b1 == 0)
7326     {
7327       if (b2 >= 0x80)
7328         error ("Invalid BIG5 code: %x", XFASTINT (code));
7329       val = code;
7330     }
7331   else
7332     {
7333       if ((b1 < 0xA1 || b1 > 0xFE)
7334           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7335         error ("Invalid BIG5 code: %x", XFASTINT (code));
7336       DECODE_BIG5 (b1, b2, charset, c1, c2);
7337       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7338     }
7339   return val;
7340 }
7341
7342 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7343        doc: /* Encode the Big5 character CH to BIG5 coding system.
7344 Return the corresponding character code in Big5.  */)
7345      (ch)
7346      Lisp_Object ch;
7347 {
7348   int charset, c1, c2, b1, b2;
7349   Lisp_Object val;
7350
7351   CHECK_NUMBER (ch);
7352   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7353   if (charset == CHARSET_ASCII)
7354     {
7355       val = ch;
7356     }
7357   else if ((charset == charset_big5_1
7358             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7359            || (charset == charset_big5_2
7360                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7361     {
7362       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7363       XSETFASTINT (val, (b1 << 8) | b2);
7364     }
7365   else
7366     error ("Can't encode to Big5: %d", XFASTINT (ch));
7367   return val;
7368 }
7369 \f
7370 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7371        Sset_terminal_coding_system_internal, 1, 1, 0,
7372        doc: /* Internal use only.  */)
7373      (coding_system)
7374      Lisp_Object coding_system;
7375 {
7376   CHECK_SYMBOL (coding_system);
7377   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7378   /* We had better not send unsafe characters to terminal.  */
7379   terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7380   /* Character composition should be disabled.  */
7381   terminal_coding.composing = COMPOSITION_DISABLED;
7382   /* Error notification should be suppressed.  */
7383   terminal_coding.suppress_error = 1;
7384   terminal_coding.src_multibyte = 1;
7385   terminal_coding.dst_multibyte = 0;
7386   return Qnil;
7387 }
7388
7389 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7390        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7391        doc: /* Internal use only.  */)
7392      (coding_system)
7393      Lisp_Object coding_system;
7394 {
7395   CHECK_SYMBOL (coding_system);
7396   setup_coding_system (Fcheck_coding_system (coding_system),
7397                        &safe_terminal_coding);
7398   /* Character composition should be disabled.  */
7399   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7400   /* Error notification should be suppressed.  */
7401   safe_terminal_coding.suppress_error = 1;
7402   safe_terminal_coding.src_multibyte = 1;
7403   safe_terminal_coding.dst_multibyte = 0;
7404   return Qnil;
7405 }
7406
7407 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7408        Sterminal_coding_system, 0, 0, 0,
7409        doc: /* Return coding system specified for terminal output.  */)
7410      ()
7411 {
7412   return terminal_coding.symbol;
7413 }
7414
7415 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7416        Sset_keyboard_coding_system_internal, 1, 1, 0,
7417        doc: /* Internal use only.  */)
7418      (coding_system)
7419      Lisp_Object coding_system;
7420 {
7421   CHECK_SYMBOL (coding_system);
7422   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7423   /* Character composition should be disabled.  */
7424   keyboard_coding.composing = COMPOSITION_DISABLED;
7425   return Qnil;
7426 }
7427
7428 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7429        Skeyboard_coding_system, 0, 0, 0,
7430        doc: /* Return coding system specified for decoding keyboard input.  */)
7431      ()
7432 {
7433   return keyboard_coding.symbol;
7434 }
7435
7436 \f
7437 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7438        Sfind_operation_coding_system,  1, MANY, 0,
7439        doc: /* Choose a coding system for an operation based on the target name.
7440 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7441 DECODING-SYSTEM is the coding system to use for decoding
7442 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7443 for encoding (in case OPERATION does encoding).
7444
7445 The first argument OPERATION specifies an I/O primitive:
7446   For file I/O, `insert-file-contents' or `write-region'.
7447   For process I/O, `call-process', `call-process-region', or `start-process'.
7448   For network I/O, `open-network-stream'.
7449
7450 The remaining arguments should be the same arguments that were passed
7451 to the primitive.  Depending on which primitive, one of those arguments
7452 is selected as the TARGET.  For example, if OPERATION does file I/O,
7453 whichever argument specifies the file name is TARGET.
7454
7455 TARGET has a meaning which depends on OPERATION:
7456   For file I/O, TARGET is a file name (except for the special case below).
7457   For process I/O, TARGET is a process name.
7458   For network I/O, TARGET is a service name or a port number
7459
7460 This function looks up what specified for TARGET in,
7461 `file-coding-system-alist', `process-coding-system-alist',
7462 or `network-coding-system-alist' depending on OPERATION.
7463 They may specify a coding system, a cons of coding systems,
7464 or a function symbol to call.
7465 In the last case, we call the function with one argument,
7466 which is a list of all the arguments given to this function.
7467
7468 If OPERATION is `insert-file-contents', the argument corresponding to
7469 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
7470 file name to look up, and BUFFER is a buffer that contains the file's
7471 contents (not yet decoded).  If `file-coding-system-alist' specifies a
7472 function to call for FILENAME, that function should examine the
7473 contents of BUFFER instead of reading the file.
7474
7475 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
7476      (nargs, args)
7477      int nargs;
7478      Lisp_Object *args;
7479 {
7480   Lisp_Object operation, target_idx, target, val;
7481   register Lisp_Object chain;
7482
7483   if (nargs < 2)
7484     error ("Too few arguments");
7485   operation = args[0];
7486   if (!SYMBOLP (operation)
7487       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7488     error ("Invalid first argument");
7489   if (nargs < 1 + XINT (target_idx))
7490     error ("Too few arguments for operation: %s",
7491            SDATA (SYMBOL_NAME (operation)));
7492   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7493      argument to write-region) is string, it must be treated as a
7494      target file name.  */
7495   if (EQ (operation, Qwrite_region)
7496       && nargs > 5
7497       && STRINGP (args[5]))
7498     target_idx = make_number (4);
7499   target = args[XINT (target_idx) + 1];
7500   if (!(STRINGP (target)
7501         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
7502             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
7503         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7504     error ("Invalid argument %d", XINT (target_idx) + 1);
7505   if (CONSP (target))
7506     target = XCAR (target);
7507
7508   chain = ((EQ (operation, Qinsert_file_contents)
7509             || EQ (operation, Qwrite_region))
7510            ? Vfile_coding_system_alist
7511            : (EQ (operation, Qopen_network_stream)
7512               ? Vnetwork_coding_system_alist
7513               : Vprocess_coding_system_alist));
7514   if (NILP (chain))
7515     return Qnil;
7516
7517   for (; CONSP (chain); chain = XCDR (chain))
7518     {
7519       Lisp_Object elt;
7520       elt = XCAR (chain);
7521
7522       if (CONSP (elt)
7523           && ((STRINGP (target)
7524                && STRINGP (XCAR (elt))
7525                && fast_string_match (XCAR (elt), target) >= 0)
7526               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7527         {
7528           val = XCDR (elt);
7529           /* Here, if VAL is both a valid coding system and a valid
7530              function symbol, we return VAL as a coding system.  */
7531           if (CONSP (val))
7532             return val;
7533           if (! SYMBOLP (val))
7534             return Qnil;
7535           if (! NILP (Fcoding_system_p (val)))
7536             return Fcons (val, val);
7537           if (! NILP (Ffboundp (val)))
7538             {
7539               /* We use call1 rather than safe_call1
7540                  so as to get bug reports about functions called here
7541                  which don't handle the current interface.  */
7542               val = call1 (val, Flist (nargs, args));
7543               if (CONSP (val))
7544                 return val;
7545               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7546                 return Fcons (val, val);
7547             }
7548           return Qnil;
7549         }
7550     }
7551   return Qnil;
7552 }
7553
7554 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7555        Supdate_coding_systems_internal, 0, 0, 0,
7556        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7557 When values of any coding categories are changed, you must
7558 call this function.  */)
7559      ()
7560 {
7561   int i;
7562
7563   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7564     {
7565       Lisp_Object val;
7566
7567       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7568       if (!NILP (val))
7569         {
7570           if (! coding_system_table[i])
7571             coding_system_table[i] = ((struct coding_system *)
7572                                       xmalloc (sizeof (struct coding_system)));
7573           setup_coding_system (val, coding_system_table[i]);
7574         }
7575       else if (coding_system_table[i])
7576         {
7577           xfree (coding_system_table[i]);
7578           coding_system_table[i] = NULL;
7579         }
7580     }
7581
7582   return Qnil;
7583 }
7584
7585 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7586        Sset_coding_priority_internal, 0, 0, 0,
7587        doc: /* Update internal database for the current value of `coding-category-list'.
7588 This function is internal use only.  */)
7589      ()
7590 {
7591   int i = 0, idx;
7592   Lisp_Object val;
7593
7594   val = Vcoding_category_list;
7595
7596   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7597     {
7598       if (! SYMBOLP (XCAR (val)))
7599         break;
7600       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7601       if (idx >= CODING_CATEGORY_IDX_MAX)
7602         break;
7603       coding_priorities[i++] = (1 << idx);
7604       val = XCDR (val);
7605     }
7606   /* If coding-category-list is valid and contains all coding
7607      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7608      the following code saves Emacs from crashing.  */
7609   while (i < CODING_CATEGORY_IDX_MAX)
7610     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7611
7612   return Qnil;
7613 }
7614
7615 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7616        Sdefine_coding_system_internal, 1, 1, 0,
7617        doc: /* Register CODING-SYSTEM as a base coding system.
7618 This function is internal use only.  */)
7619      (coding_system)
7620      Lisp_Object coding_system;
7621 {
7622   Lisp_Object safe_chars, slot;
7623
7624   if (NILP (Fcheck_coding_system (coding_system)))
7625     xsignal1 (Qcoding_system_error, coding_system);
7626
7627   safe_chars = coding_safe_chars (coding_system);
7628   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7629     error ("No valid safe-chars property for %s",
7630            SDATA (SYMBOL_NAME (coding_system)));
7631
7632   if (EQ (safe_chars, Qt))
7633     {
7634       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7635         XSETCAR (Vcoding_system_safe_chars,
7636                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7637     }
7638   else
7639     {
7640       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7641       if (NILP (slot))
7642         XSETCDR (Vcoding_system_safe_chars,
7643                  nconc2 (XCDR (Vcoding_system_safe_chars),
7644                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7645       else
7646         XSETCDR (slot, safe_chars);
7647     }
7648   return Qnil;
7649 }
7650
7651 #endif /* emacs */
7652
7653 \f
7654 /*** 9. Post-amble ***/
7655
7656 void
7657 init_coding_once ()
7658 {
7659   int i;
7660
7661   /* Emacs' internal format specific initialize routine.  */
7662   for (i = 0; i <= 0x20; i++)
7663     emacs_code_class[i] = EMACS_control_code;
7664   emacs_code_class[0x0A] = EMACS_linefeed_code;
7665   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7666   for (i = 0x21 ; i < 0x7F; i++)
7667     emacs_code_class[i] = EMACS_ascii_code;
7668   emacs_code_class[0x7F] = EMACS_control_code;
7669   for (i = 0x80; i < 0xFF; i++)
7670     emacs_code_class[i] = EMACS_invalid_code;
7671   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7672   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7673   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7674   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7675
7676   /* ISO2022 specific initialize routine.  */
7677   for (i = 0; i < 0x20; i++)
7678     iso_code_class[i] = ISO_control_0;
7679   for (i = 0x21; i < 0x7F; i++)
7680     iso_code_class[i] = ISO_graphic_plane_0;
7681   for (i = 0x80; i < 0xA0; i++)
7682     iso_code_class[i] = ISO_control_1;
7683   for (i = 0xA1; i < 0xFF; i++)
7684     iso_code_class[i] = ISO_graphic_plane_1;
7685   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7686   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7687   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7688   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7689   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7690   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7691   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7692   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7693   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7694   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7695
7696   setup_coding_system (Qnil, &keyboard_coding);
7697   setup_coding_system (Qnil, &terminal_coding);
7698   setup_coding_system (Qnil, &safe_terminal_coding);
7699   setup_coding_system (Qnil, &default_buffer_file_coding);
7700
7701   bzero (coding_system_table, sizeof coding_system_table);
7702
7703   bzero (ascii_skip_code, sizeof ascii_skip_code);
7704   for (i = 0; i < 128; i++)
7705     ascii_skip_code[i] = 1;
7706
7707 #if defined (MSDOS) || defined (WINDOWSNT)
7708   system_eol_type = CODING_EOL_CRLF;
7709 #else
7710   system_eol_type = CODING_EOL_LF;
7711 #endif
7712
7713   inhibit_pre_post_conversion = 0;
7714 }
7715
7716 #ifdef emacs
7717
7718 void
7719 syms_of_coding ()
7720 {
7721   staticpro (&Vcode_conversion_workbuf_name);
7722   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7723
7724   Qtarget_idx = intern ("target-idx");
7725   staticpro (&Qtarget_idx);
7726
7727   Qcoding_system_history = intern ("coding-system-history");
7728   staticpro (&Qcoding_system_history);
7729   Fset (Qcoding_system_history, Qnil);
7730
7731   /* Target FILENAME is the first argument.  */
7732   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7733   /* Target FILENAME is the third argument.  */
7734   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7735
7736   Qcall_process = intern ("call-process");
7737   staticpro (&Qcall_process);
7738   /* Target PROGRAM is the first argument.  */
7739   Fput (Qcall_process, Qtarget_idx, make_number (0));
7740
7741   Qcall_process_region = intern ("call-process-region");
7742   staticpro (&Qcall_process_region);
7743   /* Target PROGRAM is the third argument.  */
7744   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7745
7746   Qstart_process = intern ("start-process");
7747   staticpro (&Qstart_process);
7748   /* Target PROGRAM is the third argument.  */
7749   Fput (Qstart_process, Qtarget_idx, make_number (2));
7750
7751   Qopen_network_stream = intern ("open-network-stream");
7752   staticpro (&Qopen_network_stream);
7753   /* Target SERVICE is the fourth argument.  */
7754   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7755
7756   Qcoding_system = intern ("coding-system");
7757   staticpro (&Qcoding_system);
7758
7759   Qeol_type = intern ("eol-type");
7760   staticpro (&Qeol_type);
7761
7762   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7763   staticpro (&Qbuffer_file_coding_system);
7764
7765   Qpost_read_conversion = intern ("post-read-conversion");
7766   staticpro (&Qpost_read_conversion);
7767
7768   Qpre_write_conversion = intern ("pre-write-conversion");
7769   staticpro (&Qpre_write_conversion);
7770
7771   Qno_conversion = intern ("no-conversion");
7772   staticpro (&Qno_conversion);
7773
7774   Qundecided = intern ("undecided");
7775   staticpro (&Qundecided);
7776
7777   Qcoding_system_p = intern ("coding-system-p");
7778   staticpro (&Qcoding_system_p);
7779
7780   Qcoding_system_error = intern ("coding-system-error");
7781   staticpro (&Qcoding_system_error);
7782
7783   Fput (Qcoding_system_error, Qerror_conditions,
7784         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7785   Fput (Qcoding_system_error, Qerror_message,
7786         build_string ("Invalid coding system"));
7787
7788   Qcoding_category = intern ("coding-category");
7789   staticpro (&Qcoding_category);
7790   Qcoding_category_index = intern ("coding-category-index");
7791   staticpro (&Qcoding_category_index);
7792
7793   Vcoding_category_table
7794     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7795   staticpro (&Vcoding_category_table);
7796   {
7797     int i;
7798     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7799       {
7800         XVECTOR (Vcoding_category_table)->contents[i]
7801           = intern (coding_category_name[i]);
7802         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7803               Qcoding_category_index, make_number (i));
7804       }
7805   }
7806
7807   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7808   staticpro (&Vcoding_system_safe_chars);
7809
7810   Qtranslation_table = intern ("translation-table");
7811   staticpro (&Qtranslation_table);
7812   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7813
7814   Qtranslation_table_id = intern ("translation-table-id");
7815   staticpro (&Qtranslation_table_id);
7816
7817   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7818   staticpro (&Qtranslation_table_for_decode);
7819
7820   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7821   staticpro (&Qtranslation_table_for_encode);
7822
7823   Qsafe_chars = intern ("safe-chars");
7824   staticpro (&Qsafe_chars);
7825
7826   Qchar_coding_system = intern ("char-coding-system");
7827   staticpro (&Qchar_coding_system);
7828
7829   /* Intern this now in case it isn't already done.
7830      Setting this variable twice is harmless.
7831      But don't staticpro it here--that is done in alloc.c.  */
7832   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7833   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7834   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7835
7836   Qvalid_codes = intern ("valid-codes");
7837   staticpro (&Qvalid_codes);
7838
7839   Qascii_incompatible = intern ("ascii-incompatible");
7840   staticpro (&Qascii_incompatible);
7841
7842   Qemacs_mule = intern ("emacs-mule");
7843   staticpro (&Qemacs_mule);
7844
7845   Qraw_text = intern ("raw-text");
7846   staticpro (&Qraw_text);
7847
7848   Qutf_8 = intern ("utf-8");
7849   staticpro (&Qutf_8);
7850
7851   Qcoding_system_define_form = intern ("coding-system-define-form");
7852   staticpro (&Qcoding_system_define_form);
7853
7854   defsubr (&Scoding_system_p);
7855   defsubr (&Sread_coding_system);
7856   defsubr (&Sread_non_nil_coding_system);
7857   defsubr (&Scheck_coding_system);
7858   defsubr (&Sdetect_coding_region);
7859   defsubr (&Sdetect_coding_string);
7860   defsubr (&Sfind_coding_systems_region_internal);
7861   defsubr (&Sunencodable_char_position);
7862   defsubr (&Sdecode_coding_region);
7863   defsubr (&Sencode_coding_region);
7864   defsubr (&Sdecode_coding_string);
7865   defsubr (&Sencode_coding_string);
7866   defsubr (&Sdecode_sjis_char);
7867   defsubr (&Sencode_sjis_char);
7868   defsubr (&Sdecode_big5_char);
7869   defsubr (&Sencode_big5_char);
7870   defsubr (&Sset_terminal_coding_system_internal);
7871   defsubr (&Sset_safe_terminal_coding_system_internal);
7872   defsubr (&Sterminal_coding_system);
7873   defsubr (&Sset_keyboard_coding_system_internal);
7874   defsubr (&Skeyboard_coding_system);
7875   defsubr (&Sfind_operation_coding_system);
7876   defsubr (&Supdate_coding_systems_internal);
7877   defsubr (&Sset_coding_priority_internal);
7878   defsubr (&Sdefine_coding_system_internal);
7879
7880   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7881                doc: /* List of coding systems.
7882
7883 Do not alter the value of this variable manually.  This variable should be
7884 updated by the functions `make-coding-system' and
7885 `define-coding-system-alias'.  */);
7886   Vcoding_system_list = Qnil;
7887
7888   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7889                doc: /* Alist of coding system names.
7890 Each element is one element list of coding system name.
7891 This variable is given to `completing-read' as TABLE argument.
7892
7893 Do not alter the value of this variable manually.  This variable should be
7894 updated by the functions `make-coding-system' and
7895 `define-coding-system-alias'.  */);
7896   Vcoding_system_alist = Qnil;
7897
7898   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7899                doc: /* List of coding-categories (symbols) ordered by priority.
7900
7901 On detecting a coding system, Emacs tries code detection algorithms
7902 associated with each coding-category one by one in this order.  When
7903 one algorithm agrees with a byte sequence of source text, the coding
7904 system bound to the corresponding coding-category is selected.
7905
7906 Don't modify this variable directly, but use `set-coding-priority'.  */);
7907   {
7908     int i;
7909
7910     Vcoding_category_list = Qnil;
7911     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7912       Vcoding_category_list
7913         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7914                  Vcoding_category_list);
7915   }
7916
7917   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7918                doc: /* Specify the coding system for read operations.
7919 It is useful to bind this variable with `let', but do not set it globally.
7920 If the value is a coding system, it is used for decoding on read operation.
7921 If not, an appropriate element is used from one of the coding system alists:
7922 There are three such tables, `file-coding-system-alist',
7923 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7924   Vcoding_system_for_read = Qnil;
7925
7926   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7927                doc: /* Specify the coding system for write operations.
7928 Programs bind this variable with `let', but you should not set it globally.
7929 If the value is a coding system, it is used for encoding of output,
7930 when writing it to a file and when sending it to a file or subprocess.
7931
7932 If this does not specify a coding system, an appropriate element
7933 is used from one of the coding system alists:
7934 There are three such tables, `file-coding-system-alist',
7935 `process-coding-system-alist', and `network-coding-system-alist'.
7936 For output to files, if the above procedure does not specify a coding system,
7937 the value of `buffer-file-coding-system' is used.  */);
7938   Vcoding_system_for_write = Qnil;
7939
7940   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7941                doc: /* Coding system used in the latest file or process I/O.
7942 Also set by `encode-coding-region', `decode-coding-region',
7943 `encode-coding-string' and `decode-coding-string'.  */);
7944   Vlast_coding_system_used = Qnil;
7945
7946   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7947                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7948 See info node `Coding Systems' and info node `Text and Binary' concerning
7949 such conversion.  */);
7950   inhibit_eol_conversion = 0;
7951
7952   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7953                doc: /* Non-nil means process buffer inherits coding system of process output.
7954 Bind it to t if the process output is to be treated as if it were a file
7955 read from some filesystem.  */);
7956   inherit_process_coding_system = 0;
7957
7958   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7959                doc: /* Alist to decide a coding system to use for a file I/O operation.
7960 The format is ((PATTERN . VAL) ...),
7961 where PATTERN is a regular expression matching a file name,
7962 VAL is a coding system, a cons of coding systems, or a function symbol.
7963 If VAL is a coding system, it is used for both decoding and encoding
7964 the file contents.
7965 If VAL is a cons of coding systems, the car part is used for decoding,
7966 and the cdr part is used for encoding.
7967 If VAL is a function symbol, the function must return a coding system
7968 or a cons of coding systems which are used as above.  The function is
7969 called with an argument that is a list of the arguments with which
7970 `find-operation-coding-system' was called.
7971
7972 See also the function `find-operation-coding-system'
7973 and the variable `auto-coding-alist'.  */);
7974   Vfile_coding_system_alist = Qnil;
7975
7976   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7977     doc: /* Alist to decide a coding system to use for a process I/O operation.
7978 The format is ((PATTERN . VAL) ...),
7979 where PATTERN is a regular expression matching a program name,
7980 VAL is a coding system, a cons of coding systems, or a function symbol.
7981 If VAL is a coding system, it is used for both decoding what received
7982 from the program and encoding what sent to the program.
7983 If VAL is a cons of coding systems, the car part is used for decoding,
7984 and the cdr part is used for encoding.
7985 If VAL is a function symbol, the function must return a coding system
7986 or a cons of coding systems which are used as above.
7987
7988 See also the function `find-operation-coding-system'.  */);
7989   Vprocess_coding_system_alist = Qnil;
7990
7991   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7992     doc: /* Alist to decide a coding system to use for a network I/O operation.
7993 The format is ((PATTERN . VAL) ...),
7994 where PATTERN is a regular expression matching a network service name
7995 or is a port number to connect to,
7996 VAL is a coding system, a cons of coding systems, or a function symbol.
7997 If VAL is a coding system, it is used for both decoding what received
7998 from the network stream and encoding what sent to the network stream.
7999 If VAL is a cons of coding systems, the car part is used for decoding,
8000 and the cdr part is used for encoding.
8001 If VAL is a function symbol, the function must return a coding system
8002 or a cons of coding systems which are used as above.
8003
8004 See also the function `find-operation-coding-system'.  */);
8005   Vnetwork_coding_system_alist = Qnil;
8006
8007   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8008                doc: /* Coding system to use with system messages.
8009 Also used for decoding keyboard input on X Window system.  */);
8010   Vlocale_coding_system = Qnil;
8011
8012   /* The eol mnemonics are reset in startup.el system-dependently.  */
8013   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8014                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8015   eol_mnemonic_unix = build_string (":");
8016
8017   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8018                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8019   eol_mnemonic_dos = build_string ("\\");
8020
8021   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8022                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8023   eol_mnemonic_mac = build_string ("/");
8024
8025   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8026                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
8027   eol_mnemonic_undecided = build_string (":");
8028
8029   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8030                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
8031   Venable_character_translation = Qt;
8032
8033   DEFVAR_LISP ("standard-translation-table-for-decode",
8034                &Vstandard_translation_table_for_decode,
8035                doc: /* Table for translating characters while decoding.  */);
8036   Vstandard_translation_table_for_decode = Qnil;
8037
8038   DEFVAR_LISP ("standard-translation-table-for-encode",
8039                &Vstandard_translation_table_for_encode,
8040                doc: /* Table for translating characters while encoding.  */);
8041   Vstandard_translation_table_for_encode = Qnil;
8042
8043   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8044                doc: /* Alist of charsets vs revision numbers.
8045 While encoding, if a charset (car part of an element) is found,
8046 designate it with the escape sequence identifying revision (cdr part of the element).  */);
8047   Vcharset_revision_alist = Qnil;
8048
8049   DEFVAR_LISP ("default-process-coding-system",
8050                &Vdefault_process_coding_system,
8051                doc: /* Cons of coding systems used for process I/O by default.
8052 The car part is used for decoding a process output,
8053 the cdr part is used for encoding a text to be sent to a process.  */);
8054   Vdefault_process_coding_system = Qnil;
8055
8056   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8057                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8058 This is a vector of length 256.
8059 If Nth element is non-nil, the existence of code N in a file
8060 \(or output of subprocess) doesn't prevent it to be detected as
8061 a coding system of ISO 2022 variant which has a flag
8062 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8063 or reading output of a subprocess.
8064 Only 128th through 159th elements has a meaning.  */);
8065   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8066
8067   DEFVAR_LISP ("select-safe-coding-system-function",
8068                &Vselect_safe_coding_system_function,
8069                doc: /* Function to call to select safe coding system for encoding a text.
8070
8071 If set, this function is called to force a user to select a proper
8072 coding system which can encode the text in the case that a default
8073 coding system used in each operation can't encode the text.
8074
8075 The default value is `select-safe-coding-system' (which see).  */);
8076   Vselect_safe_coding_system_function = Qnil;
8077
8078   DEFVAR_BOOL ("coding-system-require-warning",
8079                &coding_system_require_warning,
8080                doc: /* Internal use only.
8081 If non-nil, on writing a file, `select-safe-coding-system-function' is
8082 called even if `coding-system-for-write' is non-nil.  The command
8083 `universal-coding-system-argument' binds this variable to t temporarily.  */);
8084   coding_system_require_warning = 0;
8085
8086
8087   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8088                &inhibit_iso_escape_detection,
8089                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8090
8091 By default, on reading a file, Emacs tries to detect how the text is
8092 encoded.  This code detection is sensitive to escape sequences.  If
8093 the sequence is valid as ISO2022, the code is determined as one of
8094 the ISO2022 encodings, and the file is decoded by the corresponding
8095 coding system (e.g. `iso-2022-7bit').
8096
8097 However, there may be a case that you want to read escape sequences in
8098 a file as is.  In such a case, you can set this variable to non-nil.
8099 Then, as the code detection ignores any escape sequences, no file is
8100 detected as encoded in some ISO2022 encoding.  The result is that all
8101 escape sequences become visible in a buffer.
8102
8103 The default value is nil, and it is strongly recommended not to change
8104 it.  That is because many Emacs Lisp source files that contain
8105 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8106 in Emacs's distribution, and they won't be decoded correctly on
8107 reading if you suppress escape sequence detection.
8108
8109 The other way to read escape sequences in a file without decoding is
8110 to explicitly specify some coding system that doesn't use ISO2022's
8111 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8112   inhibit_iso_escape_detection = 0;
8113
8114   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8115                doc: /* Char table for translating self-inserting characters.
8116 This is applied to the result of input methods, not their input.  See also
8117 `keyboard-translate-table'.  */);
8118     Vtranslation_table_for_input = Qnil;
8119 }
8120
8121 char *
8122 emacs_strerror (error_number)
8123      int error_number;
8124 {
8125   char *str;
8126
8127   synchronize_system_messages_locale ();
8128   str = strerror (error_number);
8129
8130   if (! NILP (Vlocale_coding_system))
8131     {
8132       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8133                                                       Vlocale_coding_system,
8134                                                       0);
8135       str = (char *) SDATA (dec);
8136     }
8137
8138   return str;
8139 }
8140
8141 #endif /* emacs */
8142
8143 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8144    (do not change this comment) */