src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995,97,1998,2002,2003  Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001,2002,2003  Free Software Foundation, Inc.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   0. General comments
  26   1. Preamble
  27   2. Emacs' internal format (emacs-mule) handlers
  28   3. ISO2022 handlers
  29   4. Shift-JIS and BIG5 handlers
  30   5. CCL handlers
  31   6. End-of-line handlers
  32   7. C library functions
  33   8. Emacs Lisp library functions
  34   9. Post-amble
  35
  36 */
  37
  38 /*** 0. General comments ***/
  39
  40
  41 /*** GENERAL NOTE on CODING SYSTEMS ***
  42
  43   A coding system is an encoding mechanism for one or more character
  44   sets.  Here's a list of coding systems which Emacs can handle.  When
  45   we say "decode", it means converting some other coding system to
  46   Emacs' internal format (emacs-mule), and when we say "encode",
  47   it means converting the coding system emacs-mule to some other
  48   coding system.
  49
  50   0. Emacs' internal format (emacs-mule)
  51
  52   Emacs itself holds a multi-lingual character in buffers and strings
  53   in a special format.  Details are described in section 2.
  54
  55   1. ISO2022
  56
  57   The most famous coding system for multiple character sets.  X's
  58   Compound Text, various EUCs (Extended Unix Code), and coding
  59   systems used in Internet communication such as ISO-2022-JP are
  60   all variants of ISO2022.  Details are described in section 3.
  61
  62   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  63
  64   A coding system to encode character sets: ASCII, JISX0201, and
  65   JISX0208.  Widely used for PC's in Japan.  Details are described in
  66   section 4.
  67
  68   3. BIG5
  69
  70   A coding system to encode the character sets ASCII and Big5.  Widely
  71   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  72   described in section 4.  In this file, when we write "BIG5"
  73   (all uppercase), we mean the coding system, and when we write
  74   "Big5" (capitalized), we mean the character set.
  75
  76   4. Raw text
  77
  78   A coding system for text containing random 8-bit code.  Emacs does
  79   no code conversion on such text except for end-of-line format.
  80
  81   5. Other
  82
  83   If a user wants to read/write text encoded in a coding system not
  84   listed above, he can supply a decoder and an encoder for it as CCL
  85   (Code Conversion Language) programs.  Emacs executes the CCL program
  86   while reading/writing.
  87
  88   Emacs represents a coding system by a Lisp symbol that has a property
  89   `coding-system'.  But, before actually using the coding system, the
  90   information about it is set in a structure of type `struct
  91   coding_system' for rapid processing.  See section 6 for more details.
  92
  93 */
  94
  95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  96
  97   How end-of-line of text is encoded depends on the operating system.
  98   For instance, Unix's format is just one byte of `line-feed' code,
  99   whereas DOS's format is two-byte sequence of `carriage-return' and
 100   `line-feed' codes.  MacOS's format is usually one byte of
 101   `carriage-return'.
 102
 103   Since text character encoding and end-of-line encoding are
 104   independent, any coding system described above can have any
 105   end-of-line format.  So Emacs has information about end-of-line
 106   format in each coding-system.  See section 6 for more details.
 107
 108 */
 109
 110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 111
 112   These functions check if a text between SRC and SRC_END is encoded
 113   in the coding system category XXX.  Each returns an integer value in
 114   which appropriate flag bits for the category XXX are set.  The flag
 115   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 116   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 117   of the range 0x80..0x9F are in multibyte form.  */
 118 #if 0
 119 int
 120 detect_coding_emacs_mule (src, src_end, multibytep)
 121      unsigned char *src, *src_end;
 122      int multibytep;
 123 {
 124   ...
 125 }
 126 #endif
 127
 128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 129
 130   These functions decode SRC_BYTES length of unibyte text at SOURCE
 131   encoded in CODING to Emacs' internal format.  The resulting
 132   multibyte text goes to a place pointed to by DESTINATION, the length
 133   of which should not exceed DST_BYTES.
 134
 135   These functions set the information about original and decoded texts
 136   in the members `produced', `produced_char', `consumed', and
 137   `consumed_char' of the structure *CODING.  They also set the member
 138   `result' to one of CODING_FINISH_XXX indicating how the decoding
 139   finished.
 140
 141   DST_BYTES zero means that the source area and destination area are
 142   overlapped, which means that we can produce a decoded text until it
 143   reaches the head of the not-yet-decoded source text.
 144
 145   Below is a template for these functions.  */
 146 #if 0
 147 static void
 148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 149      struct coding_system *coding;
 150      const unsigned char *source;
 151      unsigned char *destination;
 152      int src_bytes, dst_bytes;
 153 {
 154   ...
 155 }
 156 #endif
 157
 158 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 159
 160   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 161   internal multibyte format to CODING.  The resulting unibyte text
 162   goes to a place pointed to by DESTINATION, the length of which
 163   should not exceed DST_BYTES.
 164
 165   These functions set the information about original and encoded texts
 166   in the members `produced', `produced_char', `consumed', and
 167   `consumed_char' of the structure *CODING.  They also set the member
 168   `result' to one of CODING_FINISH_XXX indicating how the encoding
 169   finished.
 170
 171   DST_BYTES zero means that the source area and destination area are
 172   overlapped, which means that we can produce encoded text until it
 173   reaches at the head of the not-yet-encoded source text.
 174
 175   Below is a template for these functions.  */
 176 #if 0
 177 static void
 178 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 179      struct coding_system *coding;
 180      unsigned char *source, *destination;
 181      int src_bytes, dst_bytes;
 182 {
 183   ...
 184 }
 185 #endif
 186
 187 /*** COMMONLY USED MACROS ***/
 188
 189 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 190    get one, two, and three bytes from the source text respectively.
 191    If there are not enough bytes in the source, they jump to
 192    `label_end_of_loop'.  The caller should set variables `coding',
 193    `src' and `src_end' to appropriate pointer in advance.  These
 194    macros are called from decoding routines `decode_coding_XXX', thus
 195    it is assumed that the source text is unibyte.  */
 196
 197 #define ONE_MORE_BYTE(c1)                                       \
 198   do {                                                          \
 199     if (src >= src_end)                                         \
 200       {                                                         \
 201         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 202         goto label_end_of_loop;                                 \
 203       }                                                         \
 204     c1 = *src++;                                                \
 205   } while (0)
 206
 207 #define TWO_MORE_BYTES(c1, c2)                                  \
 208   do {                                                          \
 209     if (src + 1 >= src_end)                                     \
 210       {                                                         \
 211         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 212         goto label_end_of_loop;                                 \
 213       }                                                         \
 214     c1 = *src++;                                                \
 215     c2 = *src++;                                                \
 216   } while (0)
 217
 218
 219 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 220    form if MULTIBYTEP is nonzero.  */
 221
 222 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 223   do {                                                          \
 224     if (src >= src_end)                                         \
 225       {                                                         \
 226         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 227         goto label_end_of_loop;                                 \
 228       }                                                         \
 229     c1 = *src++;                                                \
 230     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 231       c1 = *src++ - 0x20;                                       \
 232   } while (0)
 233
 234 /* Set C to the next character at the source text pointed by `src'.
 235    If there are not enough characters in the source, jump to
 236    `label_end_of_loop'.  The caller should set variables `coding'
 237    `src', `src_end', and `translation_table' to appropriate pointers
 238    in advance.  This macro is used in encoding routines
 239    `encode_coding_XXX', thus it assumes that the source text is in
 240    multibyte form except for 8-bit characters.  8-bit characters are
 241    in multibyte form if coding->src_multibyte is nonzero, else they
 242    are represented by a single byte.  */
 243
 244 #define ONE_MORE_CHAR(c)                                        \
 245   do {                                                          \
 246     int len = src_end - src;                                    \
 247     int bytes;                                                  \
 248     if (len <= 0)                                               \
 249       {                                                         \
 250         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 251         goto label_end_of_loop;                                 \
 252       }                                                         \
 253     if (coding->src_multibyte                                   \
 254         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 255       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 256     else                                                        \
 257       c = *src, bytes = 1;                                      \
 258     if (!NILP (translation_table))                              \
 259       c = translate_char (translation_table, c, -1, 0, 0);      \
 260     src += bytes;                                               \
 261   } while (0)
 262
 263
 264 /* Produce a multibyte form of character C to `dst'.  Jump to
 265    `label_end_of_loop' if there's not enough space at `dst'.
 266
 267    If we are now in the middle of a composition sequence, the decoded
 268    character may be ALTCHAR (for the current composition).  In that
 269    case, the character goes to coding->cmp_data->data instead of
 270    `dst'.
 271
 272    This macro is used in decoding routines.  */
 273
 274 #define EMIT_CHAR(c)                                                    \
 275   do {                                                                  \
 276     if (! COMPOSING_P (coding)                                          \
 277         || coding->composing == COMPOSITION_RELATIVE                    \
 278         || coding->composing == COMPOSITION_WITH_RULE)                  \
 279       {                                                                 \
 280         int bytes = CHAR_BYTES (c);                                     \
 281         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 282           {                                                             \
 283             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 284             goto label_end_of_loop;                                     \
 285           }                                                             \
 286         dst += CHAR_STRING (c, dst);                                    \
 287         coding->produced_char++;                                        \
 288       }                                                                 \
 289                                                                         \
 290     if (COMPOSING_P (coding)                                            \
 291         && coding->composing != COMPOSITION_RELATIVE)                   \
 292       {                                                                 \
 293         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 294         coding->composition_rule_follows                                \
 295           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 296       }                                                                 \
 297   } while (0)
 298
 299
 300 #define EMIT_ONE_BYTE(c)                                        \
 301   do {                                                          \
 302     if (dst >= (dst_bytes ? dst_end : src))                     \
 303       {                                                         \
 304         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 305         goto label_end_of_loop;                                 \
 306       }                                                         \
 307     *dst++ = c;                                                 \
 308   } while (0)
 309
 310 #define EMIT_TWO_BYTES(c1, c2)                                  \
 311   do {                                                          \
 312     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 313       {                                                         \
 314         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 315         goto label_end_of_loop;                                 \
 316       }                                                         \
 317     *dst++ = c1, *dst++ = c2;                                   \
 318   } while (0)
 319
 320 #define EMIT_BYTES(from, to)                                    \
 321   do {                                                          \
 322     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 323       {                                                         \
 324         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 325         goto label_end_of_loop;                                 \
 326       }                                                         \
 327     while (from < to)                                           \
 328       *dst++ = *from++;                                         \
 329   } while (0)
 330
 331 \f
 332 /*** 1. Preamble ***/
 333
 334 #ifdef emacs
 335 #include <config.h>
 336 #endif
 337
 338 #include <stdio.h>
 339
 340 #ifdef emacs
 341
 342 #include "lisp.h"
 343 #include "buffer.h"
 344 #include "charset.h"
 345 #include "composite.h"
 346 #include "ccl.h"
 347 #include "coding.h"
 348 #include "window.h"
 349 #include "intervals.h"
 350
 351 #else  /* not emacs */
 352
 353 #include "mulelib.h"
 354
 355 #endif /* not emacs */
 356
 357 Lisp_Object Qcoding_system, Qeol_type;
 358 Lisp_Object Qbuffer_file_coding_system;
 359 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 360 Lisp_Object Qno_conversion, Qundecided;
 361 Lisp_Object Qcoding_system_history;
 362 Lisp_Object Qsafe_chars;
 363 Lisp_Object Qvalid_codes;
 364
 365 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 366 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 367 Lisp_Object Qstart_process, Qopen_network_stream;
 368 Lisp_Object Qtarget_idx;
 369
 370 /* If a symbol has this property, evaluate the value to define the
 371    symbol as a coding system.  */
 372 Lisp_Object Qcoding_system_define_form;
 373
 374 Lisp_Object Vselect_safe_coding_system_function;
 375
 376 int coding_system_require_warning;
 377
 378 /* Mnemonic string for each format of end-of-line.  */
 379 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 380 /* Mnemonic string to indicate format of end-of-line is not yet
 381    decided.  */
 382 Lisp_Object eol_mnemonic_undecided;
 383
 384 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 385    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 386 int system_eol_type;
 387
 388 #ifdef emacs
 389
 390 /* Information about which coding system is safe for which chars.
 391    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 392
 393    GENERIC-LIST is a list of generic coding systems which can encode
 394    any characters.
 395
 396    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 397    corresponding char table that contains safe chars.  */
 398 Lisp_Object Vcoding_system_safe_chars;
 399
 400 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 401
 402 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 403
 404 /* Coding system emacs-mule and raw-text are for converting only
 405    end-of-line format.  */
 406 Lisp_Object Qemacs_mule, Qraw_text;
 407
 408 Lisp_Object Qutf_8;
 409
 410 /* Coding-systems are handed between Emacs Lisp programs and C internal
 411    routines by the following three variables.  */
 412 /* Coding-system for reading files and receiving data from process.  */
 413 Lisp_Object Vcoding_system_for_read;
 414 /* Coding-system for writing files and sending data to process.  */
 415 Lisp_Object Vcoding_system_for_write;
 416 /* Coding-system actually used in the latest I/O.  */
 417 Lisp_Object Vlast_coding_system_used;
 418
 419 /* A vector of length 256 which contains information about special
 420    Latin codes (especially for dealing with Microsoft codes).  */
 421 Lisp_Object Vlatin_extra_code_table;
 422
 423 /* Flag to inhibit code conversion of end-of-line format.  */
 424 int inhibit_eol_conversion;
 425
 426 /* Flag to inhibit ISO2022 escape sequence detection.  */
 427 int inhibit_iso_escape_detection;
 428
 429 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 430 int inherit_process_coding_system;
 431
 432 /* Coding system to be used to encode text for terminal display.  */
 433 struct coding_system terminal_coding;
 434
 435 /* Coding system to be used to encode text for terminal display when
 436    terminal coding system is nil.  */
 437 struct coding_system safe_terminal_coding;
 438
 439 /* Coding system of what is sent from terminal keyboard.  */
 440 struct coding_system keyboard_coding;
 441
 442 /* Default coding system to be used to write a file.  */
 443 struct coding_system default_buffer_file_coding;
 444
 445 Lisp_Object Vfile_coding_system_alist;
 446 Lisp_Object Vprocess_coding_system_alist;
 447 Lisp_Object Vnetwork_coding_system_alist;
 448
 449 Lisp_Object Vlocale_coding_system;
 450
 451 #endif /* emacs */
 452
 453 Lisp_Object Qcoding_category, Qcoding_category_index;
 454
 455 /* List of symbols `coding-category-xxx' ordered by priority.  */
 456 Lisp_Object Vcoding_category_list;
 457
 458 /* Table of coding categories (Lisp symbols).  */
 459 Lisp_Object Vcoding_category_table;
 460
 461 /* Table of names of symbol for each coding-category.  */
 462 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 463   "coding-category-emacs-mule",
 464   "coding-category-sjis",
 465   "coding-category-iso-7",
 466   "coding-category-iso-7-tight",
 467   "coding-category-iso-8-1",
 468   "coding-category-iso-8-2",
 469   "coding-category-iso-7-else",
 470   "coding-category-iso-8-else",
 471   "coding-category-ccl",
 472   "coding-category-big5",
 473   "coding-category-utf-8",
 474   "coding-category-utf-16-be",
 475   "coding-category-utf-16-le",
 476   "coding-category-raw-text",
 477   "coding-category-binary"
 478 };
 479
 480 /* Table of pointers to coding systems corresponding to each coding
 481    categories.  */
 482 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 483
 484 /* Table of coding category masks.  Nth element is a mask for a coding
 485    category of which priority is Nth.  */
 486 static
 487 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 488
 489 /* Flag to tell if we look up translation table on character code
 490    conversion.  */
 491 Lisp_Object Venable_character_translation;
 492 /* Standard translation table to look up on decoding (reading).  */
 493 Lisp_Object Vstandard_translation_table_for_decode;
 494 /* Standard translation table to look up on encoding (writing).  */
 495 Lisp_Object Vstandard_translation_table_for_encode;
 496
 497 Lisp_Object Qtranslation_table;
 498 Lisp_Object Qtranslation_table_id;
 499 Lisp_Object Qtranslation_table_for_decode;
 500 Lisp_Object Qtranslation_table_for_encode;
 501
 502 /* Alist of charsets vs revision number.  */
 503 Lisp_Object Vcharset_revision_alist;
 504
 505 /* Default coding systems used for process I/O.  */
 506 Lisp_Object Vdefault_process_coding_system;
 507
 508 /* Char table for translating Quail and self-inserting input.  */
 509 Lisp_Object Vtranslation_table_for_input;
 510
 511 /* Global flag to tell that we can't call post-read-conversion and
 512    pre-write-conversion functions.  Usually the value is zero, but it
 513    is set to 1 temporarily while such functions are running.  This is
 514    to avoid infinite recursive call.  */
 515 static int inhibit_pre_post_conversion;
 516
 517 Lisp_Object Qchar_coding_system;
 518
 519 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 520    its validity.  */
 521
 522 Lisp_Object
 523 coding_safe_chars (coding_system)
 524      Lisp_Object coding_system;
 525 {
 526   Lisp_Object coding_spec, plist, safe_chars;
 527
 528   coding_spec = Fget (coding_system, Qcoding_system);
 529   plist = XVECTOR (coding_spec)->contents[3];
 530   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 531   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 532 }
 533
 534 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 535   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 536
 537 \f
 538 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 539
 540 /* Emacs' internal format for representation of multiple character
 541    sets is a kind of multi-byte encoding, i.e. characters are
 542    represented by variable-length sequences of one-byte codes.
 543
 544    ASCII characters and control characters (e.g. `tab', `newline') are
 545    represented by one-byte sequences which are their ASCII codes, in
 546    the range 0x00 through 0x7F.
 547
 548    8-bit characters of the range 0x80..0x9F are represented by
 549    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 550    code + 0x20).
 551
 552    8-bit characters of the range 0xA0..0xFF are represented by
 553    one-byte sequences which are their 8-bit code.
 554
 555    The other characters are represented by a sequence of `base
 556    leading-code', optional `extended leading-code', and one or two
 557    `position-code's.  The length of the sequence is determined by the
 558    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 559    whereas extended leading-code and position-code take the range 0xA0
 560    through 0xFF.  See `charset.h' for more details about leading-code
 561    and position-code.
 562
 563    --- CODE RANGE of Emacs' internal format ---
 564    character set        range
 565    -------------        -----
 566    ascii                0x00..0x7F
 567    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 568    eight-bit-graphic    0xA0..0xBF
 569    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 570    ---------------------------------------------
 571
 572    As this is the internal character representation, the format is
 573    usually not used externally (i.e. in a file or in a data sent to a
 574    process).  But, it is possible to have a text externally in this
 575    format (i.e. by encoding by the coding system `emacs-mule').
 576
 577    In that case, a sequence of one-byte codes has a slightly different
 578    form.
 579
 580    Firstly, all characters in eight-bit-control are represented by
 581    one-byte sequences which are their 8-bit code.
 582
 583    Next, character composition data are represented by the byte
 584    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 585    where,
 586         METHOD is 0xF0 plus one of composition method (enum
 587         composition_method),
 588
 589         BYTES is 0xA0 plus the byte length of these composition data,
 590
 591         CHARS is 0xA0 plus the number of characters composed by these
 592         data,
 593
 594         COMPONENTs are characters of multibyte form or composition
 595         rules encoded by two-byte of ASCII codes.
 596
 597    In addition, for backward compatibility, the following formats are
 598    also recognized as composition data on decoding.
 599
 600    0x80 MSEQ ...
 601    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 602
 603    Here,
 604         MSEQ is a multibyte form but in these special format:
 605           ASCII: 0xA0 ASCII_CODE+0x80,
 606           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 607         RULE is a one byte code of the range 0xA0..0xF0 that
 608         represents a composition rule.
 609   */
 610
 611 enum emacs_code_class_type emacs_code_class[256];
 612
 613 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 614    Check if a text is encoded in Emacs' internal format.  If it is,
 615    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 616
 617 static int
 618 detect_coding_emacs_mule (src, src_end, multibytep)
 619       unsigned char *src, *src_end;
 620       int multibytep;
 621 {
 622   unsigned char c;
 623   int composing = 0;
 624   /* Dummy for ONE_MORE_BYTE.  */
 625   struct coding_system dummy_coding;
 626   struct coding_system *coding = &dummy_coding;
 627
 628   while (1)
 629     {
 630       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 631
 632       if (composing)
 633         {
 634           if (c < 0xA0)
 635             composing = 0;
 636           else if (c == 0xA0)
 637             {
 638               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 639               c &= 0x7F;
 640             }
 641           else
 642             c -= 0x20;
 643         }
 644
 645       if (c < 0x20)
 646         {
 647           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 648             return 0;
 649         }
 650       else if (c >= 0x80 && c < 0xA0)
 651         {
 652           if (c == 0x80)
 653             /* Old leading code for a composite character.  */
 654             composing = 1;
 655           else
 656             {
 657               unsigned char *src_base = src - 1;
 658               int bytes;
 659
 660               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 661                                                bytes))
 662                 return 0;
 663               src = src_base + bytes;
 664             }
 665         }
 666     }
 667  label_end_of_loop:
 668   return CODING_CATEGORY_MASK_EMACS_MULE;
 669 }
 670
 671
 672 /* Record the starting position START and METHOD of one composition.  */
 673
 674 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 675   do {                                                          \
 676     struct composition_data *cmp_data = coding->cmp_data;       \
 677     int *data = cmp_data->data + cmp_data->used;                \
 678     coding->cmp_data_start = cmp_data->used;                    \
 679     data[0] = -1;                                               \
 680     data[1] = cmp_data->char_offset + start;                    \
 681     data[3] = (int) method;                                     \
 682     cmp_data->used += 4;                                        \
 683   } while (0)
 684
 685 /* Record the ending position END of the current composition.  */
 686
 687 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 688   do {                                                          \
 689     struct composition_data *cmp_data = coding->cmp_data;       \
 690     int *data = cmp_data->data + coding->cmp_data_start;        \
 691     data[0] = cmp_data->used - coding->cmp_data_start;          \
 692     data[2] = cmp_data->char_offset + end;                      \
 693   } while (0)
 694
 695 /* Record one COMPONENT (alternate character or composition rule).  */
 696
 697 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 698   do {                                                                  \
 699     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 700     if (coding->cmp_data->used - coding->cmp_data_start                 \
 701         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 702       {                                                                 \
 703         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 704         coding->composing = COMPOSITION_NO;                             \
 705       }                                                                 \
 706   } while (0)
 707
 708
 709 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 710    is not less than SRC_END, return -1 without incrementing Src.  */
 711
 712 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 713
 714
 715 /* Decode a character represented as a component of composition
 716    sequence of Emacs 20 style at SRC.  Set C to that character, store
 717    its multibyte form sequence at P, and set P to the end of that
 718    sequence.  If no valid character is found, set C to -1.  */
 719
 720 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 721   do {                                                          \
 722     int bytes;                                                  \
 723                                                                 \
 724     c = SAFE_ONE_MORE_BYTE ();                                  \
 725     if (c < 0)                                                  \
 726       break;                                                    \
 727     if (CHAR_HEAD_P (c))                                        \
 728       c = -1;                                                   \
 729     else if (c == 0xA0)                                         \
 730       {                                                         \
 731         c = SAFE_ONE_MORE_BYTE ();                              \
 732         if (c < 0xA0)                                           \
 733           c = -1;                                               \
 734         else                                                    \
 735           {                                                     \
 736             c -= 0xA0;                                          \
 737             *p++ = c;                                           \
 738           }                                                     \
 739       }                                                         \
 740     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 741       {                                                         \
 742         unsigned char *p0 = p;                                  \
 743                                                                 \
 744         c -= 0x20;                                              \
 745         *p++ = c;                                               \
 746         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 747         while (--bytes)                                         \
 748           {                                                     \
 749             c = SAFE_ONE_MORE_BYTE ();                          \
 750             if (c < 0)                                          \
 751               break;                                            \
 752             *p++ = c;                                           \
 753           }                                                     \
 754         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 755             || (coding->flags /* We are recovering a file.  */  \
 756                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 757                 && ! CHAR_HEAD_P (p0[1])))                      \
 758           c = STRING_CHAR (p0, bytes);                          \
 759         else                                                    \
 760           c = -1;                                               \
 761       }                                                         \
 762     else                                                        \
 763       c = -1;                                                   \
 764   } while (0)
 765
 766
 767 /* Decode a composition rule represented as a component of composition
 768    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 769    valid rule is found, set C to -1.  */
 770
 771 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 772   do {                                                  \
 773     c = SAFE_ONE_MORE_BYTE ();                          \
 774     c -= 0xA0;                                          \
 775     if (c < 0 || c >= 81)                               \
 776       c = -1;                                           \
 777     else                                                \
 778       {                                                 \
 779         gref = c / 9, nref = c % 9;                     \
 780         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 781       }                                                 \
 782   } while (0)
 783
 784
 785 /* Decode composition sequence encoded by `emacs-mule' at the source
 786    pointed by SRC.  SRC_END is the end of source.  Store information
 787    of the composition in CODING->cmp_data.
 788
 789    For backward compatibility, decode also a composition sequence of
 790    Emacs 20 style.  In that case, the composition sequence contains
 791    characters that should be extracted into a buffer or string.  Store
 792    those characters at *DESTINATION in multibyte form.
 793
 794    If we encounter an invalid byte sequence, return 0.
 795    If we encounter an insufficient source or destination, or
 796    insufficient space in CODING->cmp_data, return 1.
 797    Otherwise, return consumed bytes in the source.
 798
 799 */
 800 static INLINE int
 801 decode_composition_emacs_mule (coding, src, src_end,
 802                                destination, dst_end, dst_bytes)
 803      struct coding_system *coding;
 804      const unsigned char *src, *src_end;
 805      unsigned char **destination, *dst_end;
 806      int dst_bytes;
 807 {
 808   unsigned char *dst = *destination;
 809   int method, data_len, nchars;
 810   const unsigned char *src_base = src++;
 811   /* Store components of composition.  */
 812   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 813   int ncomponent;
 814   /* Store multibyte form of characters to be composed.  This is for
 815      Emacs 20 style composition sequence.  */
 816   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 817   unsigned char *bufp = buf;
 818   int c, i, gref, nref;
 819
 820   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 821       >= COMPOSITION_DATA_SIZE)
 822     {
 823       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 824       return -1;
 825     }
 826
 827   ONE_MORE_BYTE (c);
 828   if (c - 0xF0 >= COMPOSITION_RELATIVE
 829            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 830     {
 831       int with_rule;
 832
 833       method = c - 0xF0;
 834       with_rule = (method == COMPOSITION_WITH_RULE
 835                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 836       ONE_MORE_BYTE (c);
 837       data_len = c - 0xA0;
 838       if (data_len < 4
 839           || src_base + data_len > src_end)
 840         return 0;
 841       ONE_MORE_BYTE (c);
 842       nchars = c - 0xA0;
 843       if (c < 1)
 844         return 0;
 845       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 846         {
 847           /* If it is longer than this, it can't be valid.  */
 848           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 849             return 0;
 850
 851           if (ncomponent % 2 && with_rule)
 852             {
 853               ONE_MORE_BYTE (gref);
 854               gref -= 32;
 855               ONE_MORE_BYTE (nref);
 856               nref -= 32;
 857               c = COMPOSITION_ENCODE_RULE (gref, nref);
 858             }
 859           else
 860             {
 861               int bytes;
 862               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 863                   || (coding->flags /* We are recovering a file.  */
 864                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 865                       && ! CHAR_HEAD_P (src[1])))
 866                 c = STRING_CHAR (src, bytes);
 867               else
 868                 c = *src, bytes = 1;
 869               src += bytes;
 870             }
 871           component[ncomponent] = c;
 872         }
 873     }
 874   else
 875     {
 876       /* This may be an old Emacs 20 style format.  See the comment at
 877          the section 2 of this file.  */
 878       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 879       if (src == src_end
 880           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 881         goto label_end_of_loop;
 882
 883       src_end = src;
 884       src = src_base + 1;
 885       if (c < 0xC0)
 886         {
 887           method = COMPOSITION_RELATIVE;
 888           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 889             {
 890               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 891               if (c < 0)
 892                 break;
 893               component[ncomponent++] = c;
 894             }
 895           if (ncomponent < 2)
 896             return 0;
 897           nchars = ncomponent;
 898         }
 899       else if (c == 0xFF)
 900         {
 901           method = COMPOSITION_WITH_RULE;
 902           src++;
 903           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 904           if (c < 0)
 905             return 0;
 906           component[0] = c;
 907           for (ncomponent = 1;
 908                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 909             {
 910               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 911               if (c < 0)
 912                 break;
 913               component[ncomponent++] = c;
 914               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 915               if (c < 0)
 916                 break;
 917               component[ncomponent++] = c;
 918             }
 919           if (ncomponent < 3)
 920             return 0;
 921           nchars = (ncomponent + 1) / 2;
 922         }
 923       else
 924         return 0;
 925     }
 926
 927   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 928     {
 929       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 930       for (i = 0; i < ncomponent; i++)
 931         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 932       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 933       if (buf < bufp)
 934         {
 935           unsigned char *p = buf;
 936           EMIT_BYTES (p, bufp);
 937           *destination += bufp - buf;
 938           coding->produced_char += nchars;
 939         }
 940       return (src - src_base);
 941     }
 942  label_end_of_loop:
 943   return -1;
 944 }
 945
 946 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 947
 948 static void
 949 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 950      struct coding_system *coding;
 951      const unsigned char *source;
 952      unsigned char *destination;
 953      int src_bytes, dst_bytes;
 954 {
 955   const unsigned char *src = source;
 956   const unsigned char *src_end = source + src_bytes;
 957   unsigned char *dst = destination;
 958   unsigned char *dst_end = destination + dst_bytes;
 959   /* SRC_BASE remembers the start position in source in each loop.
 960      The loop will be exited when there's not enough source code, or
 961      when there's not enough destination area to produce a
 962      character.  */
 963   const unsigned char *src_base;
 964
 965   coding->produced_char = 0;
 966   while ((src_base = src) < src_end)
 967     {
 968       unsigned char tmp[MAX_MULTIBYTE_LENGTH];
 969       const unsigned char *p;
 970       int bytes;
 971
 972       if (*src == '\r')
 973         {
 974           int c = *src++;
 975
 976           if (coding->eol_type == CODING_EOL_CR)
 977             c = '\n';
 978           else if (coding->eol_type == CODING_EOL_CRLF)
 979             {
 980               ONE_MORE_BYTE (c);
 981               if (c != '\n')
 982                 {
 983                   src--;
 984                   c = '\r';
 985                 }
 986             }
 987           *dst++ = c;
 988           coding->produced_char++;
 989           continue;
 990         }
 991       else if (*src == '\n')
 992         {
 993           if ((coding->eol_type == CODING_EOL_CR
 994                || coding->eol_type == CODING_EOL_CRLF)
 995               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 996             {
 997               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 998               goto label_end_of_loop;
 999             }
1000           *dst++ = *src++;
1001           coding->produced_char++;
1002           continue;
1003         }
1004       else if (*src == 0x80 && coding->cmp_data)
1005         {
1006           /* Start of composition data.  */
1007           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1008                                                          &dst, dst_end,
1009                                                          dst_bytes);
1010           if (consumed < 0)
1011             goto label_end_of_loop;
1012           else if (consumed > 0)
1013             {
1014               src += consumed;
1015               continue;
1016             }
1017           bytes = CHAR_STRING (*src, tmp);
1018           p = tmp;
1019           src++;
1020         }
1021       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1022                || (coding->flags /* We are recovering a file.  */
1023                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1024                    && ! CHAR_HEAD_P (src[1])))
1025         {
1026           p = src;
1027           src += bytes;
1028         }
1029       else
1030         {
1031           int i, c;
1032
1033           bytes = BYTES_BY_CHAR_HEAD (*src);
1034           src++;
1035           for (i = 1; i < bytes; i++)
1036             {
1037               ONE_MORE_BYTE (c);
1038               if (CHAR_HEAD_P (c))
1039                 break;
1040             }
1041           if (i < bytes)
1042             {
1043               bytes = CHAR_STRING (*src_base, tmp);
1044               p = tmp;
1045               src = src_base + 1;
1046             }
1047           else
1048             {
1049               p = src_base;
1050             }
1051         }
1052       if (dst + bytes >= (dst_bytes ? dst_end : src))
1053         {
1054           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1055           break;
1056         }
1057       while (bytes--) *dst++ = *p++;
1058       coding->produced_char++;
1059     }
1060  label_end_of_loop:
1061   coding->consumed = coding->consumed_char = src_base - source;
1062   coding->produced = dst - destination;
1063 }
1064
1065
1066 /* Encode composition data stored at DATA into a special byte sequence
1067    starting by 0x80.  Update CODING->cmp_data_start and maybe
1068    CODING->cmp_data for the next call.  */
1069
1070 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1071   do {                                                                  \
1072     unsigned char buf[1024], *p0 = buf, *p;                             \
1073     int len = data[0];                                                  \
1074     int i;                                                              \
1075                                                                         \
1076     buf[0] = 0x80;                                                      \
1077     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1078     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1079     p = buf + 4;                                                        \
1080     if (data[3] == COMPOSITION_WITH_RULE                                \
1081         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1082       {                                                                 \
1083         p += CHAR_STRING (data[4], p);                                  \
1084         for (i = 5; i < len; i += 2)                                    \
1085           {                                                             \
1086             int gref, nref;                                             \
1087              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1088             *p++ = 0x20 + gref;                                         \
1089             *p++ = 0x20 + nref;                                         \
1090             p += CHAR_STRING (data[i + 1], p);                          \
1091           }                                                             \
1092       }                                                                 \
1093     else                                                                \
1094       {                                                                 \
1095         for (i = 4; i < len; i++)                                       \
1096           p += CHAR_STRING (data[i], p);                                \
1097       }                                                                 \
1098     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1099                                                                         \
1100     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1101       {                                                                 \
1102         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1103         goto label_end_of_loop;                                         \
1104       }                                                                 \
1105     while (p0 < p)                                                      \
1106       *dst++ = *p0++;                                                   \
1107     coding->cmp_data_start += data[0];                                  \
1108     if (coding->cmp_data_start == coding->cmp_data->used                \
1109         && coding->cmp_data->next)                                      \
1110       {                                                                 \
1111         coding->cmp_data = coding->cmp_data->next;                      \
1112         coding->cmp_data_start = 0;                                     \
1113       }                                                                 \
1114   } while (0)
1115
1116
1117 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1118                             unsigned char *, int, int));
1119
1120 static void
1121 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1122      struct coding_system *coding;
1123      const unsigned char *source;
1124      unsigned char *destination;
1125      int src_bytes, dst_bytes;
1126 {
1127   const unsigned char *src = source;
1128   const unsigned char *src_end = source + src_bytes;
1129   unsigned char *dst = destination;
1130   unsigned char *dst_end = destination + dst_bytes;
1131   const unsigned char *src_base;
1132   int c;
1133   int char_offset;
1134   int *data;
1135
1136   Lisp_Object translation_table;
1137
1138   translation_table = Qnil;
1139
1140   /* Optimization for the case that there's no composition.  */
1141   if (!coding->cmp_data || coding->cmp_data->used == 0)
1142     {
1143       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1144       return;
1145     }
1146
1147   char_offset = coding->cmp_data->char_offset;
1148   data = coding->cmp_data->data + coding->cmp_data_start;
1149   while (1)
1150     {
1151       src_base = src;
1152
1153       /* If SRC starts a composition, encode the information about the
1154          composition in advance.  */
1155       if (coding->cmp_data_start < coding->cmp_data->used
1156           && char_offset + coding->consumed_char == data[1])
1157         {
1158           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1159           char_offset = coding->cmp_data->char_offset;
1160           data = coding->cmp_data->data + coding->cmp_data_start;
1161         }
1162
1163       ONE_MORE_CHAR (c);
1164       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1165                         || coding->eol_type == CODING_EOL_CR))
1166         {
1167           if (coding->eol_type == CODING_EOL_CRLF)
1168             EMIT_TWO_BYTES ('\r', c);
1169           else
1170             EMIT_ONE_BYTE ('\r');
1171         }
1172       else if (SINGLE_BYTE_CHAR_P (c))
1173         {
1174           if (coding->flags && ! ASCII_BYTE_P (c))
1175             {
1176               /* As we are auto saving, retain the multibyte form for
1177                  8-bit chars.  */
1178               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1179               int bytes = CHAR_STRING (c, buf);
1180
1181               if (bytes == 1)
1182                 EMIT_ONE_BYTE (buf[0]);
1183               else
1184                 EMIT_TWO_BYTES (buf[0], buf[1]);
1185             }
1186           else
1187             EMIT_ONE_BYTE (c);
1188         }
1189       else
1190         EMIT_BYTES (src_base, src);
1191       coding->consumed_char++;
1192     }
1193  label_end_of_loop:
1194   coding->consumed = src_base - source;
1195   coding->produced = coding->produced_char = dst - destination;
1196   return;
1197 }
1198
1199 \f
1200 /*** 3. ISO2022 handlers ***/
1201
1202 /* The following note describes the coding system ISO2022 briefly.
1203    Since the intention of this note is to help understand the
1204    functions in this file, some parts are NOT ACCURATE or are OVERLY
1205    SIMPLIFIED.  For thorough understanding, please refer to the
1206    original document of ISO2022.  This is equivalent to the standard
1207    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1208
1209    ISO2022 provides many mechanisms to encode several character sets
1210    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1211    is encoded using bytes less than 128.  This may make the encoded
1212    text a little bit longer, but the text passes more easily through
1213    several types of gateway, some of which strip off the MSB (Most
1214    Significant Bit).
1215
1216    There are two kinds of character sets: control character sets and
1217    graphic character sets.  The former contain control characters such
1218    as `newline' and `escape' to provide control functions (control
1219    functions are also provided by escape sequences).  The latter
1220    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1221    two control character sets and many graphic character sets.
1222
1223    Graphic character sets are classified into one of the following
1224    four classes, according to the number of bytes (DIMENSION) and
1225    number of characters in one dimension (CHARS) of the set:
1226    - DIMENSION1_CHARS94
1227    - DIMENSION1_CHARS96
1228    - DIMENSION2_CHARS94
1229    - DIMENSION2_CHARS96
1230
1231    In addition, each character set is assigned an identification tag,
1232    unique for each set, called the "final character" (denoted as <F>
1233    hereafter).  The <F> of each character set is decided by ECMA(*)
1234    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1235    (0x30..0x3F are for private use only).
1236
1237    Note (*): ECMA = European Computer Manufacturers Association
1238
1239    Here are examples of graphic character sets [NAME(<F>)]:
1240         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1241         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1242         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1243         o DIMENSION2_CHARS96 -- none for the moment
1244
1245    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1246         C0 [0x00..0x1F] -- control character plane 0
1247         GL [0x20..0x7F] -- graphic character plane 0
1248         C1 [0x80..0x9F] -- control character plane 1
1249         GR [0xA0..0xFF] -- graphic character plane 1
1250
1251    A control character set is directly designated and invoked to C0 or
1252    C1 by an escape sequence.  The most common case is that:
1253    - ISO646's  control character set is designated/invoked to C0, and
1254    - ISO6429's control character set is designated/invoked to C1,
1255    and usually these designations/invocations are omitted in encoded
1256    text.  In a 7-bit environment, only C0 can be used, and a control
1257    character for C1 is encoded by an appropriate escape sequence to
1258    fit into the environment.  All control characters for C1 are
1259    defined to have corresponding escape sequences.
1260
1261    A graphic character set is at first designated to one of four
1262    graphic registers (G0 through G3), then these graphic registers are
1263    invoked to GL or GR.  These designations and invocations can be
1264    done independently.  The most common case is that G0 is invoked to
1265    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1266    these invocations and designations are omitted in encoded text.
1267    In a 7-bit environment, only GL can be used.
1268
1269    When a graphic character set of CHARS94 is invoked to GL, codes
1270    0x20 and 0x7F of the GL area work as control characters SPACE and
1271    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1272    be used.
1273
1274    There are two ways of invocation: locking-shift and single-shift.
1275    With locking-shift, the invocation lasts until the next different
1276    invocation, whereas with single-shift, the invocation affects the
1277    following character only and doesn't affect the locking-shift
1278    state.  Invocations are done by the following control characters or
1279    escape sequences:
1280
1281    ----------------------------------------------------------------------
1282    abbrev  function                  cntrl escape seq   description
1283    ----------------------------------------------------------------------
1284    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1285    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1286    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1287    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1288    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1289    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1290    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1291    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1292    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1293    ----------------------------------------------------------------------
1294    (*) These are not used by any known coding system.
1295
1296    Control characters for these functions are defined by macros
1297    ISO_CODE_XXX in `coding.h'.
1298
1299    Designations are done by the following escape sequences:
1300    ----------------------------------------------------------------------
1301    escape sequence      description
1302    ----------------------------------------------------------------------
1303    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1304    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1305    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1306    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1307    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1308    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1309    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1310    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1311    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1312    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1313    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1314    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1315    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1316    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1317    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1318    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1319    ----------------------------------------------------------------------
1320
1321    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1322    of dimension 1, chars 94, and final character <F>, etc...
1323
1324    Note (*): Although these designations are not allowed in ISO2022,
1325    Emacs accepts them on decoding, and produces them on encoding
1326    CHARS96 character sets in a coding system which is characterized as
1327    7-bit environment, non-locking-shift, and non-single-shift.
1328
1329    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1330    '(' can be omitted.  We refer to this as "short-form" hereafter.
1331
1332    Now you may notice that there are a lot of ways of encoding the
1333    same multilingual text in ISO2022.  Actually, there exist many
1334    coding systems such as Compound Text (used in X11's inter client
1335    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1336    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1337    localized platforms), and all of these are variants of ISO2022.
1338
1339    In addition to the above, Emacs handles two more kinds of escape
1340    sequences: ISO6429's direction specification and Emacs' private
1341    sequence for specifying character composition.
1342
1343    ISO6429's direction specification takes the following form:
1344         o CSI ']'      -- end of the current direction
1345         o CSI '0' ']'  -- end of the current direction
1346         o CSI '1' ']'  -- start of left-to-right text
1347         o CSI '2' ']'  -- start of right-to-left text
1348    The control character CSI (0x9B: control sequence introducer) is
1349    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1350
1351    Character composition specification takes the following form:
1352         o ESC '0' -- start relative composition
1353         o ESC '1' -- end composition
1354         o ESC '2' -- start rule-base composition (*)
1355         o ESC '3' -- start relative composition with alternate chars  (**)
1356         o ESC '4' -- start rule-base composition with alternate chars  (**)
1357   Since these are not standard escape sequences of any ISO standard,
1358   the use of them with these meanings is restricted to Emacs only.
1359
1360   (*) This form is used only in Emacs 20.5 and older versions,
1361   but the newer versions can safely decode it.
1362   (**) This form is used only in Emacs 21.1 and newer versions,
1363   and the older versions can't decode it.
1364
1365   Here's a list of example usages of these composition escape
1366   sequences (categorized by `enum composition_method').
1367
1368   COMPOSITION_RELATIVE:
1369         ESC 0 CHAR [ CHAR ] ESC 1
1370   COMPOSITION_WITH_RULE:
1371         ESC 2 CHAR [ RULE CHAR ] ESC 1
1372   COMPOSITION_WITH_ALTCHARS:
1373         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1374   COMPOSITION_WITH_RULE_ALTCHARS:
1375         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1376
1377 enum iso_code_class_type iso_code_class[256];
1378
1379 #define CHARSET_OK(idx, charset, c)                                     \
1380   (coding_system_table[idx]                                             \
1381    && (charset == CHARSET_ASCII                                         \
1382        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1383            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1384    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1385                                               charset)                  \
1386        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1387
1388 #define SHIFT_OUT_OK(idx) \
1389   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1390
1391 #define COMPOSITION_OK(idx)     \
1392   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1393
1394 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1395    Check if a text is encoded in ISO2022.  If it is, return an
1396    integer in which appropriate flag bits any of:
1397         CODING_CATEGORY_MASK_ISO_7
1398         CODING_CATEGORY_MASK_ISO_7_TIGHT
1399         CODING_CATEGORY_MASK_ISO_8_1
1400         CODING_CATEGORY_MASK_ISO_8_2
1401         CODING_CATEGORY_MASK_ISO_7_ELSE
1402         CODING_CATEGORY_MASK_ISO_8_ELSE
1403    are set.  If a code which should never appear in ISO2022 is found,
1404    returns 0.  */
1405
1406 static int
1407 detect_coding_iso2022 (src, src_end, multibytep)
1408      unsigned char *src, *src_end;
1409      int multibytep;
1410 {
1411   int mask = CODING_CATEGORY_MASK_ISO;
1412   int mask_found = 0;
1413   int reg[4], shift_out = 0, single_shifting = 0;
1414   int c, c1, charset;
1415   /* Dummy for ONE_MORE_BYTE.  */
1416   struct coding_system dummy_coding;
1417   struct coding_system *coding = &dummy_coding;
1418   Lisp_Object safe_chars;
1419
1420   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1421   while (mask && src < src_end)
1422     {
1423       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1424     retry:
1425       switch (c)
1426         {
1427         case ISO_CODE_ESC:
1428           if (inhibit_iso_escape_detection)
1429             break;
1430           single_shifting = 0;
1431           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1432           if (c >= '(' && c <= '/')
1433             {
1434               /* Designation sequence for a charset of dimension 1.  */
1435               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1436               if (c1 < ' ' || c1 >= 0x80
1437                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1438                 /* Invalid designation sequence.  Just ignore.  */
1439                 break;
1440               reg[(c - '(') % 4] = charset;
1441             }
1442           else if (c == '$')
1443             {
1444               /* Designation sequence for a charset of dimension 2.  */
1445               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1446               if (c >= '@' && c <= 'B')
1447                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1448                 reg[0] = charset = iso_charset_table[1][0][c];
1449               else if (c >= '(' && c <= '/')
1450                 {
1451                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1452                   if (c1 < ' ' || c1 >= 0x80
1453                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1454                     /* Invalid designation sequence.  Just ignore.  */
1455                     break;
1456                   reg[(c - '(') % 4] = charset;
1457                 }
1458               else
1459                 /* Invalid designation sequence.  Just ignore.  */
1460                 break;
1461             }
1462           else if (c == 'N' || c == 'O')
1463             {
1464               /* ESC <Fe> for SS2 or SS3.  */
1465               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1466               break;
1467             }
1468           else if (c >= '0' && c <= '4')
1469             {
1470               /* ESC <Fp> for start/end composition.  */
1471               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1472                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1473               else
1474                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1475               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1476                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1477               else
1478                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1479               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1480                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1481               else
1482                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1483               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1484                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1485               else
1486                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1487               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1488                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1489               else
1490                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1491               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1492                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1493               else
1494                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1495               break;
1496             }
1497           else
1498             /* Invalid escape sequence.  Just ignore.  */
1499             break;
1500
1501           /* We found a valid designation sequence for CHARSET.  */
1502           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1503           c = MAKE_CHAR (charset, 0, 0);
1504           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1505             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1506           else
1507             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1508           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1509             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1510           else
1511             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1512           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1513             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1514           else
1515             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1516           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1517             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1518           else
1519             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1520           break;
1521
1522         case ISO_CODE_SO:
1523           if (inhibit_iso_escape_detection)
1524             break;
1525           single_shifting = 0;
1526           if (shift_out == 0
1527               && (reg[1] >= 0
1528                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1529                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1530             {
1531               /* Locking shift out.  */
1532               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1533               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1534             }
1535           break;
1536
1537         case ISO_CODE_SI:
1538           if (inhibit_iso_escape_detection)
1539             break;
1540           single_shifting = 0;
1541           if (shift_out == 1)
1542             {
1543               /* Locking shift in.  */
1544               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1545               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1546             }
1547           break;
1548
1549         case ISO_CODE_CSI:
1550           single_shifting = 0;
1551         case ISO_CODE_SS2:
1552         case ISO_CODE_SS3:
1553           {
1554             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1555
1556             if (inhibit_iso_escape_detection)
1557               break;
1558             if (c != ISO_CODE_CSI)
1559               {
1560                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1561                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1562                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1563                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1564                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1565                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1566                 single_shifting = 1;
1567               }
1568             if (VECTORP (Vlatin_extra_code_table)
1569                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1570               {
1571                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1572                     & CODING_FLAG_ISO_LATIN_EXTRA)
1573                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1574                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1575                     & CODING_FLAG_ISO_LATIN_EXTRA)
1576                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1577               }
1578             mask &= newmask;
1579             mask_found |= newmask;
1580           }
1581           break;
1582
1583         default:
1584           if (c < 0x80)
1585             {
1586               single_shifting = 0;
1587               break;
1588             }
1589           else if (c < 0xA0)
1590             {
1591               single_shifting = 0;
1592               if (VECTORP (Vlatin_extra_code_table)
1593                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1594                 {
1595                   int newmask = 0;
1596
1597                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1598                       & CODING_FLAG_ISO_LATIN_EXTRA)
1599                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1600                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1601                       & CODING_FLAG_ISO_LATIN_EXTRA)
1602                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1603                   mask &= newmask;
1604                   mask_found |= newmask;
1605                 }
1606               else
1607                 return 0;
1608             }
1609           else
1610             {
1611               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1612                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1613               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1614               /* Check the length of succeeding codes of the range
1615                  0xA0..0FF.  If the byte length is odd, we exclude
1616                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1617                  when we are not single shifting.  */
1618               if (!single_shifting
1619                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1620                 {
1621                   int i = 1;
1622
1623                   c = -1;
1624                   while (src < src_end)
1625                     {
1626                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1627                       if (c < 0xA0)
1628                         break;
1629                       i++;
1630                     }
1631
1632                   if (i & 1 && src < src_end)
1633                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1634                   else
1635                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1636                   if (c >= 0)
1637                     /* This means that we have read one extra byte.  */
1638                     goto retry;
1639                 }
1640             }
1641           break;
1642         }
1643     }
1644  label_end_of_loop:
1645   return (mask & mask_found);
1646 }
1647
1648 /* Decode a character of which charset is CHARSET, the 1st position
1649    code is C1, the 2nd position code is C2, and return the decoded
1650    character code.  If the variable `translation_table' is non-nil,
1651    returned the translated code.  */
1652
1653 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1654   (NILP (translation_table)                     \
1655    ? MAKE_CHAR (charset, c1, c2)                \
1656    : translate_char (translation_table, -1, charset, c1, c2))
1657
1658 /* Set designation state into CODING.  */
1659 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1660   do {                                                                     \
1661     int charset, c;                                                        \
1662                                                                            \
1663     if (final_char < '0' || final_char >= 128)                             \
1664       goto label_invalid_code;                                             \
1665     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1666                                  make_number (chars),                      \
1667                                  make_number (final_char));                \
1668     c = MAKE_CHAR (charset, 0, 0);                                         \
1669     if (charset >= 0                                                       \
1670         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1671             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1672       {                                                                    \
1673         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1674             && reg == 0                                                    \
1675             && charset == CHARSET_ASCII)                                   \
1676           {                                                                \
1677             /* We should insert this designation sequence as is so         \
1678                that it is surely written back to a file.  */               \
1679             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1680             goto label_invalid_code;                                       \
1681           }                                                                \
1682         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1683         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1684             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1685           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1686         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1687       }                                                                    \
1688     else                                                                   \
1689       {                                                                    \
1690         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1691         goto label_invalid_code;                                           \
1692       }                                                                    \
1693   } while (0)
1694
1695 /* Allocate a memory block for storing information about compositions.
1696    The block is chained to the already allocated blocks.  */
1697
1698 void
1699 coding_allocate_composition_data (coding, char_offset)
1700      struct coding_system *coding;
1701      int char_offset;
1702 {
1703   struct composition_data *cmp_data
1704     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1705
1706   cmp_data->char_offset = char_offset;
1707   cmp_data->used = 0;
1708   cmp_data->prev = coding->cmp_data;
1709   cmp_data->next = NULL;
1710   if (coding->cmp_data)
1711     coding->cmp_data->next = cmp_data;
1712   coding->cmp_data = cmp_data;
1713   coding->cmp_data_start = 0;
1714   coding->composing = COMPOSITION_NO;
1715 }
1716
1717 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1718    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1719    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1720    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1721    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1722   */
1723
1724 #define DECODE_COMPOSITION_START(c1)                                       \
1725   do {                                                                     \
1726     if (coding->composing == COMPOSITION_DISABLED)                         \
1727       {                                                                    \
1728         *dst++ = ISO_CODE_ESC;                                             \
1729         *dst++ = c1 & 0x7f;                                                \
1730         coding->produced_char += 2;                                        \
1731       }                                                                    \
1732     else if (!COMPOSING_P (coding))                                        \
1733       {                                                                    \
1734         /* This is surely the start of a composition.  We must be sure     \
1735            that coding->cmp_data has enough space to store the             \
1736            information about the composition.  If not, terminate the       \
1737            current decoding loop, allocate one more memory block for       \
1738            coding->cmp_data in the caller, then start the decoding         \
1739            loop again.  We can't allocate memory here directly because     \
1740            it may cause buffer/string relocation.  */                      \
1741         if (!coding->cmp_data                                              \
1742             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1743                 >= COMPOSITION_DATA_SIZE))                                 \
1744           {                                                                \
1745             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1746             goto label_end_of_loop;                                        \
1747           }                                                                \
1748         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1749                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1750                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1751                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1752         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1753                                       coding->composing);                  \
1754         coding->composition_rule_follows = 0;                              \
1755       }                                                                    \
1756     else                                                                   \
1757       {                                                                    \
1758         /* We are already handling a composition.  If the method is        \
1759            the following two, the codes following the current escape       \
1760            sequence are actual characters stored in a buffer.  */          \
1761         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1762             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1763           {                                                                \
1764             coding->composing = COMPOSITION_RELATIVE;                      \
1765             coding->composition_rule_follows = 0;                          \
1766           }                                                                \
1767       }                                                                    \
1768   } while (0)
1769
1770 /* Handle composition end sequence ESC 1.  */
1771
1772 #define DECODE_COMPOSITION_END(c1)                                      \
1773   do {                                                                  \
1774     if (! COMPOSING_P (coding))                                         \
1775       {                                                                 \
1776         *dst++ = ISO_CODE_ESC;                                          \
1777         *dst++ = c1;                                                    \
1778         coding->produced_char += 2;                                     \
1779       }                                                                 \
1780     else                                                                \
1781       {                                                                 \
1782         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1783         coding->composing = COMPOSITION_NO;                             \
1784       }                                                                 \
1785   } while (0)
1786
1787 /* Decode a composition rule from the byte C1 (and maybe one more byte
1788    from SRC) and store one encoded composition rule in
1789    coding->cmp_data.  */
1790
1791 #define DECODE_COMPOSITION_RULE(c1)                                     \
1792   do {                                                                  \
1793     int rule = 0;                                                       \
1794     (c1) -= 32;                                                         \
1795     if (c1 < 81)                /* old format (before ver.21) */        \
1796       {                                                                 \
1797         int gref = (c1) / 9;                                            \
1798         int nref = (c1) % 9;                                            \
1799         if (gref == 4) gref = 10;                                       \
1800         if (nref == 4) nref = 10;                                       \
1801         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1802       }                                                                 \
1803     else if (c1 < 93)           /* new format (after ver.21) */         \
1804       {                                                                 \
1805         ONE_MORE_BYTE (c2);                                             \
1806         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1807       }                                                                 \
1808     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1809     coding->composition_rule_follows = 0;                               \
1810   } while (0)
1811
1812
1813 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1814
1815 static void
1816 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1817      struct coding_system *coding;
1818      const unsigned char *source;
1819      unsigned char *destination;
1820      int src_bytes, dst_bytes;
1821 {
1822   const unsigned char *src = source;
1823   const unsigned char *src_end = source + src_bytes;
1824   unsigned char *dst = destination;
1825   unsigned char *dst_end = destination + dst_bytes;
1826   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1827   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1828   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1829   /* SRC_BASE remembers the start position in source in each loop.
1830      The loop will be exited when there's not enough source code
1831      (within macro ONE_MORE_BYTE), or when there's not enough
1832      destination area to produce a character (within macro
1833      EMIT_CHAR).  */
1834   const unsigned char *src_base;
1835   int c, charset;
1836   Lisp_Object translation_table;
1837   Lisp_Object safe_chars;
1838
1839   safe_chars = coding_safe_chars (coding->symbol);
1840
1841   if (NILP (Venable_character_translation))
1842     translation_table = Qnil;
1843   else
1844     {
1845       translation_table = coding->translation_table_for_decode;
1846       if (NILP (translation_table))
1847         translation_table = Vstandard_translation_table_for_decode;
1848     }
1849
1850   coding->result = CODING_FINISH_NORMAL;
1851
1852   while (1)
1853     {
1854       int c1, c2 = 0;
1855
1856       src_base = src;
1857       ONE_MORE_BYTE (c1);
1858
1859       /* We produce no character or one character.  */
1860       switch (iso_code_class [c1])
1861         {
1862         case ISO_0x20_or_0x7F:
1863           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1864             {
1865               DECODE_COMPOSITION_RULE (c1);
1866               continue;
1867             }
1868           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1869             {
1870               /* This is SPACE or DEL.  */
1871               charset = CHARSET_ASCII;
1872               break;
1873             }
1874           /* This is a graphic character, we fall down ...  */
1875
1876         case ISO_graphic_plane_0:
1877           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1878             {
1879               DECODE_COMPOSITION_RULE (c1);
1880               continue;
1881             }
1882           charset = charset0;
1883           break;
1884
1885         case ISO_0xA0_or_0xFF:
1886           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1887               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1888             goto label_invalid_code;
1889           /* This is a graphic character, we fall down ... */
1890
1891         case ISO_graphic_plane_1:
1892           if (charset1 < 0)
1893             goto label_invalid_code;
1894           charset = charset1;
1895           break;
1896
1897         case ISO_control_0:
1898           if (COMPOSING_P (coding))
1899             DECODE_COMPOSITION_END ('1');
1900
1901           /* All ISO2022 control characters in this class have the
1902              same representation in Emacs internal format.  */
1903           if (c1 == '\n'
1904               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1905               && (coding->eol_type == CODING_EOL_CR
1906                   || coding->eol_type == CODING_EOL_CRLF))
1907             {
1908               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1909               goto label_end_of_loop;
1910             }
1911           charset = CHARSET_ASCII;
1912           break;
1913
1914         case ISO_control_1:
1915           if (COMPOSING_P (coding))
1916             DECODE_COMPOSITION_END ('1');
1917           goto label_invalid_code;
1918
1919         case ISO_carriage_return:
1920           if (COMPOSING_P (coding))
1921             DECODE_COMPOSITION_END ('1');
1922
1923           if (coding->eol_type == CODING_EOL_CR)
1924             c1 = '\n';
1925           else if (coding->eol_type == CODING_EOL_CRLF)
1926             {
1927               ONE_MORE_BYTE (c1);
1928               if (c1 != ISO_CODE_LF)
1929                 {
1930                   src--;
1931                   c1 = '\r';
1932                 }
1933             }
1934           charset = CHARSET_ASCII;
1935           break;
1936
1937         case ISO_shift_out:
1938           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1939               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1940             goto label_invalid_code;
1941           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1942           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1943           continue;
1944
1945         case ISO_shift_in:
1946           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1947             goto label_invalid_code;
1948           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1949           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1950           continue;
1951
1952         case ISO_single_shift_2_7:
1953         case ISO_single_shift_2:
1954           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1955             goto label_invalid_code;
1956           /* SS2 is handled as an escape sequence of ESC 'N' */
1957           c1 = 'N';
1958           goto label_escape_sequence;
1959
1960         case ISO_single_shift_3:
1961           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1962             goto label_invalid_code;
1963           /* SS2 is handled as an escape sequence of ESC 'O' */
1964           c1 = 'O';
1965           goto label_escape_sequence;
1966
1967         case ISO_control_sequence_introducer:
1968           /* CSI is handled as an escape sequence of ESC '[' ...  */
1969           c1 = '[';
1970           goto label_escape_sequence;
1971
1972         case ISO_escape:
1973           ONE_MORE_BYTE (c1);
1974         label_escape_sequence:
1975           /* Escape sequences handled by Emacs are invocation,
1976              designation, direction specification, and character
1977              composition specification.  */
1978           switch (c1)
1979             {
1980             case '&':           /* revision of following character set */
1981               ONE_MORE_BYTE (c1);
1982               if (!(c1 >= '@' && c1 <= '~'))
1983                 goto label_invalid_code;
1984               ONE_MORE_BYTE (c1);
1985               if (c1 != ISO_CODE_ESC)
1986                 goto label_invalid_code;
1987               ONE_MORE_BYTE (c1);
1988               goto label_escape_sequence;
1989
1990             case '$':           /* designation of 2-byte character set */
1991               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1992                 goto label_invalid_code;
1993               ONE_MORE_BYTE (c1);
1994               if (c1 >= '@' && c1 <= 'B')
1995                 {       /* designation of JISX0208.1978, GB2312.1980,
1996                            or JISX0208.1980 */
1997                   DECODE_DESIGNATION (0, 2, 94, c1);
1998                 }
1999               else if (c1 >= 0x28 && c1 <= 0x2B)
2000                 {       /* designation of DIMENSION2_CHARS94 character set */
2001                   ONE_MORE_BYTE (c2);
2002                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2003                 }
2004               else if (c1 >= 0x2C && c1 <= 0x2F)
2005                 {       /* designation of DIMENSION2_CHARS96 character set */
2006                   ONE_MORE_BYTE (c2);
2007                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2008                 }
2009               else
2010                 goto label_invalid_code;
2011               /* We must update these variables now.  */
2012               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2013               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2014               continue;
2015
2016             case 'n':           /* invocation of locking-shift-2 */
2017               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2018                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2019                 goto label_invalid_code;
2020               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2021               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2022               continue;
2023
2024             case 'o':           /* invocation of locking-shift-3 */
2025               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2026                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2027                 goto label_invalid_code;
2028               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2029               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2030               continue;
2031
2032             case 'N':           /* invocation of single-shift-2 */
2033               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2034                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2035                 goto label_invalid_code;
2036               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2037               ONE_MORE_BYTE (c1);
2038               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2039                 goto label_invalid_code;
2040               break;
2041
2042             case 'O':           /* invocation of single-shift-3 */
2043               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2044                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2045                 goto label_invalid_code;
2046               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2047               ONE_MORE_BYTE (c1);
2048               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2049                 goto label_invalid_code;
2050               break;
2051
2052             case '0': case '2': case '3': case '4': /* start composition */
2053               DECODE_COMPOSITION_START (c1);
2054               continue;
2055
2056             case '1':           /* end composition */
2057               DECODE_COMPOSITION_END (c1);
2058               continue;
2059
2060             case '[':           /* specification of direction */
2061               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2062                 goto label_invalid_code;
2063               /* For the moment, nested direction is not supported.
2064                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2065                  left-to-right, and nonzero means right-to-left.  */
2066               ONE_MORE_BYTE (c1);
2067               switch (c1)
2068                 {
2069                 case ']':       /* end of the current direction */
2070                   coding->mode &= ~CODING_MODE_DIRECTION;
2071
2072                 case '0':       /* end of the current direction */
2073                 case '1':       /* start of left-to-right direction */
2074                   ONE_MORE_BYTE (c1);
2075                   if (c1 == ']')
2076                     coding->mode &= ~CODING_MODE_DIRECTION;
2077                   else
2078                     goto label_invalid_code;
2079                   break;
2080
2081                 case '2':       /* start of right-to-left direction */
2082                   ONE_MORE_BYTE (c1);
2083                   if (c1 == ']')
2084                     coding->mode |= CODING_MODE_DIRECTION;
2085                   else
2086                     goto label_invalid_code;
2087                   break;
2088
2089                 default:
2090                   goto label_invalid_code;
2091                 }
2092               continue;
2093
2094             case '%':
2095               if (COMPOSING_P (coding))
2096                 DECODE_COMPOSITION_END ('1');
2097               ONE_MORE_BYTE (c1);
2098               if (c1 == '/')
2099                 {
2100                   /* CTEXT extended segment:
2101                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2102                      We keep these bytes as is for the moment.
2103                      They may be decoded by post-read-conversion.  */
2104                   int dim, M, L;
2105                   int size, required;
2106                   int produced_chars;
2107
2108                   ONE_MORE_BYTE (dim);
2109                   ONE_MORE_BYTE (M);
2110                   ONE_MORE_BYTE (L);
2111                   size = ((M - 128) * 128) + (L - 128);
2112                   required = 8 + size * 2;
2113                   if (dst + required > (dst_bytes ? dst_end : src))
2114                     goto label_end_of_loop;
2115                   *dst++ = ISO_CODE_ESC;
2116                   *dst++ = '%';
2117                   *dst++ = '/';
2118                   *dst++ = dim;
2119                   produced_chars = 4;
2120                   dst += CHAR_STRING (M, dst), produced_chars++;
2121                   dst += CHAR_STRING (L, dst), produced_chars++;
2122                   while (size-- > 0)
2123                     {
2124                       ONE_MORE_BYTE (c1);
2125                       dst += CHAR_STRING (c1, dst), produced_chars++;
2126                     }
2127                   coding->produced_char += produced_chars;
2128                 }
2129               else if (c1 == 'G')
2130                 {
2131                   unsigned char *d = dst;
2132                   int produced_chars;
2133
2134                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2135                      ESC % G --UTF-8-BYTES-- ESC % @
2136                      We keep these bytes as is for the moment.
2137                      They may be decoded by post-read-conversion.  */
2138                   if (d + 6 > (dst_bytes ? dst_end : src))
2139                     goto label_end_of_loop;
2140                   *d++ = ISO_CODE_ESC;
2141                   *d++ = '%';
2142                   *d++ = 'G';
2143                   produced_chars = 3;
2144                   while (d + 1 < (dst_bytes ? dst_end : src))
2145                     {
2146                       ONE_MORE_BYTE (c1);
2147                       if (c1 == ISO_CODE_ESC
2148                           && src + 1 < src_end
2149                           && src[0] == '%'
2150                           && src[1] == '@')
2151                         {
2152                           src += 2;
2153                           break;
2154                         }
2155                       d += CHAR_STRING (c1, d), produced_chars++;
2156                     }
2157                   if (d + 3 > (dst_bytes ? dst_end : src))
2158                     goto label_end_of_loop;
2159                   *d++ = ISO_CODE_ESC;
2160                   *d++ = '%';
2161                   *d++ = '@';
2162                   dst = d;
2163                   coding->produced_char += produced_chars + 3;
2164                 }
2165               else
2166                 goto label_invalid_code;
2167               continue;
2168
2169             default:
2170               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2171                 goto label_invalid_code;
2172               if (c1 >= 0x28 && c1 <= 0x2B)
2173                 {       /* designation of DIMENSION1_CHARS94 character set */
2174                   ONE_MORE_BYTE (c2);
2175                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2176                 }
2177               else if (c1 >= 0x2C && c1 <= 0x2F)
2178                 {       /* designation of DIMENSION1_CHARS96 character set */
2179                   ONE_MORE_BYTE (c2);
2180                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2181                 }
2182               else
2183                 goto label_invalid_code;
2184               /* We must update these variables now.  */
2185               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2186               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2187               continue;
2188             }
2189         }
2190
2191       /* Now we know CHARSET and 1st position code C1 of a character.
2192          Produce a multibyte sequence for that character while getting
2193          2nd position code C2 if necessary.  */
2194       if (CHARSET_DIMENSION (charset) == 2)
2195         {
2196           ONE_MORE_BYTE (c2);
2197           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2198             /* C2 is not in a valid range.  */
2199             goto label_invalid_code;
2200         }
2201       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2202       EMIT_CHAR (c);
2203       continue;
2204
2205     label_invalid_code:
2206       coding->errors++;
2207       if (COMPOSING_P (coding))
2208         DECODE_COMPOSITION_END ('1');
2209       src = src_base;
2210       c = *src++;
2211       EMIT_CHAR (c);
2212     }
2213
2214  label_end_of_loop:
2215   coding->consumed = coding->consumed_char = src_base - source;
2216   coding->produced = dst - destination;
2217   return;
2218 }
2219
2220
2221 /* ISO2022 encoding stuff.  */
2222
2223 /*
2224    It is not enough to say just "ISO2022" on encoding, we have to
2225    specify more details.  In Emacs, each ISO2022 coding system
2226    variant has the following specifications:
2227         1. Initial designation to G0 through G3.
2228         2. Allows short-form designation?
2229         3. ASCII should be designated to G0 before control characters?
2230         4. ASCII should be designated to G0 at end of line?
2231         5. 7-bit environment or 8-bit environment?
2232         6. Use locking-shift?
2233         7. Use Single-shift?
2234    And the following two are only for Japanese:
2235         8. Use ASCII in place of JIS0201-1976-Roman?
2236         9. Use JISX0208-1983 in place of JISX0208-1978?
2237    These specifications are encoded in `coding->flags' as flag bits
2238    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2239    details.
2240 */
2241
2242 /* Produce codes (escape sequence) for designating CHARSET to graphic
2243    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2244    '@', 'A', or 'B' and the coding system CODING allows, produce
2245    designation sequence of short-form.  */
2246
2247 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2248   do {                                                                  \
2249     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2250     char *intermediate_char_94 = "()*+";                                \
2251     char *intermediate_char_96 = ",-./";                                \
2252     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2253                                                                         \
2254     if (revision < 255)                                                 \
2255       {                                                                 \
2256         *dst++ = ISO_CODE_ESC;                                          \
2257         *dst++ = '&';                                                   \
2258         *dst++ = '@' + revision;                                        \
2259       }                                                                 \
2260     *dst++ = ISO_CODE_ESC;                                              \
2261     if (CHARSET_DIMENSION (charset) == 1)                               \
2262       {                                                                 \
2263         if (CHARSET_CHARS (charset) == 94)                              \
2264           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2265         else                                                            \
2266           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2267       }                                                                 \
2268     else                                                                \
2269       {                                                                 \
2270         *dst++ = '$';                                                   \
2271         if (CHARSET_CHARS (charset) == 94)                              \
2272           {                                                             \
2273             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2274                 || reg != 0                                             \
2275                 || final_char < '@' || final_char > 'B')                \
2276               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2277           }                                                             \
2278         else                                                            \
2279           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2280       }                                                                 \
2281     *dst++ = final_char;                                                \
2282     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2283   } while (0)
2284
2285 /* The following two macros produce codes (control character or escape
2286    sequence) for ISO2022 single-shift functions (single-shift-2 and
2287    single-shift-3).  */
2288
2289 #define ENCODE_SINGLE_SHIFT_2                           \
2290   do {                                                  \
2291     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2292       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2293     else                                                \
2294       *dst++ = ISO_CODE_SS2;                            \
2295     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2296   } while (0)
2297
2298 #define ENCODE_SINGLE_SHIFT_3                           \
2299   do {                                                  \
2300     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2301       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2302     else                                                \
2303       *dst++ = ISO_CODE_SS3;                            \
2304     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2305   } while (0)
2306
2307 /* The following four macros produce codes (control character or
2308    escape sequence) for ISO2022 locking-shift functions (shift-in,
2309    shift-out, locking-shift-2, and locking-shift-3).  */
2310
2311 #define ENCODE_SHIFT_IN                         \
2312   do {                                          \
2313     *dst++ = ISO_CODE_SI;                       \
2314     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2315   } while (0)
2316
2317 #define ENCODE_SHIFT_OUT                        \
2318   do {                                          \
2319     *dst++ = ISO_CODE_SO;                       \
2320     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2321   } while (0)
2322
2323 #define ENCODE_LOCKING_SHIFT_2                  \
2324   do {                                          \
2325     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2326     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2327   } while (0)
2328
2329 #define ENCODE_LOCKING_SHIFT_3                  \
2330   do {                                          \
2331     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2332     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2333   } while (0)
2334
2335 /* Produce codes for a DIMENSION1 character whose character set is
2336    CHARSET and whose position-code is C1.  Designation and invocation
2337    sequences are also produced in advance if necessary.  */
2338
2339 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2340   do {                                                                  \
2341     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2342       {                                                                 \
2343         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2344           *dst++ = c1 & 0x7F;                                           \
2345         else                                                            \
2346           *dst++ = c1 | 0x80;                                           \
2347         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2348         break;                                                          \
2349       }                                                                 \
2350     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2351       {                                                                 \
2352         *dst++ = c1 & 0x7F;                                             \
2353         break;                                                          \
2354       }                                                                 \
2355     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2356       {                                                                 \
2357         *dst++ = c1 | 0x80;                                             \
2358         break;                                                          \
2359       }                                                                 \
2360     else                                                                \
2361       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2362          must invoke it, or, at first, designate it to some graphic     \
2363          register.  Then repeat the loop to actually produce the        \
2364          character.  */                                                 \
2365       dst = encode_invocation_designation (charset, coding, dst);       \
2366   } while (1)
2367
2368 /* Produce codes for a DIMENSION2 character whose character set is
2369    CHARSET and whose position-codes are C1 and C2.  Designation and
2370    invocation codes are also produced in advance if necessary.  */
2371
2372 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2373   do {                                                                  \
2374     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2375       {                                                                 \
2376         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2377           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2378         else                                                            \
2379           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2380         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2381         break;                                                          \
2382       }                                                                 \
2383     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2384       {                                                                 \
2385         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2386         break;                                                          \
2387       }                                                                 \
2388     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2389       {                                                                 \
2390         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2391         break;                                                          \
2392       }                                                                 \
2393     else                                                                \
2394       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2395          must invoke it, or, at first, designate it to some graphic     \
2396          register.  Then repeat the loop to actually produce the        \
2397          character.  */                                                 \
2398       dst = encode_invocation_designation (charset, coding, dst);       \
2399   } while (1)
2400
2401 #define ENCODE_ISO_CHARACTER(c)                                 \
2402   do {                                                          \
2403     int charset, c1, c2;                                        \
2404                                                                 \
2405     SPLIT_CHAR (c, charset, c1, c2);                            \
2406     if (CHARSET_DEFINED_P (charset))                            \
2407       {                                                         \
2408         if (CHARSET_DIMENSION (charset) == 1)                   \
2409           {                                                     \
2410             if (charset == CHARSET_ASCII                        \
2411                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2412               charset = charset_latin_jisx0201;                 \
2413             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2414           }                                                     \
2415         else                                                    \
2416           {                                                     \
2417             if (charset == charset_jisx0208                     \
2418                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2419               charset = charset_jisx0208_1978;                  \
2420             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2421           }                                                     \
2422       }                                                         \
2423     else                                                        \
2424       {                                                         \
2425         *dst++ = c1;                                            \
2426         if (c2 >= 0)                                            \
2427           *dst++ = c2;                                          \
2428       }                                                         \
2429   } while (0)
2430
2431
2432 /* Instead of encoding character C, produce one or two `?'s.  */
2433
2434 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2435   do {                                                          \
2436     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2437     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2438       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2439   } while (0)
2440
2441
2442 /* Produce designation and invocation codes at a place pointed by DST
2443    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2444    Return new DST.  */
2445
2446 unsigned char *
2447 encode_invocation_designation (charset, coding, dst)
2448      int charset;
2449      struct coding_system *coding;
2450      unsigned char *dst;
2451 {
2452   int reg;                      /* graphic register number */
2453
2454   /* At first, check designations.  */
2455   for (reg = 0; reg < 4; reg++)
2456     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2457       break;
2458
2459   if (reg >= 4)
2460     {
2461       /* CHARSET is not yet designated to any graphic registers.  */
2462       /* At first check the requested designation.  */
2463       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2464       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2465         /* Since CHARSET requests no special designation, designate it
2466            to graphic register 0.  */
2467         reg = 0;
2468
2469       ENCODE_DESIGNATION (charset, reg, coding);
2470     }
2471
2472   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2473       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2474     {
2475       /* Since the graphic register REG is not invoked to any graphic
2476          planes, invoke it to graphic plane 0.  */
2477       switch (reg)
2478         {
2479         case 0:                 /* graphic register 0 */
2480           ENCODE_SHIFT_IN;
2481           break;
2482
2483         case 1:                 /* graphic register 1 */
2484           ENCODE_SHIFT_OUT;
2485           break;
2486
2487         case 2:                 /* graphic register 2 */
2488           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2489             ENCODE_SINGLE_SHIFT_2;
2490           else
2491             ENCODE_LOCKING_SHIFT_2;
2492           break;
2493
2494         case 3:                 /* graphic register 3 */
2495           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2496             ENCODE_SINGLE_SHIFT_3;
2497           else
2498             ENCODE_LOCKING_SHIFT_3;
2499           break;
2500         }
2501     }
2502
2503   return dst;
2504 }
2505
2506 /* Produce 2-byte codes for encoded composition rule RULE.  */
2507
2508 #define ENCODE_COMPOSITION_RULE(rule)           \
2509   do {                                          \
2510     int gref, nref;                             \
2511     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2512     *dst++ = 32 + 81 + gref;                    \
2513     *dst++ = 32 + nref;                         \
2514   } while (0)
2515
2516 /* Produce codes for indicating the start of a composition sequence
2517    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2518    which specify information about the composition.  See the comment
2519    in coding.h for the format of DATA.  */
2520
2521 #define ENCODE_COMPOSITION_START(coding, data)                          \
2522   do {                                                                  \
2523     coding->composing = data[3];                                        \
2524     *dst++ = ISO_CODE_ESC;                                              \
2525     if (coding->composing == COMPOSITION_RELATIVE)                      \
2526       *dst++ = '0';                                                     \
2527     else                                                                \
2528       {                                                                 \
2529         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2530                   ? '3' : '4');                                         \
2531         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2532         coding->composition_rule_follows = 0;                           \
2533       }                                                                 \
2534   } while (0)
2535
2536 /* Produce codes for indicating the end of the current composition.  */
2537
2538 #define ENCODE_COMPOSITION_END(coding, data)                    \
2539   do {                                                          \
2540     *dst++ = ISO_CODE_ESC;                                      \
2541     *dst++ = '1';                                               \
2542     coding->cmp_data_start += data[0];                          \
2543     coding->composing = COMPOSITION_NO;                         \
2544     if (coding->cmp_data_start == coding->cmp_data->used        \
2545         && coding->cmp_data->next)                              \
2546       {                                                         \
2547         coding->cmp_data = coding->cmp_data->next;              \
2548         coding->cmp_data_start = 0;                             \
2549       }                                                         \
2550   } while (0)
2551
2552 /* Produce composition start sequence ESC 0.  Here, this sequence
2553    doesn't mean the start of a new composition but means that we have
2554    just produced components (alternate chars and composition rules) of
2555    the composition and the actual text follows in SRC.  */
2556
2557 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2558   do {                                          \
2559     *dst++ = ISO_CODE_ESC;                      \
2560     *dst++ = '0';                               \
2561     coding->composing = COMPOSITION_RELATIVE;   \
2562   } while (0)
2563
2564 /* The following three macros produce codes for indicating direction
2565    of text.  */
2566 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2567   do {                                                  \
2568     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2569       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2570     else                                                \
2571       *dst++ = ISO_CODE_CSI;                            \
2572   } while (0)
2573
2574 #define ENCODE_DIRECTION_R2L    \
2575   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2576
2577 #define ENCODE_DIRECTION_L2R    \
2578   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2579
2580 /* Produce codes for designation and invocation to reset the graphic
2581    planes and registers to initial state.  */
2582 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2583   do {                                                                      \
2584     int reg;                                                                \
2585     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2586       ENCODE_SHIFT_IN;                                                      \
2587     for (reg = 0; reg < 4; reg++)                                           \
2588       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2589           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2590               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2591         ENCODE_DESIGNATION                                                  \
2592           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2593   } while (0)
2594
2595 /* Produce designation sequences of charsets in the line started from
2596    SRC to a place pointed by DST, and return updated DST.
2597
2598    If the current block ends before any end-of-line, we may fail to
2599    find all the necessary designations.  */
2600
2601 static unsigned char *
2602 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2603      struct coding_system *coding;
2604      Lisp_Object translation_table;
2605      const unsigned char *src, *src_end;
2606      unsigned char *dst;
2607 {
2608   int charset, c, found = 0, reg;
2609   /* Table of charsets to be designated to each graphic register.  */
2610   int r[4];
2611
2612   for (reg = 0; reg < 4; reg++)
2613     r[reg] = -1;
2614
2615   while (found < 4)
2616     {
2617       ONE_MORE_CHAR (c);
2618       if (c == '\n')
2619         break;
2620
2621       charset = CHAR_CHARSET (c);
2622       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2623       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2624         {
2625           found++;
2626           r[reg] = charset;
2627         }
2628     }
2629
2630  label_end_of_loop:
2631   if (found)
2632     {
2633       for (reg = 0; reg < 4; reg++)
2634         if (r[reg] >= 0
2635             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2636           ENCODE_DESIGNATION (r[reg], reg, coding);
2637     }
2638
2639   return dst;
2640 }
2641
2642 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2643
2644 static void
2645 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2646      struct coding_system *coding;
2647      const unsigned char *source;
2648      unsigned char *destination;
2649      int src_bytes, dst_bytes;
2650 {
2651   const unsigned char *src = source;
2652   const unsigned char *src_end = source + src_bytes;
2653   unsigned char *dst = destination;
2654   unsigned char *dst_end = destination + dst_bytes;
2655   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2656      from DST_END to assure overflow checking is necessary only at the
2657      head of loop.  */
2658   unsigned char *adjusted_dst_end = dst_end - 19;
2659   /* SRC_BASE remembers the start position in source in each loop.
2660      The loop will be exited when there's not enough source text to
2661      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2662      there's not enough destination area to produce encoded codes
2663      (within macro EMIT_BYTES).  */
2664   const unsigned char *src_base;
2665   int c;
2666   Lisp_Object translation_table;
2667   Lisp_Object safe_chars;
2668
2669   if (coding->flags & CODING_FLAG_ISO_SAFE)
2670     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2671
2672   safe_chars = coding_safe_chars (coding->symbol);
2673
2674   if (NILP (Venable_character_translation))
2675     translation_table = Qnil;
2676   else
2677     {
2678       translation_table = coding->translation_table_for_encode;
2679       if (NILP (translation_table))
2680         translation_table = Vstandard_translation_table_for_encode;
2681     }
2682
2683   coding->consumed_char = 0;
2684   coding->errors = 0;
2685   while (1)
2686     {
2687       src_base = src;
2688
2689       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2690         {
2691           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2692           break;
2693         }
2694
2695       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2696           && CODING_SPEC_ISO_BOL (coding))
2697         {
2698           /* We have to produce designation sequences if any now.  */
2699           dst = encode_designation_at_bol (coding, translation_table,
2700                                            src, src_end, dst);
2701           CODING_SPEC_ISO_BOL (coding) = 0;
2702         }
2703
2704       /* Check composition start and end.  */
2705       if (coding->composing != COMPOSITION_DISABLED
2706           && coding->cmp_data_start < coding->cmp_data->used)
2707         {
2708           struct composition_data *cmp_data = coding->cmp_data;
2709           int *data = cmp_data->data + coding->cmp_data_start;
2710           int this_pos = cmp_data->char_offset + coding->consumed_char;
2711
2712           if (coding->composing == COMPOSITION_RELATIVE)
2713             {
2714               if (this_pos == data[2])
2715                 {
2716                   ENCODE_COMPOSITION_END (coding, data);
2717                   cmp_data = coding->cmp_data;
2718                   data = cmp_data->data + coding->cmp_data_start;
2719                 }
2720             }
2721           else if (COMPOSING_P (coding))
2722             {
2723               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2724               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2725                 /* We have consumed components of the composition.
2726                    What follows in SRC is the composition's base
2727                    text.  */
2728                 ENCODE_COMPOSITION_FAKE_START (coding);
2729               else
2730                 {
2731                   int c = cmp_data->data[coding->cmp_data_index++];
2732                   if (coding->composition_rule_follows)
2733                     {
2734                       ENCODE_COMPOSITION_RULE (c);
2735                       coding->composition_rule_follows = 0;
2736                     }
2737                   else
2738                     {
2739                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2740                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2741                         ENCODE_UNSAFE_CHARACTER (c);
2742                       else
2743                         ENCODE_ISO_CHARACTER (c);
2744                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2745                         coding->composition_rule_follows = 1;
2746                     }
2747                   continue;
2748                 }
2749             }
2750           if (!COMPOSING_P (coding))
2751             {
2752               if (this_pos == data[1])
2753                 {
2754                   ENCODE_COMPOSITION_START (coding, data);
2755                   continue;
2756                 }
2757             }
2758         }
2759
2760       ONE_MORE_CHAR (c);
2761
2762       /* Now encode the character C.  */
2763       if (c < 0x20 || c == 0x7F)
2764         {
2765           if (c == '\r')
2766             {
2767               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2768                 {
2769                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2770                     ENCODE_RESET_PLANE_AND_REGISTER;
2771                   *dst++ = c;
2772                   continue;
2773                 }
2774               /* fall down to treat '\r' as '\n' ...  */
2775               c = '\n';
2776             }
2777           if (c == '\n')
2778             {
2779               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2780                 ENCODE_RESET_PLANE_AND_REGISTER;
2781               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2782                 bcopy (coding->spec.iso2022.initial_designation,
2783                        coding->spec.iso2022.current_designation,
2784                        sizeof coding->spec.iso2022.initial_designation);
2785               if (coding->eol_type == CODING_EOL_LF
2786                   || coding->eol_type == CODING_EOL_UNDECIDED)
2787                 *dst++ = ISO_CODE_LF;
2788               else if (coding->eol_type == CODING_EOL_CRLF)
2789                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2790               else
2791                 *dst++ = ISO_CODE_CR;
2792               CODING_SPEC_ISO_BOL (coding) = 1;
2793             }
2794           else
2795             {
2796               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2797                 ENCODE_RESET_PLANE_AND_REGISTER;
2798               *dst++ = c;
2799             }
2800         }
2801       else if (ASCII_BYTE_P (c))
2802         ENCODE_ISO_CHARACTER (c);
2803       else if (SINGLE_BYTE_CHAR_P (c))
2804         {
2805           *dst++ = c;
2806           coding->errors++;
2807         }
2808       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2809                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2810         ENCODE_UNSAFE_CHARACTER (c);
2811       else
2812         ENCODE_ISO_CHARACTER (c);
2813
2814       coding->consumed_char++;
2815     }
2816
2817  label_end_of_loop:
2818   coding->consumed = src_base - source;
2819   coding->produced = coding->produced_char = dst - destination;
2820 }
2821
2822 \f
2823 /*** 4. SJIS and BIG5 handlers ***/
2824
2825 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2826    quite widely.  So, for the moment, Emacs supports them in the bare
2827    C code.  But, in the future, they may be supported only by CCL.  */
2828
2829 /* SJIS is a coding system encoding three character sets: ASCII, right
2830    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2831    as is.  A character of charset katakana-jisx0201 is encoded by
2832    "position-code + 0x80".  A character of charset japanese-jisx0208
2833    is encoded in 2-byte but two position-codes are divided and shifted
2834    so that it fits in the range below.
2835
2836    --- CODE RANGE of SJIS ---
2837    (character set)      (range)
2838    ASCII                0x00 .. 0x7F
2839    KATAKANA-JISX0201    0xA1 .. 0xDF
2840    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2841             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2842    -------------------------------
2843
2844 */
2845
2846 /* BIG5 is a coding system encoding two character sets: ASCII and
2847    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2848    character set and is encoded in two bytes.
2849
2850    --- CODE RANGE of BIG5 ---
2851    (character set)      (range)
2852    ASCII                0x00 .. 0x7F
2853    Big5 (1st byte)      0xA1 .. 0xFE
2854         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2855    --------------------------
2856
2857    Since the number of characters in Big5 is larger than maximum
2858    characters in Emacs' charset (96x96), it can't be handled as one
2859    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2860    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2861    contains frequently used characters and the latter contains less
2862    frequently used characters.  */
2863
2864 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2865    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2866    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2867    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2868
2869 /* Number of Big5 characters which have the same code in 1st byte.  */
2870 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2871
2872 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2873   do {                                                                  \
2874     unsigned int temp                                                   \
2875       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2876     if (b1 < 0xC9)                                                      \
2877       charset = charset_big5_1;                                         \
2878     else                                                                \
2879       {                                                                 \
2880         charset = charset_big5_2;                                       \
2881         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2882       }                                                                 \
2883     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2884     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2885   } while (0)
2886
2887 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2888   do {                                                                  \
2889     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2890     if (charset == charset_big5_2)                                      \
2891       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2892     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2893     b2 = temp % BIG5_SAME_ROW;                                          \
2894     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2895   } while (0)
2896
2897 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2898    Check if a text is encoded in SJIS.  If it is, return
2899    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2900
2901 static int
2902 detect_coding_sjis (src, src_end, multibytep)
2903      unsigned char *src, *src_end;
2904      int multibytep;
2905 {
2906   int c;
2907   /* Dummy for ONE_MORE_BYTE.  */
2908   struct coding_system dummy_coding;
2909   struct coding_system *coding = &dummy_coding;
2910
2911   while (1)
2912     {
2913       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2914       if (c < 0x80)
2915         continue;
2916       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2917         return 0;
2918       if (c <= 0x9F || c >= 0xE0)
2919         {
2920           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2921           if (c < 0x40 || c == 0x7F || c > 0xFC)
2922             return 0;
2923         }
2924     }
2925  label_end_of_loop:
2926   return CODING_CATEGORY_MASK_SJIS;
2927 }
2928
2929 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2930    Check if a text is encoded in BIG5.  If it is, return
2931    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2932
2933 static int
2934 detect_coding_big5 (src, src_end, multibytep)
2935      unsigned char *src, *src_end;
2936      int multibytep;
2937 {
2938   int c;
2939   /* Dummy for ONE_MORE_BYTE.  */
2940   struct coding_system dummy_coding;
2941   struct coding_system *coding = &dummy_coding;
2942
2943   while (1)
2944     {
2945       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2946       if (c < 0x80)
2947         continue;
2948       if (c < 0xA1 || c > 0xFE)
2949         return 0;
2950       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2951       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2952         return 0;
2953     }
2954  label_end_of_loop:
2955   return CODING_CATEGORY_MASK_BIG5;
2956 }
2957
2958 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2959    Check if a text is encoded in UTF-8.  If it is, return
2960    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2961
2962 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2963 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2964 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2965 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2966 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2967 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2968 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2969
2970 static int
2971 detect_coding_utf_8 (src, src_end, multibytep)
2972      unsigned char *src, *src_end;
2973      int multibytep;
2974 {
2975   unsigned char c;
2976   int seq_maybe_bytes;
2977   /* Dummy for ONE_MORE_BYTE.  */
2978   struct coding_system dummy_coding;
2979   struct coding_system *coding = &dummy_coding;
2980
2981   while (1)
2982     {
2983       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2984       if (UTF_8_1_OCTET_P (c))
2985         continue;
2986       else if (UTF_8_2_OCTET_LEADING_P (c))
2987         seq_maybe_bytes = 1;
2988       else if (UTF_8_3_OCTET_LEADING_P (c))
2989         seq_maybe_bytes = 2;
2990       else if (UTF_8_4_OCTET_LEADING_P (c))
2991         seq_maybe_bytes = 3;
2992       else if (UTF_8_5_OCTET_LEADING_P (c))
2993         seq_maybe_bytes = 4;
2994       else if (UTF_8_6_OCTET_LEADING_P (c))
2995         seq_maybe_bytes = 5;
2996       else
2997         return 0;
2998
2999       do
3000         {
3001           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3002           if (!UTF_8_EXTRA_OCTET_P (c))
3003             return 0;
3004           seq_maybe_bytes--;
3005         }
3006       while (seq_maybe_bytes > 0);
3007     }
3008
3009  label_end_of_loop:
3010   return CODING_CATEGORY_MASK_UTF_8;
3011 }
3012
3013 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3014    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3015    Little Endian (otherwise).  If it is, return
3016    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3017    else return 0.  */
3018
3019 #define UTF_16_INVALID_P(val)   \
3020   (((val) == 0xFFFE)            \
3021    || ((val) == 0xFFFF))
3022
3023 #define UTF_16_HIGH_SURROGATE_P(val) \
3024   (((val) & 0xD800) == 0xD800)
3025
3026 #define UTF_16_LOW_SURROGATE_P(val) \
3027   (((val) & 0xDC00) == 0xDC00)
3028
3029 static int
3030 detect_coding_utf_16 (src, src_end, multibytep)
3031      unsigned char *src, *src_end;
3032      int multibytep;
3033 {
3034   unsigned char c1, c2;
3035   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3036   struct coding_system dummy_coding;
3037   struct coding_system *coding = &dummy_coding;
3038
3039   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3040   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3041
3042   if ((c1 == 0xFF) && (c2 == 0xFE))
3043     return CODING_CATEGORY_MASK_UTF_16_LE;
3044   else if ((c1 == 0xFE) && (c2 == 0xFF))
3045     return CODING_CATEGORY_MASK_UTF_16_BE;
3046
3047  label_end_of_loop:
3048   return 0;
3049 }
3050
3051 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3052    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3053
3054 static void
3055 decode_coding_sjis_big5 (coding, source, destination,
3056                          src_bytes, dst_bytes, sjis_p)
3057      struct coding_system *coding;
3058      const unsigned char *source;
3059      unsigned char  *destination;
3060      int src_bytes, dst_bytes;
3061      int sjis_p;
3062 {
3063   const unsigned char *src = source;
3064   const unsigned char *src_end = source + src_bytes;
3065   unsigned char *dst = destination;
3066   unsigned char *dst_end = destination + dst_bytes;
3067   /* SRC_BASE remembers the start position in source in each loop.
3068      The loop will be exited when there's not enough source code
3069      (within macro ONE_MORE_BYTE), or when there's not enough
3070      destination area to produce a character (within macro
3071      EMIT_CHAR).  */
3072   const unsigned char *src_base;
3073   Lisp_Object translation_table;
3074
3075   if (NILP (Venable_character_translation))
3076     translation_table = Qnil;
3077   else
3078     {
3079       translation_table = coding->translation_table_for_decode;
3080       if (NILP (translation_table))
3081         translation_table = Vstandard_translation_table_for_decode;
3082     }
3083
3084   coding->produced_char = 0;
3085   while (1)
3086     {
3087       int c, charset, c1, c2 = 0;
3088
3089       src_base = src;
3090       ONE_MORE_BYTE (c1);
3091
3092       if (c1 < 0x80)
3093         {
3094           charset = CHARSET_ASCII;
3095           if (c1 < 0x20)
3096             {
3097               if (c1 == '\r')
3098                 {
3099                   if (coding->eol_type == CODING_EOL_CRLF)
3100                     {
3101                       ONE_MORE_BYTE (c2);
3102                       if (c2 == '\n')
3103                         c1 = c2;
3104                       else
3105                         /* To process C2 again, SRC is subtracted by 1.  */
3106                         src--;
3107                     }
3108                   else if (coding->eol_type == CODING_EOL_CR)
3109                     c1 = '\n';
3110                 }
3111               else if (c1 == '\n'
3112                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3113                        && (coding->eol_type == CODING_EOL_CR
3114                            || coding->eol_type == CODING_EOL_CRLF))
3115                 {
3116                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3117                   goto label_end_of_loop;
3118                 }
3119             }
3120         }
3121       else
3122         {
3123           if (sjis_p)
3124             {
3125               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3126                 goto label_invalid_code;
3127               if (c1 <= 0x9F || c1 >= 0xE0)
3128                 {
3129                   /* SJIS -> JISX0208 */
3130                   ONE_MORE_BYTE (c2);
3131                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3132                     goto label_invalid_code;
3133                   DECODE_SJIS (c1, c2, c1, c2);
3134                   charset = charset_jisx0208;
3135                 }
3136               else
3137                 /* SJIS -> JISX0201-Kana */
3138                 charset = charset_katakana_jisx0201;
3139             }
3140           else
3141             {
3142               /* BIG5 -> Big5 */
3143               if (c1 < 0xA0 || c1 > 0xFE)
3144                 goto label_invalid_code;
3145               ONE_MORE_BYTE (c2);
3146               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3147                 goto label_invalid_code;
3148               DECODE_BIG5 (c1, c2, charset, c1, c2);
3149             }
3150         }
3151
3152       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3153       EMIT_CHAR (c);
3154       continue;
3155
3156     label_invalid_code:
3157       coding->errors++;
3158       src = src_base;
3159       c = *src++;
3160       EMIT_CHAR (c);
3161     }
3162
3163  label_end_of_loop:
3164   coding->consumed = coding->consumed_char = src_base - source;
3165   coding->produced = dst - destination;
3166   return;
3167 }
3168
3169 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3170    This function can encode charsets `ascii', `katakana-jisx0201',
3171    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3172    are sure that all these charsets are registered as official charset
3173    (i.e. do not have extended leading-codes).  Characters of other
3174    charsets are produced without any encoding.  If SJIS_P is 1, encode
3175    SJIS text, else encode BIG5 text.  */
3176
3177 static void
3178 encode_coding_sjis_big5 (coding, source, destination,
3179                          src_bytes, dst_bytes, sjis_p)
3180      struct coding_system *coding;
3181      unsigned char *source, *destination;
3182      int src_bytes, dst_bytes;
3183      int sjis_p;
3184 {
3185   unsigned char *src = source;
3186   unsigned char *src_end = source + src_bytes;
3187   unsigned char *dst = destination;
3188   unsigned char *dst_end = destination + dst_bytes;
3189   /* SRC_BASE remembers the start position in source in each loop.
3190      The loop will be exited when there's not enough source text to
3191      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3192      there's not enough destination area to produce encoded codes
3193      (within macro EMIT_BYTES).  */
3194   unsigned char *src_base;
3195   Lisp_Object translation_table;
3196
3197   if (NILP (Venable_character_translation))
3198     translation_table = Qnil;
3199   else
3200     {
3201       translation_table = coding->translation_table_for_encode;
3202       if (NILP (translation_table))
3203         translation_table = Vstandard_translation_table_for_encode;
3204     }
3205
3206   while (1)
3207     {
3208       int c, charset, c1, c2;
3209
3210       src_base = src;
3211       ONE_MORE_CHAR (c);
3212
3213       /* Now encode the character C.  */
3214       if (SINGLE_BYTE_CHAR_P (c))
3215         {
3216           switch (c)
3217             {
3218             case '\r':
3219               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3220                 {
3221                   EMIT_ONE_BYTE (c);
3222                   break;
3223                 }
3224               c = '\n';
3225             case '\n':
3226               if (coding->eol_type == CODING_EOL_CRLF)
3227                 {
3228                   EMIT_TWO_BYTES ('\r', c);
3229                   break;
3230                 }
3231               else if (coding->eol_type == CODING_EOL_CR)
3232                 c = '\r';
3233             default:
3234               EMIT_ONE_BYTE (c);
3235             }
3236         }
3237       else
3238         {
3239           SPLIT_CHAR (c, charset, c1, c2);
3240           if (sjis_p)
3241             {
3242               if (charset == charset_jisx0208
3243                   || charset == charset_jisx0208_1978)
3244                 {
3245                   ENCODE_SJIS (c1, c2, c1, c2);
3246                   EMIT_TWO_BYTES (c1, c2);
3247                 }
3248               else if (charset == charset_katakana_jisx0201)
3249                 EMIT_ONE_BYTE (c1 | 0x80);
3250               else if (charset == charset_latin_jisx0201)
3251                 EMIT_ONE_BYTE (c1);
3252               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3253                 {
3254                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3255                   if (CHARSET_WIDTH (charset) > 1)
3256                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3257                 }
3258               else
3259                 /* There's no way other than producing the internal
3260                    codes as is.  */
3261                 EMIT_BYTES (src_base, src);
3262             }
3263           else
3264             {
3265               if (charset == charset_big5_1 || charset == charset_big5_2)
3266                 {
3267                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3268                   EMIT_TWO_BYTES (c1, c2);
3269                 }
3270               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3271                 {
3272                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3273                   if (CHARSET_WIDTH (charset) > 1)
3274                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3275                 }
3276               else
3277                 /* There's no way other than producing the internal
3278                    codes as is.  */
3279                 EMIT_BYTES (src_base, src);
3280             }
3281         }
3282       coding->consumed_char++;
3283     }
3284
3285  label_end_of_loop:
3286   coding->consumed = src_base - source;
3287   coding->produced = coding->produced_char = dst - destination;
3288 }
3289
3290 \f
3291 /*** 5. CCL handlers ***/
3292
3293 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3294    Check if a text is encoded in a coding system of which
3295    encoder/decoder are written in CCL program.  If it is, return
3296    CODING_CATEGORY_MASK_CCL, else return 0.  */
3297
3298 static int
3299 detect_coding_ccl (src, src_end, multibytep)
3300      unsigned char *src, *src_end;
3301      int multibytep;
3302 {
3303   unsigned char *valid;
3304   int c;
3305   /* Dummy for ONE_MORE_BYTE.  */
3306   struct coding_system dummy_coding;
3307   struct coding_system *coding = &dummy_coding;
3308
3309   /* No coding system is assigned to coding-category-ccl.  */
3310   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3311     return 0;
3312
3313   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3314   while (1)
3315     {
3316       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3317       if (! valid[c])
3318         return 0;
3319     }
3320  label_end_of_loop:
3321   return CODING_CATEGORY_MASK_CCL;
3322 }
3323
3324 \f
3325 /*** 6. End-of-line handlers ***/
3326
3327 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3328
3329 static void
3330 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3331      struct coding_system *coding;
3332      const unsigned char *source;
3333      unsigned char *destination;
3334      int src_bytes, dst_bytes;
3335 {
3336   const unsigned char *src = source;
3337   unsigned char *dst = destination;
3338   const unsigned char *src_end = src + src_bytes;
3339   unsigned char *dst_end = dst + dst_bytes;
3340   Lisp_Object translation_table;
3341   /* SRC_BASE remembers the start position in source in each loop.
3342      The loop will be exited when there's not enough source code
3343      (within macro ONE_MORE_BYTE), or when there's not enough
3344      destination area to produce a character (within macro
3345      EMIT_CHAR).  */
3346   const unsigned char *src_base;
3347   int c;
3348
3349   translation_table = Qnil;
3350   switch (coding->eol_type)
3351     {
3352     case CODING_EOL_CRLF:
3353       while (1)
3354         {
3355           src_base = src;
3356           ONE_MORE_BYTE (c);
3357           if (c == '\r')
3358             {
3359               ONE_MORE_BYTE (c);
3360               if (c != '\n')
3361                 {
3362                   src--;
3363                   c = '\r';
3364                 }
3365             }
3366           else if (c == '\n'
3367                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3368             {
3369               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3370               goto label_end_of_loop;
3371             }
3372           EMIT_CHAR (c);
3373         }
3374       break;
3375
3376     case CODING_EOL_CR:
3377       while (1)
3378         {
3379           src_base = src;
3380           ONE_MORE_BYTE (c);
3381           if (c == '\n')
3382             {
3383               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3384                 {
3385                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3386                   goto label_end_of_loop;
3387                 }
3388             }
3389           else if (c == '\r')
3390             c = '\n';
3391           EMIT_CHAR (c);
3392         }
3393       break;
3394
3395     default:                    /* no need for EOL handling */
3396       while (1)
3397         {
3398           src_base = src;
3399           ONE_MORE_BYTE (c);
3400           EMIT_CHAR (c);
3401         }
3402     }
3403
3404  label_end_of_loop:
3405   coding->consumed = coding->consumed_char = src_base - source;
3406   coding->produced = dst - destination;
3407   return;
3408 }
3409
3410 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3411    format of end-of-line according to `coding->eol_type'.  It also
3412    convert multibyte form 8-bit characters to unibyte if
3413    CODING->src_multibyte is nonzero.  If `coding->mode &
3414    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3415    also means end-of-line.  */
3416
3417 static void
3418 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3419      struct coding_system *coding;
3420      const unsigned char *source;
3421      unsigned char *destination;
3422      int src_bytes, dst_bytes;
3423 {
3424   const unsigned char *src = source;
3425   unsigned char *dst = destination;
3426   const unsigned char *src_end = src + src_bytes;
3427   unsigned char *dst_end = dst + dst_bytes;
3428   Lisp_Object translation_table;
3429   /* SRC_BASE remembers the start position in source in each loop.
3430      The loop will be exited when there's not enough source text to
3431      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3432      there's not enough destination area to produce encoded codes
3433      (within macro EMIT_BYTES).  */
3434   const unsigned char *src_base;
3435   unsigned char *tmp;
3436   int c;
3437   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3438
3439   translation_table = Qnil;
3440   if (coding->src_multibyte
3441       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3442     {
3443       src_end--;
3444       src_bytes--;
3445       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3446     }
3447
3448   if (coding->eol_type == CODING_EOL_CRLF)
3449     {
3450       while (src < src_end)
3451         {
3452           src_base = src;
3453           c = *src++;
3454           if (c >= 0x20)
3455             EMIT_ONE_BYTE (c);
3456           else if (c == '\n' || (c == '\r' && selective_display))
3457             EMIT_TWO_BYTES ('\r', '\n');
3458           else
3459             EMIT_ONE_BYTE (c);
3460         }
3461       src_base = src;
3462     label_end_of_loop:
3463       ;
3464     }
3465   else
3466     {
3467       if (!dst_bytes || src_bytes <= dst_bytes)
3468         {
3469           safe_bcopy (src, dst, src_bytes);
3470           src_base = src_end;
3471           dst += src_bytes;
3472         }
3473       else
3474         {
3475           if (coding->src_multibyte
3476               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3477             dst_bytes--;
3478           safe_bcopy (src, dst, dst_bytes);
3479           src_base = src + dst_bytes;
3480           dst = destination + dst_bytes;
3481           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3482         }
3483       if (coding->eol_type == CODING_EOL_CR)
3484         {
3485           for (tmp = destination; tmp < dst; tmp++)
3486             if (*tmp == '\n') *tmp = '\r';
3487         }
3488       else if (selective_display)
3489         {
3490           for (tmp = destination; tmp < dst; tmp++)
3491             if (*tmp == '\r') *tmp = '\n';
3492         }
3493     }
3494   if (coding->src_multibyte)
3495     dst = destination + str_as_unibyte (destination, dst - destination);
3496
3497   coding->consumed = src_base - source;
3498   coding->produced = dst - destination;
3499   coding->produced_char = coding->produced;
3500 }
3501
3502 \f
3503 /*** 7. C library functions ***/
3504
3505 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3506    has a property `coding-system'.  The value of this property is a
3507    vector of length 5 (called the coding-vector).  Among elements of
3508    this vector, the first (element[0]) and the fifth (element[4])
3509    carry important information for decoding/encoding.  Before
3510    decoding/encoding, this information should be set in fields of a
3511    structure of type `coding_system'.
3512
3513    The value of the property `coding-system' can be a symbol of another
3514    subsidiary coding-system.  In that case, Emacs gets coding-vector
3515    from that symbol.
3516
3517    `element[0]' contains information to be set in `coding->type'.  The
3518    value and its meaning is as follows:
3519
3520    0 -- coding_type_emacs_mule
3521    1 -- coding_type_sjis
3522    2 -- coding_type_iso2022
3523    3 -- coding_type_big5
3524    4 -- coding_type_ccl encoder/decoder written in CCL
3525    nil -- coding_type_no_conversion
3526    t -- coding_type_undecided (automatic conversion on decoding,
3527                                no-conversion on encoding)
3528
3529    `element[4]' contains information to be set in `coding->flags' and
3530    `coding->spec'.  The meaning varies by `coding->type'.
3531
3532    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3533    of length 32 (of which the first 13 sub-elements are used now).
3534    Meanings of these sub-elements are:
3535
3536    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3537         If the value is an integer of valid charset, the charset is
3538         assumed to be designated to graphic register N initially.
3539
3540         If the value is minus, it is a minus value of charset which
3541         reserves graphic register N, which means that the charset is
3542         not designated initially but should be designated to graphic
3543         register N just before encoding a character in that charset.
3544
3545         If the value is nil, graphic register N is never used on
3546         encoding.
3547
3548    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3549         Each value takes t or nil.  See the section ISO2022 of
3550         `coding.h' for more information.
3551
3552    If `coding->type' is `coding_type_big5', element[4] is t to denote
3553    BIG5-ETen or nil to denote BIG5-HKU.
3554
3555    If `coding->type' takes the other value, element[4] is ignored.
3556
3557    Emacs Lisp's coding systems also carry information about format of
3558    end-of-line in a value of property `eol-type'.  If the value is
3559    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3560    means CODING_EOL_CR.  If it is not integer, it should be a vector
3561    of subsidiary coding systems of which property `eol-type' has one
3562    of the above values.
3563
3564 */
3565
3566 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3567    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3568    is setup so that no conversion is necessary and return -1, else
3569    return 0.  */
3570
3571 int
3572 setup_coding_system (coding_system, coding)
3573      Lisp_Object coding_system;
3574      struct coding_system *coding;
3575 {
3576   Lisp_Object coding_spec, coding_type, eol_type, plist;
3577   Lisp_Object val;
3578
3579   /* At first, zero clear all members.  */
3580   bzero (coding, sizeof (struct coding_system));
3581
3582   /* Initialize some fields required for all kinds of coding systems.  */
3583   coding->symbol = coding_system;
3584   coding->heading_ascii = -1;
3585   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3586   coding->composing = COMPOSITION_DISABLED;
3587   coding->cmp_data = NULL;
3588
3589   if (NILP (coding_system))
3590     goto label_invalid_coding_system;
3591
3592   coding_spec = Fget (coding_system, Qcoding_system);
3593
3594   if (!VECTORP (coding_spec)
3595       || XVECTOR (coding_spec)->size != 5
3596       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3597     goto label_invalid_coding_system;
3598
3599   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3600   if (VECTORP (eol_type))
3601     {
3602       coding->eol_type = CODING_EOL_UNDECIDED;
3603       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3604     }
3605   else if (XFASTINT (eol_type) == 1)
3606     {
3607       coding->eol_type = CODING_EOL_CRLF;
3608       coding->common_flags
3609         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3610     }
3611   else if (XFASTINT (eol_type) == 2)
3612     {
3613       coding->eol_type = CODING_EOL_CR;
3614       coding->common_flags
3615         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3616     }
3617   else
3618     coding->eol_type = CODING_EOL_LF;
3619
3620   coding_type = XVECTOR (coding_spec)->contents[0];
3621   /* Try short cut.  */
3622   if (SYMBOLP (coding_type))
3623     {
3624       if (EQ (coding_type, Qt))
3625         {
3626           coding->type = coding_type_undecided;
3627           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3628         }
3629       else
3630         coding->type = coding_type_no_conversion;
3631       /* Initialize this member.  Any thing other than
3632          CODING_CATEGORY_IDX_UTF_16_BE and
3633          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3634          special treatment in detect_eol.  */
3635       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3636
3637       return 0;
3638     }
3639
3640   /* Get values of coding system properties:
3641      `post-read-conversion', `pre-write-conversion',
3642      `translation-table-for-decode', `translation-table-for-encode'.  */
3643   plist = XVECTOR (coding_spec)->contents[3];
3644   /* Pre & post conversion functions should be disabled if
3645      inhibit_eol_conversion is nonzero.  This is the case that a code
3646      conversion function is called while those functions are running.  */
3647   if (! inhibit_pre_post_conversion)
3648     {
3649       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3650       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3651     }
3652   val = Fplist_get (plist, Qtranslation_table_for_decode);
3653   if (SYMBOLP (val))
3654     val = Fget (val, Qtranslation_table_for_decode);
3655   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3656   val = Fplist_get (plist, Qtranslation_table_for_encode);
3657   if (SYMBOLP (val))
3658     val = Fget (val, Qtranslation_table_for_encode);
3659   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3660   val = Fplist_get (plist, Qcoding_category);
3661   if (!NILP (val))
3662     {
3663       val = Fget (val, Qcoding_category_index);
3664       if (INTEGERP (val))
3665         coding->category_idx = XINT (val);
3666       else
3667         goto label_invalid_coding_system;
3668     }
3669   else
3670     goto label_invalid_coding_system;
3671
3672   /* If the coding system has non-nil `composition' property, enable
3673      composition handling.  */
3674   val = Fplist_get (plist, Qcomposition);
3675   if (!NILP (val))
3676     coding->composing = COMPOSITION_NO;
3677
3678   switch (XFASTINT (coding_type))
3679     {
3680     case 0:
3681       coding->type = coding_type_emacs_mule;
3682       coding->common_flags
3683         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3684       if (!NILP (coding->post_read_conversion))
3685         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3686       if (!NILP (coding->pre_write_conversion))
3687         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3688       break;
3689
3690     case 1:
3691       coding->type = coding_type_sjis;
3692       coding->common_flags
3693         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3694       break;
3695
3696     case 2:
3697       coding->type = coding_type_iso2022;
3698       coding->common_flags
3699         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3700       {
3701         Lisp_Object val, temp;
3702         Lisp_Object *flags;
3703         int i, charset, reg_bits = 0;
3704
3705         val = XVECTOR (coding_spec)->contents[4];
3706
3707         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3708           goto label_invalid_coding_system;
3709
3710         flags = XVECTOR (val)->contents;
3711         coding->flags
3712           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3713              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3714              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3715              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3716              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3717              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3718              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3719              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3720              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3721              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3722              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3723              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3724              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3725              );
3726
3727         /* Invoke graphic register 0 to plane 0.  */
3728         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3729         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3730         CODING_SPEC_ISO_INVOCATION (coding, 1)
3731           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3732         /* Not single shifting at first.  */
3733         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3734         /* Beginning of buffer should also be regarded as bol. */
3735         CODING_SPEC_ISO_BOL (coding) = 1;
3736
3737         for (charset = 0; charset <= MAX_CHARSET; charset++)
3738           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3739         val = Vcharset_revision_alist;
3740         while (CONSP (val))
3741           {
3742             charset = get_charset_id (Fcar_safe (XCAR (val)));
3743             if (charset >= 0
3744                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3745                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3746               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3747             val = XCDR (val);
3748           }
3749
3750         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3751            FLAGS[REG] can be one of below:
3752                 integer CHARSET: CHARSET occupies register I,
3753                 t: designate nothing to REG initially, but can be used
3754                   by any charsets,
3755                 list of integer, nil, or t: designate the first
3756                   element (if integer) to REG initially, the remaining
3757                   elements (if integer) is designated to REG on request,
3758                   if an element is t, REG can be used by any charsets,
3759                 nil: REG is never used.  */
3760         for (charset = 0; charset <= MAX_CHARSET; charset++)
3761           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3762             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3763         for (i = 0; i < 4; i++)
3764           {
3765             if ((INTEGERP (flags[i])
3766                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3767                 || (charset = get_charset_id (flags[i])) >= 0)
3768               {
3769                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3770                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3771               }
3772             else if (EQ (flags[i], Qt))
3773               {
3774                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3775                 reg_bits |= 1 << i;
3776                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3777               }
3778             else if (CONSP (flags[i]))
3779               {
3780                 Lisp_Object tail;
3781                 tail = flags[i];
3782
3783                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3784                 if ((INTEGERP (XCAR (tail))
3785                      && (charset = XINT (XCAR (tail)),
3786                          CHARSET_VALID_P (charset)))
3787                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3788                   {
3789                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3790                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3791                   }
3792                 else
3793                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3794                 tail = XCDR (tail);
3795                 while (CONSP (tail))
3796                   {
3797                     if ((INTEGERP (XCAR (tail))
3798                          && (charset = XINT (XCAR (tail)),
3799                              CHARSET_VALID_P (charset)))
3800                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3801                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3802                         = i;
3803                     else if (EQ (XCAR (tail), Qt))
3804                       reg_bits |= 1 << i;
3805                     tail = XCDR (tail);
3806                   }
3807               }
3808             else
3809               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3810
3811             CODING_SPEC_ISO_DESIGNATION (coding, i)
3812               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3813           }
3814
3815         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3816           {
3817             /* REG 1 can be used only by locking shift in 7-bit env.  */
3818             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3819               reg_bits &= ~2;
3820             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3821               /* Without any shifting, only REG 0 and 1 can be used.  */
3822               reg_bits &= 3;
3823           }
3824
3825         if (reg_bits)
3826           for (charset = 0; charset <= MAX_CHARSET; charset++)
3827             {
3828               if (CHARSET_DEFINED_P (charset)
3829                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3830                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3831                 {
3832                   /* There exist some default graphic registers to be
3833                      used by CHARSET.  */
3834
3835                   /* We had better avoid designating a charset of
3836                      CHARS96 to REG 0 as far as possible.  */
3837                   if (CHARSET_CHARS (charset) == 96)
3838                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3839                       = (reg_bits & 2
3840                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3841                   else
3842                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3843                       = (reg_bits & 1
3844                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3845                 }
3846             }
3847       }
3848       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3849       coding->spec.iso2022.last_invalid_designation_register = -1;
3850       break;
3851
3852     case 3:
3853       coding->type = coding_type_big5;
3854       coding->common_flags
3855         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3856       coding->flags
3857         = (NILP (XVECTOR (coding_spec)->contents[4])
3858            ? CODING_FLAG_BIG5_HKU
3859            : CODING_FLAG_BIG5_ETEN);
3860       break;
3861
3862     case 4:
3863       coding->type = coding_type_ccl;
3864       coding->common_flags
3865         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3866       {
3867         val = XVECTOR (coding_spec)->contents[4];
3868         if (! CONSP (val)
3869             || setup_ccl_program (&(coding->spec.ccl.decoder),
3870                                   XCAR (val)) < 0
3871             || setup_ccl_program (&(coding->spec.ccl.encoder),
3872                                   XCDR (val)) < 0)
3873           goto label_invalid_coding_system;
3874
3875         bzero (coding->spec.ccl.valid_codes, 256);
3876         val = Fplist_get (plist, Qvalid_codes);
3877         if (CONSP (val))
3878           {
3879             Lisp_Object this;
3880
3881             for (; CONSP (val); val = XCDR (val))
3882               {
3883                 this = XCAR (val);
3884                 if (INTEGERP (this)
3885                     && XINT (this) >= 0 && XINT (this) < 256)
3886                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3887                 else if (CONSP (this)
3888                          && INTEGERP (XCAR (this))
3889                          && INTEGERP (XCDR (this)))
3890                   {
3891                     int start = XINT (XCAR (this));
3892                     int end = XINT (XCDR (this));
3893
3894                     if (start >= 0 && start <= end && end < 256)
3895                       while (start <= end)
3896                         coding->spec.ccl.valid_codes[start++] = 1;
3897                   }
3898               }
3899           }
3900       }
3901       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3902       coding->spec.ccl.cr_carryover = 0;
3903       coding->spec.ccl.eight_bit_carryover[0] = 0;
3904       break;
3905
3906     case 5:
3907       coding->type = coding_type_raw_text;
3908       break;
3909
3910     default:
3911       goto label_invalid_coding_system;
3912     }
3913   return 0;
3914
3915  label_invalid_coding_system:
3916   coding->type = coding_type_no_conversion;
3917   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3918   coding->common_flags = 0;
3919   coding->eol_type = CODING_EOL_LF;
3920   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3921   return -1;
3922 }
3923
3924 /* Free memory blocks allocated for storing composition information.  */
3925
3926 void
3927 coding_free_composition_data (coding)
3928      struct coding_system *coding;
3929 {
3930   struct composition_data *cmp_data = coding->cmp_data, *next;
3931
3932   if (!cmp_data)
3933     return;
3934   /* Memory blocks are chained.  At first, rewind to the first, then,
3935      free blocks one by one.  */
3936   while (cmp_data->prev)
3937     cmp_data = cmp_data->prev;
3938   while (cmp_data)
3939     {
3940       next = cmp_data->next;
3941       xfree (cmp_data);
3942       cmp_data = next;
3943     }
3944   coding->cmp_data = NULL;
3945 }
3946
3947 /* Set `char_offset' member of all memory blocks pointed by
3948    coding->cmp_data to POS.  */
3949
3950 void
3951 coding_adjust_composition_offset (coding, pos)
3952      struct coding_system *coding;
3953      int pos;
3954 {
3955   struct composition_data *cmp_data;
3956
3957   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3958     cmp_data->char_offset = pos;
3959 }
3960
3961 /* Setup raw-text or one of its subsidiaries in the structure
3962    coding_system CODING according to the already setup value eol_type
3963    in CODING.  CODING should be setup for some coding system in
3964    advance.  */
3965
3966 void
3967 setup_raw_text_coding_system (coding)
3968      struct coding_system *coding;
3969 {
3970   if (coding->type != coding_type_raw_text)
3971     {
3972       coding->symbol = Qraw_text;
3973       coding->type = coding_type_raw_text;
3974       if (coding->eol_type != CODING_EOL_UNDECIDED)
3975         {
3976           Lisp_Object subsidiaries;
3977           subsidiaries = Fget (Qraw_text, Qeol_type);
3978
3979           if (VECTORP (subsidiaries)
3980               && XVECTOR (subsidiaries)->size == 3)
3981             coding->symbol
3982               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3983         }
3984       setup_coding_system (coding->symbol, coding);
3985     }
3986   return;
3987 }
3988
3989 /* Emacs has a mechanism to automatically detect a coding system if it
3990    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3991    it's impossible to distinguish some coding systems accurately
3992    because they use the same range of codes.  So, at first, coding
3993    systems are categorized into 7, those are:
3994
3995    o coding-category-emacs-mule
3996
3997         The category for a coding system which has the same code range
3998         as Emacs' internal format.  Assigned the coding-system (Lisp
3999         symbol) `emacs-mule' by default.
4000
4001    o coding-category-sjis
4002
4003         The category for a coding system which has the same code range
4004         as SJIS.  Assigned the coding-system (Lisp
4005         symbol) `japanese-shift-jis' by default.
4006
4007    o coding-category-iso-7
4008
4009         The category for a coding system which has the same code range
4010         as ISO2022 of 7-bit environment.  This doesn't use any locking
4011         shift and single shift functions.  This can encode/decode all
4012         charsets.  Assigned the coding-system (Lisp symbol)
4013         `iso-2022-7bit' by default.
4014
4015    o coding-category-iso-7-tight
4016
4017         Same as coding-category-iso-7 except that this can
4018         encode/decode only the specified charsets.
4019
4020    o coding-category-iso-8-1
4021
4022         The category for a coding system which has the same code range
4023         as ISO2022 of 8-bit environment and graphic plane 1 used only
4024         for DIMENSION1 charset.  This doesn't use any locking shift
4025         and single shift functions.  Assigned the coding-system (Lisp
4026         symbol) `iso-latin-1' by default.
4027
4028    o coding-category-iso-8-2
4029
4030         The category for a coding system which has the same code range
4031         as ISO2022 of 8-bit environment and graphic plane 1 used only
4032         for DIMENSION2 charset.  This doesn't use any locking shift
4033         and single shift functions.  Assigned the coding-system (Lisp
4034         symbol) `japanese-iso-8bit' by default.
4035
4036    o coding-category-iso-7-else
4037
4038         The category for a coding system which has the same code range
4039         as ISO2022 of 7-bit environment but uses locking shift or
4040         single shift functions.  Assigned the coding-system (Lisp
4041         symbol) `iso-2022-7bit-lock' by default.
4042
4043    o coding-category-iso-8-else
4044
4045         The category for a coding system which has the same code range
4046         as ISO2022 of 8-bit environment but uses locking shift or
4047         single shift functions.  Assigned the coding-system (Lisp
4048         symbol) `iso-2022-8bit-ss2' by default.
4049
4050    o coding-category-big5
4051
4052         The category for a coding system which has the same code range
4053         as BIG5.  Assigned the coding-system (Lisp symbol)
4054         `cn-big5' by default.
4055
4056    o coding-category-utf-8
4057
4058         The category for a coding system which has the same code range
4059         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4060         symbol) `utf-8' by default.
4061
4062    o coding-category-utf-16-be
4063
4064         The category for a coding system in which a text has an
4065         Unicode signature (cf. Unicode Standard) in the order of BIG
4066         endian at the head.  Assigned the coding-system (Lisp symbol)
4067         `utf-16-be' by default.
4068
4069    o coding-category-utf-16-le
4070
4071         The category for a coding system in which a text has an
4072         Unicode signature (cf. Unicode Standard) in the order of
4073         LITTLE endian at the head.  Assigned the coding-system (Lisp
4074         symbol) `utf-16-le' by default.
4075
4076    o coding-category-ccl
4077
4078         The category for a coding system of which encoder/decoder is
4079         written in CCL programs.  The default value is nil, i.e., no
4080         coding system is assigned.
4081
4082    o coding-category-binary
4083
4084         The category for a coding system not categorized in any of the
4085         above.  Assigned the coding-system (Lisp symbol)
4086         `no-conversion' by default.
4087
4088    Each of them is a Lisp symbol and the value is an actual
4089    `coding-system' (this is also a Lisp symbol) assigned by a user.
4090    What Emacs does actually is to detect a category of coding system.
4091    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4092    decide a single possible category, it selects a category of the
4093    highest priority.  Priorities of categories are also specified by a
4094    user in a Lisp variable `coding-category-list'.
4095
4096 */
4097
4098 static
4099 int ascii_skip_code[256];
4100
4101 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4102    If it detects possible coding systems, return an integer in which
4103    appropriate flag bits are set.  Flag bits are defined by macros
4104    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4105    it should point the table `coding_priorities'.  In that case, only
4106    the flag bit for a coding system of the highest priority is set in
4107    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4108    range 0x80..0x9F are in multibyte form.
4109
4110    How many ASCII characters are at the head is returned as *SKIP.  */
4111
4112 static int
4113 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4114      unsigned char *source;
4115      int src_bytes, *priorities, *skip;
4116      int multibytep;
4117 {
4118   register unsigned char c;
4119   unsigned char *src = source, *src_end = source + src_bytes;
4120   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4121   int i;
4122
4123   /* At first, skip all ASCII characters and control characters except
4124      for three ISO2022 specific control characters.  */
4125   ascii_skip_code[ISO_CODE_SO] = 0;
4126   ascii_skip_code[ISO_CODE_SI] = 0;
4127   ascii_skip_code[ISO_CODE_ESC] = 0;
4128
4129  label_loop_detect_coding:
4130   while (src < src_end && ascii_skip_code[*src]) src++;
4131   *skip = src - source;
4132
4133   if (src >= src_end)
4134     /* We found nothing other than ASCII.  There's nothing to do.  */
4135     return 0;
4136
4137   c = *src;
4138   /* The text seems to be encoded in some multilingual coding system.
4139      Now, try to find in which coding system the text is encoded.  */
4140   if (c < 0x80)
4141     {
4142       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4143       /* C is an ISO2022 specific control code of C0.  */
4144       mask = detect_coding_iso2022 (src, src_end, multibytep);
4145       if (mask == 0)
4146         {
4147           /* No valid ISO2022 code follows C.  Try again.  */
4148           src++;
4149           if (c == ISO_CODE_ESC)
4150             ascii_skip_code[ISO_CODE_ESC] = 1;
4151           else
4152             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4153           goto label_loop_detect_coding;
4154         }
4155       if (priorities)
4156         {
4157           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4158             {
4159               if (mask & priorities[i])
4160                 return priorities[i];
4161             }
4162           return CODING_CATEGORY_MASK_RAW_TEXT;
4163         }
4164     }
4165   else
4166     {
4167       int try;
4168
4169       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4170         c = src[1] - 0x20;
4171
4172       if (c < 0xA0)
4173         {
4174           /* C is the first byte of SJIS character code,
4175              or a leading-code of Emacs' internal format (emacs-mule),
4176              or the first byte of UTF-16.  */
4177           try = (CODING_CATEGORY_MASK_SJIS
4178                   | CODING_CATEGORY_MASK_EMACS_MULE
4179                   | CODING_CATEGORY_MASK_UTF_16_BE
4180                   | CODING_CATEGORY_MASK_UTF_16_LE);
4181
4182           /* Or, if C is a special latin extra code,
4183              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4184              or is an ISO2022 control-sequence-introducer (CSI),
4185              we should also consider the possibility of ISO2022 codings.  */
4186           if ((VECTORP (Vlatin_extra_code_table)
4187                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4188               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4189               || (c == ISO_CODE_CSI
4190                   && (src < src_end
4191                       && (*src == ']'
4192                           || ((*src == '0' || *src == '1' || *src == '2')
4193                               && src + 1 < src_end
4194                               && src[1] == ']')))))
4195             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4196                      | CODING_CATEGORY_MASK_ISO_8BIT);
4197         }
4198       else
4199         /* C is a character of ISO2022 in graphic plane right,
4200            or a SJIS's 1-byte character code (i.e. JISX0201),
4201            or the first byte of BIG5's 2-byte code,
4202            or the first byte of UTF-8/16.  */
4203         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4204                 | CODING_CATEGORY_MASK_ISO_8BIT
4205                 | CODING_CATEGORY_MASK_SJIS
4206                 | CODING_CATEGORY_MASK_BIG5
4207                 | CODING_CATEGORY_MASK_UTF_8
4208                 | CODING_CATEGORY_MASK_UTF_16_BE
4209                 | CODING_CATEGORY_MASK_UTF_16_LE);
4210
4211       /* Or, we may have to consider the possibility of CCL.  */
4212       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4213           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4214               ->spec.ccl.valid_codes)[c])
4215         try |= CODING_CATEGORY_MASK_CCL;
4216
4217       mask = 0;
4218       utf16_examined_p = iso2022_examined_p = 0;
4219       if (priorities)
4220         {
4221           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4222             {
4223               if (!iso2022_examined_p
4224                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4225                 {
4226                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4227                   iso2022_examined_p = 1;
4228                 }
4229               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4230                 mask |= detect_coding_sjis (src, src_end, multibytep);
4231               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4232                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4233               else if (!utf16_examined_p
4234                        && (priorities[i] & try &
4235                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4236                 {
4237                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4238                   utf16_examined_p = 1;
4239                 }
4240               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4241                 mask |= detect_coding_big5 (src, src_end, multibytep);
4242               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4243                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4244               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4245                 mask |= detect_coding_ccl (src, src_end, multibytep);
4246               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4247                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4248               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4249                 mask |= CODING_CATEGORY_MASK_BINARY;
4250               if (mask & priorities[i])
4251                 return priorities[i];
4252             }
4253           return CODING_CATEGORY_MASK_RAW_TEXT;
4254         }
4255       if (try & CODING_CATEGORY_MASK_ISO)
4256         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4257       if (try & CODING_CATEGORY_MASK_SJIS)
4258         mask |= detect_coding_sjis (src, src_end, multibytep);
4259       if (try & CODING_CATEGORY_MASK_BIG5)
4260         mask |= detect_coding_big5 (src, src_end, multibytep);
4261       if (try & CODING_CATEGORY_MASK_UTF_8)
4262         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4263       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4264         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4265       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4266         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4267       if (try & CODING_CATEGORY_MASK_CCL)
4268         mask |= detect_coding_ccl (src, src_end, multibytep);
4269     }
4270   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4271 }
4272
4273 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4274    The information of the detected coding system is set in CODING.  */
4275
4276 void
4277 detect_coding (coding, src, src_bytes)
4278      struct coding_system *coding;
4279      const unsigned char *src;
4280      int src_bytes;
4281 {
4282   unsigned int idx;
4283   int skip, mask;
4284   Lisp_Object val;
4285
4286   val = Vcoding_category_list;
4287   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4288                              coding->src_multibyte);
4289   coding->heading_ascii = skip;
4290
4291   if (!mask) return;
4292
4293   /* We found a single coding system of the highest priority in MASK.  */
4294   idx = 0;
4295   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4296   if (! mask)
4297     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4298
4299   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4300
4301   if (coding->eol_type != CODING_EOL_UNDECIDED)
4302     {
4303       Lisp_Object tmp;
4304
4305       tmp = Fget (val, Qeol_type);
4306       if (VECTORP (tmp))
4307         val = XVECTOR (tmp)->contents[coding->eol_type];
4308     }
4309
4310   /* Setup this new coding system while preserving some slots.  */
4311   {
4312     int src_multibyte = coding->src_multibyte;
4313     int dst_multibyte = coding->dst_multibyte;
4314
4315     setup_coding_system (val, coding);
4316     coding->src_multibyte = src_multibyte;
4317     coding->dst_multibyte = dst_multibyte;
4318     coding->heading_ascii = skip;
4319   }
4320 }
4321
4322 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4323    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4324    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4325
4326    How many non-eol characters are at the head is returned as *SKIP.  */
4327
4328 #define MAX_EOL_CHECK_COUNT 3
4329
4330 static int
4331 detect_eol_type (source, src_bytes, skip)
4332      unsigned char *source;
4333      int src_bytes, *skip;
4334 {
4335   unsigned char *src = source, *src_end = src + src_bytes;
4336   unsigned char c;
4337   int total = 0;                /* How many end-of-lines are found so far.  */
4338   int eol_type = CODING_EOL_UNDECIDED;
4339   int this_eol_type;
4340
4341   *skip = 0;
4342
4343   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4344     {
4345       c = *src++;
4346       if (c == '\n' || c == '\r')
4347         {
4348           if (*skip == 0)
4349             *skip = src - 1 - source;
4350           total++;
4351           if (c == '\n')
4352             this_eol_type = CODING_EOL_LF;
4353           else if (src >= src_end || *src != '\n')
4354             this_eol_type = CODING_EOL_CR;
4355           else
4356             this_eol_type = CODING_EOL_CRLF, src++;
4357
4358           if (eol_type == CODING_EOL_UNDECIDED)
4359             /* This is the first end-of-line.  */
4360             eol_type = this_eol_type;
4361           else if (eol_type != this_eol_type)
4362             {
4363               /* The found type is different from what found before.  */
4364               eol_type = CODING_EOL_INCONSISTENT;
4365               break;
4366             }
4367         }
4368     }
4369
4370   if (*skip == 0)
4371     *skip = src_end - source;
4372   return eol_type;
4373 }
4374
4375 /* Like detect_eol_type, but detect EOL type in 2-octet
4376    big-endian/little-endian format for coding systems utf-16-be and
4377    utf-16-le.  */
4378
4379 static int
4380 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4381      unsigned char *source;
4382      int src_bytes, *skip, big_endian_p;
4383 {
4384   unsigned char *src = source, *src_end = src + src_bytes;
4385   unsigned int c1, c2;
4386   int total = 0;                /* How many end-of-lines are found so far.  */
4387   int eol_type = CODING_EOL_UNDECIDED;
4388   int this_eol_type;
4389   int msb, lsb;
4390
4391   if (big_endian_p)
4392     msb = 0, lsb = 1;
4393   else
4394     msb = 1, lsb = 0;
4395
4396   *skip = 0;
4397
4398   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4399     {
4400       c1 = (src[msb] << 8) | (src[lsb]);
4401       src += 2;
4402
4403       if (c1 == '\n' || c1 == '\r')
4404         {
4405           if (*skip == 0)
4406             *skip = src - 2 - source;
4407           total++;
4408           if (c1 == '\n')
4409             {
4410               this_eol_type = CODING_EOL_LF;
4411             }
4412           else
4413             {
4414               if ((src + 1) >= src_end)
4415                 {
4416                   this_eol_type = CODING_EOL_CR;
4417                 }
4418               else
4419                 {
4420                   c2 = (src[msb] << 8) | (src[lsb]);
4421                   if (c2 == '\n')
4422                     this_eol_type = CODING_EOL_CRLF, src += 2;
4423                   else
4424                     this_eol_type = CODING_EOL_CR;
4425                 }
4426             }
4427
4428           if (eol_type == CODING_EOL_UNDECIDED)
4429             /* This is the first end-of-line.  */
4430             eol_type = this_eol_type;
4431           else if (eol_type != this_eol_type)
4432             {
4433               /* The found type is different from what found before.  */
4434               eol_type = CODING_EOL_INCONSISTENT;
4435               break;
4436             }
4437         }
4438     }
4439
4440   if (*skip == 0)
4441     *skip = src_end - source;
4442   return eol_type;
4443 }
4444
4445 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4446    is encoded.  If it detects an appropriate format of end-of-line, it
4447    sets the information in *CODING.  */
4448
4449 void
4450 detect_eol (coding, src, src_bytes)
4451      struct coding_system *coding;
4452      const unsigned char *src;
4453      int src_bytes;
4454 {
4455   Lisp_Object val;
4456   int skip;
4457   int eol_type;
4458
4459   switch (coding->category_idx)
4460     {
4461     case CODING_CATEGORY_IDX_UTF_16_BE:
4462       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4463       break;
4464     case CODING_CATEGORY_IDX_UTF_16_LE:
4465       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4466       break;
4467     default:
4468       eol_type = detect_eol_type (src, src_bytes, &skip);
4469       break;
4470     }
4471
4472   if (coding->heading_ascii > skip)
4473     coding->heading_ascii = skip;
4474   else
4475     skip = coding->heading_ascii;
4476
4477   if (eol_type == CODING_EOL_UNDECIDED)
4478     return;
4479   if (eol_type == CODING_EOL_INCONSISTENT)
4480     {
4481 #if 0
4482       /* This code is suppressed until we find a better way to
4483          distinguish raw text file and binary file.  */
4484
4485       /* If we have already detected that the coding is raw-text, the
4486          coding should actually be no-conversion.  */
4487       if (coding->type == coding_type_raw_text)
4488         {
4489           setup_coding_system (Qno_conversion, coding);
4490           return;
4491         }
4492       /* Else, let's decode only text code anyway.  */
4493 #endif /* 0 */
4494       eol_type = CODING_EOL_LF;
4495     }
4496
4497   val = Fget (coding->symbol, Qeol_type);
4498   if (VECTORP (val) && XVECTOR (val)->size == 3)
4499     {
4500       int src_multibyte = coding->src_multibyte;
4501       int dst_multibyte = coding->dst_multibyte;
4502       struct composition_data *cmp_data = coding->cmp_data;
4503
4504       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4505       coding->src_multibyte = src_multibyte;
4506       coding->dst_multibyte = dst_multibyte;
4507       coding->heading_ascii = skip;
4508       coding->cmp_data = cmp_data;
4509     }
4510 }
4511
4512 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4513
4514 #define DECODING_BUFFER_MAG(coding)                     \
4515   (coding->type == coding_type_iso2022                  \
4516    ? 3                                                  \
4517    : (coding->type == coding_type_ccl                   \
4518       ? coding->spec.ccl.decoder.buf_magnification      \
4519       : 2))
4520
4521 /* Return maximum size (bytes) of a buffer enough for decoding
4522    SRC_BYTES of text encoded in CODING.  */
4523
4524 int
4525 decoding_buffer_size (coding, src_bytes)
4526      struct coding_system *coding;
4527      int src_bytes;
4528 {
4529   return (src_bytes * DECODING_BUFFER_MAG (coding)
4530           + CONVERSION_BUFFER_EXTRA_ROOM);
4531 }
4532
4533 /* Return maximum size (bytes) of a buffer enough for encoding
4534    SRC_BYTES of text to CODING.  */
4535
4536 int
4537 encoding_buffer_size (coding, src_bytes)
4538      struct coding_system *coding;
4539      int src_bytes;
4540 {
4541   int magnification;
4542
4543   if (coding->type == coding_type_ccl)
4544     {
4545       magnification = coding->spec.ccl.encoder.buf_magnification;
4546       if (coding->eol_type == CODING_EOL_CRLF)
4547         magnification *= 2;
4548     }
4549   else if (CODING_REQUIRE_ENCODING (coding))
4550     magnification = 3;
4551   else
4552     magnification = 1;
4553
4554   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4555 }
4556
4557 /* Working buffer for code conversion.  */
4558 struct conversion_buffer
4559 {
4560   int size;                     /* size of data.  */
4561   int on_stack;                 /* 1 if allocated by alloca.  */
4562   unsigned char *data;
4563 };
4564
4565 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4566 #define allocate_conversion_buffer(buf, len)            \
4567   do {                                                  \
4568     if (len < MAX_ALLOCA)                               \
4569       {                                                 \
4570         buf.data = (unsigned char *) alloca (len);      \
4571         buf.on_stack = 1;                               \
4572       }                                                 \
4573     else                                                \
4574       {                                                 \
4575         buf.data = (unsigned char *) xmalloc (len);     \
4576         buf.on_stack = 0;                               \
4577       }                                                 \
4578     buf.size = len;                                     \
4579   } while (0)
4580
4581 /* Double the allocated memory for *BUF.  */
4582 static void
4583 extend_conversion_buffer (buf)
4584      struct conversion_buffer *buf;
4585 {
4586   if (buf->on_stack)
4587     {
4588       unsigned char *save = buf->data;
4589       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4590       bcopy (save, buf->data, buf->size);
4591       buf->on_stack = 0;
4592     }
4593   else
4594     {
4595       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4596     }
4597   buf->size *= 2;
4598 }
4599
4600 /* Free the allocated memory for BUF if it is not on stack.  */
4601 static void
4602 free_conversion_buffer (buf)
4603      struct conversion_buffer *buf;
4604 {
4605   if (!buf->on_stack)
4606     xfree (buf->data);
4607 }
4608
4609 int
4610 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4611      struct coding_system *coding;
4612      unsigned char *source, *destination;
4613      int src_bytes, dst_bytes, encodep;
4614 {
4615   struct ccl_program *ccl
4616     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4617   unsigned char *dst = destination;
4618
4619   ccl->suppress_error = coding->suppress_error;
4620   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4621   if (encodep)
4622     {
4623       /* On encoding, EOL format is converted within ccl_driver.  For
4624          that, setup proper information in the structure CCL.  */
4625       ccl->eol_type = coding->eol_type;
4626       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4627         ccl->eol_type = CODING_EOL_LF;
4628       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4629       ccl->eight_bit_control = coding->dst_multibyte;
4630     }
4631   else
4632     ccl->eight_bit_control = 1;
4633   ccl->multibyte = coding->src_multibyte;
4634   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4635     {
4636       /* Move carryover bytes to DESTINATION.  */
4637       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4638       while (*p)
4639         *dst++ = *p++;
4640       coding->spec.ccl.eight_bit_carryover[0] = 0;
4641       if (dst_bytes)
4642         dst_bytes -= dst - destination;
4643     }
4644
4645   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4646                                   &(coding->consumed))
4647                       + dst - destination);
4648
4649   if (encodep)
4650     {
4651       coding->produced_char = coding->produced;
4652       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4653     }
4654   else if (!ccl->eight_bit_control)
4655     {
4656       /* The produced bytes forms a valid multibyte sequence. */
4657       coding->produced_char
4658         = multibyte_chars_in_text (destination, coding->produced);
4659       coding->spec.ccl.eight_bit_carryover[0] = 0;
4660     }
4661   else
4662     {
4663       /* On decoding, the destination should always multibyte.  But,
4664          CCL program might have been generated an invalid multibyte
4665          sequence.  Here we make such a sequence valid as
4666          multibyte.  */
4667       int bytes
4668         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4669
4670       if ((coding->consumed < src_bytes
4671            || !ccl->last_block)
4672           && coding->produced >= 1
4673           && destination[coding->produced - 1] >= 0x80)
4674         {
4675           /* We should not convert the tailing 8-bit codes to
4676              multibyte form even if they doesn't form a valid
4677              multibyte sequence.  They may form a valid sequence in
4678              the next call.  */
4679           int carryover = 0;
4680
4681           if (destination[coding->produced - 1] < 0xA0)
4682             carryover = 1;
4683           else if (coding->produced >= 2)
4684             {
4685               if (destination[coding->produced - 2] >= 0x80)
4686                 {
4687                   if (destination[coding->produced - 2] < 0xA0)
4688                     carryover = 2;
4689                   else if (coding->produced >= 3
4690                            && destination[coding->produced - 3] >= 0x80
4691                            && destination[coding->produced - 3] < 0xA0)
4692                     carryover = 3;
4693                 }
4694             }
4695           if (carryover > 0)
4696             {
4697               BCOPY_SHORT (destination + coding->produced - carryover,
4698                            coding->spec.ccl.eight_bit_carryover,
4699                            carryover);
4700               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4701               coding->produced -= carryover;
4702             }
4703         }
4704       coding->produced = str_as_multibyte (destination, bytes,
4705                                            coding->produced,
4706                                            &(coding->produced_char));
4707     }
4708
4709   switch (ccl->status)
4710     {
4711     case CCL_STAT_SUSPEND_BY_SRC:
4712       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4713       break;
4714     case CCL_STAT_SUSPEND_BY_DST:
4715       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4716       break;
4717     case CCL_STAT_QUIT:
4718     case CCL_STAT_INVALID_CMD:
4719       coding->result = CODING_FINISH_INTERRUPT;
4720       break;
4721     default:
4722       coding->result = CODING_FINISH_NORMAL;
4723       break;
4724     }
4725   return coding->result;
4726 }
4727
4728 /* Decode EOL format of the text at PTR of BYTES length destructively
4729    according to CODING->eol_type.  This is called after the CCL
4730    program produced a decoded text at PTR.  If we do CRLF->LF
4731    conversion, update CODING->produced and CODING->produced_char.  */
4732
4733 static void
4734 decode_eol_post_ccl (coding, ptr, bytes)
4735      struct coding_system *coding;
4736      unsigned char *ptr;
4737      int bytes;
4738 {
4739   Lisp_Object val, saved_coding_symbol;
4740   unsigned char *pend = ptr + bytes;
4741   int dummy;
4742
4743   /* Remember the current coding system symbol.  We set it back when
4744      an inconsistent EOL is found so that `last-coding-system-used' is
4745      set to the coding system that doesn't specify EOL conversion.  */
4746   saved_coding_symbol = coding->symbol;
4747
4748   coding->spec.ccl.cr_carryover = 0;
4749   if (coding->eol_type == CODING_EOL_UNDECIDED)
4750     {
4751       /* Here, to avoid the call of setup_coding_system, we directly
4752          call detect_eol_type.  */
4753       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4754       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4755         coding->eol_type = CODING_EOL_LF;
4756       if (coding->eol_type != CODING_EOL_UNDECIDED)
4757         {
4758           val = Fget (coding->symbol, Qeol_type);
4759           if (VECTORP (val) && XVECTOR (val)->size == 3)
4760             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4761         }
4762       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4763     }
4764
4765   if (coding->eol_type == CODING_EOL_LF
4766       || coding->eol_type == CODING_EOL_UNDECIDED)
4767     {
4768       /* We have nothing to do.  */
4769       ptr = pend;
4770     }
4771   else if (coding->eol_type == CODING_EOL_CRLF)
4772     {
4773       unsigned char *pstart = ptr, *p = ptr;
4774
4775       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4776           && *(pend - 1) == '\r')
4777         {
4778           /* If the last character is CR, we can't handle it here
4779              because LF will be in the not-yet-decoded source text.
4780              Record that the CR is not yet processed.  */
4781           coding->spec.ccl.cr_carryover = 1;
4782           coding->produced--;
4783           coding->produced_char--;
4784           pend--;
4785         }
4786       while (ptr < pend)
4787         {
4788           if (*ptr == '\r')
4789             {
4790               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4791                 {
4792                   *p++ = '\n';
4793                   ptr += 2;
4794                 }
4795               else
4796                 {
4797                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4798                     goto undo_eol_conversion;
4799                   *p++ = *ptr++;
4800                 }
4801             }
4802           else if (*ptr == '\n'
4803                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4804             goto undo_eol_conversion;
4805           else
4806             *p++ = *ptr++;
4807           continue;
4808
4809         undo_eol_conversion:
4810           /* We have faced with inconsistent EOL format at PTR.
4811              Convert all LFs before PTR back to CRLFs.  */
4812           for (p--, ptr--; p >= pstart; p--)
4813             {
4814               if (*p == '\n')
4815                 *ptr-- = '\n', *ptr-- = '\r';
4816               else
4817                 *ptr-- = *p;
4818             }
4819           /*  If carryover is recorded, cancel it because we don't
4820               convert CRLF anymore.  */
4821           if (coding->spec.ccl.cr_carryover)
4822             {
4823               coding->spec.ccl.cr_carryover = 0;
4824               coding->produced++;
4825               coding->produced_char++;
4826               pend++;
4827             }
4828           p = ptr = pend;
4829           coding->eol_type = CODING_EOL_LF;
4830           coding->symbol = saved_coding_symbol;
4831         }
4832       if (p < pend)
4833         {
4834           /* As each two-byte sequence CRLF was converted to LF, (PEND
4835              - P) is the number of deleted characters.  */
4836           coding->produced -= pend - p;
4837           coding->produced_char -= pend - p;
4838         }
4839     }
4840   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4841     {
4842       unsigned char *p = ptr;
4843
4844       for (; ptr < pend; ptr++)
4845         {
4846           if (*ptr == '\r')
4847             *ptr = '\n';
4848           else if (*ptr == '\n'
4849                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4850             {
4851               for (; p < ptr; p++)
4852                 {
4853                   if (*p == '\n')
4854                     *p = '\r';
4855                 }
4856               ptr = pend;
4857               coding->eol_type = CODING_EOL_LF;
4858               coding->symbol = saved_coding_symbol;
4859             }
4860         }
4861     }
4862 }
4863
4864 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4865    decoding, it may detect coding system and format of end-of-line if
4866    those are not yet decided.  The source should be unibyte, the
4867    result is multibyte if CODING->dst_multibyte is nonzero, else
4868    unibyte.  */
4869
4870 int
4871 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4872      struct coding_system *coding;
4873      const unsigned char *source;
4874      unsigned char *destination;
4875      int src_bytes, dst_bytes;
4876 {
4877   int extra = 0;
4878
4879   if (coding->type == coding_type_undecided)
4880     detect_coding (coding, source, src_bytes);
4881
4882   if (coding->eol_type == CODING_EOL_UNDECIDED
4883       && coding->type != coding_type_ccl)
4884     {
4885       detect_eol (coding, source, src_bytes);
4886       /* We had better recover the original eol format if we
4887          encounter an inconsistent eol format while decoding.  */
4888       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4889     }
4890
4891   coding->produced = coding->produced_char = 0;
4892   coding->consumed = coding->consumed_char = 0;
4893   coding->errors = 0;
4894   coding->result = CODING_FINISH_NORMAL;
4895
4896   switch (coding->type)
4897     {
4898     case coding_type_sjis:
4899       decode_coding_sjis_big5 (coding, source, destination,
4900                                src_bytes, dst_bytes, 1);
4901       break;
4902
4903     case coding_type_iso2022:
4904       decode_coding_iso2022 (coding, source, destination,
4905                              src_bytes, dst_bytes);
4906       break;
4907
4908     case coding_type_big5:
4909       decode_coding_sjis_big5 (coding, source, destination,
4910                                src_bytes, dst_bytes, 0);
4911       break;
4912
4913     case coding_type_emacs_mule:
4914       decode_coding_emacs_mule (coding, source, destination,
4915                                 src_bytes, dst_bytes);
4916       break;
4917
4918     case coding_type_ccl:
4919       if (coding->spec.ccl.cr_carryover)
4920         {
4921           /* Put the CR which was not processed by the previous call
4922              of decode_eol_post_ccl in DESTINATION.  It will be
4923              decoded together with the following LF by the call to
4924              decode_eol_post_ccl below.  */
4925           *destination = '\r';
4926           coding->produced++;
4927           coding->produced_char++;
4928           dst_bytes--;
4929           extra = coding->spec.ccl.cr_carryover;
4930         }
4931       ccl_coding_driver (coding, source, destination + extra,
4932                          src_bytes, dst_bytes, 0);
4933       if (coding->eol_type != CODING_EOL_LF)
4934         {
4935           coding->produced += extra;
4936           coding->produced_char += extra;
4937           decode_eol_post_ccl (coding, destination, coding->produced);
4938         }
4939       break;
4940
4941     default:
4942       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4943     }
4944
4945   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4946       && coding->mode & CODING_MODE_LAST_BLOCK
4947       && coding->consumed == src_bytes)
4948     coding->result = CODING_FINISH_NORMAL;
4949
4950   if (coding->mode & CODING_MODE_LAST_BLOCK
4951       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4952     {
4953       const unsigned char *src = source + coding->consumed;
4954       unsigned char *dst = destination + coding->produced;
4955
4956       src_bytes -= coding->consumed;
4957       coding->errors++;
4958       if (COMPOSING_P (coding))
4959         DECODE_COMPOSITION_END ('1');
4960       while (src_bytes--)
4961         {
4962           int c = *src++;
4963           dst += CHAR_STRING (c, dst);
4964           coding->produced_char++;
4965         }
4966       coding->consumed = coding->consumed_char = src - source;
4967       coding->produced = dst - destination;
4968       coding->result = CODING_FINISH_NORMAL;
4969     }
4970
4971   if (!coding->dst_multibyte)
4972     {
4973       coding->produced = str_as_unibyte (destination, coding->produced);
4974       coding->produced_char = coding->produced;
4975     }
4976
4977   return coding->result;
4978 }
4979
4980 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4981    multibyteness of the source is CODING->src_multibyte, the
4982    multibyteness of the result is always unibyte.  */
4983
4984 int
4985 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4986      struct coding_system *coding;
4987      const unsigned char *source;
4988      unsigned char *destination;
4989      int src_bytes, dst_bytes;
4990 {
4991   coding->produced = coding->produced_char = 0;
4992   coding->consumed = coding->consumed_char = 0;
4993   coding->errors = 0;
4994   coding->result = CODING_FINISH_NORMAL;
4995
4996   switch (coding->type)
4997     {
4998     case coding_type_sjis:
4999       encode_coding_sjis_big5 (coding, source, destination,
5000                                src_bytes, dst_bytes, 1);
5001       break;
5002
5003     case coding_type_iso2022:
5004       encode_coding_iso2022 (coding, source, destination,
5005                              src_bytes, dst_bytes);
5006       break;
5007
5008     case coding_type_big5:
5009       encode_coding_sjis_big5 (coding, source, destination,
5010                                src_bytes, dst_bytes, 0);
5011       break;
5012
5013     case coding_type_emacs_mule:
5014       encode_coding_emacs_mule (coding, source, destination,
5015                                 src_bytes, dst_bytes);
5016       break;
5017
5018     case coding_type_ccl:
5019       ccl_coding_driver (coding, source, destination,
5020                          src_bytes, dst_bytes, 1);
5021       break;
5022
5023     default:
5024       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5025     }
5026
5027   if (coding->mode & CODING_MODE_LAST_BLOCK
5028       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5029     {
5030       const unsigned char *src = source + coding->consumed;
5031       unsigned char *dst = destination + coding->produced;
5032
5033       if (coding->type == coding_type_iso2022)
5034         ENCODE_RESET_PLANE_AND_REGISTER;
5035       if (COMPOSING_P (coding))
5036         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5037       if (coding->consumed < src_bytes)
5038         {
5039           int len = src_bytes - coding->consumed;
5040
5041           BCOPY_SHORT (src, dst, len);
5042           if (coding->src_multibyte)
5043             len = str_as_unibyte (dst, len);
5044           dst += len;
5045           coding->consumed = src_bytes;
5046         }
5047       coding->produced = coding->produced_char = dst - destination;
5048       coding->result = CODING_FINISH_NORMAL;
5049     }
5050
5051   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5052       && coding->consumed == src_bytes)
5053     coding->result = CODING_FINISH_NORMAL;
5054
5055   return coding->result;
5056 }
5057
5058 /* Scan text in the region between *BEG and *END (byte positions),
5059    skip characters which we don't have to decode by coding system
5060    CODING at the head and tail, then set *BEG and *END to the region
5061    of the text we actually have to convert.  The caller should move
5062    the gap out of the region in advance if the region is from a
5063    buffer.
5064
5065    If STR is not NULL, *BEG and *END are indices into STR.  */
5066
5067 static void
5068 shrink_decoding_region (beg, end, coding, str)
5069      int *beg, *end;
5070      struct coding_system *coding;
5071      unsigned char *str;
5072 {
5073   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5074   int eol_conversion;
5075   Lisp_Object translation_table;
5076
5077   if (coding->type == coding_type_ccl
5078       || coding->type == coding_type_undecided
5079       || coding->eol_type != CODING_EOL_LF
5080       || !NILP (coding->post_read_conversion)
5081       || coding->composing != COMPOSITION_DISABLED)
5082     {
5083       /* We can't skip any data.  */
5084       return;
5085     }
5086   if (coding->type == coding_type_no_conversion
5087       || coding->type == coding_type_raw_text
5088       || coding->type == coding_type_emacs_mule)
5089     {
5090       /* We need no conversion, but don't have to skip any data here.
5091          Decoding routine handles them effectively anyway.  */
5092       return;
5093     }
5094
5095   translation_table = coding->translation_table_for_decode;
5096   if (NILP (translation_table) && !NILP (Venable_character_translation))
5097     translation_table = Vstandard_translation_table_for_decode;
5098   if (CHAR_TABLE_P (translation_table))
5099     {
5100       int i;
5101       for (i = 0; i < 128; i++)
5102         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5103           break;
5104       if (i < 128)
5105         /* Some ASCII character should be translated.  We give up
5106            shrinking.  */
5107         return;
5108     }
5109
5110   if (coding->heading_ascii >= 0)
5111     /* Detection routine has already found how much we can skip at the
5112        head.  */
5113     *beg += coding->heading_ascii;
5114
5115   if (str)
5116     {
5117       begp_orig = begp = str + *beg;
5118       endp_orig = endp = str + *end;
5119     }
5120   else
5121     {
5122       begp_orig = begp = BYTE_POS_ADDR (*beg);
5123       endp_orig = endp = begp + *end - *beg;
5124     }
5125
5126   eol_conversion = (coding->eol_type == CODING_EOL_CR
5127                     || coding->eol_type == CODING_EOL_CRLF);
5128
5129   switch (coding->type)
5130     {
5131     case coding_type_sjis:
5132     case coding_type_big5:
5133       /* We can skip all ASCII characters at the head.  */
5134       if (coding->heading_ascii < 0)
5135         {
5136           if (eol_conversion)
5137             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5138           else
5139             while (begp < endp && *begp < 0x80) begp++;
5140         }
5141       /* We can skip all ASCII characters at the tail except for the
5142          second byte of SJIS or BIG5 code.  */
5143       if (eol_conversion)
5144         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5145       else
5146         while (begp < endp && endp[-1] < 0x80) endp--;
5147       /* Do not consider LF as ascii if preceded by CR, since that
5148          confuses eol decoding. */
5149       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5150         endp++;
5151       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5152         endp++;
5153       break;
5154
5155     case coding_type_iso2022:
5156       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5157         /* We can't skip any data.  */
5158         break;
5159       if (coding->heading_ascii < 0)
5160         {
5161           /* We can skip all ASCII characters at the head except for a
5162              few control codes.  */
5163           while (begp < endp && (c = *begp) < 0x80
5164                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5165                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5166                  && (!eol_conversion || c != ISO_CODE_LF))
5167             begp++;
5168         }
5169       switch (coding->category_idx)
5170         {
5171         case CODING_CATEGORY_IDX_ISO_8_1:
5172         case CODING_CATEGORY_IDX_ISO_8_2:
5173           /* We can skip all ASCII characters at the tail.  */
5174           if (eol_conversion)
5175             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5176           else
5177             while (begp < endp && endp[-1] < 0x80) endp--;
5178           /* Do not consider LF as ascii if preceded by CR, since that
5179              confuses eol decoding. */
5180           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5181             endp++;
5182           break;
5183
5184         case CODING_CATEGORY_IDX_ISO_7:
5185         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5186           {
5187             /* We can skip all characters at the tail except for 8-bit
5188                codes and ESC and the following 2-byte at the tail.  */
5189             unsigned char *eight_bit = NULL;
5190
5191             if (eol_conversion)
5192               while (begp < endp
5193                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5194                 {
5195                   if (!eight_bit && c & 0x80) eight_bit = endp;
5196                   endp--;
5197                 }
5198             else
5199               while (begp < endp
5200                      && (c = endp[-1]) != ISO_CODE_ESC)
5201                 {
5202                   if (!eight_bit && c & 0x80) eight_bit = endp;
5203                   endp--;
5204                 }
5205             /* Do not consider LF as ascii if preceded by CR, since that
5206                confuses eol decoding. */
5207             if (begp < endp && endp < endp_orig
5208                 && endp[-1] == '\r' && endp[0] == '\n')
5209               endp++;
5210             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5211               {
5212                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5213                   /* This is an ASCII designation sequence.  We can
5214                      surely skip the tail.  But, if we have
5215                      encountered an 8-bit code, skip only the codes
5216                      after that.  */
5217                   endp = eight_bit ? eight_bit : endp + 2;
5218                 else
5219                   /* Hmmm, we can't skip the tail.  */
5220                   endp = endp_orig;
5221               }
5222             else if (eight_bit)
5223               endp = eight_bit;
5224           }
5225         }
5226       break;
5227
5228     default:
5229       abort ();
5230     }
5231   *beg += begp - begp_orig;
5232   *end += endp - endp_orig;
5233   return;
5234 }
5235
5236 /* Like shrink_decoding_region but for encoding.  */
5237
5238 static void
5239 shrink_encoding_region (beg, end, coding, str)
5240      int *beg, *end;
5241      struct coding_system *coding;
5242      unsigned char *str;
5243 {
5244   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5245   int eol_conversion;
5246   Lisp_Object translation_table;
5247
5248   if (coding->type == coding_type_ccl
5249       || coding->eol_type == CODING_EOL_CRLF
5250       || coding->eol_type == CODING_EOL_CR
5251       || (coding->cmp_data && coding->cmp_data->used > 0))
5252     {
5253       /* We can't skip any data.  */
5254       return;
5255     }
5256   if (coding->type == coding_type_no_conversion
5257       || coding->type == coding_type_raw_text
5258       || coding->type == coding_type_emacs_mule
5259       || coding->type == coding_type_undecided)
5260     {
5261       /* We need no conversion, but don't have to skip any data here.
5262          Encoding routine handles them effectively anyway.  */
5263       return;
5264     }
5265
5266   translation_table = coding->translation_table_for_encode;
5267   if (NILP (translation_table) && !NILP (Venable_character_translation))
5268     translation_table = Vstandard_translation_table_for_encode;
5269   if (CHAR_TABLE_P (translation_table))
5270     {
5271       int i;
5272       for (i = 0; i < 128; i++)
5273         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5274           break;
5275       if (i < 128)
5276         /* Some ASCII character should be translated.  We give up
5277            shrinking.  */
5278         return;
5279     }
5280
5281   if (str)
5282     {
5283       begp_orig = begp = str + *beg;
5284       endp_orig = endp = str + *end;
5285     }
5286   else
5287     {
5288       begp_orig = begp = BYTE_POS_ADDR (*beg);
5289       endp_orig = endp = begp + *end - *beg;
5290     }
5291
5292   eol_conversion = (coding->eol_type == CODING_EOL_CR
5293                     || coding->eol_type == CODING_EOL_CRLF);
5294
5295   /* Here, we don't have to check coding->pre_write_conversion because
5296      the caller is expected to have handled it already.  */
5297   switch (coding->type)
5298     {
5299     case coding_type_iso2022:
5300       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5301         /* We can't skip any data.  */
5302         break;
5303       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5304         {
5305           unsigned char *bol = begp;
5306           while (begp < endp && *begp < 0x80)
5307             {
5308               begp++;
5309               if (begp[-1] == '\n')
5310                 bol = begp;
5311             }
5312           begp = bol;
5313           goto label_skip_tail;
5314         }
5315       /* fall down ... */
5316
5317     case coding_type_sjis:
5318     case coding_type_big5:
5319       /* We can skip all ASCII characters at the head and tail.  */
5320       if (eol_conversion)
5321         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5322       else
5323         while (begp < endp && *begp < 0x80) begp++;
5324     label_skip_tail:
5325       if (eol_conversion)
5326         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5327       else
5328         while (begp < endp && *(endp - 1) < 0x80) endp--;
5329       break;
5330
5331     default:
5332       abort ();
5333     }
5334
5335   *beg += begp - begp_orig;
5336   *end += endp - endp_orig;
5337   return;
5338 }
5339
5340 /* As shrinking conversion region requires some overhead, we don't try
5341    shrinking if the length of conversion region is less than this
5342    value.  */
5343 static int shrink_conversion_region_threshhold = 1024;
5344
5345 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5346   do {                                                                  \
5347     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5348       {                                                                 \
5349         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5350         else shrink_decoding_region (beg, end, coding, str);            \
5351       }                                                                 \
5352   } while (0)
5353
5354 static Lisp_Object
5355 code_convert_region_unwind (arg)
5356      Lisp_Object arg;
5357 {
5358   inhibit_pre_post_conversion = 0;
5359   Vlast_coding_system_used = arg;
5360   return Qnil;
5361 }
5362
5363 /* Store information about all compositions in the range FROM and TO
5364    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5365    buffer or a string, defaults to the current buffer.  */
5366
5367 void
5368 coding_save_composition (coding, from, to, obj)
5369      struct coding_system *coding;
5370      int from, to;
5371      Lisp_Object obj;
5372 {
5373   Lisp_Object prop;
5374   int start, end;
5375
5376   if (coding->composing == COMPOSITION_DISABLED)
5377     return;
5378   if (!coding->cmp_data)
5379     coding_allocate_composition_data (coding, from);
5380   if (!find_composition (from, to, &start, &end, &prop, obj)
5381       || end > to)
5382     return;
5383   if (start < from
5384       && (!find_composition (end, to, &start, &end, &prop, obj)
5385           || end > to))
5386     return;
5387   coding->composing = COMPOSITION_NO;
5388   do
5389     {
5390       if (COMPOSITION_VALID_P (start, end, prop))
5391         {
5392           enum composition_method method = COMPOSITION_METHOD (prop);
5393           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5394               >= COMPOSITION_DATA_SIZE)
5395             coding_allocate_composition_data (coding, from);
5396           /* For relative composition, we remember start and end
5397              positions, for the other compositions, we also remember
5398              components.  */
5399           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5400           if (method != COMPOSITION_RELATIVE)
5401             {
5402               /* We must store a*/
5403               Lisp_Object val, ch;
5404
5405               val = COMPOSITION_COMPONENTS (prop);
5406               if (CONSP (val))
5407                 while (CONSP (val))
5408                   {
5409                     ch = XCAR (val), val = XCDR (val);
5410                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5411                   }
5412               else if (VECTORP (val) || STRINGP (val))
5413                 {
5414                   int len = (VECTORP (val)
5415                              ? XVECTOR (val)->size : SCHARS (val));
5416                   int i;
5417                   for (i = 0; i < len; i++)
5418                     {
5419                       ch = (STRINGP (val)
5420                             ? Faref (val, make_number (i))
5421                             : XVECTOR (val)->contents[i]);
5422                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5423                     }
5424                 }
5425               else              /* INTEGERP (val) */
5426                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5427             }
5428           CODING_ADD_COMPOSITION_END (coding, end - from);
5429         }
5430       start = end;
5431     }
5432   while (start < to
5433          && find_composition (start, to, &start, &end, &prop, obj)
5434          && end <= to);
5435
5436   /* Make coding->cmp_data point to the first memory block.  */
5437   while (coding->cmp_data->prev)
5438     coding->cmp_data = coding->cmp_data->prev;
5439   coding->cmp_data_start = 0;
5440 }
5441
5442 /* Reflect the saved information about compositions to OBJ.
5443    CODING->cmp_data points to a memory block for the information.  OBJ
5444    is a buffer or a string, defaults to the current buffer.  */
5445
5446 void
5447 coding_restore_composition (coding, obj)
5448      struct coding_system *coding;
5449      Lisp_Object obj;
5450 {
5451   struct composition_data *cmp_data = coding->cmp_data;
5452
5453   if (!cmp_data)
5454     return;
5455
5456   while (cmp_data->prev)
5457     cmp_data = cmp_data->prev;
5458
5459   while (cmp_data)
5460     {
5461       int i;
5462
5463       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5464            i += cmp_data->data[i])
5465         {
5466           int *data = cmp_data->data + i;
5467           enum composition_method method = (enum composition_method) data[3];
5468           Lisp_Object components;
5469
5470           if (data[0] < 0 || i + data[0] > cmp_data->used)
5471             /* Invalid composition data.  */
5472             break;
5473
5474           if (method == COMPOSITION_RELATIVE)
5475             components = Qnil;
5476           else
5477             {
5478               int len = data[0] - 4, j;
5479               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5480
5481               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5482                   && len % 2 == 0)
5483                 len --;
5484               if (len < 1)
5485                 /* Invalid composition data.  */
5486                 break;
5487               for (j = 0; j < len; j++)
5488                 args[j] = make_number (data[4 + j]);
5489               components = (method == COMPOSITION_WITH_ALTCHARS
5490                             ? Fstring (len, args)
5491                             : Fvector (len, args));
5492             }
5493           compose_text (data[1], data[2], components, Qnil, obj);
5494         }
5495       cmp_data = cmp_data->next;
5496     }
5497 }
5498
5499 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5500    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5501    coding system CODING, and return the status code of code conversion
5502    (currently, this value has no meaning).
5503
5504    How many characters (and bytes) are converted to how many
5505    characters (and bytes) are recorded in members of the structure
5506    CODING.
5507
5508    If REPLACE is nonzero, we do various things as if the original text
5509    is deleted and a new text is inserted.  See the comments in
5510    replace_range (insdel.c) to know what we are doing.
5511
5512    If REPLACE is zero, it is assumed that the source text is unibyte.
5513    Otherwise, it is assumed that the source text is multibyte.  */
5514
5515 int
5516 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5517      int from, from_byte, to, to_byte, encodep, replace;
5518      struct coding_system *coding;
5519 {
5520   int len = to - from, len_byte = to_byte - from_byte;
5521   int nchars_del = 0, nbytes_del = 0;
5522   int require, inserted, inserted_byte;
5523   int head_skip, tail_skip, total_skip = 0;
5524   Lisp_Object saved_coding_symbol;
5525   int first = 1;
5526   unsigned char *src, *dst;
5527   Lisp_Object deletion;
5528   int orig_point = PT, orig_len = len;
5529   int prev_Z;
5530   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5531
5532   deletion = Qnil;
5533   saved_coding_symbol = coding->symbol;
5534
5535   if (from < PT && PT < to)
5536     {
5537       TEMP_SET_PT_BOTH (from, from_byte);
5538       orig_point = from;
5539     }
5540
5541   if (replace)
5542     {
5543       int saved_from = from;
5544       int saved_inhibit_modification_hooks;
5545
5546       prepare_to_modify_buffer (from, to, &from);
5547       if (saved_from != from)
5548         {
5549           to = from + len;
5550           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5551           len_byte = to_byte - from_byte;
5552         }
5553
5554       /* The code conversion routine can not preserve text properties
5555          for now.  So, we must remove all text properties in the
5556          region.  Here, we must suppress all modification hooks.  */
5557       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5558       inhibit_modification_hooks = 1;
5559       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5560       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5561     }
5562
5563   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5564     {
5565       /* We must detect encoding of text and eol format.  */
5566
5567       if (from < GPT && to > GPT)
5568         move_gap_both (from, from_byte);
5569       if (coding->type == coding_type_undecided)
5570         {
5571           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5572           if (coding->type == coding_type_undecided)
5573             {
5574               /* It seems that the text contains only ASCII, but we
5575                  should not leave it undecided because the deeper
5576                  decoding routine (decode_coding) tries to detect the
5577                  encodings again in vain.  */
5578               coding->type = coding_type_emacs_mule;
5579               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5580               /* As emacs-mule decoder will handle composition, we
5581                  need this setting to allocate coding->cmp_data
5582                  later.  */
5583               coding->composing = COMPOSITION_NO;
5584             }
5585         }
5586       if (coding->eol_type == CODING_EOL_UNDECIDED
5587           && coding->type != coding_type_ccl)
5588         {
5589           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5590           if (coding->eol_type == CODING_EOL_UNDECIDED)
5591             coding->eol_type = CODING_EOL_LF;
5592           /* We had better recover the original eol format if we
5593              encounter an inconsistent eol format while decoding.  */
5594           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5595         }
5596     }
5597
5598   /* Now we convert the text.  */
5599
5600   /* For encoding, we must process pre-write-conversion in advance.  */
5601   if (! inhibit_pre_post_conversion
5602       && encodep
5603       && SYMBOLP (coding->pre_write_conversion)
5604       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5605     {
5606       /* The function in pre-write-conversion may put a new text in a
5607          new buffer.  */
5608       struct buffer *prev = current_buffer;
5609       Lisp_Object new;
5610
5611       record_unwind_protect (code_convert_region_unwind,
5612                              Vlast_coding_system_used);
5613       /* We should not call any more pre-write/post-read-conversion
5614          functions while this pre-write-conversion is running.  */
5615       inhibit_pre_post_conversion = 1;
5616       call2 (coding->pre_write_conversion,
5617              make_number (from), make_number (to));
5618       inhibit_pre_post_conversion = 0;
5619       /* Discard the unwind protect.  */
5620       specpdl_ptr--;
5621
5622       if (current_buffer != prev)
5623         {
5624           len = ZV - BEGV;
5625           new = Fcurrent_buffer ();
5626           set_buffer_internal_1 (prev);
5627           del_range_2 (from, from_byte, to, to_byte, 0);
5628           TEMP_SET_PT_BOTH (from, from_byte);
5629           insert_from_buffer (XBUFFER (new), 1, len, 0);
5630           Fkill_buffer (new);
5631           if (orig_point >= to)
5632             orig_point += len - orig_len;
5633           else if (orig_point > from)
5634             orig_point = from;
5635           orig_len = len;
5636           to = from + len;
5637           from_byte = CHAR_TO_BYTE (from);
5638           to_byte = CHAR_TO_BYTE (to);
5639           len_byte = to_byte - from_byte;
5640           TEMP_SET_PT_BOTH (from, from_byte);
5641         }
5642     }
5643
5644   if (replace)
5645     {
5646       if (! EQ (current_buffer->undo_list, Qt))
5647         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5648       else
5649         {
5650           nchars_del = to - from;
5651           nbytes_del = to_byte - from_byte;
5652         }
5653     }
5654
5655   if (coding->composing != COMPOSITION_DISABLED)
5656     {
5657       if (encodep)
5658         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5659       else
5660         coding_allocate_composition_data (coding, from);
5661     }
5662
5663   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5664      if we must run CCL program or there are compositions to
5665      encode.  */
5666   if (coding->type != coding_type_ccl
5667       && (! coding->cmp_data || coding->cmp_data->used == 0))
5668     {
5669       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5670
5671       if (from < GPT && GPT < to)
5672         move_gap_both (from, from_byte);
5673       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5674       if (from_byte == to_byte
5675           && (encodep || NILP (coding->post_read_conversion))
5676           && ! CODING_REQUIRE_FLUSHING (coding))
5677         {
5678           coding->produced = len_byte;
5679           coding->produced_char = len;
5680           if (!replace)
5681             /* We must record and adjust for this new text now.  */
5682             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5683           coding_free_composition_data (coding);
5684           return 0;
5685         }
5686
5687       head_skip = from_byte - from_byte_orig;
5688       tail_skip = to_byte_orig - to_byte;
5689       total_skip = head_skip + tail_skip;
5690       from += head_skip;
5691       to -= tail_skip;
5692       len -= total_skip; len_byte -= total_skip;
5693     }
5694
5695   /* For conversion, we must put the gap before the text in addition to
5696      making the gap larger for efficient decoding.  The required gap
5697      size starts from 2000 which is the magic number used in make_gap.
5698      But, after one batch of conversion, it will be incremented if we
5699      find that it is not enough .  */
5700   require = 2000;
5701
5702   if (GAP_SIZE  < require)
5703     make_gap (require - GAP_SIZE);
5704   move_gap_both (from, from_byte);
5705
5706   inserted = inserted_byte = 0;
5707
5708   GAP_SIZE += len_byte;
5709   ZV -= len;
5710   Z -= len;
5711   ZV_BYTE -= len_byte;
5712   Z_BYTE -= len_byte;
5713
5714   if (GPT - BEG < BEG_UNCHANGED)
5715     BEG_UNCHANGED = GPT - BEG;
5716   if (Z - GPT < END_UNCHANGED)
5717     END_UNCHANGED = Z - GPT;
5718
5719   if (!encodep && coding->src_multibyte)
5720     {
5721       /* Decoding routines expects that the source text is unibyte.
5722          We must convert 8-bit characters of multibyte form to
5723          unibyte.  */
5724       int len_byte_orig = len_byte;
5725       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5726       if (len_byte < len_byte_orig)
5727         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5728                     len_byte);
5729       coding->src_multibyte = 0;
5730     }
5731
5732   for (;;)
5733     {
5734       int result;
5735
5736       /* The buffer memory is now:
5737          +--------+converted-text+---------+-------original-text-------+---+
5738          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5739                   |<---------------------- GAP ----------------------->|  */
5740       src = GAP_END_ADDR - len_byte;
5741       dst = GPT_ADDR + inserted_byte;
5742
5743       if (encodep)
5744         result = encode_coding (coding, src, dst, len_byte, 0);
5745       else
5746         {
5747           if (coding->composing != COMPOSITION_DISABLED)
5748             coding->cmp_data->char_offset = from + inserted;
5749           result = decode_coding (coding, src, dst, len_byte, 0);
5750         }
5751
5752       /* The buffer memory is now:
5753          +--------+-------converted-text----+--+------original-text----+---+
5754          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5755                   |<---------------------- GAP ----------------------->|  */
5756
5757       inserted += coding->produced_char;
5758       inserted_byte += coding->produced;
5759       len_byte -= coding->consumed;
5760
5761       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5762         {
5763           coding_allocate_composition_data (coding, from + inserted);
5764           continue;
5765         }
5766
5767       src += coding->consumed;
5768       dst += coding->produced;
5769
5770       if (result == CODING_FINISH_NORMAL)
5771         {
5772           src += len_byte;
5773           break;
5774         }
5775       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5776         {
5777           unsigned char *pend = dst, *p = pend - inserted_byte;
5778           Lisp_Object eol_type;
5779
5780           /* Encode LFs back to the original eol format (CR or CRLF).  */
5781           if (coding->eol_type == CODING_EOL_CR)
5782             {
5783               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5784             }
5785           else
5786             {
5787               int count = 0;
5788
5789               while (p < pend) if (*p++ == '\n') count++;
5790               if (src - dst < count)
5791                 {
5792                   /* We don't have sufficient room for encoding LFs
5793                      back to CRLF.  We must record converted and
5794                      not-yet-converted text back to the buffer
5795                      content, enlarge the gap, then record them out of
5796                      the buffer contents again.  */
5797                   int add = len_byte + inserted_byte;
5798
5799                   GAP_SIZE -= add;
5800                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5801                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5802                   make_gap (count - GAP_SIZE);
5803                   GAP_SIZE += add;
5804                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5805                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5806                   /* Don't forget to update SRC, DST, and PEND.  */
5807                   src = GAP_END_ADDR - len_byte;
5808                   dst = GPT_ADDR + inserted_byte;
5809                   pend = dst;
5810                 }
5811               inserted += count;
5812               inserted_byte += count;
5813               coding->produced += count;
5814               p = dst = pend + count;
5815               while (count)
5816                 {
5817                   *--p = *--pend;
5818                   if (*p == '\n') count--, *--p = '\r';
5819                 }
5820             }
5821
5822           /* Suppress eol-format conversion in the further conversion.  */
5823           coding->eol_type = CODING_EOL_LF;
5824
5825           /* Set the coding system symbol to that for Unix-like EOL.  */
5826           eol_type = Fget (saved_coding_symbol, Qeol_type);
5827           if (VECTORP (eol_type)
5828               && XVECTOR (eol_type)->size == 3
5829               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5830             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5831           else
5832             coding->symbol = saved_coding_symbol;
5833
5834           continue;
5835         }
5836       if (len_byte <= 0)
5837         {
5838           if (coding->type != coding_type_ccl
5839               || coding->mode & CODING_MODE_LAST_BLOCK)
5840             break;
5841           coding->mode |= CODING_MODE_LAST_BLOCK;
5842           continue;
5843         }
5844       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5845         {
5846           /* The source text ends in invalid codes.  Let's just
5847              make them valid buffer contents, and finish conversion.  */
5848           if (multibyte_p)
5849             {
5850               unsigned char *start = dst;
5851
5852               inserted += len_byte;
5853               while (len_byte--)
5854                 {
5855                   int c = *src++;
5856                   dst += CHAR_STRING (c, dst);
5857                 }
5858
5859               inserted_byte += dst - start;
5860             }
5861           else
5862             {
5863               inserted += len_byte;
5864               inserted_byte += len_byte;
5865               while (len_byte--)
5866                 *dst++ = *src++;
5867             }
5868           break;
5869         }
5870       if (result == CODING_FINISH_INTERRUPT)
5871         {
5872           /* The conversion procedure was interrupted by a user.  */
5873           break;
5874         }
5875       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5876       if (coding->consumed < 1)
5877         {
5878           /* It's quite strange to require more memory without
5879              consuming any bytes.  Perhaps CCL program bug.  */
5880           break;
5881         }
5882       if (first)
5883         {
5884           /* We have just done the first batch of conversion which was
5885              stopped because of insufficient gap.  Let's reconsider the
5886              required gap size (i.e. SRT - DST) now.
5887
5888              We have converted ORIG bytes (== coding->consumed) into
5889              NEW bytes (coding->produced).  To convert the remaining
5890              LEN bytes, we may need REQUIRE bytes of gap, where:
5891                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5892                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5893              Here, we are sure that NEW >= ORIG.  */
5894
5895           if (coding->produced <= coding->consumed)
5896             {
5897               /* This happens because of CCL-based coding system with
5898                  eol-type CRLF.  */
5899               require = 0;
5900             }
5901           else
5902             {
5903               float ratio = coding->produced - coding->consumed;
5904               ratio /= coding->consumed;
5905               require = len_byte * ratio;
5906             }
5907           first = 0;
5908         }
5909       if ((src - dst) < (require + 2000))
5910         {
5911           /* See the comment above the previous call of make_gap.  */
5912           int add = len_byte + inserted_byte;
5913
5914           GAP_SIZE -= add;
5915           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5916           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5917           make_gap (require + 2000);
5918           GAP_SIZE += add;
5919           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5920           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5921         }
5922     }
5923   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5924
5925   if (encodep && coding->dst_multibyte)
5926     {
5927       /* The output is unibyte.  We must convert 8-bit characters to
5928          multibyte form.  */
5929       if (inserted_byte * 2 > GAP_SIZE)
5930         {
5931           GAP_SIZE -= inserted_byte;
5932           ZV += inserted_byte; Z += inserted_byte;
5933           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5934           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5935           make_gap (inserted_byte - GAP_SIZE);
5936           GAP_SIZE += inserted_byte;
5937           ZV -= inserted_byte; Z -= inserted_byte;
5938           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5939           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5940         }
5941       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5942     }
5943
5944   /* If we shrank the conversion area, adjust it now.  */
5945   if (total_skip > 0)
5946     {
5947       if (tail_skip > 0)
5948         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5949       inserted += total_skip; inserted_byte += total_skip;
5950       GAP_SIZE += total_skip;
5951       GPT -= head_skip; GPT_BYTE -= head_skip;
5952       ZV -= total_skip; ZV_BYTE -= total_skip;
5953       Z -= total_skip; Z_BYTE -= total_skip;
5954       from -= head_skip; from_byte -= head_skip;
5955       to += tail_skip; to_byte += tail_skip;
5956     }
5957
5958   prev_Z = Z;
5959   if (! EQ (current_buffer->undo_list, Qt))
5960     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5961   else
5962     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5963                                  inserted, inserted_byte);
5964   inserted = Z - prev_Z;
5965
5966   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5967     coding_restore_composition (coding, Fcurrent_buffer ());
5968   coding_free_composition_data (coding);
5969
5970   if (! inhibit_pre_post_conversion
5971       && ! encodep && ! NILP (coding->post_read_conversion))
5972     {
5973       Lisp_Object val;
5974       Lisp_Object saved_coding_system;
5975
5976       if (from != PT)
5977         TEMP_SET_PT_BOTH (from, from_byte);
5978       prev_Z = Z;
5979       record_unwind_protect (code_convert_region_unwind,
5980                              Vlast_coding_system_used);
5981       saved_coding_system = Vlast_coding_system_used;
5982       Vlast_coding_system_used = coding->symbol;
5983       /* We should not call any more pre-write/post-read-conversion
5984          functions while this post-read-conversion is running.  */
5985       inhibit_pre_post_conversion = 1;
5986       val = call1 (coding->post_read_conversion, make_number (inserted));
5987       inhibit_pre_post_conversion = 0;
5988       coding->symbol = Vlast_coding_system_used;
5989       Vlast_coding_system_used = saved_coding_system;
5990       /* Discard the unwind protect.  */
5991       specpdl_ptr--;
5992       CHECK_NUMBER (val);
5993       inserted += Z - prev_Z;
5994     }
5995
5996   if (orig_point >= from)
5997     {
5998       if (orig_point >= from + orig_len)
5999         orig_point += inserted - orig_len;
6000       else
6001         orig_point = from;
6002       TEMP_SET_PT (orig_point);
6003     }
6004
6005   if (replace)
6006     {
6007       signal_after_change (from, to - from, inserted);
6008       update_compositions (from, from + inserted, CHECK_BORDER);
6009     }
6010
6011   {
6012     coding->consumed = to_byte - from_byte;
6013     coding->consumed_char = to - from;
6014     coding->produced = inserted_byte;
6015     coding->produced_char = inserted;
6016   }
6017
6018   return 0;
6019 }
6020
6021 /* Name (or base name) of work buffer for code conversion.  */
6022 static Lisp_Object Vcode_conversion_workbuf_name;
6023
6024 /* Set the current buffer to the working buffer prepared for
6025    code-conversion.  MULTIBYTE specifies the multibyteness of the
6026    buffer.  */
6027
6028 static struct buffer *
6029 set_conversion_work_buffer (multibyte)
6030      int multibyte;
6031 {
6032   Lisp_Object buffer;
6033   struct buffer *buf;
6034
6035   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6036   buf = XBUFFER (buffer);
6037   delete_all_overlays (buf);
6038   buf->directory = current_buffer->directory;
6039   buf->read_only = Qnil;
6040   buf->filename = Qnil;
6041   buf->undo_list = Qt;
6042   eassert (buf->overlays_before == NULL);
6043   eassert (buf->overlays_after == NULL);
6044   set_buffer_internal (buf);
6045   if (BEG != BEGV || Z != ZV)
6046     Fwiden ();
6047   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6048   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6049   return buf;
6050 }
6051
6052 Lisp_Object
6053 run_pre_post_conversion_on_str (str, coding, encodep)
6054      Lisp_Object str;
6055      struct coding_system *coding;
6056      int encodep;
6057 {
6058   int count = SPECPDL_INDEX ();
6059   struct gcpro gcpro1, gcpro2;
6060   int multibyte = STRING_MULTIBYTE (str);
6061   struct buffer *buf;
6062   Lisp_Object old_deactivate_mark;
6063
6064   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6065   record_unwind_protect (code_convert_region_unwind,
6066                          Vlast_coding_system_used);
6067   /* It is not crucial to specbind this.  */
6068   old_deactivate_mark = Vdeactivate_mark;
6069   GCPRO2 (str, old_deactivate_mark);
6070
6071   /* We must insert the contents of STR as is without
6072      unibyte<->multibyte conversion.  For that, we adjust the
6073      multibyteness of the working buffer to that of STR.  */
6074   set_conversion_work_buffer (multibyte);
6075
6076   insert_from_string (str, 0, 0,
6077                       SCHARS (str), SBYTES (str), 0);
6078   UNGCPRO;
6079   inhibit_pre_post_conversion = 1;
6080   if (encodep)
6081     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6082   else
6083     {
6084       Vlast_coding_system_used = coding->symbol;
6085       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6086       call1 (coding->post_read_conversion, make_number (Z - BEG));
6087       coding->symbol = Vlast_coding_system_used;
6088     }
6089   inhibit_pre_post_conversion = 0;
6090   Vdeactivate_mark = old_deactivate_mark;
6091   str = make_buffer_string (BEG, Z, 1);
6092   return unbind_to (count, str);
6093 }
6094
6095
6096 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6097    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6098    is intended that this function is called from encode_terminal_code,
6099    the pre-write-conversion function is run by safe_call and thus
6100    "Error during redisplay: ..." is logged when an error occurs.
6101
6102    Store the resulting text in *STR and set CODING->produced_char and
6103    CODING->produced to the number of characters and bytes
6104    respectively.  If the size of *STR is too small, enlarge it by
6105    xrealloc and update *STR and *SIZE.  */
6106
6107 void
6108 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6109      unsigned char **str;
6110      int *size, nchars, nbytes;
6111      struct coding_system *coding;
6112 {
6113   struct gcpro gcpro1, gcpro2;
6114   struct buffer *cur = current_buffer;
6115   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6116   Lisp_Object args[3];
6117
6118   /* It is not crucial to specbind this.  */
6119   old_deactivate_mark = Vdeactivate_mark;
6120   old_last_coding_system_used = Vlast_coding_system_used;
6121   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6122
6123   /* We must insert the contents of STR as is without
6124      unibyte<->multibyte conversion.  For that, we adjust the
6125      multibyteness of the working buffer to that of STR.  */
6126   set_conversion_work_buffer (coding->src_multibyte);
6127   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6128   UNGCPRO;
6129   inhibit_pre_post_conversion = 1;
6130   args[0] = coding->pre_write_conversion;
6131   args[1] = make_number (BEG);
6132   args[2] = make_number (Z);
6133   safe_call (3, args);
6134   inhibit_pre_post_conversion = 0;
6135   Vdeactivate_mark = old_deactivate_mark;
6136   Vlast_coding_system_used = old_last_coding_system_used;
6137   coding->produced_char = Z - BEG;
6138   coding->produced = Z_BYTE - BEG_BYTE;
6139   if (coding->produced > *size)
6140     {
6141       *size = coding->produced;
6142       *str = xrealloc (*str, *size);
6143     }
6144   if (BEG < GPT && GPT < Z)
6145     move_gap (BEG);
6146   bcopy (BEG_ADDR, *str, coding->produced);
6147   coding->src_multibyte
6148     = ! NILP (current_buffer->enable_multibyte_characters);
6149   set_buffer_internal (cur);
6150 }
6151
6152
6153 Lisp_Object
6154 decode_coding_string (str, coding, nocopy)
6155      Lisp_Object str;
6156      struct coding_system *coding;
6157      int nocopy;
6158 {
6159   int len;
6160   struct conversion_buffer buf;
6161   int from, to_byte;
6162   Lisp_Object saved_coding_symbol;
6163   int result;
6164   int require_decoding;
6165   int shrinked_bytes = 0;
6166   Lisp_Object newstr;
6167   int consumed, consumed_char, produced, produced_char;
6168
6169   from = 0;
6170   to_byte = SBYTES (str);
6171
6172   saved_coding_symbol = coding->symbol;
6173   coding->src_multibyte = STRING_MULTIBYTE (str);
6174   coding->dst_multibyte = 1;
6175   if (CODING_REQUIRE_DETECTION (coding))
6176     {
6177       /* See the comments in code_convert_region.  */
6178       if (coding->type == coding_type_undecided)
6179         {
6180           detect_coding (coding, SDATA (str), to_byte);
6181           if (coding->type == coding_type_undecided)
6182             {
6183               coding->type = coding_type_emacs_mule;
6184               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6185               /* As emacs-mule decoder will handle composition, we
6186                  need this setting to allocate coding->cmp_data
6187                  later.  */
6188               coding->composing = COMPOSITION_NO;
6189             }
6190         }
6191       if (coding->eol_type == CODING_EOL_UNDECIDED
6192           && coding->type != coding_type_ccl)
6193         {
6194           saved_coding_symbol = coding->symbol;
6195           detect_eol (coding, SDATA (str), to_byte);
6196           if (coding->eol_type == CODING_EOL_UNDECIDED)
6197             coding->eol_type = CODING_EOL_LF;
6198           /* We had better recover the original eol format if we
6199              encounter an inconsistent eol format while decoding.  */
6200           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6201         }
6202     }
6203
6204   if (coding->type == coding_type_no_conversion
6205       || coding->type == coding_type_raw_text)
6206     coding->dst_multibyte = 0;
6207
6208   require_decoding = CODING_REQUIRE_DECODING (coding);
6209
6210   if (STRING_MULTIBYTE (str))
6211     {
6212       /* Decoding routines expect the source text to be unibyte.  */
6213       str = Fstring_as_unibyte (str);
6214       to_byte = SBYTES (str);
6215       nocopy = 1;
6216       coding->src_multibyte = 0;
6217     }
6218
6219   /* Try to skip the heading and tailing ASCIIs.  */
6220   if (require_decoding && coding->type != coding_type_ccl)
6221     {
6222       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6223                                 0);
6224       if (from == to_byte)
6225         require_decoding = 0;
6226       shrinked_bytes = from + (SBYTES (str) - to_byte);
6227     }
6228
6229   if (!require_decoding
6230       && !(SYMBOLP (coding->post_read_conversion)
6231            && !NILP (Ffboundp (coding->post_read_conversion))))
6232     {
6233       coding->consumed = SBYTES (str);
6234       coding->consumed_char = SCHARS (str);
6235       if (coding->dst_multibyte)
6236         {
6237           str = Fstring_as_multibyte (str);
6238           nocopy = 1;
6239         }
6240       coding->produced = SBYTES (str);
6241       coding->produced_char = SCHARS (str);
6242       return (nocopy ? str : Fcopy_sequence (str));
6243     }
6244
6245   if (coding->composing != COMPOSITION_DISABLED)
6246     coding_allocate_composition_data (coding, from);
6247   len = decoding_buffer_size (coding, to_byte - from);
6248   allocate_conversion_buffer (buf, len);
6249
6250   consumed = consumed_char = produced = produced_char = 0;
6251   while (1)
6252     {
6253       result = decode_coding (coding, SDATA (str) + from + consumed,
6254                               buf.data + produced, to_byte - from - consumed,
6255                               buf.size - produced);
6256       consumed += coding->consumed;
6257       consumed_char += coding->consumed_char;
6258       produced += coding->produced;
6259       produced_char += coding->produced_char;
6260       if (result == CODING_FINISH_NORMAL
6261           || (result == CODING_FINISH_INSUFFICIENT_SRC
6262               && coding->consumed == 0))
6263         break;
6264       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6265         coding_allocate_composition_data (coding, from + produced_char);
6266       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6267         extend_conversion_buffer (&buf);
6268       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6269         {
6270           Lisp_Object eol_type;
6271
6272           /* Recover the original EOL format.  */
6273           if (coding->eol_type == CODING_EOL_CR)
6274             {
6275               unsigned char *p;
6276               for (p = buf.data; p < buf.data + produced; p++)
6277                 if (*p == '\n') *p = '\r';
6278             }
6279           else if (coding->eol_type == CODING_EOL_CRLF)
6280             {
6281               int num_eol = 0;
6282               unsigned char *p0, *p1;
6283               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6284                 if (*p0 == '\n') num_eol++;
6285               if (produced + num_eol >= buf.size)
6286                 extend_conversion_buffer (&buf);
6287               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6288                 {
6289                   *--p1 = *--p0;
6290                   if (*p0 == '\n') *--p1 = '\r';
6291                 }
6292               produced += num_eol;
6293               produced_char += num_eol;
6294             }
6295           /* Suppress eol-format conversion in the further conversion.  */
6296           coding->eol_type = CODING_EOL_LF;
6297
6298           /* Set the coding system symbol to that for Unix-like EOL.  */
6299           eol_type = Fget (saved_coding_symbol, Qeol_type);
6300           if (VECTORP (eol_type)
6301               && XVECTOR (eol_type)->size == 3
6302               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6303             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6304           else
6305             coding->symbol = saved_coding_symbol;
6306
6307
6308         }
6309     }
6310
6311   coding->consumed = consumed;
6312   coding->consumed_char = consumed_char;
6313   coding->produced = produced;
6314   coding->produced_char = produced_char;
6315
6316   if (coding->dst_multibyte)
6317     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6318                                            produced + shrinked_bytes);
6319   else
6320     newstr = make_uninit_string (produced + shrinked_bytes);
6321   if (from > 0)
6322     STRING_COPYIN (newstr, 0, SDATA (str), from);
6323   STRING_COPYIN (newstr, from, buf.data, produced);
6324   if (shrinked_bytes > from)
6325     STRING_COPYIN (newstr, from + produced,
6326                    SDATA (str) + to_byte,
6327                    shrinked_bytes - from);
6328   free_conversion_buffer (&buf);
6329
6330   coding->consumed += shrinked_bytes;
6331   coding->consumed_char += shrinked_bytes;
6332   coding->produced += shrinked_bytes;
6333   coding->produced_char += shrinked_bytes;
6334
6335   if (coding->cmp_data && coding->cmp_data->used)
6336     coding_restore_composition (coding, newstr);
6337   coding_free_composition_data (coding);
6338
6339   if (SYMBOLP (coding->post_read_conversion)
6340       && !NILP (Ffboundp (coding->post_read_conversion)))
6341     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6342
6343   return newstr;
6344 }
6345
6346 Lisp_Object
6347 encode_coding_string (str, coding, nocopy)
6348      Lisp_Object str;
6349      struct coding_system *coding;
6350      int nocopy;
6351 {
6352   int len;
6353   struct conversion_buffer buf;
6354   int from, to, to_byte;
6355   int result;
6356   int shrinked_bytes = 0;
6357   Lisp_Object newstr;
6358   int consumed, consumed_char, produced, produced_char;
6359
6360   if (SYMBOLP (coding->pre_write_conversion)
6361       && !NILP (Ffboundp (coding->pre_write_conversion)))
6362     str = run_pre_post_conversion_on_str (str, coding, 1);
6363
6364   from = 0;
6365   to = SCHARS (str);
6366   to_byte = SBYTES (str);
6367
6368   /* Encoding routines determine the multibyteness of the source text
6369      by coding->src_multibyte.  */
6370   coding->src_multibyte = STRING_MULTIBYTE (str);
6371   coding->dst_multibyte = 0;
6372   if (! CODING_REQUIRE_ENCODING (coding))
6373     {
6374       coding->consumed = SBYTES (str);
6375       coding->consumed_char = SCHARS (str);
6376       if (STRING_MULTIBYTE (str))
6377         {
6378           str = Fstring_as_unibyte (str);
6379           nocopy = 1;
6380         }
6381       coding->produced = SBYTES (str);
6382       coding->produced_char = SCHARS (str);
6383       return (nocopy ? str : Fcopy_sequence (str));
6384     }
6385
6386   if (coding->composing != COMPOSITION_DISABLED)
6387     coding_save_composition (coding, from, to, str);
6388
6389   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6390      if we must run CCL program or there are compositions to
6391      encode.  */
6392   if (coding->type != coding_type_ccl
6393       && (! coding->cmp_data || coding->cmp_data->used == 0))
6394     {
6395       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6396                                 1);
6397       if (from == to_byte)
6398         {
6399           coding_free_composition_data (coding);
6400           return (nocopy ? str : Fcopy_sequence (str));
6401         }
6402       shrinked_bytes = from + (SBYTES (str) - to_byte);
6403     }
6404
6405   len = encoding_buffer_size (coding, to_byte - from);
6406   allocate_conversion_buffer (buf, len);
6407
6408   consumed = consumed_char = produced = produced_char = 0;
6409   while (1)
6410     {
6411       result = encode_coding (coding, SDATA (str) + from + consumed,
6412                               buf.data + produced, to_byte - from - consumed,
6413                               buf.size - produced);
6414       consumed += coding->consumed;
6415       consumed_char += coding->consumed_char;
6416       produced += coding->produced;
6417       produced_char += coding->produced_char;
6418       if (result == CODING_FINISH_NORMAL
6419           || result == CODING_FINISH_INTERRUPT
6420           || (result == CODING_FINISH_INSUFFICIENT_SRC
6421               && coding->consumed == 0))
6422         break;
6423       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6424       extend_conversion_buffer (&buf);
6425     }
6426
6427   coding->consumed = consumed;
6428   coding->consumed_char = consumed_char;
6429   coding->produced = produced;
6430   coding->produced_char = produced_char;
6431
6432   newstr = make_uninit_string (produced + shrinked_bytes);
6433   if (from > 0)
6434     STRING_COPYIN (newstr, 0, SDATA (str), from);
6435   STRING_COPYIN (newstr, from, buf.data, produced);
6436   if (shrinked_bytes > from)
6437     STRING_COPYIN (newstr, from + produced,
6438                    SDATA (str) + to_byte,
6439                    shrinked_bytes - from);
6440
6441   free_conversion_buffer (&buf);
6442   coding_free_composition_data (coding);
6443
6444   return newstr;
6445 }
6446
6447 \f
6448 #ifdef emacs
6449 /*** 8. Emacs Lisp library functions ***/
6450
6451 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6452        doc: /* Return t if OBJECT is nil or a coding-system.
6453 See the documentation of `make-coding-system' for information
6454 about coding-system objects.  */)
6455      (obj)
6456      Lisp_Object obj;
6457 {
6458   if (NILP (obj))
6459     return Qt;
6460   if (!SYMBOLP (obj))
6461     return Qnil;
6462   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6463     return Qt;
6464   /* Get coding-spec vector for OBJ.  */
6465   obj = Fget (obj, Qcoding_system);
6466   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6467           ? Qt : Qnil);
6468 }
6469
6470 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6471        Sread_non_nil_coding_system, 1, 1, 0,
6472        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6473      (prompt)
6474      Lisp_Object prompt;
6475 {
6476   Lisp_Object val;
6477   do
6478     {
6479       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6480                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6481     }
6482   while (SCHARS (val) == 0);
6483   return (Fintern (val, Qnil));
6484 }
6485
6486 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6487        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6488 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6489      (prompt, default_coding_system)
6490      Lisp_Object prompt, default_coding_system;
6491 {
6492   Lisp_Object val;
6493   if (SYMBOLP (default_coding_system))
6494     default_coding_system = SYMBOL_NAME (default_coding_system);
6495   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6496                           Qt, Qnil, Qcoding_system_history,
6497                           default_coding_system, Qnil);
6498   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6499 }
6500
6501 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6502        1, 1, 0,
6503        doc: /* Check validity of CODING-SYSTEM.
6504 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6505 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6506 The value of this property should be a vector of length 5.  */)
6507      (coding_system)
6508      Lisp_Object coding_system;
6509 {
6510   Lisp_Object define_form;
6511
6512   define_form = Fget (coding_system, Qcoding_system_define_form);
6513   if (! NILP (define_form))
6514     {
6515       Fput (coding_system, Qcoding_system_define_form, Qnil);
6516       safe_eval (define_form);
6517     }
6518   if (!NILP (Fcoding_system_p (coding_system)))
6519     return coding_system;
6520   while (1)
6521     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6522 }
6523 \f
6524 Lisp_Object
6525 detect_coding_system (src, src_bytes, highest, multibytep)
6526      const unsigned char *src;
6527      int src_bytes, highest;
6528      int multibytep;
6529 {
6530   int coding_mask, eol_type;
6531   Lisp_Object val, tmp;
6532   int dummy;
6533
6534   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6535   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6536   if (eol_type == CODING_EOL_INCONSISTENT)
6537     eol_type = CODING_EOL_UNDECIDED;
6538
6539   if (!coding_mask)
6540     {
6541       val = Qundecided;
6542       if (eol_type != CODING_EOL_UNDECIDED)
6543         {
6544           Lisp_Object val2;
6545           val2 = Fget (Qundecided, Qeol_type);
6546           if (VECTORP (val2))
6547             val = XVECTOR (val2)->contents[eol_type];
6548         }
6549       return (highest ? val : Fcons (val, Qnil));
6550     }
6551
6552   /* At first, gather possible coding systems in VAL.  */
6553   val = Qnil;
6554   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6555     {
6556       Lisp_Object category_val, category_index;
6557
6558       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6559       category_val = Fsymbol_value (XCAR (tmp));
6560       if (!NILP (category_val)
6561           && NATNUMP (category_index)
6562           && (coding_mask & (1 << XFASTINT (category_index))))
6563         {
6564           val = Fcons (category_val, val);
6565           if (highest)
6566             break;
6567         }
6568     }
6569   if (!highest)
6570     val = Fnreverse (val);
6571
6572   /* Then, replace the elements with subsidiary coding systems.  */
6573   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6574     {
6575       if (eol_type != CODING_EOL_UNDECIDED
6576           && eol_type != CODING_EOL_INCONSISTENT)
6577         {
6578           Lisp_Object eol;
6579           eol = Fget (XCAR (tmp), Qeol_type);
6580           if (VECTORP (eol))
6581             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6582         }
6583     }
6584   return (highest ? XCAR (val) : val);
6585 }
6586
6587 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6588        2, 3, 0,
6589        doc: /* Detect how the byte sequence in the region is encoded.
6590 Return a list of possible coding systems used on decoding a byte
6591 sequence containing the bytes in the region between START and END when
6592 the coding system `undecided' is specified.  The list is ordered by
6593 priority decided in the current language environment.
6594
6595 If only ASCII characters are found, it returns a list of single element
6596 `undecided' or its subsidiary coding system according to a detected
6597 end-of-line format.
6598
6599 If optional argument HIGHEST is non-nil, return the coding system of
6600 highest priority.  */)
6601      (start, end, highest)
6602      Lisp_Object start, end, highest;
6603 {
6604   int from, to;
6605   int from_byte, to_byte;
6606   int include_anchor_byte = 0;
6607
6608   CHECK_NUMBER_COERCE_MARKER (start);
6609   CHECK_NUMBER_COERCE_MARKER (end);
6610
6611   validate_region (&start, &end);
6612   from = XINT (start), to = XINT (end);
6613   from_byte = CHAR_TO_BYTE (from);
6614   to_byte = CHAR_TO_BYTE (to);
6615
6616   if (from < GPT && to >= GPT)
6617     move_gap_both (to, to_byte);
6618   /* If we an anchor byte `\0' follows the region, we include it in
6619      the detecting source.  Then code detectors can handle the tailing
6620      byte sequence more accurately.
6621
6622      Fix me: This is not a perfect solution.  It is better that we
6623      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6624   */
6625   if (to == Z || (to == GPT && GAP_SIZE > 0))
6626     include_anchor_byte = 1;
6627   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6628                                to_byte - from_byte + include_anchor_byte,
6629                                !NILP (highest),
6630                                !NILP (current_buffer
6631                                       ->enable_multibyte_characters));
6632 }
6633
6634 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6635        1, 2, 0,
6636        doc: /* Detect how the byte sequence in STRING is encoded.
6637 Return a list of possible coding systems used on decoding a byte
6638 sequence containing the bytes in STRING when the coding system
6639 `undecided' is specified.  The list is ordered by priority decided in
6640 the current language environment.
6641
6642 If only ASCII characters are found, it returns a list of single element
6643 `undecided' or its subsidiary coding system according to a detected
6644 end-of-line format.
6645
6646 If optional argument HIGHEST is non-nil, return the coding system of
6647 highest priority.  */)
6648      (string, highest)
6649      Lisp_Object string, highest;
6650 {
6651   CHECK_STRING (string);
6652
6653   return detect_coding_system (SDATA (string),
6654                                /* "+ 1" is to include the anchor byte
6655                                   `\0'.  With this, code detectors can
6656                                   handle the tailing bytes more
6657                                   accurately.  */
6658                                SBYTES (string) + 1,
6659                                !NILP (highest),
6660                                STRING_MULTIBYTE (string));
6661 }
6662
6663 /*  Subroutine for Fsafe_coding_systems_region_internal.
6664
6665     Return a list of coding systems that safely encode the multibyte
6666     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6667     possible coding systems.  If it is nil, it means that we have not
6668     yet found any coding systems.
6669
6670     WORK_TABLE a char-table of which element is set to t once the
6671     element is looked up.
6672
6673     If a non-ASCII single byte char is found, set
6674     *single_byte_char_found to 1.  */
6675
6676 static Lisp_Object
6677 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6678      unsigned char *p, *pend;
6679      Lisp_Object safe_codings, work_table;
6680      int *single_byte_char_found;
6681 {
6682   int c, len;
6683   Lisp_Object val, ch;
6684   Lisp_Object prev, tail;
6685
6686   if (NILP (safe_codings))
6687     goto done_safe_codings;
6688   while (p < pend)
6689     {
6690       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6691       p += len;
6692       if (ASCII_BYTE_P (c))
6693         /* We can ignore ASCII characters here.  */
6694         continue;
6695       if (SINGLE_BYTE_CHAR_P (c))
6696         *single_byte_char_found = 1;
6697       /* Check the safe coding systems for C.  */
6698       ch = make_number (c);
6699       val = Faref (work_table, ch);
6700       if (EQ (val, Qt))
6701         /* This element was already checked.  Ignore it.  */
6702         continue;
6703       /* Remember that we checked this element.  */
6704       Faset (work_table, ch, Qt);
6705
6706       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6707         {
6708           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6709           int encodable;
6710
6711           elt = XCAR (tail);
6712           if (CONSP (XCDR (elt)))
6713             {
6714               /* This entry has this format now:
6715                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6716                           ACCEPT-LATIN-EXTRA ) */
6717               val = XCDR (elt);
6718               encodable = ! NILP (Faref (XCAR (val), ch));
6719               if (! encodable)
6720                 {
6721                   val = XCDR (val);
6722                   translation_table = XCAR (val);
6723                   hash_table = XCAR (XCDR (val));
6724                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6725                 }
6726             }
6727           else
6728             {
6729               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6730               encodable = ! NILP (Faref (XCDR (elt), ch));
6731               if (! encodable)
6732                 {
6733                   /* Transform the format to:
6734                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6735                        ACCEPT-LATIN-EXTRA )  */
6736                   val = Fget (XCAR (elt), Qcoding_system);
6737                   translation_table
6738                     = Fplist_get (AREF (val, 3),
6739                                   Qtranslation_table_for_encode);
6740                   if (SYMBOLP (translation_table))
6741                     translation_table = Fget (translation_table,
6742                                               Qtranslation_table);
6743                   hash_table
6744                     = (CHAR_TABLE_P (translation_table)
6745                        ? XCHAR_TABLE (translation_table)->extras[1]
6746                        : Qnil);
6747                   accept_latin_extra
6748                     = ((EQ (AREF (val, 0), make_number (2))
6749                         && VECTORP (AREF (val, 4)))
6750                        ? AREF (AREF (val, 4), 16)
6751                        : Qnil);
6752                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6753                                         translation_table, hash_table,
6754                                         accept_latin_extra));
6755                 }
6756             }
6757
6758           if (! encodable
6759               && ((CHAR_TABLE_P (translation_table)
6760                    && ! NILP (Faref (translation_table, ch)))
6761                   || (HASH_TABLE_P (hash_table)
6762                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6763                   || (SINGLE_BYTE_CHAR_P (c)
6764                       && ! NILP (accept_latin_extra)
6765                       && VECTORP (Vlatin_extra_code_table)
6766                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6767             encodable = 1;
6768           if (encodable)
6769             prev = tail;
6770           else
6771             {
6772               /* Exclude this coding system from SAFE_CODINGS.  */
6773               if (EQ (tail, safe_codings))
6774                 {
6775                   safe_codings = XCDR (safe_codings);
6776                   if (NILP (safe_codings))
6777                     goto done_safe_codings;
6778                 }
6779               else
6780                 XSETCDR (prev, XCDR (tail));
6781             }
6782         }
6783     }
6784
6785  done_safe_codings:
6786   /* If the above loop was terminated before P reaches PEND, it means
6787      SAFE_CODINGS was set to nil.  If we have not yet found an
6788      non-ASCII single-byte char, check it now.  */
6789   if (! *single_byte_char_found)
6790     while (p < pend)
6791       {
6792         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6793         p += len;
6794         if (! ASCII_BYTE_P (c)
6795             && SINGLE_BYTE_CHAR_P (c))
6796           {
6797             *single_byte_char_found = 1;
6798             break;
6799           }
6800       }
6801   return safe_codings;
6802 }
6803
6804 DEFUN ("find-coding-systems-region-internal",
6805        Ffind_coding_systems_region_internal,
6806        Sfind_coding_systems_region_internal, 2, 2, 0,
6807        doc: /* Internal use only.  */)
6808      (start, end)
6809      Lisp_Object start, end;
6810 {
6811   Lisp_Object work_table, safe_codings;
6812   int non_ascii_p = 0;
6813   int single_byte_char_found = 0;
6814   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6815
6816   if (STRINGP (start))
6817     {
6818       if (!STRING_MULTIBYTE (start))
6819         return Qt;
6820       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6821       p2 = p2end = p1end;
6822       if (SCHARS (start) != SBYTES (start))
6823         non_ascii_p = 1;
6824     }
6825   else
6826     {
6827       int from, to, stop;
6828
6829       CHECK_NUMBER_COERCE_MARKER (start);
6830       CHECK_NUMBER_COERCE_MARKER (end);
6831       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6832         args_out_of_range (start, end);
6833       if (NILP (current_buffer->enable_multibyte_characters))
6834         return Qt;
6835       from = CHAR_TO_BYTE (XINT (start));
6836       to = CHAR_TO_BYTE (XINT (end));
6837       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6838       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6839       if (stop == to)
6840         p2 = p2end = p1end;
6841       else
6842         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6843       if (XINT (end) - XINT (start) != to - from)
6844         non_ascii_p = 1;
6845     }
6846
6847   if (!non_ascii_p)
6848     {
6849       /* We are sure that the text contains no multibyte character.
6850          Check if it contains eight-bit-graphic.  */
6851       p = p1;
6852       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6853       if (p == p1end)
6854         {
6855           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6856           if (p == p2end)
6857             return Qt;
6858         }
6859     }
6860
6861   /* The text contains non-ASCII characters.  */
6862
6863   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6864   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6865
6866   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6867                                     &single_byte_char_found);
6868   if (p2 < p2end)
6869     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6870                                       &single_byte_char_found);
6871   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6872     safe_codings = Qt;
6873   else
6874     {
6875       /* Turn safe_codings to a list of coding systems... */
6876       Lisp_Object val;
6877
6878       if (single_byte_char_found)
6879         /* ... and append these for eight-bit chars.  */
6880         val = Fcons (Qraw_text,
6881                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6882       else
6883         /* ... and append generic coding systems.  */
6884         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6885
6886       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6887         val = Fcons (XCAR (XCAR (safe_codings)), val);
6888       safe_codings = val;
6889     }
6890
6891   return safe_codings;
6892 }
6893
6894
6895 /* Search from position POS for such characters that are unencodable
6896    accoding to SAFE_CHARS, and return a list of their positions.  P
6897    points where in the memory the character at POS exists.  Limit the
6898    search at PEND or when Nth unencodable characters are found.
6899
6900    If SAFE_CHARS is a char table, an element for an unencodable
6901    character is nil.
6902
6903    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6904
6905    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6906    eight-bit-graphic characters are unencodable.  */
6907
6908 static Lisp_Object
6909 unencodable_char_position (safe_chars, pos, p, pend, n)
6910      Lisp_Object safe_chars;
6911      int pos;
6912      unsigned char *p, *pend;
6913      int n;
6914 {
6915   Lisp_Object pos_list;
6916
6917   pos_list = Qnil;
6918   while (p < pend)
6919     {
6920       int len;
6921       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6922
6923       if (c >= 128
6924           && (CHAR_TABLE_P (safe_chars)
6925               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6926               : (NILP (safe_chars) || c < 256)))
6927         {
6928           pos_list = Fcons (make_number (pos), pos_list);
6929           if (--n <= 0)
6930             break;
6931         }
6932       pos++;
6933       p += len;
6934     }
6935   return Fnreverse (pos_list);
6936 }
6937
6938
6939 DEFUN ("unencodable-char-position", Funencodable_char_position,
6940        Sunencodable_char_position, 3, 5, 0,
6941        doc: /*
6942 Return position of first un-encodable character in a region.
6943 START and END specfiy the region and CODING-SYSTEM specifies the
6944 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
6945
6946 If optional 4th argument COUNT is non-nil, it specifies at most how
6947 many un-encodable characters to search.  In this case, the value is a
6948 list of positions.
6949
6950 If optional 5th argument STRING is non-nil, it is a string to search
6951 for un-encodable characters.  In that case, START and END are indexes
6952 to the string.  */)
6953      (start, end, coding_system, count, string)
6954      Lisp_Object start, end, coding_system, count, string;
6955 {
6956   int n;
6957   Lisp_Object safe_chars;
6958   struct coding_system coding;
6959   Lisp_Object positions;
6960   int from, to;
6961   unsigned char *p, *pend;
6962
6963   if (NILP (string))
6964     {
6965       validate_region (&start, &end);
6966       from = XINT (start);
6967       to = XINT (end);
6968       if (NILP (current_buffer->enable_multibyte_characters))
6969         return Qnil;
6970       p = CHAR_POS_ADDR (from);
6971       if (to == GPT)
6972         pend = GPT_ADDR;
6973       else
6974         pend = CHAR_POS_ADDR (to);
6975     }
6976   else
6977     {
6978       CHECK_STRING (string);
6979       CHECK_NATNUM (start);
6980       CHECK_NATNUM (end);
6981       from = XINT (start);
6982       to = XINT (end);
6983       if (from > to
6984           || to > SCHARS (string))
6985         args_out_of_range_3 (string, start, end);
6986       if (! STRING_MULTIBYTE (string))
6987         return Qnil;
6988       p = SDATA (string) + string_char_to_byte (string, from);
6989       pend = SDATA (string) + string_char_to_byte (string, to);
6990     }
6991
6992   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6993
6994   if (NILP (count))
6995     n = 1;
6996   else
6997     {
6998       CHECK_NATNUM (count);
6999       n = XINT (count);
7000     }
7001
7002   if (coding.type == coding_type_no_conversion
7003       || coding.type == coding_type_raw_text)
7004     return Qnil;
7005
7006   if (coding.type == coding_type_undecided)
7007     safe_chars = Qnil;
7008   else
7009     safe_chars = coding_safe_chars (coding_system);
7010
7011   if (STRINGP (string)
7012       || from >= GPT || to <= GPT)
7013     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7014   else
7015     {
7016       Lisp_Object args[2];
7017
7018       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7019       n -= XINT (Flength (args[0]));
7020       if (n <= 0)
7021         positions = args[0];
7022       else
7023         {
7024           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7025                                                pend, n);
7026           positions = Fappend (2, args);
7027         }
7028     }
7029
7030   return  (NILP (count) ? Fcar (positions) : positions);
7031 }
7032
7033
7034 Lisp_Object
7035 code_convert_region1 (start, end, coding_system, encodep)
7036      Lisp_Object start, end, coding_system;
7037      int encodep;
7038 {
7039   struct coding_system coding;
7040   int from, to;
7041
7042   CHECK_NUMBER_COERCE_MARKER (start);
7043   CHECK_NUMBER_COERCE_MARKER (end);
7044   CHECK_SYMBOL (coding_system);
7045
7046   validate_region (&start, &end);
7047   from = XFASTINT (start);
7048   to = XFASTINT (end);
7049
7050   if (NILP (coding_system))
7051     return make_number (to - from);
7052
7053   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7054     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7055
7056   coding.mode |= CODING_MODE_LAST_BLOCK;
7057   coding.src_multibyte = coding.dst_multibyte
7058     = !NILP (current_buffer->enable_multibyte_characters);
7059   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7060                        &coding, encodep, 1);
7061   Vlast_coding_system_used = coding.symbol;
7062   return make_number (coding.produced_char);
7063 }
7064
7065 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7066        3, 3, "r\nzCoding system: ",
7067        doc: /* Decode the current region from the specified coding system.
7068 When called from a program, takes three arguments:
7069 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7070 This function sets `last-coding-system-used' to the precise coding system
7071 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7072 not fully specified.)
7073 It returns the length of the decoded text.  */)
7074      (start, end, coding_system)
7075      Lisp_Object start, end, coding_system;
7076 {
7077   return code_convert_region1 (start, end, coding_system, 0);
7078 }
7079
7080 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7081        3, 3, "r\nzCoding system: ",
7082        doc: /* Encode the current region into the specified coding system.
7083 When called from a program, takes three arguments:
7084 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7085 This function sets `last-coding-system-used' to the precise coding system
7086 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7087 not fully specified.)
7088 It returns the length of the encoded text.  */)
7089      (start, end, coding_system)
7090      Lisp_Object start, end, coding_system;
7091 {
7092   return code_convert_region1 (start, end, coding_system, 1);
7093 }
7094
7095 Lisp_Object
7096 code_convert_string1 (string, coding_system, nocopy, encodep)
7097      Lisp_Object string, coding_system, nocopy;
7098      int encodep;
7099 {
7100   struct coding_system coding;
7101
7102   CHECK_STRING (string);
7103   CHECK_SYMBOL (coding_system);
7104
7105   if (NILP (coding_system))
7106     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7107
7108   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7109     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7110
7111   coding.mode |= CODING_MODE_LAST_BLOCK;
7112   string = (encodep
7113             ? encode_coding_string (string, &coding, !NILP (nocopy))
7114             : decode_coding_string (string, &coding, !NILP (nocopy)));
7115   Vlast_coding_system_used = coding.symbol;
7116
7117   return string;
7118 }
7119
7120 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7121        2, 3, 0,
7122        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7123 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7124 if the decoding operation is trivial.
7125 This function sets `last-coding-system-used' to the precise coding system
7126 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7127 not fully specified.)  */)
7128      (string, coding_system, nocopy)
7129      Lisp_Object string, coding_system, nocopy;
7130 {
7131   return code_convert_string1 (string, coding_system, nocopy, 0);
7132 }
7133
7134 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7135        2, 3, 0,
7136        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7137 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7138 if the encoding operation is trivial.
7139 This function sets `last-coding-system-used' to the precise coding system
7140 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7141 not fully specified.)  */)
7142      (string, coding_system, nocopy)
7143      Lisp_Object string, coding_system, nocopy;
7144 {
7145   return code_convert_string1 (string, coding_system, nocopy, 1);
7146 }
7147
7148 /* Encode or decode STRING according to CODING_SYSTEM.
7149    Do not set Vlast_coding_system_used.
7150
7151    This function is called only from macros DECODE_FILE and
7152    ENCODE_FILE, thus we ignore character composition.  */
7153
7154 Lisp_Object
7155 code_convert_string_norecord (string, coding_system, encodep)
7156      Lisp_Object string, coding_system;
7157      int encodep;
7158 {
7159   struct coding_system coding;
7160
7161   CHECK_STRING (string);
7162   CHECK_SYMBOL (coding_system);
7163
7164   if (NILP (coding_system))
7165     return string;
7166
7167   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7168     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7169
7170   coding.composing = COMPOSITION_DISABLED;
7171   coding.mode |= CODING_MODE_LAST_BLOCK;
7172   return (encodep
7173           ? encode_coding_string (string, &coding, 1)
7174           : decode_coding_string (string, &coding, 1));
7175 }
7176 \f
7177 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7178        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7179 Return the corresponding character.  */)
7180      (code)
7181      Lisp_Object code;
7182 {
7183   unsigned char c1, c2, s1, s2;
7184   Lisp_Object val;
7185
7186   CHECK_NUMBER (code);
7187   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7188   if (s1 == 0)
7189     {
7190       if (s2 < 0x80)
7191         XSETFASTINT (val, s2);
7192       else if (s2 >= 0xA0 || s2 <= 0xDF)
7193         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7194       else
7195         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7196     }
7197   else
7198     {
7199       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7200           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7201         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7202       DECODE_SJIS (s1, s2, c1, c2);
7203       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7204     }
7205   return val;
7206 }
7207
7208 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7209        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7210 Return the corresponding code in SJIS.  */)
7211      (ch)
7212      Lisp_Object ch;
7213 {
7214   int charset, c1, c2, s1, s2;
7215   Lisp_Object val;
7216
7217   CHECK_NUMBER (ch);
7218   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7219   if (charset == CHARSET_ASCII)
7220     {
7221       val = ch;
7222     }
7223   else if (charset == charset_jisx0208
7224            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7225     {
7226       ENCODE_SJIS (c1, c2, s1, s2);
7227       XSETFASTINT (val, (s1 << 8) | s2);
7228     }
7229   else if (charset == charset_katakana_jisx0201
7230            && c1 > 0x20 && c2 < 0xE0)
7231     {
7232       XSETFASTINT (val, c1 | 0x80);
7233     }
7234   else
7235     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7236   return val;
7237 }
7238
7239 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7240        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7241 Return the corresponding character.  */)
7242      (code)
7243      Lisp_Object code;
7244 {
7245   int charset;
7246   unsigned char b1, b2, c1, c2;
7247   Lisp_Object val;
7248
7249   CHECK_NUMBER (code);
7250   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7251   if (b1 == 0)
7252     {
7253       if (b2 >= 0x80)
7254         error ("Invalid BIG5 code: %x", XFASTINT (code));
7255       val = code;
7256     }
7257   else
7258     {
7259       if ((b1 < 0xA1 || b1 > 0xFE)
7260           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7261         error ("Invalid BIG5 code: %x", XFASTINT (code));
7262       DECODE_BIG5 (b1, b2, charset, c1, c2);
7263       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7264     }
7265   return val;
7266 }
7267
7268 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7269        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7270 Return the corresponding character code in Big5.  */)
7271      (ch)
7272      Lisp_Object ch;
7273 {
7274   int charset, c1, c2, b1, b2;
7275   Lisp_Object val;
7276
7277   CHECK_NUMBER (ch);
7278   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7279   if (charset == CHARSET_ASCII)
7280     {
7281       val = ch;
7282     }
7283   else if ((charset == charset_big5_1
7284             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7285            || (charset == charset_big5_2
7286                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7287     {
7288       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7289       XSETFASTINT (val, (b1 << 8) | b2);
7290     }
7291   else
7292     error ("Can't encode to Big5: %d", XFASTINT (ch));
7293   return val;
7294 }
7295 \f
7296 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7297        Sset_terminal_coding_system_internal, 1, 1, 0,
7298        doc: /* Internal use only.  */)
7299      (coding_system)
7300      Lisp_Object coding_system;
7301 {
7302   CHECK_SYMBOL (coding_system);
7303   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7304   /* We had better not send unsafe characters to terminal.  */
7305   terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7306   /* Character composition should be disabled.  */
7307   terminal_coding.composing = COMPOSITION_DISABLED;
7308   /* Error notification should be suppressed.  */
7309   terminal_coding.suppress_error = 1;
7310   terminal_coding.src_multibyte = 1;
7311   terminal_coding.dst_multibyte = 0;
7312   return Qnil;
7313 }
7314
7315 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7316        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7317        doc: /* Internal use only.  */)
7318      (coding_system)
7319      Lisp_Object coding_system;
7320 {
7321   CHECK_SYMBOL (coding_system);
7322   setup_coding_system (Fcheck_coding_system (coding_system),
7323                        &safe_terminal_coding);
7324   /* Character composition should be disabled.  */
7325   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7326   /* Error notification should be suppressed.  */
7327   safe_terminal_coding.suppress_error = 1;
7328   safe_terminal_coding.src_multibyte = 1;
7329   safe_terminal_coding.dst_multibyte = 0;
7330   return Qnil;
7331 }
7332
7333 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7334        Sterminal_coding_system, 0, 0, 0,
7335        doc: /* Return coding system specified for terminal output.  */)
7336      ()
7337 {
7338   return terminal_coding.symbol;
7339 }
7340
7341 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7342        Sset_keyboard_coding_system_internal, 1, 1, 0,
7343        doc: /* Internal use only.  */)
7344      (coding_system)
7345      Lisp_Object coding_system;
7346 {
7347   CHECK_SYMBOL (coding_system);
7348   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7349   /* Character composition should be disabled.  */
7350   keyboard_coding.composing = COMPOSITION_DISABLED;
7351   return Qnil;
7352 }
7353
7354 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7355        Skeyboard_coding_system, 0, 0, 0,
7356        doc: /* Return coding system specified for decoding keyboard input.  */)
7357      ()
7358 {
7359   return keyboard_coding.symbol;
7360 }
7361
7362 \f
7363 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7364        Sfind_operation_coding_system,  1, MANY, 0,
7365        doc: /* Choose a coding system for an operation based on the target name.
7366 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7367 DECODING-SYSTEM is the coding system to use for decoding
7368 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7369 for encoding (in case OPERATION does encoding).
7370
7371 The first argument OPERATION specifies an I/O primitive:
7372   For file I/O, `insert-file-contents' or `write-region'.
7373   For process I/O, `call-process', `call-process-region', or `start-process'.
7374   For network I/O, `open-network-stream'.
7375
7376 The remaining arguments should be the same arguments that were passed
7377 to the primitive.  Depending on which primitive, one of those arguments
7378 is selected as the TARGET.  For example, if OPERATION does file I/O,
7379 whichever argument specifies the file name is TARGET.
7380
7381 TARGET has a meaning which depends on OPERATION:
7382   For file I/O, TARGET is a file name.
7383   For process I/O, TARGET is a process name.
7384   For network I/O, TARGET is a service name or a port number
7385
7386 This function looks up what specified for TARGET in,
7387 `file-coding-system-alist', `process-coding-system-alist',
7388 or `network-coding-system-alist' depending on OPERATION.
7389 They may specify a coding system, a cons of coding systems,
7390 or a function symbol to call.
7391 In the last case, we call the function with one argument,
7392 which is a list of all the arguments given to this function.
7393
7394 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7395      (nargs, args)
7396      int nargs;
7397      Lisp_Object *args;
7398 {
7399   Lisp_Object operation, target_idx, target, val;
7400   register Lisp_Object chain;
7401
7402   if (nargs < 2)
7403     error ("Too few arguments");
7404   operation = args[0];
7405   if (!SYMBOLP (operation)
7406       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7407     error ("Invalid first argument");
7408   if (nargs < 1 + XINT (target_idx))
7409     error ("Too few arguments for operation: %s",
7410            SDATA (SYMBOL_NAME (operation)));
7411   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7412      argument to write-region) is string, it must be treated as a
7413      target file name.  */
7414   if (EQ (operation, Qwrite_region)
7415       && nargs > 5
7416       && STRINGP (args[5]))
7417     target_idx = make_number (4);
7418   target = args[XINT (target_idx) + 1];
7419   if (!(STRINGP (target)
7420         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7421     error ("Invalid argument %d", XINT (target_idx) + 1);
7422
7423   chain = ((EQ (operation, Qinsert_file_contents)
7424             || EQ (operation, Qwrite_region))
7425            ? Vfile_coding_system_alist
7426            : (EQ (operation, Qopen_network_stream)
7427               ? Vnetwork_coding_system_alist
7428               : Vprocess_coding_system_alist));
7429   if (NILP (chain))
7430     return Qnil;
7431
7432   for (; CONSP (chain); chain = XCDR (chain))
7433     {
7434       Lisp_Object elt;
7435       elt = XCAR (chain);
7436
7437       if (CONSP (elt)
7438           && ((STRINGP (target)
7439                && STRINGP (XCAR (elt))
7440                && fast_string_match (XCAR (elt), target) >= 0)
7441               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7442         {
7443           val = XCDR (elt);
7444           /* Here, if VAL is both a valid coding system and a valid
7445              function symbol, we return VAL as a coding system.  */
7446           if (CONSP (val))
7447             return val;
7448           if (! SYMBOLP (val))
7449             return Qnil;
7450           if (! NILP (Fcoding_system_p (val)))
7451             return Fcons (val, val);
7452           if (! NILP (Ffboundp (val)))
7453             {
7454               val = call1 (val, Flist (nargs, args));
7455               if (CONSP (val))
7456                 return val;
7457               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7458                 return Fcons (val, val);
7459             }
7460           return Qnil;
7461         }
7462     }
7463   return Qnil;
7464 }
7465
7466 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7467        Supdate_coding_systems_internal, 0, 0, 0,
7468        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7469 When values of any coding categories are changed, you must
7470 call this function.  */)
7471      ()
7472 {
7473   int i;
7474
7475   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7476     {
7477       Lisp_Object val;
7478
7479       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7480       if (!NILP (val))
7481         {
7482           if (! coding_system_table[i])
7483             coding_system_table[i] = ((struct coding_system *)
7484                                       xmalloc (sizeof (struct coding_system)));
7485           setup_coding_system (val, coding_system_table[i]);
7486         }
7487       else if (coding_system_table[i])
7488         {
7489           xfree (coding_system_table[i]);
7490           coding_system_table[i] = NULL;
7491         }
7492     }
7493
7494   return Qnil;
7495 }
7496
7497 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7498        Sset_coding_priority_internal, 0, 0, 0,
7499        doc: /* Update internal database for the current value of `coding-category-list'.
7500 This function is internal use only.  */)
7501      ()
7502 {
7503   int i = 0, idx;
7504   Lisp_Object val;
7505
7506   val = Vcoding_category_list;
7507
7508   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7509     {
7510       if (! SYMBOLP (XCAR (val)))
7511         break;
7512       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7513       if (idx >= CODING_CATEGORY_IDX_MAX)
7514         break;
7515       coding_priorities[i++] = (1 << idx);
7516       val = XCDR (val);
7517     }
7518   /* If coding-category-list is valid and contains all coding
7519      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7520      the following code saves Emacs from crashing.  */
7521   while (i < CODING_CATEGORY_IDX_MAX)
7522     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7523
7524   return Qnil;
7525 }
7526
7527 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7528        Sdefine_coding_system_internal, 1, 1, 0,
7529        doc: /* Register CODING-SYSTEM as a base coding system.
7530 This function is internal use only.  */)
7531      (coding_system)
7532      Lisp_Object coding_system;
7533 {
7534   Lisp_Object safe_chars, slot;
7535
7536   if (NILP (Fcheck_coding_system (coding_system)))
7537     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7538   safe_chars = coding_safe_chars (coding_system);
7539   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7540     error ("No valid safe-chars property for %s",
7541            SDATA (SYMBOL_NAME (coding_system)));
7542   if (EQ (safe_chars, Qt))
7543     {
7544       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7545         XSETCAR (Vcoding_system_safe_chars,
7546                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7547     }
7548   else
7549     {
7550       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7551       if (NILP (slot))
7552         XSETCDR (Vcoding_system_safe_chars,
7553                  nconc2 (XCDR (Vcoding_system_safe_chars),
7554                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7555       else
7556         XSETCDR (slot, safe_chars);
7557     }
7558   return Qnil;
7559 }
7560
7561 #endif /* emacs */
7562
7563 \f
7564 /*** 9. Post-amble ***/
7565
7566 void
7567 init_coding_once ()
7568 {
7569   int i;
7570
7571   /* Emacs' internal format specific initialize routine.  */
7572   for (i = 0; i <= 0x20; i++)
7573     emacs_code_class[i] = EMACS_control_code;
7574   emacs_code_class[0x0A] = EMACS_linefeed_code;
7575   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7576   for (i = 0x21 ; i < 0x7F; i++)
7577     emacs_code_class[i] = EMACS_ascii_code;
7578   emacs_code_class[0x7F] = EMACS_control_code;
7579   for (i = 0x80; i < 0xFF; i++)
7580     emacs_code_class[i] = EMACS_invalid_code;
7581   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7582   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7583   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7584   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7585
7586   /* ISO2022 specific initialize routine.  */
7587   for (i = 0; i < 0x20; i++)
7588     iso_code_class[i] = ISO_control_0;
7589   for (i = 0x21; i < 0x7F; i++)
7590     iso_code_class[i] = ISO_graphic_plane_0;
7591   for (i = 0x80; i < 0xA0; i++)
7592     iso_code_class[i] = ISO_control_1;
7593   for (i = 0xA1; i < 0xFF; i++)
7594     iso_code_class[i] = ISO_graphic_plane_1;
7595   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7596   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7597   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7598   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7599   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7600   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7601   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7602   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7603   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7604   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7605
7606   setup_coding_system (Qnil, &keyboard_coding);
7607   setup_coding_system (Qnil, &terminal_coding);
7608   setup_coding_system (Qnil, &safe_terminal_coding);
7609   setup_coding_system (Qnil, &default_buffer_file_coding);
7610
7611   bzero (coding_system_table, sizeof coding_system_table);
7612
7613   bzero (ascii_skip_code, sizeof ascii_skip_code);
7614   for (i = 0; i < 128; i++)
7615     ascii_skip_code[i] = 1;
7616
7617 #if defined (MSDOS) || defined (WINDOWSNT)
7618   system_eol_type = CODING_EOL_CRLF;
7619 #else
7620   system_eol_type = CODING_EOL_LF;
7621 #endif
7622
7623   inhibit_pre_post_conversion = 0;
7624 }
7625
7626 #ifdef emacs
7627
7628 void
7629 syms_of_coding ()
7630 {
7631   staticpro (&Vcode_conversion_workbuf_name);
7632   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7633
7634   Qtarget_idx = intern ("target-idx");
7635   staticpro (&Qtarget_idx);
7636
7637   Qcoding_system_history = intern ("coding-system-history");
7638   staticpro (&Qcoding_system_history);
7639   Fset (Qcoding_system_history, Qnil);
7640
7641   /* Target FILENAME is the first argument.  */
7642   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7643   /* Target FILENAME is the third argument.  */
7644   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7645
7646   Qcall_process = intern ("call-process");
7647   staticpro (&Qcall_process);
7648   /* Target PROGRAM is the first argument.  */
7649   Fput (Qcall_process, Qtarget_idx, make_number (0));
7650
7651   Qcall_process_region = intern ("call-process-region");
7652   staticpro (&Qcall_process_region);
7653   /* Target PROGRAM is the third argument.  */
7654   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7655
7656   Qstart_process = intern ("start-process");
7657   staticpro (&Qstart_process);
7658   /* Target PROGRAM is the third argument.  */
7659   Fput (Qstart_process, Qtarget_idx, make_number (2));
7660
7661   Qopen_network_stream = intern ("open-network-stream");
7662   staticpro (&Qopen_network_stream);
7663   /* Target SERVICE is the fourth argument.  */
7664   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7665
7666   Qcoding_system = intern ("coding-system");
7667   staticpro (&Qcoding_system);
7668
7669   Qeol_type = intern ("eol-type");
7670   staticpro (&Qeol_type);
7671
7672   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7673   staticpro (&Qbuffer_file_coding_system);
7674
7675   Qpost_read_conversion = intern ("post-read-conversion");
7676   staticpro (&Qpost_read_conversion);
7677
7678   Qpre_write_conversion = intern ("pre-write-conversion");
7679   staticpro (&Qpre_write_conversion);
7680
7681   Qno_conversion = intern ("no-conversion");
7682   staticpro (&Qno_conversion);
7683
7684   Qundecided = intern ("undecided");
7685   staticpro (&Qundecided);
7686
7687   Qcoding_system_p = intern ("coding-system-p");
7688   staticpro (&Qcoding_system_p);
7689
7690   Qcoding_system_error = intern ("coding-system-error");
7691   staticpro (&Qcoding_system_error);
7692
7693   Fput (Qcoding_system_error, Qerror_conditions,
7694         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7695   Fput (Qcoding_system_error, Qerror_message,
7696         build_string ("Invalid coding system"));
7697
7698   Qcoding_category = intern ("coding-category");
7699   staticpro (&Qcoding_category);
7700   Qcoding_category_index = intern ("coding-category-index");
7701   staticpro (&Qcoding_category_index);
7702
7703   Vcoding_category_table
7704     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7705   staticpro (&Vcoding_category_table);
7706   {
7707     int i;
7708     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7709       {
7710         XVECTOR (Vcoding_category_table)->contents[i]
7711           = intern (coding_category_name[i]);
7712         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7713               Qcoding_category_index, make_number (i));
7714       }
7715   }
7716
7717   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7718   staticpro (&Vcoding_system_safe_chars);
7719
7720   Qtranslation_table = intern ("translation-table");
7721   staticpro (&Qtranslation_table);
7722   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7723
7724   Qtranslation_table_id = intern ("translation-table-id");
7725   staticpro (&Qtranslation_table_id);
7726
7727   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7728   staticpro (&Qtranslation_table_for_decode);
7729
7730   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7731   staticpro (&Qtranslation_table_for_encode);
7732
7733   Qsafe_chars = intern ("safe-chars");
7734   staticpro (&Qsafe_chars);
7735
7736   Qchar_coding_system = intern ("char-coding-system");
7737   staticpro (&Qchar_coding_system);
7738
7739   /* Intern this now in case it isn't already done.
7740      Setting this variable twice is harmless.
7741      But don't staticpro it here--that is done in alloc.c.  */
7742   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7743   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7744   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7745
7746   Qvalid_codes = intern ("valid-codes");
7747   staticpro (&Qvalid_codes);
7748
7749   Qemacs_mule = intern ("emacs-mule");
7750   staticpro (&Qemacs_mule);
7751
7752   Qraw_text = intern ("raw-text");
7753   staticpro (&Qraw_text);
7754
7755   Qutf_8 = intern ("utf-8");
7756   staticpro (&Qutf_8);
7757
7758   Qcoding_system_define_form = intern ("coding-system-define-form");
7759   staticpro (&Qcoding_system_define_form);
7760
7761   defsubr (&Scoding_system_p);
7762   defsubr (&Sread_coding_system);
7763   defsubr (&Sread_non_nil_coding_system);
7764   defsubr (&Scheck_coding_system);
7765   defsubr (&Sdetect_coding_region);
7766   defsubr (&Sdetect_coding_string);
7767   defsubr (&Sfind_coding_systems_region_internal);
7768   defsubr (&Sunencodable_char_position);
7769   defsubr (&Sdecode_coding_region);
7770   defsubr (&Sencode_coding_region);
7771   defsubr (&Sdecode_coding_string);
7772   defsubr (&Sencode_coding_string);
7773   defsubr (&Sdecode_sjis_char);
7774   defsubr (&Sencode_sjis_char);
7775   defsubr (&Sdecode_big5_char);
7776   defsubr (&Sencode_big5_char);
7777   defsubr (&Sset_terminal_coding_system_internal);
7778   defsubr (&Sset_safe_terminal_coding_system_internal);
7779   defsubr (&Sterminal_coding_system);
7780   defsubr (&Sset_keyboard_coding_system_internal);
7781   defsubr (&Skeyboard_coding_system);
7782   defsubr (&Sfind_operation_coding_system);
7783   defsubr (&Supdate_coding_systems_internal);
7784   defsubr (&Sset_coding_priority_internal);
7785   defsubr (&Sdefine_coding_system_internal);
7786
7787   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7788                doc: /* List of coding systems.
7789
7790 Do not alter the value of this variable manually.  This variable should be
7791 updated by the functions `make-coding-system' and
7792 `define-coding-system-alias'.  */);
7793   Vcoding_system_list = Qnil;
7794
7795   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7796                doc: /* Alist of coding system names.
7797 Each element is one element list of coding system name.
7798 This variable is given to `completing-read' as TABLE argument.
7799
7800 Do not alter the value of this variable manually.  This variable should be
7801 updated by the functions `make-coding-system' and
7802 `define-coding-system-alias'.  */);
7803   Vcoding_system_alist = Qnil;
7804
7805   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7806                doc: /* List of coding-categories (symbols) ordered by priority.
7807
7808 On detecting a coding system, Emacs tries code detection algorithms
7809 associated with each coding-category one by one in this order.  When
7810 one algorithm agrees with a byte sequence of source text, the coding
7811 system bound to the corresponding coding-category is selected.  */);
7812   {
7813     int i;
7814
7815     Vcoding_category_list = Qnil;
7816     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7817       Vcoding_category_list
7818         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7819                  Vcoding_category_list);
7820   }
7821
7822   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7823                doc: /* Specify the coding system for read operations.
7824 It is useful to bind this variable with `let', but do not set it globally.
7825 If the value is a coding system, it is used for decoding on read operation.
7826 If not, an appropriate element is used from one of the coding system alists:
7827 There are three such tables, `file-coding-system-alist',
7828 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7829   Vcoding_system_for_read = Qnil;
7830
7831   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7832                doc: /* Specify the coding system for write operations.
7833 Programs bind this variable with `let', but you should not set it globally.
7834 If the value is a coding system, it is used for encoding of output,
7835 when writing it to a file and when sending it to a file or subprocess.
7836
7837 If this does not specify a coding system, an appropriate element
7838 is used from one of the coding system alists:
7839 There are three such tables, `file-coding-system-alist',
7840 `process-coding-system-alist', and `network-coding-system-alist'.
7841 For output to files, if the above procedure does not specify a coding system,
7842 the value of `buffer-file-coding-system' is used.  */);
7843   Vcoding_system_for_write = Qnil;
7844
7845   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7846                doc: /* Coding system used in the latest file or process I/O.
7847 Also set by `encode-coding-region', `decode-coding-region',
7848 `encode-coding-string' and `decode-coding-string'.  */);
7849   Vlast_coding_system_used = Qnil;
7850
7851   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7852                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7853 See info node `Coding Systems' and info node `Text and Binary' concerning
7854 such conversion.  */);
7855   inhibit_eol_conversion = 0;
7856
7857   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7858                doc: /* Non-nil means process buffer inherits coding system of process output.
7859 Bind it to t if the process output is to be treated as if it were a file
7860 read from some filesystem.  */);
7861   inherit_process_coding_system = 0;
7862
7863   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7864                doc: /* Alist to decide a coding system to use for a file I/O operation.
7865 The format is ((PATTERN . VAL) ...),
7866 where PATTERN is a regular expression matching a file name,
7867 VAL is a coding system, a cons of coding systems, or a function symbol.
7868 If VAL is a coding system, it is used for both decoding and encoding
7869 the file contents.
7870 If VAL is a cons of coding systems, the car part is used for decoding,
7871 and the cdr part is used for encoding.
7872 If VAL is a function symbol, the function must return a coding system
7873 or a cons of coding systems which are used as above.  The function gets
7874 the arguments with which `find-operation-coding-system' was called.
7875
7876 See also the function `find-operation-coding-system'
7877 and the variable `auto-coding-alist'.  */);
7878   Vfile_coding_system_alist = Qnil;
7879
7880   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7881     doc: /* Alist to decide a coding system to use for a process I/O operation.
7882 The format is ((PATTERN . VAL) ...),
7883 where PATTERN is a regular expression matching a program name,
7884 VAL is a coding system, a cons of coding systems, or a function symbol.
7885 If VAL is a coding system, it is used for both decoding what received
7886 from the program and encoding what sent to the program.
7887 If VAL is a cons of coding systems, the car part is used for decoding,
7888 and the cdr part is used for encoding.
7889 If VAL is a function symbol, the function must return a coding system
7890 or a cons of coding systems which are used as above.
7891
7892 See also the function `find-operation-coding-system'.  */);
7893   Vprocess_coding_system_alist = Qnil;
7894
7895   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7896     doc: /* Alist to decide a coding system to use for a network I/O operation.
7897 The format is ((PATTERN . VAL) ...),
7898 where PATTERN is a regular expression matching a network service name
7899 or is a port number to connect to,
7900 VAL is a coding system, a cons of coding systems, or a function symbol.
7901 If VAL is a coding system, it is used for both decoding what received
7902 from the network stream and encoding what sent to the network stream.
7903 If VAL is a cons of coding systems, the car part is used for decoding,
7904 and the cdr part is used for encoding.
7905 If VAL is a function symbol, the function must return a coding system
7906 or a cons of coding systems which are used as above.
7907
7908 See also the function `find-operation-coding-system'.  */);
7909   Vnetwork_coding_system_alist = Qnil;
7910
7911   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7912                doc: /* Coding system to use with system messages.
7913 Also used for decoding keyboard input on X Window system.  */);
7914   Vlocale_coding_system = Qnil;
7915
7916   /* The eol mnemonics are reset in startup.el system-dependently.  */
7917   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7918                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7919   eol_mnemonic_unix = build_string (":");
7920
7921   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7922                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
7923   eol_mnemonic_dos = build_string ("\\");
7924
7925   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7926                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
7927   eol_mnemonic_mac = build_string ("/");
7928
7929   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7930                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
7931   eol_mnemonic_undecided = build_string (":");
7932
7933   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7934                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
7935   Venable_character_translation = Qt;
7936
7937   DEFVAR_LISP ("standard-translation-table-for-decode",
7938                &Vstandard_translation_table_for_decode,
7939                doc: /* Table for translating characters while decoding.  */);
7940   Vstandard_translation_table_for_decode = Qnil;
7941
7942   DEFVAR_LISP ("standard-translation-table-for-encode",
7943                &Vstandard_translation_table_for_encode,
7944                doc: /* Table for translating characters while encoding.  */);
7945   Vstandard_translation_table_for_encode = Qnil;
7946
7947   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7948                doc: /* Alist of charsets vs revision numbers.
7949 While encoding, if a charset (car part of an element) is found,
7950 designate it with the escape sequence identifying revision (cdr part of the element).  */);
7951   Vcharset_revision_alist = Qnil;
7952
7953   DEFVAR_LISP ("default-process-coding-system",
7954                &Vdefault_process_coding_system,
7955                doc: /* Cons of coding systems used for process I/O by default.
7956 The car part is used for decoding a process output,
7957 the cdr part is used for encoding a text to be sent to a process.  */);
7958   Vdefault_process_coding_system = Qnil;
7959
7960   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7961                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7962 This is a vector of length 256.
7963 If Nth element is non-nil, the existence of code N in a file
7964 \(or output of subprocess) doesn't prevent it to be detected as
7965 a coding system of ISO 2022 variant which has a flag
7966 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7967 or reading output of a subprocess.
7968 Only 128th through 159th elements has a meaning.  */);
7969   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7970
7971   DEFVAR_LISP ("select-safe-coding-system-function",
7972                &Vselect_safe_coding_system_function,
7973                doc: /* Function to call to select safe coding system for encoding a text.
7974
7975 If set, this function is called to force a user to select a proper
7976 coding system which can encode the text in the case that a default
7977 coding system used in each operation can't encode the text.
7978
7979 The default value is `select-safe-coding-system' (which see).  */);
7980   Vselect_safe_coding_system_function = Qnil;
7981
7982   DEFVAR_BOOL ("coding-system-require-warning",
7983                &coding_system_require_warning,
7984                doc: /* Internal use only.
7985 If non-nil, on writing a file, `select-safe-coding-system-function' is
7986 called even if `coding-system-for-write' is non-nil.  The command
7987 `universal-coding-system-argument' binds this variable to t temporarily.  */);
7988   coding_system_require_warning = 0;
7989
7990
7991   DEFVAR_BOOL ("inhibit-iso-escape-detection",
7992                &inhibit_iso_escape_detection,
7993                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7994
7995 By default, on reading a file, Emacs tries to detect how the text is
7996 encoded.  This code detection is sensitive to escape sequences.  If
7997 the sequence is valid as ISO2022, the code is determined as one of
7998 the ISO2022 encodings, and the file is decoded by the corresponding
7999 coding system (e.g. `iso-2022-7bit').
8000
8001 However, there may be a case that you want to read escape sequences in
8002 a file as is.  In such a case, you can set this variable to non-nil.
8003 Then, as the code detection ignores any escape sequences, no file is
8004 detected as encoded in some ISO2022 encoding.  The result is that all
8005 escape sequences become visible in a buffer.
8006
8007 The default value is nil, and it is strongly recommended not to change
8008 it.  That is because many Emacs Lisp source files that contain
8009 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8010 in Emacs's distribution, and they won't be decoded correctly on
8011 reading if you suppress escape sequence detection.
8012
8013 The other way to read escape sequences in a file without decoding is
8014 to explicitly specify some coding system that doesn't use ISO2022's
8015 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8016   inhibit_iso_escape_detection = 0;
8017
8018   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8019                doc: /* Char table for translating self-inserting characters.
8020 This is applied to the result of input methods, not their input.  See also
8021 `keyboard-translate-table'.  */);
8022     Vtranslation_table_for_input = Qnil;
8023 }
8024
8025 char *
8026 emacs_strerror (error_number)
8027      int error_number;
8028 {
8029   char *str;
8030
8031   synchronize_system_messages_locale ();
8032   str = strerror (error_number);
8033
8034   if (! NILP (Vlocale_coding_system))
8035     {
8036       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8037                                                       Vlocale_coding_system,
8038                                                       0);
8039       str = (char *) SDATA (dec);
8040     }
8041
8042   return str;
8043 }
8044
8045 #endif /* emacs */
8046
8047 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8048    (do not change this comment) */