src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
   4      National Institute of Advanced Industrial Science and Technology (AIST)
   5      Registration Number H14PRO021
   6
   7 This file is part of GNU Emacs.
   8
   9 GNU Emacs is free software; you can redistribute it and/or modify
  10 it under the terms of the GNU General Public License as published by
  11 the Free Software Foundation; either version 2, or (at your option)
  12 any later version.
  13
  14 GNU Emacs is distributed in the hope that it will be useful,
  15 but WITHOUT ANY WARRANTY; without even the implied warranty of
  16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17 GNU General Public License for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GNU Emacs; see the file COPYING.  If not, write to
  21 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  22 Boston, MA 02110-1301, USA.  */
  23
  24 /*** TABLE OF CONTENTS ***
  25
  26   0. General comments
  27   1. Preamble
  28   2. Emacs' internal format (emacs-mule) handlers
  29   3. ISO2022 handlers
  30   4. Shift-JIS and BIG5 handlers
  31   5. CCL handlers
  32   6. End-of-line handlers
  33   7. C library functions
  34   8. Emacs Lisp library functions
  35   9. Post-amble
  36
  37 */
  38
  39 /*** 0. General comments ***/
  40
  41
  42 /*** GENERAL NOTE on CODING SYSTEMS ***
  43
  44   A coding system is an encoding mechanism for one or more character
  45   sets.  Here's a list of coding systems which Emacs can handle.  When
  46   we say "decode", it means converting some other coding system to
  47   Emacs' internal format (emacs-mule), and when we say "encode",
  48   it means converting the coding system emacs-mule to some other
  49   coding system.
  50
  51   0. Emacs' internal format (emacs-mule)
  52
  53   Emacs itself holds a multi-lingual character in buffers and strings
  54   in a special format.  Details are described in section 2.
  55
  56   1. ISO2022
  57
  58   The most famous coding system for multiple character sets.  X's
  59   Compound Text, various EUCs (Extended Unix Code), and coding
  60   systems used in Internet communication such as ISO-2022-JP are
  61   all variants of ISO2022.  Details are described in section 3.
  62
  63   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  64
  65   A coding system to encode character sets: ASCII, JISX0201, and
  66   JISX0208.  Widely used for PC's in Japan.  Details are described in
  67   section 4.
  68
  69   3. BIG5
  70
  71   A coding system to encode the character sets ASCII and Big5.  Widely
  72   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  73   described in section 4.  In this file, when we write "BIG5"
  74   (all uppercase), we mean the coding system, and when we write
  75   "Big5" (capitalized), we mean the character set.
  76
  77   4. Raw text
  78
  79   A coding system for text containing random 8-bit code.  Emacs does
  80   no code conversion on such text except for end-of-line format.
  81
  82   5. Other
  83
  84   If a user wants to read/write text encoded in a coding system not
  85   listed above, he can supply a decoder and an encoder for it as CCL
  86   (Code Conversion Language) programs.  Emacs executes the CCL program
  87   while reading/writing.
  88
  89   Emacs represents a coding system by a Lisp symbol that has a property
  90   `coding-system'.  But, before actually using the coding system, the
  91   information about it is set in a structure of type `struct
  92   coding_system' for rapid processing.  See section 6 for more details.
  93
  94 */
  95
  96 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  97
  98   How end-of-line of text is encoded depends on the operating system.
  99   For instance, Unix's format is just one byte of `line-feed' code,
 100   whereas DOS's format is two-byte sequence of `carriage-return' and
 101   `line-feed' codes.  MacOS's format is usually one byte of
 102   `carriage-return'.
 103
 104   Since text character encoding and end-of-line encoding are
 105   independent, any coding system described above can have any
 106   end-of-line format.  So Emacs has information about end-of-line
 107   format in each coding-system.  See section 6 for more details.
 108
 109 */
 110
 111 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 112
 113   These functions check if a text between SRC and SRC_END is encoded
 114   in the coding system category XXX.  Each returns an integer value in
 115   which appropriate flag bits for the category XXX are set.  The flag
 116   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 117   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 118   of the range 0x80..0x9F are in multibyte form.  */
 119 #if 0
 120 int
 121 detect_coding_emacs_mule (src, src_end, multibytep)
 122      unsigned char *src, *src_end;
 123      int multibytep;
 124 {
 125   ...
 126 }
 127 #endif
 128
 129 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 130
 131   These functions decode SRC_BYTES length of unibyte text at SOURCE
 132   encoded in CODING to Emacs' internal format.  The resulting
 133   multibyte text goes to a place pointed to by DESTINATION, the length
 134   of which should not exceed DST_BYTES.
 135
 136   These functions set the information about original and decoded texts
 137   in the members `produced', `produced_char', `consumed', and
 138   `consumed_char' of the structure *CODING.  They also set the member
 139   `result' to one of CODING_FINISH_XXX indicating how the decoding
 140   finished.
 141
 142   DST_BYTES zero means that the source area and destination area are
 143   overlapped, which means that we can produce a decoded text until it
 144   reaches the head of the not-yet-decoded source text.
 145
 146   Below is a template for these functions.  */
 147 #if 0
 148 static void
 149 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 150      struct coding_system *coding;
 151      const unsigned char *source;
 152      unsigned char *destination;
 153      int src_bytes, dst_bytes;
 154 {
 155   ...
 156 }
 157 #endif
 158
 159 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 160
 161   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 162   internal multibyte format to CODING.  The resulting unibyte text
 163   goes to a place pointed to by DESTINATION, the length of which
 164   should not exceed DST_BYTES.
 165
 166   These functions set the information about original and encoded texts
 167   in the members `produced', `produced_char', `consumed', and
 168   `consumed_char' of the structure *CODING.  They also set the member
 169   `result' to one of CODING_FINISH_XXX indicating how the encoding
 170   finished.
 171
 172   DST_BYTES zero means that the source area and destination area are
 173   overlapped, which means that we can produce encoded text until it
 174   reaches at the head of the not-yet-encoded source text.
 175
 176   Below is a template for these functions.  */
 177 #if 0
 178 static void
 179 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 180      struct coding_system *coding;
 181      unsigned char *source, *destination;
 182      int src_bytes, dst_bytes;
 183 {
 184   ...
 185 }
 186 #endif
 187
 188 /*** COMMONLY USED MACROS ***/
 189
 190 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 191    get one, two, and three bytes from the source text respectively.
 192    If there are not enough bytes in the source, they jump to
 193    `label_end_of_loop'.  The caller should set variables `coding',
 194    `src' and `src_end' to appropriate pointer in advance.  These
 195    macros are called from decoding routines `decode_coding_XXX', thus
 196    it is assumed that the source text is unibyte.  */
 197
 198 #define ONE_MORE_BYTE(c1)                                       \
 199   do {                                                          \
 200     if (src >= src_end)                                         \
 201       {                                                         \
 202         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 203         goto label_end_of_loop;                                 \
 204       }                                                         \
 205     c1 = *src++;                                                \
 206   } while (0)
 207
 208 #define TWO_MORE_BYTES(c1, c2)                                  \
 209   do {                                                          \
 210     if (src + 1 >= src_end)                                     \
 211       {                                                         \
 212         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 213         goto label_end_of_loop;                                 \
 214       }                                                         \
 215     c1 = *src++;                                                \
 216     c2 = *src++;                                                \
 217   } while (0)
 218
 219
 220 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 221    form if MULTIBYTEP is nonzero.  */
 222
 223 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 224   do {                                                          \
 225     if (src >= src_end)                                         \
 226       {                                                         \
 227         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 228         goto label_end_of_loop;                                 \
 229       }                                                         \
 230     c1 = *src++;                                                \
 231     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 232       c1 = *src++ - 0x20;                                       \
 233   } while (0)
 234
 235 /* Set C to the next character at the source text pointed by `src'.
 236    If there are not enough characters in the source, jump to
 237    `label_end_of_loop'.  The caller should set variables `coding'
 238    `src', `src_end', and `translation_table' to appropriate pointers
 239    in advance.  This macro is used in encoding routines
 240    `encode_coding_XXX', thus it assumes that the source text is in
 241    multibyte form except for 8-bit characters.  8-bit characters are
 242    in multibyte form if coding->src_multibyte is nonzero, else they
 243    are represented by a single byte.  */
 244
 245 #define ONE_MORE_CHAR(c)                                        \
 246   do {                                                          \
 247     int len = src_end - src;                                    \
 248     int bytes;                                                  \
 249     if (len <= 0)                                               \
 250       {                                                         \
 251         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 252         goto label_end_of_loop;                                 \
 253       }                                                         \
 254     if (coding->src_multibyte                                   \
 255         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 256       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 257     else                                                        \
 258       c = *src, bytes = 1;                                      \
 259     if (!NILP (translation_table))                              \
 260       c = translate_char (translation_table, c, -1, 0, 0);      \
 261     src += bytes;                                               \
 262   } while (0)
 263
 264
 265 /* Produce a multibyte form of character C to `dst'.  Jump to
 266    `label_end_of_loop' if there's not enough space at `dst'.
 267
 268    If we are now in the middle of a composition sequence, the decoded
 269    character may be ALTCHAR (for the current composition).  In that
 270    case, the character goes to coding->cmp_data->data instead of
 271    `dst'.
 272
 273    This macro is used in decoding routines.  */
 274
 275 #define EMIT_CHAR(c)                                                    \
 276   do {                                                                  \
 277     if (! COMPOSING_P (coding)                                          \
 278         || coding->composing == COMPOSITION_RELATIVE                    \
 279         || coding->composing == COMPOSITION_WITH_RULE)                  \
 280       {                                                                 \
 281         int bytes = CHAR_BYTES (c);                                     \
 282         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 283           {                                                             \
 284             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 285             goto label_end_of_loop;                                     \
 286           }                                                             \
 287         dst += CHAR_STRING (c, dst);                                    \
 288         coding->produced_char++;                                        \
 289       }                                                                 \
 290                                                                         \
 291     if (COMPOSING_P (coding)                                            \
 292         && coding->composing != COMPOSITION_RELATIVE)                   \
 293       {                                                                 \
 294         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 295         coding->composition_rule_follows                                \
 296           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 297       }                                                                 \
 298   } while (0)
 299
 300
 301 #define EMIT_ONE_BYTE(c)                                        \
 302   do {                                                          \
 303     if (dst >= (dst_bytes ? dst_end : src))                     \
 304       {                                                         \
 305         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 306         goto label_end_of_loop;                                 \
 307       }                                                         \
 308     *dst++ = c;                                                 \
 309   } while (0)
 310
 311 #define EMIT_TWO_BYTES(c1, c2)                                  \
 312   do {                                                          \
 313     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 314       {                                                         \
 315         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 316         goto label_end_of_loop;                                 \
 317       }                                                         \
 318     *dst++ = c1, *dst++ = c2;                                   \
 319   } while (0)
 320
 321 #define EMIT_BYTES(from, to)                                    \
 322   do {                                                          \
 323     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 324       {                                                         \
 325         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 326         goto label_end_of_loop;                                 \
 327       }                                                         \
 328     while (from < to)                                           \
 329       *dst++ = *from++;                                         \
 330   } while (0)
 331
 332 \f
 333 /*** 1. Preamble ***/
 334
 335 #ifdef emacs
 336 #include <config.h>
 337 #endif
 338
 339 #include <stdio.h>
 340
 341 #ifdef emacs
 342
 343 #include "lisp.h"
 344 #include "buffer.h"
 345 #include "charset.h"
 346 #include "composite.h"
 347 #include "ccl.h"
 348 #include "coding.h"
 349 #include "window.h"
 350 #include "intervals.h"
 351
 352 #else  /* not emacs */
 353
 354 #include "mulelib.h"
 355
 356 #endif /* not emacs */
 357
 358 Lisp_Object Qcoding_system, Qeol_type;
 359 Lisp_Object Qbuffer_file_coding_system;
 360 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 361 Lisp_Object Qno_conversion, Qundecided;
 362 Lisp_Object Qcoding_system_history;
 363 Lisp_Object Qsafe_chars;
 364 Lisp_Object Qvalid_codes;
 365
 366 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 367 Lisp_Object Qcall_process, Qcall_process_region;
 368 Lisp_Object Qstart_process, Qopen_network_stream;
 369 Lisp_Object Qtarget_idx;
 370
 371 /* If a symbol has this property, evaluate the value to define the
 372    symbol as a coding system.  */
 373 Lisp_Object Qcoding_system_define_form;
 374
 375 Lisp_Object Vselect_safe_coding_system_function;
 376
 377 int coding_system_require_warning;
 378
 379 /* Mnemonic string for each format of end-of-line.  */
 380 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 381 /* Mnemonic string to indicate format of end-of-line is not yet
 382    decided.  */
 383 Lisp_Object eol_mnemonic_undecided;
 384
 385 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 386    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 387 int system_eol_type;
 388
 389 #ifdef emacs
 390
 391 /* Information about which coding system is safe for which chars.
 392    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 393
 394    GENERIC-LIST is a list of generic coding systems which can encode
 395    any characters.
 396
 397    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 398    corresponding char table that contains safe chars.  */
 399 Lisp_Object Vcoding_system_safe_chars;
 400
 401 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 402
 403 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 404
 405 /* Coding system emacs-mule and raw-text are for converting only
 406    end-of-line format.  */
 407 Lisp_Object Qemacs_mule, Qraw_text;
 408
 409 Lisp_Object Qutf_8;
 410
 411 /* Coding-systems are handed between Emacs Lisp programs and C internal
 412    routines by the following three variables.  */
 413 /* Coding-system for reading files and receiving data from process.  */
 414 Lisp_Object Vcoding_system_for_read;
 415 /* Coding-system for writing files and sending data to process.  */
 416 Lisp_Object Vcoding_system_for_write;
 417 /* Coding-system actually used in the latest I/O.  */
 418 Lisp_Object Vlast_coding_system_used;
 419
 420 /* A vector of length 256 which contains information about special
 421    Latin codes (especially for dealing with Microsoft codes).  */
 422 Lisp_Object Vlatin_extra_code_table;
 423
 424 /* Flag to inhibit code conversion of end-of-line format.  */
 425 int inhibit_eol_conversion;
 426
 427 /* Flag to inhibit ISO2022 escape sequence detection.  */
 428 int inhibit_iso_escape_detection;
 429
 430 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 431 int inherit_process_coding_system;
 432
 433 /* Coding system to be used to encode text for terminal display.  */
 434 struct coding_system terminal_coding;
 435
 436 /* Coding system to be used to encode text for terminal display when
 437    terminal coding system is nil.  */
 438 struct coding_system safe_terminal_coding;
 439
 440 /* Coding system of what is sent from terminal keyboard.  */
 441 struct coding_system keyboard_coding;
 442
 443 /* Default coding system to be used to write a file.  */
 444 struct coding_system default_buffer_file_coding;
 445
 446 Lisp_Object Vfile_coding_system_alist;
 447 Lisp_Object Vprocess_coding_system_alist;
 448 Lisp_Object Vnetwork_coding_system_alist;
 449
 450 Lisp_Object Vlocale_coding_system;
 451
 452 #endif /* emacs */
 453
 454 Lisp_Object Qcoding_category, Qcoding_category_index;
 455
 456 /* List of symbols `coding-category-xxx' ordered by priority.  */
 457 Lisp_Object Vcoding_category_list;
 458
 459 /* Table of coding categories (Lisp symbols).  */
 460 Lisp_Object Vcoding_category_table;
 461
 462 /* Table of names of symbol for each coding-category.  */
 463 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 464   "coding-category-emacs-mule",
 465   "coding-category-sjis",
 466   "coding-category-iso-7",
 467   "coding-category-iso-7-tight",
 468   "coding-category-iso-8-1",
 469   "coding-category-iso-8-2",
 470   "coding-category-iso-7-else",
 471   "coding-category-iso-8-else",
 472   "coding-category-ccl",
 473   "coding-category-big5",
 474   "coding-category-utf-8",
 475   "coding-category-utf-16-be",
 476   "coding-category-utf-16-le",
 477   "coding-category-raw-text",
 478   "coding-category-binary"
 479 };
 480
 481 /* Table of pointers to coding systems corresponding to each coding
 482    categories.  */
 483 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 484
 485 /* Table of coding category masks.  Nth element is a mask for a coding
 486    category of which priority is Nth.  */
 487 static
 488 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 489
 490 /* Flag to tell if we look up translation table on character code
 491    conversion.  */
 492 Lisp_Object Venable_character_translation;
 493 /* Standard translation table to look up on decoding (reading).  */
 494 Lisp_Object Vstandard_translation_table_for_decode;
 495 /* Standard translation table to look up on encoding (writing).  */
 496 Lisp_Object Vstandard_translation_table_for_encode;
 497
 498 Lisp_Object Qtranslation_table;
 499 Lisp_Object Qtranslation_table_id;
 500 Lisp_Object Qtranslation_table_for_decode;
 501 Lisp_Object Qtranslation_table_for_encode;
 502
 503 /* Alist of charsets vs revision number.  */
 504 Lisp_Object Vcharset_revision_alist;
 505
 506 /* Default coding systems used for process I/O.  */
 507 Lisp_Object Vdefault_process_coding_system;
 508
 509 /* Char table for translating Quail and self-inserting input.  */
 510 Lisp_Object Vtranslation_table_for_input;
 511
 512 /* Global flag to tell that we can't call post-read-conversion and
 513    pre-write-conversion functions.  Usually the value is zero, but it
 514    is set to 1 temporarily while such functions are running.  This is
 515    to avoid infinite recursive call.  */
 516 static int inhibit_pre_post_conversion;
 517
 518 Lisp_Object Qchar_coding_system;
 519
 520 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 521    its validity.  */
 522
 523 Lisp_Object
 524 coding_safe_chars (coding_system)
 525      Lisp_Object coding_system;
 526 {
 527   Lisp_Object coding_spec, plist, safe_chars;
 528
 529   coding_spec = Fget (coding_system, Qcoding_system);
 530   plist = XVECTOR (coding_spec)->contents[3];
 531   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 532   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 533 }
 534
 535 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 536   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 537
 538 \f
 539 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 540
 541 /* Emacs' internal format for representation of multiple character
 542    sets is a kind of multi-byte encoding, i.e. characters are
 543    represented by variable-length sequences of one-byte codes.
 544
 545    ASCII characters and control characters (e.g. `tab', `newline') are
 546    represented by one-byte sequences which are their ASCII codes, in
 547    the range 0x00 through 0x7F.
 548
 549    8-bit characters of the range 0x80..0x9F are represented by
 550    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 551    code + 0x20).
 552
 553    8-bit characters of the range 0xA0..0xFF are represented by
 554    one-byte sequences which are their 8-bit code.
 555
 556    The other characters are represented by a sequence of `base
 557    leading-code', optional `extended leading-code', and one or two
 558    `position-code's.  The length of the sequence is determined by the
 559    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 560    whereas extended leading-code and position-code take the range 0xA0
 561    through 0xFF.  See `charset.h' for more details about leading-code
 562    and position-code.
 563
 564    --- CODE RANGE of Emacs' internal format ---
 565    character set        range
 566    -------------        -----
 567    ascii                0x00..0x7F
 568    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 569    eight-bit-graphic    0xA0..0xBF
 570    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 571    ---------------------------------------------
 572
 573    As this is the internal character representation, the format is
 574    usually not used externally (i.e. in a file or in a data sent to a
 575    process).  But, it is possible to have a text externally in this
 576    format (i.e. by encoding by the coding system `emacs-mule').
 577
 578    In that case, a sequence of one-byte codes has a slightly different
 579    form.
 580
 581    Firstly, all characters in eight-bit-control are represented by
 582    one-byte sequences which are their 8-bit code.
 583
 584    Next, character composition data are represented by the byte
 585    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 586    where,
 587         METHOD is 0xF0 plus one of composition method (enum
 588         composition_method),
 589
 590         BYTES is 0xA0 plus the byte length of these composition data,
 591
 592         CHARS is 0xA0 plus the number of characters composed by these
 593         data,
 594
 595         COMPONENTs are characters of multibyte form or composition
 596         rules encoded by two-byte of ASCII codes.
 597
 598    In addition, for backward compatibility, the following formats are
 599    also recognized as composition data on decoding.
 600
 601    0x80 MSEQ ...
 602    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 603
 604    Here,
 605         MSEQ is a multibyte form but in these special format:
 606           ASCII: 0xA0 ASCII_CODE+0x80,
 607           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 608         RULE is a one byte code of the range 0xA0..0xF0 that
 609         represents a composition rule.
 610   */
 611
 612 enum emacs_code_class_type emacs_code_class[256];
 613
 614 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 615    Check if a text is encoded in Emacs' internal format.  If it is,
 616    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 617
 618 static int
 619 detect_coding_emacs_mule (src, src_end, multibytep)
 620       unsigned char *src, *src_end;
 621       int multibytep;
 622 {
 623   unsigned char c;
 624   int composing = 0;
 625   /* Dummy for ONE_MORE_BYTE.  */
 626   struct coding_system dummy_coding;
 627   struct coding_system *coding = &dummy_coding;
 628
 629   while (1)
 630     {
 631       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 632
 633       if (composing)
 634         {
 635           if (c < 0xA0)
 636             composing = 0;
 637           else if (c == 0xA0)
 638             {
 639               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 640               c &= 0x7F;
 641             }
 642           else
 643             c -= 0x20;
 644         }
 645
 646       if (c < 0x20)
 647         {
 648           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 649             return 0;
 650         }
 651       else if (c >= 0x80 && c < 0xA0)
 652         {
 653           if (c == 0x80)
 654             /* Old leading code for a composite character.  */
 655             composing = 1;
 656           else
 657             {
 658               unsigned char *src_base = src - 1;
 659               int bytes;
 660
 661               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 662                                                bytes))
 663                 return 0;
 664               src = src_base + bytes;
 665             }
 666         }
 667     }
 668  label_end_of_loop:
 669   return CODING_CATEGORY_MASK_EMACS_MULE;
 670 }
 671
 672
 673 /* Record the starting position START and METHOD of one composition.  */
 674
 675 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 676   do {                                                          \
 677     struct composition_data *cmp_data = coding->cmp_data;       \
 678     int *data = cmp_data->data + cmp_data->used;                \
 679     coding->cmp_data_start = cmp_data->used;                    \
 680     data[0] = -1;                                               \
 681     data[1] = cmp_data->char_offset + start;                    \
 682     data[3] = (int) method;                                     \
 683     cmp_data->used += 4;                                        \
 684   } while (0)
 685
 686 /* Record the ending position END of the current composition.  */
 687
 688 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 689   do {                                                          \
 690     struct composition_data *cmp_data = coding->cmp_data;       \
 691     int *data = cmp_data->data + coding->cmp_data_start;        \
 692     data[0] = cmp_data->used - coding->cmp_data_start;          \
 693     data[2] = cmp_data->char_offset + end;                      \
 694   } while (0)
 695
 696 /* Record one COMPONENT (alternate character or composition rule).  */
 697
 698 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 699   do {                                                                  \
 700     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 701     if (coding->cmp_data->used - coding->cmp_data_start                 \
 702         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 703       {                                                                 \
 704         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 705         coding->composing = COMPOSITION_NO;                             \
 706       }                                                                 \
 707   } while (0)
 708
 709
 710 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 711    is not less than SRC_END, return -1 without incrementing Src.  */
 712
 713 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 714
 715
 716 /* Decode a character represented as a component of composition
 717    sequence of Emacs 20 style at SRC.  Set C to that character, store
 718    its multibyte form sequence at P, and set P to the end of that
 719    sequence.  If no valid character is found, set C to -1.  */
 720
 721 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 722   do {                                                          \
 723     int bytes;                                                  \
 724                                                                 \
 725     c = SAFE_ONE_MORE_BYTE ();                                  \
 726     if (c < 0)                                                  \
 727       break;                                                    \
 728     if (CHAR_HEAD_P (c))                                        \
 729       c = -1;                                                   \
 730     else if (c == 0xA0)                                         \
 731       {                                                         \
 732         c = SAFE_ONE_MORE_BYTE ();                              \
 733         if (c < 0xA0)                                           \
 734           c = -1;                                               \
 735         else                                                    \
 736           {                                                     \
 737             c -= 0xA0;                                          \
 738             *p++ = c;                                           \
 739           }                                                     \
 740       }                                                         \
 741     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 742       {                                                         \
 743         unsigned char *p0 = p;                                  \
 744                                                                 \
 745         c -= 0x20;                                              \
 746         *p++ = c;                                               \
 747         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 748         while (--bytes)                                         \
 749           {                                                     \
 750             c = SAFE_ONE_MORE_BYTE ();                          \
 751             if (c < 0)                                          \
 752               break;                                            \
 753             *p++ = c;                                           \
 754           }                                                     \
 755         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 756             || (coding->flags /* We are recovering a file.  */  \
 757                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 758                 && ! CHAR_HEAD_P (p0[1])))                      \
 759           c = STRING_CHAR (p0, bytes);                          \
 760         else                                                    \
 761           c = -1;                                               \
 762       }                                                         \
 763     else                                                        \
 764       c = -1;                                                   \
 765   } while (0)
 766
 767
 768 /* Decode a composition rule represented as a component of composition
 769    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 770    valid rule is found, set C to -1.  */
 771
 772 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 773   do {                                                  \
 774     c = SAFE_ONE_MORE_BYTE ();                          \
 775     c -= 0xA0;                                          \
 776     if (c < 0 || c >= 81)                               \
 777       c = -1;                                           \
 778     else                                                \
 779       {                                                 \
 780         gref = c / 9, nref = c % 9;                     \
 781         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 782       }                                                 \
 783   } while (0)
 784
 785
 786 /* Decode composition sequence encoded by `emacs-mule' at the source
 787    pointed by SRC.  SRC_END is the end of source.  Store information
 788    of the composition in CODING->cmp_data.
 789
 790    For backward compatibility, decode also a composition sequence of
 791    Emacs 20 style.  In that case, the composition sequence contains
 792    characters that should be extracted into a buffer or string.  Store
 793    those characters at *DESTINATION in multibyte form.
 794
 795    If we encounter an invalid byte sequence, return 0.
 796    If we encounter an insufficient source or destination, or
 797    insufficient space in CODING->cmp_data, return 1.
 798    Otherwise, return consumed bytes in the source.
 799
 800 */
 801 static INLINE int
 802 decode_composition_emacs_mule (coding, src, src_end,
 803                                destination, dst_end, dst_bytes)
 804      struct coding_system *coding;
 805      const unsigned char *src, *src_end;
 806      unsigned char **destination, *dst_end;
 807      int dst_bytes;
 808 {
 809   unsigned char *dst = *destination;
 810   int method, data_len, nchars;
 811   const unsigned char *src_base = src++;
 812   /* Store components of composition.  */
 813   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 814   int ncomponent;
 815   /* Store multibyte form of characters to be composed.  This is for
 816      Emacs 20 style composition sequence.  */
 817   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 818   unsigned char *bufp = buf;
 819   int c, i, gref, nref;
 820
 821   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 822       >= COMPOSITION_DATA_SIZE)
 823     {
 824       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 825       return -1;
 826     }
 827
 828   ONE_MORE_BYTE (c);
 829   if (c - 0xF0 >= COMPOSITION_RELATIVE
 830            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 831     {
 832       int with_rule;
 833
 834       method = c - 0xF0;
 835       with_rule = (method == COMPOSITION_WITH_RULE
 836                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 837       ONE_MORE_BYTE (c);
 838       data_len = c - 0xA0;
 839       if (data_len < 4
 840           || src_base + data_len > src_end)
 841         return 0;
 842       ONE_MORE_BYTE (c);
 843       nchars = c - 0xA0;
 844       if (c < 1)
 845         return 0;
 846       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 847         {
 848           /* If it is longer than this, it can't be valid.  */
 849           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 850             return 0;
 851
 852           if (ncomponent % 2 && with_rule)
 853             {
 854               ONE_MORE_BYTE (gref);
 855               gref -= 32;
 856               ONE_MORE_BYTE (nref);
 857               nref -= 32;
 858               c = COMPOSITION_ENCODE_RULE (gref, nref);
 859             }
 860           else
 861             {
 862               int bytes;
 863               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 864                   || (coding->flags /* We are recovering a file.  */
 865                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 866                       && ! CHAR_HEAD_P (src[1])))
 867                 c = STRING_CHAR (src, bytes);
 868               else
 869                 c = *src, bytes = 1;
 870               src += bytes;
 871             }
 872           component[ncomponent] = c;
 873         }
 874     }
 875   else
 876     {
 877       /* This may be an old Emacs 20 style format.  See the comment at
 878          the section 2 of this file.  */
 879       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 880       if (src == src_end
 881           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 882         goto label_end_of_loop;
 883
 884       src_end = src;
 885       src = src_base + 1;
 886       if (c < 0xC0)
 887         {
 888           method = COMPOSITION_RELATIVE;
 889           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 890             {
 891               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 892               if (c < 0)
 893                 break;
 894               component[ncomponent++] = c;
 895             }
 896           if (ncomponent < 2)
 897             return 0;
 898           nchars = ncomponent;
 899         }
 900       else if (c == 0xFF)
 901         {
 902           method = COMPOSITION_WITH_RULE;
 903           src++;
 904           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 905           if (c < 0)
 906             return 0;
 907           component[0] = c;
 908           for (ncomponent = 1;
 909                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 910             {
 911               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 912               if (c < 0)
 913                 break;
 914               component[ncomponent++] = c;
 915               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 916               if (c < 0)
 917                 break;
 918               component[ncomponent++] = c;
 919             }
 920           if (ncomponent < 3)
 921             return 0;
 922           nchars = (ncomponent + 1) / 2;
 923         }
 924       else
 925         return 0;
 926     }
 927
 928   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 929     {
 930       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 931       for (i = 0; i < ncomponent; i++)
 932         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 933       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 934       if (buf < bufp)
 935         {
 936           unsigned char *p = buf;
 937           EMIT_BYTES (p, bufp);
 938           *destination += bufp - buf;
 939           coding->produced_char += nchars;
 940         }
 941       return (src - src_base);
 942     }
 943  label_end_of_loop:
 944   return -1;
 945 }
 946
 947 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 948
 949 static void
 950 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 951      struct coding_system *coding;
 952      const unsigned char *source;
 953      unsigned char *destination;
 954      int src_bytes, dst_bytes;
 955 {
 956   const unsigned char *src = source;
 957   const unsigned char *src_end = source + src_bytes;
 958   unsigned char *dst = destination;
 959   unsigned char *dst_end = destination + dst_bytes;
 960   /* SRC_BASE remembers the start position in source in each loop.
 961      The loop will be exited when there's not enough source code, or
 962      when there's not enough destination area to produce a
 963      character.  */
 964   const unsigned char *src_base;
 965
 966   coding->produced_char = 0;
 967   while ((src_base = src) < src_end)
 968     {
 969       unsigned char tmp[MAX_MULTIBYTE_LENGTH];
 970       const unsigned char *p;
 971       int bytes;
 972
 973       if (*src == '\r')
 974         {
 975           int c = *src++;
 976
 977           if (coding->eol_type == CODING_EOL_CR)
 978             c = '\n';
 979           else if (coding->eol_type == CODING_EOL_CRLF)
 980             {
 981               ONE_MORE_BYTE (c);
 982               if (c != '\n')
 983                 {
 984                   src--;
 985                   c = '\r';
 986                 }
 987             }
 988           *dst++ = c;
 989           coding->produced_char++;
 990           continue;
 991         }
 992       else if (*src == '\n')
 993         {
 994           if ((coding->eol_type == CODING_EOL_CR
 995                || coding->eol_type == CODING_EOL_CRLF)
 996               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 997             {
 998               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 999               goto label_end_of_loop;
1000             }
1001           *dst++ = *src++;
1002           coding->produced_char++;
1003           continue;
1004         }
1005       else if (*src == 0x80 && coding->cmp_data)
1006         {
1007           /* Start of composition data.  */
1008           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
1009                                                          &dst, dst_end,
1010                                                          dst_bytes);
1011           if (consumed < 0)
1012             goto label_end_of_loop;
1013           else if (consumed > 0)
1014             {
1015               src += consumed;
1016               continue;
1017             }
1018           bytes = CHAR_STRING (*src, tmp);
1019           p = tmp;
1020           src++;
1021         }
1022       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1023                || (coding->flags /* We are recovering a file.  */
1024                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1025                    && ! CHAR_HEAD_P (src[1])))
1026         {
1027           p = src;
1028           src += bytes;
1029         }
1030       else
1031         {
1032           int i, c;
1033
1034           bytes = BYTES_BY_CHAR_HEAD (*src);
1035           src++;
1036           for (i = 1; i < bytes; i++)
1037             {
1038               ONE_MORE_BYTE (c);
1039               if (CHAR_HEAD_P (c))
1040                 break;
1041             }
1042           if (i < bytes)
1043             {
1044               bytes = CHAR_STRING (*src_base, tmp);
1045               p = tmp;
1046               src = src_base + 1;
1047             }
1048           else
1049             {
1050               p = src_base;
1051             }
1052         }
1053       if (dst + bytes >= (dst_bytes ? dst_end : src))
1054         {
1055           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1056           break;
1057         }
1058       while (bytes--) *dst++ = *p++;
1059       coding->produced_char++;
1060     }
1061  label_end_of_loop:
1062   coding->consumed = coding->consumed_char = src_base - source;
1063   coding->produced = dst - destination;
1064 }
1065
1066
1067 /* Encode composition data stored at DATA into a special byte sequence
1068    starting by 0x80.  Update CODING->cmp_data_start and maybe
1069    CODING->cmp_data for the next call.  */
1070
1071 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1072   do {                                                                  \
1073     unsigned char buf[1024], *p0 = buf, *p;                             \
1074     int len = data[0];                                                  \
1075     int i;                                                              \
1076                                                                         \
1077     buf[0] = 0x80;                                                      \
1078     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1079     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1080     p = buf + 4;                                                        \
1081     if (data[3] == COMPOSITION_WITH_RULE                                \
1082         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1083       {                                                                 \
1084         p += CHAR_STRING (data[4], p);                                  \
1085         for (i = 5; i < len; i += 2)                                    \
1086           {                                                             \
1087             int gref, nref;                                             \
1088              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1089             *p++ = 0x20 + gref;                                         \
1090             *p++ = 0x20 + nref;                                         \
1091             p += CHAR_STRING (data[i + 1], p);                          \
1092           }                                                             \
1093       }                                                                 \
1094     else                                                                \
1095       {                                                                 \
1096         for (i = 4; i < len; i++)                                       \
1097           p += CHAR_STRING (data[i], p);                                \
1098       }                                                                 \
1099     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1100                                                                         \
1101     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1102       {                                                                 \
1103         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1104         goto label_end_of_loop;                                         \
1105       }                                                                 \
1106     while (p0 < p)                                                      \
1107       *dst++ = *p0++;                                                   \
1108     coding->cmp_data_start += data[0];                                  \
1109     if (coding->cmp_data_start == coding->cmp_data->used                \
1110         && coding->cmp_data->next)                                      \
1111       {                                                                 \
1112         coding->cmp_data = coding->cmp_data->next;                      \
1113         coding->cmp_data_start = 0;                                     \
1114       }                                                                 \
1115   } while (0)
1116
1117
1118 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1119                             unsigned char *, int, int));
1120
1121 static void
1122 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1123      struct coding_system *coding;
1124      const unsigned char *source;
1125      unsigned char *destination;
1126      int src_bytes, dst_bytes;
1127 {
1128   const unsigned char *src = source;
1129   const unsigned char *src_end = source + src_bytes;
1130   unsigned char *dst = destination;
1131   unsigned char *dst_end = destination + dst_bytes;
1132   const unsigned char *src_base;
1133   int c;
1134   int char_offset;
1135   int *data;
1136
1137   Lisp_Object translation_table;
1138
1139   translation_table = Qnil;
1140
1141   /* Optimization for the case that there's no composition.  */
1142   if (!coding->cmp_data || coding->cmp_data->used == 0)
1143     {
1144       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1145       return;
1146     }
1147
1148   char_offset = coding->cmp_data->char_offset;
1149   data = coding->cmp_data->data + coding->cmp_data_start;
1150   while (1)
1151     {
1152       src_base = src;
1153
1154       /* If SRC starts a composition, encode the information about the
1155          composition in advance.  */
1156       if (coding->cmp_data_start < coding->cmp_data->used
1157           && char_offset + coding->consumed_char == data[1])
1158         {
1159           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1160           char_offset = coding->cmp_data->char_offset;
1161           data = coding->cmp_data->data + coding->cmp_data_start;
1162         }
1163
1164       ONE_MORE_CHAR (c);
1165       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1166                         || coding->eol_type == CODING_EOL_CR))
1167         {
1168           if (coding->eol_type == CODING_EOL_CRLF)
1169             EMIT_TWO_BYTES ('\r', c);
1170           else
1171             EMIT_ONE_BYTE ('\r');
1172         }
1173       else if (SINGLE_BYTE_CHAR_P (c))
1174         {
1175           if (coding->flags && ! ASCII_BYTE_P (c))
1176             {
1177               /* As we are auto saving, retain the multibyte form for
1178                  8-bit chars.  */
1179               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1180               int bytes = CHAR_STRING (c, buf);
1181
1182               if (bytes == 1)
1183                 EMIT_ONE_BYTE (buf[0]);
1184               else
1185                 EMIT_TWO_BYTES (buf[0], buf[1]);
1186             }
1187           else
1188             EMIT_ONE_BYTE (c);
1189         }
1190       else
1191         EMIT_BYTES (src_base, src);
1192       coding->consumed_char++;
1193     }
1194  label_end_of_loop:
1195   coding->consumed = src_base - source;
1196   coding->produced = coding->produced_char = dst - destination;
1197   return;
1198 }
1199
1200 \f
1201 /*** 3. ISO2022 handlers ***/
1202
1203 /* The following note describes the coding system ISO2022 briefly.
1204    Since the intention of this note is to help understand the
1205    functions in this file, some parts are NOT ACCURATE or are OVERLY
1206    SIMPLIFIED.  For thorough understanding, please refer to the
1207    original document of ISO2022.  This is equivalent to the standard
1208    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1209
1210    ISO2022 provides many mechanisms to encode several character sets
1211    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1212    is encoded using bytes less than 128.  This may make the encoded
1213    text a little bit longer, but the text passes more easily through
1214    several types of gateway, some of which strip off the MSB (Most
1215    Significant Bit).
1216
1217    There are two kinds of character sets: control character sets and
1218    graphic character sets.  The former contain control characters such
1219    as `newline' and `escape' to provide control functions (control
1220    functions are also provided by escape sequences).  The latter
1221    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1222    two control character sets and many graphic character sets.
1223
1224    Graphic character sets are classified into one of the following
1225    four classes, according to the number of bytes (DIMENSION) and
1226    number of characters in one dimension (CHARS) of the set:
1227    - DIMENSION1_CHARS94
1228    - DIMENSION1_CHARS96
1229    - DIMENSION2_CHARS94
1230    - DIMENSION2_CHARS96
1231
1232    In addition, each character set is assigned an identification tag,
1233    unique for each set, called the "final character" (denoted as <F>
1234    hereafter).  The <F> of each character set is decided by ECMA(*)
1235    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1236    (0x30..0x3F are for private use only).
1237
1238    Note (*): ECMA = European Computer Manufacturers Association
1239
1240    Here are examples of graphic character sets [NAME(<F>)]:
1241         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1242         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1243         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1244         o DIMENSION2_CHARS96 -- none for the moment
1245
1246    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1247         C0 [0x00..0x1F] -- control character plane 0
1248         GL [0x20..0x7F] -- graphic character plane 0
1249         C1 [0x80..0x9F] -- control character plane 1
1250         GR [0xA0..0xFF] -- graphic character plane 1
1251
1252    A control character set is directly designated and invoked to C0 or
1253    C1 by an escape sequence.  The most common case is that:
1254    - ISO646's  control character set is designated/invoked to C0, and
1255    - ISO6429's control character set is designated/invoked to C1,
1256    and usually these designations/invocations are omitted in encoded
1257    text.  In a 7-bit environment, only C0 can be used, and a control
1258    character for C1 is encoded by an appropriate escape sequence to
1259    fit into the environment.  All control characters for C1 are
1260    defined to have corresponding escape sequences.
1261
1262    A graphic character set is at first designated to one of four
1263    graphic registers (G0 through G3), then these graphic registers are
1264    invoked to GL or GR.  These designations and invocations can be
1265    done independently.  The most common case is that G0 is invoked to
1266    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1267    these invocations and designations are omitted in encoded text.
1268    In a 7-bit environment, only GL can be used.
1269
1270    When a graphic character set of CHARS94 is invoked to GL, codes
1271    0x20 and 0x7F of the GL area work as control characters SPACE and
1272    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1273    be used.
1274
1275    There are two ways of invocation: locking-shift and single-shift.
1276    With locking-shift, the invocation lasts until the next different
1277    invocation, whereas with single-shift, the invocation affects the
1278    following character only and doesn't affect the locking-shift
1279    state.  Invocations are done by the following control characters or
1280    escape sequences:
1281
1282    ----------------------------------------------------------------------
1283    abbrev  function                  cntrl escape seq   description
1284    ----------------------------------------------------------------------
1285    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1286    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1287    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1288    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1289    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1290    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1291    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1292    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1293    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1294    ----------------------------------------------------------------------
1295    (*) These are not used by any known coding system.
1296
1297    Control characters for these functions are defined by macros
1298    ISO_CODE_XXX in `coding.h'.
1299
1300    Designations are done by the following escape sequences:
1301    ----------------------------------------------------------------------
1302    escape sequence      description
1303    ----------------------------------------------------------------------
1304    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1305    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1306    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1307    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1308    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1309    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1310    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1311    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1312    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1313    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1314    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1315    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1316    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1317    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1318    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1319    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1320    ----------------------------------------------------------------------
1321
1322    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1323    of dimension 1, chars 94, and final character <F>, etc...
1324
1325    Note (*): Although these designations are not allowed in ISO2022,
1326    Emacs accepts them on decoding, and produces them on encoding
1327    CHARS96 character sets in a coding system which is characterized as
1328    7-bit environment, non-locking-shift, and non-single-shift.
1329
1330    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1331    '(' can be omitted.  We refer to this as "short-form" hereafter.
1332
1333    Now you may notice that there are a lot of ways of encoding the
1334    same multilingual text in ISO2022.  Actually, there exist many
1335    coding systems such as Compound Text (used in X11's inter client
1336    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1337    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1338    localized platforms), and all of these are variants of ISO2022.
1339
1340    In addition to the above, Emacs handles two more kinds of escape
1341    sequences: ISO6429's direction specification and Emacs' private
1342    sequence for specifying character composition.
1343
1344    ISO6429's direction specification takes the following form:
1345         o CSI ']'      -- end of the current direction
1346         o CSI '0' ']'  -- end of the current direction
1347         o CSI '1' ']'  -- start of left-to-right text
1348         o CSI '2' ']'  -- start of right-to-left text
1349    The control character CSI (0x9B: control sequence introducer) is
1350    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1351
1352    Character composition specification takes the following form:
1353         o ESC '0' -- start relative composition
1354         o ESC '1' -- end composition
1355         o ESC '2' -- start rule-base composition (*)
1356         o ESC '3' -- start relative composition with alternate chars  (**)
1357         o ESC '4' -- start rule-base composition with alternate chars  (**)
1358   Since these are not standard escape sequences of any ISO standard,
1359   the use of them with these meanings is restricted to Emacs only.
1360
1361   (*) This form is used only in Emacs 20.5 and older versions,
1362   but the newer versions can safely decode it.
1363   (**) This form is used only in Emacs 21.1 and newer versions,
1364   and the older versions can't decode it.
1365
1366   Here's a list of example usages of these composition escape
1367   sequences (categorized by `enum composition_method').
1368
1369   COMPOSITION_RELATIVE:
1370         ESC 0 CHAR [ CHAR ] ESC 1
1371   COMPOSITION_WITH_RULE:
1372         ESC 2 CHAR [ RULE CHAR ] ESC 1
1373   COMPOSITION_WITH_ALTCHARS:
1374         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1375   COMPOSITION_WITH_RULE_ALTCHARS:
1376         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1377
1378 enum iso_code_class_type iso_code_class[256];
1379
1380 #define CHARSET_OK(idx, charset, c)                                     \
1381   (coding_system_table[idx]                                             \
1382    && (charset == CHARSET_ASCII                                         \
1383        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1384            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1385    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1386                                               charset)                  \
1387        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1388
1389 #define SHIFT_OUT_OK(idx) \
1390   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1391
1392 #define COMPOSITION_OK(idx)     \
1393   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1394
1395 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1396    Check if a text is encoded in ISO2022.  If it is, return an
1397    integer in which appropriate flag bits any of:
1398         CODING_CATEGORY_MASK_ISO_7
1399         CODING_CATEGORY_MASK_ISO_7_TIGHT
1400         CODING_CATEGORY_MASK_ISO_8_1
1401         CODING_CATEGORY_MASK_ISO_8_2
1402         CODING_CATEGORY_MASK_ISO_7_ELSE
1403         CODING_CATEGORY_MASK_ISO_8_ELSE
1404    are set.  If a code which should never appear in ISO2022 is found,
1405    returns 0.  */
1406
1407 static int
1408 detect_coding_iso2022 (src, src_end, multibytep)
1409      unsigned char *src, *src_end;
1410      int multibytep;
1411 {
1412   int mask = CODING_CATEGORY_MASK_ISO;
1413   int mask_found = 0;
1414   int reg[4], shift_out = 0, single_shifting = 0;
1415   int c, c1, charset;
1416   /* Dummy for ONE_MORE_BYTE.  */
1417   struct coding_system dummy_coding;
1418   struct coding_system *coding = &dummy_coding;
1419   Lisp_Object safe_chars;
1420
1421   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1422   while (mask && src < src_end)
1423     {
1424       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1425     retry:
1426       switch (c)
1427         {
1428         case ISO_CODE_ESC:
1429           if (inhibit_iso_escape_detection)
1430             break;
1431           single_shifting = 0;
1432           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1433           if (c >= '(' && c <= '/')
1434             {
1435               /* Designation sequence for a charset of dimension 1.  */
1436               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1437               if (c1 < ' ' || c1 >= 0x80
1438                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1439                 /* Invalid designation sequence.  Just ignore.  */
1440                 break;
1441               reg[(c - '(') % 4] = charset;
1442             }
1443           else if (c == '$')
1444             {
1445               /* Designation sequence for a charset of dimension 2.  */
1446               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1447               if (c >= '@' && c <= 'B')
1448                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1449                 reg[0] = charset = iso_charset_table[1][0][c];
1450               else if (c >= '(' && c <= '/')
1451                 {
1452                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1453                   if (c1 < ' ' || c1 >= 0x80
1454                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1455                     /* Invalid designation sequence.  Just ignore.  */
1456                     break;
1457                   reg[(c - '(') % 4] = charset;
1458                 }
1459               else
1460                 /* Invalid designation sequence.  Just ignore.  */
1461                 break;
1462             }
1463           else if (c == 'N' || c == 'O')
1464             {
1465               /* ESC <Fe> for SS2 or SS3.  */
1466               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1467               break;
1468             }
1469           else if (c >= '0' && c <= '4')
1470             {
1471               /* ESC <Fp> for start/end composition.  */
1472               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1473                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1474               else
1475                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1476               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1477                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1478               else
1479                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1480               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1481                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1482               else
1483                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1484               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1485                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1486               else
1487                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1488               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1489                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1490               else
1491                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1492               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1493                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1494               else
1495                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1496               break;
1497             }
1498           else
1499             /* Invalid escape sequence.  Just ignore.  */
1500             break;
1501
1502           /* We found a valid designation sequence for CHARSET.  */
1503           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1504           c = MAKE_CHAR (charset, 0, 0);
1505           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1506             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1507           else
1508             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1509           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1510             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1511           else
1512             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1513           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1514             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1515           else
1516             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1517           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1518             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1519           else
1520             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1521           break;
1522
1523         case ISO_CODE_SO:
1524           if (inhibit_iso_escape_detection)
1525             break;
1526           single_shifting = 0;
1527           if (shift_out == 0
1528               && (reg[1] >= 0
1529                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1530                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1531             {
1532               /* Locking shift out.  */
1533               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1534               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1535             }
1536           break;
1537
1538         case ISO_CODE_SI:
1539           if (inhibit_iso_escape_detection)
1540             break;
1541           single_shifting = 0;
1542           if (shift_out == 1)
1543             {
1544               /* Locking shift in.  */
1545               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1546               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1547             }
1548           break;
1549
1550         case ISO_CODE_CSI:
1551           single_shifting = 0;
1552         case ISO_CODE_SS2:
1553         case ISO_CODE_SS3:
1554           {
1555             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1556
1557             if (inhibit_iso_escape_detection)
1558               break;
1559             if (c != ISO_CODE_CSI)
1560               {
1561                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1562                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1563                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1564                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1565                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1566                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1567                 single_shifting = 1;
1568               }
1569             if (VECTORP (Vlatin_extra_code_table)
1570                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1571               {
1572                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1573                     & CODING_FLAG_ISO_LATIN_EXTRA)
1574                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1575                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1576                     & CODING_FLAG_ISO_LATIN_EXTRA)
1577                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1578               }
1579             mask &= newmask;
1580             mask_found |= newmask;
1581           }
1582           break;
1583
1584         default:
1585           if (c < 0x80)
1586             {
1587               single_shifting = 0;
1588               break;
1589             }
1590           else if (c < 0xA0)
1591             {
1592               single_shifting = 0;
1593               if (VECTORP (Vlatin_extra_code_table)
1594                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1595                 {
1596                   int newmask = 0;
1597
1598                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1599                       & CODING_FLAG_ISO_LATIN_EXTRA)
1600                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1601                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1602                       & CODING_FLAG_ISO_LATIN_EXTRA)
1603                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1604                   mask &= newmask;
1605                   mask_found |= newmask;
1606                 }
1607               else
1608                 return 0;
1609             }
1610           else
1611             {
1612               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1613                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1614               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1615               /* Check the length of succeeding codes of the range
1616                  0xA0..0FF.  If the byte length is odd, we exclude
1617                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1618                  when we are not single shifting.  */
1619               if (!single_shifting
1620                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1621                 {
1622                   int i = 1;
1623
1624                   c = -1;
1625                   while (src < src_end)
1626                     {
1627                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1628                       if (c < 0xA0)
1629                         break;
1630                       i++;
1631                     }
1632
1633                   if (i & 1 && src < src_end)
1634                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1635                   else
1636                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1637                   if (c >= 0)
1638                     /* This means that we have read one extra byte.  */
1639                     goto retry;
1640                 }
1641             }
1642           break;
1643         }
1644     }
1645  label_end_of_loop:
1646   return (mask & mask_found);
1647 }
1648
1649 /* Decode a character of which charset is CHARSET, the 1st position
1650    code is C1, the 2nd position code is C2, and return the decoded
1651    character code.  If the variable `translation_table' is non-nil,
1652    returned the translated code.  */
1653
1654 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1655   (NILP (translation_table)                     \
1656    ? MAKE_CHAR (charset, c1, c2)                \
1657    : translate_char (translation_table, -1, charset, c1, c2))
1658
1659 /* Set designation state into CODING.  */
1660 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1661   do {                                                                     \
1662     int charset, c;                                                        \
1663                                                                            \
1664     if (final_char < '0' || final_char >= 128)                             \
1665       goto label_invalid_code;                                             \
1666     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1667                                  make_number (chars),                      \
1668                                  make_number (final_char));                \
1669     c = MAKE_CHAR (charset, 0, 0);                                         \
1670     if (charset >= 0                                                       \
1671         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1672             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1673       {                                                                    \
1674         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1675             && reg == 0                                                    \
1676             && charset == CHARSET_ASCII)                                   \
1677           {                                                                \
1678             /* We should insert this designation sequence as is so         \
1679                that it is surely written back to a file.  */               \
1680             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1681             goto label_invalid_code;                                       \
1682           }                                                                \
1683         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1684         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1685             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1686           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1687         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1688       }                                                                    \
1689     else                                                                   \
1690       {                                                                    \
1691         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1692         goto label_invalid_code;                                           \
1693       }                                                                    \
1694   } while (0)
1695
1696 /* Allocate a memory block for storing information about compositions.
1697    The block is chained to the already allocated blocks.  */
1698
1699 void
1700 coding_allocate_composition_data (coding, char_offset)
1701      struct coding_system *coding;
1702      int char_offset;
1703 {
1704   struct composition_data *cmp_data
1705     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1706
1707   cmp_data->char_offset = char_offset;
1708   cmp_data->used = 0;
1709   cmp_data->prev = coding->cmp_data;
1710   cmp_data->next = NULL;
1711   if (coding->cmp_data)
1712     coding->cmp_data->next = cmp_data;
1713   coding->cmp_data = cmp_data;
1714   coding->cmp_data_start = 0;
1715   coding->composing = COMPOSITION_NO;
1716 }
1717
1718 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1719    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1720    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1721    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1722    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1723   */
1724
1725 #define DECODE_COMPOSITION_START(c1)                                       \
1726   do {                                                                     \
1727     if (coding->composing == COMPOSITION_DISABLED)                         \
1728       {                                                                    \
1729         *dst++ = ISO_CODE_ESC;                                             \
1730         *dst++ = c1 & 0x7f;                                                \
1731         coding->produced_char += 2;                                        \
1732       }                                                                    \
1733     else if (!COMPOSING_P (coding))                                        \
1734       {                                                                    \
1735         /* This is surely the start of a composition.  We must be sure     \
1736            that coding->cmp_data has enough space to store the             \
1737            information about the composition.  If not, terminate the       \
1738            current decoding loop, allocate one more memory block for       \
1739            coding->cmp_data in the caller, then start the decoding         \
1740            loop again.  We can't allocate memory here directly because     \
1741            it may cause buffer/string relocation.  */                      \
1742         if (!coding->cmp_data                                              \
1743             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1744                 >= COMPOSITION_DATA_SIZE))                                 \
1745           {                                                                \
1746             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1747             goto label_end_of_loop;                                        \
1748           }                                                                \
1749         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1750                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1751                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1752                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1753         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1754                                       coding->composing);                  \
1755         coding->composition_rule_follows = 0;                              \
1756       }                                                                    \
1757     else                                                                   \
1758       {                                                                    \
1759         /* We are already handling a composition.  If the method is        \
1760            the following two, the codes following the current escape       \
1761            sequence are actual characters stored in a buffer.  */          \
1762         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1763             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1764           {                                                                \
1765             coding->composing = COMPOSITION_RELATIVE;                      \
1766             coding->composition_rule_follows = 0;                          \
1767           }                                                                \
1768       }                                                                    \
1769   } while (0)
1770
1771 /* Handle composition end sequence ESC 1.  */
1772
1773 #define DECODE_COMPOSITION_END(c1)                                      \
1774   do {                                                                  \
1775     if (! COMPOSING_P (coding))                                         \
1776       {                                                                 \
1777         *dst++ = ISO_CODE_ESC;                                          \
1778         *dst++ = c1;                                                    \
1779         coding->produced_char += 2;                                     \
1780       }                                                                 \
1781     else                                                                \
1782       {                                                                 \
1783         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1784         coding->composing = COMPOSITION_NO;                             \
1785       }                                                                 \
1786   } while (0)
1787
1788 /* Decode a composition rule from the byte C1 (and maybe one more byte
1789    from SRC) and store one encoded composition rule in
1790    coding->cmp_data.  */
1791
1792 #define DECODE_COMPOSITION_RULE(c1)                                     \
1793   do {                                                                  \
1794     int rule = 0;                                                       \
1795     (c1) -= 32;                                                         \
1796     if (c1 < 81)                /* old format (before ver.21) */        \
1797       {                                                                 \
1798         int gref = (c1) / 9;                                            \
1799         int nref = (c1) % 9;                                            \
1800         if (gref == 4) gref = 10;                                       \
1801         if (nref == 4) nref = 10;                                       \
1802         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1803       }                                                                 \
1804     else if (c1 < 93)           /* new format (after ver.21) */         \
1805       {                                                                 \
1806         ONE_MORE_BYTE (c2);                                             \
1807         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1808       }                                                                 \
1809     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1810     coding->composition_rule_follows = 0;                               \
1811   } while (0)
1812
1813
1814 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1815
1816 static void
1817 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1818      struct coding_system *coding;
1819      const unsigned char *source;
1820      unsigned char *destination;
1821      int src_bytes, dst_bytes;
1822 {
1823   const unsigned char *src = source;
1824   const unsigned char *src_end = source + src_bytes;
1825   unsigned char *dst = destination;
1826   unsigned char *dst_end = destination + dst_bytes;
1827   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1828   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1829   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1830   /* SRC_BASE remembers the start position in source in each loop.
1831      The loop will be exited when there's not enough source code
1832      (within macro ONE_MORE_BYTE), or when there's not enough
1833      destination area to produce a character (within macro
1834      EMIT_CHAR).  */
1835   const unsigned char *src_base;
1836   int c, charset;
1837   Lisp_Object translation_table;
1838   Lisp_Object safe_chars;
1839
1840   safe_chars = coding_safe_chars (coding->symbol);
1841
1842   if (NILP (Venable_character_translation))
1843     translation_table = Qnil;
1844   else
1845     {
1846       translation_table = coding->translation_table_for_decode;
1847       if (NILP (translation_table))
1848         translation_table = Vstandard_translation_table_for_decode;
1849     }
1850
1851   coding->result = CODING_FINISH_NORMAL;
1852
1853   while (1)
1854     {
1855       int c1, c2 = 0;
1856
1857       src_base = src;
1858       ONE_MORE_BYTE (c1);
1859
1860       /* We produce no character or one character.  */
1861       switch (iso_code_class [c1])
1862         {
1863         case ISO_0x20_or_0x7F:
1864           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1865             {
1866               DECODE_COMPOSITION_RULE (c1);
1867               continue;
1868             }
1869           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1870             {
1871               /* This is SPACE or DEL.  */
1872               charset = CHARSET_ASCII;
1873               break;
1874             }
1875           /* This is a graphic character, we fall down ...  */
1876
1877         case ISO_graphic_plane_0:
1878           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1879             {
1880               DECODE_COMPOSITION_RULE (c1);
1881               continue;
1882             }
1883           charset = charset0;
1884           break;
1885
1886         case ISO_0xA0_or_0xFF:
1887           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1888               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1889             goto label_invalid_code;
1890           /* This is a graphic character, we fall down ... */
1891
1892         case ISO_graphic_plane_1:
1893           if (charset1 < 0)
1894             goto label_invalid_code;
1895           charset = charset1;
1896           break;
1897
1898         case ISO_control_0:
1899           if (COMPOSING_P (coding))
1900             DECODE_COMPOSITION_END ('1');
1901
1902           /* All ISO2022 control characters in this class have the
1903              same representation in Emacs internal format.  */
1904           if (c1 == '\n'
1905               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1906               && (coding->eol_type == CODING_EOL_CR
1907                   || coding->eol_type == CODING_EOL_CRLF))
1908             {
1909               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1910               goto label_end_of_loop;
1911             }
1912           charset = CHARSET_ASCII;
1913           break;
1914
1915         case ISO_control_1:
1916           if (COMPOSING_P (coding))
1917             DECODE_COMPOSITION_END ('1');
1918           goto label_invalid_code;
1919
1920         case ISO_carriage_return:
1921           if (COMPOSING_P (coding))
1922             DECODE_COMPOSITION_END ('1');
1923
1924           if (coding->eol_type == CODING_EOL_CR)
1925             c1 = '\n';
1926           else if (coding->eol_type == CODING_EOL_CRLF)
1927             {
1928               ONE_MORE_BYTE (c1);
1929               if (c1 != ISO_CODE_LF)
1930                 {
1931                   src--;
1932                   c1 = '\r';
1933                 }
1934             }
1935           charset = CHARSET_ASCII;
1936           break;
1937
1938         case ISO_shift_out:
1939           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1940               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1941             goto label_invalid_code;
1942           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1943           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1944           continue;
1945
1946         case ISO_shift_in:
1947           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1948             goto label_invalid_code;
1949           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1950           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1951           continue;
1952
1953         case ISO_single_shift_2_7:
1954         case ISO_single_shift_2:
1955           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1956             goto label_invalid_code;
1957           /* SS2 is handled as an escape sequence of ESC 'N' */
1958           c1 = 'N';
1959           goto label_escape_sequence;
1960
1961         case ISO_single_shift_3:
1962           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1963             goto label_invalid_code;
1964           /* SS2 is handled as an escape sequence of ESC 'O' */
1965           c1 = 'O';
1966           goto label_escape_sequence;
1967
1968         case ISO_control_sequence_introducer:
1969           /* CSI is handled as an escape sequence of ESC '[' ...  */
1970           c1 = '[';
1971           goto label_escape_sequence;
1972
1973         case ISO_escape:
1974           ONE_MORE_BYTE (c1);
1975         label_escape_sequence:
1976           /* Escape sequences handled by Emacs are invocation,
1977              designation, direction specification, and character
1978              composition specification.  */
1979           switch (c1)
1980             {
1981             case '&':           /* revision of following character set */
1982               ONE_MORE_BYTE (c1);
1983               if (!(c1 >= '@' && c1 <= '~'))
1984                 goto label_invalid_code;
1985               ONE_MORE_BYTE (c1);
1986               if (c1 != ISO_CODE_ESC)
1987                 goto label_invalid_code;
1988               ONE_MORE_BYTE (c1);
1989               goto label_escape_sequence;
1990
1991             case '$':           /* designation of 2-byte character set */
1992               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1993                 goto label_invalid_code;
1994               ONE_MORE_BYTE (c1);
1995               if (c1 >= '@' && c1 <= 'B')
1996                 {       /* designation of JISX0208.1978, GB2312.1980,
1997                            or JISX0208.1980 */
1998                   DECODE_DESIGNATION (0, 2, 94, c1);
1999                 }
2000               else if (c1 >= 0x28 && c1 <= 0x2B)
2001                 {       /* designation of DIMENSION2_CHARS94 character set */
2002                   ONE_MORE_BYTE (c2);
2003                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
2004                 }
2005               else if (c1 >= 0x2C && c1 <= 0x2F)
2006                 {       /* designation of DIMENSION2_CHARS96 character set */
2007                   ONE_MORE_BYTE (c2);
2008                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
2009                 }
2010               else
2011                 goto label_invalid_code;
2012               /* We must update these variables now.  */
2013               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2014               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2015               continue;
2016
2017             case 'n':           /* invocation of locking-shift-2 */
2018               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2019                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2020                 goto label_invalid_code;
2021               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
2022               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2023               continue;
2024
2025             case 'o':           /* invocation of locking-shift-3 */
2026               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
2027                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2028                 goto label_invalid_code;
2029               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2030               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2031               continue;
2032
2033             case 'N':           /* invocation of single-shift-2 */
2034               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2035                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2036                 goto label_invalid_code;
2037               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2038               ONE_MORE_BYTE (c1);
2039               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2040                 goto label_invalid_code;
2041               break;
2042
2043             case 'O':           /* invocation of single-shift-3 */
2044               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2045                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2046                 goto label_invalid_code;
2047               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2048               ONE_MORE_BYTE (c1);
2049               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2050                 goto label_invalid_code;
2051               break;
2052
2053             case '0': case '2': case '3': case '4': /* start composition */
2054               DECODE_COMPOSITION_START (c1);
2055               continue;
2056
2057             case '1':           /* end composition */
2058               DECODE_COMPOSITION_END (c1);
2059               continue;
2060
2061             case '[':           /* specification of direction */
2062               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2063                 goto label_invalid_code;
2064               /* For the moment, nested direction is not supported.
2065                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2066                  left-to-right, and nonzero means right-to-left.  */
2067               ONE_MORE_BYTE (c1);
2068               switch (c1)
2069                 {
2070                 case ']':       /* end of the current direction */
2071                   coding->mode &= ~CODING_MODE_DIRECTION;
2072
2073                 case '0':       /* end of the current direction */
2074                 case '1':       /* start of left-to-right direction */
2075                   ONE_MORE_BYTE (c1);
2076                   if (c1 == ']')
2077                     coding->mode &= ~CODING_MODE_DIRECTION;
2078                   else
2079                     goto label_invalid_code;
2080                   break;
2081
2082                 case '2':       /* start of right-to-left direction */
2083                   ONE_MORE_BYTE (c1);
2084                   if (c1 == ']')
2085                     coding->mode |= CODING_MODE_DIRECTION;
2086                   else
2087                     goto label_invalid_code;
2088                   break;
2089
2090                 default:
2091                   goto label_invalid_code;
2092                 }
2093               continue;
2094
2095             case '%':
2096               if (COMPOSING_P (coding))
2097                 DECODE_COMPOSITION_END ('1');
2098               ONE_MORE_BYTE (c1);
2099               if (c1 == '/')
2100                 {
2101                   /* CTEXT extended segment:
2102                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2103                      We keep these bytes as is for the moment.
2104                      They may be decoded by post-read-conversion.  */
2105                   int dim, M, L;
2106                   int size, required;
2107                   int produced_chars;
2108
2109                   ONE_MORE_BYTE (dim);
2110                   ONE_MORE_BYTE (M);
2111                   ONE_MORE_BYTE (L);
2112                   size = ((M - 128) * 128) + (L - 128);
2113                   required = 8 + size * 2;
2114                   if (dst + required > (dst_bytes ? dst_end : src))
2115                     goto label_end_of_loop;
2116                   *dst++ = ISO_CODE_ESC;
2117                   *dst++ = '%';
2118                   *dst++ = '/';
2119                   *dst++ = dim;
2120                   produced_chars = 4;
2121                   dst += CHAR_STRING (M, dst), produced_chars++;
2122                   dst += CHAR_STRING (L, dst), produced_chars++;
2123                   while (size-- > 0)
2124                     {
2125                       ONE_MORE_BYTE (c1);
2126                       dst += CHAR_STRING (c1, dst), produced_chars++;
2127                     }
2128                   coding->produced_char += produced_chars;
2129                 }
2130               else if (c1 == 'G')
2131                 {
2132                   unsigned char *d = dst;
2133                   int produced_chars;
2134
2135                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2136                      ESC % G --UTF-8-BYTES-- ESC % @
2137                      We keep these bytes as is for the moment.
2138                      They may be decoded by post-read-conversion.  */
2139                   if (d + 6 > (dst_bytes ? dst_end : src))
2140                     goto label_end_of_loop;
2141                   *d++ = ISO_CODE_ESC;
2142                   *d++ = '%';
2143                   *d++ = 'G';
2144                   produced_chars = 3;
2145                   while (d + 1 < (dst_bytes ? dst_end : src))
2146                     {
2147                       ONE_MORE_BYTE (c1);
2148                       if (c1 == ISO_CODE_ESC
2149                           && src + 1 < src_end
2150                           && src[0] == '%'
2151                           && src[1] == '@')
2152                         {
2153                           src += 2;
2154                           break;
2155                         }
2156                       d += CHAR_STRING (c1, d), produced_chars++;
2157                     }
2158                   if (d + 3 > (dst_bytes ? dst_end : src))
2159                     goto label_end_of_loop;
2160                   *d++ = ISO_CODE_ESC;
2161                   *d++ = '%';
2162                   *d++ = '@';
2163                   dst = d;
2164                   coding->produced_char += produced_chars + 3;
2165                 }
2166               else
2167                 goto label_invalid_code;
2168               continue;
2169
2170             default:
2171               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2172                 goto label_invalid_code;
2173               if (c1 >= 0x28 && c1 <= 0x2B)
2174                 {       /* designation of DIMENSION1_CHARS94 character set */
2175                   ONE_MORE_BYTE (c2);
2176                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2177                 }
2178               else if (c1 >= 0x2C && c1 <= 0x2F)
2179                 {       /* designation of DIMENSION1_CHARS96 character set */
2180                   ONE_MORE_BYTE (c2);
2181                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2182                 }
2183               else
2184                 goto label_invalid_code;
2185               /* We must update these variables now.  */
2186               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2187               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2188               continue;
2189             }
2190         }
2191
2192       /* Now we know CHARSET and 1st position code C1 of a character.
2193          Produce a multibyte sequence for that character while getting
2194          2nd position code C2 if necessary.  */
2195       if (CHARSET_DIMENSION (charset) == 2)
2196         {
2197           ONE_MORE_BYTE (c2);
2198           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2199             /* C2 is not in a valid range.  */
2200             goto label_invalid_code;
2201         }
2202       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2203       EMIT_CHAR (c);
2204       continue;
2205
2206     label_invalid_code:
2207       coding->errors++;
2208       if (COMPOSING_P (coding))
2209         DECODE_COMPOSITION_END ('1');
2210       src = src_base;
2211       c = *src++;
2212       if (! NILP (translation_table))
2213         c = translate_char (translation_table, c, 0, 0, 0);
2214       EMIT_CHAR (c);
2215     }
2216
2217  label_end_of_loop:
2218   coding->consumed = coding->consumed_char = src_base - source;
2219   coding->produced = dst - destination;
2220   return;
2221 }
2222
2223
2224 /* ISO2022 encoding stuff.  */
2225
2226 /*
2227    It is not enough to say just "ISO2022" on encoding, we have to
2228    specify more details.  In Emacs, each ISO2022 coding system
2229    variant has the following specifications:
2230         1. Initial designation to G0 through G3.
2231         2. Allows short-form designation?
2232         3. ASCII should be designated to G0 before control characters?
2233         4. ASCII should be designated to G0 at end of line?
2234         5. 7-bit environment or 8-bit environment?
2235         6. Use locking-shift?
2236         7. Use Single-shift?
2237    And the following two are only for Japanese:
2238         8. Use ASCII in place of JIS0201-1976-Roman?
2239         9. Use JISX0208-1983 in place of JISX0208-1978?
2240    These specifications are encoded in `coding->flags' as flag bits
2241    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2242    details.
2243 */
2244
2245 /* Produce codes (escape sequence) for designating CHARSET to graphic
2246    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2247    '@', 'A', or 'B' and the coding system CODING allows, produce
2248    designation sequence of short-form.  */
2249
2250 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2251   do {                                                                  \
2252     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2253     char *intermediate_char_94 = "()*+";                                \
2254     char *intermediate_char_96 = ",-./";                                \
2255     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2256                                                                         \
2257     if (revision < 255)                                                 \
2258       {                                                                 \
2259         *dst++ = ISO_CODE_ESC;                                          \
2260         *dst++ = '&';                                                   \
2261         *dst++ = '@' + revision;                                        \
2262       }                                                                 \
2263     *dst++ = ISO_CODE_ESC;                                              \
2264     if (CHARSET_DIMENSION (charset) == 1)                               \
2265       {                                                                 \
2266         if (CHARSET_CHARS (charset) == 94)                              \
2267           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2268         else                                                            \
2269           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2270       }                                                                 \
2271     else                                                                \
2272       {                                                                 \
2273         *dst++ = '$';                                                   \
2274         if (CHARSET_CHARS (charset) == 94)                              \
2275           {                                                             \
2276             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2277                 || reg != 0                                             \
2278                 || final_char < '@' || final_char > 'B')                \
2279               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2280           }                                                             \
2281         else                                                            \
2282           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2283       }                                                                 \
2284     *dst++ = final_char;                                                \
2285     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2286   } while (0)
2287
2288 /* The following two macros produce codes (control character or escape
2289    sequence) for ISO2022 single-shift functions (single-shift-2 and
2290    single-shift-3).  */
2291
2292 #define ENCODE_SINGLE_SHIFT_2                           \
2293   do {                                                  \
2294     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2295       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2296     else                                                \
2297       *dst++ = ISO_CODE_SS2;                            \
2298     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2299   } while (0)
2300
2301 #define ENCODE_SINGLE_SHIFT_3                           \
2302   do {                                                  \
2303     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2304       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2305     else                                                \
2306       *dst++ = ISO_CODE_SS3;                            \
2307     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2308   } while (0)
2309
2310 /* The following four macros produce codes (control character or
2311    escape sequence) for ISO2022 locking-shift functions (shift-in,
2312    shift-out, locking-shift-2, and locking-shift-3).  */
2313
2314 #define ENCODE_SHIFT_IN                         \
2315   do {                                          \
2316     *dst++ = ISO_CODE_SI;                       \
2317     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2318   } while (0)
2319
2320 #define ENCODE_SHIFT_OUT                        \
2321   do {                                          \
2322     *dst++ = ISO_CODE_SO;                       \
2323     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2324   } while (0)
2325
2326 #define ENCODE_LOCKING_SHIFT_2                  \
2327   do {                                          \
2328     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2329     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2330   } while (0)
2331
2332 #define ENCODE_LOCKING_SHIFT_3                  \
2333   do {                                          \
2334     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2335     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2336   } while (0)
2337
2338 /* Produce codes for a DIMENSION1 character whose character set is
2339    CHARSET and whose position-code is C1.  Designation and invocation
2340    sequences are also produced in advance if necessary.  */
2341
2342 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2343   do {                                                                  \
2344     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2345       {                                                                 \
2346         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2347           *dst++ = c1 & 0x7F;                                           \
2348         else                                                            \
2349           *dst++ = c1 | 0x80;                                           \
2350         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2351         break;                                                          \
2352       }                                                                 \
2353     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2354       {                                                                 \
2355         *dst++ = c1 & 0x7F;                                             \
2356         break;                                                          \
2357       }                                                                 \
2358     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2359       {                                                                 \
2360         *dst++ = c1 | 0x80;                                             \
2361         break;                                                          \
2362       }                                                                 \
2363     else                                                                \
2364       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2365          must invoke it, or, at first, designate it to some graphic     \
2366          register.  Then repeat the loop to actually produce the        \
2367          character.  */                                                 \
2368       dst = encode_invocation_designation (charset, coding, dst);       \
2369   } while (1)
2370
2371 /* Produce codes for a DIMENSION2 character whose character set is
2372    CHARSET and whose position-codes are C1 and C2.  Designation and
2373    invocation codes are also produced in advance if necessary.  */
2374
2375 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2376   do {                                                                  \
2377     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2378       {                                                                 \
2379         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2380           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2381         else                                                            \
2382           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2383         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2384         break;                                                          \
2385       }                                                                 \
2386     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2387       {                                                                 \
2388         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2389         break;                                                          \
2390       }                                                                 \
2391     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2392       {                                                                 \
2393         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2394         break;                                                          \
2395       }                                                                 \
2396     else                                                                \
2397       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2398          must invoke it, or, at first, designate it to some graphic     \
2399          register.  Then repeat the loop to actually produce the        \
2400          character.  */                                                 \
2401       dst = encode_invocation_designation (charset, coding, dst);       \
2402   } while (1)
2403
2404 #define ENCODE_ISO_CHARACTER(c)                                 \
2405   do {                                                          \
2406     int charset, c1, c2;                                        \
2407                                                                 \
2408     SPLIT_CHAR (c, charset, c1, c2);                            \
2409     if (CHARSET_DEFINED_P (charset))                            \
2410       {                                                         \
2411         if (CHARSET_DIMENSION (charset) == 1)                   \
2412           {                                                     \
2413             if (charset == CHARSET_ASCII                        \
2414                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2415               charset = charset_latin_jisx0201;                 \
2416             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2417           }                                                     \
2418         else                                                    \
2419           {                                                     \
2420             if (charset == charset_jisx0208                     \
2421                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2422               charset = charset_jisx0208_1978;                  \
2423             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2424           }                                                     \
2425       }                                                         \
2426     else                                                        \
2427       {                                                         \
2428         *dst++ = c1;                                            \
2429         if (c2 >= 0)                                            \
2430           *dst++ = c2;                                          \
2431       }                                                         \
2432   } while (0)
2433
2434
2435 /* Instead of encoding character C, produce one or two `?'s.  */
2436
2437 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2438   do {                                                          \
2439     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2440     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2441       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2442   } while (0)
2443
2444
2445 /* Produce designation and invocation codes at a place pointed by DST
2446    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2447    Return new DST.  */
2448
2449 unsigned char *
2450 encode_invocation_designation (charset, coding, dst)
2451      int charset;
2452      struct coding_system *coding;
2453      unsigned char *dst;
2454 {
2455   int reg;                      /* graphic register number */
2456
2457   /* At first, check designations.  */
2458   for (reg = 0; reg < 4; reg++)
2459     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2460       break;
2461
2462   if (reg >= 4)
2463     {
2464       /* CHARSET is not yet designated to any graphic registers.  */
2465       /* At first check the requested designation.  */
2466       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2467       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2468         /* Since CHARSET requests no special designation, designate it
2469            to graphic register 0.  */
2470         reg = 0;
2471
2472       ENCODE_DESIGNATION (charset, reg, coding);
2473     }
2474
2475   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2476       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2477     {
2478       /* Since the graphic register REG is not invoked to any graphic
2479          planes, invoke it to graphic plane 0.  */
2480       switch (reg)
2481         {
2482         case 0:                 /* graphic register 0 */
2483           ENCODE_SHIFT_IN;
2484           break;
2485
2486         case 1:                 /* graphic register 1 */
2487           ENCODE_SHIFT_OUT;
2488           break;
2489
2490         case 2:                 /* graphic register 2 */
2491           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2492             ENCODE_SINGLE_SHIFT_2;
2493           else
2494             ENCODE_LOCKING_SHIFT_2;
2495           break;
2496
2497         case 3:                 /* graphic register 3 */
2498           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2499             ENCODE_SINGLE_SHIFT_3;
2500           else
2501             ENCODE_LOCKING_SHIFT_3;
2502           break;
2503         }
2504     }
2505
2506   return dst;
2507 }
2508
2509 /* Produce 2-byte codes for encoded composition rule RULE.  */
2510
2511 #define ENCODE_COMPOSITION_RULE(rule)           \
2512   do {                                          \
2513     int gref, nref;                             \
2514     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2515     *dst++ = 32 + 81 + gref;                    \
2516     *dst++ = 32 + nref;                         \
2517   } while (0)
2518
2519 /* Produce codes for indicating the start of a composition sequence
2520    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2521    which specify information about the composition.  See the comment
2522    in coding.h for the format of DATA.  */
2523
2524 #define ENCODE_COMPOSITION_START(coding, data)                          \
2525   do {                                                                  \
2526     coding->composing = data[3];                                        \
2527     *dst++ = ISO_CODE_ESC;                                              \
2528     if (coding->composing == COMPOSITION_RELATIVE)                      \
2529       *dst++ = '0';                                                     \
2530     else                                                                \
2531       {                                                                 \
2532         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2533                   ? '3' : '4');                                         \
2534         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2535         coding->composition_rule_follows = 0;                           \
2536       }                                                                 \
2537   } while (0)
2538
2539 /* Produce codes for indicating the end of the current composition.  */
2540
2541 #define ENCODE_COMPOSITION_END(coding, data)                    \
2542   do {                                                          \
2543     *dst++ = ISO_CODE_ESC;                                      \
2544     *dst++ = '1';                                               \
2545     coding->cmp_data_start += data[0];                          \
2546     coding->composing = COMPOSITION_NO;                         \
2547     if (coding->cmp_data_start == coding->cmp_data->used        \
2548         && coding->cmp_data->next)                              \
2549       {                                                         \
2550         coding->cmp_data = coding->cmp_data->next;              \
2551         coding->cmp_data_start = 0;                             \
2552       }                                                         \
2553   } while (0)
2554
2555 /* Produce composition start sequence ESC 0.  Here, this sequence
2556    doesn't mean the start of a new composition but means that we have
2557    just produced components (alternate chars and composition rules) of
2558    the composition and the actual text follows in SRC.  */
2559
2560 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2561   do {                                          \
2562     *dst++ = ISO_CODE_ESC;                      \
2563     *dst++ = '0';                               \
2564     coding->composing = COMPOSITION_RELATIVE;   \
2565   } while (0)
2566
2567 /* The following three macros produce codes for indicating direction
2568    of text.  */
2569 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2570   do {                                                  \
2571     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2572       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2573     else                                                \
2574       *dst++ = ISO_CODE_CSI;                            \
2575   } while (0)
2576
2577 #define ENCODE_DIRECTION_R2L    \
2578   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2579
2580 #define ENCODE_DIRECTION_L2R    \
2581   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2582
2583 /* Produce codes for designation and invocation to reset the graphic
2584    planes and registers to initial state.  */
2585 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2586   do {                                                                      \
2587     int reg;                                                                \
2588     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2589       ENCODE_SHIFT_IN;                                                      \
2590     for (reg = 0; reg < 4; reg++)                                           \
2591       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2592           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2593               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2594         ENCODE_DESIGNATION                                                  \
2595           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2596   } while (0)
2597
2598 /* Produce designation sequences of charsets in the line started from
2599    SRC to a place pointed by DST, and return updated DST.
2600
2601    If the current block ends before any end-of-line, we may fail to
2602    find all the necessary designations.  */
2603
2604 static unsigned char *
2605 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2606      struct coding_system *coding;
2607      Lisp_Object translation_table;
2608      const unsigned char *src, *src_end;
2609      unsigned char *dst;
2610 {
2611   int charset, c, found = 0, reg;
2612   /* Table of charsets to be designated to each graphic register.  */
2613   int r[4];
2614
2615   for (reg = 0; reg < 4; reg++)
2616     r[reg] = -1;
2617
2618   while (found < 4)
2619     {
2620       ONE_MORE_CHAR (c);
2621       if (c == '\n')
2622         break;
2623
2624       charset = CHAR_CHARSET (c);
2625       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2626       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2627         {
2628           found++;
2629           r[reg] = charset;
2630         }
2631     }
2632
2633  label_end_of_loop:
2634   if (found)
2635     {
2636       for (reg = 0; reg < 4; reg++)
2637         if (r[reg] >= 0
2638             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2639           ENCODE_DESIGNATION (r[reg], reg, coding);
2640     }
2641
2642   return dst;
2643 }
2644
2645 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2646
2647 static void
2648 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2649      struct coding_system *coding;
2650      const unsigned char *source;
2651      unsigned char *destination;
2652      int src_bytes, dst_bytes;
2653 {
2654   const unsigned char *src = source;
2655   const unsigned char *src_end = source + src_bytes;
2656   unsigned char *dst = destination;
2657   unsigned char *dst_end = destination + dst_bytes;
2658   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2659      from DST_END to assure overflow checking is necessary only at the
2660      head of loop.  */
2661   unsigned char *adjusted_dst_end = dst_end - 19;
2662   /* SRC_BASE remembers the start position in source in each loop.
2663      The loop will be exited when there's not enough source text to
2664      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2665      there's not enough destination area to produce encoded codes
2666      (within macro EMIT_BYTES).  */
2667   const unsigned char *src_base;
2668   int c;
2669   Lisp_Object translation_table;
2670   Lisp_Object safe_chars;
2671
2672   if (coding->flags & CODING_FLAG_ISO_SAFE)
2673     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2674
2675   safe_chars = coding_safe_chars (coding->symbol);
2676
2677   if (NILP (Venable_character_translation))
2678     translation_table = Qnil;
2679   else
2680     {
2681       translation_table = coding->translation_table_for_encode;
2682       if (NILP (translation_table))
2683         translation_table = Vstandard_translation_table_for_encode;
2684     }
2685
2686   coding->consumed_char = 0;
2687   coding->errors = 0;
2688   while (1)
2689     {
2690       src_base = src;
2691
2692       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2693         {
2694           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2695           break;
2696         }
2697
2698       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2699           && CODING_SPEC_ISO_BOL (coding))
2700         {
2701           /* We have to produce designation sequences if any now.  */
2702           dst = encode_designation_at_bol (coding, translation_table,
2703                                            src, src_end, dst);
2704           CODING_SPEC_ISO_BOL (coding) = 0;
2705         }
2706
2707       /* Check composition start and end.  */
2708       if (coding->composing != COMPOSITION_DISABLED
2709           && coding->cmp_data_start < coding->cmp_data->used)
2710         {
2711           struct composition_data *cmp_data = coding->cmp_data;
2712           int *data = cmp_data->data + coding->cmp_data_start;
2713           int this_pos = cmp_data->char_offset + coding->consumed_char;
2714
2715           if (coding->composing == COMPOSITION_RELATIVE)
2716             {
2717               if (this_pos == data[2])
2718                 {
2719                   ENCODE_COMPOSITION_END (coding, data);
2720                   cmp_data = coding->cmp_data;
2721                   data = cmp_data->data + coding->cmp_data_start;
2722                 }
2723             }
2724           else if (COMPOSING_P (coding))
2725             {
2726               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2727               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2728                 /* We have consumed components of the composition.
2729                    What follows in SRC is the composition's base
2730                    text.  */
2731                 ENCODE_COMPOSITION_FAKE_START (coding);
2732               else
2733                 {
2734                   int c = cmp_data->data[coding->cmp_data_index++];
2735                   if (coding->composition_rule_follows)
2736                     {
2737                       ENCODE_COMPOSITION_RULE (c);
2738                       coding->composition_rule_follows = 0;
2739                     }
2740                   else
2741                     {
2742                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2743                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2744                         ENCODE_UNSAFE_CHARACTER (c);
2745                       else
2746                         ENCODE_ISO_CHARACTER (c);
2747                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2748                         coding->composition_rule_follows = 1;
2749                     }
2750                   continue;
2751                 }
2752             }
2753           if (!COMPOSING_P (coding))
2754             {
2755               if (this_pos == data[1])
2756                 {
2757                   ENCODE_COMPOSITION_START (coding, data);
2758                   continue;
2759                 }
2760             }
2761         }
2762
2763       ONE_MORE_CHAR (c);
2764
2765       /* Now encode the character C.  */
2766       if (c < 0x20 || c == 0x7F)
2767         {
2768           if (c == '\r')
2769             {
2770               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2771                 {
2772                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2773                     ENCODE_RESET_PLANE_AND_REGISTER;
2774                   *dst++ = c;
2775                   continue;
2776                 }
2777               /* fall down to treat '\r' as '\n' ...  */
2778               c = '\n';
2779             }
2780           if (c == '\n')
2781             {
2782               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2783                 ENCODE_RESET_PLANE_AND_REGISTER;
2784               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2785                 bcopy (coding->spec.iso2022.initial_designation,
2786                        coding->spec.iso2022.current_designation,
2787                        sizeof coding->spec.iso2022.initial_designation);
2788               if (coding->eol_type == CODING_EOL_LF
2789                   || coding->eol_type == CODING_EOL_UNDECIDED)
2790                 *dst++ = ISO_CODE_LF;
2791               else if (coding->eol_type == CODING_EOL_CRLF)
2792                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2793               else
2794                 *dst++ = ISO_CODE_CR;
2795               CODING_SPEC_ISO_BOL (coding) = 1;
2796             }
2797           else
2798             {
2799               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2800                 ENCODE_RESET_PLANE_AND_REGISTER;
2801               *dst++ = c;
2802             }
2803         }
2804       else if (ASCII_BYTE_P (c))
2805         ENCODE_ISO_CHARACTER (c);
2806       else if (SINGLE_BYTE_CHAR_P (c))
2807         {
2808           *dst++ = c;
2809           coding->errors++;
2810         }
2811       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2812                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2813         ENCODE_UNSAFE_CHARACTER (c);
2814       else
2815         ENCODE_ISO_CHARACTER (c);
2816
2817       coding->consumed_char++;
2818     }
2819
2820  label_end_of_loop:
2821   coding->consumed = src_base - source;
2822   coding->produced = coding->produced_char = dst - destination;
2823 }
2824
2825 \f
2826 /*** 4. SJIS and BIG5 handlers ***/
2827
2828 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2829    quite widely.  So, for the moment, Emacs supports them in the bare
2830    C code.  But, in the future, they may be supported only by CCL.  */
2831
2832 /* SJIS is a coding system encoding three character sets: ASCII, right
2833    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2834    as is.  A character of charset katakana-jisx0201 is encoded by
2835    "position-code + 0x80".  A character of charset japanese-jisx0208
2836    is encoded in 2-byte but two position-codes are divided and shifted
2837    so that it fits in the range below.
2838
2839    --- CODE RANGE of SJIS ---
2840    (character set)      (range)
2841    ASCII                0x00 .. 0x7F
2842    KATAKANA-JISX0201    0xA1 .. 0xDF
2843    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2844             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2845    -------------------------------
2846
2847 */
2848
2849 /* BIG5 is a coding system encoding two character sets: ASCII and
2850    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2851    character set and is encoded in two bytes.
2852
2853    --- CODE RANGE of BIG5 ---
2854    (character set)      (range)
2855    ASCII                0x00 .. 0x7F
2856    Big5 (1st byte)      0xA1 .. 0xFE
2857         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2858    --------------------------
2859
2860    Since the number of characters in Big5 is larger than maximum
2861    characters in Emacs' charset (96x96), it can't be handled as one
2862    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2863    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2864    contains frequently used characters and the latter contains less
2865    frequently used characters.  */
2866
2867 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2868    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2869    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2870    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2871
2872 /* Number of Big5 characters which have the same code in 1st byte.  */
2873 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2874
2875 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2876   do {                                                                  \
2877     unsigned int temp                                                   \
2878       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2879     if (b1 < 0xC9)                                                      \
2880       charset = charset_big5_1;                                         \
2881     else                                                                \
2882       {                                                                 \
2883         charset = charset_big5_2;                                       \
2884         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2885       }                                                                 \
2886     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2887     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2888   } while (0)
2889
2890 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2891   do {                                                                  \
2892     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2893     if (charset == charset_big5_2)                                      \
2894       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2895     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2896     b2 = temp % BIG5_SAME_ROW;                                          \
2897     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2898   } while (0)
2899
2900 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2901    Check if a text is encoded in SJIS.  If it is, return
2902    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2903
2904 static int
2905 detect_coding_sjis (src, src_end, multibytep)
2906      unsigned char *src, *src_end;
2907      int multibytep;
2908 {
2909   int c;
2910   /* Dummy for ONE_MORE_BYTE.  */
2911   struct coding_system dummy_coding;
2912   struct coding_system *coding = &dummy_coding;
2913
2914   while (1)
2915     {
2916       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2917       if (c < 0x80)
2918         continue;
2919       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2920         return 0;
2921       if (c <= 0x9F || c >= 0xE0)
2922         {
2923           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2924           if (c < 0x40 || c == 0x7F || c > 0xFC)
2925             return 0;
2926         }
2927     }
2928  label_end_of_loop:
2929   return CODING_CATEGORY_MASK_SJIS;
2930 }
2931
2932 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2933    Check if a text is encoded in BIG5.  If it is, return
2934    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2935
2936 static int
2937 detect_coding_big5 (src, src_end, multibytep)
2938      unsigned char *src, *src_end;
2939      int multibytep;
2940 {
2941   int c;
2942   /* Dummy for ONE_MORE_BYTE.  */
2943   struct coding_system dummy_coding;
2944   struct coding_system *coding = &dummy_coding;
2945
2946   while (1)
2947     {
2948       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2949       if (c < 0x80)
2950         continue;
2951       if (c < 0xA1 || c > 0xFE)
2952         return 0;
2953       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2954       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2955         return 0;
2956     }
2957  label_end_of_loop:
2958   return CODING_CATEGORY_MASK_BIG5;
2959 }
2960
2961 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2962    Check if a text is encoded in UTF-8.  If it is, return
2963    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2964
2965 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2966 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2967 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2968 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2969 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2970 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2971 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2972
2973 static int
2974 detect_coding_utf_8 (src, src_end, multibytep)
2975      unsigned char *src, *src_end;
2976      int multibytep;
2977 {
2978   unsigned char c;
2979   int seq_maybe_bytes;
2980   /* Dummy for ONE_MORE_BYTE.  */
2981   struct coding_system dummy_coding;
2982   struct coding_system *coding = &dummy_coding;
2983
2984   while (1)
2985     {
2986       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2987       if (UTF_8_1_OCTET_P (c))
2988         continue;
2989       else if (UTF_8_2_OCTET_LEADING_P (c))
2990         seq_maybe_bytes = 1;
2991       else if (UTF_8_3_OCTET_LEADING_P (c))
2992         seq_maybe_bytes = 2;
2993       else if (UTF_8_4_OCTET_LEADING_P (c))
2994         seq_maybe_bytes = 3;
2995       else if (UTF_8_5_OCTET_LEADING_P (c))
2996         seq_maybe_bytes = 4;
2997       else if (UTF_8_6_OCTET_LEADING_P (c))
2998         seq_maybe_bytes = 5;
2999       else
3000         return 0;
3001
3002       do
3003         {
3004           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3005           if (!UTF_8_EXTRA_OCTET_P (c))
3006             return 0;
3007           seq_maybe_bytes--;
3008         }
3009       while (seq_maybe_bytes > 0);
3010     }
3011
3012  label_end_of_loop:
3013   return CODING_CATEGORY_MASK_UTF_8;
3014 }
3015
3016 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3017    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
3018    Little Endian (otherwise).  If it is, return
3019    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
3020    else return 0.  */
3021
3022 #define UTF_16_INVALID_P(val)   \
3023   (((val) == 0xFFFE)            \
3024    || ((val) == 0xFFFF))
3025
3026 #define UTF_16_HIGH_SURROGATE_P(val) \
3027   (((val) & 0xD800) == 0xD800)
3028
3029 #define UTF_16_LOW_SURROGATE_P(val) \
3030   (((val) & 0xDC00) == 0xDC00)
3031
3032 static int
3033 detect_coding_utf_16 (src, src_end, multibytep)
3034      unsigned char *src, *src_end;
3035      int multibytep;
3036 {
3037   unsigned char c1, c2;
3038   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3039   struct coding_system dummy_coding;
3040   struct coding_system *coding = &dummy_coding;
3041
3042   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3043   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3044
3045   if ((c1 == 0xFF) && (c2 == 0xFE))
3046     return CODING_CATEGORY_MASK_UTF_16_LE;
3047   else if ((c1 == 0xFE) && (c2 == 0xFF))
3048     return CODING_CATEGORY_MASK_UTF_16_BE;
3049
3050  label_end_of_loop:
3051   return 0;
3052 }
3053
3054 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3055    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3056
3057 static void
3058 decode_coding_sjis_big5 (coding, source, destination,
3059                          src_bytes, dst_bytes, sjis_p)
3060      struct coding_system *coding;
3061      const unsigned char *source;
3062      unsigned char  *destination;
3063      int src_bytes, dst_bytes;
3064      int sjis_p;
3065 {
3066   const unsigned char *src = source;
3067   const unsigned char *src_end = source + src_bytes;
3068   unsigned char *dst = destination;
3069   unsigned char *dst_end = destination + dst_bytes;
3070   /* SRC_BASE remembers the start position in source in each loop.
3071      The loop will be exited when there's not enough source code
3072      (within macro ONE_MORE_BYTE), or when there's not enough
3073      destination area to produce a character (within macro
3074      EMIT_CHAR).  */
3075   const unsigned char *src_base;
3076   Lisp_Object translation_table;
3077
3078   if (NILP (Venable_character_translation))
3079     translation_table = Qnil;
3080   else
3081     {
3082       translation_table = coding->translation_table_for_decode;
3083       if (NILP (translation_table))
3084         translation_table = Vstandard_translation_table_for_decode;
3085     }
3086
3087   coding->produced_char = 0;
3088   while (1)
3089     {
3090       int c, charset, c1, c2 = 0;
3091
3092       src_base = src;
3093       ONE_MORE_BYTE (c1);
3094
3095       if (c1 < 0x80)
3096         {
3097           charset = CHARSET_ASCII;
3098           if (c1 < 0x20)
3099             {
3100               if (c1 == '\r')
3101                 {
3102                   if (coding->eol_type == CODING_EOL_CRLF)
3103                     {
3104                       ONE_MORE_BYTE (c2);
3105                       if (c2 == '\n')
3106                         c1 = c2;
3107                       else
3108                         /* To process C2 again, SRC is subtracted by 1.  */
3109                         src--;
3110                     }
3111                   else if (coding->eol_type == CODING_EOL_CR)
3112                     c1 = '\n';
3113                 }
3114               else if (c1 == '\n'
3115                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3116                        && (coding->eol_type == CODING_EOL_CR
3117                            || coding->eol_type == CODING_EOL_CRLF))
3118                 {
3119                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3120                   goto label_end_of_loop;
3121                 }
3122             }
3123         }
3124       else
3125         {
3126           if (sjis_p)
3127             {
3128               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3129                 goto label_invalid_code;
3130               if (c1 <= 0x9F || c1 >= 0xE0)
3131                 {
3132                   /* SJIS -> JISX0208 */
3133                   ONE_MORE_BYTE (c2);
3134                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3135                     goto label_invalid_code;
3136                   DECODE_SJIS (c1, c2, c1, c2);
3137                   charset = charset_jisx0208;
3138                 }
3139               else
3140                 /* SJIS -> JISX0201-Kana */
3141                 charset = charset_katakana_jisx0201;
3142             }
3143           else
3144             {
3145               /* BIG5 -> Big5 */
3146               if (c1 < 0xA0 || c1 > 0xFE)
3147                 goto label_invalid_code;
3148               ONE_MORE_BYTE (c2);
3149               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3150                 goto label_invalid_code;
3151               DECODE_BIG5 (c1, c2, charset, c1, c2);
3152             }
3153         }
3154
3155       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3156       EMIT_CHAR (c);
3157       continue;
3158
3159     label_invalid_code:
3160       coding->errors++;
3161       src = src_base;
3162       c = *src++;
3163       EMIT_CHAR (c);
3164     }
3165
3166  label_end_of_loop:
3167   coding->consumed = coding->consumed_char = src_base - source;
3168   coding->produced = dst - destination;
3169   return;
3170 }
3171
3172 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3173    This function can encode charsets `ascii', `katakana-jisx0201',
3174    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3175    are sure that all these charsets are registered as official charset
3176    (i.e. do not have extended leading-codes).  Characters of other
3177    charsets are produced without any encoding.  If SJIS_P is 1, encode
3178    SJIS text, else encode BIG5 text.  */
3179
3180 static void
3181 encode_coding_sjis_big5 (coding, source, destination,
3182                          src_bytes, dst_bytes, sjis_p)
3183      struct coding_system *coding;
3184      unsigned char *source, *destination;
3185      int src_bytes, dst_bytes;
3186      int sjis_p;
3187 {
3188   unsigned char *src = source;
3189   unsigned char *src_end = source + src_bytes;
3190   unsigned char *dst = destination;
3191   unsigned char *dst_end = destination + dst_bytes;
3192   /* SRC_BASE remembers the start position in source in each loop.
3193      The loop will be exited when there's not enough source text to
3194      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3195      there's not enough destination area to produce encoded codes
3196      (within macro EMIT_BYTES).  */
3197   unsigned char *src_base;
3198   Lisp_Object translation_table;
3199
3200   if (NILP (Venable_character_translation))
3201     translation_table = Qnil;
3202   else
3203     {
3204       translation_table = coding->translation_table_for_encode;
3205       if (NILP (translation_table))
3206         translation_table = Vstandard_translation_table_for_encode;
3207     }
3208
3209   while (1)
3210     {
3211       int c, charset, c1, c2;
3212
3213       src_base = src;
3214       ONE_MORE_CHAR (c);
3215
3216       /* Now encode the character C.  */
3217       if (SINGLE_BYTE_CHAR_P (c))
3218         {
3219           switch (c)
3220             {
3221             case '\r':
3222               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3223                 {
3224                   EMIT_ONE_BYTE (c);
3225                   break;
3226                 }
3227               c = '\n';
3228             case '\n':
3229               if (coding->eol_type == CODING_EOL_CRLF)
3230                 {
3231                   EMIT_TWO_BYTES ('\r', c);
3232                   break;
3233                 }
3234               else if (coding->eol_type == CODING_EOL_CR)
3235                 c = '\r';
3236             default:
3237               EMIT_ONE_BYTE (c);
3238             }
3239         }
3240       else
3241         {
3242           SPLIT_CHAR (c, charset, c1, c2);
3243           if (sjis_p)
3244             {
3245               if (charset == charset_jisx0208
3246                   || charset == charset_jisx0208_1978)
3247                 {
3248                   ENCODE_SJIS (c1, c2, c1, c2);
3249                   EMIT_TWO_BYTES (c1, c2);
3250                 }
3251               else if (charset == charset_katakana_jisx0201)
3252                 EMIT_ONE_BYTE (c1 | 0x80);
3253               else if (charset == charset_latin_jisx0201)
3254                 EMIT_ONE_BYTE (c1);
3255               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3256                 {
3257                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3258                   if (CHARSET_WIDTH (charset) > 1)
3259                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3260                 }
3261               else
3262                 /* There's no way other than producing the internal
3263                    codes as is.  */
3264                 EMIT_BYTES (src_base, src);
3265             }
3266           else
3267             {
3268               if (charset == charset_big5_1 || charset == charset_big5_2)
3269                 {
3270                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3271                   EMIT_TWO_BYTES (c1, c2);
3272                 }
3273               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3274                 {
3275                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3276                   if (CHARSET_WIDTH (charset) > 1)
3277                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3278                 }
3279               else
3280                 /* There's no way other than producing the internal
3281                    codes as is.  */
3282                 EMIT_BYTES (src_base, src);
3283             }
3284         }
3285       coding->consumed_char++;
3286     }
3287
3288  label_end_of_loop:
3289   coding->consumed = src_base - source;
3290   coding->produced = coding->produced_char = dst - destination;
3291 }
3292
3293 \f
3294 /*** 5. CCL handlers ***/
3295
3296 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3297    Check if a text is encoded in a coding system of which
3298    encoder/decoder are written in CCL program.  If it is, return
3299    CODING_CATEGORY_MASK_CCL, else return 0.  */
3300
3301 static int
3302 detect_coding_ccl (src, src_end, multibytep)
3303      unsigned char *src, *src_end;
3304      int multibytep;
3305 {
3306   unsigned char *valid;
3307   int c;
3308   /* Dummy for ONE_MORE_BYTE.  */
3309   struct coding_system dummy_coding;
3310   struct coding_system *coding = &dummy_coding;
3311
3312   /* No coding system is assigned to coding-category-ccl.  */
3313   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3314     return 0;
3315
3316   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3317   while (1)
3318     {
3319       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3320       if (! valid[c])
3321         return 0;
3322     }
3323  label_end_of_loop:
3324   return CODING_CATEGORY_MASK_CCL;
3325 }
3326
3327 \f
3328 /*** 6. End-of-line handlers ***/
3329
3330 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3331
3332 static void
3333 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3334      struct coding_system *coding;
3335      const unsigned char *source;
3336      unsigned char *destination;
3337      int src_bytes, dst_bytes;
3338 {
3339   const unsigned char *src = source;
3340   unsigned char *dst = destination;
3341   const unsigned char *src_end = src + src_bytes;
3342   unsigned char *dst_end = dst + dst_bytes;
3343   Lisp_Object translation_table;
3344   /* SRC_BASE remembers the start position in source in each loop.
3345      The loop will be exited when there's not enough source code
3346      (within macro ONE_MORE_BYTE), or when there's not enough
3347      destination area to produce a character (within macro
3348      EMIT_CHAR).  */
3349   const unsigned char *src_base;
3350   int c;
3351
3352   translation_table = Qnil;
3353   switch (coding->eol_type)
3354     {
3355     case CODING_EOL_CRLF:
3356       while (1)
3357         {
3358           src_base = src;
3359           ONE_MORE_BYTE (c);
3360           if (c == '\r')
3361             {
3362               ONE_MORE_BYTE (c);
3363               if (c != '\n')
3364                 {
3365                   src--;
3366                   c = '\r';
3367                 }
3368             }
3369           else if (c == '\n'
3370                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3371             {
3372               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3373               goto label_end_of_loop;
3374             }
3375           EMIT_CHAR (c);
3376         }
3377       break;
3378
3379     case CODING_EOL_CR:
3380       while (1)
3381         {
3382           src_base = src;
3383           ONE_MORE_BYTE (c);
3384           if (c == '\n')
3385             {
3386               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3387                 {
3388                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3389                   goto label_end_of_loop;
3390                 }
3391             }
3392           else if (c == '\r')
3393             c = '\n';
3394           EMIT_CHAR (c);
3395         }
3396       break;
3397
3398     default:                    /* no need for EOL handling */
3399       while (1)
3400         {
3401           src_base = src;
3402           ONE_MORE_BYTE (c);
3403           EMIT_CHAR (c);
3404         }
3405     }
3406
3407  label_end_of_loop:
3408   coding->consumed = coding->consumed_char = src_base - source;
3409   coding->produced = dst - destination;
3410   return;
3411 }
3412
3413 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3414    format of end-of-line according to `coding->eol_type'.  It also
3415    convert multibyte form 8-bit characters to unibyte if
3416    CODING->src_multibyte is nonzero.  If `coding->mode &
3417    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3418    also means end-of-line.  */
3419
3420 static void
3421 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3422      struct coding_system *coding;
3423      const unsigned char *source;
3424      unsigned char *destination;
3425      int src_bytes, dst_bytes;
3426 {
3427   const unsigned char *src = source;
3428   unsigned char *dst = destination;
3429   const unsigned char *src_end = src + src_bytes;
3430   unsigned char *dst_end = dst + dst_bytes;
3431   Lisp_Object translation_table;
3432   /* SRC_BASE remembers the start position in source in each loop.
3433      The loop will be exited when there's not enough source text to
3434      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3435      there's not enough destination area to produce encoded codes
3436      (within macro EMIT_BYTES).  */
3437   const unsigned char *src_base;
3438   unsigned char *tmp;
3439   int c;
3440   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3441
3442   translation_table = Qnil;
3443   if (coding->src_multibyte
3444       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3445     {
3446       src_end--;
3447       src_bytes--;
3448       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3449     }
3450
3451   if (coding->eol_type == CODING_EOL_CRLF)
3452     {
3453       while (src < src_end)
3454         {
3455           src_base = src;
3456           c = *src++;
3457           if (c >= 0x20)
3458             EMIT_ONE_BYTE (c);
3459           else if (c == '\n' || (c == '\r' && selective_display))
3460             EMIT_TWO_BYTES ('\r', '\n');
3461           else
3462             EMIT_ONE_BYTE (c);
3463         }
3464       src_base = src;
3465     label_end_of_loop:
3466       ;
3467     }
3468   else
3469     {
3470       if (!dst_bytes || src_bytes <= dst_bytes)
3471         {
3472           safe_bcopy (src, dst, src_bytes);
3473           src_base = src_end;
3474           dst += src_bytes;
3475         }
3476       else
3477         {
3478           if (coding->src_multibyte
3479               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3480             dst_bytes--;
3481           safe_bcopy (src, dst, dst_bytes);
3482           src_base = src + dst_bytes;
3483           dst = destination + dst_bytes;
3484           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3485         }
3486       if (coding->eol_type == CODING_EOL_CR)
3487         {
3488           for (tmp = destination; tmp < dst; tmp++)
3489             if (*tmp == '\n') *tmp = '\r';
3490         }
3491       else if (selective_display)
3492         {
3493           for (tmp = destination; tmp < dst; tmp++)
3494             if (*tmp == '\r') *tmp = '\n';
3495         }
3496     }
3497   if (coding->src_multibyte)
3498     dst = destination + str_as_unibyte (destination, dst - destination);
3499
3500   coding->consumed = src_base - source;
3501   coding->produced = dst - destination;
3502   coding->produced_char = coding->produced;
3503 }
3504
3505 \f
3506 /*** 7. C library functions ***/
3507
3508 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3509    has a property `coding-system'.  The value of this property is a
3510    vector of length 5 (called the coding-vector).  Among elements of
3511    this vector, the first (element[0]) and the fifth (element[4])
3512    carry important information for decoding/encoding.  Before
3513    decoding/encoding, this information should be set in fields of a
3514    structure of type `coding_system'.
3515
3516    The value of the property `coding-system' can be a symbol of another
3517    subsidiary coding-system.  In that case, Emacs gets coding-vector
3518    from that symbol.
3519
3520    `element[0]' contains information to be set in `coding->type'.  The
3521    value and its meaning is as follows:
3522
3523    0 -- coding_type_emacs_mule
3524    1 -- coding_type_sjis
3525    2 -- coding_type_iso2022
3526    3 -- coding_type_big5
3527    4 -- coding_type_ccl encoder/decoder written in CCL
3528    nil -- coding_type_no_conversion
3529    t -- coding_type_undecided (automatic conversion on decoding,
3530                                no-conversion on encoding)
3531
3532    `element[4]' contains information to be set in `coding->flags' and
3533    `coding->spec'.  The meaning varies by `coding->type'.
3534
3535    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3536    of length 32 (of which the first 13 sub-elements are used now).
3537    Meanings of these sub-elements are:
3538
3539    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3540         If the value is an integer of valid charset, the charset is
3541         assumed to be designated to graphic register N initially.
3542
3543         If the value is minus, it is a minus value of charset which
3544         reserves graphic register N, which means that the charset is
3545         not designated initially but should be designated to graphic
3546         register N just before encoding a character in that charset.
3547
3548         If the value is nil, graphic register N is never used on
3549         encoding.
3550
3551    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3552         Each value takes t or nil.  See the section ISO2022 of
3553         `coding.h' for more information.
3554
3555    If `coding->type' is `coding_type_big5', element[4] is t to denote
3556    BIG5-ETen or nil to denote BIG5-HKU.
3557
3558    If `coding->type' takes the other value, element[4] is ignored.
3559
3560    Emacs Lisp's coding systems also carry information about format of
3561    end-of-line in a value of property `eol-type'.  If the value is
3562    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3563    means CODING_EOL_CR.  If it is not integer, it should be a vector
3564    of subsidiary coding systems of which property `eol-type' has one
3565    of the above values.
3566
3567 */
3568
3569 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3570    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3571    is setup so that no conversion is necessary and return -1, else
3572    return 0.  */
3573
3574 int
3575 setup_coding_system (coding_system, coding)
3576      Lisp_Object coding_system;
3577      struct coding_system *coding;
3578 {
3579   Lisp_Object coding_spec, coding_type, eol_type, plist;
3580   Lisp_Object val;
3581
3582   /* At first, zero clear all members.  */
3583   bzero (coding, sizeof (struct coding_system));
3584
3585   /* Initialize some fields required for all kinds of coding systems.  */
3586   coding->symbol = coding_system;
3587   coding->heading_ascii = -1;
3588   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3589   coding->composing = COMPOSITION_DISABLED;
3590   coding->cmp_data = NULL;
3591
3592   if (NILP (coding_system))
3593     goto label_invalid_coding_system;
3594
3595   coding_spec = Fget (coding_system, Qcoding_system);
3596
3597   if (!VECTORP (coding_spec)
3598       || XVECTOR (coding_spec)->size != 5
3599       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3600     goto label_invalid_coding_system;
3601
3602   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3603   if (VECTORP (eol_type))
3604     {
3605       coding->eol_type = CODING_EOL_UNDECIDED;
3606       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3607     }
3608   else if (XFASTINT (eol_type) == 1)
3609     {
3610       coding->eol_type = CODING_EOL_CRLF;
3611       coding->common_flags
3612         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3613     }
3614   else if (XFASTINT (eol_type) == 2)
3615     {
3616       coding->eol_type = CODING_EOL_CR;
3617       coding->common_flags
3618         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3619     }
3620   else
3621     coding->eol_type = CODING_EOL_LF;
3622
3623   coding_type = XVECTOR (coding_spec)->contents[0];
3624   /* Try short cut.  */
3625   if (SYMBOLP (coding_type))
3626     {
3627       if (EQ (coding_type, Qt))
3628         {
3629           coding->type = coding_type_undecided;
3630           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3631         }
3632       else
3633         coding->type = coding_type_no_conversion;
3634       /* Initialize this member.  Any thing other than
3635          CODING_CATEGORY_IDX_UTF_16_BE and
3636          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3637          special treatment in detect_eol.  */
3638       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3639
3640       return 0;
3641     }
3642
3643   /* Get values of coding system properties:
3644      `post-read-conversion', `pre-write-conversion',
3645      `translation-table-for-decode', `translation-table-for-encode'.  */
3646   plist = XVECTOR (coding_spec)->contents[3];
3647   /* Pre & post conversion functions should be disabled if
3648      inhibit_eol_conversion is nonzero.  This is the case that a code
3649      conversion function is called while those functions are running.  */
3650   if (! inhibit_pre_post_conversion)
3651     {
3652       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3653       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3654     }
3655   val = Fplist_get (plist, Qtranslation_table_for_decode);
3656   if (SYMBOLP (val))
3657     val = Fget (val, Qtranslation_table_for_decode);
3658   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3659   val = Fplist_get (plist, Qtranslation_table_for_encode);
3660   if (SYMBOLP (val))
3661     val = Fget (val, Qtranslation_table_for_encode);
3662   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3663   val = Fplist_get (plist, Qcoding_category);
3664   if (!NILP (val))
3665     {
3666       val = Fget (val, Qcoding_category_index);
3667       if (INTEGERP (val))
3668         coding->category_idx = XINT (val);
3669       else
3670         goto label_invalid_coding_system;
3671     }
3672   else
3673     goto label_invalid_coding_system;
3674
3675   /* If the coding system has non-nil `composition' property, enable
3676      composition handling.  */
3677   val = Fplist_get (plist, Qcomposition);
3678   if (!NILP (val))
3679     coding->composing = COMPOSITION_NO;
3680
3681   switch (XFASTINT (coding_type))
3682     {
3683     case 0:
3684       coding->type = coding_type_emacs_mule;
3685       coding->common_flags
3686         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3687       if (!NILP (coding->post_read_conversion))
3688         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3689       if (!NILP (coding->pre_write_conversion))
3690         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3691       break;
3692
3693     case 1:
3694       coding->type = coding_type_sjis;
3695       coding->common_flags
3696         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3697       break;
3698
3699     case 2:
3700       coding->type = coding_type_iso2022;
3701       coding->common_flags
3702         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3703       {
3704         Lisp_Object val, temp;
3705         Lisp_Object *flags;
3706         int i, charset, reg_bits = 0;
3707
3708         val = XVECTOR (coding_spec)->contents[4];
3709
3710         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3711           goto label_invalid_coding_system;
3712
3713         flags = XVECTOR (val)->contents;
3714         coding->flags
3715           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3716              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3717              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3718              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3719              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3720              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3721              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3722              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3723              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3724              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3725              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3726              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3727              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3728              );
3729
3730         /* Invoke graphic register 0 to plane 0.  */
3731         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3732         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3733         CODING_SPEC_ISO_INVOCATION (coding, 1)
3734           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3735         /* Not single shifting at first.  */
3736         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3737         /* Beginning of buffer should also be regarded as bol. */
3738         CODING_SPEC_ISO_BOL (coding) = 1;
3739
3740         for (charset = 0; charset <= MAX_CHARSET; charset++)
3741           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3742         val = Vcharset_revision_alist;
3743         while (CONSP (val))
3744           {
3745             charset = get_charset_id (Fcar_safe (XCAR (val)));
3746             if (charset >= 0
3747                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3748                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3749               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3750             val = XCDR (val);
3751           }
3752
3753         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3754            FLAGS[REG] can be one of below:
3755                 integer CHARSET: CHARSET occupies register I,
3756                 t: designate nothing to REG initially, but can be used
3757                   by any charsets,
3758                 list of integer, nil, or t: designate the first
3759                   element (if integer) to REG initially, the remaining
3760                   elements (if integer) is designated to REG on request,
3761                   if an element is t, REG can be used by any charsets,
3762                 nil: REG is never used.  */
3763         for (charset = 0; charset <= MAX_CHARSET; charset++)
3764           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3765             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3766         for (i = 0; i < 4; i++)
3767           {
3768             if ((INTEGERP (flags[i])
3769                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3770                 || (charset = get_charset_id (flags[i])) >= 0)
3771               {
3772                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3773                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3774               }
3775             else if (EQ (flags[i], Qt))
3776               {
3777                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3778                 reg_bits |= 1 << i;
3779                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3780               }
3781             else if (CONSP (flags[i]))
3782               {
3783                 Lisp_Object tail;
3784                 tail = flags[i];
3785
3786                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3787                 if ((INTEGERP (XCAR (tail))
3788                      && (charset = XINT (XCAR (tail)),
3789                          CHARSET_VALID_P (charset)))
3790                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3791                   {
3792                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3793                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3794                   }
3795                 else
3796                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3797                 tail = XCDR (tail);
3798                 while (CONSP (tail))
3799                   {
3800                     if ((INTEGERP (XCAR (tail))
3801                          && (charset = XINT (XCAR (tail)),
3802                              CHARSET_VALID_P (charset)))
3803                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3804                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3805                         = i;
3806                     else if (EQ (XCAR (tail), Qt))
3807                       reg_bits |= 1 << i;
3808                     tail = XCDR (tail);
3809                   }
3810               }
3811             else
3812               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3813
3814             CODING_SPEC_ISO_DESIGNATION (coding, i)
3815               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3816           }
3817
3818         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3819           {
3820             /* REG 1 can be used only by locking shift in 7-bit env.  */
3821             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3822               reg_bits &= ~2;
3823             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3824               /* Without any shifting, only REG 0 and 1 can be used.  */
3825               reg_bits &= 3;
3826           }
3827
3828         if (reg_bits)
3829           for (charset = 0; charset <= MAX_CHARSET; charset++)
3830             {
3831               if (CHARSET_DEFINED_P (charset)
3832                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3833                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3834                 {
3835                   /* There exist some default graphic registers to be
3836                      used by CHARSET.  */
3837
3838                   /* We had better avoid designating a charset of
3839                      CHARS96 to REG 0 as far as possible.  */
3840                   if (CHARSET_CHARS (charset) == 96)
3841                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3842                       = (reg_bits & 2
3843                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3844                   else
3845                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3846                       = (reg_bits & 1
3847                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3848                 }
3849             }
3850       }
3851       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3852       coding->spec.iso2022.last_invalid_designation_register = -1;
3853       break;
3854
3855     case 3:
3856       coding->type = coding_type_big5;
3857       coding->common_flags
3858         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3859       coding->flags
3860         = (NILP (XVECTOR (coding_spec)->contents[4])
3861            ? CODING_FLAG_BIG5_HKU
3862            : CODING_FLAG_BIG5_ETEN);
3863       break;
3864
3865     case 4:
3866       coding->type = coding_type_ccl;
3867       coding->common_flags
3868         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3869       {
3870         val = XVECTOR (coding_spec)->contents[4];
3871         if (! CONSP (val)
3872             || setup_ccl_program (&(coding->spec.ccl.decoder),
3873                                   XCAR (val)) < 0
3874             || setup_ccl_program (&(coding->spec.ccl.encoder),
3875                                   XCDR (val)) < 0)
3876           goto label_invalid_coding_system;
3877
3878         bzero (coding->spec.ccl.valid_codes, 256);
3879         val = Fplist_get (plist, Qvalid_codes);
3880         if (CONSP (val))
3881           {
3882             Lisp_Object this;
3883
3884             for (; CONSP (val); val = XCDR (val))
3885               {
3886                 this = XCAR (val);
3887                 if (INTEGERP (this)
3888                     && XINT (this) >= 0 && XINT (this) < 256)
3889                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3890                 else if (CONSP (this)
3891                          && INTEGERP (XCAR (this))
3892                          && INTEGERP (XCDR (this)))
3893                   {
3894                     int start = XINT (XCAR (this));
3895                     int end = XINT (XCDR (this));
3896
3897                     if (start >= 0 && start <= end && end < 256)
3898                       while (start <= end)
3899                         coding->spec.ccl.valid_codes[start++] = 1;
3900                   }
3901               }
3902           }
3903       }
3904       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3905       coding->spec.ccl.cr_carryover = 0;
3906       coding->spec.ccl.eight_bit_carryover[0] = 0;
3907       break;
3908
3909     case 5:
3910       coding->type = coding_type_raw_text;
3911       break;
3912
3913     default:
3914       goto label_invalid_coding_system;
3915     }
3916   return 0;
3917
3918  label_invalid_coding_system:
3919   coding->type = coding_type_no_conversion;
3920   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3921   coding->common_flags = 0;
3922   coding->eol_type = CODING_EOL_LF;
3923   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3924   return -1;
3925 }
3926
3927 /* Free memory blocks allocated for storing composition information.  */
3928
3929 void
3930 coding_free_composition_data (coding)
3931      struct coding_system *coding;
3932 {
3933   struct composition_data *cmp_data = coding->cmp_data, *next;
3934
3935   if (!cmp_data)
3936     return;
3937   /* Memory blocks are chained.  At first, rewind to the first, then,
3938      free blocks one by one.  */
3939   while (cmp_data->prev)
3940     cmp_data = cmp_data->prev;
3941   while (cmp_data)
3942     {
3943       next = cmp_data->next;
3944       xfree (cmp_data);
3945       cmp_data = next;
3946     }
3947   coding->cmp_data = NULL;
3948 }
3949
3950 /* Set `char_offset' member of all memory blocks pointed by
3951    coding->cmp_data to POS.  */
3952
3953 void
3954 coding_adjust_composition_offset (coding, pos)
3955      struct coding_system *coding;
3956      int pos;
3957 {
3958   struct composition_data *cmp_data;
3959
3960   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3961     cmp_data->char_offset = pos;
3962 }
3963
3964 /* Setup raw-text or one of its subsidiaries in the structure
3965    coding_system CODING according to the already setup value eol_type
3966    in CODING.  CODING should be setup for some coding system in
3967    advance.  */
3968
3969 void
3970 setup_raw_text_coding_system (coding)
3971      struct coding_system *coding;
3972 {
3973   if (coding->type != coding_type_raw_text)
3974     {
3975       coding->symbol = Qraw_text;
3976       coding->type = coding_type_raw_text;
3977       if (coding->eol_type != CODING_EOL_UNDECIDED)
3978         {
3979           Lisp_Object subsidiaries;
3980           subsidiaries = Fget (Qraw_text, Qeol_type);
3981
3982           if (VECTORP (subsidiaries)
3983               && XVECTOR (subsidiaries)->size == 3)
3984             coding->symbol
3985               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3986         }
3987       setup_coding_system (coding->symbol, coding);
3988     }
3989   return;
3990 }
3991
3992 /* Emacs has a mechanism to automatically detect a coding system if it
3993    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3994    it's impossible to distinguish some coding systems accurately
3995    because they use the same range of codes.  So, at first, coding
3996    systems are categorized into 7, those are:
3997
3998    o coding-category-emacs-mule
3999
4000         The category for a coding system which has the same code range
4001         as Emacs' internal format.  Assigned the coding-system (Lisp
4002         symbol) `emacs-mule' by default.
4003
4004    o coding-category-sjis
4005
4006         The category for a coding system which has the same code range
4007         as SJIS.  Assigned the coding-system (Lisp
4008         symbol) `japanese-shift-jis' by default.
4009
4010    o coding-category-iso-7
4011
4012         The category for a coding system which has the same code range
4013         as ISO2022 of 7-bit environment.  This doesn't use any locking
4014         shift and single shift functions.  This can encode/decode all
4015         charsets.  Assigned the coding-system (Lisp symbol)
4016         `iso-2022-7bit' by default.
4017
4018    o coding-category-iso-7-tight
4019
4020         Same as coding-category-iso-7 except that this can
4021         encode/decode only the specified charsets.
4022
4023    o coding-category-iso-8-1
4024
4025         The category for a coding system which has the same code range
4026         as ISO2022 of 8-bit environment and graphic plane 1 used only
4027         for DIMENSION1 charset.  This doesn't use any locking shift
4028         and single shift functions.  Assigned the coding-system (Lisp
4029         symbol) `iso-latin-1' by default.
4030
4031    o coding-category-iso-8-2
4032
4033         The category for a coding system which has the same code range
4034         as ISO2022 of 8-bit environment and graphic plane 1 used only
4035         for DIMENSION2 charset.  This doesn't use any locking shift
4036         and single shift functions.  Assigned the coding-system (Lisp
4037         symbol) `japanese-iso-8bit' by default.
4038
4039    o coding-category-iso-7-else
4040
4041         The category for a coding system which has the same code range
4042         as ISO2022 of 7-bit environment but uses locking shift or
4043         single shift functions.  Assigned the coding-system (Lisp
4044         symbol) `iso-2022-7bit-lock' by default.
4045
4046    o coding-category-iso-8-else
4047
4048         The category for a coding system which has the same code range
4049         as ISO2022 of 8-bit environment but uses locking shift or
4050         single shift functions.  Assigned the coding-system (Lisp
4051         symbol) `iso-2022-8bit-ss2' by default.
4052
4053    o coding-category-big5
4054
4055         The category for a coding system which has the same code range
4056         as BIG5.  Assigned the coding-system (Lisp symbol)
4057         `cn-big5' by default.
4058
4059    o coding-category-utf-8
4060
4061         The category for a coding system which has the same code range
4062         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
4063         symbol) `utf-8' by default.
4064
4065    o coding-category-utf-16-be
4066
4067         The category for a coding system in which a text has an
4068         Unicode signature (cf. Unicode Standard) in the order of BIG
4069         endian at the head.  Assigned the coding-system (Lisp symbol)
4070         `utf-16-be' by default.
4071
4072    o coding-category-utf-16-le
4073
4074         The category for a coding system in which a text has an
4075         Unicode signature (cf. Unicode Standard) in the order of
4076         LITTLE endian at the head.  Assigned the coding-system (Lisp
4077         symbol) `utf-16-le' by default.
4078
4079    o coding-category-ccl
4080
4081         The category for a coding system of which encoder/decoder is
4082         written in CCL programs.  The default value is nil, i.e., no
4083         coding system is assigned.
4084
4085    o coding-category-binary
4086
4087         The category for a coding system not categorized in any of the
4088         above.  Assigned the coding-system (Lisp symbol)
4089         `no-conversion' by default.
4090
4091    Each of them is a Lisp symbol and the value is an actual
4092    `coding-system' (this is also a Lisp symbol) assigned by a user.
4093    What Emacs does actually is to detect a category of coding system.
4094    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4095    decide a single possible category, it selects a category of the
4096    highest priority.  Priorities of categories are also specified by a
4097    user in a Lisp variable `coding-category-list'.
4098
4099 */
4100
4101 static
4102 int ascii_skip_code[256];
4103
4104 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4105    If it detects possible coding systems, return an integer in which
4106    appropriate flag bits are set.  Flag bits are defined by macros
4107    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4108    it should point the table `coding_priorities'.  In that case, only
4109    the flag bit for a coding system of the highest priority is set in
4110    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4111    range 0x80..0x9F are in multibyte form.
4112
4113    How many ASCII characters are at the head is returned as *SKIP.  */
4114
4115 static int
4116 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4117      unsigned char *source;
4118      int src_bytes, *priorities, *skip;
4119      int multibytep;
4120 {
4121   register unsigned char c;
4122   unsigned char *src = source, *src_end = source + src_bytes;
4123   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4124   int i;
4125
4126   /* At first, skip all ASCII characters and control characters except
4127      for three ISO2022 specific control characters.  */
4128   ascii_skip_code[ISO_CODE_SO] = 0;
4129   ascii_skip_code[ISO_CODE_SI] = 0;
4130   ascii_skip_code[ISO_CODE_ESC] = 0;
4131
4132  label_loop_detect_coding:
4133   while (src < src_end && ascii_skip_code[*src]) src++;
4134   *skip = src - source;
4135
4136   if (src >= src_end)
4137     /* We found nothing other than ASCII.  There's nothing to do.  */
4138     return 0;
4139
4140   c = *src;
4141   /* The text seems to be encoded in some multilingual coding system.
4142      Now, try to find in which coding system the text is encoded.  */
4143   if (c < 0x80)
4144     {
4145       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4146       /* C is an ISO2022 specific control code of C0.  */
4147       mask = detect_coding_iso2022 (src, src_end, multibytep);
4148       if (mask == 0)
4149         {
4150           /* No valid ISO2022 code follows C.  Try again.  */
4151           src++;
4152           if (c == ISO_CODE_ESC)
4153             ascii_skip_code[ISO_CODE_ESC] = 1;
4154           else
4155             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4156           goto label_loop_detect_coding;
4157         }
4158       if (priorities)
4159         {
4160           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4161             {
4162               if (mask & priorities[i])
4163                 return priorities[i];
4164             }
4165           return CODING_CATEGORY_MASK_RAW_TEXT;
4166         }
4167     }
4168   else
4169     {
4170       int try;
4171
4172       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4173         c = src[1] - 0x20;
4174
4175       if (c < 0xA0)
4176         {
4177           /* C is the first byte of SJIS character code,
4178              or a leading-code of Emacs' internal format (emacs-mule),
4179              or the first byte of UTF-16.  */
4180           try = (CODING_CATEGORY_MASK_SJIS
4181                   | CODING_CATEGORY_MASK_EMACS_MULE
4182                   | CODING_CATEGORY_MASK_UTF_16_BE
4183                   | CODING_CATEGORY_MASK_UTF_16_LE);
4184
4185           /* Or, if C is a special latin extra code,
4186              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4187              or is an ISO2022 control-sequence-introducer (CSI),
4188              we should also consider the possibility of ISO2022 codings.  */
4189           if ((VECTORP (Vlatin_extra_code_table)
4190                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4191               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4192               || (c == ISO_CODE_CSI
4193                   && (src < src_end
4194                       && (*src == ']'
4195                           || ((*src == '0' || *src == '1' || *src == '2')
4196                               && src + 1 < src_end
4197                               && src[1] == ']')))))
4198             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4199                      | CODING_CATEGORY_MASK_ISO_8BIT);
4200         }
4201       else
4202         /* C is a character of ISO2022 in graphic plane right,
4203            or a SJIS's 1-byte character code (i.e. JISX0201),
4204            or the first byte of BIG5's 2-byte code,
4205            or the first byte of UTF-8/16.  */
4206         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4207                 | CODING_CATEGORY_MASK_ISO_8BIT
4208                 | CODING_CATEGORY_MASK_SJIS
4209                 | CODING_CATEGORY_MASK_BIG5
4210                 | CODING_CATEGORY_MASK_UTF_8
4211                 | CODING_CATEGORY_MASK_UTF_16_BE
4212                 | CODING_CATEGORY_MASK_UTF_16_LE);
4213
4214       /* Or, we may have to consider the possibility of CCL.  */
4215       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4216           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4217               ->spec.ccl.valid_codes)[c])
4218         try |= CODING_CATEGORY_MASK_CCL;
4219
4220       mask = 0;
4221       utf16_examined_p = iso2022_examined_p = 0;
4222       if (priorities)
4223         {
4224           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4225             {
4226               if (!iso2022_examined_p
4227                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4228                 {
4229                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4230                   iso2022_examined_p = 1;
4231                 }
4232               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4233                 mask |= detect_coding_sjis (src, src_end, multibytep);
4234               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4235                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4236               else if (!utf16_examined_p
4237                        && (priorities[i] & try &
4238                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4239                 {
4240                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4241                   utf16_examined_p = 1;
4242                 }
4243               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4244                 mask |= detect_coding_big5 (src, src_end, multibytep);
4245               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4246                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4247               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4248                 mask |= detect_coding_ccl (src, src_end, multibytep);
4249               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4250                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4251               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4252                 mask |= CODING_CATEGORY_MASK_BINARY;
4253               if (mask & priorities[i])
4254                 return priorities[i];
4255             }
4256           return CODING_CATEGORY_MASK_RAW_TEXT;
4257         }
4258       if (try & CODING_CATEGORY_MASK_ISO)
4259         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4260       if (try & CODING_CATEGORY_MASK_SJIS)
4261         mask |= detect_coding_sjis (src, src_end, multibytep);
4262       if (try & CODING_CATEGORY_MASK_BIG5)
4263         mask |= detect_coding_big5 (src, src_end, multibytep);
4264       if (try & CODING_CATEGORY_MASK_UTF_8)
4265         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4266       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4267         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4268       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4269         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4270       if (try & CODING_CATEGORY_MASK_CCL)
4271         mask |= detect_coding_ccl (src, src_end, multibytep);
4272     }
4273   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4274 }
4275
4276 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4277    The information of the detected coding system is set in CODING.  */
4278
4279 void
4280 detect_coding (coding, src, src_bytes)
4281      struct coding_system *coding;
4282      const unsigned char *src;
4283      int src_bytes;
4284 {
4285   unsigned int idx;
4286   int skip, mask;
4287   Lisp_Object val;
4288
4289   val = Vcoding_category_list;
4290   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4291                              coding->src_multibyte);
4292   coding->heading_ascii = skip;
4293
4294   if (!mask) return;
4295
4296   /* We found a single coding system of the highest priority in MASK.  */
4297   idx = 0;
4298   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4299   if (! mask)
4300     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4301
4302   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4303
4304   if (coding->eol_type != CODING_EOL_UNDECIDED)
4305     {
4306       Lisp_Object tmp;
4307
4308       tmp = Fget (val, Qeol_type);
4309       if (VECTORP (tmp))
4310         val = XVECTOR (tmp)->contents[coding->eol_type];
4311     }
4312
4313   /* Setup this new coding system while preserving some slots.  */
4314   {
4315     int src_multibyte = coding->src_multibyte;
4316     int dst_multibyte = coding->dst_multibyte;
4317
4318     setup_coding_system (val, coding);
4319     coding->src_multibyte = src_multibyte;
4320     coding->dst_multibyte = dst_multibyte;
4321     coding->heading_ascii = skip;
4322   }
4323 }
4324
4325 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4326    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4327    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4328
4329    How many non-eol characters are at the head is returned as *SKIP.  */
4330
4331 #define MAX_EOL_CHECK_COUNT 3
4332
4333 static int
4334 detect_eol_type (source, src_bytes, skip)
4335      unsigned char *source;
4336      int src_bytes, *skip;
4337 {
4338   unsigned char *src = source, *src_end = src + src_bytes;
4339   unsigned char c;
4340   int total = 0;                /* How many end-of-lines are found so far.  */
4341   int eol_type = CODING_EOL_UNDECIDED;
4342   int this_eol_type;
4343
4344   *skip = 0;
4345
4346   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4347     {
4348       c = *src++;
4349       if (c == '\n' || c == '\r')
4350         {
4351           if (*skip == 0)
4352             *skip = src - 1 - source;
4353           total++;
4354           if (c == '\n')
4355             this_eol_type = CODING_EOL_LF;
4356           else if (src >= src_end || *src != '\n')
4357             this_eol_type = CODING_EOL_CR;
4358           else
4359             this_eol_type = CODING_EOL_CRLF, src++;
4360
4361           if (eol_type == CODING_EOL_UNDECIDED)
4362             /* This is the first end-of-line.  */
4363             eol_type = this_eol_type;
4364           else if (eol_type != this_eol_type)
4365             {
4366               /* The found type is different from what found before.  */
4367               eol_type = CODING_EOL_INCONSISTENT;
4368               break;
4369             }
4370         }
4371     }
4372
4373   if (*skip == 0)
4374     *skip = src_end - source;
4375   return eol_type;
4376 }
4377
4378 /* Like detect_eol_type, but detect EOL type in 2-octet
4379    big-endian/little-endian format for coding systems utf-16-be and
4380    utf-16-le.  */
4381
4382 static int
4383 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4384      unsigned char *source;
4385      int src_bytes, *skip, big_endian_p;
4386 {
4387   unsigned char *src = source, *src_end = src + src_bytes;
4388   unsigned int c1, c2;
4389   int total = 0;                /* How many end-of-lines are found so far.  */
4390   int eol_type = CODING_EOL_UNDECIDED;
4391   int this_eol_type;
4392   int msb, lsb;
4393
4394   if (big_endian_p)
4395     msb = 0, lsb = 1;
4396   else
4397     msb = 1, lsb = 0;
4398
4399   *skip = 0;
4400
4401   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4402     {
4403       c1 = (src[msb] << 8) | (src[lsb]);
4404       src += 2;
4405
4406       if (c1 == '\n' || c1 == '\r')
4407         {
4408           if (*skip == 0)
4409             *skip = src - 2 - source;
4410           total++;
4411           if (c1 == '\n')
4412             {
4413               this_eol_type = CODING_EOL_LF;
4414             }
4415           else
4416             {
4417               if ((src + 1) >= src_end)
4418                 {
4419                   this_eol_type = CODING_EOL_CR;
4420                 }
4421               else
4422                 {
4423                   c2 = (src[msb] << 8) | (src[lsb]);
4424                   if (c2 == '\n')
4425                     this_eol_type = CODING_EOL_CRLF, src += 2;
4426                   else
4427                     this_eol_type = CODING_EOL_CR;
4428                 }
4429             }
4430
4431           if (eol_type == CODING_EOL_UNDECIDED)
4432             /* This is the first end-of-line.  */
4433             eol_type = this_eol_type;
4434           else if (eol_type != this_eol_type)
4435             {
4436               /* The found type is different from what found before.  */
4437               eol_type = CODING_EOL_INCONSISTENT;
4438               break;
4439             }
4440         }
4441     }
4442
4443   if (*skip == 0)
4444     *skip = src_end - source;
4445   return eol_type;
4446 }
4447
4448 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4449    is encoded.  If it detects an appropriate format of end-of-line, it
4450    sets the information in *CODING.  */
4451
4452 void
4453 detect_eol (coding, src, src_bytes)
4454      struct coding_system *coding;
4455      const unsigned char *src;
4456      int src_bytes;
4457 {
4458   Lisp_Object val;
4459   int skip;
4460   int eol_type;
4461
4462   switch (coding->category_idx)
4463     {
4464     case CODING_CATEGORY_IDX_UTF_16_BE:
4465       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4466       break;
4467     case CODING_CATEGORY_IDX_UTF_16_LE:
4468       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4469       break;
4470     default:
4471       eol_type = detect_eol_type (src, src_bytes, &skip);
4472       break;
4473     }
4474
4475   if (coding->heading_ascii > skip)
4476     coding->heading_ascii = skip;
4477   else
4478     skip = coding->heading_ascii;
4479
4480   if (eol_type == CODING_EOL_UNDECIDED)
4481     return;
4482   if (eol_type == CODING_EOL_INCONSISTENT)
4483     {
4484 #if 0
4485       /* This code is suppressed until we find a better way to
4486          distinguish raw text file and binary file.  */
4487
4488       /* If we have already detected that the coding is raw-text, the
4489          coding should actually be no-conversion.  */
4490       if (coding->type == coding_type_raw_text)
4491         {
4492           setup_coding_system (Qno_conversion, coding);
4493           return;
4494         }
4495       /* Else, let's decode only text code anyway.  */
4496 #endif /* 0 */
4497       eol_type = CODING_EOL_LF;
4498     }
4499
4500   val = Fget (coding->symbol, Qeol_type);
4501   if (VECTORP (val) && XVECTOR (val)->size == 3)
4502     {
4503       int src_multibyte = coding->src_multibyte;
4504       int dst_multibyte = coding->dst_multibyte;
4505       struct composition_data *cmp_data = coding->cmp_data;
4506
4507       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4508       coding->src_multibyte = src_multibyte;
4509       coding->dst_multibyte = dst_multibyte;
4510       coding->heading_ascii = skip;
4511       coding->cmp_data = cmp_data;
4512     }
4513 }
4514
4515 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4516
4517 #define DECODING_BUFFER_MAG(coding)                     \
4518   (coding->type == coding_type_iso2022                  \
4519    ? 3                                                  \
4520    : (coding->type == coding_type_ccl                   \
4521       ? coding->spec.ccl.decoder.buf_magnification      \
4522       : 2))
4523
4524 /* Return maximum size (bytes) of a buffer enough for decoding
4525    SRC_BYTES of text encoded in CODING.  */
4526
4527 int
4528 decoding_buffer_size (coding, src_bytes)
4529      struct coding_system *coding;
4530      int src_bytes;
4531 {
4532   return (src_bytes * DECODING_BUFFER_MAG (coding)
4533           + CONVERSION_BUFFER_EXTRA_ROOM);
4534 }
4535
4536 /* Return maximum size (bytes) of a buffer enough for encoding
4537    SRC_BYTES of text to CODING.  */
4538
4539 int
4540 encoding_buffer_size (coding, src_bytes)
4541      struct coding_system *coding;
4542      int src_bytes;
4543 {
4544   int magnification;
4545
4546   if (coding->type == coding_type_ccl)
4547     {
4548       magnification = coding->spec.ccl.encoder.buf_magnification;
4549       if (coding->eol_type == CODING_EOL_CRLF)
4550         magnification *= 2;
4551     }
4552   else if (CODING_REQUIRE_ENCODING (coding))
4553     magnification = 3;
4554   else
4555     magnification = 1;
4556
4557   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4558 }
4559
4560 /* Working buffer for code conversion.  */
4561 struct conversion_buffer
4562 {
4563   int size;                     /* size of data.  */
4564   int on_stack;                 /* 1 if allocated by alloca.  */
4565   unsigned char *data;
4566 };
4567
4568 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4569 #define allocate_conversion_buffer(buf, len)            \
4570   do {                                                  \
4571     if (len < MAX_ALLOCA)                               \
4572       {                                                 \
4573         buf.data = (unsigned char *) alloca (len);      \
4574         buf.on_stack = 1;                               \
4575       }                                                 \
4576     else                                                \
4577       {                                                 \
4578         buf.data = (unsigned char *) xmalloc (len);     \
4579         buf.on_stack = 0;                               \
4580       }                                                 \
4581     buf.size = len;                                     \
4582   } while (0)
4583
4584 /* Double the allocated memory for *BUF.  */
4585 static void
4586 extend_conversion_buffer (buf)
4587      struct conversion_buffer *buf;
4588 {
4589   if (buf->on_stack)
4590     {
4591       unsigned char *save = buf->data;
4592       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4593       bcopy (save, buf->data, buf->size);
4594       buf->on_stack = 0;
4595     }
4596   else
4597     {
4598       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4599     }
4600   buf->size *= 2;
4601 }
4602
4603 /* Free the allocated memory for BUF if it is not on stack.  */
4604 static void
4605 free_conversion_buffer (buf)
4606      struct conversion_buffer *buf;
4607 {
4608   if (!buf->on_stack)
4609     xfree (buf->data);
4610 }
4611
4612 int
4613 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4614      struct coding_system *coding;
4615      unsigned char *source, *destination;
4616      int src_bytes, dst_bytes, encodep;
4617 {
4618   struct ccl_program *ccl
4619     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4620   unsigned char *dst = destination;
4621
4622   ccl->suppress_error = coding->suppress_error;
4623   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4624   if (encodep)
4625     {
4626       /* On encoding, EOL format is converted within ccl_driver.  For
4627          that, setup proper information in the structure CCL.  */
4628       ccl->eol_type = coding->eol_type;
4629       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4630         ccl->eol_type = CODING_EOL_LF;
4631       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4632       ccl->eight_bit_control = coding->dst_multibyte;
4633     }
4634   else
4635     ccl->eight_bit_control = 1;
4636   ccl->multibyte = coding->src_multibyte;
4637   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4638     {
4639       /* Move carryover bytes to DESTINATION.  */
4640       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4641       while (*p)
4642         *dst++ = *p++;
4643       coding->spec.ccl.eight_bit_carryover[0] = 0;
4644       if (dst_bytes)
4645         dst_bytes -= dst - destination;
4646     }
4647
4648   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4649                                   &(coding->consumed))
4650                       + dst - destination);
4651
4652   if (encodep)
4653     {
4654       coding->produced_char = coding->produced;
4655       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4656     }
4657   else if (!ccl->eight_bit_control)
4658     {
4659       /* The produced bytes forms a valid multibyte sequence. */
4660       coding->produced_char
4661         = multibyte_chars_in_text (destination, coding->produced);
4662       coding->spec.ccl.eight_bit_carryover[0] = 0;
4663     }
4664   else
4665     {
4666       /* On decoding, the destination should always multibyte.  But,
4667          CCL program might have been generated an invalid multibyte
4668          sequence.  Here we make such a sequence valid as
4669          multibyte.  */
4670       int bytes
4671         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4672
4673       if ((coding->consumed < src_bytes
4674            || !ccl->last_block)
4675           && coding->produced >= 1
4676           && destination[coding->produced - 1] >= 0x80)
4677         {
4678           /* We should not convert the tailing 8-bit codes to
4679              multibyte form even if they doesn't form a valid
4680              multibyte sequence.  They may form a valid sequence in
4681              the next call.  */
4682           int carryover = 0;
4683
4684           if (destination[coding->produced - 1] < 0xA0)
4685             carryover = 1;
4686           else if (coding->produced >= 2)
4687             {
4688               if (destination[coding->produced - 2] >= 0x80)
4689                 {
4690                   if (destination[coding->produced - 2] < 0xA0)
4691                     carryover = 2;
4692                   else if (coding->produced >= 3
4693                            && destination[coding->produced - 3] >= 0x80
4694                            && destination[coding->produced - 3] < 0xA0)
4695                     carryover = 3;
4696                 }
4697             }
4698           if (carryover > 0)
4699             {
4700               BCOPY_SHORT (destination + coding->produced - carryover,
4701                            coding->spec.ccl.eight_bit_carryover,
4702                            carryover);
4703               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4704               coding->produced -= carryover;
4705             }
4706         }
4707       coding->produced = str_as_multibyte (destination, bytes,
4708                                            coding->produced,
4709                                            &(coding->produced_char));
4710     }
4711
4712   switch (ccl->status)
4713     {
4714     case CCL_STAT_SUSPEND_BY_SRC:
4715       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4716       break;
4717     case CCL_STAT_SUSPEND_BY_DST:
4718       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4719       break;
4720     case CCL_STAT_QUIT:
4721     case CCL_STAT_INVALID_CMD:
4722       coding->result = CODING_FINISH_INTERRUPT;
4723       break;
4724     default:
4725       coding->result = CODING_FINISH_NORMAL;
4726       break;
4727     }
4728   return coding->result;
4729 }
4730
4731 /* Decode EOL format of the text at PTR of BYTES length destructively
4732    according to CODING->eol_type.  This is called after the CCL
4733    program produced a decoded text at PTR.  If we do CRLF->LF
4734    conversion, update CODING->produced and CODING->produced_char.  */
4735
4736 static void
4737 decode_eol_post_ccl (coding, ptr, bytes)
4738      struct coding_system *coding;
4739      unsigned char *ptr;
4740      int bytes;
4741 {
4742   Lisp_Object val, saved_coding_symbol;
4743   unsigned char *pend = ptr + bytes;
4744   int dummy;
4745
4746   /* Remember the current coding system symbol.  We set it back when
4747      an inconsistent EOL is found so that `last-coding-system-used' is
4748      set to the coding system that doesn't specify EOL conversion.  */
4749   saved_coding_symbol = coding->symbol;
4750
4751   coding->spec.ccl.cr_carryover = 0;
4752   if (coding->eol_type == CODING_EOL_UNDECIDED)
4753     {
4754       /* Here, to avoid the call of setup_coding_system, we directly
4755          call detect_eol_type.  */
4756       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4757       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4758         coding->eol_type = CODING_EOL_LF;
4759       if (coding->eol_type != CODING_EOL_UNDECIDED)
4760         {
4761           val = Fget (coding->symbol, Qeol_type);
4762           if (VECTORP (val) && XVECTOR (val)->size == 3)
4763             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4764         }
4765       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4766     }
4767
4768   if (coding->eol_type == CODING_EOL_LF
4769       || coding->eol_type == CODING_EOL_UNDECIDED)
4770     {
4771       /* We have nothing to do.  */
4772       ptr = pend;
4773     }
4774   else if (coding->eol_type == CODING_EOL_CRLF)
4775     {
4776       unsigned char *pstart = ptr, *p = ptr;
4777
4778       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4779           && *(pend - 1) == '\r')
4780         {
4781           /* If the last character is CR, we can't handle it here
4782              because LF will be in the not-yet-decoded source text.
4783              Record that the CR is not yet processed.  */
4784           coding->spec.ccl.cr_carryover = 1;
4785           coding->produced--;
4786           coding->produced_char--;
4787           pend--;
4788         }
4789       while (ptr < pend)
4790         {
4791           if (*ptr == '\r')
4792             {
4793               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4794                 {
4795                   *p++ = '\n';
4796                   ptr += 2;
4797                 }
4798               else
4799                 {
4800                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4801                     goto undo_eol_conversion;
4802                   *p++ = *ptr++;
4803                 }
4804             }
4805           else if (*ptr == '\n'
4806                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4807             goto undo_eol_conversion;
4808           else
4809             *p++ = *ptr++;
4810           continue;
4811
4812         undo_eol_conversion:
4813           /* We have faced with inconsistent EOL format at PTR.
4814              Convert all LFs before PTR back to CRLFs.  */
4815           for (p--, ptr--; p >= pstart; p--)
4816             {
4817               if (*p == '\n')
4818                 *ptr-- = '\n', *ptr-- = '\r';
4819               else
4820                 *ptr-- = *p;
4821             }
4822           /*  If carryover is recorded, cancel it because we don't
4823               convert CRLF anymore.  */
4824           if (coding->spec.ccl.cr_carryover)
4825             {
4826               coding->spec.ccl.cr_carryover = 0;
4827               coding->produced++;
4828               coding->produced_char++;
4829               pend++;
4830             }
4831           p = ptr = pend;
4832           coding->eol_type = CODING_EOL_LF;
4833           coding->symbol = saved_coding_symbol;
4834         }
4835       if (p < pend)
4836         {
4837           /* As each two-byte sequence CRLF was converted to LF, (PEND
4838              - P) is the number of deleted characters.  */
4839           coding->produced -= pend - p;
4840           coding->produced_char -= pend - p;
4841         }
4842     }
4843   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4844     {
4845       unsigned char *p = ptr;
4846
4847       for (; ptr < pend; ptr++)
4848         {
4849           if (*ptr == '\r')
4850             *ptr = '\n';
4851           else if (*ptr == '\n'
4852                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4853             {
4854               for (; p < ptr; p++)
4855                 {
4856                   if (*p == '\n')
4857                     *p = '\r';
4858                 }
4859               ptr = pend;
4860               coding->eol_type = CODING_EOL_LF;
4861               coding->symbol = saved_coding_symbol;
4862             }
4863         }
4864     }
4865 }
4866
4867 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4868    decoding, it may detect coding system and format of end-of-line if
4869    those are not yet decided.  The source should be unibyte, the
4870    result is multibyte if CODING->dst_multibyte is nonzero, else
4871    unibyte.  */
4872
4873 int
4874 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4875      struct coding_system *coding;
4876      const unsigned char *source;
4877      unsigned char *destination;
4878      int src_bytes, dst_bytes;
4879 {
4880   int extra = 0;
4881
4882   if (coding->type == coding_type_undecided)
4883     detect_coding (coding, source, src_bytes);
4884
4885   if (coding->eol_type == CODING_EOL_UNDECIDED
4886       && coding->type != coding_type_ccl)
4887     {
4888       detect_eol (coding, source, src_bytes);
4889       /* We had better recover the original eol format if we
4890          encounter an inconsistent eol format while decoding.  */
4891       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4892     }
4893
4894   coding->produced = coding->produced_char = 0;
4895   coding->consumed = coding->consumed_char = 0;
4896   coding->errors = 0;
4897   coding->result = CODING_FINISH_NORMAL;
4898
4899   switch (coding->type)
4900     {
4901     case coding_type_sjis:
4902       decode_coding_sjis_big5 (coding, source, destination,
4903                                src_bytes, dst_bytes, 1);
4904       break;
4905
4906     case coding_type_iso2022:
4907       decode_coding_iso2022 (coding, source, destination,
4908                              src_bytes, dst_bytes);
4909       break;
4910
4911     case coding_type_big5:
4912       decode_coding_sjis_big5 (coding, source, destination,
4913                                src_bytes, dst_bytes, 0);
4914       break;
4915
4916     case coding_type_emacs_mule:
4917       decode_coding_emacs_mule (coding, source, destination,
4918                                 src_bytes, dst_bytes);
4919       break;
4920
4921     case coding_type_ccl:
4922       if (coding->spec.ccl.cr_carryover)
4923         {
4924           /* Put the CR which was not processed by the previous call
4925              of decode_eol_post_ccl in DESTINATION.  It will be
4926              decoded together with the following LF by the call to
4927              decode_eol_post_ccl below.  */
4928           *destination = '\r';
4929           coding->produced++;
4930           coding->produced_char++;
4931           dst_bytes--;
4932           extra = coding->spec.ccl.cr_carryover;
4933         }
4934       ccl_coding_driver (coding, source, destination + extra,
4935                          src_bytes, dst_bytes, 0);
4936       if (coding->eol_type != CODING_EOL_LF)
4937         {
4938           coding->produced += extra;
4939           coding->produced_char += extra;
4940           decode_eol_post_ccl (coding, destination, coding->produced);
4941         }
4942       break;
4943
4944     default:
4945       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4946     }
4947
4948   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4949       && coding->mode & CODING_MODE_LAST_BLOCK
4950       && coding->consumed == src_bytes)
4951     coding->result = CODING_FINISH_NORMAL;
4952
4953   if (coding->mode & CODING_MODE_LAST_BLOCK
4954       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4955     {
4956       const unsigned char *src = source + coding->consumed;
4957       unsigned char *dst = destination + coding->produced;
4958
4959       src_bytes -= coding->consumed;
4960       coding->errors++;
4961       if (COMPOSING_P (coding))
4962         DECODE_COMPOSITION_END ('1');
4963       while (src_bytes--)
4964         {
4965           int c = *src++;
4966           dst += CHAR_STRING (c, dst);
4967           coding->produced_char++;
4968         }
4969       coding->consumed = coding->consumed_char = src - source;
4970       coding->produced = dst - destination;
4971       coding->result = CODING_FINISH_NORMAL;
4972     }
4973
4974   if (!coding->dst_multibyte)
4975     {
4976       coding->produced = str_as_unibyte (destination, coding->produced);
4977       coding->produced_char = coding->produced;
4978     }
4979
4980   return coding->result;
4981 }
4982
4983 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4984    multibyteness of the source is CODING->src_multibyte, the
4985    multibyteness of the result is always unibyte.  */
4986
4987 int
4988 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4989      struct coding_system *coding;
4990      const unsigned char *source;
4991      unsigned char *destination;
4992      int src_bytes, dst_bytes;
4993 {
4994   coding->produced = coding->produced_char = 0;
4995   coding->consumed = coding->consumed_char = 0;
4996   coding->errors = 0;
4997   coding->result = CODING_FINISH_NORMAL;
4998
4999   switch (coding->type)
5000     {
5001     case coding_type_sjis:
5002       encode_coding_sjis_big5 (coding, source, destination,
5003                                src_bytes, dst_bytes, 1);
5004       break;
5005
5006     case coding_type_iso2022:
5007       encode_coding_iso2022 (coding, source, destination,
5008                              src_bytes, dst_bytes);
5009       break;
5010
5011     case coding_type_big5:
5012       encode_coding_sjis_big5 (coding, source, destination,
5013                                src_bytes, dst_bytes, 0);
5014       break;
5015
5016     case coding_type_emacs_mule:
5017       encode_coding_emacs_mule (coding, source, destination,
5018                                 src_bytes, dst_bytes);
5019       break;
5020
5021     case coding_type_ccl:
5022       ccl_coding_driver (coding, source, destination,
5023                          src_bytes, dst_bytes, 1);
5024       break;
5025
5026     default:
5027       encode_eol (coding, source, destination, src_bytes, dst_bytes);
5028     }
5029
5030   if (coding->mode & CODING_MODE_LAST_BLOCK
5031       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
5032     {
5033       const unsigned char *src = source + coding->consumed;
5034       unsigned char *dst = destination + coding->produced;
5035
5036       if (coding->type == coding_type_iso2022)
5037         ENCODE_RESET_PLANE_AND_REGISTER;
5038       if (COMPOSING_P (coding))
5039         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5040       if (coding->consumed < src_bytes)
5041         {
5042           int len = src_bytes - coding->consumed;
5043
5044           BCOPY_SHORT (src, dst, len);
5045           if (coding->src_multibyte)
5046             len = str_as_unibyte (dst, len);
5047           dst += len;
5048           coding->consumed = src_bytes;
5049         }
5050       coding->produced = coding->produced_char = dst - destination;
5051       coding->result = CODING_FINISH_NORMAL;
5052     }
5053
5054   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5055       && coding->consumed == src_bytes)
5056     coding->result = CODING_FINISH_NORMAL;
5057
5058   return coding->result;
5059 }
5060
5061 /* Scan text in the region between *BEG and *END (byte positions),
5062    skip characters which we don't have to decode by coding system
5063    CODING at the head and tail, then set *BEG and *END to the region
5064    of the text we actually have to convert.  The caller should move
5065    the gap out of the region in advance if the region is from a
5066    buffer.
5067
5068    If STR is not NULL, *BEG and *END are indices into STR.  */
5069
5070 static void
5071 shrink_decoding_region (beg, end, coding, str)
5072      int *beg, *end;
5073      struct coding_system *coding;
5074      unsigned char *str;
5075 {
5076   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5077   int eol_conversion;
5078   Lisp_Object translation_table;
5079
5080   if (coding->type == coding_type_ccl
5081       || coding->type == coding_type_undecided
5082       || coding->eol_type != CODING_EOL_LF
5083       || !NILP (coding->post_read_conversion)
5084       || coding->composing != COMPOSITION_DISABLED)
5085     {
5086       /* We can't skip any data.  */
5087       return;
5088     }
5089   if (coding->type == coding_type_no_conversion
5090       || coding->type == coding_type_raw_text
5091       || coding->type == coding_type_emacs_mule)
5092     {
5093       /* We need no conversion, but don't have to skip any data here.
5094          Decoding routine handles them effectively anyway.  */
5095       return;
5096     }
5097
5098   translation_table = coding->translation_table_for_decode;
5099   if (NILP (translation_table) && !NILP (Venable_character_translation))
5100     translation_table = Vstandard_translation_table_for_decode;
5101   if (CHAR_TABLE_P (translation_table))
5102     {
5103       int i;
5104       for (i = 0; i < 128; i++)
5105         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5106           break;
5107       if (i < 128)
5108         /* Some ASCII character should be translated.  We give up
5109            shrinking.  */
5110         return;
5111     }
5112
5113   if (coding->heading_ascii >= 0)
5114     /* Detection routine has already found how much we can skip at the
5115        head.  */
5116     *beg += coding->heading_ascii;
5117
5118   if (str)
5119     {
5120       begp_orig = begp = str + *beg;
5121       endp_orig = endp = str + *end;
5122     }
5123   else
5124     {
5125       begp_orig = begp = BYTE_POS_ADDR (*beg);
5126       endp_orig = endp = begp + *end - *beg;
5127     }
5128
5129   eol_conversion = (coding->eol_type == CODING_EOL_CR
5130                     || coding->eol_type == CODING_EOL_CRLF);
5131
5132   switch (coding->type)
5133     {
5134     case coding_type_sjis:
5135     case coding_type_big5:
5136       /* We can skip all ASCII characters at the head.  */
5137       if (coding->heading_ascii < 0)
5138         {
5139           if (eol_conversion)
5140             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5141           else
5142             while (begp < endp && *begp < 0x80) begp++;
5143         }
5144       /* We can skip all ASCII characters at the tail except for the
5145          second byte of SJIS or BIG5 code.  */
5146       if (eol_conversion)
5147         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5148       else
5149         while (begp < endp && endp[-1] < 0x80) endp--;
5150       /* Do not consider LF as ascii if preceded by CR, since that
5151          confuses eol decoding. */
5152       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5153         endp++;
5154       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5155         endp++;
5156       break;
5157
5158     case coding_type_iso2022:
5159       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5160         /* We can't skip any data.  */
5161         break;
5162       if (coding->heading_ascii < 0)
5163         {
5164           /* We can skip all ASCII characters at the head except for a
5165              few control codes.  */
5166           while (begp < endp && (c = *begp) < 0x80
5167                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5168                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5169                  && (!eol_conversion || c != ISO_CODE_LF))
5170             begp++;
5171         }
5172       switch (coding->category_idx)
5173         {
5174         case CODING_CATEGORY_IDX_ISO_8_1:
5175         case CODING_CATEGORY_IDX_ISO_8_2:
5176           /* We can skip all ASCII characters at the tail.  */
5177           if (eol_conversion)
5178             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5179           else
5180             while (begp < endp && endp[-1] < 0x80) endp--;
5181           /* Do not consider LF as ascii if preceded by CR, since that
5182              confuses eol decoding. */
5183           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5184             endp++;
5185           break;
5186
5187         case CODING_CATEGORY_IDX_ISO_7:
5188         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5189           {
5190             /* We can skip all characters at the tail except for 8-bit
5191                codes and ESC and the following 2-byte at the tail.  */
5192             unsigned char *eight_bit = NULL;
5193
5194             if (eol_conversion)
5195               while (begp < endp
5196                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5197                 {
5198                   if (!eight_bit && c & 0x80) eight_bit = endp;
5199                   endp--;
5200                 }
5201             else
5202               while (begp < endp
5203                      && (c = endp[-1]) != ISO_CODE_ESC)
5204                 {
5205                   if (!eight_bit && c & 0x80) eight_bit = endp;
5206                   endp--;
5207                 }
5208             /* Do not consider LF as ascii if preceded by CR, since that
5209                confuses eol decoding. */
5210             if (begp < endp && endp < endp_orig
5211                 && endp[-1] == '\r' && endp[0] == '\n')
5212               endp++;
5213             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5214               {
5215                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5216                   /* This is an ASCII designation sequence.  We can
5217                      surely skip the tail.  But, if we have
5218                      encountered an 8-bit code, skip only the codes
5219                      after that.  */
5220                   endp = eight_bit ? eight_bit : endp + 2;
5221                 else
5222                   /* Hmmm, we can't skip the tail.  */
5223                   endp = endp_orig;
5224               }
5225             else if (eight_bit)
5226               endp = eight_bit;
5227           }
5228         }
5229       break;
5230
5231     default:
5232       abort ();
5233     }
5234   *beg += begp - begp_orig;
5235   *end += endp - endp_orig;
5236   return;
5237 }
5238
5239 /* Like shrink_decoding_region but for encoding.  */
5240
5241 static void
5242 shrink_encoding_region (beg, end, coding, str)
5243      int *beg, *end;
5244      struct coding_system *coding;
5245      unsigned char *str;
5246 {
5247   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5248   int eol_conversion;
5249   Lisp_Object translation_table;
5250
5251   if (coding->type == coding_type_ccl
5252       || coding->eol_type == CODING_EOL_CRLF
5253       || coding->eol_type == CODING_EOL_CR
5254       || (coding->cmp_data && coding->cmp_data->used > 0))
5255     {
5256       /* We can't skip any data.  */
5257       return;
5258     }
5259   if (coding->type == coding_type_no_conversion
5260       || coding->type == coding_type_raw_text
5261       || coding->type == coding_type_emacs_mule
5262       || coding->type == coding_type_undecided)
5263     {
5264       /* We need no conversion, but don't have to skip any data here.
5265          Encoding routine handles them effectively anyway.  */
5266       return;
5267     }
5268
5269   translation_table = coding->translation_table_for_encode;
5270   if (NILP (translation_table) && !NILP (Venable_character_translation))
5271     translation_table = Vstandard_translation_table_for_encode;
5272   if (CHAR_TABLE_P (translation_table))
5273     {
5274       int i;
5275       for (i = 0; i < 128; i++)
5276         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5277           break;
5278       if (i < 128)
5279         /* Some ASCII character should be translated.  We give up
5280            shrinking.  */
5281         return;
5282     }
5283
5284   if (str)
5285     {
5286       begp_orig = begp = str + *beg;
5287       endp_orig = endp = str + *end;
5288     }
5289   else
5290     {
5291       begp_orig = begp = BYTE_POS_ADDR (*beg);
5292       endp_orig = endp = begp + *end - *beg;
5293     }
5294
5295   eol_conversion = (coding->eol_type == CODING_EOL_CR
5296                     || coding->eol_type == CODING_EOL_CRLF);
5297
5298   /* Here, we don't have to check coding->pre_write_conversion because
5299      the caller is expected to have handled it already.  */
5300   switch (coding->type)
5301     {
5302     case coding_type_iso2022:
5303       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5304         /* We can't skip any data.  */
5305         break;
5306       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5307         {
5308           unsigned char *bol = begp;
5309           while (begp < endp && *begp < 0x80)
5310             {
5311               begp++;
5312               if (begp[-1] == '\n')
5313                 bol = begp;
5314             }
5315           begp = bol;
5316           goto label_skip_tail;
5317         }
5318       /* fall down ... */
5319
5320     case coding_type_sjis:
5321     case coding_type_big5:
5322       /* We can skip all ASCII characters at the head and tail.  */
5323       if (eol_conversion)
5324         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5325       else
5326         while (begp < endp && *begp < 0x80) begp++;
5327     label_skip_tail:
5328       if (eol_conversion)
5329         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5330       else
5331         while (begp < endp && *(endp - 1) < 0x80) endp--;
5332       break;
5333
5334     default:
5335       abort ();
5336     }
5337
5338   *beg += begp - begp_orig;
5339   *end += endp - endp_orig;
5340   return;
5341 }
5342
5343 /* As shrinking conversion region requires some overhead, we don't try
5344    shrinking if the length of conversion region is less than this
5345    value.  */
5346 static int shrink_conversion_region_threshhold = 1024;
5347
5348 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5349   do {                                                                  \
5350     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5351       {                                                                 \
5352         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5353         else shrink_decoding_region (beg, end, coding, str);            \
5354       }                                                                 \
5355   } while (0)
5356
5357 /* ARG is (CODING BUFFER ...) where CODING is what to be set in
5358    Vlast_coding_system_used and the remaining elements are buffers to
5359    kill.  */
5360 static Lisp_Object
5361 code_convert_region_unwind (arg)
5362      Lisp_Object arg;
5363 {
5364   struct gcpro gcpro1;
5365   GCPRO1 (arg);
5366
5367   inhibit_pre_post_conversion = 0;
5368   Vlast_coding_system_used = XCAR (arg);
5369   for (arg = XCDR (arg); ! NILP (arg); arg = XCDR (arg))
5370     Fkill_buffer (XCAR (arg));
5371
5372   UNGCPRO;
5373   return Qnil;
5374 }
5375
5376 /* Store information about all compositions in the range FROM and TO
5377    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5378    buffer or a string, defaults to the current buffer.  */
5379
5380 void
5381 coding_save_composition (coding, from, to, obj)
5382      struct coding_system *coding;
5383      int from, to;
5384      Lisp_Object obj;
5385 {
5386   Lisp_Object prop;
5387   int start, end;
5388
5389   if (coding->composing == COMPOSITION_DISABLED)
5390     return;
5391   if (!coding->cmp_data)
5392     coding_allocate_composition_data (coding, from);
5393   if (!find_composition (from, to, &start, &end, &prop, obj)
5394       || end > to)
5395     return;
5396   if (start < from
5397       && (!find_composition (end, to, &start, &end, &prop, obj)
5398           || end > to))
5399     return;
5400   coding->composing = COMPOSITION_NO;
5401   do
5402     {
5403       if (COMPOSITION_VALID_P (start, end, prop))
5404         {
5405           enum composition_method method = COMPOSITION_METHOD (prop);
5406           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5407               >= COMPOSITION_DATA_SIZE)
5408             coding_allocate_composition_data (coding, from);
5409           /* For relative composition, we remember start and end
5410              positions, for the other compositions, we also remember
5411              components.  */
5412           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5413           if (method != COMPOSITION_RELATIVE)
5414             {
5415               /* We must store a*/
5416               Lisp_Object val, ch;
5417
5418               val = COMPOSITION_COMPONENTS (prop);
5419               if (CONSP (val))
5420                 while (CONSP (val))
5421                   {
5422                     ch = XCAR (val), val = XCDR (val);
5423                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5424                   }
5425               else if (VECTORP (val) || STRINGP (val))
5426                 {
5427                   int len = (VECTORP (val)
5428                              ? XVECTOR (val)->size : SCHARS (val));
5429                   int i;
5430                   for (i = 0; i < len; i++)
5431                     {
5432                       ch = (STRINGP (val)
5433                             ? Faref (val, make_number (i))
5434                             : XVECTOR (val)->contents[i]);
5435                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5436                     }
5437                 }
5438               else              /* INTEGERP (val) */
5439                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5440             }
5441           CODING_ADD_COMPOSITION_END (coding, end - from);
5442         }
5443       start = end;
5444     }
5445   while (start < to
5446          && find_composition (start, to, &start, &end, &prop, obj)
5447          && end <= to);
5448
5449   /* Make coding->cmp_data point to the first memory block.  */
5450   while (coding->cmp_data->prev)
5451     coding->cmp_data = coding->cmp_data->prev;
5452   coding->cmp_data_start = 0;
5453 }
5454
5455 /* Reflect the saved information about compositions to OBJ.
5456    CODING->cmp_data points to a memory block for the information.  OBJ
5457    is a buffer or a string, defaults to the current buffer.  */
5458
5459 void
5460 coding_restore_composition (coding, obj)
5461      struct coding_system *coding;
5462      Lisp_Object obj;
5463 {
5464   struct composition_data *cmp_data = coding->cmp_data;
5465
5466   if (!cmp_data)
5467     return;
5468
5469   while (cmp_data->prev)
5470     cmp_data = cmp_data->prev;
5471
5472   while (cmp_data)
5473     {
5474       int i;
5475
5476       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5477            i += cmp_data->data[i])
5478         {
5479           int *data = cmp_data->data + i;
5480           enum composition_method method = (enum composition_method) data[3];
5481           Lisp_Object components;
5482
5483           if (data[0] < 0 || i + data[0] > cmp_data->used)
5484             /* Invalid composition data.  */
5485             break;
5486
5487           if (method == COMPOSITION_RELATIVE)
5488             components = Qnil;
5489           else
5490             {
5491               int len = data[0] - 4, j;
5492               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5493
5494               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5495                   && len % 2 == 0)
5496                 len --;
5497               if (len < 1)
5498                 /* Invalid composition data.  */
5499                 break;
5500               for (j = 0; j < len; j++)
5501                 args[j] = make_number (data[4 + j]);
5502               components = (method == COMPOSITION_WITH_ALTCHARS
5503                             ? Fstring (len, args)
5504                             : Fvector (len, args));
5505             }
5506           compose_text (data[1], data[2], components, Qnil, obj);
5507         }
5508       cmp_data = cmp_data->next;
5509     }
5510 }
5511
5512 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5513    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5514    coding system CODING, and return the status code of code conversion
5515    (currently, this value has no meaning).
5516
5517    How many characters (and bytes) are converted to how many
5518    characters (and bytes) are recorded in members of the structure
5519    CODING.
5520
5521    If REPLACE is nonzero, we do various things as if the original text
5522    is deleted and a new text is inserted.  See the comments in
5523    replace_range (insdel.c) to know what we are doing.
5524
5525    If REPLACE is zero, it is assumed that the source text is unibyte.
5526    Otherwise, it is assumed that the source text is multibyte.  */
5527
5528 int
5529 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5530      int from, from_byte, to, to_byte, encodep, replace;
5531      struct coding_system *coding;
5532 {
5533   int len = to - from, len_byte = to_byte - from_byte;
5534   int nchars_del = 0, nbytes_del = 0;
5535   int require, inserted, inserted_byte;
5536   int head_skip, tail_skip, total_skip = 0;
5537   Lisp_Object saved_coding_symbol;
5538   int first = 1;
5539   unsigned char *src, *dst;
5540   Lisp_Object deletion;
5541   int orig_point = PT, orig_len = len;
5542   int prev_Z;
5543   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5544
5545   deletion = Qnil;
5546   saved_coding_symbol = coding->symbol;
5547
5548   if (from < PT && PT < to)
5549     {
5550       TEMP_SET_PT_BOTH (from, from_byte);
5551       orig_point = from;
5552     }
5553
5554   if (replace)
5555     {
5556       int saved_from = from;
5557       int saved_inhibit_modification_hooks;
5558
5559       prepare_to_modify_buffer (from, to, &from);
5560       if (saved_from != from)
5561         {
5562           to = from + len;
5563           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5564           len_byte = to_byte - from_byte;
5565         }
5566
5567       /* The code conversion routine can not preserve text properties
5568          for now.  So, we must remove all text properties in the
5569          region.  Here, we must suppress all modification hooks.  */
5570       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5571       inhibit_modification_hooks = 1;
5572       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5573       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5574     }
5575
5576   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5577     {
5578       /* We must detect encoding of text and eol format.  */
5579
5580       if (from < GPT && to > GPT)
5581         move_gap_both (from, from_byte);
5582       if (coding->type == coding_type_undecided)
5583         {
5584           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5585           if (coding->type == coding_type_undecided)
5586             {
5587               /* It seems that the text contains only ASCII, but we
5588                  should not leave it undecided because the deeper
5589                  decoding routine (decode_coding) tries to detect the
5590                  encodings again in vain.  */
5591               coding->type = coding_type_emacs_mule;
5592               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5593               /* As emacs-mule decoder will handle composition, we
5594                  need this setting to allocate coding->cmp_data
5595                  later.  */
5596               coding->composing = COMPOSITION_NO;
5597             }
5598         }
5599       if (coding->eol_type == CODING_EOL_UNDECIDED
5600           && coding->type != coding_type_ccl)
5601         {
5602           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5603           if (coding->eol_type == CODING_EOL_UNDECIDED)
5604             coding->eol_type = CODING_EOL_LF;
5605           /* We had better recover the original eol format if we
5606              encounter an inconsistent eol format while decoding.  */
5607           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5608         }
5609     }
5610
5611   /* Now we convert the text.  */
5612
5613   /* For encoding, we must process pre-write-conversion in advance.  */
5614   if (! inhibit_pre_post_conversion
5615       && encodep
5616       && SYMBOLP (coding->pre_write_conversion)
5617       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5618     {
5619       /* The function in pre-write-conversion may put a new text in a
5620          new buffer.  */
5621       struct buffer *prev = current_buffer;
5622       Lisp_Object new;
5623
5624       record_unwind_protect (code_convert_region_unwind,
5625                              Fcons (Vlast_coding_system_used, Qnil));
5626       /* We should not call any more pre-write/post-read-conversion
5627          functions while this pre-write-conversion is running.  */
5628       inhibit_pre_post_conversion = 1;
5629       call2 (coding->pre_write_conversion,
5630              make_number (from), make_number (to));
5631       inhibit_pre_post_conversion = 0;
5632       /* Discard the unwind protect.  */
5633       specpdl_ptr--;
5634
5635       if (current_buffer != prev)
5636         {
5637           len = ZV - BEGV;
5638           new = Fcurrent_buffer ();
5639           set_buffer_internal_1 (prev);
5640           del_range_2 (from, from_byte, to, to_byte, 0);
5641           TEMP_SET_PT_BOTH (from, from_byte);
5642           insert_from_buffer (XBUFFER (new), 1, len, 0);
5643           Fkill_buffer (new);
5644           if (orig_point >= to)
5645             orig_point += len - orig_len;
5646           else if (orig_point > from)
5647             orig_point = from;
5648           orig_len = len;
5649           to = from + len;
5650           from_byte = CHAR_TO_BYTE (from);
5651           to_byte = CHAR_TO_BYTE (to);
5652           len_byte = to_byte - from_byte;
5653           TEMP_SET_PT_BOTH (from, from_byte);
5654         }
5655     }
5656
5657   if (replace)
5658     {
5659       if (! EQ (current_buffer->undo_list, Qt))
5660         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5661       else
5662         {
5663           nchars_del = to - from;
5664           nbytes_del = to_byte - from_byte;
5665         }
5666     }
5667
5668   if (coding->composing != COMPOSITION_DISABLED)
5669     {
5670       if (encodep)
5671         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5672       else
5673         coding_allocate_composition_data (coding, from);
5674     }
5675
5676   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
5677      if we must run CCL program or there are compositions to
5678      encode.  */
5679   if (coding->type != coding_type_ccl
5680       && (! coding->cmp_data || coding->cmp_data->used == 0))
5681     {
5682       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5683
5684       if (from < GPT && GPT < to)
5685         move_gap_both (from, from_byte);
5686       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5687       if (from_byte == to_byte
5688           && (encodep || NILP (coding->post_read_conversion))
5689           && ! CODING_REQUIRE_FLUSHING (coding))
5690         {
5691           coding->produced = len_byte;
5692           coding->produced_char = len;
5693           if (!replace)
5694             /* We must record and adjust for this new text now.  */
5695             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5696           coding_free_composition_data (coding);
5697           return 0;
5698         }
5699
5700       head_skip = from_byte - from_byte_orig;
5701       tail_skip = to_byte_orig - to_byte;
5702       total_skip = head_skip + tail_skip;
5703       from += head_skip;
5704       to -= tail_skip;
5705       len -= total_skip; len_byte -= total_skip;
5706     }
5707
5708   /* For conversion, we must put the gap before the text in addition to
5709      making the gap larger for efficient decoding.  The required gap
5710      size starts from 2000 which is the magic number used in make_gap.
5711      But, after one batch of conversion, it will be incremented if we
5712      find that it is not enough .  */
5713   require = 2000;
5714
5715   if (GAP_SIZE  < require)
5716     make_gap (require - GAP_SIZE);
5717   move_gap_both (from, from_byte);
5718
5719   inserted = inserted_byte = 0;
5720
5721   GAP_SIZE += len_byte;
5722   ZV -= len;
5723   Z -= len;
5724   ZV_BYTE -= len_byte;
5725   Z_BYTE -= len_byte;
5726
5727   if (GPT - BEG < BEG_UNCHANGED)
5728     BEG_UNCHANGED = GPT - BEG;
5729   if (Z - GPT < END_UNCHANGED)
5730     END_UNCHANGED = Z - GPT;
5731
5732   if (!encodep && coding->src_multibyte)
5733     {
5734       /* Decoding routines expects that the source text is unibyte.
5735          We must convert 8-bit characters of multibyte form to
5736          unibyte.  */
5737       int len_byte_orig = len_byte;
5738       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5739       if (len_byte < len_byte_orig)
5740         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5741                     len_byte);
5742       coding->src_multibyte = 0;
5743     }
5744
5745   for (;;)
5746     {
5747       int result;
5748
5749       /* The buffer memory is now:
5750          +--------+converted-text+---------+-------original-text-------+---+
5751          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5752                   |<---------------------- GAP ----------------------->|  */
5753       src = GAP_END_ADDR - len_byte;
5754       dst = GPT_ADDR + inserted_byte;
5755
5756       if (encodep)
5757         result = encode_coding (coding, src, dst, len_byte, 0);
5758       else
5759         {
5760           if (coding->composing != COMPOSITION_DISABLED)
5761             coding->cmp_data->char_offset = from + inserted;
5762           result = decode_coding (coding, src, dst, len_byte, 0);
5763         }
5764
5765       /* The buffer memory is now:
5766          +--------+-------converted-text----+--+------original-text----+---+
5767          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5768                   |<---------------------- GAP ----------------------->|  */
5769
5770       inserted += coding->produced_char;
5771       inserted_byte += coding->produced;
5772       len_byte -= coding->consumed;
5773
5774       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5775         {
5776           coding_allocate_composition_data (coding, from + inserted);
5777           continue;
5778         }
5779
5780       src += coding->consumed;
5781       dst += coding->produced;
5782
5783       if (result == CODING_FINISH_NORMAL)
5784         {
5785           src += len_byte;
5786           break;
5787         }
5788       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5789         {
5790           unsigned char *pend = dst, *p = pend - inserted_byte;
5791           Lisp_Object eol_type;
5792
5793           /* Encode LFs back to the original eol format (CR or CRLF).  */
5794           if (coding->eol_type == CODING_EOL_CR)
5795             {
5796               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5797             }
5798           else
5799             {
5800               int count = 0;
5801
5802               while (p < pend) if (*p++ == '\n') count++;
5803               if (src - dst < count)
5804                 {
5805                   /* We don't have sufficient room for encoding LFs
5806                      back to CRLF.  We must record converted and
5807                      not-yet-converted text back to the buffer
5808                      content, enlarge the gap, then record them out of
5809                      the buffer contents again.  */
5810                   int add = len_byte + inserted_byte;
5811
5812                   GAP_SIZE -= add;
5813                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5814                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5815                   make_gap (count - GAP_SIZE);
5816                   GAP_SIZE += add;
5817                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5818                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5819                   /* Don't forget to update SRC, DST, and PEND.  */
5820                   src = GAP_END_ADDR - len_byte;
5821                   dst = GPT_ADDR + inserted_byte;
5822                   pend = dst;
5823                 }
5824               inserted += count;
5825               inserted_byte += count;
5826               coding->produced += count;
5827               p = dst = pend + count;
5828               while (count)
5829                 {
5830                   *--p = *--pend;
5831                   if (*p == '\n') count--, *--p = '\r';
5832                 }
5833             }
5834
5835           /* Suppress eol-format conversion in the further conversion.  */
5836           coding->eol_type = CODING_EOL_LF;
5837
5838           /* Set the coding system symbol to that for Unix-like EOL.  */
5839           eol_type = Fget (saved_coding_symbol, Qeol_type);
5840           if (VECTORP (eol_type)
5841               && XVECTOR (eol_type)->size == 3
5842               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5843             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5844           else
5845             coding->symbol = saved_coding_symbol;
5846
5847           continue;
5848         }
5849       if (len_byte <= 0)
5850         {
5851           if (coding->type != coding_type_ccl
5852               || coding->mode & CODING_MODE_LAST_BLOCK)
5853             break;
5854           coding->mode |= CODING_MODE_LAST_BLOCK;
5855           continue;
5856         }
5857       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5858         {
5859           /* The source text ends in invalid codes.  Let's just
5860              make them valid buffer contents, and finish conversion.  */
5861           if (multibyte_p)
5862             {
5863               unsigned char *start = dst;
5864
5865               inserted += len_byte;
5866               while (len_byte--)
5867                 {
5868                   int c = *src++;
5869                   dst += CHAR_STRING (c, dst);
5870                 }
5871
5872               inserted_byte += dst - start;
5873             }
5874           else
5875             {
5876               inserted += len_byte;
5877               inserted_byte += len_byte;
5878               while (len_byte--)
5879                 *dst++ = *src++;
5880             }
5881           break;
5882         }
5883       if (result == CODING_FINISH_INTERRUPT)
5884         {
5885           /* The conversion procedure was interrupted by a user.  */
5886           break;
5887         }
5888       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5889       if (coding->consumed < 1)
5890         {
5891           /* It's quite strange to require more memory without
5892              consuming any bytes.  Perhaps CCL program bug.  */
5893           break;
5894         }
5895       if (first)
5896         {
5897           /* We have just done the first batch of conversion which was
5898              stopped because of insufficient gap.  Let's reconsider the
5899              required gap size (i.e. SRT - DST) now.
5900
5901              We have converted ORIG bytes (== coding->consumed) into
5902              NEW bytes (coding->produced).  To convert the remaining
5903              LEN bytes, we may need REQUIRE bytes of gap, where:
5904                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5905                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5906              Here, we are sure that NEW >= ORIG.  */
5907
5908           if (coding->produced <= coding->consumed)
5909             {
5910               /* This happens because of CCL-based coding system with
5911                  eol-type CRLF.  */
5912               require = 0;
5913             }
5914           else
5915             {
5916               float ratio = coding->produced - coding->consumed;
5917               ratio /= coding->consumed;
5918               require = len_byte * ratio;
5919             }
5920           first = 0;
5921         }
5922       if ((src - dst) < (require + 2000))
5923         {
5924           /* See the comment above the previous call of make_gap.  */
5925           int add = len_byte + inserted_byte;
5926
5927           GAP_SIZE -= add;
5928           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5929           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5930           make_gap (require + 2000);
5931           GAP_SIZE += add;
5932           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5933           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5934         }
5935     }
5936   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5937
5938   if (encodep && coding->dst_multibyte)
5939     {
5940       /* The output is unibyte.  We must convert 8-bit characters to
5941          multibyte form.  */
5942       if (inserted_byte * 2 > GAP_SIZE)
5943         {
5944           GAP_SIZE -= inserted_byte;
5945           ZV += inserted_byte; Z += inserted_byte;
5946           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5947           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5948           make_gap (inserted_byte - GAP_SIZE);
5949           GAP_SIZE += inserted_byte;
5950           ZV -= inserted_byte; Z -= inserted_byte;
5951           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5952           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5953         }
5954       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5955     }
5956
5957   /* If we shrank the conversion area, adjust it now.  */
5958   if (total_skip > 0)
5959     {
5960       if (tail_skip > 0)
5961         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5962       inserted += total_skip; inserted_byte += total_skip;
5963       GAP_SIZE += total_skip;
5964       GPT -= head_skip; GPT_BYTE -= head_skip;
5965       ZV -= total_skip; ZV_BYTE -= total_skip;
5966       Z -= total_skip; Z_BYTE -= total_skip;
5967       from -= head_skip; from_byte -= head_skip;
5968       to += tail_skip; to_byte += tail_skip;
5969     }
5970
5971   prev_Z = Z;
5972   if (! EQ (current_buffer->undo_list, Qt))
5973     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5974   else
5975     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5976                                  inserted, inserted_byte);
5977   inserted = Z - prev_Z;
5978
5979   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5980     coding_restore_composition (coding, Fcurrent_buffer ());
5981   coding_free_composition_data (coding);
5982
5983   if (! inhibit_pre_post_conversion
5984       && ! encodep && ! NILP (coding->post_read_conversion))
5985     {
5986       Lisp_Object val;
5987       Lisp_Object saved_coding_system;
5988
5989       if (from != PT)
5990         TEMP_SET_PT_BOTH (from, from_byte);
5991       prev_Z = Z;
5992       record_unwind_protect (code_convert_region_unwind,
5993                              Fcons (Vlast_coding_system_used, Qnil));
5994       saved_coding_system = Vlast_coding_system_used;
5995       Vlast_coding_system_used = coding->symbol;
5996       /* We should not call any more pre-write/post-read-conversion
5997          functions while this post-read-conversion is running.  */
5998       inhibit_pre_post_conversion = 1;
5999       val = call1 (coding->post_read_conversion, make_number (inserted));
6000       inhibit_pre_post_conversion = 0;
6001       coding->symbol = Vlast_coding_system_used;
6002       Vlast_coding_system_used = saved_coding_system;
6003       /* Discard the unwind protect.  */
6004       specpdl_ptr--;
6005       CHECK_NUMBER (val);
6006       inserted += Z - prev_Z;
6007     }
6008
6009   if (orig_point >= from)
6010     {
6011       if (orig_point >= from + orig_len)
6012         orig_point += inserted - orig_len;
6013       else
6014         orig_point = from;
6015       TEMP_SET_PT (orig_point);
6016     }
6017
6018   if (replace)
6019     {
6020       signal_after_change (from, to - from, inserted);
6021       update_compositions (from, from + inserted, CHECK_BORDER);
6022     }
6023
6024   {
6025     coding->consumed = to_byte - from_byte;
6026     coding->consumed_char = to - from;
6027     coding->produced = inserted_byte;
6028     coding->produced_char = inserted;
6029   }
6030
6031   return 0;
6032 }
6033
6034 /* Name (or base name) of work buffer for code conversion.  */
6035 static Lisp_Object Vcode_conversion_workbuf_name;
6036
6037 /* Set the current buffer to the working buffer prepared for
6038    code-conversion.  MULTIBYTE specifies the multibyteness of the
6039    buffer.  Return the buffer we set if it must be killed after use.
6040    Otherwise return Qnil.  */
6041
6042 static Lisp_Object
6043 set_conversion_work_buffer (multibyte)
6044      int multibyte;
6045 {
6046   Lisp_Object buffer, buffer_to_kill;
6047   struct buffer *buf;
6048
6049   buffer = Fget_buffer_create (Vcode_conversion_workbuf_name);
6050   buf = XBUFFER (buffer);
6051   if (buf == current_buffer)
6052     {
6053       /* As we are already in the work buffer, we must generate a new
6054          buffer for the work.  */
6055       Lisp_Object name;
6056
6057       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6058       buffer = buffer_to_kill = Fget_buffer_create (name);
6059       buf = XBUFFER (buffer);
6060     }
6061   else
6062     buffer_to_kill = Qnil;
6063
6064   delete_all_overlays (buf);
6065   buf->directory = current_buffer->directory;
6066   buf->read_only = Qnil;
6067   buf->filename = Qnil;
6068   buf->undo_list = Qt;
6069   eassert (buf->overlays_before == NULL);
6070   eassert (buf->overlays_after == NULL);
6071   set_buffer_internal (buf);
6072   if (BEG != BEGV || Z != ZV)
6073     Fwiden ();
6074   del_range_2 (BEG, BEG_BYTE, Z, Z_BYTE, 0);
6075   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6076   return buffer_to_kill;
6077 }
6078
6079 Lisp_Object
6080 run_pre_post_conversion_on_str (str, coding, encodep)
6081      Lisp_Object str;
6082      struct coding_system *coding;
6083      int encodep;
6084 {
6085   int count = SPECPDL_INDEX ();
6086   struct gcpro gcpro1, gcpro2;
6087   int multibyte = STRING_MULTIBYTE (str);
6088   Lisp_Object old_deactivate_mark;
6089   Lisp_Object buffer_to_kill;
6090   Lisp_Object unwind_arg;
6091
6092   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
6093   /* It is not crucial to specbind this.  */
6094   old_deactivate_mark = Vdeactivate_mark;
6095   GCPRO2 (str, old_deactivate_mark);
6096
6097   /* We must insert the contents of STR as is without
6098      unibyte<->multibyte conversion.  For that, we adjust the
6099      multibyteness of the working buffer to that of STR.  */
6100   buffer_to_kill = set_conversion_work_buffer (multibyte);
6101   if (NILP (buffer_to_kill))
6102     unwind_arg = Fcons (Vlast_coding_system_used, Qnil);
6103   else
6104     unwind_arg = list2 (Vlast_coding_system_used, buffer_to_kill);
6105   record_unwind_protect (code_convert_region_unwind, unwind_arg);
6106
6107   insert_from_string (str, 0, 0,
6108                       SCHARS (str), SBYTES (str), 0);
6109   UNGCPRO;
6110   inhibit_pre_post_conversion = 1;
6111   if (encodep)
6112     {
6113       struct buffer *prev = current_buffer;
6114
6115       call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6116       if (prev != current_buffer)
6117         /* We must kill the current buffer too.  */
6118         Fsetcdr (unwind_arg, Fcons (Fcurrent_buffer (), XCDR (unwind_arg)));
6119     }
6120   else
6121     {
6122       Vlast_coding_system_used = coding->symbol;
6123       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6124       call1 (coding->post_read_conversion, make_number (Z - BEG));
6125       coding->symbol = Vlast_coding_system_used;
6126     }
6127   inhibit_pre_post_conversion = 0;
6128   Vdeactivate_mark = old_deactivate_mark;
6129   str = make_buffer_string (BEG, Z, 1);
6130   return unbind_to (count, str);
6131 }
6132
6133
6134 /* Run pre-write-conversion function of CODING on NCHARS/NBYTES
6135    text in *STR.  *SIZE is the allocated bytes for STR.  As it
6136    is intended that this function is called from encode_terminal_code,
6137    the pre-write-conversion function is run by safe_call and thus
6138    "Error during redisplay: ..." is logged when an error occurs.
6139
6140    Store the resulting text in *STR and set CODING->produced_char and
6141    CODING->produced to the number of characters and bytes
6142    respectively.  If the size of *STR is too small, enlarge it by
6143    xrealloc and update *STR and *SIZE.  */
6144
6145 void
6146 run_pre_write_conversin_on_c_str (str, size, nchars, nbytes, coding)
6147      unsigned char **str;
6148      int *size, nchars, nbytes;
6149      struct coding_system *coding;
6150 {
6151   struct gcpro gcpro1, gcpro2;
6152   struct buffer *cur = current_buffer;
6153   struct buffer *prev;
6154   Lisp_Object old_deactivate_mark, old_last_coding_system_used;
6155   Lisp_Object args[3];
6156   Lisp_Object buffer_to_kill;
6157
6158   /* It is not crucial to specbind this.  */
6159   old_deactivate_mark = Vdeactivate_mark;
6160   old_last_coding_system_used = Vlast_coding_system_used;
6161   GCPRO2 (old_deactivate_mark, old_last_coding_system_used);
6162
6163   /* We must insert the contents of STR as is without
6164      unibyte<->multibyte conversion.  For that, we adjust the
6165      multibyteness of the working buffer to that of STR.  */
6166   buffer_to_kill = set_conversion_work_buffer (coding->src_multibyte);
6167   insert_1_both (*str, nchars, nbytes, 0, 0, 0);
6168   UNGCPRO;
6169   inhibit_pre_post_conversion = 1;
6170   prev = current_buffer;
6171   args[0] = coding->pre_write_conversion;
6172   args[1] = make_number (BEG);
6173   args[2] = make_number (Z);
6174   safe_call (3, args);
6175   inhibit_pre_post_conversion = 0;
6176   Vdeactivate_mark = old_deactivate_mark;
6177   Vlast_coding_system_used = old_last_coding_system_used;
6178   coding->produced_char = Z - BEG;
6179   coding->produced = Z_BYTE - BEG_BYTE;
6180   if (coding->produced > *size)
6181     {
6182       *size = coding->produced;
6183       *str = xrealloc (*str, *size);
6184     }
6185   if (BEG < GPT && GPT < Z)
6186     move_gap (BEG);
6187   bcopy (BEG_ADDR, *str, coding->produced);
6188   coding->src_multibyte
6189     = ! NILP (current_buffer->enable_multibyte_characters);
6190   if (prev != current_buffer)
6191     Fkill_buffer (Fcurrent_buffer ());
6192   set_buffer_internal (cur);
6193   if (! NILP (buffer_to_kill))
6194     Fkill_buffer (buffer_to_kill);
6195 }
6196
6197
6198 Lisp_Object
6199 decode_coding_string (str, coding, nocopy)
6200      Lisp_Object str;
6201      struct coding_system *coding;
6202      int nocopy;
6203 {
6204   int len;
6205   struct conversion_buffer buf;
6206   int from, to_byte;
6207   Lisp_Object saved_coding_symbol;
6208   int result;
6209   int require_decoding;
6210   int shrinked_bytes = 0;
6211   Lisp_Object newstr;
6212   int consumed, consumed_char, produced, produced_char;
6213
6214   from = 0;
6215   to_byte = SBYTES (str);
6216
6217   saved_coding_symbol = coding->symbol;
6218   coding->src_multibyte = STRING_MULTIBYTE (str);
6219   coding->dst_multibyte = 1;
6220   if (CODING_REQUIRE_DETECTION (coding))
6221     {
6222       /* See the comments in code_convert_region.  */
6223       if (coding->type == coding_type_undecided)
6224         {
6225           detect_coding (coding, SDATA (str), to_byte);
6226           if (coding->type == coding_type_undecided)
6227             {
6228               coding->type = coding_type_emacs_mule;
6229               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6230               /* As emacs-mule decoder will handle composition, we
6231                  need this setting to allocate coding->cmp_data
6232                  later.  */
6233               coding->composing = COMPOSITION_NO;
6234             }
6235         }
6236       if (coding->eol_type == CODING_EOL_UNDECIDED
6237           && coding->type != coding_type_ccl)
6238         {
6239           saved_coding_symbol = coding->symbol;
6240           detect_eol (coding, SDATA (str), to_byte);
6241           if (coding->eol_type == CODING_EOL_UNDECIDED)
6242             coding->eol_type = CODING_EOL_LF;
6243           /* We had better recover the original eol format if we
6244              encounter an inconsistent eol format while decoding.  */
6245           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6246         }
6247     }
6248
6249   if (coding->type == coding_type_no_conversion
6250       || coding->type == coding_type_raw_text)
6251     coding->dst_multibyte = 0;
6252
6253   require_decoding = CODING_REQUIRE_DECODING (coding);
6254
6255   if (STRING_MULTIBYTE (str))
6256     {
6257       /* Decoding routines expect the source text to be unibyte.  */
6258       str = Fstring_as_unibyte (str);
6259       to_byte = SBYTES (str);
6260       nocopy = 1;
6261       coding->src_multibyte = 0;
6262     }
6263
6264   /* Try to skip the heading and tailing ASCIIs.  */
6265   if (require_decoding && coding->type != coding_type_ccl)
6266     {
6267       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6268                                 0);
6269       if (from == to_byte)
6270         require_decoding = 0;
6271       shrinked_bytes = from + (SBYTES (str) - to_byte);
6272     }
6273
6274   if (!require_decoding
6275       && !(SYMBOLP (coding->post_read_conversion)
6276            && !NILP (Ffboundp (coding->post_read_conversion))))
6277     {
6278       coding->consumed = SBYTES (str);
6279       coding->consumed_char = SCHARS (str);
6280       if (coding->dst_multibyte)
6281         {
6282           str = Fstring_as_multibyte (str);
6283           nocopy = 1;
6284         }
6285       coding->produced = SBYTES (str);
6286       coding->produced_char = SCHARS (str);
6287       return (nocopy ? str : Fcopy_sequence (str));
6288     }
6289
6290   if (coding->composing != COMPOSITION_DISABLED)
6291     coding_allocate_composition_data (coding, from);
6292   len = decoding_buffer_size (coding, to_byte - from);
6293   allocate_conversion_buffer (buf, len);
6294
6295   consumed = consumed_char = produced = produced_char = 0;
6296   while (1)
6297     {
6298       result = decode_coding (coding, SDATA (str) + from + consumed,
6299                               buf.data + produced, to_byte - from - consumed,
6300                               buf.size - produced);
6301       consumed += coding->consumed;
6302       consumed_char += coding->consumed_char;
6303       produced += coding->produced;
6304       produced_char += coding->produced_char;
6305       if (result == CODING_FINISH_NORMAL
6306           || result == CODING_FINISH_INTERRUPT
6307           || (result == CODING_FINISH_INSUFFICIENT_SRC
6308               && coding->consumed == 0))
6309         break;
6310       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6311         coding_allocate_composition_data (coding, from + produced_char);
6312       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6313         extend_conversion_buffer (&buf);
6314       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6315         {
6316           Lisp_Object eol_type;
6317
6318           /* Recover the original EOL format.  */
6319           if (coding->eol_type == CODING_EOL_CR)
6320             {
6321               unsigned char *p;
6322               for (p = buf.data; p < buf.data + produced; p++)
6323                 if (*p == '\n') *p = '\r';
6324             }
6325           else if (coding->eol_type == CODING_EOL_CRLF)
6326             {
6327               int num_eol = 0;
6328               unsigned char *p0, *p1;
6329               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6330                 if (*p0 == '\n') num_eol++;
6331               if (produced + num_eol >= buf.size)
6332                 extend_conversion_buffer (&buf);
6333               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6334                 {
6335                   *--p1 = *--p0;
6336                   if (*p0 == '\n') *--p1 = '\r';
6337                 }
6338               produced += num_eol;
6339               produced_char += num_eol;
6340             }
6341           /* Suppress eol-format conversion in the further conversion.  */
6342           coding->eol_type = CODING_EOL_LF;
6343
6344           /* Set the coding system symbol to that for Unix-like EOL.  */
6345           eol_type = Fget (saved_coding_symbol, Qeol_type);
6346           if (VECTORP (eol_type)
6347               && XVECTOR (eol_type)->size == 3
6348               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6349             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6350           else
6351             coding->symbol = saved_coding_symbol;
6352
6353
6354         }
6355     }
6356
6357   coding->consumed = consumed;
6358   coding->consumed_char = consumed_char;
6359   coding->produced = produced;
6360   coding->produced_char = produced_char;
6361
6362   if (coding->dst_multibyte)
6363     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6364                                            produced + shrinked_bytes);
6365   else
6366     newstr = make_uninit_string (produced + shrinked_bytes);
6367   if (from > 0)
6368     STRING_COPYIN (newstr, 0, SDATA (str), from);
6369   STRING_COPYIN (newstr, from, buf.data, produced);
6370   if (shrinked_bytes > from)
6371     STRING_COPYIN (newstr, from + produced,
6372                    SDATA (str) + to_byte,
6373                    shrinked_bytes - from);
6374   free_conversion_buffer (&buf);
6375
6376   coding->consumed += shrinked_bytes;
6377   coding->consumed_char += shrinked_bytes;
6378   coding->produced += shrinked_bytes;
6379   coding->produced_char += shrinked_bytes;
6380
6381   if (coding->cmp_data && coding->cmp_data->used)
6382     coding_restore_composition (coding, newstr);
6383   coding_free_composition_data (coding);
6384
6385   if (SYMBOLP (coding->post_read_conversion)
6386       && !NILP (Ffboundp (coding->post_read_conversion)))
6387     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6388
6389   return newstr;
6390 }
6391
6392 Lisp_Object
6393 encode_coding_string (str, coding, nocopy)
6394      Lisp_Object str;
6395      struct coding_system *coding;
6396      int nocopy;
6397 {
6398   int len;
6399   struct conversion_buffer buf;
6400   int from, to, to_byte;
6401   int result;
6402   int shrinked_bytes = 0;
6403   Lisp_Object newstr;
6404   int consumed, consumed_char, produced, produced_char;
6405
6406   if (SYMBOLP (coding->pre_write_conversion)
6407       && !NILP (Ffboundp (coding->pre_write_conversion)))
6408     {
6409       str = run_pre_post_conversion_on_str (str, coding, 1);
6410       /* As STR is just newly generated, we don't have to copy it
6411          anymore.  */
6412       nocopy = 1;
6413     }
6414
6415   from = 0;
6416   to = SCHARS (str);
6417   to_byte = SBYTES (str);
6418
6419   /* Encoding routines determine the multibyteness of the source text
6420      by coding->src_multibyte.  */
6421   coding->src_multibyte = SCHARS (str) < SBYTES (str);
6422   coding->dst_multibyte = 0;
6423   if (! CODING_REQUIRE_ENCODING (coding))
6424     goto no_need_of_encoding;
6425
6426   if (coding->composing != COMPOSITION_DISABLED)
6427     coding_save_composition (coding, from, to, str);
6428
6429   /* Try to skip the heading and tailing ASCIIs.  We can't skip them
6430      if we must run CCL program or there are compositions to
6431      encode.  */
6432   if (coding->type != coding_type_ccl
6433       && (! coding->cmp_data || coding->cmp_data->used == 0))
6434     {
6435       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6436                                 1);
6437       if (from == to_byte)
6438         {
6439           coding_free_composition_data (coding);
6440           goto no_need_of_encoding;
6441         }
6442       shrinked_bytes = from + (SBYTES (str) - to_byte);
6443     }
6444
6445   len = encoding_buffer_size (coding, to_byte - from);
6446   allocate_conversion_buffer (buf, len);
6447
6448   consumed = consumed_char = produced = produced_char = 0;
6449   while (1)
6450     {
6451       result = encode_coding (coding, SDATA (str) + from + consumed,
6452                               buf.data + produced, to_byte - from - consumed,
6453                               buf.size - produced);
6454       consumed += coding->consumed;
6455       consumed_char += coding->consumed_char;
6456       produced += coding->produced;
6457       produced_char += coding->produced_char;
6458       if (result == CODING_FINISH_NORMAL
6459           || result == CODING_FINISH_INTERRUPT
6460           || (result == CODING_FINISH_INSUFFICIENT_SRC
6461               && coding->consumed == 0))
6462         break;
6463       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6464       extend_conversion_buffer (&buf);
6465     }
6466
6467   coding->consumed = consumed;
6468   coding->consumed_char = consumed_char;
6469   coding->produced = produced;
6470   coding->produced_char = produced_char;
6471
6472   newstr = make_uninit_string (produced + shrinked_bytes);
6473   if (from > 0)
6474     STRING_COPYIN (newstr, 0, SDATA (str), from);
6475   STRING_COPYIN (newstr, from, buf.data, produced);
6476   if (shrinked_bytes > from)
6477     STRING_COPYIN (newstr, from + produced,
6478                    SDATA (str) + to_byte,
6479                    shrinked_bytes - from);
6480
6481   free_conversion_buffer (&buf);
6482   coding_free_composition_data (coding);
6483
6484   return newstr;
6485
6486  no_need_of_encoding:
6487   coding->consumed = SBYTES (str);
6488   coding->consumed_char = SCHARS (str);
6489   if (STRING_MULTIBYTE (str))
6490     {
6491       if (nocopy)
6492         /* We are sure that STR doesn't contain a multibyte
6493            character.  */
6494         STRING_SET_UNIBYTE (str);
6495       else
6496         {
6497           str = Fstring_as_unibyte (str);
6498           nocopy = 1;
6499         }
6500     }
6501   coding->produced = SBYTES (str);
6502   coding->produced_char = SCHARS (str);
6503   return (nocopy ? str : Fcopy_sequence (str));
6504 }
6505
6506 \f
6507 #ifdef emacs
6508 /*** 8. Emacs Lisp library functions ***/
6509
6510 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6511        doc: /* Return t if OBJECT is nil or a coding-system.
6512 See the documentation of `make-coding-system' for information
6513 about coding-system objects.  */)
6514      (obj)
6515      Lisp_Object obj;
6516 {
6517   if (NILP (obj))
6518     return Qt;
6519   if (!SYMBOLP (obj))
6520     return Qnil;
6521   if (! NILP (Fget (obj, Qcoding_system_define_form)))
6522     return Qt;
6523   /* Get coding-spec vector for OBJ.  */
6524   obj = Fget (obj, Qcoding_system);
6525   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6526           ? Qt : Qnil);
6527 }
6528
6529 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6530        Sread_non_nil_coding_system, 1, 1, 0,
6531        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6532      (prompt)
6533      Lisp_Object prompt;
6534 {
6535   Lisp_Object val;
6536   do
6537     {
6538       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6539                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6540     }
6541   while (SCHARS (val) == 0);
6542   return (Fintern (val, Qnil));
6543 }
6544
6545 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6546        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6547 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6548      (prompt, default_coding_system)
6549      Lisp_Object prompt, default_coding_system;
6550 {
6551   Lisp_Object val;
6552   if (SYMBOLP (default_coding_system))
6553     default_coding_system = SYMBOL_NAME (default_coding_system);
6554   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6555                           Qt, Qnil, Qcoding_system_history,
6556                           default_coding_system, Qnil);
6557   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6558 }
6559
6560 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6561        1, 1, 0,
6562        doc: /* Check validity of CODING-SYSTEM.
6563 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6564 It is valid if it is nil or a symbol with a non-nil `coding-system' property.
6565 The value of this property should be a vector of length 5.  */)
6566      (coding_system)
6567      Lisp_Object coding_system;
6568 {
6569   Lisp_Object define_form;
6570
6571   define_form = Fget (coding_system, Qcoding_system_define_form);
6572   if (! NILP (define_form))
6573     {
6574       Fput (coding_system, Qcoding_system_define_form, Qnil);
6575       safe_eval (define_form);
6576     }
6577   if (!NILP (Fcoding_system_p (coding_system)))
6578     return coding_system;
6579   while (1)
6580     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6581 }
6582 \f
6583 Lisp_Object
6584 detect_coding_system (src, src_bytes, highest, multibytep)
6585      const unsigned char *src;
6586      int src_bytes, highest;
6587      int multibytep;
6588 {
6589   int coding_mask, eol_type;
6590   Lisp_Object val, tmp;
6591   int dummy;
6592
6593   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6594   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6595   if (eol_type == CODING_EOL_INCONSISTENT)
6596     eol_type = CODING_EOL_UNDECIDED;
6597
6598   if (!coding_mask)
6599     {
6600       val = Qundecided;
6601       if (eol_type != CODING_EOL_UNDECIDED)
6602         {
6603           Lisp_Object val2;
6604           val2 = Fget (Qundecided, Qeol_type);
6605           if (VECTORP (val2))
6606             val = XVECTOR (val2)->contents[eol_type];
6607         }
6608       return (highest ? val : Fcons (val, Qnil));
6609     }
6610
6611   /* At first, gather possible coding systems in VAL.  */
6612   val = Qnil;
6613   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6614     {
6615       Lisp_Object category_val, category_index;
6616
6617       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6618       category_val = Fsymbol_value (XCAR (tmp));
6619       if (!NILP (category_val)
6620           && NATNUMP (category_index)
6621           && (coding_mask & (1 << XFASTINT (category_index))))
6622         {
6623           val = Fcons (category_val, val);
6624           if (highest)
6625             break;
6626         }
6627     }
6628   if (!highest)
6629     val = Fnreverse (val);
6630
6631   /* Then, replace the elements with subsidiary coding systems.  */
6632   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6633     {
6634       if (eol_type != CODING_EOL_UNDECIDED
6635           && eol_type != CODING_EOL_INCONSISTENT)
6636         {
6637           Lisp_Object eol;
6638           eol = Fget (XCAR (tmp), Qeol_type);
6639           if (VECTORP (eol))
6640             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6641         }
6642     }
6643   return (highest ? XCAR (val) : val);
6644 }
6645
6646 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6647        2, 3, 0,
6648        doc: /* Detect how the byte sequence in the region is encoded.
6649 Return a list of possible coding systems used on decoding a byte
6650 sequence containing the bytes in the region between START and END when
6651 the coding system `undecided' is specified.  The list is ordered by
6652 priority decided in the current language environment.
6653
6654 If only ASCII characters are found, it returns a list of single element
6655 `undecided' or its subsidiary coding system according to a detected
6656 end-of-line format.
6657
6658 If optional argument HIGHEST is non-nil, return the coding system of
6659 highest priority.  */)
6660      (start, end, highest)
6661      Lisp_Object start, end, highest;
6662 {
6663   int from, to;
6664   int from_byte, to_byte;
6665   int include_anchor_byte = 0;
6666
6667   CHECK_NUMBER_COERCE_MARKER (start);
6668   CHECK_NUMBER_COERCE_MARKER (end);
6669
6670   validate_region (&start, &end);
6671   from = XINT (start), to = XINT (end);
6672   from_byte = CHAR_TO_BYTE (from);
6673   to_byte = CHAR_TO_BYTE (to);
6674
6675   if (from < GPT && to >= GPT)
6676     move_gap_both (to, to_byte);
6677   /* If we an anchor byte `\0' follows the region, we include it in
6678      the detecting source.  Then code detectors can handle the tailing
6679      byte sequence more accurately.
6680
6681      Fix me: This is not a perfect solution.  It is better that we
6682      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6683   */
6684   if (to == Z || (to == GPT && GAP_SIZE > 0))
6685     include_anchor_byte = 1;
6686   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6687                                to_byte - from_byte + include_anchor_byte,
6688                                !NILP (highest),
6689                                !NILP (current_buffer
6690                                       ->enable_multibyte_characters));
6691 }
6692
6693 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6694        1, 2, 0,
6695        doc: /* Detect how the byte sequence in STRING is encoded.
6696 Return a list of possible coding systems used on decoding a byte
6697 sequence containing the bytes in STRING when the coding system
6698 `undecided' is specified.  The list is ordered by priority decided in
6699 the current language environment.
6700
6701 If only ASCII characters are found, it returns a list of single element
6702 `undecided' or its subsidiary coding system according to a detected
6703 end-of-line format.
6704
6705 If optional argument HIGHEST is non-nil, return the coding system of
6706 highest priority.  */)
6707      (string, highest)
6708      Lisp_Object string, highest;
6709 {
6710   CHECK_STRING (string);
6711
6712   return detect_coding_system (SDATA (string),
6713                                /* "+ 1" is to include the anchor byte
6714                                   `\0'.  With this, code detectors can
6715                                   handle the tailing bytes more
6716                                   accurately.  */
6717                                SBYTES (string) + 1,
6718                                !NILP (highest),
6719                                STRING_MULTIBYTE (string));
6720 }
6721
6722 /*  Subroutine for Ffind_coding_systems_region_internal.
6723
6724     Return a list of coding systems that safely encode the multibyte
6725     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6726     possible coding systems.  If it is nil, it means that we have not
6727     yet found any coding systems.
6728
6729     WORK_TABLE a char-table of which element is set to t once the
6730     element is looked up.
6731
6732     If a non-ASCII single byte char is found, set
6733     *single_byte_char_found to 1.  */
6734
6735 static Lisp_Object
6736 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6737      unsigned char *p, *pend;
6738      Lisp_Object safe_codings, work_table;
6739      int *single_byte_char_found;
6740 {
6741   int c, len;
6742   Lisp_Object val, ch;
6743   Lisp_Object prev, tail;
6744
6745   if (NILP (safe_codings))
6746     goto done_safe_codings;
6747   while (p < pend)
6748     {
6749       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6750       p += len;
6751       if (ASCII_BYTE_P (c))
6752         /* We can ignore ASCII characters here.  */
6753         continue;
6754       if (SINGLE_BYTE_CHAR_P (c))
6755         *single_byte_char_found = 1;
6756       /* Check the safe coding systems for C.  */
6757       ch = make_number (c);
6758       val = Faref (work_table, ch);
6759       if (EQ (val, Qt))
6760         /* This element was already checked.  Ignore it.  */
6761         continue;
6762       /* Remember that we checked this element.  */
6763       Faset (work_table, ch, Qt);
6764
6765       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6766         {
6767           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6768           int encodable;
6769
6770           elt = XCAR (tail);
6771           if (CONSP (XCDR (elt)))
6772             {
6773               /* This entry has this format now:
6774                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6775                           ACCEPT-LATIN-EXTRA ) */
6776               val = XCDR (elt);
6777               encodable = ! NILP (Faref (XCAR (val), ch));
6778               if (! encodable)
6779                 {
6780                   val = XCDR (val);
6781                   translation_table = XCAR (val);
6782                   hash_table = XCAR (XCDR (val));
6783                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6784                 }
6785             }
6786           else
6787             {
6788               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6789               encodable = ! NILP (Faref (XCDR (elt), ch));
6790               if (! encodable)
6791                 {
6792                   /* Transform the format to:
6793                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6794                        ACCEPT-LATIN-EXTRA )  */
6795                   val = Fget (XCAR (elt), Qcoding_system);
6796                   translation_table
6797                     = Fplist_get (AREF (val, 3),
6798                                   Qtranslation_table_for_encode);
6799                   if (SYMBOLP (translation_table))
6800                     translation_table = Fget (translation_table,
6801                                               Qtranslation_table);
6802                   hash_table
6803                     = (CHAR_TABLE_P (translation_table)
6804                        ? XCHAR_TABLE (translation_table)->extras[1]
6805                        : Qnil);
6806                   accept_latin_extra
6807                     = ((EQ (AREF (val, 0), make_number (2))
6808                         && VECTORP (AREF (val, 4)))
6809                        ? AREF (AREF (val, 4), 16)
6810                        : Qnil);
6811                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6812                                         translation_table, hash_table,
6813                                         accept_latin_extra));
6814                 }
6815             }
6816
6817           if (! encodable
6818               && ((CHAR_TABLE_P (translation_table)
6819                    && ! NILP (Faref (translation_table, ch)))
6820                   || (HASH_TABLE_P (hash_table)
6821                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6822                   || (SINGLE_BYTE_CHAR_P (c)
6823                       && ! NILP (accept_latin_extra)
6824                       && VECTORP (Vlatin_extra_code_table)
6825                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6826             encodable = 1;
6827           if (encodable)
6828             prev = tail;
6829           else
6830             {
6831               /* Exclude this coding system from SAFE_CODINGS.  */
6832               if (EQ (tail, safe_codings))
6833                 {
6834                   safe_codings = XCDR (safe_codings);
6835                   if (NILP (safe_codings))
6836                     goto done_safe_codings;
6837                 }
6838               else
6839                 XSETCDR (prev, XCDR (tail));
6840             }
6841         }
6842     }
6843
6844  done_safe_codings:
6845   /* If the above loop was terminated before P reaches PEND, it means
6846      SAFE_CODINGS was set to nil.  If we have not yet found an
6847      non-ASCII single-byte char, check it now.  */
6848   if (! *single_byte_char_found)
6849     while (p < pend)
6850       {
6851         c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6852         p += len;
6853         if (! ASCII_BYTE_P (c)
6854             && SINGLE_BYTE_CHAR_P (c))
6855           {
6856             *single_byte_char_found = 1;
6857             break;
6858           }
6859       }
6860   return safe_codings;
6861 }
6862
6863 DEFUN ("find-coding-systems-region-internal",
6864        Ffind_coding_systems_region_internal,
6865        Sfind_coding_systems_region_internal, 2, 2, 0,
6866        doc: /* Internal use only.  */)
6867      (start, end)
6868      Lisp_Object start, end;
6869 {
6870   Lisp_Object work_table, safe_codings;
6871   int non_ascii_p = 0;
6872   int single_byte_char_found = 0;
6873   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6874
6875   if (STRINGP (start))
6876     {
6877       if (!STRING_MULTIBYTE (start))
6878         return Qt;
6879       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6880       p2 = p2end = p1end;
6881       if (SCHARS (start) != SBYTES (start))
6882         non_ascii_p = 1;
6883     }
6884   else
6885     {
6886       int from, to, stop;
6887
6888       CHECK_NUMBER_COERCE_MARKER (start);
6889       CHECK_NUMBER_COERCE_MARKER (end);
6890       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6891         args_out_of_range (start, end);
6892       if (NILP (current_buffer->enable_multibyte_characters))
6893         return Qt;
6894       from = CHAR_TO_BYTE (XINT (start));
6895       to = CHAR_TO_BYTE (XINT (end));
6896       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6897       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6898       if (stop == to)
6899         p2 = p2end = p1end;
6900       else
6901         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6902       if (XINT (end) - XINT (start) != to - from)
6903         non_ascii_p = 1;
6904     }
6905
6906   if (!non_ascii_p)
6907     {
6908       /* We are sure that the text contains no multibyte character.
6909          Check if it contains eight-bit-graphic.  */
6910       p = p1;
6911       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6912       if (p == p1end)
6913         {
6914           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6915           if (p == p2end)
6916             return Qt;
6917         }
6918     }
6919
6920   /* The text contains non-ASCII characters.  */
6921
6922   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6923   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6924
6925   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6926                                     &single_byte_char_found);
6927   if (p2 < p2end)
6928     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6929                                       &single_byte_char_found);
6930   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6931     safe_codings = Qt;
6932   else
6933     {
6934       /* Turn safe_codings to a list of coding systems... */
6935       Lisp_Object val;
6936
6937       if (single_byte_char_found)
6938         /* ... and append these for eight-bit chars.  */
6939         val = Fcons (Qraw_text,
6940                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6941       else
6942         /* ... and append generic coding systems.  */
6943         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6944
6945       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6946         val = Fcons (XCAR (XCAR (safe_codings)), val);
6947       safe_codings = val;
6948     }
6949
6950   return safe_codings;
6951 }
6952
6953
6954 /* Search from position POS for such characters that are unencodable
6955    accoding to SAFE_CHARS, and return a list of their positions.  P
6956    points where in the memory the character at POS exists.  Limit the
6957    search at PEND or when Nth unencodable characters are found.
6958
6959    If SAFE_CHARS is a char table, an element for an unencodable
6960    character is nil.
6961
6962    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6963
6964    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6965    eight-bit-graphic characters are unencodable.  */
6966
6967 static Lisp_Object
6968 unencodable_char_position (safe_chars, pos, p, pend, n)
6969      Lisp_Object safe_chars;
6970      int pos;
6971      unsigned char *p, *pend;
6972      int n;
6973 {
6974   Lisp_Object pos_list;
6975
6976   pos_list = Qnil;
6977   while (p < pend)
6978     {
6979       int len;
6980       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6981
6982       if (c >= 128
6983           && (CHAR_TABLE_P (safe_chars)
6984               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6985               : (NILP (safe_chars) || c < 256)))
6986         {
6987           pos_list = Fcons (make_number (pos), pos_list);
6988           if (--n <= 0)
6989             break;
6990         }
6991       pos++;
6992       p += len;
6993     }
6994   return Fnreverse (pos_list);
6995 }
6996
6997
6998 DEFUN ("unencodable-char-position", Funencodable_char_position,
6999        Sunencodable_char_position, 3, 5, 0,
7000        doc: /*
7001 Return position of first un-encodable character in a region.
7002 START and END specfiy the region and CODING-SYSTEM specifies the
7003 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7004
7005 If optional 4th argument COUNT is non-nil, it specifies at most how
7006 many un-encodable characters to search.  In this case, the value is a
7007 list of positions.
7008
7009 If optional 5th argument STRING is non-nil, it is a string to search
7010 for un-encodable characters.  In that case, START and END are indexes
7011 to the string.  */)
7012      (start, end, coding_system, count, string)
7013      Lisp_Object start, end, coding_system, count, string;
7014 {
7015   int n;
7016   Lisp_Object safe_chars;
7017   struct coding_system coding;
7018   Lisp_Object positions;
7019   int from, to;
7020   unsigned char *p, *pend;
7021
7022   if (NILP (string))
7023     {
7024       validate_region (&start, &end);
7025       from = XINT (start);
7026       to = XINT (end);
7027       if (NILP (current_buffer->enable_multibyte_characters))
7028         return Qnil;
7029       p = CHAR_POS_ADDR (from);
7030       if (to == GPT)
7031         pend = GPT_ADDR;
7032       else
7033         pend = CHAR_POS_ADDR (to);
7034     }
7035   else
7036     {
7037       CHECK_STRING (string);
7038       CHECK_NATNUM (start);
7039       CHECK_NATNUM (end);
7040       from = XINT (start);
7041       to = XINT (end);
7042       if (from > to
7043           || to > SCHARS (string))
7044         args_out_of_range_3 (string, start, end);
7045       if (! STRING_MULTIBYTE (string))
7046         return Qnil;
7047       p = SDATA (string) + string_char_to_byte (string, from);
7048       pend = SDATA (string) + string_char_to_byte (string, to);
7049     }
7050
7051   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7052
7053   if (NILP (count))
7054     n = 1;
7055   else
7056     {
7057       CHECK_NATNUM (count);
7058       n = XINT (count);
7059     }
7060
7061   if (coding.type == coding_type_no_conversion
7062       || coding.type == coding_type_raw_text)
7063     return Qnil;
7064
7065   if (coding.type == coding_type_undecided)
7066     safe_chars = Qnil;
7067   else
7068     safe_chars = coding_safe_chars (coding_system);
7069
7070   if (STRINGP (string)
7071       || from >= GPT || to <= GPT)
7072     positions = unencodable_char_position (safe_chars, from, p, pend, n);
7073   else
7074     {
7075       Lisp_Object args[2];
7076
7077       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
7078       n -= XINT (Flength (args[0]));
7079       if (n <= 0)
7080         positions = args[0];
7081       else
7082         {
7083           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
7084                                                pend, n);
7085           positions = Fappend (2, args);
7086         }
7087     }
7088
7089   return  (NILP (count) ? Fcar (positions) : positions);
7090 }
7091
7092
7093 Lisp_Object
7094 code_convert_region1 (start, end, coding_system, encodep)
7095      Lisp_Object start, end, coding_system;
7096      int encodep;
7097 {
7098   struct coding_system coding;
7099   int from, to;
7100
7101   CHECK_NUMBER_COERCE_MARKER (start);
7102   CHECK_NUMBER_COERCE_MARKER (end);
7103   CHECK_SYMBOL (coding_system);
7104
7105   validate_region (&start, &end);
7106   from = XFASTINT (start);
7107   to = XFASTINT (end);
7108
7109   if (NILP (coding_system))
7110     return make_number (to - from);
7111
7112   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7113     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7114
7115   coding.mode |= CODING_MODE_LAST_BLOCK;
7116   coding.src_multibyte = coding.dst_multibyte
7117     = !NILP (current_buffer->enable_multibyte_characters);
7118   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
7119                        &coding, encodep, 1);
7120   Vlast_coding_system_used = coding.symbol;
7121   return make_number (coding.produced_char);
7122 }
7123
7124 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7125        3, 3, "r\nzCoding system: ",
7126        doc: /* Decode the current region from the specified coding system.
7127 When called from a program, takes three arguments:
7128 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7129 This function sets `last-coding-system-used' to the precise coding system
7130 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7131 not fully specified.)
7132 It returns the length of the decoded text.  */)
7133      (start, end, coding_system)
7134      Lisp_Object start, end, coding_system;
7135 {
7136   return code_convert_region1 (start, end, coding_system, 0);
7137 }
7138
7139 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7140        3, 3, "r\nzCoding system: ",
7141        doc: /* Encode the current region into the specified coding system.
7142 When called from a program, takes three arguments:
7143 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7144 This function sets `last-coding-system-used' to the precise coding system
7145 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7146 not fully specified.)
7147 It returns the length of the encoded text.  */)
7148      (start, end, coding_system)
7149      Lisp_Object start, end, coding_system;
7150 {
7151   return code_convert_region1 (start, end, coding_system, 1);
7152 }
7153
7154 Lisp_Object
7155 code_convert_string1 (string, coding_system, nocopy, encodep)
7156      Lisp_Object string, coding_system, nocopy;
7157      int encodep;
7158 {
7159   struct coding_system coding;
7160
7161   CHECK_STRING (string);
7162   CHECK_SYMBOL (coding_system);
7163
7164   if (NILP (coding_system))
7165     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
7166
7167   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7168     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7169
7170   coding.mode |= CODING_MODE_LAST_BLOCK;
7171   string = (encodep
7172             ? encode_coding_string (string, &coding, !NILP (nocopy))
7173             : decode_coding_string (string, &coding, !NILP (nocopy)));
7174   Vlast_coding_system_used = coding.symbol;
7175
7176   return string;
7177 }
7178
7179 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7180        2, 3, 0,
7181        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7182 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7183 if the decoding operation is trivial.
7184 This function sets `last-coding-system-used' to the precise coding system
7185 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7186 not fully specified.)  */)
7187      (string, coding_system, nocopy)
7188      Lisp_Object string, coding_system, nocopy;
7189 {
7190   return code_convert_string1 (string, coding_system, nocopy, 0);
7191 }
7192
7193 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7194        2, 3, 0,
7195        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7196 Optional arg NOCOPY non-nil means it is OK to return STRING itself
7197 if the encoding operation is trivial.
7198 This function sets `last-coding-system-used' to the precise coding system
7199 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7200 not fully specified.)  */)
7201      (string, coding_system, nocopy)
7202      Lisp_Object string, coding_system, nocopy;
7203 {
7204   return code_convert_string1 (string, coding_system, nocopy, 1);
7205 }
7206
7207 /* Encode or decode STRING according to CODING_SYSTEM.
7208    Do not set Vlast_coding_system_used.
7209
7210    This function is called only from macros DECODE_FILE and
7211    ENCODE_FILE, thus we ignore character composition.  */
7212
7213 Lisp_Object
7214 code_convert_string_norecord (string, coding_system, encodep)
7215      Lisp_Object string, coding_system;
7216      int encodep;
7217 {
7218   struct coding_system coding;
7219
7220   CHECK_STRING (string);
7221   CHECK_SYMBOL (coding_system);
7222
7223   if (NILP (coding_system))
7224     return string;
7225
7226   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7227     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7228
7229   coding.composing = COMPOSITION_DISABLED;
7230   coding.mode |= CODING_MODE_LAST_BLOCK;
7231   return (encodep
7232           ? encode_coding_string (string, &coding, 1)
7233           : decode_coding_string (string, &coding, 1));
7234 }
7235 \f
7236 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7237        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7238 Return the corresponding character.  */)
7239      (code)
7240      Lisp_Object code;
7241 {
7242   unsigned char c1, c2, s1, s2;
7243   Lisp_Object val;
7244
7245   CHECK_NUMBER (code);
7246   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7247   if (s1 == 0)
7248     {
7249       if (s2 < 0x80)
7250         XSETFASTINT (val, s2);
7251       else if (s2 >= 0xA0 || s2 <= 0xDF)
7252         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7253       else
7254         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7255     }
7256   else
7257     {
7258       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7259           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7260         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7261       DECODE_SJIS (s1, s2, c1, c2);
7262       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7263     }
7264   return val;
7265 }
7266
7267 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7268        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7269 Return the corresponding code in SJIS.  */)
7270      (ch)
7271      Lisp_Object ch;
7272 {
7273   int charset, c1, c2, s1, s2;
7274   Lisp_Object val;
7275
7276   CHECK_NUMBER (ch);
7277   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7278   if (charset == CHARSET_ASCII)
7279     {
7280       val = ch;
7281     }
7282   else if (charset == charset_jisx0208
7283            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7284     {
7285       ENCODE_SJIS (c1, c2, s1, s2);
7286       XSETFASTINT (val, (s1 << 8) | s2);
7287     }
7288   else if (charset == charset_katakana_jisx0201
7289            && c1 > 0x20 && c2 < 0xE0)
7290     {
7291       XSETFASTINT (val, c1 | 0x80);
7292     }
7293   else
7294     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7295   return val;
7296 }
7297
7298 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7299        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7300 Return the corresponding character.  */)
7301      (code)
7302      Lisp_Object code;
7303 {
7304   int charset;
7305   unsigned char b1, b2, c1, c2;
7306   Lisp_Object val;
7307
7308   CHECK_NUMBER (code);
7309   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7310   if (b1 == 0)
7311     {
7312       if (b2 >= 0x80)
7313         error ("Invalid BIG5 code: %x", XFASTINT (code));
7314       val = code;
7315     }
7316   else
7317     {
7318       if ((b1 < 0xA1 || b1 > 0xFE)
7319           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7320         error ("Invalid BIG5 code: %x", XFASTINT (code));
7321       DECODE_BIG5 (b1, b2, charset, c1, c2);
7322       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7323     }
7324   return val;
7325 }
7326
7327 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7328        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7329 Return the corresponding character code in Big5.  */)
7330      (ch)
7331      Lisp_Object ch;
7332 {
7333   int charset, c1, c2, b1, b2;
7334   Lisp_Object val;
7335
7336   CHECK_NUMBER (ch);
7337   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7338   if (charset == CHARSET_ASCII)
7339     {
7340       val = ch;
7341     }
7342   else if ((charset == charset_big5_1
7343             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7344            || (charset == charset_big5_2
7345                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7346     {
7347       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7348       XSETFASTINT (val, (b1 << 8) | b2);
7349     }
7350   else
7351     error ("Can't encode to Big5: %d", XFASTINT (ch));
7352   return val;
7353 }
7354 \f
7355 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7356        Sset_terminal_coding_system_internal, 1, 1, 0,
7357        doc: /* Internal use only.  */)
7358      (coding_system)
7359      Lisp_Object coding_system;
7360 {
7361   CHECK_SYMBOL (coding_system);
7362   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7363   /* We had better not send unsafe characters to terminal.  */
7364   terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7365   /* Character composition should be disabled.  */
7366   terminal_coding.composing = COMPOSITION_DISABLED;
7367   /* Error notification should be suppressed.  */
7368   terminal_coding.suppress_error = 1;
7369   terminal_coding.src_multibyte = 1;
7370   terminal_coding.dst_multibyte = 0;
7371   return Qnil;
7372 }
7373
7374 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7375        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7376        doc: /* Internal use only.  */)
7377      (coding_system)
7378      Lisp_Object coding_system;
7379 {
7380   CHECK_SYMBOL (coding_system);
7381   setup_coding_system (Fcheck_coding_system (coding_system),
7382                        &safe_terminal_coding);
7383   /* Character composition should be disabled.  */
7384   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7385   /* Error notification should be suppressed.  */
7386   safe_terminal_coding.suppress_error = 1;
7387   safe_terminal_coding.src_multibyte = 1;
7388   safe_terminal_coding.dst_multibyte = 0;
7389   return Qnil;
7390 }
7391
7392 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7393        Sterminal_coding_system, 0, 0, 0,
7394        doc: /* Return coding system specified for terminal output.  */)
7395      ()
7396 {
7397   return terminal_coding.symbol;
7398 }
7399
7400 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7401        Sset_keyboard_coding_system_internal, 1, 1, 0,
7402        doc: /* Internal use only.  */)
7403      (coding_system)
7404      Lisp_Object coding_system;
7405 {
7406   CHECK_SYMBOL (coding_system);
7407   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7408   /* Character composition should be disabled.  */
7409   keyboard_coding.composing = COMPOSITION_DISABLED;
7410   return Qnil;
7411 }
7412
7413 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7414        Skeyboard_coding_system, 0, 0, 0,
7415        doc: /* Return coding system specified for decoding keyboard input.  */)
7416      ()
7417 {
7418   return keyboard_coding.symbol;
7419 }
7420
7421 \f
7422 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7423        Sfind_operation_coding_system,  1, MANY, 0,
7424        doc: /* Choose a coding system for an operation based on the target name.
7425 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7426 DECODING-SYSTEM is the coding system to use for decoding
7427 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7428 for encoding (in case OPERATION does encoding).
7429
7430 The first argument OPERATION specifies an I/O primitive:
7431   For file I/O, `insert-file-contents' or `write-region'.
7432   For process I/O, `call-process', `call-process-region', or `start-process'.
7433   For network I/O, `open-network-stream'.
7434
7435 The remaining arguments should be the same arguments that were passed
7436 to the primitive.  Depending on which primitive, one of those arguments
7437 is selected as the TARGET.  For example, if OPERATION does file I/O,
7438 whichever argument specifies the file name is TARGET.
7439
7440 TARGET has a meaning which depends on OPERATION:
7441   For file I/O, TARGET is a file name.
7442   For process I/O, TARGET is a process name.
7443   For network I/O, TARGET is a service name or a port number
7444
7445 This function looks up what specified for TARGET in,
7446 `file-coding-system-alist', `process-coding-system-alist',
7447 or `network-coding-system-alist' depending on OPERATION.
7448 They may specify a coding system, a cons of coding systems,
7449 or a function symbol to call.
7450 In the last case, we call the function with one argument,
7451 which is a list of all the arguments given to this function.
7452
7453 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7454      (nargs, args)
7455      int nargs;
7456      Lisp_Object *args;
7457 {
7458   Lisp_Object operation, target_idx, target, val;
7459   register Lisp_Object chain;
7460
7461   if (nargs < 2)
7462     error ("Too few arguments");
7463   operation = args[0];
7464   if (!SYMBOLP (operation)
7465       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7466     error ("Invalid first argument");
7467   if (nargs < 1 + XINT (target_idx))
7468     error ("Too few arguments for operation: %s",
7469            SDATA (SYMBOL_NAME (operation)));
7470   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7471      argument to write-region) is string, it must be treated as a
7472      target file name.  */
7473   if (EQ (operation, Qwrite_region)
7474       && nargs > 5
7475       && STRINGP (args[5]))
7476     target_idx = make_number (4);
7477   target = args[XINT (target_idx) + 1];
7478   if (!(STRINGP (target)
7479         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7480     error ("Invalid argument %d", XINT (target_idx) + 1);
7481
7482   chain = ((EQ (operation, Qinsert_file_contents)
7483             || EQ (operation, Qwrite_region))
7484            ? Vfile_coding_system_alist
7485            : (EQ (operation, Qopen_network_stream)
7486               ? Vnetwork_coding_system_alist
7487               : Vprocess_coding_system_alist));
7488   if (NILP (chain))
7489     return Qnil;
7490
7491   for (; CONSP (chain); chain = XCDR (chain))
7492     {
7493       Lisp_Object elt;
7494       elt = XCAR (chain);
7495
7496       if (CONSP (elt)
7497           && ((STRINGP (target)
7498                && STRINGP (XCAR (elt))
7499                && fast_string_match (XCAR (elt), target) >= 0)
7500               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7501         {
7502           val = XCDR (elt);
7503           /* Here, if VAL is both a valid coding system and a valid
7504              function symbol, we return VAL as a coding system.  */
7505           if (CONSP (val))
7506             return val;
7507           if (! SYMBOLP (val))
7508             return Qnil;
7509           if (! NILP (Fcoding_system_p (val)))
7510             return Fcons (val, val);
7511           if (! NILP (Ffboundp (val)))
7512             {
7513               val = call1 (val, Flist (nargs, args));
7514               if (CONSP (val))
7515                 return val;
7516               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7517                 return Fcons (val, val);
7518             }
7519           return Qnil;
7520         }
7521     }
7522   return Qnil;
7523 }
7524
7525 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7526        Supdate_coding_systems_internal, 0, 0, 0,
7527        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7528 When values of any coding categories are changed, you must
7529 call this function.  */)
7530      ()
7531 {
7532   int i;
7533
7534   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7535     {
7536       Lisp_Object val;
7537
7538       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7539       if (!NILP (val))
7540         {
7541           if (! coding_system_table[i])
7542             coding_system_table[i] = ((struct coding_system *)
7543                                       xmalloc (sizeof (struct coding_system)));
7544           setup_coding_system (val, coding_system_table[i]);
7545         }
7546       else if (coding_system_table[i])
7547         {
7548           xfree (coding_system_table[i]);
7549           coding_system_table[i] = NULL;
7550         }
7551     }
7552
7553   return Qnil;
7554 }
7555
7556 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7557        Sset_coding_priority_internal, 0, 0, 0,
7558        doc: /* Update internal database for the current value of `coding-category-list'.
7559 This function is internal use only.  */)
7560      ()
7561 {
7562   int i = 0, idx;
7563   Lisp_Object val;
7564
7565   val = Vcoding_category_list;
7566
7567   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7568     {
7569       if (! SYMBOLP (XCAR (val)))
7570         break;
7571       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7572       if (idx >= CODING_CATEGORY_IDX_MAX)
7573         break;
7574       coding_priorities[i++] = (1 << idx);
7575       val = XCDR (val);
7576     }
7577   /* If coding-category-list is valid and contains all coding
7578      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7579      the following code saves Emacs from crashing.  */
7580   while (i < CODING_CATEGORY_IDX_MAX)
7581     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7582
7583   return Qnil;
7584 }
7585
7586 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7587        Sdefine_coding_system_internal, 1, 1, 0,
7588        doc: /* Register CODING-SYSTEM as a base coding system.
7589 This function is internal use only.  */)
7590      (coding_system)
7591      Lisp_Object coding_system;
7592 {
7593   Lisp_Object safe_chars, slot;
7594
7595   if (NILP (Fcheck_coding_system (coding_system)))
7596     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7597   safe_chars = coding_safe_chars (coding_system);
7598   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7599     error ("No valid safe-chars property for %s",
7600            SDATA (SYMBOL_NAME (coding_system)));
7601   if (EQ (safe_chars, Qt))
7602     {
7603       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7604         XSETCAR (Vcoding_system_safe_chars,
7605                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7606     }
7607   else
7608     {
7609       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7610       if (NILP (slot))
7611         XSETCDR (Vcoding_system_safe_chars,
7612                  nconc2 (XCDR (Vcoding_system_safe_chars),
7613                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7614       else
7615         XSETCDR (slot, safe_chars);
7616     }
7617   return Qnil;
7618 }
7619
7620 #endif /* emacs */
7621
7622 \f
7623 /*** 9. Post-amble ***/
7624
7625 void
7626 init_coding_once ()
7627 {
7628   int i;
7629
7630   /* Emacs' internal format specific initialize routine.  */
7631   for (i = 0; i <= 0x20; i++)
7632     emacs_code_class[i] = EMACS_control_code;
7633   emacs_code_class[0x0A] = EMACS_linefeed_code;
7634   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7635   for (i = 0x21 ; i < 0x7F; i++)
7636     emacs_code_class[i] = EMACS_ascii_code;
7637   emacs_code_class[0x7F] = EMACS_control_code;
7638   for (i = 0x80; i < 0xFF; i++)
7639     emacs_code_class[i] = EMACS_invalid_code;
7640   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7641   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7642   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7643   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7644
7645   /* ISO2022 specific initialize routine.  */
7646   for (i = 0; i < 0x20; i++)
7647     iso_code_class[i] = ISO_control_0;
7648   for (i = 0x21; i < 0x7F; i++)
7649     iso_code_class[i] = ISO_graphic_plane_0;
7650   for (i = 0x80; i < 0xA0; i++)
7651     iso_code_class[i] = ISO_control_1;
7652   for (i = 0xA1; i < 0xFF; i++)
7653     iso_code_class[i] = ISO_graphic_plane_1;
7654   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7655   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7656   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7657   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7658   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7659   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7660   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7661   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7662   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7663   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7664
7665   setup_coding_system (Qnil, &keyboard_coding);
7666   setup_coding_system (Qnil, &terminal_coding);
7667   setup_coding_system (Qnil, &safe_terminal_coding);
7668   setup_coding_system (Qnil, &default_buffer_file_coding);
7669
7670   bzero (coding_system_table, sizeof coding_system_table);
7671
7672   bzero (ascii_skip_code, sizeof ascii_skip_code);
7673   for (i = 0; i < 128; i++)
7674     ascii_skip_code[i] = 1;
7675
7676 #if defined (MSDOS) || defined (WINDOWSNT)
7677   system_eol_type = CODING_EOL_CRLF;
7678 #else
7679   system_eol_type = CODING_EOL_LF;
7680 #endif
7681
7682   inhibit_pre_post_conversion = 0;
7683 }
7684
7685 #ifdef emacs
7686
7687 void
7688 syms_of_coding ()
7689 {
7690   staticpro (&Vcode_conversion_workbuf_name);
7691   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
7692
7693   Qtarget_idx = intern ("target-idx");
7694   staticpro (&Qtarget_idx);
7695
7696   Qcoding_system_history = intern ("coding-system-history");
7697   staticpro (&Qcoding_system_history);
7698   Fset (Qcoding_system_history, Qnil);
7699
7700   /* Target FILENAME is the first argument.  */
7701   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7702   /* Target FILENAME is the third argument.  */
7703   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7704
7705   Qcall_process = intern ("call-process");
7706   staticpro (&Qcall_process);
7707   /* Target PROGRAM is the first argument.  */
7708   Fput (Qcall_process, Qtarget_idx, make_number (0));
7709
7710   Qcall_process_region = intern ("call-process-region");
7711   staticpro (&Qcall_process_region);
7712   /* Target PROGRAM is the third argument.  */
7713   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7714
7715   Qstart_process = intern ("start-process");
7716   staticpro (&Qstart_process);
7717   /* Target PROGRAM is the third argument.  */
7718   Fput (Qstart_process, Qtarget_idx, make_number (2));
7719
7720   Qopen_network_stream = intern ("open-network-stream");
7721   staticpro (&Qopen_network_stream);
7722   /* Target SERVICE is the fourth argument.  */
7723   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7724
7725   Qcoding_system = intern ("coding-system");
7726   staticpro (&Qcoding_system);
7727
7728   Qeol_type = intern ("eol-type");
7729   staticpro (&Qeol_type);
7730
7731   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7732   staticpro (&Qbuffer_file_coding_system);
7733
7734   Qpost_read_conversion = intern ("post-read-conversion");
7735   staticpro (&Qpost_read_conversion);
7736
7737   Qpre_write_conversion = intern ("pre-write-conversion");
7738   staticpro (&Qpre_write_conversion);
7739
7740   Qno_conversion = intern ("no-conversion");
7741   staticpro (&Qno_conversion);
7742
7743   Qundecided = intern ("undecided");
7744   staticpro (&Qundecided);
7745
7746   Qcoding_system_p = intern ("coding-system-p");
7747   staticpro (&Qcoding_system_p);
7748
7749   Qcoding_system_error = intern ("coding-system-error");
7750   staticpro (&Qcoding_system_error);
7751
7752   Fput (Qcoding_system_error, Qerror_conditions,
7753         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7754   Fput (Qcoding_system_error, Qerror_message,
7755         build_string ("Invalid coding system"));
7756
7757   Qcoding_category = intern ("coding-category");
7758   staticpro (&Qcoding_category);
7759   Qcoding_category_index = intern ("coding-category-index");
7760   staticpro (&Qcoding_category_index);
7761
7762   Vcoding_category_table
7763     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7764   staticpro (&Vcoding_category_table);
7765   {
7766     int i;
7767     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7768       {
7769         XVECTOR (Vcoding_category_table)->contents[i]
7770           = intern (coding_category_name[i]);
7771         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7772               Qcoding_category_index, make_number (i));
7773       }
7774   }
7775
7776   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7777   staticpro (&Vcoding_system_safe_chars);
7778
7779   Qtranslation_table = intern ("translation-table");
7780   staticpro (&Qtranslation_table);
7781   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7782
7783   Qtranslation_table_id = intern ("translation-table-id");
7784   staticpro (&Qtranslation_table_id);
7785
7786   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7787   staticpro (&Qtranslation_table_for_decode);
7788
7789   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7790   staticpro (&Qtranslation_table_for_encode);
7791
7792   Qsafe_chars = intern ("safe-chars");
7793   staticpro (&Qsafe_chars);
7794
7795   Qchar_coding_system = intern ("char-coding-system");
7796   staticpro (&Qchar_coding_system);
7797
7798   /* Intern this now in case it isn't already done.
7799      Setting this variable twice is harmless.
7800      But don't staticpro it here--that is done in alloc.c.  */
7801   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7802   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7803   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7804
7805   Qvalid_codes = intern ("valid-codes");
7806   staticpro (&Qvalid_codes);
7807
7808   Qemacs_mule = intern ("emacs-mule");
7809   staticpro (&Qemacs_mule);
7810
7811   Qraw_text = intern ("raw-text");
7812   staticpro (&Qraw_text);
7813
7814   Qutf_8 = intern ("utf-8");
7815   staticpro (&Qutf_8);
7816
7817   Qcoding_system_define_form = intern ("coding-system-define-form");
7818   staticpro (&Qcoding_system_define_form);
7819
7820   defsubr (&Scoding_system_p);
7821   defsubr (&Sread_coding_system);
7822   defsubr (&Sread_non_nil_coding_system);
7823   defsubr (&Scheck_coding_system);
7824   defsubr (&Sdetect_coding_region);
7825   defsubr (&Sdetect_coding_string);
7826   defsubr (&Sfind_coding_systems_region_internal);
7827   defsubr (&Sunencodable_char_position);
7828   defsubr (&Sdecode_coding_region);
7829   defsubr (&Sencode_coding_region);
7830   defsubr (&Sdecode_coding_string);
7831   defsubr (&Sencode_coding_string);
7832   defsubr (&Sdecode_sjis_char);
7833   defsubr (&Sencode_sjis_char);
7834   defsubr (&Sdecode_big5_char);
7835   defsubr (&Sencode_big5_char);
7836   defsubr (&Sset_terminal_coding_system_internal);
7837   defsubr (&Sset_safe_terminal_coding_system_internal);
7838   defsubr (&Sterminal_coding_system);
7839   defsubr (&Sset_keyboard_coding_system_internal);
7840   defsubr (&Skeyboard_coding_system);
7841   defsubr (&Sfind_operation_coding_system);
7842   defsubr (&Supdate_coding_systems_internal);
7843   defsubr (&Sset_coding_priority_internal);
7844   defsubr (&Sdefine_coding_system_internal);
7845
7846   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7847                doc: /* List of coding systems.
7848
7849 Do not alter the value of this variable manually.  This variable should be
7850 updated by the functions `make-coding-system' and
7851 `define-coding-system-alias'.  */);
7852   Vcoding_system_list = Qnil;
7853
7854   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7855                doc: /* Alist of coding system names.
7856 Each element is one element list of coding system name.
7857 This variable is given to `completing-read' as TABLE argument.
7858
7859 Do not alter the value of this variable manually.  This variable should be
7860 updated by the functions `make-coding-system' and
7861 `define-coding-system-alias'.  */);
7862   Vcoding_system_alist = Qnil;
7863
7864   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7865                doc: /* List of coding-categories (symbols) ordered by priority.
7866
7867 On detecting a coding system, Emacs tries code detection algorithms
7868 associated with each coding-category one by one in this order.  When
7869 one algorithm agrees with a byte sequence of source text, the coding
7870 system bound to the corresponding coding-category is selected.
7871
7872 Don't modify this variable directly, but use `set-coding-priority'.  */);
7873   {
7874     int i;
7875
7876     Vcoding_category_list = Qnil;
7877     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7878       Vcoding_category_list
7879         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7880                  Vcoding_category_list);
7881   }
7882
7883   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7884                doc: /* Specify the coding system for read operations.
7885 It is useful to bind this variable with `let', but do not set it globally.
7886 If the value is a coding system, it is used for decoding on read operation.
7887 If not, an appropriate element is used from one of the coding system alists:
7888 There are three such tables, `file-coding-system-alist',
7889 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7890   Vcoding_system_for_read = Qnil;
7891
7892   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7893                doc: /* Specify the coding system for write operations.
7894 Programs bind this variable with `let', but you should not set it globally.
7895 If the value is a coding system, it is used for encoding of output,
7896 when writing it to a file and when sending it to a file or subprocess.
7897
7898 If this does not specify a coding system, an appropriate element
7899 is used from one of the coding system alists:
7900 There are three such tables, `file-coding-system-alist',
7901 `process-coding-system-alist', and `network-coding-system-alist'.
7902 For output to files, if the above procedure does not specify a coding system,
7903 the value of `buffer-file-coding-system' is used.  */);
7904   Vcoding_system_for_write = Qnil;
7905
7906   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7907                doc: /* Coding system used in the latest file or process I/O.
7908 Also set by `encode-coding-region', `decode-coding-region',
7909 `encode-coding-string' and `decode-coding-string'.  */);
7910   Vlast_coding_system_used = Qnil;
7911
7912   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7913                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7914 See info node `Coding Systems' and info node `Text and Binary' concerning
7915 such conversion.  */);
7916   inhibit_eol_conversion = 0;
7917
7918   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7919                doc: /* Non-nil means process buffer inherits coding system of process output.
7920 Bind it to t if the process output is to be treated as if it were a file
7921 read from some filesystem.  */);
7922   inherit_process_coding_system = 0;
7923
7924   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7925                doc: /* Alist to decide a coding system to use for a file I/O operation.
7926 The format is ((PATTERN . VAL) ...),
7927 where PATTERN is a regular expression matching a file name,
7928 VAL is a coding system, a cons of coding systems, or a function symbol.
7929 If VAL is a coding system, it is used for both decoding and encoding
7930 the file contents.
7931 If VAL is a cons of coding systems, the car part is used for decoding,
7932 and the cdr part is used for encoding.
7933 If VAL is a function symbol, the function must return a coding system
7934 or a cons of coding systems which are used as above.  The function gets
7935 the arguments with which `find-operation-coding-system' was called.
7936
7937 See also the function `find-operation-coding-system'
7938 and the variable `auto-coding-alist'.  */);
7939   Vfile_coding_system_alist = Qnil;
7940
7941   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7942     doc: /* Alist to decide a coding system to use for a process I/O operation.
7943 The format is ((PATTERN . VAL) ...),
7944 where PATTERN is a regular expression matching a program name,
7945 VAL is a coding system, a cons of coding systems, or a function symbol.
7946 If VAL is a coding system, it is used for both decoding what received
7947 from the program and encoding what sent to the program.
7948 If VAL is a cons of coding systems, the car part is used for decoding,
7949 and the cdr part is used for encoding.
7950 If VAL is a function symbol, the function must return a coding system
7951 or a cons of coding systems which are used as above.
7952
7953 See also the function `find-operation-coding-system'.  */);
7954   Vprocess_coding_system_alist = Qnil;
7955
7956   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7957     doc: /* Alist to decide a coding system to use for a network I/O operation.
7958 The format is ((PATTERN . VAL) ...),
7959 where PATTERN is a regular expression matching a network service name
7960 or is a port number to connect to,
7961 VAL is a coding system, a cons of coding systems, or a function symbol.
7962 If VAL is a coding system, it is used for both decoding what received
7963 from the network stream and encoding what sent to the network stream.
7964 If VAL is a cons of coding systems, the car part is used for decoding,
7965 and the cdr part is used for encoding.
7966 If VAL is a function symbol, the function must return a coding system
7967 or a cons of coding systems which are used as above.
7968
7969 See also the function `find-operation-coding-system'.  */);
7970   Vnetwork_coding_system_alist = Qnil;
7971
7972   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7973                doc: /* Coding system to use with system messages.
7974 Also used for decoding keyboard input on X Window system.  */);
7975   Vlocale_coding_system = Qnil;
7976
7977   /* The eol mnemonics are reset in startup.el system-dependently.  */
7978   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7979                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7980   eol_mnemonic_unix = build_string (":");
7981
7982   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7983                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
7984   eol_mnemonic_dos = build_string ("\\");
7985
7986   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7987                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
7988   eol_mnemonic_mac = build_string ("/");
7989
7990   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7991                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
7992   eol_mnemonic_undecided = build_string (":");
7993
7994   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7995                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
7996   Venable_character_translation = Qt;
7997
7998   DEFVAR_LISP ("standard-translation-table-for-decode",
7999                &Vstandard_translation_table_for_decode,
8000                doc: /* Table for translating characters while decoding.  */);
8001   Vstandard_translation_table_for_decode = Qnil;
8002
8003   DEFVAR_LISP ("standard-translation-table-for-encode",
8004                &Vstandard_translation_table_for_encode,
8005                doc: /* Table for translating characters while encoding.  */);
8006   Vstandard_translation_table_for_encode = Qnil;
8007
8008   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
8009                doc: /* Alist of charsets vs revision numbers.
8010 While encoding, if a charset (car part of an element) is found,
8011 designate it with the escape sequence identifying revision (cdr part of the element).  */);
8012   Vcharset_revision_alist = Qnil;
8013
8014   DEFVAR_LISP ("default-process-coding-system",
8015                &Vdefault_process_coding_system,
8016                doc: /* Cons of coding systems used for process I/O by default.
8017 The car part is used for decoding a process output,
8018 the cdr part is used for encoding a text to be sent to a process.  */);
8019   Vdefault_process_coding_system = Qnil;
8020
8021   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8022                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
8023 This is a vector of length 256.
8024 If Nth element is non-nil, the existence of code N in a file
8025 \(or output of subprocess) doesn't prevent it to be detected as
8026 a coding system of ISO 2022 variant which has a flag
8027 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8028 or reading output of a subprocess.
8029 Only 128th through 159th elements has a meaning.  */);
8030   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8031
8032   DEFVAR_LISP ("select-safe-coding-system-function",
8033                &Vselect_safe_coding_system_function,
8034                doc: /* Function to call to select safe coding system for encoding a text.
8035
8036 If set, this function is called to force a user to select a proper
8037 coding system which can encode the text in the case that a default
8038 coding system used in each operation can't encode the text.
8039
8040 The default value is `select-safe-coding-system' (which see).  */);
8041   Vselect_safe_coding_system_function = Qnil;
8042
8043   DEFVAR_BOOL ("coding-system-require-warning",
8044                &coding_system_require_warning,
8045                doc: /* Internal use only.
8046 If non-nil, on writing a file, `select-safe-coding-system-function' is
8047 called even if `coding-system-for-write' is non-nil.  The command
8048 `universal-coding-system-argument' binds this variable to t temporarily.  */);
8049   coding_system_require_warning = 0;
8050
8051
8052   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8053                &inhibit_iso_escape_detection,
8054                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8055
8056 By default, on reading a file, Emacs tries to detect how the text is
8057 encoded.  This code detection is sensitive to escape sequences.  If
8058 the sequence is valid as ISO2022, the code is determined as one of
8059 the ISO2022 encodings, and the file is decoded by the corresponding
8060 coding system (e.g. `iso-2022-7bit').
8061
8062 However, there may be a case that you want to read escape sequences in
8063 a file as is.  In such a case, you can set this variable to non-nil.
8064 Then, as the code detection ignores any escape sequences, no file is
8065 detected as encoded in some ISO2022 encoding.  The result is that all
8066 escape sequences become visible in a buffer.
8067
8068 The default value is nil, and it is strongly recommended not to change
8069 it.  That is because many Emacs Lisp source files that contain
8070 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8071 in Emacs's distribution, and they won't be decoded correctly on
8072 reading if you suppress escape sequence detection.
8073
8074 The other way to read escape sequences in a file without decoding is
8075 to explicitly specify some coding system that doesn't use ISO2022's
8076 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8077   inhibit_iso_escape_detection = 0;
8078
8079   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
8080                doc: /* Char table for translating self-inserting characters.
8081 This is applied to the result of input methods, not their input.  See also
8082 `keyboard-translate-table'.  */);
8083     Vtranslation_table_for_input = Qnil;
8084 }
8085
8086 char *
8087 emacs_strerror (error_number)
8088      int error_number;
8089 {
8090   char *str;
8091
8092   synchronize_system_messages_locale ();
8093   str = strerror (error_number);
8094
8095   if (! NILP (Vlocale_coding_system))
8096     {
8097       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8098                                                       Vlocale_coding_system,
8099                                                       0);
8100       str = (char *) SDATA (dec);
8101     }
8102
8103   return str;
8104 }
8105
8106 #endif /* emacs */
8107
8108 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
8109    (do not change this comment) */