src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995,97,1998,2002,2003  Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001,2002,2003  Free Software Foundation, Inc.
   5
   6 This file is part of GNU Emacs.
   7
   8 GNU Emacs is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU General Public License as published by
  10 the Free Software Foundation; either version 2, or (at your option)
  11 any later version.
  12
  13 GNU Emacs is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GNU Emacs; see the file COPYING.  If not, write to
  20 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  21 Boston, MA 02111-1307, USA.  */
  22
  23 /*** TABLE OF CONTENTS ***
  24
  25   0. General comments
  26   1. Preamble
  27   2. Emacs' internal format (emacs-mule) handlers
  28   3. ISO2022 handlers
  29   4. Shift-JIS and BIG5 handlers
  30   5. CCL handlers
  31   6. End-of-line handlers
  32   7. C library functions
  33   8. Emacs Lisp library functions
  34   9. Post-amble
  35
  36 */
  37
  38 /*** 0. General comments ***/
  39
  40
  41 /*** GENERAL NOTE on CODING SYSTEMS ***
  42
  43   A coding system is an encoding mechanism for one or more character
  44   sets.  Here's a list of coding systems which Emacs can handle.  When
  45   we say "decode", it means converting some other coding system to
  46   Emacs' internal format (emacs-mule), and when we say "encode",
  47   it means converting the coding system emacs-mule to some other
  48   coding system.
  49
  50   0. Emacs' internal format (emacs-mule)
  51
  52   Emacs itself holds a multi-lingual character in buffers and strings
  53   in a special format.  Details are described in section 2.
  54
  55   1. ISO2022
  56
  57   The most famous coding system for multiple character sets.  X's
  58   Compound Text, various EUCs (Extended Unix Code), and coding
  59   systems used in Internet communication such as ISO-2022-JP are
  60   all variants of ISO2022.  Details are described in section 3.
  61
  62   2. SJIS (or Shift-JIS or MS-Kanji-Code)
  63
  64   A coding system to encode character sets: ASCII, JISX0201, and
  65   JISX0208.  Widely used for PC's in Japan.  Details are described in
  66   section 4.
  67
  68   3. BIG5
  69
  70   A coding system to encode the character sets ASCII and Big5.  Widely
  71   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  72   described in section 4.  In this file, when we write "BIG5"
  73   (all uppercase), we mean the coding system, and when we write
  74   "Big5" (capitalized), we mean the character set.
  75
  76   4. Raw text
  77
  78   A coding system for text containing random 8-bit code.  Emacs does
  79   no code conversion on such text except for end-of-line format.
  80
  81   5. Other
  82
  83   If a user wants to read/write text encoded in a coding system not
  84   listed above, he can supply a decoder and an encoder for it as CCL
  85   (Code Conversion Language) programs.  Emacs executes the CCL program
  86   while reading/writing.
  87
  88   Emacs represents a coding system by a Lisp symbol that has a property
  89   `coding-system'.  But, before actually using the coding system, the
  90   information about it is set in a structure of type `struct
  91   coding_system' for rapid processing.  See section 6 for more details.
  92
  93 */
  94
  95 /*** GENERAL NOTES on END-OF-LINE FORMAT ***
  96
  97   How end-of-line of text is encoded depends on the operating system.
  98   For instance, Unix's format is just one byte of `line-feed' code,
  99   whereas DOS's format is two-byte sequence of `carriage-return' and
 100   `line-feed' codes.  MacOS's format is usually one byte of
 101   `carriage-return'.
 102
 103   Since text character encoding and end-of-line encoding are
 104   independent, any coding system described above can have any
 105   end-of-line format.  So Emacs has information about end-of-line
 106   format in each coding-system.  See section 6 for more details.
 107
 108 */
 109
 110 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 111
 112   These functions check if a text between SRC and SRC_END is encoded
 113   in the coding system category XXX.  Each returns an integer value in
 114   which appropriate flag bits for the category XXX are set.  The flag
 115   bits are defined in macros CODING_CATEGORY_MASK_XXX.  Below is the
 116   template for these functions.  If MULTIBYTEP is nonzero, 8-bit codes
 117   of the range 0x80..0x9F are in multibyte form.  */
 118 #if 0
 119 int
 120 detect_coding_emacs_mule (src, src_end, multibytep)
 121      unsigned char *src, *src_end;
 122      int multibytep;
 123 {
 124   ...
 125 }
 126 #endif
 127
 128 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 129
 130   These functions decode SRC_BYTES length of unibyte text at SOURCE
 131   encoded in CODING to Emacs' internal format.  The resulting
 132   multibyte text goes to a place pointed to by DESTINATION, the length
 133   of which should not exceed DST_BYTES.
 134
 135   These functions set the information about original and decoded texts
 136   in the members `produced', `produced_char', `consumed', and
 137   `consumed_char' of the structure *CODING.  They also set the member
 138   `result' to one of CODING_FINISH_XXX indicating how the decoding
 139   finished.
 140
 141   DST_BYTES zero means that the source area and destination area are
 142   overlapped, which means that we can produce a decoded text until it
 143   reaches the head of the not-yet-decoded source text.
 144
 145   Below is a template for these functions.  */
 146 #if 0
 147 static void
 148 decode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 149      struct coding_system *coding;
 150      unsigned char *source, *destination;
 151      int src_bytes, dst_bytes;
 152 {
 153   ...
 154 }
 155 #endif
 156
 157 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 158
 159   These functions encode SRC_BYTES length text at SOURCE from Emacs'
 160   internal multibyte format to CODING.  The resulting unibyte text
 161   goes to a place pointed to by DESTINATION, the length of which
 162   should not exceed DST_BYTES.
 163
 164   These functions set the information about original and encoded texts
 165   in the members `produced', `produced_char', `consumed', and
 166   `consumed_char' of the structure *CODING.  They also set the member
 167   `result' to one of CODING_FINISH_XXX indicating how the encoding
 168   finished.
 169
 170   DST_BYTES zero means that the source area and destination area are
 171   overlapped, which means that we can produce encoded text until it
 172   reaches at the head of the not-yet-encoded source text.
 173
 174   Below is a template for these functions.  */
 175 #if 0
 176 static void
 177 encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes)
 178      struct coding_system *coding;
 179      unsigned char *source, *destination;
 180      int src_bytes, dst_bytes;
 181 {
 182   ...
 183 }
 184 #endif
 185
 186 /*** COMMONLY USED MACROS ***/
 187
 188 /* The following two macros ONE_MORE_BYTE and TWO_MORE_BYTES safely
 189    get one, two, and three bytes from the source text respectively.
 190    If there are not enough bytes in the source, they jump to
 191    `label_end_of_loop'.  The caller should set variables `coding',
 192    `src' and `src_end' to appropriate pointer in advance.  These
 193    macros are called from decoding routines `decode_coding_XXX', thus
 194    it is assumed that the source text is unibyte.  */
 195
 196 #define ONE_MORE_BYTE(c1)                                       \
 197   do {                                                          \
 198     if (src >= src_end)                                         \
 199       {                                                         \
 200         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 201         goto label_end_of_loop;                                 \
 202       }                                                         \
 203     c1 = *src++;                                                \
 204   } while (0)
 205
 206 #define TWO_MORE_BYTES(c1, c2)                                  \
 207   do {                                                          \
 208     if (src + 1 >= src_end)                                     \
 209       {                                                         \
 210         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 211         goto label_end_of_loop;                                 \
 212       }                                                         \
 213     c1 = *src++;                                                \
 214     c2 = *src++;                                                \
 215   } while (0)
 216
 217
 218 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte
 219    form if MULTIBYTEP is nonzero.  */
 220
 221 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep)           \
 222   do {                                                          \
 223     if (src >= src_end)                                         \
 224       {                                                         \
 225         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 226         goto label_end_of_loop;                                 \
 227       }                                                         \
 228     c1 = *src++;                                                \
 229     if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL)         \
 230       c1 = *src++ - 0x20;                                       \
 231   } while (0)
 232
 233 /* Set C to the next character at the source text pointed by `src'.
 234    If there are not enough characters in the source, jump to
 235    `label_end_of_loop'.  The caller should set variables `coding'
 236    `src', `src_end', and `translation_table' to appropriate pointers
 237    in advance.  This macro is used in encoding routines
 238    `encode_coding_XXX', thus it assumes that the source text is in
 239    multibyte form except for 8-bit characters.  8-bit characters are
 240    in multibyte form if coding->src_multibyte is nonzero, else they
 241    are represented by a single byte.  */
 242
 243 #define ONE_MORE_CHAR(c)                                        \
 244   do {                                                          \
 245     int len = src_end - src;                                    \
 246     int bytes;                                                  \
 247     if (len <= 0)                                               \
 248       {                                                         \
 249         coding->result = CODING_FINISH_INSUFFICIENT_SRC;        \
 250         goto label_end_of_loop;                                 \
 251       }                                                         \
 252     if (coding->src_multibyte                                   \
 253         || UNIBYTE_STR_AS_MULTIBYTE_P (src, len, bytes))        \
 254       c = STRING_CHAR_AND_LENGTH (src, len, bytes);             \
 255     else                                                        \
 256       c = *src, bytes = 1;                                      \
 257     if (!NILP (translation_table))                              \
 258       c = translate_char (translation_table, c, -1, 0, 0);      \
 259     src += bytes;                                               \
 260   } while (0)
 261
 262
 263 /* Produce a multibyte form of character C to `dst'.  Jump to
 264    `label_end_of_loop' if there's not enough space at `dst'.
 265
 266    If we are now in the middle of a composition sequence, the decoded
 267    character may be ALTCHAR (for the current composition).  In that
 268    case, the character goes to coding->cmp_data->data instead of
 269    `dst'.
 270
 271    This macro is used in decoding routines.  */
 272
 273 #define EMIT_CHAR(c)                                                    \
 274   do {                                                                  \
 275     if (! COMPOSING_P (coding)                                          \
 276         || coding->composing == COMPOSITION_RELATIVE                    \
 277         || coding->composing == COMPOSITION_WITH_RULE)                  \
 278       {                                                                 \
 279         int bytes = CHAR_BYTES (c);                                     \
 280         if ((dst + bytes) > (dst_bytes ? dst_end : src))                \
 281           {                                                             \
 282             coding->result = CODING_FINISH_INSUFFICIENT_DST;            \
 283             goto label_end_of_loop;                                     \
 284           }                                                             \
 285         dst += CHAR_STRING (c, dst);                                    \
 286         coding->produced_char++;                                        \
 287       }                                                                 \
 288                                                                         \
 289     if (COMPOSING_P (coding)                                            \
 290         && coding->composing != COMPOSITION_RELATIVE)                   \
 291       {                                                                 \
 292         CODING_ADD_COMPOSITION_COMPONENT (coding, c);                   \
 293         coding->composition_rule_follows                                \
 294           = coding->composing != COMPOSITION_WITH_ALTCHARS;             \
 295       }                                                                 \
 296   } while (0)
 297
 298
 299 #define EMIT_ONE_BYTE(c)                                        \
 300   do {                                                          \
 301     if (dst >= (dst_bytes ? dst_end : src))                     \
 302       {                                                         \
 303         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 304         goto label_end_of_loop;                                 \
 305       }                                                         \
 306     *dst++ = c;                                                 \
 307   } while (0)
 308
 309 #define EMIT_TWO_BYTES(c1, c2)                                  \
 310   do {                                                          \
 311     if (dst + 2 > (dst_bytes ? dst_end : src))                  \
 312       {                                                         \
 313         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 314         goto label_end_of_loop;                                 \
 315       }                                                         \
 316     *dst++ = c1, *dst++ = c2;                                   \
 317   } while (0)
 318
 319 #define EMIT_BYTES(from, to)                                    \
 320   do {                                                          \
 321     if (dst + (to - from) > (dst_bytes ? dst_end : src))        \
 322       {                                                         \
 323         coding->result = CODING_FINISH_INSUFFICIENT_DST;        \
 324         goto label_end_of_loop;                                 \
 325       }                                                         \
 326     while (from < to)                                           \
 327       *dst++ = *from++;                                         \
 328   } while (0)
 329
 330 \f
 331 /*** 1. Preamble ***/
 332
 333 #ifdef emacs
 334 #include <config.h>
 335 #endif
 336
 337 #include <stdio.h>
 338
 339 #ifdef emacs
 340
 341 #include "lisp.h"
 342 #include "buffer.h"
 343 #include "charset.h"
 344 #include "composite.h"
 345 #include "ccl.h"
 346 #include "coding.h"
 347 #include "window.h"
 348
 349 #else  /* not emacs */
 350
 351 #include "mulelib.h"
 352
 353 #endif /* not emacs */
 354
 355 Lisp_Object Qcoding_system, Qeol_type;
 356 Lisp_Object Qbuffer_file_coding_system;
 357 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 358 Lisp_Object Qno_conversion, Qundecided;
 359 Lisp_Object Qcoding_system_history;
 360 Lisp_Object Qsafe_chars;
 361 Lisp_Object Qvalid_codes;
 362
 363 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 364 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 365 Lisp_Object Qstart_process, Qopen_network_stream;
 366 Lisp_Object Qtarget_idx;
 367
 368 Lisp_Object Vselect_safe_coding_system_function;
 369
 370 int coding_system_require_warning;
 371
 372 /* Mnemonic string for each format of end-of-line.  */
 373 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 374 /* Mnemonic string to indicate format of end-of-line is not yet
 375    decided.  */
 376 Lisp_Object eol_mnemonic_undecided;
 377
 378 /* Format of end-of-line decided by system.  This is CODING_EOL_LF on
 379    Unix, CODING_EOL_CRLF on DOS/Windows, and CODING_EOL_CR on Mac.  */
 380 int system_eol_type;
 381
 382 #ifdef emacs
 383
 384 /* Information about which coding system is safe for which chars.
 385    The value has the form (GENERIC-LIST . NON-GENERIC-ALIST).
 386
 387    GENERIC-LIST is a list of generic coding systems which can encode
 388    any characters.
 389
 390    NON-GENERIC-ALIST is an alist of non generic coding systems vs the
 391    corresponding char table that contains safe chars.  */
 392 Lisp_Object Vcoding_system_safe_chars;
 393
 394 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 395
 396 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 397
 398 /* Coding system emacs-mule and raw-text are for converting only
 399    end-of-line format.  */
 400 Lisp_Object Qemacs_mule, Qraw_text;
 401
 402 Lisp_Object Qutf_8;
 403
 404 /* Coding-systems are handed between Emacs Lisp programs and C internal
 405    routines by the following three variables.  */
 406 /* Coding-system for reading files and receiving data from process.  */
 407 Lisp_Object Vcoding_system_for_read;
 408 /* Coding-system for writing files and sending data to process.  */
 409 Lisp_Object Vcoding_system_for_write;
 410 /* Coding-system actually used in the latest I/O.  */
 411 Lisp_Object Vlast_coding_system_used;
 412
 413 /* A vector of length 256 which contains information about special
 414    Latin codes (especially for dealing with Microsoft codes).  */
 415 Lisp_Object Vlatin_extra_code_table;
 416
 417 /* Flag to inhibit code conversion of end-of-line format.  */
 418 int inhibit_eol_conversion;
 419
 420 /* Flag to inhibit ISO2022 escape sequence detection.  */
 421 int inhibit_iso_escape_detection;
 422
 423 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 424 int inherit_process_coding_system;
 425
 426 /* Coding system to be used to encode text for terminal display.  */
 427 struct coding_system terminal_coding;
 428
 429 /* Coding system to be used to encode text for terminal display when
 430    terminal coding system is nil.  */
 431 struct coding_system safe_terminal_coding;
 432
 433 /* Coding system of what is sent from terminal keyboard.  */
 434 struct coding_system keyboard_coding;
 435
 436 /* Default coding system to be used to write a file.  */
 437 struct coding_system default_buffer_file_coding;
 438
 439 Lisp_Object Vfile_coding_system_alist;
 440 Lisp_Object Vprocess_coding_system_alist;
 441 Lisp_Object Vnetwork_coding_system_alist;
 442
 443 Lisp_Object Vlocale_coding_system;
 444
 445 #endif /* emacs */
 446
 447 Lisp_Object Qcoding_category, Qcoding_category_index;
 448
 449 /* List of symbols `coding-category-xxx' ordered by priority.  */
 450 Lisp_Object Vcoding_category_list;
 451
 452 /* Table of coding categories (Lisp symbols).  */
 453 Lisp_Object Vcoding_category_table;
 454
 455 /* Table of names of symbol for each coding-category.  */
 456 char *coding_category_name[CODING_CATEGORY_IDX_MAX] = {
 457   "coding-category-emacs-mule",
 458   "coding-category-sjis",
 459   "coding-category-iso-7",
 460   "coding-category-iso-7-tight",
 461   "coding-category-iso-8-1",
 462   "coding-category-iso-8-2",
 463   "coding-category-iso-7-else",
 464   "coding-category-iso-8-else",
 465   "coding-category-ccl",
 466   "coding-category-big5",
 467   "coding-category-utf-8",
 468   "coding-category-utf-16-be",
 469   "coding-category-utf-16-le",
 470   "coding-category-raw-text",
 471   "coding-category-binary"
 472 };
 473
 474 /* Table of pointers to coding systems corresponding to each coding
 475    categories.  */
 476 struct coding_system *coding_system_table[CODING_CATEGORY_IDX_MAX];
 477
 478 /* Table of coding category masks.  Nth element is a mask for a coding
 479    category of which priority is Nth.  */
 480 static
 481 int coding_priorities[CODING_CATEGORY_IDX_MAX];
 482
 483 /* Flag to tell if we look up translation table on character code
 484    conversion.  */
 485 Lisp_Object Venable_character_translation;
 486 /* Standard translation table to look up on decoding (reading).  */
 487 Lisp_Object Vstandard_translation_table_for_decode;
 488 /* Standard translation table to look up on encoding (writing).  */
 489 Lisp_Object Vstandard_translation_table_for_encode;
 490
 491 Lisp_Object Qtranslation_table;
 492 Lisp_Object Qtranslation_table_id;
 493 Lisp_Object Qtranslation_table_for_decode;
 494 Lisp_Object Qtranslation_table_for_encode;
 495
 496 /* Alist of charsets vs revision number.  */
 497 Lisp_Object Vcharset_revision_alist;
 498
 499 /* Default coding systems used for process I/O.  */
 500 Lisp_Object Vdefault_process_coding_system;
 501
 502 /* Char table for translating Quail and self-inserting input.  */
 503 Lisp_Object Vtranslation_table_for_input;
 504
 505 /* Global flag to tell that we can't call post-read-conversion and
 506    pre-write-conversion functions.  Usually the value is zero, but it
 507    is set to 1 temporarily while such functions are running.  This is
 508    to avoid infinite recursive call.  */
 509 static int inhibit_pre_post_conversion;
 510
 511 Lisp_Object Qchar_coding_system;
 512
 513 /* Return `safe-chars' property of CODING_SYSTEM (symbol).  Don't check
 514    its validity.  */
 515
 516 Lisp_Object
 517 coding_safe_chars (coding_system)
 518      Lisp_Object coding_system;
 519 {
 520   Lisp_Object coding_spec, plist, safe_chars;
 521
 522   coding_spec = Fget (coding_system, Qcoding_system);
 523   plist = XVECTOR (coding_spec)->contents[3];
 524   safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars);
 525   return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt);
 526 }
 527
 528 #define CODING_SAFE_CHAR_P(safe_chars, c) \
 529   (EQ (safe_chars, Qt) || !NILP (CHAR_TABLE_REF (safe_chars, c)))
 530
 531 \f
 532 /*** 2. Emacs internal format (emacs-mule) handlers ***/
 533
 534 /* Emacs' internal format for representation of multiple character
 535    sets is a kind of multi-byte encoding, i.e. characters are
 536    represented by variable-length sequences of one-byte codes.
 537
 538    ASCII characters and control characters (e.g. `tab', `newline') are
 539    represented by one-byte sequences which are their ASCII codes, in
 540    the range 0x00 through 0x7F.
 541
 542    8-bit characters of the range 0x80..0x9F are represented by
 543    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
 544    code + 0x20).
 545
 546    8-bit characters of the range 0xA0..0xFF are represented by
 547    one-byte sequences which are their 8-bit code.
 548
 549    The other characters are represented by a sequence of `base
 550    leading-code', optional `extended leading-code', and one or two
 551    `position-code's.  The length of the sequence is determined by the
 552    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
 553    whereas extended leading-code and position-code take the range 0xA0
 554    through 0xFF.  See `charset.h' for more details about leading-code
 555    and position-code.
 556
 557    --- CODE RANGE of Emacs' internal format ---
 558    character set        range
 559    -------------        -----
 560    ascii                0x00..0x7F
 561    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
 562    eight-bit-graphic    0xA0..0xBF
 563    ELSE                 0x81..0x9D + [0xA0..0xFF]+
 564    ---------------------------------------------
 565
 566    As this is the internal character representation, the format is
 567    usually not used externally (i.e. in a file or in a data sent to a
 568    process).  But, it is possible to have a text externally in this
 569    format (i.e. by encoding by the coding system `emacs-mule').
 570
 571    In that case, a sequence of one-byte codes has a slightly different
 572    form.
 573
 574    Firstly, all characters in eight-bit-control are represented by
 575    one-byte sequences which are their 8-bit code.
 576
 577    Next, character composition data are represented by the byte
 578    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
 579    where,
 580         METHOD is 0xF0 plus one of composition method (enum
 581         composition_method),
 582
 583         BYTES is 0xA0 plus the byte length of these composition data,
 584
 585         CHARS is 0xA0 plus the number of characters composed by these
 586         data,
 587
 588         COMPONENTs are characters of multibyte form or composition
 589         rules encoded by two-byte of ASCII codes.
 590
 591    In addition, for backward compatibility, the following formats are
 592    also recognized as composition data on decoding.
 593
 594    0x80 MSEQ ...
 595    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
 596
 597    Here,
 598         MSEQ is a multibyte form but in these special format:
 599           ASCII: 0xA0 ASCII_CODE+0x80,
 600           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
 601         RULE is a one byte code of the range 0xA0..0xF0 that
 602         represents a composition rule.
 603   */
 604
 605 enum emacs_code_class_type emacs_code_class[256];
 606
 607 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
 608    Check if a text is encoded in Emacs' internal format.  If it is,
 609    return CODING_CATEGORY_MASK_EMACS_MULE, else return 0.  */
 610
 611 static int
 612 detect_coding_emacs_mule (src, src_end, multibytep)
 613       unsigned char *src, *src_end;
 614       int multibytep;
 615 {
 616   unsigned char c;
 617   int composing = 0;
 618   /* Dummy for ONE_MORE_BYTE.  */
 619   struct coding_system dummy_coding;
 620   struct coding_system *coding = &dummy_coding;
 621
 622   while (1)
 623     {
 624       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 625
 626       if (composing)
 627         {
 628           if (c < 0xA0)
 629             composing = 0;
 630           else if (c == 0xA0)
 631             {
 632               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
 633               c &= 0x7F;
 634             }
 635           else
 636             c -= 0x20;
 637         }
 638
 639       if (c < 0x20)
 640         {
 641           if (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
 642             return 0;
 643         }
 644       else if (c >= 0x80 && c < 0xA0)
 645         {
 646           if (c == 0x80)
 647             /* Old leading code for a composite character.  */
 648             composing = 1;
 649           else
 650             {
 651               unsigned char *src_base = src - 1;
 652               int bytes;
 653
 654               if (!UNIBYTE_STR_AS_MULTIBYTE_P (src_base, src_end - src_base,
 655                                                bytes))
 656                 return 0;
 657               src = src_base + bytes;
 658             }
 659         }
 660     }
 661  label_end_of_loop:
 662   return CODING_CATEGORY_MASK_EMACS_MULE;
 663 }
 664
 665
 666 /* Record the starting position START and METHOD of one composition.  */
 667
 668 #define CODING_ADD_COMPOSITION_START(coding, start, method)     \
 669   do {                                                          \
 670     struct composition_data *cmp_data = coding->cmp_data;       \
 671     int *data = cmp_data->data + cmp_data->used;                \
 672     coding->cmp_data_start = cmp_data->used;                    \
 673     data[0] = -1;                                               \
 674     data[1] = cmp_data->char_offset + start;                    \
 675     data[3] = (int) method;                                     \
 676     cmp_data->used += 4;                                        \
 677   } while (0)
 678
 679 /* Record the ending position END of the current composition.  */
 680
 681 #define CODING_ADD_COMPOSITION_END(coding, end)                 \
 682   do {                                                          \
 683     struct composition_data *cmp_data = coding->cmp_data;       \
 684     int *data = cmp_data->data + coding->cmp_data_start;        \
 685     data[0] = cmp_data->used - coding->cmp_data_start;          \
 686     data[2] = cmp_data->char_offset + end;                      \
 687   } while (0)
 688
 689 /* Record one COMPONENT (alternate character or composition rule).  */
 690
 691 #define CODING_ADD_COMPOSITION_COMPONENT(coding, component)             \
 692   do {                                                                  \
 693     coding->cmp_data->data[coding->cmp_data->used++] = component;       \
 694     if (coding->cmp_data->used - coding->cmp_data_start                 \
 695         == COMPOSITION_DATA_MAX_BUNCH_LENGTH)                           \
 696       {                                                                 \
 697         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
 698         coding->composing = COMPOSITION_NO;                             \
 699       }                                                                 \
 700   } while (0)
 701
 702
 703 /* Get one byte from a data pointed by SRC and increment SRC.  If SRC
 704    is not less than SRC_END, return -1 without incrementing Src.  */
 705
 706 #define SAFE_ONE_MORE_BYTE() (src >= src_end ? -1 : *src++)
 707
 708
 709 /* Decode a character represented as a component of composition
 710    sequence of Emacs 20 style at SRC.  Set C to that character, store
 711    its multibyte form sequence at P, and set P to the end of that
 712    sequence.  If no valid character is found, set C to -1.  */
 713
 714 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p)                \
 715   do {                                                          \
 716     int bytes;                                                  \
 717                                                                 \
 718     c = SAFE_ONE_MORE_BYTE ();                                  \
 719     if (c < 0)                                                  \
 720       break;                                                    \
 721     if (CHAR_HEAD_P (c))                                        \
 722       c = -1;                                                   \
 723     else if (c == 0xA0)                                         \
 724       {                                                         \
 725         c = SAFE_ONE_MORE_BYTE ();                              \
 726         if (c < 0xA0)                                           \
 727           c = -1;                                               \
 728         else                                                    \
 729           {                                                     \
 730             c -= 0xA0;                                          \
 731             *p++ = c;                                           \
 732           }                                                     \
 733       }                                                         \
 734     else if (BASE_LEADING_CODE_P (c - 0x20))                    \
 735       {                                                         \
 736         unsigned char *p0 = p;                                  \
 737                                                                 \
 738         c -= 0x20;                                              \
 739         *p++ = c;                                               \
 740         bytes = BYTES_BY_CHAR_HEAD (c);                         \
 741         while (--bytes)                                         \
 742           {                                                     \
 743             c = SAFE_ONE_MORE_BYTE ();                          \
 744             if (c < 0)                                          \
 745               break;                                            \
 746             *p++ = c;                                           \
 747           }                                                     \
 748         if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)      \
 749             || (coding->flags /* We are recovering a file.  */  \
 750                 && p0[0] == LEADING_CODE_8_BIT_CONTROL          \
 751                 && ! CHAR_HEAD_P (p0[1])))                      \
 752           c = STRING_CHAR (p0, bytes);                          \
 753         else                                                    \
 754           c = -1;                                               \
 755       }                                                         \
 756     else                                                        \
 757       c = -1;                                                   \
 758   } while (0)
 759
 760
 761 /* Decode a composition rule represented as a component of composition
 762    sequence of Emacs 20 style at SRC.  Set C to the rule.  If not
 763    valid rule is found, set C to -1.  */
 764
 765 #define DECODE_EMACS_MULE_COMPOSITION_RULE(c)           \
 766   do {                                                  \
 767     c = SAFE_ONE_MORE_BYTE ();                          \
 768     c -= 0xA0;                                          \
 769     if (c < 0 || c >= 81)                               \
 770       c = -1;                                           \
 771     else                                                \
 772       {                                                 \
 773         gref = c / 9, nref = c % 9;                     \
 774         c = COMPOSITION_ENCODE_RULE (gref, nref);       \
 775       }                                                 \
 776   } while (0)
 777
 778
 779 /* Decode composition sequence encoded by `emacs-mule' at the source
 780    pointed by SRC.  SRC_END is the end of source.  Store information
 781    of the composition in CODING->cmp_data.
 782
 783    For backward compatibility, decode also a composition sequence of
 784    Emacs 20 style.  In that case, the composition sequence contains
 785    characters that should be extracted into a buffer or string.  Store
 786    those characters at *DESTINATION in multibyte form.
 787
 788    If we encounter an invalid byte sequence, return 0.
 789    If we encounter an insufficient source or destination, or
 790    insufficient space in CODING->cmp_data, return 1.
 791    Otherwise, return consumed bytes in the source.
 792
 793 */
 794 static INLINE int
 795 decode_composition_emacs_mule (coding, src, src_end,
 796                                destination, dst_end, dst_bytes)
 797      struct coding_system *coding;
 798      unsigned char *src, *src_end, **destination, *dst_end;
 799      int dst_bytes;
 800 {
 801   unsigned char *dst = *destination;
 802   int method, data_len, nchars;
 803   unsigned char *src_base = src++;
 804   /* Store components of composition.  */
 805   int component[COMPOSITION_DATA_MAX_BUNCH_LENGTH];
 806   int ncomponent;
 807   /* Store multibyte form of characters to be composed.  This is for
 808      Emacs 20 style composition sequence.  */
 809   unsigned char buf[MAX_COMPOSITION_COMPONENTS * MAX_MULTIBYTE_LENGTH];
 810   unsigned char *bufp = buf;
 811   int c, i, gref, nref;
 812
 813   if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
 814       >= COMPOSITION_DATA_SIZE)
 815     {
 816       coding->result = CODING_FINISH_INSUFFICIENT_CMP;
 817       return -1;
 818     }
 819
 820   ONE_MORE_BYTE (c);
 821   if (c - 0xF0 >= COMPOSITION_RELATIVE
 822            && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
 823     {
 824       int with_rule;
 825
 826       method = c - 0xF0;
 827       with_rule = (method == COMPOSITION_WITH_RULE
 828                    || method == COMPOSITION_WITH_RULE_ALTCHARS);
 829       ONE_MORE_BYTE (c);
 830       data_len = c - 0xA0;
 831       if (data_len < 4
 832           || src_base + data_len > src_end)
 833         return 0;
 834       ONE_MORE_BYTE (c);
 835       nchars = c - 0xA0;
 836       if (c < 1)
 837         return 0;
 838       for (ncomponent = 0; src < src_base + data_len; ncomponent++)
 839         {
 840           /* If it is longer than this, it can't be valid.  */
 841           if (ncomponent >= COMPOSITION_DATA_MAX_BUNCH_LENGTH)
 842             return 0;
 843
 844           if (ncomponent % 2 && with_rule)
 845             {
 846               ONE_MORE_BYTE (gref);
 847               gref -= 32;
 848               ONE_MORE_BYTE (nref);
 849               nref -= 32;
 850               c = COMPOSITION_ENCODE_RULE (gref, nref);
 851             }
 852           else
 853             {
 854               int bytes;
 855               if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
 856                   || (coding->flags /* We are recovering a file.  */
 857                       && src[0] == LEADING_CODE_8_BIT_CONTROL
 858                       && ! CHAR_HEAD_P (src[1])))
 859                 c = STRING_CHAR (src, bytes);
 860               else
 861                 c = *src, bytes = 1;
 862               src += bytes;
 863             }
 864           component[ncomponent] = c;
 865         }
 866     }
 867   else
 868     {
 869       /* This may be an old Emacs 20 style format.  See the comment at
 870          the section 2 of this file.  */
 871       while (src < src_end && !CHAR_HEAD_P (*src)) src++;
 872       if (src == src_end
 873           && !(coding->mode & CODING_MODE_LAST_BLOCK))
 874         goto label_end_of_loop;
 875
 876       src_end = src;
 877       src = src_base + 1;
 878       if (c < 0xC0)
 879         {
 880           method = COMPOSITION_RELATIVE;
 881           for (ncomponent = 0; ncomponent < MAX_COMPOSITION_COMPONENTS;)
 882             {
 883               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 884               if (c < 0)
 885                 break;
 886               component[ncomponent++] = c;
 887             }
 888           if (ncomponent < 2)
 889             return 0;
 890           nchars = ncomponent;
 891         }
 892       else if (c == 0xFF)
 893         {
 894           method = COMPOSITION_WITH_RULE;
 895           src++;
 896           DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 897           if (c < 0)
 898             return 0;
 899           component[0] = c;
 900           for (ncomponent = 1;
 901                ncomponent < MAX_COMPOSITION_COMPONENTS * 2 - 1;)
 902             {
 903               DECODE_EMACS_MULE_COMPOSITION_RULE (c);
 904               if (c < 0)
 905                 break;
 906               component[ncomponent++] = c;
 907               DECODE_EMACS_MULE_COMPOSITION_CHAR (c, bufp);
 908               if (c < 0)
 909                 break;
 910               component[ncomponent++] = c;
 911             }
 912           if (ncomponent < 3)
 913             return 0;
 914           nchars = (ncomponent + 1) / 2;
 915         }
 916       else
 917         return 0;
 918     }
 919
 920   if (buf == bufp || dst + (bufp - buf) <= (dst_bytes ? dst_end : src))
 921     {
 922       CODING_ADD_COMPOSITION_START (coding, coding->produced_char, method);
 923       for (i = 0; i < ncomponent; i++)
 924         CODING_ADD_COMPOSITION_COMPONENT (coding, component[i]);
 925       CODING_ADD_COMPOSITION_END (coding, coding->produced_char + nchars);
 926       if (buf < bufp)
 927         {
 928           unsigned char *p = buf;
 929           EMIT_BYTES (p, bufp);
 930           *destination += bufp - buf;
 931           coding->produced_char += nchars;
 932         }
 933       return (src - src_base);
 934     }
 935  label_end_of_loop:
 936   return -1;
 937 }
 938
 939 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
 940
 941 static void
 942 decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
 943      struct coding_system *coding;
 944      unsigned char *source, *destination;
 945      int src_bytes, dst_bytes;
 946 {
 947   unsigned char *src = source;
 948   unsigned char *src_end = source + src_bytes;
 949   unsigned char *dst = destination;
 950   unsigned char *dst_end = destination + dst_bytes;
 951   /* SRC_BASE remembers the start position in source in each loop.
 952      The loop will be exited when there's not enough source code, or
 953      when there's not enough destination area to produce a
 954      character.  */
 955   unsigned char *src_base;
 956
 957   coding->produced_char = 0;
 958   while ((src_base = src) < src_end)
 959     {
 960       unsigned char tmp[MAX_MULTIBYTE_LENGTH], *p;
 961       int bytes;
 962
 963       if (*src == '\r')
 964         {
 965           int c = *src++;
 966
 967           if (coding->eol_type == CODING_EOL_CR)
 968             c = '\n';
 969           else if (coding->eol_type == CODING_EOL_CRLF)
 970             {
 971               ONE_MORE_BYTE (c);
 972               if (c != '\n')
 973                 {
 974                   src--;
 975                   c = '\r';
 976                 }
 977             }
 978           *dst++ = c;
 979           coding->produced_char++;
 980           continue;
 981         }
 982       else if (*src == '\n')
 983         {
 984           if ((coding->eol_type == CODING_EOL_CR
 985                || coding->eol_type == CODING_EOL_CRLF)
 986               && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
 987             {
 988               coding->result = CODING_FINISH_INCONSISTENT_EOL;
 989               goto label_end_of_loop;
 990             }
 991           *dst++ = *src++;
 992           coding->produced_char++;
 993           continue;
 994         }
 995       else if (*src == 0x80 && coding->cmp_data)
 996         {
 997           /* Start of composition data.  */
 998           int consumed  = decode_composition_emacs_mule (coding, src, src_end,
 999                                                          &dst, dst_end,
1000                                                          dst_bytes);
1001           if (consumed < 0)
1002             goto label_end_of_loop;
1003           else if (consumed > 0)
1004             {
1005               src += consumed;
1006               continue;
1007             }
1008           bytes = CHAR_STRING (*src, tmp);
1009           p = tmp;
1010           src++;
1011         }
1012       else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)
1013                || (coding->flags /* We are recovering a file.  */
1014                    && src[0] == LEADING_CODE_8_BIT_CONTROL
1015                    && ! CHAR_HEAD_P (src[1])))
1016         {
1017           p = src;
1018           src += bytes;
1019         }
1020       else
1021         {
1022           bytes = CHAR_STRING (*src, tmp);
1023           p = tmp;
1024           src++;
1025         }
1026       if (dst + bytes >= (dst_bytes ? dst_end : src))
1027         {
1028           coding->result = CODING_FINISH_INSUFFICIENT_DST;
1029           break;
1030         }
1031       while (bytes--) *dst++ = *p++;
1032       coding->produced_char++;
1033     }
1034  label_end_of_loop:
1035   coding->consumed = coding->consumed_char = src_base - source;
1036   coding->produced = dst - destination;
1037 }
1038
1039
1040 /* Encode composition data stored at DATA into a special byte sequence
1041    starting by 0x80.  Update CODING->cmp_data_start and maybe
1042    CODING->cmp_data for the next call.  */
1043
1044 #define ENCODE_COMPOSITION_EMACS_MULE(coding, data)                     \
1045   do {                                                                  \
1046     unsigned char buf[1024], *p0 = buf, *p;                             \
1047     int len = data[0];                                                  \
1048     int i;                                                              \
1049                                                                         \
1050     buf[0] = 0x80;                                                      \
1051     buf[1] = 0xF0 + data[3];    /* METHOD */                            \
1052     buf[3] = 0xA0 + (data[2] - data[1]); /* COMPOSED-CHARS */           \
1053     p = buf + 4;                                                        \
1054     if (data[3] == COMPOSITION_WITH_RULE                                \
1055         || data[3] == COMPOSITION_WITH_RULE_ALTCHARS)                   \
1056       {                                                                 \
1057         p += CHAR_STRING (data[4], p);                                  \
1058         for (i = 5; i < len; i += 2)                                    \
1059           {                                                             \
1060             int gref, nref;                                             \
1061              COMPOSITION_DECODE_RULE (data[i], gref, nref);             \
1062             *p++ = 0x20 + gref;                                         \
1063             *p++ = 0x20 + nref;                                         \
1064             p += CHAR_STRING (data[i + 1], p);                          \
1065           }                                                             \
1066       }                                                                 \
1067     else                                                                \
1068       {                                                                 \
1069         for (i = 4; i < len; i++)                                       \
1070           p += CHAR_STRING (data[i], p);                                \
1071       }                                                                 \
1072     buf[2] = 0xA0 + (p - buf);  /* COMPONENTS-BYTES */                  \
1073                                                                         \
1074     if (dst + (p - buf) + 4 > (dst_bytes ? dst_end : src))              \
1075       {                                                                 \
1076         coding->result = CODING_FINISH_INSUFFICIENT_DST;                \
1077         goto label_end_of_loop;                                         \
1078       }                                                                 \
1079     while (p0 < p)                                                      \
1080       *dst++ = *p0++;                                                   \
1081     coding->cmp_data_start += data[0];                                  \
1082     if (coding->cmp_data_start == coding->cmp_data->used                \
1083         && coding->cmp_data->next)                                      \
1084       {                                                                 \
1085         coding->cmp_data = coding->cmp_data->next;                      \
1086         coding->cmp_data_start = 0;                                     \
1087       }                                                                 \
1088   } while (0)
1089
1090
1091 static void encode_eol P_ ((struct coding_system *, const unsigned char *,
1092                             unsigned char *, int, int));
1093
1094 static void
1095 encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes)
1096      struct coding_system *coding;
1097      unsigned char *source, *destination;
1098      int src_bytes, dst_bytes;
1099 {
1100   unsigned char *src = source;
1101   unsigned char *src_end = source + src_bytes;
1102   unsigned char *dst = destination;
1103   unsigned char *dst_end = destination + dst_bytes;
1104   unsigned char *src_base;
1105   int c;
1106   int char_offset;
1107   int *data;
1108
1109   Lisp_Object translation_table;
1110
1111   translation_table = Qnil;
1112
1113   /* Optimization for the case that there's no composition.  */
1114   if (!coding->cmp_data || coding->cmp_data->used == 0)
1115     {
1116       encode_eol (coding, source, destination, src_bytes, dst_bytes);
1117       return;
1118     }
1119
1120   char_offset = coding->cmp_data->char_offset;
1121   data = coding->cmp_data->data + coding->cmp_data_start;
1122   while (1)
1123     {
1124       src_base = src;
1125
1126       /* If SRC starts a composition, encode the information about the
1127          composition in advance.  */
1128       if (coding->cmp_data_start < coding->cmp_data->used
1129           && char_offset + coding->consumed_char == data[1])
1130         {
1131           ENCODE_COMPOSITION_EMACS_MULE (coding, data);
1132           char_offset = coding->cmp_data->char_offset;
1133           data = coding->cmp_data->data + coding->cmp_data_start;
1134         }
1135
1136       ONE_MORE_CHAR (c);
1137       if (c == '\n' && (coding->eol_type == CODING_EOL_CRLF
1138                         || coding->eol_type == CODING_EOL_CR))
1139         {
1140           if (coding->eol_type == CODING_EOL_CRLF)
1141             EMIT_TWO_BYTES ('\r', c);
1142           else
1143             EMIT_ONE_BYTE ('\r');
1144         }
1145       else if (SINGLE_BYTE_CHAR_P (c))
1146         {
1147           if (coding->flags && ! ASCII_BYTE_P (c))
1148             {
1149               /* As we are auto saving, retain the multibyte form for
1150                  8-bit chars.  */
1151               unsigned char buf[MAX_MULTIBYTE_LENGTH];
1152               int bytes = CHAR_STRING (c, buf);
1153
1154               if (bytes == 1)
1155                 EMIT_ONE_BYTE (buf[0]);
1156               else
1157                 EMIT_TWO_BYTES (buf[0], buf[1]);
1158             }
1159           else
1160             EMIT_ONE_BYTE (c);
1161         }
1162       else
1163         EMIT_BYTES (src_base, src);
1164       coding->consumed_char++;
1165     }
1166  label_end_of_loop:
1167   coding->consumed = src_base - source;
1168   coding->produced = coding->produced_char = dst - destination;
1169   return;
1170 }
1171
1172 \f
1173 /*** 3. ISO2022 handlers ***/
1174
1175 /* The following note describes the coding system ISO2022 briefly.
1176    Since the intention of this note is to help understand the
1177    functions in this file, some parts are NOT ACCURATE or are OVERLY
1178    SIMPLIFIED.  For thorough understanding, please refer to the
1179    original document of ISO2022.  This is equivalent to the standard
1180    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
1181
1182    ISO2022 provides many mechanisms to encode several character sets
1183    in 7-bit and 8-bit environments.  For 7-bit environments, all text
1184    is encoded using bytes less than 128.  This may make the encoded
1185    text a little bit longer, but the text passes more easily through
1186    several types of gateway, some of which strip off the MSB (Most
1187    Significant Bit).
1188
1189    There are two kinds of character sets: control character sets and
1190    graphic character sets.  The former contain control characters such
1191    as `newline' and `escape' to provide control functions (control
1192    functions are also provided by escape sequences).  The latter
1193    contain graphic characters such as 'A' and '-'.  Emacs recognizes
1194    two control character sets and many graphic character sets.
1195
1196    Graphic character sets are classified into one of the following
1197    four classes, according to the number of bytes (DIMENSION) and
1198    number of characters in one dimension (CHARS) of the set:
1199    - DIMENSION1_CHARS94
1200    - DIMENSION1_CHARS96
1201    - DIMENSION2_CHARS94
1202    - DIMENSION2_CHARS96
1203
1204    In addition, each character set is assigned an identification tag,
1205    unique for each set, called the "final character" (denoted as <F>
1206    hereafter).  The <F> of each character set is decided by ECMA(*)
1207    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
1208    (0x30..0x3F are for private use only).
1209
1210    Note (*): ECMA = European Computer Manufacturers Association
1211
1212    Here are examples of graphic character sets [NAME(<F>)]:
1213         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
1214         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
1215         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
1216         o DIMENSION2_CHARS96 -- none for the moment
1217
1218    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
1219         C0 [0x00..0x1F] -- control character plane 0
1220         GL [0x20..0x7F] -- graphic character plane 0
1221         C1 [0x80..0x9F] -- control character plane 1
1222         GR [0xA0..0xFF] -- graphic character plane 1
1223
1224    A control character set is directly designated and invoked to C0 or
1225    C1 by an escape sequence.  The most common case is that:
1226    - ISO646's  control character set is designated/invoked to C0, and
1227    - ISO6429's control character set is designated/invoked to C1,
1228    and usually these designations/invocations are omitted in encoded
1229    text.  In a 7-bit environment, only C0 can be used, and a control
1230    character for C1 is encoded by an appropriate escape sequence to
1231    fit into the environment.  All control characters for C1 are
1232    defined to have corresponding escape sequences.
1233
1234    A graphic character set is at first designated to one of four
1235    graphic registers (G0 through G3), then these graphic registers are
1236    invoked to GL or GR.  These designations and invocations can be
1237    done independently.  The most common case is that G0 is invoked to
1238    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
1239    these invocations and designations are omitted in encoded text.
1240    In a 7-bit environment, only GL can be used.
1241
1242    When a graphic character set of CHARS94 is invoked to GL, codes
1243    0x20 and 0x7F of the GL area work as control characters SPACE and
1244    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
1245    be used.
1246
1247    There are two ways of invocation: locking-shift and single-shift.
1248    With locking-shift, the invocation lasts until the next different
1249    invocation, whereas with single-shift, the invocation affects the
1250    following character only and doesn't affect the locking-shift
1251    state.  Invocations are done by the following control characters or
1252    escape sequences:
1253
1254    ----------------------------------------------------------------------
1255    abbrev  function                  cntrl escape seq   description
1256    ----------------------------------------------------------------------
1257    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
1258    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
1259    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
1260    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
1261    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
1262    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
1263    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
1264    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
1265    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
1266    ----------------------------------------------------------------------
1267    (*) These are not used by any known coding system.
1268
1269    Control characters for these functions are defined by macros
1270    ISO_CODE_XXX in `coding.h'.
1271
1272    Designations are done by the following escape sequences:
1273    ----------------------------------------------------------------------
1274    escape sequence      description
1275    ----------------------------------------------------------------------
1276    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
1277    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
1278    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
1279    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
1280    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
1281    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
1282    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
1283    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
1284    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
1285    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
1286    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
1287    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
1288    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
1289    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
1290    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
1291    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
1292    ----------------------------------------------------------------------
1293
1294    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
1295    of dimension 1, chars 94, and final character <F>, etc...
1296
1297    Note (*): Although these designations are not allowed in ISO2022,
1298    Emacs accepts them on decoding, and produces them on encoding
1299    CHARS96 character sets in a coding system which is characterized as
1300    7-bit environment, non-locking-shift, and non-single-shift.
1301
1302    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
1303    '(' can be omitted.  We refer to this as "short-form" hereafter.
1304
1305    Now you may notice that there are a lot of ways of encoding the
1306    same multilingual text in ISO2022.  Actually, there exist many
1307    coding systems such as Compound Text (used in X11's inter client
1308    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
1309    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
1310    localized platforms), and all of these are variants of ISO2022.
1311
1312    In addition to the above, Emacs handles two more kinds of escape
1313    sequences: ISO6429's direction specification and Emacs' private
1314    sequence for specifying character composition.
1315
1316    ISO6429's direction specification takes the following form:
1317         o CSI ']'      -- end of the current direction
1318         o CSI '0' ']'  -- end of the current direction
1319         o CSI '1' ']'  -- start of left-to-right text
1320         o CSI '2' ']'  -- start of right-to-left text
1321    The control character CSI (0x9B: control sequence introducer) is
1322    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
1323
1324    Character composition specification takes the following form:
1325         o ESC '0' -- start relative composition
1326         o ESC '1' -- end composition
1327         o ESC '2' -- start rule-base composition (*)
1328         o ESC '3' -- start relative composition with alternate chars  (**)
1329         o ESC '4' -- start rule-base composition with alternate chars  (**)
1330   Since these are not standard escape sequences of any ISO standard,
1331   the use of them with these meanings is restricted to Emacs only.
1332
1333   (*) This form is used only in Emacs 20.5 and older versions,
1334   but the newer versions can safely decode it.
1335   (**) This form is used only in Emacs 21.1 and newer versions,
1336   and the older versions can't decode it.
1337
1338   Here's a list of example usages of these composition escape
1339   sequences (categorized by `enum composition_method').
1340
1341   COMPOSITION_RELATIVE:
1342         ESC 0 CHAR [ CHAR ] ESC 1
1343   COMPOSITION_WITH_RULE:
1344         ESC 2 CHAR [ RULE CHAR ] ESC 1
1345   COMPOSITION_WITH_ALTCHARS:
1346         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
1347   COMPOSITION_WITH_RULE_ALTCHARS:
1348         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
1349
1350 enum iso_code_class_type iso_code_class[256];
1351
1352 #define CHARSET_OK(idx, charset, c)                                     \
1353   (coding_system_table[idx]                                             \
1354    && (charset == CHARSET_ASCII                                         \
1355        || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \
1356            CODING_SAFE_CHAR_P (safe_chars, c)))                         \
1357    && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \
1358                                               charset)                  \
1359        != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
1360
1361 #define SHIFT_OUT_OK(idx) \
1362   (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0)
1363
1364 #define COMPOSITION_OK(idx)     \
1365   (coding_system_table[idx]->composing != COMPOSITION_DISABLED)
1366
1367 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1368    Check if a text is encoded in ISO2022.  If it is, return an
1369    integer in which appropriate flag bits any of:
1370         CODING_CATEGORY_MASK_ISO_7
1371         CODING_CATEGORY_MASK_ISO_7_TIGHT
1372         CODING_CATEGORY_MASK_ISO_8_1
1373         CODING_CATEGORY_MASK_ISO_8_2
1374         CODING_CATEGORY_MASK_ISO_7_ELSE
1375         CODING_CATEGORY_MASK_ISO_8_ELSE
1376    are set.  If a code which should never appear in ISO2022 is found,
1377    returns 0.  */
1378
1379 static int
1380 detect_coding_iso2022 (src, src_end, multibytep)
1381      unsigned char *src, *src_end;
1382      int multibytep;
1383 {
1384   int mask = CODING_CATEGORY_MASK_ISO;
1385   int mask_found = 0;
1386   int reg[4], shift_out = 0, single_shifting = 0;
1387   int c, c1, charset;
1388   /* Dummy for ONE_MORE_BYTE.  */
1389   struct coding_system dummy_coding;
1390   struct coding_system *coding = &dummy_coding;
1391   Lisp_Object safe_chars;
1392
1393   reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1;
1394   while (mask && src < src_end)
1395     {
1396       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1397     retry:
1398       switch (c)
1399         {
1400         case ISO_CODE_ESC:
1401           if (inhibit_iso_escape_detection)
1402             break;
1403           single_shifting = 0;
1404           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1405           if (c >= '(' && c <= '/')
1406             {
1407               /* Designation sequence for a charset of dimension 1.  */
1408               ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1409               if (c1 < ' ' || c1 >= 0x80
1410                   || (charset = iso_charset_table[0][c >= ','][c1]) < 0)
1411                 /* Invalid designation sequence.  Just ignore.  */
1412                 break;
1413               reg[(c - '(') % 4] = charset;
1414             }
1415           else if (c == '$')
1416             {
1417               /* Designation sequence for a charset of dimension 2.  */
1418               ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1419               if (c >= '@' && c <= 'B')
1420                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
1421                 reg[0] = charset = iso_charset_table[1][0][c];
1422               else if (c >= '(' && c <= '/')
1423                 {
1424                   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
1425                   if (c1 < ' ' || c1 >= 0x80
1426                       || (charset = iso_charset_table[1][c >= ','][c1]) < 0)
1427                     /* Invalid designation sequence.  Just ignore.  */
1428                     break;
1429                   reg[(c - '(') % 4] = charset;
1430                 }
1431               else
1432                 /* Invalid designation sequence.  Just ignore.  */
1433                 break;
1434             }
1435           else if (c == 'N' || c == 'O')
1436             {
1437               /* ESC <Fe> for SS2 or SS3.  */
1438               mask &= CODING_CATEGORY_MASK_ISO_7_ELSE;
1439               break;
1440             }
1441           else if (c >= '0' && c <= '4')
1442             {
1443               /* ESC <Fp> for start/end composition.  */
1444               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7))
1445                 mask_found |= CODING_CATEGORY_MASK_ISO_7;
1446               else
1447                 mask &= ~CODING_CATEGORY_MASK_ISO_7;
1448               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT))
1449                 mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1450               else
1451                 mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1452               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1))
1453                 mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1454               else
1455                 mask &= ~CODING_CATEGORY_MASK_ISO_8_1;
1456               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2))
1457                 mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1458               else
1459                 mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1460               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE))
1461                 mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1462               else
1463                 mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1464               if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE))
1465                 mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1466               else
1467                 mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1468               break;
1469             }
1470           else
1471             /* Invalid escape sequence.  Just ignore.  */
1472             break;
1473
1474           /* We found a valid designation sequence for CHARSET.  */
1475           mask &= ~CODING_CATEGORY_MASK_ISO_8BIT;
1476           c = MAKE_CHAR (charset, 0, 0);
1477           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7, charset, c))
1478             mask_found |= CODING_CATEGORY_MASK_ISO_7;
1479           else
1480             mask &= ~CODING_CATEGORY_MASK_ISO_7;
1481           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT, charset, c))
1482             mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT;
1483           else
1484             mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT;
1485           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_7_ELSE, charset, c))
1486             mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE;
1487           else
1488             mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE;
1489           if (CHARSET_OK (CODING_CATEGORY_IDX_ISO_8_ELSE, charset, c))
1490             mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE;
1491           else
1492             mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE;
1493           break;
1494
1495         case ISO_CODE_SO:
1496           if (inhibit_iso_escape_detection)
1497             break;
1498           single_shifting = 0;
1499           if (shift_out == 0
1500               && (reg[1] >= 0
1501                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)
1502                   || SHIFT_OUT_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)))
1503             {
1504               /* Locking shift out.  */
1505               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1506               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1507             }
1508           break;
1509
1510         case ISO_CODE_SI:
1511           if (inhibit_iso_escape_detection)
1512             break;
1513           single_shifting = 0;
1514           if (shift_out == 1)
1515             {
1516               /* Locking shift in.  */
1517               mask &= ~CODING_CATEGORY_MASK_ISO_7BIT;
1518               mask_found |= CODING_CATEGORY_MASK_ISO_SHIFT;
1519             }
1520           break;
1521
1522         case ISO_CODE_CSI:
1523           single_shifting = 0;
1524         case ISO_CODE_SS2:
1525         case ISO_CODE_SS3:
1526           {
1527             int newmask = CODING_CATEGORY_MASK_ISO_8_ELSE;
1528
1529             if (inhibit_iso_escape_detection)
1530               break;
1531             if (c != ISO_CODE_CSI)
1532               {
1533                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1534                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1535                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1536                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1537                     & CODING_FLAG_ISO_SINGLE_SHIFT)
1538                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1539                 single_shifting = 1;
1540               }
1541             if (VECTORP (Vlatin_extra_code_table)
1542                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1543               {
1544                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1545                     & CODING_FLAG_ISO_LATIN_EXTRA)
1546                   newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1547                 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1548                     & CODING_FLAG_ISO_LATIN_EXTRA)
1549                   newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1550               }
1551             mask &= newmask;
1552             mask_found |= newmask;
1553           }
1554           break;
1555
1556         default:
1557           if (c < 0x80)
1558             {
1559               single_shifting = 0;
1560               break;
1561             }
1562           else if (c < 0xA0)
1563             {
1564               single_shifting = 0;
1565               if (VECTORP (Vlatin_extra_code_table)
1566                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
1567                 {
1568                   int newmask = 0;
1569
1570                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags
1571                       & CODING_FLAG_ISO_LATIN_EXTRA)
1572                     newmask |= CODING_CATEGORY_MASK_ISO_8_1;
1573                   if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags
1574                       & CODING_FLAG_ISO_LATIN_EXTRA)
1575                     newmask |= CODING_CATEGORY_MASK_ISO_8_2;
1576                   mask &= newmask;
1577                   mask_found |= newmask;
1578                 }
1579               else
1580                 return 0;
1581             }
1582           else
1583             {
1584               mask &= ~(CODING_CATEGORY_MASK_ISO_7BIT
1585                         | CODING_CATEGORY_MASK_ISO_7_ELSE);
1586               mask_found |= CODING_CATEGORY_MASK_ISO_8_1;
1587               /* Check the length of succeeding codes of the range
1588                  0xA0..0FF.  If the byte length is odd, we exclude
1589                  CODING_CATEGORY_MASK_ISO_8_2.  We can check this only
1590                  when we are not single shifting.  */
1591               if (!single_shifting
1592                   && mask & CODING_CATEGORY_MASK_ISO_8_2)
1593                 {
1594                   int i = 1;
1595
1596                   c = -1;
1597                   while (src < src_end)
1598                     {
1599                       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
1600                       if (c < 0xA0)
1601                         break;
1602                       i++;
1603                     }
1604
1605                   if (i & 1 && src < src_end)
1606                     mask &= ~CODING_CATEGORY_MASK_ISO_8_2;
1607                   else
1608                     mask_found |= CODING_CATEGORY_MASK_ISO_8_2;
1609                   if (c >= 0)
1610                     /* This means that we have read one extra byte.  */
1611                     goto retry;
1612                 }
1613             }
1614           break;
1615         }
1616     }
1617  label_end_of_loop:
1618   return (mask & mask_found);
1619 }
1620
1621 /* Decode a character of which charset is CHARSET, the 1st position
1622    code is C1, the 2nd position code is C2, and return the decoded
1623    character code.  If the variable `translation_table' is non-nil,
1624    returned the translated code.  */
1625
1626 #define DECODE_ISO_CHARACTER(charset, c1, c2)   \
1627   (NILP (translation_table)                     \
1628    ? MAKE_CHAR (charset, c1, c2)                \
1629    : translate_char (translation_table, -1, charset, c1, c2))
1630
1631 /* Set designation state into CODING.  */
1632 #define DECODE_DESIGNATION(reg, dimension, chars, final_char)              \
1633   do {                                                                     \
1634     int charset, c;                                                        \
1635                                                                            \
1636     if (final_char < '0' || final_char >= 128)                             \
1637       goto label_invalid_code;                                             \
1638     charset = ISO_CHARSET_TABLE (make_number (dimension),                  \
1639                                  make_number (chars),                      \
1640                                  make_number (final_char));                \
1641     c = MAKE_CHAR (charset, 0, 0);                                         \
1642     if (charset >= 0                                                       \
1643         && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) == reg \
1644             || CODING_SAFE_CHAR_P (safe_chars, c)))                        \
1645       {                                                                    \
1646         if (coding->spec.iso2022.last_invalid_designation_register == 0    \
1647             && reg == 0                                                    \
1648             && charset == CHARSET_ASCII)                                   \
1649           {                                                                \
1650             /* We should insert this designation sequence as is so         \
1651                that it is surely written back to a file.  */               \
1652             coding->spec.iso2022.last_invalid_designation_register = -1;   \
1653             goto label_invalid_code;                                       \
1654           }                                                                \
1655         coding->spec.iso2022.last_invalid_designation_register = -1;       \
1656         if ((coding->mode & CODING_MODE_DIRECTION)                         \
1657             && CHARSET_REVERSE_CHARSET (charset) >= 0)                     \
1658           charset = CHARSET_REVERSE_CHARSET (charset);                     \
1659         CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;               \
1660       }                                                                    \
1661     else                                                                   \
1662       {                                                                    \
1663         coding->spec.iso2022.last_invalid_designation_register = reg;      \
1664         goto label_invalid_code;                                           \
1665       }                                                                    \
1666   } while (0)
1667
1668 /* Allocate a memory block for storing information about compositions.
1669    The block is chained to the already allocated blocks.  */
1670
1671 void
1672 coding_allocate_composition_data (coding, char_offset)
1673      struct coding_system *coding;
1674      int char_offset;
1675 {
1676   struct composition_data *cmp_data
1677     = (struct composition_data *) xmalloc (sizeof *cmp_data);
1678
1679   cmp_data->char_offset = char_offset;
1680   cmp_data->used = 0;
1681   cmp_data->prev = coding->cmp_data;
1682   cmp_data->next = NULL;
1683   if (coding->cmp_data)
1684     coding->cmp_data->next = cmp_data;
1685   coding->cmp_data = cmp_data;
1686   coding->cmp_data_start = 0;
1687 }
1688
1689 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
1690    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
1691    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
1692    ESC 3 : altchar composition :  ESC 3 ALT ... ESC 0 CHAR ... ESC 1
1693    ESC 4 : alt&rule composition : ESC 4 ALT RULE .. ALT ESC 0 CHAR ... ESC 1
1694   */
1695
1696 #define DECODE_COMPOSITION_START(c1)                                       \
1697   do {                                                                     \
1698     if (coding->composing == COMPOSITION_DISABLED)                         \
1699       {                                                                    \
1700         *dst++ = ISO_CODE_ESC;                                             \
1701         *dst++ = c1 & 0x7f;                                                \
1702         coding->produced_char += 2;                                        \
1703       }                                                                    \
1704     else if (!COMPOSING_P (coding))                                        \
1705       {                                                                    \
1706         /* This is surely the start of a composition.  We must be sure     \
1707            that coding->cmp_data has enough space to store the             \
1708            information about the composition.  If not, terminate the       \
1709            current decoding loop, allocate one more memory block for       \
1710            coding->cmp_data in the caller, then start the decoding         \
1711            loop again.  We can't allocate memory here directly because     \
1712            it may cause buffer/string relocation.  */                      \
1713         if (!coding->cmp_data                                              \
1714             || (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH \
1715                 >= COMPOSITION_DATA_SIZE))                                 \
1716           {                                                                \
1717             coding->result = CODING_FINISH_INSUFFICIENT_CMP;               \
1718             goto label_end_of_loop;                                        \
1719           }                                                                \
1720         coding->composing = (c1 == '0' ? COMPOSITION_RELATIVE              \
1721                              : c1 == '2' ? COMPOSITION_WITH_RULE           \
1722                              : c1 == '3' ? COMPOSITION_WITH_ALTCHARS       \
1723                              : COMPOSITION_WITH_RULE_ALTCHARS);            \
1724         CODING_ADD_COMPOSITION_START (coding, coding->produced_char,       \
1725                                       coding->composing);                  \
1726         coding->composition_rule_follows = 0;                              \
1727       }                                                                    \
1728     else                                                                   \
1729       {                                                                    \
1730         /* We are already handling a composition.  If the method is        \
1731            the following two, the codes following the current escape       \
1732            sequence are actual characters stored in a buffer.  */          \
1733         if (coding->composing == COMPOSITION_WITH_ALTCHARS                 \
1734             || coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)        \
1735           {                                                                \
1736             coding->composing = COMPOSITION_RELATIVE;                      \
1737             coding->composition_rule_follows = 0;                          \
1738           }                                                                \
1739       }                                                                    \
1740   } while (0)
1741
1742 /* Handle composition end sequence ESC 1.  */
1743
1744 #define DECODE_COMPOSITION_END(c1)                                      \
1745   do {                                                                  \
1746     if (! COMPOSING_P (coding))                                         \
1747       {                                                                 \
1748         *dst++ = ISO_CODE_ESC;                                          \
1749         *dst++ = c1;                                                    \
1750         coding->produced_char += 2;                                     \
1751       }                                                                 \
1752     else                                                                \
1753       {                                                                 \
1754         CODING_ADD_COMPOSITION_END (coding, coding->produced_char);     \
1755         coding->composing = COMPOSITION_NO;                             \
1756       }                                                                 \
1757   } while (0)
1758
1759 /* Decode a composition rule from the byte C1 (and maybe one more byte
1760    from SRC) and store one encoded composition rule in
1761    coding->cmp_data.  */
1762
1763 #define DECODE_COMPOSITION_RULE(c1)                                     \
1764   do {                                                                  \
1765     int rule = 0;                                                       \
1766     (c1) -= 32;                                                         \
1767     if (c1 < 81)                /* old format (before ver.21) */        \
1768       {                                                                 \
1769         int gref = (c1) / 9;                                            \
1770         int nref = (c1) % 9;                                            \
1771         if (gref == 4) gref = 10;                                       \
1772         if (nref == 4) nref = 10;                                       \
1773         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
1774       }                                                                 \
1775     else if (c1 < 93)           /* new format (after ver.21) */         \
1776       {                                                                 \
1777         ONE_MORE_BYTE (c2);                                             \
1778         rule = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);              \
1779       }                                                                 \
1780     CODING_ADD_COMPOSITION_COMPONENT (coding, rule);                    \
1781     coding->composition_rule_follows = 0;                               \
1782   } while (0)
1783
1784
1785 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1786
1787 static void
1788 decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
1789      struct coding_system *coding;
1790      unsigned char *source, *destination;
1791      int src_bytes, dst_bytes;
1792 {
1793   unsigned char *src = source;
1794   unsigned char *src_end = source + src_bytes;
1795   unsigned char *dst = destination;
1796   unsigned char *dst_end = destination + dst_bytes;
1797   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
1798   int charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1799   int charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1800   /* SRC_BASE remembers the start position in source in each loop.
1801      The loop will be exited when there's not enough source code
1802      (within macro ONE_MORE_BYTE), or when there's not enough
1803      destination area to produce a character (within macro
1804      EMIT_CHAR).  */
1805   unsigned char *src_base;
1806   int c, charset;
1807   Lisp_Object translation_table;
1808   Lisp_Object safe_chars;
1809
1810   safe_chars = coding_safe_chars (coding->symbol);
1811
1812   if (NILP (Venable_character_translation))
1813     translation_table = Qnil;
1814   else
1815     {
1816       translation_table = coding->translation_table_for_decode;
1817       if (NILP (translation_table))
1818         translation_table = Vstandard_translation_table_for_decode;
1819     }
1820
1821   coding->result = CODING_FINISH_NORMAL;
1822
1823   while (1)
1824     {
1825       int c1, c2;
1826
1827       src_base = src;
1828       ONE_MORE_BYTE (c1);
1829
1830       /* We produce no character or one character.  */
1831       switch (iso_code_class [c1])
1832         {
1833         case ISO_0x20_or_0x7F:
1834           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1835             {
1836               DECODE_COMPOSITION_RULE (c1);
1837               continue;
1838             }
1839           if (charset0 < 0 || CHARSET_CHARS (charset0) == 94)
1840             {
1841               /* This is SPACE or DEL.  */
1842               charset = CHARSET_ASCII;
1843               break;
1844             }
1845           /* This is a graphic character, we fall down ...  */
1846
1847         case ISO_graphic_plane_0:
1848           if (COMPOSING_P (coding) && coding->composition_rule_follows)
1849             {
1850               DECODE_COMPOSITION_RULE (c1);
1851               continue;
1852             }
1853           charset = charset0;
1854           break;
1855
1856         case ISO_0xA0_or_0xFF:
1857           if (charset1 < 0 || CHARSET_CHARS (charset1) == 94
1858               || coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
1859             goto label_invalid_code;
1860           /* This is a graphic character, we fall down ... */
1861
1862         case ISO_graphic_plane_1:
1863           if (charset1 < 0)
1864             goto label_invalid_code;
1865           charset = charset1;
1866           break;
1867
1868         case ISO_control_0:
1869           if (COMPOSING_P (coding))
1870             DECODE_COMPOSITION_END ('1');
1871
1872           /* All ISO2022 control characters in this class have the
1873              same representation in Emacs internal format.  */
1874           if (c1 == '\n'
1875               && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
1876               && (coding->eol_type == CODING_EOL_CR
1877                   || coding->eol_type == CODING_EOL_CRLF))
1878             {
1879               coding->result = CODING_FINISH_INCONSISTENT_EOL;
1880               goto label_end_of_loop;
1881             }
1882           charset = CHARSET_ASCII;
1883           break;
1884
1885         case ISO_control_1:
1886           if (COMPOSING_P (coding))
1887             DECODE_COMPOSITION_END ('1');
1888           goto label_invalid_code;
1889
1890         case ISO_carriage_return:
1891           if (COMPOSING_P (coding))
1892             DECODE_COMPOSITION_END ('1');
1893
1894           if (coding->eol_type == CODING_EOL_CR)
1895             c1 = '\n';
1896           else if (coding->eol_type == CODING_EOL_CRLF)
1897             {
1898               ONE_MORE_BYTE (c1);
1899               if (c1 != ISO_CODE_LF)
1900                 {
1901                   src--;
1902                   c1 = '\r';
1903                 }
1904             }
1905           charset = CHARSET_ASCII;
1906           break;
1907
1908         case ISO_shift_out:
1909           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1910               || CODING_SPEC_ISO_DESIGNATION (coding, 1) < 0)
1911             goto label_invalid_code;
1912           CODING_SPEC_ISO_INVOCATION (coding, 0) = 1;
1913           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1914           continue;
1915
1916         case ISO_shift_in:
1917           if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
1918             goto label_invalid_code;
1919           CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
1920           charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1921           continue;
1922
1923         case ISO_single_shift_2_7:
1924         case ISO_single_shift_2:
1925           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1926             goto label_invalid_code;
1927           /* SS2 is handled as an escape sequence of ESC 'N' */
1928           c1 = 'N';
1929           goto label_escape_sequence;
1930
1931         case ISO_single_shift_3:
1932           if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
1933             goto label_invalid_code;
1934           /* SS2 is handled as an escape sequence of ESC 'O' */
1935           c1 = 'O';
1936           goto label_escape_sequence;
1937
1938         case ISO_control_sequence_introducer:
1939           /* CSI is handled as an escape sequence of ESC '[' ...  */
1940           c1 = '[';
1941           goto label_escape_sequence;
1942
1943         case ISO_escape:
1944           ONE_MORE_BYTE (c1);
1945         label_escape_sequence:
1946           /* Escape sequences handled by Emacs are invocation,
1947              designation, direction specification, and character
1948              composition specification.  */
1949           switch (c1)
1950             {
1951             case '&':           /* revision of following character set */
1952               ONE_MORE_BYTE (c1);
1953               if (!(c1 >= '@' && c1 <= '~'))
1954                 goto label_invalid_code;
1955               ONE_MORE_BYTE (c1);
1956               if (c1 != ISO_CODE_ESC)
1957                 goto label_invalid_code;
1958               ONE_MORE_BYTE (c1);
1959               goto label_escape_sequence;
1960
1961             case '$':           /* designation of 2-byte character set */
1962               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
1963                 goto label_invalid_code;
1964               ONE_MORE_BYTE (c1);
1965               if (c1 >= '@' && c1 <= 'B')
1966                 {       /* designation of JISX0208.1978, GB2312.1980,
1967                            or JISX0208.1980 */
1968                   DECODE_DESIGNATION (0, 2, 94, c1);
1969                 }
1970               else if (c1 >= 0x28 && c1 <= 0x2B)
1971                 {       /* designation of DIMENSION2_CHARS94 character set */
1972                   ONE_MORE_BYTE (c2);
1973                   DECODE_DESIGNATION (c1 - 0x28, 2, 94, c2);
1974                 }
1975               else if (c1 >= 0x2C && c1 <= 0x2F)
1976                 {       /* designation of DIMENSION2_CHARS96 character set */
1977                   ONE_MORE_BYTE (c2);
1978                   DECODE_DESIGNATION (c1 - 0x2C, 2, 96, c2);
1979                 }
1980               else
1981                 goto label_invalid_code;
1982               /* We must update these variables now.  */
1983               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1984               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
1985               continue;
1986
1987             case 'n':           /* invocation of locking-shift-2 */
1988               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1989                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
1990                 goto label_invalid_code;
1991               CODING_SPEC_ISO_INVOCATION (coding, 0) = 2;
1992               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
1993               continue;
1994
1995             case 'o':           /* invocation of locking-shift-3 */
1996               if (! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT)
1997                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
1998                 goto label_invalid_code;
1999               CODING_SPEC_ISO_INVOCATION (coding, 0) = 3;
2000               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2001               continue;
2002
2003             case 'N':           /* invocation of single-shift-2 */
2004               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2005                   || CODING_SPEC_ISO_DESIGNATION (coding, 2) < 0)
2006                 goto label_invalid_code;
2007               charset = CODING_SPEC_ISO_DESIGNATION (coding, 2);
2008               ONE_MORE_BYTE (c1);
2009               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2010                 goto label_invalid_code;
2011               break;
2012
2013             case 'O':           /* invocation of single-shift-3 */
2014               if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2015                   || CODING_SPEC_ISO_DESIGNATION (coding, 3) < 0)
2016                 goto label_invalid_code;
2017               charset = CODING_SPEC_ISO_DESIGNATION (coding, 3);
2018               ONE_MORE_BYTE (c1);
2019               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2020                 goto label_invalid_code;
2021               break;
2022
2023             case '0': case '2': case '3': case '4': /* start composition */
2024               DECODE_COMPOSITION_START (c1);
2025               continue;
2026
2027             case '1':           /* end composition */
2028               DECODE_COMPOSITION_END (c1);
2029               continue;
2030
2031             case '[':           /* specification of direction */
2032               if (coding->flags & CODING_FLAG_ISO_NO_DIRECTION)
2033                 goto label_invalid_code;
2034               /* For the moment, nested direction is not supported.
2035                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2036                  left-to-right, and nonzero means right-to-left.  */
2037               ONE_MORE_BYTE (c1);
2038               switch (c1)
2039                 {
2040                 case ']':       /* end of the current direction */
2041                   coding->mode &= ~CODING_MODE_DIRECTION;
2042
2043                 case '0':       /* end of the current direction */
2044                 case '1':       /* start of left-to-right direction */
2045                   ONE_MORE_BYTE (c1);
2046                   if (c1 == ']')
2047                     coding->mode &= ~CODING_MODE_DIRECTION;
2048                   else
2049                     goto label_invalid_code;
2050                   break;
2051
2052                 case '2':       /* start of right-to-left direction */
2053                   ONE_MORE_BYTE (c1);
2054                   if (c1 == ']')
2055                     coding->mode |= CODING_MODE_DIRECTION;
2056                   else
2057                     goto label_invalid_code;
2058                   break;
2059
2060                 default:
2061                   goto label_invalid_code;
2062                 }
2063               continue;
2064
2065             case '%':
2066               if (COMPOSING_P (coding))
2067                 DECODE_COMPOSITION_END ('1');
2068               ONE_MORE_BYTE (c1);
2069               if (c1 == '/')
2070                 {
2071                   /* CTEXT extended segment:
2072                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
2073                      We keep these bytes as is for the moment.
2074                      They may be decoded by post-read-conversion.  */
2075                   int dim, M, L;
2076                   int size, required;
2077                   int produced_chars;
2078
2079                   ONE_MORE_BYTE (dim);
2080                   ONE_MORE_BYTE (M);
2081                   ONE_MORE_BYTE (L);
2082                   size = ((M - 128) * 128) + (L - 128);
2083                   required = 8 + size * 2;
2084                   if (dst + required > (dst_bytes ? dst_end : src))
2085                     goto label_end_of_loop;
2086                   *dst++ = ISO_CODE_ESC;
2087                   *dst++ = '%';
2088                   *dst++ = '/';
2089                   *dst++ = dim;
2090                   produced_chars = 4;
2091                   dst += CHAR_STRING (M, dst), produced_chars++;
2092                   dst += CHAR_STRING (L, dst), produced_chars++;
2093                   while (size-- > 0)
2094                     {
2095                       ONE_MORE_BYTE (c1);
2096                       dst += CHAR_STRING (c1, dst), produced_chars++;
2097                     }
2098                   coding->produced_char += produced_chars;
2099                 }
2100               else if (c1 == 'G')
2101                 {
2102                   unsigned char *d = dst;
2103                   int produced_chars;
2104
2105                   /* XFree86 extension for embedding UTF-8 in CTEXT:
2106                      ESC % G --UTF-8-BYTES-- ESC % @
2107                      We keep these bytes as is for the moment.
2108                      They may be decoded by post-read-conversion.  */
2109                   if (d + 6 > (dst_bytes ? dst_end : src))
2110                     goto label_end_of_loop;
2111                   *d++ = ISO_CODE_ESC;
2112                   *d++ = '%';
2113                   *d++ = 'G';
2114                   produced_chars = 3;
2115                   while (d + 1 < (dst_bytes ? dst_end : src))
2116                     {
2117                       ONE_MORE_BYTE (c1);
2118                       if (c1 == ISO_CODE_ESC
2119                           && src + 1 < src_end
2120                           && src[0] == '%'
2121                           && src[1] == '@')
2122                         break;
2123                       d += CHAR_STRING (c1, d), produced_chars++;
2124                     }
2125                   if (d + 3 > (dst_bytes ? dst_end : src))
2126                     goto label_end_of_loop;
2127                   *d++ = ISO_CODE_ESC;
2128                   *d++ = '%';
2129                   *d++ = '@';
2130                   dst = d;
2131                   coding->produced_char += produced_chars + 3;
2132                 }
2133               else
2134                 goto label_invalid_code;
2135               continue;
2136
2137             default:
2138               if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION))
2139                 goto label_invalid_code;
2140               if (c1 >= 0x28 && c1 <= 0x2B)
2141                 {       /* designation of DIMENSION1_CHARS94 character set */
2142                   ONE_MORE_BYTE (c2);
2143                   DECODE_DESIGNATION (c1 - 0x28, 1, 94, c2);
2144                 }
2145               else if (c1 >= 0x2C && c1 <= 0x2F)
2146                 {       /* designation of DIMENSION1_CHARS96 character set */
2147                   ONE_MORE_BYTE (c2);
2148                   DECODE_DESIGNATION (c1 - 0x2C, 1, 96, c2);
2149                 }
2150               else
2151                 goto label_invalid_code;
2152               /* We must update these variables now.  */
2153               charset0 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 0);
2154               charset1 = CODING_SPEC_ISO_PLANE_CHARSET (coding, 1);
2155               continue;
2156             }
2157         }
2158
2159       /* Now we know CHARSET and 1st position code C1 of a character.
2160          Produce a multibyte sequence for that character while getting
2161          2nd position code C2 if necessary.  */
2162       if (CHARSET_DIMENSION (charset) == 2)
2163         {
2164           ONE_MORE_BYTE (c2);
2165           if (c1 < 0x80 ? c2 < 0x20 || c2 >= 0x80 : c2 < 0xA0)
2166             /* C2 is not in a valid range.  */
2167             goto label_invalid_code;
2168         }
2169       c = DECODE_ISO_CHARACTER (charset, c1, c2);
2170       EMIT_CHAR (c);
2171       continue;
2172
2173     label_invalid_code:
2174       coding->errors++;
2175       if (COMPOSING_P (coding))
2176         DECODE_COMPOSITION_END ('1');
2177       src = src_base;
2178       c = *src++;
2179       EMIT_CHAR (c);
2180     }
2181
2182  label_end_of_loop:
2183   coding->consumed = coding->consumed_char = src_base - source;
2184   coding->produced = dst - destination;
2185   return;
2186 }
2187
2188
2189 /* ISO2022 encoding stuff.  */
2190
2191 /*
2192    It is not enough to say just "ISO2022" on encoding, we have to
2193    specify more details.  In Emacs, each ISO2022 coding system
2194    variant has the following specifications:
2195         1. Initial designation to G0 through G3.
2196         2. Allows short-form designation?
2197         3. ASCII should be designated to G0 before control characters?
2198         4. ASCII should be designated to G0 at end of line?
2199         5. 7-bit environment or 8-bit environment?
2200         6. Use locking-shift?
2201         7. Use Single-shift?
2202    And the following two are only for Japanese:
2203         8. Use ASCII in place of JIS0201-1976-Roman?
2204         9. Use JISX0208-1983 in place of JISX0208-1978?
2205    These specifications are encoded in `coding->flags' as flag bits
2206    defined by macros CODING_FLAG_ISO_XXX.  See `coding.h' for more
2207    details.
2208 */
2209
2210 /* Produce codes (escape sequence) for designating CHARSET to graphic
2211    register REG at DST, and increment DST.  If <final-char> of CHARSET is
2212    '@', 'A', or 'B' and the coding system CODING allows, produce
2213    designation sequence of short-form.  */
2214
2215 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
2216   do {                                                                  \
2217     unsigned char final_char = CHARSET_ISO_FINAL_CHAR (charset);        \
2218     char *intermediate_char_94 = "()*+";                                \
2219     char *intermediate_char_96 = ",-./";                                \
2220     int revision = CODING_SPEC_ISO_REVISION_NUMBER(coding, charset);    \
2221                                                                         \
2222     if (revision < 255)                                                 \
2223       {                                                                 \
2224         *dst++ = ISO_CODE_ESC;                                          \
2225         *dst++ = '&';                                                   \
2226         *dst++ = '@' + revision;                                        \
2227       }                                                                 \
2228     *dst++ = ISO_CODE_ESC;                                              \
2229     if (CHARSET_DIMENSION (charset) == 1)                               \
2230       {                                                                 \
2231         if (CHARSET_CHARS (charset) == 94)                              \
2232           *dst++ = (unsigned char) (intermediate_char_94[reg]);         \
2233         else                                                            \
2234           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2235       }                                                                 \
2236     else                                                                \
2237       {                                                                 \
2238         *dst++ = '$';                                                   \
2239         if (CHARSET_CHARS (charset) == 94)                              \
2240           {                                                             \
2241             if (! (coding->flags & CODING_FLAG_ISO_SHORT_FORM)          \
2242                 || reg != 0                                             \
2243                 || final_char < '@' || final_char > 'B')                \
2244               *dst++ = (unsigned char) (intermediate_char_94[reg]);     \
2245           }                                                             \
2246         else                                                            \
2247           *dst++ = (unsigned char) (intermediate_char_96[reg]);         \
2248       }                                                                 \
2249     *dst++ = final_char;                                                \
2250     CODING_SPEC_ISO_DESIGNATION (coding, reg) = charset;                \
2251   } while (0)
2252
2253 /* The following two macros produce codes (control character or escape
2254    sequence) for ISO2022 single-shift functions (single-shift-2 and
2255    single-shift-3).  */
2256
2257 #define ENCODE_SINGLE_SHIFT_2                           \
2258   do {                                                  \
2259     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2260       *dst++ = ISO_CODE_ESC, *dst++ = 'N';              \
2261     else                                                \
2262       *dst++ = ISO_CODE_SS2;                            \
2263     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2264   } while (0)
2265
2266 #define ENCODE_SINGLE_SHIFT_3                           \
2267   do {                                                  \
2268     if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)     \
2269       *dst++ = ISO_CODE_ESC, *dst++ = 'O';              \
2270     else                                                \
2271       *dst++ = ISO_CODE_SS3;                            \
2272     CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 1;       \
2273   } while (0)
2274
2275 /* The following four macros produce codes (control character or
2276    escape sequence) for ISO2022 locking-shift functions (shift-in,
2277    shift-out, locking-shift-2, and locking-shift-3).  */
2278
2279 #define ENCODE_SHIFT_IN                         \
2280   do {                                          \
2281     *dst++ = ISO_CODE_SI;                       \
2282     CODING_SPEC_ISO_INVOCATION (coding, 0) = 0; \
2283   } while (0)
2284
2285 #define ENCODE_SHIFT_OUT                        \
2286   do {                                          \
2287     *dst++ = ISO_CODE_SO;                       \
2288     CODING_SPEC_ISO_INVOCATION (coding, 0) = 1; \
2289   } while (0)
2290
2291 #define ENCODE_LOCKING_SHIFT_2                  \
2292   do {                                          \
2293     *dst++ = ISO_CODE_ESC, *dst++ = 'n';        \
2294     CODING_SPEC_ISO_INVOCATION (coding, 0) = 2; \
2295   } while (0)
2296
2297 #define ENCODE_LOCKING_SHIFT_3                  \
2298   do {                                          \
2299     *dst++ = ISO_CODE_ESC, *dst++ = 'o';        \
2300     CODING_SPEC_ISO_INVOCATION (coding, 0) = 3; \
2301   } while (0)
2302
2303 /* Produce codes for a DIMENSION1 character whose character set is
2304    CHARSET and whose position-code is C1.  Designation and invocation
2305    sequences are also produced in advance if necessary.  */
2306
2307 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
2308   do {                                                                  \
2309     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2310       {                                                                 \
2311         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2312           *dst++ = c1 & 0x7F;                                           \
2313         else                                                            \
2314           *dst++ = c1 | 0x80;                                           \
2315         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2316         break;                                                          \
2317       }                                                                 \
2318     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2319       {                                                                 \
2320         *dst++ = c1 & 0x7F;                                             \
2321         break;                                                          \
2322       }                                                                 \
2323     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2324       {                                                                 \
2325         *dst++ = c1 | 0x80;                                             \
2326         break;                                                          \
2327       }                                                                 \
2328     else                                                                \
2329       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2330          must invoke it, or, at first, designate it to some graphic     \
2331          register.  Then repeat the loop to actually produce the        \
2332          character.  */                                                 \
2333       dst = encode_invocation_designation (charset, coding, dst);       \
2334   } while (1)
2335
2336 /* Produce codes for a DIMENSION2 character whose character set is
2337    CHARSET and whose position-codes are C1 and C2.  Designation and
2338    invocation codes are also produced in advance if necessary.  */
2339
2340 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
2341   do {                                                                  \
2342     if (CODING_SPEC_ISO_SINGLE_SHIFTING (coding))                       \
2343       {                                                                 \
2344         if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)                 \
2345           *dst++ = c1 & 0x7F, *dst++ = c2 & 0x7F;                       \
2346         else                                                            \
2347           *dst++ = c1 | 0x80, *dst++ = c2 | 0x80;                       \
2348         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;                   \
2349         break;                                                          \
2350       }                                                                 \
2351     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 0))      \
2352       {                                                                 \
2353         *dst++ = c1 & 0x7F, *dst++= c2 & 0x7F;                          \
2354         break;                                                          \
2355       }                                                                 \
2356     else if (charset == CODING_SPEC_ISO_PLANE_CHARSET (coding, 1))      \
2357       {                                                                 \
2358         *dst++ = c1 | 0x80, *dst++= c2 | 0x80;                          \
2359         break;                                                          \
2360       }                                                                 \
2361     else                                                                \
2362       /* Since CHARSET is not yet invoked to any graphic planes, we     \
2363          must invoke it, or, at first, designate it to some graphic     \
2364          register.  Then repeat the loop to actually produce the        \
2365          character.  */                                                 \
2366       dst = encode_invocation_designation (charset, coding, dst);       \
2367   } while (1)
2368
2369 #define ENCODE_ISO_CHARACTER(c)                                 \
2370   do {                                                          \
2371     int charset, c1, c2;                                        \
2372                                                                 \
2373     SPLIT_CHAR (c, charset, c1, c2);                            \
2374     if (CHARSET_DEFINED_P (charset))                            \
2375       {                                                         \
2376         if (CHARSET_DIMENSION (charset) == 1)                   \
2377           {                                                     \
2378             if (charset == CHARSET_ASCII                        \
2379                 && coding->flags & CODING_FLAG_ISO_USE_ROMAN)   \
2380               charset = charset_latin_jisx0201;                 \
2381             ENCODE_ISO_CHARACTER_DIMENSION1 (charset, c1);      \
2382           }                                                     \
2383         else                                                    \
2384           {                                                     \
2385             if (charset == charset_jisx0208                     \
2386                 && coding->flags & CODING_FLAG_ISO_USE_OLDJIS)  \
2387               charset = charset_jisx0208_1978;                  \
2388             ENCODE_ISO_CHARACTER_DIMENSION2 (charset, c1, c2);  \
2389           }                                                     \
2390       }                                                         \
2391     else                                                        \
2392       {                                                         \
2393         *dst++ = c1;                                            \
2394         if (c2 >= 0)                                            \
2395           *dst++ = c2;                                          \
2396       }                                                         \
2397   } while (0)
2398
2399
2400 /* Instead of encoding character C, produce one or two `?'s.  */
2401
2402 #define ENCODE_UNSAFE_CHARACTER(c)                              \
2403   do {                                                          \
2404     ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);        \
2405     if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1)                   \
2406       ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER);      \
2407   } while (0)
2408
2409
2410 /* Produce designation and invocation codes at a place pointed by DST
2411    to use CHARSET.  The element `spec.iso2022' of *CODING is updated.
2412    Return new DST.  */
2413
2414 unsigned char *
2415 encode_invocation_designation (charset, coding, dst)
2416      int charset;
2417      struct coding_system *coding;
2418      unsigned char *dst;
2419 {
2420   int reg;                      /* graphic register number */
2421
2422   /* At first, check designations.  */
2423   for (reg = 0; reg < 4; reg++)
2424     if (charset == CODING_SPEC_ISO_DESIGNATION (coding, reg))
2425       break;
2426
2427   if (reg >= 4)
2428     {
2429       /* CHARSET is not yet designated to any graphic registers.  */
2430       /* At first check the requested designation.  */
2431       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2432       if (reg == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION)
2433         /* Since CHARSET requests no special designation, designate it
2434            to graphic register 0.  */
2435         reg = 0;
2436
2437       ENCODE_DESIGNATION (charset, reg, coding);
2438     }
2439
2440   if (CODING_SPEC_ISO_INVOCATION (coding, 0) != reg
2441       && CODING_SPEC_ISO_INVOCATION (coding, 1) != reg)
2442     {
2443       /* Since the graphic register REG is not invoked to any graphic
2444          planes, invoke it to graphic plane 0.  */
2445       switch (reg)
2446         {
2447         case 0:                 /* graphic register 0 */
2448           ENCODE_SHIFT_IN;
2449           break;
2450
2451         case 1:                 /* graphic register 1 */
2452           ENCODE_SHIFT_OUT;
2453           break;
2454
2455         case 2:                 /* graphic register 2 */
2456           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2457             ENCODE_SINGLE_SHIFT_2;
2458           else
2459             ENCODE_LOCKING_SHIFT_2;
2460           break;
2461
2462         case 3:                 /* graphic register 3 */
2463           if (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT)
2464             ENCODE_SINGLE_SHIFT_3;
2465           else
2466             ENCODE_LOCKING_SHIFT_3;
2467           break;
2468         }
2469     }
2470
2471   return dst;
2472 }
2473
2474 /* Produce 2-byte codes for encoded composition rule RULE.  */
2475
2476 #define ENCODE_COMPOSITION_RULE(rule)           \
2477   do {                                          \
2478     int gref, nref;                             \
2479     COMPOSITION_DECODE_RULE (rule, gref, nref); \
2480     *dst++ = 32 + 81 + gref;                    \
2481     *dst++ = 32 + nref;                         \
2482   } while (0)
2483
2484 /* Produce codes for indicating the start of a composition sequence
2485    (ESC 0, ESC 3, or ESC 4).  DATA points to an array of integers
2486    which specify information about the composition.  See the comment
2487    in coding.h for the format of DATA.  */
2488
2489 #define ENCODE_COMPOSITION_START(coding, data)                          \
2490   do {                                                                  \
2491     coding->composing = data[3];                                        \
2492     *dst++ = ISO_CODE_ESC;                                              \
2493     if (coding->composing == COMPOSITION_RELATIVE)                      \
2494       *dst++ = '0';                                                     \
2495     else                                                                \
2496       {                                                                 \
2497         *dst++ = (coding->composing == COMPOSITION_WITH_ALTCHARS        \
2498                   ? '3' : '4');                                         \
2499         coding->cmp_data_index = coding->cmp_data_start + 4;            \
2500         coding->composition_rule_follows = 0;                           \
2501       }                                                                 \
2502   } while (0)
2503
2504 /* Produce codes for indicating the end of the current composition.  */
2505
2506 #define ENCODE_COMPOSITION_END(coding, data)                    \
2507   do {                                                          \
2508     *dst++ = ISO_CODE_ESC;                                      \
2509     *dst++ = '1';                                               \
2510     coding->cmp_data_start += data[0];                          \
2511     coding->composing = COMPOSITION_NO;                         \
2512     if (coding->cmp_data_start == coding->cmp_data->used        \
2513         && coding->cmp_data->next)                              \
2514       {                                                         \
2515         coding->cmp_data = coding->cmp_data->next;              \
2516         coding->cmp_data_start = 0;                             \
2517       }                                                         \
2518   } while (0)
2519
2520 /* Produce composition start sequence ESC 0.  Here, this sequence
2521    doesn't mean the start of a new composition but means that we have
2522    just produced components (alternate chars and composition rules) of
2523    the composition and the actual text follows in SRC.  */
2524
2525 #define ENCODE_COMPOSITION_FAKE_START(coding)   \
2526   do {                                          \
2527     *dst++ = ISO_CODE_ESC;                      \
2528     *dst++ = '0';                               \
2529     coding->composing = COMPOSITION_RELATIVE;   \
2530   } while (0)
2531
2532 /* The following three macros produce codes for indicating direction
2533    of text.  */
2534 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER              \
2535   do {                                                  \
2536     if (coding->flags == CODING_FLAG_ISO_SEVEN_BITS)    \
2537       *dst++ = ISO_CODE_ESC, *dst++ = '[';              \
2538     else                                                \
2539       *dst++ = ISO_CODE_CSI;                            \
2540   } while (0)
2541
2542 #define ENCODE_DIRECTION_R2L    \
2543   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '2', *dst++ = ']'
2544
2545 #define ENCODE_DIRECTION_L2R    \
2546   ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst), *dst++ = '0', *dst++ = ']'
2547
2548 /* Produce codes for designation and invocation to reset the graphic
2549    planes and registers to initial state.  */
2550 #define ENCODE_RESET_PLANE_AND_REGISTER                                     \
2551   do {                                                                      \
2552     int reg;                                                                \
2553     if (CODING_SPEC_ISO_INVOCATION (coding, 0) != 0)                        \
2554       ENCODE_SHIFT_IN;                                                      \
2555     for (reg = 0; reg < 4; reg++)                                           \
2556       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg) >= 0            \
2557           && (CODING_SPEC_ISO_DESIGNATION (coding, reg)                     \
2558               != CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg)))        \
2559         ENCODE_DESIGNATION                                                  \
2560           (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, reg), reg, coding); \
2561   } while (0)
2562
2563 /* Produce designation sequences of charsets in the line started from
2564    SRC to a place pointed by DST, and return updated DST.
2565
2566    If the current block ends before any end-of-line, we may fail to
2567    find all the necessary designations.  */
2568
2569 static unsigned char *
2570 encode_designation_at_bol (coding, translation_table, src, src_end, dst)
2571      struct coding_system *coding;
2572      Lisp_Object translation_table;
2573      unsigned char *src, *src_end, *dst;
2574 {
2575   int charset, c, found = 0, reg;
2576   /* Table of charsets to be designated to each graphic register.  */
2577   int r[4];
2578
2579   for (reg = 0; reg < 4; reg++)
2580     r[reg] = -1;
2581
2582   while (found < 4)
2583     {
2584       ONE_MORE_CHAR (c);
2585       if (c == '\n')
2586         break;
2587
2588       charset = CHAR_CHARSET (c);
2589       reg = CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset);
2590       if (reg != CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION && r[reg] < 0)
2591         {
2592           found++;
2593           r[reg] = charset;
2594         }
2595     }
2596
2597  label_end_of_loop:
2598   if (found)
2599     {
2600       for (reg = 0; reg < 4; reg++)
2601         if (r[reg] >= 0
2602             && CODING_SPEC_ISO_DESIGNATION (coding, reg) != r[reg])
2603           ENCODE_DESIGNATION (r[reg], reg, coding);
2604     }
2605
2606   return dst;
2607 }
2608
2609 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
2610
2611 static void
2612 encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes)
2613      struct coding_system *coding;
2614      unsigned char *source, *destination;
2615      int src_bytes, dst_bytes;
2616 {
2617   unsigned char *src = source;
2618   unsigned char *src_end = source + src_bytes;
2619   unsigned char *dst = destination;
2620   unsigned char *dst_end = destination + dst_bytes;
2621   /* Since the maximum bytes produced by each loop is 20, we subtract 19
2622      from DST_END to assure overflow checking is necessary only at the
2623      head of loop.  */
2624   unsigned char *adjusted_dst_end = dst_end - 19;
2625   /* SRC_BASE remembers the start position in source in each loop.
2626      The loop will be exited when there's not enough source text to
2627      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
2628      there's not enough destination area to produce encoded codes
2629      (within macro EMIT_BYTES).  */
2630   unsigned char *src_base;
2631   int c;
2632   Lisp_Object translation_table;
2633   Lisp_Object safe_chars;
2634
2635   if (coding->flags & CODING_FLAG_ISO_SAFE)
2636     coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
2637
2638   safe_chars = coding_safe_chars (coding->symbol);
2639
2640   if (NILP (Venable_character_translation))
2641     translation_table = Qnil;
2642   else
2643     {
2644       translation_table = coding->translation_table_for_encode;
2645       if (NILP (translation_table))
2646         translation_table = Vstandard_translation_table_for_encode;
2647     }
2648
2649   coding->consumed_char = 0;
2650   coding->errors = 0;
2651   while (1)
2652     {
2653       src_base = src;
2654
2655       if (dst >= (dst_bytes ? adjusted_dst_end : (src - 19)))
2656         {
2657           coding->result = CODING_FINISH_INSUFFICIENT_DST;
2658           break;
2659         }
2660
2661       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL
2662           && CODING_SPEC_ISO_BOL (coding))
2663         {
2664           /* We have to produce designation sequences if any now.  */
2665           dst = encode_designation_at_bol (coding, translation_table,
2666                                            src, src_end, dst);
2667           CODING_SPEC_ISO_BOL (coding) = 0;
2668         }
2669
2670       /* Check composition start and end.  */
2671       if (coding->composing != COMPOSITION_DISABLED
2672           && coding->cmp_data_start < coding->cmp_data->used)
2673         {
2674           struct composition_data *cmp_data = coding->cmp_data;
2675           int *data = cmp_data->data + coding->cmp_data_start;
2676           int this_pos = cmp_data->char_offset + coding->consumed_char;
2677
2678           if (coding->composing == COMPOSITION_RELATIVE)
2679             {
2680               if (this_pos == data[2])
2681                 {
2682                   ENCODE_COMPOSITION_END (coding, data);
2683                   cmp_data = coding->cmp_data;
2684                   data = cmp_data->data + coding->cmp_data_start;
2685                 }
2686             }
2687           else if (COMPOSING_P (coding))
2688             {
2689               /* COMPOSITION_WITH_ALTCHARS or COMPOSITION_WITH_RULE_ALTCHAR  */
2690               if (coding->cmp_data_index == coding->cmp_data_start + data[0])
2691                 /* We have consumed components of the composition.
2692                    What follows in SRC is the composition's base
2693                    text.  */
2694                 ENCODE_COMPOSITION_FAKE_START (coding);
2695               else
2696                 {
2697                   int c = cmp_data->data[coding->cmp_data_index++];
2698                   if (coding->composition_rule_follows)
2699                     {
2700                       ENCODE_COMPOSITION_RULE (c);
2701                       coding->composition_rule_follows = 0;
2702                     }
2703                   else
2704                     {
2705                       if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2706                           && ! CODING_SAFE_CHAR_P (safe_chars, c))
2707                         ENCODE_UNSAFE_CHARACTER (c);
2708                       else
2709                         ENCODE_ISO_CHARACTER (c);
2710                       if (coding->composing == COMPOSITION_WITH_RULE_ALTCHARS)
2711                         coding->composition_rule_follows = 1;
2712                     }
2713                   continue;
2714                 }
2715             }
2716           if (!COMPOSING_P (coding))
2717             {
2718               if (this_pos == data[1])
2719                 {
2720                   ENCODE_COMPOSITION_START (coding, data);
2721                   continue;
2722                 }
2723             }
2724         }
2725
2726       ONE_MORE_CHAR (c);
2727
2728       /* Now encode the character C.  */
2729       if (c < 0x20 || c == 0x7F)
2730         {
2731           if (c == '\r')
2732             {
2733               if (! (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
2734                 {
2735                   if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2736                     ENCODE_RESET_PLANE_AND_REGISTER;
2737                   *dst++ = c;
2738                   continue;
2739                 }
2740               /* fall down to treat '\r' as '\n' ...  */
2741               c = '\n';
2742             }
2743           if (c == '\n')
2744             {
2745               if (coding->flags & CODING_FLAG_ISO_RESET_AT_EOL)
2746                 ENCODE_RESET_PLANE_AND_REGISTER;
2747               if (coding->flags & CODING_FLAG_ISO_INIT_AT_BOL)
2748                 bcopy (coding->spec.iso2022.initial_designation,
2749                        coding->spec.iso2022.current_designation,
2750                        sizeof coding->spec.iso2022.initial_designation);
2751               if (coding->eol_type == CODING_EOL_LF
2752                   || coding->eol_type == CODING_EOL_UNDECIDED)
2753                 *dst++ = ISO_CODE_LF;
2754               else if (coding->eol_type == CODING_EOL_CRLF)
2755                 *dst++ = ISO_CODE_CR, *dst++ = ISO_CODE_LF;
2756               else
2757                 *dst++ = ISO_CODE_CR;
2758               CODING_SPEC_ISO_BOL (coding) = 1;
2759             }
2760           else
2761             {
2762               if (coding->flags & CODING_FLAG_ISO_RESET_AT_CNTL)
2763                 ENCODE_RESET_PLANE_AND_REGISTER;
2764               *dst++ = c;
2765             }
2766         }
2767       else if (ASCII_BYTE_P (c))
2768         ENCODE_ISO_CHARACTER (c);
2769       else if (SINGLE_BYTE_CHAR_P (c))
2770         {
2771           *dst++ = c;
2772           coding->errors++;
2773         }
2774       else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR
2775                && ! CODING_SAFE_CHAR_P (safe_chars, c))
2776         ENCODE_UNSAFE_CHARACTER (c);
2777       else
2778         ENCODE_ISO_CHARACTER (c);
2779
2780       coding->consumed_char++;
2781     }
2782
2783  label_end_of_loop:
2784   coding->consumed = src_base - source;
2785   coding->produced = coding->produced_char = dst - destination;
2786 }
2787
2788 \f
2789 /*** 4. SJIS and BIG5 handlers ***/
2790
2791 /* Although SJIS and BIG5 are not ISO coding systems, they are used
2792    quite widely.  So, for the moment, Emacs supports them in the bare
2793    C code.  But, in the future, they may be supported only by CCL.  */
2794
2795 /* SJIS is a coding system encoding three character sets: ASCII, right
2796    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
2797    as is.  A character of charset katakana-jisx0201 is encoded by
2798    "position-code + 0x80".  A character of charset japanese-jisx0208
2799    is encoded in 2-byte but two position-codes are divided and shifted
2800    so that it fits in the range below.
2801
2802    --- CODE RANGE of SJIS ---
2803    (character set)      (range)
2804    ASCII                0x00 .. 0x7F
2805    KATAKANA-JISX0201    0xA1 .. 0xDF
2806    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
2807             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
2808    -------------------------------
2809
2810 */
2811
2812 /* BIG5 is a coding system encoding two character sets: ASCII and
2813    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
2814    character set and is encoded in two bytes.
2815
2816    --- CODE RANGE of BIG5 ---
2817    (character set)      (range)
2818    ASCII                0x00 .. 0x7F
2819    Big5 (1st byte)      0xA1 .. 0xFE
2820         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
2821    --------------------------
2822
2823    Since the number of characters in Big5 is larger than maximum
2824    characters in Emacs' charset (96x96), it can't be handled as one
2825    charset.  So, in Emacs, Big5 is divided into two: `charset-big5-1'
2826    and `charset-big5-2'.  Both are DIMENSION2 and CHARS94.  The former
2827    contains frequently used characters and the latter contains less
2828    frequently used characters.  */
2829
2830 /* Macros to decode or encode a character of Big5 in BIG5.  B1 and B2
2831    are the 1st and 2nd position-codes of Big5 in BIG5 coding system.
2832    C1 and C2 are the 1st and 2nd position-codes of Emacs' internal
2833    format.  CHARSET is `charset_big5_1' or `charset_big5_2'.  */
2834
2835 /* Number of Big5 characters which have the same code in 1st byte.  */
2836 #define BIG5_SAME_ROW (0xFF - 0xA1 + 0x7F - 0x40)
2837
2838 #define DECODE_BIG5(b1, b2, charset, c1, c2)                            \
2839   do {                                                                  \
2840     unsigned int temp                                                   \
2841       = (b1 - 0xA1) * BIG5_SAME_ROW + b2 - (b2 < 0x7F ? 0x40 : 0x62);   \
2842     if (b1 < 0xC9)                                                      \
2843       charset = charset_big5_1;                                         \
2844     else                                                                \
2845       {                                                                 \
2846         charset = charset_big5_2;                                       \
2847         temp -= (0xC9 - 0xA1) * BIG5_SAME_ROW;                          \
2848       }                                                                 \
2849     c1 = temp / (0xFF - 0xA1) + 0x21;                                   \
2850     c2 = temp % (0xFF - 0xA1) + 0x21;                                   \
2851   } while (0)
2852
2853 #define ENCODE_BIG5(charset, c1, c2, b1, b2)                            \
2854   do {                                                                  \
2855     unsigned int temp = (c1 - 0x21) * (0xFF - 0xA1) + (c2 - 0x21);      \
2856     if (charset == charset_big5_2)                                      \
2857       temp += BIG5_SAME_ROW * (0xC9 - 0xA1);                            \
2858     b1 = temp / BIG5_SAME_ROW + 0xA1;                                   \
2859     b2 = temp % BIG5_SAME_ROW;                                          \
2860     b2 += b2 < 0x3F ? 0x40 : 0x62;                                      \
2861   } while (0)
2862
2863 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2864    Check if a text is encoded in SJIS.  If it is, return
2865    CODING_CATEGORY_MASK_SJIS, else return 0.  */
2866
2867 static int
2868 detect_coding_sjis (src, src_end, multibytep)
2869      unsigned char *src, *src_end;
2870      int multibytep;
2871 {
2872   int c;
2873   /* Dummy for ONE_MORE_BYTE.  */
2874   struct coding_system dummy_coding;
2875   struct coding_system *coding = &dummy_coding;
2876
2877   while (1)
2878     {
2879       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2880       if (c < 0x80)
2881         continue;
2882       if (c == 0x80 || c == 0xA0 || c > 0xEF)
2883         return 0;
2884       if (c <= 0x9F || c >= 0xE0)
2885         {
2886           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2887           if (c < 0x40 || c == 0x7F || c > 0xFC)
2888             return 0;
2889         }
2890     }
2891  label_end_of_loop:
2892   return CODING_CATEGORY_MASK_SJIS;
2893 }
2894
2895 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2896    Check if a text is encoded in BIG5.  If it is, return
2897    CODING_CATEGORY_MASK_BIG5, else return 0.  */
2898
2899 static int
2900 detect_coding_big5 (src, src_end, multibytep)
2901      unsigned char *src, *src_end;
2902      int multibytep;
2903 {
2904   int c;
2905   /* Dummy for ONE_MORE_BYTE.  */
2906   struct coding_system dummy_coding;
2907   struct coding_system *coding = &dummy_coding;
2908
2909   while (1)
2910     {
2911       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2912       if (c < 0x80)
2913         continue;
2914       if (c < 0xA1 || c > 0xFE)
2915         return 0;
2916       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2917       if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE)
2918         return 0;
2919     }
2920  label_end_of_loop:
2921   return CODING_CATEGORY_MASK_BIG5;
2922 }
2923
2924 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2925    Check if a text is encoded in UTF-8.  If it is, return
2926    CODING_CATEGORY_MASK_UTF_8, else return 0.  */
2927
2928 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
2929 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
2930 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
2931 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
2932 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
2933 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
2934 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC)
2935
2936 static int
2937 detect_coding_utf_8 (src, src_end, multibytep)
2938      unsigned char *src, *src_end;
2939      int multibytep;
2940 {
2941   unsigned char c;
2942   int seq_maybe_bytes;
2943   /* Dummy for ONE_MORE_BYTE.  */
2944   struct coding_system dummy_coding;
2945   struct coding_system *coding = &dummy_coding;
2946
2947   while (1)
2948     {
2949       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2950       if (UTF_8_1_OCTET_P (c))
2951         continue;
2952       else if (UTF_8_2_OCTET_LEADING_P (c))
2953         seq_maybe_bytes = 1;
2954       else if (UTF_8_3_OCTET_LEADING_P (c))
2955         seq_maybe_bytes = 2;
2956       else if (UTF_8_4_OCTET_LEADING_P (c))
2957         seq_maybe_bytes = 3;
2958       else if (UTF_8_5_OCTET_LEADING_P (c))
2959         seq_maybe_bytes = 4;
2960       else if (UTF_8_6_OCTET_LEADING_P (c))
2961         seq_maybe_bytes = 5;
2962       else
2963         return 0;
2964
2965       do
2966         {
2967           ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
2968           if (!UTF_8_EXTRA_OCTET_P (c))
2969             return 0;
2970           seq_maybe_bytes--;
2971         }
2972       while (seq_maybe_bytes > 0);
2973     }
2974
2975  label_end_of_loop:
2976   return CODING_CATEGORY_MASK_UTF_8;
2977 }
2978
2979 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2980    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
2981    Little Endian (otherwise).  If it is, return
2982    CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE,
2983    else return 0.  */
2984
2985 #define UTF_16_INVALID_P(val)   \
2986   (((val) == 0xFFFE)            \
2987    || ((val) == 0xFFFF))
2988
2989 #define UTF_16_HIGH_SURROGATE_P(val) \
2990   (((val) & 0xD800) == 0xD800)
2991
2992 #define UTF_16_LOW_SURROGATE_P(val) \
2993   (((val) & 0xDC00) == 0xDC00)
2994
2995 static int
2996 detect_coding_utf_16 (src, src_end, multibytep)
2997      unsigned char *src, *src_end;
2998      int multibytep;
2999 {
3000   unsigned char c1, c2;
3001   /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE.  */
3002   struct coding_system dummy_coding;
3003   struct coding_system *coding = &dummy_coding;
3004
3005   ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep);
3006   ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep);
3007
3008   if ((c1 == 0xFF) && (c2 == 0xFE))
3009     return CODING_CATEGORY_MASK_UTF_16_LE;
3010   else if ((c1 == 0xFE) && (c2 == 0xFF))
3011     return CODING_CATEGORY_MASK_UTF_16_BE;
3012
3013  label_end_of_loop:
3014   return 0;
3015 }
3016
3017 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3018    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3019
3020 static void
3021 decode_coding_sjis_big5 (coding, source, destination,
3022                          src_bytes, dst_bytes, sjis_p)
3023      struct coding_system *coding;
3024      unsigned char *source, *destination;
3025      int src_bytes, dst_bytes;
3026      int sjis_p;
3027 {
3028   unsigned char *src = source;
3029   unsigned char *src_end = source + src_bytes;
3030   unsigned char *dst = destination;
3031   unsigned char *dst_end = destination + dst_bytes;
3032   /* SRC_BASE remembers the start position in source in each loop.
3033      The loop will be exited when there's not enough source code
3034      (within macro ONE_MORE_BYTE), or when there's not enough
3035      destination area to produce a character (within macro
3036      EMIT_CHAR).  */
3037   unsigned char *src_base;
3038   Lisp_Object translation_table;
3039
3040   if (NILP (Venable_character_translation))
3041     translation_table = Qnil;
3042   else
3043     {
3044       translation_table = coding->translation_table_for_decode;
3045       if (NILP (translation_table))
3046         translation_table = Vstandard_translation_table_for_decode;
3047     }
3048
3049   coding->produced_char = 0;
3050   while (1)
3051     {
3052       int c, charset, c1, c2;
3053
3054       src_base = src;
3055       ONE_MORE_BYTE (c1);
3056
3057       if (c1 < 0x80)
3058         {
3059           charset = CHARSET_ASCII;
3060           if (c1 < 0x20)
3061             {
3062               if (c1 == '\r')
3063                 {
3064                   if (coding->eol_type == CODING_EOL_CRLF)
3065                     {
3066                       ONE_MORE_BYTE (c2);
3067                       if (c2 == '\n')
3068                         c1 = c2;
3069                       else
3070                         /* To process C2 again, SRC is subtracted by 1.  */
3071                         src--;
3072                     }
3073                   else if (coding->eol_type == CODING_EOL_CR)
3074                     c1 = '\n';
3075                 }
3076               else if (c1 == '\n'
3077                        && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3078                        && (coding->eol_type == CODING_EOL_CR
3079                            || coding->eol_type == CODING_EOL_CRLF))
3080                 {
3081                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3082                   goto label_end_of_loop;
3083                 }
3084             }
3085         }
3086       else
3087         {
3088           if (sjis_p)
3089             {
3090               if (c1 == 0x80 || c1 == 0xA0 || c1 > 0xEF)
3091                 goto label_invalid_code;
3092               if (c1 <= 0x9F || c1 >= 0xE0)
3093                 {
3094                   /* SJIS -> JISX0208 */
3095                   ONE_MORE_BYTE (c2);
3096                   if (c2 < 0x40 || c2 == 0x7F || c2 > 0xFC)
3097                     goto label_invalid_code;
3098                   DECODE_SJIS (c1, c2, c1, c2);
3099                   charset = charset_jisx0208;
3100                 }
3101               else
3102                 /* SJIS -> JISX0201-Kana */
3103                 charset = charset_katakana_jisx0201;
3104             }
3105           else
3106             {
3107               /* BIG5 -> Big5 */
3108               if (c1 < 0xA0 || c1 > 0xFE)
3109                 goto label_invalid_code;
3110               ONE_MORE_BYTE (c2);
3111               if (c2 < 0x40 || (c2 > 0x7E && c2 < 0xA1) || c2 > 0xFE)
3112                 goto label_invalid_code;
3113               DECODE_BIG5 (c1, c2, charset, c1, c2);
3114             }
3115         }
3116
3117       c = DECODE_ISO_CHARACTER (charset, c1, c2);
3118       EMIT_CHAR (c);
3119       continue;
3120
3121     label_invalid_code:
3122       coding->errors++;
3123       src = src_base;
3124       c = *src++;
3125       EMIT_CHAR (c);
3126     }
3127
3128  label_end_of_loop:
3129   coding->consumed = coding->consumed_char = src_base - source;
3130   coding->produced = dst - destination;
3131   return;
3132 }
3133
3134 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3135    This function can encode charsets `ascii', `katakana-jisx0201',
3136    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3137    are sure that all these charsets are registered as official charset
3138    (i.e. do not have extended leading-codes).  Characters of other
3139    charsets are produced without any encoding.  If SJIS_P is 1, encode
3140    SJIS text, else encode BIG5 text.  */
3141
3142 static void
3143 encode_coding_sjis_big5 (coding, source, destination,
3144                          src_bytes, dst_bytes, sjis_p)
3145      struct coding_system *coding;
3146      unsigned char *source, *destination;
3147      int src_bytes, dst_bytes;
3148      int sjis_p;
3149 {
3150   unsigned char *src = source;
3151   unsigned char *src_end = source + src_bytes;
3152   unsigned char *dst = destination;
3153   unsigned char *dst_end = destination + dst_bytes;
3154   /* SRC_BASE remembers the start position in source in each loop.
3155      The loop will be exited when there's not enough source text to
3156      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3157      there's not enough destination area to produce encoded codes
3158      (within macro EMIT_BYTES).  */
3159   unsigned char *src_base;
3160   Lisp_Object translation_table;
3161
3162   if (NILP (Venable_character_translation))
3163     translation_table = Qnil;
3164   else
3165     {
3166       translation_table = coding->translation_table_for_encode;
3167       if (NILP (translation_table))
3168         translation_table = Vstandard_translation_table_for_encode;
3169     }
3170
3171   while (1)
3172     {
3173       int c, charset, c1, c2;
3174
3175       src_base = src;
3176       ONE_MORE_CHAR (c);
3177
3178       /* Now encode the character C.  */
3179       if (SINGLE_BYTE_CHAR_P (c))
3180         {
3181           switch (c)
3182             {
3183             case '\r':
3184               if (!(coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
3185                 {
3186                   EMIT_ONE_BYTE (c);
3187                   break;
3188                 }
3189               c = '\n';
3190             case '\n':
3191               if (coding->eol_type == CODING_EOL_CRLF)
3192                 {
3193                   EMIT_TWO_BYTES ('\r', c);
3194                   break;
3195                 }
3196               else if (coding->eol_type == CODING_EOL_CR)
3197                 c = '\r';
3198             default:
3199               EMIT_ONE_BYTE (c);
3200             }
3201         }
3202       else
3203         {
3204           SPLIT_CHAR (c, charset, c1, c2);
3205           if (sjis_p)
3206             {
3207               if (charset == charset_jisx0208
3208                   || charset == charset_jisx0208_1978)
3209                 {
3210                   ENCODE_SJIS (c1, c2, c1, c2);
3211                   EMIT_TWO_BYTES (c1, c2);
3212                 }
3213               else if (charset == charset_katakana_jisx0201)
3214                 EMIT_ONE_BYTE (c1 | 0x80);
3215               else if (charset == charset_latin_jisx0201)
3216                 EMIT_ONE_BYTE (c1);
3217               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3218                 {
3219                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3220                   if (CHARSET_WIDTH (charset) > 1)
3221                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3222                 }
3223               else
3224                 /* There's no way other than producing the internal
3225                    codes as is.  */
3226                 EMIT_BYTES (src_base, src);
3227             }
3228           else
3229             {
3230               if (charset == charset_big5_1 || charset == charset_big5_2)
3231                 {
3232                   ENCODE_BIG5 (charset, c1, c2, c1, c2);
3233                   EMIT_TWO_BYTES (c1, c2);
3234                 }
3235               else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR)
3236                 {
3237                   EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3238                   if (CHARSET_WIDTH (charset) > 1)
3239                     EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER);
3240                 }
3241               else
3242                 /* There's no way other than producing the internal
3243                    codes as is.  */
3244                 EMIT_BYTES (src_base, src);
3245             }
3246         }
3247       coding->consumed_char++;
3248     }
3249
3250  label_end_of_loop:
3251   coding->consumed = src_base - source;
3252   coding->produced = coding->produced_char = dst - destination;
3253 }
3254
3255 \f
3256 /*** 5. CCL handlers ***/
3257
3258 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3259    Check if a text is encoded in a coding system of which
3260    encoder/decoder are written in CCL program.  If it is, return
3261    CODING_CATEGORY_MASK_CCL, else return 0.  */
3262
3263 static int
3264 detect_coding_ccl (src, src_end, multibytep)
3265      unsigned char *src, *src_end;
3266      int multibytep;
3267 {
3268   unsigned char *valid;
3269   int c;
3270   /* Dummy for ONE_MORE_BYTE.  */
3271   struct coding_system dummy_coding;
3272   struct coding_system *coding = &dummy_coding;
3273
3274   /* No coding system is assigned to coding-category-ccl.  */
3275   if (!coding_system_table[CODING_CATEGORY_IDX_CCL])
3276     return 0;
3277
3278   valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes;
3279   while (1)
3280     {
3281       ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep);
3282       if (! valid[c])
3283         return 0;
3284     }
3285  label_end_of_loop:
3286   return CODING_CATEGORY_MASK_CCL;
3287 }
3288
3289 \f
3290 /*** 6. End-of-line handlers ***/
3291
3292 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3293
3294 static void
3295 decode_eol (coding, source, destination, src_bytes, dst_bytes)
3296      struct coding_system *coding;
3297      unsigned char *source, *destination;
3298      int src_bytes, dst_bytes;
3299 {
3300   unsigned char *src = source;
3301   unsigned char *dst = destination;
3302   unsigned char *src_end = src + src_bytes;
3303   unsigned char *dst_end = dst + dst_bytes;
3304   Lisp_Object translation_table;
3305   /* SRC_BASE remembers the start position in source in each loop.
3306      The loop will be exited when there's not enough source code
3307      (within macro ONE_MORE_BYTE), or when there's not enough
3308      destination area to produce a character (within macro
3309      EMIT_CHAR).  */
3310   unsigned char *src_base;
3311   int c;
3312
3313   translation_table = Qnil;
3314   switch (coding->eol_type)
3315     {
3316     case CODING_EOL_CRLF:
3317       while (1)
3318         {
3319           src_base = src;
3320           ONE_MORE_BYTE (c);
3321           if (c == '\r')
3322             {
3323               ONE_MORE_BYTE (c);
3324               if (c != '\n')
3325                 {
3326                   src--;
3327                   c = '\r';
3328                 }
3329             }
3330           else if (c == '\n'
3331                    && (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL))
3332             {
3333               coding->result = CODING_FINISH_INCONSISTENT_EOL;
3334               goto label_end_of_loop;
3335             }
3336           EMIT_CHAR (c);
3337         }
3338       break;
3339
3340     case CODING_EOL_CR:
3341       while (1)
3342         {
3343           src_base = src;
3344           ONE_MORE_BYTE (c);
3345           if (c == '\n')
3346             {
3347               if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
3348                 {
3349                   coding->result = CODING_FINISH_INCONSISTENT_EOL;
3350                   goto label_end_of_loop;
3351                 }
3352             }
3353           else if (c == '\r')
3354             c = '\n';
3355           EMIT_CHAR (c);
3356         }
3357       break;
3358
3359     default:                    /* no need for EOL handling */
3360       while (1)
3361         {
3362           src_base = src;
3363           ONE_MORE_BYTE (c);
3364           EMIT_CHAR (c);
3365         }
3366     }
3367
3368  label_end_of_loop:
3369   coding->consumed = coding->consumed_char = src_base - source;
3370   coding->produced = dst - destination;
3371   return;
3372 }
3373
3374 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  Encode
3375    format of end-of-line according to `coding->eol_type'.  It also
3376    convert multibyte form 8-bit characters to unibyte if
3377    CODING->src_multibyte is nonzero.  If `coding->mode &
3378    CODING_MODE_SELECTIVE_DISPLAY' is nonzero, code '\r' in source text
3379    also means end-of-line.  */
3380
3381 static void
3382 encode_eol (coding, source, destination, src_bytes, dst_bytes)
3383      struct coding_system *coding;
3384      const unsigned char *source;
3385      unsigned char *destination;
3386      int src_bytes, dst_bytes;
3387 {
3388   const unsigned char *src = source;
3389   unsigned char *dst = destination;
3390   const unsigned char *src_end = src + src_bytes;
3391   unsigned char *dst_end = dst + dst_bytes;
3392   Lisp_Object translation_table;
3393   /* SRC_BASE remembers the start position in source in each loop.
3394      The loop will be exited when there's not enough source text to
3395      analyze multi-byte codes (within macro ONE_MORE_CHAR), or when
3396      there's not enough destination area to produce encoded codes
3397      (within macro EMIT_BYTES).  */
3398   const unsigned char *src_base;
3399   unsigned char *tmp;
3400   int c;
3401   int selective_display = coding->mode & CODING_MODE_SELECTIVE_DISPLAY;
3402
3403   translation_table = Qnil;
3404   if (coding->src_multibyte
3405       && *(src_end - 1) == LEADING_CODE_8_BIT_CONTROL)
3406     {
3407       src_end--;
3408       src_bytes--;
3409       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
3410     }
3411
3412   if (coding->eol_type == CODING_EOL_CRLF)
3413     {
3414       while (src < src_end)
3415         {
3416           src_base = src;
3417           c = *src++;
3418           if (c >= 0x20)
3419             EMIT_ONE_BYTE (c);
3420           else if (c == '\n' || (c == '\r' && selective_display))
3421             EMIT_TWO_BYTES ('\r', '\n');
3422           else
3423             EMIT_ONE_BYTE (c);
3424         }
3425       src_base = src;
3426     label_end_of_loop:
3427       ;
3428     }
3429   else
3430     {
3431       if (!dst_bytes || src_bytes <= dst_bytes)
3432         {
3433           safe_bcopy (src, dst, src_bytes);
3434           src_base = src_end;
3435           dst += src_bytes;
3436         }
3437       else
3438         {
3439           if (coding->src_multibyte
3440               && *(src + dst_bytes - 1) == LEADING_CODE_8_BIT_CONTROL)
3441             dst_bytes--;
3442           safe_bcopy (src, dst, dst_bytes);
3443           src_base = src + dst_bytes;
3444           dst = destination + dst_bytes;
3445           coding->result = CODING_FINISH_INSUFFICIENT_DST;
3446         }
3447       if (coding->eol_type == CODING_EOL_CR)
3448         {
3449           for (tmp = destination; tmp < dst; tmp++)
3450             if (*tmp == '\n') *tmp = '\r';
3451         }
3452       else if (selective_display)
3453         {
3454           for (tmp = destination; tmp < dst; tmp++)
3455             if (*tmp == '\r') *tmp = '\n';
3456         }
3457     }
3458   if (coding->src_multibyte)
3459     dst = destination + str_as_unibyte (destination, dst - destination);
3460
3461   coding->consumed = src_base - source;
3462   coding->produced = dst - destination;
3463   coding->produced_char = coding->produced;
3464 }
3465
3466 \f
3467 /*** 7. C library functions ***/
3468
3469 /* In Emacs Lisp, a coding system is represented by a Lisp symbol which
3470    has a property `coding-system'.  The value of this property is a
3471    vector of length 5 (called the coding-vector).  Among elements of
3472    this vector, the first (element[0]) and the fifth (element[4])
3473    carry important information for decoding/encoding.  Before
3474    decoding/encoding, this information should be set in fields of a
3475    structure of type `coding_system'.
3476
3477    The value of the property `coding-system' can be a symbol of another
3478    subsidiary coding-system.  In that case, Emacs gets coding-vector
3479    from that symbol.
3480
3481    `element[0]' contains information to be set in `coding->type'.  The
3482    value and its meaning is as follows:
3483
3484    0 -- coding_type_emacs_mule
3485    1 -- coding_type_sjis
3486    2 -- coding_type_iso2022
3487    3 -- coding_type_big5
3488    4 -- coding_type_ccl encoder/decoder written in CCL
3489    nil -- coding_type_no_conversion
3490    t -- coding_type_undecided (automatic conversion on decoding,
3491                                no-conversion on encoding)
3492
3493    `element[4]' contains information to be set in `coding->flags' and
3494    `coding->spec'.  The meaning varies by `coding->type'.
3495
3496    If `coding->type' is `coding_type_iso2022', element[4] is a vector
3497    of length 32 (of which the first 13 sub-elements are used now).
3498    Meanings of these sub-elements are:
3499
3500    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso2022'
3501         If the value is an integer of valid charset, the charset is
3502         assumed to be designated to graphic register N initially.
3503
3504         If the value is minus, it is a minus value of charset which
3505         reserves graphic register N, which means that the charset is
3506         not designated initially but should be designated to graphic
3507         register N just before encoding a character in that charset.
3508
3509         If the value is nil, graphic register N is never used on
3510         encoding.
3511
3512    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
3513         Each value takes t or nil.  See the section ISO2022 of
3514         `coding.h' for more information.
3515
3516    If `coding->type' is `coding_type_big5', element[4] is t to denote
3517    BIG5-ETen or nil to denote BIG5-HKU.
3518
3519    If `coding->type' takes the other value, element[4] is ignored.
3520
3521    Emacs Lisp's coding systems also carry information about format of
3522    end-of-line in a value of property `eol-type'.  If the value is
3523    integer, 0 means CODING_EOL_LF, 1 means CODING_EOL_CRLF, and 2
3524    means CODING_EOL_CR.  If it is not integer, it should be a vector
3525    of subsidiary coding systems of which property `eol-type' has one
3526    of the above values.
3527
3528 */
3529
3530 /* Extract information for decoding/encoding from CODING_SYSTEM_SYMBOL
3531    and set it in CODING.  If CODING_SYSTEM_SYMBOL is invalid, CODING
3532    is setup so that no conversion is necessary and return -1, else
3533    return 0.  */
3534
3535 int
3536 setup_coding_system (coding_system, coding)
3537      Lisp_Object coding_system;
3538      struct coding_system *coding;
3539 {
3540   Lisp_Object coding_spec, coding_type, eol_type, plist;
3541   Lisp_Object val;
3542
3543   /* At first, zero clear all members.  */
3544   bzero (coding, sizeof (struct coding_system));
3545
3546   /* Initialize some fields required for all kinds of coding systems.  */
3547   coding->symbol = coding_system;
3548   coding->heading_ascii = -1;
3549   coding->post_read_conversion = coding->pre_write_conversion = Qnil;
3550   coding->composing = COMPOSITION_DISABLED;
3551   coding->cmp_data = NULL;
3552
3553   if (NILP (coding_system))
3554     goto label_invalid_coding_system;
3555
3556   coding_spec = Fget (coding_system, Qcoding_system);
3557
3558   if (!VECTORP (coding_spec)
3559       || XVECTOR (coding_spec)->size != 5
3560       || !CONSP (XVECTOR (coding_spec)->contents[3]))
3561     goto label_invalid_coding_system;
3562
3563   eol_type = inhibit_eol_conversion ? Qnil : Fget (coding_system, Qeol_type);
3564   if (VECTORP (eol_type))
3565     {
3566       coding->eol_type = CODING_EOL_UNDECIDED;
3567       coding->common_flags = CODING_REQUIRE_DETECTION_MASK;
3568     }
3569   else if (XFASTINT (eol_type) == 1)
3570     {
3571       coding->eol_type = CODING_EOL_CRLF;
3572       coding->common_flags
3573         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3574     }
3575   else if (XFASTINT (eol_type) == 2)
3576     {
3577       coding->eol_type = CODING_EOL_CR;
3578       coding->common_flags
3579         = CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3580     }
3581   else
3582     coding->eol_type = CODING_EOL_LF;
3583
3584   coding_type = XVECTOR (coding_spec)->contents[0];
3585   /* Try short cut.  */
3586   if (SYMBOLP (coding_type))
3587     {
3588       if (EQ (coding_type, Qt))
3589         {
3590           coding->type = coding_type_undecided;
3591           coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
3592         }
3593       else
3594         coding->type = coding_type_no_conversion;
3595       /* Initialize this member.  Any thing other than
3596          CODING_CATEGORY_IDX_UTF_16_BE and
3597          CODING_CATEGORY_IDX_UTF_16_LE are ok because they have
3598          special treatment in detect_eol.  */
3599       coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
3600
3601       return 0;
3602     }
3603
3604   /* Get values of coding system properties:
3605      `post-read-conversion', `pre-write-conversion',
3606      `translation-table-for-decode', `translation-table-for-encode'.  */
3607   plist = XVECTOR (coding_spec)->contents[3];
3608   /* Pre & post conversion functions should be disabled if
3609      inhibit_eol_conversion is nonzero.  This is the case that a code
3610      conversion function is called while those functions are running.  */
3611   if (! inhibit_pre_post_conversion)
3612     {
3613       coding->post_read_conversion = Fplist_get (plist, Qpost_read_conversion);
3614       coding->pre_write_conversion = Fplist_get (plist, Qpre_write_conversion);
3615     }
3616   val = Fplist_get (plist, Qtranslation_table_for_decode);
3617   if (SYMBOLP (val))
3618     val = Fget (val, Qtranslation_table_for_decode);
3619   coding->translation_table_for_decode = CHAR_TABLE_P (val) ? val : Qnil;
3620   val = Fplist_get (plist, Qtranslation_table_for_encode);
3621   if (SYMBOLP (val))
3622     val = Fget (val, Qtranslation_table_for_encode);
3623   coding->translation_table_for_encode = CHAR_TABLE_P (val) ? val : Qnil;
3624   val = Fplist_get (plist, Qcoding_category);
3625   if (!NILP (val))
3626     {
3627       val = Fget (val, Qcoding_category_index);
3628       if (INTEGERP (val))
3629         coding->category_idx = XINT (val);
3630       else
3631         goto label_invalid_coding_system;
3632     }
3633   else
3634     goto label_invalid_coding_system;
3635
3636   /* If the coding system has non-nil `composition' property, enable
3637      composition handling.  */
3638   val = Fplist_get (plist, Qcomposition);
3639   if (!NILP (val))
3640     coding->composing = COMPOSITION_NO;
3641
3642   switch (XFASTINT (coding_type))
3643     {
3644     case 0:
3645       coding->type = coding_type_emacs_mule;
3646       coding->common_flags
3647         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3648       if (!NILP (coding->post_read_conversion))
3649         coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
3650       if (!NILP (coding->pre_write_conversion))
3651         coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
3652       break;
3653
3654     case 1:
3655       coding->type = coding_type_sjis;
3656       coding->common_flags
3657         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3658       break;
3659
3660     case 2:
3661       coding->type = coding_type_iso2022;
3662       coding->common_flags
3663         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3664       {
3665         Lisp_Object val, temp;
3666         Lisp_Object *flags;
3667         int i, charset, reg_bits = 0;
3668
3669         val = XVECTOR (coding_spec)->contents[4];
3670
3671         if (!VECTORP (val) || XVECTOR (val)->size != 32)
3672           goto label_invalid_coding_system;
3673
3674         flags = XVECTOR (val)->contents;
3675         coding->flags
3676           = ((NILP (flags[4]) ? 0 : CODING_FLAG_ISO_SHORT_FORM)
3677              | (NILP (flags[5]) ? 0 : CODING_FLAG_ISO_RESET_AT_EOL)
3678              | (NILP (flags[6]) ? 0 : CODING_FLAG_ISO_RESET_AT_CNTL)
3679              | (NILP (flags[7]) ? 0 : CODING_FLAG_ISO_SEVEN_BITS)
3680              | (NILP (flags[8]) ? 0 : CODING_FLAG_ISO_LOCKING_SHIFT)
3681              | (NILP (flags[9]) ? 0 : CODING_FLAG_ISO_SINGLE_SHIFT)
3682              | (NILP (flags[10]) ? 0 : CODING_FLAG_ISO_USE_ROMAN)
3683              | (NILP (flags[11]) ? 0 : CODING_FLAG_ISO_USE_OLDJIS)
3684              | (NILP (flags[12]) ? 0 : CODING_FLAG_ISO_NO_DIRECTION)
3685              | (NILP (flags[13]) ? 0 : CODING_FLAG_ISO_INIT_AT_BOL)
3686              | (NILP (flags[14]) ? 0 : CODING_FLAG_ISO_DESIGNATE_AT_BOL)
3687              | (NILP (flags[15]) ? 0 : CODING_FLAG_ISO_SAFE)
3688              | (NILP (flags[16]) ? 0 : CODING_FLAG_ISO_LATIN_EXTRA)
3689              );
3690
3691         /* Invoke graphic register 0 to plane 0.  */
3692         CODING_SPEC_ISO_INVOCATION (coding, 0) = 0;
3693         /* Invoke graphic register 1 to plane 1 if we can use full 8-bit.  */
3694         CODING_SPEC_ISO_INVOCATION (coding, 1)
3695           = (coding->flags & CODING_FLAG_ISO_SEVEN_BITS ? -1 : 1);
3696         /* Not single shifting at first.  */
3697         CODING_SPEC_ISO_SINGLE_SHIFTING (coding) = 0;
3698         /* Beginning of buffer should also be regarded as bol. */
3699         CODING_SPEC_ISO_BOL (coding) = 1;
3700
3701         for (charset = 0; charset <= MAX_CHARSET; charset++)
3702           CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = 255;
3703         val = Vcharset_revision_alist;
3704         while (CONSP (val))
3705           {
3706             charset = get_charset_id (Fcar_safe (XCAR (val)));
3707             if (charset >= 0
3708                 && (temp = Fcdr_safe (XCAR (val)), INTEGERP (temp))
3709                 && (i = XINT (temp), (i >= 0 && (i + '@') < 128)))
3710               CODING_SPEC_ISO_REVISION_NUMBER (coding, charset) = i;
3711             val = XCDR (val);
3712           }
3713
3714         /* Checks FLAGS[REG] (REG = 0, 1, 2 3) and decide designations.
3715            FLAGS[REG] can be one of below:
3716                 integer CHARSET: CHARSET occupies register I,
3717                 t: designate nothing to REG initially, but can be used
3718                   by any charsets,
3719                 list of integer, nil, or t: designate the first
3720                   element (if integer) to REG initially, the remaining
3721                   elements (if integer) is designated to REG on request,
3722                   if an element is t, REG can be used by any charsets,
3723                 nil: REG is never used.  */
3724         for (charset = 0; charset <= MAX_CHARSET; charset++)
3725           CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3726             = CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION;
3727         for (i = 0; i < 4; i++)
3728           {
3729             if ((INTEGERP (flags[i])
3730                  && (charset = XINT (flags[i]), CHARSET_VALID_P (charset)))
3731                 || (charset = get_charset_id (flags[i])) >= 0)
3732               {
3733                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3734                 CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) = i;
3735               }
3736             else if (EQ (flags[i], Qt))
3737               {
3738                 CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3739                 reg_bits |= 1 << i;
3740                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3741               }
3742             else if (CONSP (flags[i]))
3743               {
3744                 Lisp_Object tail;
3745                 tail = flags[i];
3746
3747                 coding->flags |= CODING_FLAG_ISO_DESIGNATION;
3748                 if ((INTEGERP (XCAR (tail))
3749                      && (charset = XINT (XCAR (tail)),
3750                          CHARSET_VALID_P (charset)))
3751                     || (charset = get_charset_id (XCAR (tail))) >= 0)
3752                   {
3753                     CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = charset;
3754                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset) =i;
3755                   }
3756                 else
3757                   CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3758                 tail = XCDR (tail);
3759                 while (CONSP (tail))
3760                   {
3761                     if ((INTEGERP (XCAR (tail))
3762                          && (charset = XINT (XCAR (tail)),
3763                              CHARSET_VALID_P (charset)))
3764                         || (charset = get_charset_id (XCAR (tail))) >= 0)
3765                       CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3766                         = i;
3767                     else if (EQ (XCAR (tail), Qt))
3768                       reg_bits |= 1 << i;
3769                     tail = XCDR (tail);
3770                   }
3771               }
3772             else
3773               CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i) = -1;
3774
3775             CODING_SPEC_ISO_DESIGNATION (coding, i)
3776               = CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, i);
3777           }
3778
3779         if (reg_bits && ! (coding->flags & CODING_FLAG_ISO_LOCKING_SHIFT))
3780           {
3781             /* REG 1 can be used only by locking shift in 7-bit env.  */
3782             if (coding->flags & CODING_FLAG_ISO_SEVEN_BITS)
3783               reg_bits &= ~2;
3784             if (! (coding->flags & CODING_FLAG_ISO_SINGLE_SHIFT))
3785               /* Without any shifting, only REG 0 and 1 can be used.  */
3786               reg_bits &= 3;
3787           }
3788
3789         if (reg_bits)
3790           for (charset = 0; charset <= MAX_CHARSET; charset++)
3791             {
3792               if (CHARSET_DEFINED_P (charset)
3793                   && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3794                       == CODING_SPEC_ISO_NO_REQUESTED_DESIGNATION))
3795                 {
3796                   /* There exist some default graphic registers to be
3797                      used by CHARSET.  */
3798
3799                   /* We had better avoid designating a charset of
3800                      CHARS96 to REG 0 as far as possible.  */
3801                   if (CHARSET_CHARS (charset) == 96)
3802                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3803                       = (reg_bits & 2
3804                          ? 1 : (reg_bits & 4 ? 2 : (reg_bits & 8 ? 3 : 0)));
3805                   else
3806                     CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding, charset)
3807                       = (reg_bits & 1
3808                          ? 0 : (reg_bits & 2 ? 1 : (reg_bits & 4 ? 2 : 3)));
3809                 }
3810             }
3811       }
3812       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3813       coding->spec.iso2022.last_invalid_designation_register = -1;
3814       break;
3815
3816     case 3:
3817       coding->type = coding_type_big5;
3818       coding->common_flags
3819         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3820       coding->flags
3821         = (NILP (XVECTOR (coding_spec)->contents[4])
3822            ? CODING_FLAG_BIG5_HKU
3823            : CODING_FLAG_BIG5_ETEN);
3824       break;
3825
3826     case 4:
3827       coding->type = coding_type_ccl;
3828       coding->common_flags
3829         |= CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK;
3830       {
3831         val = XVECTOR (coding_spec)->contents[4];
3832         if (! CONSP (val)
3833             || setup_ccl_program (&(coding->spec.ccl.decoder),
3834                                   XCAR (val)) < 0
3835             || setup_ccl_program (&(coding->spec.ccl.encoder),
3836                                   XCDR (val)) < 0)
3837           goto label_invalid_coding_system;
3838
3839         bzero (coding->spec.ccl.valid_codes, 256);
3840         val = Fplist_get (plist, Qvalid_codes);
3841         if (CONSP (val))
3842           {
3843             Lisp_Object this;
3844
3845             for (; CONSP (val); val = XCDR (val))
3846               {
3847                 this = XCAR (val);
3848                 if (INTEGERP (this)
3849                     && XINT (this) >= 0 && XINT (this) < 256)
3850                   coding->spec.ccl.valid_codes[XINT (this)] = 1;
3851                 else if (CONSP (this)
3852                          && INTEGERP (XCAR (this))
3853                          && INTEGERP (XCDR (this)))
3854                   {
3855                     int start = XINT (XCAR (this));
3856                     int end = XINT (XCDR (this));
3857
3858                     if (start >= 0 && start <= end && end < 256)
3859                       while (start <= end)
3860                         coding->spec.ccl.valid_codes[start++] = 1;
3861                   }
3862               }
3863           }
3864       }
3865       coding->common_flags |= CODING_REQUIRE_FLUSHING_MASK;
3866       coding->spec.ccl.cr_carryover = 0;
3867       coding->spec.ccl.eight_bit_carryover[0] = 0;
3868       break;
3869
3870     case 5:
3871       coding->type = coding_type_raw_text;
3872       break;
3873
3874     default:
3875       goto label_invalid_coding_system;
3876     }
3877   return 0;
3878
3879  label_invalid_coding_system:
3880   coding->type = coding_type_no_conversion;
3881   coding->category_idx = CODING_CATEGORY_IDX_BINARY;
3882   coding->common_flags = 0;
3883   coding->eol_type = CODING_EOL_LF;
3884   coding->pre_write_conversion = coding->post_read_conversion = Qnil;
3885   return -1;
3886 }
3887
3888 /* Free memory blocks allocated for storing composition information.  */
3889
3890 void
3891 coding_free_composition_data (coding)
3892      struct coding_system *coding;
3893 {
3894   struct composition_data *cmp_data = coding->cmp_data, *next;
3895
3896   if (!cmp_data)
3897     return;
3898   /* Memory blocks are chained.  At first, rewind to the first, then,
3899      free blocks one by one.  */
3900   while (cmp_data->prev)
3901     cmp_data = cmp_data->prev;
3902   while (cmp_data)
3903     {
3904       next = cmp_data->next;
3905       xfree (cmp_data);
3906       cmp_data = next;
3907     }
3908   coding->cmp_data = NULL;
3909 }
3910
3911 /* Set `char_offset' member of all memory blocks pointed by
3912    coding->cmp_data to POS.  */
3913
3914 void
3915 coding_adjust_composition_offset (coding, pos)
3916      struct coding_system *coding;
3917      int pos;
3918 {
3919   struct composition_data *cmp_data;
3920
3921   for (cmp_data = coding->cmp_data; cmp_data; cmp_data = cmp_data->next)
3922     cmp_data->char_offset = pos;
3923 }
3924
3925 /* Setup raw-text or one of its subsidiaries in the structure
3926    coding_system CODING according to the already setup value eol_type
3927    in CODING.  CODING should be setup for some coding system in
3928    advance.  */
3929
3930 void
3931 setup_raw_text_coding_system (coding)
3932      struct coding_system *coding;
3933 {
3934   if (coding->type != coding_type_raw_text)
3935     {
3936       coding->symbol = Qraw_text;
3937       coding->type = coding_type_raw_text;
3938       if (coding->eol_type != CODING_EOL_UNDECIDED)
3939         {
3940           Lisp_Object subsidiaries;
3941           subsidiaries = Fget (Qraw_text, Qeol_type);
3942
3943           if (VECTORP (subsidiaries)
3944               && XVECTOR (subsidiaries)->size == 3)
3945             coding->symbol
3946               = XVECTOR (subsidiaries)->contents[coding->eol_type];
3947         }
3948       setup_coding_system (coding->symbol, coding);
3949     }
3950   return;
3951 }
3952
3953 /* Emacs has a mechanism to automatically detect a coding system if it
3954    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
3955    it's impossible to distinguish some coding systems accurately
3956    because they use the same range of codes.  So, at first, coding
3957    systems are categorized into 7, those are:
3958
3959    o coding-category-emacs-mule
3960
3961         The category for a coding system which has the same code range
3962         as Emacs' internal format.  Assigned the coding-system (Lisp
3963         symbol) `emacs-mule' by default.
3964
3965    o coding-category-sjis
3966
3967         The category for a coding system which has the same code range
3968         as SJIS.  Assigned the coding-system (Lisp
3969         symbol) `japanese-shift-jis' by default.
3970
3971    o coding-category-iso-7
3972
3973         The category for a coding system which has the same code range
3974         as ISO2022 of 7-bit environment.  This doesn't use any locking
3975         shift and single shift functions.  This can encode/decode all
3976         charsets.  Assigned the coding-system (Lisp symbol)
3977         `iso-2022-7bit' by default.
3978
3979    o coding-category-iso-7-tight
3980
3981         Same as coding-category-iso-7 except that this can
3982         encode/decode only the specified charsets.
3983
3984    o coding-category-iso-8-1
3985
3986         The category for a coding system which has the same code range
3987         as ISO2022 of 8-bit environment and graphic plane 1 used only
3988         for DIMENSION1 charset.  This doesn't use any locking shift
3989         and single shift functions.  Assigned the coding-system (Lisp
3990         symbol) `iso-latin-1' by default.
3991
3992    o coding-category-iso-8-2
3993
3994         The category for a coding system which has the same code range
3995         as ISO2022 of 8-bit environment and graphic plane 1 used only
3996         for DIMENSION2 charset.  This doesn't use any locking shift
3997         and single shift functions.  Assigned the coding-system (Lisp
3998         symbol) `japanese-iso-8bit' by default.
3999
4000    o coding-category-iso-7-else
4001
4002         The category for a coding system which has the same code range
4003         as ISO2022 of 7-bit environment but uses locking shift or
4004         single shift functions.  Assigned the coding-system (Lisp
4005         symbol) `iso-2022-7bit-lock' by default.
4006
4007    o coding-category-iso-8-else
4008
4009         The category for a coding system which has the same code range
4010         as ISO2022 of 8-bit environment but uses locking shift or
4011         single shift functions.  Assigned the coding-system (Lisp
4012         symbol) `iso-2022-8bit-ss2' by default.
4013
4014    o coding-category-big5
4015
4016         The category for a coding system which has the same code range
4017         as BIG5.  Assigned the coding-system (Lisp symbol)
4018         `cn-big5' by default.
4019
4020    o coding-category-utf-8
4021
4022         The category for a coding system which has the same code range
4023         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
4024         symbol) `utf-8' by default.
4025
4026    o coding-category-utf-16-be
4027
4028         The category for a coding system in which a text has an
4029         Unicode signature (cf. Unicode Standard) in the order of BIG
4030         endian at the head.  Assigned the coding-system (Lisp symbol)
4031         `utf-16-be' by default.
4032
4033    o coding-category-utf-16-le
4034
4035         The category for a coding system in which a text has an
4036         Unicode signature (cf. Unicode Standard) in the order of
4037         LITTLE endian at the head.  Assigned the coding-system (Lisp
4038         symbol) `utf-16-le' by default.
4039
4040    o coding-category-ccl
4041
4042         The category for a coding system of which encoder/decoder is
4043         written in CCL programs.  The default value is nil, i.e., no
4044         coding system is assigned.
4045
4046    o coding-category-binary
4047
4048         The category for a coding system not categorized in any of the
4049         above.  Assigned the coding-system (Lisp symbol)
4050         `no-conversion' by default.
4051
4052    Each of them is a Lisp symbol and the value is an actual
4053    `coding-system' (this is also a Lisp symbol) assigned by a user.
4054    What Emacs does actually is to detect a category of coding system.
4055    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4056    decide a single possible category, it selects a category of the
4057    highest priority.  Priorities of categories are also specified by a
4058    user in a Lisp variable `coding-category-list'.
4059
4060 */
4061
4062 static
4063 int ascii_skip_code[256];
4064
4065 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded.
4066    If it detects possible coding systems, return an integer in which
4067    appropriate flag bits are set.  Flag bits are defined by macros
4068    CODING_CATEGORY_MASK_XXX in `coding.h'.  If PRIORITIES is non-NULL,
4069    it should point the table `coding_priorities'.  In that case, only
4070    the flag bit for a coding system of the highest priority is set in
4071    the returned value.  If MULTIBYTEP is nonzero, 8-bit codes of the
4072    range 0x80..0x9F are in multibyte form.
4073
4074    How many ASCII characters are at the head is returned as *SKIP.  */
4075
4076 static int
4077 detect_coding_mask (source, src_bytes, priorities, skip, multibytep)
4078      unsigned char *source;
4079      int src_bytes, *priorities, *skip;
4080      int multibytep;
4081 {
4082   register unsigned char c;
4083   unsigned char *src = source, *src_end = source + src_bytes;
4084   unsigned int mask, utf16_examined_p, iso2022_examined_p;
4085   int i;
4086
4087   /* At first, skip all ASCII characters and control characters except
4088      for three ISO2022 specific control characters.  */
4089   ascii_skip_code[ISO_CODE_SO] = 0;
4090   ascii_skip_code[ISO_CODE_SI] = 0;
4091   ascii_skip_code[ISO_CODE_ESC] = 0;
4092
4093  label_loop_detect_coding:
4094   while (src < src_end && ascii_skip_code[*src]) src++;
4095   *skip = src - source;
4096
4097   if (src >= src_end)
4098     /* We found nothing other than ASCII.  There's nothing to do.  */
4099     return 0;
4100
4101   c = *src;
4102   /* The text seems to be encoded in some multilingual coding system.
4103      Now, try to find in which coding system the text is encoded.  */
4104   if (c < 0x80)
4105     {
4106       /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */
4107       /* C is an ISO2022 specific control code of C0.  */
4108       mask = detect_coding_iso2022 (src, src_end, multibytep);
4109       if (mask == 0)
4110         {
4111           /* No valid ISO2022 code follows C.  Try again.  */
4112           src++;
4113           if (c == ISO_CODE_ESC)
4114             ascii_skip_code[ISO_CODE_ESC] = 1;
4115           else
4116             ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1;
4117           goto label_loop_detect_coding;
4118         }
4119       if (priorities)
4120         {
4121           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4122             {
4123               if (mask & priorities[i])
4124                 return priorities[i];
4125             }
4126           return CODING_CATEGORY_MASK_RAW_TEXT;
4127         }
4128     }
4129   else
4130     {
4131       int try;
4132
4133       if (multibytep && c == LEADING_CODE_8_BIT_CONTROL)
4134         c = src[1] - 0x20;
4135
4136       if (c < 0xA0)
4137         {
4138           /* C is the first byte of SJIS character code,
4139              or a leading-code of Emacs' internal format (emacs-mule),
4140              or the first byte of UTF-16.  */
4141           try = (CODING_CATEGORY_MASK_SJIS
4142                   | CODING_CATEGORY_MASK_EMACS_MULE
4143                   | CODING_CATEGORY_MASK_UTF_16_BE
4144                   | CODING_CATEGORY_MASK_UTF_16_LE);
4145
4146           /* Or, if C is a special latin extra code,
4147              or is an ISO2022 specific control code of C1 (SS2 or SS3),
4148              or is an ISO2022 control-sequence-introducer (CSI),
4149              we should also consider the possibility of ISO2022 codings.  */
4150           if ((VECTORP (Vlatin_extra_code_table)
4151                && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
4152               || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3)
4153               || (c == ISO_CODE_CSI
4154                   && (src < src_end
4155                       && (*src == ']'
4156                           || ((*src == '0' || *src == '1' || *src == '2')
4157                               && src + 1 < src_end
4158                               && src[1] == ']')))))
4159             try |= (CODING_CATEGORY_MASK_ISO_8_ELSE
4160                      | CODING_CATEGORY_MASK_ISO_8BIT);
4161         }
4162       else
4163         /* C is a character of ISO2022 in graphic plane right,
4164            or a SJIS's 1-byte character code (i.e. JISX0201),
4165            or the first byte of BIG5's 2-byte code,
4166            or the first byte of UTF-8/16.  */
4167         try = (CODING_CATEGORY_MASK_ISO_8_ELSE
4168                 | CODING_CATEGORY_MASK_ISO_8BIT
4169                 | CODING_CATEGORY_MASK_SJIS
4170                 | CODING_CATEGORY_MASK_BIG5
4171                 | CODING_CATEGORY_MASK_UTF_8
4172                 | CODING_CATEGORY_MASK_UTF_16_BE
4173                 | CODING_CATEGORY_MASK_UTF_16_LE);
4174
4175       /* Or, we may have to consider the possibility of CCL.  */
4176       if (coding_system_table[CODING_CATEGORY_IDX_CCL]
4177           && (coding_system_table[CODING_CATEGORY_IDX_CCL]
4178               ->spec.ccl.valid_codes)[c])
4179         try |= CODING_CATEGORY_MASK_CCL;
4180
4181       mask = 0;
4182       utf16_examined_p = iso2022_examined_p = 0;
4183       if (priorities)
4184         {
4185           for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
4186             {
4187               if (!iso2022_examined_p
4188                   && (priorities[i] & try & CODING_CATEGORY_MASK_ISO))
4189                 {
4190                   mask |= detect_coding_iso2022 (src, src_end, multibytep);
4191                   iso2022_examined_p = 1;
4192                 }
4193               else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS)
4194                 mask |= detect_coding_sjis (src, src_end, multibytep);
4195               else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8)
4196                 mask |= detect_coding_utf_8 (src, src_end, multibytep);
4197               else if (!utf16_examined_p
4198                        && (priorities[i] & try &
4199                            CODING_CATEGORY_MASK_UTF_16_BE_LE))
4200                 {
4201                   mask |= detect_coding_utf_16 (src, src_end, multibytep);
4202                   utf16_examined_p = 1;
4203                 }
4204               else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5)
4205                 mask |= detect_coding_big5 (src, src_end, multibytep);
4206               else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE)
4207                 mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4208               else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL)
4209                 mask |= detect_coding_ccl (src, src_end, multibytep);
4210               else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT)
4211                 mask |= CODING_CATEGORY_MASK_RAW_TEXT;
4212               else if (priorities[i] & CODING_CATEGORY_MASK_BINARY)
4213                 mask |= CODING_CATEGORY_MASK_BINARY;
4214               if (mask & priorities[i])
4215                 return priorities[i];
4216             }
4217           return CODING_CATEGORY_MASK_RAW_TEXT;
4218         }
4219       if (try & CODING_CATEGORY_MASK_ISO)
4220         mask |= detect_coding_iso2022 (src, src_end, multibytep);
4221       if (try & CODING_CATEGORY_MASK_SJIS)
4222         mask |= detect_coding_sjis (src, src_end, multibytep);
4223       if (try & CODING_CATEGORY_MASK_BIG5)
4224         mask |= detect_coding_big5 (src, src_end, multibytep);
4225       if (try & CODING_CATEGORY_MASK_UTF_8)
4226         mask |= detect_coding_utf_8 (src, src_end, multibytep);
4227       if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE)
4228         mask |= detect_coding_utf_16 (src, src_end, multibytep);
4229       if (try & CODING_CATEGORY_MASK_EMACS_MULE)
4230         mask |= detect_coding_emacs_mule (src, src_end, multibytep);
4231       if (try & CODING_CATEGORY_MASK_CCL)
4232         mask |= detect_coding_ccl (src, src_end, multibytep);
4233     }
4234   return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY);
4235 }
4236
4237 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded.
4238    The information of the detected coding system is set in CODING.  */
4239
4240 void
4241 detect_coding (coding, src, src_bytes)
4242      struct coding_system *coding;
4243      const unsigned char *src;
4244      int src_bytes;
4245 {
4246   unsigned int idx;
4247   int skip, mask;
4248   Lisp_Object val;
4249
4250   val = Vcoding_category_list;
4251   mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip,
4252                              coding->src_multibyte);
4253   coding->heading_ascii = skip;
4254
4255   if (!mask) return;
4256
4257   /* We found a single coding system of the highest priority in MASK.  */
4258   idx = 0;
4259   while (mask && ! (mask & 1)) mask >>= 1, idx++;
4260   if (! mask)
4261     idx = CODING_CATEGORY_IDX_RAW_TEXT;
4262
4263   val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[idx]);
4264
4265   if (coding->eol_type != CODING_EOL_UNDECIDED)
4266     {
4267       Lisp_Object tmp;
4268
4269       tmp = Fget (val, Qeol_type);
4270       if (VECTORP (tmp))
4271         val = XVECTOR (tmp)->contents[coding->eol_type];
4272     }
4273
4274   /* Setup this new coding system while preserving some slots.  */
4275   {
4276     int src_multibyte = coding->src_multibyte;
4277     int dst_multibyte = coding->dst_multibyte;
4278
4279     setup_coding_system (val, coding);
4280     coding->src_multibyte = src_multibyte;
4281     coding->dst_multibyte = dst_multibyte;
4282     coding->heading_ascii = skip;
4283   }
4284 }
4285
4286 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4287    SOURCE is encoded.  Return one of CODING_EOL_LF, CODING_EOL_CRLF,
4288    CODING_EOL_CR, and CODING_EOL_UNDECIDED.
4289
4290    How many non-eol characters are at the head is returned as *SKIP.  */
4291
4292 #define MAX_EOL_CHECK_COUNT 3
4293
4294 static int
4295 detect_eol_type (source, src_bytes, skip)
4296      unsigned char *source;
4297      int src_bytes, *skip;
4298 {
4299   unsigned char *src = source, *src_end = src + src_bytes;
4300   unsigned char c;
4301   int total = 0;                /* How many end-of-lines are found so far.  */
4302   int eol_type = CODING_EOL_UNDECIDED;
4303   int this_eol_type;
4304
4305   *skip = 0;
4306
4307   while (src < src_end && total < MAX_EOL_CHECK_COUNT)
4308     {
4309       c = *src++;
4310       if (c == '\n' || c == '\r')
4311         {
4312           if (*skip == 0)
4313             *skip = src - 1 - source;
4314           total++;
4315           if (c == '\n')
4316             this_eol_type = CODING_EOL_LF;
4317           else if (src >= src_end || *src != '\n')
4318             this_eol_type = CODING_EOL_CR;
4319           else
4320             this_eol_type = CODING_EOL_CRLF, src++;
4321
4322           if (eol_type == CODING_EOL_UNDECIDED)
4323             /* This is the first end-of-line.  */
4324             eol_type = this_eol_type;
4325           else if (eol_type != this_eol_type)
4326             {
4327               /* The found type is different from what found before.  */
4328               eol_type = CODING_EOL_INCONSISTENT;
4329               break;
4330             }
4331         }
4332     }
4333
4334   if (*skip == 0)
4335     *skip = src_end - source;
4336   return eol_type;
4337 }
4338
4339 /* Like detect_eol_type, but detect EOL type in 2-octet
4340    big-endian/little-endian format for coding systems utf-16-be and
4341    utf-16-le.  */
4342
4343 static int
4344 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p)
4345      unsigned char *source;
4346      int src_bytes, *skip, big_endian_p;
4347 {
4348   unsigned char *src = source, *src_end = src + src_bytes;
4349   unsigned int c1, c2;
4350   int total = 0;                /* How many end-of-lines are found so far.  */
4351   int eol_type = CODING_EOL_UNDECIDED;
4352   int this_eol_type;
4353   int msb, lsb;
4354
4355   if (big_endian_p)
4356     msb = 0, lsb = 1;
4357   else
4358     msb = 1, lsb = 0;
4359
4360   *skip = 0;
4361
4362   while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT)
4363     {
4364       c1 = (src[msb] << 8) | (src[lsb]);
4365       src += 2;
4366
4367       if (c1 == '\n' || c1 == '\r')
4368         {
4369           if (*skip == 0)
4370             *skip = src - 2 - source;
4371           total++;
4372           if (c1 == '\n')
4373             {
4374               this_eol_type = CODING_EOL_LF;
4375             }
4376           else
4377             {
4378               if ((src + 1) >= src_end)
4379                 {
4380                   this_eol_type = CODING_EOL_CR;
4381                 }
4382               else
4383                 {
4384                   c2 = (src[msb] << 8) | (src[lsb]);
4385                   if (c2 == '\n')
4386                     this_eol_type = CODING_EOL_CRLF, src += 2;
4387                   else
4388                     this_eol_type = CODING_EOL_CR;
4389                 }
4390             }
4391
4392           if (eol_type == CODING_EOL_UNDECIDED)
4393             /* This is the first end-of-line.  */
4394             eol_type = this_eol_type;
4395           else if (eol_type != this_eol_type)
4396             {
4397               /* The found type is different from what found before.  */
4398               eol_type = CODING_EOL_INCONSISTENT;
4399               break;
4400             }
4401         }
4402     }
4403
4404   if (*skip == 0)
4405     *skip = src_end - source;
4406   return eol_type;
4407 }
4408
4409 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC
4410    is encoded.  If it detects an appropriate format of end-of-line, it
4411    sets the information in *CODING.  */
4412
4413 void
4414 detect_eol (coding, src, src_bytes)
4415      struct coding_system *coding;
4416      const unsigned char *src;
4417      int src_bytes;
4418 {
4419   Lisp_Object val;
4420   int skip;
4421   int eol_type;
4422
4423   switch (coding->category_idx)
4424     {
4425     case CODING_CATEGORY_IDX_UTF_16_BE:
4426       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1);
4427       break;
4428     case CODING_CATEGORY_IDX_UTF_16_LE:
4429       eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0);
4430       break;
4431     default:
4432       eol_type = detect_eol_type (src, src_bytes, &skip);
4433       break;
4434     }
4435
4436   if (coding->heading_ascii > skip)
4437     coding->heading_ascii = skip;
4438   else
4439     skip = coding->heading_ascii;
4440
4441   if (eol_type == CODING_EOL_UNDECIDED)
4442     return;
4443   if (eol_type == CODING_EOL_INCONSISTENT)
4444     {
4445 #if 0
4446       /* This code is suppressed until we find a better way to
4447          distinguish raw text file and binary file.  */
4448
4449       /* If we have already detected that the coding is raw-text, the
4450          coding should actually be no-conversion.  */
4451       if (coding->type == coding_type_raw_text)
4452         {
4453           setup_coding_system (Qno_conversion, coding);
4454           return;
4455         }
4456       /* Else, let's decode only text code anyway.  */
4457 #endif /* 0 */
4458       eol_type = CODING_EOL_LF;
4459     }
4460
4461   val = Fget (coding->symbol, Qeol_type);
4462   if (VECTORP (val) && XVECTOR (val)->size == 3)
4463     {
4464       int src_multibyte = coding->src_multibyte;
4465       int dst_multibyte = coding->dst_multibyte;
4466       struct composition_data *cmp_data = coding->cmp_data;
4467
4468       setup_coding_system (XVECTOR (val)->contents[eol_type], coding);
4469       coding->src_multibyte = src_multibyte;
4470       coding->dst_multibyte = dst_multibyte;
4471       coding->heading_ascii = skip;
4472       coding->cmp_data = cmp_data;
4473     }
4474 }
4475
4476 #define CONVERSION_BUFFER_EXTRA_ROOM 256
4477
4478 #define DECODING_BUFFER_MAG(coding)                     \
4479   (coding->type == coding_type_iso2022                  \
4480    ? 3                                                  \
4481    : (coding->type == coding_type_ccl                   \
4482       ? coding->spec.ccl.decoder.buf_magnification      \
4483       : 2))
4484
4485 /* Return maximum size (bytes) of a buffer enough for decoding
4486    SRC_BYTES of text encoded in CODING.  */
4487
4488 int
4489 decoding_buffer_size (coding, src_bytes)
4490      struct coding_system *coding;
4491      int src_bytes;
4492 {
4493   return (src_bytes * DECODING_BUFFER_MAG (coding)
4494           + CONVERSION_BUFFER_EXTRA_ROOM);
4495 }
4496
4497 /* Return maximum size (bytes) of a buffer enough for encoding
4498    SRC_BYTES of text to CODING.  */
4499
4500 int
4501 encoding_buffer_size (coding, src_bytes)
4502      struct coding_system *coding;
4503      int src_bytes;
4504 {
4505   int magnification;
4506
4507   if (coding->type == coding_type_ccl)
4508     {
4509       magnification = coding->spec.ccl.encoder.buf_magnification;
4510       if (coding->eol_type == CODING_EOL_CRLF)
4511         magnification *= 2;
4512     }
4513   else if (CODING_REQUIRE_ENCODING (coding))
4514     magnification = 3;
4515   else
4516     magnification = 1;
4517
4518   return (src_bytes * magnification + CONVERSION_BUFFER_EXTRA_ROOM);
4519 }
4520
4521 /* Working buffer for code conversion.  */
4522 struct conversion_buffer
4523 {
4524   int size;                     /* size of data.  */
4525   int on_stack;                 /* 1 if allocated by alloca.  */
4526   unsigned char *data;
4527 };
4528
4529 /* Don't use alloca for allocating memory space larger than this, lest
4530    we overflow their stack.  */
4531 #define MAX_ALLOCA 16*1024
4532
4533 /* Allocate LEN bytes of memory for BUF (struct conversion_buffer).  */
4534 #define allocate_conversion_buffer(buf, len)            \
4535   do {                                                  \
4536     if (len < MAX_ALLOCA)                               \
4537       {                                                 \
4538         buf.data = (unsigned char *) alloca (len);      \
4539         buf.on_stack = 1;                               \
4540       }                                                 \
4541     else                                                \
4542       {                                                 \
4543         buf.data = (unsigned char *) xmalloc (len);     \
4544         buf.on_stack = 0;                               \
4545       }                                                 \
4546     buf.size = len;                                     \
4547   } while (0)
4548
4549 /* Double the allocated memory for *BUF.  */
4550 static void
4551 extend_conversion_buffer (buf)
4552      struct conversion_buffer *buf;
4553 {
4554   if (buf->on_stack)
4555     {
4556       unsigned char *save = buf->data;
4557       buf->data = (unsigned char *) xmalloc (buf->size * 2);
4558       bcopy (save, buf->data, buf->size);
4559       buf->on_stack = 0;
4560     }
4561   else
4562     {
4563       buf->data = (unsigned char *) xrealloc (buf->data, buf->size * 2);
4564     }
4565   buf->size *= 2;
4566 }
4567
4568 /* Free the allocated memory for BUF if it is not on stack.  */
4569 static void
4570 free_conversion_buffer (buf)
4571      struct conversion_buffer *buf;
4572 {
4573   if (!buf->on_stack)
4574     xfree (buf->data);
4575 }
4576
4577 int
4578 ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep)
4579      struct coding_system *coding;
4580      unsigned char *source, *destination;
4581      int src_bytes, dst_bytes, encodep;
4582 {
4583   struct ccl_program *ccl
4584     = encodep ? &coding->spec.ccl.encoder : &coding->spec.ccl.decoder;
4585   unsigned char *dst = destination;
4586
4587   ccl->suppress_error = coding->suppress_error;
4588   ccl->last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4589   if (encodep)
4590     {
4591       /* On encoding, EOL format is converted within ccl_driver.  For
4592          that, setup proper information in the structure CCL.  */
4593       ccl->eol_type = coding->eol_type;
4594       if (ccl->eol_type ==CODING_EOL_UNDECIDED)
4595         ccl->eol_type = CODING_EOL_LF;
4596       ccl->cr_consumed = coding->spec.ccl.cr_carryover;
4597       ccl->eight_bit_control = coding->dst_multibyte;
4598     }
4599   else
4600     ccl->eight_bit_control = 1;
4601   ccl->multibyte = coding->src_multibyte;
4602   if (coding->spec.ccl.eight_bit_carryover[0] != 0)
4603     {
4604       /* Move carryover bytes to DESTINATION.  */
4605       unsigned char *p = coding->spec.ccl.eight_bit_carryover;
4606       while (*p)
4607         *dst++ = *p++;
4608       coding->spec.ccl.eight_bit_carryover[0] = 0;
4609       if (dst_bytes)
4610         dst_bytes -= dst - destination;
4611     }
4612
4613   coding->produced = (ccl_driver (ccl, source, dst, src_bytes, dst_bytes,
4614                                   &(coding->consumed))
4615                       + dst - destination);
4616
4617   if (encodep)
4618     {
4619       coding->produced_char = coding->produced;
4620       coding->spec.ccl.cr_carryover = ccl->cr_consumed;
4621     }
4622   else if (!ccl->eight_bit_control)
4623     {
4624       /* The produced bytes forms a valid multibyte sequence. */
4625       coding->produced_char
4626         = multibyte_chars_in_text (destination, coding->produced);
4627       coding->spec.ccl.eight_bit_carryover[0] = 0;
4628     }
4629   else
4630     {
4631       /* On decoding, the destination should always multibyte.  But,
4632          CCL program might have been generated an invalid multibyte
4633          sequence.  Here we make such a sequence valid as
4634          multibyte.  */
4635       int bytes
4636         = dst_bytes ? dst_bytes : source + coding->consumed - destination;
4637
4638       if ((coding->consumed < src_bytes
4639            || !ccl->last_block)
4640           && coding->produced >= 1
4641           && destination[coding->produced - 1] >= 0x80)
4642         {
4643           /* We should not convert the tailing 8-bit codes to
4644              multibyte form even if they doesn't form a valid
4645              multibyte sequence.  They may form a valid sequence in
4646              the next call.  */
4647           int carryover = 0;
4648
4649           if (destination[coding->produced - 1] < 0xA0)
4650             carryover = 1;
4651           else if (coding->produced >= 2)
4652             {
4653               if (destination[coding->produced - 2] >= 0x80)
4654                 {
4655                   if (destination[coding->produced - 2] < 0xA0)
4656                     carryover = 2;
4657                   else if (coding->produced >= 3
4658                            && destination[coding->produced - 3] >= 0x80
4659                            && destination[coding->produced - 3] < 0xA0)
4660                     carryover = 3;
4661                 }
4662             }
4663           if (carryover > 0)
4664             {
4665               BCOPY_SHORT (destination + coding->produced - carryover,
4666                            coding->spec.ccl.eight_bit_carryover,
4667                            carryover);
4668               coding->spec.ccl.eight_bit_carryover[carryover] = 0;
4669               coding->produced -= carryover;
4670             }
4671         }
4672       coding->produced = str_as_multibyte (destination, bytes,
4673                                            coding->produced,
4674                                            &(coding->produced_char));
4675     }
4676
4677   switch (ccl->status)
4678     {
4679     case CCL_STAT_SUSPEND_BY_SRC:
4680       coding->result = CODING_FINISH_INSUFFICIENT_SRC;
4681       break;
4682     case CCL_STAT_SUSPEND_BY_DST:
4683       coding->result = CODING_FINISH_INSUFFICIENT_DST;
4684       break;
4685     case CCL_STAT_QUIT:
4686     case CCL_STAT_INVALID_CMD:
4687       coding->result = CODING_FINISH_INTERRUPT;
4688       break;
4689     default:
4690       coding->result = CODING_FINISH_NORMAL;
4691       break;
4692     }
4693   return coding->result;
4694 }
4695
4696 /* Decode EOL format of the text at PTR of BYTES length destructively
4697    according to CODING->eol_type.  This is called after the CCL
4698    program produced a decoded text at PTR.  If we do CRLF->LF
4699    conversion, update CODING->produced and CODING->produced_char.  */
4700
4701 static void
4702 decode_eol_post_ccl (coding, ptr, bytes)
4703      struct coding_system *coding;
4704      unsigned char *ptr;
4705      int bytes;
4706 {
4707   Lisp_Object val, saved_coding_symbol;
4708   unsigned char *pend = ptr + bytes;
4709   int dummy;
4710
4711   /* Remember the current coding system symbol.  We set it back when
4712      an inconsistent EOL is found so that `last-coding-system-used' is
4713      set to the coding system that doesn't specify EOL conversion.  */
4714   saved_coding_symbol = coding->symbol;
4715
4716   coding->spec.ccl.cr_carryover = 0;
4717   if (coding->eol_type == CODING_EOL_UNDECIDED)
4718     {
4719       /* Here, to avoid the call of setup_coding_system, we directly
4720          call detect_eol_type.  */
4721       coding->eol_type = detect_eol_type (ptr, bytes, &dummy);
4722       if (coding->eol_type == CODING_EOL_INCONSISTENT)
4723         coding->eol_type = CODING_EOL_LF;
4724       if (coding->eol_type != CODING_EOL_UNDECIDED)
4725         {
4726           val = Fget (coding->symbol, Qeol_type);
4727           if (VECTORP (val) && XVECTOR (val)->size == 3)
4728             coding->symbol = XVECTOR (val)->contents[coding->eol_type];
4729         }
4730       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4731     }
4732
4733   if (coding->eol_type == CODING_EOL_LF
4734       || coding->eol_type == CODING_EOL_UNDECIDED)
4735     {
4736       /* We have nothing to do.  */
4737       ptr = pend;
4738     }
4739   else if (coding->eol_type == CODING_EOL_CRLF)
4740     {
4741       unsigned char *pstart = ptr, *p = ptr;
4742
4743       if (! (coding->mode & CODING_MODE_LAST_BLOCK)
4744           && *(pend - 1) == '\r')
4745         {
4746           /* If the last character is CR, we can't handle it here
4747              because LF will be in the not-yet-decoded source text.
4748              Record that the CR is not yet processed.  */
4749           coding->spec.ccl.cr_carryover = 1;
4750           coding->produced--;
4751           coding->produced_char--;
4752           pend--;
4753         }
4754       while (ptr < pend)
4755         {
4756           if (*ptr == '\r')
4757             {
4758               if (ptr + 1 < pend && *(ptr + 1) == '\n')
4759                 {
4760                   *p++ = '\n';
4761                   ptr += 2;
4762                 }
4763               else
4764                 {
4765                   if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4766                     goto undo_eol_conversion;
4767                   *p++ = *ptr++;
4768                 }
4769             }
4770           else if (*ptr == '\n'
4771                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4772             goto undo_eol_conversion;
4773           else
4774             *p++ = *ptr++;
4775           continue;
4776
4777         undo_eol_conversion:
4778           /* We have faced with inconsistent EOL format at PTR.
4779              Convert all LFs before PTR back to CRLFs.  */
4780           for (p--, ptr--; p >= pstart; p--)
4781             {
4782               if (*p == '\n')
4783                 *ptr-- = '\n', *ptr-- = '\r';
4784               else
4785                 *ptr-- = *p;
4786             }
4787           /*  If carryover is recorded, cancel it because we don't
4788               convert CRLF anymore.  */
4789           if (coding->spec.ccl.cr_carryover)
4790             {
4791               coding->spec.ccl.cr_carryover = 0;
4792               coding->produced++;
4793               coding->produced_char++;
4794               pend++;
4795             }
4796           p = ptr = pend;
4797           coding->eol_type = CODING_EOL_LF;
4798           coding->symbol = saved_coding_symbol;
4799         }
4800       if (p < pend)
4801         {
4802           /* As each two-byte sequence CRLF was converted to LF, (PEND
4803              - P) is the number of deleted characters.  */
4804           coding->produced -= pend - p;
4805           coding->produced_char -= pend - p;
4806         }
4807     }
4808   else                  /* i.e. coding->eol_type == CODING_EOL_CR */
4809     {
4810       unsigned char *p = ptr;
4811
4812       for (; ptr < pend; ptr++)
4813         {
4814           if (*ptr == '\r')
4815             *ptr = '\n';
4816           else if (*ptr == '\n'
4817                    && coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL)
4818             {
4819               for (; p < ptr; p++)
4820                 {
4821                   if (*p == '\n')
4822                     *p = '\r';
4823                 }
4824               ptr = pend;
4825               coding->eol_type = CODING_EOL_LF;
4826               coding->symbol = saved_coding_symbol;
4827             }
4828         }
4829     }
4830 }
4831
4832 /* See "GENERAL NOTES about `decode_coding_XXX ()' functions".  Before
4833    decoding, it may detect coding system and format of end-of-line if
4834    those are not yet decided.  The source should be unibyte, the
4835    result is multibyte if CODING->dst_multibyte is nonzero, else
4836    unibyte.  */
4837
4838 int
4839 decode_coding (coding, source, destination, src_bytes, dst_bytes)
4840      struct coding_system *coding;
4841      const unsigned char *source;
4842      unsigned char *destination;
4843      int src_bytes, dst_bytes;
4844 {
4845   int extra = 0;
4846
4847   if (coding->type == coding_type_undecided)
4848     detect_coding (coding, source, src_bytes);
4849
4850   if (coding->eol_type == CODING_EOL_UNDECIDED
4851       && coding->type != coding_type_ccl)
4852     {
4853       detect_eol (coding, source, src_bytes);
4854       /* We had better recover the original eol format if we
4855          encounter an inconsistent eol format while decoding.  */
4856       coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
4857     }
4858
4859   coding->produced = coding->produced_char = 0;
4860   coding->consumed = coding->consumed_char = 0;
4861   coding->errors = 0;
4862   coding->result = CODING_FINISH_NORMAL;
4863
4864   switch (coding->type)
4865     {
4866     case coding_type_sjis:
4867       decode_coding_sjis_big5 (coding, source, destination,
4868                                src_bytes, dst_bytes, 1);
4869       break;
4870
4871     case coding_type_iso2022:
4872       decode_coding_iso2022 (coding, source, destination,
4873                              src_bytes, dst_bytes);
4874       break;
4875
4876     case coding_type_big5:
4877       decode_coding_sjis_big5 (coding, source, destination,
4878                                src_bytes, dst_bytes, 0);
4879       break;
4880
4881     case coding_type_emacs_mule:
4882       decode_coding_emacs_mule (coding, source, destination,
4883                                 src_bytes, dst_bytes);
4884       break;
4885
4886     case coding_type_ccl:
4887       if (coding->spec.ccl.cr_carryover)
4888         {
4889           /* Put the CR which was not processed by the previous call
4890              of decode_eol_post_ccl in DESTINATION.  It will be
4891              decoded together with the following LF by the call to
4892              decode_eol_post_ccl below.  */
4893           *destination = '\r';
4894           coding->produced++;
4895           coding->produced_char++;
4896           dst_bytes--;
4897           extra = coding->spec.ccl.cr_carryover;
4898         }
4899       ccl_coding_driver (coding, source, destination + extra,
4900                          src_bytes, dst_bytes, 0);
4901       if (coding->eol_type != CODING_EOL_LF)
4902         {
4903           coding->produced += extra;
4904           coding->produced_char += extra;
4905           decode_eol_post_ccl (coding, destination, coding->produced);
4906         }
4907       break;
4908
4909     default:
4910       decode_eol (coding, source, destination, src_bytes, dst_bytes);
4911     }
4912
4913   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
4914       && coding->mode & CODING_MODE_LAST_BLOCK
4915       && coding->consumed == src_bytes)
4916     coding->result = CODING_FINISH_NORMAL;
4917
4918   if (coding->mode & CODING_MODE_LAST_BLOCK
4919       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4920     {
4921       const unsigned char *src = source + coding->consumed;
4922       unsigned char *dst = destination + coding->produced;
4923
4924       src_bytes -= coding->consumed;
4925       coding->errors++;
4926       if (COMPOSING_P (coding))
4927         DECODE_COMPOSITION_END ('1');
4928       while (src_bytes--)
4929         {
4930           int c = *src++;
4931           dst += CHAR_STRING (c, dst);
4932           coding->produced_char++;
4933         }
4934       coding->consumed = coding->consumed_char = src - source;
4935       coding->produced = dst - destination;
4936       coding->result = CODING_FINISH_NORMAL;
4937     }
4938
4939   if (!coding->dst_multibyte)
4940     {
4941       coding->produced = str_as_unibyte (destination, coding->produced);
4942       coding->produced_char = coding->produced;
4943     }
4944
4945   return coding->result;
4946 }
4947
4948 /* See "GENERAL NOTES about `encode_coding_XXX ()' functions".  The
4949    multibyteness of the source is CODING->src_multibyte, the
4950    multibyteness of the result is always unibyte.  */
4951
4952 int
4953 encode_coding (coding, source, destination, src_bytes, dst_bytes)
4954      struct coding_system *coding;
4955      const unsigned char *source;
4956      unsigned char *destination;
4957      int src_bytes, dst_bytes;
4958 {
4959   coding->produced = coding->produced_char = 0;
4960   coding->consumed = coding->consumed_char = 0;
4961   coding->errors = 0;
4962   coding->result = CODING_FINISH_NORMAL;
4963
4964   switch (coding->type)
4965     {
4966     case coding_type_sjis:
4967       encode_coding_sjis_big5 (coding, source, destination,
4968                                src_bytes, dst_bytes, 1);
4969       break;
4970
4971     case coding_type_iso2022:
4972       encode_coding_iso2022 (coding, source, destination,
4973                              src_bytes, dst_bytes);
4974       break;
4975
4976     case coding_type_big5:
4977       encode_coding_sjis_big5 (coding, source, destination,
4978                                src_bytes, dst_bytes, 0);
4979       break;
4980
4981     case coding_type_emacs_mule:
4982       encode_coding_emacs_mule (coding, source, destination,
4983                                 src_bytes, dst_bytes);
4984       break;
4985
4986     case coding_type_ccl:
4987       ccl_coding_driver (coding, source, destination,
4988                          src_bytes, dst_bytes, 1);
4989       break;
4990
4991     default:
4992       encode_eol (coding, source, destination, src_bytes, dst_bytes);
4993     }
4994
4995   if (coding->mode & CODING_MODE_LAST_BLOCK
4996       && coding->result == CODING_FINISH_INSUFFICIENT_SRC)
4997     {
4998       const unsigned char *src = source + coding->consumed;
4999       unsigned char *dst = destination + coding->produced;
5000
5001       if (coding->type == coding_type_iso2022)
5002         ENCODE_RESET_PLANE_AND_REGISTER;
5003       if (COMPOSING_P (coding))
5004         *dst++ = ISO_CODE_ESC, *dst++ = '1';
5005       if (coding->consumed < src_bytes)
5006         {
5007           int len = src_bytes - coding->consumed;
5008
5009           BCOPY_SHORT (src, dst, len);
5010           if (coding->src_multibyte)
5011             len = str_as_unibyte (dst, len);
5012           dst += len;
5013           coding->consumed = src_bytes;
5014         }
5015       coding->produced = coding->produced_char = dst - destination;
5016       coding->result = CODING_FINISH_NORMAL;
5017     }
5018
5019   if (coding->result == CODING_FINISH_INSUFFICIENT_SRC
5020       && coding->consumed == src_bytes)
5021     coding->result = CODING_FINISH_NORMAL;
5022
5023   return coding->result;
5024 }
5025
5026 /* Scan text in the region between *BEG and *END (byte positions),
5027    skip characters which we don't have to decode by coding system
5028    CODING at the head and tail, then set *BEG and *END to the region
5029    of the text we actually have to convert.  The caller should move
5030    the gap out of the region in advance if the region is from a
5031    buffer.
5032
5033    If STR is not NULL, *BEG and *END are indices into STR.  */
5034
5035 static void
5036 shrink_decoding_region (beg, end, coding, str)
5037      int *beg, *end;
5038      struct coding_system *coding;
5039      unsigned char *str;
5040 {
5041   unsigned char *begp_orig, *begp, *endp_orig, *endp, c;
5042   int eol_conversion;
5043   Lisp_Object translation_table;
5044
5045   if (coding->type == coding_type_ccl
5046       || coding->type == coding_type_undecided
5047       || coding->eol_type != CODING_EOL_LF
5048       || !NILP (coding->post_read_conversion)
5049       || coding->composing != COMPOSITION_DISABLED)
5050     {
5051       /* We can't skip any data.  */
5052       return;
5053     }
5054   if (coding->type == coding_type_no_conversion
5055       || coding->type == coding_type_raw_text
5056       || coding->type == coding_type_emacs_mule)
5057     {
5058       /* We need no conversion, but don't have to skip any data here.
5059          Decoding routine handles them effectively anyway.  */
5060       return;
5061     }
5062
5063   translation_table = coding->translation_table_for_decode;
5064   if (NILP (translation_table) && !NILP (Venable_character_translation))
5065     translation_table = Vstandard_translation_table_for_decode;
5066   if (CHAR_TABLE_P (translation_table))
5067     {
5068       int i;
5069       for (i = 0; i < 128; i++)
5070         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5071           break;
5072       if (i < 128)
5073         /* Some ASCII character should be translated.  We give up
5074            shrinking.  */
5075         return;
5076     }
5077
5078   if (coding->heading_ascii >= 0)
5079     /* Detection routine has already found how much we can skip at the
5080        head.  */
5081     *beg += coding->heading_ascii;
5082
5083   if (str)
5084     {
5085       begp_orig = begp = str + *beg;
5086       endp_orig = endp = str + *end;
5087     }
5088   else
5089     {
5090       begp_orig = begp = BYTE_POS_ADDR (*beg);
5091       endp_orig = endp = begp + *end - *beg;
5092     }
5093
5094   eol_conversion = (coding->eol_type == CODING_EOL_CR
5095                     || coding->eol_type == CODING_EOL_CRLF);
5096
5097   switch (coding->type)
5098     {
5099     case coding_type_sjis:
5100     case coding_type_big5:
5101       /* We can skip all ASCII characters at the head.  */
5102       if (coding->heading_ascii < 0)
5103         {
5104           if (eol_conversion)
5105             while (begp < endp && *begp < 0x80 && *begp != '\r') begp++;
5106           else
5107             while (begp < endp && *begp < 0x80) begp++;
5108         }
5109       /* We can skip all ASCII characters at the tail except for the
5110          second byte of SJIS or BIG5 code.  */
5111       if (eol_conversion)
5112         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\r') endp--;
5113       else
5114         while (begp < endp && endp[-1] < 0x80) endp--;
5115       /* Do not consider LF as ascii if preceded by CR, since that
5116          confuses eol decoding. */
5117       if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5118         endp++;
5119       if (begp < endp && endp < endp_orig && endp[-1] >= 0x80)
5120         endp++;
5121       break;
5122
5123     case coding_type_iso2022:
5124       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5125         /* We can't skip any data.  */
5126         break;
5127       if (coding->heading_ascii < 0)
5128         {
5129           /* We can skip all ASCII characters at the head except for a
5130              few control codes.  */
5131           while (begp < endp && (c = *begp) < 0x80
5132                  && c != ISO_CODE_CR && c != ISO_CODE_SO
5133                  && c != ISO_CODE_SI && c != ISO_CODE_ESC
5134                  && (!eol_conversion || c != ISO_CODE_LF))
5135             begp++;
5136         }
5137       switch (coding->category_idx)
5138         {
5139         case CODING_CATEGORY_IDX_ISO_8_1:
5140         case CODING_CATEGORY_IDX_ISO_8_2:
5141           /* We can skip all ASCII characters at the tail.  */
5142           if (eol_conversion)
5143             while (begp < endp && (c = endp[-1]) < 0x80 && c != '\r') endp--;
5144           else
5145             while (begp < endp && endp[-1] < 0x80) endp--;
5146           /* Do not consider LF as ascii if preceded by CR, since that
5147              confuses eol decoding. */
5148           if (begp < endp && endp < endp_orig && endp[-1] == '\r' && endp[0] == '\n')
5149             endp++;
5150           break;
5151
5152         case CODING_CATEGORY_IDX_ISO_7:
5153         case CODING_CATEGORY_IDX_ISO_7_TIGHT:
5154           {
5155             /* We can skip all characters at the tail except for 8-bit
5156                codes and ESC and the following 2-byte at the tail.  */
5157             unsigned char *eight_bit = NULL;
5158
5159             if (eol_conversion)
5160               while (begp < endp
5161                      && (c = endp[-1]) != ISO_CODE_ESC && c != '\r')
5162                 {
5163                   if (!eight_bit && c & 0x80) eight_bit = endp;
5164                   endp--;
5165                 }
5166             else
5167               while (begp < endp
5168                      && (c = endp[-1]) != ISO_CODE_ESC)
5169                 {
5170                   if (!eight_bit && c & 0x80) eight_bit = endp;
5171                   endp--;
5172                 }
5173             /* Do not consider LF as ascii if preceded by CR, since that
5174                confuses eol decoding. */
5175             if (begp < endp && endp < endp_orig
5176                 && endp[-1] == '\r' && endp[0] == '\n')
5177               endp++;
5178             if (begp < endp && endp[-1] == ISO_CODE_ESC)
5179               {
5180                 if (endp + 1 < endp_orig && end[0] == '(' && end[1] == 'B')
5181                   /* This is an ASCII designation sequence.  We can
5182                      surely skip the tail.  But, if we have
5183                      encountered an 8-bit code, skip only the codes
5184                      after that.  */
5185                   endp = eight_bit ? eight_bit : endp + 2;
5186                 else
5187                   /* Hmmm, we can't skip the tail.  */
5188                   endp = endp_orig;
5189               }
5190             else if (eight_bit)
5191               endp = eight_bit;
5192           }
5193         }
5194       break;
5195
5196     default:
5197       abort ();
5198     }
5199   *beg += begp - begp_orig;
5200   *end += endp - endp_orig;
5201   return;
5202 }
5203
5204 /* Like shrink_decoding_region but for encoding.  */
5205
5206 static void
5207 shrink_encoding_region (beg, end, coding, str)
5208      int *beg, *end;
5209      struct coding_system *coding;
5210      unsigned char *str;
5211 {
5212   unsigned char *begp_orig, *begp, *endp_orig, *endp;
5213   int eol_conversion;
5214   Lisp_Object translation_table;
5215
5216   if (coding->type == coding_type_ccl
5217       || coding->eol_type == CODING_EOL_CRLF
5218       || coding->eol_type == CODING_EOL_CR
5219       || (coding->cmp_data && coding->cmp_data->used > 0))
5220     {
5221       /* We can't skip any data.  */
5222       return;
5223     }
5224   if (coding->type == coding_type_no_conversion
5225       || coding->type == coding_type_raw_text
5226       || coding->type == coding_type_emacs_mule
5227       || coding->type == coding_type_undecided)
5228     {
5229       /* We need no conversion, but don't have to skip any data here.
5230          Encoding routine handles them effectively anyway.  */
5231       return;
5232     }
5233
5234   translation_table = coding->translation_table_for_encode;
5235   if (NILP (translation_table) && !NILP (Venable_character_translation))
5236     translation_table = Vstandard_translation_table_for_encode;
5237   if (CHAR_TABLE_P (translation_table))
5238     {
5239       int i;
5240       for (i = 0; i < 128; i++)
5241         if (!NILP (CHAR_TABLE_REF (translation_table, i)))
5242           break;
5243       if (i < 128)
5244         /* Some ASCII character should be translated.  We give up
5245            shrinking.  */
5246         return;
5247     }
5248
5249   if (str)
5250     {
5251       begp_orig = begp = str + *beg;
5252       endp_orig = endp = str + *end;
5253     }
5254   else
5255     {
5256       begp_orig = begp = BYTE_POS_ADDR (*beg);
5257       endp_orig = endp = begp + *end - *beg;
5258     }
5259
5260   eol_conversion = (coding->eol_type == CODING_EOL_CR
5261                     || coding->eol_type == CODING_EOL_CRLF);
5262
5263   /* Here, we don't have to check coding->pre_write_conversion because
5264      the caller is expected to have handled it already.  */
5265   switch (coding->type)
5266     {
5267     case coding_type_iso2022:
5268       if (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding, 0) != CHARSET_ASCII)
5269         /* We can't skip any data.  */
5270         break;
5271       if (coding->flags & CODING_FLAG_ISO_DESIGNATE_AT_BOL)
5272         {
5273           unsigned char *bol = begp;
5274           while (begp < endp && *begp < 0x80)
5275             {
5276               begp++;
5277               if (begp[-1] == '\n')
5278                 bol = begp;
5279             }
5280           begp = bol;
5281           goto label_skip_tail;
5282         }
5283       /* fall down ... */
5284
5285     case coding_type_sjis:
5286     case coding_type_big5:
5287       /* We can skip all ASCII characters at the head and tail.  */
5288       if (eol_conversion)
5289         while (begp < endp && *begp < 0x80 && *begp != '\n') begp++;
5290       else
5291         while (begp < endp && *begp < 0x80) begp++;
5292     label_skip_tail:
5293       if (eol_conversion)
5294         while (begp < endp && endp[-1] < 0x80 && endp[-1] != '\n') endp--;
5295       else
5296         while (begp < endp && *(endp - 1) < 0x80) endp--;
5297       break;
5298
5299     default:
5300       abort ();
5301     }
5302
5303   *beg += begp - begp_orig;
5304   *end += endp - endp_orig;
5305   return;
5306 }
5307
5308 /* As shrinking conversion region requires some overhead, we don't try
5309    shrinking if the length of conversion region is less than this
5310    value.  */
5311 static int shrink_conversion_region_threshhold = 1024;
5312
5313 #define SHRINK_CONVERSION_REGION(beg, end, coding, str, encodep)        \
5314   do {                                                                  \
5315     if (*(end) - *(beg) > shrink_conversion_region_threshhold)          \
5316       {                                                                 \
5317         if (encodep) shrink_encoding_region (beg, end, coding, str);    \
5318         else shrink_decoding_region (beg, end, coding, str);            \
5319       }                                                                 \
5320   } while (0)
5321
5322 static Lisp_Object
5323 code_convert_region_unwind (arg)
5324      Lisp_Object arg;
5325 {
5326   inhibit_pre_post_conversion = 0;
5327   Vlast_coding_system_used = arg;
5328   return Qnil;
5329 }
5330
5331 /* Store information about all compositions in the range FROM and TO
5332    of OBJ in memory blocks pointed by CODING->cmp_data.  OBJ is a
5333    buffer or a string, defaults to the current buffer.  */
5334
5335 void
5336 coding_save_composition (coding, from, to, obj)
5337      struct coding_system *coding;
5338      int from, to;
5339      Lisp_Object obj;
5340 {
5341   Lisp_Object prop;
5342   int start, end;
5343
5344   if (coding->composing == COMPOSITION_DISABLED)
5345     return;
5346   if (!coding->cmp_data)
5347     coding_allocate_composition_data (coding, from);
5348   if (!find_composition (from, to, &start, &end, &prop, obj)
5349       || end > to)
5350     return;
5351   if (start < from
5352       && (!find_composition (end, to, &start, &end, &prop, obj)
5353           || end > to))
5354     return;
5355   coding->composing = COMPOSITION_NO;
5356   do
5357     {
5358       if (COMPOSITION_VALID_P (start, end, prop))
5359         {
5360           enum composition_method method = COMPOSITION_METHOD (prop);
5361           if (coding->cmp_data->used + COMPOSITION_DATA_MAX_BUNCH_LENGTH
5362               >= COMPOSITION_DATA_SIZE)
5363             coding_allocate_composition_data (coding, from);
5364           /* For relative composition, we remember start and end
5365              positions, for the other compositions, we also remember
5366              components.  */
5367           CODING_ADD_COMPOSITION_START (coding, start - from, method);
5368           if (method != COMPOSITION_RELATIVE)
5369             {
5370               /* We must store a*/
5371               Lisp_Object val, ch;
5372
5373               val = COMPOSITION_COMPONENTS (prop);
5374               if (CONSP (val))
5375                 while (CONSP (val))
5376                   {
5377                     ch = XCAR (val), val = XCDR (val);
5378                     CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5379                   }
5380               else if (VECTORP (val) || STRINGP (val))
5381                 {
5382                   int len = (VECTORP (val)
5383                              ? XVECTOR (val)->size : SCHARS (val));
5384                   int i;
5385                   for (i = 0; i < len; i++)
5386                     {
5387                       ch = (STRINGP (val)
5388                             ? Faref (val, make_number (i))
5389                             : XVECTOR (val)->contents[i]);
5390                       CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (ch));
5391                     }
5392                 }
5393               else              /* INTEGERP (val) */
5394                 CODING_ADD_COMPOSITION_COMPONENT (coding, XINT (val));
5395             }
5396           CODING_ADD_COMPOSITION_END (coding, end - from);
5397         }
5398       start = end;
5399     }
5400   while (start < to
5401          && find_composition (start, to, &start, &end, &prop, obj)
5402          && end <= to);
5403
5404   /* Make coding->cmp_data point to the first memory block.  */
5405   while (coding->cmp_data->prev)
5406     coding->cmp_data = coding->cmp_data->prev;
5407   coding->cmp_data_start = 0;
5408 }
5409
5410 /* Reflect the saved information about compositions to OBJ.
5411    CODING->cmp_data points to a memory block for the information.  OBJ
5412    is a buffer or a string, defaults to the current buffer.  */
5413
5414 void
5415 coding_restore_composition (coding, obj)
5416      struct coding_system *coding;
5417      Lisp_Object obj;
5418 {
5419   struct composition_data *cmp_data = coding->cmp_data;
5420
5421   if (!cmp_data)
5422     return;
5423
5424   while (cmp_data->prev)
5425     cmp_data = cmp_data->prev;
5426
5427   while (cmp_data)
5428     {
5429       int i;
5430
5431       for (i = 0; i < cmp_data->used && cmp_data->data[i] > 0;
5432            i += cmp_data->data[i])
5433         {
5434           int *data = cmp_data->data + i;
5435           enum composition_method method = (enum composition_method) data[3];
5436           Lisp_Object components;
5437
5438           if (method == COMPOSITION_RELATIVE)
5439             components = Qnil;
5440           else
5441             {
5442               int len = data[0] - 4, j;
5443               Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5444
5445               if (method == COMPOSITION_WITH_RULE_ALTCHARS
5446                   && len % 2 == 0)
5447                 len --;
5448               for (j = 0; j < len; j++)
5449                 args[j] = make_number (data[4 + j]);
5450               components = (method == COMPOSITION_WITH_ALTCHARS
5451                             ? Fstring (len, args) : Fvector (len, args));
5452             }
5453           compose_text (data[1], data[2], components, Qnil, obj);
5454         }
5455       cmp_data = cmp_data->next;
5456     }
5457 }
5458
5459 /* Decode (if ENCODEP is zero) or encode (if ENCODEP is nonzero) the
5460    text from FROM to TO (byte positions are FROM_BYTE and TO_BYTE) by
5461    coding system CODING, and return the status code of code conversion
5462    (currently, this value has no meaning).
5463
5464    How many characters (and bytes) are converted to how many
5465    characters (and bytes) are recorded in members of the structure
5466    CODING.
5467
5468    If REPLACE is nonzero, we do various things as if the original text
5469    is deleted and a new text is inserted.  See the comments in
5470    replace_range (insdel.c) to know what we are doing.
5471
5472    If REPLACE is zero, it is assumed that the source text is unibyte.
5473    Otherwise, it is assumed that the source text is multibyte.  */
5474
5475 int
5476 code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace)
5477      int from, from_byte, to, to_byte, encodep, replace;
5478      struct coding_system *coding;
5479 {
5480   int len = to - from, len_byte = to_byte - from_byte;
5481   int nchars_del = 0, nbytes_del = 0;
5482   int require, inserted, inserted_byte;
5483   int head_skip, tail_skip, total_skip = 0;
5484   Lisp_Object saved_coding_symbol;
5485   int first = 1;
5486   unsigned char *src, *dst;
5487   Lisp_Object deletion;
5488   int orig_point = PT, orig_len = len;
5489   int prev_Z;
5490   int multibyte_p = !NILP (current_buffer->enable_multibyte_characters);
5491
5492   deletion = Qnil;
5493   saved_coding_symbol = coding->symbol;
5494
5495   if (from < PT && PT < to)
5496     {
5497       TEMP_SET_PT_BOTH (from, from_byte);
5498       orig_point = from;
5499     }
5500
5501   if (replace)
5502     {
5503       int saved_from = from;
5504       int saved_inhibit_modification_hooks;
5505
5506       prepare_to_modify_buffer (from, to, &from);
5507       if (saved_from != from)
5508         {
5509           to = from + len;
5510           from_byte = CHAR_TO_BYTE (from), to_byte = CHAR_TO_BYTE (to);
5511           len_byte = to_byte - from_byte;
5512         }
5513
5514       /* The code conversion routine can not preserve text properties
5515          for now.  So, we must remove all text properties in the
5516          region.  Here, we must suppress all modification hooks.  */
5517       saved_inhibit_modification_hooks = inhibit_modification_hooks;
5518       inhibit_modification_hooks = 1;
5519       Fset_text_properties (make_number (from), make_number (to), Qnil, Qnil);
5520       inhibit_modification_hooks = saved_inhibit_modification_hooks;
5521     }
5522
5523   if (! encodep && CODING_REQUIRE_DETECTION (coding))
5524     {
5525       /* We must detect encoding of text and eol format.  */
5526
5527       if (from < GPT && to > GPT)
5528         move_gap_both (from, from_byte);
5529       if (coding->type == coding_type_undecided)
5530         {
5531           detect_coding (coding, BYTE_POS_ADDR (from_byte), len_byte);
5532           if (coding->type == coding_type_undecided)
5533             {
5534               /* It seems that the text contains only ASCII, but we
5535                  should not leave it undecided because the deeper
5536                  decoding routine (decode_coding) tries to detect the
5537                  encodings again in vain.  */
5538               coding->type = coding_type_emacs_mule;
5539               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
5540               /* As emacs-mule decoder will handle composition, we
5541                  need this setting to allocate coding->cmp_data
5542                  later.  */
5543               coding->composing = COMPOSITION_NO;
5544             }
5545         }
5546       if (coding->eol_type == CODING_EOL_UNDECIDED
5547           && coding->type != coding_type_ccl)
5548         {
5549           detect_eol (coding, BYTE_POS_ADDR (from_byte), len_byte);
5550           if (coding->eol_type == CODING_EOL_UNDECIDED)
5551             coding->eol_type = CODING_EOL_LF;
5552           /* We had better recover the original eol format if we
5553              encounter an inconsistent eol format while decoding.  */
5554           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
5555         }
5556     }
5557
5558   /* Now we convert the text.  */
5559
5560   /* For encoding, we must process pre-write-conversion in advance.  */
5561   if (! inhibit_pre_post_conversion
5562       && encodep
5563       && SYMBOLP (coding->pre_write_conversion)
5564       && ! NILP (Ffboundp (coding->pre_write_conversion)))
5565     {
5566       /* The function in pre-write-conversion may put a new text in a
5567          new buffer.  */
5568       struct buffer *prev = current_buffer;
5569       Lisp_Object new;
5570
5571       record_unwind_protect (code_convert_region_unwind,
5572                              Vlast_coding_system_used);
5573       /* We should not call any more pre-write/post-read-conversion
5574          functions while this pre-write-conversion is running.  */
5575       inhibit_pre_post_conversion = 1;
5576       call2 (coding->pre_write_conversion,
5577              make_number (from), make_number (to));
5578       inhibit_pre_post_conversion = 0;
5579       /* Discard the unwind protect.  */
5580       specpdl_ptr--;
5581
5582       if (current_buffer != prev)
5583         {
5584           len = ZV - BEGV;
5585           new = Fcurrent_buffer ();
5586           set_buffer_internal_1 (prev);
5587           del_range_2 (from, from_byte, to, to_byte, 0);
5588           TEMP_SET_PT_BOTH (from, from_byte);
5589           insert_from_buffer (XBUFFER (new), 1, len, 0);
5590           Fkill_buffer (new);
5591           if (orig_point >= to)
5592             orig_point += len - orig_len;
5593           else if (orig_point > from)
5594             orig_point = from;
5595           orig_len = len;
5596           to = from + len;
5597           from_byte = CHAR_TO_BYTE (from);
5598           to_byte = CHAR_TO_BYTE (to);
5599           len_byte = to_byte - from_byte;
5600           TEMP_SET_PT_BOTH (from, from_byte);
5601         }
5602     }
5603
5604   if (replace)
5605     {
5606       if (! EQ (current_buffer->undo_list, Qt))
5607         deletion = make_buffer_string_both (from, from_byte, to, to_byte, 1);
5608       else
5609         {
5610           nchars_del = to - from;
5611           nbytes_del = to_byte - from_byte;
5612         }
5613     }
5614
5615   if (coding->composing != COMPOSITION_DISABLED)
5616     {
5617       if (encodep)
5618         coding_save_composition (coding, from, to, Fcurrent_buffer ());
5619       else
5620         coding_allocate_composition_data (coding, from);
5621     }
5622
5623   /* Try to skip the heading and tailing ASCIIs.  */
5624   if (coding->type != coding_type_ccl)
5625     {
5626       int from_byte_orig = from_byte, to_byte_orig = to_byte;
5627
5628       if (from < GPT && GPT < to)
5629         move_gap_both (from, from_byte);
5630       SHRINK_CONVERSION_REGION (&from_byte, &to_byte, coding, NULL, encodep);
5631       if (from_byte == to_byte
5632           && (encodep || NILP (coding->post_read_conversion))
5633           && ! CODING_REQUIRE_FLUSHING (coding))
5634         {
5635           coding->produced = len_byte;
5636           coding->produced_char = len;
5637           if (!replace)
5638             /* We must record and adjust for this new text now.  */
5639             adjust_after_insert (from, from_byte_orig, to, to_byte_orig, len);
5640           return 0;
5641         }
5642
5643       head_skip = from_byte - from_byte_orig;
5644       tail_skip = to_byte_orig - to_byte;
5645       total_skip = head_skip + tail_skip;
5646       from += head_skip;
5647       to -= tail_skip;
5648       len -= total_skip; len_byte -= total_skip;
5649     }
5650
5651   /* For conversion, we must put the gap before the text in addition to
5652      making the gap larger for efficient decoding.  The required gap
5653      size starts from 2000 which is the magic number used in make_gap.
5654      But, after one batch of conversion, it will be incremented if we
5655      find that it is not enough .  */
5656   require = 2000;
5657
5658   if (GAP_SIZE  < require)
5659     make_gap (require - GAP_SIZE);
5660   move_gap_both (from, from_byte);
5661
5662   inserted = inserted_byte = 0;
5663
5664   GAP_SIZE += len_byte;
5665   ZV -= len;
5666   Z -= len;
5667   ZV_BYTE -= len_byte;
5668   Z_BYTE -= len_byte;
5669
5670   if (GPT - BEG < BEG_UNCHANGED)
5671     BEG_UNCHANGED = GPT - BEG;
5672   if (Z - GPT < END_UNCHANGED)
5673     END_UNCHANGED = Z - GPT;
5674
5675   if (!encodep && coding->src_multibyte)
5676     {
5677       /* Decoding routines expects that the source text is unibyte.
5678          We must convert 8-bit characters of multibyte form to
5679          unibyte.  */
5680       int len_byte_orig = len_byte;
5681       len_byte = str_as_unibyte (GAP_END_ADDR - len_byte, len_byte);
5682       if (len_byte < len_byte_orig)
5683         safe_bcopy (GAP_END_ADDR - len_byte_orig, GAP_END_ADDR - len_byte,
5684                     len_byte);
5685       coding->src_multibyte = 0;
5686     }
5687
5688   for (;;)
5689     {
5690       int result;
5691
5692       /* The buffer memory is now:
5693          +--------+converted-text+---------+-------original-text-------+---+
5694          |<-from->|<--inserted-->|---------|<--------len_byte--------->|---|
5695                   |<---------------------- GAP ----------------------->|  */
5696       src = GAP_END_ADDR - len_byte;
5697       dst = GPT_ADDR + inserted_byte;
5698
5699       if (encodep)
5700         result = encode_coding (coding, src, dst, len_byte, 0);
5701       else
5702         {
5703           if (coding->composing != COMPOSITION_DISABLED)
5704             coding->cmp_data->char_offset = from + inserted;
5705           result = decode_coding (coding, src, dst, len_byte, 0);
5706         }
5707
5708       /* The buffer memory is now:
5709          +--------+-------converted-text----+--+------original-text----+---+
5710          |<-from->|<-inserted->|<-produced->|--|<-(len_byte-consumed)->|---|
5711                   |<---------------------- GAP ----------------------->|  */
5712
5713       inserted += coding->produced_char;
5714       inserted_byte += coding->produced;
5715       len_byte -= coding->consumed;
5716
5717       if (result == CODING_FINISH_INSUFFICIENT_CMP)
5718         {
5719           coding_allocate_composition_data (coding, from + inserted);
5720           continue;
5721         }
5722
5723       src += coding->consumed;
5724       dst += coding->produced;
5725
5726       if (result == CODING_FINISH_NORMAL)
5727         {
5728           src += len_byte;
5729           break;
5730         }
5731       if (! encodep && result == CODING_FINISH_INCONSISTENT_EOL)
5732         {
5733           unsigned char *pend = dst, *p = pend - inserted_byte;
5734           Lisp_Object eol_type;
5735
5736           /* Encode LFs back to the original eol format (CR or CRLF).  */
5737           if (coding->eol_type == CODING_EOL_CR)
5738             {
5739               while (p < pend) if (*p++ == '\n') p[-1] = '\r';
5740             }
5741           else
5742             {
5743               int count = 0;
5744
5745               while (p < pend) if (*p++ == '\n') count++;
5746               if (src - dst < count)
5747                 {
5748                   /* We don't have sufficient room for encoding LFs
5749                      back to CRLF.  We must record converted and
5750                      not-yet-converted text back to the buffer
5751                      content, enlarge the gap, then record them out of
5752                      the buffer contents again.  */
5753                   int add = len_byte + inserted_byte;
5754
5755                   GAP_SIZE -= add;
5756                   ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5757                   GPT += inserted_byte; GPT_BYTE += inserted_byte;
5758                   make_gap (count - GAP_SIZE);
5759                   GAP_SIZE += add;
5760                   ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5761                   GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5762                   /* Don't forget to update SRC, DST, and PEND.  */
5763                   src = GAP_END_ADDR - len_byte;
5764                   dst = GPT_ADDR + inserted_byte;
5765                   pend = dst;
5766                 }
5767               inserted += count;
5768               inserted_byte += count;
5769               coding->produced += count;
5770               p = dst = pend + count;
5771               while (count)
5772                 {
5773                   *--p = *--pend;
5774                   if (*p == '\n') count--, *--p = '\r';
5775                 }
5776             }
5777
5778           /* Suppress eol-format conversion in the further conversion.  */
5779           coding->eol_type = CODING_EOL_LF;
5780
5781           /* Set the coding system symbol to that for Unix-like EOL.  */
5782           eol_type = Fget (saved_coding_symbol, Qeol_type);
5783           if (VECTORP (eol_type)
5784               && XVECTOR (eol_type)->size == 3
5785               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
5786             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
5787           else
5788             coding->symbol = saved_coding_symbol;
5789
5790           continue;
5791         }
5792       if (len_byte <= 0)
5793         {
5794           if (coding->type != coding_type_ccl
5795               || coding->mode & CODING_MODE_LAST_BLOCK)
5796             break;
5797           coding->mode |= CODING_MODE_LAST_BLOCK;
5798           continue;
5799         }
5800       if (result == CODING_FINISH_INSUFFICIENT_SRC)
5801         {
5802           /* The source text ends in invalid codes.  Let's just
5803              make them valid buffer contents, and finish conversion.  */
5804           if (multibyte_p)
5805             {
5806               unsigned char *start = dst;
5807
5808               inserted += len_byte;
5809               while (len_byte--)
5810                 {
5811                   int c = *src++;
5812                   dst += CHAR_STRING (c, dst);
5813                 }
5814
5815               inserted_byte += dst - start;
5816             }
5817           else
5818             {
5819               inserted += len_byte;
5820               inserted_byte += len_byte;
5821               while (len_byte--)
5822                 *dst++ = *src++;
5823             }
5824           break;
5825         }
5826       if (result == CODING_FINISH_INTERRUPT)
5827         {
5828           /* The conversion procedure was interrupted by a user.  */
5829           break;
5830         }
5831       /* Now RESULT == CODING_FINISH_INSUFFICIENT_DST  */
5832       if (coding->consumed < 1)
5833         {
5834           /* It's quite strange to require more memory without
5835              consuming any bytes.  Perhaps CCL program bug.  */
5836           break;
5837         }
5838       if (first)
5839         {
5840           /* We have just done the first batch of conversion which was
5841              stopped because of insufficient gap.  Let's reconsider the
5842              required gap size (i.e. SRT - DST) now.
5843
5844              We have converted ORIG bytes (== coding->consumed) into
5845              NEW bytes (coding->produced).  To convert the remaining
5846              LEN bytes, we may need REQUIRE bytes of gap, where:
5847                 REQUIRE + LEN_BYTE = LEN_BYTE * (NEW / ORIG)
5848                 REQUIRE = LEN_BYTE * (NEW - ORIG) / ORIG
5849              Here, we are sure that NEW >= ORIG.  */
5850           float ratio;
5851
5852           if (coding->produced <= coding->consumed)
5853             {
5854               /* This happens because of CCL-based coding system with
5855                  eol-type CRLF.  */
5856               require = 0;
5857             }
5858           else
5859             {
5860               ratio = (coding->produced - coding->consumed) / coding->consumed;
5861               require = len_byte * ratio;
5862             }
5863           first = 0;
5864         }
5865       if ((src - dst) < (require + 2000))
5866         {
5867           /* See the comment above the previous call of make_gap.  */
5868           int add = len_byte + inserted_byte;
5869
5870           GAP_SIZE -= add;
5871           ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
5872           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5873           make_gap (require + 2000);
5874           GAP_SIZE += add;
5875           ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
5876           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5877         }
5878     }
5879   if (src - dst > 0) *dst = 0; /* Put an anchor.  */
5880
5881   if (encodep && coding->dst_multibyte)
5882     {
5883       /* The output is unibyte.  We must convert 8-bit characters to
5884          multibyte form.  */
5885       if (inserted_byte * 2 > GAP_SIZE)
5886         {
5887           GAP_SIZE -= inserted_byte;
5888           ZV += inserted_byte; Z += inserted_byte;
5889           ZV_BYTE += inserted_byte; Z_BYTE += inserted_byte;
5890           GPT += inserted_byte; GPT_BYTE += inserted_byte;
5891           make_gap (inserted_byte - GAP_SIZE);
5892           GAP_SIZE += inserted_byte;
5893           ZV -= inserted_byte; Z -= inserted_byte;
5894           ZV_BYTE -= inserted_byte; Z_BYTE -= inserted_byte;
5895           GPT -= inserted_byte; GPT_BYTE -= inserted_byte;
5896         }
5897       inserted_byte = str_to_multibyte (GPT_ADDR, GAP_SIZE, inserted_byte);
5898     }
5899
5900   /* If we shrank the conversion area, adjust it now.  */
5901   if (total_skip > 0)
5902     {
5903       if (tail_skip > 0)
5904         safe_bcopy (GAP_END_ADDR, GPT_ADDR + inserted_byte, tail_skip);
5905       inserted += total_skip; inserted_byte += total_skip;
5906       GAP_SIZE += total_skip;
5907       GPT -= head_skip; GPT_BYTE -= head_skip;
5908       ZV -= total_skip; ZV_BYTE -= total_skip;
5909       Z -= total_skip; Z_BYTE -= total_skip;
5910       from -= head_skip; from_byte -= head_skip;
5911       to += tail_skip; to_byte += tail_skip;
5912     }
5913
5914   prev_Z = Z;
5915   if (! EQ (current_buffer->undo_list, Qt))
5916     adjust_after_replace (from, from_byte, deletion, inserted, inserted_byte);
5917   else
5918     adjust_after_replace_noundo (from, from_byte, nchars_del, nbytes_del,
5919                                  inserted, inserted_byte);
5920   inserted = Z - prev_Z;
5921
5922   if (!encodep && coding->cmp_data && coding->cmp_data->used)
5923     coding_restore_composition (coding, Fcurrent_buffer ());
5924   coding_free_composition_data (coding);
5925
5926   if (! inhibit_pre_post_conversion
5927       && ! encodep && ! NILP (coding->post_read_conversion))
5928     {
5929       Lisp_Object val;
5930       Lisp_Object saved_coding_system;
5931
5932       if (from != PT)
5933         TEMP_SET_PT_BOTH (from, from_byte);
5934       prev_Z = Z;
5935       record_unwind_protect (code_convert_region_unwind,
5936                              Vlast_coding_system_used);
5937       saved_coding_system = Vlast_coding_system_used;
5938       Vlast_coding_system_used = coding->symbol;
5939       /* We should not call any more pre-write/post-read-conversion
5940          functions while this post-read-conversion is running.  */
5941       inhibit_pre_post_conversion = 1;
5942       val = call1 (coding->post_read_conversion, make_number (inserted));
5943       inhibit_pre_post_conversion = 0;
5944       coding->symbol = Vlast_coding_system_used;
5945       Vlast_coding_system_used = saved_coding_system;
5946       /* Discard the unwind protect.  */
5947       specpdl_ptr--;
5948       CHECK_NUMBER (val);
5949       inserted += Z - prev_Z;
5950     }
5951
5952   if (orig_point >= from)
5953     {
5954       if (orig_point >= from + orig_len)
5955         orig_point += inserted - orig_len;
5956       else
5957         orig_point = from;
5958       TEMP_SET_PT (orig_point);
5959     }
5960
5961   if (replace)
5962     {
5963       signal_after_change (from, to - from, inserted);
5964       update_compositions (from, from + inserted, CHECK_BORDER);
5965     }
5966
5967   {
5968     coding->consumed = to_byte - from_byte;
5969     coding->consumed_char = to - from;
5970     coding->produced = inserted_byte;
5971     coding->produced_char = inserted;
5972   }
5973
5974   return 0;
5975 }
5976
5977 Lisp_Object
5978 run_pre_post_conversion_on_str (str, coding, encodep)
5979      Lisp_Object str;
5980      struct coding_system *coding;
5981      int encodep;
5982 {
5983   int count = SPECPDL_INDEX ();
5984   struct gcpro gcpro1, gcpro2;
5985   int multibyte = STRING_MULTIBYTE (str);
5986   Lisp_Object buffer;
5987   struct buffer *buf;
5988   Lisp_Object old_deactivate_mark;
5989
5990   record_unwind_protect (Fset_buffer, Fcurrent_buffer ());
5991   record_unwind_protect (code_convert_region_unwind,
5992                          Vlast_coding_system_used);
5993   /* It is not crucial to specbind this.  */
5994   old_deactivate_mark = Vdeactivate_mark;
5995   GCPRO2 (str, old_deactivate_mark);
5996
5997   buffer = Fget_buffer_create (build_string (" *code-converting-work*"));
5998   buf = XBUFFER (buffer);
5999
6000   buf->directory = current_buffer->directory;
6001   buf->read_only = Qnil;
6002   buf->filename = Qnil;
6003   buf->undo_list = Qt;
6004   buf->overlays_before = Qnil;
6005   buf->overlays_after = Qnil;
6006
6007   set_buffer_internal (buf);
6008   /* We must insert the contents of STR as is without
6009      unibyte<->multibyte conversion.  For that, we adjust the
6010      multibyteness of the working buffer to that of STR.  */
6011   Ferase_buffer ();
6012   buf->enable_multibyte_characters = multibyte ? Qt : Qnil;
6013
6014   insert_from_string (str, 0, 0,
6015                       SCHARS (str), SBYTES (str), 0);
6016   UNGCPRO;
6017   inhibit_pre_post_conversion = 1;
6018   if (encodep)
6019     call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z));
6020   else
6021     {
6022       Vlast_coding_system_used = coding->symbol;
6023       TEMP_SET_PT_BOTH (BEG, BEG_BYTE);
6024       call1 (coding->post_read_conversion, make_number (Z - BEG));
6025       coding->symbol = Vlast_coding_system_used;
6026     }
6027   inhibit_pre_post_conversion = 0;
6028   Vdeactivate_mark = old_deactivate_mark;
6029   str = make_buffer_string (BEG, Z, 1);
6030   return unbind_to (count, str);
6031 }
6032
6033 Lisp_Object
6034 decode_coding_string (str, coding, nocopy)
6035      Lisp_Object str;
6036      struct coding_system *coding;
6037      int nocopy;
6038 {
6039   int len;
6040   struct conversion_buffer buf;
6041   int from, to_byte;
6042   Lisp_Object saved_coding_symbol;
6043   int result;
6044   int require_decoding;
6045   int shrinked_bytes = 0;
6046   Lisp_Object newstr;
6047   int consumed, consumed_char, produced, produced_char;
6048
6049   from = 0;
6050   to_byte = SBYTES (str);
6051
6052   saved_coding_symbol = coding->symbol;
6053   coding->src_multibyte = STRING_MULTIBYTE (str);
6054   coding->dst_multibyte = 1;
6055   if (CODING_REQUIRE_DETECTION (coding))
6056     {
6057       /* See the comments in code_convert_region.  */
6058       if (coding->type == coding_type_undecided)
6059         {
6060           detect_coding (coding, SDATA (str), to_byte);
6061           if (coding->type == coding_type_undecided)
6062             {
6063               coding->type = coding_type_emacs_mule;
6064               coding->category_idx = CODING_CATEGORY_IDX_EMACS_MULE;
6065               /* As emacs-mule decoder will handle composition, we
6066                  need this setting to allocate coding->cmp_data
6067                  later.  */
6068               coding->composing = COMPOSITION_NO;
6069             }
6070         }
6071       if (coding->eol_type == CODING_EOL_UNDECIDED
6072           && coding->type != coding_type_ccl)
6073         {
6074           saved_coding_symbol = coding->symbol;
6075           detect_eol (coding, SDATA (str), to_byte);
6076           if (coding->eol_type == CODING_EOL_UNDECIDED)
6077             coding->eol_type = CODING_EOL_LF;
6078           /* We had better recover the original eol format if we
6079              encounter an inconsistent eol format while decoding.  */
6080           coding->mode |= CODING_MODE_INHIBIT_INCONSISTENT_EOL;
6081         }
6082     }
6083
6084   if (coding->type == coding_type_no_conversion
6085       || coding->type == coding_type_raw_text)
6086     coding->dst_multibyte = 0;
6087
6088   require_decoding = CODING_REQUIRE_DECODING (coding);
6089
6090   if (STRING_MULTIBYTE (str))
6091     {
6092       /* Decoding routines expect the source text to be unibyte.  */
6093       str = Fstring_as_unibyte (str);
6094       to_byte = SBYTES (str);
6095       nocopy = 1;
6096       coding->src_multibyte = 0;
6097     }
6098
6099   /* Try to skip the heading and tailing ASCIIs.  */
6100   if (require_decoding && coding->type != coding_type_ccl)
6101     {
6102       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6103                                 0);
6104       if (from == to_byte)
6105         require_decoding = 0;
6106       shrinked_bytes = from + (SBYTES (str) - to_byte);
6107     }
6108
6109   if (!require_decoding
6110       && !(SYMBOLP (coding->post_read_conversion)
6111            && !NILP (Ffboundp (coding->post_read_conversion))))
6112     {
6113       coding->consumed = SBYTES (str);
6114       coding->consumed_char = SCHARS (str);
6115       if (coding->dst_multibyte)
6116         {
6117           str = Fstring_as_multibyte (str);
6118           nocopy = 1;
6119         }
6120       coding->produced = SBYTES (str);
6121       coding->produced_char = SCHARS (str);
6122       return (nocopy ? str : Fcopy_sequence (str));
6123     }
6124
6125   if (coding->composing != COMPOSITION_DISABLED)
6126     coding_allocate_composition_data (coding, from);
6127   len = decoding_buffer_size (coding, to_byte - from);
6128   allocate_conversion_buffer (buf, len);
6129
6130   consumed = consumed_char = produced = produced_char = 0;
6131   while (1)
6132     {
6133       result = decode_coding (coding, SDATA (str) + from + consumed,
6134                               buf.data + produced, to_byte - from - consumed,
6135                               buf.size - produced);
6136       consumed += coding->consumed;
6137       consumed_char += coding->consumed_char;
6138       produced += coding->produced;
6139       produced_char += coding->produced_char;
6140       if (result == CODING_FINISH_NORMAL
6141           || (result == CODING_FINISH_INSUFFICIENT_SRC
6142               && coding->consumed == 0))
6143         break;
6144       if (result == CODING_FINISH_INSUFFICIENT_CMP)
6145         coding_allocate_composition_data (coding, from + produced_char);
6146       else if (result == CODING_FINISH_INSUFFICIENT_DST)
6147         extend_conversion_buffer (&buf);
6148       else if (result == CODING_FINISH_INCONSISTENT_EOL)
6149         {
6150           Lisp_Object eol_type;
6151
6152           /* Recover the original EOL format.  */
6153           if (coding->eol_type == CODING_EOL_CR)
6154             {
6155               unsigned char *p;
6156               for (p = buf.data; p < buf.data + produced; p++)
6157                 if (*p == '\n') *p = '\r';
6158             }
6159           else if (coding->eol_type == CODING_EOL_CRLF)
6160             {
6161               int num_eol = 0;
6162               unsigned char *p0, *p1;
6163               for (p0 = buf.data, p1 = p0 + produced; p0 < p1; p0++)
6164                 if (*p0 == '\n') num_eol++;
6165               if (produced + num_eol >= buf.size)
6166                 extend_conversion_buffer (&buf);
6167               for (p0 = buf.data + produced, p1 = p0 + num_eol; p0 > buf.data;)
6168                 {
6169                   *--p1 = *--p0;
6170                   if (*p0 == '\n') *--p1 = '\r';
6171                 }
6172               produced += num_eol;
6173               produced_char += num_eol;
6174             }
6175           /* Suppress eol-format conversion in the further conversion.  */
6176           coding->eol_type = CODING_EOL_LF;
6177
6178           /* Set the coding system symbol to that for Unix-like EOL.  */
6179           eol_type = Fget (saved_coding_symbol, Qeol_type);
6180           if (VECTORP (eol_type)
6181               && XVECTOR (eol_type)->size == 3
6182               && SYMBOLP (XVECTOR (eol_type)->contents[CODING_EOL_LF]))
6183             coding->symbol = XVECTOR (eol_type)->contents[CODING_EOL_LF];
6184           else
6185             coding->symbol = saved_coding_symbol;
6186
6187
6188         }
6189     }
6190
6191   coding->consumed = consumed;
6192   coding->consumed_char = consumed_char;
6193   coding->produced = produced;
6194   coding->produced_char = produced_char;
6195
6196   if (coding->dst_multibyte)
6197     newstr = make_uninit_multibyte_string (produced_char + shrinked_bytes,
6198                                            produced + shrinked_bytes);
6199   else
6200     newstr = make_uninit_string (produced + shrinked_bytes);
6201   if (from > 0)
6202     STRING_COPYIN (newstr, 0, SDATA (str), from);
6203   STRING_COPYIN (newstr, from, buf.data, produced);
6204   if (shrinked_bytes > from)
6205     STRING_COPYIN (newstr, from + produced,
6206                    SDATA (str) + to_byte,
6207                    shrinked_bytes - from);
6208   free_conversion_buffer (&buf);
6209
6210   if (coding->cmp_data && coding->cmp_data->used)
6211     coding_restore_composition (coding, newstr);
6212   coding_free_composition_data (coding);
6213
6214   if (SYMBOLP (coding->post_read_conversion)
6215       && !NILP (Ffboundp (coding->post_read_conversion)))
6216     newstr = run_pre_post_conversion_on_str (newstr, coding, 0);
6217
6218   return newstr;
6219 }
6220
6221 Lisp_Object
6222 encode_coding_string (str, coding, nocopy)
6223      Lisp_Object str;
6224      struct coding_system *coding;
6225      int nocopy;
6226 {
6227   int len;
6228   struct conversion_buffer buf;
6229   int from, to, to_byte;
6230   int result;
6231   int shrinked_bytes = 0;
6232   Lisp_Object newstr;
6233   int consumed, consumed_char, produced, produced_char;
6234
6235   if (SYMBOLP (coding->pre_write_conversion)
6236       && !NILP (Ffboundp (coding->pre_write_conversion)))
6237     str = run_pre_post_conversion_on_str (str, coding, 1);
6238
6239   from = 0;
6240   to = SCHARS (str);
6241   to_byte = SBYTES (str);
6242
6243   /* Encoding routines determine the multibyteness of the source text
6244      by coding->src_multibyte.  */
6245   coding->src_multibyte = STRING_MULTIBYTE (str);
6246   coding->dst_multibyte = 0;
6247   if (! CODING_REQUIRE_ENCODING (coding))
6248     {
6249       coding->consumed = SBYTES (str);
6250       coding->consumed_char = SCHARS (str);
6251       if (STRING_MULTIBYTE (str))
6252         {
6253           str = Fstring_as_unibyte (str);
6254           nocopy = 1;
6255         }
6256       coding->produced = SBYTES (str);
6257       coding->produced_char = SCHARS (str);
6258       return (nocopy ? str : Fcopy_sequence (str));
6259     }
6260
6261   if (coding->composing != COMPOSITION_DISABLED)
6262     coding_save_composition (coding, from, to, str);
6263
6264   /* Try to skip the heading and tailing ASCIIs.  */
6265   if (coding->type != coding_type_ccl)
6266     {
6267       SHRINK_CONVERSION_REGION (&from, &to_byte, coding, SDATA (str),
6268                                 1);
6269       if (from == to_byte)
6270         return (nocopy ? str : Fcopy_sequence (str));
6271       shrinked_bytes = from + (SBYTES (str) - to_byte);
6272     }
6273
6274   len = encoding_buffer_size (coding, to_byte - from);
6275   allocate_conversion_buffer (buf, len);
6276
6277   consumed = consumed_char = produced = produced_char = 0;
6278   while (1)
6279     {
6280       result = encode_coding (coding, SDATA (str) + from + consumed,
6281                               buf.data + produced, to_byte - from - consumed,
6282                               buf.size - produced);
6283       consumed += coding->consumed;
6284       consumed_char += coding->consumed_char;
6285       produced += coding->produced;
6286       produced_char += coding->produced_char;
6287       if (result == CODING_FINISH_NORMAL
6288           || (result == CODING_FINISH_INSUFFICIENT_SRC
6289               && coding->consumed == 0))
6290         break;
6291       /* Now result should be CODING_FINISH_INSUFFICIENT_DST.  */
6292       extend_conversion_buffer (&buf);
6293     }
6294
6295   coding->consumed = consumed;
6296   coding->consumed_char = consumed_char;
6297   coding->produced = produced;
6298   coding->produced_char = produced_char;
6299
6300   newstr = make_uninit_string (produced + shrinked_bytes);
6301   if (from > 0)
6302     STRING_COPYIN (newstr, 0, SDATA (str), from);
6303   STRING_COPYIN (newstr, from, buf.data, produced);
6304   if (shrinked_bytes > from)
6305     STRING_COPYIN (newstr, from + produced,
6306                    SDATA (str) + to_byte,
6307                    shrinked_bytes - from);
6308
6309   free_conversion_buffer (&buf);
6310   coding_free_composition_data (coding);
6311
6312   return newstr;
6313 }
6314
6315 \f
6316 #ifdef emacs
6317 /*** 8. Emacs Lisp library functions ***/
6318
6319 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6320        doc: /* Return t if OBJECT is nil or a coding-system.
6321 See the documentation of `make-coding-system' for information
6322 about coding-system objects.  */)
6323      (obj)
6324      Lisp_Object obj;
6325 {
6326   if (NILP (obj))
6327     return Qt;
6328   if (!SYMBOLP (obj))
6329     return Qnil;
6330   /* Get coding-spec vector for OBJ.  */
6331   obj = Fget (obj, Qcoding_system);
6332   return ((VECTORP (obj) && XVECTOR (obj)->size == 5)
6333           ? Qt : Qnil);
6334 }
6335
6336 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6337        Sread_non_nil_coding_system, 1, 1, 0,
6338        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6339      (prompt)
6340      Lisp_Object prompt;
6341 {
6342   Lisp_Object val;
6343   do
6344     {
6345       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6346                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6347     }
6348   while (SCHARS (val) == 0);
6349   return (Fintern (val, Qnil));
6350 }
6351
6352 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6353        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6354 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6355      (prompt, default_coding_system)
6356      Lisp_Object prompt, default_coding_system;
6357 {
6358   Lisp_Object val;
6359   if (SYMBOLP (default_coding_system))
6360     default_coding_system = SYMBOL_NAME (default_coding_system);
6361   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6362                           Qt, Qnil, Qcoding_system_history,
6363                           default_coding_system, Qnil);
6364   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
6365 }
6366
6367 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6368        1, 1, 0,
6369        doc: /* Check validity of CODING-SYSTEM.
6370 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6371 It is valid if it is a symbol with a non-nil `coding-system' property.
6372 The value of property should be a vector of length 5.  */)
6373      (coding_system)
6374      Lisp_Object coding_system;
6375 {
6376   CHECK_SYMBOL (coding_system);
6377   if (!NILP (Fcoding_system_p (coding_system)))
6378     return coding_system;
6379   while (1)
6380     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6381 }
6382 \f
6383 Lisp_Object
6384 detect_coding_system (src, src_bytes, highest, multibytep)
6385      const unsigned char *src;
6386      int src_bytes, highest;
6387      int multibytep;
6388 {
6389   int coding_mask, eol_type;
6390   Lisp_Object val, tmp;
6391   int dummy;
6392
6393   coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep);
6394   eol_type  = detect_eol_type (src, src_bytes, &dummy);
6395   if (eol_type == CODING_EOL_INCONSISTENT)
6396     eol_type = CODING_EOL_UNDECIDED;
6397
6398   if (!coding_mask)
6399     {
6400       val = Qundecided;
6401       if (eol_type != CODING_EOL_UNDECIDED)
6402         {
6403           Lisp_Object val2;
6404           val2 = Fget (Qundecided, Qeol_type);
6405           if (VECTORP (val2))
6406             val = XVECTOR (val2)->contents[eol_type];
6407         }
6408       return (highest ? val : Fcons (val, Qnil));
6409     }
6410
6411   /* At first, gather possible coding systems in VAL.  */
6412   val = Qnil;
6413   for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp))
6414     {
6415       Lisp_Object category_val, category_index;
6416
6417       category_index = Fget (XCAR (tmp), Qcoding_category_index);
6418       category_val = Fsymbol_value (XCAR (tmp));
6419       if (!NILP (category_val)
6420           && NATNUMP (category_index)
6421           && (coding_mask & (1 << XFASTINT (category_index))))
6422         {
6423           val = Fcons (category_val, val);
6424           if (highest)
6425             break;
6426         }
6427     }
6428   if (!highest)
6429     val = Fnreverse (val);
6430
6431   /* Then, replace the elements with subsidiary coding systems.  */
6432   for (tmp = val; CONSP (tmp); tmp = XCDR (tmp))
6433     {
6434       if (eol_type != CODING_EOL_UNDECIDED
6435           && eol_type != CODING_EOL_INCONSISTENT)
6436         {
6437           Lisp_Object eol;
6438           eol = Fget (XCAR (tmp), Qeol_type);
6439           if (VECTORP (eol))
6440             XSETCAR (tmp, XVECTOR (eol)->contents[eol_type]);
6441         }
6442     }
6443   return (highest ? XCAR (val) : val);
6444 }
6445
6446 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6447        2, 3, 0,
6448        doc: /* Detect how the byte sequence in the region is encoded.
6449 Return a list of possible coding systems used on decoding a byte
6450 sequence containing the bytes in the region between START and END when
6451 the coding system `undecided' is specified.  The list is ordered by
6452 priority decided in the current language environment.
6453
6454 If only ASCII characters are found, it returns a list of single element
6455 `undecided' or its subsidiary coding system according to a detected
6456 end-of-line format.
6457
6458 If optional argument HIGHEST is non-nil, return the coding system of
6459 highest priority.  */)
6460      (start, end, highest)
6461      Lisp_Object start, end, highest;
6462 {
6463   int from, to;
6464   int from_byte, to_byte;
6465   int include_anchor_byte = 0;
6466
6467   CHECK_NUMBER_COERCE_MARKER (start);
6468   CHECK_NUMBER_COERCE_MARKER (end);
6469
6470   validate_region (&start, &end);
6471   from = XINT (start), to = XINT (end);
6472   from_byte = CHAR_TO_BYTE (from);
6473   to_byte = CHAR_TO_BYTE (to);
6474
6475   if (from < GPT && to >= GPT)
6476     move_gap_both (to, to_byte);
6477   /* If we an anchor byte `\0' follows the region, we include it in
6478      the detecting source.  Then code detectors can handle the tailing
6479      byte sequence more accurately.
6480
6481      Fix me: This is not a perfect solution.  It is better that we
6482      add one more argument, say LAST_BLOCK, to all detect_coding_XXX.
6483   */
6484   if (to == Z || (to == GPT && GAP_SIZE > 0))
6485     include_anchor_byte = 1;
6486   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6487                                to_byte - from_byte + include_anchor_byte,
6488                                !NILP (highest),
6489                                !NILP (current_buffer
6490                                       ->enable_multibyte_characters));
6491 }
6492
6493 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6494        1, 2, 0,
6495        doc: /* Detect how the byte sequence in STRING is encoded.
6496 Return a list of possible coding systems used on decoding a byte
6497 sequence containing the bytes in STRING when the coding system
6498 `undecided' is specified.  The list is ordered by priority decided in
6499 the current language environment.
6500
6501 If only ASCII characters are found, it returns a list of single element
6502 `undecided' or its subsidiary coding system according to a detected
6503 end-of-line format.
6504
6505 If optional argument HIGHEST is non-nil, return the coding system of
6506 highest priority.  */)
6507      (string, highest)
6508      Lisp_Object string, highest;
6509 {
6510   CHECK_STRING (string);
6511
6512   return detect_coding_system (SDATA (string),
6513                                /* "+ 1" is to include the anchor byte
6514                                   `\0'.  With this, code detectors can
6515                                   handle the tailing bytes more
6516                                   accurately.  */
6517                                SBYTES (string) + 1,
6518                                !NILP (highest),
6519                                STRING_MULTIBYTE (string));
6520 }
6521
6522 /*  Subroutine for Fsafe_coding_systems_region_internal.
6523
6524     Return a list of coding systems that safely encode the multibyte
6525     text between P and PEND.  SAFE_CODINGS, if non-nil, is an alist of
6526     possible coding systems.  If it is nil, it means that we have not
6527     yet found any coding systems.
6528
6529     WORK_TABLE is a copy of the char-table Vchar_coding_system_table.  An
6530     element of WORK_TABLE is set to t once the element is looked up.
6531
6532     If a non-ASCII single byte char is found, set
6533     *single_byte_char_found to 1.  */
6534
6535 static Lisp_Object
6536 find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found)
6537      unsigned char *p, *pend;
6538      Lisp_Object safe_codings, work_table;
6539      int *single_byte_char_found;
6540 {
6541   int c, len;
6542   Lisp_Object val, ch;
6543   Lisp_Object prev, tail;
6544
6545   while (p < pend)
6546     {
6547       c = STRING_CHAR_AND_LENGTH (p, pend - p, len);
6548       p += len;
6549       if (ASCII_BYTE_P (c))
6550         /* We can ignore ASCII characters here.  */
6551         continue;
6552       if (SINGLE_BYTE_CHAR_P (c))
6553         *single_byte_char_found = 1;
6554       if (NILP (safe_codings))
6555         /* Already all coding systems are excluded.  But, we can't
6556            terminate the loop here because non-ASCII single-byte char
6557            must be found.  */
6558         continue;
6559       /* Check the safe coding systems for C.  */
6560       ch = make_number (c);
6561       val = Faref (work_table, ch);
6562       if (EQ (val, Qt))
6563         /* This element was already checked.  Ignore it.  */
6564         continue;
6565       /* Remember that we checked this element.  */
6566       Faset (work_table, ch, Qt);
6567
6568       for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail))
6569         {
6570           Lisp_Object elt, translation_table, hash_table, accept_latin_extra;
6571           int encodable;
6572
6573           elt = XCAR (tail);
6574           if (CONSP (XCDR (elt)))
6575             {
6576               /* This entry has this format now:
6577                  ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6578                           ACCEPT-LATIN-EXTRA ) */
6579               val = XCDR (elt);
6580               encodable = ! NILP (Faref (XCAR (val), ch));
6581               if (! encodable)
6582                 {
6583                   val = XCDR (val);
6584                   translation_table = XCAR (val);
6585                   hash_table = XCAR (XCDR (val));
6586                   accept_latin_extra = XCAR (XCDR (XCDR (val)));
6587                 }
6588             }
6589           else
6590             {
6591               /* This entry has this format now: ( CODING . SAFE-CHARS) */
6592               encodable = ! NILP (Faref (XCDR (elt), ch));
6593               if (! encodable)
6594                 {
6595                   /* Transform the format to:
6596                      ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE
6597                        ACCEPT-LATIN-EXTRA )  */
6598                   val = Fget (XCAR (elt), Qcoding_system);
6599                   translation_table
6600                     = Fplist_get (AREF (val, 3),
6601                                   Qtranslation_table_for_encode);
6602                   if (SYMBOLP (translation_table))
6603                     translation_table = Fget (translation_table,
6604                                               Qtranslation_table);
6605                   hash_table
6606                     = (CHAR_TABLE_P (translation_table)
6607                        ? XCHAR_TABLE (translation_table)->extras[1]
6608                        : Qnil);
6609                   accept_latin_extra
6610                     = ((EQ (AREF (val, 0), make_number (2))
6611                         && VECTORP (AREF (val, 4)))
6612                        ? AREF (AREF (val, 4), 16)
6613                        : Qnil);
6614                   XSETCAR (tail, list5 (XCAR (elt), XCDR (elt),
6615                                         translation_table, hash_table,
6616                                         accept_latin_extra));
6617                 }
6618             }
6619
6620           if (! encodable
6621               && ((CHAR_TABLE_P (translation_table)
6622                    && ! NILP (Faref (translation_table, ch)))
6623                   || (HASH_TABLE_P (hash_table)
6624                       && ! NILP (Fgethash (ch, hash_table, Qnil)))
6625                   || (SINGLE_BYTE_CHAR_P (c)
6626                       && ! NILP (accept_latin_extra)
6627                       && VECTORP (Vlatin_extra_code_table)
6628                       && ! NILP (AREF (Vlatin_extra_code_table, c)))))
6629             encodable = 1;
6630           if (encodable)
6631             prev = tail;
6632           else
6633             {
6634               /* Exclude this coding system from SAFE_CODINGS.  */
6635               if (EQ (tail, safe_codings))
6636                 safe_codings = XCDR (safe_codings);
6637               else
6638                 XSETCDR (prev, XCDR (tail));
6639             }
6640         }
6641     }
6642   return safe_codings;
6643 }
6644
6645 DEFUN ("find-coding-systems-region-internal",
6646        Ffind_coding_systems_region_internal,
6647        Sfind_coding_systems_region_internal, 2, 2, 0,
6648        doc: /* Internal use only.  */)
6649      (start, end)
6650      Lisp_Object start, end;
6651 {
6652   Lisp_Object work_table, safe_codings;
6653   int non_ascii_p = 0;
6654   int single_byte_char_found = 0;
6655   const unsigned char *p1, *p1end, *p2, *p2end, *p;
6656
6657   if (STRINGP (start))
6658     {
6659       if (!STRING_MULTIBYTE (start))
6660         return Qt;
6661       p1 = SDATA (start), p1end = p1 + SBYTES (start);
6662       p2 = p2end = p1end;
6663       if (SCHARS (start) != SBYTES (start))
6664         non_ascii_p = 1;
6665     }
6666   else
6667     {
6668       int from, to, stop;
6669
6670       CHECK_NUMBER_COERCE_MARKER (start);
6671       CHECK_NUMBER_COERCE_MARKER (end);
6672       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6673         args_out_of_range (start, end);
6674       if (NILP (current_buffer->enable_multibyte_characters))
6675         return Qt;
6676       from = CHAR_TO_BYTE (XINT (start));
6677       to = CHAR_TO_BYTE (XINT (end));
6678       stop = from < GPT_BYTE && GPT_BYTE < to ? GPT_BYTE : to;
6679       p1 = BYTE_POS_ADDR (from), p1end = p1 + (stop - from);
6680       if (stop == to)
6681         p2 = p2end = p1end;
6682       else
6683         p2 = BYTE_POS_ADDR (stop), p2end = p2 + (to - stop);
6684       if (XINT (end) - XINT (start) != to - from)
6685         non_ascii_p = 1;
6686     }
6687
6688   if (!non_ascii_p)
6689     {
6690       /* We are sure that the text contains no multibyte character.
6691          Check if it contains eight-bit-graphic.  */
6692       p = p1;
6693       for (p = p1; p < p1end && ASCII_BYTE_P (*p); p++);
6694       if (p == p1end)
6695         {
6696           for (p = p2; p < p2end && ASCII_BYTE_P (*p); p++);
6697           if (p == p2end)
6698             return Qt;
6699         }
6700     }
6701
6702   /* The text contains non-ASCII characters.  */
6703
6704   work_table = Fmake_char_table (Qchar_coding_system, Qnil);
6705   safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars));
6706
6707   safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table,
6708                                     &single_byte_char_found);
6709   if (p2 < p2end)
6710     safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table,
6711                                       &single_byte_char_found);
6712   if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars)))
6713     safe_codings = Qt;
6714   else
6715     {
6716       /* Turn safe_codings to a list of coding systems... */
6717       Lisp_Object val;
6718
6719       if (single_byte_char_found)
6720         /* ... and append these for eight-bit chars.  */
6721         val = Fcons (Qraw_text,
6722                      Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil)));
6723       else
6724         /* ... and append generic coding systems.  */
6725         val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars));
6726
6727       for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings))
6728         val = Fcons (XCAR (XCAR (safe_codings)), val);
6729       safe_codings = val;
6730     }
6731
6732   return safe_codings;
6733 }
6734
6735
6736 /* Search from position POS for such characters that are unencodable
6737    accoding to SAFE_CHARS, and return a list of their positions.  P
6738    points where in the memory the character at POS exists.  Limit the
6739    search at PEND or when Nth unencodable characters are found.
6740
6741    If SAFE_CHARS is a char table, an element for an unencodable
6742    character is nil.
6743
6744    If SAFE_CHARS is nil, all non-ASCII characters are unencodable.
6745
6746    Otherwise, SAFE_CHARS is t, and only eight-bit-contrl and
6747    eight-bit-graphic characters are unencodable.  */
6748
6749 static Lisp_Object
6750 unencodable_char_position (safe_chars, pos, p, pend, n)
6751      Lisp_Object safe_chars;
6752      int pos;
6753      unsigned char *p, *pend;
6754      int n;
6755 {
6756   Lisp_Object pos_list;
6757
6758   pos_list = Qnil;
6759   while (p < pend)
6760     {
6761       int len;
6762       int c = STRING_CHAR_AND_LENGTH (p, MAX_MULTIBYTE_LENGTH, len);
6763
6764       if (c >= 128
6765           && (CHAR_TABLE_P (safe_chars)
6766               ? NILP (CHAR_TABLE_REF (safe_chars, c))
6767               : (NILP (safe_chars) || c < 256)))
6768         {
6769           pos_list = Fcons (make_number (pos), pos_list);
6770           if (--n <= 0)
6771             break;
6772         }
6773       pos++;
6774       p += len;
6775     }
6776   return Fnreverse (pos_list);
6777 }
6778
6779
6780 DEFUN ("unencodable-char-position", Funencodable_char_position,
6781        Sunencodable_char_position, 3, 5, 0,
6782        doc: /*
6783 Return position of first un-encodable character in a region.
6784 START and END specfiy the region and CODING-SYSTEM specifies the
6785 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
6786
6787 If optional 4th argument COUNT is non-nil, it specifies at most how
6788 many un-encodable characters to search.  In this case, the value is a
6789 list of positions.
6790
6791 If optional 5th argument STRING is non-nil, it is a string to search
6792 for un-encodable characters.  In that case, START and END are indexes
6793 to the string.  */)
6794      (start, end, coding_system, count, string)
6795      Lisp_Object start, end, coding_system, count, string;
6796 {
6797   int n;
6798   Lisp_Object safe_chars;
6799   struct coding_system coding;
6800   Lisp_Object positions;
6801   int from, to;
6802   unsigned char *p, *pend;
6803
6804   if (NILP (string))
6805     {
6806       validate_region (&start, &end);
6807       from = XINT (start);
6808       to = XINT (end);
6809       if (NILP (current_buffer->enable_multibyte_characters))
6810         return Qnil;
6811       p = CHAR_POS_ADDR (from);
6812       if (to == GPT)
6813         pend = GPT_ADDR;
6814       else
6815         pend = CHAR_POS_ADDR (to);
6816     }
6817   else
6818     {
6819       CHECK_STRING (string);
6820       CHECK_NATNUM (start);
6821       CHECK_NATNUM (end);
6822       from = XINT (start);
6823       to = XINT (end);
6824       if (from > to
6825           || to > SCHARS (string))
6826         args_out_of_range_3 (string, start, end);
6827       if (! STRING_MULTIBYTE (string))
6828         return Qnil;
6829       p = SDATA (string) + string_char_to_byte (string, from);
6830       pend = SDATA (string) + string_char_to_byte (string, to);
6831     }
6832
6833   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
6834
6835   if (NILP (count))
6836     n = 1;
6837   else
6838     {
6839       CHECK_NATNUM (count);
6840       n = XINT (count);
6841     }
6842
6843   if (coding.type == coding_type_no_conversion
6844       || coding.type == coding_type_raw_text)
6845     return Qnil;
6846
6847   if (coding.type == coding_type_undecided)
6848     safe_chars = Qnil;
6849   else
6850     safe_chars = coding_safe_chars (coding_system);
6851
6852   if (STRINGP (string)
6853       || from >= GPT || to <= GPT)
6854     positions = unencodable_char_position (safe_chars, from, p, pend, n);
6855   else
6856     {
6857       Lisp_Object args[2];
6858
6859       args[0] = unencodable_char_position (safe_chars, from, p, GPT_ADDR, n);
6860       n -= XINT (Flength (args[0]));
6861       if (n <= 0)
6862         positions = args[0];
6863       else
6864         {
6865           args[1] = unencodable_char_position (safe_chars, GPT, GAP_END_ADDR,
6866                                                pend, n);
6867           positions = Fappend (2, args);
6868         }
6869     }
6870
6871   return  (NILP (count) ? Fcar (positions) : positions);
6872 }
6873
6874
6875 Lisp_Object
6876 code_convert_region1 (start, end, coding_system, encodep)
6877      Lisp_Object start, end, coding_system;
6878      int encodep;
6879 {
6880   struct coding_system coding;
6881   int from, to;
6882
6883   CHECK_NUMBER_COERCE_MARKER (start);
6884   CHECK_NUMBER_COERCE_MARKER (end);
6885   CHECK_SYMBOL (coding_system);
6886
6887   validate_region (&start, &end);
6888   from = XFASTINT (start);
6889   to = XFASTINT (end);
6890
6891   if (NILP (coding_system))
6892     return make_number (to - from);
6893
6894   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6895     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6896
6897   coding.mode |= CODING_MODE_LAST_BLOCK;
6898   coding.src_multibyte = coding.dst_multibyte
6899     = !NILP (current_buffer->enable_multibyte_characters);
6900   code_convert_region (from, CHAR_TO_BYTE (from), to, CHAR_TO_BYTE (to),
6901                        &coding, encodep, 1);
6902   Vlast_coding_system_used = coding.symbol;
6903   return make_number (coding.produced_char);
6904 }
6905
6906 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6907        3, 3, "r\nzCoding system: ",
6908        doc: /* Decode the current region from the specified coding system.
6909 When called from a program, takes three arguments:
6910 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6911 This function sets `last-coding-system-used' to the precise coding system
6912 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6913 not fully specified.)
6914 It returns the length of the decoded text.  */)
6915      (start, end, coding_system)
6916      Lisp_Object start, end, coding_system;
6917 {
6918   return code_convert_region1 (start, end, coding_system, 0);
6919 }
6920
6921 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6922        3, 3, "r\nzCoding system: ",
6923        doc: /* Encode the current region into the specified coding system.
6924 When called from a program, takes three arguments:
6925 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6926 This function sets `last-coding-system-used' to the precise coding system
6927 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6928 not fully specified.)
6929 It returns the length of the encoded text.  */)
6930      (start, end, coding_system)
6931      Lisp_Object start, end, coding_system;
6932 {
6933   return code_convert_region1 (start, end, coding_system, 1);
6934 }
6935
6936 Lisp_Object
6937 code_convert_string1 (string, coding_system, nocopy, encodep)
6938      Lisp_Object string, coding_system, nocopy;
6939      int encodep;
6940 {
6941   struct coding_system coding;
6942
6943   CHECK_STRING (string);
6944   CHECK_SYMBOL (coding_system);
6945
6946   if (NILP (coding_system))
6947     return (NILP (nocopy) ? Fcopy_sequence (string) : string);
6948
6949   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
6950     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
6951
6952   coding.mode |= CODING_MODE_LAST_BLOCK;
6953   string = (encodep
6954             ? encode_coding_string (string, &coding, !NILP (nocopy))
6955             : decode_coding_string (string, &coding, !NILP (nocopy)));
6956   Vlast_coding_system_used = coding.symbol;
6957
6958   return string;
6959 }
6960
6961 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6962        2, 3, 0,
6963        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6964 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6965 if the decoding operation is trivial.
6966 This function sets `last-coding-system-used' to the precise coding system
6967 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6968 not fully specified.)  */)
6969      (string, coding_system, nocopy)
6970      Lisp_Object string, coding_system, nocopy;
6971 {
6972   return code_convert_string1 (string, coding_system, nocopy, 0);
6973 }
6974
6975 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6976        2, 3, 0,
6977        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6978 Optional arg NOCOPY non-nil means it is OK to return STRING itself
6979 if the encoding operation is trivial.
6980 This function sets `last-coding-system-used' to the precise coding system
6981 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6982 not fully specified.)  */)
6983      (string, coding_system, nocopy)
6984      Lisp_Object string, coding_system, nocopy;
6985 {
6986   return code_convert_string1 (string, coding_system, nocopy, 1);
6987 }
6988
6989 /* Encode or decode STRING according to CODING_SYSTEM.
6990    Do not set Vlast_coding_system_used.
6991
6992    This function is called only from macros DECODE_FILE and
6993    ENCODE_FILE, thus we ignore character composition.  */
6994
6995 Lisp_Object
6996 code_convert_string_norecord (string, coding_system, encodep)
6997      Lisp_Object string, coding_system;
6998      int encodep;
6999 {
7000   struct coding_system coding;
7001
7002   CHECK_STRING (string);
7003   CHECK_SYMBOL (coding_system);
7004
7005   if (NILP (coding_system))
7006     return string;
7007
7008   if (setup_coding_system (Fcheck_coding_system (coding_system), &coding) < 0)
7009     error ("Invalid coding system: %s", SDATA (SYMBOL_NAME (coding_system)));
7010
7011   coding.composing = COMPOSITION_DISABLED;
7012   coding.mode |= CODING_MODE_LAST_BLOCK;
7013   return (encodep
7014           ? encode_coding_string (string, &coding, 1)
7015           : decode_coding_string (string, &coding, 1));
7016 }
7017 \f
7018 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7019        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7020 Return the corresponding character.  */)
7021      (code)
7022      Lisp_Object code;
7023 {
7024   unsigned char c1, c2, s1, s2;
7025   Lisp_Object val;
7026
7027   CHECK_NUMBER (code);
7028   s1 = (XFASTINT (code)) >> 8, s2 = (XFASTINT (code)) & 0xFF;
7029   if (s1 == 0)
7030     {
7031       if (s2 < 0x80)
7032         XSETFASTINT (val, s2);
7033       else if (s2 >= 0xA0 || s2 <= 0xDF)
7034         XSETFASTINT (val, MAKE_CHAR (charset_katakana_jisx0201, s2, 0));
7035       else
7036         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7037     }
7038   else
7039     {
7040       if ((s1 < 0x80 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF)
7041           || (s2 < 0x40 || s2 == 0x7F || s2 > 0xFC))
7042         error ("Invalid Shift JIS code: %x", XFASTINT (code));
7043       DECODE_SJIS (s1, s2, c1, c2);
7044       XSETFASTINT (val, MAKE_CHAR (charset_jisx0208, c1, c2));
7045     }
7046   return val;
7047 }
7048
7049 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7050        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7051 Return the corresponding code in SJIS.  */)
7052      (ch)
7053      Lisp_Object ch;
7054 {
7055   int charset, c1, c2, s1, s2;
7056   Lisp_Object val;
7057
7058   CHECK_NUMBER (ch);
7059   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7060   if (charset == CHARSET_ASCII)
7061     {
7062       val = ch;
7063     }
7064   else if (charset == charset_jisx0208
7065            && c1 > 0x20 && c1 < 0x7F && c2 > 0x20 && c2 < 0x7F)
7066     {
7067       ENCODE_SJIS (c1, c2, s1, s2);
7068       XSETFASTINT (val, (s1 << 8) | s2);
7069     }
7070   else if (charset == charset_katakana_jisx0201
7071            && c1 > 0x20 && c2 < 0xE0)
7072     {
7073       XSETFASTINT (val, c1 | 0x80);
7074     }
7075   else
7076     error ("Can't encode to shift_jis: %d", XFASTINT (ch));
7077   return val;
7078 }
7079
7080 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7081        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7082 Return the corresponding character.  */)
7083      (code)
7084      Lisp_Object code;
7085 {
7086   int charset;
7087   unsigned char b1, b2, c1, c2;
7088   Lisp_Object val;
7089
7090   CHECK_NUMBER (code);
7091   b1 = (XFASTINT (code)) >> 8, b2 = (XFASTINT (code)) & 0xFF;
7092   if (b1 == 0)
7093     {
7094       if (b2 >= 0x80)
7095         error ("Invalid BIG5 code: %x", XFASTINT (code));
7096       val = code;
7097     }
7098   else
7099     {
7100       if ((b1 < 0xA1 || b1 > 0xFE)
7101           || (b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE))
7102         error ("Invalid BIG5 code: %x", XFASTINT (code));
7103       DECODE_BIG5 (b1, b2, charset, c1, c2);
7104       XSETFASTINT (val, MAKE_CHAR (charset, c1, c2));
7105     }
7106   return val;
7107 }
7108
7109 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7110        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7111 Return the corresponding character code in Big5.  */)
7112      (ch)
7113      Lisp_Object ch;
7114 {
7115   int charset, c1, c2, b1, b2;
7116   Lisp_Object val;
7117
7118   CHECK_NUMBER (ch);
7119   SPLIT_CHAR (XFASTINT (ch), charset, c1, c2);
7120   if (charset == CHARSET_ASCII)
7121     {
7122       val = ch;
7123     }
7124   else if ((charset == charset_big5_1
7125             && (XFASTINT (ch) >= 0x250a1 && XFASTINT (ch) <= 0x271ec))
7126            || (charset == charset_big5_2
7127                && XFASTINT (ch) >= 0x290a1 && XFASTINT (ch) <= 0x2bdb2))
7128     {
7129       ENCODE_BIG5 (charset, c1, c2, b1, b2);
7130       XSETFASTINT (val, (b1 << 8) | b2);
7131     }
7132   else
7133     error ("Can't encode to Big5: %d", XFASTINT (ch));
7134   return val;
7135 }
7136 \f
7137 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
7138        Sset_terminal_coding_system_internal, 1, 1, 0,
7139        doc: /* Internal use only.  */)
7140      (coding_system)
7141      Lisp_Object coding_system;
7142 {
7143   CHECK_SYMBOL (coding_system);
7144   setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding);
7145   /* We had better not send unsafe characters to terminal.  */
7146   terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR;
7147   /* Character composition should be disabled.  */
7148   terminal_coding.composing = COMPOSITION_DISABLED;
7149   /* Error notification should be suppressed.  */
7150   terminal_coding.suppress_error = 1;
7151   terminal_coding.src_multibyte = 1;
7152   terminal_coding.dst_multibyte = 0;
7153   return Qnil;
7154 }
7155
7156 DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal,
7157        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7158        doc: /* Internal use only.  */)
7159      (coding_system)
7160      Lisp_Object coding_system;
7161 {
7162   CHECK_SYMBOL (coding_system);
7163   setup_coding_system (Fcheck_coding_system (coding_system),
7164                        &safe_terminal_coding);
7165   /* Character composition should be disabled.  */
7166   safe_terminal_coding.composing = COMPOSITION_DISABLED;
7167   /* Error notification should be suppressed.  */
7168   terminal_coding.suppress_error = 1;
7169   safe_terminal_coding.src_multibyte = 1;
7170   safe_terminal_coding.dst_multibyte = 0;
7171   return Qnil;
7172 }
7173
7174 DEFUN ("terminal-coding-system", Fterminal_coding_system,
7175        Sterminal_coding_system, 0, 0, 0,
7176        doc: /* Return coding system specified for terminal output.  */)
7177      ()
7178 {
7179   return terminal_coding.symbol;
7180 }
7181
7182 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
7183        Sset_keyboard_coding_system_internal, 1, 1, 0,
7184        doc: /* Internal use only.  */)
7185      (coding_system)
7186      Lisp_Object coding_system;
7187 {
7188   CHECK_SYMBOL (coding_system);
7189   setup_coding_system (Fcheck_coding_system (coding_system), &keyboard_coding);
7190   /* Character composition should be disabled.  */
7191   keyboard_coding.composing = COMPOSITION_DISABLED;
7192   return Qnil;
7193 }
7194
7195 DEFUN ("keyboard-coding-system", Fkeyboard_coding_system,
7196        Skeyboard_coding_system, 0, 0, 0,
7197        doc: /* Return coding system specified for decoding keyboard input.  */)
7198      ()
7199 {
7200   return keyboard_coding.symbol;
7201 }
7202
7203 \f
7204 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7205        Sfind_operation_coding_system,  1, MANY, 0,
7206        doc: /* Choose a coding system for an operation based on the target name.
7207 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7208 DECODING-SYSTEM is the coding system to use for decoding
7209 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7210 for encoding (in case OPERATION does encoding).
7211
7212 The first argument OPERATION specifies an I/O primitive:
7213   For file I/O, `insert-file-contents' or `write-region'.
7214   For process I/O, `call-process', `call-process-region', or `start-process'.
7215   For network I/O, `open-network-stream'.
7216
7217 The remaining arguments should be the same arguments that were passed
7218 to the primitive.  Depending on which primitive, one of those arguments
7219 is selected as the TARGET.  For example, if OPERATION does file I/O,
7220 whichever argument specifies the file name is TARGET.
7221
7222 TARGET has a meaning which depends on OPERATION:
7223   For file I/O, TARGET is a file name.
7224   For process I/O, TARGET is a process name.
7225   For network I/O, TARGET is a service name or a port number
7226
7227 This function looks up what specified for TARGET in,
7228 `file-coding-system-alist', `process-coding-system-alist',
7229 or `network-coding-system-alist' depending on OPERATION.
7230 They may specify a coding system, a cons of coding systems,
7231 or a function symbol to call.
7232 In the last case, we call the function with one argument,
7233 which is a list of all the arguments given to this function.
7234
7235 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7236      (nargs, args)
7237      int nargs;
7238      Lisp_Object *args;
7239 {
7240   Lisp_Object operation, target_idx, target, val;
7241   register Lisp_Object chain;
7242
7243   if (nargs < 2)
7244     error ("Too few arguments");
7245   operation = args[0];
7246   if (!SYMBOLP (operation)
7247       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7248     error ("Invalid first argument");
7249   if (nargs < 1 + XINT (target_idx))
7250     error ("Too few arguments for operation: %s",
7251            SDATA (SYMBOL_NAME (operation)));
7252   /* For write-region, if the 6th argument (i.e. VISIT, the 5th
7253      argument to write-region) is string, it must be treated as a
7254      target file name.  */
7255   if (EQ (operation, Qwrite_region)
7256       && nargs > 5
7257       && STRINGP (args[5]))
7258     target_idx = make_number (4);
7259   target = args[XINT (target_idx) + 1];
7260   if (!(STRINGP (target)
7261         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7262     error ("Invalid argument %d", XINT (target_idx) + 1);
7263
7264   chain = ((EQ (operation, Qinsert_file_contents)
7265             || EQ (operation, Qwrite_region))
7266            ? Vfile_coding_system_alist
7267            : (EQ (operation, Qopen_network_stream)
7268               ? Vnetwork_coding_system_alist
7269               : Vprocess_coding_system_alist));
7270   if (NILP (chain))
7271     return Qnil;
7272
7273   for (; CONSP (chain); chain = XCDR (chain))
7274     {
7275       Lisp_Object elt;
7276       elt = XCAR (chain);
7277
7278       if (CONSP (elt)
7279           && ((STRINGP (target)
7280                && STRINGP (XCAR (elt))
7281                && fast_string_match (XCAR (elt), target) >= 0)
7282               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7283         {
7284           val = XCDR (elt);
7285           /* Here, if VAL is both a valid coding system and a valid
7286              function symbol, we return VAL as a coding system.  */
7287           if (CONSP (val))
7288             return val;
7289           if (! SYMBOLP (val))
7290             return Qnil;
7291           if (! NILP (Fcoding_system_p (val)))
7292             return Fcons (val, val);
7293           if (! NILP (Ffboundp (val)))
7294             {
7295               val = call1 (val, Flist (nargs, args));
7296               if (CONSP (val))
7297                 return val;
7298               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7299                 return Fcons (val, val);
7300             }
7301           return Qnil;
7302         }
7303     }
7304   return Qnil;
7305 }
7306
7307 DEFUN ("update-coding-systems-internal",  Fupdate_coding_systems_internal,
7308        Supdate_coding_systems_internal, 0, 0, 0,
7309        doc: /* Update internal database for ISO2022 and CCL based coding systems.
7310 When values of any coding categories are changed, you must
7311 call this function.  */)
7312      ()
7313 {
7314   int i;
7315
7316   for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++)
7317     {
7318       Lisp_Object val;
7319
7320       val = SYMBOL_VALUE (XVECTOR (Vcoding_category_table)->contents[i]);
7321       if (!NILP (val))
7322         {
7323           if (! coding_system_table[i])
7324             coding_system_table[i] = ((struct coding_system *)
7325                                       xmalloc (sizeof (struct coding_system)));
7326           setup_coding_system (val, coding_system_table[i]);
7327         }
7328       else if (coding_system_table[i])
7329         {
7330           xfree (coding_system_table[i]);
7331           coding_system_table[i] = NULL;
7332         }
7333     }
7334
7335   return Qnil;
7336 }
7337
7338 DEFUN ("set-coding-priority-internal", Fset_coding_priority_internal,
7339        Sset_coding_priority_internal, 0, 0, 0,
7340        doc: /* Update internal database for the current value of `coding-category-list'.
7341 This function is internal use only.  */)
7342      ()
7343 {
7344   int i = 0, idx;
7345   Lisp_Object val;
7346
7347   val = Vcoding_category_list;
7348
7349   while (CONSP (val) && i < CODING_CATEGORY_IDX_MAX)
7350     {
7351       if (! SYMBOLP (XCAR (val)))
7352         break;
7353       idx = XFASTINT (Fget (XCAR (val), Qcoding_category_index));
7354       if (idx >= CODING_CATEGORY_IDX_MAX)
7355         break;
7356       coding_priorities[i++] = (1 << idx);
7357       val = XCDR (val);
7358     }
7359   /* If coding-category-list is valid and contains all coding
7360      categories, `i' should be CODING_CATEGORY_IDX_MAX now.  If not,
7361      the following code saves Emacs from crashing.  */
7362   while (i < CODING_CATEGORY_IDX_MAX)
7363     coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT;
7364
7365   return Qnil;
7366 }
7367
7368 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7369        Sdefine_coding_system_internal, 1, 1, 0,
7370        doc: /* Register CODING-SYSTEM as a base coding system.
7371 This function is internal use only.  */)
7372      (coding_system)
7373      Lisp_Object coding_system;
7374 {
7375   Lisp_Object safe_chars, slot;
7376
7377   if (NILP (Fcheck_coding_system (coding_system)))
7378     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7379   safe_chars = coding_safe_chars (coding_system);
7380   if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars))
7381     error ("No valid safe-chars property for %s",
7382            SDATA (SYMBOL_NAME (coding_system)));
7383   if (EQ (safe_chars, Qt))
7384     {
7385       if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars))))
7386         XSETCAR (Vcoding_system_safe_chars,
7387                  Fcons (coding_system, XCAR (Vcoding_system_safe_chars)));
7388     }
7389   else
7390     {
7391       slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars));
7392       if (NILP (slot))
7393         XSETCDR (Vcoding_system_safe_chars,
7394                  nconc2 (XCDR (Vcoding_system_safe_chars),
7395                          Fcons (Fcons (coding_system, safe_chars), Qnil)));
7396       else
7397         XSETCDR (slot, safe_chars);
7398     }
7399   return Qnil;
7400 }
7401
7402 #endif /* emacs */
7403
7404 \f
7405 /*** 9. Post-amble ***/
7406
7407 void
7408 init_coding_once ()
7409 {
7410   int i;
7411
7412   /* Emacs' internal format specific initialize routine.  */
7413   for (i = 0; i <= 0x20; i++)
7414     emacs_code_class[i] = EMACS_control_code;
7415   emacs_code_class[0x0A] = EMACS_linefeed_code;
7416   emacs_code_class[0x0D] = EMACS_carriage_return_code;
7417   for (i = 0x21 ; i < 0x7F; i++)
7418     emacs_code_class[i] = EMACS_ascii_code;
7419   emacs_code_class[0x7F] = EMACS_control_code;
7420   for (i = 0x80; i < 0xFF; i++)
7421     emacs_code_class[i] = EMACS_invalid_code;
7422   emacs_code_class[LEADING_CODE_PRIVATE_11] = EMACS_leading_code_3;
7423   emacs_code_class[LEADING_CODE_PRIVATE_12] = EMACS_leading_code_3;
7424   emacs_code_class[LEADING_CODE_PRIVATE_21] = EMACS_leading_code_4;
7425   emacs_code_class[LEADING_CODE_PRIVATE_22] = EMACS_leading_code_4;
7426
7427   /* ISO2022 specific initialize routine.  */
7428   for (i = 0; i < 0x20; i++)
7429     iso_code_class[i] = ISO_control_0;
7430   for (i = 0x21; i < 0x7F; i++)
7431     iso_code_class[i] = ISO_graphic_plane_0;
7432   for (i = 0x80; i < 0xA0; i++)
7433     iso_code_class[i] = ISO_control_1;
7434   for (i = 0xA1; i < 0xFF; i++)
7435     iso_code_class[i] = ISO_graphic_plane_1;
7436   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7437   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7438   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7439   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7440   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7441   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7442   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7443   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7444   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7445   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7446
7447   setup_coding_system (Qnil, &keyboard_coding);
7448   setup_coding_system (Qnil, &terminal_coding);
7449   setup_coding_system (Qnil, &safe_terminal_coding);
7450   setup_coding_system (Qnil, &default_buffer_file_coding);
7451
7452   bzero (coding_system_table, sizeof coding_system_table);
7453
7454   bzero (ascii_skip_code, sizeof ascii_skip_code);
7455   for (i = 0; i < 128; i++)
7456     ascii_skip_code[i] = 1;
7457
7458 #if defined (MSDOS) || defined (WINDOWSNT)
7459   system_eol_type = CODING_EOL_CRLF;
7460 #else
7461   system_eol_type = CODING_EOL_LF;
7462 #endif
7463
7464   inhibit_pre_post_conversion = 0;
7465 }
7466
7467 #ifdef emacs
7468
7469 void
7470 syms_of_coding ()
7471 {
7472   Qtarget_idx = intern ("target-idx");
7473   staticpro (&Qtarget_idx);
7474
7475   Qcoding_system_history = intern ("coding-system-history");
7476   staticpro (&Qcoding_system_history);
7477   Fset (Qcoding_system_history, Qnil);
7478
7479   /* Target FILENAME is the first argument.  */
7480   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7481   /* Target FILENAME is the third argument.  */
7482   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7483
7484   Qcall_process = intern ("call-process");
7485   staticpro (&Qcall_process);
7486   /* Target PROGRAM is the first argument.  */
7487   Fput (Qcall_process, Qtarget_idx, make_number (0));
7488
7489   Qcall_process_region = intern ("call-process-region");
7490   staticpro (&Qcall_process_region);
7491   /* Target PROGRAM is the third argument.  */
7492   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7493
7494   Qstart_process = intern ("start-process");
7495   staticpro (&Qstart_process);
7496   /* Target PROGRAM is the third argument.  */
7497   Fput (Qstart_process, Qtarget_idx, make_number (2));
7498
7499   Qopen_network_stream = intern ("open-network-stream");
7500   staticpro (&Qopen_network_stream);
7501   /* Target SERVICE is the fourth argument.  */
7502   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7503
7504   Qcoding_system = intern ("coding-system");
7505   staticpro (&Qcoding_system);
7506
7507   Qeol_type = intern ("eol-type");
7508   staticpro (&Qeol_type);
7509
7510   Qbuffer_file_coding_system = intern ("buffer-file-coding-system");
7511   staticpro (&Qbuffer_file_coding_system);
7512
7513   Qpost_read_conversion = intern ("post-read-conversion");
7514   staticpro (&Qpost_read_conversion);
7515
7516   Qpre_write_conversion = intern ("pre-write-conversion");
7517   staticpro (&Qpre_write_conversion);
7518
7519   Qno_conversion = intern ("no-conversion");
7520   staticpro (&Qno_conversion);
7521
7522   Qundecided = intern ("undecided");
7523   staticpro (&Qundecided);
7524
7525   Qcoding_system_p = intern ("coding-system-p");
7526   staticpro (&Qcoding_system_p);
7527
7528   Qcoding_system_error = intern ("coding-system-error");
7529   staticpro (&Qcoding_system_error);
7530
7531   Fput (Qcoding_system_error, Qerror_conditions,
7532         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7533   Fput (Qcoding_system_error, Qerror_message,
7534         build_string ("Invalid coding system"));
7535
7536   Qcoding_category = intern ("coding-category");
7537   staticpro (&Qcoding_category);
7538   Qcoding_category_index = intern ("coding-category-index");
7539   staticpro (&Qcoding_category_index);
7540
7541   Vcoding_category_table
7542     = Fmake_vector (make_number (CODING_CATEGORY_IDX_MAX), Qnil);
7543   staticpro (&Vcoding_category_table);
7544   {
7545     int i;
7546     for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++)
7547       {
7548         XVECTOR (Vcoding_category_table)->contents[i]
7549           = intern (coding_category_name[i]);
7550         Fput (XVECTOR (Vcoding_category_table)->contents[i],
7551               Qcoding_category_index, make_number (i));
7552       }
7553   }
7554
7555   Vcoding_system_safe_chars = Fcons (Qnil, Qnil);
7556   staticpro (&Vcoding_system_safe_chars);
7557
7558   Qtranslation_table = intern ("translation-table");
7559   staticpro (&Qtranslation_table);
7560   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
7561
7562   Qtranslation_table_id = intern ("translation-table-id");
7563   staticpro (&Qtranslation_table_id);
7564
7565   Qtranslation_table_for_decode = intern ("translation-table-for-decode");
7566   staticpro (&Qtranslation_table_for_decode);
7567
7568   Qtranslation_table_for_encode = intern ("translation-table-for-encode");
7569   staticpro (&Qtranslation_table_for_encode);
7570
7571   Qsafe_chars = intern ("safe-chars");
7572   staticpro (&Qsafe_chars);
7573
7574   Qchar_coding_system = intern ("char-coding-system");
7575   staticpro (&Qchar_coding_system);
7576
7577   /* Intern this now in case it isn't already done.
7578      Setting this variable twice is harmless.
7579      But don't staticpro it here--that is done in alloc.c.  */
7580   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7581   Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0));
7582   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0));
7583
7584   Qvalid_codes = intern ("valid-codes");
7585   staticpro (&Qvalid_codes);
7586
7587   Qemacs_mule = intern ("emacs-mule");
7588   staticpro (&Qemacs_mule);
7589
7590   Qraw_text = intern ("raw-text");
7591   staticpro (&Qraw_text);
7592
7593   Qutf_8 = intern ("utf-8");
7594   staticpro (&Qutf_8);
7595
7596   defsubr (&Scoding_system_p);
7597   defsubr (&Sread_coding_system);
7598   defsubr (&Sread_non_nil_coding_system);
7599   defsubr (&Scheck_coding_system);
7600   defsubr (&Sdetect_coding_region);
7601   defsubr (&Sdetect_coding_string);
7602   defsubr (&Sfind_coding_systems_region_internal);
7603   defsubr (&Sunencodable_char_position);
7604   defsubr (&Sdecode_coding_region);
7605   defsubr (&Sencode_coding_region);
7606   defsubr (&Sdecode_coding_string);
7607   defsubr (&Sencode_coding_string);
7608   defsubr (&Sdecode_sjis_char);
7609   defsubr (&Sencode_sjis_char);
7610   defsubr (&Sdecode_big5_char);
7611   defsubr (&Sencode_big5_char);
7612   defsubr (&Sset_terminal_coding_system_internal);
7613   defsubr (&Sset_safe_terminal_coding_system_internal);
7614   defsubr (&Sterminal_coding_system);
7615   defsubr (&Sset_keyboard_coding_system_internal);
7616   defsubr (&Skeyboard_coding_system);
7617   defsubr (&Sfind_operation_coding_system);
7618   defsubr (&Supdate_coding_systems_internal);
7619   defsubr (&Sset_coding_priority_internal);
7620   defsubr (&Sdefine_coding_system_internal);
7621
7622   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7623                doc: /* List of coding systems.
7624
7625 Do not alter the value of this variable manually.  This variable should be
7626 updated by the functions `make-coding-system' and
7627 `define-coding-system-alias'.  */);
7628   Vcoding_system_list = Qnil;
7629
7630   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7631                doc: /* Alist of coding system names.
7632 Each element is one element list of coding system name.
7633 This variable is given to `completing-read' as TABLE argument.
7634
7635 Do not alter the value of this variable manually.  This variable should be
7636 updated by the functions `make-coding-system' and
7637 `define-coding-system-alias'.  */);
7638   Vcoding_system_alist = Qnil;
7639
7640   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7641                doc: /* List of coding-categories (symbols) ordered by priority.
7642
7643 On detecting a coding system, Emacs tries code detection algorithms
7644 associated with each coding-category one by one in this order.  When
7645 one algorithm agrees with a byte sequence of source text, the coding
7646 system bound to the corresponding coding-category is selected.  */);
7647   {
7648     int i;
7649
7650     Vcoding_category_list = Qnil;
7651     for (i = CODING_CATEGORY_IDX_MAX - 1; i >= 0; i--)
7652       Vcoding_category_list
7653         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7654                  Vcoding_category_list);
7655   }
7656
7657   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7658                doc: /* Specify the coding system for read operations.
7659 It is useful to bind this variable with `let', but do not set it globally.
7660 If the value is a coding system, it is used for decoding on read operation.
7661 If not, an appropriate element is used from one of the coding system alists:
7662 There are three such tables, `file-coding-system-alist',
7663 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7664   Vcoding_system_for_read = Qnil;
7665
7666   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7667                doc: /* Specify the coding system for write operations.
7668 Programs bind this variable with `let', but you should not set it globally.
7669 If the value is a coding system, it is used for encoding of output,
7670 when writing it to a file and when sending it to a file or subprocess.
7671
7672 If this does not specify a coding system, an appropriate element
7673 is used from one of the coding system alists:
7674 There are three such tables, `file-coding-system-alist',
7675 `process-coding-system-alist', and `network-coding-system-alist'.
7676 For output to files, if the above procedure does not specify a coding system,
7677 the value of `buffer-file-coding-system' is used.  */);
7678   Vcoding_system_for_write = Qnil;
7679
7680   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7681                doc: /* Coding system used in the latest file or process I/O.
7682 Also set by `encode-coding-region', `decode-coding-region',
7683 `encode-coding-string' and `decode-coding-string'.  */);
7684   Vlast_coding_system_used = Qnil;
7685
7686   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7687                doc: /* *Non-nil means always inhibit code conversion of end-of-line format.
7688 See info node `Coding Systems' and info node `Text and Binary' concerning
7689 such conversion.  */);
7690   inhibit_eol_conversion = 0;
7691
7692   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
7693                doc: /* Non-nil means process buffer inherits coding system of process output.
7694 Bind it to t if the process output is to be treated as if it were a file
7695 read from some filesystem.  */);
7696   inherit_process_coding_system = 0;
7697
7698   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
7699                doc: /* Alist to decide a coding system to use for a file I/O operation.
7700 The format is ((PATTERN . VAL) ...),
7701 where PATTERN is a regular expression matching a file name,
7702 VAL is a coding system, a cons of coding systems, or a function symbol.
7703 If VAL is a coding system, it is used for both decoding and encoding
7704 the file contents.
7705 If VAL is a cons of coding systems, the car part is used for decoding,
7706 and the cdr part is used for encoding.
7707 If VAL is a function symbol, the function must return a coding system
7708 or a cons of coding systems which are used as above.  The function gets
7709 the arguments with which `find-operation-coding-system' was called.
7710
7711 See also the function `find-operation-coding-system'
7712 and the variable `auto-coding-alist'.  */);
7713   Vfile_coding_system_alist = Qnil;
7714
7715   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
7716     doc: /* Alist to decide a coding system to use for a process I/O operation.
7717 The format is ((PATTERN . VAL) ...),
7718 where PATTERN is a regular expression matching a program name,
7719 VAL is a coding system, a cons of coding systems, or a function symbol.
7720 If VAL is a coding system, it is used for both decoding what received
7721 from the program and encoding what sent to the program.
7722 If VAL is a cons of coding systems, the car part is used for decoding,
7723 and the cdr part is used for encoding.
7724 If VAL is a function symbol, the function must return a coding system
7725 or a cons of coding systems which are used as above.
7726
7727 See also the function `find-operation-coding-system'.  */);
7728   Vprocess_coding_system_alist = Qnil;
7729
7730   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
7731     doc: /* Alist to decide a coding system to use for a network I/O operation.
7732 The format is ((PATTERN . VAL) ...),
7733 where PATTERN is a regular expression matching a network service name
7734 or is a port number to connect to,
7735 VAL is a coding system, a cons of coding systems, or a function symbol.
7736 If VAL is a coding system, it is used for both decoding what received
7737 from the network stream and encoding what sent to the network stream.
7738 If VAL is a cons of coding systems, the car part is used for decoding,
7739 and the cdr part is used for encoding.
7740 If VAL is a function symbol, the function must return a coding system
7741 or a cons of coding systems which are used as above.
7742
7743 See also the function `find-operation-coding-system'.  */);
7744   Vnetwork_coding_system_alist = Qnil;
7745
7746   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
7747                doc: /* Coding system to use with system messages.
7748 Also used for decoding keyboard input on X Window system.  */);
7749   Vlocale_coding_system = Qnil;
7750
7751   /* The eol mnemonics are reset in startup.el system-dependently.  */
7752   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
7753                doc: /* *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
7754   eol_mnemonic_unix = build_string (":");
7755
7756   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
7757                doc: /* *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
7758   eol_mnemonic_dos = build_string ("\\");
7759
7760   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
7761                doc: /* *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
7762   eol_mnemonic_mac = build_string ("/");
7763
7764   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
7765                doc: /* *String displayed in mode line when end-of-line format is not yet determined.  */);
7766   eol_mnemonic_undecided = build_string (":");
7767
7768   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
7769                doc: /* *Non-nil enables character translation while encoding and decoding.  */);
7770   Venable_character_translation = Qt;
7771
7772   DEFVAR_LISP ("standard-translation-table-for-decode",
7773                &Vstandard_translation_table_for_decode,
7774                doc: /* Table for translating characters while decoding.  */);
7775   Vstandard_translation_table_for_decode = Qnil;
7776
7777   DEFVAR_LISP ("standard-translation-table-for-encode",
7778                &Vstandard_translation_table_for_encode,
7779                doc: /* Table for translating characters while encoding.  */);
7780   Vstandard_translation_table_for_encode = Qnil;
7781
7782   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_alist,
7783                doc: /* Alist of charsets vs revision numbers.
7784 While encoding, if a charset (car part of an element) is found,
7785 designate it with the escape sequence identifying revision (cdr part of the element).  */);
7786   Vcharset_revision_alist = Qnil;
7787
7788   DEFVAR_LISP ("default-process-coding-system",
7789                &Vdefault_process_coding_system,
7790                doc: /* Cons of coding systems used for process I/O by default.
7791 The car part is used for decoding a process output,
7792 the cdr part is used for encoding a text to be sent to a process.  */);
7793   Vdefault_process_coding_system = Qnil;
7794
7795   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
7796                doc: /* Table of extra Latin codes in the range 128..159 (inclusive).
7797 This is a vector of length 256.
7798 If Nth element is non-nil, the existence of code N in a file
7799 \(or output of subprocess) doesn't prevent it to be detected as
7800 a coding system of ISO 2022 variant which has a flag
7801 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
7802 or reading output of a subprocess.
7803 Only 128th through 159th elements has a meaning.  */);
7804   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
7805
7806   DEFVAR_LISP ("select-safe-coding-system-function",
7807                &Vselect_safe_coding_system_function,
7808                doc: /* Function to call to select safe coding system for encoding a text.
7809
7810 If set, this function is called to force a user to select a proper
7811 coding system which can encode the text in the case that a default
7812 coding system used in each operation can't encode the text.
7813
7814 The default value is `select-safe-coding-system' (which see).  */);
7815   Vselect_safe_coding_system_function = Qnil;
7816
7817   DEFVAR_BOOL ("coding-system-require-warning",
7818                &coding_system_require_warning,
7819                doc: /* Internal use only.
7820 If non-nil, on writing a file, `select-safe-coding-system-function' is
7821 called even if `coding-system-for-write' is non-nil.  The command
7822 `universal-coding-system-argument' binds this variable to t temporarily.  */);
7823   coding_system_require_warning = 0;
7824
7825
7826   DEFVAR_BOOL ("inhibit-iso-escape-detection",
7827                &inhibit_iso_escape_detection,
7828                doc: /* If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
7829
7830 By default, on reading a file, Emacs tries to detect how the text is
7831 encoded.  This code detection is sensitive to escape sequences.  If
7832 the sequence is valid as ISO2022, the code is determined as one of
7833 the ISO2022 encodings, and the file is decoded by the corresponding
7834 coding system (e.g. `iso-2022-7bit').
7835
7836 However, there may be a case that you want to read escape sequences in
7837 a file as is.  In such a case, you can set this variable to non-nil.
7838 Then, as the code detection ignores any escape sequences, no file is
7839 detected as encoded in some ISO2022 encoding.  The result is that all
7840 escape sequences become visible in a buffer.
7841
7842 The default value is nil, and it is strongly recommended not to change
7843 it.  That is because many Emacs Lisp source files that contain
7844 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
7845 in Emacs's distribution, and they won't be decoded correctly on
7846 reading if you suppress escape sequence detection.
7847
7848 The other way to read escape sequences in a file without decoding is
7849 to explicitly specify some coding system that doesn't use ISO2022's
7850 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
7851   inhibit_iso_escape_detection = 0;
7852
7853   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
7854                doc: /* Char table for translating self-inserting characters.
7855 This is applied to the result of input methods, not their input.  See also
7856 `keyboard-translate-table'.  */);
7857     Vtranslation_table_for_input = Qnil;
7858 }
7859
7860 char *
7861 emacs_strerror (error_number)
7862      int error_number;
7863 {
7864   char *str;
7865
7866   synchronize_system_messages_locale ();
7867   str = strerror (error_number);
7868
7869   if (! NILP (Vlocale_coding_system))
7870     {
7871       Lisp_Object dec = code_convert_string_norecord (build_string (str),
7872                                                       Vlocale_coding_system,
7873                                                       0);
7874       str = (char *) SDATA (dec);
7875     }
7876
7877   return str;
7878 }
7879
7880 #endif /* emacs */
7881